diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65144 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999462510077936, + "eval_steps": 500, + "global_step": 9302, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.52533925718378, + "learning_rate": 3.5714285714285714e-06, + "loss": 2.1829, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.8917807391691538, + "learning_rate": 7.142857142857143e-06, + "loss": 1.9838, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 2.0748449554888, + "learning_rate": 1.0714285714285714e-05, + "loss": 1.9405, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.6850439438507572, + "learning_rate": 1.4285714285714285e-05, + "loss": 2.0572, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 5.233502306710884, + "learning_rate": 1.7857142857142855e-05, + "loss": 2.017, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 2.1190437540863045, + "learning_rate": 2.1428571428571428e-05, + "loss": 2.1505, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 78.75075566973746, + "learning_rate": 2.5e-05, + "loss": 1.9595, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 5.491814243878795, + "learning_rate": 2.857142857142857e-05, + "loss": 1.9188, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 1.0583872857696868, + "learning_rate": 3.214285714285714e-05, + "loss": 2.2103, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 0.885577656564717, + "learning_rate": 3.571428571428571e-05, + "loss": 2.0415, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.4388687007921046, + "learning_rate": 3.928571428571428e-05, + "loss": 2.0554, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.5874308565723774, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.9838, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.7501093864144527, + "learning_rate": 4.642857142857143e-05, + "loss": 2.103, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.7408455109981018, + "learning_rate": 5e-05, + "loss": 1.8546, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 1.0611197819252878, + "learning_rate": 5.357142857142857e-05, + "loss": 1.9603, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.9561526566868136, + "learning_rate": 5.714285714285714e-05, + "loss": 1.9864, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 0.8141552130974743, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.8761, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 0.9098149261285031, + "learning_rate": 6.428571428571427e-05, + "loss": 1.767, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 0.8866212598000296, + "learning_rate": 6.785714285714285e-05, + "loss": 2.1052, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.1741636005388276, + "learning_rate": 7.142857142857142e-05, + "loss": 1.9065, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 1.3333604730387096, + "learning_rate": 7.5e-05, + "loss": 2.1443, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 1.3112663647673257, + "learning_rate": 7.857142857142857e-05, + "loss": 1.9296, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 1.4304082960837077, + "learning_rate": 8.214285714285714e-05, + "loss": 1.8629, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 1.5095743351765647, + "learning_rate": 8.571428571428571e-05, + "loss": 2.096, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 3.5458230630716185, + "learning_rate": 8.928571428571429e-05, + "loss": 2.0096, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 2.761767871209414, + "learning_rate": 9.285714285714286e-05, + "loss": 1.9721, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 5.81257268388485, + "learning_rate": 9.642857142857143e-05, + "loss": 2.0577, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 4.486947255640102, + "learning_rate": 0.0001, + "loss": 1.916, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.0720761542391175, + "learning_rate": 0.00010357142857142858, + "loss": 1.9207, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 1.6505238857303257, + "learning_rate": 0.00010714285714285714, + "loss": 2.0371, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 9.291313495395931, + "learning_rate": 0.00011071428571428571, + "loss": 2.0557, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.7313893024574583, + "learning_rate": 0.00011428571428571428, + "loss": 1.9889, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 1.5935351470037225, + "learning_rate": 0.00011785714285714286, + "loss": 1.9542, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 1.066218796348364, + "learning_rate": 0.00012142857142857143, + "loss": 1.9928, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.3713446928171047, + "learning_rate": 0.000125, + "loss": 1.9124, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.3523693102135053, + "learning_rate": 0.00012857142857142855, + "loss": 2.0267, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 0.9515514517206934, + "learning_rate": 0.00013214285714285715, + "loss": 1.8685, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 0.9729557725179657, + "learning_rate": 0.0001357142857142857, + "loss": 1.7549, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.564147690807114, + "learning_rate": 0.0001392857142857143, + "loss": 1.9658, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 1.002970099447507, + "learning_rate": 0.00014285714285714284, + "loss": 2.0966, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 1.0123033933136292, + "learning_rate": 0.00014642857142857144, + "loss": 1.9739, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 1.7633814778573, + "learning_rate": 0.00015, + "loss": 1.847, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 1.1686064851334461, + "learning_rate": 0.0001535714285714286, + "loss": 1.9012, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 0.9361446756319182, + "learning_rate": 0.00015714285714285713, + "loss": 1.9118, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 1.5200884270670607, + "learning_rate": 0.00016071428571428573, + "loss": 1.9476, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 1.450100365815453, + "learning_rate": 0.00016428571428571428, + "loss": 1.7931, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 1.288950921055788, + "learning_rate": 0.00016785714285714285, + "loss": 1.9861, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 0.803610469800681, + "learning_rate": 0.00017142857142857143, + "loss": 1.9061, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 1.4356333181084613, + "learning_rate": 0.000175, + "loss": 1.8542, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.2358575078876701, + "learning_rate": 0.00017857142857142857, + "loss": 1.9251, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.0864304720316573, + "learning_rate": 0.00018214285714285714, + "loss": 1.8825, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 1.1309351656853563, + "learning_rate": 0.00018571428571428572, + "loss": 1.846, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 0.9981874490118918, + "learning_rate": 0.0001892857142857143, + "loss": 1.7579, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 1.065892064141929, + "learning_rate": 0.00019285714285714286, + "loss": 1.8208, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.8816359706176079, + "learning_rate": 0.00019642857142857144, + "loss": 1.7919, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.0578385928836473, + "learning_rate": 0.0002, + "loss": 1.8116, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 0.9421173828756452, + "learning_rate": 0.00020357142857142858, + "loss": 1.7467, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 1.6289823592081432, + "learning_rate": 0.00020714285714285716, + "loss": 1.655, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 1.4741147658119709, + "learning_rate": 0.00021071428571428573, + "loss": 1.7672, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 2.57246021644073, + "learning_rate": 0.00021428571428571427, + "loss": 1.6553, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.1375051314898432, + "learning_rate": 0.00021785714285714287, + "loss": 1.7275, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.3522322434105656, + "learning_rate": 0.00022142857142857142, + "loss": 1.6659, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 1.3498064220242347, + "learning_rate": 0.00022500000000000002, + "loss": 1.7152, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.4251392957299767, + "learning_rate": 0.00022857142857142857, + "loss": 1.5624, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.119320800018593, + "learning_rate": 0.00023214285714285717, + "loss": 1.667, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.0918414494076893, + "learning_rate": 0.0002357142857142857, + "loss": 1.5963, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.9746103076509893, + "learning_rate": 0.0002392857142857143, + "loss": 1.504, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.9492418576791255, + "learning_rate": 0.00024285714285714286, + "loss": 1.6585, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.8174319326574438, + "learning_rate": 0.00024642857142857143, + "loss": 1.5783, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.184127183820156, + "learning_rate": 0.00025, + "loss": 1.6589, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.6662930221337083, + "learning_rate": 0.0002535714285714286, + "loss": 1.5221, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.2301561957715066, + "learning_rate": 0.0002571428571428571, + "loss": 1.7142, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.7534991533961916, + "learning_rate": 0.0002607142857142857, + "loss": 1.5588, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 0.5609113126608429, + "learning_rate": 0.0002642857142857143, + "loss": 1.5584, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 0.958306688822885, + "learning_rate": 0.00026785714285714287, + "loss": 1.523, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.6114223761007516, + "learning_rate": 0.0002714285714285714, + "loss": 1.6552, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 0.6799693126275542, + "learning_rate": 0.000275, + "loss": 1.6153, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 0.846263726121709, + "learning_rate": 0.0002785714285714286, + "loss": 1.5276, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 0.5330196945414835, + "learning_rate": 0.00028214285714285716, + "loss": 1.5136, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.6961419376772497, + "learning_rate": 0.0002857142857142857, + "loss": 1.4979, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.44617572572355957, + "learning_rate": 0.0002892857142857143, + "loss": 1.5388, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 0.5404931034315101, + "learning_rate": 0.0002928571428571429, + "loss": 1.53, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 0.625345659039276, + "learning_rate": 0.00029642857142857145, + "loss": 1.5604, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 0.3973305228158473, + "learning_rate": 0.0003, + "loss": 1.5368, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 0.5150470529051457, + "learning_rate": 0.00030357142857142855, + "loss": 1.5111, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.5165971204137647, + "learning_rate": 0.0003071428571428572, + "loss": 1.6526, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.5061721865580466, + "learning_rate": 0.00031071428571428575, + "loss": 1.5447, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.7102067251701049, + "learning_rate": 0.00031428571428571427, + "loss": 1.4482, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.5244806059205487, + "learning_rate": 0.00031785714285714284, + "loss": 1.6764, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.5221727806541643, + "learning_rate": 0.00032142857142857147, + "loss": 1.6043, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.7848075737761744, + "learning_rate": 0.00032500000000000004, + "loss": 1.5074, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.4290039738082572, + "learning_rate": 0.00032857142857142856, + "loss": 1.5482, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.5232736843606722, + "learning_rate": 0.00033214285714285713, + "loss": 1.5139, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.44862425899364705, + "learning_rate": 0.0003357142857142857, + "loss": 1.5531, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.4858044328676179, + "learning_rate": 0.00033928571428571433, + "loss": 1.5697, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.40783462592279174, + "learning_rate": 0.00034285714285714285, + "loss": 1.4921, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.4833210430418762, + "learning_rate": 0.0003464285714285714, + "loss": 1.6326, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.3349518955445, + "learning_rate": 0.00035, + "loss": 1.481, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.5119405076678242, + "learning_rate": 0.0003535714285714286, + "loss": 1.5565, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.4644397775386018, + "learning_rate": 0.00035714285714285714, + "loss": 1.5632, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.3798960509359317, + "learning_rate": 0.0003607142857142857, + "loss": 1.6424, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.38836469924030503, + "learning_rate": 0.0003642857142857143, + "loss": 1.5311, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.40155819248605457, + "learning_rate": 0.0003678571428571429, + "loss": 1.4503, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.5577097012658978, + "learning_rate": 0.00037142857142857143, + "loss": 1.4728, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.3302891581452391, + "learning_rate": 0.000375, + "loss": 1.4451, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.5105057674282057, + "learning_rate": 0.0003785714285714286, + "loss": 1.4823, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.34139026979805664, + "learning_rate": 0.0003821428571428571, + "loss": 1.5526, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.3526373961195756, + "learning_rate": 0.0003857142857142857, + "loss": 1.6479, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.36739582888447553, + "learning_rate": 0.0003892857142857143, + "loss": 1.5424, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.31509373847267547, + "learning_rate": 0.0003928571428571429, + "loss": 1.7238, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.7305039688024733, + "learning_rate": 0.0003964285714285714, + "loss": 1.5271, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.3581584781956524, + "learning_rate": 0.0004, + "loss": 1.6029, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.3224198059244875, + "learning_rate": 0.0004035714285714286, + "loss": 1.5386, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.3471545607154616, + "learning_rate": 0.00040714285714285717, + "loss": 1.468, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.3904049444253878, + "learning_rate": 0.0004107142857142857, + "loss": 1.6968, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.3121284899151761, + "learning_rate": 0.0004142857142857143, + "loss": 1.7429, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.3244004142779458, + "learning_rate": 0.0004178571428571429, + "loss": 1.5028, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.5252291278727379, + "learning_rate": 0.00042142857142857146, + "loss": 1.4616, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.3378902942457399, + "learning_rate": 0.000425, + "loss": 1.4332, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.28884442583413833, + "learning_rate": 0.00042857142857142855, + "loss": 1.3443, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.35609856553313685, + "learning_rate": 0.0004321428571428572, + "loss": 1.5219, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.35554901725913257, + "learning_rate": 0.00043571428571428575, + "loss": 1.5723, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.3440599128217032, + "learning_rate": 0.00043928571428571427, + "loss": 1.3977, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.31192531792308886, + "learning_rate": 0.00044285714285714284, + "loss": 1.5228, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.32046127353392895, + "learning_rate": 0.00044642857142857147, + "loss": 1.4763, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.3101840268627974, + "learning_rate": 0.00045000000000000004, + "loss": 1.4582, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.4288094131576573, + "learning_rate": 0.00045357142857142856, + "loss": 1.5662, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.44158522598033534, + "learning_rate": 0.00045714285714285713, + "loss": 1.4745, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.3239538132892933, + "learning_rate": 0.0004607142857142857, + "loss": 1.4899, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.2871588061072913, + "learning_rate": 0.00046428571428571433, + "loss": 1.6439, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.38892543132373336, + "learning_rate": 0.00046785714285714285, + "loss": 1.4552, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.3825473937891277, + "learning_rate": 0.0004714285714285714, + "loss": 1.5597, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.4309140645349847, + "learning_rate": 0.000475, + "loss": 1.5602, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 0.4017136556479656, + "learning_rate": 0.0004785714285714286, + "loss": 1.5129, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 0.28489478417342257, + "learning_rate": 0.00048214285714285715, + "loss": 1.4758, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.3099684276288773, + "learning_rate": 0.0004857142857142857, + "loss": 1.5706, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.29120739116594524, + "learning_rate": 0.0004892857142857142, + "loss": 1.5954, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.36657854067868006, + "learning_rate": 0.0004928571428571429, + "loss": 1.5739, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.4317725139749333, + "learning_rate": 0.0004964285714285715, + "loss": 1.4752, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 0.31764086869505503, + "learning_rate": 0.0005, + "loss": 1.6824, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 0.3385632495847182, + "learning_rate": 0.0005035714285714285, + "loss": 1.5375, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 0.37461770956336343, + "learning_rate": 0.0005071428571428572, + "loss": 1.412, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 0.3399190654827032, + "learning_rate": 0.0005107142857142857, + "loss": 1.4806, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 0.4216971640700005, + "learning_rate": 0.0005142857142857142, + "loss": 1.5229, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 0.3746707384688907, + "learning_rate": 0.0005178571428571429, + "loss": 1.5051, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 0.360108642576056, + "learning_rate": 0.0005214285714285714, + "loss": 1.4608, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 0.3508119200146609, + "learning_rate": 0.0005250000000000001, + "loss": 1.5631, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 0.3979211809075355, + "learning_rate": 0.0005285714285714286, + "loss": 1.7342, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 0.2965752820354549, + "learning_rate": 0.0005321428571428571, + "loss": 1.4366, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 0.36692441991683633, + "learning_rate": 0.0005357142857142857, + "loss": 1.6711, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 0.3433996604400371, + "learning_rate": 0.0005392857142857143, + "loss": 1.5016, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 0.3136388055796795, + "learning_rate": 0.0005428571428571428, + "loss": 1.4356, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 0.30192687104297594, + "learning_rate": 0.0005464285714285714, + "loss": 1.4473, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 0.33129198015257594, + "learning_rate": 0.00055, + "loss": 1.4901, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 0.3165111191353476, + "learning_rate": 0.0005535714285714287, + "loss": 1.4939, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 0.29059641041159867, + "learning_rate": 0.0005571428571428572, + "loss": 1.5284, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 0.30213428826667454, + "learning_rate": 0.0005607142857142857, + "loss": 1.6462, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 0.2845219847958041, + "learning_rate": 0.0005642857142857143, + "loss": 1.526, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 0.35986971493563835, + "learning_rate": 0.0005678571428571428, + "loss": 1.4807, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 0.43297526779692946, + "learning_rate": 0.0005714285714285714, + "loss": 1.4925, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 0.309817795851931, + "learning_rate": 0.000575, + "loss": 1.4316, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 0.3532850398670643, + "learning_rate": 0.0005785714285714286, + "loss": 1.3771, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 0.2921544010366211, + "learning_rate": 0.0005821428571428572, + "loss": 1.3419, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 0.29158256657962844, + "learning_rate": 0.0005857142857142858, + "loss": 1.3892, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 0.26959751530851456, + "learning_rate": 0.0005892857142857143, + "loss": 1.5987, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.27858915227839154, + "learning_rate": 0.0005928571428571429, + "loss": 1.6159, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 0.27887679232979334, + "learning_rate": 0.0005964285714285714, + "loss": 1.6343, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 0.2902063856837706, + "learning_rate": 0.0006, + "loss": 1.5628, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 0.26242952479313464, + "learning_rate": 0.0006035714285714286, + "loss": 1.6507, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 0.2593759252347874, + "learning_rate": 0.0006071428571428571, + "loss": 1.5475, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.31011666182187436, + "learning_rate": 0.0006107142857142858, + "loss": 1.5178, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 0.25639111105134527, + "learning_rate": 0.0006142857142857143, + "loss": 1.498, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 0.31652842035082357, + "learning_rate": 0.0006178571428571429, + "loss": 1.4955, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 0.2987038846515858, + "learning_rate": 0.0006214285714285715, + "loss": 1.4423, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 0.3263219499540308, + "learning_rate": 0.000625, + "loss": 1.7098, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.32379973578168375, + "learning_rate": 0.0006285714285714285, + "loss": 1.5546, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 0.30673266201950766, + "learning_rate": 0.0006321428571428572, + "loss": 1.5595, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 0.31025981831403515, + "learning_rate": 0.0006357142857142857, + "loss": 1.5682, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 0.2631500803146406, + "learning_rate": 0.0006392857142857142, + "loss": 1.4778, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 0.263772189519905, + "learning_rate": 0.0006428571428571429, + "loss": 1.5322, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.3170171832408717, + "learning_rate": 0.0006464285714285715, + "loss": 1.5399, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 0.25839825036928027, + "learning_rate": 0.0006500000000000001, + "loss": 1.5088, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 0.2693335388497259, + "learning_rate": 0.0006535714285714286, + "loss": 1.3628, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 0.3374590349944099, + "learning_rate": 0.0006571428571428571, + "loss": 1.5421, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 0.2880039486353611, + "learning_rate": 0.0006607142857142857, + "loss": 1.5958, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.2532226088788996, + "learning_rate": 0.0006642857142857143, + "loss": 1.544, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 0.31236526445694107, + "learning_rate": 0.0006678571428571428, + "loss": 1.4988, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 0.296432786908887, + "learning_rate": 0.0006714285714285714, + "loss": 1.5167, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 0.3319120080791588, + "learning_rate": 0.000675, + "loss": 1.4906, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 0.2671474694497889, + "learning_rate": 0.0006785714285714287, + "loss": 1.506, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.29230866138530726, + "learning_rate": 0.0006821428571428572, + "loss": 1.5605, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 0.29237517306757743, + "learning_rate": 0.0006857142857142857, + "loss": 1.5592, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 0.23998407955562542, + "learning_rate": 0.0006892857142857143, + "loss": 1.5919, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 0.33358955469999935, + "learning_rate": 0.0006928571428571428, + "loss": 1.5007, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 0.2588549835888095, + "learning_rate": 0.0006964285714285714, + "loss": 1.583, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.2985579696888121, + "learning_rate": 0.0007, + "loss": 1.5594, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 0.2837689469382843, + "learning_rate": 0.0007035714285714286, + "loss": 1.4313, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 0.26100830899090055, + "learning_rate": 0.0007071428571428572, + "loss": 1.5072, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 0.2858482873084629, + "learning_rate": 0.0007107142857142858, + "loss": 1.6042, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 0.28657146589590393, + "learning_rate": 0.0007142857142857143, + "loss": 1.5303, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.29257530177935365, + "learning_rate": 0.0007178571428571429, + "loss": 1.6107, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 0.3042172997552271, + "learning_rate": 0.0007214285714285714, + "loss": 1.5416, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 0.30426651346545824, + "learning_rate": 0.000725, + "loss": 1.5159, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 0.26088662022664416, + "learning_rate": 0.0007285714285714286, + "loss": 1.511, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 0.22126894553880855, + "learning_rate": 0.0007321428571428571, + "loss": 1.3603, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 0.22402688666484616, + "learning_rate": 0.0007357142857142858, + "loss": 1.4555, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 0.22146626840818323, + "learning_rate": 0.0007392857142857144, + "loss": 1.345, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 0.234618942149512, + "learning_rate": 0.0007428571428571429, + "loss": 1.4908, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 0.29634089484886406, + "learning_rate": 0.0007464285714285715, + "loss": 1.5035, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 0.25852822888367205, + "learning_rate": 0.00075, + "loss": 1.6435, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.292756900485729, + "learning_rate": 0.0007535714285714285, + "loss": 1.5641, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 0.2039367704094928, + "learning_rate": 0.0007571428571428572, + "loss": 1.4156, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 0.25648074373805585, + "learning_rate": 0.0007607142857142857, + "loss": 1.4194, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 0.2552155993199289, + "learning_rate": 0.0007642857142857142, + "loss": 1.4379, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 0.2585414529627138, + "learning_rate": 0.0007678571428571429, + "loss": 1.5787, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 0.2546374137167054, + "learning_rate": 0.0007714285714285715, + "loss": 1.5555, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 0.27081684679812273, + "learning_rate": 0.0007750000000000001, + "loss": 1.3571, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 0.24325223266409077, + "learning_rate": 0.0007785714285714286, + "loss": 1.5655, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 0.24276431673429932, + "learning_rate": 0.0007821428571428571, + "loss": 1.4558, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 0.2366434468516385, + "learning_rate": 0.0007857142857142857, + "loss": 1.5843, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 0.23005762850876868, + "learning_rate": 0.0007892857142857143, + "loss": 1.4706, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 0.21661794426819167, + "learning_rate": 0.0007928571428571428, + "loss": 1.5066, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 0.24364550930674125, + "learning_rate": 0.0007964285714285714, + "loss": 1.4984, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 0.24544373452185395, + "learning_rate": 0.0008, + "loss": 1.5392, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 0.3164181294977973, + "learning_rate": 0.0008035714285714287, + "loss": 1.4713, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 0.2057971791624359, + "learning_rate": 0.0008071428571428572, + "loss": 1.6402, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 0.22646235028301068, + "learning_rate": 0.0008107142857142857, + "loss": 1.6311, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 0.2434939861749518, + "learning_rate": 0.0008142857142857143, + "loss": 1.4522, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 0.25565301421024494, + "learning_rate": 0.0008178571428571428, + "loss": 1.4731, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 0.2145112325466064, + "learning_rate": 0.0008214285714285714, + "loss": 1.508, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 0.24235525781497383, + "learning_rate": 0.000825, + "loss": 1.5515, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 0.24808658822607518, + "learning_rate": 0.0008285714285714286, + "loss": 1.4308, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 0.25257648565419344, + "learning_rate": 0.0008321428571428573, + "loss": 1.5598, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 0.2052328252860909, + "learning_rate": 0.0008357142857142858, + "loss": 1.4058, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 0.30554429044632603, + "learning_rate": 0.0008392857142857143, + "loss": 1.4984, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 0.2684656793305302, + "learning_rate": 0.0008428571428571429, + "loss": 1.5596, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 0.24919207229849352, + "learning_rate": 0.0008464285714285714, + "loss": 1.5213, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 0.23262987526628107, + "learning_rate": 0.00085, + "loss": 1.6697, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 0.206144476035226, + "learning_rate": 0.0008535714285714286, + "loss": 1.6066, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 0.1916608965478554, + "learning_rate": 0.0008571428571428571, + "loss": 1.5925, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 0.18887206567344306, + "learning_rate": 0.0008607142857142858, + "loss": 1.5143, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 0.18012370414006335, + "learning_rate": 0.0008642857142857144, + "loss": 1.5059, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 0.23347982878855053, + "learning_rate": 0.0008678571428571429, + "loss": 1.5216, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 0.2113463845182561, + "learning_rate": 0.0008714285714285715, + "loss": 1.6146, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 0.2069102374474599, + "learning_rate": 0.000875, + "loss": 1.4846, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 0.20672558766688756, + "learning_rate": 0.0008785714285714285, + "loss": 1.4937, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 0.21223510647754198, + "learning_rate": 0.0008821428571428572, + "loss": 1.5495, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 0.2702576482389292, + "learning_rate": 0.0008857142857142857, + "loss": 1.4101, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 0.22179795777881917, + "learning_rate": 0.0008892857142857142, + "loss": 1.677, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 0.2231004575446933, + "learning_rate": 0.0008928571428571429, + "loss": 1.6451, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 0.19345695749788838, + "learning_rate": 0.0008964285714285715, + "loss": 1.4773, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 0.2164011557010416, + "learning_rate": 0.0009000000000000001, + "loss": 1.4006, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 0.20521116028853958, + "learning_rate": 0.0009035714285714286, + "loss": 1.4699, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 0.21510699790233587, + "learning_rate": 0.0009071428571428571, + "loss": 1.3917, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 0.18317267392579845, + "learning_rate": 0.0009107142857142857, + "loss": 1.5035, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 0.2783009940687441, + "learning_rate": 0.0009142857142857143, + "loss": 1.5334, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 0.2054533522968223, + "learning_rate": 0.0009178571428571428, + "loss": 1.3404, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 0.21102299105750338, + "learning_rate": 0.0009214285714285714, + "loss": 1.3101, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 0.18963226471367908, + "learning_rate": 0.000925, + "loss": 1.3663, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 0.2184587395865125, + "learning_rate": 0.0009285714285714287, + "loss": 1.5179, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 0.20605484569236143, + "learning_rate": 0.0009321428571428572, + "loss": 1.5511, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 0.2791226050077258, + "learning_rate": 0.0009357142857142857, + "loss": 1.5101, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 0.19037886699486145, + "learning_rate": 0.0009392857142857143, + "loss": 1.4686, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 0.1981693942215429, + "learning_rate": 0.0009428571428571429, + "loss": 1.3974, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 0.22826376509236745, + "learning_rate": 0.0009464285714285714, + "loss": 1.5342, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.19772832728602222, + "learning_rate": 0.00095, + "loss": 1.4764, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 0.2584142237517251, + "learning_rate": 0.0009535714285714286, + "loss": 1.4261, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 0.2144015182353289, + "learning_rate": 0.0009571428571428573, + "loss": 1.5228, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 0.17217525622544874, + "learning_rate": 0.0009607142857142858, + "loss": 1.5678, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 0.1996368107538457, + "learning_rate": 0.0009642857142857143, + "loss": 1.5514, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 0.20732376738259717, + "learning_rate": 0.0009678571428571429, + "loss": 1.5184, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 0.19770305356463294, + "learning_rate": 0.0009714285714285714, + "loss": 1.6127, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 0.18981688274012634, + "learning_rate": 0.000975, + "loss": 1.5789, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 0.20918579733765114, + "learning_rate": 0.0009785714285714285, + "loss": 1.408, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 0.20927524823711904, + "learning_rate": 0.0009821428571428572, + "loss": 1.4623, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 0.17053415437999953, + "learning_rate": 0.0009857142857142857, + "loss": 1.4153, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 0.17353729231705273, + "learning_rate": 0.0009892857142857142, + "loss": 1.4628, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 0.19297924630966062, + "learning_rate": 0.000992857142857143, + "loss": 1.4706, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 0.18001888908262587, + "learning_rate": 0.0009964285714285715, + "loss": 1.5269, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 0.18830669239089334, + "learning_rate": 0.001, + "loss": 1.5408, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 0.17643323360815724, + "learning_rate": 0.0009999999696866382, + "loss": 1.4809, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 0.2040443477107501, + "learning_rate": 0.000999999878746556, + "loss": 1.5866, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 0.1793843334457987, + "learning_rate": 0.000999999727179765, + "loss": 1.4915, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 0.2035576998367501, + "learning_rate": 0.0009999995149862835, + "loss": 1.5028, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 0.18822278840465206, + "learning_rate": 0.0009999992421661369, + "loss": 1.5095, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 0.22599095980346726, + "learning_rate": 0.0009999989087193582, + "loss": 1.4853, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 0.19948992318384096, + "learning_rate": 0.0009999985146459881, + "loss": 1.4658, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 0.2077750753103998, + "learning_rate": 0.0009999980599460746, + "loss": 1.4903, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 0.2042826255511603, + "learning_rate": 0.0009999975446196726, + "loss": 1.5716, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 0.1809148903047096, + "learning_rate": 0.0009999969686668442, + "loss": 1.5172, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 0.17323034884698793, + "learning_rate": 0.0009999963320876598, + "loss": 1.4453, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 0.20664753532753552, + "learning_rate": 0.0009999956348821966, + "loss": 1.6475, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 0.5668139338801725, + "learning_rate": 0.0009999948770505387, + "loss": 1.4983, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 0.20916989817361614, + "learning_rate": 0.0009999940585927781, + "loss": 1.5331, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 0.20045916684556297, + "learning_rate": 0.0009999931795090142, + "loss": 1.4464, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 0.1991163579687259, + "learning_rate": 0.0009999922397993537, + "loss": 1.5759, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 0.26448205876932024, + "learning_rate": 0.0009999912394639103, + "loss": 1.4446, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 0.19273562442141076, + "learning_rate": 0.0009999901785028053, + "loss": 1.5955, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 0.16933403058061827, + "learning_rate": 0.0009999890569161677, + "loss": 1.4588, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 0.1830637464672982, + "learning_rate": 0.000999987874704133, + "loss": 1.6497, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 0.17282182901114648, + "learning_rate": 0.0009999866318668449, + "loss": 1.4537, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 0.18143770203988144, + "learning_rate": 0.0009999853284044537, + "loss": 1.5288, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 0.21283164213442005, + "learning_rate": 0.000999983964317118, + "loss": 1.6833, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 0.16871865453821608, + "learning_rate": 0.0009999825396050028, + "loss": 1.4128, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 0.17788316189586986, + "learning_rate": 0.000999981054268281, + "loss": 1.4958, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 0.17628497956862654, + "learning_rate": 0.0009999795083071327, + "loss": 1.5826, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 0.17774954930179934, + "learning_rate": 0.0009999779017217452, + "loss": 1.5999, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 0.16138411201607197, + "learning_rate": 0.0009999762345123135, + "loss": 1.448, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 0.14636359639747132, + "learning_rate": 0.0009999745066790397, + "loss": 1.5426, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 0.17395839471064034, + "learning_rate": 0.0009999727182221335, + "loss": 1.4609, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 0.1596077796811495, + "learning_rate": 0.0009999708691418112, + "loss": 1.5566, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 0.17254640291130927, + "learning_rate": 0.0009999689594382976, + "loss": 1.5635, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 0.21093878480731898, + "learning_rate": 0.0009999669891118238, + "loss": 1.4798, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 0.14639537332716396, + "learning_rate": 0.0009999649581626292, + "loss": 1.3929, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 0.18027095555325548, + "learning_rate": 0.0009999628665909597, + "loss": 1.6339, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 0.18042986872074054, + "learning_rate": 0.0009999607143970689, + "loss": 1.547, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 0.1586215202794954, + "learning_rate": 0.0009999585015812178, + "loss": 1.4079, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 0.17834825140193727, + "learning_rate": 0.0009999562281436747, + "loss": 1.4367, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 0.17982659450306954, + "learning_rate": 0.0009999538940847156, + "loss": 1.6693, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 0.164622497777815, + "learning_rate": 0.0009999514994046231, + "loss": 1.5315, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 0.2006206552979814, + "learning_rate": 0.0009999490441036877, + "loss": 1.5092, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 0.16907762296767967, + "learning_rate": 0.0009999465281822072, + "loss": 1.5472, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 0.19421538970369667, + "learning_rate": 0.0009999439516404863, + "loss": 1.5062, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 0.16267349147204174, + "learning_rate": 0.000999941314478838, + "loss": 1.4601, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 0.17687924044685271, + "learning_rate": 0.0009999386166975816, + "loss": 1.4468, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 0.1658946269536119, + "learning_rate": 0.0009999358582970443, + "loss": 1.3812, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 0.19961931592072243, + "learning_rate": 0.000999933039277561, + "loss": 1.451, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 0.18247338360131146, + "learning_rate": 0.0009999301596394727, + "loss": 1.5772, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 0.18531837950187607, + "learning_rate": 0.0009999272193831293, + "loss": 1.499, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 0.17578485273271977, + "learning_rate": 0.000999924218508887, + "loss": 1.4941, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 0.1716177706116155, + "learning_rate": 0.0009999211570171098, + "loss": 1.3478, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 0.15907076461451958, + "learning_rate": 0.0009999180349081686, + "loss": 1.5074, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 0.14106955725340992, + "learning_rate": 0.0009999148521824424, + "loss": 1.6304, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 0.18473843834209136, + "learning_rate": 0.0009999116088403167, + "loss": 1.3852, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 0.12962084897330353, + "learning_rate": 0.0009999083048821851, + "loss": 1.5796, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 0.21469389717837936, + "learning_rate": 0.000999904940308448, + "loss": 1.5622, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 0.13965253223738205, + "learning_rate": 0.0009999015151195135, + "loss": 1.4831, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 0.14771018846987435, + "learning_rate": 0.000999898029315797, + "loss": 1.4489, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 0.14126087838350085, + "learning_rate": 0.0009998944828977208, + "loss": 1.4695, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 0.13103131840592866, + "learning_rate": 0.0009998908758657153, + "loss": 1.3719, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 0.15938251830581013, + "learning_rate": 0.0009998872082202176, + "loss": 1.6759, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 0.14062844047046466, + "learning_rate": 0.0009998834799616726, + "loss": 1.5435, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 0.14069295274521235, + "learning_rate": 0.0009998796910905324, + "loss": 1.5798, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 0.14899985863071655, + "learning_rate": 0.0009998758416072562, + "loss": 1.4963, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 0.15403995672516727, + "learning_rate": 0.000999871931512311, + "loss": 1.5071, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 0.15965488508555337, + "learning_rate": 0.0009998679608061705, + "loss": 1.4844, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 0.1397205113440922, + "learning_rate": 0.0009998639294893166, + "loss": 1.3994, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 0.16837720499970224, + "learning_rate": 0.0009998598375622382, + "loss": 1.6294, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 0.14290960526839144, + "learning_rate": 0.0009998556850254307, + "loss": 1.393, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 0.1647613660769307, + "learning_rate": 0.0009998514718793986, + "loss": 1.5986, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 0.126604301560717, + "learning_rate": 0.000999847198124652, + "loss": 1.5888, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 0.1625076881295614, + "learning_rate": 0.0009998428637617094, + "loss": 1.4581, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 0.1354795426114141, + "learning_rate": 0.0009998384687910967, + "loss": 1.5685, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 0.1960614626166613, + "learning_rate": 0.000999834013213346, + "loss": 1.4762, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 0.15410756920154223, + "learning_rate": 0.0009998294970289983, + "loss": 1.5645, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 0.18398803112017642, + "learning_rate": 0.0009998249202386007, + "loss": 1.4068, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 0.19952868680271008, + "learning_rate": 0.0009998202828427085, + "loss": 1.5943, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 0.17194494265665142, + "learning_rate": 0.000999815584841884, + "loss": 1.5148, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 0.16781358427269294, + "learning_rate": 0.0009998108262366965, + "loss": 1.4017, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 0.17487641147080715, + "learning_rate": 0.0009998060070277232, + "loss": 1.4302, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 0.14854649570076, + "learning_rate": 0.0009998011272155485, + "loss": 1.435, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 0.13955296740945117, + "learning_rate": 0.0009997961868007642, + "loss": 1.3177, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 0.17180197963616256, + "learning_rate": 0.0009997911857839688, + "loss": 1.4593, + "step": 363 + }, + { + "epoch": 0.04, + "grad_norm": 0.1450351295687952, + "learning_rate": 0.0009997861241657694, + "loss": 1.4266, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 0.17459394228876687, + "learning_rate": 0.0009997810019467793, + "loss": 1.4735, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 0.21035233323036612, + "learning_rate": 0.0009997758191276197, + "loss": 1.5312, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 0.1522354572488286, + "learning_rate": 0.0009997705757089192, + "loss": 1.4431, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 0.15293858625262405, + "learning_rate": 0.0009997652716913134, + "loss": 1.6787, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 0.1353046205100757, + "learning_rate": 0.0009997599070754454, + "loss": 1.4503, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 0.17227780219049219, + "learning_rate": 0.0009997544818619657, + "loss": 1.4085, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 0.1437172909509328, + "learning_rate": 0.000999748996051532, + "loss": 1.5073, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 0.1504855289987324, + "learning_rate": 0.0009997434496448099, + "loss": 1.5767, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 0.13326666119325065, + "learning_rate": 0.0009997378426424715, + "loss": 1.4447, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 0.12945116710966437, + "learning_rate": 0.0009997321750451968, + "loss": 1.5107, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 0.1633221149641643, + "learning_rate": 0.0009997264468536732, + "loss": 1.5965, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 0.14094383798016874, + "learning_rate": 0.000999720658068595, + "loss": 1.5529, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 0.1424006539876174, + "learning_rate": 0.0009997148086906642, + "loss": 1.4084, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 0.1659677155871326, + "learning_rate": 0.0009997088987205903, + "loss": 1.6574, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 0.16406857672291258, + "learning_rate": 0.0009997029281590892, + "loss": 1.4398, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 0.14181116310622616, + "learning_rate": 0.0009996968970068857, + "loss": 1.4507, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 0.14808479870132174, + "learning_rate": 0.0009996908052647105, + "loss": 1.5897, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 0.17030159407878712, + "learning_rate": 0.0009996846529333027, + "loss": 1.5398, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 0.16969702631552683, + "learning_rate": 0.0009996784400134078, + "loss": 1.4812, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 0.17226460579810668, + "learning_rate": 0.0009996721665057795, + "loss": 1.469, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 0.1721386837516993, + "learning_rate": 0.0009996658324111785, + "loss": 1.5467, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 0.14892597511153813, + "learning_rate": 0.0009996594377303725, + "loss": 1.5863, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 0.1575483529907697, + "learning_rate": 0.0009996529824641374, + "loss": 1.4743, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 0.1456398333006279, + "learning_rate": 0.0009996464666132553, + "loss": 1.5276, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 0.15489743549736446, + "learning_rate": 0.0009996398901785167, + "loss": 1.482, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 0.1583860129059049, + "learning_rate": 0.000999633253160719, + "loss": 1.5561, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 0.17262883033342483, + "learning_rate": 0.0009996265555606667, + "loss": 1.4898, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 0.16439961527862196, + "learning_rate": 0.000999619797379172, + "loss": 1.5272, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 0.15828301920189175, + "learning_rate": 0.0009996129786170546, + "loss": 1.4813, + "step": 393 + }, + { + "epoch": 0.04, + "grad_norm": 0.1731170888583976, + "learning_rate": 0.000999606099275141, + "loss": 1.4245, + "step": 394 + }, + { + "epoch": 0.04, + "grad_norm": 0.1559416559722423, + "learning_rate": 0.0009995991593542656, + "loss": 1.5365, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 0.18583660593985893, + "learning_rate": 0.0009995921588552695, + "loss": 1.4156, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 0.1707241641157939, + "learning_rate": 0.000999585097779002, + "loss": 1.5137, + "step": 397 + }, + { + "epoch": 0.04, + "grad_norm": 0.1407581665746861, + "learning_rate": 0.0009995779761263189, + "loss": 1.5203, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 0.1388777158799526, + "learning_rate": 0.000999570793898084, + "loss": 1.5761, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 0.16295821219358653, + "learning_rate": 0.000999563551095168, + "loss": 1.6157, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 0.1539280261207723, + "learning_rate": 0.0009995562477184492, + "loss": 1.416, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 0.14246096761905572, + "learning_rate": 0.0009995488837688132, + "loss": 1.5411, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 0.15716424737797968, + "learning_rate": 0.0009995414592471527, + "loss": 1.4782, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 0.14283758228041246, + "learning_rate": 0.000999533974154368, + "loss": 1.4147, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 0.15328695456751443, + "learning_rate": 0.000999526428491367, + "loss": 1.5153, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 0.1436552033181903, + "learning_rate": 0.0009995188222590645, + "loss": 1.5375, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 0.1454954430103715, + "learning_rate": 0.0009995111554583825, + "loss": 1.34, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 0.16441780739523218, + "learning_rate": 0.0009995034280902509, + "loss": 1.5478, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 0.13139245357731225, + "learning_rate": 0.0009994956401556065, + "loss": 1.3986, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 0.1609687663696494, + "learning_rate": 0.0009994877916553937, + "loss": 1.651, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 0.1774917612967736, + "learning_rate": 0.0009994798825905644, + "loss": 1.4559, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 0.12610286657532213, + "learning_rate": 0.000999471912962077, + "loss": 1.3864, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 0.15091836283777837, + "learning_rate": 0.0009994638827708986, + "loss": 1.3917, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 0.1274380080548309, + "learning_rate": 0.0009994557920180024, + "loss": 1.4435, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 0.1403056631620941, + "learning_rate": 0.0009994476407043694, + "loss": 1.5053, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 0.13960834407799336, + "learning_rate": 0.000999439428830988, + "loss": 1.4803, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 0.14244926848342568, + "learning_rate": 0.0009994311563988542, + "loss": 1.5339, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 0.14869433694885337, + "learning_rate": 0.0009994228234089708, + "loss": 1.4244, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 0.14528003568402328, + "learning_rate": 0.0009994144298623485, + "loss": 1.4579, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 0.13406799469371342, + "learning_rate": 0.0009994059757600048, + "loss": 1.5282, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 0.14796217619302254, + "learning_rate": 0.0009993974611029646, + "loss": 1.5673, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 0.1370563358302789, + "learning_rate": 0.0009993888858922605, + "loss": 1.4697, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 0.12170722368190066, + "learning_rate": 0.0009993802501289326, + "loss": 1.4959, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 0.10003558537131502, + "learning_rate": 0.0009993715538140276, + "loss": 1.4291, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 0.1334012983526422, + "learning_rate": 0.0009993627969486002, + "loss": 1.558, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 0.12602036845602907, + "learning_rate": 0.000999353979533712, + "loss": 1.3761, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 0.09994334678040766, + "learning_rate": 0.0009993451015704324, + "loss": 1.3765, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 0.1429494796596272, + "learning_rate": 0.0009993361630598377, + "loss": 1.4477, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 0.1459824322844078, + "learning_rate": 0.0009993271640030116, + "loss": 1.5808, + "step": 429 + }, + { + "epoch": 0.05, + "grad_norm": 0.13516497528709895, + "learning_rate": 0.0009993181044010454, + "loss": 1.5225, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 0.12460002385750782, + "learning_rate": 0.000999308984255038, + "loss": 1.5176, + "step": 431 + }, + { + "epoch": 0.05, + "grad_norm": 0.12165124868540639, + "learning_rate": 0.0009992998035660945, + "loss": 1.4966, + "step": 432 + }, + { + "epoch": 0.05, + "grad_norm": 0.11682420685092966, + "learning_rate": 0.0009992905623353286, + "loss": 1.4564, + "step": 433 + }, + { + "epoch": 0.05, + "grad_norm": 0.13472134097524974, + "learning_rate": 0.0009992812605638605, + "loss": 1.4921, + "step": 434 + }, + { + "epoch": 0.05, + "grad_norm": 0.11455037083596988, + "learning_rate": 0.0009992718982528186, + "loss": 1.4622, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 0.13050817933155195, + "learning_rate": 0.0009992624754033377, + "loss": 1.5472, + "step": 436 + }, + { + "epoch": 0.05, + "grad_norm": 0.15767461304956595, + "learning_rate": 0.0009992529920165602, + "loss": 1.6125, + "step": 437 + }, + { + "epoch": 0.05, + "grad_norm": 0.146525866672167, + "learning_rate": 0.0009992434480936366, + "loss": 1.4863, + "step": 438 + }, + { + "epoch": 0.05, + "grad_norm": 0.1321979814402815, + "learning_rate": 0.0009992338436357235, + "loss": 1.5751, + "step": 439 + }, + { + "epoch": 0.05, + "grad_norm": 0.1292076236415919, + "learning_rate": 0.0009992241786439858, + "loss": 1.4775, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 0.13500666830362648, + "learning_rate": 0.0009992144531195955, + "loss": 1.5966, + "step": 441 + }, + { + "epoch": 0.05, + "grad_norm": 0.13695901467453922, + "learning_rate": 0.0009992046670637316, + "loss": 1.5805, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 0.13922716501708265, + "learning_rate": 0.0009991948204775807, + "loss": 1.5072, + "step": 443 + }, + { + "epoch": 0.05, + "grad_norm": 0.14647658645269154, + "learning_rate": 0.000999184913362337, + "loss": 1.3592, + "step": 444 + }, + { + "epoch": 0.05, + "grad_norm": 0.12899480292336285, + "learning_rate": 0.0009991749457192013, + "loss": 1.4513, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 0.12393656639381344, + "learning_rate": 0.0009991649175493827, + "loss": 1.4687, + "step": 446 + }, + { + "epoch": 0.05, + "grad_norm": 0.16754428326189647, + "learning_rate": 0.000999154828854097, + "loss": 1.4184, + "step": 447 + }, + { + "epoch": 0.05, + "grad_norm": 0.14210675981117007, + "learning_rate": 0.0009991446796345676, + "loss": 1.4588, + "step": 448 + }, + { + "epoch": 0.05, + "grad_norm": 0.1377863515377502, + "learning_rate": 0.0009991344698920246, + "loss": 1.5647, + "step": 449 + }, + { + "epoch": 0.05, + "grad_norm": 0.15677195035868116, + "learning_rate": 0.0009991241996277068, + "loss": 1.4818, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 0.14291065447977097, + "learning_rate": 0.0009991138688428588, + "loss": 1.4337, + "step": 451 + }, + { + "epoch": 0.05, + "grad_norm": 0.1500952470396202, + "learning_rate": 0.0009991034775387335, + "loss": 1.5161, + "step": 452 + }, + { + "epoch": 0.05, + "grad_norm": 0.14513044258257118, + "learning_rate": 0.000999093025716591, + "loss": 1.5096, + "step": 453 + }, + { + "epoch": 0.05, + "grad_norm": 0.16032928911490424, + "learning_rate": 0.000999082513377698, + "loss": 1.4002, + "step": 454 + }, + { + "epoch": 0.05, + "grad_norm": 0.17202853259574485, + "learning_rate": 0.0009990719405233303, + "loss": 1.4792, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 0.15835946799615688, + "learning_rate": 0.000999061307154769, + "loss": 1.4615, + "step": 456 + }, + { + "epoch": 0.05, + "grad_norm": 0.16951071663191195, + "learning_rate": 0.0009990506132733037, + "loss": 1.4748, + "step": 457 + }, + { + "epoch": 0.05, + "grad_norm": 0.1398963494982491, + "learning_rate": 0.000999039858880231, + "loss": 1.5173, + "step": 458 + }, + { + "epoch": 0.05, + "grad_norm": 0.12997095802643854, + "learning_rate": 0.000999029043976855, + "loss": 1.2695, + "step": 459 + }, + { + "epoch": 0.05, + "grad_norm": 0.11616314177597527, + "learning_rate": 0.000999018168564487, + "loss": 1.4087, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 0.17461980067454186, + "learning_rate": 0.0009990072326444455, + "loss": 1.5803, + "step": 461 + }, + { + "epoch": 0.05, + "grad_norm": 0.14272014194613272, + "learning_rate": 0.0009989962362180569, + "loss": 1.4678, + "step": 462 + }, + { + "epoch": 0.05, + "grad_norm": 0.16570305570684601, + "learning_rate": 0.0009989851792866543, + "loss": 1.6578, + "step": 463 + }, + { + "epoch": 0.05, + "grad_norm": 0.11998225014078424, + "learning_rate": 0.0009989740618515787, + "loss": 1.4982, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 0.13537021139680433, + "learning_rate": 0.0009989628839141775, + "loss": 1.5582, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 0.14086644816346083, + "learning_rate": 0.0009989516454758066, + "loss": 1.4891, + "step": 466 + }, + { + "epoch": 0.05, + "grad_norm": 0.1358567824453613, + "learning_rate": 0.0009989403465378284, + "loss": 1.4645, + "step": 467 + }, + { + "epoch": 0.05, + "grad_norm": 0.1401683725102955, + "learning_rate": 0.0009989289871016132, + "loss": 1.5034, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 0.14130289874302648, + "learning_rate": 0.0009989175671685383, + "loss": 1.6208, + "step": 469 + }, + { + "epoch": 0.05, + "grad_norm": 0.1476505273386164, + "learning_rate": 0.0009989060867399884, + "loss": 1.4666, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 0.1494861721735433, + "learning_rate": 0.0009988945458173552, + "loss": 1.458, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 0.14147325490478396, + "learning_rate": 0.0009988829444020385, + "loss": 1.6521, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 0.15133662684402113, + "learning_rate": 0.0009988712824954451, + "loss": 1.5475, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 0.15736937108778778, + "learning_rate": 0.0009988595600989886, + "loss": 1.5526, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 0.13815053807297753, + "learning_rate": 0.0009988477772140908, + "loss": 1.4171, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 0.15170448316189497, + "learning_rate": 0.00099883593384218, + "loss": 1.4429, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 0.14523768793650085, + "learning_rate": 0.0009988240299846926, + "loss": 1.6591, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 0.15517158286559127, + "learning_rate": 0.0009988120656430719, + "loss": 1.4886, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 0.13328382363243924, + "learning_rate": 0.0009988000408187685, + "loss": 1.4964, + "step": 479 + }, + { + "epoch": 0.05, + "grad_norm": 0.12208130405531557, + "learning_rate": 0.0009987879555132405, + "loss": 1.3444, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 0.1371216763034477, + "learning_rate": 0.0009987758097279534, + "loss": 1.5955, + "step": 481 + }, + { + "epoch": 0.05, + "grad_norm": 0.11997966531046428, + "learning_rate": 0.0009987636034643798, + "loss": 1.462, + "step": 482 + }, + { + "epoch": 0.05, + "grad_norm": 0.15171065892823024, + "learning_rate": 0.0009987513367239996, + "loss": 1.3923, + "step": 483 + }, + { + "epoch": 0.05, + "grad_norm": 0.13082361088442068, + "learning_rate": 0.0009987390095083004, + "loss": 1.4156, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 0.12932044553418595, + "learning_rate": 0.0009987266218187772, + "loss": 1.4468, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 0.1475713076816688, + "learning_rate": 0.0009987141736569314, + "loss": 1.4624, + "step": 486 + }, + { + "epoch": 0.05, + "grad_norm": 0.14178318017974623, + "learning_rate": 0.000998701665024273, + "loss": 1.3946, + "step": 487 + }, + { + "epoch": 0.05, + "grad_norm": 0.16305685355062952, + "learning_rate": 0.0009986890959223181, + "loss": 1.5231, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 0.1686444442480543, + "learning_rate": 0.0009986764663525913, + "loss": 1.6783, + "step": 489 + }, + { + "epoch": 0.05, + "grad_norm": 0.15549050758741176, + "learning_rate": 0.0009986637763166237, + "loss": 1.62, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 0.13218940398691503, + "learning_rate": 0.0009986510258159541, + "loss": 1.4141, + "step": 491 + }, + { + "epoch": 0.05, + "grad_norm": 0.14336555374093515, + "learning_rate": 0.0009986382148521283, + "loss": 1.545, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 0.12407347653401202, + "learning_rate": 0.0009986253434267, + "loss": 1.4171, + "step": 493 + }, + { + "epoch": 0.05, + "grad_norm": 0.15672602263906668, + "learning_rate": 0.00099861241154123, + "loss": 1.5143, + "step": 494 + }, + { + "epoch": 0.05, + "grad_norm": 0.13339275812986198, + "learning_rate": 0.000998599419197286, + "loss": 1.4366, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 0.13385666580192612, + "learning_rate": 0.0009985863663964434, + "loss": 1.4831, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 0.1704216054223673, + "learning_rate": 0.000998573253140285, + "loss": 1.365, + "step": 497 + }, + { + "epoch": 0.05, + "grad_norm": 0.1419854321243664, + "learning_rate": 0.0009985600794304007, + "loss": 1.3812, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 0.1294114515649719, + "learning_rate": 0.0009985468452683882, + "loss": 1.4495, + "step": 499 + }, + { + "epoch": 0.05, + "grad_norm": 0.16788358525374333, + "learning_rate": 0.0009985335506558519, + "loss": 1.6707, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 0.1633606897699836, + "learning_rate": 0.0009985201955944039, + "loss": 1.6144, + "step": 501 + }, + { + "epoch": 0.05, + "grad_norm": 0.13574456823443043, + "learning_rate": 0.0009985067800856635, + "loss": 1.5012, + "step": 502 + }, + { + "epoch": 0.05, + "grad_norm": 0.12805624518518505, + "learning_rate": 0.0009984933041312573, + "loss": 1.469, + "step": 503 + }, + { + "epoch": 0.05, + "grad_norm": 0.14562449161498103, + "learning_rate": 0.0009984797677328194, + "loss": 1.557, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 0.13274042544010126, + "learning_rate": 0.0009984661708919913, + "loss": 1.5591, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 0.13783829279522242, + "learning_rate": 0.0009984525136104215, + "loss": 1.4461, + "step": 506 + }, + { + "epoch": 0.05, + "grad_norm": 0.14150450438484263, + "learning_rate": 0.000998438795889766, + "loss": 1.4764, + "step": 507 + }, + { + "epoch": 0.05, + "grad_norm": 0.17191114370741217, + "learning_rate": 0.0009984250177316881, + "loss": 1.5333, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 0.16755229866536533, + "learning_rate": 0.0009984111791378582, + "loss": 1.5334, + "step": 509 + }, + { + "epoch": 0.05, + "grad_norm": 0.15257417846103322, + "learning_rate": 0.0009983972801099548, + "loss": 1.4058, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 0.15478833011778742, + "learning_rate": 0.000998383320649663, + "loss": 1.5657, + "step": 511 + }, + { + "epoch": 0.06, + "grad_norm": 0.12880041372659604, + "learning_rate": 0.0009983693007586752, + "loss": 1.6036, + "step": 512 + }, + { + "epoch": 0.06, + "grad_norm": 0.14047382096291214, + "learning_rate": 0.0009983552204386916, + "loss": 1.5327, + "step": 513 + }, + { + "epoch": 0.06, + "grad_norm": 0.12671081085946684, + "learning_rate": 0.0009983410796914197, + "loss": 1.5109, + "step": 514 + }, + { + "epoch": 0.06, + "grad_norm": 0.13315349868182663, + "learning_rate": 0.0009983268785185735, + "loss": 1.4648, + "step": 515 + }, + { + "epoch": 0.06, + "grad_norm": 0.12974308859167147, + "learning_rate": 0.0009983126169218755, + "loss": 1.4275, + "step": 516 + }, + { + "epoch": 0.06, + "grad_norm": 0.10708978863759408, + "learning_rate": 0.0009982982949030546, + "loss": 1.4725, + "step": 517 + }, + { + "epoch": 0.06, + "grad_norm": 0.11232917145575787, + "learning_rate": 0.0009982839124638475, + "loss": 1.5388, + "step": 518 + }, + { + "epoch": 0.06, + "grad_norm": 0.13263578439023072, + "learning_rate": 0.0009982694696059982, + "loss": 1.5457, + "step": 519 + }, + { + "epoch": 0.06, + "grad_norm": 0.1180388204627775, + "learning_rate": 0.0009982549663312581, + "loss": 1.4521, + "step": 520 + }, + { + "epoch": 0.06, + "grad_norm": 0.1069615275333495, + "learning_rate": 0.0009982404026413854, + "loss": 1.4397, + "step": 521 + }, + { + "epoch": 0.06, + "grad_norm": 0.11920448256909753, + "learning_rate": 0.0009982257785381464, + "loss": 1.3893, + "step": 522 + }, + { + "epoch": 0.06, + "grad_norm": 0.13802661336336072, + "learning_rate": 0.0009982110940233138, + "loss": 1.3956, + "step": 523 + }, + { + "epoch": 0.06, + "grad_norm": 0.12589487672919536, + "learning_rate": 0.0009981963490986686, + "loss": 1.6111, + "step": 524 + }, + { + "epoch": 0.06, + "grad_norm": 0.1388066167759072, + "learning_rate": 0.0009981815437659985, + "loss": 1.4834, + "step": 525 + }, + { + "epoch": 0.06, + "grad_norm": 0.11860878855329496, + "learning_rate": 0.0009981666780270989, + "loss": 1.5376, + "step": 526 + }, + { + "epoch": 0.06, + "grad_norm": 0.1291593285336083, + "learning_rate": 0.000998151751883772, + "loss": 1.5099, + "step": 527 + }, + { + "epoch": 0.06, + "grad_norm": 0.13856146675132416, + "learning_rate": 0.0009981367653378278, + "loss": 1.4985, + "step": 528 + }, + { + "epoch": 0.06, + "grad_norm": 0.1428411422383799, + "learning_rate": 0.0009981217183910834, + "loss": 1.4989, + "step": 529 + }, + { + "epoch": 0.06, + "grad_norm": 0.13401950703669366, + "learning_rate": 0.0009981066110453633, + "loss": 1.5726, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 0.11238669635358409, + "learning_rate": 0.0009980914433024997, + "loss": 1.4765, + "step": 531 + }, + { + "epoch": 0.06, + "grad_norm": 0.14635913263213093, + "learning_rate": 0.0009980762151643313, + "loss": 1.5009, + "step": 532 + }, + { + "epoch": 0.06, + "grad_norm": 0.12077582640456455, + "learning_rate": 0.0009980609266327044, + "loss": 1.402, + "step": 533 + }, + { + "epoch": 0.06, + "grad_norm": 0.13303350194920682, + "learning_rate": 0.0009980455777094733, + "loss": 1.6252, + "step": 534 + }, + { + "epoch": 0.06, + "grad_norm": 0.13303747826576684, + "learning_rate": 0.0009980301683964988, + "loss": 1.5057, + "step": 535 + }, + { + "epoch": 0.06, + "grad_norm": 0.14834784322318495, + "learning_rate": 0.0009980146986956495, + "loss": 1.4399, + "step": 536 + }, + { + "epoch": 0.06, + "grad_norm": 0.1385305315636288, + "learning_rate": 0.000997999168608801, + "loss": 1.5656, + "step": 537 + }, + { + "epoch": 0.06, + "grad_norm": 0.1264521765532537, + "learning_rate": 0.0009979835781378367, + "loss": 1.4, + "step": 538 + }, + { + "epoch": 0.06, + "grad_norm": 0.15363887930073927, + "learning_rate": 0.0009979679272846462, + "loss": 1.5321, + "step": 539 + }, + { + "epoch": 0.06, + "grad_norm": 0.12205084836277039, + "learning_rate": 0.0009979522160511282, + "loss": 1.4562, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 0.11622818650530785, + "learning_rate": 0.000997936444439187, + "loss": 1.4356, + "step": 541 + }, + { + "epoch": 0.06, + "grad_norm": 0.12491589190127463, + "learning_rate": 0.0009979206124507355, + "loss": 1.3947, + "step": 542 + }, + { + "epoch": 0.06, + "grad_norm": 0.12922947991275946, + "learning_rate": 0.0009979047200876932, + "loss": 1.49, + "step": 543 + }, + { + "epoch": 0.06, + "grad_norm": 0.13489214169177516, + "learning_rate": 0.000997888767351987, + "loss": 1.5153, + "step": 544 + }, + { + "epoch": 0.06, + "grad_norm": 0.12878593154366547, + "learning_rate": 0.0009978727542455511, + "loss": 1.4892, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 0.13996023103395688, + "learning_rate": 0.0009978566807703274, + "loss": 1.5035, + "step": 546 + }, + { + "epoch": 0.06, + "grad_norm": 0.12336944314011723, + "learning_rate": 0.0009978405469282647, + "loss": 1.3817, + "step": 547 + }, + { + "epoch": 0.06, + "grad_norm": 0.1420723471733146, + "learning_rate": 0.0009978243527213196, + "loss": 1.5272, + "step": 548 + }, + { + "epoch": 0.06, + "grad_norm": 0.14789430461564015, + "learning_rate": 0.0009978080981514553, + "loss": 1.5767, + "step": 549 + }, + { + "epoch": 0.06, + "grad_norm": 0.1267611684634138, + "learning_rate": 0.0009977917832206431, + "loss": 1.4002, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 0.13572961076613538, + "learning_rate": 0.0009977754079308608, + "loss": 1.3569, + "step": 551 + }, + { + "epoch": 0.06, + "grad_norm": 0.12869792779865574, + "learning_rate": 0.0009977589722840942, + "loss": 1.4989, + "step": 552 + }, + { + "epoch": 0.06, + "grad_norm": 0.12884701104585888, + "learning_rate": 0.0009977424762823363, + "loss": 1.4594, + "step": 553 + }, + { + "epoch": 0.06, + "grad_norm": 0.1496458530775414, + "learning_rate": 0.000997725919927587, + "loss": 1.6697, + "step": 554 + }, + { + "epoch": 0.06, + "grad_norm": 0.16089313194101512, + "learning_rate": 0.0009977093032218544, + "loss": 1.4965, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 0.12898716410042255, + "learning_rate": 0.0009976926261671523, + "loss": 1.5562, + "step": 556 + }, + { + "epoch": 0.06, + "grad_norm": 0.13909417311407565, + "learning_rate": 0.000997675888765504, + "loss": 1.4994, + "step": 557 + }, + { + "epoch": 0.06, + "grad_norm": 0.1364769705088755, + "learning_rate": 0.0009976590910189382, + "loss": 1.6546, + "step": 558 + }, + { + "epoch": 0.06, + "grad_norm": 0.1407938612275754, + "learning_rate": 0.0009976422329294919, + "loss": 1.6437, + "step": 559 + }, + { + "epoch": 0.06, + "grad_norm": 0.12342258081030302, + "learning_rate": 0.0009976253144992093, + "loss": 1.4347, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 0.11970500471927006, + "learning_rate": 0.0009976083357301417, + "loss": 1.2459, + "step": 561 + }, + { + "epoch": 0.06, + "grad_norm": 0.11644855616958355, + "learning_rate": 0.0009975912966243478, + "loss": 1.4681, + "step": 562 + }, + { + "epoch": 0.06, + "grad_norm": 0.1399132278939777, + "learning_rate": 0.0009975741971838938, + "loss": 1.4792, + "step": 563 + }, + { + "epoch": 0.06, + "grad_norm": 0.12385326071720139, + "learning_rate": 0.000997557037410853, + "loss": 1.4996, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 0.13958038993004748, + "learning_rate": 0.0009975398173073062, + "loss": 1.5275, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 0.11635118724721759, + "learning_rate": 0.000997522536875341, + "loss": 1.5998, + "step": 566 + }, + { + "epoch": 0.06, + "grad_norm": 0.1135776748032988, + "learning_rate": 0.0009975051961170532, + "loss": 1.4715, + "step": 567 + }, + { + "epoch": 0.06, + "grad_norm": 0.14026212563791537, + "learning_rate": 0.0009974877950345452, + "loss": 1.5512, + "step": 568 + }, + { + "epoch": 0.06, + "grad_norm": 0.13372265079333248, + "learning_rate": 0.000997470333629927, + "loss": 1.5538, + "step": 569 + }, + { + "epoch": 0.06, + "grad_norm": 0.09829929826672491, + "learning_rate": 0.0009974528119053156, + "loss": 1.404, + "step": 570 + }, + { + "epoch": 0.06, + "grad_norm": 0.1400205748949228, + "learning_rate": 0.0009974352298628359, + "loss": 1.4348, + "step": 571 + }, + { + "epoch": 0.06, + "grad_norm": 0.12816133710759073, + "learning_rate": 0.0009974175875046196, + "loss": 1.5441, + "step": 572 + }, + { + "epoch": 0.06, + "grad_norm": 0.11826809528085243, + "learning_rate": 0.0009973998848328061, + "loss": 1.4434, + "step": 573 + }, + { + "epoch": 0.06, + "grad_norm": 0.11123840372385792, + "learning_rate": 0.0009973821218495415, + "loss": 1.5314, + "step": 574 + }, + { + "epoch": 0.06, + "grad_norm": 0.1042235432756459, + "learning_rate": 0.0009973642985569803, + "loss": 1.5121, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 0.11749153433901713, + "learning_rate": 0.0009973464149572828, + "loss": 1.5447, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 0.15188595873537555, + "learning_rate": 0.000997328471052618, + "loss": 1.5764, + "step": 577 + }, + { + "epoch": 0.06, + "grad_norm": 0.11940122268611729, + "learning_rate": 0.0009973104668451617, + "loss": 1.4156, + "step": 578 + }, + { + "epoch": 0.06, + "grad_norm": 0.1230749446399393, + "learning_rate": 0.0009972924023370967, + "loss": 1.5258, + "step": 579 + }, + { + "epoch": 0.06, + "grad_norm": 0.11807230862866323, + "learning_rate": 0.0009972742775306133, + "loss": 1.5498, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 0.12698953546566902, + "learning_rate": 0.0009972560924279097, + "loss": 1.5633, + "step": 581 + }, + { + "epoch": 0.06, + "grad_norm": 0.12048242833327667, + "learning_rate": 0.0009972378470311904, + "loss": 1.4828, + "step": 582 + }, + { + "epoch": 0.06, + "grad_norm": 0.12821890991850848, + "learning_rate": 0.0009972195413426679, + "loss": 1.3623, + "step": 583 + }, + { + "epoch": 0.06, + "grad_norm": 0.1364893555564914, + "learning_rate": 0.000997201175364562, + "loss": 1.4673, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 0.10110545251686494, + "learning_rate": 0.0009971827490990993, + "loss": 1.5114, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 0.12792269127038894, + "learning_rate": 0.0009971642625485144, + "loss": 1.3687, + "step": 586 + }, + { + "epoch": 0.06, + "grad_norm": 0.11988262596625479, + "learning_rate": 0.0009971457157150485, + "loss": 1.4693, + "step": 587 + }, + { + "epoch": 0.06, + "grad_norm": 0.14192927684022144, + "learning_rate": 0.0009971271086009507, + "loss": 1.5541, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 0.1335038186372788, + "learning_rate": 0.0009971084412084771, + "loss": 1.3357, + "step": 589 + }, + { + "epoch": 0.06, + "grad_norm": 0.11027670743622948, + "learning_rate": 0.0009970897135398913, + "loss": 1.4499, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 0.10736693798136951, + "learning_rate": 0.000997070925597464, + "loss": 1.3992, + "step": 591 + }, + { + "epoch": 0.06, + "grad_norm": 0.10885656905907487, + "learning_rate": 0.0009970520773834733, + "loss": 1.4657, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 0.09049890929626538, + "learning_rate": 0.0009970331689002046, + "loss": 1.4588, + "step": 593 + }, + { + "epoch": 0.06, + "grad_norm": 0.10742491587756407, + "learning_rate": 0.0009970142001499505, + "loss": 1.3222, + "step": 594 + }, + { + "epoch": 0.06, + "grad_norm": 0.1267830112432258, + "learning_rate": 0.0009969951711350114, + "loss": 1.5056, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 0.11243589034871186, + "learning_rate": 0.0009969760818576941, + "loss": 1.4724, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 0.1280646485857755, + "learning_rate": 0.0009969569323203138, + "loss": 1.5029, + "step": 597 + }, + { + "epoch": 0.06, + "grad_norm": 0.13283166676950728, + "learning_rate": 0.000996937722525192, + "loss": 1.5652, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 0.11385817819290642, + "learning_rate": 0.0009969184524746585, + "loss": 1.3798, + "step": 599 + }, + { + "epoch": 0.06, + "grad_norm": 0.1104480250063874, + "learning_rate": 0.000996899122171049, + "loss": 1.3786, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 0.1153007090104645, + "learning_rate": 0.0009968797316167082, + "loss": 1.397, + "step": 601 + }, + { + "epoch": 0.06, + "grad_norm": 0.12277337551873949, + "learning_rate": 0.0009968602808139869, + "loss": 1.6086, + "step": 602 + }, + { + "epoch": 0.06, + "grad_norm": 0.12101584666596928, + "learning_rate": 0.0009968407697652434, + "loss": 1.4888, + "step": 603 + }, + { + "epoch": 0.06, + "grad_norm": 0.15500860601806254, + "learning_rate": 0.000996821198472844, + "loss": 1.5664, + "step": 604 + }, + { + "epoch": 0.07, + "grad_norm": 0.1178828194892715, + "learning_rate": 0.0009968015669391613, + "loss": 1.5685, + "step": 605 + }, + { + "epoch": 0.07, + "grad_norm": 0.12335417800873578, + "learning_rate": 0.000996781875166576, + "loss": 1.5528, + "step": 606 + }, + { + "epoch": 0.07, + "grad_norm": 0.12233699469045906, + "learning_rate": 0.0009967621231574753, + "loss": 1.3711, + "step": 607 + }, + { + "epoch": 0.07, + "grad_norm": 0.11591524885822231, + "learning_rate": 0.000996742310914255, + "loss": 1.4826, + "step": 608 + }, + { + "epoch": 0.07, + "grad_norm": 0.14044536601206312, + "learning_rate": 0.0009967224384393168, + "loss": 1.6158, + "step": 609 + }, + { + "epoch": 0.07, + "grad_norm": 0.1381027639030021, + "learning_rate": 0.0009967025057350705, + "loss": 1.4215, + "step": 610 + }, + { + "epoch": 0.07, + "grad_norm": 0.125010089541578, + "learning_rate": 0.000996682512803933, + "loss": 1.4867, + "step": 611 + }, + { + "epoch": 0.07, + "grad_norm": 0.12207318684054917, + "learning_rate": 0.0009966624596483285, + "loss": 1.3042, + "step": 612 + }, + { + "epoch": 0.07, + "grad_norm": 0.10878370098894088, + "learning_rate": 0.0009966423462706884, + "loss": 1.4636, + "step": 613 + }, + { + "epoch": 0.07, + "grad_norm": 0.13328193000271166, + "learning_rate": 0.0009966221726734517, + "loss": 1.3897, + "step": 614 + }, + { + "epoch": 0.07, + "grad_norm": 0.11689483535778034, + "learning_rate": 0.0009966019388590644, + "loss": 1.6064, + "step": 615 + }, + { + "epoch": 0.07, + "grad_norm": 0.1301761197932106, + "learning_rate": 0.00099658164482998, + "loss": 1.6075, + "step": 616 + }, + { + "epoch": 0.07, + "grad_norm": 0.1346747568866436, + "learning_rate": 0.0009965612905886592, + "loss": 1.3378, + "step": 617 + }, + { + "epoch": 0.07, + "grad_norm": 0.12025260657555636, + "learning_rate": 0.00099654087613757, + "loss": 1.3927, + "step": 618 + }, + { + "epoch": 0.07, + "grad_norm": 0.14311098764679006, + "learning_rate": 0.0009965204014791879, + "loss": 1.5443, + "step": 619 + }, + { + "epoch": 0.07, + "grad_norm": 0.11841687436158872, + "learning_rate": 0.0009964998666159952, + "loss": 1.538, + "step": 620 + }, + { + "epoch": 0.07, + "grad_norm": 0.11248266795214713, + "learning_rate": 0.000996479271550482, + "loss": 1.4871, + "step": 621 + }, + { + "epoch": 0.07, + "grad_norm": 0.13459548022776915, + "learning_rate": 0.0009964586162851455, + "loss": 1.3075, + "step": 622 + }, + { + "epoch": 0.07, + "grad_norm": 0.13005288898830575, + "learning_rate": 0.0009964379008224901, + "loss": 1.4756, + "step": 623 + }, + { + "epoch": 0.07, + "grad_norm": 0.12150477440478978, + "learning_rate": 0.0009964171251650277, + "loss": 1.3786, + "step": 624 + }, + { + "epoch": 0.07, + "grad_norm": 0.11943429960726923, + "learning_rate": 0.0009963962893152778, + "loss": 1.4501, + "step": 625 + }, + { + "epoch": 0.07, + "grad_norm": 0.12382509486585465, + "learning_rate": 0.0009963753932757662, + "loss": 1.5678, + "step": 626 + }, + { + "epoch": 0.07, + "grad_norm": 0.15440640002770115, + "learning_rate": 0.000996354437049027, + "loss": 1.5596, + "step": 627 + }, + { + "epoch": 0.07, + "grad_norm": 0.11091940708776127, + "learning_rate": 0.0009963334206376012, + "loss": 1.4617, + "step": 628 + }, + { + "epoch": 0.07, + "grad_norm": 0.11455971413894141, + "learning_rate": 0.0009963123440440368, + "loss": 1.3403, + "step": 629 + }, + { + "epoch": 0.07, + "grad_norm": 0.15444295040707226, + "learning_rate": 0.0009962912072708897, + "loss": 1.5607, + "step": 630 + }, + { + "epoch": 0.07, + "grad_norm": 0.1059896049882209, + "learning_rate": 0.0009962700103207228, + "loss": 1.4402, + "step": 631 + }, + { + "epoch": 0.07, + "grad_norm": 0.10979744315974159, + "learning_rate": 0.0009962487531961063, + "loss": 1.509, + "step": 632 + }, + { + "epoch": 0.07, + "grad_norm": 0.1271977452069575, + "learning_rate": 0.0009962274358996178, + "loss": 1.5976, + "step": 633 + }, + { + "epoch": 0.07, + "grad_norm": 0.11723721165067925, + "learning_rate": 0.0009962060584338417, + "loss": 1.5683, + "step": 634 + }, + { + "epoch": 0.07, + "grad_norm": 0.11516730530763199, + "learning_rate": 0.0009961846208013704, + "loss": 1.4661, + "step": 635 + }, + { + "epoch": 0.07, + "grad_norm": 0.1176706269429394, + "learning_rate": 0.0009961631230048032, + "loss": 1.4224, + "step": 636 + }, + { + "epoch": 0.07, + "grad_norm": 0.1072376383600733, + "learning_rate": 0.0009961415650467467, + "loss": 1.5348, + "step": 637 + }, + { + "epoch": 0.07, + "grad_norm": 0.12294971323557731, + "learning_rate": 0.000996119946929815, + "loss": 1.6004, + "step": 638 + }, + { + "epoch": 0.07, + "grad_norm": 0.11965915531049516, + "learning_rate": 0.0009960982686566294, + "loss": 1.3978, + "step": 639 + }, + { + "epoch": 0.07, + "grad_norm": 0.09157392129523097, + "learning_rate": 0.0009960765302298184, + "loss": 1.4615, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 0.11716155321247809, + "learning_rate": 0.0009960547316520182, + "loss": 1.5232, + "step": 641 + }, + { + "epoch": 0.07, + "grad_norm": 0.1170791482693603, + "learning_rate": 0.0009960328729258711, + "loss": 1.4715, + "step": 642 + }, + { + "epoch": 0.07, + "grad_norm": 0.12259710702230697, + "learning_rate": 0.0009960109540540284, + "loss": 1.5141, + "step": 643 + }, + { + "epoch": 0.07, + "grad_norm": 0.12107772105788704, + "learning_rate": 0.0009959889750391472, + "loss": 1.461, + "step": 644 + }, + { + "epoch": 0.07, + "grad_norm": 0.10557229425276307, + "learning_rate": 0.0009959669358838932, + "loss": 1.5069, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 0.09790643592491724, + "learning_rate": 0.0009959448365909384, + "loss": 1.4338, + "step": 646 + }, + { + "epoch": 0.07, + "grad_norm": 0.12391777117899554, + "learning_rate": 0.0009959226771629622, + "loss": 1.4601, + "step": 647 + }, + { + "epoch": 0.07, + "grad_norm": 0.11025736594706048, + "learning_rate": 0.0009959004576026516, + "loss": 1.5652, + "step": 648 + }, + { + "epoch": 0.07, + "grad_norm": 0.10460029117111654, + "learning_rate": 0.000995878177912701, + "loss": 1.4367, + "step": 649 + }, + { + "epoch": 0.07, + "grad_norm": 0.10927989562994969, + "learning_rate": 0.0009958558380958116, + "loss": 1.4751, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 0.12173129880732504, + "learning_rate": 0.0009958334381546927, + "loss": 1.5573, + "step": 651 + }, + { + "epoch": 0.07, + "grad_norm": 0.12838332720662465, + "learning_rate": 0.0009958109780920598, + "loss": 1.4785, + "step": 652 + }, + { + "epoch": 0.07, + "grad_norm": 0.12814839246956414, + "learning_rate": 0.0009957884579106363, + "loss": 1.5043, + "step": 653 + }, + { + "epoch": 0.07, + "grad_norm": 0.12270162223081243, + "learning_rate": 0.0009957658776131536, + "loss": 1.6063, + "step": 654 + }, + { + "epoch": 0.07, + "grad_norm": 0.13428707781101876, + "learning_rate": 0.0009957432372023486, + "loss": 1.4086, + "step": 655 + }, + { + "epoch": 0.07, + "grad_norm": 0.13177426626485697, + "learning_rate": 0.000995720536680967, + "loss": 1.4597, + "step": 656 + }, + { + "epoch": 0.07, + "grad_norm": 0.12414797214999132, + "learning_rate": 0.0009956977760517615, + "loss": 1.4479, + "step": 657 + }, + { + "epoch": 0.07, + "grad_norm": 0.12911162849346897, + "learning_rate": 0.0009956749553174918, + "loss": 1.6136, + "step": 658 + }, + { + "epoch": 0.07, + "grad_norm": 0.12716929664557133, + "learning_rate": 0.0009956520744809248, + "loss": 1.4265, + "step": 659 + }, + { + "epoch": 0.07, + "grad_norm": 0.1179166264201855, + "learning_rate": 0.000995629133544835, + "loss": 1.4051, + "step": 660 + }, + { + "epoch": 0.07, + "grad_norm": 0.14156709119896776, + "learning_rate": 0.000995606132512004, + "loss": 1.4486, + "step": 661 + }, + { + "epoch": 0.07, + "grad_norm": 0.14670926950068083, + "learning_rate": 0.000995583071385221, + "loss": 1.6208, + "step": 662 + }, + { + "epoch": 0.07, + "grad_norm": 0.10603558357744294, + "learning_rate": 0.000995559950167282, + "loss": 1.4793, + "step": 663 + }, + { + "epoch": 0.07, + "grad_norm": 0.1224055075034946, + "learning_rate": 0.0009955367688609905, + "loss": 1.4483, + "step": 664 + }, + { + "epoch": 0.07, + "grad_norm": 0.1395496872898413, + "learning_rate": 0.0009955135274691573, + "loss": 1.5365, + "step": 665 + }, + { + "epoch": 0.07, + "grad_norm": 0.1079570336167769, + "learning_rate": 0.0009954902259946008, + "loss": 1.4479, + "step": 666 + }, + { + "epoch": 0.07, + "grad_norm": 0.13238288662712247, + "learning_rate": 0.0009954668644401462, + "loss": 1.6067, + "step": 667 + }, + { + "epoch": 0.07, + "grad_norm": 0.10990400407308812, + "learning_rate": 0.0009954434428086259, + "loss": 1.4747, + "step": 668 + }, + { + "epoch": 0.07, + "grad_norm": 0.12106482867993244, + "learning_rate": 0.0009954199611028802, + "loss": 1.4404, + "step": 669 + }, + { + "epoch": 0.07, + "grad_norm": 0.10078426132952148, + "learning_rate": 0.0009953964193257564, + "loss": 1.5387, + "step": 670 + }, + { + "epoch": 0.07, + "grad_norm": 0.09989641475889491, + "learning_rate": 0.0009953728174801088, + "loss": 1.2998, + "step": 671 + }, + { + "epoch": 0.07, + "grad_norm": 0.1222248510617004, + "learning_rate": 0.0009953491555687991, + "loss": 1.5839, + "step": 672 + }, + { + "epoch": 0.07, + "grad_norm": 0.10030915780605011, + "learning_rate": 0.0009953254335946969, + "loss": 1.6116, + "step": 673 + }, + { + "epoch": 0.07, + "grad_norm": 0.10147470567669492, + "learning_rate": 0.000995301651560678, + "loss": 1.4606, + "step": 674 + }, + { + "epoch": 0.07, + "grad_norm": 0.12189077525868597, + "learning_rate": 0.0009952778094696262, + "loss": 1.5718, + "step": 675 + }, + { + "epoch": 0.07, + "grad_norm": 0.12389923995874418, + "learning_rate": 0.0009952539073244326, + "loss": 1.5397, + "step": 676 + }, + { + "epoch": 0.07, + "grad_norm": 0.1079829259763348, + "learning_rate": 0.0009952299451279954, + "loss": 1.5822, + "step": 677 + }, + { + "epoch": 0.07, + "grad_norm": 0.13025108271421548, + "learning_rate": 0.00099520592288322, + "loss": 1.3588, + "step": 678 + }, + { + "epoch": 0.07, + "grad_norm": 0.09252175398506655, + "learning_rate": 0.0009951818405930194, + "loss": 1.3074, + "step": 679 + }, + { + "epoch": 0.07, + "grad_norm": 0.12190380184039762, + "learning_rate": 0.0009951576982603133, + "loss": 1.5712, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 0.12012031597622391, + "learning_rate": 0.0009951334958880292, + "loss": 1.5322, + "step": 681 + }, + { + "epoch": 0.07, + "grad_norm": 0.11291624944914505, + "learning_rate": 0.0009951092334791016, + "loss": 1.4539, + "step": 682 + }, + { + "epoch": 0.07, + "grad_norm": 0.0931588349385203, + "learning_rate": 0.0009950849110364729, + "loss": 1.3937, + "step": 683 + }, + { + "epoch": 0.07, + "grad_norm": 0.1053379448621117, + "learning_rate": 0.0009950605285630916, + "loss": 1.4337, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 0.10645922517410703, + "learning_rate": 0.0009950360860619147, + "loss": 1.3934, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 0.10572018719307143, + "learning_rate": 0.0009950115835359054, + "loss": 1.4284, + "step": 686 + }, + { + "epoch": 0.07, + "grad_norm": 0.09603500722299851, + "learning_rate": 0.0009949870209880354, + "loss": 1.4436, + "step": 687 + }, + { + "epoch": 0.07, + "grad_norm": 0.1171822427524059, + "learning_rate": 0.0009949623984212824, + "loss": 1.4438, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 0.10775144263060174, + "learning_rate": 0.0009949377158386323, + "loss": 1.5625, + "step": 689 + }, + { + "epoch": 0.07, + "grad_norm": 0.11412434036504529, + "learning_rate": 0.0009949129732430778, + "loss": 1.501, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 0.10324234472443945, + "learning_rate": 0.0009948881706376192, + "loss": 1.6314, + "step": 691 + }, + { + "epoch": 0.07, + "grad_norm": 0.12233091194141885, + "learning_rate": 0.0009948633080252636, + "loss": 1.5398, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 0.10981153329077435, + "learning_rate": 0.000994838385409026, + "loss": 1.5315, + "step": 693 + }, + { + "epoch": 0.07, + "grad_norm": 0.11124117283144365, + "learning_rate": 0.000994813402791928, + "loss": 1.468, + "step": 694 + }, + { + "epoch": 0.07, + "grad_norm": 0.10859983867961821, + "learning_rate": 0.000994788360176999, + "loss": 1.4983, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 0.10679836737127918, + "learning_rate": 0.0009947632575672757, + "loss": 1.509, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 0.1452761412325607, + "learning_rate": 0.0009947380949658017, + "loss": 1.5244, + "step": 697 + }, + { + "epoch": 0.08, + "grad_norm": 0.10376320328347177, + "learning_rate": 0.0009947128723756281, + "loss": 1.4568, + "step": 698 + }, + { + "epoch": 0.08, + "grad_norm": 0.13737925652052532, + "learning_rate": 0.0009946875897998131, + "loss": 1.5241, + "step": 699 + }, + { + "epoch": 0.08, + "grad_norm": 0.12421770725172457, + "learning_rate": 0.0009946622472414223, + "loss": 1.5322, + "step": 700 + }, + { + "epoch": 0.08, + "grad_norm": 0.12125159942786252, + "learning_rate": 0.000994636844703529, + "loss": 1.3998, + "step": 701 + }, + { + "epoch": 0.08, + "grad_norm": 0.13120907346940802, + "learning_rate": 0.0009946113821892128, + "loss": 1.4633, + "step": 702 + }, + { + "epoch": 0.08, + "grad_norm": 0.15391223685243563, + "learning_rate": 0.0009945858597015613, + "loss": 1.4451, + "step": 703 + }, + { + "epoch": 0.08, + "grad_norm": 0.1360647191323169, + "learning_rate": 0.0009945602772436692, + "loss": 1.4341, + "step": 704 + }, + { + "epoch": 0.08, + "grad_norm": 0.10529263459373553, + "learning_rate": 0.0009945346348186385, + "loss": 1.4727, + "step": 705 + }, + { + "epoch": 0.08, + "grad_norm": 0.11431211623838632, + "learning_rate": 0.0009945089324295785, + "loss": 1.4321, + "step": 706 + }, + { + "epoch": 0.08, + "grad_norm": 0.13598904701131048, + "learning_rate": 0.0009944831700796053, + "loss": 1.5741, + "step": 707 + }, + { + "epoch": 0.08, + "grad_norm": 0.1355862716894886, + "learning_rate": 0.0009944573477718435, + "loss": 1.4294, + "step": 708 + }, + { + "epoch": 0.08, + "grad_norm": 0.12494946367200017, + "learning_rate": 0.0009944314655094232, + "loss": 1.4831, + "step": 709 + }, + { + "epoch": 0.08, + "grad_norm": 0.11809725685709474, + "learning_rate": 0.0009944055232954832, + "loss": 1.4706, + "step": 710 + }, + { + "epoch": 0.08, + "grad_norm": 0.1198085117849122, + "learning_rate": 0.0009943795211331693, + "loss": 1.579, + "step": 711 + }, + { + "epoch": 0.08, + "grad_norm": 0.13247770297281755, + "learning_rate": 0.0009943534590256338, + "loss": 1.5206, + "step": 712 + }, + { + "epoch": 0.08, + "grad_norm": 0.11403899759015178, + "learning_rate": 0.000994327336976037, + "loss": 1.5245, + "step": 713 + }, + { + "epoch": 0.08, + "grad_norm": 0.1273405472522352, + "learning_rate": 0.0009943011549875466, + "loss": 1.4184, + "step": 714 + }, + { + "epoch": 0.08, + "grad_norm": 0.1358059212062907, + "learning_rate": 0.000994274913063337, + "loss": 1.5701, + "step": 715 + }, + { + "epoch": 0.08, + "grad_norm": 0.10773623159909808, + "learning_rate": 0.0009942486112065902, + "loss": 1.5458, + "step": 716 + }, + { + "epoch": 0.08, + "grad_norm": 0.11792824796737497, + "learning_rate": 0.0009942222494204954, + "loss": 1.5711, + "step": 717 + }, + { + "epoch": 0.08, + "grad_norm": 0.11176245858069334, + "learning_rate": 0.000994195827708249, + "loss": 1.4227, + "step": 718 + }, + { + "epoch": 0.08, + "grad_norm": 0.11809148806007971, + "learning_rate": 0.0009941693460730548, + "loss": 1.4618, + "step": 719 + }, + { + "epoch": 0.08, + "grad_norm": 0.12523379292415296, + "learning_rate": 0.0009941428045181235, + "loss": 1.4765, + "step": 720 + }, + { + "epoch": 0.08, + "grad_norm": 0.11416568669285194, + "learning_rate": 0.0009941162030466736, + "loss": 1.4385, + "step": 721 + }, + { + "epoch": 0.08, + "grad_norm": 0.11968357238475509, + "learning_rate": 0.0009940895416619307, + "loss": 1.4373, + "step": 722 + }, + { + "epoch": 0.08, + "grad_norm": 0.15665080698553804, + "learning_rate": 0.0009940628203671275, + "loss": 1.5316, + "step": 723 + }, + { + "epoch": 0.08, + "grad_norm": 0.10064658632776849, + "learning_rate": 0.0009940360391655042, + "loss": 1.5352, + "step": 724 + }, + { + "epoch": 0.08, + "grad_norm": 0.10251226529146976, + "learning_rate": 0.0009940091980603076, + "loss": 1.4791, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 0.10378073760870751, + "learning_rate": 0.000993982297054793, + "loss": 1.5029, + "step": 726 + }, + { + "epoch": 0.08, + "grad_norm": 0.10539285260130456, + "learning_rate": 0.0009939553361522217, + "loss": 1.4734, + "step": 727 + }, + { + "epoch": 0.08, + "grad_norm": 0.1100171833779912, + "learning_rate": 0.0009939283153558628, + "loss": 1.353, + "step": 728 + }, + { + "epoch": 0.08, + "grad_norm": 0.11572299828458692, + "learning_rate": 0.000993901234668993, + "loss": 1.4491, + "step": 729 + }, + { + "epoch": 0.08, + "grad_norm": 0.10411765589679892, + "learning_rate": 0.0009938740940948958, + "loss": 1.4687, + "step": 730 + }, + { + "epoch": 0.08, + "grad_norm": 0.1213664111538291, + "learning_rate": 0.0009938468936368618, + "loss": 1.4684, + "step": 731 + }, + { + "epoch": 0.08, + "grad_norm": 0.11803576700547622, + "learning_rate": 0.0009938196332981897, + "loss": 1.5678, + "step": 732 + }, + { + "epoch": 0.08, + "grad_norm": 0.1189255355811123, + "learning_rate": 0.0009937923130821844, + "loss": 1.4901, + "step": 733 + }, + { + "epoch": 0.08, + "grad_norm": 0.1140209620430202, + "learning_rate": 0.0009937649329921589, + "loss": 1.3862, + "step": 734 + }, + { + "epoch": 0.08, + "grad_norm": 0.12863941643968582, + "learning_rate": 0.000993737493031433, + "loss": 1.4651, + "step": 735 + }, + { + "epoch": 0.08, + "grad_norm": 0.11018747815379674, + "learning_rate": 0.0009937099932033338, + "loss": 1.4941, + "step": 736 + }, + { + "epoch": 0.08, + "grad_norm": 0.11099330218951524, + "learning_rate": 0.000993682433511196, + "loss": 1.5038, + "step": 737 + }, + { + "epoch": 0.08, + "grad_norm": 0.11147184255727886, + "learning_rate": 0.000993654813958361, + "loss": 1.4039, + "step": 738 + }, + { + "epoch": 0.08, + "grad_norm": 0.10734348471301973, + "learning_rate": 0.000993627134548178, + "loss": 1.4491, + "step": 739 + }, + { + "epoch": 0.08, + "grad_norm": 0.10710535668640941, + "learning_rate": 0.0009935993952840032, + "loss": 1.39, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 0.13094386533114225, + "learning_rate": 0.0009935715961692, + "loss": 1.5734, + "step": 741 + }, + { + "epoch": 0.08, + "grad_norm": 0.12488034601195075, + "learning_rate": 0.000993543737207139, + "loss": 1.5388, + "step": 742 + }, + { + "epoch": 0.08, + "grad_norm": 0.12403785629260212, + "learning_rate": 0.0009935158184011985, + "loss": 1.2961, + "step": 743 + }, + { + "epoch": 0.08, + "grad_norm": 0.11652789448505044, + "learning_rate": 0.0009934878397547635, + "loss": 1.4781, + "step": 744 + }, + { + "epoch": 0.08, + "grad_norm": 0.11516732956770669, + "learning_rate": 0.0009934598012712268, + "loss": 1.4467, + "step": 745 + }, + { + "epoch": 0.08, + "grad_norm": 0.12245986579048859, + "learning_rate": 0.000993431702953988, + "loss": 1.4653, + "step": 746 + }, + { + "epoch": 0.08, + "grad_norm": 0.10983200899730307, + "learning_rate": 0.0009934035448064538, + "loss": 1.5349, + "step": 747 + }, + { + "epoch": 0.08, + "grad_norm": 0.09167922119798957, + "learning_rate": 0.000993375326832039, + "loss": 1.4424, + "step": 748 + }, + { + "epoch": 0.08, + "grad_norm": 0.1069490038423996, + "learning_rate": 0.0009933470490341649, + "loss": 1.447, + "step": 749 + }, + { + "epoch": 0.08, + "grad_norm": 0.11574964570738946, + "learning_rate": 0.0009933187114162603, + "loss": 1.4137, + "step": 750 + }, + { + "epoch": 0.08, + "grad_norm": 0.13373086874940893, + "learning_rate": 0.0009932903139817611, + "loss": 1.4324, + "step": 751 + }, + { + "epoch": 0.08, + "grad_norm": 0.12154833008669642, + "learning_rate": 0.0009932618567341108, + "loss": 1.4614, + "step": 752 + }, + { + "epoch": 0.08, + "grad_norm": 0.12863675940870534, + "learning_rate": 0.0009932333396767596, + "loss": 1.6192, + "step": 753 + }, + { + "epoch": 0.08, + "grad_norm": 0.12602366816139124, + "learning_rate": 0.0009932047628131659, + "loss": 1.4902, + "step": 754 + }, + { + "epoch": 0.08, + "grad_norm": 0.11828428253206899, + "learning_rate": 0.000993176126146794, + "loss": 1.5057, + "step": 755 + }, + { + "epoch": 0.08, + "grad_norm": 0.11447959551486472, + "learning_rate": 0.0009931474296811169, + "loss": 1.4539, + "step": 756 + }, + { + "epoch": 0.08, + "grad_norm": 0.11652175026432009, + "learning_rate": 0.0009931186734196136, + "loss": 1.5865, + "step": 757 + }, + { + "epoch": 0.08, + "grad_norm": 0.15953279308615217, + "learning_rate": 0.0009930898573657712, + "loss": 1.5144, + "step": 758 + }, + { + "epoch": 0.08, + "grad_norm": 0.11445487382353405, + "learning_rate": 0.0009930609815230838, + "loss": 1.5207, + "step": 759 + }, + { + "epoch": 0.08, + "grad_norm": 0.09523137267014015, + "learning_rate": 0.0009930320458950523, + "loss": 1.4406, + "step": 760 + }, + { + "epoch": 0.08, + "grad_norm": 0.11500133371745935, + "learning_rate": 0.0009930030504851856, + "loss": 1.5745, + "step": 761 + }, + { + "epoch": 0.08, + "grad_norm": 0.11011675099438246, + "learning_rate": 0.0009929739952969994, + "loss": 1.4388, + "step": 762 + }, + { + "epoch": 0.08, + "grad_norm": 0.11046935647517428, + "learning_rate": 0.0009929448803340166, + "loss": 1.5235, + "step": 763 + }, + { + "epoch": 0.08, + "grad_norm": 0.11679227263335894, + "learning_rate": 0.0009929157055997677, + "loss": 1.4837, + "step": 764 + }, + { + "epoch": 0.08, + "grad_norm": 0.10443444082409932, + "learning_rate": 0.0009928864710977902, + "loss": 1.5236, + "step": 765 + }, + { + "epoch": 0.08, + "grad_norm": 0.11647164928646948, + "learning_rate": 0.0009928571768316288, + "loss": 1.4907, + "step": 766 + }, + { + "epoch": 0.08, + "grad_norm": 0.10566648958207446, + "learning_rate": 0.0009928278228048357, + "loss": 1.4907, + "step": 767 + }, + { + "epoch": 0.08, + "grad_norm": 0.09826190297521414, + "learning_rate": 0.00099279840902097, + "loss": 1.4079, + "step": 768 + }, + { + "epoch": 0.08, + "grad_norm": 0.1017904451693403, + "learning_rate": 0.0009927689354835981, + "loss": 1.4102, + "step": 769 + }, + { + "epoch": 0.08, + "grad_norm": 0.09993971482709918, + "learning_rate": 0.0009927394021962941, + "loss": 1.6505, + "step": 770 + }, + { + "epoch": 0.08, + "grad_norm": 0.10591663082599534, + "learning_rate": 0.0009927098091626388, + "loss": 1.4566, + "step": 771 + }, + { + "epoch": 0.08, + "grad_norm": 0.11498870565123132, + "learning_rate": 0.0009926801563862205, + "loss": 1.6038, + "step": 772 + }, + { + "epoch": 0.08, + "grad_norm": 0.11407235573294057, + "learning_rate": 0.0009926504438706348, + "loss": 1.5839, + "step": 773 + }, + { + "epoch": 0.08, + "grad_norm": 0.12308677147643597, + "learning_rate": 0.0009926206716194842, + "loss": 1.5441, + "step": 774 + }, + { + "epoch": 0.08, + "grad_norm": 0.10318309978776286, + "learning_rate": 0.0009925908396363789, + "loss": 1.5983, + "step": 775 + }, + { + "epoch": 0.08, + "grad_norm": 0.10265958152079739, + "learning_rate": 0.0009925609479249362, + "loss": 1.467, + "step": 776 + }, + { + "epoch": 0.08, + "grad_norm": 0.1041780857939972, + "learning_rate": 0.0009925309964887803, + "loss": 1.3516, + "step": 777 + }, + { + "epoch": 0.08, + "grad_norm": 0.1127904379754784, + "learning_rate": 0.0009925009853315432, + "loss": 1.3867, + "step": 778 + }, + { + "epoch": 0.08, + "grad_norm": 0.10759215513795695, + "learning_rate": 0.0009924709144568636, + "loss": 1.4878, + "step": 779 + }, + { + "epoch": 0.08, + "grad_norm": 0.10524365237305855, + "learning_rate": 0.0009924407838683878, + "loss": 1.4779, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 0.09420759561249992, + "learning_rate": 0.0009924105935697693, + "loss": 1.475, + "step": 781 + }, + { + "epoch": 0.08, + "grad_norm": 0.10835188218183127, + "learning_rate": 0.0009923803435646689, + "loss": 1.5956, + "step": 782 + }, + { + "epoch": 0.08, + "grad_norm": 0.10517000343653546, + "learning_rate": 0.0009923500338567541, + "loss": 1.4006, + "step": 783 + }, + { + "epoch": 0.08, + "grad_norm": 0.10588943505093504, + "learning_rate": 0.0009923196644497003, + "loss": 1.4583, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 0.09840148056749254, + "learning_rate": 0.00099228923534719, + "loss": 1.5177, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 0.087873993730566, + "learning_rate": 0.0009922587465529128, + "loss": 1.4561, + "step": 786 + }, + { + "epoch": 0.08, + "grad_norm": 0.10730475413220047, + "learning_rate": 0.0009922281980705653, + "loss": 1.4842, + "step": 787 + }, + { + "epoch": 0.08, + "grad_norm": 0.09819906057900965, + "learning_rate": 0.0009921975899038521, + "loss": 1.443, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 0.10673849388513212, + "learning_rate": 0.0009921669220564842, + "loss": 1.4846, + "step": 789 + }, + { + "epoch": 0.08, + "grad_norm": 0.09585259890060156, + "learning_rate": 0.00099213619453218, + "loss": 1.4982, + "step": 790 + }, + { + "epoch": 0.09, + "grad_norm": 0.10886473711799136, + "learning_rate": 0.0009921054073346659, + "loss": 1.5461, + "step": 791 + }, + { + "epoch": 0.09, + "grad_norm": 0.11529970337773958, + "learning_rate": 0.0009920745604676743, + "loss": 1.483, + "step": 792 + }, + { + "epoch": 0.09, + "grad_norm": 0.1049014082110331, + "learning_rate": 0.0009920436539349459, + "loss": 1.428, + "step": 793 + }, + { + "epoch": 0.09, + "grad_norm": 0.11341000507473294, + "learning_rate": 0.0009920126877402283, + "loss": 1.4127, + "step": 794 + }, + { + "epoch": 0.09, + "grad_norm": 0.0972092028209167, + "learning_rate": 0.000991981661887276, + "loss": 1.5701, + "step": 795 + }, + { + "epoch": 0.09, + "grad_norm": 0.10979701692895823, + "learning_rate": 0.0009919505763798509, + "loss": 1.4882, + "step": 796 + }, + { + "epoch": 0.09, + "grad_norm": 0.12347853346750738, + "learning_rate": 0.0009919194312217226, + "loss": 1.5197, + "step": 797 + }, + { + "epoch": 0.09, + "grad_norm": 0.12862095545623062, + "learning_rate": 0.0009918882264166671, + "loss": 1.4307, + "step": 798 + }, + { + "epoch": 0.09, + "grad_norm": 0.11011053871767552, + "learning_rate": 0.0009918569619684686, + "loss": 1.5358, + "step": 799 + }, + { + "epoch": 0.09, + "grad_norm": 0.10923348365641565, + "learning_rate": 0.0009918256378809178, + "loss": 1.3628, + "step": 800 + }, + { + "epoch": 0.09, + "grad_norm": 0.10848730542687637, + "learning_rate": 0.0009917942541578127, + "loss": 1.4926, + "step": 801 + }, + { + "epoch": 0.09, + "grad_norm": 0.09815586793245867, + "learning_rate": 0.0009917628108029588, + "loss": 1.4147, + "step": 802 + }, + { + "epoch": 0.09, + "grad_norm": 0.12150522280091537, + "learning_rate": 0.000991731307820169, + "loss": 1.3304, + "step": 803 + }, + { + "epoch": 0.09, + "grad_norm": 0.10961537259089094, + "learning_rate": 0.0009916997452132624, + "loss": 1.4825, + "step": 804 + }, + { + "epoch": 0.09, + "grad_norm": 0.11017075427928086, + "learning_rate": 0.0009916681229860669, + "loss": 1.5064, + "step": 805 + }, + { + "epoch": 0.09, + "grad_norm": 0.10544774278956268, + "learning_rate": 0.0009916364411424164, + "loss": 1.4638, + "step": 806 + }, + { + "epoch": 0.09, + "grad_norm": 0.121938065418309, + "learning_rate": 0.0009916046996861522, + "loss": 1.4857, + "step": 807 + }, + { + "epoch": 0.09, + "grad_norm": 0.12330109961033686, + "learning_rate": 0.0009915728986211237, + "loss": 1.3688, + "step": 808 + }, + { + "epoch": 0.09, + "grad_norm": 0.1371285563688456, + "learning_rate": 0.0009915410379511865, + "loss": 1.5859, + "step": 809 + }, + { + "epoch": 0.09, + "grad_norm": 0.1053850031308369, + "learning_rate": 0.0009915091176802035, + "loss": 1.3979, + "step": 810 + }, + { + "epoch": 0.09, + "grad_norm": 0.10649860867690676, + "learning_rate": 0.000991477137812046, + "loss": 1.5179, + "step": 811 + }, + { + "epoch": 0.09, + "grad_norm": 0.13922844265421294, + "learning_rate": 0.0009914450983505908, + "loss": 1.3672, + "step": 812 + }, + { + "epoch": 0.09, + "grad_norm": 0.1287741960027465, + "learning_rate": 0.0009914129992997232, + "loss": 1.465, + "step": 813 + }, + { + "epoch": 0.09, + "grad_norm": 0.10611709028639908, + "learning_rate": 0.0009913808406633354, + "loss": 1.568, + "step": 814 + }, + { + "epoch": 0.09, + "grad_norm": 0.10666038719575555, + "learning_rate": 0.0009913486224453266, + "loss": 1.4613, + "step": 815 + }, + { + "epoch": 0.09, + "grad_norm": 0.09552780948354576, + "learning_rate": 0.0009913163446496032, + "loss": 1.3986, + "step": 816 + }, + { + "epoch": 0.09, + "grad_norm": 0.10458514944773992, + "learning_rate": 0.0009912840072800796, + "loss": 1.4997, + "step": 817 + }, + { + "epoch": 0.09, + "grad_norm": 0.10781289805463203, + "learning_rate": 0.0009912516103406759, + "loss": 1.4163, + "step": 818 + }, + { + "epoch": 0.09, + "grad_norm": 0.11066849525641222, + "learning_rate": 0.0009912191538353212, + "loss": 1.3647, + "step": 819 + }, + { + "epoch": 0.09, + "grad_norm": 0.11037102460141801, + "learning_rate": 0.0009911866377679505, + "loss": 1.4986, + "step": 820 + }, + { + "epoch": 0.09, + "grad_norm": 0.10550276331452135, + "learning_rate": 0.0009911540621425066, + "loss": 1.4142, + "step": 821 + }, + { + "epoch": 0.09, + "grad_norm": 0.09357877084866194, + "learning_rate": 0.0009911214269629394, + "loss": 1.4046, + "step": 822 + }, + { + "epoch": 0.09, + "grad_norm": 0.10821731143868805, + "learning_rate": 0.000991088732233206, + "loss": 1.4454, + "step": 823 + }, + { + "epoch": 0.09, + "grad_norm": 0.10885502691894844, + "learning_rate": 0.0009910559779572707, + "loss": 1.4416, + "step": 824 + }, + { + "epoch": 0.09, + "grad_norm": 0.09442699393531002, + "learning_rate": 0.0009910231641391052, + "loss": 1.5394, + "step": 825 + }, + { + "epoch": 0.09, + "grad_norm": 0.10525545876158422, + "learning_rate": 0.0009909902907826883, + "loss": 1.546, + "step": 826 + }, + { + "epoch": 0.09, + "grad_norm": 0.11720992112248244, + "learning_rate": 0.000990957357892006, + "loss": 1.5321, + "step": 827 + }, + { + "epoch": 0.09, + "grad_norm": 0.10864424543494355, + "learning_rate": 0.0009909243654710514, + "loss": 1.4025, + "step": 828 + }, + { + "epoch": 0.09, + "grad_norm": 0.1117918912175246, + "learning_rate": 0.000990891313523825, + "loss": 1.692, + "step": 829 + }, + { + "epoch": 0.09, + "grad_norm": 0.1313258688345413, + "learning_rate": 0.0009908582020543345, + "loss": 1.5812, + "step": 830 + }, + { + "epoch": 0.09, + "grad_norm": 0.11780429486456513, + "learning_rate": 0.0009908250310665947, + "loss": 1.556, + "step": 831 + }, + { + "epoch": 0.09, + "grad_norm": 0.11012726750810582, + "learning_rate": 0.0009907918005646276, + "loss": 1.4705, + "step": 832 + }, + { + "epoch": 0.09, + "grad_norm": 0.10699556897583314, + "learning_rate": 0.000990758510552463, + "loss": 1.4487, + "step": 833 + }, + { + "epoch": 0.09, + "grad_norm": 0.1012229677196335, + "learning_rate": 0.0009907251610341368, + "loss": 1.5403, + "step": 834 + }, + { + "epoch": 0.09, + "grad_norm": 0.1236371026793818, + "learning_rate": 0.0009906917520136932, + "loss": 1.4242, + "step": 835 + }, + { + "epoch": 0.09, + "grad_norm": 0.12682607602434304, + "learning_rate": 0.000990658283495183, + "loss": 1.4958, + "step": 836 + }, + { + "epoch": 0.09, + "grad_norm": 0.1035656380844131, + "learning_rate": 0.0009906247554826643, + "loss": 1.4714, + "step": 837 + }, + { + "epoch": 0.09, + "grad_norm": 0.10153729327718629, + "learning_rate": 0.0009905911679802027, + "loss": 1.4798, + "step": 838 + }, + { + "epoch": 0.09, + "grad_norm": 0.09357476745648424, + "learning_rate": 0.0009905575209918705, + "loss": 1.5151, + "step": 839 + }, + { + "epoch": 0.09, + "grad_norm": 0.10110221465858242, + "learning_rate": 0.0009905238145217477, + "loss": 1.5721, + "step": 840 + }, + { + "epoch": 0.09, + "grad_norm": 0.09581386231874159, + "learning_rate": 0.0009904900485739213, + "loss": 1.5697, + "step": 841 + }, + { + "epoch": 0.09, + "grad_norm": 0.11136596229104052, + "learning_rate": 0.0009904562231524857, + "loss": 1.5124, + "step": 842 + }, + { + "epoch": 0.09, + "grad_norm": 0.10563905529316776, + "learning_rate": 0.0009904223382615417, + "loss": 1.5153, + "step": 843 + }, + { + "epoch": 0.09, + "grad_norm": 0.105291268018836, + "learning_rate": 0.000990388393905199, + "loss": 1.5121, + "step": 844 + }, + { + "epoch": 0.09, + "grad_norm": 0.13132650708524377, + "learning_rate": 0.0009903543900875726, + "loss": 1.565, + "step": 845 + }, + { + "epoch": 0.09, + "grad_norm": 0.11176225233742272, + "learning_rate": 0.000990320326812786, + "loss": 1.5904, + "step": 846 + }, + { + "epoch": 0.09, + "grad_norm": 0.09906334204646819, + "learning_rate": 0.0009902862040849694, + "loss": 1.434, + "step": 847 + }, + { + "epoch": 0.09, + "grad_norm": 0.10512336498042481, + "learning_rate": 0.0009902520219082602, + "loss": 1.4771, + "step": 848 + }, + { + "epoch": 0.09, + "grad_norm": 0.10922470793825259, + "learning_rate": 0.0009902177802868033, + "loss": 1.5708, + "step": 849 + }, + { + "epoch": 0.09, + "grad_norm": 0.1013422099157301, + "learning_rate": 0.0009901834792247503, + "loss": 1.543, + "step": 850 + }, + { + "epoch": 0.09, + "grad_norm": 0.11685308384489977, + "learning_rate": 0.0009901491187262609, + "loss": 1.5184, + "step": 851 + }, + { + "epoch": 0.09, + "grad_norm": 0.1068500340383022, + "learning_rate": 0.0009901146987955007, + "loss": 1.3863, + "step": 852 + }, + { + "epoch": 0.09, + "grad_norm": 0.10535661820687749, + "learning_rate": 0.0009900802194366437, + "loss": 1.4796, + "step": 853 + }, + { + "epoch": 0.09, + "grad_norm": 0.11783230529707762, + "learning_rate": 0.0009900456806538707, + "loss": 1.3684, + "step": 854 + }, + { + "epoch": 0.09, + "grad_norm": 0.11384519631304521, + "learning_rate": 0.0009900110824513691, + "loss": 1.4558, + "step": 855 + }, + { + "epoch": 0.09, + "grad_norm": 0.10264058083146971, + "learning_rate": 0.0009899764248333348, + "loss": 1.434, + "step": 856 + }, + { + "epoch": 0.09, + "grad_norm": 0.11519712314754389, + "learning_rate": 0.0009899417078039696, + "loss": 1.4387, + "step": 857 + }, + { + "epoch": 0.09, + "grad_norm": 0.1123372880638399, + "learning_rate": 0.0009899069313674832, + "loss": 1.4141, + "step": 858 + }, + { + "epoch": 0.09, + "grad_norm": 0.1097586324100307, + "learning_rate": 0.0009898720955280925, + "loss": 1.5496, + "step": 859 + }, + { + "epoch": 0.09, + "grad_norm": 0.11048300266047464, + "learning_rate": 0.0009898372002900213, + "loss": 1.3956, + "step": 860 + }, + { + "epoch": 0.09, + "grad_norm": 0.10667135360093603, + "learning_rate": 0.000989802245657501, + "loss": 1.4431, + "step": 861 + }, + { + "epoch": 0.09, + "grad_norm": 0.10169688375776761, + "learning_rate": 0.0009897672316347696, + "loss": 1.4944, + "step": 862 + }, + { + "epoch": 0.09, + "grad_norm": 0.10805112813479695, + "learning_rate": 0.000989732158226073, + "loss": 1.5502, + "step": 863 + }, + { + "epoch": 0.09, + "grad_norm": 0.11383346267574627, + "learning_rate": 0.0009896970254356637, + "loss": 1.5222, + "step": 864 + }, + { + "epoch": 0.09, + "grad_norm": 0.10437239928845843, + "learning_rate": 0.000989661833267802, + "loss": 1.557, + "step": 865 + }, + { + "epoch": 0.09, + "grad_norm": 0.12880169356158996, + "learning_rate": 0.0009896265817267548, + "loss": 1.6166, + "step": 866 + }, + { + "epoch": 0.09, + "grad_norm": 0.12070472393705185, + "learning_rate": 0.0009895912708167967, + "loss": 1.5124, + "step": 867 + }, + { + "epoch": 0.09, + "grad_norm": 0.12068593858069836, + "learning_rate": 0.000989555900542209, + "loss": 1.3678, + "step": 868 + }, + { + "epoch": 0.09, + "grad_norm": 0.11386637597424645, + "learning_rate": 0.0009895204709072806, + "loss": 1.3299, + "step": 869 + }, + { + "epoch": 0.09, + "grad_norm": 0.12175046980633066, + "learning_rate": 0.0009894849819163075, + "loss": 1.5199, + "step": 870 + }, + { + "epoch": 0.09, + "grad_norm": 0.10357307243540072, + "learning_rate": 0.000989449433573593, + "loss": 1.4439, + "step": 871 + }, + { + "epoch": 0.09, + "grad_norm": 0.11587763922735016, + "learning_rate": 0.000989413825883447, + "loss": 1.424, + "step": 872 + }, + { + "epoch": 0.09, + "grad_norm": 0.10361221368654966, + "learning_rate": 0.0009893781588501875, + "loss": 1.5112, + "step": 873 + }, + { + "epoch": 0.09, + "grad_norm": 0.1307499271286858, + "learning_rate": 0.000989342432478139, + "loss": 1.467, + "step": 874 + }, + { + "epoch": 0.09, + "grad_norm": 0.10556346561030903, + "learning_rate": 0.0009893066467716336, + "loss": 1.6212, + "step": 875 + }, + { + "epoch": 0.09, + "grad_norm": 0.13185572299133208, + "learning_rate": 0.0009892708017350104, + "loss": 1.4886, + "step": 876 + }, + { + "epoch": 0.09, + "grad_norm": 0.11593000129396568, + "learning_rate": 0.0009892348973726157, + "loss": 1.4242, + "step": 877 + }, + { + "epoch": 0.09, + "grad_norm": 0.10263973471205634, + "learning_rate": 0.0009891989336888033, + "loss": 1.4018, + "step": 878 + }, + { + "epoch": 0.09, + "grad_norm": 0.09388484545466252, + "learning_rate": 0.0009891629106879333, + "loss": 1.5406, + "step": 879 + }, + { + "epoch": 0.09, + "grad_norm": 0.10579398130719597, + "learning_rate": 0.0009891268283743742, + "loss": 1.4365, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 0.10205073561963973, + "learning_rate": 0.0009890906867525008, + "loss": 1.36, + "step": 881 + }, + { + "epoch": 0.09, + "grad_norm": 0.0993925966084655, + "learning_rate": 0.0009890544858266953, + "loss": 1.5417, + "step": 882 + }, + { + "epoch": 0.09, + "grad_norm": 0.12462405391860587, + "learning_rate": 0.0009890182256013476, + "loss": 1.4766, + "step": 883 + }, + { + "epoch": 0.1, + "grad_norm": 0.11871289446812809, + "learning_rate": 0.0009889819060808541, + "loss": 1.3472, + "step": 884 + }, + { + "epoch": 0.1, + "grad_norm": 0.10788692034328204, + "learning_rate": 0.0009889455272696186, + "loss": 1.5064, + "step": 885 + }, + { + "epoch": 0.1, + "grad_norm": 0.10830663754630337, + "learning_rate": 0.0009889090891720524, + "loss": 1.4562, + "step": 886 + }, + { + "epoch": 0.1, + "grad_norm": 0.1020377985895941, + "learning_rate": 0.0009888725917925735, + "loss": 1.4743, + "step": 887 + }, + { + "epoch": 0.1, + "grad_norm": 0.11402004313995222, + "learning_rate": 0.0009888360351356076, + "loss": 1.5885, + "step": 888 + }, + { + "epoch": 0.1, + "grad_norm": 0.10404694253312369, + "learning_rate": 0.0009887994192055872, + "loss": 1.4231, + "step": 889 + }, + { + "epoch": 0.1, + "grad_norm": 0.11522811129082161, + "learning_rate": 0.0009887627440069518, + "loss": 1.4904, + "step": 890 + }, + { + "epoch": 0.1, + "grad_norm": 0.11120179814086706, + "learning_rate": 0.0009887260095441488, + "loss": 1.4243, + "step": 891 + }, + { + "epoch": 0.1, + "grad_norm": 0.11751114935366559, + "learning_rate": 0.0009886892158216323, + "loss": 1.4856, + "step": 892 + }, + { + "epoch": 0.1, + "grad_norm": 0.09387632888638782, + "learning_rate": 0.0009886523628438635, + "loss": 1.4743, + "step": 893 + }, + { + "epoch": 0.1, + "grad_norm": 0.1089212935769675, + "learning_rate": 0.0009886154506153113, + "loss": 1.4551, + "step": 894 + }, + { + "epoch": 0.1, + "grad_norm": 0.10504625831674352, + "learning_rate": 0.000988578479140451, + "loss": 1.5762, + "step": 895 + }, + { + "epoch": 0.1, + "grad_norm": 0.11992942665130872, + "learning_rate": 0.0009885414484237657, + "loss": 1.5388, + "step": 896 + }, + { + "epoch": 0.1, + "grad_norm": 0.12013797851445274, + "learning_rate": 0.0009885043584697457, + "loss": 1.3869, + "step": 897 + }, + { + "epoch": 0.1, + "grad_norm": 0.1055384732184131, + "learning_rate": 0.000988467209282888, + "loss": 1.4898, + "step": 898 + }, + { + "epoch": 0.1, + "grad_norm": 0.10227730583423617, + "learning_rate": 0.000988430000867697, + "loss": 1.4475, + "step": 899 + }, + { + "epoch": 0.1, + "grad_norm": 0.11746558731715007, + "learning_rate": 0.0009883927332286846, + "loss": 1.5964, + "step": 900 + }, + { + "epoch": 0.1, + "grad_norm": 0.11176430850331567, + "learning_rate": 0.0009883554063703697, + "loss": 1.4568, + "step": 901 + }, + { + "epoch": 0.1, + "grad_norm": 0.10878283844133003, + "learning_rate": 0.0009883180202972781, + "loss": 1.5068, + "step": 902 + }, + { + "epoch": 0.1, + "grad_norm": 0.1241270556094801, + "learning_rate": 0.0009882805750139432, + "loss": 1.5213, + "step": 903 + }, + { + "epoch": 0.1, + "grad_norm": 0.11725153365662314, + "learning_rate": 0.000988243070524905, + "loss": 1.5395, + "step": 904 + }, + { + "epoch": 0.1, + "grad_norm": 0.11545711343200074, + "learning_rate": 0.0009882055068347114, + "loss": 1.5103, + "step": 905 + }, + { + "epoch": 0.1, + "grad_norm": 0.10455006262284446, + "learning_rate": 0.000988167883947917, + "loss": 1.4557, + "step": 906 + }, + { + "epoch": 0.1, + "grad_norm": 0.10220852893232839, + "learning_rate": 0.0009881302018690833, + "loss": 1.4858, + "step": 907 + }, + { + "epoch": 0.1, + "grad_norm": 0.0935560395575124, + "learning_rate": 0.0009880924606027802, + "loss": 1.5, + "step": 908 + }, + { + "epoch": 0.1, + "grad_norm": 0.10430753292676732, + "learning_rate": 0.0009880546601535834, + "loss": 1.4719, + "step": 909 + }, + { + "epoch": 0.1, + "grad_norm": 0.10808182563791133, + "learning_rate": 0.0009880168005260766, + "loss": 1.4541, + "step": 910 + }, + { + "epoch": 0.1, + "grad_norm": 0.1250933395522429, + "learning_rate": 0.0009879788817248503, + "loss": 1.5065, + "step": 911 + }, + { + "epoch": 0.1, + "grad_norm": 0.1072916390151232, + "learning_rate": 0.000987940903754502, + "loss": 1.456, + "step": 912 + }, + { + "epoch": 0.1, + "grad_norm": 0.09340424157858447, + "learning_rate": 0.0009879028666196373, + "loss": 1.5287, + "step": 913 + }, + { + "epoch": 0.1, + "grad_norm": 0.11313097745359296, + "learning_rate": 0.0009878647703248677, + "loss": 1.5945, + "step": 914 + }, + { + "epoch": 0.1, + "grad_norm": 0.09626876557166207, + "learning_rate": 0.0009878266148748128, + "loss": 1.5226, + "step": 915 + }, + { + "epoch": 0.1, + "grad_norm": 0.1136594247718088, + "learning_rate": 0.000987788400274099, + "loss": 1.3699, + "step": 916 + }, + { + "epoch": 0.1, + "grad_norm": 0.09857785705146646, + "learning_rate": 0.0009877501265273603, + "loss": 1.4187, + "step": 917 + }, + { + "epoch": 0.1, + "grad_norm": 0.09942441719629294, + "learning_rate": 0.000987711793639237, + "loss": 1.5443, + "step": 918 + }, + { + "epoch": 0.1, + "grad_norm": 0.11513364964116303, + "learning_rate": 0.0009876734016143773, + "loss": 1.5316, + "step": 919 + }, + { + "epoch": 0.1, + "grad_norm": 0.10223246559180174, + "learning_rate": 0.0009876349504574365, + "loss": 1.5269, + "step": 920 + }, + { + "epoch": 0.1, + "grad_norm": 0.09292632355039271, + "learning_rate": 0.000987596440173077, + "loss": 1.3696, + "step": 921 + }, + { + "epoch": 0.1, + "grad_norm": 0.11279813746379812, + "learning_rate": 0.0009875578707659676, + "loss": 1.4942, + "step": 922 + }, + { + "epoch": 0.1, + "grad_norm": 0.09788112736253952, + "learning_rate": 0.0009875192422407859, + "loss": 1.418, + "step": 923 + }, + { + "epoch": 0.1, + "grad_norm": 0.09913455232850231, + "learning_rate": 0.0009874805546022153, + "loss": 1.5017, + "step": 924 + }, + { + "epoch": 0.1, + "grad_norm": 0.10018718019093535, + "learning_rate": 0.0009874418078549467, + "loss": 1.4764, + "step": 925 + }, + { + "epoch": 0.1, + "grad_norm": 0.10335561243043875, + "learning_rate": 0.0009874030020036787, + "loss": 1.4936, + "step": 926 + }, + { + "epoch": 0.1, + "grad_norm": 0.1014320149701531, + "learning_rate": 0.0009873641370531162, + "loss": 1.4366, + "step": 927 + }, + { + "epoch": 0.1, + "grad_norm": 0.11382615640806051, + "learning_rate": 0.0009873252130079718, + "loss": 1.527, + "step": 928 + }, + { + "epoch": 0.1, + "grad_norm": 0.09703283161987435, + "learning_rate": 0.0009872862298729653, + "loss": 1.3459, + "step": 929 + }, + { + "epoch": 0.1, + "grad_norm": 0.12078726730680549, + "learning_rate": 0.0009872471876528235, + "loss": 1.4748, + "step": 930 + }, + { + "epoch": 0.1, + "grad_norm": 0.13779708131618348, + "learning_rate": 0.0009872080863522806, + "loss": 1.4424, + "step": 931 + }, + { + "epoch": 0.1, + "grad_norm": 0.09662842113778519, + "learning_rate": 0.0009871689259760771, + "loss": 1.506, + "step": 932 + }, + { + "epoch": 0.1, + "grad_norm": 0.10470466535314994, + "learning_rate": 0.0009871297065289623, + "loss": 1.4008, + "step": 933 + }, + { + "epoch": 0.1, + "grad_norm": 0.09825370182641988, + "learning_rate": 0.000987090428015691, + "loss": 1.457, + "step": 934 + }, + { + "epoch": 0.1, + "grad_norm": 0.10128986233646643, + "learning_rate": 0.000987051090441026, + "loss": 1.5307, + "step": 935 + }, + { + "epoch": 0.1, + "grad_norm": 0.10257667805231513, + "learning_rate": 0.0009870116938097374, + "loss": 1.5253, + "step": 936 + }, + { + "epoch": 0.1, + "grad_norm": 0.13212406497778306, + "learning_rate": 0.0009869722381266016, + "loss": 1.4041, + "step": 937 + }, + { + "epoch": 0.1, + "grad_norm": 0.09740066939177527, + "learning_rate": 0.0009869327233964032, + "loss": 1.4791, + "step": 938 + }, + { + "epoch": 0.1, + "grad_norm": 0.09993232471086905, + "learning_rate": 0.0009868931496239334, + "loss": 1.3653, + "step": 939 + }, + { + "epoch": 0.1, + "grad_norm": 0.10008462495414401, + "learning_rate": 0.0009868535168139907, + "loss": 1.3846, + "step": 940 + }, + { + "epoch": 0.1, + "grad_norm": 0.13685743246388624, + "learning_rate": 0.0009868138249713805, + "loss": 1.4776, + "step": 941 + }, + { + "epoch": 0.1, + "grad_norm": 0.09142847020327265, + "learning_rate": 0.0009867740741009159, + "loss": 1.3617, + "step": 942 + }, + { + "epoch": 0.1, + "grad_norm": 0.11429730776068008, + "learning_rate": 0.0009867342642074165, + "loss": 1.4035, + "step": 943 + }, + { + "epoch": 0.1, + "grad_norm": 0.12908152202529422, + "learning_rate": 0.0009866943952957096, + "loss": 1.4352, + "step": 944 + }, + { + "epoch": 0.1, + "grad_norm": 0.12166464155376931, + "learning_rate": 0.0009866544673706294, + "loss": 1.3091, + "step": 945 + }, + { + "epoch": 0.1, + "grad_norm": 0.11394285539064337, + "learning_rate": 0.0009866144804370172, + "loss": 1.418, + "step": 946 + }, + { + "epoch": 0.1, + "grad_norm": 0.11662230091559467, + "learning_rate": 0.0009865744344997216, + "loss": 1.4271, + "step": 947 + }, + { + "epoch": 0.1, + "grad_norm": 0.10758530149575866, + "learning_rate": 0.0009865343295635985, + "loss": 1.4356, + "step": 948 + }, + { + "epoch": 0.1, + "grad_norm": 0.10608808920552162, + "learning_rate": 0.0009864941656335105, + "loss": 1.4282, + "step": 949 + }, + { + "epoch": 0.1, + "grad_norm": 0.10309934585985372, + "learning_rate": 0.0009864539427143278, + "loss": 1.514, + "step": 950 + }, + { + "epoch": 0.1, + "grad_norm": 0.10917924921838347, + "learning_rate": 0.0009864136608109272, + "loss": 1.4121, + "step": 951 + }, + { + "epoch": 0.1, + "grad_norm": 0.11296901752973143, + "learning_rate": 0.0009863733199281938, + "loss": 1.4354, + "step": 952 + }, + { + "epoch": 0.1, + "grad_norm": 0.12866370183838677, + "learning_rate": 0.0009863329200710182, + "loss": 1.5468, + "step": 953 + }, + { + "epoch": 0.1, + "grad_norm": 0.1286966396951818, + "learning_rate": 0.0009862924612442994, + "loss": 1.4508, + "step": 954 + }, + { + "epoch": 0.1, + "grad_norm": 0.13960278705237253, + "learning_rate": 0.0009862519434529434, + "loss": 1.538, + "step": 955 + }, + { + "epoch": 0.1, + "grad_norm": 0.11652432188490669, + "learning_rate": 0.0009862113667018627, + "loss": 1.5395, + "step": 956 + }, + { + "epoch": 0.1, + "grad_norm": 0.10072364916856659, + "learning_rate": 0.0009861707309959777, + "loss": 1.4169, + "step": 957 + }, + { + "epoch": 0.1, + "grad_norm": 0.12757051256843824, + "learning_rate": 0.0009861300363402153, + "loss": 1.4389, + "step": 958 + }, + { + "epoch": 0.1, + "grad_norm": 0.111629121012831, + "learning_rate": 0.0009860892827395103, + "loss": 1.598, + "step": 959 + }, + { + "epoch": 0.1, + "grad_norm": 0.12919976312893156, + "learning_rate": 0.0009860484701988037, + "loss": 1.4479, + "step": 960 + }, + { + "epoch": 0.1, + "grad_norm": 0.11279528892079407, + "learning_rate": 0.0009860075987230446, + "loss": 1.5771, + "step": 961 + }, + { + "epoch": 0.1, + "grad_norm": 0.1278424802958141, + "learning_rate": 0.0009859666683171885, + "loss": 1.5236, + "step": 962 + }, + { + "epoch": 0.1, + "grad_norm": 0.11501125985783359, + "learning_rate": 0.0009859256789861986, + "loss": 1.4961, + "step": 963 + }, + { + "epoch": 0.1, + "grad_norm": 0.09565466997074817, + "learning_rate": 0.000985884630735045, + "loss": 1.4487, + "step": 964 + }, + { + "epoch": 0.1, + "grad_norm": 0.09876986226137081, + "learning_rate": 0.0009858435235687044, + "loss": 1.617, + "step": 965 + }, + { + "epoch": 0.1, + "grad_norm": 0.10299888552267045, + "learning_rate": 0.000985802357492162, + "loss": 1.4978, + "step": 966 + }, + { + "epoch": 0.1, + "grad_norm": 0.11351964792472877, + "learning_rate": 0.0009857611325104088, + "loss": 1.4704, + "step": 967 + }, + { + "epoch": 0.1, + "grad_norm": 0.11014860613660067, + "learning_rate": 0.0009857198486284435, + "loss": 1.5225, + "step": 968 + }, + { + "epoch": 0.1, + "grad_norm": 0.10712646048648043, + "learning_rate": 0.0009856785058512721, + "loss": 1.5689, + "step": 969 + }, + { + "epoch": 0.1, + "grad_norm": 0.12399232842174956, + "learning_rate": 0.0009856371041839075, + "loss": 1.5708, + "step": 970 + }, + { + "epoch": 0.1, + "grad_norm": 0.09681065676026035, + "learning_rate": 0.00098559564363137, + "loss": 1.6005, + "step": 971 + }, + { + "epoch": 0.1, + "grad_norm": 0.10178011038545998, + "learning_rate": 0.0009855541241986863, + "loss": 1.512, + "step": 972 + }, + { + "epoch": 0.1, + "grad_norm": 0.12471617013219558, + "learning_rate": 0.000985512545890891, + "loss": 1.6852, + "step": 973 + }, + { + "epoch": 0.1, + "grad_norm": 0.10686030336674802, + "learning_rate": 0.000985470908713026, + "loss": 1.4567, + "step": 974 + }, + { + "epoch": 0.1, + "grad_norm": 0.10697882274944155, + "learning_rate": 0.0009854292126701397, + "loss": 1.5434, + "step": 975 + }, + { + "epoch": 0.1, + "grad_norm": 0.10325976456318522, + "learning_rate": 0.0009853874577672875, + "loss": 1.4225, + "step": 976 + }, + { + "epoch": 0.11, + "grad_norm": 0.09506887103679255, + "learning_rate": 0.0009853456440095327, + "loss": 1.4139, + "step": 977 + }, + { + "epoch": 0.11, + "grad_norm": 0.09578804705306913, + "learning_rate": 0.0009853037714019454, + "loss": 1.4377, + "step": 978 + }, + { + "epoch": 0.11, + "grad_norm": 0.09398835151976992, + "learning_rate": 0.000985261839949603, + "loss": 1.5171, + "step": 979 + }, + { + "epoch": 0.11, + "grad_norm": 0.1143179413310966, + "learning_rate": 0.0009852198496575894, + "loss": 1.5511, + "step": 980 + }, + { + "epoch": 0.11, + "grad_norm": 0.10145389270154191, + "learning_rate": 0.000985177800530996, + "loss": 1.4236, + "step": 981 + }, + { + "epoch": 0.11, + "grad_norm": 0.08855066022940801, + "learning_rate": 0.0009851356925749217, + "loss": 1.428, + "step": 982 + }, + { + "epoch": 0.11, + "grad_norm": 0.10133105738476883, + "learning_rate": 0.0009850935257944722, + "loss": 1.3685, + "step": 983 + }, + { + "epoch": 0.11, + "grad_norm": 0.10335410310098274, + "learning_rate": 0.0009850513001947604, + "loss": 1.464, + "step": 984 + }, + { + "epoch": 0.11, + "grad_norm": 0.09331951297898144, + "learning_rate": 0.0009850090157809061, + "loss": 1.4766, + "step": 985 + }, + { + "epoch": 0.11, + "grad_norm": 0.0851802762504621, + "learning_rate": 0.0009849666725580367, + "loss": 1.6014, + "step": 986 + }, + { + "epoch": 0.11, + "grad_norm": 0.09555168236454595, + "learning_rate": 0.0009849242705312863, + "loss": 1.4576, + "step": 987 + }, + { + "epoch": 0.11, + "grad_norm": 0.0921061434635937, + "learning_rate": 0.000984881809705796, + "loss": 1.4993, + "step": 988 + }, + { + "epoch": 0.11, + "grad_norm": 0.08929323937431682, + "learning_rate": 0.000984839290086715, + "loss": 1.5275, + "step": 989 + }, + { + "epoch": 0.11, + "grad_norm": 0.08698430865853823, + "learning_rate": 0.0009847967116791985, + "loss": 1.4085, + "step": 990 + }, + { + "epoch": 0.11, + "grad_norm": 0.1106918708421856, + "learning_rate": 0.0009847540744884094, + "loss": 1.4351, + "step": 991 + }, + { + "epoch": 0.11, + "grad_norm": 0.1198492958950366, + "learning_rate": 0.0009847113785195175, + "loss": 1.5471, + "step": 992 + }, + { + "epoch": 0.11, + "grad_norm": 0.09043846175343777, + "learning_rate": 0.0009846686237776998, + "loss": 1.3394, + "step": 993 + }, + { + "epoch": 0.11, + "grad_norm": 0.09081924089189229, + "learning_rate": 0.0009846258102681406, + "loss": 1.4382, + "step": 994 + }, + { + "epoch": 0.11, + "grad_norm": 0.09804043302232386, + "learning_rate": 0.0009845829379960312, + "loss": 1.3546, + "step": 995 + }, + { + "epoch": 0.11, + "grad_norm": 0.10160497776628699, + "learning_rate": 0.00098454000696657, + "loss": 1.425, + "step": 996 + }, + { + "epoch": 0.11, + "grad_norm": 0.09855611576341153, + "learning_rate": 0.0009844970171849624, + "loss": 1.3847, + "step": 997 + }, + { + "epoch": 0.11, + "grad_norm": 0.08069941782676368, + "learning_rate": 0.000984453968656421, + "loss": 1.4353, + "step": 998 + }, + { + "epoch": 0.11, + "grad_norm": 0.10779230448674895, + "learning_rate": 0.0009844108613861662, + "loss": 1.5072, + "step": 999 + }, + { + "epoch": 0.11, + "grad_norm": 0.10385625952467904, + "learning_rate": 0.000984367695379424, + "loss": 1.4946, + "step": 1000 + }, + { + "epoch": 0.11, + "grad_norm": 0.10086289488325921, + "learning_rate": 0.0009843244706414292, + "loss": 1.4217, + "step": 1001 + }, + { + "epoch": 0.11, + "grad_norm": 0.1119121824309659, + "learning_rate": 0.0009842811871774225, + "loss": 1.4483, + "step": 1002 + }, + { + "epoch": 0.11, + "grad_norm": 0.1080440169974913, + "learning_rate": 0.000984237844992652, + "loss": 1.4309, + "step": 1003 + }, + { + "epoch": 0.11, + "grad_norm": 0.10987133844288485, + "learning_rate": 0.0009841944440923736, + "loss": 1.6328, + "step": 1004 + }, + { + "epoch": 0.11, + "grad_norm": 0.10625514927826236, + "learning_rate": 0.0009841509844818496, + "loss": 1.4141, + "step": 1005 + }, + { + "epoch": 0.11, + "grad_norm": 0.10159122308468017, + "learning_rate": 0.0009841074661663496, + "loss": 1.4201, + "step": 1006 + }, + { + "epoch": 0.11, + "grad_norm": 0.10218291648959085, + "learning_rate": 0.0009840638891511504, + "loss": 1.4711, + "step": 1007 + }, + { + "epoch": 0.11, + "grad_norm": 0.09931046150217804, + "learning_rate": 0.0009840202534415357, + "loss": 1.5503, + "step": 1008 + }, + { + "epoch": 0.11, + "grad_norm": 0.10118813706201002, + "learning_rate": 0.0009839765590427968, + "loss": 1.4787, + "step": 1009 + }, + { + "epoch": 0.11, + "grad_norm": 0.11350892060283965, + "learning_rate": 0.0009839328059602316, + "loss": 1.4651, + "step": 1010 + }, + { + "epoch": 0.11, + "grad_norm": 0.10192194476484162, + "learning_rate": 0.000983888994199145, + "loss": 1.3939, + "step": 1011 + }, + { + "epoch": 0.11, + "grad_norm": 0.09806497870255045, + "learning_rate": 0.0009838451237648498, + "loss": 1.544, + "step": 1012 + }, + { + "epoch": 0.11, + "grad_norm": 0.1076394976182956, + "learning_rate": 0.0009838011946626652, + "loss": 1.4081, + "step": 1013 + }, + { + "epoch": 0.11, + "grad_norm": 0.10085057657545801, + "learning_rate": 0.0009837572068979179, + "loss": 1.4804, + "step": 1014 + }, + { + "epoch": 0.11, + "grad_norm": 0.10371985397156486, + "learning_rate": 0.0009837131604759416, + "loss": 1.6341, + "step": 1015 + }, + { + "epoch": 0.11, + "grad_norm": 0.11205728480425982, + "learning_rate": 0.0009836690554020768, + "loss": 1.4994, + "step": 1016 + }, + { + "epoch": 0.11, + "grad_norm": 0.0986560095523421, + "learning_rate": 0.0009836248916816716, + "loss": 1.3913, + "step": 1017 + }, + { + "epoch": 0.11, + "grad_norm": 0.11832071985582042, + "learning_rate": 0.000983580669320081, + "loss": 1.4542, + "step": 1018 + }, + { + "epoch": 0.11, + "grad_norm": 0.11660566131058599, + "learning_rate": 0.0009835363883226673, + "loss": 1.3678, + "step": 1019 + }, + { + "epoch": 0.11, + "grad_norm": 0.11717444386866575, + "learning_rate": 0.0009834920486947994, + "loss": 1.5154, + "step": 1020 + }, + { + "epoch": 0.11, + "grad_norm": 0.11973506149009075, + "learning_rate": 0.0009834476504418535, + "loss": 1.5199, + "step": 1021 + }, + { + "epoch": 0.11, + "grad_norm": 0.12117972897901545, + "learning_rate": 0.0009834031935692135, + "loss": 1.5541, + "step": 1022 + }, + { + "epoch": 0.11, + "grad_norm": 0.09979961498083265, + "learning_rate": 0.0009833586780822697, + "loss": 1.4248, + "step": 1023 + }, + { + "epoch": 0.11, + "grad_norm": 0.08245349674048885, + "learning_rate": 0.0009833141039864198, + "loss": 1.3094, + "step": 1024 + }, + { + "epoch": 0.11, + "grad_norm": 0.09588735228913653, + "learning_rate": 0.0009832694712870688, + "loss": 1.4229, + "step": 1025 + }, + { + "epoch": 0.11, + "grad_norm": 0.09828741670587042, + "learning_rate": 0.000983224779989628, + "loss": 1.569, + "step": 1026 + }, + { + "epoch": 0.11, + "grad_norm": 0.11780202591373197, + "learning_rate": 0.0009831800300995166, + "loss": 1.418, + "step": 1027 + }, + { + "epoch": 0.11, + "grad_norm": 0.09290873549896511, + "learning_rate": 0.000983135221622161, + "loss": 1.4123, + "step": 1028 + }, + { + "epoch": 0.11, + "grad_norm": 0.10994217265314823, + "learning_rate": 0.000983090354562994, + "loss": 1.5884, + "step": 1029 + }, + { + "epoch": 0.11, + "grad_norm": 0.11691382359685173, + "learning_rate": 0.0009830454289274562, + "loss": 1.4539, + "step": 1030 + }, + { + "epoch": 0.11, + "grad_norm": 0.1054038050930932, + "learning_rate": 0.0009830004447209948, + "loss": 1.4914, + "step": 1031 + }, + { + "epoch": 0.11, + "grad_norm": 0.093228195566531, + "learning_rate": 0.0009829554019490643, + "loss": 1.6054, + "step": 1032 + }, + { + "epoch": 0.11, + "grad_norm": 0.09753879636453995, + "learning_rate": 0.0009829103006171263, + "loss": 1.5552, + "step": 1033 + }, + { + "epoch": 0.11, + "grad_norm": 0.0990335731096939, + "learning_rate": 0.0009828651407306494, + "loss": 1.4238, + "step": 1034 + }, + { + "epoch": 0.11, + "grad_norm": 0.10536520343116573, + "learning_rate": 0.0009828199222951097, + "loss": 1.5421, + "step": 1035 + }, + { + "epoch": 0.11, + "grad_norm": 0.10416559849628501, + "learning_rate": 0.0009827746453159897, + "loss": 1.5409, + "step": 1036 + }, + { + "epoch": 0.11, + "grad_norm": 0.11005172462656163, + "learning_rate": 0.0009827293097987798, + "loss": 1.4537, + "step": 1037 + }, + { + "epoch": 0.11, + "grad_norm": 0.11247632552430167, + "learning_rate": 0.0009826839157489767, + "loss": 1.5894, + "step": 1038 + }, + { + "epoch": 0.11, + "grad_norm": 0.10756012516652874, + "learning_rate": 0.0009826384631720848, + "loss": 1.3542, + "step": 1039 + }, + { + "epoch": 0.11, + "grad_norm": 0.10824115615017481, + "learning_rate": 0.0009825929520736155, + "loss": 1.4874, + "step": 1040 + }, + { + "epoch": 0.11, + "grad_norm": 0.1300208355066187, + "learning_rate": 0.0009825473824590866, + "loss": 1.4001, + "step": 1041 + }, + { + "epoch": 0.11, + "grad_norm": 0.12730737924436886, + "learning_rate": 0.0009825017543340245, + "loss": 1.344, + "step": 1042 + }, + { + "epoch": 0.11, + "grad_norm": 0.12974938372532713, + "learning_rate": 0.000982456067703961, + "loss": 1.507, + "step": 1043 + }, + { + "epoch": 0.11, + "grad_norm": 0.1035502393897516, + "learning_rate": 0.0009824103225744359, + "loss": 1.5458, + "step": 1044 + }, + { + "epoch": 0.11, + "grad_norm": 0.1141337981032992, + "learning_rate": 0.000982364518950996, + "loss": 1.5196, + "step": 1045 + }, + { + "epoch": 0.11, + "grad_norm": 0.12199297553231467, + "learning_rate": 0.0009823186568391955, + "loss": 1.5455, + "step": 1046 + }, + { + "epoch": 0.11, + "grad_norm": 0.11478738447697275, + "learning_rate": 0.000982272736244595, + "loss": 1.4725, + "step": 1047 + }, + { + "epoch": 0.11, + "grad_norm": 0.1262147429809612, + "learning_rate": 0.0009822267571727623, + "loss": 1.3743, + "step": 1048 + }, + { + "epoch": 0.11, + "grad_norm": 0.10300865953073311, + "learning_rate": 0.000982180719629273, + "loss": 1.3518, + "step": 1049 + }, + { + "epoch": 0.11, + "grad_norm": 0.1337500675482, + "learning_rate": 0.0009821346236197092, + "loss": 1.5529, + "step": 1050 + }, + { + "epoch": 0.11, + "grad_norm": 0.12392587654224337, + "learning_rate": 0.00098208846914966, + "loss": 1.3774, + "step": 1051 + }, + { + "epoch": 0.11, + "grad_norm": 0.15327272162495567, + "learning_rate": 0.000982042256224722, + "loss": 1.5009, + "step": 1052 + }, + { + "epoch": 0.11, + "grad_norm": 0.17463158449167768, + "learning_rate": 0.0009819959848504985, + "loss": 1.3791, + "step": 1053 + }, + { + "epoch": 0.11, + "grad_norm": 0.10993680189395197, + "learning_rate": 0.0009819496550326002, + "loss": 1.3499, + "step": 1054 + }, + { + "epoch": 0.11, + "grad_norm": 0.09452638170222592, + "learning_rate": 0.0009819032667766445, + "loss": 1.3927, + "step": 1055 + }, + { + "epoch": 0.11, + "grad_norm": 0.11780194487896538, + "learning_rate": 0.0009818568200882566, + "loss": 1.5436, + "step": 1056 + }, + { + "epoch": 0.11, + "grad_norm": 0.12564058560924768, + "learning_rate": 0.0009818103149730679, + "loss": 1.4736, + "step": 1057 + }, + { + "epoch": 0.11, + "grad_norm": 0.12738094503349265, + "learning_rate": 0.0009817637514367174, + "loss": 1.5559, + "step": 1058 + }, + { + "epoch": 0.11, + "grad_norm": 0.11203298042668797, + "learning_rate": 0.0009817171294848514, + "loss": 1.5329, + "step": 1059 + }, + { + "epoch": 0.11, + "grad_norm": 0.09762718006430858, + "learning_rate": 0.0009816704491231226, + "loss": 1.5596, + "step": 1060 + }, + { + "epoch": 0.11, + "grad_norm": 0.09700332763521427, + "learning_rate": 0.0009816237103571913, + "loss": 1.4408, + "step": 1061 + }, + { + "epoch": 0.11, + "grad_norm": 0.1141120906538291, + "learning_rate": 0.0009815769131927246, + "loss": 1.5046, + "step": 1062 + }, + { + "epoch": 0.11, + "grad_norm": 0.09951655888040634, + "learning_rate": 0.0009815300576353969, + "loss": 1.3907, + "step": 1063 + }, + { + "epoch": 0.11, + "grad_norm": 0.11335129257836248, + "learning_rate": 0.0009814831436908897, + "loss": 1.46, + "step": 1064 + }, + { + "epoch": 0.11, + "grad_norm": 0.12462823502348326, + "learning_rate": 0.0009814361713648915, + "loss": 1.5528, + "step": 1065 + }, + { + "epoch": 0.11, + "grad_norm": 0.10207058518187534, + "learning_rate": 0.0009813891406630975, + "loss": 1.5344, + "step": 1066 + }, + { + "epoch": 0.11, + "grad_norm": 0.11331609691483102, + "learning_rate": 0.0009813420515912108, + "loss": 1.3399, + "step": 1067 + }, + { + "epoch": 0.11, + "grad_norm": 0.11008646442048181, + "learning_rate": 0.0009812949041549408, + "loss": 1.4344, + "step": 1068 + }, + { + "epoch": 0.11, + "grad_norm": 0.10103273692277252, + "learning_rate": 0.0009812476983600046, + "loss": 1.4093, + "step": 1069 + }, + { + "epoch": 0.12, + "grad_norm": 0.10079447646547779, + "learning_rate": 0.0009812004342121257, + "loss": 1.4924, + "step": 1070 + }, + { + "epoch": 0.12, + "grad_norm": 0.10309180507533766, + "learning_rate": 0.0009811531117170352, + "loss": 1.4854, + "step": 1071 + }, + { + "epoch": 0.12, + "grad_norm": 0.10701374638267942, + "learning_rate": 0.000981105730880471, + "loss": 1.5011, + "step": 1072 + }, + { + "epoch": 0.12, + "grad_norm": 0.10438559121872394, + "learning_rate": 0.0009810582917081786, + "loss": 1.5557, + "step": 1073 + }, + { + "epoch": 0.12, + "grad_norm": 0.08928725239161178, + "learning_rate": 0.0009810107942059096, + "loss": 1.5582, + "step": 1074 + }, + { + "epoch": 0.12, + "grad_norm": 0.10607406200382605, + "learning_rate": 0.0009809632383794237, + "loss": 1.4577, + "step": 1075 + }, + { + "epoch": 0.12, + "grad_norm": 0.10140866770455789, + "learning_rate": 0.0009809156242344868, + "loss": 1.587, + "step": 1076 + }, + { + "epoch": 0.12, + "grad_norm": 0.11324183419584681, + "learning_rate": 0.0009808679517768727, + "loss": 1.3907, + "step": 1077 + }, + { + "epoch": 0.12, + "grad_norm": 0.09985830953044832, + "learning_rate": 0.0009808202210123615, + "loss": 1.5259, + "step": 1078 + }, + { + "epoch": 0.12, + "grad_norm": 0.10726443153559254, + "learning_rate": 0.000980772431946741, + "loss": 1.5433, + "step": 1079 + }, + { + "epoch": 0.12, + "grad_norm": 0.09371546019627704, + "learning_rate": 0.0009807245845858054, + "loss": 1.4472, + "step": 1080 + }, + { + "epoch": 0.12, + "grad_norm": 0.09099857207405426, + "learning_rate": 0.000980676678935357, + "loss": 1.5692, + "step": 1081 + }, + { + "epoch": 0.12, + "grad_norm": 0.1026646049901164, + "learning_rate": 0.000980628715001204, + "loss": 1.4125, + "step": 1082 + }, + { + "epoch": 0.12, + "grad_norm": 0.10668880279133305, + "learning_rate": 0.000980580692789162, + "loss": 1.3309, + "step": 1083 + }, + { + "epoch": 0.12, + "grad_norm": 0.1119693933886867, + "learning_rate": 0.0009805326123050544, + "loss": 1.445, + "step": 1084 + }, + { + "epoch": 0.12, + "grad_norm": 0.11640640857224897, + "learning_rate": 0.000980484473554711, + "loss": 1.4927, + "step": 1085 + }, + { + "epoch": 0.12, + "grad_norm": 0.10229689293466915, + "learning_rate": 0.0009804362765439688, + "loss": 1.4752, + "step": 1086 + }, + { + "epoch": 0.12, + "grad_norm": 0.10514504374029744, + "learning_rate": 0.0009803880212786715, + "loss": 1.6104, + "step": 1087 + }, + { + "epoch": 0.12, + "grad_norm": 0.11079636321731676, + "learning_rate": 0.0009803397077646704, + "loss": 1.4712, + "step": 1088 + }, + { + "epoch": 0.12, + "grad_norm": 0.10082080188781722, + "learning_rate": 0.000980291336007824, + "loss": 1.4789, + "step": 1089 + }, + { + "epoch": 0.12, + "grad_norm": 0.10675297685692006, + "learning_rate": 0.0009802429060139974, + "loss": 1.5433, + "step": 1090 + }, + { + "epoch": 0.12, + "grad_norm": 0.126227264307551, + "learning_rate": 0.0009801944177890624, + "loss": 1.4064, + "step": 1091 + }, + { + "epoch": 0.12, + "grad_norm": 0.09845560036111164, + "learning_rate": 0.000980145871338899, + "loss": 1.4893, + "step": 1092 + }, + { + "epoch": 0.12, + "grad_norm": 0.1273710597482172, + "learning_rate": 0.0009800972666693935, + "loss": 1.5066, + "step": 1093 + }, + { + "epoch": 0.12, + "grad_norm": 0.10197482636295606, + "learning_rate": 0.000980048603786439, + "loss": 1.4119, + "step": 1094 + }, + { + "epoch": 0.12, + "grad_norm": 0.09923180377874398, + "learning_rate": 0.0009799998826959366, + "loss": 1.4974, + "step": 1095 + }, + { + "epoch": 0.12, + "grad_norm": 0.10551711580597985, + "learning_rate": 0.0009799511034037933, + "loss": 1.5412, + "step": 1096 + }, + { + "epoch": 0.12, + "grad_norm": 0.10527622567768133, + "learning_rate": 0.0009799022659159242, + "loss": 1.5318, + "step": 1097 + }, + { + "epoch": 0.12, + "grad_norm": 0.09689921924266803, + "learning_rate": 0.000979853370238251, + "loss": 1.5553, + "step": 1098 + }, + { + "epoch": 0.12, + "grad_norm": 0.09900786363899416, + "learning_rate": 0.0009798044163767023, + "loss": 1.3221, + "step": 1099 + }, + { + "epoch": 0.12, + "grad_norm": 0.11176675418735046, + "learning_rate": 0.0009797554043372138, + "loss": 1.7212, + "step": 1100 + }, + { + "epoch": 0.12, + "grad_norm": 0.10447474360809943, + "learning_rate": 0.000979706334125729, + "loss": 1.4666, + "step": 1101 + }, + { + "epoch": 0.12, + "grad_norm": 0.0931764807312025, + "learning_rate": 0.0009796572057481968, + "loss": 1.5391, + "step": 1102 + }, + { + "epoch": 0.12, + "grad_norm": 0.09105412831210441, + "learning_rate": 0.000979608019210575, + "loss": 1.4852, + "step": 1103 + }, + { + "epoch": 0.12, + "grad_norm": 0.11365343481767007, + "learning_rate": 0.0009795587745188275, + "loss": 1.5142, + "step": 1104 + }, + { + "epoch": 0.12, + "grad_norm": 0.09720993573412151, + "learning_rate": 0.0009795094716789252, + "loss": 1.4296, + "step": 1105 + }, + { + "epoch": 0.12, + "grad_norm": 0.10445259805972958, + "learning_rate": 0.0009794601106968466, + "loss": 1.4978, + "step": 1106 + }, + { + "epoch": 0.12, + "grad_norm": 0.08713210273408448, + "learning_rate": 0.0009794106915785763, + "loss": 1.3949, + "step": 1107 + }, + { + "epoch": 0.12, + "grad_norm": 0.09792767712858018, + "learning_rate": 0.000979361214330107, + "loss": 1.3676, + "step": 1108 + }, + { + "epoch": 0.12, + "grad_norm": 0.10885611133876523, + "learning_rate": 0.0009793116789574379, + "loss": 1.372, + "step": 1109 + }, + { + "epoch": 0.12, + "grad_norm": 0.0952070184084006, + "learning_rate": 0.0009792620854665753, + "loss": 1.475, + "step": 1110 + }, + { + "epoch": 0.12, + "grad_norm": 0.09550091244632411, + "learning_rate": 0.0009792124338635325, + "loss": 1.5107, + "step": 1111 + }, + { + "epoch": 0.12, + "grad_norm": 0.09616490303940983, + "learning_rate": 0.00097916272415433, + "loss": 1.5391, + "step": 1112 + }, + { + "epoch": 0.12, + "grad_norm": 0.1050502168365546, + "learning_rate": 0.0009791129563449952, + "loss": 1.445, + "step": 1113 + }, + { + "epoch": 0.12, + "grad_norm": 0.09273724100814855, + "learning_rate": 0.0009790631304415628, + "loss": 1.4769, + "step": 1114 + }, + { + "epoch": 0.12, + "grad_norm": 0.09836197010370934, + "learning_rate": 0.000979013246450074, + "loss": 1.3792, + "step": 1115 + }, + { + "epoch": 0.12, + "grad_norm": 0.10609141191664713, + "learning_rate": 0.000978963304376578, + "loss": 1.4917, + "step": 1116 + }, + { + "epoch": 0.12, + "grad_norm": 0.08593793619652067, + "learning_rate": 0.00097891330422713, + "loss": 1.4464, + "step": 1117 + }, + { + "epoch": 0.12, + "grad_norm": 0.09539697052300453, + "learning_rate": 0.0009788632460077927, + "loss": 1.399, + "step": 1118 + }, + { + "epoch": 0.12, + "grad_norm": 0.10822664643859031, + "learning_rate": 0.000978813129724636, + "loss": 1.4785, + "step": 1119 + }, + { + "epoch": 0.12, + "grad_norm": 0.10154289276419569, + "learning_rate": 0.0009787629553837367, + "loss": 1.4826, + "step": 1120 + }, + { + "epoch": 0.12, + "grad_norm": 0.10862578787342332, + "learning_rate": 0.0009787127229911783, + "loss": 1.4277, + "step": 1121 + }, + { + "epoch": 0.12, + "grad_norm": 0.10981816955085401, + "learning_rate": 0.000978662432553052, + "loss": 1.3717, + "step": 1122 + }, + { + "epoch": 0.12, + "grad_norm": 0.1023940491468338, + "learning_rate": 0.0009786120840754556, + "loss": 1.6124, + "step": 1123 + }, + { + "epoch": 0.12, + "grad_norm": 0.11702668759170876, + "learning_rate": 0.0009785616775644938, + "loss": 1.4212, + "step": 1124 + }, + { + "epoch": 0.12, + "grad_norm": 0.12158573099045404, + "learning_rate": 0.000978511213026279, + "loss": 1.3692, + "step": 1125 + }, + { + "epoch": 0.12, + "grad_norm": 0.10410236569465127, + "learning_rate": 0.0009784606904669297, + "loss": 1.4622, + "step": 1126 + }, + { + "epoch": 0.12, + "grad_norm": 0.11484842201707575, + "learning_rate": 0.0009784101098925723, + "loss": 1.5727, + "step": 1127 + }, + { + "epoch": 0.12, + "grad_norm": 0.10040496777521027, + "learning_rate": 0.0009783594713093397, + "loss": 1.4933, + "step": 1128 + }, + { + "epoch": 0.12, + "grad_norm": 0.11033351716891086, + "learning_rate": 0.000978308774723372, + "loss": 1.5888, + "step": 1129 + }, + { + "epoch": 0.12, + "grad_norm": 0.12154411199523346, + "learning_rate": 0.0009782580201408164, + "loss": 1.4452, + "step": 1130 + }, + { + "epoch": 0.12, + "grad_norm": 0.11269562719555688, + "learning_rate": 0.0009782072075678271, + "loss": 1.4995, + "step": 1131 + }, + { + "epoch": 0.12, + "grad_norm": 0.101394103359456, + "learning_rate": 0.0009781563370105653, + "loss": 1.5348, + "step": 1132 + }, + { + "epoch": 0.12, + "grad_norm": 0.12337547846107601, + "learning_rate": 0.0009781054084751991, + "loss": 1.5912, + "step": 1133 + }, + { + "epoch": 0.12, + "grad_norm": 0.10895999658008346, + "learning_rate": 0.0009780544219679039, + "loss": 1.5698, + "step": 1134 + }, + { + "epoch": 0.12, + "grad_norm": 0.10875009724116637, + "learning_rate": 0.000978003377494862, + "loss": 1.418, + "step": 1135 + }, + { + "epoch": 0.12, + "grad_norm": 0.08717663220170035, + "learning_rate": 0.0009779522750622625, + "loss": 1.5179, + "step": 1136 + }, + { + "epoch": 0.12, + "grad_norm": 0.0901001609158058, + "learning_rate": 0.0009779011146763019, + "loss": 1.5814, + "step": 1137 + }, + { + "epoch": 0.12, + "grad_norm": 0.10275116736092334, + "learning_rate": 0.0009778498963431837, + "loss": 1.4128, + "step": 1138 + }, + { + "epoch": 0.12, + "grad_norm": 0.09762112462628392, + "learning_rate": 0.000977798620069118, + "loss": 1.4922, + "step": 1139 + }, + { + "epoch": 0.12, + "grad_norm": 0.10060723320049322, + "learning_rate": 0.0009777472858603226, + "loss": 1.5383, + "step": 1140 + }, + { + "epoch": 0.12, + "grad_norm": 0.1164092627557891, + "learning_rate": 0.0009776958937230216, + "loss": 1.5062, + "step": 1141 + }, + { + "epoch": 0.12, + "grad_norm": 0.10534018168294584, + "learning_rate": 0.0009776444436634466, + "loss": 1.4397, + "step": 1142 + }, + { + "epoch": 0.12, + "grad_norm": 0.10459737964269737, + "learning_rate": 0.0009775929356878362, + "loss": 1.4836, + "step": 1143 + }, + { + "epoch": 0.12, + "grad_norm": 0.1129596993044548, + "learning_rate": 0.0009775413698024358, + "loss": 1.5252, + "step": 1144 + }, + { + "epoch": 0.12, + "grad_norm": 0.0977021657766086, + "learning_rate": 0.000977489746013498, + "loss": 1.3986, + "step": 1145 + }, + { + "epoch": 0.12, + "grad_norm": 0.10225167190944157, + "learning_rate": 0.0009774380643272822, + "loss": 1.6317, + "step": 1146 + }, + { + "epoch": 0.12, + "grad_norm": 0.08850304042119721, + "learning_rate": 0.0009773863247500554, + "loss": 1.4967, + "step": 1147 + }, + { + "epoch": 0.12, + "grad_norm": 0.11049263300762383, + "learning_rate": 0.0009773345272880906, + "loss": 1.5815, + "step": 1148 + }, + { + "epoch": 0.12, + "grad_norm": 0.09691390291798195, + "learning_rate": 0.000977282671947669, + "loss": 1.4033, + "step": 1149 + }, + { + "epoch": 0.12, + "grad_norm": 0.10663292694602702, + "learning_rate": 0.000977230758735078, + "loss": 1.5099, + "step": 1150 + }, + { + "epoch": 0.12, + "grad_norm": 0.08971210350781093, + "learning_rate": 0.000977178787656612, + "loss": 1.4351, + "step": 1151 + }, + { + "epoch": 0.12, + "grad_norm": 0.12450168045568234, + "learning_rate": 0.000977126758718573, + "loss": 1.5371, + "step": 1152 + }, + { + "epoch": 0.12, + "grad_norm": 0.0948433794803075, + "learning_rate": 0.0009770746719272696, + "loss": 1.4678, + "step": 1153 + }, + { + "epoch": 0.12, + "grad_norm": 0.10085071740419524, + "learning_rate": 0.0009770225272890177, + "loss": 1.4806, + "step": 1154 + }, + { + "epoch": 0.12, + "grad_norm": 0.4484143809430015, + "learning_rate": 0.0009769703248101397, + "loss": 1.607, + "step": 1155 + }, + { + "epoch": 0.12, + "grad_norm": 0.15922980601920514, + "learning_rate": 0.0009769180644969653, + "loss": 1.5416, + "step": 1156 + }, + { + "epoch": 0.12, + "grad_norm": 0.11107798130456996, + "learning_rate": 0.0009768657463558315, + "loss": 1.6267, + "step": 1157 + }, + { + "epoch": 0.12, + "grad_norm": 0.2946033526128502, + "learning_rate": 0.0009768133703930819, + "loss": 1.4602, + "step": 1158 + }, + { + "epoch": 0.12, + "grad_norm": 0.15370026664096484, + "learning_rate": 0.0009767609366150673, + "loss": 1.5199, + "step": 1159 + }, + { + "epoch": 0.12, + "grad_norm": 0.10375667791821207, + "learning_rate": 0.0009767084450281456, + "loss": 1.4155, + "step": 1160 + }, + { + "epoch": 0.12, + "grad_norm": 0.12207675726090313, + "learning_rate": 0.0009766558956386814, + "loss": 1.3242, + "step": 1161 + }, + { + "epoch": 0.12, + "grad_norm": 0.22605444819633985, + "learning_rate": 0.0009766032884530465, + "loss": 1.3588, + "step": 1162 + }, + { + "epoch": 0.13, + "grad_norm": 0.12489025319914, + "learning_rate": 0.00097655062347762, + "loss": 1.5016, + "step": 1163 + }, + { + "epoch": 0.13, + "grad_norm": 0.11854980300003969, + "learning_rate": 0.0009764979007187873, + "loss": 1.4542, + "step": 1164 + }, + { + "epoch": 0.13, + "grad_norm": 0.26535399019300804, + "learning_rate": 0.0009764451201829414, + "loss": 1.5479, + "step": 1165 + }, + { + "epoch": 0.13, + "grad_norm": 0.4479661893821317, + "learning_rate": 0.0009763922818764822, + "loss": 1.5013, + "step": 1166 + }, + { + "epoch": 0.13, + "grad_norm": 0.17293052163018774, + "learning_rate": 0.0009763393858058164, + "loss": 1.4978, + "step": 1167 + }, + { + "epoch": 0.13, + "grad_norm": 0.12328773343089187, + "learning_rate": 0.0009762864319773579, + "loss": 1.4241, + "step": 1168 + }, + { + "epoch": 0.13, + "grad_norm": 0.10961795118064666, + "learning_rate": 0.0009762334203975276, + "loss": 1.5303, + "step": 1169 + }, + { + "epoch": 0.13, + "grad_norm": 0.1175993556438544, + "learning_rate": 0.0009761803510727531, + "loss": 1.4783, + "step": 1170 + }, + { + "epoch": 0.13, + "grad_norm": 0.11106899286073688, + "learning_rate": 0.0009761272240094695, + "loss": 1.458, + "step": 1171 + }, + { + "epoch": 0.13, + "grad_norm": 0.13369412952321702, + "learning_rate": 0.0009760740392141186, + "loss": 1.4819, + "step": 1172 + }, + { + "epoch": 0.13, + "grad_norm": 0.09580765095849292, + "learning_rate": 0.0009760207966931489, + "loss": 1.4705, + "step": 1173 + }, + { + "epoch": 0.13, + "grad_norm": 0.11617828359160304, + "learning_rate": 0.0009759674964530167, + "loss": 1.4023, + "step": 1174 + }, + { + "epoch": 0.13, + "grad_norm": 0.10434869245175268, + "learning_rate": 0.0009759141385001847, + "loss": 1.5545, + "step": 1175 + }, + { + "epoch": 0.13, + "grad_norm": 0.1199786542618325, + "learning_rate": 0.0009758607228411225, + "loss": 1.5722, + "step": 1176 + }, + { + "epoch": 0.13, + "grad_norm": 0.0964001059293876, + "learning_rate": 0.0009758072494823072, + "loss": 1.5165, + "step": 1177 + }, + { + "epoch": 0.13, + "grad_norm": 0.11184263546413122, + "learning_rate": 0.0009757537184302225, + "loss": 1.4181, + "step": 1178 + }, + { + "epoch": 0.13, + "grad_norm": 0.11404339323309447, + "learning_rate": 0.0009757001296913593, + "loss": 1.5288, + "step": 1179 + }, + { + "epoch": 0.13, + "grad_norm": 0.11956579528844924, + "learning_rate": 0.0009756464832722154, + "loss": 1.4549, + "step": 1180 + }, + { + "epoch": 0.13, + "grad_norm": 0.0978519457773404, + "learning_rate": 0.0009755927791792956, + "loss": 1.3303, + "step": 1181 + }, + { + "epoch": 0.13, + "grad_norm": 0.17000533389215636, + "learning_rate": 0.0009755390174191117, + "loss": 1.4161, + "step": 1182 + }, + { + "epoch": 0.13, + "grad_norm": 0.07996186824950127, + "learning_rate": 0.0009754851979981826, + "loss": 1.3548, + "step": 1183 + }, + { + "epoch": 0.13, + "grad_norm": 0.09044166304385826, + "learning_rate": 0.0009754313209230339, + "loss": 1.5155, + "step": 1184 + }, + { + "epoch": 0.13, + "grad_norm": 0.10621964304981515, + "learning_rate": 0.0009753773862001985, + "loss": 1.6404, + "step": 1185 + }, + { + "epoch": 0.13, + "grad_norm": 0.08254025272480696, + "learning_rate": 0.0009753233938362161, + "loss": 1.4376, + "step": 1186 + }, + { + "epoch": 0.13, + "grad_norm": 0.1106607801453522, + "learning_rate": 0.0009752693438376336, + "loss": 1.498, + "step": 1187 + }, + { + "epoch": 0.13, + "grad_norm": 0.09876265807599625, + "learning_rate": 0.0009752152362110045, + "loss": 1.5122, + "step": 1188 + }, + { + "epoch": 0.13, + "grad_norm": 0.10039646627078934, + "learning_rate": 0.0009751610709628897, + "loss": 1.3371, + "step": 1189 + }, + { + "epoch": 0.13, + "grad_norm": 0.1005714762578938, + "learning_rate": 0.000975106848099857, + "loss": 1.4913, + "step": 1190 + }, + { + "epoch": 0.13, + "grad_norm": 0.09409073411849112, + "learning_rate": 0.0009750525676284811, + "loss": 1.4359, + "step": 1191 + }, + { + "epoch": 0.13, + "grad_norm": 0.1114939909639379, + "learning_rate": 0.0009749982295553436, + "loss": 1.4572, + "step": 1192 + }, + { + "epoch": 0.13, + "grad_norm": 0.11159452971642979, + "learning_rate": 0.0009749438338870331, + "loss": 1.4944, + "step": 1193 + }, + { + "epoch": 0.13, + "grad_norm": 0.10054333256733074, + "learning_rate": 0.0009748893806301455, + "loss": 1.4464, + "step": 1194 + }, + { + "epoch": 0.13, + "grad_norm": 0.09895279648206912, + "learning_rate": 0.0009748348697912832, + "loss": 1.509, + "step": 1195 + }, + { + "epoch": 0.13, + "grad_norm": 0.10061217401043886, + "learning_rate": 0.000974780301377056, + "loss": 1.4162, + "step": 1196 + }, + { + "epoch": 0.13, + "grad_norm": 0.10417574356188845, + "learning_rate": 0.0009747256753940803, + "loss": 1.3959, + "step": 1197 + }, + { + "epoch": 0.13, + "grad_norm": 0.10853524853515546, + "learning_rate": 0.00097467099184898, + "loss": 1.3809, + "step": 1198 + }, + { + "epoch": 0.13, + "grad_norm": 0.10205080178365547, + "learning_rate": 0.0009746162507483854, + "loss": 1.5636, + "step": 1199 + }, + { + "epoch": 0.13, + "grad_norm": 0.11428455096975486, + "learning_rate": 0.0009745614520989341, + "loss": 1.3596, + "step": 1200 + }, + { + "epoch": 0.13, + "grad_norm": 0.10288778226541649, + "learning_rate": 0.0009745065959072708, + "loss": 1.5153, + "step": 1201 + }, + { + "epoch": 0.13, + "grad_norm": 0.13135365823412376, + "learning_rate": 0.0009744516821800469, + "loss": 1.5387, + "step": 1202 + }, + { + "epoch": 0.13, + "grad_norm": 0.10102362449138394, + "learning_rate": 0.0009743967109239206, + "loss": 1.4004, + "step": 1203 + }, + { + "epoch": 0.13, + "grad_norm": 0.13512092208553644, + "learning_rate": 0.0009743416821455577, + "loss": 1.5138, + "step": 1204 + }, + { + "epoch": 0.13, + "grad_norm": 0.1087914935718707, + "learning_rate": 0.0009742865958516307, + "loss": 1.5475, + "step": 1205 + }, + { + "epoch": 0.13, + "grad_norm": 0.11166016102754622, + "learning_rate": 0.0009742314520488187, + "loss": 1.4857, + "step": 1206 + }, + { + "epoch": 0.13, + "grad_norm": 0.10473240664950298, + "learning_rate": 0.0009741762507438083, + "loss": 1.4585, + "step": 1207 + }, + { + "epoch": 0.13, + "grad_norm": 0.09850077005465968, + "learning_rate": 0.0009741209919432928, + "loss": 1.475, + "step": 1208 + }, + { + "epoch": 0.13, + "grad_norm": 0.09199217175728568, + "learning_rate": 0.0009740656756539723, + "loss": 1.5659, + "step": 1209 + }, + { + "epoch": 0.13, + "grad_norm": 0.09683493951065482, + "learning_rate": 0.0009740103018825543, + "loss": 1.4967, + "step": 1210 + }, + { + "epoch": 0.13, + "grad_norm": 0.09121350319498756, + "learning_rate": 0.0009739548706357532, + "loss": 1.3673, + "step": 1211 + }, + { + "epoch": 0.13, + "grad_norm": 0.10048743165141734, + "learning_rate": 0.0009738993819202901, + "loss": 1.5065, + "step": 1212 + }, + { + "epoch": 0.13, + "grad_norm": 0.09127861055952981, + "learning_rate": 0.0009738438357428929, + "loss": 1.2944, + "step": 1213 + }, + { + "epoch": 0.13, + "grad_norm": 0.09363988777597988, + "learning_rate": 0.0009737882321102972, + "loss": 1.4625, + "step": 1214 + }, + { + "epoch": 0.13, + "grad_norm": 0.10479854745256931, + "learning_rate": 0.000973732571029245, + "loss": 1.5235, + "step": 1215 + }, + { + "epoch": 0.13, + "grad_norm": 0.09677667521333407, + "learning_rate": 0.0009736768525064851, + "loss": 1.4624, + "step": 1216 + }, + { + "epoch": 0.13, + "grad_norm": 0.09244710850250462, + "learning_rate": 0.0009736210765487741, + "loss": 1.5458, + "step": 1217 + }, + { + "epoch": 0.13, + "grad_norm": 0.10714498510511104, + "learning_rate": 0.0009735652431628747, + "loss": 1.5138, + "step": 1218 + }, + { + "epoch": 0.13, + "grad_norm": 0.10730277307980761, + "learning_rate": 0.000973509352355557, + "loss": 1.3659, + "step": 1219 + }, + { + "epoch": 0.13, + "grad_norm": 0.11548224356442682, + "learning_rate": 0.0009734534041335977, + "loss": 1.4155, + "step": 1220 + }, + { + "epoch": 0.13, + "grad_norm": 0.10400406071826049, + "learning_rate": 0.000973397398503781, + "loss": 1.4974, + "step": 1221 + }, + { + "epoch": 0.13, + "grad_norm": 0.10990936584302333, + "learning_rate": 0.0009733413354728977, + "loss": 1.455, + "step": 1222 + }, + { + "epoch": 0.13, + "grad_norm": 0.09334736281891305, + "learning_rate": 0.0009732852150477456, + "loss": 1.4218, + "step": 1223 + }, + { + "epoch": 0.13, + "grad_norm": 0.10295858393774611, + "learning_rate": 0.0009732290372351294, + "loss": 1.5732, + "step": 1224 + }, + { + "epoch": 0.13, + "grad_norm": 0.09349978355010372, + "learning_rate": 0.0009731728020418611, + "loss": 1.5234, + "step": 1225 + }, + { + "epoch": 0.13, + "grad_norm": 0.10091993747486702, + "learning_rate": 0.0009731165094747593, + "loss": 1.4055, + "step": 1226 + }, + { + "epoch": 0.13, + "grad_norm": 0.1048721686348737, + "learning_rate": 0.0009730601595406496, + "loss": 1.4747, + "step": 1227 + }, + { + "epoch": 0.13, + "grad_norm": 0.09189084245289222, + "learning_rate": 0.0009730037522463647, + "loss": 1.4501, + "step": 1228 + }, + { + "epoch": 0.13, + "grad_norm": 0.09610235865729654, + "learning_rate": 0.0009729472875987439, + "loss": 1.4709, + "step": 1229 + }, + { + "epoch": 0.13, + "grad_norm": 0.09998948369758241, + "learning_rate": 0.0009728907656046343, + "loss": 1.4859, + "step": 1230 + }, + { + "epoch": 0.13, + "grad_norm": 0.09704555535405772, + "learning_rate": 0.0009728341862708889, + "loss": 1.5897, + "step": 1231 + }, + { + "epoch": 0.13, + "grad_norm": 0.11181733909784186, + "learning_rate": 0.0009727775496043685, + "loss": 1.4797, + "step": 1232 + }, + { + "epoch": 0.13, + "grad_norm": 0.1078281133859294, + "learning_rate": 0.0009727208556119401, + "loss": 1.5068, + "step": 1233 + }, + { + "epoch": 0.13, + "grad_norm": 0.12417520601082405, + "learning_rate": 0.0009726641043004782, + "loss": 1.4862, + "step": 1234 + }, + { + "epoch": 0.13, + "grad_norm": 0.09808494647507954, + "learning_rate": 0.0009726072956768643, + "loss": 1.4989, + "step": 1235 + }, + { + "epoch": 0.13, + "grad_norm": 0.11618660624020795, + "learning_rate": 0.0009725504297479864, + "loss": 1.3851, + "step": 1236 + }, + { + "epoch": 0.13, + "grad_norm": 0.10189474305066351, + "learning_rate": 0.0009724935065207399, + "loss": 1.4341, + "step": 1237 + }, + { + "epoch": 0.13, + "grad_norm": 0.099810201488847, + "learning_rate": 0.0009724365260020267, + "loss": 1.3454, + "step": 1238 + }, + { + "epoch": 0.13, + "grad_norm": 0.10607882500756213, + "learning_rate": 0.0009723794881987559, + "loss": 1.4519, + "step": 1239 + }, + { + "epoch": 0.13, + "grad_norm": 0.10124419381866068, + "learning_rate": 0.0009723223931178438, + "loss": 1.6662, + "step": 1240 + }, + { + "epoch": 0.13, + "grad_norm": 0.08695360420074442, + "learning_rate": 0.0009722652407662129, + "loss": 1.4175, + "step": 1241 + }, + { + "epoch": 0.13, + "grad_norm": 0.09549842078403618, + "learning_rate": 0.0009722080311507937, + "loss": 1.5201, + "step": 1242 + }, + { + "epoch": 0.13, + "grad_norm": 0.08806136818436658, + "learning_rate": 0.0009721507642785226, + "loss": 1.4395, + "step": 1243 + }, + { + "epoch": 0.13, + "grad_norm": 0.09556559977297367, + "learning_rate": 0.0009720934401563437, + "loss": 1.5403, + "step": 1244 + }, + { + "epoch": 0.13, + "grad_norm": 0.08343873926288949, + "learning_rate": 0.0009720360587912075, + "loss": 1.4373, + "step": 1245 + }, + { + "epoch": 0.13, + "grad_norm": 0.09441068917886046, + "learning_rate": 0.0009719786201900719, + "loss": 1.5036, + "step": 1246 + }, + { + "epoch": 0.13, + "grad_norm": 0.09864797097988, + "learning_rate": 0.0009719211243599014, + "loss": 1.3718, + "step": 1247 + }, + { + "epoch": 0.13, + "grad_norm": 0.08882461434600883, + "learning_rate": 0.0009718635713076676, + "loss": 1.473, + "step": 1248 + }, + { + "epoch": 0.13, + "grad_norm": 0.08388976795441185, + "learning_rate": 0.0009718059610403491, + "loss": 1.4691, + "step": 1249 + }, + { + "epoch": 0.13, + "grad_norm": 0.10270330807860907, + "learning_rate": 0.0009717482935649312, + "loss": 1.4669, + "step": 1250 + }, + { + "epoch": 0.13, + "grad_norm": 0.096526175780071, + "learning_rate": 0.0009716905688884063, + "loss": 1.4292, + "step": 1251 + }, + { + "epoch": 0.13, + "grad_norm": 0.07826161807267731, + "learning_rate": 0.0009716327870177739, + "loss": 1.4802, + "step": 1252 + }, + { + "epoch": 0.13, + "grad_norm": 0.07991878427102167, + "learning_rate": 0.00097157494796004, + "loss": 1.4538, + "step": 1253 + }, + { + "epoch": 0.13, + "grad_norm": 0.08512603268481847, + "learning_rate": 0.000971517051722218, + "loss": 1.3659, + "step": 1254 + }, + { + "epoch": 0.13, + "grad_norm": 0.08429100786699598, + "learning_rate": 0.0009714590983113279, + "loss": 1.4721, + "step": 1255 + }, + { + "epoch": 0.14, + "grad_norm": 0.08571040834333599, + "learning_rate": 0.0009714010877343966, + "loss": 1.5399, + "step": 1256 + }, + { + "epoch": 0.14, + "grad_norm": 0.08559073310258722, + "learning_rate": 0.0009713430199984583, + "loss": 1.3743, + "step": 1257 + }, + { + "epoch": 0.14, + "grad_norm": 0.09653287635713186, + "learning_rate": 0.0009712848951105539, + "loss": 1.4327, + "step": 1258 + }, + { + "epoch": 0.14, + "grad_norm": 0.09277545525213768, + "learning_rate": 0.0009712267130777312, + "loss": 1.4792, + "step": 1259 + }, + { + "epoch": 0.14, + "grad_norm": 0.08815163163035124, + "learning_rate": 0.000971168473907045, + "loss": 1.5024, + "step": 1260 + }, + { + "epoch": 0.14, + "grad_norm": 0.08767532362959778, + "learning_rate": 0.0009711101776055569, + "loss": 1.4843, + "step": 1261 + }, + { + "epoch": 0.14, + "grad_norm": 0.09195990998443754, + "learning_rate": 0.0009710518241803356, + "loss": 1.5151, + "step": 1262 + }, + { + "epoch": 0.14, + "grad_norm": 0.09294957831798757, + "learning_rate": 0.0009709934136384568, + "loss": 1.5475, + "step": 1263 + }, + { + "epoch": 0.14, + "grad_norm": 0.09036887036299439, + "learning_rate": 0.0009709349459870027, + "loss": 1.3581, + "step": 1264 + }, + { + "epoch": 0.14, + "grad_norm": 0.10196282506888943, + "learning_rate": 0.0009708764212330629, + "loss": 1.3326, + "step": 1265 + }, + { + "epoch": 0.14, + "grad_norm": 0.10355169189765306, + "learning_rate": 0.0009708178393837336, + "loss": 1.5029, + "step": 1266 + }, + { + "epoch": 0.14, + "grad_norm": 0.09318762477278592, + "learning_rate": 0.0009707592004461182, + "loss": 1.5204, + "step": 1267 + }, + { + "epoch": 0.14, + "grad_norm": 0.08563077439899368, + "learning_rate": 0.0009707005044273267, + "loss": 1.3752, + "step": 1268 + }, + { + "epoch": 0.14, + "grad_norm": 0.08600170949975434, + "learning_rate": 0.0009706417513344764, + "loss": 1.5756, + "step": 1269 + }, + { + "epoch": 0.14, + "grad_norm": 0.08164799944411805, + "learning_rate": 0.0009705829411746911, + "loss": 1.3627, + "step": 1270 + }, + { + "epoch": 0.14, + "grad_norm": 0.07849971938700012, + "learning_rate": 0.000970524073955102, + "loss": 1.3703, + "step": 1271 + }, + { + "epoch": 0.14, + "grad_norm": 0.07646692578878106, + "learning_rate": 0.0009704651496828466, + "loss": 1.4787, + "step": 1272 + }, + { + "epoch": 0.14, + "grad_norm": 0.09464792546642692, + "learning_rate": 0.00097040616836507, + "loss": 1.3937, + "step": 1273 + }, + { + "epoch": 0.14, + "grad_norm": 0.09949300784748961, + "learning_rate": 0.0009703471300089236, + "loss": 1.5593, + "step": 1274 + }, + { + "epoch": 0.14, + "grad_norm": 0.08122148070582941, + "learning_rate": 0.0009702880346215664, + "loss": 1.3546, + "step": 1275 + }, + { + "epoch": 0.14, + "grad_norm": 0.10318702598898441, + "learning_rate": 0.0009702288822101634, + "loss": 1.52, + "step": 1276 + }, + { + "epoch": 0.14, + "grad_norm": 0.11343488674792428, + "learning_rate": 0.0009701696727818874, + "loss": 1.3261, + "step": 1277 + }, + { + "epoch": 0.14, + "grad_norm": 0.10000234811603913, + "learning_rate": 0.0009701104063439177, + "loss": 1.4701, + "step": 1278 + }, + { + "epoch": 0.14, + "grad_norm": 0.08002664638456945, + "learning_rate": 0.0009700510829034404, + "loss": 1.4196, + "step": 1279 + }, + { + "epoch": 0.14, + "grad_norm": 0.08976865563700746, + "learning_rate": 0.0009699917024676488, + "loss": 1.4879, + "step": 1280 + }, + { + "epoch": 0.14, + "grad_norm": 0.09486370384243133, + "learning_rate": 0.0009699322650437432, + "loss": 1.6916, + "step": 1281 + }, + { + "epoch": 0.14, + "grad_norm": 0.09483802520074949, + "learning_rate": 0.00096987277063893, + "loss": 1.4527, + "step": 1282 + }, + { + "epoch": 0.14, + "grad_norm": 0.10092644093150213, + "learning_rate": 0.0009698132192604238, + "loss": 1.4879, + "step": 1283 + }, + { + "epoch": 0.14, + "grad_norm": 0.09844581657806153, + "learning_rate": 0.0009697536109154449, + "loss": 1.4846, + "step": 1284 + }, + { + "epoch": 0.14, + "grad_norm": 0.09448314390473393, + "learning_rate": 0.0009696939456112213, + "loss": 1.318, + "step": 1285 + }, + { + "epoch": 0.14, + "grad_norm": 0.11425995620933611, + "learning_rate": 0.0009696342233549873, + "loss": 1.3915, + "step": 1286 + }, + { + "epoch": 0.14, + "grad_norm": 0.10553751796369051, + "learning_rate": 0.0009695744441539849, + "loss": 1.6516, + "step": 1287 + }, + { + "epoch": 0.14, + "grad_norm": 0.09672534952850492, + "learning_rate": 0.0009695146080154621, + "loss": 1.4494, + "step": 1288 + }, + { + "epoch": 0.14, + "grad_norm": 0.10035478781628406, + "learning_rate": 0.0009694547149466745, + "loss": 1.5295, + "step": 1289 + }, + { + "epoch": 0.14, + "grad_norm": 0.08972138904342805, + "learning_rate": 0.0009693947649548842, + "loss": 1.474, + "step": 1290 + }, + { + "epoch": 0.14, + "grad_norm": 0.08679630306391951, + "learning_rate": 0.0009693347580473604, + "loss": 1.4317, + "step": 1291 + }, + { + "epoch": 0.14, + "grad_norm": 0.10871542085222537, + "learning_rate": 0.0009692746942313792, + "loss": 1.5484, + "step": 1292 + }, + { + "epoch": 0.14, + "grad_norm": 0.08603429220157566, + "learning_rate": 0.0009692145735142235, + "loss": 1.4189, + "step": 1293 + }, + { + "epoch": 0.14, + "grad_norm": 0.094369401781512, + "learning_rate": 0.000969154395903183, + "loss": 1.2444, + "step": 1294 + }, + { + "epoch": 0.14, + "grad_norm": 0.08502135101989325, + "learning_rate": 0.0009690941614055546, + "loss": 1.4072, + "step": 1295 + }, + { + "epoch": 0.14, + "grad_norm": 0.09178718049011819, + "learning_rate": 0.0009690338700286421, + "loss": 1.3757, + "step": 1296 + }, + { + "epoch": 0.14, + "grad_norm": 0.08929961053282694, + "learning_rate": 0.0009689735217797557, + "loss": 1.423, + "step": 1297 + }, + { + "epoch": 0.14, + "grad_norm": 0.0829406241969028, + "learning_rate": 0.0009689131166662131, + "loss": 1.4598, + "step": 1298 + }, + { + "epoch": 0.14, + "grad_norm": 0.09831046552987227, + "learning_rate": 0.0009688526546953384, + "loss": 1.5143, + "step": 1299 + }, + { + "epoch": 0.14, + "grad_norm": 0.10233533693634345, + "learning_rate": 0.0009687921358744629, + "loss": 1.5625, + "step": 1300 + }, + { + "epoch": 0.14, + "grad_norm": 0.12009618350664676, + "learning_rate": 0.0009687315602109248, + "loss": 1.5813, + "step": 1301 + }, + { + "epoch": 0.14, + "grad_norm": 0.09579390968775611, + "learning_rate": 0.0009686709277120691, + "loss": 1.5, + "step": 1302 + }, + { + "epoch": 0.14, + "grad_norm": 0.09347070874904188, + "learning_rate": 0.0009686102383852477, + "loss": 1.4762, + "step": 1303 + }, + { + "epoch": 0.14, + "grad_norm": 0.0974530705160833, + "learning_rate": 0.0009685494922378193, + "loss": 1.4729, + "step": 1304 + }, + { + "epoch": 0.14, + "grad_norm": 0.09780466008302127, + "learning_rate": 0.0009684886892771497, + "loss": 1.5751, + "step": 1305 + }, + { + "epoch": 0.14, + "grad_norm": 0.10446303843416181, + "learning_rate": 0.0009684278295106112, + "loss": 1.521, + "step": 1306 + }, + { + "epoch": 0.14, + "grad_norm": 0.08583740614987581, + "learning_rate": 0.0009683669129455836, + "loss": 1.4352, + "step": 1307 + }, + { + "epoch": 0.14, + "grad_norm": 0.09548449846071512, + "learning_rate": 0.0009683059395894532, + "loss": 1.6236, + "step": 1308 + }, + { + "epoch": 0.14, + "grad_norm": 0.09317317974015707, + "learning_rate": 0.0009682449094496129, + "loss": 1.4718, + "step": 1309 + }, + { + "epoch": 0.14, + "grad_norm": 0.09028344652051286, + "learning_rate": 0.0009681838225334632, + "loss": 1.5197, + "step": 1310 + }, + { + "epoch": 0.14, + "grad_norm": 0.1010454477834947, + "learning_rate": 0.0009681226788484107, + "loss": 1.4355, + "step": 1311 + }, + { + "epoch": 0.14, + "grad_norm": 0.10063424616961393, + "learning_rate": 0.0009680614784018699, + "loss": 1.5216, + "step": 1312 + }, + { + "epoch": 0.14, + "grad_norm": 0.08847368543043868, + "learning_rate": 0.000968000221201261, + "loss": 1.4489, + "step": 1313 + }, + { + "epoch": 0.14, + "grad_norm": 0.08917200687658379, + "learning_rate": 0.0009679389072540118, + "loss": 1.5095, + "step": 1314 + }, + { + "epoch": 0.14, + "grad_norm": 0.08780084899229014, + "learning_rate": 0.0009678775365675569, + "loss": 1.2328, + "step": 1315 + }, + { + "epoch": 0.14, + "grad_norm": 0.09952472380916254, + "learning_rate": 0.0009678161091493377, + "loss": 1.5445, + "step": 1316 + }, + { + "epoch": 0.14, + "grad_norm": 0.0922044875770402, + "learning_rate": 0.0009677546250068024, + "loss": 1.4997, + "step": 1317 + }, + { + "epoch": 0.14, + "grad_norm": 0.08679058782028819, + "learning_rate": 0.0009676930841474063, + "loss": 1.4818, + "step": 1318 + }, + { + "epoch": 0.14, + "grad_norm": 0.09946020728542797, + "learning_rate": 0.0009676314865786113, + "loss": 1.541, + "step": 1319 + }, + { + "epoch": 0.14, + "grad_norm": 0.09924599571927359, + "learning_rate": 0.0009675698323078864, + "loss": 1.4275, + "step": 1320 + }, + { + "epoch": 0.14, + "grad_norm": 0.09736847437990914, + "learning_rate": 0.0009675081213427075, + "loss": 1.3534, + "step": 1321 + }, + { + "epoch": 0.14, + "grad_norm": 0.08981117793178947, + "learning_rate": 0.0009674463536905569, + "loss": 1.5193, + "step": 1322 + }, + { + "epoch": 0.14, + "grad_norm": 0.0892544928899613, + "learning_rate": 0.0009673845293589245, + "loss": 1.4088, + "step": 1323 + }, + { + "epoch": 0.14, + "grad_norm": 0.09692218740693645, + "learning_rate": 0.0009673226483553067, + "loss": 1.4853, + "step": 1324 + }, + { + "epoch": 0.14, + "grad_norm": 0.090931858503463, + "learning_rate": 0.0009672607106872065, + "loss": 1.5478, + "step": 1325 + }, + { + "epoch": 0.14, + "grad_norm": 0.08677645972181737, + "learning_rate": 0.0009671987163621343, + "loss": 1.4351, + "step": 1326 + }, + { + "epoch": 0.14, + "grad_norm": 0.0965811198069088, + "learning_rate": 0.0009671366653876072, + "loss": 1.44, + "step": 1327 + }, + { + "epoch": 0.14, + "grad_norm": 0.10265572360163623, + "learning_rate": 0.0009670745577711489, + "loss": 1.4758, + "step": 1328 + }, + { + "epoch": 0.14, + "grad_norm": 0.10813126200323721, + "learning_rate": 0.0009670123935202901, + "loss": 1.5154, + "step": 1329 + }, + { + "epoch": 0.14, + "grad_norm": 0.0818442543578839, + "learning_rate": 0.0009669501726425686, + "loss": 1.5325, + "step": 1330 + }, + { + "epoch": 0.14, + "grad_norm": 0.08476502614974389, + "learning_rate": 0.0009668878951455289, + "loss": 1.4566, + "step": 1331 + }, + { + "epoch": 0.14, + "grad_norm": 0.0929172330057343, + "learning_rate": 0.0009668255610367222, + "loss": 1.4826, + "step": 1332 + }, + { + "epoch": 0.14, + "grad_norm": 0.11067141747464249, + "learning_rate": 0.0009667631703237069, + "loss": 1.4778, + "step": 1333 + }, + { + "epoch": 0.14, + "grad_norm": 0.09895618231426587, + "learning_rate": 0.000966700723014048, + "loss": 1.4972, + "step": 1334 + }, + { + "epoch": 0.14, + "grad_norm": 0.09258309463650612, + "learning_rate": 0.0009666382191153175, + "loss": 1.5373, + "step": 1335 + }, + { + "epoch": 0.14, + "grad_norm": 0.08478282889023635, + "learning_rate": 0.0009665756586350942, + "loss": 1.3624, + "step": 1336 + }, + { + "epoch": 0.14, + "grad_norm": 0.1029379375379849, + "learning_rate": 0.0009665130415809636, + "loss": 1.5687, + "step": 1337 + }, + { + "epoch": 0.14, + "grad_norm": 0.10999399674084075, + "learning_rate": 0.0009664503679605186, + "loss": 1.4364, + "step": 1338 + }, + { + "epoch": 0.14, + "grad_norm": 0.1002629659584574, + "learning_rate": 0.0009663876377813583, + "loss": 1.4827, + "step": 1339 + }, + { + "epoch": 0.14, + "grad_norm": 0.09189118197800461, + "learning_rate": 0.000966324851051089, + "loss": 1.5273, + "step": 1340 + }, + { + "epoch": 0.14, + "grad_norm": 0.08846946549951583, + "learning_rate": 0.0009662620077773239, + "loss": 1.4333, + "step": 1341 + }, + { + "epoch": 0.14, + "grad_norm": 0.09374605110911702, + "learning_rate": 0.0009661991079676828, + "loss": 1.547, + "step": 1342 + }, + { + "epoch": 0.14, + "grad_norm": 0.08242169059262731, + "learning_rate": 0.0009661361516297928, + "loss": 1.4459, + "step": 1343 + }, + { + "epoch": 0.14, + "grad_norm": 0.09832558372243909, + "learning_rate": 0.0009660731387712873, + "loss": 1.5649, + "step": 1344 + }, + { + "epoch": 0.14, + "grad_norm": 0.10825358627457048, + "learning_rate": 0.0009660100693998071, + "loss": 1.4277, + "step": 1345 + }, + { + "epoch": 0.14, + "grad_norm": 0.09313205123054559, + "learning_rate": 0.0009659469435229992, + "loss": 1.5624, + "step": 1346 + }, + { + "epoch": 0.14, + "grad_norm": 0.07998615164356912, + "learning_rate": 0.0009658837611485181, + "loss": 1.4637, + "step": 1347 + }, + { + "epoch": 0.14, + "grad_norm": 0.10552045428986864, + "learning_rate": 0.0009658205222840249, + "loss": 1.454, + "step": 1348 + }, + { + "epoch": 0.15, + "grad_norm": 0.08907788707383209, + "learning_rate": 0.0009657572269371873, + "loss": 1.5136, + "step": 1349 + }, + { + "epoch": 0.15, + "grad_norm": 0.08617641281827475, + "learning_rate": 0.0009656938751156802, + "loss": 1.4615, + "step": 1350 + }, + { + "epoch": 0.15, + "grad_norm": 0.09921990951476314, + "learning_rate": 0.0009656304668271854, + "loss": 1.4423, + "step": 1351 + }, + { + "epoch": 0.15, + "grad_norm": 0.10024579878753528, + "learning_rate": 0.000965567002079391, + "loss": 1.4746, + "step": 1352 + }, + { + "epoch": 0.15, + "grad_norm": 0.09568464334273936, + "learning_rate": 0.0009655034808799928, + "loss": 1.5115, + "step": 1353 + }, + { + "epoch": 0.15, + "grad_norm": 0.0948152667533343, + "learning_rate": 0.0009654399032366925, + "loss": 1.482, + "step": 1354 + }, + { + "epoch": 0.15, + "grad_norm": 0.09802476493056436, + "learning_rate": 0.0009653762691571995, + "loss": 1.427, + "step": 1355 + }, + { + "epoch": 0.15, + "grad_norm": 0.08709415251464686, + "learning_rate": 0.0009653125786492294, + "loss": 1.4658, + "step": 1356 + }, + { + "epoch": 0.15, + "grad_norm": 0.08686335295193764, + "learning_rate": 0.0009652488317205048, + "loss": 1.3828, + "step": 1357 + }, + { + "epoch": 0.15, + "grad_norm": 0.10936667844927866, + "learning_rate": 0.0009651850283787555, + "loss": 1.6204, + "step": 1358 + }, + { + "epoch": 0.15, + "grad_norm": 0.08318505997719156, + "learning_rate": 0.000965121168631718, + "loss": 1.3626, + "step": 1359 + }, + { + "epoch": 0.15, + "grad_norm": 0.0981715400028448, + "learning_rate": 0.000965057252487135, + "loss": 1.4695, + "step": 1360 + }, + { + "epoch": 0.15, + "grad_norm": 0.0855374725173818, + "learning_rate": 0.000964993279952757, + "loss": 1.4296, + "step": 1361 + }, + { + "epoch": 0.15, + "grad_norm": 0.09090872788305977, + "learning_rate": 0.0009649292510363407, + "loss": 1.4942, + "step": 1362 + }, + { + "epoch": 0.15, + "grad_norm": 0.08903024982584848, + "learning_rate": 0.0009648651657456497, + "loss": 1.53, + "step": 1363 + }, + { + "epoch": 0.15, + "grad_norm": 0.09651472004451829, + "learning_rate": 0.0009648010240884549, + "loss": 1.4141, + "step": 1364 + }, + { + "epoch": 0.15, + "grad_norm": 0.0975116971187096, + "learning_rate": 0.0009647368260725335, + "loss": 1.6606, + "step": 1365 + }, + { + "epoch": 0.15, + "grad_norm": 0.09109281987857183, + "learning_rate": 0.0009646725717056696, + "loss": 1.4704, + "step": 1366 + }, + { + "epoch": 0.15, + "grad_norm": 0.08977013401911238, + "learning_rate": 0.0009646082609956546, + "loss": 1.4401, + "step": 1367 + }, + { + "epoch": 0.15, + "grad_norm": 0.09619001203581924, + "learning_rate": 0.0009645438939502862, + "loss": 1.4719, + "step": 1368 + }, + { + "epoch": 0.15, + "grad_norm": 0.0920101285961825, + "learning_rate": 0.000964479470577369, + "loss": 1.4291, + "step": 1369 + }, + { + "epoch": 0.15, + "grad_norm": 0.10189576673060297, + "learning_rate": 0.0009644149908847148, + "loss": 1.3505, + "step": 1370 + }, + { + "epoch": 0.15, + "grad_norm": 0.10141333070010058, + "learning_rate": 0.0009643504548801418, + "loss": 1.5281, + "step": 1371 + }, + { + "epoch": 0.15, + "grad_norm": 0.10183136911830679, + "learning_rate": 0.0009642858625714753, + "loss": 1.617, + "step": 1372 + }, + { + "epoch": 0.15, + "grad_norm": 0.08938428018230365, + "learning_rate": 0.0009642212139665474, + "loss": 1.5241, + "step": 1373 + }, + { + "epoch": 0.15, + "grad_norm": 0.09867932384400704, + "learning_rate": 0.0009641565090731968, + "loss": 1.4051, + "step": 1374 + }, + { + "epoch": 0.15, + "grad_norm": 0.08685976647015854, + "learning_rate": 0.0009640917478992692, + "loss": 1.4531, + "step": 1375 + }, + { + "epoch": 0.15, + "grad_norm": 0.08388522860676766, + "learning_rate": 0.0009640269304526175, + "loss": 1.4059, + "step": 1376 + }, + { + "epoch": 0.15, + "grad_norm": 0.08755849762519224, + "learning_rate": 0.0009639620567411005, + "loss": 1.4757, + "step": 1377 + }, + { + "epoch": 0.15, + "grad_norm": 0.08967987306203287, + "learning_rate": 0.0009638971267725846, + "loss": 1.4917, + "step": 1378 + }, + { + "epoch": 0.15, + "grad_norm": 0.09337617930404216, + "learning_rate": 0.0009638321405549429, + "loss": 1.4998, + "step": 1379 + }, + { + "epoch": 0.15, + "grad_norm": 0.11438394374416985, + "learning_rate": 0.0009637670980960549, + "loss": 1.3926, + "step": 1380 + }, + { + "epoch": 0.15, + "grad_norm": 0.08596327108077496, + "learning_rate": 0.0009637019994038076, + "loss": 1.4768, + "step": 1381 + }, + { + "epoch": 0.15, + "grad_norm": 0.10081100976304277, + "learning_rate": 0.0009636368444860941, + "loss": 1.4218, + "step": 1382 + }, + { + "epoch": 0.15, + "grad_norm": 0.08628745782550797, + "learning_rate": 0.0009635716333508149, + "loss": 1.5145, + "step": 1383 + }, + { + "epoch": 0.15, + "grad_norm": 0.09973150834603343, + "learning_rate": 0.000963506366005877, + "loss": 1.5565, + "step": 1384 + }, + { + "epoch": 0.15, + "grad_norm": 0.0915294973766297, + "learning_rate": 0.0009634410424591941, + "loss": 1.5018, + "step": 1385 + }, + { + "epoch": 0.15, + "grad_norm": 0.09358282880478652, + "learning_rate": 0.0009633756627186874, + "loss": 1.4947, + "step": 1386 + }, + { + "epoch": 0.15, + "grad_norm": 0.08854602719419898, + "learning_rate": 0.0009633102267922838, + "loss": 1.3823, + "step": 1387 + }, + { + "epoch": 0.15, + "grad_norm": 0.09370252902472906, + "learning_rate": 0.0009632447346879181, + "loss": 1.372, + "step": 1388 + }, + { + "epoch": 0.15, + "grad_norm": 0.07704849046364678, + "learning_rate": 0.0009631791864135313, + "loss": 1.3268, + "step": 1389 + }, + { + "epoch": 0.15, + "grad_norm": 0.09768848364054729, + "learning_rate": 0.0009631135819770711, + "loss": 1.4281, + "step": 1390 + }, + { + "epoch": 0.15, + "grad_norm": 0.10743879271717618, + "learning_rate": 0.0009630479213864927, + "loss": 1.6314, + "step": 1391 + }, + { + "epoch": 0.15, + "grad_norm": 0.11841657156720103, + "learning_rate": 0.0009629822046497573, + "loss": 1.455, + "step": 1392 + }, + { + "epoch": 0.15, + "grad_norm": 0.0871394521914947, + "learning_rate": 0.0009629164317748335, + "loss": 1.4534, + "step": 1393 + }, + { + "epoch": 0.15, + "grad_norm": 0.10329392146592385, + "learning_rate": 0.0009628506027696966, + "loss": 1.4848, + "step": 1394 + }, + { + "epoch": 0.15, + "grad_norm": 0.08307046208995565, + "learning_rate": 0.0009627847176423282, + "loss": 1.4573, + "step": 1395 + }, + { + "epoch": 0.15, + "grad_norm": 0.10441134569877149, + "learning_rate": 0.0009627187764007175, + "loss": 1.4169, + "step": 1396 + }, + { + "epoch": 0.15, + "grad_norm": 0.11076009082556416, + "learning_rate": 0.0009626527790528599, + "loss": 1.4463, + "step": 1397 + }, + { + "epoch": 0.15, + "grad_norm": 0.09972177969927494, + "learning_rate": 0.0009625867256067577, + "loss": 1.4483, + "step": 1398 + }, + { + "epoch": 0.15, + "grad_norm": 0.10232915510331024, + "learning_rate": 0.0009625206160704203, + "loss": 1.4843, + "step": 1399 + }, + { + "epoch": 0.15, + "grad_norm": 0.10033949503613594, + "learning_rate": 0.0009624544504518636, + "loss": 1.4372, + "step": 1400 + }, + { + "epoch": 0.15, + "grad_norm": 0.10470825877339096, + "learning_rate": 0.0009623882287591106, + "loss": 1.4166, + "step": 1401 + }, + { + "epoch": 0.15, + "grad_norm": 0.09177089511218502, + "learning_rate": 0.0009623219510001906, + "loss": 1.1729, + "step": 1402 + }, + { + "epoch": 0.15, + "grad_norm": 0.09485966877207715, + "learning_rate": 0.0009622556171831403, + "loss": 1.5652, + "step": 1403 + }, + { + "epoch": 0.15, + "grad_norm": 0.0908345472809001, + "learning_rate": 0.0009621892273160027, + "loss": 1.4296, + "step": 1404 + }, + { + "epoch": 0.15, + "grad_norm": 0.09258681147903781, + "learning_rate": 0.0009621227814068281, + "loss": 1.3104, + "step": 1405 + }, + { + "epoch": 0.15, + "grad_norm": 0.09872348083382965, + "learning_rate": 0.0009620562794636728, + "loss": 1.4792, + "step": 1406 + }, + { + "epoch": 0.15, + "grad_norm": 0.10549538108369423, + "learning_rate": 0.0009619897214946007, + "loss": 1.4798, + "step": 1407 + }, + { + "epoch": 0.15, + "grad_norm": 0.10762569208689654, + "learning_rate": 0.0009619231075076823, + "loss": 1.5403, + "step": 1408 + }, + { + "epoch": 0.15, + "grad_norm": 0.10595909019966, + "learning_rate": 0.0009618564375109945, + "loss": 1.4209, + "step": 1409 + }, + { + "epoch": 0.15, + "grad_norm": 0.09530047576327778, + "learning_rate": 0.0009617897115126215, + "loss": 1.5453, + "step": 1410 + }, + { + "epoch": 0.15, + "grad_norm": 0.09150305099250461, + "learning_rate": 0.0009617229295206537, + "loss": 1.4043, + "step": 1411 + }, + { + "epoch": 0.15, + "grad_norm": 0.10734559081510649, + "learning_rate": 0.0009616560915431891, + "loss": 1.5386, + "step": 1412 + }, + { + "epoch": 0.15, + "grad_norm": 0.1166936881059637, + "learning_rate": 0.000961589197588332, + "loss": 1.4432, + "step": 1413 + }, + { + "epoch": 0.15, + "grad_norm": 0.09399762085255639, + "learning_rate": 0.000961522247664193, + "loss": 1.3782, + "step": 1414 + }, + { + "epoch": 0.15, + "grad_norm": 0.10108051188315073, + "learning_rate": 0.0009614552417788906, + "loss": 1.5388, + "step": 1415 + }, + { + "epoch": 0.15, + "grad_norm": 0.08425065039992434, + "learning_rate": 0.0009613881799405491, + "loss": 1.3643, + "step": 1416 + }, + { + "epoch": 0.15, + "grad_norm": 0.08774005124641272, + "learning_rate": 0.0009613210621573001, + "loss": 1.4577, + "step": 1417 + }, + { + "epoch": 0.15, + "grad_norm": 0.11048355058020695, + "learning_rate": 0.0009612538884372821, + "loss": 1.4588, + "step": 1418 + }, + { + "epoch": 0.15, + "grad_norm": 0.08641151132612267, + "learning_rate": 0.0009611866587886399, + "loss": 1.5007, + "step": 1419 + }, + { + "epoch": 0.15, + "grad_norm": 0.09339672697113094, + "learning_rate": 0.0009611193732195254, + "loss": 1.4057, + "step": 1420 + }, + { + "epoch": 0.15, + "grad_norm": 0.08382803898484295, + "learning_rate": 0.000961052031738097, + "loss": 1.4925, + "step": 1421 + }, + { + "epoch": 0.15, + "grad_norm": 0.13267703370359948, + "learning_rate": 0.0009609846343525204, + "loss": 1.3566, + "step": 1422 + }, + { + "epoch": 0.15, + "grad_norm": 0.09158613485789965, + "learning_rate": 0.0009609171810709676, + "loss": 1.4777, + "step": 1423 + }, + { + "epoch": 0.15, + "grad_norm": 0.09234863197344007, + "learning_rate": 0.0009608496719016175, + "loss": 1.4714, + "step": 1424 + }, + { + "epoch": 0.15, + "grad_norm": 0.08860250240147277, + "learning_rate": 0.0009607821068526559, + "loss": 1.5249, + "step": 1425 + }, + { + "epoch": 0.15, + "grad_norm": 0.09605535819946916, + "learning_rate": 0.0009607144859322754, + "loss": 1.4138, + "step": 1426 + }, + { + "epoch": 0.15, + "grad_norm": 0.0910599326476764, + "learning_rate": 0.000960646809148675, + "loss": 1.393, + "step": 1427 + }, + { + "epoch": 0.15, + "grad_norm": 0.08997170262147476, + "learning_rate": 0.000960579076510061, + "loss": 1.4954, + "step": 1428 + }, + { + "epoch": 0.15, + "grad_norm": 0.09321262030864544, + "learning_rate": 0.0009605112880246462, + "loss": 1.4509, + "step": 1429 + }, + { + "epoch": 0.15, + "grad_norm": 0.08691768022573092, + "learning_rate": 0.00096044344370065, + "loss": 1.6263, + "step": 1430 + }, + { + "epoch": 0.15, + "grad_norm": 0.09869112519405952, + "learning_rate": 0.0009603755435462989, + "loss": 1.5487, + "step": 1431 + }, + { + "epoch": 0.15, + "grad_norm": 0.0929891550387196, + "learning_rate": 0.000960307587569826, + "loss": 1.5137, + "step": 1432 + }, + { + "epoch": 0.15, + "grad_norm": 0.09569271308352895, + "learning_rate": 0.0009602395757794711, + "loss": 1.4777, + "step": 1433 + }, + { + "epoch": 0.15, + "grad_norm": 0.09755014615911316, + "learning_rate": 0.000960171508183481, + "loss": 1.4682, + "step": 1434 + }, + { + "epoch": 0.15, + "grad_norm": 0.0972474147226531, + "learning_rate": 0.0009601033847901091, + "loss": 1.5142, + "step": 1435 + }, + { + "epoch": 0.15, + "grad_norm": 0.08735796798182946, + "learning_rate": 0.0009600352056076154, + "loss": 1.4992, + "step": 1436 + }, + { + "epoch": 0.15, + "grad_norm": 0.09119723664332861, + "learning_rate": 0.0009599669706442672, + "loss": 1.4638, + "step": 1437 + }, + { + "epoch": 0.15, + "grad_norm": 0.12378601755710915, + "learning_rate": 0.000959898679908338, + "loss": 1.6005, + "step": 1438 + }, + { + "epoch": 0.15, + "grad_norm": 0.10478109012333306, + "learning_rate": 0.0009598303334081085, + "loss": 1.4894, + "step": 1439 + }, + { + "epoch": 0.15, + "grad_norm": 0.23570162511872436, + "learning_rate": 0.0009597619311518657, + "loss": 1.5644, + "step": 1440 + }, + { + "epoch": 0.15, + "grad_norm": 0.10682124284391994, + "learning_rate": 0.0009596934731479036, + "loss": 1.5554, + "step": 1441 + }, + { + "epoch": 0.16, + "grad_norm": 0.09295343956178324, + "learning_rate": 0.0009596249594045232, + "loss": 1.438, + "step": 1442 + }, + { + "epoch": 0.16, + "grad_norm": 0.09713099725191914, + "learning_rate": 0.0009595563899300319, + "loss": 1.4028, + "step": 1443 + }, + { + "epoch": 0.16, + "grad_norm": 0.08720151241947806, + "learning_rate": 0.000959487764732744, + "loss": 1.3532, + "step": 1444 + }, + { + "epoch": 0.16, + "grad_norm": 0.0886950843743924, + "learning_rate": 0.0009594190838209805, + "loss": 1.5788, + "step": 1445 + }, + { + "epoch": 0.16, + "grad_norm": 0.08793516619885679, + "learning_rate": 0.0009593503472030692, + "loss": 1.3038, + "step": 1446 + }, + { + "epoch": 0.16, + "grad_norm": 0.09144261031291229, + "learning_rate": 0.0009592815548873448, + "loss": 1.3458, + "step": 1447 + }, + { + "epoch": 0.16, + "grad_norm": 0.09311955286422781, + "learning_rate": 0.0009592127068821484, + "loss": 1.5537, + "step": 1448 + }, + { + "epoch": 0.16, + "grad_norm": 0.08723657754682848, + "learning_rate": 0.0009591438031958282, + "loss": 1.3372, + "step": 1449 + }, + { + "epoch": 0.16, + "grad_norm": 0.09217854048138731, + "learning_rate": 0.0009590748438367388, + "loss": 1.5481, + "step": 1450 + }, + { + "epoch": 0.16, + "grad_norm": 0.08500728494003494, + "learning_rate": 0.000959005828813242, + "loss": 1.4458, + "step": 1451 + }, + { + "epoch": 0.16, + "grad_norm": 0.0853018457118595, + "learning_rate": 0.0009589367581337061, + "loss": 1.424, + "step": 1452 + }, + { + "epoch": 0.16, + "grad_norm": 0.09868598911419524, + "learning_rate": 0.000958867631806506, + "loss": 1.3293, + "step": 1453 + }, + { + "epoch": 0.16, + "grad_norm": 0.08639367094360562, + "learning_rate": 0.0009587984498400235, + "loss": 1.3447, + "step": 1454 + }, + { + "epoch": 0.16, + "grad_norm": 0.10359812768616228, + "learning_rate": 0.0009587292122426475, + "loss": 1.4611, + "step": 1455 + }, + { + "epoch": 0.16, + "grad_norm": 0.11130702504760993, + "learning_rate": 0.0009586599190227728, + "loss": 1.4713, + "step": 1456 + }, + { + "epoch": 0.16, + "grad_norm": 0.08535204266384723, + "learning_rate": 0.0009585905701888018, + "loss": 1.4174, + "step": 1457 + }, + { + "epoch": 0.16, + "grad_norm": 0.09085953542296663, + "learning_rate": 0.0009585211657491431, + "loss": 1.4496, + "step": 1458 + }, + { + "epoch": 0.16, + "grad_norm": 0.08848742414165559, + "learning_rate": 0.0009584517057122122, + "loss": 1.5021, + "step": 1459 + }, + { + "epoch": 0.16, + "grad_norm": 0.08832297742063362, + "learning_rate": 0.0009583821900864315, + "loss": 1.494, + "step": 1460 + }, + { + "epoch": 0.16, + "grad_norm": 0.08922415214466346, + "learning_rate": 0.0009583126188802302, + "loss": 1.5711, + "step": 1461 + }, + { + "epoch": 0.16, + "grad_norm": 0.09999174702384374, + "learning_rate": 0.0009582429921020436, + "loss": 1.5518, + "step": 1462 + }, + { + "epoch": 0.16, + "grad_norm": 0.09822083586247843, + "learning_rate": 0.0009581733097603145, + "loss": 1.3522, + "step": 1463 + }, + { + "epoch": 0.16, + "grad_norm": 0.10384201828720639, + "learning_rate": 0.0009581035718634919, + "loss": 1.4244, + "step": 1464 + }, + { + "epoch": 0.16, + "grad_norm": 0.09055085355297395, + "learning_rate": 0.0009580337784200319, + "loss": 1.5444, + "step": 1465 + }, + { + "epoch": 0.16, + "grad_norm": 0.10518106325110177, + "learning_rate": 0.0009579639294383973, + "loss": 1.4364, + "step": 1466 + }, + { + "epoch": 0.16, + "grad_norm": 0.0967908735277497, + "learning_rate": 0.0009578940249270573, + "loss": 1.457, + "step": 1467 + }, + { + "epoch": 0.16, + "grad_norm": 0.09207038006235847, + "learning_rate": 0.0009578240648944882, + "loss": 1.4476, + "step": 1468 + }, + { + "epoch": 0.16, + "grad_norm": 0.10043286509181094, + "learning_rate": 0.000957754049349173, + "loss": 1.4658, + "step": 1469 + }, + { + "epoch": 0.16, + "grad_norm": 0.09054729227475117, + "learning_rate": 0.0009576839782996012, + "loss": 1.4334, + "step": 1470 + }, + { + "epoch": 0.16, + "grad_norm": 0.08377663291202349, + "learning_rate": 0.000957613851754269, + "loss": 1.4492, + "step": 1471 + }, + { + "epoch": 0.16, + "grad_norm": 0.11060837805665544, + "learning_rate": 0.0009575436697216797, + "loss": 1.4966, + "step": 1472 + }, + { + "epoch": 0.16, + "grad_norm": 0.09969183136235889, + "learning_rate": 0.0009574734322103431, + "loss": 1.5009, + "step": 1473 + }, + { + "epoch": 0.16, + "grad_norm": 0.09967141930559241, + "learning_rate": 0.0009574031392287757, + "loss": 1.4341, + "step": 1474 + }, + { + "epoch": 0.16, + "grad_norm": 0.09799525072093489, + "learning_rate": 0.0009573327907855006, + "loss": 1.4278, + "step": 1475 + }, + { + "epoch": 0.16, + "grad_norm": 0.08604866662176865, + "learning_rate": 0.0009572623868890481, + "loss": 1.4587, + "step": 1476 + }, + { + "epoch": 0.16, + "grad_norm": 0.09265415277930036, + "learning_rate": 0.0009571919275479548, + "loss": 1.4699, + "step": 1477 + }, + { + "epoch": 0.16, + "grad_norm": 0.10062297583552167, + "learning_rate": 0.0009571214127707639, + "loss": 1.4039, + "step": 1478 + }, + { + "epoch": 0.16, + "grad_norm": 0.09118049223741401, + "learning_rate": 0.000957050842566026, + "loss": 1.4081, + "step": 1479 + }, + { + "epoch": 0.16, + "grad_norm": 0.08501681321065166, + "learning_rate": 0.0009569802169422976, + "loss": 1.4726, + "step": 1480 + }, + { + "epoch": 0.16, + "grad_norm": 0.08422437325947779, + "learning_rate": 0.0009569095359081426, + "loss": 1.4094, + "step": 1481 + }, + { + "epoch": 0.16, + "grad_norm": 0.09366136573080257, + "learning_rate": 0.000956838799472131, + "loss": 1.3966, + "step": 1482 + }, + { + "epoch": 0.16, + "grad_norm": 0.08511044239192846, + "learning_rate": 0.00095676800764284, + "loss": 1.464, + "step": 1483 + }, + { + "epoch": 0.16, + "grad_norm": 0.09087661694984668, + "learning_rate": 0.0009566971604288534, + "loss": 1.4871, + "step": 1484 + }, + { + "epoch": 0.16, + "grad_norm": 0.1192006425086519, + "learning_rate": 0.0009566262578387617, + "loss": 1.404, + "step": 1485 + }, + { + "epoch": 0.16, + "grad_norm": 0.09748627055341871, + "learning_rate": 0.000956555299881162, + "loss": 1.3744, + "step": 1486 + }, + { + "epoch": 0.16, + "grad_norm": 0.09993689583714448, + "learning_rate": 0.000956484286564658, + "loss": 1.5675, + "step": 1487 + }, + { + "epoch": 0.16, + "grad_norm": 0.10586802308275939, + "learning_rate": 0.0009564132178978606, + "loss": 1.4087, + "step": 1488 + }, + { + "epoch": 0.16, + "grad_norm": 0.09472377025775539, + "learning_rate": 0.0009563420938893871, + "loss": 1.4368, + "step": 1489 + }, + { + "epoch": 0.16, + "grad_norm": 0.10012467647677623, + "learning_rate": 0.0009562709145478615, + "loss": 1.4937, + "step": 1490 + }, + { + "epoch": 0.16, + "grad_norm": 0.10303077194874116, + "learning_rate": 0.0009561996798819145, + "loss": 1.4654, + "step": 1491 + }, + { + "epoch": 0.16, + "grad_norm": 0.10113641879804244, + "learning_rate": 0.0009561283899001835, + "loss": 1.5285, + "step": 1492 + }, + { + "epoch": 0.16, + "grad_norm": 0.09589102118147506, + "learning_rate": 0.0009560570446113128, + "loss": 1.4873, + "step": 1493 + }, + { + "epoch": 0.16, + "grad_norm": 0.09126782272024617, + "learning_rate": 0.000955985644023953, + "loss": 1.5029, + "step": 1494 + }, + { + "epoch": 0.16, + "grad_norm": 0.09001747858385097, + "learning_rate": 0.0009559141881467619, + "loss": 1.3982, + "step": 1495 + }, + { + "epoch": 0.16, + "grad_norm": 0.10262554472649192, + "learning_rate": 0.0009558426769884039, + "loss": 1.5575, + "step": 1496 + }, + { + "epoch": 0.16, + "grad_norm": 0.10556088502827525, + "learning_rate": 0.0009557711105575496, + "loss": 1.483, + "step": 1497 + }, + { + "epoch": 0.16, + "grad_norm": 0.08980632344515092, + "learning_rate": 0.0009556994888628769, + "loss": 1.3076, + "step": 1498 + }, + { + "epoch": 0.16, + "grad_norm": 0.090263236114018, + "learning_rate": 0.0009556278119130701, + "loss": 1.3918, + "step": 1499 + }, + { + "epoch": 0.16, + "grad_norm": 0.09906411529902781, + "learning_rate": 0.0009555560797168204, + "loss": 1.5834, + "step": 1500 + }, + { + "epoch": 0.16, + "grad_norm": 0.09241869135981694, + "learning_rate": 0.0009554842922828254, + "loss": 1.5699, + "step": 1501 + }, + { + "epoch": 0.16, + "grad_norm": 0.09344833611604621, + "learning_rate": 0.0009554124496197897, + "loss": 1.3565, + "step": 1502 + }, + { + "epoch": 0.16, + "grad_norm": 0.09286482726655389, + "learning_rate": 0.0009553405517364244, + "loss": 1.4, + "step": 1503 + }, + { + "epoch": 0.16, + "grad_norm": 0.09532973376012242, + "learning_rate": 0.0009552685986414475, + "loss": 1.4654, + "step": 1504 + }, + { + "epoch": 0.16, + "grad_norm": 0.09067680572772746, + "learning_rate": 0.0009551965903435835, + "loss": 1.4848, + "step": 1505 + }, + { + "epoch": 0.16, + "grad_norm": 0.1040664046288145, + "learning_rate": 0.0009551245268515636, + "loss": 1.4488, + "step": 1506 + }, + { + "epoch": 0.16, + "grad_norm": 0.09429619546311979, + "learning_rate": 0.0009550524081741256, + "loss": 1.5326, + "step": 1507 + }, + { + "epoch": 0.16, + "grad_norm": 0.10532072284542995, + "learning_rate": 0.0009549802343200145, + "loss": 1.3992, + "step": 1508 + }, + { + "epoch": 0.16, + "grad_norm": 0.08912863106155763, + "learning_rate": 0.0009549080052979813, + "loss": 1.4448, + "step": 1509 + }, + { + "epoch": 0.16, + "grad_norm": 0.1003030251918555, + "learning_rate": 0.0009548357211167841, + "loss": 1.484, + "step": 1510 + }, + { + "epoch": 0.16, + "grad_norm": 0.11243803152082767, + "learning_rate": 0.000954763381785188, + "loss": 1.4291, + "step": 1511 + }, + { + "epoch": 0.16, + "grad_norm": 0.1001886187096508, + "learning_rate": 0.0009546909873119636, + "loss": 1.4383, + "step": 1512 + }, + { + "epoch": 0.16, + "grad_norm": 0.10231401178866115, + "learning_rate": 0.0009546185377058898, + "loss": 1.5126, + "step": 1513 + }, + { + "epoch": 0.16, + "grad_norm": 0.104553472069901, + "learning_rate": 0.0009545460329757507, + "loss": 1.6627, + "step": 1514 + }, + { + "epoch": 0.16, + "grad_norm": 0.08907543138688107, + "learning_rate": 0.0009544734731303382, + "loss": 1.5044, + "step": 1515 + }, + { + "epoch": 0.16, + "grad_norm": 0.12886965839182934, + "learning_rate": 0.0009544008581784503, + "loss": 1.4528, + "step": 1516 + }, + { + "epoch": 0.16, + "grad_norm": 0.09209918402576728, + "learning_rate": 0.0009543281881288918, + "loss": 1.468, + "step": 1517 + }, + { + "epoch": 0.16, + "grad_norm": 0.09475658742189953, + "learning_rate": 0.0009542554629904741, + "loss": 1.3359, + "step": 1518 + }, + { + "epoch": 0.16, + "grad_norm": 0.08776235492227999, + "learning_rate": 0.0009541826827720155, + "loss": 1.4278, + "step": 1519 + }, + { + "epoch": 0.16, + "grad_norm": 0.0973510550854703, + "learning_rate": 0.0009541098474823408, + "loss": 1.4128, + "step": 1520 + }, + { + "epoch": 0.16, + "grad_norm": 0.08791160995249331, + "learning_rate": 0.0009540369571302815, + "loss": 1.4823, + "step": 1521 + }, + { + "epoch": 0.16, + "grad_norm": 0.1104543848288305, + "learning_rate": 0.0009539640117246759, + "loss": 1.474, + "step": 1522 + }, + { + "epoch": 0.16, + "grad_norm": 0.09377728378127773, + "learning_rate": 0.0009538910112743687, + "loss": 1.5595, + "step": 1523 + }, + { + "epoch": 0.16, + "grad_norm": 0.08352231010091461, + "learning_rate": 0.0009538179557882117, + "loss": 1.4802, + "step": 1524 + }, + { + "epoch": 0.16, + "grad_norm": 0.10346239673786903, + "learning_rate": 0.000953744845275063, + "loss": 1.5015, + "step": 1525 + }, + { + "epoch": 0.16, + "grad_norm": 0.10087785904923345, + "learning_rate": 0.0009536716797437875, + "loss": 1.5315, + "step": 1526 + }, + { + "epoch": 0.16, + "grad_norm": 0.09040805736131872, + "learning_rate": 0.0009535984592032569, + "loss": 1.4685, + "step": 1527 + }, + { + "epoch": 0.16, + "grad_norm": 0.12036277893925264, + "learning_rate": 0.0009535251836623491, + "loss": 1.39, + "step": 1528 + }, + { + "epoch": 0.16, + "grad_norm": 0.09447315081864598, + "learning_rate": 0.0009534518531299494, + "loss": 1.5542, + "step": 1529 + }, + { + "epoch": 0.16, + "grad_norm": 0.09873458361328201, + "learning_rate": 0.0009533784676149492, + "loss": 1.4234, + "step": 1530 + }, + { + "epoch": 0.16, + "grad_norm": 0.09101268943669456, + "learning_rate": 0.0009533050271262467, + "loss": 1.4116, + "step": 1531 + }, + { + "epoch": 0.16, + "grad_norm": 0.08897558088863952, + "learning_rate": 0.0009532315316727469, + "loss": 1.4038, + "step": 1532 + }, + { + "epoch": 0.16, + "grad_norm": 0.11018902277583942, + "learning_rate": 0.0009531579812633615, + "loss": 1.4236, + "step": 1533 + }, + { + "epoch": 0.16, + "grad_norm": 0.09125555762503207, + "learning_rate": 0.0009530843759070085, + "loss": 1.5578, + "step": 1534 + }, + { + "epoch": 0.17, + "grad_norm": 0.09385367834635039, + "learning_rate": 0.0009530107156126129, + "loss": 1.5099, + "step": 1535 + }, + { + "epoch": 0.17, + "grad_norm": 0.10683817721639628, + "learning_rate": 0.0009529370003891062, + "loss": 1.6342, + "step": 1536 + }, + { + "epoch": 0.17, + "grad_norm": 0.08967972247987649, + "learning_rate": 0.0009528632302454268, + "loss": 1.4171, + "step": 1537 + }, + { + "epoch": 0.17, + "grad_norm": 0.09753788211751449, + "learning_rate": 0.0009527894051905194, + "loss": 1.4745, + "step": 1538 + }, + { + "epoch": 0.17, + "grad_norm": 0.09304012816220279, + "learning_rate": 0.0009527155252333357, + "loss": 1.5435, + "step": 1539 + }, + { + "epoch": 0.17, + "grad_norm": 0.09474919727112473, + "learning_rate": 0.0009526415903828337, + "loss": 1.3861, + "step": 1540 + }, + { + "epoch": 0.17, + "grad_norm": 0.07778975111369331, + "learning_rate": 0.0009525676006479784, + "loss": 1.4173, + "step": 1541 + }, + { + "epoch": 0.17, + "grad_norm": 0.09020032560229345, + "learning_rate": 0.0009524935560377414, + "loss": 1.4258, + "step": 1542 + }, + { + "epoch": 0.17, + "grad_norm": 0.09665307246990602, + "learning_rate": 0.0009524194565611006, + "loss": 1.4578, + "step": 1543 + }, + { + "epoch": 0.17, + "grad_norm": 0.09800227849364905, + "learning_rate": 0.000952345302227041, + "loss": 1.367, + "step": 1544 + }, + { + "epoch": 0.17, + "grad_norm": 0.1039422730155937, + "learning_rate": 0.0009522710930445541, + "loss": 1.5948, + "step": 1545 + }, + { + "epoch": 0.17, + "grad_norm": 0.08481742258534228, + "learning_rate": 0.000952196829022638, + "loss": 1.4945, + "step": 1546 + }, + { + "epoch": 0.17, + "grad_norm": 0.08727988396404307, + "learning_rate": 0.0009521225101702973, + "loss": 1.4855, + "step": 1547 + }, + { + "epoch": 0.17, + "grad_norm": 0.10107756346544976, + "learning_rate": 0.0009520481364965435, + "loss": 1.5414, + "step": 1548 + }, + { + "epoch": 0.17, + "grad_norm": 0.08148558962045617, + "learning_rate": 0.0009519737080103948, + "loss": 1.4015, + "step": 1549 + }, + { + "epoch": 0.17, + "grad_norm": 0.09117985257996772, + "learning_rate": 0.0009518992247208758, + "loss": 1.4934, + "step": 1550 + }, + { + "epoch": 0.17, + "grad_norm": 0.10201481363275045, + "learning_rate": 0.000951824686637018, + "loss": 1.4628, + "step": 1551 + }, + { + "epoch": 0.17, + "grad_norm": 0.09022581841205508, + "learning_rate": 0.000951750093767859, + "loss": 1.6013, + "step": 1552 + }, + { + "epoch": 0.17, + "grad_norm": 0.0953335804179374, + "learning_rate": 0.0009516754461224439, + "loss": 1.6041, + "step": 1553 + }, + { + "epoch": 0.17, + "grad_norm": 0.09639577286814874, + "learning_rate": 0.0009516007437098238, + "loss": 1.554, + "step": 1554 + }, + { + "epoch": 0.17, + "grad_norm": 0.09708674309060862, + "learning_rate": 0.0009515259865390564, + "loss": 1.3906, + "step": 1555 + }, + { + "epoch": 0.17, + "grad_norm": 0.09782023489932025, + "learning_rate": 0.0009514511746192068, + "loss": 1.4169, + "step": 1556 + }, + { + "epoch": 0.17, + "grad_norm": 0.08847376964278968, + "learning_rate": 0.0009513763079593456, + "loss": 1.489, + "step": 1557 + }, + { + "epoch": 0.17, + "grad_norm": 0.09418702831874794, + "learning_rate": 0.0009513013865685511, + "loss": 1.4692, + "step": 1558 + }, + { + "epoch": 0.17, + "grad_norm": 0.0839674566660529, + "learning_rate": 0.0009512264104559077, + "loss": 1.4908, + "step": 1559 + }, + { + "epoch": 0.17, + "grad_norm": 0.09666360133659271, + "learning_rate": 0.0009511513796305062, + "loss": 1.519, + "step": 1560 + }, + { + "epoch": 0.17, + "grad_norm": 0.08317259532860914, + "learning_rate": 0.0009510762941014446, + "loss": 1.4297, + "step": 1561 + }, + { + "epoch": 0.17, + "grad_norm": 0.09939011817545634, + "learning_rate": 0.0009510011538778274, + "loss": 1.5525, + "step": 1562 + }, + { + "epoch": 0.17, + "grad_norm": 0.08154865062357083, + "learning_rate": 0.0009509259589687653, + "loss": 1.4989, + "step": 1563 + }, + { + "epoch": 0.17, + "grad_norm": 0.09355040253719421, + "learning_rate": 0.000950850709383376, + "loss": 1.4582, + "step": 1564 + }, + { + "epoch": 0.17, + "grad_norm": 0.09125277909364553, + "learning_rate": 0.0009507754051307841, + "loss": 1.5391, + "step": 1565 + }, + { + "epoch": 0.17, + "grad_norm": 0.089502284335009, + "learning_rate": 0.0009507000462201201, + "loss": 1.543, + "step": 1566 + }, + { + "epoch": 0.17, + "grad_norm": 0.08176070826278141, + "learning_rate": 0.0009506246326605219, + "loss": 1.5341, + "step": 1567 + }, + { + "epoch": 0.17, + "grad_norm": 0.10665943107435298, + "learning_rate": 0.0009505491644611333, + "loss": 1.3918, + "step": 1568 + }, + { + "epoch": 0.17, + "grad_norm": 0.09157527137567606, + "learning_rate": 0.0009504736416311053, + "loss": 1.5549, + "step": 1569 + }, + { + "epoch": 0.17, + "grad_norm": 0.09586127992803163, + "learning_rate": 0.0009503980641795952, + "loss": 1.3776, + "step": 1570 + }, + { + "epoch": 0.17, + "grad_norm": 0.09017513026010152, + "learning_rate": 0.0009503224321157671, + "loss": 1.4205, + "step": 1571 + }, + { + "epoch": 0.17, + "grad_norm": 0.09741079955785971, + "learning_rate": 0.0009502467454487915, + "loss": 1.2827, + "step": 1572 + }, + { + "epoch": 0.17, + "grad_norm": 0.10275736685666167, + "learning_rate": 0.0009501710041878458, + "loss": 1.4762, + "step": 1573 + }, + { + "epoch": 0.17, + "grad_norm": 0.08569182696154201, + "learning_rate": 0.0009500952083421139, + "loss": 1.3994, + "step": 1574 + }, + { + "epoch": 0.17, + "grad_norm": 0.09179001894163673, + "learning_rate": 0.0009500193579207863, + "loss": 1.5314, + "step": 1575 + }, + { + "epoch": 0.17, + "grad_norm": 0.08676494976521502, + "learning_rate": 0.0009499434529330602, + "loss": 1.4571, + "step": 1576 + }, + { + "epoch": 0.17, + "grad_norm": 0.09077194936057632, + "learning_rate": 0.000949867493388139, + "loss": 1.4957, + "step": 1577 + }, + { + "epoch": 0.17, + "grad_norm": 0.10232739854820663, + "learning_rate": 0.0009497914792952333, + "loss": 1.5458, + "step": 1578 + }, + { + "epoch": 0.17, + "grad_norm": 0.10029952697335129, + "learning_rate": 0.0009497154106635604, + "loss": 1.4446, + "step": 1579 + }, + { + "epoch": 0.17, + "grad_norm": 0.1156402004405788, + "learning_rate": 0.0009496392875023432, + "loss": 1.4567, + "step": 1580 + }, + { + "epoch": 0.17, + "grad_norm": 0.09278124851672967, + "learning_rate": 0.0009495631098208124, + "loss": 1.4469, + "step": 1581 + }, + { + "epoch": 0.17, + "grad_norm": 0.10394690523142269, + "learning_rate": 0.0009494868776282046, + "loss": 1.3859, + "step": 1582 + }, + { + "epoch": 0.17, + "grad_norm": 0.09395591900009802, + "learning_rate": 0.0009494105909337633, + "loss": 1.5942, + "step": 1583 + }, + { + "epoch": 0.17, + "grad_norm": 0.10303144085226874, + "learning_rate": 0.0009493342497467385, + "loss": 1.4295, + "step": 1584 + }, + { + "epoch": 0.17, + "grad_norm": 0.1037174632287724, + "learning_rate": 0.000949257854076387, + "loss": 1.4119, + "step": 1585 + }, + { + "epoch": 0.17, + "grad_norm": 0.09553928851204532, + "learning_rate": 0.0009491814039319716, + "loss": 1.3801, + "step": 1586 + }, + { + "epoch": 0.17, + "grad_norm": 0.09651654620253583, + "learning_rate": 0.0009491048993227625, + "loss": 1.4304, + "step": 1587 + }, + { + "epoch": 0.17, + "grad_norm": 0.09455369631618026, + "learning_rate": 0.000949028340258036, + "loss": 1.5054, + "step": 1588 + }, + { + "epoch": 0.17, + "grad_norm": 0.097039379644678, + "learning_rate": 0.0009489517267470753, + "loss": 1.5381, + "step": 1589 + }, + { + "epoch": 0.17, + "grad_norm": 0.09872281104254445, + "learning_rate": 0.00094887505879917, + "loss": 1.4912, + "step": 1590 + }, + { + "epoch": 0.17, + "grad_norm": 0.09155850201559067, + "learning_rate": 0.0009487983364236162, + "loss": 1.5075, + "step": 1591 + }, + { + "epoch": 0.17, + "grad_norm": 0.08864721777779384, + "learning_rate": 0.0009487215596297169, + "loss": 1.5051, + "step": 1592 + }, + { + "epoch": 0.17, + "grad_norm": 0.11145148010181871, + "learning_rate": 0.0009486447284267816, + "loss": 1.5501, + "step": 1593 + }, + { + "epoch": 0.17, + "grad_norm": 0.09424231822009453, + "learning_rate": 0.0009485678428241262, + "loss": 1.4026, + "step": 1594 + }, + { + "epoch": 0.17, + "grad_norm": 0.0936796798826941, + "learning_rate": 0.0009484909028310734, + "loss": 1.4845, + "step": 1595 + }, + { + "epoch": 0.17, + "grad_norm": 0.09572362072102075, + "learning_rate": 0.0009484139084569525, + "loss": 1.5573, + "step": 1596 + }, + { + "epoch": 0.17, + "grad_norm": 0.10489996329436806, + "learning_rate": 0.0009483368597110992, + "loss": 1.3578, + "step": 1597 + }, + { + "epoch": 0.17, + "grad_norm": 0.09287949322524128, + "learning_rate": 0.000948259756602856, + "loss": 1.3336, + "step": 1598 + }, + { + "epoch": 0.17, + "grad_norm": 0.10417263018902385, + "learning_rate": 0.000948182599141572, + "loss": 1.5492, + "step": 1599 + }, + { + "epoch": 0.17, + "grad_norm": 0.09698735516601625, + "learning_rate": 0.0009481053873366027, + "loss": 1.6122, + "step": 1600 + }, + { + "epoch": 0.17, + "grad_norm": 0.10366781086993937, + "learning_rate": 0.0009480281211973103, + "loss": 1.4094, + "step": 1601 + }, + { + "epoch": 0.17, + "grad_norm": 0.08318527269695515, + "learning_rate": 0.0009479508007330638, + "loss": 1.395, + "step": 1602 + }, + { + "epoch": 0.17, + "grad_norm": 0.09637176359968291, + "learning_rate": 0.0009478734259532381, + "loss": 1.4232, + "step": 1603 + }, + { + "epoch": 0.17, + "grad_norm": 0.10117308070962983, + "learning_rate": 0.0009477959968672156, + "loss": 1.4984, + "step": 1604 + }, + { + "epoch": 0.17, + "grad_norm": 0.08695311552966221, + "learning_rate": 0.0009477185134843846, + "loss": 1.6225, + "step": 1605 + }, + { + "epoch": 0.17, + "grad_norm": 0.08627423777317476, + "learning_rate": 0.0009476409758141405, + "loss": 1.5029, + "step": 1606 + }, + { + "epoch": 0.17, + "grad_norm": 0.07656034118664419, + "learning_rate": 0.0009475633838658847, + "loss": 1.3344, + "step": 1607 + }, + { + "epoch": 0.17, + "grad_norm": 0.0928219643216207, + "learning_rate": 0.0009474857376490257, + "loss": 1.4961, + "step": 1608 + }, + { + "epoch": 0.17, + "grad_norm": 0.10412888114786652, + "learning_rate": 0.0009474080371729782, + "loss": 1.3727, + "step": 1609 + }, + { + "epoch": 0.17, + "grad_norm": 0.09068058658406214, + "learning_rate": 0.0009473302824471637, + "loss": 1.5552, + "step": 1610 + }, + { + "epoch": 0.17, + "grad_norm": 0.08843922927390716, + "learning_rate": 0.0009472524734810104, + "loss": 1.5213, + "step": 1611 + }, + { + "epoch": 0.17, + "grad_norm": 0.0915814113209529, + "learning_rate": 0.0009471746102839527, + "loss": 1.5118, + "step": 1612 + }, + { + "epoch": 0.17, + "grad_norm": 0.06836751650965489, + "learning_rate": 0.0009470966928654319, + "loss": 1.4489, + "step": 1613 + }, + { + "epoch": 0.17, + "grad_norm": 0.07747282566984308, + "learning_rate": 0.0009470187212348957, + "loss": 1.4894, + "step": 1614 + }, + { + "epoch": 0.17, + "grad_norm": 0.07132050314074537, + "learning_rate": 0.0009469406954017985, + "loss": 1.5092, + "step": 1615 + }, + { + "epoch": 0.17, + "grad_norm": 0.08277949038549247, + "learning_rate": 0.000946862615375601, + "loss": 1.5643, + "step": 1616 + }, + { + "epoch": 0.17, + "grad_norm": 0.09289209657839693, + "learning_rate": 0.000946784481165771, + "loss": 1.4531, + "step": 1617 + }, + { + "epoch": 0.17, + "grad_norm": 0.10295338032257036, + "learning_rate": 0.0009467062927817822, + "loss": 1.4354, + "step": 1618 + }, + { + "epoch": 0.17, + "grad_norm": 0.09135645998922941, + "learning_rate": 0.0009466280502331154, + "loss": 1.4541, + "step": 1619 + }, + { + "epoch": 0.17, + "grad_norm": 0.11184134475928127, + "learning_rate": 0.0009465497535292579, + "loss": 1.5205, + "step": 1620 + }, + { + "epoch": 0.17, + "grad_norm": 0.09717044875080767, + "learning_rate": 0.0009464714026797031, + "loss": 1.4884, + "step": 1621 + }, + { + "epoch": 0.17, + "grad_norm": 0.09757801259769658, + "learning_rate": 0.0009463929976939515, + "loss": 1.425, + "step": 1622 + }, + { + "epoch": 0.17, + "grad_norm": 0.0922656442309496, + "learning_rate": 0.0009463145385815102, + "loss": 1.4102, + "step": 1623 + }, + { + "epoch": 0.17, + "grad_norm": 0.10056802441985584, + "learning_rate": 0.0009462360253518923, + "loss": 1.5064, + "step": 1624 + }, + { + "epoch": 0.17, + "grad_norm": 0.09757900282309952, + "learning_rate": 0.0009461574580146179, + "loss": 1.4064, + "step": 1625 + }, + { + "epoch": 0.17, + "grad_norm": 0.09223895834044753, + "learning_rate": 0.0009460788365792135, + "loss": 1.4853, + "step": 1626 + }, + { + "epoch": 0.17, + "grad_norm": 0.08517631548088779, + "learning_rate": 0.0009460001610552124, + "loss": 1.3385, + "step": 1627 + }, + { + "epoch": 0.18, + "grad_norm": 0.09644877186160637, + "learning_rate": 0.000945921431452154, + "loss": 1.3694, + "step": 1628 + }, + { + "epoch": 0.18, + "grad_norm": 0.10147088301807176, + "learning_rate": 0.000945842647779585, + "loss": 1.4762, + "step": 1629 + }, + { + "epoch": 0.18, + "grad_norm": 0.10594360644469922, + "learning_rate": 0.0009457638100470577, + "loss": 1.5326, + "step": 1630 + }, + { + "epoch": 0.18, + "grad_norm": 0.0964403054717145, + "learning_rate": 0.0009456849182641317, + "loss": 1.4083, + "step": 1631 + }, + { + "epoch": 0.18, + "grad_norm": 0.08982800262769298, + "learning_rate": 0.000945605972440373, + "loss": 1.3163, + "step": 1632 + }, + { + "epoch": 0.18, + "grad_norm": 0.09752836436861666, + "learning_rate": 0.0009455269725853536, + "loss": 1.3745, + "step": 1633 + }, + { + "epoch": 0.18, + "grad_norm": 0.09479524516483495, + "learning_rate": 0.000945447918708653, + "loss": 1.3966, + "step": 1634 + }, + { + "epoch": 0.18, + "grad_norm": 0.08660628978231161, + "learning_rate": 0.0009453688108198566, + "loss": 1.3603, + "step": 1635 + }, + { + "epoch": 0.18, + "grad_norm": 0.08391899236342358, + "learning_rate": 0.0009452896489285563, + "loss": 1.4881, + "step": 1636 + }, + { + "epoch": 0.18, + "grad_norm": 0.10847952104104855, + "learning_rate": 0.0009452104330443511, + "loss": 1.5492, + "step": 1637 + }, + { + "epoch": 0.18, + "grad_norm": 0.08318537065982123, + "learning_rate": 0.0009451311631768459, + "loss": 1.5448, + "step": 1638 + }, + { + "epoch": 0.18, + "grad_norm": 0.09468223699769081, + "learning_rate": 0.0009450518393356527, + "loss": 1.4535, + "step": 1639 + }, + { + "epoch": 0.18, + "grad_norm": 0.10397951977441247, + "learning_rate": 0.0009449724615303894, + "loss": 1.4966, + "step": 1640 + }, + { + "epoch": 0.18, + "grad_norm": 0.10389824321917117, + "learning_rate": 0.0009448930297706813, + "loss": 1.484, + "step": 1641 + }, + { + "epoch": 0.18, + "grad_norm": 0.08499587782031137, + "learning_rate": 0.0009448135440661595, + "loss": 1.4936, + "step": 1642 + }, + { + "epoch": 0.18, + "grad_norm": 0.08757048555474968, + "learning_rate": 0.0009447340044264619, + "loss": 1.4992, + "step": 1643 + }, + { + "epoch": 0.18, + "grad_norm": 0.09261434742761426, + "learning_rate": 0.0009446544108612331, + "loss": 1.4285, + "step": 1644 + }, + { + "epoch": 0.18, + "grad_norm": 0.08674281372806956, + "learning_rate": 0.0009445747633801241, + "loss": 1.4837, + "step": 1645 + }, + { + "epoch": 0.18, + "grad_norm": 0.08584946897397816, + "learning_rate": 0.0009444950619927924, + "loss": 1.5338, + "step": 1646 + }, + { + "epoch": 0.18, + "grad_norm": 0.11737706230601937, + "learning_rate": 0.0009444153067089019, + "loss": 1.5565, + "step": 1647 + }, + { + "epoch": 0.18, + "grad_norm": 0.09096614220056518, + "learning_rate": 0.0009443354975381234, + "loss": 1.382, + "step": 1648 + }, + { + "epoch": 0.18, + "grad_norm": 0.09032154151221224, + "learning_rate": 0.0009442556344901339, + "loss": 1.3891, + "step": 1649 + }, + { + "epoch": 0.18, + "grad_norm": 0.1027879079616311, + "learning_rate": 0.000944175717574617, + "loss": 1.551, + "step": 1650 + }, + { + "epoch": 0.18, + "grad_norm": 0.08719485311068445, + "learning_rate": 0.0009440957468012632, + "loss": 1.4096, + "step": 1651 + }, + { + "epoch": 0.18, + "grad_norm": 0.09352779878047505, + "learning_rate": 0.0009440157221797692, + "loss": 1.4714, + "step": 1652 + }, + { + "epoch": 0.18, + "grad_norm": 0.10486400709151468, + "learning_rate": 0.0009439356437198379, + "loss": 1.4014, + "step": 1653 + }, + { + "epoch": 0.18, + "grad_norm": 0.08752718991568993, + "learning_rate": 0.0009438555114311795, + "loss": 1.3704, + "step": 1654 + }, + { + "epoch": 0.18, + "grad_norm": 0.11686969419920402, + "learning_rate": 0.00094377532532351, + "loss": 1.3982, + "step": 1655 + }, + { + "epoch": 0.18, + "grad_norm": 0.0973101634579299, + "learning_rate": 0.0009436950854065524, + "loss": 1.5212, + "step": 1656 + }, + { + "epoch": 0.18, + "grad_norm": 0.09899858128911071, + "learning_rate": 0.0009436147916900361, + "loss": 1.5066, + "step": 1657 + }, + { + "epoch": 0.18, + "grad_norm": 0.11649096433856834, + "learning_rate": 0.0009435344441836969, + "loss": 1.4407, + "step": 1658 + }, + { + "epoch": 0.18, + "grad_norm": 0.0953453538897908, + "learning_rate": 0.0009434540428972772, + "loss": 1.4545, + "step": 1659 + }, + { + "epoch": 0.18, + "grad_norm": 0.10133785841493663, + "learning_rate": 0.0009433735878405261, + "loss": 1.4837, + "step": 1660 + }, + { + "epoch": 0.18, + "grad_norm": 0.10015520209464993, + "learning_rate": 0.0009432930790231988, + "loss": 1.5321, + "step": 1661 + }, + { + "epoch": 0.18, + "grad_norm": 0.08948532164322566, + "learning_rate": 0.0009432125164550576, + "loss": 1.4165, + "step": 1662 + }, + { + "epoch": 0.18, + "grad_norm": 0.09658583989362529, + "learning_rate": 0.0009431319001458704, + "loss": 1.4593, + "step": 1663 + }, + { + "epoch": 0.18, + "grad_norm": 0.10145569880361127, + "learning_rate": 0.000943051230105413, + "loss": 1.426, + "step": 1664 + }, + { + "epoch": 0.18, + "grad_norm": 0.09816258060491967, + "learning_rate": 0.0009429705063434664, + "loss": 1.4223, + "step": 1665 + }, + { + "epoch": 0.18, + "grad_norm": 0.09250427904332187, + "learning_rate": 0.0009428897288698188, + "loss": 1.5038, + "step": 1666 + }, + { + "epoch": 0.18, + "grad_norm": 0.0853389109743307, + "learning_rate": 0.0009428088976942646, + "loss": 1.5678, + "step": 1667 + }, + { + "epoch": 0.18, + "grad_norm": 0.07695692774289016, + "learning_rate": 0.0009427280128266049, + "loss": 1.4343, + "step": 1668 + }, + { + "epoch": 0.18, + "grad_norm": 0.10008107955257982, + "learning_rate": 0.0009426470742766476, + "loss": 1.4508, + "step": 1669 + }, + { + "epoch": 0.18, + "grad_norm": 0.08818370379739952, + "learning_rate": 0.0009425660820542062, + "loss": 1.4621, + "step": 1670 + }, + { + "epoch": 0.18, + "grad_norm": 0.0852469680143305, + "learning_rate": 0.0009424850361691017, + "loss": 1.5184, + "step": 1671 + }, + { + "epoch": 0.18, + "grad_norm": 0.081114977796592, + "learning_rate": 0.0009424039366311612, + "loss": 1.4093, + "step": 1672 + }, + { + "epoch": 0.18, + "grad_norm": 0.10402276311434469, + "learning_rate": 0.0009423227834502181, + "loss": 1.5449, + "step": 1673 + }, + { + "epoch": 0.18, + "grad_norm": 0.08138128486557042, + "learning_rate": 0.0009422415766361126, + "loss": 1.4053, + "step": 1674 + }, + { + "epoch": 0.18, + "grad_norm": 0.09250871509940908, + "learning_rate": 0.0009421603161986913, + "loss": 1.4838, + "step": 1675 + }, + { + "epoch": 0.18, + "grad_norm": 0.09799638166091465, + "learning_rate": 0.0009420790021478072, + "loss": 1.3309, + "step": 1676 + }, + { + "epoch": 0.18, + "grad_norm": 0.07799924026862029, + "learning_rate": 0.00094199763449332, + "loss": 1.3402, + "step": 1677 + }, + { + "epoch": 0.18, + "grad_norm": 0.09607736488537287, + "learning_rate": 0.0009419162132450961, + "loss": 1.4766, + "step": 1678 + }, + { + "epoch": 0.18, + "grad_norm": 0.08245856386036492, + "learning_rate": 0.0009418347384130076, + "loss": 1.3557, + "step": 1679 + }, + { + "epoch": 0.18, + "grad_norm": 0.08653876091437132, + "learning_rate": 0.000941753210006934, + "loss": 1.4636, + "step": 1680 + }, + { + "epoch": 0.18, + "grad_norm": 0.09659732334971687, + "learning_rate": 0.0009416716280367606, + "loss": 1.5785, + "step": 1681 + }, + { + "epoch": 0.18, + "grad_norm": 0.09251463831839435, + "learning_rate": 0.0009415899925123795, + "loss": 1.4092, + "step": 1682 + }, + { + "epoch": 0.18, + "grad_norm": 0.0885401579842873, + "learning_rate": 0.0009415083034436895, + "loss": 1.4988, + "step": 1683 + }, + { + "epoch": 0.18, + "grad_norm": 0.08458310827754215, + "learning_rate": 0.0009414265608405956, + "loss": 1.453, + "step": 1684 + }, + { + "epoch": 0.18, + "grad_norm": 0.09539966740345308, + "learning_rate": 0.0009413447647130096, + "loss": 1.5588, + "step": 1685 + }, + { + "epoch": 0.18, + "grad_norm": 0.087828154350039, + "learning_rate": 0.0009412629150708492, + "loss": 1.4878, + "step": 1686 + }, + { + "epoch": 0.18, + "grad_norm": 0.09293970968778822, + "learning_rate": 0.0009411810119240389, + "loss": 1.4289, + "step": 1687 + }, + { + "epoch": 0.18, + "grad_norm": 0.09332098625509935, + "learning_rate": 0.00094109905528251, + "loss": 1.3876, + "step": 1688 + }, + { + "epoch": 0.18, + "grad_norm": 0.09483816513762469, + "learning_rate": 0.0009410170451562001, + "loss": 1.4632, + "step": 1689 + }, + { + "epoch": 0.18, + "grad_norm": 0.0878505960015256, + "learning_rate": 0.000940934981555053, + "loss": 1.4356, + "step": 1690 + }, + { + "epoch": 0.18, + "grad_norm": 0.10200709452803336, + "learning_rate": 0.000940852864489019, + "loss": 1.3956, + "step": 1691 + }, + { + "epoch": 0.18, + "grad_norm": 0.09522477733928604, + "learning_rate": 0.0009407706939680556, + "loss": 1.3232, + "step": 1692 + }, + { + "epoch": 0.18, + "grad_norm": 0.08411835086532109, + "learning_rate": 0.0009406884700021259, + "loss": 1.5153, + "step": 1693 + }, + { + "epoch": 0.18, + "grad_norm": 0.09014720566437971, + "learning_rate": 0.0009406061926012, + "loss": 1.546, + "step": 1694 + }, + { + "epoch": 0.18, + "grad_norm": 0.07605071855611331, + "learning_rate": 0.0009405238617752543, + "loss": 1.4299, + "step": 1695 + }, + { + "epoch": 0.18, + "grad_norm": 0.080623963055023, + "learning_rate": 0.0009404414775342715, + "loss": 1.4253, + "step": 1696 + }, + { + "epoch": 0.18, + "grad_norm": 0.09427266407825832, + "learning_rate": 0.0009403590398882411, + "loss": 1.4959, + "step": 1697 + }, + { + "epoch": 0.18, + "grad_norm": 0.07474659358381536, + "learning_rate": 0.0009402765488471591, + "loss": 1.3768, + "step": 1698 + }, + { + "epoch": 0.18, + "grad_norm": 0.09176750550282939, + "learning_rate": 0.0009401940044210276, + "loss": 1.4923, + "step": 1699 + }, + { + "epoch": 0.18, + "grad_norm": 0.10027187119937282, + "learning_rate": 0.0009401114066198555, + "loss": 1.4703, + "step": 1700 + }, + { + "epoch": 0.18, + "grad_norm": 0.09729007735305473, + "learning_rate": 0.000940028755453658, + "loss": 1.4746, + "step": 1701 + }, + { + "epoch": 0.18, + "grad_norm": 0.08153229489393464, + "learning_rate": 0.000939946050932457, + "loss": 1.4445, + "step": 1702 + }, + { + "epoch": 0.18, + "grad_norm": 0.08757343940935985, + "learning_rate": 0.0009398632930662805, + "loss": 1.5132, + "step": 1703 + }, + { + "epoch": 0.18, + "grad_norm": 0.08901895804515486, + "learning_rate": 0.0009397804818651634, + "loss": 1.2978, + "step": 1704 + }, + { + "epoch": 0.18, + "grad_norm": 0.10619612933936191, + "learning_rate": 0.0009396976173391466, + "loss": 1.507, + "step": 1705 + }, + { + "epoch": 0.18, + "grad_norm": 0.0917857472797286, + "learning_rate": 0.000939614699498278, + "loss": 1.4033, + "step": 1706 + }, + { + "epoch": 0.18, + "grad_norm": 0.1134247857054161, + "learning_rate": 0.0009395317283526113, + "loss": 1.5523, + "step": 1707 + }, + { + "epoch": 0.18, + "grad_norm": 0.09581787178579383, + "learning_rate": 0.0009394487039122072, + "loss": 1.4631, + "step": 1708 + }, + { + "epoch": 0.18, + "grad_norm": 0.09259611299053887, + "learning_rate": 0.0009393656261871328, + "loss": 1.4405, + "step": 1709 + }, + { + "epoch": 0.18, + "grad_norm": 0.09297130715435674, + "learning_rate": 0.0009392824951874617, + "loss": 1.6184, + "step": 1710 + }, + { + "epoch": 0.18, + "grad_norm": 0.08872509563911614, + "learning_rate": 0.0009391993109232735, + "loss": 1.4219, + "step": 1711 + }, + { + "epoch": 0.18, + "grad_norm": 0.08836517965035444, + "learning_rate": 0.0009391160734046547, + "loss": 1.5004, + "step": 1712 + }, + { + "epoch": 0.18, + "grad_norm": 0.09390231142694527, + "learning_rate": 0.000939032782641698, + "loss": 1.4052, + "step": 1713 + }, + { + "epoch": 0.18, + "grad_norm": 0.09998203205571347, + "learning_rate": 0.000938949438644503, + "loss": 1.4534, + "step": 1714 + }, + { + "epoch": 0.18, + "grad_norm": 0.10024244141659862, + "learning_rate": 0.0009388660414231751, + "loss": 1.597, + "step": 1715 + }, + { + "epoch": 0.18, + "grad_norm": 0.09070174363207298, + "learning_rate": 0.0009387825909878269, + "loss": 1.431, + "step": 1716 + }, + { + "epoch": 0.18, + "grad_norm": 0.09080214985361508, + "learning_rate": 0.0009386990873485767, + "loss": 1.4065, + "step": 1717 + }, + { + "epoch": 0.18, + "grad_norm": 0.1004759223908107, + "learning_rate": 0.0009386155305155497, + "loss": 1.4715, + "step": 1718 + }, + { + "epoch": 0.18, + "grad_norm": 0.09826529128818194, + "learning_rate": 0.0009385319204988776, + "loss": 1.436, + "step": 1719 + }, + { + "epoch": 0.18, + "grad_norm": 0.09212018460786485, + "learning_rate": 0.0009384482573086983, + "loss": 1.4487, + "step": 1720 + }, + { + "epoch": 0.19, + "grad_norm": 0.09053519573952286, + "learning_rate": 0.000938364540955156, + "loss": 1.6079, + "step": 1721 + }, + { + "epoch": 0.19, + "grad_norm": 0.09161427273443738, + "learning_rate": 0.0009382807714484021, + "loss": 1.5146, + "step": 1722 + }, + { + "epoch": 0.19, + "grad_norm": 0.09193193868164781, + "learning_rate": 0.0009381969487985935, + "loss": 1.4547, + "step": 1723 + }, + { + "epoch": 0.19, + "grad_norm": 0.08852240861982075, + "learning_rate": 0.0009381130730158943, + "loss": 1.5276, + "step": 1724 + }, + { + "epoch": 0.19, + "grad_norm": 0.0914904802379876, + "learning_rate": 0.0009380291441104747, + "loss": 1.6143, + "step": 1725 + }, + { + "epoch": 0.19, + "grad_norm": 0.08260988756714899, + "learning_rate": 0.000937945162092511, + "loss": 1.4669, + "step": 1726 + }, + { + "epoch": 0.19, + "grad_norm": 0.10181657253275951, + "learning_rate": 0.0009378611269721866, + "loss": 1.4018, + "step": 1727 + }, + { + "epoch": 0.19, + "grad_norm": 0.09565750562612672, + "learning_rate": 0.0009377770387596911, + "loss": 1.5622, + "step": 1728 + }, + { + "epoch": 0.19, + "grad_norm": 0.08427791252695747, + "learning_rate": 0.0009376928974652205, + "loss": 1.494, + "step": 1729 + }, + { + "epoch": 0.19, + "grad_norm": 0.08475458912711241, + "learning_rate": 0.0009376087030989771, + "loss": 1.4584, + "step": 1730 + }, + { + "epoch": 0.19, + "grad_norm": 0.09136780435680898, + "learning_rate": 0.0009375244556711695, + "loss": 1.4727, + "step": 1731 + }, + { + "epoch": 0.19, + "grad_norm": 0.08392899372822892, + "learning_rate": 0.0009374401551920135, + "loss": 1.4106, + "step": 1732 + }, + { + "epoch": 0.19, + "grad_norm": 0.07494760250079167, + "learning_rate": 0.0009373558016717306, + "loss": 1.3448, + "step": 1733 + }, + { + "epoch": 0.19, + "grad_norm": 0.10183183783607534, + "learning_rate": 0.000937271395120549, + "loss": 1.5347, + "step": 1734 + }, + { + "epoch": 0.19, + "grad_norm": 0.09490611937533996, + "learning_rate": 0.0009371869355487031, + "loss": 1.5137, + "step": 1735 + }, + { + "epoch": 0.19, + "grad_norm": 0.09095625419563282, + "learning_rate": 0.0009371024229664341, + "loss": 1.5858, + "step": 1736 + }, + { + "epoch": 0.19, + "grad_norm": 0.09897520105592522, + "learning_rate": 0.0009370178573839894, + "loss": 1.502, + "step": 1737 + }, + { + "epoch": 0.19, + "grad_norm": 0.08026803716653642, + "learning_rate": 0.000936933238811623, + "loss": 1.3948, + "step": 1738 + }, + { + "epoch": 0.19, + "grad_norm": 0.08326463099403736, + "learning_rate": 0.0009368485672595948, + "loss": 1.3566, + "step": 1739 + }, + { + "epoch": 0.19, + "grad_norm": 0.08141481156595314, + "learning_rate": 0.000936763842738172, + "loss": 1.4374, + "step": 1740 + }, + { + "epoch": 0.19, + "grad_norm": 0.0986349039516981, + "learning_rate": 0.0009366790652576274, + "loss": 1.5395, + "step": 1741 + }, + { + "epoch": 0.19, + "grad_norm": 0.10291774398160058, + "learning_rate": 0.0009365942348282405, + "loss": 1.5838, + "step": 1742 + }, + { + "epoch": 0.19, + "grad_norm": 0.10217497765301305, + "learning_rate": 0.0009365093514602978, + "loss": 1.5123, + "step": 1743 + }, + { + "epoch": 0.19, + "grad_norm": 0.08112289391270511, + "learning_rate": 0.0009364244151640913, + "loss": 1.4214, + "step": 1744 + }, + { + "epoch": 0.19, + "grad_norm": 0.08252851448595519, + "learning_rate": 0.0009363394259499197, + "loss": 1.5329, + "step": 1745 + }, + { + "epoch": 0.19, + "grad_norm": 0.10139278527828859, + "learning_rate": 0.0009362543838280884, + "loss": 1.5226, + "step": 1746 + }, + { + "epoch": 0.19, + "grad_norm": 0.09091939187498209, + "learning_rate": 0.0009361692888089092, + "loss": 1.5196, + "step": 1747 + }, + { + "epoch": 0.19, + "grad_norm": 0.08879705035046179, + "learning_rate": 0.0009360841409027001, + "loss": 1.3142, + "step": 1748 + }, + { + "epoch": 0.19, + "grad_norm": 0.1033063834090903, + "learning_rate": 0.0009359989401197852, + "loss": 1.4918, + "step": 1749 + }, + { + "epoch": 0.19, + "grad_norm": 0.08250221798770896, + "learning_rate": 0.000935913686470496, + "loss": 1.4754, + "step": 1750 + }, + { + "epoch": 0.19, + "grad_norm": 0.09952553090757268, + "learning_rate": 0.0009358283799651694, + "loss": 1.5277, + "step": 1751 + }, + { + "epoch": 0.19, + "grad_norm": 0.08770051236655776, + "learning_rate": 0.0009357430206141492, + "loss": 1.5348, + "step": 1752 + }, + { + "epoch": 0.19, + "grad_norm": 0.08456069157036568, + "learning_rate": 0.0009356576084277855, + "loss": 1.594, + "step": 1753 + }, + { + "epoch": 0.19, + "grad_norm": 0.08688679813984553, + "learning_rate": 0.0009355721434164348, + "loss": 1.4007, + "step": 1754 + }, + { + "epoch": 0.19, + "grad_norm": 0.08459334076466057, + "learning_rate": 0.0009354866255904602, + "loss": 1.3808, + "step": 1755 + }, + { + "epoch": 0.19, + "grad_norm": 0.0750770133707275, + "learning_rate": 0.0009354010549602308, + "loss": 1.4116, + "step": 1756 + }, + { + "epoch": 0.19, + "grad_norm": 0.0810999858623209, + "learning_rate": 0.0009353154315361223, + "loss": 1.392, + "step": 1757 + }, + { + "epoch": 0.19, + "grad_norm": 0.07561264712057969, + "learning_rate": 0.0009352297553285172, + "loss": 1.5725, + "step": 1758 + }, + { + "epoch": 0.19, + "grad_norm": 0.07689189356736299, + "learning_rate": 0.0009351440263478036, + "loss": 1.3871, + "step": 1759 + }, + { + "epoch": 0.19, + "grad_norm": 0.07656862328089326, + "learning_rate": 0.0009350582446043767, + "loss": 1.4561, + "step": 1760 + }, + { + "epoch": 0.19, + "grad_norm": 0.07421858530277249, + "learning_rate": 0.0009349724101086379, + "loss": 1.3506, + "step": 1761 + }, + { + "epoch": 0.19, + "grad_norm": 0.09125757959767364, + "learning_rate": 0.0009348865228709947, + "loss": 1.3664, + "step": 1762 + }, + { + "epoch": 0.19, + "grad_norm": 0.08458607011462088, + "learning_rate": 0.0009348005829018612, + "loss": 1.3518, + "step": 1763 + }, + { + "epoch": 0.19, + "grad_norm": 0.07846569043882619, + "learning_rate": 0.0009347145902116582, + "loss": 1.4436, + "step": 1764 + }, + { + "epoch": 0.19, + "grad_norm": 0.07880792008907359, + "learning_rate": 0.0009346285448108124, + "loss": 1.3728, + "step": 1765 + }, + { + "epoch": 0.19, + "grad_norm": 0.08379065827759669, + "learning_rate": 0.0009345424467097572, + "loss": 1.4667, + "step": 1766 + }, + { + "epoch": 0.19, + "grad_norm": 0.07785121808069187, + "learning_rate": 0.0009344562959189321, + "loss": 1.4439, + "step": 1767 + }, + { + "epoch": 0.19, + "grad_norm": 0.07867477807500672, + "learning_rate": 0.0009343700924487835, + "loss": 1.4389, + "step": 1768 + }, + { + "epoch": 0.19, + "grad_norm": 0.0823946804001334, + "learning_rate": 0.0009342838363097635, + "loss": 1.5018, + "step": 1769 + }, + { + "epoch": 0.19, + "grad_norm": 0.08063914021381803, + "learning_rate": 0.0009341975275123313, + "loss": 1.406, + "step": 1770 + }, + { + "epoch": 0.19, + "grad_norm": 0.08547202214736771, + "learning_rate": 0.0009341111660669519, + "loss": 1.328, + "step": 1771 + }, + { + "epoch": 0.19, + "grad_norm": 0.10723639210392595, + "learning_rate": 0.0009340247519840969, + "loss": 1.5986, + "step": 1772 + }, + { + "epoch": 0.19, + "grad_norm": 0.08744751839005566, + "learning_rate": 0.0009339382852742446, + "loss": 1.4943, + "step": 1773 + }, + { + "epoch": 0.19, + "grad_norm": 0.0947836846245832, + "learning_rate": 0.0009338517659478791, + "loss": 1.4841, + "step": 1774 + }, + { + "epoch": 0.19, + "grad_norm": 0.07957752401569529, + "learning_rate": 0.0009337651940154914, + "loss": 1.415, + "step": 1775 + }, + { + "epoch": 0.19, + "grad_norm": 0.09486280553870081, + "learning_rate": 0.0009336785694875785, + "loss": 1.4312, + "step": 1776 + }, + { + "epoch": 0.19, + "grad_norm": 0.0946131450018491, + "learning_rate": 0.0009335918923746438, + "loss": 1.4232, + "step": 1777 + }, + { + "epoch": 0.19, + "grad_norm": 0.0869396458702221, + "learning_rate": 0.0009335051626871973, + "loss": 1.416, + "step": 1778 + }, + { + "epoch": 0.19, + "grad_norm": 0.09052244814090689, + "learning_rate": 0.0009334183804357555, + "loss": 1.4041, + "step": 1779 + }, + { + "epoch": 0.19, + "grad_norm": 0.11469431378434027, + "learning_rate": 0.0009333315456308407, + "loss": 1.4579, + "step": 1780 + }, + { + "epoch": 0.19, + "grad_norm": 0.0906512085860017, + "learning_rate": 0.0009332446582829821, + "loss": 1.5208, + "step": 1781 + }, + { + "epoch": 0.19, + "grad_norm": 0.09416622905854574, + "learning_rate": 0.0009331577184027149, + "loss": 1.3891, + "step": 1782 + }, + { + "epoch": 0.19, + "grad_norm": 0.09651543652014465, + "learning_rate": 0.0009330707260005813, + "loss": 1.6078, + "step": 1783 + }, + { + "epoch": 0.19, + "grad_norm": 0.09579404336719885, + "learning_rate": 0.000932983681087129, + "loss": 1.52, + "step": 1784 + }, + { + "epoch": 0.19, + "grad_norm": 0.09205179631972425, + "learning_rate": 0.0009328965836729127, + "loss": 1.4295, + "step": 1785 + }, + { + "epoch": 0.19, + "grad_norm": 0.10099213256917071, + "learning_rate": 0.0009328094337684932, + "loss": 1.5516, + "step": 1786 + }, + { + "epoch": 0.19, + "grad_norm": 0.08243984686364722, + "learning_rate": 0.0009327222313844376, + "loss": 1.3988, + "step": 1787 + }, + { + "epoch": 0.19, + "grad_norm": 0.10671922892355516, + "learning_rate": 0.0009326349765313199, + "loss": 1.3028, + "step": 1788 + }, + { + "epoch": 0.19, + "grad_norm": 0.10306132239894458, + "learning_rate": 0.0009325476692197197, + "loss": 1.4138, + "step": 1789 + }, + { + "epoch": 0.19, + "grad_norm": 0.09456390248444875, + "learning_rate": 0.0009324603094602232, + "loss": 1.5466, + "step": 1790 + }, + { + "epoch": 0.19, + "grad_norm": 0.08567679617373725, + "learning_rate": 0.0009323728972634234, + "loss": 1.4125, + "step": 1791 + }, + { + "epoch": 0.19, + "grad_norm": 0.0866261102864471, + "learning_rate": 0.0009322854326399192, + "loss": 1.3749, + "step": 1792 + }, + { + "epoch": 0.19, + "grad_norm": 0.10008267779048576, + "learning_rate": 0.000932197915600316, + "loss": 1.4912, + "step": 1793 + }, + { + "epoch": 0.19, + "grad_norm": 0.09628776139314607, + "learning_rate": 0.0009321103461552254, + "loss": 1.5609, + "step": 1794 + }, + { + "epoch": 0.19, + "grad_norm": 0.09887887458347153, + "learning_rate": 0.0009320227243152657, + "loss": 1.5033, + "step": 1795 + }, + { + "epoch": 0.19, + "grad_norm": 0.08765493554091913, + "learning_rate": 0.0009319350500910612, + "loss": 1.4827, + "step": 1796 + }, + { + "epoch": 0.19, + "grad_norm": 0.09325239873922055, + "learning_rate": 0.0009318473234932428, + "loss": 1.3376, + "step": 1797 + }, + { + "epoch": 0.19, + "grad_norm": 0.08425381736541987, + "learning_rate": 0.0009317595445324476, + "loss": 1.4944, + "step": 1798 + }, + { + "epoch": 0.19, + "grad_norm": 0.0966079305940004, + "learning_rate": 0.0009316717132193192, + "loss": 1.5316, + "step": 1799 + }, + { + "epoch": 0.19, + "grad_norm": 0.08757180961711196, + "learning_rate": 0.0009315838295645074, + "loss": 1.4882, + "step": 1800 + }, + { + "epoch": 0.19, + "grad_norm": 0.08662799707706155, + "learning_rate": 0.0009314958935786683, + "loss": 1.46, + "step": 1801 + }, + { + "epoch": 0.19, + "grad_norm": 0.08311178903942941, + "learning_rate": 0.0009314079052724644, + "loss": 1.574, + "step": 1802 + }, + { + "epoch": 0.19, + "grad_norm": 0.09580809932720677, + "learning_rate": 0.0009313198646565648, + "loss": 1.4954, + "step": 1803 + }, + { + "epoch": 0.19, + "grad_norm": 0.09515422211680134, + "learning_rate": 0.0009312317717416448, + "loss": 1.4624, + "step": 1804 + }, + { + "epoch": 0.19, + "grad_norm": 0.09528198931281466, + "learning_rate": 0.0009311436265383856, + "loss": 1.5618, + "step": 1805 + }, + { + "epoch": 0.19, + "grad_norm": 0.1036689450583713, + "learning_rate": 0.0009310554290574753, + "loss": 1.4556, + "step": 1806 + }, + { + "epoch": 0.19, + "grad_norm": 0.08480419301117718, + "learning_rate": 0.0009309671793096082, + "loss": 1.4717, + "step": 1807 + }, + { + "epoch": 0.19, + "grad_norm": 0.08133107911744367, + "learning_rate": 0.0009308788773054848, + "loss": 1.3288, + "step": 1808 + }, + { + "epoch": 0.19, + "grad_norm": 0.08016797148504663, + "learning_rate": 0.0009307905230558121, + "loss": 1.5397, + "step": 1809 + }, + { + "epoch": 0.19, + "grad_norm": 0.0846677701501156, + "learning_rate": 0.0009307021165713033, + "loss": 1.4324, + "step": 1810 + }, + { + "epoch": 0.19, + "grad_norm": 0.07570819301145869, + "learning_rate": 0.000930613657862678, + "loss": 1.5013, + "step": 1811 + }, + { + "epoch": 0.19, + "grad_norm": 0.09220020986485801, + "learning_rate": 0.0009305251469406622, + "loss": 1.3811, + "step": 1812 + }, + { + "epoch": 0.19, + "grad_norm": 0.08711057852439665, + "learning_rate": 0.000930436583815988, + "loss": 1.5304, + "step": 1813 + }, + { + "epoch": 0.2, + "grad_norm": 0.0801584114606825, + "learning_rate": 0.0009303479684993942, + "loss": 1.4914, + "step": 1814 + }, + { + "epoch": 0.2, + "grad_norm": 0.08411107652644935, + "learning_rate": 0.0009302593010016254, + "loss": 1.309, + "step": 1815 + }, + { + "epoch": 0.2, + "grad_norm": 0.08740716260274044, + "learning_rate": 0.0009301705813334331, + "loss": 1.4615, + "step": 1816 + }, + { + "epoch": 0.2, + "grad_norm": 0.08826543969180319, + "learning_rate": 0.000930081809505575, + "loss": 1.3672, + "step": 1817 + }, + { + "epoch": 0.2, + "grad_norm": 0.08663114298057124, + "learning_rate": 0.0009299929855288145, + "loss": 1.3388, + "step": 1818 + }, + { + "epoch": 0.2, + "grad_norm": 0.09701128888327752, + "learning_rate": 0.0009299041094139222, + "loss": 1.5865, + "step": 1819 + }, + { + "epoch": 0.2, + "grad_norm": 0.07635149578067951, + "learning_rate": 0.0009298151811716744, + "loss": 1.5124, + "step": 1820 + }, + { + "epoch": 0.2, + "grad_norm": 0.08728723564923248, + "learning_rate": 0.0009297262008128543, + "loss": 1.5027, + "step": 1821 + }, + { + "epoch": 0.2, + "grad_norm": 0.11155604154795036, + "learning_rate": 0.0009296371683482508, + "loss": 1.3568, + "step": 1822 + }, + { + "epoch": 0.2, + "grad_norm": 0.07719213570012251, + "learning_rate": 0.0009295480837886594, + "loss": 1.4425, + "step": 1823 + }, + { + "epoch": 0.2, + "grad_norm": 0.0911625047888657, + "learning_rate": 0.0009294589471448819, + "loss": 1.4705, + "step": 1824 + }, + { + "epoch": 0.2, + "grad_norm": 0.09613111465408736, + "learning_rate": 0.0009293697584277265, + "loss": 1.4979, + "step": 1825 + }, + { + "epoch": 0.2, + "grad_norm": 0.08512139925092915, + "learning_rate": 0.0009292805176480077, + "loss": 1.3785, + "step": 1826 + }, + { + "epoch": 0.2, + "grad_norm": 0.09262471827262757, + "learning_rate": 0.0009291912248165461, + "loss": 1.4475, + "step": 1827 + }, + { + "epoch": 0.2, + "grad_norm": 0.09367459711998498, + "learning_rate": 0.0009291018799441691, + "loss": 1.4571, + "step": 1828 + }, + { + "epoch": 0.2, + "grad_norm": 0.09291744975591652, + "learning_rate": 0.0009290124830417096, + "loss": 1.6161, + "step": 1829 + }, + { + "epoch": 0.2, + "grad_norm": 0.091514229624702, + "learning_rate": 0.0009289230341200075, + "loss": 1.4075, + "step": 1830 + }, + { + "epoch": 0.2, + "grad_norm": 0.09264404283002275, + "learning_rate": 0.0009288335331899088, + "loss": 1.4796, + "step": 1831 + }, + { + "epoch": 0.2, + "grad_norm": 0.09373072460820865, + "learning_rate": 0.0009287439802622659, + "loss": 1.4221, + "step": 1832 + }, + { + "epoch": 0.2, + "grad_norm": 0.09399036796231679, + "learning_rate": 0.0009286543753479372, + "loss": 1.3825, + "step": 1833 + }, + { + "epoch": 0.2, + "grad_norm": 0.08289694956094026, + "learning_rate": 0.0009285647184577877, + "loss": 1.477, + "step": 1834 + }, + { + "epoch": 0.2, + "grad_norm": 0.08546486711268557, + "learning_rate": 0.0009284750096026887, + "loss": 1.3694, + "step": 1835 + }, + { + "epoch": 0.2, + "grad_norm": 0.09047798016590947, + "learning_rate": 0.0009283852487935174, + "loss": 1.4847, + "step": 1836 + }, + { + "epoch": 0.2, + "grad_norm": 0.10009191077886122, + "learning_rate": 0.0009282954360411577, + "loss": 1.4234, + "step": 1837 + }, + { + "epoch": 0.2, + "grad_norm": 0.0841123623143838, + "learning_rate": 0.0009282055713565001, + "loss": 1.5119, + "step": 1838 + }, + { + "epoch": 0.2, + "grad_norm": 0.0915824218936105, + "learning_rate": 0.0009281156547504408, + "loss": 1.4167, + "step": 1839 + }, + { + "epoch": 0.2, + "grad_norm": 0.08648630759897831, + "learning_rate": 0.0009280256862338822, + "loss": 1.3795, + "step": 1840 + }, + { + "epoch": 0.2, + "grad_norm": 0.08526753456498552, + "learning_rate": 0.0009279356658177336, + "loss": 1.3847, + "step": 1841 + }, + { + "epoch": 0.2, + "grad_norm": 0.09626763852163273, + "learning_rate": 0.0009278455935129102, + "loss": 1.5616, + "step": 1842 + }, + { + "epoch": 0.2, + "grad_norm": 0.08816486594899263, + "learning_rate": 0.0009277554693303337, + "loss": 1.53, + "step": 1843 + }, + { + "epoch": 0.2, + "grad_norm": 0.09226706367835856, + "learning_rate": 0.0009276652932809315, + "loss": 1.4491, + "step": 1844 + }, + { + "epoch": 0.2, + "grad_norm": 0.10737839815995892, + "learning_rate": 0.0009275750653756384, + "loss": 1.3005, + "step": 1845 + }, + { + "epoch": 0.2, + "grad_norm": 0.0827159786371698, + "learning_rate": 0.0009274847856253945, + "loss": 1.3935, + "step": 1846 + }, + { + "epoch": 0.2, + "grad_norm": 0.08930362176980898, + "learning_rate": 0.0009273944540411465, + "loss": 1.4578, + "step": 1847 + }, + { + "epoch": 0.2, + "grad_norm": 0.08293126307134922, + "learning_rate": 0.0009273040706338476, + "loss": 1.4298, + "step": 1848 + }, + { + "epoch": 0.2, + "grad_norm": 0.09167982308662573, + "learning_rate": 0.0009272136354144569, + "loss": 1.4165, + "step": 1849 + }, + { + "epoch": 0.2, + "grad_norm": 0.08142752813431287, + "learning_rate": 0.0009271231483939402, + "loss": 1.413, + "step": 1850 + }, + { + "epoch": 0.2, + "grad_norm": 0.08088682067099412, + "learning_rate": 0.0009270326095832691, + "loss": 1.5194, + "step": 1851 + }, + { + "epoch": 0.2, + "grad_norm": 0.080201882006416, + "learning_rate": 0.0009269420189934219, + "loss": 1.5184, + "step": 1852 + }, + { + "epoch": 0.2, + "grad_norm": 0.08617551581904413, + "learning_rate": 0.000926851376635383, + "loss": 1.3672, + "step": 1853 + }, + { + "epoch": 0.2, + "grad_norm": 0.08943700327583678, + "learning_rate": 0.0009267606825201433, + "loss": 1.4141, + "step": 1854 + }, + { + "epoch": 0.2, + "grad_norm": 0.0788005263395767, + "learning_rate": 0.0009266699366586993, + "loss": 1.4555, + "step": 1855 + }, + { + "epoch": 0.2, + "grad_norm": 0.08619240998799432, + "learning_rate": 0.0009265791390620546, + "loss": 1.4135, + "step": 1856 + }, + { + "epoch": 0.2, + "grad_norm": 0.0801316495282694, + "learning_rate": 0.0009264882897412188, + "loss": 1.4703, + "step": 1857 + }, + { + "epoch": 0.2, + "grad_norm": 0.08894081393129301, + "learning_rate": 0.0009263973887072074, + "loss": 1.5404, + "step": 1858 + }, + { + "epoch": 0.2, + "grad_norm": 0.08415426907162747, + "learning_rate": 0.0009263064359710427, + "loss": 1.3856, + "step": 1859 + }, + { + "epoch": 0.2, + "grad_norm": 0.08494026519042301, + "learning_rate": 0.0009262154315437528, + "loss": 1.3861, + "step": 1860 + }, + { + "epoch": 0.2, + "grad_norm": 0.08459923609704402, + "learning_rate": 0.0009261243754363725, + "loss": 1.4656, + "step": 1861 + }, + { + "epoch": 0.2, + "grad_norm": 0.08646099536983778, + "learning_rate": 0.0009260332676599425, + "loss": 1.4025, + "step": 1862 + }, + { + "epoch": 0.2, + "grad_norm": 0.0833393446485166, + "learning_rate": 0.0009259421082255103, + "loss": 1.313, + "step": 1863 + }, + { + "epoch": 0.2, + "grad_norm": 0.10105293160333415, + "learning_rate": 0.0009258508971441288, + "loss": 1.4142, + "step": 1864 + }, + { + "epoch": 0.2, + "grad_norm": 0.07753588468155745, + "learning_rate": 0.0009257596344268579, + "loss": 1.4151, + "step": 1865 + }, + { + "epoch": 0.2, + "grad_norm": 0.07843557406004462, + "learning_rate": 0.0009256683200847637, + "loss": 1.5059, + "step": 1866 + }, + { + "epoch": 0.2, + "grad_norm": 0.09283600002740316, + "learning_rate": 0.000925576954128918, + "loss": 1.4546, + "step": 1867 + }, + { + "epoch": 0.2, + "grad_norm": 0.08221451122832747, + "learning_rate": 0.0009254855365703995, + "loss": 1.4822, + "step": 1868 + }, + { + "epoch": 0.2, + "grad_norm": 0.09696442404894645, + "learning_rate": 0.0009253940674202929, + "loss": 1.5163, + "step": 1869 + }, + { + "epoch": 0.2, + "grad_norm": 0.09377079226474856, + "learning_rate": 0.000925302546689689, + "loss": 1.5331, + "step": 1870 + }, + { + "epoch": 0.2, + "grad_norm": 0.08495042450853789, + "learning_rate": 0.000925210974389685, + "loss": 1.4877, + "step": 1871 + }, + { + "epoch": 0.2, + "grad_norm": 0.09270980057823298, + "learning_rate": 0.0009251193505313844, + "loss": 1.4065, + "step": 1872 + }, + { + "epoch": 0.2, + "grad_norm": 0.10013493564774836, + "learning_rate": 0.0009250276751258972, + "loss": 1.3734, + "step": 1873 + }, + { + "epoch": 0.2, + "grad_norm": 0.08111322828236464, + "learning_rate": 0.0009249359481843389, + "loss": 1.4621, + "step": 1874 + }, + { + "epoch": 0.2, + "grad_norm": 0.095044532703512, + "learning_rate": 0.0009248441697178318, + "loss": 1.3761, + "step": 1875 + }, + { + "epoch": 0.2, + "grad_norm": 0.09643327975927989, + "learning_rate": 0.0009247523397375047, + "loss": 1.5059, + "step": 1876 + }, + { + "epoch": 0.2, + "grad_norm": 0.09213762752747705, + "learning_rate": 0.000924660458254492, + "loss": 1.4399, + "step": 1877 + }, + { + "epoch": 0.2, + "grad_norm": 0.096760198515979, + "learning_rate": 0.0009245685252799345, + "loss": 1.4854, + "step": 1878 + }, + { + "epoch": 0.2, + "grad_norm": 0.08833643778084667, + "learning_rate": 0.0009244765408249798, + "loss": 1.3744, + "step": 1879 + }, + { + "epoch": 0.2, + "grad_norm": 0.10338132621574833, + "learning_rate": 0.0009243845049007811, + "loss": 1.6115, + "step": 1880 + }, + { + "epoch": 0.2, + "grad_norm": 0.09608860137761568, + "learning_rate": 0.0009242924175184981, + "loss": 1.4643, + "step": 1881 + }, + { + "epoch": 0.2, + "grad_norm": 0.09499464003051669, + "learning_rate": 0.0009242002786892967, + "loss": 1.4931, + "step": 1882 + }, + { + "epoch": 0.2, + "grad_norm": 0.10553838276741273, + "learning_rate": 0.0009241080884243491, + "loss": 1.4613, + "step": 1883 + }, + { + "epoch": 0.2, + "grad_norm": 0.08971818800379484, + "learning_rate": 0.0009240158467348337, + "loss": 1.3961, + "step": 1884 + }, + { + "epoch": 0.2, + "grad_norm": 0.09005172515294688, + "learning_rate": 0.000923923553631935, + "loss": 1.3789, + "step": 1885 + }, + { + "epoch": 0.2, + "grad_norm": 0.09165971227928925, + "learning_rate": 0.000923831209126844, + "loss": 1.5329, + "step": 1886 + }, + { + "epoch": 0.2, + "grad_norm": 0.08771485926445989, + "learning_rate": 0.0009237388132307576, + "loss": 1.3577, + "step": 1887 + }, + { + "epoch": 0.2, + "grad_norm": 0.10489309578393058, + "learning_rate": 0.0009236463659548793, + "loss": 1.4723, + "step": 1888 + }, + { + "epoch": 0.2, + "grad_norm": 0.08272777189492383, + "learning_rate": 0.0009235538673104187, + "loss": 1.3486, + "step": 1889 + }, + { + "epoch": 0.2, + "grad_norm": 0.08734761554544256, + "learning_rate": 0.0009234613173085913, + "loss": 1.3613, + "step": 1890 + }, + { + "epoch": 0.2, + "grad_norm": 0.0913984053355163, + "learning_rate": 0.0009233687159606194, + "loss": 1.3321, + "step": 1891 + }, + { + "epoch": 0.2, + "grad_norm": 0.08114508390947732, + "learning_rate": 0.000923276063277731, + "loss": 1.5438, + "step": 1892 + }, + { + "epoch": 0.2, + "grad_norm": 0.08283596903978362, + "learning_rate": 0.0009231833592711609, + "loss": 1.5123, + "step": 1893 + }, + { + "epoch": 0.2, + "grad_norm": 0.08104943206708809, + "learning_rate": 0.0009230906039521494, + "loss": 1.4906, + "step": 1894 + }, + { + "epoch": 0.2, + "grad_norm": 0.0813732064731777, + "learning_rate": 0.0009229977973319436, + "loss": 1.5399, + "step": 1895 + }, + { + "epoch": 0.2, + "grad_norm": 0.07189004509962865, + "learning_rate": 0.0009229049394217965, + "loss": 1.4665, + "step": 1896 + }, + { + "epoch": 0.2, + "grad_norm": 0.1086799985754889, + "learning_rate": 0.0009228120302329677, + "loss": 1.3363, + "step": 1897 + }, + { + "epoch": 0.2, + "grad_norm": 0.08013314880463421, + "learning_rate": 0.0009227190697767224, + "loss": 1.5356, + "step": 1898 + }, + { + "epoch": 0.2, + "grad_norm": 0.08018669322491434, + "learning_rate": 0.0009226260580643326, + "loss": 1.5013, + "step": 1899 + }, + { + "epoch": 0.2, + "grad_norm": 0.09553634333447207, + "learning_rate": 0.0009225329951070763, + "loss": 1.4245, + "step": 1900 + }, + { + "epoch": 0.2, + "grad_norm": 0.09467093564785904, + "learning_rate": 0.0009224398809162376, + "loss": 1.4783, + "step": 1901 + }, + { + "epoch": 0.2, + "grad_norm": 0.08422828895427777, + "learning_rate": 0.0009223467155031068, + "loss": 1.4271, + "step": 1902 + }, + { + "epoch": 0.2, + "grad_norm": 0.0876539354690251, + "learning_rate": 0.000922253498878981, + "loss": 1.2882, + "step": 1903 + }, + { + "epoch": 0.2, + "grad_norm": 0.09326919056385669, + "learning_rate": 0.0009221602310551625, + "loss": 1.4323, + "step": 1904 + }, + { + "epoch": 0.2, + "grad_norm": 0.093287013902574, + "learning_rate": 0.0009220669120429608, + "loss": 1.476, + "step": 1905 + }, + { + "epoch": 0.2, + "grad_norm": 0.0839791046751122, + "learning_rate": 0.0009219735418536908, + "loss": 1.5382, + "step": 1906 + }, + { + "epoch": 0.2, + "grad_norm": 0.0969289999285875, + "learning_rate": 0.000921880120498674, + "loss": 1.4541, + "step": 1907 + }, + { + "epoch": 0.21, + "grad_norm": 0.10043740648290224, + "learning_rate": 0.0009217866479892383, + "loss": 1.5245, + "step": 1908 + }, + { + "epoch": 0.21, + "grad_norm": 0.09340595530802943, + "learning_rate": 0.0009216931243367173, + "loss": 1.4604, + "step": 1909 + }, + { + "epoch": 0.21, + "grad_norm": 0.09369494142636196, + "learning_rate": 0.0009215995495524512, + "loss": 1.4864, + "step": 1910 + }, + { + "epoch": 0.21, + "grad_norm": 0.09596598542059517, + "learning_rate": 0.0009215059236477863, + "loss": 1.3945, + "step": 1911 + }, + { + "epoch": 0.21, + "grad_norm": 0.09961180356748962, + "learning_rate": 0.000921412246634075, + "loss": 1.4319, + "step": 1912 + }, + { + "epoch": 0.21, + "grad_norm": 0.09037393165223366, + "learning_rate": 0.000921318518522676, + "loss": 1.5074, + "step": 1913 + }, + { + "epoch": 0.21, + "grad_norm": 0.08625277482368682, + "learning_rate": 0.000921224739324954, + "loss": 1.3883, + "step": 1914 + }, + { + "epoch": 0.21, + "grad_norm": 0.09714422223326019, + "learning_rate": 0.0009211309090522802, + "loss": 1.5628, + "step": 1915 + }, + { + "epoch": 0.21, + "grad_norm": 0.0909470499364441, + "learning_rate": 0.0009210370277160319, + "loss": 1.5049, + "step": 1916 + }, + { + "epoch": 0.21, + "grad_norm": 0.09368988434281326, + "learning_rate": 0.0009209430953275923, + "loss": 1.5492, + "step": 1917 + }, + { + "epoch": 0.21, + "grad_norm": 0.09939051407296741, + "learning_rate": 0.0009208491118983514, + "loss": 1.4222, + "step": 1918 + }, + { + "epoch": 0.21, + "grad_norm": 0.11093468746461634, + "learning_rate": 0.0009207550774397047, + "loss": 1.5089, + "step": 1919 + }, + { + "epoch": 0.21, + "grad_norm": 0.09192166444725798, + "learning_rate": 0.0009206609919630542, + "loss": 1.4475, + "step": 1920 + }, + { + "epoch": 0.21, + "grad_norm": 0.08413223070653192, + "learning_rate": 0.0009205668554798084, + "loss": 1.3386, + "step": 1921 + }, + { + "epoch": 0.21, + "grad_norm": 0.08125748886386505, + "learning_rate": 0.0009204726680013813, + "loss": 1.3634, + "step": 1922 + }, + { + "epoch": 0.21, + "grad_norm": 0.08620903076980944, + "learning_rate": 0.0009203784295391937, + "loss": 1.401, + "step": 1923 + }, + { + "epoch": 0.21, + "grad_norm": 0.10407022815151389, + "learning_rate": 0.0009202841401046722, + "loss": 1.4641, + "step": 1924 + }, + { + "epoch": 0.21, + "grad_norm": 0.09106470624464104, + "learning_rate": 0.0009201897997092496, + "loss": 1.3364, + "step": 1925 + }, + { + "epoch": 0.21, + "grad_norm": 0.0793647200463425, + "learning_rate": 0.0009200954083643654, + "loss": 1.3409, + "step": 1926 + }, + { + "epoch": 0.21, + "grad_norm": 0.11996672867870423, + "learning_rate": 0.0009200009660814645, + "loss": 1.4471, + "step": 1927 + }, + { + "epoch": 0.21, + "grad_norm": 0.07598152126075328, + "learning_rate": 0.0009199064728719988, + "loss": 1.3633, + "step": 1928 + }, + { + "epoch": 0.21, + "grad_norm": 0.09030240320487572, + "learning_rate": 0.0009198119287474254, + "loss": 1.4453, + "step": 1929 + }, + { + "epoch": 0.21, + "grad_norm": 0.09808317088255411, + "learning_rate": 0.0009197173337192082, + "loss": 1.4955, + "step": 1930 + }, + { + "epoch": 0.21, + "grad_norm": 0.09680838880881662, + "learning_rate": 0.0009196226877988174, + "loss": 1.5606, + "step": 1931 + }, + { + "epoch": 0.21, + "grad_norm": 0.08010659787429597, + "learning_rate": 0.0009195279909977293, + "loss": 1.3131, + "step": 1932 + }, + { + "epoch": 0.21, + "grad_norm": 0.09152502912300714, + "learning_rate": 0.0009194332433274256, + "loss": 1.5883, + "step": 1933 + }, + { + "epoch": 0.21, + "grad_norm": 0.09644413439022238, + "learning_rate": 0.0009193384447993954, + "loss": 1.4096, + "step": 1934 + }, + { + "epoch": 0.21, + "grad_norm": 0.07964388791093453, + "learning_rate": 0.0009192435954251328, + "loss": 1.3457, + "step": 1935 + }, + { + "epoch": 0.21, + "grad_norm": 0.08985149308714338, + "learning_rate": 0.0009191486952161392, + "loss": 1.4786, + "step": 1936 + }, + { + "epoch": 0.21, + "grad_norm": 0.08504235679832424, + "learning_rate": 0.0009190537441839211, + "loss": 1.5413, + "step": 1937 + }, + { + "epoch": 0.21, + "grad_norm": 0.08693547735099826, + "learning_rate": 0.0009189587423399919, + "loss": 1.3525, + "step": 1938 + }, + { + "epoch": 0.21, + "grad_norm": 0.08671648762518341, + "learning_rate": 0.0009188636896958707, + "loss": 1.5021, + "step": 1939 + }, + { + "epoch": 0.21, + "grad_norm": 0.0905281579702194, + "learning_rate": 0.0009187685862630833, + "loss": 1.4043, + "step": 1940 + }, + { + "epoch": 0.21, + "grad_norm": 0.08064542998545053, + "learning_rate": 0.0009186734320531609, + "loss": 1.4366, + "step": 1941 + }, + { + "epoch": 0.21, + "grad_norm": 0.08168312123459194, + "learning_rate": 0.0009185782270776416, + "loss": 1.433, + "step": 1942 + }, + { + "epoch": 0.21, + "grad_norm": 0.07806262864437452, + "learning_rate": 0.0009184829713480691, + "loss": 1.4421, + "step": 1943 + }, + { + "epoch": 0.21, + "grad_norm": 0.09621596143974015, + "learning_rate": 0.0009183876648759937, + "loss": 1.4011, + "step": 1944 + }, + { + "epoch": 0.21, + "grad_norm": 0.08310495330480361, + "learning_rate": 0.0009182923076729714, + "loss": 1.4856, + "step": 1945 + }, + { + "epoch": 0.21, + "grad_norm": 0.09214450573103178, + "learning_rate": 0.0009181968997505649, + "loss": 1.4795, + "step": 1946 + }, + { + "epoch": 0.21, + "grad_norm": 0.09309818499806359, + "learning_rate": 0.0009181014411203425, + "loss": 1.4156, + "step": 1947 + }, + { + "epoch": 0.21, + "grad_norm": 0.09404188405062014, + "learning_rate": 0.0009180059317938789, + "loss": 1.4638, + "step": 1948 + }, + { + "epoch": 0.21, + "grad_norm": 0.08127932701261531, + "learning_rate": 0.000917910371782755, + "loss": 1.4105, + "step": 1949 + }, + { + "epoch": 0.21, + "grad_norm": 0.06988438601558783, + "learning_rate": 0.0009178147610985577, + "loss": 1.4714, + "step": 1950 + }, + { + "epoch": 0.21, + "grad_norm": 0.08197023756718551, + "learning_rate": 0.0009177190997528803, + "loss": 1.4166, + "step": 1951 + }, + { + "epoch": 0.21, + "grad_norm": 0.08041723486575238, + "learning_rate": 0.0009176233877573219, + "loss": 1.4853, + "step": 1952 + }, + { + "epoch": 0.21, + "grad_norm": 0.08324063551372075, + "learning_rate": 0.000917527625123488, + "loss": 1.3482, + "step": 1953 + }, + { + "epoch": 0.21, + "grad_norm": 0.08663822722836335, + "learning_rate": 0.00091743181186299, + "loss": 1.4284, + "step": 1954 + }, + { + "epoch": 0.21, + "grad_norm": 0.08043853843069798, + "learning_rate": 0.000917335947987446, + "loss": 1.5212, + "step": 1955 + }, + { + "epoch": 0.21, + "grad_norm": 0.09342537247056798, + "learning_rate": 0.0009172400335084792, + "loss": 1.5348, + "step": 1956 + }, + { + "epoch": 0.21, + "grad_norm": 0.08823417963705833, + "learning_rate": 0.0009171440684377202, + "loss": 1.4782, + "step": 1957 + }, + { + "epoch": 0.21, + "grad_norm": 0.08640741433278323, + "learning_rate": 0.0009170480527868045, + "loss": 1.3428, + "step": 1958 + }, + { + "epoch": 0.21, + "grad_norm": 0.08641452096333434, + "learning_rate": 0.0009169519865673747, + "loss": 1.4254, + "step": 1959 + }, + { + "epoch": 0.21, + "grad_norm": 0.09594850199633233, + "learning_rate": 0.0009168558697910792, + "loss": 1.493, + "step": 1960 + }, + { + "epoch": 0.21, + "grad_norm": 0.0810561582133138, + "learning_rate": 0.0009167597024695722, + "loss": 1.3539, + "step": 1961 + }, + { + "epoch": 0.21, + "grad_norm": 0.07901539053041998, + "learning_rate": 0.0009166634846145145, + "loss": 1.4775, + "step": 1962 + }, + { + "epoch": 0.21, + "grad_norm": 0.09014507355204333, + "learning_rate": 0.000916567216237573, + "loss": 1.5478, + "step": 1963 + }, + { + "epoch": 0.21, + "grad_norm": 0.08634547230819684, + "learning_rate": 0.0009164708973504204, + "loss": 1.3436, + "step": 1964 + }, + { + "epoch": 0.21, + "grad_norm": 0.09372835977458653, + "learning_rate": 0.0009163745279647355, + "loss": 1.4296, + "step": 1965 + }, + { + "epoch": 0.21, + "grad_norm": 0.08489228357702569, + "learning_rate": 0.0009162781080922038, + "loss": 1.5168, + "step": 1966 + }, + { + "epoch": 0.21, + "grad_norm": 0.08701353461899991, + "learning_rate": 0.0009161816377445162, + "loss": 1.5228, + "step": 1967 + }, + { + "epoch": 0.21, + "grad_norm": 0.08607045368609934, + "learning_rate": 0.0009160851169333704, + "loss": 1.5066, + "step": 1968 + }, + { + "epoch": 0.21, + "grad_norm": 0.07958188338929062, + "learning_rate": 0.0009159885456704695, + "loss": 1.3882, + "step": 1969 + }, + { + "epoch": 0.21, + "grad_norm": 0.09109091693797759, + "learning_rate": 0.0009158919239675235, + "loss": 1.4687, + "step": 1970 + }, + { + "epoch": 0.21, + "grad_norm": 0.08095901944672143, + "learning_rate": 0.0009157952518362478, + "loss": 1.4857, + "step": 1971 + }, + { + "epoch": 0.21, + "grad_norm": 0.0809430968575337, + "learning_rate": 0.0009156985292883645, + "loss": 1.4591, + "step": 1972 + }, + { + "epoch": 0.21, + "grad_norm": 0.09504100751905215, + "learning_rate": 0.0009156017563356013, + "loss": 1.4128, + "step": 1973 + }, + { + "epoch": 0.21, + "grad_norm": 0.09268525098327988, + "learning_rate": 0.0009155049329896923, + "loss": 1.4546, + "step": 1974 + }, + { + "epoch": 0.21, + "grad_norm": 0.09645365774327952, + "learning_rate": 0.0009154080592623777, + "loss": 1.3952, + "step": 1975 + }, + { + "epoch": 0.21, + "grad_norm": 0.08312108781757994, + "learning_rate": 0.000915311135165404, + "loss": 1.3924, + "step": 1976 + }, + { + "epoch": 0.21, + "grad_norm": 0.08589342731975778, + "learning_rate": 0.0009152141607105231, + "loss": 1.4111, + "step": 1977 + }, + { + "epoch": 0.21, + "grad_norm": 0.08478583867251643, + "learning_rate": 0.0009151171359094939, + "loss": 1.4493, + "step": 1978 + }, + { + "epoch": 0.21, + "grad_norm": 0.08228102758097187, + "learning_rate": 0.0009150200607740809, + "loss": 1.4154, + "step": 1979 + }, + { + "epoch": 0.21, + "grad_norm": 0.08876973483667644, + "learning_rate": 0.0009149229353160545, + "loss": 1.4799, + "step": 1980 + }, + { + "epoch": 0.21, + "grad_norm": 0.08620876609407999, + "learning_rate": 0.0009148257595471919, + "loss": 1.4748, + "step": 1981 + }, + { + "epoch": 0.21, + "grad_norm": 0.08926651388447245, + "learning_rate": 0.0009147285334792759, + "loss": 1.5433, + "step": 1982 + }, + { + "epoch": 0.21, + "grad_norm": 0.08686773533514985, + "learning_rate": 0.0009146312571240953, + "loss": 1.4099, + "step": 1983 + }, + { + "epoch": 0.21, + "grad_norm": 0.08120790615360068, + "learning_rate": 0.0009145339304934453, + "loss": 1.5089, + "step": 1984 + }, + { + "epoch": 0.21, + "grad_norm": 0.08751756915323748, + "learning_rate": 0.0009144365535991273, + "loss": 1.5103, + "step": 1985 + }, + { + "epoch": 0.21, + "grad_norm": 0.08258101469539926, + "learning_rate": 0.0009143391264529482, + "loss": 1.4118, + "step": 1986 + }, + { + "epoch": 0.21, + "grad_norm": 0.08068566310207773, + "learning_rate": 0.0009142416490667217, + "loss": 1.4599, + "step": 1987 + }, + { + "epoch": 0.21, + "grad_norm": 0.09637149301141386, + "learning_rate": 0.000914144121452267, + "loss": 1.5734, + "step": 1988 + }, + { + "epoch": 0.21, + "grad_norm": 0.08287725138258284, + "learning_rate": 0.0009140465436214099, + "loss": 1.5485, + "step": 1989 + }, + { + "epoch": 0.21, + "grad_norm": 0.09651842539644462, + "learning_rate": 0.000913948915585982, + "loss": 1.4493, + "step": 1990 + }, + { + "epoch": 0.21, + "grad_norm": 0.08806081162924627, + "learning_rate": 0.0009138512373578209, + "loss": 1.4323, + "step": 1991 + }, + { + "epoch": 0.21, + "grad_norm": 0.07898507253280361, + "learning_rate": 0.0009137535089487705, + "loss": 1.4421, + "step": 1992 + }, + { + "epoch": 0.21, + "grad_norm": 0.08991267965323817, + "learning_rate": 0.0009136557303706808, + "loss": 1.4571, + "step": 1993 + }, + { + "epoch": 0.21, + "grad_norm": 0.08687067556252696, + "learning_rate": 0.0009135579016354077, + "loss": 1.5119, + "step": 1994 + }, + { + "epoch": 0.21, + "grad_norm": 0.09059313463610325, + "learning_rate": 0.0009134600227548132, + "loss": 1.3702, + "step": 1995 + }, + { + "epoch": 0.21, + "grad_norm": 0.08454581493932053, + "learning_rate": 0.0009133620937407656, + "loss": 1.3382, + "step": 1996 + }, + { + "epoch": 0.21, + "grad_norm": 0.0915977469098984, + "learning_rate": 0.0009132641146051391, + "loss": 1.4972, + "step": 1997 + }, + { + "epoch": 0.21, + "grad_norm": 0.07616506753971135, + "learning_rate": 0.0009131660853598138, + "loss": 1.4425, + "step": 1998 + }, + { + "epoch": 0.21, + "grad_norm": 0.0853799975536159, + "learning_rate": 0.0009130680060166763, + "loss": 1.4682, + "step": 1999 + }, + { + "epoch": 0.21, + "grad_norm": 0.08997371980517725, + "learning_rate": 0.0009129698765876191, + "loss": 1.4905, + "step": 2000 + }, + { + "epoch": 0.22, + "grad_norm": 0.0893355318664062, + "learning_rate": 0.0009128716970845406, + "loss": 1.4148, + "step": 2001 + }, + { + "epoch": 0.22, + "grad_norm": 0.0984766048951835, + "learning_rate": 0.0009127734675193454, + "loss": 1.4577, + "step": 2002 + }, + { + "epoch": 0.22, + "grad_norm": 0.08685169232348386, + "learning_rate": 0.0009126751879039441, + "loss": 1.326, + "step": 2003 + }, + { + "epoch": 0.22, + "grad_norm": 0.08337716785047515, + "learning_rate": 0.0009125768582502539, + "loss": 1.457, + "step": 2004 + }, + { + "epoch": 0.22, + "grad_norm": 0.08013750991226204, + "learning_rate": 0.0009124784785701969, + "loss": 1.2967, + "step": 2005 + }, + { + "epoch": 0.22, + "grad_norm": 0.08594967067757286, + "learning_rate": 0.0009123800488757026, + "loss": 1.5688, + "step": 2006 + }, + { + "epoch": 0.22, + "grad_norm": 0.0777155592804685, + "learning_rate": 0.0009122815691787055, + "loss": 1.4152, + "step": 2007 + }, + { + "epoch": 0.22, + "grad_norm": 0.10406910057370224, + "learning_rate": 0.000912183039491147, + "loss": 1.614, + "step": 2008 + }, + { + "epoch": 0.22, + "grad_norm": 0.08189508655131475, + "learning_rate": 0.0009120844598249737, + "loss": 1.447, + "step": 2009 + }, + { + "epoch": 0.22, + "grad_norm": 0.07712372854960725, + "learning_rate": 0.0009119858301921391, + "loss": 1.3347, + "step": 2010 + }, + { + "epoch": 0.22, + "grad_norm": 0.08994235194617171, + "learning_rate": 0.0009118871506046024, + "loss": 1.5152, + "step": 2011 + }, + { + "epoch": 0.22, + "grad_norm": 0.07797437990098123, + "learning_rate": 0.0009117884210743286, + "loss": 1.4365, + "step": 2012 + }, + { + "epoch": 0.22, + "grad_norm": 0.08655098692860116, + "learning_rate": 0.0009116896416132889, + "loss": 1.4759, + "step": 2013 + }, + { + "epoch": 0.22, + "grad_norm": 0.07745160520338816, + "learning_rate": 0.000911590812233461, + "loss": 1.3671, + "step": 2014 + }, + { + "epoch": 0.22, + "grad_norm": 0.10139052107795801, + "learning_rate": 0.0009114919329468282, + "loss": 1.4782, + "step": 2015 + }, + { + "epoch": 0.22, + "grad_norm": 0.09445383935773374, + "learning_rate": 0.00091139300376538, + "loss": 1.5765, + "step": 2016 + }, + { + "epoch": 0.22, + "grad_norm": 0.0893852361489357, + "learning_rate": 0.0009112940247011116, + "loss": 1.4059, + "step": 2017 + }, + { + "epoch": 0.22, + "grad_norm": 0.08429572837560623, + "learning_rate": 0.0009111949957660248, + "loss": 1.3523, + "step": 2018 + }, + { + "epoch": 0.22, + "grad_norm": 0.08670828198710072, + "learning_rate": 0.0009110959169721271, + "loss": 1.3836, + "step": 2019 + }, + { + "epoch": 0.22, + "grad_norm": 0.09593977547060985, + "learning_rate": 0.0009109967883314323, + "loss": 1.499, + "step": 2020 + }, + { + "epoch": 0.22, + "grad_norm": 0.0893023259917977, + "learning_rate": 0.00091089760985596, + "loss": 1.4865, + "step": 2021 + }, + { + "epoch": 0.22, + "grad_norm": 0.08920793652767317, + "learning_rate": 0.0009107983815577359, + "loss": 1.5379, + "step": 2022 + }, + { + "epoch": 0.22, + "grad_norm": 0.10048196567007192, + "learning_rate": 0.0009106991034487917, + "loss": 1.5585, + "step": 2023 + }, + { + "epoch": 0.22, + "grad_norm": 0.09363391762575825, + "learning_rate": 0.0009105997755411655, + "loss": 1.5651, + "step": 2024 + }, + { + "epoch": 0.22, + "grad_norm": 0.10401149059455012, + "learning_rate": 0.0009105003978469009, + "loss": 1.4563, + "step": 2025 + }, + { + "epoch": 0.22, + "grad_norm": 0.08535302742501824, + "learning_rate": 0.0009104009703780478, + "loss": 1.4701, + "step": 2026 + }, + { + "epoch": 0.22, + "grad_norm": 0.08868145003485764, + "learning_rate": 0.0009103014931466623, + "loss": 1.4583, + "step": 2027 + }, + { + "epoch": 0.22, + "grad_norm": 0.0879030351354358, + "learning_rate": 0.000910201966164806, + "loss": 1.3511, + "step": 2028 + }, + { + "epoch": 0.22, + "grad_norm": 0.10253950527640521, + "learning_rate": 0.0009101023894445472, + "loss": 1.5729, + "step": 2029 + }, + { + "epoch": 0.22, + "grad_norm": 0.08542785241162401, + "learning_rate": 0.0009100027629979599, + "loss": 1.4357, + "step": 2030 + }, + { + "epoch": 0.22, + "grad_norm": 0.08579083004639929, + "learning_rate": 0.0009099030868371241, + "loss": 1.4863, + "step": 2031 + }, + { + "epoch": 0.22, + "grad_norm": 0.09758998052839936, + "learning_rate": 0.0009098033609741259, + "loss": 1.5492, + "step": 2032 + }, + { + "epoch": 0.22, + "grad_norm": 0.07844186539869633, + "learning_rate": 0.0009097035854210573, + "loss": 1.5178, + "step": 2033 + }, + { + "epoch": 0.22, + "grad_norm": 0.08847073531592889, + "learning_rate": 0.0009096037601900166, + "loss": 1.474, + "step": 2034 + }, + { + "epoch": 0.22, + "grad_norm": 0.08055118937273592, + "learning_rate": 0.0009095038852931077, + "loss": 1.3124, + "step": 2035 + }, + { + "epoch": 0.22, + "grad_norm": 0.08954927397429793, + "learning_rate": 0.000909403960742441, + "loss": 1.3911, + "step": 2036 + }, + { + "epoch": 0.22, + "grad_norm": 0.08871609293202781, + "learning_rate": 0.0009093039865501327, + "loss": 1.4989, + "step": 2037 + }, + { + "epoch": 0.22, + "grad_norm": 0.08331819035983692, + "learning_rate": 0.0009092039627283049, + "loss": 1.5124, + "step": 2038 + }, + { + "epoch": 0.22, + "grad_norm": 0.08617807710536765, + "learning_rate": 0.0009091038892890859, + "loss": 1.3432, + "step": 2039 + }, + { + "epoch": 0.22, + "grad_norm": 0.08013091876615193, + "learning_rate": 0.0009090037662446099, + "loss": 1.4968, + "step": 2040 + }, + { + "epoch": 0.22, + "grad_norm": 0.08088959403354601, + "learning_rate": 0.0009089035936070171, + "loss": 1.2544, + "step": 2041 + }, + { + "epoch": 0.22, + "grad_norm": 0.07511260388633456, + "learning_rate": 0.0009088033713884541, + "loss": 1.5305, + "step": 2042 + }, + { + "epoch": 0.22, + "grad_norm": 0.07857443163504174, + "learning_rate": 0.0009087030996010728, + "loss": 1.4741, + "step": 2043 + }, + { + "epoch": 0.22, + "grad_norm": 0.08773673280492814, + "learning_rate": 0.0009086027782570316, + "loss": 1.4905, + "step": 2044 + }, + { + "epoch": 0.22, + "grad_norm": 0.08617794607094241, + "learning_rate": 0.0009085024073684951, + "loss": 1.5055, + "step": 2045 + }, + { + "epoch": 0.22, + "grad_norm": 0.0949855047620933, + "learning_rate": 0.0009084019869476332, + "loss": 1.5818, + "step": 2046 + }, + { + "epoch": 0.22, + "grad_norm": 0.09609447475098014, + "learning_rate": 0.0009083015170066224, + "loss": 1.4573, + "step": 2047 + }, + { + "epoch": 0.22, + "grad_norm": 0.08109159950480951, + "learning_rate": 0.0009082009975576451, + "loss": 1.3369, + "step": 2048 + }, + { + "epoch": 0.22, + "grad_norm": 0.08835922201145058, + "learning_rate": 0.0009081004286128895, + "loss": 1.3451, + "step": 2049 + }, + { + "epoch": 0.22, + "grad_norm": 0.08859070142962146, + "learning_rate": 0.0009079998101845501, + "loss": 1.3428, + "step": 2050 + }, + { + "epoch": 0.22, + "grad_norm": 0.088189800018709, + "learning_rate": 0.000907899142284827, + "loss": 1.5434, + "step": 2051 + }, + { + "epoch": 0.22, + "grad_norm": 0.08644531320243522, + "learning_rate": 0.0009077984249259268, + "loss": 1.4702, + "step": 2052 + }, + { + "epoch": 0.22, + "grad_norm": 0.07668741919840479, + "learning_rate": 0.0009076976581200615, + "loss": 1.3968, + "step": 2053 + }, + { + "epoch": 0.22, + "grad_norm": 0.09581406401109188, + "learning_rate": 0.0009075968418794498, + "loss": 1.4724, + "step": 2054 + }, + { + "epoch": 0.22, + "grad_norm": 0.09246875649499241, + "learning_rate": 0.0009074959762163157, + "loss": 1.4314, + "step": 2055 + }, + { + "epoch": 0.22, + "grad_norm": 0.07492645924876302, + "learning_rate": 0.0009073950611428897, + "loss": 1.2898, + "step": 2056 + }, + { + "epoch": 0.22, + "grad_norm": 0.06928869437837035, + "learning_rate": 0.000907294096671408, + "loss": 1.3936, + "step": 2057 + }, + { + "epoch": 0.22, + "grad_norm": 0.0832862559597914, + "learning_rate": 0.0009071930828141128, + "loss": 1.4466, + "step": 2058 + }, + { + "epoch": 0.22, + "grad_norm": 0.08848451000796444, + "learning_rate": 0.0009070920195832527, + "loss": 1.4536, + "step": 2059 + }, + { + "epoch": 0.22, + "grad_norm": 0.09506876450203977, + "learning_rate": 0.0009069909069910816, + "loss": 1.342, + "step": 2060 + }, + { + "epoch": 0.22, + "grad_norm": 0.08569941811729877, + "learning_rate": 0.0009068897450498602, + "loss": 1.4359, + "step": 2061 + }, + { + "epoch": 0.22, + "grad_norm": 0.08488207204085854, + "learning_rate": 0.0009067885337718543, + "loss": 1.5106, + "step": 2062 + }, + { + "epoch": 0.22, + "grad_norm": 0.08690685472711807, + "learning_rate": 0.0009066872731693361, + "loss": 1.3516, + "step": 2063 + }, + { + "epoch": 0.22, + "grad_norm": 0.08252793754556853, + "learning_rate": 0.0009065859632545841, + "loss": 1.496, + "step": 2064 + }, + { + "epoch": 0.22, + "grad_norm": 0.09116501152269689, + "learning_rate": 0.0009064846040398822, + "loss": 1.3323, + "step": 2065 + }, + { + "epoch": 0.22, + "grad_norm": 0.08508703996689969, + "learning_rate": 0.0009063831955375209, + "loss": 1.4011, + "step": 2066 + }, + { + "epoch": 0.22, + "grad_norm": 0.08245437493180188, + "learning_rate": 0.0009062817377597961, + "loss": 1.5624, + "step": 2067 + }, + { + "epoch": 0.22, + "grad_norm": 0.08352745314965869, + "learning_rate": 0.0009061802307190098, + "loss": 1.4388, + "step": 2068 + }, + { + "epoch": 0.22, + "grad_norm": 0.08597478680811273, + "learning_rate": 0.0009060786744274703, + "loss": 1.4461, + "step": 2069 + }, + { + "epoch": 0.22, + "grad_norm": 0.08312171106043391, + "learning_rate": 0.0009059770688974915, + "loss": 1.4689, + "step": 2070 + }, + { + "epoch": 0.22, + "grad_norm": 0.09194463520309476, + "learning_rate": 0.0009058754141413935, + "loss": 1.4197, + "step": 2071 + }, + { + "epoch": 0.22, + "grad_norm": 0.08150972185382112, + "learning_rate": 0.0009057737101715024, + "loss": 1.4696, + "step": 2072 + }, + { + "epoch": 0.22, + "grad_norm": 0.0853710443666307, + "learning_rate": 0.0009056719570001498, + "loss": 1.5233, + "step": 2073 + }, + { + "epoch": 0.22, + "grad_norm": 0.07523008570520828, + "learning_rate": 0.000905570154639674, + "loss": 1.5505, + "step": 2074 + }, + { + "epoch": 0.22, + "grad_norm": 0.0847559640472571, + "learning_rate": 0.0009054683031024187, + "loss": 1.5916, + "step": 2075 + }, + { + "epoch": 0.22, + "grad_norm": 0.08104363762554731, + "learning_rate": 0.0009053664024007337, + "loss": 1.4587, + "step": 2076 + }, + { + "epoch": 0.22, + "grad_norm": 0.07962733245896239, + "learning_rate": 0.0009052644525469751, + "loss": 1.3431, + "step": 2077 + }, + { + "epoch": 0.22, + "grad_norm": 0.08353363693288063, + "learning_rate": 0.0009051624535535044, + "loss": 1.4799, + "step": 2078 + }, + { + "epoch": 0.22, + "grad_norm": 0.08825166774195803, + "learning_rate": 0.0009050604054326893, + "loss": 1.3508, + "step": 2079 + }, + { + "epoch": 0.22, + "grad_norm": 0.07694662079941654, + "learning_rate": 0.0009049583081969037, + "loss": 1.3975, + "step": 2080 + }, + { + "epoch": 0.22, + "grad_norm": 0.08547425993073962, + "learning_rate": 0.0009048561618585269, + "loss": 1.4588, + "step": 2081 + }, + { + "epoch": 0.22, + "grad_norm": 0.07890146312971011, + "learning_rate": 0.000904753966429945, + "loss": 1.5806, + "step": 2082 + }, + { + "epoch": 0.22, + "grad_norm": 0.0895410367058908, + "learning_rate": 0.0009046517219235492, + "loss": 1.428, + "step": 2083 + }, + { + "epoch": 0.22, + "grad_norm": 0.07457533928827222, + "learning_rate": 0.000904549428351737, + "loss": 1.4553, + "step": 2084 + }, + { + "epoch": 0.22, + "grad_norm": 0.08100017328753376, + "learning_rate": 0.0009044470857269121, + "loss": 1.3618, + "step": 2085 + }, + { + "epoch": 0.22, + "grad_norm": 0.07129459326988942, + "learning_rate": 0.0009043446940614835, + "loss": 1.4317, + "step": 2086 + }, + { + "epoch": 0.22, + "grad_norm": 0.07500535174736596, + "learning_rate": 0.0009042422533678667, + "loss": 1.4146, + "step": 2087 + }, + { + "epoch": 0.22, + "grad_norm": 0.08011378389844677, + "learning_rate": 0.0009041397636584831, + "loss": 1.3876, + "step": 2088 + }, + { + "epoch": 0.22, + "grad_norm": 0.07943960209069717, + "learning_rate": 0.00090403722494576, + "loss": 1.428, + "step": 2089 + }, + { + "epoch": 0.22, + "grad_norm": 0.07076112195478479, + "learning_rate": 0.0009039346372421304, + "loss": 1.3577, + "step": 2090 + }, + { + "epoch": 0.22, + "grad_norm": 0.09256678327668848, + "learning_rate": 0.0009038320005600336, + "loss": 1.4306, + "step": 2091 + }, + { + "epoch": 0.22, + "grad_norm": 0.07157860379515876, + "learning_rate": 0.0009037293149119144, + "loss": 1.4065, + "step": 2092 + }, + { + "epoch": 0.22, + "grad_norm": 0.09035990774616685, + "learning_rate": 0.0009036265803102237, + "loss": 1.3061, + "step": 2093 + }, + { + "epoch": 0.23, + "grad_norm": 0.09038176456247499, + "learning_rate": 0.0009035237967674188, + "loss": 1.4361, + "step": 2094 + }, + { + "epoch": 0.23, + "grad_norm": 0.10021661807035723, + "learning_rate": 0.0009034209642959624, + "loss": 1.3928, + "step": 2095 + }, + { + "epoch": 0.23, + "grad_norm": 0.08500099881896636, + "learning_rate": 0.0009033180829083232, + "loss": 1.2741, + "step": 2096 + }, + { + "epoch": 0.23, + "grad_norm": 0.09099960063293876, + "learning_rate": 0.0009032151526169761, + "loss": 1.3831, + "step": 2097 + }, + { + "epoch": 0.23, + "grad_norm": 0.09257103776160713, + "learning_rate": 0.0009031121734344016, + "loss": 1.3861, + "step": 2098 + }, + { + "epoch": 0.23, + "grad_norm": 0.09067821153143035, + "learning_rate": 0.0009030091453730862, + "loss": 1.5433, + "step": 2099 + }, + { + "epoch": 0.23, + "grad_norm": 0.10061981038222514, + "learning_rate": 0.0009029060684455228, + "loss": 1.6066, + "step": 2100 + }, + { + "epoch": 0.23, + "grad_norm": 0.08322155025349774, + "learning_rate": 0.0009028029426642095, + "loss": 1.44, + "step": 2101 + }, + { + "epoch": 0.23, + "grad_norm": 0.08632501692592569, + "learning_rate": 0.0009026997680416505, + "loss": 1.5127, + "step": 2102 + }, + { + "epoch": 0.23, + "grad_norm": 0.10646380692549054, + "learning_rate": 0.0009025965445903565, + "loss": 1.5835, + "step": 2103 + }, + { + "epoch": 0.23, + "grad_norm": 0.0778664557671461, + "learning_rate": 0.0009024932723228436, + "loss": 1.4187, + "step": 2104 + }, + { + "epoch": 0.23, + "grad_norm": 0.08218957602622107, + "learning_rate": 0.0009023899512516336, + "loss": 1.322, + "step": 2105 + }, + { + "epoch": 0.23, + "grad_norm": 0.068440123958451, + "learning_rate": 0.0009022865813892549, + "loss": 1.4493, + "step": 2106 + }, + { + "epoch": 0.23, + "grad_norm": 0.07736229830673369, + "learning_rate": 0.0009021831627482413, + "loss": 1.3777, + "step": 2107 + }, + { + "epoch": 0.23, + "grad_norm": 0.07927886201598712, + "learning_rate": 0.0009020796953411327, + "loss": 1.4454, + "step": 2108 + }, + { + "epoch": 0.23, + "grad_norm": 0.07994526360761076, + "learning_rate": 0.0009019761791804748, + "loss": 1.3731, + "step": 2109 + }, + { + "epoch": 0.23, + "grad_norm": 0.07833679220856182, + "learning_rate": 0.0009018726142788194, + "loss": 1.3188, + "step": 2110 + }, + { + "epoch": 0.23, + "grad_norm": 0.07289754206616753, + "learning_rate": 0.000901769000648724, + "loss": 1.4212, + "step": 2111 + }, + { + "epoch": 0.23, + "grad_norm": 0.07449956522325085, + "learning_rate": 0.0009016653383027522, + "loss": 1.4958, + "step": 2112 + }, + { + "epoch": 0.23, + "grad_norm": 0.07840040658702752, + "learning_rate": 0.0009015616272534734, + "loss": 1.3861, + "step": 2113 + }, + { + "epoch": 0.23, + "grad_norm": 0.07392483160096817, + "learning_rate": 0.0009014578675134628, + "loss": 1.4716, + "step": 2114 + }, + { + "epoch": 0.23, + "grad_norm": 0.06578630614556928, + "learning_rate": 0.0009013540590953017, + "loss": 1.5169, + "step": 2115 + }, + { + "epoch": 0.23, + "grad_norm": 0.07803948013204608, + "learning_rate": 0.0009012502020115776, + "loss": 1.5624, + "step": 2116 + }, + { + "epoch": 0.23, + "grad_norm": 0.076649100119888, + "learning_rate": 0.0009011462962748829, + "loss": 1.5829, + "step": 2117 + }, + { + "epoch": 0.23, + "grad_norm": 0.07625429267408237, + "learning_rate": 0.0009010423418978168, + "loss": 1.4575, + "step": 2118 + }, + { + "epoch": 0.23, + "grad_norm": 0.07959475385111861, + "learning_rate": 0.0009009383388929842, + "loss": 1.4098, + "step": 2119 + }, + { + "epoch": 0.23, + "grad_norm": 0.07880013779769067, + "learning_rate": 0.0009008342872729957, + "loss": 1.4709, + "step": 2120 + }, + { + "epoch": 0.23, + "grad_norm": 0.08272735817424194, + "learning_rate": 0.0009007301870504681, + "loss": 1.4538, + "step": 2121 + }, + { + "epoch": 0.23, + "grad_norm": 0.07552018367416652, + "learning_rate": 0.0009006260382380238, + "loss": 1.3958, + "step": 2122 + }, + { + "epoch": 0.23, + "grad_norm": 0.0773802621109197, + "learning_rate": 0.0009005218408482911, + "loss": 1.6479, + "step": 2123 + }, + { + "epoch": 0.23, + "grad_norm": 0.0903702500400604, + "learning_rate": 0.0009004175948939044, + "loss": 1.4688, + "step": 2124 + }, + { + "epoch": 0.23, + "grad_norm": 0.08143156259179056, + "learning_rate": 0.000900313300387504, + "loss": 1.3508, + "step": 2125 + }, + { + "epoch": 0.23, + "grad_norm": 0.08345652362054003, + "learning_rate": 0.0009002089573417356, + "loss": 1.4798, + "step": 2126 + }, + { + "epoch": 0.23, + "grad_norm": 0.08097286863224183, + "learning_rate": 0.0009001045657692517, + "loss": 1.4088, + "step": 2127 + }, + { + "epoch": 0.23, + "grad_norm": 0.07816712018885351, + "learning_rate": 0.0009000001256827095, + "loss": 1.464, + "step": 2128 + }, + { + "epoch": 0.23, + "grad_norm": 0.07995949220911897, + "learning_rate": 0.0008998956370947733, + "loss": 1.4584, + "step": 2129 + }, + { + "epoch": 0.23, + "grad_norm": 0.08389083649029062, + "learning_rate": 0.0008997911000181122, + "loss": 1.4115, + "step": 2130 + }, + { + "epoch": 0.23, + "grad_norm": 0.08706037657202785, + "learning_rate": 0.0008996865144654023, + "loss": 1.4264, + "step": 2131 + }, + { + "epoch": 0.23, + "grad_norm": 0.08739980235747145, + "learning_rate": 0.0008995818804493243, + "loss": 1.4701, + "step": 2132 + }, + { + "epoch": 0.23, + "grad_norm": 0.09140265561641635, + "learning_rate": 0.0008994771979825658, + "loss": 1.4218, + "step": 2133 + }, + { + "epoch": 0.23, + "grad_norm": 0.07375929912993871, + "learning_rate": 0.0008993724670778198, + "loss": 1.4068, + "step": 2134 + }, + { + "epoch": 0.23, + "grad_norm": 0.07953888777292227, + "learning_rate": 0.0008992676877477854, + "loss": 1.444, + "step": 2135 + }, + { + "epoch": 0.23, + "grad_norm": 0.10877764802904497, + "learning_rate": 0.0008991628600051673, + "loss": 1.4785, + "step": 2136 + }, + { + "epoch": 0.23, + "grad_norm": 0.07748952894124081, + "learning_rate": 0.0008990579838626764, + "loss": 1.4316, + "step": 2137 + }, + { + "epoch": 0.23, + "grad_norm": 0.08527565495180588, + "learning_rate": 0.0008989530593330291, + "loss": 1.6002, + "step": 2138 + }, + { + "epoch": 0.23, + "grad_norm": 0.08335708177946245, + "learning_rate": 0.0008988480864289481, + "loss": 1.3452, + "step": 2139 + }, + { + "epoch": 0.23, + "grad_norm": 0.09759801081695489, + "learning_rate": 0.0008987430651631613, + "loss": 1.4563, + "step": 2140 + }, + { + "epoch": 0.23, + "grad_norm": 0.07840457899332089, + "learning_rate": 0.0008986379955484036, + "loss": 1.4884, + "step": 2141 + }, + { + "epoch": 0.23, + "grad_norm": 0.0876443419021953, + "learning_rate": 0.0008985328775974142, + "loss": 1.5695, + "step": 2142 + }, + { + "epoch": 0.23, + "grad_norm": 0.08807857999714565, + "learning_rate": 0.0008984277113229397, + "loss": 1.5175, + "step": 2143 + }, + { + "epoch": 0.23, + "grad_norm": 0.10090005538461481, + "learning_rate": 0.0008983224967377315, + "loss": 1.435, + "step": 2144 + }, + { + "epoch": 0.23, + "grad_norm": 0.09627909221834423, + "learning_rate": 0.0008982172338545474, + "loss": 1.5002, + "step": 2145 + }, + { + "epoch": 0.23, + "grad_norm": 0.0789701966323783, + "learning_rate": 0.0008981119226861508, + "loss": 1.6117, + "step": 2146 + }, + { + "epoch": 0.23, + "grad_norm": 0.0862375515699889, + "learning_rate": 0.0008980065632453111, + "loss": 1.5657, + "step": 2147 + }, + { + "epoch": 0.23, + "grad_norm": 0.07179033638664258, + "learning_rate": 0.0008979011555448035, + "loss": 1.4099, + "step": 2148 + }, + { + "epoch": 0.23, + "grad_norm": 0.07114245906288379, + "learning_rate": 0.0008977956995974089, + "loss": 1.4637, + "step": 2149 + }, + { + "epoch": 0.23, + "grad_norm": 0.07414337930145709, + "learning_rate": 0.0008976901954159144, + "loss": 1.4752, + "step": 2150 + }, + { + "epoch": 0.23, + "grad_norm": 0.07132830095164981, + "learning_rate": 0.0008975846430131127, + "loss": 1.4056, + "step": 2151 + }, + { + "epoch": 0.23, + "grad_norm": 0.0719025939859958, + "learning_rate": 0.0008974790424018022, + "loss": 1.4565, + "step": 2152 + }, + { + "epoch": 0.23, + "grad_norm": 0.07354577248780032, + "learning_rate": 0.0008973733935947877, + "loss": 1.3983, + "step": 2153 + }, + { + "epoch": 0.23, + "grad_norm": 0.0787391972157109, + "learning_rate": 0.0008972676966048789, + "loss": 1.3842, + "step": 2154 + }, + { + "epoch": 0.23, + "grad_norm": 0.07914358659288476, + "learning_rate": 0.0008971619514448927, + "loss": 1.4485, + "step": 2155 + }, + { + "epoch": 0.23, + "grad_norm": 0.09372633769515623, + "learning_rate": 0.0008970561581276505, + "loss": 1.4679, + "step": 2156 + }, + { + "epoch": 0.23, + "grad_norm": 0.07041013581411883, + "learning_rate": 0.0008969503166659803, + "loss": 1.3479, + "step": 2157 + }, + { + "epoch": 0.23, + "grad_norm": 0.08613639135343044, + "learning_rate": 0.0008968444270727157, + "loss": 1.4191, + "step": 2158 + }, + { + "epoch": 0.23, + "grad_norm": 0.08240147616175394, + "learning_rate": 0.0008967384893606962, + "loss": 1.4829, + "step": 2159 + }, + { + "epoch": 0.23, + "grad_norm": 0.08323090670400539, + "learning_rate": 0.0008966325035427669, + "loss": 1.4426, + "step": 2160 + }, + { + "epoch": 0.23, + "grad_norm": 0.0830480211709058, + "learning_rate": 0.0008965264696317795, + "loss": 1.4769, + "step": 2161 + }, + { + "epoch": 0.23, + "grad_norm": 0.08928546482396188, + "learning_rate": 0.0008964203876405903, + "loss": 1.3255, + "step": 2162 + }, + { + "epoch": 0.23, + "grad_norm": 0.0822658434232799, + "learning_rate": 0.0008963142575820626, + "loss": 1.4945, + "step": 2163 + }, + { + "epoch": 0.23, + "grad_norm": 0.08491975646767387, + "learning_rate": 0.0008962080794690648, + "loss": 1.5602, + "step": 2164 + }, + { + "epoch": 0.23, + "grad_norm": 0.08414408557119114, + "learning_rate": 0.0008961018533144716, + "loss": 1.4698, + "step": 2165 + }, + { + "epoch": 0.23, + "grad_norm": 0.0954940468086645, + "learning_rate": 0.000895995579131163, + "loss": 1.3421, + "step": 2166 + }, + { + "epoch": 0.23, + "grad_norm": 0.08659721199046269, + "learning_rate": 0.0008958892569320251, + "loss": 1.446, + "step": 2167 + }, + { + "epoch": 0.23, + "grad_norm": 0.08211464984956957, + "learning_rate": 0.00089578288672995, + "loss": 1.4428, + "step": 2168 + }, + { + "epoch": 0.23, + "grad_norm": 0.08730211184796505, + "learning_rate": 0.0008956764685378356, + "loss": 1.5298, + "step": 2169 + }, + { + "epoch": 0.23, + "grad_norm": 0.08028511736236937, + "learning_rate": 0.0008955700023685851, + "loss": 1.401, + "step": 2170 + }, + { + "epoch": 0.23, + "grad_norm": 0.08607184275244723, + "learning_rate": 0.000895463488235108, + "loss": 1.2803, + "step": 2171 + }, + { + "epoch": 0.23, + "grad_norm": 0.08878050364927839, + "learning_rate": 0.0008953569261503198, + "loss": 1.3938, + "step": 2172 + }, + { + "epoch": 0.23, + "grad_norm": 0.09230301699295128, + "learning_rate": 0.0008952503161271413, + "loss": 1.5243, + "step": 2173 + }, + { + "epoch": 0.23, + "grad_norm": 0.09530871727738145, + "learning_rate": 0.000895143658178499, + "loss": 1.4443, + "step": 2174 + }, + { + "epoch": 0.23, + "grad_norm": 0.09034274785163224, + "learning_rate": 0.0008950369523173263, + "loss": 1.3993, + "step": 2175 + }, + { + "epoch": 0.23, + "grad_norm": 0.08126098450813693, + "learning_rate": 0.000894930198556561, + "loss": 1.3772, + "step": 2176 + }, + { + "epoch": 0.23, + "grad_norm": 0.09846129233960661, + "learning_rate": 0.0008948233969091477, + "loss": 1.4249, + "step": 2177 + }, + { + "epoch": 0.23, + "grad_norm": 0.09346589113210531, + "learning_rate": 0.0008947165473880363, + "loss": 1.2923, + "step": 2178 + }, + { + "epoch": 0.23, + "grad_norm": 0.07884971976680567, + "learning_rate": 0.0008946096500061828, + "loss": 1.5027, + "step": 2179 + }, + { + "epoch": 0.23, + "grad_norm": 0.0815049356861313, + "learning_rate": 0.0008945027047765488, + "loss": 1.5911, + "step": 2180 + }, + { + "epoch": 0.23, + "grad_norm": 0.08053472358349446, + "learning_rate": 0.0008943957117121017, + "loss": 1.3357, + "step": 2181 + }, + { + "epoch": 0.23, + "grad_norm": 0.07124111934527916, + "learning_rate": 0.0008942886708258148, + "loss": 1.485, + "step": 2182 + }, + { + "epoch": 0.23, + "grad_norm": 0.08208081331893921, + "learning_rate": 0.0008941815821306674, + "loss": 1.3862, + "step": 2183 + }, + { + "epoch": 0.23, + "grad_norm": 0.08992051131410057, + "learning_rate": 0.0008940744456396442, + "loss": 1.4123, + "step": 2184 + }, + { + "epoch": 0.23, + "grad_norm": 0.08292039307031486, + "learning_rate": 0.0008939672613657359, + "loss": 1.4377, + "step": 2185 + }, + { + "epoch": 0.23, + "grad_norm": 0.07751252629536556, + "learning_rate": 0.000893860029321939, + "loss": 1.3624, + "step": 2186 + }, + { + "epoch": 0.24, + "grad_norm": 0.07378227106710288, + "learning_rate": 0.0008937527495212555, + "loss": 1.5427, + "step": 2187 + }, + { + "epoch": 0.24, + "grad_norm": 0.08362754852649955, + "learning_rate": 0.0008936454219766938, + "loss": 1.3911, + "step": 2188 + }, + { + "epoch": 0.24, + "grad_norm": 0.08561863317364699, + "learning_rate": 0.0008935380467012675, + "loss": 1.4485, + "step": 2189 + }, + { + "epoch": 0.24, + "grad_norm": 0.08022086430199703, + "learning_rate": 0.0008934306237079963, + "loss": 1.4691, + "step": 2190 + }, + { + "epoch": 0.24, + "grad_norm": 0.10481201673736351, + "learning_rate": 0.0008933231530099058, + "loss": 1.4008, + "step": 2191 + }, + { + "epoch": 0.24, + "grad_norm": 0.07735777171508976, + "learning_rate": 0.0008932156346200268, + "loss": 1.4511, + "step": 2192 + }, + { + "epoch": 0.24, + "grad_norm": 0.07372539389115546, + "learning_rate": 0.0008931080685513966, + "loss": 1.5533, + "step": 2193 + }, + { + "epoch": 0.24, + "grad_norm": 0.08699225540912171, + "learning_rate": 0.0008930004548170577, + "loss": 1.5613, + "step": 2194 + }, + { + "epoch": 0.24, + "grad_norm": 0.08015320764450956, + "learning_rate": 0.0008928927934300588, + "loss": 1.4748, + "step": 2195 + }, + { + "epoch": 0.24, + "grad_norm": 0.08026380063427321, + "learning_rate": 0.0008927850844034544, + "loss": 1.5011, + "step": 2196 + }, + { + "epoch": 0.24, + "grad_norm": 0.08569620404860831, + "learning_rate": 0.0008926773277503041, + "loss": 1.4672, + "step": 2197 + }, + { + "epoch": 0.24, + "grad_norm": 0.08527228975419182, + "learning_rate": 0.0008925695234836742, + "loss": 1.4531, + "step": 2198 + }, + { + "epoch": 0.24, + "grad_norm": 0.08123604466644242, + "learning_rate": 0.0008924616716166363, + "loss": 1.4952, + "step": 2199 + }, + { + "epoch": 0.24, + "grad_norm": 0.08556822527789404, + "learning_rate": 0.0008923537721622674, + "loss": 1.5915, + "step": 2200 + }, + { + "epoch": 0.24, + "grad_norm": 0.09081981002504544, + "learning_rate": 0.0008922458251336511, + "loss": 1.5223, + "step": 2201 + }, + { + "epoch": 0.24, + "grad_norm": 0.08036843164515005, + "learning_rate": 0.0008921378305438763, + "loss": 1.415, + "step": 2202 + }, + { + "epoch": 0.24, + "grad_norm": 0.08294936113634996, + "learning_rate": 0.0008920297884060376, + "loss": 1.3729, + "step": 2203 + }, + { + "epoch": 0.24, + "grad_norm": 0.09402295203612535, + "learning_rate": 0.0008919216987332356, + "loss": 1.3907, + "step": 2204 + }, + { + "epoch": 0.24, + "grad_norm": 0.08251725989381521, + "learning_rate": 0.0008918135615385763, + "loss": 1.3759, + "step": 2205 + }, + { + "epoch": 0.24, + "grad_norm": 0.07518200632745241, + "learning_rate": 0.0008917053768351719, + "loss": 1.3763, + "step": 2206 + }, + { + "epoch": 0.24, + "grad_norm": 0.07912416047033925, + "learning_rate": 0.0008915971446361404, + "loss": 1.4037, + "step": 2207 + }, + { + "epoch": 0.24, + "grad_norm": 0.08064078885414537, + "learning_rate": 0.0008914888649546048, + "loss": 1.5326, + "step": 2208 + }, + { + "epoch": 0.24, + "grad_norm": 0.07408548298786542, + "learning_rate": 0.0008913805378036948, + "loss": 1.5165, + "step": 2209 + }, + { + "epoch": 0.24, + "grad_norm": 0.08872131280264428, + "learning_rate": 0.0008912721631965453, + "loss": 1.4678, + "step": 2210 + }, + { + "epoch": 0.24, + "grad_norm": 0.08249217488465536, + "learning_rate": 0.0008911637411462969, + "loss": 1.3006, + "step": 2211 + }, + { + "epoch": 0.24, + "grad_norm": 0.08866808668942394, + "learning_rate": 0.0008910552716660965, + "loss": 1.4458, + "step": 2212 + }, + { + "epoch": 0.24, + "grad_norm": 0.07822108916047592, + "learning_rate": 0.0008909467547690962, + "loss": 1.5045, + "step": 2213 + }, + { + "epoch": 0.24, + "grad_norm": 0.07968046873758618, + "learning_rate": 0.0008908381904684542, + "loss": 1.3571, + "step": 2214 + }, + { + "epoch": 0.24, + "grad_norm": 0.08621063906251096, + "learning_rate": 0.0008907295787773339, + "loss": 1.4686, + "step": 2215 + }, + { + "epoch": 0.24, + "grad_norm": 0.09541846290904246, + "learning_rate": 0.0008906209197089054, + "loss": 1.4994, + "step": 2216 + }, + { + "epoch": 0.24, + "grad_norm": 0.08307421917053688, + "learning_rate": 0.0008905122132763437, + "loss": 1.5249, + "step": 2217 + }, + { + "epoch": 0.24, + "grad_norm": 0.07788724633936456, + "learning_rate": 0.0008904034594928296, + "loss": 1.4023, + "step": 2218 + }, + { + "epoch": 0.24, + "grad_norm": 0.0816366690950349, + "learning_rate": 0.0008902946583715503, + "loss": 1.43, + "step": 2219 + }, + { + "epoch": 0.24, + "grad_norm": 0.09982940037534067, + "learning_rate": 0.0008901858099256981, + "loss": 1.4255, + "step": 2220 + }, + { + "epoch": 0.24, + "grad_norm": 0.08232288093656423, + "learning_rate": 0.0008900769141684712, + "loss": 1.3594, + "step": 2221 + }, + { + "epoch": 0.24, + "grad_norm": 0.08357733265593364, + "learning_rate": 0.0008899679711130737, + "loss": 1.4477, + "step": 2222 + }, + { + "epoch": 0.24, + "grad_norm": 0.08849502280367522, + "learning_rate": 0.0008898589807727153, + "loss": 1.3596, + "step": 2223 + }, + { + "epoch": 0.24, + "grad_norm": 0.07174272902783962, + "learning_rate": 0.0008897499431606116, + "loss": 1.4807, + "step": 2224 + }, + { + "epoch": 0.24, + "grad_norm": 0.08742620366231663, + "learning_rate": 0.0008896408582899833, + "loss": 1.4376, + "step": 2225 + }, + { + "epoch": 0.24, + "grad_norm": 0.07675971108920426, + "learning_rate": 0.0008895317261740579, + "loss": 1.4947, + "step": 2226 + }, + { + "epoch": 0.24, + "grad_norm": 0.07584998544369173, + "learning_rate": 0.0008894225468260675, + "loss": 1.5446, + "step": 2227 + }, + { + "epoch": 0.24, + "grad_norm": 0.08675223323782724, + "learning_rate": 0.000889313320259251, + "loss": 1.5474, + "step": 2228 + }, + { + "epoch": 0.24, + "grad_norm": 0.07847232947461194, + "learning_rate": 0.000889204046486852, + "loss": 1.3825, + "step": 2229 + }, + { + "epoch": 0.24, + "grad_norm": 0.08410254193952434, + "learning_rate": 0.0008890947255221209, + "loss": 1.4741, + "step": 2230 + }, + { + "epoch": 0.24, + "grad_norm": 0.08534168834441479, + "learning_rate": 0.0008889853573783127, + "loss": 1.359, + "step": 2231 + }, + { + "epoch": 0.24, + "grad_norm": 0.07198918151582337, + "learning_rate": 0.0008888759420686889, + "loss": 1.3831, + "step": 2232 + }, + { + "epoch": 0.24, + "grad_norm": 0.07677775674736524, + "learning_rate": 0.0008887664796065165, + "loss": 1.3148, + "step": 2233 + }, + { + "epoch": 0.24, + "grad_norm": 0.08207106759233733, + "learning_rate": 0.0008886569700050682, + "loss": 1.468, + "step": 2234 + }, + { + "epoch": 0.24, + "grad_norm": 0.07396556465325442, + "learning_rate": 0.0008885474132776224, + "loss": 1.3863, + "step": 2235 + }, + { + "epoch": 0.24, + "grad_norm": 0.08104484567295037, + "learning_rate": 0.0008884378094374632, + "loss": 1.469, + "step": 2236 + }, + { + "epoch": 0.24, + "grad_norm": 0.07786764881894506, + "learning_rate": 0.0008883281584978804, + "loss": 1.5114, + "step": 2237 + }, + { + "epoch": 0.24, + "grad_norm": 0.07771523721695527, + "learning_rate": 0.0008882184604721697, + "loss": 1.5415, + "step": 2238 + }, + { + "epoch": 0.24, + "grad_norm": 0.08084233888893828, + "learning_rate": 0.000888108715373632, + "loss": 1.5652, + "step": 2239 + }, + { + "epoch": 0.24, + "grad_norm": 0.07311662878971846, + "learning_rate": 0.0008879989232155748, + "loss": 1.4416, + "step": 2240 + }, + { + "epoch": 0.24, + "grad_norm": 0.07765587834196362, + "learning_rate": 0.0008878890840113105, + "loss": 1.3599, + "step": 2241 + }, + { + "epoch": 0.24, + "grad_norm": 0.07468966830819058, + "learning_rate": 0.0008877791977741575, + "loss": 1.4502, + "step": 2242 + }, + { + "epoch": 0.24, + "grad_norm": 0.0810774288676914, + "learning_rate": 0.0008876692645174399, + "loss": 1.5496, + "step": 2243 + }, + { + "epoch": 0.24, + "grad_norm": 0.07578306736684104, + "learning_rate": 0.0008875592842544875, + "loss": 1.4876, + "step": 2244 + }, + { + "epoch": 0.24, + "grad_norm": 0.07308565897661695, + "learning_rate": 0.0008874492569986357, + "loss": 1.4319, + "step": 2245 + }, + { + "epoch": 0.24, + "grad_norm": 0.08661688941721231, + "learning_rate": 0.0008873391827632258, + "loss": 1.3202, + "step": 2246 + }, + { + "epoch": 0.24, + "grad_norm": 0.08669507642175672, + "learning_rate": 0.0008872290615616046, + "loss": 1.5107, + "step": 2247 + }, + { + "epoch": 0.24, + "grad_norm": 0.08289264655816533, + "learning_rate": 0.0008871188934071246, + "loss": 1.3426, + "step": 2248 + }, + { + "epoch": 0.24, + "grad_norm": 0.08315127821419833, + "learning_rate": 0.0008870086783131444, + "loss": 1.3794, + "step": 2249 + }, + { + "epoch": 0.24, + "grad_norm": 0.08843733636930243, + "learning_rate": 0.0008868984162930275, + "loss": 1.4677, + "step": 2250 + }, + { + "epoch": 0.24, + "grad_norm": 0.09172413612371813, + "learning_rate": 0.0008867881073601439, + "loss": 1.5372, + "step": 2251 + }, + { + "epoch": 0.24, + "grad_norm": 0.08078669573488897, + "learning_rate": 0.0008866777515278688, + "loss": 1.4151, + "step": 2252 + }, + { + "epoch": 0.24, + "grad_norm": 0.08090166791860437, + "learning_rate": 0.0008865673488095832, + "loss": 1.5728, + "step": 2253 + }, + { + "epoch": 0.24, + "grad_norm": 0.08000330075098684, + "learning_rate": 0.0008864568992186739, + "loss": 1.3872, + "step": 2254 + }, + { + "epoch": 0.24, + "grad_norm": 0.0756398228234597, + "learning_rate": 0.0008863464027685332, + "loss": 1.2759, + "step": 2255 + }, + { + "epoch": 0.24, + "grad_norm": 0.09501349605436056, + "learning_rate": 0.0008862358594725595, + "loss": 1.6111, + "step": 2256 + }, + { + "epoch": 0.24, + "grad_norm": 0.08869060983281359, + "learning_rate": 0.0008861252693441559, + "loss": 1.5307, + "step": 2257 + }, + { + "epoch": 0.24, + "grad_norm": 0.08322655242755857, + "learning_rate": 0.0008860146323967324, + "loss": 1.3653, + "step": 2258 + }, + { + "epoch": 0.24, + "grad_norm": 0.09070182705927224, + "learning_rate": 0.0008859039486437039, + "loss": 1.465, + "step": 2259 + }, + { + "epoch": 0.24, + "grad_norm": 0.08490542675613597, + "learning_rate": 0.0008857932180984914, + "loss": 1.3364, + "step": 2260 + }, + { + "epoch": 0.24, + "grad_norm": 0.08572743355151981, + "learning_rate": 0.000885682440774521, + "loss": 1.39, + "step": 2261 + }, + { + "epoch": 0.24, + "grad_norm": 0.08391700720736694, + "learning_rate": 0.000885571616685225, + "loss": 1.3427, + "step": 2262 + }, + { + "epoch": 0.24, + "grad_norm": 0.07350389296018711, + "learning_rate": 0.0008854607458440412, + "loss": 1.3976, + "step": 2263 + }, + { + "epoch": 0.24, + "grad_norm": 0.0914364941499127, + "learning_rate": 0.000885349828264413, + "loss": 1.4906, + "step": 2264 + }, + { + "epoch": 0.24, + "grad_norm": 0.08023243376640941, + "learning_rate": 0.0008852388639597897, + "loss": 1.3761, + "step": 2265 + }, + { + "epoch": 0.24, + "grad_norm": 0.07893137372515271, + "learning_rate": 0.0008851278529436261, + "loss": 1.5595, + "step": 2266 + }, + { + "epoch": 0.24, + "grad_norm": 0.07561619587902403, + "learning_rate": 0.0008850167952293825, + "loss": 1.476, + "step": 2267 + }, + { + "epoch": 0.24, + "grad_norm": 0.08126009628490852, + "learning_rate": 0.0008849056908305252, + "loss": 1.5766, + "step": 2268 + }, + { + "epoch": 0.24, + "grad_norm": 0.0779068568614784, + "learning_rate": 0.0008847945397605258, + "loss": 1.4817, + "step": 2269 + }, + { + "epoch": 0.24, + "grad_norm": 0.07932082446412245, + "learning_rate": 0.0008846833420328619, + "loss": 1.373, + "step": 2270 + }, + { + "epoch": 0.24, + "grad_norm": 0.08156960018218874, + "learning_rate": 0.0008845720976610168, + "loss": 1.4695, + "step": 2271 + }, + { + "epoch": 0.24, + "grad_norm": 0.07735279185733433, + "learning_rate": 0.0008844608066584787, + "loss": 1.3753, + "step": 2272 + }, + { + "epoch": 0.24, + "grad_norm": 0.07733770886342596, + "learning_rate": 0.0008843494690387426, + "loss": 1.4613, + "step": 2273 + }, + { + "epoch": 0.24, + "grad_norm": 0.07890085671660646, + "learning_rate": 0.0008842380848153082, + "loss": 1.3932, + "step": 2274 + }, + { + "epoch": 0.24, + "grad_norm": 0.07782994223247774, + "learning_rate": 0.0008841266540016813, + "loss": 1.4201, + "step": 2275 + }, + { + "epoch": 0.24, + "grad_norm": 0.0957307653994958, + "learning_rate": 0.0008840151766113735, + "loss": 1.3713, + "step": 2276 + }, + { + "epoch": 0.24, + "grad_norm": 0.08676129615166901, + "learning_rate": 0.0008839036526579014, + "loss": 1.4288, + "step": 2277 + }, + { + "epoch": 0.24, + "grad_norm": 0.08086343688253304, + "learning_rate": 0.000883792082154788, + "loss": 1.453, + "step": 2278 + }, + { + "epoch": 0.24, + "grad_norm": 0.07603031386284381, + "learning_rate": 0.0008836804651155617, + "loss": 1.3063, + "step": 2279 + }, + { + "epoch": 0.25, + "grad_norm": 0.08419241615367228, + "learning_rate": 0.0008835688015537559, + "loss": 1.3828, + "step": 2280 + }, + { + "epoch": 0.25, + "grad_norm": 0.08829666758411504, + "learning_rate": 0.0008834570914829108, + "loss": 1.4098, + "step": 2281 + }, + { + "epoch": 0.25, + "grad_norm": 0.08693258350713497, + "learning_rate": 0.0008833453349165714, + "loss": 1.3494, + "step": 2282 + }, + { + "epoch": 0.25, + "grad_norm": 0.09007341118626776, + "learning_rate": 0.0008832335318682883, + "loss": 1.4282, + "step": 2283 + }, + { + "epoch": 0.25, + "grad_norm": 0.09274387264986217, + "learning_rate": 0.0008831216823516185, + "loss": 1.4668, + "step": 2284 + }, + { + "epoch": 0.25, + "grad_norm": 0.09646572654831478, + "learning_rate": 0.0008830097863801238, + "loss": 1.5571, + "step": 2285 + }, + { + "epoch": 0.25, + "grad_norm": 0.08083030222951221, + "learning_rate": 0.000882897843967372, + "loss": 1.4943, + "step": 2286 + }, + { + "epoch": 0.25, + "grad_norm": 0.08450865254218719, + "learning_rate": 0.0008827858551269368, + "loss": 1.3344, + "step": 2287 + }, + { + "epoch": 0.25, + "grad_norm": 0.08067151928811613, + "learning_rate": 0.0008826738198723967, + "loss": 1.5423, + "step": 2288 + }, + { + "epoch": 0.25, + "grad_norm": 0.08614124183323224, + "learning_rate": 0.0008825617382173369, + "loss": 1.4565, + "step": 2289 + }, + { + "epoch": 0.25, + "grad_norm": 0.07814908550447222, + "learning_rate": 0.0008824496101753473, + "loss": 1.459, + "step": 2290 + }, + { + "epoch": 0.25, + "grad_norm": 0.08058843744933337, + "learning_rate": 0.0008823374357600241, + "loss": 1.5628, + "step": 2291 + }, + { + "epoch": 0.25, + "grad_norm": 0.08686365181873504, + "learning_rate": 0.0008822252149849686, + "loss": 1.4197, + "step": 2292 + }, + { + "epoch": 0.25, + "grad_norm": 0.07392644100827063, + "learning_rate": 0.000882112947863788, + "loss": 1.3753, + "step": 2293 + }, + { + "epoch": 0.25, + "grad_norm": 0.08686559375776513, + "learning_rate": 0.0008820006344100953, + "loss": 1.5226, + "step": 2294 + }, + { + "epoch": 0.25, + "grad_norm": 0.0891901138257802, + "learning_rate": 0.0008818882746375085, + "loss": 1.516, + "step": 2295 + }, + { + "epoch": 0.25, + "grad_norm": 0.09554588650132052, + "learning_rate": 0.0008817758685596519, + "loss": 1.4844, + "step": 2296 + }, + { + "epoch": 0.25, + "grad_norm": 0.07631449918448031, + "learning_rate": 0.0008816634161901552, + "loss": 1.3256, + "step": 2297 + }, + { + "epoch": 0.25, + "grad_norm": 0.08089428852812466, + "learning_rate": 0.0008815509175426534, + "loss": 1.3853, + "step": 2298 + }, + { + "epoch": 0.25, + "grad_norm": 0.07814014705209726, + "learning_rate": 0.0008814383726307876, + "loss": 1.473, + "step": 2299 + }, + { + "epoch": 0.25, + "grad_norm": 0.07405353182252997, + "learning_rate": 0.0008813257814682038, + "loss": 1.5659, + "step": 2300 + }, + { + "epoch": 0.25, + "grad_norm": 0.07250750021173145, + "learning_rate": 0.0008812131440685544, + "loss": 1.4467, + "step": 2301 + }, + { + "epoch": 0.25, + "grad_norm": 0.0717452676620836, + "learning_rate": 0.0008811004604454973, + "loss": 1.4328, + "step": 2302 + }, + { + "epoch": 0.25, + "grad_norm": 0.07509376431463823, + "learning_rate": 0.0008809877306126953, + "loss": 1.4588, + "step": 2303 + }, + { + "epoch": 0.25, + "grad_norm": 0.09604822190961418, + "learning_rate": 0.0008808749545838176, + "loss": 1.4542, + "step": 2304 + }, + { + "epoch": 0.25, + "grad_norm": 0.08803782587082482, + "learning_rate": 0.0008807621323725386, + "loss": 1.4286, + "step": 2305 + }, + { + "epoch": 0.25, + "grad_norm": 0.07727137861921227, + "learning_rate": 0.0008806492639925383, + "loss": 1.5215, + "step": 2306 + }, + { + "epoch": 0.25, + "grad_norm": 0.08376997789258082, + "learning_rate": 0.0008805363494575024, + "loss": 1.3851, + "step": 2307 + }, + { + "epoch": 0.25, + "grad_norm": 0.0819358725338881, + "learning_rate": 0.0008804233887811223, + "loss": 1.4812, + "step": 2308 + }, + { + "epoch": 0.25, + "grad_norm": 0.0792579194627983, + "learning_rate": 0.0008803103819770947, + "loss": 1.5521, + "step": 2309 + }, + { + "epoch": 0.25, + "grad_norm": 0.0915688602232919, + "learning_rate": 0.0008801973290591223, + "loss": 1.4422, + "step": 2310 + }, + { + "epoch": 0.25, + "grad_norm": 0.09290993712517474, + "learning_rate": 0.0008800842300409129, + "loss": 1.4499, + "step": 2311 + }, + { + "epoch": 0.25, + "grad_norm": 0.07260739372011728, + "learning_rate": 0.0008799710849361803, + "loss": 1.3542, + "step": 2312 + }, + { + "epoch": 0.25, + "grad_norm": 0.07889876448184573, + "learning_rate": 0.0008798578937586436, + "loss": 1.5093, + "step": 2313 + }, + { + "epoch": 0.25, + "grad_norm": 0.082364590057529, + "learning_rate": 0.0008797446565220278, + "loss": 1.3858, + "step": 2314 + }, + { + "epoch": 0.25, + "grad_norm": 0.07912789012616007, + "learning_rate": 0.0008796313732400634, + "loss": 1.4733, + "step": 2315 + }, + { + "epoch": 0.25, + "grad_norm": 0.08590092205671555, + "learning_rate": 0.000879518043926486, + "loss": 1.5081, + "step": 2316 + }, + { + "epoch": 0.25, + "grad_norm": 0.07329552774435327, + "learning_rate": 0.0008794046685950373, + "loss": 1.3782, + "step": 2317 + }, + { + "epoch": 0.25, + "grad_norm": 0.07799343415492194, + "learning_rate": 0.0008792912472594647, + "loss": 1.3976, + "step": 2318 + }, + { + "epoch": 0.25, + "grad_norm": 0.08369021131091983, + "learning_rate": 0.0008791777799335205, + "loss": 1.5131, + "step": 2319 + }, + { + "epoch": 0.25, + "grad_norm": 0.08178300989609327, + "learning_rate": 0.0008790642666309637, + "loss": 1.5082, + "step": 2320 + }, + { + "epoch": 0.25, + "grad_norm": 0.08988780905708967, + "learning_rate": 0.0008789507073655574, + "loss": 1.3528, + "step": 2321 + }, + { + "epoch": 0.25, + "grad_norm": 0.08569905270305181, + "learning_rate": 0.0008788371021510713, + "loss": 1.5646, + "step": 2322 + }, + { + "epoch": 0.25, + "grad_norm": 0.08054189742708023, + "learning_rate": 0.0008787234510012807, + "loss": 1.3814, + "step": 2323 + }, + { + "epoch": 0.25, + "grad_norm": 0.08068380039302207, + "learning_rate": 0.000878609753929966, + "loss": 1.4276, + "step": 2324 + }, + { + "epoch": 0.25, + "grad_norm": 0.08722938708128018, + "learning_rate": 0.0008784960109509133, + "loss": 1.4702, + "step": 2325 + }, + { + "epoch": 0.25, + "grad_norm": 0.08364451612686977, + "learning_rate": 0.0008783822220779145, + "loss": 1.4256, + "step": 2326 + }, + { + "epoch": 0.25, + "grad_norm": 0.0754569829554669, + "learning_rate": 0.0008782683873247667, + "loss": 1.4047, + "step": 2327 + }, + { + "epoch": 0.25, + "grad_norm": 0.0885670001296458, + "learning_rate": 0.0008781545067052729, + "loss": 1.4172, + "step": 2328 + }, + { + "epoch": 0.25, + "grad_norm": 0.08249488016564695, + "learning_rate": 0.0008780405802332415, + "loss": 1.4541, + "step": 2329 + }, + { + "epoch": 0.25, + "grad_norm": 0.08681960813369974, + "learning_rate": 0.0008779266079224863, + "loss": 1.3966, + "step": 2330 + }, + { + "epoch": 0.25, + "grad_norm": 0.09003709379265666, + "learning_rate": 0.0008778125897868272, + "loss": 1.5273, + "step": 2331 + }, + { + "epoch": 0.25, + "grad_norm": 0.08457069138096256, + "learning_rate": 0.0008776985258400889, + "loss": 1.5136, + "step": 2332 + }, + { + "epoch": 0.25, + "grad_norm": 0.08639359952616797, + "learning_rate": 0.0008775844160961023, + "loss": 1.3153, + "step": 2333 + }, + { + "epoch": 0.25, + "grad_norm": 0.08436050301536181, + "learning_rate": 0.0008774702605687035, + "loss": 1.4928, + "step": 2334 + }, + { + "epoch": 0.25, + "grad_norm": 0.07796128523951937, + "learning_rate": 0.0008773560592717343, + "loss": 1.4667, + "step": 2335 + }, + { + "epoch": 0.25, + "grad_norm": 0.08239972146783438, + "learning_rate": 0.0008772418122190418, + "loss": 1.4009, + "step": 2336 + }, + { + "epoch": 0.25, + "grad_norm": 0.08335490168042457, + "learning_rate": 0.0008771275194244792, + "loss": 1.4612, + "step": 2337 + }, + { + "epoch": 0.25, + "grad_norm": 0.0857843475703198, + "learning_rate": 0.0008770131809019046, + "loss": 1.4773, + "step": 2338 + }, + { + "epoch": 0.25, + "grad_norm": 0.08868483682877011, + "learning_rate": 0.0008768987966651822, + "loss": 1.4839, + "step": 2339 + }, + { + "epoch": 0.25, + "grad_norm": 0.07590656353342753, + "learning_rate": 0.0008767843667281812, + "loss": 1.3742, + "step": 2340 + }, + { + "epoch": 0.25, + "grad_norm": 0.08180431782754888, + "learning_rate": 0.0008766698911047768, + "loss": 1.4264, + "step": 2341 + }, + { + "epoch": 0.25, + "grad_norm": 0.0827417682256729, + "learning_rate": 0.0008765553698088496, + "loss": 1.3259, + "step": 2342 + }, + { + "epoch": 0.25, + "grad_norm": 0.08122944950217337, + "learning_rate": 0.0008764408028542854, + "loss": 1.4616, + "step": 2343 + }, + { + "epoch": 0.25, + "grad_norm": 0.07970569960246655, + "learning_rate": 0.0008763261902549762, + "loss": 1.4272, + "step": 2344 + }, + { + "epoch": 0.25, + "grad_norm": 0.0831541958515407, + "learning_rate": 0.0008762115320248192, + "loss": 1.3979, + "step": 2345 + }, + { + "epoch": 0.25, + "grad_norm": 0.06820614193604783, + "learning_rate": 0.0008760968281777167, + "loss": 1.4121, + "step": 2346 + }, + { + "epoch": 0.25, + "grad_norm": 0.08231977544674345, + "learning_rate": 0.0008759820787275773, + "loss": 1.3942, + "step": 2347 + }, + { + "epoch": 0.25, + "grad_norm": 0.07261045445751502, + "learning_rate": 0.0008758672836883146, + "loss": 1.4361, + "step": 2348 + }, + { + "epoch": 0.25, + "grad_norm": 0.07834877807278877, + "learning_rate": 0.0008757524430738479, + "loss": 1.5091, + "step": 2349 + }, + { + "epoch": 0.25, + "grad_norm": 0.08087675977722167, + "learning_rate": 0.0008756375568981023, + "loss": 1.5176, + "step": 2350 + }, + { + "epoch": 0.25, + "grad_norm": 0.06467515099636481, + "learning_rate": 0.0008755226251750077, + "loss": 1.3714, + "step": 2351 + }, + { + "epoch": 0.25, + "grad_norm": 0.07426388522565618, + "learning_rate": 0.0008754076479185001, + "loss": 1.518, + "step": 2352 + }, + { + "epoch": 0.25, + "grad_norm": 0.09416405071741757, + "learning_rate": 0.000875292625142521, + "loss": 1.5035, + "step": 2353 + }, + { + "epoch": 0.25, + "grad_norm": 0.08717906392381507, + "learning_rate": 0.0008751775568610175, + "loss": 1.5043, + "step": 2354 + }, + { + "epoch": 0.25, + "grad_norm": 0.07416649126779296, + "learning_rate": 0.0008750624430879416, + "loss": 1.5297, + "step": 2355 + }, + { + "epoch": 0.25, + "grad_norm": 0.07998119133842944, + "learning_rate": 0.0008749472838372514, + "loss": 1.3105, + "step": 2356 + }, + { + "epoch": 0.25, + "grad_norm": 0.081112375757246, + "learning_rate": 0.0008748320791229106, + "loss": 1.7473, + "step": 2357 + }, + { + "epoch": 0.25, + "grad_norm": 0.08904481652658924, + "learning_rate": 0.0008747168289588879, + "loss": 1.4059, + "step": 2358 + }, + { + "epoch": 0.25, + "grad_norm": 0.08234605951979214, + "learning_rate": 0.0008746015333591578, + "loss": 1.5228, + "step": 2359 + }, + { + "epoch": 0.25, + "grad_norm": 0.08955036857950854, + "learning_rate": 0.0008744861923377001, + "loss": 1.5783, + "step": 2360 + }, + { + "epoch": 0.25, + "grad_norm": 0.0831734267365419, + "learning_rate": 0.0008743708059085008, + "loss": 1.63, + "step": 2361 + }, + { + "epoch": 0.25, + "grad_norm": 0.08897953739111732, + "learning_rate": 0.0008742553740855505, + "loss": 1.3316, + "step": 2362 + }, + { + "epoch": 0.25, + "grad_norm": 0.0982441775876888, + "learning_rate": 0.000874139896882846, + "loss": 1.4574, + "step": 2363 + }, + { + "epoch": 0.25, + "grad_norm": 0.08749307153723353, + "learning_rate": 0.000874024374314389, + "loss": 1.4225, + "step": 2364 + }, + { + "epoch": 0.25, + "grad_norm": 0.08421424247218816, + "learning_rate": 0.0008739088063941874, + "loss": 1.4224, + "step": 2365 + }, + { + "epoch": 0.25, + "grad_norm": 0.09632762229707822, + "learning_rate": 0.0008737931931362536, + "loss": 1.4607, + "step": 2366 + }, + { + "epoch": 0.25, + "grad_norm": 0.08000551688554444, + "learning_rate": 0.0008736775345546066, + "loss": 1.5745, + "step": 2367 + }, + { + "epoch": 0.25, + "grad_norm": 0.08320750171271744, + "learning_rate": 0.0008735618306632704, + "loss": 1.409, + "step": 2368 + }, + { + "epoch": 0.25, + "grad_norm": 0.07967367321090332, + "learning_rate": 0.0008734460814762743, + "loss": 1.4925, + "step": 2369 + }, + { + "epoch": 0.25, + "grad_norm": 0.07739999356529328, + "learning_rate": 0.0008733302870076534, + "loss": 1.5329, + "step": 2370 + }, + { + "epoch": 0.25, + "grad_norm": 0.0739284834248217, + "learning_rate": 0.000873214447271448, + "loss": 1.4682, + "step": 2371 + }, + { + "epoch": 0.25, + "grad_norm": 0.08430494075436981, + "learning_rate": 0.0008730985622817043, + "loss": 1.5422, + "step": 2372 + }, + { + "epoch": 0.26, + "grad_norm": 0.07628909438924128, + "learning_rate": 0.0008729826320524736, + "loss": 1.3732, + "step": 2373 + }, + { + "epoch": 0.26, + "grad_norm": 0.08018562323859652, + "learning_rate": 0.0008728666565978129, + "loss": 1.3432, + "step": 2374 + }, + { + "epoch": 0.26, + "grad_norm": 0.07884604930066987, + "learning_rate": 0.0008727506359317847, + "loss": 1.4342, + "step": 2375 + }, + { + "epoch": 0.26, + "grad_norm": 0.07219376890047094, + "learning_rate": 0.0008726345700684568, + "loss": 1.5017, + "step": 2376 + }, + { + "epoch": 0.26, + "grad_norm": 0.07842816450456827, + "learning_rate": 0.0008725184590219026, + "loss": 1.3689, + "step": 2377 + }, + { + "epoch": 0.26, + "grad_norm": 0.08355168533148163, + "learning_rate": 0.000872402302806201, + "loss": 1.5077, + "step": 2378 + }, + { + "epoch": 0.26, + "grad_norm": 0.07483659212043353, + "learning_rate": 0.0008722861014354363, + "loss": 1.4426, + "step": 2379 + }, + { + "epoch": 0.26, + "grad_norm": 0.08414078995361736, + "learning_rate": 0.0008721698549236982, + "loss": 1.5652, + "step": 2380 + }, + { + "epoch": 0.26, + "grad_norm": 0.07285448936483976, + "learning_rate": 0.0008720535632850823, + "loss": 1.4901, + "step": 2381 + }, + { + "epoch": 0.26, + "grad_norm": 0.07770740691795891, + "learning_rate": 0.0008719372265336892, + "loss": 1.4684, + "step": 2382 + }, + { + "epoch": 0.26, + "grad_norm": 0.09889862035074047, + "learning_rate": 0.0008718208446836251, + "loss": 1.4999, + "step": 2383 + }, + { + "epoch": 0.26, + "grad_norm": 0.07299850297868295, + "learning_rate": 0.0008717044177490017, + "loss": 1.3581, + "step": 2384 + }, + { + "epoch": 0.26, + "grad_norm": 0.07882407017544525, + "learning_rate": 0.0008715879457439362, + "loss": 1.3601, + "step": 2385 + }, + { + "epoch": 0.26, + "grad_norm": 0.08112163967024738, + "learning_rate": 0.0008714714286825511, + "loss": 1.3724, + "step": 2386 + }, + { + "epoch": 0.26, + "grad_norm": 0.09481491559845347, + "learning_rate": 0.0008713548665789748, + "loss": 1.6153, + "step": 2387 + }, + { + "epoch": 0.26, + "grad_norm": 0.07735816148190601, + "learning_rate": 0.0008712382594473404, + "loss": 1.3866, + "step": 2388 + }, + { + "epoch": 0.26, + "grad_norm": 0.08755764137667417, + "learning_rate": 0.0008711216073017875, + "loss": 1.4494, + "step": 2389 + }, + { + "epoch": 0.26, + "grad_norm": 0.08229766736555363, + "learning_rate": 0.0008710049101564601, + "loss": 1.4153, + "step": 2390 + }, + { + "epoch": 0.26, + "grad_norm": 0.0842704737549601, + "learning_rate": 0.0008708881680255083, + "loss": 1.4117, + "step": 2391 + }, + { + "epoch": 0.26, + "grad_norm": 0.08135189581146728, + "learning_rate": 0.0008707713809230875, + "loss": 1.5633, + "step": 2392 + }, + { + "epoch": 0.26, + "grad_norm": 0.0820669553456813, + "learning_rate": 0.0008706545488633586, + "loss": 1.4688, + "step": 2393 + }, + { + "epoch": 0.26, + "grad_norm": 0.07755666930453603, + "learning_rate": 0.0008705376718604877, + "loss": 1.4559, + "step": 2394 + }, + { + "epoch": 0.26, + "grad_norm": 0.0788686142056009, + "learning_rate": 0.0008704207499286467, + "loss": 1.4704, + "step": 2395 + }, + { + "epoch": 0.26, + "grad_norm": 0.08433337387934393, + "learning_rate": 0.0008703037830820127, + "loss": 1.4576, + "step": 2396 + }, + { + "epoch": 0.26, + "grad_norm": 0.07928307629309808, + "learning_rate": 0.0008701867713347684, + "loss": 1.4808, + "step": 2397 + }, + { + "epoch": 0.26, + "grad_norm": 0.083526767428123, + "learning_rate": 0.0008700697147011018, + "loss": 1.4463, + "step": 2398 + }, + { + "epoch": 0.26, + "grad_norm": 0.0837424016622009, + "learning_rate": 0.0008699526131952064, + "loss": 1.4302, + "step": 2399 + }, + { + "epoch": 0.26, + "grad_norm": 0.09432955877287996, + "learning_rate": 0.0008698354668312815, + "loss": 1.5715, + "step": 2400 + }, + { + "epoch": 0.26, + "grad_norm": 0.08495084110646854, + "learning_rate": 0.0008697182756235311, + "loss": 1.45, + "step": 2401 + }, + { + "epoch": 0.26, + "grad_norm": 0.07433196345469577, + "learning_rate": 0.0008696010395861651, + "loss": 1.4465, + "step": 2402 + }, + { + "epoch": 0.26, + "grad_norm": 0.07958498308048166, + "learning_rate": 0.0008694837587333988, + "loss": 1.4197, + "step": 2403 + }, + { + "epoch": 0.26, + "grad_norm": 0.07695594597259321, + "learning_rate": 0.000869366433079453, + "loss": 1.3394, + "step": 2404 + }, + { + "epoch": 0.26, + "grad_norm": 0.08917274057254963, + "learning_rate": 0.0008692490626385538, + "loss": 1.5033, + "step": 2405 + }, + { + "epoch": 0.26, + "grad_norm": 0.07292600891886494, + "learning_rate": 0.0008691316474249329, + "loss": 1.446, + "step": 2406 + }, + { + "epoch": 0.26, + "grad_norm": 0.07778910311002209, + "learning_rate": 0.000869014187452827, + "loss": 1.5116, + "step": 2407 + }, + { + "epoch": 0.26, + "grad_norm": 0.0758528324629491, + "learning_rate": 0.0008688966827364788, + "loss": 1.4107, + "step": 2408 + }, + { + "epoch": 0.26, + "grad_norm": 0.07499271508937456, + "learning_rate": 0.000868779133290136, + "loss": 1.417, + "step": 2409 + }, + { + "epoch": 0.26, + "grad_norm": 0.09554549660511087, + "learning_rate": 0.0008686615391280518, + "loss": 1.4752, + "step": 2410 + }, + { + "epoch": 0.26, + "grad_norm": 0.07605546223030824, + "learning_rate": 0.0008685439002644851, + "loss": 1.4271, + "step": 2411 + }, + { + "epoch": 0.26, + "grad_norm": 0.087182582522994, + "learning_rate": 0.0008684262167136998, + "loss": 1.4356, + "step": 2412 + }, + { + "epoch": 0.26, + "grad_norm": 0.07107863576670509, + "learning_rate": 0.0008683084884899656, + "loss": 1.3961, + "step": 2413 + }, + { + "epoch": 0.26, + "grad_norm": 0.08099026302786344, + "learning_rate": 0.0008681907156075577, + "loss": 1.5513, + "step": 2414 + }, + { + "epoch": 0.26, + "grad_norm": 0.09136275098009797, + "learning_rate": 0.0008680728980807559, + "loss": 1.5117, + "step": 2415 + }, + { + "epoch": 0.26, + "grad_norm": 0.09153738020255894, + "learning_rate": 0.0008679550359238464, + "loss": 1.391, + "step": 2416 + }, + { + "epoch": 0.26, + "grad_norm": 0.08700276520341825, + "learning_rate": 0.0008678371291511202, + "loss": 1.4262, + "step": 2417 + }, + { + "epoch": 0.26, + "grad_norm": 0.0845217493208463, + "learning_rate": 0.0008677191777768739, + "loss": 1.3337, + "step": 2418 + }, + { + "epoch": 0.26, + "grad_norm": 0.07806179350815017, + "learning_rate": 0.0008676011818154097, + "loss": 1.4307, + "step": 2419 + }, + { + "epoch": 0.26, + "grad_norm": 0.08075539501899112, + "learning_rate": 0.0008674831412810349, + "loss": 1.3418, + "step": 2420 + }, + { + "epoch": 0.26, + "grad_norm": 0.08976538106363947, + "learning_rate": 0.0008673650561880622, + "loss": 1.433, + "step": 2421 + }, + { + "epoch": 0.26, + "grad_norm": 0.08419710209291338, + "learning_rate": 0.0008672469265508099, + "loss": 1.3991, + "step": 2422 + }, + { + "epoch": 0.26, + "grad_norm": 0.07548679020927829, + "learning_rate": 0.0008671287523836018, + "loss": 1.3889, + "step": 2423 + }, + { + "epoch": 0.26, + "grad_norm": 0.07968383779478568, + "learning_rate": 0.0008670105337007667, + "loss": 1.4236, + "step": 2424 + }, + { + "epoch": 0.26, + "grad_norm": 0.07402167066513765, + "learning_rate": 0.0008668922705166391, + "loss": 1.3511, + "step": 2425 + }, + { + "epoch": 0.26, + "grad_norm": 0.07791837249202221, + "learning_rate": 0.0008667739628455591, + "loss": 1.6978, + "step": 2426 + }, + { + "epoch": 0.26, + "grad_norm": 0.07326641874026783, + "learning_rate": 0.0008666556107018713, + "loss": 1.3809, + "step": 2427 + }, + { + "epoch": 0.26, + "grad_norm": 0.08249221016536444, + "learning_rate": 0.0008665372140999268, + "loss": 1.4819, + "step": 2428 + }, + { + "epoch": 0.26, + "grad_norm": 0.08468100454038449, + "learning_rate": 0.0008664187730540813, + "loss": 1.3896, + "step": 2429 + }, + { + "epoch": 0.26, + "grad_norm": 0.07235421408348185, + "learning_rate": 0.0008663002875786965, + "loss": 1.3804, + "step": 2430 + }, + { + "epoch": 0.26, + "grad_norm": 0.07356618584967646, + "learning_rate": 0.0008661817576881391, + "loss": 1.4013, + "step": 2431 + }, + { + "epoch": 0.26, + "grad_norm": 0.08129543864427304, + "learning_rate": 0.0008660631833967809, + "loss": 1.4612, + "step": 2432 + }, + { + "epoch": 0.26, + "grad_norm": 0.07326509117595839, + "learning_rate": 0.0008659445647189999, + "loss": 1.4255, + "step": 2433 + }, + { + "epoch": 0.26, + "grad_norm": 0.07271571725202336, + "learning_rate": 0.0008658259016691786, + "loss": 1.3477, + "step": 2434 + }, + { + "epoch": 0.26, + "grad_norm": 0.07563391254650749, + "learning_rate": 0.0008657071942617056, + "loss": 1.5405, + "step": 2435 + }, + { + "epoch": 0.26, + "grad_norm": 0.07088489797268319, + "learning_rate": 0.0008655884425109747, + "loss": 1.4278, + "step": 2436 + }, + { + "epoch": 0.26, + "grad_norm": 0.07552308628678947, + "learning_rate": 0.0008654696464313846, + "loss": 1.4229, + "step": 2437 + }, + { + "epoch": 0.26, + "grad_norm": 0.08332445254415345, + "learning_rate": 0.0008653508060373399, + "loss": 1.4936, + "step": 2438 + }, + { + "epoch": 0.26, + "grad_norm": 0.07147667803000843, + "learning_rate": 0.0008652319213432504, + "loss": 1.4365, + "step": 2439 + }, + { + "epoch": 0.26, + "grad_norm": 0.07498735151078668, + "learning_rate": 0.0008651129923635314, + "loss": 1.4489, + "step": 2440 + }, + { + "epoch": 0.26, + "grad_norm": 0.08279578112131421, + "learning_rate": 0.0008649940191126033, + "loss": 1.4801, + "step": 2441 + }, + { + "epoch": 0.26, + "grad_norm": 0.07408268276283134, + "learning_rate": 0.0008648750016048921, + "loss": 1.3708, + "step": 2442 + }, + { + "epoch": 0.26, + "grad_norm": 0.08279177095879077, + "learning_rate": 0.000864755939854829, + "loss": 1.4605, + "step": 2443 + }, + { + "epoch": 0.26, + "grad_norm": 0.07600695074651367, + "learning_rate": 0.0008646368338768506, + "loss": 1.3836, + "step": 2444 + }, + { + "epoch": 0.26, + "grad_norm": 0.08935962208751888, + "learning_rate": 0.0008645176836853992, + "loss": 1.434, + "step": 2445 + }, + { + "epoch": 0.26, + "grad_norm": 0.07414169923498534, + "learning_rate": 0.0008643984892949217, + "loss": 1.4505, + "step": 2446 + }, + { + "epoch": 0.26, + "grad_norm": 0.088469364176592, + "learning_rate": 0.0008642792507198713, + "loss": 1.3818, + "step": 2447 + }, + { + "epoch": 0.26, + "grad_norm": 0.0826174631745449, + "learning_rate": 0.0008641599679747059, + "loss": 1.5778, + "step": 2448 + }, + { + "epoch": 0.26, + "grad_norm": 0.08789837591114688, + "learning_rate": 0.0008640406410738888, + "loss": 1.3421, + "step": 2449 + }, + { + "epoch": 0.26, + "grad_norm": 0.07804401790078475, + "learning_rate": 0.000863921270031889, + "loss": 1.5028, + "step": 2450 + }, + { + "epoch": 0.26, + "grad_norm": 0.07428320796104862, + "learning_rate": 0.0008638018548631805, + "loss": 1.4064, + "step": 2451 + }, + { + "epoch": 0.26, + "grad_norm": 0.07560885338288993, + "learning_rate": 0.000863682395582243, + "loss": 1.3384, + "step": 2452 + }, + { + "epoch": 0.26, + "grad_norm": 0.10187961286723403, + "learning_rate": 0.0008635628922035613, + "loss": 1.5161, + "step": 2453 + }, + { + "epoch": 0.26, + "grad_norm": 0.09026702268572244, + "learning_rate": 0.0008634433447416253, + "loss": 1.4465, + "step": 2454 + }, + { + "epoch": 0.26, + "grad_norm": 0.09035052032958152, + "learning_rate": 0.000863323753210931, + "loss": 1.441, + "step": 2455 + }, + { + "epoch": 0.26, + "grad_norm": 0.0813296004930123, + "learning_rate": 0.0008632041176259788, + "loss": 1.4335, + "step": 2456 + }, + { + "epoch": 0.26, + "grad_norm": 0.08422711412673596, + "learning_rate": 0.0008630844380012754, + "loss": 1.4123, + "step": 2457 + }, + { + "epoch": 0.26, + "grad_norm": 0.07926370932506693, + "learning_rate": 0.0008629647143513321, + "loss": 1.3936, + "step": 2458 + }, + { + "epoch": 0.26, + "grad_norm": 0.07404514764363598, + "learning_rate": 0.0008628449466906658, + "loss": 1.3632, + "step": 2459 + }, + { + "epoch": 0.26, + "grad_norm": 0.07673003267922554, + "learning_rate": 0.0008627251350337988, + "loss": 1.4799, + "step": 2460 + }, + { + "epoch": 0.26, + "grad_norm": 0.07558809531135767, + "learning_rate": 0.0008626052793952588, + "loss": 1.3333, + "step": 2461 + }, + { + "epoch": 0.26, + "grad_norm": 0.07736495424535647, + "learning_rate": 0.0008624853797895784, + "loss": 1.4643, + "step": 2462 + }, + { + "epoch": 0.26, + "grad_norm": 0.0781190229156017, + "learning_rate": 0.0008623654362312961, + "loss": 1.4538, + "step": 2463 + }, + { + "epoch": 0.26, + "grad_norm": 0.08576676357567804, + "learning_rate": 0.0008622454487349554, + "loss": 1.4785, + "step": 2464 + }, + { + "epoch": 0.26, + "grad_norm": 0.07382184959413687, + "learning_rate": 0.0008621254173151052, + "loss": 1.4344, + "step": 2465 + }, + { + "epoch": 0.27, + "grad_norm": 0.08142733655694544, + "learning_rate": 0.0008620053419862997, + "loss": 1.4177, + "step": 2466 + }, + { + "epoch": 0.27, + "grad_norm": 0.0752615574922961, + "learning_rate": 0.0008618852227630985, + "loss": 1.4311, + "step": 2467 + }, + { + "epoch": 0.27, + "grad_norm": 0.09972312920589078, + "learning_rate": 0.0008617650596600665, + "loss": 1.4834, + "step": 2468 + }, + { + "epoch": 0.27, + "grad_norm": 0.08568112647402251, + "learning_rate": 0.0008616448526917736, + "loss": 1.4549, + "step": 2469 + }, + { + "epoch": 0.27, + "grad_norm": 0.0780979765408123, + "learning_rate": 0.0008615246018727956, + "loss": 1.4037, + "step": 2470 + }, + { + "epoch": 0.27, + "grad_norm": 0.08294238148339915, + "learning_rate": 0.0008614043072177135, + "loss": 1.4814, + "step": 2471 + }, + { + "epoch": 0.27, + "grad_norm": 0.09715355593692114, + "learning_rate": 0.000861283968741113, + "loss": 1.5874, + "step": 2472 + }, + { + "epoch": 0.27, + "grad_norm": 0.07782506887676875, + "learning_rate": 0.0008611635864575857, + "loss": 1.381, + "step": 2473 + }, + { + "epoch": 0.27, + "grad_norm": 0.0906288348611541, + "learning_rate": 0.0008610431603817285, + "loss": 1.3718, + "step": 2474 + }, + { + "epoch": 0.27, + "grad_norm": 0.07870778480964968, + "learning_rate": 0.0008609226905281433, + "loss": 1.4466, + "step": 2475 + }, + { + "epoch": 0.27, + "grad_norm": 0.08602254355096633, + "learning_rate": 0.0008608021769114378, + "loss": 1.4097, + "step": 2476 + }, + { + "epoch": 0.27, + "grad_norm": 0.0854785814244712, + "learning_rate": 0.0008606816195462243, + "loss": 1.3345, + "step": 2477 + }, + { + "epoch": 0.27, + "grad_norm": 0.08923277482885959, + "learning_rate": 0.000860561018447121, + "loss": 1.4728, + "step": 2478 + }, + { + "epoch": 0.27, + "grad_norm": 0.09441553256745243, + "learning_rate": 0.0008604403736287512, + "loss": 1.4979, + "step": 2479 + }, + { + "epoch": 0.27, + "grad_norm": 0.09169254563073491, + "learning_rate": 0.0008603196851057434, + "loss": 1.4615, + "step": 2480 + }, + { + "epoch": 0.27, + "grad_norm": 0.09397528694611802, + "learning_rate": 0.0008601989528927317, + "loss": 1.3988, + "step": 2481 + }, + { + "epoch": 0.27, + "grad_norm": 0.08594017810333246, + "learning_rate": 0.0008600781770043551, + "loss": 1.434, + "step": 2482 + }, + { + "epoch": 0.27, + "grad_norm": 0.0857484672914396, + "learning_rate": 0.0008599573574552582, + "loss": 1.4412, + "step": 2483 + }, + { + "epoch": 0.27, + "grad_norm": 0.09556263713315377, + "learning_rate": 0.0008598364942600906, + "loss": 1.5596, + "step": 2484 + }, + { + "epoch": 0.27, + "grad_norm": 0.096404618723962, + "learning_rate": 0.0008597155874335076, + "loss": 1.4049, + "step": 2485 + }, + { + "epoch": 0.27, + "grad_norm": 0.08729667242752806, + "learning_rate": 0.0008595946369901696, + "loss": 1.4342, + "step": 2486 + }, + { + "epoch": 0.27, + "grad_norm": 0.08010480744400537, + "learning_rate": 0.0008594736429447421, + "loss": 1.4179, + "step": 2487 + }, + { + "epoch": 0.27, + "grad_norm": 0.07866884276153974, + "learning_rate": 0.0008593526053118961, + "loss": 1.5207, + "step": 2488 + }, + { + "epoch": 0.27, + "grad_norm": 0.08333403772066363, + "learning_rate": 0.0008592315241063077, + "loss": 1.4315, + "step": 2489 + }, + { + "epoch": 0.27, + "grad_norm": 0.08300905086491174, + "learning_rate": 0.0008591103993426588, + "loss": 1.3496, + "step": 2490 + }, + { + "epoch": 0.27, + "grad_norm": 0.0820231745923072, + "learning_rate": 0.0008589892310356357, + "loss": 1.4858, + "step": 2491 + }, + { + "epoch": 0.27, + "grad_norm": 0.09302760868607428, + "learning_rate": 0.0008588680191999308, + "loss": 1.4802, + "step": 2492 + }, + { + "epoch": 0.27, + "grad_norm": 0.08616521316743678, + "learning_rate": 0.0008587467638502413, + "loss": 1.4069, + "step": 2493 + }, + { + "epoch": 0.27, + "grad_norm": 0.08199246375158696, + "learning_rate": 0.0008586254650012699, + "loss": 1.4902, + "step": 2494 + }, + { + "epoch": 0.27, + "grad_norm": 0.08134043261911114, + "learning_rate": 0.0008585041226677247, + "loss": 1.4154, + "step": 2495 + }, + { + "epoch": 0.27, + "grad_norm": 0.07958788079491519, + "learning_rate": 0.0008583827368643185, + "loss": 1.4978, + "step": 2496 + }, + { + "epoch": 0.27, + "grad_norm": 0.07389846036126062, + "learning_rate": 0.0008582613076057699, + "loss": 1.4757, + "step": 2497 + }, + { + "epoch": 0.27, + "grad_norm": 0.07637567948544043, + "learning_rate": 0.0008581398349068028, + "loss": 1.3928, + "step": 2498 + }, + { + "epoch": 0.27, + "grad_norm": 0.07228880909881819, + "learning_rate": 0.0008580183187821459, + "loss": 1.496, + "step": 2499 + }, + { + "epoch": 0.27, + "grad_norm": 0.07744674862621322, + "learning_rate": 0.0008578967592465335, + "loss": 1.409, + "step": 2500 + }, + { + "epoch": 0.27, + "grad_norm": 0.07110313278941842, + "learning_rate": 0.0008577751563147054, + "loss": 1.4029, + "step": 2501 + }, + { + "epoch": 0.27, + "grad_norm": 0.0699785168285317, + "learning_rate": 0.000857653510001406, + "loss": 1.365, + "step": 2502 + }, + { + "epoch": 0.27, + "grad_norm": 0.07645102695193777, + "learning_rate": 0.0008575318203213855, + "loss": 1.3709, + "step": 2503 + }, + { + "epoch": 0.27, + "grad_norm": 0.09179887617243095, + "learning_rate": 0.0008574100872893992, + "loss": 1.6029, + "step": 2504 + }, + { + "epoch": 0.27, + "grad_norm": 0.08142107716187588, + "learning_rate": 0.0008572883109202077, + "loss": 1.4783, + "step": 2505 + }, + { + "epoch": 0.27, + "grad_norm": 0.06824772265618469, + "learning_rate": 0.0008571664912285766, + "loss": 1.3282, + "step": 2506 + }, + { + "epoch": 0.27, + "grad_norm": 0.07416330481288567, + "learning_rate": 0.0008570446282292773, + "loss": 1.439, + "step": 2507 + }, + { + "epoch": 0.27, + "grad_norm": 0.07513943521999795, + "learning_rate": 0.0008569227219370856, + "loss": 1.4978, + "step": 2508 + }, + { + "epoch": 0.27, + "grad_norm": 0.07361208809988612, + "learning_rate": 0.0008568007723667837, + "loss": 1.6253, + "step": 2509 + }, + { + "epoch": 0.27, + "grad_norm": 0.08213288675158097, + "learning_rate": 0.0008566787795331579, + "loss": 1.4, + "step": 2510 + }, + { + "epoch": 0.27, + "grad_norm": 0.07926099951067463, + "learning_rate": 0.0008565567434510004, + "loss": 1.4767, + "step": 2511 + }, + { + "epoch": 0.27, + "grad_norm": 0.07761340571003827, + "learning_rate": 0.0008564346641351087, + "loss": 1.3751, + "step": 2512 + }, + { + "epoch": 0.27, + "grad_norm": 0.07060565203198378, + "learning_rate": 0.0008563125416002849, + "loss": 1.4009, + "step": 2513 + }, + { + "epoch": 0.27, + "grad_norm": 0.07997901091917972, + "learning_rate": 0.0008561903758613372, + "loss": 1.5378, + "step": 2514 + }, + { + "epoch": 0.27, + "grad_norm": 0.08769052562520391, + "learning_rate": 0.0008560681669330783, + "loss": 1.3733, + "step": 2515 + }, + { + "epoch": 0.27, + "grad_norm": 0.07068911164600497, + "learning_rate": 0.0008559459148303268, + "loss": 1.4424, + "step": 2516 + }, + { + "epoch": 0.27, + "grad_norm": 0.06882590599628988, + "learning_rate": 0.0008558236195679059, + "loss": 1.4711, + "step": 2517 + }, + { + "epoch": 0.27, + "grad_norm": 0.07503865848801855, + "learning_rate": 0.0008557012811606444, + "loss": 1.5057, + "step": 2518 + }, + { + "epoch": 0.27, + "grad_norm": 0.07369198331548805, + "learning_rate": 0.0008555788996233764, + "loss": 1.3597, + "step": 2519 + }, + { + "epoch": 0.27, + "grad_norm": 0.06915275711769295, + "learning_rate": 0.0008554564749709408, + "loss": 1.5348, + "step": 2520 + }, + { + "epoch": 0.27, + "grad_norm": 0.08524842375728962, + "learning_rate": 0.0008553340072181822, + "loss": 1.2874, + "step": 2521 + }, + { + "epoch": 0.27, + "grad_norm": 0.08088460709388172, + "learning_rate": 0.0008552114963799502, + "loss": 1.4587, + "step": 2522 + }, + { + "epoch": 0.27, + "grad_norm": 0.07020129575793528, + "learning_rate": 0.0008550889424710997, + "loss": 1.494, + "step": 2523 + }, + { + "epoch": 0.27, + "grad_norm": 0.07286098460203513, + "learning_rate": 0.0008549663455064907, + "loss": 1.4065, + "step": 2524 + }, + { + "epoch": 0.27, + "grad_norm": 0.07224682131553291, + "learning_rate": 0.0008548437055009886, + "loss": 1.3713, + "step": 2525 + }, + { + "epoch": 0.27, + "grad_norm": 0.07626714388252791, + "learning_rate": 0.0008547210224694639, + "loss": 1.4525, + "step": 2526 + }, + { + "epoch": 0.27, + "grad_norm": 0.08518727889962867, + "learning_rate": 0.0008545982964267922, + "loss": 1.3893, + "step": 2527 + }, + { + "epoch": 0.27, + "grad_norm": 0.08097460248924065, + "learning_rate": 0.0008544755273878546, + "loss": 1.3658, + "step": 2528 + }, + { + "epoch": 0.27, + "grad_norm": 0.07863853475459574, + "learning_rate": 0.0008543527153675374, + "loss": 1.5665, + "step": 2529 + }, + { + "epoch": 0.27, + "grad_norm": 0.07424216152167827, + "learning_rate": 0.0008542298603807317, + "loss": 1.3939, + "step": 2530 + }, + { + "epoch": 0.27, + "grad_norm": 0.0711106261983796, + "learning_rate": 0.0008541069624423343, + "loss": 1.3313, + "step": 2531 + }, + { + "epoch": 0.27, + "grad_norm": 0.0786625769759333, + "learning_rate": 0.0008539840215672467, + "loss": 1.4449, + "step": 2532 + }, + { + "epoch": 0.27, + "grad_norm": 0.07540816364950761, + "learning_rate": 0.0008538610377703764, + "loss": 1.5184, + "step": 2533 + }, + { + "epoch": 0.27, + "grad_norm": 0.07406561827332242, + "learning_rate": 0.0008537380110666351, + "loss": 1.4328, + "step": 2534 + }, + { + "epoch": 0.27, + "grad_norm": 0.07747251536991673, + "learning_rate": 0.0008536149414709404, + "loss": 1.4349, + "step": 2535 + }, + { + "epoch": 0.27, + "grad_norm": 0.07339786673715235, + "learning_rate": 0.0008534918289982152, + "loss": 1.3211, + "step": 2536 + }, + { + "epoch": 0.27, + "grad_norm": 0.07446062835008409, + "learning_rate": 0.0008533686736633868, + "loss": 1.396, + "step": 2537 + }, + { + "epoch": 0.27, + "grad_norm": 0.08408711492254073, + "learning_rate": 0.0008532454754813886, + "loss": 1.4873, + "step": 2538 + }, + { + "epoch": 0.27, + "grad_norm": 0.06694151801394763, + "learning_rate": 0.0008531222344671588, + "loss": 1.5546, + "step": 2539 + }, + { + "epoch": 0.27, + "grad_norm": 0.10107174239142706, + "learning_rate": 0.0008529989506356406, + "loss": 1.4372, + "step": 2540 + }, + { + "epoch": 0.27, + "grad_norm": 0.07716872380123085, + "learning_rate": 0.0008528756240017825, + "loss": 1.3469, + "step": 2541 + }, + { + "epoch": 0.27, + "grad_norm": 0.1048404291852175, + "learning_rate": 0.0008527522545805385, + "loss": 1.4357, + "step": 2542 + }, + { + "epoch": 0.27, + "grad_norm": 0.06980478409748192, + "learning_rate": 0.0008526288423868675, + "loss": 1.3616, + "step": 2543 + }, + { + "epoch": 0.27, + "grad_norm": 0.07265651507995816, + "learning_rate": 0.0008525053874357338, + "loss": 1.4286, + "step": 2544 + }, + { + "epoch": 0.27, + "grad_norm": 0.07424307098433014, + "learning_rate": 0.0008523818897421065, + "loss": 1.4647, + "step": 2545 + }, + { + "epoch": 0.27, + "grad_norm": 0.07441297327581105, + "learning_rate": 0.0008522583493209603, + "loss": 1.4718, + "step": 2546 + }, + { + "epoch": 0.27, + "grad_norm": 0.07910137117621747, + "learning_rate": 0.0008521347661872748, + "loss": 1.5926, + "step": 2547 + }, + { + "epoch": 0.27, + "grad_norm": 0.07998697396490068, + "learning_rate": 0.0008520111403560349, + "loss": 1.3263, + "step": 2548 + }, + { + "epoch": 0.27, + "grad_norm": 0.0724968400988003, + "learning_rate": 0.0008518874718422307, + "loss": 1.3337, + "step": 2549 + }, + { + "epoch": 0.27, + "grad_norm": 0.0798632264900906, + "learning_rate": 0.0008517637606608573, + "loss": 1.4341, + "step": 2550 + }, + { + "epoch": 0.27, + "grad_norm": 0.07801198011377168, + "learning_rate": 0.0008516400068269152, + "loss": 1.5612, + "step": 2551 + }, + { + "epoch": 0.27, + "grad_norm": 0.08604863144633705, + "learning_rate": 0.00085151621035541, + "loss": 1.4635, + "step": 2552 + }, + { + "epoch": 0.27, + "grad_norm": 0.07636576667059033, + "learning_rate": 0.0008513923712613525, + "loss": 1.5159, + "step": 2553 + }, + { + "epoch": 0.27, + "grad_norm": 0.10516077127557029, + "learning_rate": 0.0008512684895597586, + "loss": 1.4889, + "step": 2554 + }, + { + "epoch": 0.27, + "grad_norm": 0.07562060657657407, + "learning_rate": 0.0008511445652656494, + "loss": 1.4407, + "step": 2555 + }, + { + "epoch": 0.27, + "grad_norm": 0.07763797531070722, + "learning_rate": 0.0008510205983940507, + "loss": 1.5135, + "step": 2556 + }, + { + "epoch": 0.27, + "grad_norm": 0.07375876092295945, + "learning_rate": 0.0008508965889599947, + "loss": 1.4025, + "step": 2557 + }, + { + "epoch": 0.27, + "grad_norm": 0.07563084794765446, + "learning_rate": 0.0008507725369785173, + "loss": 1.3534, + "step": 2558 + }, + { + "epoch": 0.28, + "grad_norm": 0.07692755695568271, + "learning_rate": 0.0008506484424646606, + "loss": 1.4164, + "step": 2559 + }, + { + "epoch": 0.28, + "grad_norm": 0.0808219714314826, + "learning_rate": 0.0008505243054334713, + "loss": 1.4273, + "step": 2560 + }, + { + "epoch": 0.28, + "grad_norm": 0.08065608841873591, + "learning_rate": 0.0008504001259000016, + "loss": 1.4571, + "step": 2561 + }, + { + "epoch": 0.28, + "grad_norm": 0.08063335526960716, + "learning_rate": 0.0008502759038793087, + "loss": 1.3944, + "step": 2562 + }, + { + "epoch": 0.28, + "grad_norm": 0.09265606382023213, + "learning_rate": 0.0008501516393864548, + "loss": 1.5381, + "step": 2563 + }, + { + "epoch": 0.28, + "grad_norm": 0.08276084737543599, + "learning_rate": 0.0008500273324365073, + "loss": 1.4669, + "step": 2564 + }, + { + "epoch": 0.28, + "grad_norm": 0.08159564795043535, + "learning_rate": 0.0008499029830445393, + "loss": 1.4467, + "step": 2565 + }, + { + "epoch": 0.28, + "grad_norm": 0.08220391890474721, + "learning_rate": 0.0008497785912256282, + "loss": 1.3983, + "step": 2566 + }, + { + "epoch": 0.28, + "grad_norm": 0.07864749205521121, + "learning_rate": 0.000849654156994857, + "loss": 1.538, + "step": 2567 + }, + { + "epoch": 0.28, + "grad_norm": 0.08184544063144142, + "learning_rate": 0.0008495296803673138, + "loss": 1.4729, + "step": 2568 + }, + { + "epoch": 0.28, + "grad_norm": 0.08469200046218774, + "learning_rate": 0.0008494051613580918, + "loss": 1.402, + "step": 2569 + }, + { + "epoch": 0.28, + "grad_norm": 0.07779454288092019, + "learning_rate": 0.0008492805999822897, + "loss": 1.4113, + "step": 2570 + }, + { + "epoch": 0.28, + "grad_norm": 0.08809251512257028, + "learning_rate": 0.0008491559962550104, + "loss": 1.4379, + "step": 2571 + }, + { + "epoch": 0.28, + "grad_norm": 0.08325662907420392, + "learning_rate": 0.0008490313501913629, + "loss": 1.3634, + "step": 2572 + }, + { + "epoch": 0.28, + "grad_norm": 0.08503557494839924, + "learning_rate": 0.0008489066618064609, + "loss": 1.482, + "step": 2573 + }, + { + "epoch": 0.28, + "grad_norm": 0.08179961293993213, + "learning_rate": 0.0008487819311154233, + "loss": 1.3169, + "step": 2574 + }, + { + "epoch": 0.28, + "grad_norm": 0.07767794072518784, + "learning_rate": 0.0008486571581333742, + "loss": 1.4895, + "step": 2575 + }, + { + "epoch": 0.28, + "grad_norm": 0.07652551678602128, + "learning_rate": 0.0008485323428754426, + "loss": 1.4348, + "step": 2576 + }, + { + "epoch": 0.28, + "grad_norm": 0.07414488818852104, + "learning_rate": 0.0008484074853567629, + "loss": 1.395, + "step": 2577 + }, + { + "epoch": 0.28, + "grad_norm": 0.08519549165590558, + "learning_rate": 0.0008482825855924743, + "loss": 1.4378, + "step": 2578 + }, + { + "epoch": 0.28, + "grad_norm": 0.08324947940809344, + "learning_rate": 0.0008481576435977217, + "loss": 1.3522, + "step": 2579 + }, + { + "epoch": 0.28, + "grad_norm": 0.06811742710699656, + "learning_rate": 0.0008480326593876544, + "loss": 1.3793, + "step": 2580 + }, + { + "epoch": 0.28, + "grad_norm": 0.0812754167751017, + "learning_rate": 0.0008479076329774274, + "loss": 1.4694, + "step": 2581 + }, + { + "epoch": 0.28, + "grad_norm": 0.07372976296447055, + "learning_rate": 0.0008477825643822003, + "loss": 1.5546, + "step": 2582 + }, + { + "epoch": 0.28, + "grad_norm": 0.08190879452155753, + "learning_rate": 0.0008476574536171385, + "loss": 1.3917, + "step": 2583 + }, + { + "epoch": 0.28, + "grad_norm": 0.09192577655834207, + "learning_rate": 0.0008475323006974116, + "loss": 1.5549, + "step": 2584 + }, + { + "epoch": 0.28, + "grad_norm": 0.07686509843963789, + "learning_rate": 0.0008474071056381953, + "loss": 1.4178, + "step": 2585 + }, + { + "epoch": 0.28, + "grad_norm": 0.0905695216161556, + "learning_rate": 0.0008472818684546697, + "loss": 1.4021, + "step": 2586 + }, + { + "epoch": 0.28, + "grad_norm": 0.07791879082984891, + "learning_rate": 0.0008471565891620203, + "loss": 1.5132, + "step": 2587 + }, + { + "epoch": 0.28, + "grad_norm": 0.07101522886996688, + "learning_rate": 0.0008470312677754377, + "loss": 1.4741, + "step": 2588 + }, + { + "epoch": 0.28, + "grad_norm": 0.08034711963413725, + "learning_rate": 0.0008469059043101175, + "loss": 1.4286, + "step": 2589 + }, + { + "epoch": 0.28, + "grad_norm": 0.07548169925316262, + "learning_rate": 0.0008467804987812603, + "loss": 1.3944, + "step": 2590 + }, + { + "epoch": 0.28, + "grad_norm": 0.07519177174592058, + "learning_rate": 0.0008466550512040722, + "loss": 1.4444, + "step": 2591 + }, + { + "epoch": 0.28, + "grad_norm": 0.09383438703269041, + "learning_rate": 0.0008465295615937641, + "loss": 1.4118, + "step": 2592 + }, + { + "epoch": 0.28, + "grad_norm": 0.08051542055976478, + "learning_rate": 0.0008464040299655518, + "loss": 1.4393, + "step": 2593 + }, + { + "epoch": 0.28, + "grad_norm": 0.08233997317534307, + "learning_rate": 0.0008462784563346567, + "loss": 1.401, + "step": 2594 + }, + { + "epoch": 0.28, + "grad_norm": 0.08144426653897043, + "learning_rate": 0.000846152840716305, + "loss": 1.4978, + "step": 2595 + }, + { + "epoch": 0.28, + "grad_norm": 0.09101065818594047, + "learning_rate": 0.000846027183125728, + "loss": 1.4853, + "step": 2596 + }, + { + "epoch": 0.28, + "grad_norm": 0.08392381563918748, + "learning_rate": 0.0008459014835781621, + "loss": 1.3675, + "step": 2597 + }, + { + "epoch": 0.28, + "grad_norm": 0.0800643944017374, + "learning_rate": 0.0008457757420888488, + "loss": 1.4346, + "step": 2598 + }, + { + "epoch": 0.28, + "grad_norm": 0.07982629773993684, + "learning_rate": 0.0008456499586730346, + "loss": 1.3716, + "step": 2599 + }, + { + "epoch": 0.28, + "grad_norm": 0.0933833278071163, + "learning_rate": 0.0008455241333459715, + "loss": 1.4097, + "step": 2600 + }, + { + "epoch": 0.28, + "grad_norm": 0.09264156229360837, + "learning_rate": 0.0008453982661229158, + "loss": 1.4631, + "step": 2601 + }, + { + "epoch": 0.28, + "grad_norm": 0.09061343077073224, + "learning_rate": 0.0008452723570191297, + "loss": 1.5063, + "step": 2602 + }, + { + "epoch": 0.28, + "grad_norm": 0.07478352133945661, + "learning_rate": 0.0008451464060498799, + "loss": 1.2421, + "step": 2603 + }, + { + "epoch": 0.28, + "grad_norm": 0.08105557941037968, + "learning_rate": 0.0008450204132304386, + "loss": 1.4022, + "step": 2604 + }, + { + "epoch": 0.28, + "grad_norm": 0.08207495978087463, + "learning_rate": 0.0008448943785760826, + "loss": 1.3719, + "step": 2605 + }, + { + "epoch": 0.28, + "grad_norm": 0.08368550950846189, + "learning_rate": 0.0008447683021020942, + "loss": 1.5827, + "step": 2606 + }, + { + "epoch": 0.28, + "grad_norm": 0.07688854298372594, + "learning_rate": 0.0008446421838237605, + "loss": 1.4621, + "step": 2607 + }, + { + "epoch": 0.28, + "grad_norm": 0.09196343428155447, + "learning_rate": 0.0008445160237563741, + "loss": 1.5451, + "step": 2608 + }, + { + "epoch": 0.28, + "grad_norm": 0.07903413930876052, + "learning_rate": 0.0008443898219152319, + "loss": 1.4598, + "step": 2609 + }, + { + "epoch": 0.28, + "grad_norm": 0.07076947983689727, + "learning_rate": 0.0008442635783156365, + "loss": 1.3268, + "step": 2610 + }, + { + "epoch": 0.28, + "grad_norm": 0.07636266741339315, + "learning_rate": 0.0008441372929728954, + "loss": 1.5277, + "step": 2611 + }, + { + "epoch": 0.28, + "grad_norm": 0.07600285349782061, + "learning_rate": 0.0008440109659023211, + "loss": 1.4652, + "step": 2612 + }, + { + "epoch": 0.28, + "grad_norm": 0.07201022852211071, + "learning_rate": 0.0008438845971192313, + "loss": 1.4532, + "step": 2613 + }, + { + "epoch": 0.28, + "grad_norm": 0.07285683346924836, + "learning_rate": 0.0008437581866389483, + "loss": 1.4461, + "step": 2614 + }, + { + "epoch": 0.28, + "grad_norm": 0.07768183722711053, + "learning_rate": 0.0008436317344768003, + "loss": 1.585, + "step": 2615 + }, + { + "epoch": 0.28, + "grad_norm": 0.08838640887202365, + "learning_rate": 0.0008435052406481196, + "loss": 1.4553, + "step": 2616 + }, + { + "epoch": 0.28, + "grad_norm": 0.08074211986383892, + "learning_rate": 0.0008433787051682443, + "loss": 1.5233, + "step": 2617 + }, + { + "epoch": 0.28, + "grad_norm": 0.07530166795895882, + "learning_rate": 0.0008432521280525174, + "loss": 1.543, + "step": 2618 + }, + { + "epoch": 0.28, + "grad_norm": 0.07064169571659298, + "learning_rate": 0.0008431255093162864, + "loss": 1.5052, + "step": 2619 + }, + { + "epoch": 0.28, + "grad_norm": 0.07175033898234041, + "learning_rate": 0.0008429988489749046, + "loss": 1.5151, + "step": 2620 + }, + { + "epoch": 0.28, + "grad_norm": 0.07320433910283142, + "learning_rate": 0.0008428721470437296, + "loss": 1.3737, + "step": 2621 + }, + { + "epoch": 0.28, + "grad_norm": 0.07414426014352578, + "learning_rate": 0.0008427454035381249, + "loss": 1.5339, + "step": 2622 + }, + { + "epoch": 0.28, + "grad_norm": 0.09143642921759464, + "learning_rate": 0.0008426186184734585, + "loss": 1.4326, + "step": 2623 + }, + { + "epoch": 0.28, + "grad_norm": 0.0845165786223017, + "learning_rate": 0.0008424917918651031, + "loss": 1.4812, + "step": 2624 + }, + { + "epoch": 0.28, + "grad_norm": 0.0759915730522228, + "learning_rate": 0.0008423649237284376, + "loss": 1.4493, + "step": 2625 + }, + { + "epoch": 0.28, + "grad_norm": 0.0745678505883287, + "learning_rate": 0.0008422380140788445, + "loss": 1.36, + "step": 2626 + }, + { + "epoch": 0.28, + "grad_norm": 0.08146179178868668, + "learning_rate": 0.0008421110629317123, + "loss": 1.4266, + "step": 2627 + }, + { + "epoch": 0.28, + "grad_norm": 0.07628880445326346, + "learning_rate": 0.0008419840703024344, + "loss": 1.401, + "step": 2628 + }, + { + "epoch": 0.28, + "grad_norm": 0.07686954348813545, + "learning_rate": 0.0008418570362064091, + "loss": 1.5006, + "step": 2629 + }, + { + "epoch": 0.28, + "grad_norm": 0.07513837237629467, + "learning_rate": 0.0008417299606590393, + "loss": 1.4112, + "step": 2630 + }, + { + "epoch": 0.28, + "grad_norm": 0.07439513015706296, + "learning_rate": 0.0008416028436757339, + "loss": 1.3948, + "step": 2631 + }, + { + "epoch": 0.28, + "grad_norm": 0.07113589119232142, + "learning_rate": 0.0008414756852719059, + "loss": 1.3521, + "step": 2632 + }, + { + "epoch": 0.28, + "grad_norm": 0.0746111940741349, + "learning_rate": 0.0008413484854629739, + "loss": 1.5365, + "step": 2633 + }, + { + "epoch": 0.28, + "grad_norm": 0.08053469900948855, + "learning_rate": 0.0008412212442643611, + "loss": 1.317, + "step": 2634 + }, + { + "epoch": 0.28, + "grad_norm": 0.07311809517893839, + "learning_rate": 0.0008410939616914961, + "loss": 1.3688, + "step": 2635 + }, + { + "epoch": 0.28, + "grad_norm": 0.08879393398483074, + "learning_rate": 0.0008409666377598124, + "loss": 1.4062, + "step": 2636 + }, + { + "epoch": 0.28, + "grad_norm": 0.06956310232444521, + "learning_rate": 0.0008408392724847482, + "loss": 1.3274, + "step": 2637 + }, + { + "epoch": 0.28, + "grad_norm": 0.07211476766646407, + "learning_rate": 0.0008407118658817474, + "loss": 1.4828, + "step": 2638 + }, + { + "epoch": 0.28, + "grad_norm": 0.07084769306180853, + "learning_rate": 0.0008405844179662581, + "loss": 1.46, + "step": 2639 + }, + { + "epoch": 0.28, + "grad_norm": 0.08089545995057705, + "learning_rate": 0.000840456928753734, + "loss": 1.4635, + "step": 2640 + }, + { + "epoch": 0.28, + "grad_norm": 0.09611046115627091, + "learning_rate": 0.0008403293982596336, + "loss": 1.505, + "step": 2641 + }, + { + "epoch": 0.28, + "grad_norm": 0.07674825520262066, + "learning_rate": 0.0008402018264994203, + "loss": 1.4323, + "step": 2642 + }, + { + "epoch": 0.28, + "grad_norm": 0.0753083169510489, + "learning_rate": 0.0008400742134885627, + "loss": 1.4169, + "step": 2643 + }, + { + "epoch": 0.28, + "grad_norm": 0.07514595139372099, + "learning_rate": 0.0008399465592425342, + "loss": 1.3978, + "step": 2644 + }, + { + "epoch": 0.28, + "grad_norm": 0.06846109808760739, + "learning_rate": 0.0008398188637768136, + "loss": 1.3855, + "step": 2645 + }, + { + "epoch": 0.28, + "grad_norm": 0.08015966915623766, + "learning_rate": 0.0008396911271068841, + "loss": 1.4843, + "step": 2646 + }, + { + "epoch": 0.28, + "grad_norm": 0.07651693334730639, + "learning_rate": 0.0008395633492482344, + "loss": 1.4069, + "step": 2647 + }, + { + "epoch": 0.28, + "grad_norm": 0.08363488440862517, + "learning_rate": 0.0008394355302163578, + "loss": 1.5263, + "step": 2648 + }, + { + "epoch": 0.28, + "grad_norm": 0.0910487532185436, + "learning_rate": 0.0008393076700267532, + "loss": 1.5277, + "step": 2649 + }, + { + "epoch": 0.28, + "grad_norm": 0.08272526983826728, + "learning_rate": 0.0008391797686949237, + "loss": 1.4217, + "step": 2650 + }, + { + "epoch": 0.28, + "grad_norm": 0.08459734947024608, + "learning_rate": 0.000839051826236378, + "loss": 1.3437, + "step": 2651 + }, + { + "epoch": 0.29, + "grad_norm": 0.07350458824423961, + "learning_rate": 0.0008389238426666294, + "loss": 1.4984, + "step": 2652 + }, + { + "epoch": 0.29, + "grad_norm": 0.07125971137650254, + "learning_rate": 0.0008387958180011964, + "loss": 1.329, + "step": 2653 + }, + { + "epoch": 0.29, + "grad_norm": 0.0819044410255831, + "learning_rate": 0.0008386677522556025, + "loss": 1.372, + "step": 2654 + }, + { + "epoch": 0.29, + "grad_norm": 0.07611520811269741, + "learning_rate": 0.0008385396454453762, + "loss": 1.6093, + "step": 2655 + }, + { + "epoch": 0.29, + "grad_norm": 0.06776836737705744, + "learning_rate": 0.0008384114975860507, + "loss": 1.4628, + "step": 2656 + }, + { + "epoch": 0.29, + "grad_norm": 0.07221637155692598, + "learning_rate": 0.0008382833086931642, + "loss": 1.2861, + "step": 2657 + }, + { + "epoch": 0.29, + "grad_norm": 0.07581032939812374, + "learning_rate": 0.0008381550787822605, + "loss": 1.588, + "step": 2658 + }, + { + "epoch": 0.29, + "grad_norm": 0.0767775626653725, + "learning_rate": 0.0008380268078688877, + "loss": 1.4529, + "step": 2659 + }, + { + "epoch": 0.29, + "grad_norm": 0.07200170746516522, + "learning_rate": 0.0008378984959685991, + "loss": 1.4092, + "step": 2660 + }, + { + "epoch": 0.29, + "grad_norm": 0.07172741580740591, + "learning_rate": 0.0008377701430969528, + "loss": 1.4825, + "step": 2661 + }, + { + "epoch": 0.29, + "grad_norm": 0.06814074124177844, + "learning_rate": 0.0008376417492695123, + "loss": 1.4343, + "step": 2662 + }, + { + "epoch": 0.29, + "grad_norm": 0.08848005244759329, + "learning_rate": 0.0008375133145018457, + "loss": 1.5059, + "step": 2663 + }, + { + "epoch": 0.29, + "grad_norm": 0.07264186405227538, + "learning_rate": 0.000837384838809526, + "loss": 1.279, + "step": 2664 + }, + { + "epoch": 0.29, + "grad_norm": 0.07900937324347093, + "learning_rate": 0.0008372563222081316, + "loss": 1.4473, + "step": 2665 + }, + { + "epoch": 0.29, + "grad_norm": 0.08847583018877739, + "learning_rate": 0.0008371277647132453, + "loss": 1.5201, + "step": 2666 + }, + { + "epoch": 0.29, + "grad_norm": 0.07547179396139557, + "learning_rate": 0.0008369991663404555, + "loss": 1.332, + "step": 2667 + }, + { + "epoch": 0.29, + "grad_norm": 0.07603778101136066, + "learning_rate": 0.0008368705271053547, + "loss": 1.2928, + "step": 2668 + }, + { + "epoch": 0.29, + "grad_norm": 0.06737944518535066, + "learning_rate": 0.0008367418470235413, + "loss": 1.3386, + "step": 2669 + }, + { + "epoch": 0.29, + "grad_norm": 0.09022510747259038, + "learning_rate": 0.0008366131261106179, + "loss": 1.6164, + "step": 2670 + }, + { + "epoch": 0.29, + "grad_norm": 0.07686893921895585, + "learning_rate": 0.0008364843643821927, + "loss": 1.4379, + "step": 2671 + }, + { + "epoch": 0.29, + "grad_norm": 0.07283643227924404, + "learning_rate": 0.000836355561853878, + "loss": 1.4125, + "step": 2672 + }, + { + "epoch": 0.29, + "grad_norm": 0.07205487947966797, + "learning_rate": 0.0008362267185412919, + "loss": 1.4272, + "step": 2673 + }, + { + "epoch": 0.29, + "grad_norm": 0.07591904501548323, + "learning_rate": 0.0008360978344600572, + "loss": 1.3735, + "step": 2674 + }, + { + "epoch": 0.29, + "grad_norm": 0.0840464697876995, + "learning_rate": 0.0008359689096258011, + "loss": 1.4788, + "step": 2675 + }, + { + "epoch": 0.29, + "grad_norm": 0.07417724728902096, + "learning_rate": 0.0008358399440541567, + "loss": 1.4503, + "step": 2676 + }, + { + "epoch": 0.29, + "grad_norm": 0.07269091407325959, + "learning_rate": 0.0008357109377607611, + "loss": 1.4989, + "step": 2677 + }, + { + "epoch": 0.29, + "grad_norm": 0.08418319284393984, + "learning_rate": 0.0008355818907612569, + "loss": 1.557, + "step": 2678 + }, + { + "epoch": 0.29, + "grad_norm": 0.07474993738977974, + "learning_rate": 0.0008354528030712915, + "loss": 1.4532, + "step": 2679 + }, + { + "epoch": 0.29, + "grad_norm": 0.07500813692751027, + "learning_rate": 0.0008353236747065174, + "loss": 1.634, + "step": 2680 + }, + { + "epoch": 0.29, + "grad_norm": 0.0829101197991892, + "learning_rate": 0.0008351945056825917, + "loss": 1.3884, + "step": 2681 + }, + { + "epoch": 0.29, + "grad_norm": 0.07835371691603794, + "learning_rate": 0.0008350652960151765, + "loss": 1.4707, + "step": 2682 + }, + { + "epoch": 0.29, + "grad_norm": 0.08405599478984883, + "learning_rate": 0.0008349360457199391, + "loss": 1.4975, + "step": 2683 + }, + { + "epoch": 0.29, + "grad_norm": 0.07659122988719483, + "learning_rate": 0.0008348067548125514, + "loss": 1.2843, + "step": 2684 + }, + { + "epoch": 0.29, + "grad_norm": 0.07639967954015899, + "learning_rate": 0.0008346774233086904, + "loss": 1.4632, + "step": 2685 + }, + { + "epoch": 0.29, + "grad_norm": 0.08634703761278015, + "learning_rate": 0.000834548051224038, + "loss": 1.4512, + "step": 2686 + }, + { + "epoch": 0.29, + "grad_norm": 0.07601354417275835, + "learning_rate": 0.0008344186385742811, + "loss": 1.455, + "step": 2687 + }, + { + "epoch": 0.29, + "grad_norm": 0.07770416940481581, + "learning_rate": 0.0008342891853751114, + "loss": 1.4682, + "step": 2688 + }, + { + "epoch": 0.29, + "grad_norm": 0.0738525816231597, + "learning_rate": 0.0008341596916422254, + "loss": 1.459, + "step": 2689 + }, + { + "epoch": 0.29, + "grad_norm": 0.08505835511210237, + "learning_rate": 0.0008340301573913249, + "loss": 1.567, + "step": 2690 + }, + { + "epoch": 0.29, + "grad_norm": 0.07871515325468663, + "learning_rate": 0.0008339005826381161, + "loss": 1.3747, + "step": 2691 + }, + { + "epoch": 0.29, + "grad_norm": 0.07020707472104386, + "learning_rate": 0.0008337709673983106, + "loss": 1.4348, + "step": 2692 + }, + { + "epoch": 0.29, + "grad_norm": 0.07336686807263738, + "learning_rate": 0.0008336413116876245, + "loss": 1.3568, + "step": 2693 + }, + { + "epoch": 0.29, + "grad_norm": 0.07164094008232932, + "learning_rate": 0.0008335116155217793, + "loss": 1.4443, + "step": 2694 + }, + { + "epoch": 0.29, + "grad_norm": 0.06686213937741443, + "learning_rate": 0.0008333818789165008, + "loss": 1.4464, + "step": 2695 + }, + { + "epoch": 0.29, + "grad_norm": 0.07568116953259008, + "learning_rate": 0.00083325210188752, + "loss": 1.3801, + "step": 2696 + }, + { + "epoch": 0.29, + "grad_norm": 0.07516356051721163, + "learning_rate": 0.0008331222844505733, + "loss": 1.2997, + "step": 2697 + }, + { + "epoch": 0.29, + "grad_norm": 0.06746999225222641, + "learning_rate": 0.000832992426621401, + "loss": 1.4718, + "step": 2698 + }, + { + "epoch": 0.29, + "grad_norm": 0.07642008615200645, + "learning_rate": 0.000832862528415749, + "loss": 1.4935, + "step": 2699 + }, + { + "epoch": 0.29, + "grad_norm": 0.0784700225660804, + "learning_rate": 0.0008327325898493677, + "loss": 1.5099, + "step": 2700 + }, + { + "epoch": 0.29, + "grad_norm": 0.07330172661899201, + "learning_rate": 0.000832602610938013, + "loss": 1.5399, + "step": 2701 + }, + { + "epoch": 0.29, + "grad_norm": 0.0717009174794634, + "learning_rate": 0.000832472591697445, + "loss": 1.4603, + "step": 2702 + }, + { + "epoch": 0.29, + "grad_norm": 0.07255700608309591, + "learning_rate": 0.0008323425321434291, + "loss": 1.3573, + "step": 2703 + }, + { + "epoch": 0.29, + "grad_norm": 0.07581764931155754, + "learning_rate": 0.0008322124322917353, + "loss": 1.4361, + "step": 2704 + }, + { + "epoch": 0.29, + "grad_norm": 0.07880780191271873, + "learning_rate": 0.0008320822921581388, + "loss": 1.4721, + "step": 2705 + }, + { + "epoch": 0.29, + "grad_norm": 0.07603032304986217, + "learning_rate": 0.0008319521117584194, + "loss": 1.2977, + "step": 2706 + }, + { + "epoch": 0.29, + "grad_norm": 0.06423068223877285, + "learning_rate": 0.0008318218911083623, + "loss": 1.4188, + "step": 2707 + }, + { + "epoch": 0.29, + "grad_norm": 0.08181975268370313, + "learning_rate": 0.0008316916302237568, + "loss": 1.5171, + "step": 2708 + }, + { + "epoch": 0.29, + "grad_norm": 0.0788819316295538, + "learning_rate": 0.0008315613291203976, + "loss": 1.4912, + "step": 2709 + }, + { + "epoch": 0.29, + "grad_norm": 0.07534497094282523, + "learning_rate": 0.0008314309878140842, + "loss": 1.4276, + "step": 2710 + }, + { + "epoch": 0.29, + "grad_norm": 0.07599726403491437, + "learning_rate": 0.000831300606320621, + "loss": 1.3766, + "step": 2711 + }, + { + "epoch": 0.29, + "grad_norm": 0.08399386546384041, + "learning_rate": 0.0008311701846558171, + "loss": 1.4363, + "step": 2712 + }, + { + "epoch": 0.29, + "grad_norm": 0.07194774780906135, + "learning_rate": 0.0008310397228354864, + "loss": 1.359, + "step": 2713 + }, + { + "epoch": 0.29, + "grad_norm": 0.07170651795131391, + "learning_rate": 0.0008309092208754483, + "loss": 1.4945, + "step": 2714 + }, + { + "epoch": 0.29, + "grad_norm": 0.09471088333737213, + "learning_rate": 0.0008307786787915261, + "loss": 1.6064, + "step": 2715 + }, + { + "epoch": 0.29, + "grad_norm": 0.07001458324123662, + "learning_rate": 0.0008306480965995489, + "loss": 1.477, + "step": 2716 + }, + { + "epoch": 0.29, + "grad_norm": 0.07331420219072145, + "learning_rate": 0.0008305174743153499, + "loss": 1.4956, + "step": 2717 + }, + { + "epoch": 0.29, + "grad_norm": 0.08307822550827987, + "learning_rate": 0.000830386811954768, + "loss": 1.4908, + "step": 2718 + }, + { + "epoch": 0.29, + "grad_norm": 0.10553037644370117, + "learning_rate": 0.0008302561095336459, + "loss": 1.502, + "step": 2719 + }, + { + "epoch": 0.29, + "grad_norm": 0.07753527659679234, + "learning_rate": 0.0008301253670678319, + "loss": 1.3897, + "step": 2720 + }, + { + "epoch": 0.29, + "grad_norm": 0.07645511236420256, + "learning_rate": 0.0008299945845731792, + "loss": 1.4402, + "step": 2721 + }, + { + "epoch": 0.29, + "grad_norm": 0.07836695249123142, + "learning_rate": 0.0008298637620655453, + "loss": 1.3848, + "step": 2722 + }, + { + "epoch": 0.29, + "grad_norm": 0.08857471945388738, + "learning_rate": 0.0008297328995607932, + "loss": 1.3904, + "step": 2723 + }, + { + "epoch": 0.29, + "grad_norm": 0.08127929407000808, + "learning_rate": 0.0008296019970747902, + "loss": 1.2742, + "step": 2724 + }, + { + "epoch": 0.29, + "grad_norm": 0.07769020055838141, + "learning_rate": 0.0008294710546234086, + "loss": 1.2634, + "step": 2725 + }, + { + "epoch": 0.29, + "grad_norm": 0.07820145025470278, + "learning_rate": 0.0008293400722225259, + "loss": 1.4207, + "step": 2726 + }, + { + "epoch": 0.29, + "grad_norm": 0.08332673123651665, + "learning_rate": 0.0008292090498880241, + "loss": 1.4028, + "step": 2727 + }, + { + "epoch": 0.29, + "grad_norm": 0.08469613645080842, + "learning_rate": 0.0008290779876357899, + "loss": 1.507, + "step": 2728 + }, + { + "epoch": 0.29, + "grad_norm": 0.07603799229175152, + "learning_rate": 0.0008289468854817153, + "loss": 1.4011, + "step": 2729 + }, + { + "epoch": 0.29, + "grad_norm": 0.07966344010849837, + "learning_rate": 0.0008288157434416967, + "loss": 1.4921, + "step": 2730 + }, + { + "epoch": 0.29, + "grad_norm": 0.07132344853654753, + "learning_rate": 0.0008286845615316356, + "loss": 1.3162, + "step": 2731 + }, + { + "epoch": 0.29, + "grad_norm": 0.07104434671857382, + "learning_rate": 0.0008285533397674382, + "loss": 1.373, + "step": 2732 + }, + { + "epoch": 0.29, + "grad_norm": 0.08032338161910683, + "learning_rate": 0.0008284220781650158, + "loss": 1.4925, + "step": 2733 + }, + { + "epoch": 0.29, + "grad_norm": 0.0877688716941064, + "learning_rate": 0.000828290776740284, + "loss": 1.4924, + "step": 2734 + }, + { + "epoch": 0.29, + "grad_norm": 0.07186097832409206, + "learning_rate": 0.0008281594355091641, + "loss": 1.3546, + "step": 2735 + }, + { + "epoch": 0.29, + "grad_norm": 0.07622048007118681, + "learning_rate": 0.0008280280544875811, + "loss": 1.4452, + "step": 2736 + }, + { + "epoch": 0.29, + "grad_norm": 0.07849727387690847, + "learning_rate": 0.0008278966336914655, + "loss": 1.4423, + "step": 2737 + }, + { + "epoch": 0.29, + "grad_norm": 0.07222521947527058, + "learning_rate": 0.0008277651731367528, + "loss": 1.4843, + "step": 2738 + }, + { + "epoch": 0.29, + "grad_norm": 0.07099999895203911, + "learning_rate": 0.0008276336728393828, + "loss": 1.4156, + "step": 2739 + }, + { + "epoch": 0.29, + "grad_norm": 0.06822141452088458, + "learning_rate": 0.0008275021328153006, + "loss": 1.3963, + "step": 2740 + }, + { + "epoch": 0.29, + "grad_norm": 0.0740043671559136, + "learning_rate": 0.0008273705530804554, + "loss": 1.4871, + "step": 2741 + }, + { + "epoch": 0.29, + "grad_norm": 0.07094511989097507, + "learning_rate": 0.0008272389336508022, + "loss": 1.2901, + "step": 2742 + }, + { + "epoch": 0.29, + "grad_norm": 0.07948023468384699, + "learning_rate": 0.0008271072745423, + "loss": 1.4011, + "step": 2743 + }, + { + "epoch": 0.29, + "grad_norm": 0.08280207297295716, + "learning_rate": 0.0008269755757709132, + "loss": 1.5201, + "step": 2744 + }, + { + "epoch": 0.3, + "grad_norm": 0.088306395939248, + "learning_rate": 0.0008268438373526106, + "loss": 1.517, + "step": 2745 + }, + { + "epoch": 0.3, + "grad_norm": 0.07326544074639216, + "learning_rate": 0.0008267120593033659, + "loss": 1.4476, + "step": 2746 + }, + { + "epoch": 0.3, + "grad_norm": 0.08336236489635436, + "learning_rate": 0.0008265802416391577, + "loss": 1.2915, + "step": 2747 + }, + { + "epoch": 0.3, + "grad_norm": 0.08857755307752961, + "learning_rate": 0.0008264483843759691, + "loss": 1.4161, + "step": 2748 + }, + { + "epoch": 0.3, + "grad_norm": 0.08720921111079, + "learning_rate": 0.0008263164875297887, + "loss": 1.3664, + "step": 2749 + }, + { + "epoch": 0.3, + "grad_norm": 0.0849530419155793, + "learning_rate": 0.0008261845511166092, + "loss": 1.4203, + "step": 2750 + }, + { + "epoch": 0.3, + "grad_norm": 0.08352948658753696, + "learning_rate": 0.0008260525751524282, + "loss": 1.4433, + "step": 2751 + }, + { + "epoch": 0.3, + "grad_norm": 0.07117468986953664, + "learning_rate": 0.0008259205596532484, + "loss": 1.4106, + "step": 2752 + }, + { + "epoch": 0.3, + "grad_norm": 0.09710061791847775, + "learning_rate": 0.0008257885046350773, + "loss": 1.3963, + "step": 2753 + }, + { + "epoch": 0.3, + "grad_norm": 0.08773828788389773, + "learning_rate": 0.0008256564101139266, + "loss": 1.4722, + "step": 2754 + }, + { + "epoch": 0.3, + "grad_norm": 0.0782548391737564, + "learning_rate": 0.0008255242761058135, + "loss": 1.3094, + "step": 2755 + }, + { + "epoch": 0.3, + "grad_norm": 0.08430634648229657, + "learning_rate": 0.0008253921026267599, + "loss": 1.6252, + "step": 2756 + }, + { + "epoch": 0.3, + "grad_norm": 0.08229486931029277, + "learning_rate": 0.0008252598896927918, + "loss": 1.4528, + "step": 2757 + }, + { + "epoch": 0.3, + "grad_norm": 0.08570349190519552, + "learning_rate": 0.0008251276373199408, + "loss": 1.445, + "step": 2758 + }, + { + "epoch": 0.3, + "grad_norm": 0.0823944896586127, + "learning_rate": 0.0008249953455242429, + "loss": 1.4586, + "step": 2759 + }, + { + "epoch": 0.3, + "grad_norm": 0.078848461846731, + "learning_rate": 0.000824863014321739, + "loss": 1.3897, + "step": 2760 + }, + { + "epoch": 0.3, + "grad_norm": 0.07616890999329672, + "learning_rate": 0.0008247306437284747, + "loss": 1.3357, + "step": 2761 + }, + { + "epoch": 0.3, + "grad_norm": 0.07291692672694469, + "learning_rate": 0.0008245982337605003, + "loss": 1.4341, + "step": 2762 + }, + { + "epoch": 0.3, + "grad_norm": 0.0799698621170182, + "learning_rate": 0.0008244657844338708, + "loss": 1.4776, + "step": 2763 + }, + { + "epoch": 0.3, + "grad_norm": 0.07410913925115305, + "learning_rate": 0.0008243332957646464, + "loss": 1.4272, + "step": 2764 + }, + { + "epoch": 0.3, + "grad_norm": 0.0725370622239953, + "learning_rate": 0.0008242007677688918, + "loss": 1.3854, + "step": 2765 + }, + { + "epoch": 0.3, + "grad_norm": 0.07919036331246422, + "learning_rate": 0.0008240682004626765, + "loss": 1.4331, + "step": 2766 + }, + { + "epoch": 0.3, + "grad_norm": 0.07625402550594935, + "learning_rate": 0.0008239355938620745, + "loss": 1.4987, + "step": 2767 + }, + { + "epoch": 0.3, + "grad_norm": 0.07087029054031294, + "learning_rate": 0.0008238029479831652, + "loss": 1.5275, + "step": 2768 + }, + { + "epoch": 0.3, + "grad_norm": 0.09117193245742936, + "learning_rate": 0.0008236702628420319, + "loss": 1.5647, + "step": 2769 + }, + { + "epoch": 0.3, + "grad_norm": 0.07650468483821572, + "learning_rate": 0.0008235375384547635, + "loss": 1.4506, + "step": 2770 + }, + { + "epoch": 0.3, + "grad_norm": 0.09424438605028439, + "learning_rate": 0.000823404774837453, + "loss": 1.4702, + "step": 2771 + }, + { + "epoch": 0.3, + "grad_norm": 0.07393962832466713, + "learning_rate": 0.0008232719720061987, + "loss": 1.3698, + "step": 2772 + }, + { + "epoch": 0.3, + "grad_norm": 0.07453913303088434, + "learning_rate": 0.0008231391299771034, + "loss": 1.3223, + "step": 2773 + }, + { + "epoch": 0.3, + "grad_norm": 0.08888193416857225, + "learning_rate": 0.0008230062487662745, + "loss": 1.5193, + "step": 2774 + }, + { + "epoch": 0.3, + "grad_norm": 0.08072388459301796, + "learning_rate": 0.0008228733283898243, + "loss": 1.4469, + "step": 2775 + }, + { + "epoch": 0.3, + "grad_norm": 0.08710297536435438, + "learning_rate": 0.00082274036886387, + "loss": 1.4754, + "step": 2776 + }, + { + "epoch": 0.3, + "grad_norm": 0.08359284400448978, + "learning_rate": 0.0008226073702045333, + "loss": 1.5685, + "step": 2777 + }, + { + "epoch": 0.3, + "grad_norm": 0.08749874887891788, + "learning_rate": 0.0008224743324279407, + "loss": 1.4436, + "step": 2778 + }, + { + "epoch": 0.3, + "grad_norm": 0.07760028764078293, + "learning_rate": 0.0008223412555502236, + "loss": 1.5412, + "step": 2779 + }, + { + "epoch": 0.3, + "grad_norm": 0.08406358133209071, + "learning_rate": 0.000822208139587518, + "loss": 1.5003, + "step": 2780 + }, + { + "epoch": 0.3, + "grad_norm": 0.08354272711327344, + "learning_rate": 0.0008220749845559648, + "loss": 1.4887, + "step": 2781 + }, + { + "epoch": 0.3, + "grad_norm": 0.07861669011751445, + "learning_rate": 0.0008219417904717091, + "loss": 1.3924, + "step": 2782 + }, + { + "epoch": 0.3, + "grad_norm": 0.06729249498959294, + "learning_rate": 0.0008218085573509016, + "loss": 1.4451, + "step": 2783 + }, + { + "epoch": 0.3, + "grad_norm": 0.07110046725712708, + "learning_rate": 0.0008216752852096969, + "loss": 1.5831, + "step": 2784 + }, + { + "epoch": 0.3, + "grad_norm": 0.07728581028416445, + "learning_rate": 0.0008215419740642549, + "loss": 1.4519, + "step": 2785 + }, + { + "epoch": 0.3, + "grad_norm": 0.07445156605390331, + "learning_rate": 0.0008214086239307401, + "loss": 1.4988, + "step": 2786 + }, + { + "epoch": 0.3, + "grad_norm": 0.0826710728733664, + "learning_rate": 0.0008212752348253216, + "loss": 1.3996, + "step": 2787 + }, + { + "epoch": 0.3, + "grad_norm": 0.10887963866407403, + "learning_rate": 0.0008211418067641734, + "loss": 1.4847, + "step": 2788 + }, + { + "epoch": 0.3, + "grad_norm": 0.08905891287841133, + "learning_rate": 0.0008210083397634738, + "loss": 1.4369, + "step": 2789 + }, + { + "epoch": 0.3, + "grad_norm": 0.08396342066020057, + "learning_rate": 0.0008208748338394064, + "loss": 1.2997, + "step": 2790 + }, + { + "epoch": 0.3, + "grad_norm": 0.08154082009916738, + "learning_rate": 0.000820741289008159, + "loss": 1.5037, + "step": 2791 + }, + { + "epoch": 0.3, + "grad_norm": 0.0800821125968288, + "learning_rate": 0.0008206077052859246, + "loss": 1.5425, + "step": 2792 + }, + { + "epoch": 0.3, + "grad_norm": 0.07327743953485256, + "learning_rate": 0.0008204740826889008, + "loss": 1.4202, + "step": 2793 + }, + { + "epoch": 0.3, + "grad_norm": 0.06753263368726979, + "learning_rate": 0.0008203404212332897, + "loss": 1.4971, + "step": 2794 + }, + { + "epoch": 0.3, + "grad_norm": 0.06862134633875801, + "learning_rate": 0.0008202067209352979, + "loss": 1.3466, + "step": 2795 + }, + { + "epoch": 0.3, + "grad_norm": 0.06855119036723545, + "learning_rate": 0.0008200729818111372, + "loss": 1.466, + "step": 2796 + }, + { + "epoch": 0.3, + "grad_norm": 0.08170769665034927, + "learning_rate": 0.0008199392038770242, + "loss": 1.4341, + "step": 2797 + }, + { + "epoch": 0.3, + "grad_norm": 0.06737320855709193, + "learning_rate": 0.0008198053871491797, + "loss": 1.6141, + "step": 2798 + }, + { + "epoch": 0.3, + "grad_norm": 0.0659153505087909, + "learning_rate": 0.0008196715316438294, + "loss": 1.4589, + "step": 2799 + }, + { + "epoch": 0.3, + "grad_norm": 0.08287915047683615, + "learning_rate": 0.0008195376373772039, + "loss": 1.4108, + "step": 2800 + }, + { + "epoch": 0.3, + "grad_norm": 0.08024241150674424, + "learning_rate": 0.0008194037043655382, + "loss": 1.2903, + "step": 2801 + }, + { + "epoch": 0.3, + "grad_norm": 0.07159138451861921, + "learning_rate": 0.0008192697326250722, + "loss": 1.4705, + "step": 2802 + }, + { + "epoch": 0.3, + "grad_norm": 0.07311652052094464, + "learning_rate": 0.0008191357221720506, + "loss": 1.4327, + "step": 2803 + }, + { + "epoch": 0.3, + "grad_norm": 0.0764745512460257, + "learning_rate": 0.0008190016730227224, + "loss": 1.5245, + "step": 2804 + }, + { + "epoch": 0.3, + "grad_norm": 0.07772607729105947, + "learning_rate": 0.0008188675851933414, + "loss": 1.5384, + "step": 2805 + }, + { + "epoch": 0.3, + "grad_norm": 0.06669285839551875, + "learning_rate": 0.0008187334587001664, + "loss": 1.3745, + "step": 2806 + }, + { + "epoch": 0.3, + "grad_norm": 0.08076034380977436, + "learning_rate": 0.0008185992935594607, + "loss": 1.4328, + "step": 2807 + }, + { + "epoch": 0.3, + "grad_norm": 0.08093218565950956, + "learning_rate": 0.0008184650897874923, + "loss": 1.4824, + "step": 2808 + }, + { + "epoch": 0.3, + "grad_norm": 0.07044639355943785, + "learning_rate": 0.000818330847400534, + "loss": 1.4994, + "step": 2809 + }, + { + "epoch": 0.3, + "grad_norm": 0.07088399140907917, + "learning_rate": 0.0008181965664148628, + "loss": 1.4263, + "step": 2810 + }, + { + "epoch": 0.3, + "grad_norm": 0.07363030280128995, + "learning_rate": 0.0008180622468467611, + "loss": 1.2771, + "step": 2811 + }, + { + "epoch": 0.3, + "grad_norm": 0.08144611471453442, + "learning_rate": 0.0008179278887125152, + "loss": 1.4055, + "step": 2812 + }, + { + "epoch": 0.3, + "grad_norm": 0.07736857460180284, + "learning_rate": 0.000817793492028417, + "loss": 1.4156, + "step": 2813 + }, + { + "epoch": 0.3, + "grad_norm": 0.06555042851554732, + "learning_rate": 0.0008176590568107622, + "loss": 1.4743, + "step": 2814 + }, + { + "epoch": 0.3, + "grad_norm": 0.07479045299224463, + "learning_rate": 0.0008175245830758515, + "loss": 1.49, + "step": 2815 + }, + { + "epoch": 0.3, + "grad_norm": 0.0841811461554626, + "learning_rate": 0.0008173900708399906, + "loss": 1.4209, + "step": 2816 + }, + { + "epoch": 0.3, + "grad_norm": 0.07004572589189338, + "learning_rate": 0.0008172555201194894, + "loss": 1.5112, + "step": 2817 + }, + { + "epoch": 0.3, + "grad_norm": 0.07560307310975573, + "learning_rate": 0.0008171209309306625, + "loss": 1.4413, + "step": 2818 + }, + { + "epoch": 0.3, + "grad_norm": 0.07308794549332547, + "learning_rate": 0.0008169863032898296, + "loss": 1.3943, + "step": 2819 + }, + { + "epoch": 0.3, + "grad_norm": 0.07782205167088278, + "learning_rate": 0.0008168516372133145, + "loss": 1.4607, + "step": 2820 + }, + { + "epoch": 0.3, + "grad_norm": 0.08067353235727805, + "learning_rate": 0.0008167169327174459, + "loss": 1.485, + "step": 2821 + }, + { + "epoch": 0.3, + "grad_norm": 0.07605076294811133, + "learning_rate": 0.0008165821898185576, + "loss": 1.3621, + "step": 2822 + }, + { + "epoch": 0.3, + "grad_norm": 0.07507805847597698, + "learning_rate": 0.0008164474085329872, + "loss": 1.4791, + "step": 2823 + }, + { + "epoch": 0.3, + "grad_norm": 0.08074805704595682, + "learning_rate": 0.0008163125888770776, + "loss": 1.4916, + "step": 2824 + }, + { + "epoch": 0.3, + "grad_norm": 0.08492739058974759, + "learning_rate": 0.0008161777308671762, + "loss": 1.529, + "step": 2825 + }, + { + "epoch": 0.3, + "grad_norm": 0.06727666112619586, + "learning_rate": 0.0008160428345196347, + "loss": 1.3156, + "step": 2826 + }, + { + "epoch": 0.3, + "grad_norm": 0.0688311847976553, + "learning_rate": 0.00081590789985081, + "loss": 1.5281, + "step": 2827 + }, + { + "epoch": 0.3, + "grad_norm": 0.09836022791746445, + "learning_rate": 0.0008157729268770635, + "loss": 1.4562, + "step": 2828 + }, + { + "epoch": 0.3, + "grad_norm": 0.06422326070659888, + "learning_rate": 0.0008156379156147608, + "loss": 1.3688, + "step": 2829 + }, + { + "epoch": 0.3, + "grad_norm": 0.07043174146263807, + "learning_rate": 0.0008155028660802728, + "loss": 1.5884, + "step": 2830 + }, + { + "epoch": 0.3, + "grad_norm": 0.083964177686042, + "learning_rate": 0.0008153677782899745, + "loss": 1.5427, + "step": 2831 + }, + { + "epoch": 0.3, + "grad_norm": 0.06899335085260903, + "learning_rate": 0.0008152326522602458, + "loss": 1.4434, + "step": 2832 + }, + { + "epoch": 0.3, + "grad_norm": 0.08238764103901412, + "learning_rate": 0.0008150974880074713, + "loss": 1.5017, + "step": 2833 + }, + { + "epoch": 0.3, + "grad_norm": 0.07391969606608612, + "learning_rate": 0.0008149622855480401, + "loss": 1.3963, + "step": 2834 + }, + { + "epoch": 0.3, + "grad_norm": 0.08318721513910766, + "learning_rate": 0.000814827044898346, + "loss": 1.5438, + "step": 2835 + }, + { + "epoch": 0.3, + "grad_norm": 0.08554886321365836, + "learning_rate": 0.0008146917660747872, + "loss": 1.4779, + "step": 2836 + }, + { + "epoch": 0.3, + "grad_norm": 0.07698488554302305, + "learning_rate": 0.0008145564490937668, + "loss": 1.4578, + "step": 2837 + }, + { + "epoch": 0.31, + "grad_norm": 0.07037277585204749, + "learning_rate": 0.0008144210939716927, + "loss": 1.4635, + "step": 2838 + }, + { + "epoch": 0.31, + "grad_norm": 0.07461934381938339, + "learning_rate": 0.0008142857007249768, + "loss": 1.4102, + "step": 2839 + }, + { + "epoch": 0.31, + "grad_norm": 0.08392078326405981, + "learning_rate": 0.0008141502693700363, + "loss": 1.4993, + "step": 2840 + }, + { + "epoch": 0.31, + "grad_norm": 0.06748677350552709, + "learning_rate": 0.0008140147999232925, + "loss": 1.5196, + "step": 2841 + }, + { + "epoch": 0.31, + "grad_norm": 0.07883616211063142, + "learning_rate": 0.0008138792924011717, + "loss": 1.2979, + "step": 2842 + }, + { + "epoch": 0.31, + "grad_norm": 0.06773196082861774, + "learning_rate": 0.0008137437468201047, + "loss": 1.3348, + "step": 2843 + }, + { + "epoch": 0.31, + "grad_norm": 0.06988800909616495, + "learning_rate": 0.0008136081631965267, + "loss": 1.4625, + "step": 2844 + }, + { + "epoch": 0.31, + "grad_norm": 0.07477987235145722, + "learning_rate": 0.0008134725415468775, + "loss": 1.4027, + "step": 2845 + }, + { + "epoch": 0.31, + "grad_norm": 0.07010726064819425, + "learning_rate": 0.0008133368818876022, + "loss": 1.4468, + "step": 2846 + }, + { + "epoch": 0.31, + "grad_norm": 0.08261004610399639, + "learning_rate": 0.0008132011842351496, + "loss": 1.3365, + "step": 2847 + }, + { + "epoch": 0.31, + "grad_norm": 0.08736930521087224, + "learning_rate": 0.0008130654486059737, + "loss": 1.566, + "step": 2848 + }, + { + "epoch": 0.31, + "grad_norm": 0.06642310720948336, + "learning_rate": 0.0008129296750165329, + "loss": 1.4061, + "step": 2849 + }, + { + "epoch": 0.31, + "grad_norm": 0.08933889777362511, + "learning_rate": 0.0008127938634832901, + "loss": 1.3926, + "step": 2850 + }, + { + "epoch": 0.31, + "grad_norm": 0.0724623698609669, + "learning_rate": 0.0008126580140227131, + "loss": 1.4211, + "step": 2851 + }, + { + "epoch": 0.31, + "grad_norm": 0.07930876972688328, + "learning_rate": 0.0008125221266512739, + "loss": 1.4018, + "step": 2852 + }, + { + "epoch": 0.31, + "grad_norm": 0.07774062520927012, + "learning_rate": 0.0008123862013854495, + "loss": 1.3741, + "step": 2853 + }, + { + "epoch": 0.31, + "grad_norm": 0.07060780534531708, + "learning_rate": 0.0008122502382417211, + "loss": 1.507, + "step": 2854 + }, + { + "epoch": 0.31, + "grad_norm": 0.07428692679133919, + "learning_rate": 0.0008121142372365749, + "loss": 1.4531, + "step": 2855 + }, + { + "epoch": 0.31, + "grad_norm": 0.07165081198741169, + "learning_rate": 0.0008119781983865013, + "loss": 1.3349, + "step": 2856 + }, + { + "epoch": 0.31, + "grad_norm": 0.07103201332670216, + "learning_rate": 0.0008118421217079958, + "loss": 1.4967, + "step": 2857 + }, + { + "epoch": 0.31, + "grad_norm": 0.06745289271534266, + "learning_rate": 0.0008117060072175578, + "loss": 1.3217, + "step": 2858 + }, + { + "epoch": 0.31, + "grad_norm": 0.07565056480064823, + "learning_rate": 0.0008115698549316919, + "loss": 1.3472, + "step": 2859 + }, + { + "epoch": 0.31, + "grad_norm": 0.08917104470742244, + "learning_rate": 0.0008114336648669068, + "loss": 1.4251, + "step": 2860 + }, + { + "epoch": 0.31, + "grad_norm": 0.07999502935604388, + "learning_rate": 0.0008112974370397163, + "loss": 1.4453, + "step": 2861 + }, + { + "epoch": 0.31, + "grad_norm": 0.07927721882772928, + "learning_rate": 0.0008111611714666382, + "loss": 1.4793, + "step": 2862 + }, + { + "epoch": 0.31, + "grad_norm": 0.07378233339106233, + "learning_rate": 0.0008110248681641956, + "loss": 1.3303, + "step": 2863 + }, + { + "epoch": 0.31, + "grad_norm": 0.0716702577172704, + "learning_rate": 0.0008108885271489153, + "loss": 1.39, + "step": 2864 + }, + { + "epoch": 0.31, + "grad_norm": 0.07223233377589355, + "learning_rate": 0.0008107521484373292, + "loss": 1.433, + "step": 2865 + }, + { + "epoch": 0.31, + "grad_norm": 0.07618471336989437, + "learning_rate": 0.000810615732045974, + "loss": 1.4637, + "step": 2866 + }, + { + "epoch": 0.31, + "grad_norm": 0.07505002942237572, + "learning_rate": 0.0008104792779913903, + "loss": 1.5149, + "step": 2867 + }, + { + "epoch": 0.31, + "grad_norm": 0.09599552930508799, + "learning_rate": 0.0008103427862901238, + "loss": 1.453, + "step": 2868 + }, + { + "epoch": 0.31, + "grad_norm": 0.08413359365818811, + "learning_rate": 0.0008102062569587244, + "loss": 1.299, + "step": 2869 + }, + { + "epoch": 0.31, + "grad_norm": 0.08334424197676094, + "learning_rate": 0.0008100696900137469, + "loss": 1.5173, + "step": 2870 + }, + { + "epoch": 0.31, + "grad_norm": 0.0681229530596204, + "learning_rate": 0.0008099330854717508, + "loss": 1.4892, + "step": 2871 + }, + { + "epoch": 0.31, + "grad_norm": 0.0826144462608642, + "learning_rate": 0.0008097964433492993, + "loss": 1.5718, + "step": 2872 + }, + { + "epoch": 0.31, + "grad_norm": 0.07647809806526784, + "learning_rate": 0.0008096597636629612, + "loss": 1.3691, + "step": 2873 + }, + { + "epoch": 0.31, + "grad_norm": 0.07280620512949906, + "learning_rate": 0.0008095230464293091, + "loss": 1.416, + "step": 2874 + }, + { + "epoch": 0.31, + "grad_norm": 0.0881966046359378, + "learning_rate": 0.0008093862916649207, + "loss": 1.4359, + "step": 2875 + }, + { + "epoch": 0.31, + "grad_norm": 0.07054285637471477, + "learning_rate": 0.0008092494993863775, + "loss": 1.4334, + "step": 2876 + }, + { + "epoch": 0.31, + "grad_norm": 0.07684187702267566, + "learning_rate": 0.0008091126696102665, + "loss": 1.3943, + "step": 2877 + }, + { + "epoch": 0.31, + "grad_norm": 0.06921697707144545, + "learning_rate": 0.0008089758023531788, + "loss": 1.2889, + "step": 2878 + }, + { + "epoch": 0.31, + "grad_norm": 0.0741571622600297, + "learning_rate": 0.0008088388976317096, + "loss": 1.4221, + "step": 2879 + }, + { + "epoch": 0.31, + "grad_norm": 0.0770392792484649, + "learning_rate": 0.0008087019554624595, + "loss": 1.3828, + "step": 2880 + }, + { + "epoch": 0.31, + "grad_norm": 0.07399574456270709, + "learning_rate": 0.000808564975862033, + "loss": 1.5514, + "step": 2881 + }, + { + "epoch": 0.31, + "grad_norm": 0.08204482880103793, + "learning_rate": 0.0008084279588470393, + "loss": 1.4972, + "step": 2882 + }, + { + "epoch": 0.31, + "grad_norm": 0.07952716032297752, + "learning_rate": 0.0008082909044340924, + "loss": 1.4671, + "step": 2883 + }, + { + "epoch": 0.31, + "grad_norm": 0.08374489001130474, + "learning_rate": 0.0008081538126398105, + "loss": 1.4455, + "step": 2884 + }, + { + "epoch": 0.31, + "grad_norm": 0.06973746802826421, + "learning_rate": 0.0008080166834808165, + "loss": 1.5941, + "step": 2885 + }, + { + "epoch": 0.31, + "grad_norm": 0.07006537617171173, + "learning_rate": 0.0008078795169737376, + "loss": 1.4264, + "step": 2886 + }, + { + "epoch": 0.31, + "grad_norm": 0.08481181694594125, + "learning_rate": 0.0008077423131352059, + "loss": 1.3789, + "step": 2887 + }, + { + "epoch": 0.31, + "grad_norm": 0.09125579107905064, + "learning_rate": 0.0008076050719818577, + "loss": 1.4581, + "step": 2888 + }, + { + "epoch": 0.31, + "grad_norm": 0.07959044783895391, + "learning_rate": 0.0008074677935303342, + "loss": 1.3779, + "step": 2889 + }, + { + "epoch": 0.31, + "grad_norm": 0.0794410279675721, + "learning_rate": 0.0008073304777972807, + "loss": 1.4972, + "step": 2890 + }, + { + "epoch": 0.31, + "grad_norm": 0.07737642113539102, + "learning_rate": 0.0008071931247993472, + "loss": 1.378, + "step": 2891 + }, + { + "epoch": 0.31, + "grad_norm": 0.07837475551562528, + "learning_rate": 0.0008070557345531881, + "loss": 1.4374, + "step": 2892 + }, + { + "epoch": 0.31, + "grad_norm": 0.0753630649514313, + "learning_rate": 0.0008069183070754628, + "loss": 1.4086, + "step": 2893 + }, + { + "epoch": 0.31, + "grad_norm": 0.0896110774192796, + "learning_rate": 0.0008067808423828347, + "loss": 1.4894, + "step": 2894 + }, + { + "epoch": 0.31, + "grad_norm": 0.0825189232098451, + "learning_rate": 0.0008066433404919718, + "loss": 1.5712, + "step": 2895 + }, + { + "epoch": 0.31, + "grad_norm": 0.08237890387777524, + "learning_rate": 0.0008065058014195465, + "loss": 1.3692, + "step": 2896 + }, + { + "epoch": 0.31, + "grad_norm": 0.0742473531492679, + "learning_rate": 0.0008063682251822363, + "loss": 1.3707, + "step": 2897 + }, + { + "epoch": 0.31, + "grad_norm": 0.07624379455220504, + "learning_rate": 0.0008062306117967225, + "loss": 1.3805, + "step": 2898 + }, + { + "epoch": 0.31, + "grad_norm": 0.0806083050065474, + "learning_rate": 0.0008060929612796914, + "loss": 1.3719, + "step": 2899 + }, + { + "epoch": 0.31, + "grad_norm": 0.07716589591308537, + "learning_rate": 0.0008059552736478333, + "loss": 1.3963, + "step": 2900 + }, + { + "epoch": 0.31, + "grad_norm": 0.09220293711869569, + "learning_rate": 0.0008058175489178436, + "loss": 1.4691, + "step": 2901 + }, + { + "epoch": 0.31, + "grad_norm": 0.07788974238950698, + "learning_rate": 0.0008056797871064216, + "loss": 1.3894, + "step": 2902 + }, + { + "epoch": 0.31, + "grad_norm": 0.10235235836865986, + "learning_rate": 0.0008055419882302719, + "loss": 1.5451, + "step": 2903 + }, + { + "epoch": 0.31, + "grad_norm": 0.07739987925558103, + "learning_rate": 0.0008054041523061026, + "loss": 1.3031, + "step": 2904 + }, + { + "epoch": 0.31, + "grad_norm": 0.09149966036697384, + "learning_rate": 0.000805266279350627, + "loss": 1.4051, + "step": 2905 + }, + { + "epoch": 0.31, + "grad_norm": 0.0873485165013668, + "learning_rate": 0.0008051283693805625, + "loss": 1.3646, + "step": 2906 + }, + { + "epoch": 0.31, + "grad_norm": 0.08345322596031957, + "learning_rate": 0.0008049904224126312, + "loss": 1.4502, + "step": 2907 + }, + { + "epoch": 0.31, + "grad_norm": 0.08088555125381701, + "learning_rate": 0.0008048524384635598, + "loss": 1.3885, + "step": 2908 + }, + { + "epoch": 0.31, + "grad_norm": 0.08576366605222421, + "learning_rate": 0.0008047144175500794, + "loss": 1.452, + "step": 2909 + }, + { + "epoch": 0.31, + "grad_norm": 0.07420507574850772, + "learning_rate": 0.0008045763596889253, + "loss": 1.4566, + "step": 2910 + }, + { + "epoch": 0.31, + "grad_norm": 0.0704001420236665, + "learning_rate": 0.0008044382648968374, + "loss": 1.3832, + "step": 2911 + }, + { + "epoch": 0.31, + "grad_norm": 0.07461222698093337, + "learning_rate": 0.0008043001331905604, + "loss": 1.3567, + "step": 2912 + }, + { + "epoch": 0.31, + "grad_norm": 0.08177185770423774, + "learning_rate": 0.0008041619645868433, + "loss": 1.4646, + "step": 2913 + }, + { + "epoch": 0.31, + "grad_norm": 0.07573707444529373, + "learning_rate": 0.0008040237591024393, + "loss": 1.457, + "step": 2914 + }, + { + "epoch": 0.31, + "grad_norm": 0.10223308961472137, + "learning_rate": 0.0008038855167541064, + "loss": 1.4352, + "step": 2915 + }, + { + "epoch": 0.31, + "grad_norm": 0.08263870527187396, + "learning_rate": 0.0008037472375586067, + "loss": 1.2946, + "step": 2916 + }, + { + "epoch": 0.31, + "grad_norm": 0.08385596883014022, + "learning_rate": 0.0008036089215327076, + "loss": 1.3963, + "step": 2917 + }, + { + "epoch": 0.31, + "grad_norm": 0.0765236562605998, + "learning_rate": 0.00080347056869318, + "loss": 1.4262, + "step": 2918 + }, + { + "epoch": 0.31, + "grad_norm": 0.08764321552227011, + "learning_rate": 0.0008033321790567996, + "loss": 1.5388, + "step": 2919 + }, + { + "epoch": 0.31, + "grad_norm": 0.07062198820275128, + "learning_rate": 0.0008031937526403469, + "loss": 1.3124, + "step": 2920 + }, + { + "epoch": 0.31, + "grad_norm": 0.07193907290002885, + "learning_rate": 0.0008030552894606063, + "loss": 1.4052, + "step": 2921 + }, + { + "epoch": 0.31, + "grad_norm": 0.08627791606011874, + "learning_rate": 0.0008029167895343671, + "loss": 1.494, + "step": 2922 + }, + { + "epoch": 0.31, + "grad_norm": 0.07689630376058164, + "learning_rate": 0.0008027782528784228, + "loss": 1.5321, + "step": 2923 + }, + { + "epoch": 0.31, + "grad_norm": 0.07838402069101297, + "learning_rate": 0.0008026396795095716, + "loss": 1.4008, + "step": 2924 + }, + { + "epoch": 0.31, + "grad_norm": 0.07890700473282818, + "learning_rate": 0.000802501069444616, + "loss": 1.4139, + "step": 2925 + }, + { + "epoch": 0.31, + "grad_norm": 0.07602723921168046, + "learning_rate": 0.0008023624227003626, + "loss": 1.5312, + "step": 2926 + }, + { + "epoch": 0.31, + "grad_norm": 0.09021190956086227, + "learning_rate": 0.0008022237392936231, + "loss": 1.4854, + "step": 2927 + }, + { + "epoch": 0.31, + "grad_norm": 0.0772411567904348, + "learning_rate": 0.0008020850192412135, + "loss": 1.3465, + "step": 2928 + }, + { + "epoch": 0.31, + "grad_norm": 0.07531806908392204, + "learning_rate": 0.0008019462625599536, + "loss": 1.493, + "step": 2929 + }, + { + "epoch": 0.31, + "grad_norm": 0.07284679179896028, + "learning_rate": 0.0008018074692666686, + "loss": 1.4981, + "step": 2930 + }, + { + "epoch": 0.32, + "grad_norm": 0.06456369250701985, + "learning_rate": 0.0008016686393781874, + "loss": 1.487, + "step": 2931 + }, + { + "epoch": 0.32, + "grad_norm": 0.06991722508371778, + "learning_rate": 0.0008015297729113436, + "loss": 1.4999, + "step": 2932 + }, + { + "epoch": 0.32, + "grad_norm": 0.08148887404697132, + "learning_rate": 0.0008013908698829752, + "loss": 1.3856, + "step": 2933 + }, + { + "epoch": 0.32, + "grad_norm": 0.08756760737824867, + "learning_rate": 0.000801251930309925, + "loss": 1.5644, + "step": 2934 + }, + { + "epoch": 0.32, + "grad_norm": 0.06420419497450133, + "learning_rate": 0.0008011129542090397, + "loss": 1.3941, + "step": 2935 + }, + { + "epoch": 0.32, + "grad_norm": 0.0796356391212337, + "learning_rate": 0.0008009739415971704, + "loss": 1.4468, + "step": 2936 + }, + { + "epoch": 0.32, + "grad_norm": 0.07887732694508226, + "learning_rate": 0.0008008348924911732, + "loss": 1.5729, + "step": 2937 + }, + { + "epoch": 0.32, + "grad_norm": 0.07020543228364666, + "learning_rate": 0.0008006958069079081, + "loss": 1.3993, + "step": 2938 + }, + { + "epoch": 0.32, + "grad_norm": 0.0671047003370231, + "learning_rate": 0.0008005566848642399, + "loss": 1.3402, + "step": 2939 + }, + { + "epoch": 0.32, + "grad_norm": 0.07054047003240703, + "learning_rate": 0.0008004175263770372, + "loss": 1.4655, + "step": 2940 + }, + { + "epoch": 0.32, + "grad_norm": 0.07740737220087766, + "learning_rate": 0.0008002783314631738, + "loss": 1.4764, + "step": 2941 + }, + { + "epoch": 0.32, + "grad_norm": 0.0728072252685772, + "learning_rate": 0.0008001391001395277, + "loss": 1.357, + "step": 2942 + }, + { + "epoch": 0.32, + "grad_norm": 0.07554078318188316, + "learning_rate": 0.0007999998324229809, + "loss": 1.4548, + "step": 2943 + }, + { + "epoch": 0.32, + "grad_norm": 0.08128226314789191, + "learning_rate": 0.0007998605283304201, + "loss": 1.5187, + "step": 2944 + }, + { + "epoch": 0.32, + "grad_norm": 0.07245701659717234, + "learning_rate": 0.0007997211878787365, + "loss": 1.4871, + "step": 2945 + }, + { + "epoch": 0.32, + "grad_norm": 0.08310930694834401, + "learning_rate": 0.0007995818110848256, + "loss": 1.437, + "step": 2946 + }, + { + "epoch": 0.32, + "grad_norm": 0.08043969912327549, + "learning_rate": 0.0007994423979655872, + "loss": 1.4586, + "step": 2947 + }, + { + "epoch": 0.32, + "grad_norm": 0.07315357331256875, + "learning_rate": 0.0007993029485379257, + "loss": 1.3868, + "step": 2948 + }, + { + "epoch": 0.32, + "grad_norm": 0.08625764575142641, + "learning_rate": 0.0007991634628187499, + "loss": 1.5691, + "step": 2949 + }, + { + "epoch": 0.32, + "grad_norm": 0.06859030969791598, + "learning_rate": 0.0007990239408249729, + "loss": 1.4585, + "step": 2950 + }, + { + "epoch": 0.32, + "grad_norm": 0.08095069758093375, + "learning_rate": 0.0007988843825735121, + "loss": 1.4382, + "step": 2951 + }, + { + "epoch": 0.32, + "grad_norm": 0.08455585505928823, + "learning_rate": 0.0007987447880812895, + "loss": 1.4736, + "step": 2952 + }, + { + "epoch": 0.32, + "grad_norm": 0.07840102281047322, + "learning_rate": 0.0007986051573652315, + "loss": 1.3839, + "step": 2953 + }, + { + "epoch": 0.32, + "grad_norm": 0.0781115296791198, + "learning_rate": 0.0007984654904422685, + "loss": 1.5133, + "step": 2954 + }, + { + "epoch": 0.32, + "grad_norm": 0.0743966495861294, + "learning_rate": 0.0007983257873293362, + "loss": 1.2965, + "step": 2955 + }, + { + "epoch": 0.32, + "grad_norm": 0.0751376147423777, + "learning_rate": 0.0007981860480433733, + "loss": 1.4109, + "step": 2956 + }, + { + "epoch": 0.32, + "grad_norm": 0.08181895305123754, + "learning_rate": 0.0007980462726013245, + "loss": 1.4691, + "step": 2957 + }, + { + "epoch": 0.32, + "grad_norm": 0.08562272032257762, + "learning_rate": 0.0007979064610201372, + "loss": 1.3148, + "step": 2958 + }, + { + "epoch": 0.32, + "grad_norm": 0.07781075518252807, + "learning_rate": 0.0007977666133167647, + "loss": 1.3035, + "step": 2959 + }, + { + "epoch": 0.32, + "grad_norm": 0.09447045090218623, + "learning_rate": 0.0007976267295081636, + "loss": 1.3363, + "step": 2960 + }, + { + "epoch": 0.32, + "grad_norm": 0.08987328702071136, + "learning_rate": 0.0007974868096112957, + "loss": 1.3668, + "step": 2961 + }, + { + "epoch": 0.32, + "grad_norm": 0.07679392751255011, + "learning_rate": 0.0007973468536431266, + "loss": 1.2937, + "step": 2962 + }, + { + "epoch": 0.32, + "grad_norm": 0.07944557506892269, + "learning_rate": 0.000797206861620626, + "loss": 1.376, + "step": 2963 + }, + { + "epoch": 0.32, + "grad_norm": 0.0879601918515955, + "learning_rate": 0.0007970668335607692, + "loss": 1.4672, + "step": 2964 + }, + { + "epoch": 0.32, + "grad_norm": 0.0875774791959148, + "learning_rate": 0.0007969267694805344, + "loss": 1.7008, + "step": 2965 + }, + { + "epoch": 0.32, + "grad_norm": 0.07462856123424448, + "learning_rate": 0.0007967866693969053, + "loss": 1.4685, + "step": 2966 + }, + { + "epoch": 0.32, + "grad_norm": 0.07553293553208955, + "learning_rate": 0.0007966465333268692, + "loss": 1.4977, + "step": 2967 + }, + { + "epoch": 0.32, + "grad_norm": 0.07498275141260913, + "learning_rate": 0.0007965063612874184, + "loss": 1.4594, + "step": 2968 + }, + { + "epoch": 0.32, + "grad_norm": 0.08857786850312585, + "learning_rate": 0.0007963661532955491, + "loss": 1.3205, + "step": 2969 + }, + { + "epoch": 0.32, + "grad_norm": 0.0781016657672057, + "learning_rate": 0.0007962259093682618, + "loss": 1.5386, + "step": 2970 + }, + { + "epoch": 0.32, + "grad_norm": 0.06611466290116325, + "learning_rate": 0.0007960856295225618, + "loss": 1.412, + "step": 2971 + }, + { + "epoch": 0.32, + "grad_norm": 0.05980479298650208, + "learning_rate": 0.0007959453137754586, + "loss": 1.2784, + "step": 2972 + }, + { + "epoch": 0.32, + "grad_norm": 0.08170204436084891, + "learning_rate": 0.0007958049621439658, + "loss": 1.3268, + "step": 2973 + }, + { + "epoch": 0.32, + "grad_norm": 0.07901322029342102, + "learning_rate": 0.0007956645746451014, + "loss": 1.4302, + "step": 2974 + }, + { + "epoch": 0.32, + "grad_norm": 0.07307847800484599, + "learning_rate": 0.0007955241512958881, + "loss": 1.3754, + "step": 2975 + }, + { + "epoch": 0.32, + "grad_norm": 0.06798109224771459, + "learning_rate": 0.0007953836921133526, + "loss": 1.4458, + "step": 2976 + }, + { + "epoch": 0.32, + "grad_norm": 0.07147885396357032, + "learning_rate": 0.000795243197114526, + "loss": 1.4256, + "step": 2977 + }, + { + "epoch": 0.32, + "grad_norm": 0.08764899736377428, + "learning_rate": 0.0007951026663164441, + "loss": 1.4224, + "step": 2978 + }, + { + "epoch": 0.32, + "grad_norm": 0.0789920953072474, + "learning_rate": 0.0007949620997361464, + "loss": 1.527, + "step": 2979 + }, + { + "epoch": 0.32, + "grad_norm": 0.060860731754923306, + "learning_rate": 0.0007948214973906773, + "loss": 1.3997, + "step": 2980 + }, + { + "epoch": 0.32, + "grad_norm": 0.0708414939041876, + "learning_rate": 0.0007946808592970851, + "loss": 1.3102, + "step": 2981 + }, + { + "epoch": 0.32, + "grad_norm": 0.07718632769563154, + "learning_rate": 0.0007945401854724231, + "loss": 1.2831, + "step": 2982 + }, + { + "epoch": 0.32, + "grad_norm": 0.08056455667858022, + "learning_rate": 0.0007943994759337478, + "loss": 1.3903, + "step": 2983 + }, + { + "epoch": 0.32, + "grad_norm": 0.07101624686847008, + "learning_rate": 0.0007942587306981213, + "loss": 1.4499, + "step": 2984 + }, + { + "epoch": 0.32, + "grad_norm": 0.06746511771239394, + "learning_rate": 0.0007941179497826092, + "loss": 1.5146, + "step": 2985 + }, + { + "epoch": 0.32, + "grad_norm": 0.07859460093405395, + "learning_rate": 0.0007939771332042817, + "loss": 1.5021, + "step": 2986 + }, + { + "epoch": 0.32, + "grad_norm": 0.08026433583333177, + "learning_rate": 0.0007938362809802133, + "loss": 1.5269, + "step": 2987 + }, + { + "epoch": 0.32, + "grad_norm": 0.0804769915693462, + "learning_rate": 0.0007936953931274827, + "loss": 1.4494, + "step": 2988 + }, + { + "epoch": 0.32, + "grad_norm": 0.07631714582546169, + "learning_rate": 0.0007935544696631734, + "loss": 1.4709, + "step": 2989 + }, + { + "epoch": 0.32, + "grad_norm": 0.07816795866767308, + "learning_rate": 0.0007934135106043725, + "loss": 1.4538, + "step": 2990 + }, + { + "epoch": 0.32, + "grad_norm": 0.07403291790022538, + "learning_rate": 0.000793272515968172, + "loss": 1.4281, + "step": 2991 + }, + { + "epoch": 0.32, + "grad_norm": 0.07624985603510445, + "learning_rate": 0.0007931314857716676, + "loss": 1.5291, + "step": 2992 + }, + { + "epoch": 0.32, + "grad_norm": 0.0742004553934557, + "learning_rate": 0.0007929904200319602, + "loss": 1.4579, + "step": 2993 + }, + { + "epoch": 0.32, + "grad_norm": 0.08297138817412927, + "learning_rate": 0.0007928493187661543, + "loss": 1.478, + "step": 2994 + }, + { + "epoch": 0.32, + "grad_norm": 0.07212656405189363, + "learning_rate": 0.0007927081819913589, + "loss": 1.3083, + "step": 2995 + }, + { + "epoch": 0.32, + "grad_norm": 0.07193865619974371, + "learning_rate": 0.0007925670097246871, + "loss": 1.4224, + "step": 2996 + }, + { + "epoch": 0.32, + "grad_norm": 0.08509412142880574, + "learning_rate": 0.0007924258019832569, + "loss": 1.4168, + "step": 2997 + }, + { + "epoch": 0.32, + "grad_norm": 0.07326614306013109, + "learning_rate": 0.00079228455878419, + "loss": 1.4902, + "step": 2998 + }, + { + "epoch": 0.32, + "grad_norm": 0.08526895309448661, + "learning_rate": 0.0007921432801446127, + "loss": 1.3673, + "step": 2999 + }, + { + "epoch": 0.32, + "grad_norm": 0.06863352180449873, + "learning_rate": 0.0007920019660816555, + "loss": 1.347, + "step": 3000 + }, + { + "epoch": 0.32, + "grad_norm": 0.07099094175406699, + "learning_rate": 0.0007918606166124534, + "loss": 1.4413, + "step": 3001 + }, + { + "epoch": 0.32, + "grad_norm": 0.07331999460617844, + "learning_rate": 0.000791719231754145, + "loss": 1.3459, + "step": 3002 + }, + { + "epoch": 0.32, + "grad_norm": 0.06742208138232067, + "learning_rate": 0.0007915778115238743, + "loss": 1.3944, + "step": 3003 + }, + { + "epoch": 0.32, + "grad_norm": 0.07564835035452432, + "learning_rate": 0.0007914363559387887, + "loss": 1.4459, + "step": 3004 + }, + { + "epoch": 0.32, + "grad_norm": 0.0775774753702783, + "learning_rate": 0.0007912948650160404, + "loss": 1.6455, + "step": 3005 + }, + { + "epoch": 0.32, + "grad_norm": 0.06744949006518554, + "learning_rate": 0.0007911533387727852, + "loss": 1.377, + "step": 3006 + }, + { + "epoch": 0.32, + "grad_norm": 0.07458417426910349, + "learning_rate": 0.0007910117772261839, + "loss": 1.3564, + "step": 3007 + }, + { + "epoch": 0.32, + "grad_norm": 0.06923492807546411, + "learning_rate": 0.0007908701803934013, + "loss": 1.4969, + "step": 3008 + }, + { + "epoch": 0.32, + "grad_norm": 0.0741115394912985, + "learning_rate": 0.0007907285482916067, + "loss": 1.5018, + "step": 3009 + }, + { + "epoch": 0.32, + "grad_norm": 0.07936484472761873, + "learning_rate": 0.0007905868809379734, + "loss": 1.5695, + "step": 3010 + }, + { + "epoch": 0.32, + "grad_norm": 0.07847917265702553, + "learning_rate": 0.0007904451783496789, + "loss": 1.3488, + "step": 3011 + }, + { + "epoch": 0.32, + "grad_norm": 0.07361634529449362, + "learning_rate": 0.0007903034405439051, + "loss": 1.3476, + "step": 3012 + }, + { + "epoch": 0.32, + "grad_norm": 0.06785097870413606, + "learning_rate": 0.0007901616675378383, + "loss": 1.4224, + "step": 3013 + }, + { + "epoch": 0.32, + "grad_norm": 0.08175453562213789, + "learning_rate": 0.0007900198593486691, + "loss": 1.4744, + "step": 3014 + }, + { + "epoch": 0.32, + "grad_norm": 0.07547507633226375, + "learning_rate": 0.0007898780159935921, + "loss": 1.4698, + "step": 3015 + }, + { + "epoch": 0.32, + "grad_norm": 0.0743726151849371, + "learning_rate": 0.0007897361374898062, + "loss": 1.4479, + "step": 3016 + }, + { + "epoch": 0.32, + "grad_norm": 0.08480416967435063, + "learning_rate": 0.0007895942238545148, + "loss": 1.385, + "step": 3017 + }, + { + "epoch": 0.32, + "grad_norm": 0.07394595425252229, + "learning_rate": 0.0007894522751049253, + "loss": 1.4534, + "step": 3018 + }, + { + "epoch": 0.32, + "grad_norm": 0.0714067033877472, + "learning_rate": 0.0007893102912582497, + "loss": 1.4485, + "step": 3019 + }, + { + "epoch": 0.32, + "grad_norm": 0.07582505910196573, + "learning_rate": 0.000789168272331704, + "loss": 1.4941, + "step": 3020 + }, + { + "epoch": 0.32, + "grad_norm": 0.0731420165751429, + "learning_rate": 0.0007890262183425081, + "loss": 1.4112, + "step": 3021 + }, + { + "epoch": 0.32, + "grad_norm": 0.09305550460370213, + "learning_rate": 0.0007888841293078868, + "loss": 1.4552, + "step": 3022 + }, + { + "epoch": 0.32, + "grad_norm": 0.06702964079143485, + "learning_rate": 0.0007887420052450689, + "loss": 1.4836, + "step": 3023 + }, + { + "epoch": 0.33, + "grad_norm": 0.07074665516379978, + "learning_rate": 0.0007885998461712875, + "loss": 1.4678, + "step": 3024 + }, + { + "epoch": 0.33, + "grad_norm": 0.07983639768866699, + "learning_rate": 0.0007884576521037798, + "loss": 1.4624, + "step": 3025 + }, + { + "epoch": 0.33, + "grad_norm": 0.07033851096059493, + "learning_rate": 0.0007883154230597873, + "loss": 1.3853, + "step": 3026 + }, + { + "epoch": 0.33, + "grad_norm": 0.08476954997112443, + "learning_rate": 0.0007881731590565559, + "loss": 1.4094, + "step": 3027 + }, + { + "epoch": 0.33, + "grad_norm": 0.07706177346613348, + "learning_rate": 0.0007880308601113352, + "loss": 1.4241, + "step": 3028 + }, + { + "epoch": 0.33, + "grad_norm": 0.08278153657107577, + "learning_rate": 0.0007878885262413799, + "loss": 1.3777, + "step": 3029 + }, + { + "epoch": 0.33, + "grad_norm": 0.08990975215370425, + "learning_rate": 0.0007877461574639484, + "loss": 1.5666, + "step": 3030 + }, + { + "epoch": 0.33, + "grad_norm": 0.08601960681426747, + "learning_rate": 0.0007876037537963033, + "loss": 1.3981, + "step": 3031 + }, + { + "epoch": 0.33, + "grad_norm": 0.07788237507460331, + "learning_rate": 0.0007874613152557113, + "loss": 1.3502, + "step": 3032 + }, + { + "epoch": 0.33, + "grad_norm": 0.104418842475275, + "learning_rate": 0.0007873188418594438, + "loss": 1.5997, + "step": 3033 + }, + { + "epoch": 0.33, + "grad_norm": 0.08013070374938921, + "learning_rate": 0.0007871763336247764, + "loss": 1.481, + "step": 3034 + }, + { + "epoch": 0.33, + "grad_norm": 0.07381930904128295, + "learning_rate": 0.0007870337905689881, + "loss": 1.3511, + "step": 3035 + }, + { + "epoch": 0.33, + "grad_norm": 0.08219086071312165, + "learning_rate": 0.0007868912127093638, + "loss": 1.3737, + "step": 3036 + }, + { + "epoch": 0.33, + "grad_norm": 0.08179829364428115, + "learning_rate": 0.0007867486000631903, + "loss": 1.595, + "step": 3037 + }, + { + "epoch": 0.33, + "grad_norm": 0.09105328320067167, + "learning_rate": 0.0007866059526477607, + "loss": 1.5167, + "step": 3038 + }, + { + "epoch": 0.33, + "grad_norm": 0.07385748224030242, + "learning_rate": 0.0007864632704803711, + "loss": 1.3882, + "step": 3039 + }, + { + "epoch": 0.33, + "grad_norm": 0.08146182608657129, + "learning_rate": 0.0007863205535783227, + "loss": 1.4446, + "step": 3040 + }, + { + "epoch": 0.33, + "grad_norm": 0.08225645645470177, + "learning_rate": 0.0007861778019589198, + "loss": 1.3429, + "step": 3041 + }, + { + "epoch": 0.33, + "grad_norm": 0.07002468941388816, + "learning_rate": 0.000786035015639472, + "loss": 1.4645, + "step": 3042 + }, + { + "epoch": 0.33, + "grad_norm": 0.08573645543248615, + "learning_rate": 0.0007858921946372922, + "loss": 1.352, + "step": 3043 + }, + { + "epoch": 0.33, + "grad_norm": 0.07785179379080004, + "learning_rate": 0.0007857493389696983, + "loss": 1.4217, + "step": 3044 + }, + { + "epoch": 0.33, + "grad_norm": 0.06999357815245667, + "learning_rate": 0.0007856064486540119, + "loss": 1.5484, + "step": 3045 + }, + { + "epoch": 0.33, + "grad_norm": 0.07645852761959619, + "learning_rate": 0.0007854635237075591, + "loss": 1.382, + "step": 3046 + }, + { + "epoch": 0.33, + "grad_norm": 0.07006480211273053, + "learning_rate": 0.0007853205641476698, + "loss": 1.409, + "step": 3047 + }, + { + "epoch": 0.33, + "grad_norm": 0.07233610453690911, + "learning_rate": 0.0007851775699916786, + "loss": 1.5152, + "step": 3048 + }, + { + "epoch": 0.33, + "grad_norm": 0.06938021129476968, + "learning_rate": 0.0007850345412569236, + "loss": 1.4692, + "step": 3049 + }, + { + "epoch": 0.33, + "grad_norm": 0.07696153387851845, + "learning_rate": 0.0007848914779607479, + "loss": 1.3746, + "step": 3050 + }, + { + "epoch": 0.33, + "grad_norm": 0.07222688770788331, + "learning_rate": 0.0007847483801204984, + "loss": 1.3984, + "step": 3051 + }, + { + "epoch": 0.33, + "grad_norm": 0.07217459083677298, + "learning_rate": 0.0007846052477535262, + "loss": 1.4673, + "step": 3052 + }, + { + "epoch": 0.33, + "grad_norm": 0.07665892010803753, + "learning_rate": 0.0007844620808771864, + "loss": 1.2931, + "step": 3053 + }, + { + "epoch": 0.33, + "grad_norm": 0.0753925094097213, + "learning_rate": 0.0007843188795088386, + "loss": 1.5428, + "step": 3054 + }, + { + "epoch": 0.33, + "grad_norm": 0.10489383050028697, + "learning_rate": 0.0007841756436658464, + "loss": 1.3772, + "step": 3055 + }, + { + "epoch": 0.33, + "grad_norm": 0.07367833059294322, + "learning_rate": 0.0007840323733655779, + "loss": 1.4949, + "step": 3056 + }, + { + "epoch": 0.33, + "grad_norm": 0.0720997223768572, + "learning_rate": 0.0007838890686254048, + "loss": 1.3626, + "step": 3057 + }, + { + "epoch": 0.33, + "grad_norm": 0.07082156336721733, + "learning_rate": 0.0007837457294627035, + "loss": 1.4854, + "step": 3058 + }, + { + "epoch": 0.33, + "grad_norm": 0.06935942782492194, + "learning_rate": 0.0007836023558948542, + "loss": 1.4135, + "step": 3059 + }, + { + "epoch": 0.33, + "grad_norm": 0.08042147923845204, + "learning_rate": 0.0007834589479392415, + "loss": 1.2829, + "step": 3060 + }, + { + "epoch": 0.33, + "grad_norm": 0.07681957831308812, + "learning_rate": 0.0007833155056132543, + "loss": 1.5114, + "step": 3061 + }, + { + "epoch": 0.33, + "grad_norm": 0.06971247023737136, + "learning_rate": 0.0007831720289342851, + "loss": 1.2874, + "step": 3062 + }, + { + "epoch": 0.33, + "grad_norm": 0.07685493671422122, + "learning_rate": 0.0007830285179197313, + "loss": 1.5248, + "step": 3063 + }, + { + "epoch": 0.33, + "grad_norm": 0.08086964730881499, + "learning_rate": 0.0007828849725869939, + "loss": 1.575, + "step": 3064 + }, + { + "epoch": 0.33, + "grad_norm": 0.07938799977856249, + "learning_rate": 0.0007827413929534783, + "loss": 1.4236, + "step": 3065 + }, + { + "epoch": 0.33, + "grad_norm": 0.07732912826079967, + "learning_rate": 0.000782597779036594, + "loss": 1.5576, + "step": 3066 + }, + { + "epoch": 0.33, + "grad_norm": 0.07555603999638347, + "learning_rate": 0.0007824541308537547, + "loss": 1.437, + "step": 3067 + }, + { + "epoch": 0.33, + "grad_norm": 0.0723582025951719, + "learning_rate": 0.0007823104484223784, + "loss": 1.3081, + "step": 3068 + }, + { + "epoch": 0.33, + "grad_norm": 0.07712330288020602, + "learning_rate": 0.0007821667317598871, + "loss": 1.4435, + "step": 3069 + }, + { + "epoch": 0.33, + "grad_norm": 0.07022497073587027, + "learning_rate": 0.0007820229808837065, + "loss": 1.4977, + "step": 3070 + }, + { + "epoch": 0.33, + "grad_norm": 0.08805211648596689, + "learning_rate": 0.0007818791958112672, + "loss": 1.4112, + "step": 3071 + }, + { + "epoch": 0.33, + "grad_norm": 0.09148966969586016, + "learning_rate": 0.0007817353765600038, + "loss": 1.4515, + "step": 3072 + }, + { + "epoch": 0.33, + "grad_norm": 0.06831467563512097, + "learning_rate": 0.0007815915231473547, + "loss": 1.4484, + "step": 3073 + }, + { + "epoch": 0.33, + "grad_norm": 0.07058078689286881, + "learning_rate": 0.0007814476355907625, + "loss": 1.5195, + "step": 3074 + }, + { + "epoch": 0.33, + "grad_norm": 0.07712659647369673, + "learning_rate": 0.0007813037139076742, + "loss": 1.5531, + "step": 3075 + }, + { + "epoch": 0.33, + "grad_norm": 0.07372955770119158, + "learning_rate": 0.0007811597581155407, + "loss": 1.2755, + "step": 3076 + }, + { + "epoch": 0.33, + "grad_norm": 0.06877276645781746, + "learning_rate": 0.0007810157682318174, + "loss": 1.4825, + "step": 3077 + }, + { + "epoch": 0.33, + "grad_norm": 0.07132173224006651, + "learning_rate": 0.0007808717442739634, + "loss": 1.4652, + "step": 3078 + }, + { + "epoch": 0.33, + "grad_norm": 0.07358607939108996, + "learning_rate": 0.000780727686259442, + "loss": 1.4582, + "step": 3079 + }, + { + "epoch": 0.33, + "grad_norm": 0.07282302440663897, + "learning_rate": 0.0007805835942057209, + "loss": 1.4773, + "step": 3080 + }, + { + "epoch": 0.33, + "grad_norm": 0.08644424392136274, + "learning_rate": 0.0007804394681302716, + "loss": 1.3965, + "step": 3081 + }, + { + "epoch": 0.33, + "grad_norm": 0.07859450169273975, + "learning_rate": 0.0007802953080505699, + "loss": 1.5057, + "step": 3082 + }, + { + "epoch": 0.33, + "grad_norm": 0.07110005546604672, + "learning_rate": 0.000780151113984096, + "loss": 1.4402, + "step": 3083 + }, + { + "epoch": 0.33, + "grad_norm": 0.06966222966144436, + "learning_rate": 0.0007800068859483335, + "loss": 1.4002, + "step": 3084 + }, + { + "epoch": 0.33, + "grad_norm": 0.07523096465564282, + "learning_rate": 0.0007798626239607709, + "loss": 1.4039, + "step": 3085 + }, + { + "epoch": 0.33, + "grad_norm": 0.08023929078514579, + "learning_rate": 0.0007797183280389002, + "loss": 1.3024, + "step": 3086 + }, + { + "epoch": 0.33, + "grad_norm": 0.06607105968539327, + "learning_rate": 0.0007795739982002179, + "loss": 1.4149, + "step": 3087 + }, + { + "epoch": 0.33, + "grad_norm": 0.07990908371456909, + "learning_rate": 0.0007794296344622245, + "loss": 1.5067, + "step": 3088 + }, + { + "epoch": 0.33, + "grad_norm": 0.07774648613968008, + "learning_rate": 0.0007792852368424246, + "loss": 1.3315, + "step": 3089 + }, + { + "epoch": 0.33, + "grad_norm": 0.06540989487465651, + "learning_rate": 0.0007791408053583269, + "loss": 1.3436, + "step": 3090 + }, + { + "epoch": 0.33, + "grad_norm": 0.07598791466657227, + "learning_rate": 0.0007789963400274443, + "loss": 1.4636, + "step": 3091 + }, + { + "epoch": 0.33, + "grad_norm": 0.07035326107353247, + "learning_rate": 0.0007788518408672934, + "loss": 1.4288, + "step": 3092 + }, + { + "epoch": 0.33, + "grad_norm": 0.07360792080801745, + "learning_rate": 0.0007787073078953955, + "loss": 1.4191, + "step": 3093 + }, + { + "epoch": 0.33, + "grad_norm": 0.08166605972997906, + "learning_rate": 0.0007785627411292757, + "loss": 1.4531, + "step": 3094 + }, + { + "epoch": 0.33, + "grad_norm": 0.08293183860962706, + "learning_rate": 0.0007784181405864633, + "loss": 1.4735, + "step": 3095 + }, + { + "epoch": 0.33, + "grad_norm": 0.07091869532476382, + "learning_rate": 0.0007782735062844914, + "loss": 1.5312, + "step": 3096 + }, + { + "epoch": 0.33, + "grad_norm": 0.07628603322010782, + "learning_rate": 0.0007781288382408975, + "loss": 1.4945, + "step": 3097 + }, + { + "epoch": 0.33, + "grad_norm": 0.07645251008994298, + "learning_rate": 0.0007779841364732231, + "loss": 1.5767, + "step": 3098 + }, + { + "epoch": 0.33, + "grad_norm": 0.07069182150804638, + "learning_rate": 0.0007778394009990138, + "loss": 1.3911, + "step": 3099 + }, + { + "epoch": 0.33, + "grad_norm": 0.06384757627433746, + "learning_rate": 0.0007776946318358193, + "loss": 1.4573, + "step": 3100 + }, + { + "epoch": 0.33, + "grad_norm": 0.06905081379306961, + "learning_rate": 0.0007775498290011934, + "loss": 1.4385, + "step": 3101 + }, + { + "epoch": 0.33, + "grad_norm": 0.06830814690017548, + "learning_rate": 0.0007774049925126938, + "loss": 1.3747, + "step": 3102 + }, + { + "epoch": 0.33, + "grad_norm": 0.07537641175106087, + "learning_rate": 0.0007772601223878825, + "loss": 1.4257, + "step": 3103 + }, + { + "epoch": 0.33, + "grad_norm": 0.06944422593512861, + "learning_rate": 0.0007771152186443254, + "loss": 1.3538, + "step": 3104 + }, + { + "epoch": 0.33, + "grad_norm": 0.06475440686857296, + "learning_rate": 0.0007769702812995929, + "loss": 1.4781, + "step": 3105 + }, + { + "epoch": 0.33, + "grad_norm": 0.06935888065280987, + "learning_rate": 0.0007768253103712588, + "loss": 1.5099, + "step": 3106 + }, + { + "epoch": 0.33, + "grad_norm": 0.0958797906744289, + "learning_rate": 0.0007766803058769015, + "loss": 1.423, + "step": 3107 + }, + { + "epoch": 0.33, + "grad_norm": 0.07001471875357002, + "learning_rate": 0.0007765352678341031, + "loss": 1.3799, + "step": 3108 + }, + { + "epoch": 0.33, + "grad_norm": 0.06829461879326418, + "learning_rate": 0.0007763901962604502, + "loss": 1.4027, + "step": 3109 + }, + { + "epoch": 0.33, + "grad_norm": 0.07110317044671406, + "learning_rate": 0.0007762450911735333, + "loss": 1.3645, + "step": 3110 + }, + { + "epoch": 0.33, + "grad_norm": 0.06932317253596268, + "learning_rate": 0.0007760999525909465, + "loss": 1.3177, + "step": 3111 + }, + { + "epoch": 0.33, + "grad_norm": 0.07623931912800835, + "learning_rate": 0.0007759547805302887, + "loss": 1.5691, + "step": 3112 + }, + { + "epoch": 0.33, + "grad_norm": 0.07738409838265192, + "learning_rate": 0.0007758095750091624, + "loss": 1.376, + "step": 3113 + }, + { + "epoch": 0.33, + "grad_norm": 0.07384334853814246, + "learning_rate": 0.0007756643360451742, + "loss": 1.4941, + "step": 3114 + }, + { + "epoch": 0.33, + "grad_norm": 0.07783517274774741, + "learning_rate": 0.0007755190636559349, + "loss": 1.5442, + "step": 3115 + }, + { + "epoch": 0.33, + "grad_norm": 0.08431183660573531, + "learning_rate": 0.0007753737578590594, + "loss": 1.3991, + "step": 3116 + }, + { + "epoch": 0.34, + "grad_norm": 0.07425496823855175, + "learning_rate": 0.0007752284186721664, + "loss": 1.4384, + "step": 3117 + }, + { + "epoch": 0.34, + "grad_norm": 0.07768753925552281, + "learning_rate": 0.0007750830461128786, + "loss": 1.5313, + "step": 3118 + }, + { + "epoch": 0.34, + "grad_norm": 0.0742130465574765, + "learning_rate": 0.0007749376401988232, + "loss": 1.4509, + "step": 3119 + }, + { + "epoch": 0.34, + "grad_norm": 0.07106119474212085, + "learning_rate": 0.0007747922009476311, + "loss": 1.4476, + "step": 3120 + }, + { + "epoch": 0.34, + "grad_norm": 0.08026943237402184, + "learning_rate": 0.0007746467283769373, + "loss": 1.4237, + "step": 3121 + }, + { + "epoch": 0.34, + "grad_norm": 0.07251319491821662, + "learning_rate": 0.0007745012225043808, + "loss": 1.4031, + "step": 3122 + }, + { + "epoch": 0.34, + "grad_norm": 0.0838468737335798, + "learning_rate": 0.0007743556833476046, + "loss": 1.3595, + "step": 3123 + }, + { + "epoch": 0.34, + "grad_norm": 0.07248789364297477, + "learning_rate": 0.0007742101109242561, + "loss": 1.3831, + "step": 3124 + }, + { + "epoch": 0.34, + "grad_norm": 0.06538596756504367, + "learning_rate": 0.0007740645052519863, + "loss": 1.4328, + "step": 3125 + }, + { + "epoch": 0.34, + "grad_norm": 0.07321164447995468, + "learning_rate": 0.0007739188663484504, + "loss": 1.4778, + "step": 3126 + }, + { + "epoch": 0.34, + "grad_norm": 0.06658563944433109, + "learning_rate": 0.0007737731942313075, + "loss": 1.4636, + "step": 3127 + }, + { + "epoch": 0.34, + "grad_norm": 0.07882492086563908, + "learning_rate": 0.000773627488918221, + "loss": 1.4752, + "step": 3128 + }, + { + "epoch": 0.34, + "grad_norm": 0.07703285975834412, + "learning_rate": 0.0007734817504268581, + "loss": 1.4747, + "step": 3129 + }, + { + "epoch": 0.34, + "grad_norm": 0.07459758239690902, + "learning_rate": 0.0007733359787748903, + "loss": 1.3904, + "step": 3130 + }, + { + "epoch": 0.34, + "grad_norm": 0.07884448125379544, + "learning_rate": 0.0007731901739799926, + "loss": 1.4475, + "step": 3131 + }, + { + "epoch": 0.34, + "grad_norm": 0.07877035915224383, + "learning_rate": 0.0007730443360598446, + "loss": 1.2817, + "step": 3132 + }, + { + "epoch": 0.34, + "grad_norm": 0.07954104659452825, + "learning_rate": 0.0007728984650321294, + "loss": 1.3287, + "step": 3133 + }, + { + "epoch": 0.34, + "grad_norm": 0.07618254195620103, + "learning_rate": 0.0007727525609145345, + "loss": 1.3799, + "step": 3134 + }, + { + "epoch": 0.34, + "grad_norm": 0.07133598285920512, + "learning_rate": 0.0007726066237247513, + "loss": 1.4582, + "step": 3135 + }, + { + "epoch": 0.34, + "grad_norm": 0.08030773599573748, + "learning_rate": 0.000772460653480475, + "loss": 1.3284, + "step": 3136 + }, + { + "epoch": 0.34, + "grad_norm": 0.07601094209678182, + "learning_rate": 0.0007723146501994053, + "loss": 1.4454, + "step": 3137 + }, + { + "epoch": 0.34, + "grad_norm": 0.07237544124895776, + "learning_rate": 0.0007721686138992456, + "loss": 1.4829, + "step": 3138 + }, + { + "epoch": 0.34, + "grad_norm": 0.07307223788740541, + "learning_rate": 0.0007720225445977029, + "loss": 1.5171, + "step": 3139 + }, + { + "epoch": 0.34, + "grad_norm": 0.07334412315927522, + "learning_rate": 0.000771876442312489, + "loss": 1.3808, + "step": 3140 + }, + { + "epoch": 0.34, + "grad_norm": 0.07045811403441511, + "learning_rate": 0.0007717303070613192, + "loss": 1.5331, + "step": 3141 + }, + { + "epoch": 0.34, + "grad_norm": 0.06922844829063676, + "learning_rate": 0.0007715841388619129, + "loss": 1.3979, + "step": 3142 + }, + { + "epoch": 0.34, + "grad_norm": 0.07247091145004228, + "learning_rate": 0.0007714379377319933, + "loss": 1.3762, + "step": 3143 + }, + { + "epoch": 0.34, + "grad_norm": 0.08291167413964351, + "learning_rate": 0.0007712917036892881, + "loss": 1.5631, + "step": 3144 + }, + { + "epoch": 0.34, + "grad_norm": 0.07776849587307075, + "learning_rate": 0.0007711454367515284, + "loss": 1.4031, + "step": 3145 + }, + { + "epoch": 0.34, + "grad_norm": 0.0775914855157828, + "learning_rate": 0.0007709991369364498, + "loss": 1.2737, + "step": 3146 + }, + { + "epoch": 0.34, + "grad_norm": 0.07038223844706518, + "learning_rate": 0.0007708528042617915, + "loss": 1.3721, + "step": 3147 + }, + { + "epoch": 0.34, + "grad_norm": 0.06373412075036435, + "learning_rate": 0.0007707064387452972, + "loss": 1.4981, + "step": 3148 + }, + { + "epoch": 0.34, + "grad_norm": 0.07680543115273483, + "learning_rate": 0.0007705600404047135, + "loss": 1.5106, + "step": 3149 + }, + { + "epoch": 0.34, + "grad_norm": 0.07667822761501376, + "learning_rate": 0.0007704136092577925, + "loss": 1.4488, + "step": 3150 + }, + { + "epoch": 0.34, + "grad_norm": 0.0650475144264375, + "learning_rate": 0.000770267145322289, + "loss": 1.5037, + "step": 3151 + }, + { + "epoch": 0.34, + "grad_norm": 0.07429507685088853, + "learning_rate": 0.0007701206486159622, + "loss": 1.2532, + "step": 3152 + }, + { + "epoch": 0.34, + "grad_norm": 0.06955586145840957, + "learning_rate": 0.0007699741191565758, + "loss": 1.4294, + "step": 3153 + }, + { + "epoch": 0.34, + "grad_norm": 0.08122438166751517, + "learning_rate": 0.0007698275569618965, + "loss": 1.4821, + "step": 3154 + }, + { + "epoch": 0.34, + "grad_norm": 0.08451174509670904, + "learning_rate": 0.0007696809620496958, + "loss": 1.4029, + "step": 3155 + }, + { + "epoch": 0.34, + "grad_norm": 0.08009083722558016, + "learning_rate": 0.0007695343344377485, + "loss": 1.4404, + "step": 3156 + }, + { + "epoch": 0.34, + "grad_norm": 0.07499676196986182, + "learning_rate": 0.0007693876741438341, + "loss": 1.383, + "step": 3157 + }, + { + "epoch": 0.34, + "grad_norm": 0.0725984705093447, + "learning_rate": 0.0007692409811857356, + "loss": 1.5022, + "step": 3158 + }, + { + "epoch": 0.34, + "grad_norm": 0.07422957294380073, + "learning_rate": 0.0007690942555812397, + "loss": 1.3875, + "step": 3159 + }, + { + "epoch": 0.34, + "grad_norm": 0.06613298070529067, + "learning_rate": 0.0007689474973481377, + "loss": 1.4326, + "step": 3160 + }, + { + "epoch": 0.34, + "grad_norm": 0.08919731981533549, + "learning_rate": 0.0007688007065042245, + "loss": 1.4545, + "step": 3161 + }, + { + "epoch": 0.34, + "grad_norm": 0.07398471196751458, + "learning_rate": 0.0007686538830672988, + "loss": 1.4074, + "step": 3162 + }, + { + "epoch": 0.34, + "grad_norm": 0.08089708897669948, + "learning_rate": 0.0007685070270551638, + "loss": 1.452, + "step": 3163 + }, + { + "epoch": 0.34, + "grad_norm": 0.0789578097029579, + "learning_rate": 0.000768360138485626, + "loss": 1.4309, + "step": 3164 + }, + { + "epoch": 0.34, + "grad_norm": 0.07689564974172196, + "learning_rate": 0.000768213217376496, + "loss": 1.4699, + "step": 3165 + }, + { + "epoch": 0.34, + "grad_norm": 0.07618594956580693, + "learning_rate": 0.0007680662637455889, + "loss": 1.3925, + "step": 3166 + }, + { + "epoch": 0.34, + "grad_norm": 0.07046309543070939, + "learning_rate": 0.0007679192776107232, + "loss": 1.4711, + "step": 3167 + }, + { + "epoch": 0.34, + "grad_norm": 0.06065773332639102, + "learning_rate": 0.0007677722589897214, + "loss": 1.4663, + "step": 3168 + }, + { + "epoch": 0.34, + "grad_norm": 0.06448888328620603, + "learning_rate": 0.0007676252079004101, + "loss": 1.2777, + "step": 3169 + }, + { + "epoch": 0.34, + "grad_norm": 0.06919677462771456, + "learning_rate": 0.0007674781243606197, + "loss": 1.4472, + "step": 3170 + }, + { + "epoch": 0.34, + "grad_norm": 0.07706927662533691, + "learning_rate": 0.0007673310083881844, + "loss": 1.4423, + "step": 3171 + }, + { + "epoch": 0.34, + "grad_norm": 0.07029417746284516, + "learning_rate": 0.0007671838600009429, + "loss": 1.5247, + "step": 3172 + }, + { + "epoch": 0.34, + "grad_norm": 0.07006293784244212, + "learning_rate": 0.0007670366792167371, + "loss": 1.3477, + "step": 3173 + }, + { + "epoch": 0.34, + "grad_norm": 0.0665223617685133, + "learning_rate": 0.0007668894660534135, + "loss": 1.3575, + "step": 3174 + }, + { + "epoch": 0.34, + "grad_norm": 0.07626888473910773, + "learning_rate": 0.0007667422205288219, + "loss": 1.4666, + "step": 3175 + }, + { + "epoch": 0.34, + "grad_norm": 0.0680382115354333, + "learning_rate": 0.0007665949426608164, + "loss": 1.4727, + "step": 3176 + }, + { + "epoch": 0.34, + "grad_norm": 0.0718190962160836, + "learning_rate": 0.0007664476324672552, + "loss": 1.5404, + "step": 3177 + }, + { + "epoch": 0.34, + "grad_norm": 0.0793876214439945, + "learning_rate": 0.0007663002899659999, + "loss": 1.477, + "step": 3178 + }, + { + "epoch": 0.34, + "grad_norm": 0.07195150887665572, + "learning_rate": 0.0007661529151749163, + "loss": 1.4443, + "step": 3179 + }, + { + "epoch": 0.34, + "grad_norm": 0.07746803954967403, + "learning_rate": 0.0007660055081118742, + "loss": 1.5019, + "step": 3180 + }, + { + "epoch": 0.34, + "grad_norm": 0.07756374186181363, + "learning_rate": 0.0007658580687947473, + "loss": 1.5107, + "step": 3181 + }, + { + "epoch": 0.34, + "grad_norm": 0.084088607358134, + "learning_rate": 0.0007657105972414128, + "loss": 1.35, + "step": 3182 + }, + { + "epoch": 0.34, + "grad_norm": 0.07775278682089769, + "learning_rate": 0.0007655630934697524, + "loss": 1.4803, + "step": 3183 + }, + { + "epoch": 0.34, + "grad_norm": 0.07602799563639731, + "learning_rate": 0.0007654155574976515, + "loss": 1.5302, + "step": 3184 + }, + { + "epoch": 0.34, + "grad_norm": 0.09250492590276259, + "learning_rate": 0.0007652679893429993, + "loss": 1.5617, + "step": 3185 + }, + { + "epoch": 0.34, + "grad_norm": 0.07556500064817434, + "learning_rate": 0.0007651203890236886, + "loss": 1.3106, + "step": 3186 + }, + { + "epoch": 0.34, + "grad_norm": 0.0823390739699376, + "learning_rate": 0.0007649727565576168, + "loss": 1.399, + "step": 3187 + }, + { + "epoch": 0.34, + "grad_norm": 0.08353330942823384, + "learning_rate": 0.0007648250919626849, + "loss": 1.5233, + "step": 3188 + }, + { + "epoch": 0.34, + "grad_norm": 0.07649777003122347, + "learning_rate": 0.0007646773952567975, + "loss": 1.4492, + "step": 3189 + }, + { + "epoch": 0.34, + "grad_norm": 0.07280004721530052, + "learning_rate": 0.0007645296664578635, + "loss": 1.3165, + "step": 3190 + }, + { + "epoch": 0.34, + "grad_norm": 0.09933336670129464, + "learning_rate": 0.0007643819055837955, + "loss": 1.3888, + "step": 3191 + }, + { + "epoch": 0.34, + "grad_norm": 0.07779644484036152, + "learning_rate": 0.0007642341126525099, + "loss": 1.3763, + "step": 3192 + }, + { + "epoch": 0.34, + "grad_norm": 0.0683306880772141, + "learning_rate": 0.0007640862876819271, + "loss": 1.3403, + "step": 3193 + }, + { + "epoch": 0.34, + "grad_norm": 0.07300035128576285, + "learning_rate": 0.0007639384306899716, + "loss": 1.4203, + "step": 3194 + }, + { + "epoch": 0.34, + "grad_norm": 0.07191331871436503, + "learning_rate": 0.0007637905416945716, + "loss": 1.3708, + "step": 3195 + }, + { + "epoch": 0.34, + "grad_norm": 0.06748665275475779, + "learning_rate": 0.0007636426207136587, + "loss": 1.3958, + "step": 3196 + }, + { + "epoch": 0.34, + "grad_norm": 0.06979899017349758, + "learning_rate": 0.0007634946677651693, + "loss": 1.5443, + "step": 3197 + }, + { + "epoch": 0.34, + "grad_norm": 0.0665352354043053, + "learning_rate": 0.000763346682867043, + "loss": 1.4987, + "step": 3198 + }, + { + "epoch": 0.34, + "grad_norm": 0.08238063923115757, + "learning_rate": 0.0007631986660372234, + "loss": 1.4835, + "step": 3199 + }, + { + "epoch": 0.34, + "grad_norm": 0.06784751022670603, + "learning_rate": 0.0007630506172936583, + "loss": 1.5603, + "step": 3200 + }, + { + "epoch": 0.34, + "grad_norm": 0.06156458689570208, + "learning_rate": 0.000762902536654299, + "loss": 1.4053, + "step": 3201 + }, + { + "epoch": 0.34, + "grad_norm": 0.07874911875458783, + "learning_rate": 0.0007627544241371005, + "loss": 1.4591, + "step": 3202 + }, + { + "epoch": 0.34, + "grad_norm": 0.06958839969062831, + "learning_rate": 0.0007626062797600225, + "loss": 1.43, + "step": 3203 + }, + { + "epoch": 0.34, + "grad_norm": 0.07551242792555159, + "learning_rate": 0.0007624581035410277, + "loss": 1.539, + "step": 3204 + }, + { + "epoch": 0.34, + "grad_norm": 0.0802738339351694, + "learning_rate": 0.000762309895498083, + "loss": 1.411, + "step": 3205 + }, + { + "epoch": 0.34, + "grad_norm": 0.07610686443993148, + "learning_rate": 0.0007621616556491591, + "loss": 1.331, + "step": 3206 + }, + { + "epoch": 0.34, + "grad_norm": 0.07973442227199408, + "learning_rate": 0.0007620133840122305, + "loss": 1.3919, + "step": 3207 + }, + { + "epoch": 0.34, + "grad_norm": 0.06935731716024028, + "learning_rate": 0.0007618650806052761, + "loss": 1.468, + "step": 3208 + }, + { + "epoch": 0.34, + "grad_norm": 0.07140926619478406, + "learning_rate": 0.0007617167454462777, + "loss": 1.3073, + "step": 3209 + }, + { + "epoch": 0.35, + "grad_norm": 0.07628062933111013, + "learning_rate": 0.0007615683785532217, + "loss": 1.4563, + "step": 3210 + }, + { + "epoch": 0.35, + "grad_norm": 0.06641544863858916, + "learning_rate": 0.000761419979944098, + "loss": 1.488, + "step": 3211 + }, + { + "epoch": 0.35, + "grad_norm": 0.07750941731981588, + "learning_rate": 0.0007612715496369006, + "loss": 1.4531, + "step": 3212 + }, + { + "epoch": 0.35, + "grad_norm": 0.06603884034822871, + "learning_rate": 0.0007611230876496269, + "loss": 1.3834, + "step": 3213 + }, + { + "epoch": 0.35, + "grad_norm": 0.07814376352410252, + "learning_rate": 0.0007609745940002785, + "loss": 1.2924, + "step": 3214 + }, + { + "epoch": 0.35, + "grad_norm": 0.06860503690532974, + "learning_rate": 0.0007608260687068611, + "loss": 1.3301, + "step": 3215 + }, + { + "epoch": 0.35, + "grad_norm": 0.07411296420440955, + "learning_rate": 0.0007606775117873836, + "loss": 1.4608, + "step": 3216 + }, + { + "epoch": 0.35, + "grad_norm": 0.0887118333530251, + "learning_rate": 0.0007605289232598591, + "loss": 1.4031, + "step": 3217 + }, + { + "epoch": 0.35, + "grad_norm": 0.0749536165963251, + "learning_rate": 0.0007603803031423045, + "loss": 1.3985, + "step": 3218 + }, + { + "epoch": 0.35, + "grad_norm": 0.08599613367738648, + "learning_rate": 0.0007602316514527404, + "loss": 1.3853, + "step": 3219 + }, + { + "epoch": 0.35, + "grad_norm": 0.06959589644535209, + "learning_rate": 0.0007600829682091915, + "loss": 1.3899, + "step": 3220 + }, + { + "epoch": 0.35, + "grad_norm": 0.08227183337119814, + "learning_rate": 0.0007599342534296861, + "loss": 1.573, + "step": 3221 + }, + { + "epoch": 0.35, + "grad_norm": 0.07457434958035399, + "learning_rate": 0.0007597855071322561, + "loss": 1.4239, + "step": 3222 + }, + { + "epoch": 0.35, + "grad_norm": 0.07725146361782721, + "learning_rate": 0.000759636729334938, + "loss": 1.4454, + "step": 3223 + }, + { + "epoch": 0.35, + "grad_norm": 0.07127251988346796, + "learning_rate": 0.0007594879200557711, + "loss": 1.2921, + "step": 3224 + }, + { + "epoch": 0.35, + "grad_norm": 0.08661988788655628, + "learning_rate": 0.0007593390793127996, + "loss": 1.5255, + "step": 3225 + }, + { + "epoch": 0.35, + "grad_norm": 0.08081542170362471, + "learning_rate": 0.0007591902071240705, + "loss": 1.5518, + "step": 3226 + }, + { + "epoch": 0.35, + "grad_norm": 0.07345692626207229, + "learning_rate": 0.0007590413035076354, + "loss": 1.3901, + "step": 3227 + }, + { + "epoch": 0.35, + "grad_norm": 0.08021573419868515, + "learning_rate": 0.0007588923684815489, + "loss": 1.4691, + "step": 3228 + }, + { + "epoch": 0.35, + "grad_norm": 0.07183007813806935, + "learning_rate": 0.0007587434020638704, + "loss": 1.474, + "step": 3229 + }, + { + "epoch": 0.35, + "grad_norm": 0.09673214943620395, + "learning_rate": 0.0007585944042726622, + "loss": 1.4883, + "step": 3230 + }, + { + "epoch": 0.35, + "grad_norm": 0.07943320951000708, + "learning_rate": 0.0007584453751259911, + "loss": 1.4071, + "step": 3231 + }, + { + "epoch": 0.35, + "grad_norm": 0.07247765308432169, + "learning_rate": 0.0007582963146419273, + "loss": 1.4031, + "step": 3232 + }, + { + "epoch": 0.35, + "grad_norm": 0.0768200140003289, + "learning_rate": 0.0007581472228385447, + "loss": 1.4768, + "step": 3233 + }, + { + "epoch": 0.35, + "grad_norm": 0.086314346381171, + "learning_rate": 0.0007579980997339215, + "loss": 1.5756, + "step": 3234 + }, + { + "epoch": 0.35, + "grad_norm": 0.08993807019188155, + "learning_rate": 0.0007578489453461392, + "loss": 1.4346, + "step": 3235 + }, + { + "epoch": 0.35, + "grad_norm": 0.07458162456361211, + "learning_rate": 0.0007576997596932833, + "loss": 1.4711, + "step": 3236 + }, + { + "epoch": 0.35, + "grad_norm": 0.0715283955335303, + "learning_rate": 0.0007575505427934433, + "loss": 1.4222, + "step": 3237 + }, + { + "epoch": 0.35, + "grad_norm": 0.07396927269800081, + "learning_rate": 0.000757401294664712, + "loss": 1.3881, + "step": 3238 + }, + { + "epoch": 0.35, + "grad_norm": 0.07757426845121163, + "learning_rate": 0.0007572520153251861, + "loss": 1.5357, + "step": 3239 + }, + { + "epoch": 0.35, + "grad_norm": 0.07139276773731634, + "learning_rate": 0.0007571027047929668, + "loss": 1.3491, + "step": 3240 + }, + { + "epoch": 0.35, + "grad_norm": 0.09502708923492989, + "learning_rate": 0.0007569533630861578, + "loss": 1.5269, + "step": 3241 + }, + { + "epoch": 0.35, + "grad_norm": 0.07195223372318488, + "learning_rate": 0.0007568039902228679, + "loss": 1.4009, + "step": 3242 + }, + { + "epoch": 0.35, + "grad_norm": 0.07778683580295905, + "learning_rate": 0.000756654586221209, + "loss": 1.3772, + "step": 3243 + }, + { + "epoch": 0.35, + "grad_norm": 0.06633510179131384, + "learning_rate": 0.0007565051510992964, + "loss": 1.5195, + "step": 3244 + }, + { + "epoch": 0.35, + "grad_norm": 0.07960888862929236, + "learning_rate": 0.00075635568487525, + "loss": 1.4633, + "step": 3245 + }, + { + "epoch": 0.35, + "grad_norm": 0.07572686281344403, + "learning_rate": 0.0007562061875671932, + "loss": 1.4543, + "step": 3246 + }, + { + "epoch": 0.35, + "grad_norm": 0.07078481527058693, + "learning_rate": 0.0007560566591932526, + "loss": 1.4283, + "step": 3247 + }, + { + "epoch": 0.35, + "grad_norm": 0.07721076934087472, + "learning_rate": 0.0007559070997715596, + "loss": 1.4219, + "step": 3248 + }, + { + "epoch": 0.35, + "grad_norm": 0.07709818422571885, + "learning_rate": 0.0007557575093202483, + "loss": 1.4719, + "step": 3249 + }, + { + "epoch": 0.35, + "grad_norm": 0.08369733445571949, + "learning_rate": 0.0007556078878574573, + "loss": 1.3737, + "step": 3250 + }, + { + "epoch": 0.35, + "grad_norm": 0.07127796374107943, + "learning_rate": 0.0007554582354013287, + "loss": 1.4437, + "step": 3251 + }, + { + "epoch": 0.35, + "grad_norm": 0.06956043970136461, + "learning_rate": 0.0007553085519700084, + "loss": 1.4763, + "step": 3252 + }, + { + "epoch": 0.35, + "grad_norm": 0.07552454140482719, + "learning_rate": 0.0007551588375816461, + "loss": 1.4528, + "step": 3253 + }, + { + "epoch": 0.35, + "grad_norm": 0.07703097370477303, + "learning_rate": 0.0007550090922543948, + "loss": 1.432, + "step": 3254 + }, + { + "epoch": 0.35, + "grad_norm": 0.08408237716390983, + "learning_rate": 0.000754859316006412, + "loss": 1.5888, + "step": 3255 + }, + { + "epoch": 0.35, + "grad_norm": 0.07580679587042956, + "learning_rate": 0.0007547095088558585, + "loss": 1.4903, + "step": 3256 + }, + { + "epoch": 0.35, + "grad_norm": 0.07470619755946706, + "learning_rate": 0.0007545596708208991, + "loss": 1.2541, + "step": 3257 + }, + { + "epoch": 0.35, + "grad_norm": 0.07443437030109887, + "learning_rate": 0.0007544098019197018, + "loss": 1.3443, + "step": 3258 + }, + { + "epoch": 0.35, + "grad_norm": 0.07786511617842874, + "learning_rate": 0.000754259902170439, + "loss": 1.4346, + "step": 3259 + }, + { + "epoch": 0.35, + "grad_norm": 0.07049486481909144, + "learning_rate": 0.0007541099715912867, + "loss": 1.4424, + "step": 3260 + }, + { + "epoch": 0.35, + "grad_norm": 0.08884401595716221, + "learning_rate": 0.0007539600102004241, + "loss": 1.4576, + "step": 3261 + }, + { + "epoch": 0.35, + "grad_norm": 0.06963460525741602, + "learning_rate": 0.0007538100180160347, + "loss": 1.5296, + "step": 3262 + }, + { + "epoch": 0.35, + "grad_norm": 0.07647861256873244, + "learning_rate": 0.0007536599950563056, + "loss": 1.4882, + "step": 3263 + }, + { + "epoch": 0.35, + "grad_norm": 0.07582018739217335, + "learning_rate": 0.0007535099413394276, + "loss": 1.3487, + "step": 3264 + }, + { + "epoch": 0.35, + "grad_norm": 0.0677356672438712, + "learning_rate": 0.0007533598568835953, + "loss": 1.3253, + "step": 3265 + }, + { + "epoch": 0.35, + "grad_norm": 0.06655141971882164, + "learning_rate": 0.0007532097417070069, + "loss": 1.3951, + "step": 3266 + }, + { + "epoch": 0.35, + "grad_norm": 0.0672083493114287, + "learning_rate": 0.0007530595958278643, + "loss": 1.4441, + "step": 3267 + }, + { + "epoch": 0.35, + "grad_norm": 0.06966617093307825, + "learning_rate": 0.0007529094192643733, + "loss": 1.4569, + "step": 3268 + }, + { + "epoch": 0.35, + "grad_norm": 0.07642161102040695, + "learning_rate": 0.0007527592120347433, + "loss": 1.4863, + "step": 3269 + }, + { + "epoch": 0.35, + "grad_norm": 0.07535741721980958, + "learning_rate": 0.0007526089741571875, + "loss": 1.5901, + "step": 3270 + }, + { + "epoch": 0.35, + "grad_norm": 0.08060712796678715, + "learning_rate": 0.0007524587056499227, + "loss": 1.4188, + "step": 3271 + }, + { + "epoch": 0.35, + "grad_norm": 0.07216132537348179, + "learning_rate": 0.0007523084065311694, + "loss": 1.4918, + "step": 3272 + }, + { + "epoch": 0.35, + "grad_norm": 0.06697670994819911, + "learning_rate": 0.000752158076819152, + "loss": 1.3135, + "step": 3273 + }, + { + "epoch": 0.35, + "grad_norm": 0.09212419434556492, + "learning_rate": 0.0007520077165320986, + "loss": 1.5355, + "step": 3274 + }, + { + "epoch": 0.35, + "grad_norm": 0.07964494763502676, + "learning_rate": 0.0007518573256882406, + "loss": 1.4749, + "step": 3275 + }, + { + "epoch": 0.35, + "grad_norm": 0.06205358368642162, + "learning_rate": 0.0007517069043058136, + "loss": 1.4527, + "step": 3276 + }, + { + "epoch": 0.35, + "grad_norm": 0.06997752615570801, + "learning_rate": 0.0007515564524030567, + "loss": 1.4434, + "step": 3277 + }, + { + "epoch": 0.35, + "grad_norm": 0.06796929318797444, + "learning_rate": 0.0007514059699982127, + "loss": 1.4697, + "step": 3278 + }, + { + "epoch": 0.35, + "grad_norm": 0.06739848065311337, + "learning_rate": 0.0007512554571095281, + "loss": 1.5002, + "step": 3279 + }, + { + "epoch": 0.35, + "grad_norm": 0.07068450192460699, + "learning_rate": 0.0007511049137552533, + "loss": 1.387, + "step": 3280 + }, + { + "epoch": 0.35, + "grad_norm": 0.06525573630565533, + "learning_rate": 0.0007509543399536418, + "loss": 1.347, + "step": 3281 + }, + { + "epoch": 0.35, + "grad_norm": 0.0718387775523518, + "learning_rate": 0.0007508037357229515, + "loss": 1.5561, + "step": 3282 + }, + { + "epoch": 0.35, + "grad_norm": 0.06705072625049899, + "learning_rate": 0.0007506531010814435, + "loss": 1.4128, + "step": 3283 + }, + { + "epoch": 0.35, + "grad_norm": 0.06879282519736174, + "learning_rate": 0.0007505024360473829, + "loss": 1.481, + "step": 3284 + }, + { + "epoch": 0.35, + "grad_norm": 0.07087927610595694, + "learning_rate": 0.0007503517406390384, + "loss": 1.5064, + "step": 3285 + }, + { + "epoch": 0.35, + "grad_norm": 0.06462073478069816, + "learning_rate": 0.000750201014874682, + "loss": 1.3303, + "step": 3286 + }, + { + "epoch": 0.35, + "grad_norm": 0.07515022238790801, + "learning_rate": 0.0007500502587725903, + "loss": 1.4377, + "step": 3287 + }, + { + "epoch": 0.35, + "grad_norm": 0.06472988386041287, + "learning_rate": 0.0007498994723510427, + "loss": 1.4442, + "step": 3288 + }, + { + "epoch": 0.35, + "grad_norm": 0.07501542260623986, + "learning_rate": 0.0007497486556283222, + "loss": 1.4461, + "step": 3289 + }, + { + "epoch": 0.35, + "grad_norm": 0.07074110541629863, + "learning_rate": 0.0007495978086227164, + "loss": 1.3875, + "step": 3290 + }, + { + "epoch": 0.35, + "grad_norm": 0.08067924371663614, + "learning_rate": 0.0007494469313525158, + "loss": 1.3703, + "step": 3291 + }, + { + "epoch": 0.35, + "grad_norm": 0.07953533766463396, + "learning_rate": 0.0007492960238360147, + "loss": 1.3868, + "step": 3292 + }, + { + "epoch": 0.35, + "grad_norm": 0.08099029033843955, + "learning_rate": 0.0007491450860915114, + "loss": 1.5645, + "step": 3293 + }, + { + "epoch": 0.35, + "grad_norm": 0.06974776206856093, + "learning_rate": 0.0007489941181373075, + "loss": 1.39, + "step": 3294 + }, + { + "epoch": 0.35, + "grad_norm": 0.0663906501580651, + "learning_rate": 0.0007488431199917081, + "loss": 1.3797, + "step": 3295 + }, + { + "epoch": 0.35, + "grad_norm": 0.07187536093113295, + "learning_rate": 0.0007486920916730228, + "loss": 1.4383, + "step": 3296 + }, + { + "epoch": 0.35, + "grad_norm": 0.07034086526155568, + "learning_rate": 0.0007485410331995639, + "loss": 1.4984, + "step": 3297 + }, + { + "epoch": 0.35, + "grad_norm": 0.09952125204166053, + "learning_rate": 0.0007483899445896478, + "loss": 1.387, + "step": 3298 + }, + { + "epoch": 0.35, + "grad_norm": 0.07855678239956934, + "learning_rate": 0.0007482388258615946, + "loss": 1.5485, + "step": 3299 + }, + { + "epoch": 0.35, + "grad_norm": 0.06719366075858256, + "learning_rate": 0.000748087677033728, + "loss": 1.3693, + "step": 3300 + }, + { + "epoch": 0.35, + "grad_norm": 0.07047823572228767, + "learning_rate": 0.0007479364981243753, + "loss": 1.5318, + "step": 3301 + }, + { + "epoch": 0.35, + "grad_norm": 0.07007505720471473, + "learning_rate": 0.0007477852891518675, + "loss": 1.5427, + "step": 3302 + }, + { + "epoch": 0.36, + "grad_norm": 0.07306598130413317, + "learning_rate": 0.000747634050134539, + "loss": 1.4338, + "step": 3303 + }, + { + "epoch": 0.36, + "grad_norm": 0.073346910314022, + "learning_rate": 0.0007474827810907283, + "loss": 1.4186, + "step": 3304 + }, + { + "epoch": 0.36, + "grad_norm": 0.08051731318932802, + "learning_rate": 0.0007473314820387771, + "loss": 1.3899, + "step": 3305 + }, + { + "epoch": 0.36, + "grad_norm": 0.07628081552060612, + "learning_rate": 0.0007471801529970311, + "loss": 1.4434, + "step": 3306 + }, + { + "epoch": 0.36, + "grad_norm": 0.07692904755262349, + "learning_rate": 0.0007470287939838393, + "loss": 1.4179, + "step": 3307 + }, + { + "epoch": 0.36, + "grad_norm": 0.07629253080964134, + "learning_rate": 0.0007468774050175547, + "loss": 1.495, + "step": 3308 + }, + { + "epoch": 0.36, + "grad_norm": 0.07601463459835986, + "learning_rate": 0.0007467259861165335, + "loss": 1.3929, + "step": 3309 + }, + { + "epoch": 0.36, + "grad_norm": 0.07143316010707389, + "learning_rate": 0.0007465745372991359, + "loss": 1.4335, + "step": 3310 + }, + { + "epoch": 0.36, + "grad_norm": 0.06589330547380333, + "learning_rate": 0.0007464230585837257, + "loss": 1.4047, + "step": 3311 + }, + { + "epoch": 0.36, + "grad_norm": 0.07249848843228891, + "learning_rate": 0.0007462715499886701, + "loss": 1.4494, + "step": 3312 + }, + { + "epoch": 0.36, + "grad_norm": 0.07137267226329203, + "learning_rate": 0.00074612001153234, + "loss": 1.2914, + "step": 3313 + }, + { + "epoch": 0.36, + "grad_norm": 0.0871238453782044, + "learning_rate": 0.00074596844323311, + "loss": 1.3637, + "step": 3314 + }, + { + "epoch": 0.36, + "grad_norm": 0.06963991804376052, + "learning_rate": 0.0007458168451093582, + "loss": 1.4503, + "step": 3315 + }, + { + "epoch": 0.36, + "grad_norm": 0.08264139026068265, + "learning_rate": 0.0007456652171794665, + "loss": 1.4632, + "step": 3316 + }, + { + "epoch": 0.36, + "grad_norm": 0.07188881114245928, + "learning_rate": 0.0007455135594618204, + "loss": 1.5282, + "step": 3317 + }, + { + "epoch": 0.36, + "grad_norm": 0.06770134356378205, + "learning_rate": 0.0007453618719748086, + "loss": 1.423, + "step": 3318 + }, + { + "epoch": 0.36, + "grad_norm": 0.07265567228128596, + "learning_rate": 0.0007452101547368241, + "loss": 1.2598, + "step": 3319 + }, + { + "epoch": 0.36, + "grad_norm": 0.08608144387463132, + "learning_rate": 0.0007450584077662628, + "loss": 1.4486, + "step": 3320 + }, + { + "epoch": 0.36, + "grad_norm": 0.08328290542930794, + "learning_rate": 0.0007449066310815249, + "loss": 1.3688, + "step": 3321 + }, + { + "epoch": 0.36, + "grad_norm": 0.07610336074240623, + "learning_rate": 0.0007447548247010137, + "loss": 1.3053, + "step": 3322 + }, + { + "epoch": 0.36, + "grad_norm": 0.07294867434292124, + "learning_rate": 0.000744602988643136, + "loss": 1.4548, + "step": 3323 + }, + { + "epoch": 0.36, + "grad_norm": 0.07649839853752352, + "learning_rate": 0.0007444511229263026, + "loss": 1.3756, + "step": 3324 + }, + { + "epoch": 0.36, + "grad_norm": 0.07738633235915615, + "learning_rate": 0.0007442992275689281, + "loss": 1.4607, + "step": 3325 + }, + { + "epoch": 0.36, + "grad_norm": 0.0820756094066135, + "learning_rate": 0.0007441473025894298, + "loss": 1.3747, + "step": 3326 + }, + { + "epoch": 0.36, + "grad_norm": 0.07086103194112217, + "learning_rate": 0.0007439953480062294, + "loss": 1.3998, + "step": 3327 + }, + { + "epoch": 0.36, + "grad_norm": 0.07017241619996507, + "learning_rate": 0.000743843363837752, + "loss": 1.4109, + "step": 3328 + }, + { + "epoch": 0.36, + "grad_norm": 0.07430812668013119, + "learning_rate": 0.0007436913501024259, + "loss": 1.4276, + "step": 3329 + }, + { + "epoch": 0.36, + "grad_norm": 0.07826138304497253, + "learning_rate": 0.0007435393068186835, + "loss": 1.3875, + "step": 3330 + }, + { + "epoch": 0.36, + "grad_norm": 0.07893852810631888, + "learning_rate": 0.0007433872340049607, + "loss": 1.417, + "step": 3331 + }, + { + "epoch": 0.36, + "grad_norm": 0.0685915410461669, + "learning_rate": 0.0007432351316796964, + "loss": 1.4295, + "step": 3332 + }, + { + "epoch": 0.36, + "grad_norm": 0.07565845102634473, + "learning_rate": 0.0007430829998613342, + "loss": 1.4313, + "step": 3333 + }, + { + "epoch": 0.36, + "grad_norm": 0.07870178690082014, + "learning_rate": 0.0007429308385683199, + "loss": 1.3445, + "step": 3334 + }, + { + "epoch": 0.36, + "grad_norm": 0.06969600272935284, + "learning_rate": 0.0007427786478191041, + "loss": 1.3896, + "step": 3335 + }, + { + "epoch": 0.36, + "grad_norm": 0.07161250210757818, + "learning_rate": 0.0007426264276321401, + "loss": 1.5339, + "step": 3336 + }, + { + "epoch": 0.36, + "grad_norm": 0.06567066213357739, + "learning_rate": 0.0007424741780258855, + "loss": 1.4913, + "step": 3337 + }, + { + "epoch": 0.36, + "grad_norm": 0.07739750277858809, + "learning_rate": 0.0007423218990188008, + "loss": 1.5223, + "step": 3338 + }, + { + "epoch": 0.36, + "grad_norm": 0.07139533267516067, + "learning_rate": 0.0007421695906293504, + "loss": 1.2608, + "step": 3339 + }, + { + "epoch": 0.36, + "grad_norm": 0.07084136654668771, + "learning_rate": 0.0007420172528760022, + "loss": 1.4166, + "step": 3340 + }, + { + "epoch": 0.36, + "grad_norm": 0.07461768305043502, + "learning_rate": 0.0007418648857772279, + "loss": 1.5376, + "step": 3341 + }, + { + "epoch": 0.36, + "grad_norm": 0.07018413349605858, + "learning_rate": 0.0007417124893515022, + "loss": 1.2933, + "step": 3342 + }, + { + "epoch": 0.36, + "grad_norm": 0.07748422851796735, + "learning_rate": 0.0007415600636173039, + "loss": 1.3764, + "step": 3343 + }, + { + "epoch": 0.36, + "grad_norm": 0.06651878463581043, + "learning_rate": 0.0007414076085931152, + "loss": 1.274, + "step": 3344 + }, + { + "epoch": 0.36, + "grad_norm": 0.0720113569348327, + "learning_rate": 0.0007412551242974215, + "loss": 1.4276, + "step": 3345 + }, + { + "epoch": 0.36, + "grad_norm": 0.06866646064645342, + "learning_rate": 0.0007411026107487123, + "loss": 1.5271, + "step": 3346 + }, + { + "epoch": 0.36, + "grad_norm": 0.07366727179489832, + "learning_rate": 0.0007409500679654805, + "loss": 1.4906, + "step": 3347 + }, + { + "epoch": 0.36, + "grad_norm": 0.08059863399610857, + "learning_rate": 0.0007407974959662222, + "loss": 1.3035, + "step": 3348 + }, + { + "epoch": 0.36, + "grad_norm": 0.08704144005558019, + "learning_rate": 0.0007406448947694373, + "loss": 1.4854, + "step": 3349 + }, + { + "epoch": 0.36, + "grad_norm": 0.08134336654874601, + "learning_rate": 0.0007404922643936294, + "loss": 1.3461, + "step": 3350 + }, + { + "epoch": 0.36, + "grad_norm": 0.07515621562261053, + "learning_rate": 0.0007403396048573051, + "loss": 1.4072, + "step": 3351 + }, + { + "epoch": 0.36, + "grad_norm": 0.07475152903248071, + "learning_rate": 0.0007401869161789753, + "loss": 1.3376, + "step": 3352 + }, + { + "epoch": 0.36, + "grad_norm": 0.07219916506306422, + "learning_rate": 0.0007400341983771539, + "loss": 1.5695, + "step": 3353 + }, + { + "epoch": 0.36, + "grad_norm": 0.07830596989301176, + "learning_rate": 0.0007398814514703585, + "loss": 1.5121, + "step": 3354 + }, + { + "epoch": 0.36, + "grad_norm": 0.07554099739341379, + "learning_rate": 0.0007397286754771098, + "loss": 1.3901, + "step": 3355 + }, + { + "epoch": 0.36, + "grad_norm": 0.0806406228367244, + "learning_rate": 0.000739575870415933, + "loss": 1.4044, + "step": 3356 + }, + { + "epoch": 0.36, + "grad_norm": 0.07450558110927982, + "learning_rate": 0.0007394230363053558, + "loss": 1.4013, + "step": 3357 + }, + { + "epoch": 0.36, + "grad_norm": 0.07182394432047186, + "learning_rate": 0.0007392701731639102, + "loss": 1.5534, + "step": 3358 + }, + { + "epoch": 0.36, + "grad_norm": 0.0755364633765582, + "learning_rate": 0.0007391172810101311, + "loss": 1.4123, + "step": 3359 + }, + { + "epoch": 0.36, + "grad_norm": 0.07636094394024896, + "learning_rate": 0.0007389643598625573, + "loss": 1.4783, + "step": 3360 + }, + { + "epoch": 0.36, + "grad_norm": 0.06831486564246876, + "learning_rate": 0.0007388114097397311, + "loss": 1.5466, + "step": 3361 + }, + { + "epoch": 0.36, + "grad_norm": 0.08977748147634984, + "learning_rate": 0.0007386584306601983, + "loss": 1.4046, + "step": 3362 + }, + { + "epoch": 0.36, + "grad_norm": 0.0672322704542431, + "learning_rate": 0.0007385054226425077, + "loss": 1.3721, + "step": 3363 + }, + { + "epoch": 0.36, + "grad_norm": 0.06880509467787907, + "learning_rate": 0.0007383523857052124, + "loss": 1.3872, + "step": 3364 + }, + { + "epoch": 0.36, + "grad_norm": 0.07513708330952967, + "learning_rate": 0.0007381993198668688, + "loss": 1.5438, + "step": 3365 + }, + { + "epoch": 0.36, + "grad_norm": 0.07848633488508752, + "learning_rate": 0.0007380462251460364, + "loss": 1.5115, + "step": 3366 + }, + { + "epoch": 0.36, + "grad_norm": 0.07274279733626927, + "learning_rate": 0.0007378931015612786, + "loss": 1.5028, + "step": 3367 + }, + { + "epoch": 0.36, + "grad_norm": 0.06856498960103916, + "learning_rate": 0.0007377399491311619, + "loss": 1.4003, + "step": 3368 + }, + { + "epoch": 0.36, + "grad_norm": 0.06815056499835738, + "learning_rate": 0.0007375867678742567, + "loss": 1.582, + "step": 3369 + }, + { + "epoch": 0.36, + "grad_norm": 0.07130247554924865, + "learning_rate": 0.0007374335578091371, + "loss": 1.3157, + "step": 3370 + }, + { + "epoch": 0.36, + "grad_norm": 0.0699724644388025, + "learning_rate": 0.0007372803189543798, + "loss": 1.4183, + "step": 3371 + }, + { + "epoch": 0.36, + "grad_norm": 0.07859318932566481, + "learning_rate": 0.0007371270513285659, + "loss": 1.5631, + "step": 3372 + }, + { + "epoch": 0.36, + "grad_norm": 0.07308852607081995, + "learning_rate": 0.0007369737549502796, + "loss": 1.576, + "step": 3373 + }, + { + "epoch": 0.36, + "grad_norm": 0.06767306064322558, + "learning_rate": 0.0007368204298381085, + "loss": 1.4173, + "step": 3374 + }, + { + "epoch": 0.36, + "grad_norm": 0.07610629235626565, + "learning_rate": 0.0007366670760106438, + "loss": 1.3515, + "step": 3375 + }, + { + "epoch": 0.36, + "grad_norm": 0.07255829044756128, + "learning_rate": 0.0007365136934864803, + "loss": 1.3286, + "step": 3376 + }, + { + "epoch": 0.36, + "grad_norm": 0.06113231650355408, + "learning_rate": 0.0007363602822842159, + "loss": 1.4304, + "step": 3377 + }, + { + "epoch": 0.36, + "grad_norm": 0.07358452028485662, + "learning_rate": 0.0007362068424224526, + "loss": 1.3216, + "step": 3378 + }, + { + "epoch": 0.36, + "grad_norm": 0.07125399818046416, + "learning_rate": 0.0007360533739197952, + "loss": 1.4562, + "step": 3379 + }, + { + "epoch": 0.36, + "grad_norm": 0.09144438881226293, + "learning_rate": 0.0007358998767948525, + "loss": 1.478, + "step": 3380 + }, + { + "epoch": 0.36, + "grad_norm": 0.09000168011921686, + "learning_rate": 0.0007357463510662364, + "loss": 1.5342, + "step": 3381 + }, + { + "epoch": 0.36, + "grad_norm": 0.0706911701954772, + "learning_rate": 0.0007355927967525626, + "loss": 1.3949, + "step": 3382 + }, + { + "epoch": 0.36, + "grad_norm": 0.07477870009184143, + "learning_rate": 0.0007354392138724499, + "loss": 1.4481, + "step": 3383 + }, + { + "epoch": 0.36, + "grad_norm": 0.09106274263140833, + "learning_rate": 0.0007352856024445208, + "loss": 1.3723, + "step": 3384 + }, + { + "epoch": 0.36, + "grad_norm": 0.07556958378953439, + "learning_rate": 0.0007351319624874012, + "loss": 1.5089, + "step": 3385 + }, + { + "epoch": 0.36, + "grad_norm": 0.07314230545614694, + "learning_rate": 0.0007349782940197207, + "loss": 1.3267, + "step": 3386 + }, + { + "epoch": 0.36, + "grad_norm": 0.07184969570633701, + "learning_rate": 0.000734824597060112, + "loss": 1.5058, + "step": 3387 + }, + { + "epoch": 0.36, + "grad_norm": 0.0729652621412798, + "learning_rate": 0.000734670871627211, + "loss": 1.5736, + "step": 3388 + }, + { + "epoch": 0.36, + "grad_norm": 0.0669490998865407, + "learning_rate": 0.000734517117739658, + "loss": 1.4002, + "step": 3389 + }, + { + "epoch": 0.36, + "grad_norm": 0.08135789295051453, + "learning_rate": 0.000734363335416096, + "loss": 1.4905, + "step": 3390 + }, + { + "epoch": 0.36, + "grad_norm": 0.07506628391924669, + "learning_rate": 0.0007342095246751717, + "loss": 1.4123, + "step": 3391 + }, + { + "epoch": 0.36, + "grad_norm": 0.07631817394827138, + "learning_rate": 0.000734055685535535, + "loss": 1.4101, + "step": 3392 + }, + { + "epoch": 0.36, + "grad_norm": 0.08451380581854086, + "learning_rate": 0.0007339018180158394, + "loss": 1.4673, + "step": 3393 + }, + { + "epoch": 0.36, + "grad_norm": 0.08132207812962348, + "learning_rate": 0.0007337479221347419, + "loss": 1.5261, + "step": 3394 + }, + { + "epoch": 0.36, + "grad_norm": 0.07423312430722277, + "learning_rate": 0.0007335939979109032, + "loss": 1.4681, + "step": 3395 + }, + { + "epoch": 0.37, + "grad_norm": 0.08032224968211471, + "learning_rate": 0.0007334400453629869, + "loss": 1.518, + "step": 3396 + }, + { + "epoch": 0.37, + "grad_norm": 0.07632194005433686, + "learning_rate": 0.0007332860645096604, + "loss": 1.4886, + "step": 3397 + }, + { + "epoch": 0.37, + "grad_norm": 0.0864928518705132, + "learning_rate": 0.0007331320553695942, + "loss": 1.516, + "step": 3398 + }, + { + "epoch": 0.37, + "grad_norm": 0.07020971282123498, + "learning_rate": 0.0007329780179614624, + "loss": 1.3371, + "step": 3399 + }, + { + "epoch": 0.37, + "grad_norm": 0.06779508227886841, + "learning_rate": 0.000732823952303943, + "loss": 1.4788, + "step": 3400 + }, + { + "epoch": 0.37, + "grad_norm": 0.08093791904028005, + "learning_rate": 0.0007326698584157167, + "loss": 1.5612, + "step": 3401 + }, + { + "epoch": 0.37, + "grad_norm": 0.07297898365770618, + "learning_rate": 0.0007325157363154678, + "loss": 1.4382, + "step": 3402 + }, + { + "epoch": 0.37, + "grad_norm": 0.07359410616916069, + "learning_rate": 0.0007323615860218843, + "loss": 1.4834, + "step": 3403 + }, + { + "epoch": 0.37, + "grad_norm": 0.07731757154460116, + "learning_rate": 0.0007322074075536574, + "loss": 1.5044, + "step": 3404 + }, + { + "epoch": 0.37, + "grad_norm": 0.0669375876227775, + "learning_rate": 0.0007320532009294818, + "loss": 1.5491, + "step": 3405 + }, + { + "epoch": 0.37, + "grad_norm": 0.08319722191595673, + "learning_rate": 0.0007318989661680556, + "loss": 1.5788, + "step": 3406 + }, + { + "epoch": 0.37, + "grad_norm": 0.07349318095347086, + "learning_rate": 0.0007317447032880804, + "loss": 1.4061, + "step": 3407 + }, + { + "epoch": 0.37, + "grad_norm": 0.0815880060028677, + "learning_rate": 0.0007315904123082608, + "loss": 1.4521, + "step": 3408 + }, + { + "epoch": 0.37, + "grad_norm": 0.07833449022531756, + "learning_rate": 0.0007314360932473054, + "loss": 1.301, + "step": 3409 + }, + { + "epoch": 0.37, + "grad_norm": 0.06153440472323062, + "learning_rate": 0.0007312817461239258, + "loss": 1.4043, + "step": 3410 + }, + { + "epoch": 0.37, + "grad_norm": 0.061429343476582014, + "learning_rate": 0.000731127370956837, + "loss": 1.3902, + "step": 3411 + }, + { + "epoch": 0.37, + "grad_norm": 0.07111252029973611, + "learning_rate": 0.0007309729677647579, + "loss": 1.3566, + "step": 3412 + }, + { + "epoch": 0.37, + "grad_norm": 0.07442921283389783, + "learning_rate": 0.00073081853656641, + "loss": 1.4205, + "step": 3413 + }, + { + "epoch": 0.37, + "grad_norm": 0.06875680871458785, + "learning_rate": 0.0007306640773805188, + "loss": 1.5055, + "step": 3414 + }, + { + "epoch": 0.37, + "grad_norm": 0.07011065801353429, + "learning_rate": 0.000730509590225813, + "loss": 1.4903, + "step": 3415 + }, + { + "epoch": 0.37, + "grad_norm": 0.07315229939972583, + "learning_rate": 0.0007303550751210247, + "loss": 1.3406, + "step": 3416 + }, + { + "epoch": 0.37, + "grad_norm": 0.06862713960228749, + "learning_rate": 0.0007302005320848894, + "loss": 1.487, + "step": 3417 + }, + { + "epoch": 0.37, + "grad_norm": 0.06991335457962292, + "learning_rate": 0.0007300459611361461, + "loss": 1.3993, + "step": 3418 + }, + { + "epoch": 0.37, + "grad_norm": 0.07522110372865594, + "learning_rate": 0.0007298913622935366, + "loss": 1.5515, + "step": 3419 + }, + { + "epoch": 0.37, + "grad_norm": 0.08368471518379769, + "learning_rate": 0.0007297367355758071, + "loss": 1.4278, + "step": 3420 + }, + { + "epoch": 0.37, + "grad_norm": 0.07070017111651242, + "learning_rate": 0.0007295820810017064, + "loss": 1.3956, + "step": 3421 + }, + { + "epoch": 0.37, + "grad_norm": 0.07220151835057213, + "learning_rate": 0.0007294273985899867, + "loss": 1.4705, + "step": 3422 + }, + { + "epoch": 0.37, + "grad_norm": 0.06758161171286328, + "learning_rate": 0.0007292726883594042, + "loss": 1.5548, + "step": 3423 + }, + { + "epoch": 0.37, + "grad_norm": 0.08252864849353624, + "learning_rate": 0.0007291179503287178, + "loss": 1.4895, + "step": 3424 + }, + { + "epoch": 0.37, + "grad_norm": 0.07274048956748501, + "learning_rate": 0.0007289631845166897, + "loss": 1.4906, + "step": 3425 + }, + { + "epoch": 0.37, + "grad_norm": 0.0645889905677694, + "learning_rate": 0.0007288083909420865, + "loss": 1.506, + "step": 3426 + }, + { + "epoch": 0.37, + "grad_norm": 0.07054697947372947, + "learning_rate": 0.0007286535696236769, + "loss": 1.3929, + "step": 3427 + }, + { + "epoch": 0.37, + "grad_norm": 0.06771033703226476, + "learning_rate": 0.0007284987205802338, + "loss": 1.3862, + "step": 3428 + }, + { + "epoch": 0.37, + "grad_norm": 0.07230953142493653, + "learning_rate": 0.000728343843830533, + "loss": 1.5308, + "step": 3429 + }, + { + "epoch": 0.37, + "grad_norm": 0.07990829409950602, + "learning_rate": 0.0007281889393933539, + "loss": 1.3315, + "step": 3430 + }, + { + "epoch": 0.37, + "grad_norm": 0.0744965070918347, + "learning_rate": 0.0007280340072874791, + "loss": 1.3883, + "step": 3431 + }, + { + "epoch": 0.37, + "grad_norm": 0.06853594795545971, + "learning_rate": 0.000727879047531695, + "loss": 1.4513, + "step": 3432 + }, + { + "epoch": 0.37, + "grad_norm": 0.07005750508920855, + "learning_rate": 0.0007277240601447907, + "loss": 1.429, + "step": 3433 + }, + { + "epoch": 0.37, + "grad_norm": 0.0746780149046262, + "learning_rate": 0.000727569045145559, + "loss": 1.4085, + "step": 3434 + }, + { + "epoch": 0.37, + "grad_norm": 0.07182195957106077, + "learning_rate": 0.000727414002552796, + "loss": 1.4943, + "step": 3435 + }, + { + "epoch": 0.37, + "grad_norm": 0.0640184881855754, + "learning_rate": 0.0007272589323853012, + "loss": 1.4937, + "step": 3436 + }, + { + "epoch": 0.37, + "grad_norm": 0.07500444842348676, + "learning_rate": 0.0007271038346618774, + "loss": 1.3062, + "step": 3437 + }, + { + "epoch": 0.37, + "grad_norm": 0.07010662383143268, + "learning_rate": 0.0007269487094013306, + "loss": 1.5224, + "step": 3438 + }, + { + "epoch": 0.37, + "grad_norm": 0.06754630061604039, + "learning_rate": 0.0007267935566224704, + "loss": 1.5214, + "step": 3439 + }, + { + "epoch": 0.37, + "grad_norm": 0.07547928011898576, + "learning_rate": 0.0007266383763441097, + "loss": 1.5368, + "step": 3440 + }, + { + "epoch": 0.37, + "grad_norm": 0.06936226935791721, + "learning_rate": 0.0007264831685850645, + "loss": 1.3726, + "step": 3441 + }, + { + "epoch": 0.37, + "grad_norm": 0.07120368855493818, + "learning_rate": 0.0007263279333641541, + "loss": 1.5166, + "step": 3442 + }, + { + "epoch": 0.37, + "grad_norm": 0.06496628037540313, + "learning_rate": 0.0007261726707002016, + "loss": 1.4256, + "step": 3443 + }, + { + "epoch": 0.37, + "grad_norm": 0.07751887752981045, + "learning_rate": 0.0007260173806120331, + "loss": 1.4362, + "step": 3444 + }, + { + "epoch": 0.37, + "grad_norm": 0.07593916993944279, + "learning_rate": 0.0007258620631184781, + "loss": 1.4158, + "step": 3445 + }, + { + "epoch": 0.37, + "grad_norm": 0.06962702489132955, + "learning_rate": 0.0007257067182383692, + "loss": 1.4768, + "step": 3446 + }, + { + "epoch": 0.37, + "grad_norm": 0.07130112366350376, + "learning_rate": 0.0007255513459905425, + "loss": 1.4573, + "step": 3447 + }, + { + "epoch": 0.37, + "grad_norm": 0.07317457257010111, + "learning_rate": 0.0007253959463938375, + "loss": 1.5383, + "step": 3448 + }, + { + "epoch": 0.37, + "grad_norm": 0.07639509895879555, + "learning_rate": 0.0007252405194670972, + "loss": 1.4555, + "step": 3449 + }, + { + "epoch": 0.37, + "grad_norm": 0.08762753103718868, + "learning_rate": 0.0007250850652291671, + "loss": 1.4631, + "step": 3450 + }, + { + "epoch": 0.37, + "grad_norm": 0.07417213080044456, + "learning_rate": 0.000724929583698897, + "loss": 1.4104, + "step": 3451 + }, + { + "epoch": 0.37, + "grad_norm": 0.08182638136439328, + "learning_rate": 0.0007247740748951393, + "loss": 1.4581, + "step": 3452 + }, + { + "epoch": 0.37, + "grad_norm": 0.07652625611553487, + "learning_rate": 0.0007246185388367502, + "loss": 1.495, + "step": 3453 + }, + { + "epoch": 0.37, + "grad_norm": 0.0718289444464132, + "learning_rate": 0.0007244629755425889, + "loss": 1.3931, + "step": 3454 + }, + { + "epoch": 0.37, + "grad_norm": 0.0776318362638091, + "learning_rate": 0.0007243073850315179, + "loss": 1.4296, + "step": 3455 + }, + { + "epoch": 0.37, + "grad_norm": 0.07319195455280723, + "learning_rate": 0.0007241517673224032, + "loss": 1.4762, + "step": 3456 + }, + { + "epoch": 0.37, + "grad_norm": 0.0704550126688549, + "learning_rate": 0.000723996122434114, + "loss": 1.5216, + "step": 3457 + }, + { + "epoch": 0.37, + "grad_norm": 0.0773671225672999, + "learning_rate": 0.0007238404503855227, + "loss": 1.4112, + "step": 3458 + }, + { + "epoch": 0.37, + "grad_norm": 0.09560707561440114, + "learning_rate": 0.000723684751195505, + "loss": 1.4316, + "step": 3459 + }, + { + "epoch": 0.37, + "grad_norm": 0.08176097879461783, + "learning_rate": 0.0007235290248829402, + "loss": 1.4419, + "step": 3460 + }, + { + "epoch": 0.37, + "grad_norm": 0.07671659208485172, + "learning_rate": 0.0007233732714667104, + "loss": 1.5268, + "step": 3461 + }, + { + "epoch": 0.37, + "grad_norm": 0.07096830395444363, + "learning_rate": 0.0007232174909657014, + "loss": 1.4263, + "step": 3462 + }, + { + "epoch": 0.37, + "grad_norm": 0.07937368779759209, + "learning_rate": 0.0007230616833988021, + "loss": 1.4219, + "step": 3463 + }, + { + "epoch": 0.37, + "grad_norm": 0.07043297450143687, + "learning_rate": 0.0007229058487849045, + "loss": 1.5423, + "step": 3464 + }, + { + "epoch": 0.37, + "grad_norm": 0.08169254273194243, + "learning_rate": 0.0007227499871429046, + "loss": 1.3532, + "step": 3465 + }, + { + "epoch": 0.37, + "grad_norm": 0.06726101998135063, + "learning_rate": 0.0007225940984917007, + "loss": 1.5103, + "step": 3466 + }, + { + "epoch": 0.37, + "grad_norm": 0.06720793619103706, + "learning_rate": 0.0007224381828501947, + "loss": 1.5341, + "step": 3467 + }, + { + "epoch": 0.37, + "grad_norm": 0.08637904126374926, + "learning_rate": 0.0007222822402372924, + "loss": 1.4348, + "step": 3468 + }, + { + "epoch": 0.37, + "grad_norm": 0.08033335965656004, + "learning_rate": 0.000722126270671902, + "loss": 1.3226, + "step": 3469 + }, + { + "epoch": 0.37, + "grad_norm": 0.07863747479165964, + "learning_rate": 0.0007219702741729357, + "loss": 1.4313, + "step": 3470 + }, + { + "epoch": 0.37, + "grad_norm": 0.08115522916601269, + "learning_rate": 0.0007218142507593084, + "loss": 1.5195, + "step": 3471 + }, + { + "epoch": 0.37, + "grad_norm": 0.08244015470619837, + "learning_rate": 0.0007216582004499384, + "loss": 1.4175, + "step": 3472 + }, + { + "epoch": 0.37, + "grad_norm": 0.08949002833198955, + "learning_rate": 0.0007215021232637474, + "loss": 1.4375, + "step": 3473 + }, + { + "epoch": 0.37, + "grad_norm": 0.07748904819217635, + "learning_rate": 0.0007213460192196602, + "loss": 1.3619, + "step": 3474 + }, + { + "epoch": 0.37, + "grad_norm": 0.09245287952959458, + "learning_rate": 0.0007211898883366052, + "loss": 1.5071, + "step": 3475 + }, + { + "epoch": 0.37, + "grad_norm": 0.07437346670229164, + "learning_rate": 0.0007210337306335137, + "loss": 1.4178, + "step": 3476 + }, + { + "epoch": 0.37, + "grad_norm": 0.07848104764122867, + "learning_rate": 0.0007208775461293205, + "loss": 1.4574, + "step": 3477 + }, + { + "epoch": 0.37, + "grad_norm": 0.08088266548645225, + "learning_rate": 0.0007207213348429629, + "loss": 1.4889, + "step": 3478 + }, + { + "epoch": 0.37, + "grad_norm": 0.08175567193562198, + "learning_rate": 0.0007205650967933829, + "loss": 1.5308, + "step": 3479 + }, + { + "epoch": 0.37, + "grad_norm": 0.08083881794107464, + "learning_rate": 0.0007204088319995245, + "loss": 1.45, + "step": 3480 + }, + { + "epoch": 0.37, + "grad_norm": 0.06983298049987251, + "learning_rate": 0.0007202525404803352, + "loss": 1.4103, + "step": 3481 + }, + { + "epoch": 0.37, + "grad_norm": 0.08082054975206557, + "learning_rate": 0.0007200962222547662, + "loss": 1.3104, + "step": 3482 + }, + { + "epoch": 0.37, + "grad_norm": 0.07562790021280924, + "learning_rate": 0.0007199398773417713, + "loss": 1.4955, + "step": 3483 + }, + { + "epoch": 0.37, + "grad_norm": 0.06228637321505799, + "learning_rate": 0.000719783505760308, + "loss": 1.5076, + "step": 3484 + }, + { + "epoch": 0.37, + "grad_norm": 0.07524344062108244, + "learning_rate": 0.000719627107529337, + "loss": 1.4938, + "step": 3485 + }, + { + "epoch": 0.37, + "grad_norm": 0.06815009438550329, + "learning_rate": 0.0007194706826678222, + "loss": 1.5052, + "step": 3486 + }, + { + "epoch": 0.37, + "grad_norm": 0.0670706317496325, + "learning_rate": 0.0007193142311947302, + "loss": 1.291, + "step": 3487 + }, + { + "epoch": 0.37, + "grad_norm": 0.07554509041215911, + "learning_rate": 0.0007191577531290318, + "loss": 1.5673, + "step": 3488 + }, + { + "epoch": 0.38, + "grad_norm": 0.06758859386555005, + "learning_rate": 0.0007190012484897002, + "loss": 1.4189, + "step": 3489 + }, + { + "epoch": 0.38, + "grad_norm": 0.0725558291970073, + "learning_rate": 0.0007188447172957121, + "loss": 1.526, + "step": 3490 + }, + { + "epoch": 0.38, + "grad_norm": 0.07233185809808401, + "learning_rate": 0.0007186881595660478, + "loss": 1.4618, + "step": 3491 + }, + { + "epoch": 0.38, + "grad_norm": 0.06533375243840589, + "learning_rate": 0.0007185315753196899, + "loss": 1.4157, + "step": 3492 + }, + { + "epoch": 0.38, + "grad_norm": 0.07160807457158654, + "learning_rate": 0.0007183749645756253, + "loss": 1.5502, + "step": 3493 + }, + { + "epoch": 0.38, + "grad_norm": 0.0695540672338539, + "learning_rate": 0.0007182183273528436, + "loss": 1.3664, + "step": 3494 + }, + { + "epoch": 0.38, + "grad_norm": 0.07960679599217649, + "learning_rate": 0.000718061663670337, + "loss": 1.4714, + "step": 3495 + }, + { + "epoch": 0.38, + "grad_norm": 0.06495251596402488, + "learning_rate": 0.0007179049735471021, + "loss": 1.3737, + "step": 3496 + }, + { + "epoch": 0.38, + "grad_norm": 0.06138238935894003, + "learning_rate": 0.0007177482570021379, + "loss": 1.4541, + "step": 3497 + }, + { + "epoch": 0.38, + "grad_norm": 0.06824731668030849, + "learning_rate": 0.0007175915140544469, + "loss": 1.429, + "step": 3498 + }, + { + "epoch": 0.38, + "grad_norm": 0.07191700712025918, + "learning_rate": 0.0007174347447230346, + "loss": 1.4168, + "step": 3499 + }, + { + "epoch": 0.38, + "grad_norm": 0.06375599026585484, + "learning_rate": 0.0007172779490269099, + "loss": 1.422, + "step": 3500 + }, + { + "epoch": 0.38, + "grad_norm": 0.0727035296847183, + "learning_rate": 0.0007171211269850847, + "loss": 1.4303, + "step": 3501 + }, + { + "epoch": 0.38, + "grad_norm": 0.07744658478094242, + "learning_rate": 0.0007169642786165746, + "loss": 1.5019, + "step": 3502 + }, + { + "epoch": 0.38, + "grad_norm": 0.07138038364485365, + "learning_rate": 0.0007168074039403975, + "loss": 1.3967, + "step": 3503 + }, + { + "epoch": 0.38, + "grad_norm": 0.07553044790967374, + "learning_rate": 0.0007166505029755752, + "loss": 1.3925, + "step": 3504 + }, + { + "epoch": 0.38, + "grad_norm": 0.08417883389077783, + "learning_rate": 0.0007164935757411327, + "loss": 1.4638, + "step": 3505 + }, + { + "epoch": 0.38, + "grad_norm": 0.0700501690822233, + "learning_rate": 0.0007163366222560976, + "loss": 1.4004, + "step": 3506 + }, + { + "epoch": 0.38, + "grad_norm": 0.07009352089361452, + "learning_rate": 0.0007161796425395013, + "loss": 1.4417, + "step": 3507 + }, + { + "epoch": 0.38, + "grad_norm": 0.08437481489426124, + "learning_rate": 0.0007160226366103781, + "loss": 1.3978, + "step": 3508 + }, + { + "epoch": 0.38, + "grad_norm": 0.07281127210582462, + "learning_rate": 0.0007158656044877654, + "loss": 1.4749, + "step": 3509 + }, + { + "epoch": 0.38, + "grad_norm": 0.08738001079910843, + "learning_rate": 0.000715708546190704, + "loss": 1.3974, + "step": 3510 + }, + { + "epoch": 0.38, + "grad_norm": 0.07474336566461999, + "learning_rate": 0.0007155514617382377, + "loss": 1.4416, + "step": 3511 + }, + { + "epoch": 0.38, + "grad_norm": 0.07599929215831347, + "learning_rate": 0.0007153943511494134, + "loss": 1.5088, + "step": 3512 + }, + { + "epoch": 0.38, + "grad_norm": 0.07441053895292685, + "learning_rate": 0.0007152372144432817, + "loss": 1.4735, + "step": 3513 + }, + { + "epoch": 0.38, + "grad_norm": 0.06776065211117123, + "learning_rate": 0.0007150800516388956, + "loss": 1.4267, + "step": 3514 + }, + { + "epoch": 0.38, + "grad_norm": 0.07725836370655137, + "learning_rate": 0.0007149228627553117, + "loss": 1.4947, + "step": 3515 + }, + { + "epoch": 0.38, + "grad_norm": 0.08250426681685036, + "learning_rate": 0.0007147656478115898, + "loss": 1.4598, + "step": 3516 + }, + { + "epoch": 0.38, + "grad_norm": 0.079833072983792, + "learning_rate": 0.0007146084068267928, + "loss": 1.3901, + "step": 3517 + }, + { + "epoch": 0.38, + "grad_norm": 0.07133873587178743, + "learning_rate": 0.0007144511398199865, + "loss": 1.3933, + "step": 3518 + }, + { + "epoch": 0.38, + "grad_norm": 0.06894999027593159, + "learning_rate": 0.00071429384681024, + "loss": 1.4166, + "step": 3519 + }, + { + "epoch": 0.38, + "grad_norm": 0.06676758690851145, + "learning_rate": 0.0007141365278166261, + "loss": 1.3517, + "step": 3520 + }, + { + "epoch": 0.38, + "grad_norm": 0.06575167661410714, + "learning_rate": 0.0007139791828582196, + "loss": 1.4837, + "step": 3521 + }, + { + "epoch": 0.38, + "grad_norm": 0.08017489354969888, + "learning_rate": 0.0007138218119540998, + "loss": 1.4512, + "step": 3522 + }, + { + "epoch": 0.38, + "grad_norm": 0.07228178571734656, + "learning_rate": 0.000713664415123348, + "loss": 1.3205, + "step": 3523 + }, + { + "epoch": 0.38, + "grad_norm": 0.1026294128547175, + "learning_rate": 0.0007135069923850493, + "loss": 1.4643, + "step": 3524 + }, + { + "epoch": 0.38, + "grad_norm": 0.08011786852797782, + "learning_rate": 0.0007133495437582916, + "loss": 1.442, + "step": 3525 + }, + { + "epoch": 0.38, + "grad_norm": 0.07244582841262252, + "learning_rate": 0.0007131920692621663, + "loss": 1.2873, + "step": 3526 + }, + { + "epoch": 0.38, + "grad_norm": 0.06822431369016423, + "learning_rate": 0.0007130345689157676, + "loss": 1.4352, + "step": 3527 + }, + { + "epoch": 0.38, + "grad_norm": 0.07031653395975418, + "learning_rate": 0.000712877042738193, + "loss": 1.3986, + "step": 3528 + }, + { + "epoch": 0.38, + "grad_norm": 0.07397680940283695, + "learning_rate": 0.000712719490748543, + "loss": 1.361, + "step": 3529 + }, + { + "epoch": 0.38, + "grad_norm": 0.07489945680906114, + "learning_rate": 0.0007125619129659214, + "loss": 1.3775, + "step": 3530 + }, + { + "epoch": 0.38, + "grad_norm": 0.06688488591400973, + "learning_rate": 0.0007124043094094352, + "loss": 1.4639, + "step": 3531 + }, + { + "epoch": 0.38, + "grad_norm": 0.0706049908559832, + "learning_rate": 0.0007122466800981939, + "loss": 1.5427, + "step": 3532 + }, + { + "epoch": 0.38, + "grad_norm": 0.0711990066874569, + "learning_rate": 0.0007120890250513111, + "loss": 1.515, + "step": 3533 + }, + { + "epoch": 0.38, + "grad_norm": 0.07309355595657928, + "learning_rate": 0.0007119313442879028, + "loss": 1.5657, + "step": 3534 + }, + { + "epoch": 0.38, + "grad_norm": 0.07203044835573408, + "learning_rate": 0.0007117736378270885, + "loss": 1.3643, + "step": 3535 + }, + { + "epoch": 0.38, + "grad_norm": 0.07921231575470476, + "learning_rate": 0.0007116159056879904, + "loss": 1.3966, + "step": 3536 + }, + { + "epoch": 0.38, + "grad_norm": 0.06516803459785417, + "learning_rate": 0.0007114581478897342, + "loss": 1.3938, + "step": 3537 + }, + { + "epoch": 0.38, + "grad_norm": 0.07084548870515166, + "learning_rate": 0.0007113003644514485, + "loss": 1.4167, + "step": 3538 + }, + { + "epoch": 0.38, + "grad_norm": 0.0754906872344934, + "learning_rate": 0.0007111425553922653, + "loss": 1.4765, + "step": 3539 + }, + { + "epoch": 0.38, + "grad_norm": 0.07277782098342274, + "learning_rate": 0.0007109847207313191, + "loss": 1.4083, + "step": 3540 + }, + { + "epoch": 0.38, + "grad_norm": 0.06913253674908561, + "learning_rate": 0.0007108268604877483, + "loss": 1.4447, + "step": 3541 + }, + { + "epoch": 0.38, + "grad_norm": 0.06659941330711411, + "learning_rate": 0.0007106689746806939, + "loss": 1.4691, + "step": 3542 + }, + { + "epoch": 0.38, + "grad_norm": 0.07232512460358181, + "learning_rate": 0.0007105110633292999, + "loss": 1.5318, + "step": 3543 + }, + { + "epoch": 0.38, + "grad_norm": 0.06723843964590365, + "learning_rate": 0.0007103531264527138, + "loss": 1.4276, + "step": 3544 + }, + { + "epoch": 0.38, + "grad_norm": 0.06852588979292723, + "learning_rate": 0.000710195164070086, + "loss": 1.3008, + "step": 3545 + }, + { + "epoch": 0.38, + "grad_norm": 0.06904335986327417, + "learning_rate": 0.0007100371762005697, + "loss": 1.2505, + "step": 3546 + }, + { + "epoch": 0.38, + "grad_norm": 0.07517395552253944, + "learning_rate": 0.0007098791628633217, + "loss": 1.5369, + "step": 3547 + }, + { + "epoch": 0.38, + "grad_norm": 0.07166106998321647, + "learning_rate": 0.0007097211240775018, + "loss": 1.2608, + "step": 3548 + }, + { + "epoch": 0.38, + "grad_norm": 0.06562667167612078, + "learning_rate": 0.0007095630598622724, + "loss": 1.4766, + "step": 3549 + }, + { + "epoch": 0.38, + "grad_norm": 0.07500007910028895, + "learning_rate": 0.0007094049702367997, + "loss": 1.1845, + "step": 3550 + }, + { + "epoch": 0.38, + "grad_norm": 0.07829807898182478, + "learning_rate": 0.0007092468552202523, + "loss": 1.2858, + "step": 3551 + }, + { + "epoch": 0.38, + "grad_norm": 0.06925524966397749, + "learning_rate": 0.0007090887148318023, + "loss": 1.2332, + "step": 3552 + }, + { + "epoch": 0.38, + "grad_norm": 0.08475475670099092, + "learning_rate": 0.000708930549090625, + "loss": 1.5682, + "step": 3553 + }, + { + "epoch": 0.38, + "grad_norm": 0.07622082830610633, + "learning_rate": 0.0007087723580158983, + "loss": 1.3916, + "step": 3554 + }, + { + "epoch": 0.38, + "grad_norm": 0.0766525857066531, + "learning_rate": 0.0007086141416268033, + "loss": 1.4084, + "step": 3555 + }, + { + "epoch": 0.38, + "grad_norm": 0.08003974204009992, + "learning_rate": 0.0007084558999425245, + "loss": 1.4344, + "step": 3556 + }, + { + "epoch": 0.38, + "grad_norm": 0.06920958308933454, + "learning_rate": 0.0007082976329822491, + "loss": 1.4378, + "step": 3557 + }, + { + "epoch": 0.38, + "grad_norm": 0.07090901454865277, + "learning_rate": 0.0007081393407651675, + "loss": 1.4642, + "step": 3558 + }, + { + "epoch": 0.38, + "grad_norm": 0.07078282523191834, + "learning_rate": 0.0007079810233104734, + "loss": 1.3724, + "step": 3559 + }, + { + "epoch": 0.38, + "grad_norm": 0.06605378041470791, + "learning_rate": 0.0007078226806373631, + "loss": 1.4078, + "step": 3560 + }, + { + "epoch": 0.38, + "grad_norm": 0.06343453006005209, + "learning_rate": 0.0007076643127650366, + "loss": 1.4076, + "step": 3561 + }, + { + "epoch": 0.38, + "grad_norm": 0.07317786253087795, + "learning_rate": 0.0007075059197126961, + "loss": 1.5665, + "step": 3562 + }, + { + "epoch": 0.38, + "grad_norm": 0.0654489556452131, + "learning_rate": 0.0007073475014995472, + "loss": 1.4653, + "step": 3563 + }, + { + "epoch": 0.38, + "grad_norm": 0.07119377427290143, + "learning_rate": 0.0007071890581447992, + "loss": 1.4013, + "step": 3564 + }, + { + "epoch": 0.38, + "grad_norm": 0.06589837136177674, + "learning_rate": 0.0007070305896676634, + "loss": 1.3597, + "step": 3565 + }, + { + "epoch": 0.38, + "grad_norm": 0.07099054244663582, + "learning_rate": 0.0007068720960873552, + "loss": 1.4718, + "step": 3566 + }, + { + "epoch": 0.38, + "grad_norm": 0.06627867168212025, + "learning_rate": 0.0007067135774230919, + "loss": 1.5182, + "step": 3567 + }, + { + "epoch": 0.38, + "grad_norm": 0.06392067010017136, + "learning_rate": 0.0007065550336940947, + "loss": 1.5218, + "step": 3568 + }, + { + "epoch": 0.38, + "grad_norm": 0.06383872794322432, + "learning_rate": 0.0007063964649195874, + "loss": 1.3602, + "step": 3569 + }, + { + "epoch": 0.38, + "grad_norm": 0.07302909210052719, + "learning_rate": 0.0007062378711187973, + "loss": 1.4906, + "step": 3570 + }, + { + "epoch": 0.38, + "grad_norm": 0.07202766521247028, + "learning_rate": 0.0007060792523109544, + "loss": 1.5076, + "step": 3571 + }, + { + "epoch": 0.38, + "grad_norm": 0.06485625650547473, + "learning_rate": 0.0007059206085152918, + "loss": 1.3075, + "step": 3572 + }, + { + "epoch": 0.38, + "grad_norm": 0.0715065717760381, + "learning_rate": 0.0007057619397510453, + "loss": 1.4296, + "step": 3573 + }, + { + "epoch": 0.38, + "grad_norm": 0.08825733453760141, + "learning_rate": 0.0007056032460374541, + "loss": 1.4665, + "step": 3574 + }, + { + "epoch": 0.38, + "grad_norm": 0.0701463361815818, + "learning_rate": 0.0007054445273937609, + "loss": 1.3857, + "step": 3575 + }, + { + "epoch": 0.38, + "grad_norm": 0.07630369200437422, + "learning_rate": 0.0007052857838392104, + "loss": 1.4023, + "step": 3576 + }, + { + "epoch": 0.38, + "grad_norm": 0.06817368849988802, + "learning_rate": 0.0007051270153930506, + "loss": 1.5429, + "step": 3577 + }, + { + "epoch": 0.38, + "grad_norm": 0.07273743830308618, + "learning_rate": 0.0007049682220745332, + "loss": 1.5339, + "step": 3578 + }, + { + "epoch": 0.38, + "grad_norm": 0.06237971196922428, + "learning_rate": 0.0007048094039029122, + "loss": 1.4023, + "step": 3579 + }, + { + "epoch": 0.38, + "grad_norm": 0.07174054000231482, + "learning_rate": 0.0007046505608974447, + "loss": 1.4179, + "step": 3580 + }, + { + "epoch": 0.38, + "grad_norm": 0.07079421703558056, + "learning_rate": 0.0007044916930773915, + "loss": 1.4336, + "step": 3581 + }, + { + "epoch": 0.39, + "grad_norm": 0.06324002319659304, + "learning_rate": 0.0007043328004620154, + "loss": 1.4002, + "step": 3582 + }, + { + "epoch": 0.39, + "grad_norm": 0.07273252464213828, + "learning_rate": 0.0007041738830705827, + "loss": 1.3021, + "step": 3583 + }, + { + "epoch": 0.39, + "grad_norm": 0.0679012855779142, + "learning_rate": 0.0007040149409223628, + "loss": 1.5173, + "step": 3584 + }, + { + "epoch": 0.39, + "grad_norm": 0.06706605104839712, + "learning_rate": 0.0007038559740366281, + "loss": 1.4245, + "step": 3585 + }, + { + "epoch": 0.39, + "grad_norm": 0.07263886553942718, + "learning_rate": 0.0007036969824326535, + "loss": 1.5822, + "step": 3586 + }, + { + "epoch": 0.39, + "grad_norm": 0.06272461509437303, + "learning_rate": 0.0007035379661297179, + "loss": 1.3431, + "step": 3587 + }, + { + "epoch": 0.39, + "grad_norm": 0.07364340877937099, + "learning_rate": 0.0007033789251471019, + "loss": 1.3892, + "step": 3588 + }, + { + "epoch": 0.39, + "grad_norm": 0.07060017034955898, + "learning_rate": 0.0007032198595040901, + "loss": 1.4748, + "step": 3589 + }, + { + "epoch": 0.39, + "grad_norm": 0.06756244433398076, + "learning_rate": 0.00070306076921997, + "loss": 1.3551, + "step": 3590 + }, + { + "epoch": 0.39, + "grad_norm": 0.07358338514614324, + "learning_rate": 0.0007029016543140311, + "loss": 1.407, + "step": 3591 + }, + { + "epoch": 0.39, + "grad_norm": 0.09232691672724472, + "learning_rate": 0.0007027425148055677, + "loss": 1.4698, + "step": 3592 + }, + { + "epoch": 0.39, + "grad_norm": 0.07306567061185729, + "learning_rate": 0.000702583350713875, + "loss": 1.3349, + "step": 3593 + }, + { + "epoch": 0.39, + "grad_norm": 0.07152049501825936, + "learning_rate": 0.0007024241620582527, + "loss": 1.5199, + "step": 3594 + }, + { + "epoch": 0.39, + "grad_norm": 0.07471021321320678, + "learning_rate": 0.0007022649488580029, + "loss": 1.4685, + "step": 3595 + }, + { + "epoch": 0.39, + "grad_norm": 0.07674020487606688, + "learning_rate": 0.0007021057111324307, + "loss": 1.3409, + "step": 3596 + }, + { + "epoch": 0.39, + "grad_norm": 0.08010512353343058, + "learning_rate": 0.0007019464489008443, + "loss": 1.4164, + "step": 3597 + }, + { + "epoch": 0.39, + "grad_norm": 0.07294899590648436, + "learning_rate": 0.0007017871621825549, + "loss": 1.4573, + "step": 3598 + }, + { + "epoch": 0.39, + "grad_norm": 0.0714024285774514, + "learning_rate": 0.0007016278509968761, + "loss": 1.3979, + "step": 3599 + }, + { + "epoch": 0.39, + "grad_norm": 0.0715807774435257, + "learning_rate": 0.0007014685153631255, + "loss": 1.3887, + "step": 3600 + }, + { + "epoch": 0.39, + "grad_norm": 0.07500788006512786, + "learning_rate": 0.0007013091553006227, + "loss": 1.4591, + "step": 3601 + }, + { + "epoch": 0.39, + "grad_norm": 0.06521259328969803, + "learning_rate": 0.0007011497708286909, + "loss": 1.4541, + "step": 3602 + }, + { + "epoch": 0.39, + "grad_norm": 0.0741904167001245, + "learning_rate": 0.000700990361966656, + "loss": 1.4057, + "step": 3603 + }, + { + "epoch": 0.39, + "grad_norm": 0.06183999889715281, + "learning_rate": 0.0007008309287338467, + "loss": 1.3977, + "step": 3604 + }, + { + "epoch": 0.39, + "grad_norm": 0.06681395641341766, + "learning_rate": 0.0007006714711495949, + "loss": 1.5082, + "step": 3605 + }, + { + "epoch": 0.39, + "grad_norm": 0.08653655603945522, + "learning_rate": 0.0007005119892332354, + "loss": 1.4142, + "step": 3606 + }, + { + "epoch": 0.39, + "grad_norm": 0.07028847770806987, + "learning_rate": 0.0007003524830041059, + "loss": 1.4125, + "step": 3607 + }, + { + "epoch": 0.39, + "grad_norm": 0.08717202095751826, + "learning_rate": 0.0007001929524815472, + "loss": 1.4099, + "step": 3608 + }, + { + "epoch": 0.39, + "grad_norm": 0.07358796039969515, + "learning_rate": 0.0007000333976849028, + "loss": 1.4611, + "step": 3609 + }, + { + "epoch": 0.39, + "grad_norm": 0.07321829406105443, + "learning_rate": 0.0006998738186335193, + "loss": 1.2502, + "step": 3610 + }, + { + "epoch": 0.39, + "grad_norm": 0.07212265150735724, + "learning_rate": 0.0006997142153467461, + "loss": 1.5196, + "step": 3611 + }, + { + "epoch": 0.39, + "grad_norm": 0.06711496628227426, + "learning_rate": 0.000699554587843936, + "loss": 1.4971, + "step": 3612 + }, + { + "epoch": 0.39, + "grad_norm": 0.0709609735167094, + "learning_rate": 0.0006993949361444441, + "loss": 1.3627, + "step": 3613 + }, + { + "epoch": 0.39, + "grad_norm": 0.06737833681998814, + "learning_rate": 0.0006992352602676287, + "loss": 1.4435, + "step": 3614 + }, + { + "epoch": 0.39, + "grad_norm": 0.08318606479339613, + "learning_rate": 0.0006990755602328512, + "loss": 1.3223, + "step": 3615 + }, + { + "epoch": 0.39, + "grad_norm": 0.07678376698123786, + "learning_rate": 0.0006989158360594756, + "loss": 1.2798, + "step": 3616 + }, + { + "epoch": 0.39, + "grad_norm": 0.07405997377374739, + "learning_rate": 0.0006987560877668692, + "loss": 1.4287, + "step": 3617 + }, + { + "epoch": 0.39, + "grad_norm": 0.0764351613615117, + "learning_rate": 0.0006985963153744019, + "loss": 1.5333, + "step": 3618 + }, + { + "epoch": 0.39, + "grad_norm": 0.08385446808915825, + "learning_rate": 0.0006984365189014467, + "loss": 1.4834, + "step": 3619 + }, + { + "epoch": 0.39, + "grad_norm": 0.06263563035341106, + "learning_rate": 0.0006982766983673795, + "loss": 1.3152, + "step": 3620 + }, + { + "epoch": 0.39, + "grad_norm": 0.08852794236399607, + "learning_rate": 0.000698116853791579, + "loss": 1.4673, + "step": 3621 + }, + { + "epoch": 0.39, + "grad_norm": 0.06941771637584818, + "learning_rate": 0.000697956985193427, + "loss": 1.4864, + "step": 3622 + }, + { + "epoch": 0.39, + "grad_norm": 0.08562998996198683, + "learning_rate": 0.0006977970925923081, + "loss": 1.4882, + "step": 3623 + }, + { + "epoch": 0.39, + "grad_norm": 0.07531704894846612, + "learning_rate": 0.0006976371760076099, + "loss": 1.4987, + "step": 3624 + }, + { + "epoch": 0.39, + "grad_norm": 0.0801714652638698, + "learning_rate": 0.0006974772354587226, + "loss": 1.4313, + "step": 3625 + }, + { + "epoch": 0.39, + "grad_norm": 0.08020508619169321, + "learning_rate": 0.0006973172709650397, + "loss": 1.4755, + "step": 3626 + }, + { + "epoch": 0.39, + "grad_norm": 0.07296454416739705, + "learning_rate": 0.0006971572825459576, + "loss": 1.4356, + "step": 3627 + }, + { + "epoch": 0.39, + "grad_norm": 0.06951882359819238, + "learning_rate": 0.000696997270220875, + "loss": 1.4227, + "step": 3628 + }, + { + "epoch": 0.39, + "grad_norm": 0.07644784282849344, + "learning_rate": 0.0006968372340091946, + "loss": 1.4035, + "step": 3629 + }, + { + "epoch": 0.39, + "grad_norm": 0.07155938125174738, + "learning_rate": 0.0006966771739303206, + "loss": 1.4814, + "step": 3630 + }, + { + "epoch": 0.39, + "grad_norm": 0.06280959644830418, + "learning_rate": 0.0006965170900036613, + "loss": 1.432, + "step": 3631 + }, + { + "epoch": 0.39, + "grad_norm": 0.06420053763727787, + "learning_rate": 0.0006963569822486276, + "loss": 1.3966, + "step": 3632 + }, + { + "epoch": 0.39, + "grad_norm": 0.06407629379194352, + "learning_rate": 0.0006961968506846327, + "loss": 1.4223, + "step": 3633 + }, + { + "epoch": 0.39, + "grad_norm": 0.07341699164847276, + "learning_rate": 0.0006960366953310931, + "loss": 1.6418, + "step": 3634 + }, + { + "epoch": 0.39, + "grad_norm": 0.07158103196668297, + "learning_rate": 0.0006958765162074287, + "loss": 1.3857, + "step": 3635 + }, + { + "epoch": 0.39, + "grad_norm": 0.06656688307492928, + "learning_rate": 0.0006957163133330611, + "loss": 1.5035, + "step": 3636 + }, + { + "epoch": 0.39, + "grad_norm": 0.07009898988268752, + "learning_rate": 0.0006955560867274159, + "loss": 1.4776, + "step": 3637 + }, + { + "epoch": 0.39, + "grad_norm": 0.06893175987469181, + "learning_rate": 0.0006953958364099208, + "loss": 1.5117, + "step": 3638 + }, + { + "epoch": 0.39, + "grad_norm": 0.07194928575269588, + "learning_rate": 0.0006952355624000072, + "loss": 1.4054, + "step": 3639 + }, + { + "epoch": 0.39, + "grad_norm": 0.07005599291594371, + "learning_rate": 0.0006950752647171086, + "loss": 1.4454, + "step": 3640 + }, + { + "epoch": 0.39, + "grad_norm": 0.06485794286202459, + "learning_rate": 0.0006949149433806614, + "loss": 1.2388, + "step": 3641 + }, + { + "epoch": 0.39, + "grad_norm": 0.07321685193872544, + "learning_rate": 0.0006947545984101053, + "loss": 1.5212, + "step": 3642 + }, + { + "epoch": 0.39, + "grad_norm": 0.06766799514535249, + "learning_rate": 0.000694594229824883, + "loss": 1.4703, + "step": 3643 + }, + { + "epoch": 0.39, + "grad_norm": 0.07674140785209206, + "learning_rate": 0.0006944338376444393, + "loss": 1.4143, + "step": 3644 + }, + { + "epoch": 0.39, + "grad_norm": 0.07113361066488795, + "learning_rate": 0.0006942734218882225, + "loss": 1.5061, + "step": 3645 + }, + { + "epoch": 0.39, + "grad_norm": 0.07408116710866873, + "learning_rate": 0.0006941129825756836, + "loss": 1.3525, + "step": 3646 + }, + { + "epoch": 0.39, + "grad_norm": 0.06834433253859368, + "learning_rate": 0.0006939525197262762, + "loss": 1.3372, + "step": 3647 + }, + { + "epoch": 0.39, + "grad_norm": 0.07240980576151891, + "learning_rate": 0.0006937920333594572, + "loss": 1.3757, + "step": 3648 + }, + { + "epoch": 0.39, + "grad_norm": 0.07630489881087277, + "learning_rate": 0.0006936315234946861, + "loss": 1.4788, + "step": 3649 + }, + { + "epoch": 0.39, + "grad_norm": 0.07956854851302582, + "learning_rate": 0.0006934709901514251, + "loss": 1.4576, + "step": 3650 + }, + { + "epoch": 0.39, + "grad_norm": 0.0670356740719808, + "learning_rate": 0.0006933104333491398, + "loss": 1.5165, + "step": 3651 + }, + { + "epoch": 0.39, + "grad_norm": 0.07588722767607452, + "learning_rate": 0.0006931498531072977, + "loss": 1.3835, + "step": 3652 + }, + { + "epoch": 0.39, + "grad_norm": 0.09334811732319619, + "learning_rate": 0.0006929892494453703, + "loss": 1.4464, + "step": 3653 + }, + { + "epoch": 0.39, + "grad_norm": 0.061988691124392446, + "learning_rate": 0.0006928286223828309, + "loss": 1.3886, + "step": 3654 + }, + { + "epoch": 0.39, + "grad_norm": 0.07921738004769585, + "learning_rate": 0.0006926679719391562, + "loss": 1.4586, + "step": 3655 + }, + { + "epoch": 0.39, + "grad_norm": 0.08326077624240993, + "learning_rate": 0.0006925072981338259, + "loss": 1.3614, + "step": 3656 + }, + { + "epoch": 0.39, + "grad_norm": 0.08352543499375753, + "learning_rate": 0.0006923466009863218, + "loss": 1.4544, + "step": 3657 + }, + { + "epoch": 0.39, + "grad_norm": 0.06887853383184417, + "learning_rate": 0.0006921858805161294, + "loss": 1.3777, + "step": 3658 + }, + { + "epoch": 0.39, + "grad_norm": 0.06923628331383389, + "learning_rate": 0.0006920251367427361, + "loss": 1.5486, + "step": 3659 + }, + { + "epoch": 0.39, + "grad_norm": 0.06519640964192866, + "learning_rate": 0.0006918643696856333, + "loss": 1.4187, + "step": 3660 + }, + { + "epoch": 0.39, + "grad_norm": 0.07237657442273564, + "learning_rate": 0.0006917035793643141, + "loss": 1.5045, + "step": 3661 + }, + { + "epoch": 0.39, + "grad_norm": 0.0693432290313142, + "learning_rate": 0.0006915427657982751, + "loss": 1.3163, + "step": 3662 + }, + { + "epoch": 0.39, + "grad_norm": 0.06416290954447852, + "learning_rate": 0.0006913819290070153, + "loss": 1.4348, + "step": 3663 + }, + { + "epoch": 0.39, + "grad_norm": 0.07147056356269492, + "learning_rate": 0.0006912210690100369, + "loss": 1.4821, + "step": 3664 + }, + { + "epoch": 0.39, + "grad_norm": 0.07969102834155034, + "learning_rate": 0.0006910601858268444, + "loss": 1.453, + "step": 3665 + }, + { + "epoch": 0.39, + "grad_norm": 0.13718934471197883, + "learning_rate": 0.000690899279476946, + "loss": 1.4078, + "step": 3666 + }, + { + "epoch": 0.39, + "grad_norm": 0.08114721434640836, + "learning_rate": 0.0006907383499798516, + "loss": 1.4907, + "step": 3667 + }, + { + "epoch": 0.39, + "grad_norm": 0.07115178902771321, + "learning_rate": 0.0006905773973550748, + "loss": 1.4556, + "step": 3668 + }, + { + "epoch": 0.39, + "grad_norm": 0.07942590621430361, + "learning_rate": 0.0006904164216221314, + "loss": 1.4596, + "step": 3669 + }, + { + "epoch": 0.39, + "grad_norm": 0.07289576701035694, + "learning_rate": 0.0006902554228005405, + "loss": 1.4612, + "step": 3670 + }, + { + "epoch": 0.39, + "grad_norm": 0.06528821995816773, + "learning_rate": 0.0006900944009098238, + "loss": 1.3585, + "step": 3671 + }, + { + "epoch": 0.39, + "grad_norm": 0.07616160702579455, + "learning_rate": 0.0006899333559695056, + "loss": 1.3554, + "step": 3672 + }, + { + "epoch": 0.39, + "grad_norm": 0.07769584578503212, + "learning_rate": 0.0006897722879991131, + "loss": 1.4175, + "step": 3673 + }, + { + "epoch": 0.39, + "grad_norm": 0.07083227509463093, + "learning_rate": 0.0006896111970181764, + "loss": 1.4129, + "step": 3674 + }, + { + "epoch": 0.4, + "grad_norm": 0.06563564523573756, + "learning_rate": 0.0006894500830462285, + "loss": 1.3298, + "step": 3675 + }, + { + "epoch": 0.4, + "grad_norm": 0.0704221954251129, + "learning_rate": 0.0006892889461028047, + "loss": 1.4314, + "step": 3676 + }, + { + "epoch": 0.4, + "grad_norm": 0.07517734381017364, + "learning_rate": 0.0006891277862074439, + "loss": 1.3131, + "step": 3677 + }, + { + "epoch": 0.4, + "grad_norm": 0.07709956431855618, + "learning_rate": 0.0006889666033796869, + "loss": 1.4983, + "step": 3678 + }, + { + "epoch": 0.4, + "grad_norm": 0.06641564320298735, + "learning_rate": 0.0006888053976390776, + "loss": 1.4659, + "step": 3679 + }, + { + "epoch": 0.4, + "grad_norm": 0.07326313513970828, + "learning_rate": 0.000688644169005163, + "loss": 1.4576, + "step": 3680 + }, + { + "epoch": 0.4, + "grad_norm": 0.06526229926022548, + "learning_rate": 0.0006884829174974927, + "loss": 1.5378, + "step": 3681 + }, + { + "epoch": 0.4, + "grad_norm": 0.06510469605310103, + "learning_rate": 0.0006883216431356187, + "loss": 1.2951, + "step": 3682 + }, + { + "epoch": 0.4, + "grad_norm": 0.06975087735626676, + "learning_rate": 0.0006881603459390964, + "loss": 1.535, + "step": 3683 + }, + { + "epoch": 0.4, + "grad_norm": 0.07054312371923935, + "learning_rate": 0.0006879990259274832, + "loss": 1.6018, + "step": 3684 + }, + { + "epoch": 0.4, + "grad_norm": 0.06413316601711448, + "learning_rate": 0.0006878376831203401, + "loss": 1.4806, + "step": 3685 + }, + { + "epoch": 0.4, + "grad_norm": 0.06207448118614186, + "learning_rate": 0.0006876763175372305, + "loss": 1.4473, + "step": 3686 + }, + { + "epoch": 0.4, + "grad_norm": 0.07947569981904305, + "learning_rate": 0.00068751492919772, + "loss": 1.2827, + "step": 3687 + }, + { + "epoch": 0.4, + "grad_norm": 0.0667703783383479, + "learning_rate": 0.0006873535181213784, + "loss": 1.4303, + "step": 3688 + }, + { + "epoch": 0.4, + "grad_norm": 0.06814443823594021, + "learning_rate": 0.0006871920843277764, + "loss": 1.3534, + "step": 3689 + }, + { + "epoch": 0.4, + "grad_norm": 0.06858806164536586, + "learning_rate": 0.000687030627836489, + "loss": 1.4704, + "step": 3690 + }, + { + "epoch": 0.4, + "grad_norm": 0.07516155321113553, + "learning_rate": 0.0006868691486670932, + "loss": 1.4027, + "step": 3691 + }, + { + "epoch": 0.4, + "grad_norm": 0.06523038718461018, + "learning_rate": 0.0006867076468391688, + "loss": 1.4835, + "step": 3692 + }, + { + "epoch": 0.4, + "grad_norm": 0.0641694726958727, + "learning_rate": 0.0006865461223722986, + "loss": 1.4408, + "step": 3693 + }, + { + "epoch": 0.4, + "grad_norm": 0.07188965078622174, + "learning_rate": 0.0006863845752860679, + "loss": 1.509, + "step": 3694 + }, + { + "epoch": 0.4, + "grad_norm": 0.06917751289317574, + "learning_rate": 0.0006862230056000648, + "loss": 1.4961, + "step": 3695 + }, + { + "epoch": 0.4, + "grad_norm": 0.06763332815454019, + "learning_rate": 0.0006860614133338804, + "loss": 1.3402, + "step": 3696 + }, + { + "epoch": 0.4, + "grad_norm": 0.06762636955642931, + "learning_rate": 0.0006858997985071081, + "loss": 1.5605, + "step": 3697 + }, + { + "epoch": 0.4, + "grad_norm": 0.06856232217515264, + "learning_rate": 0.0006857381611393445, + "loss": 1.4544, + "step": 3698 + }, + { + "epoch": 0.4, + "grad_norm": 0.06275521544299778, + "learning_rate": 0.0006855765012501883, + "loss": 1.4577, + "step": 3699 + }, + { + "epoch": 0.4, + "grad_norm": 0.06899956083233556, + "learning_rate": 0.0006854148188592418, + "loss": 1.4359, + "step": 3700 + }, + { + "epoch": 0.4, + "grad_norm": 0.06995023762873773, + "learning_rate": 0.000685253113986109, + "loss": 1.4413, + "step": 3701 + }, + { + "epoch": 0.4, + "grad_norm": 0.06988588357833313, + "learning_rate": 0.0006850913866503977, + "loss": 1.3826, + "step": 3702 + }, + { + "epoch": 0.4, + "grad_norm": 0.06786538775904234, + "learning_rate": 0.0006849296368717176, + "loss": 1.4923, + "step": 3703 + }, + { + "epoch": 0.4, + "grad_norm": 0.07654915353921299, + "learning_rate": 0.0006847678646696813, + "loss": 1.3739, + "step": 3704 + }, + { + "epoch": 0.4, + "grad_norm": 0.0627728148075214, + "learning_rate": 0.0006846060700639046, + "loss": 1.3529, + "step": 3705 + }, + { + "epoch": 0.4, + "grad_norm": 0.06829333606625117, + "learning_rate": 0.0006844442530740055, + "loss": 1.5439, + "step": 3706 + }, + { + "epoch": 0.4, + "grad_norm": 0.06632683279100844, + "learning_rate": 0.0006842824137196046, + "loss": 1.3775, + "step": 3707 + }, + { + "epoch": 0.4, + "grad_norm": 0.0709408213687562, + "learning_rate": 0.000684120552020326, + "loss": 1.2745, + "step": 3708 + }, + { + "epoch": 0.4, + "grad_norm": 0.07214752580069766, + "learning_rate": 0.0006839586679957956, + "loss": 1.5218, + "step": 3709 + }, + { + "epoch": 0.4, + "grad_norm": 0.07159420982548895, + "learning_rate": 0.0006837967616656425, + "loss": 1.4378, + "step": 3710 + }, + { + "epoch": 0.4, + "grad_norm": 0.07620806323467304, + "learning_rate": 0.0006836348330494984, + "loss": 1.4767, + "step": 3711 + }, + { + "epoch": 0.4, + "grad_norm": 0.0712872441268927, + "learning_rate": 0.0006834728821669977, + "loss": 1.3746, + "step": 3712 + }, + { + "epoch": 0.4, + "grad_norm": 0.06739384795782043, + "learning_rate": 0.0006833109090377775, + "loss": 1.3444, + "step": 3713 + }, + { + "epoch": 0.4, + "grad_norm": 0.07259481271825156, + "learning_rate": 0.0006831489136814777, + "loss": 1.4627, + "step": 3714 + }, + { + "epoch": 0.4, + "grad_norm": 0.08300629502199651, + "learning_rate": 0.0006829868961177406, + "loss": 1.3774, + "step": 3715 + }, + { + "epoch": 0.4, + "grad_norm": 0.06620793761290668, + "learning_rate": 0.0006828248563662116, + "loss": 1.441, + "step": 3716 + }, + { + "epoch": 0.4, + "grad_norm": 0.06753191484298786, + "learning_rate": 0.0006826627944465383, + "loss": 1.2897, + "step": 3717 + }, + { + "epoch": 0.4, + "grad_norm": 0.06968868045330964, + "learning_rate": 0.0006825007103783716, + "loss": 1.3708, + "step": 3718 + }, + { + "epoch": 0.4, + "grad_norm": 0.0664248081179177, + "learning_rate": 0.0006823386041813647, + "loss": 1.3146, + "step": 3719 + }, + { + "epoch": 0.4, + "grad_norm": 0.06767190375786564, + "learning_rate": 0.0006821764758751732, + "loss": 1.404, + "step": 3720 + }, + { + "epoch": 0.4, + "grad_norm": 0.07605015217317455, + "learning_rate": 0.0006820143254794559, + "loss": 1.5113, + "step": 3721 + }, + { + "epoch": 0.4, + "grad_norm": 0.06485099522202713, + "learning_rate": 0.0006818521530138743, + "loss": 1.3003, + "step": 3722 + }, + { + "epoch": 0.4, + "grad_norm": 0.07531942615179421, + "learning_rate": 0.0006816899584980922, + "loss": 1.4695, + "step": 3723 + }, + { + "epoch": 0.4, + "grad_norm": 0.09489560161012517, + "learning_rate": 0.000681527741951776, + "loss": 1.3409, + "step": 3724 + }, + { + "epoch": 0.4, + "grad_norm": 0.061931818640721746, + "learning_rate": 0.0006813655033945956, + "loss": 1.4266, + "step": 3725 + }, + { + "epoch": 0.4, + "grad_norm": 0.08377945929406046, + "learning_rate": 0.0006812032428462225, + "loss": 1.4634, + "step": 3726 + }, + { + "epoch": 0.4, + "grad_norm": 0.08635568739760313, + "learning_rate": 0.0006810409603263314, + "loss": 1.5708, + "step": 3727 + }, + { + "epoch": 0.4, + "grad_norm": 0.07301256105687044, + "learning_rate": 0.0006808786558546, + "loss": 1.4444, + "step": 3728 + }, + { + "epoch": 0.4, + "grad_norm": 0.07858240960791367, + "learning_rate": 0.0006807163294507078, + "loss": 1.4104, + "step": 3729 + }, + { + "epoch": 0.4, + "grad_norm": 0.069090300634783, + "learning_rate": 0.0006805539811343376, + "loss": 1.3712, + "step": 3730 + }, + { + "epoch": 0.4, + "grad_norm": 0.07759023420372646, + "learning_rate": 0.0006803916109251748, + "loss": 1.3338, + "step": 3731 + }, + { + "epoch": 0.4, + "grad_norm": 0.07414115909337814, + "learning_rate": 0.0006802292188429072, + "loss": 1.5009, + "step": 3732 + }, + { + "epoch": 0.4, + "grad_norm": 0.0818109136073175, + "learning_rate": 0.0006800668049072256, + "loss": 1.3017, + "step": 3733 + }, + { + "epoch": 0.4, + "grad_norm": 0.07277609899484332, + "learning_rate": 0.000679904369137823, + "loss": 1.4741, + "step": 3734 + }, + { + "epoch": 0.4, + "grad_norm": 0.06986648440783053, + "learning_rate": 0.0006797419115543954, + "loss": 1.3859, + "step": 3735 + }, + { + "epoch": 0.4, + "grad_norm": 0.08023992915741134, + "learning_rate": 0.0006795794321766415, + "loss": 1.5204, + "step": 3736 + }, + { + "epoch": 0.4, + "grad_norm": 0.07276280799914031, + "learning_rate": 0.0006794169310242624, + "loss": 1.4427, + "step": 3737 + }, + { + "epoch": 0.4, + "grad_norm": 0.07170170736813007, + "learning_rate": 0.0006792544081169616, + "loss": 1.4303, + "step": 3738 + }, + { + "epoch": 0.4, + "grad_norm": 0.06361279357416362, + "learning_rate": 0.000679091863474446, + "loss": 1.5411, + "step": 3739 + }, + { + "epoch": 0.4, + "grad_norm": 0.07108682039307701, + "learning_rate": 0.0006789292971164244, + "loss": 1.2778, + "step": 3740 + }, + { + "epoch": 0.4, + "grad_norm": 0.07580525556322194, + "learning_rate": 0.0006787667090626089, + "loss": 1.3907, + "step": 3741 + }, + { + "epoch": 0.4, + "grad_norm": 0.06995561131507061, + "learning_rate": 0.0006786040993327135, + "loss": 1.4253, + "step": 3742 + }, + { + "epoch": 0.4, + "grad_norm": 0.07492744663626666, + "learning_rate": 0.0006784414679464552, + "loss": 1.4251, + "step": 3743 + }, + { + "epoch": 0.4, + "grad_norm": 0.06882834924705727, + "learning_rate": 0.0006782788149235538, + "loss": 1.4079, + "step": 3744 + }, + { + "epoch": 0.4, + "grad_norm": 0.07609185882270524, + "learning_rate": 0.0006781161402837316, + "loss": 1.5241, + "step": 3745 + }, + { + "epoch": 0.4, + "grad_norm": 0.0919088950874994, + "learning_rate": 0.0006779534440467133, + "loss": 1.4305, + "step": 3746 + }, + { + "epoch": 0.4, + "grad_norm": 0.07213172396040231, + "learning_rate": 0.0006777907262322262, + "loss": 1.3415, + "step": 3747 + }, + { + "epoch": 0.4, + "grad_norm": 0.06770604290970164, + "learning_rate": 0.0006776279868600008, + "loss": 1.4458, + "step": 3748 + }, + { + "epoch": 0.4, + "grad_norm": 0.07331521526131282, + "learning_rate": 0.0006774652259497696, + "loss": 1.4466, + "step": 3749 + }, + { + "epoch": 0.4, + "grad_norm": 0.06631965109609582, + "learning_rate": 0.0006773024435212678, + "loss": 1.45, + "step": 3750 + }, + { + "epoch": 0.4, + "grad_norm": 0.06771060610435131, + "learning_rate": 0.0006771396395942338, + "loss": 1.3031, + "step": 3751 + }, + { + "epoch": 0.4, + "grad_norm": 0.0725160661843529, + "learning_rate": 0.0006769768141884074, + "loss": 1.5257, + "step": 3752 + }, + { + "epoch": 0.4, + "grad_norm": 0.0793938270762842, + "learning_rate": 0.0006768139673235323, + "loss": 1.5339, + "step": 3753 + }, + { + "epoch": 0.4, + "grad_norm": 0.07151774981622631, + "learning_rate": 0.0006766510990193541, + "loss": 1.3757, + "step": 3754 + }, + { + "epoch": 0.4, + "grad_norm": 0.06547987137883667, + "learning_rate": 0.0006764882092956211, + "loss": 1.3952, + "step": 3755 + }, + { + "epoch": 0.4, + "grad_norm": 0.07314158842433856, + "learning_rate": 0.0006763252981720844, + "loss": 1.5099, + "step": 3756 + }, + { + "epoch": 0.4, + "grad_norm": 0.07145510078013781, + "learning_rate": 0.0006761623656684973, + "loss": 1.3377, + "step": 3757 + }, + { + "epoch": 0.4, + "grad_norm": 0.08644202316815978, + "learning_rate": 0.0006759994118046161, + "loss": 1.414, + "step": 3758 + }, + { + "epoch": 0.4, + "grad_norm": 0.07282147693961499, + "learning_rate": 0.0006758364366001994, + "loss": 1.4244, + "step": 3759 + }, + { + "epoch": 0.4, + "grad_norm": 0.07554452717292655, + "learning_rate": 0.0006756734400750087, + "loss": 1.3676, + "step": 3760 + }, + { + "epoch": 0.4, + "grad_norm": 0.0666807039375581, + "learning_rate": 0.0006755104222488076, + "loss": 1.4265, + "step": 3761 + }, + { + "epoch": 0.4, + "grad_norm": 0.09012510595806483, + "learning_rate": 0.0006753473831413628, + "loss": 1.3391, + "step": 3762 + }, + { + "epoch": 0.4, + "grad_norm": 0.07835217913269821, + "learning_rate": 0.0006751843227724432, + "loss": 1.5525, + "step": 3763 + }, + { + "epoch": 0.4, + "grad_norm": 0.06987950663692771, + "learning_rate": 0.0006750212411618206, + "loss": 1.3493, + "step": 3764 + }, + { + "epoch": 0.4, + "grad_norm": 0.06996075079577234, + "learning_rate": 0.000674858138329269, + "loss": 1.457, + "step": 3765 + }, + { + "epoch": 0.4, + "grad_norm": 0.07173072264270693, + "learning_rate": 0.0006746950142945653, + "loss": 1.326, + "step": 3766 + }, + { + "epoch": 0.4, + "grad_norm": 0.06810115938826602, + "learning_rate": 0.0006745318690774891, + "loss": 1.2359, + "step": 3767 + }, + { + "epoch": 0.41, + "grad_norm": 0.07757166620232256, + "learning_rate": 0.0006743687026978219, + "loss": 1.4865, + "step": 3768 + }, + { + "epoch": 0.41, + "grad_norm": 0.08521148854272749, + "learning_rate": 0.0006742055151753483, + "loss": 1.5518, + "step": 3769 + }, + { + "epoch": 0.41, + "grad_norm": 0.07341229077115634, + "learning_rate": 0.0006740423065298556, + "loss": 1.5728, + "step": 3770 + }, + { + "epoch": 0.41, + "grad_norm": 0.07251757762711362, + "learning_rate": 0.0006738790767811329, + "loss": 1.4438, + "step": 3771 + }, + { + "epoch": 0.41, + "grad_norm": 0.0754765370031017, + "learning_rate": 0.0006737158259489729, + "loss": 1.4546, + "step": 3772 + }, + { + "epoch": 0.41, + "grad_norm": 0.06987041998936272, + "learning_rate": 0.0006735525540531702, + "loss": 1.4971, + "step": 3773 + }, + { + "epoch": 0.41, + "grad_norm": 0.07846064142843433, + "learning_rate": 0.0006733892611135217, + "loss": 1.3602, + "step": 3774 + }, + { + "epoch": 0.41, + "grad_norm": 0.08199997856494941, + "learning_rate": 0.0006732259471498278, + "loss": 1.5195, + "step": 3775 + }, + { + "epoch": 0.41, + "grad_norm": 0.06705852406335323, + "learning_rate": 0.0006730626121818906, + "loss": 1.3127, + "step": 3776 + }, + { + "epoch": 0.41, + "grad_norm": 0.07146928164080467, + "learning_rate": 0.0006728992562295148, + "loss": 1.4246, + "step": 3777 + }, + { + "epoch": 0.41, + "grad_norm": 0.07467077035958926, + "learning_rate": 0.0006727358793125084, + "loss": 1.3603, + "step": 3778 + }, + { + "epoch": 0.41, + "grad_norm": 0.06747803921749279, + "learning_rate": 0.0006725724814506809, + "loss": 1.401, + "step": 3779 + }, + { + "epoch": 0.41, + "grad_norm": 0.07455585726671034, + "learning_rate": 0.0006724090626638451, + "loss": 1.3864, + "step": 3780 + }, + { + "epoch": 0.41, + "grad_norm": 0.06935011523991552, + "learning_rate": 0.0006722456229718162, + "loss": 1.3373, + "step": 3781 + }, + { + "epoch": 0.41, + "grad_norm": 0.07001330043357815, + "learning_rate": 0.0006720821623944117, + "loss": 1.3813, + "step": 3782 + }, + { + "epoch": 0.41, + "grad_norm": 0.07006791244257965, + "learning_rate": 0.0006719186809514516, + "loss": 1.4948, + "step": 3783 + }, + { + "epoch": 0.41, + "grad_norm": 0.08041409052706847, + "learning_rate": 0.000671755178662759, + "loss": 1.4941, + "step": 3784 + }, + { + "epoch": 0.41, + "grad_norm": 0.07903064459799436, + "learning_rate": 0.0006715916555481585, + "loss": 1.3258, + "step": 3785 + }, + { + "epoch": 0.41, + "grad_norm": 0.07790220948556671, + "learning_rate": 0.0006714281116274783, + "loss": 1.3577, + "step": 3786 + }, + { + "epoch": 0.41, + "grad_norm": 0.0683831054506245, + "learning_rate": 0.0006712645469205488, + "loss": 1.3955, + "step": 3787 + }, + { + "epoch": 0.41, + "grad_norm": 0.07375602721280128, + "learning_rate": 0.0006711009614472022, + "loss": 1.344, + "step": 3788 + }, + { + "epoch": 0.41, + "grad_norm": 0.0753083239073712, + "learning_rate": 0.0006709373552272744, + "loss": 1.3137, + "step": 3789 + }, + { + "epoch": 0.41, + "grad_norm": 0.08058980255270123, + "learning_rate": 0.0006707737282806029, + "loss": 1.4757, + "step": 3790 + }, + { + "epoch": 0.41, + "grad_norm": 0.07188458605519603, + "learning_rate": 0.000670610080627028, + "loss": 1.4421, + "step": 3791 + }, + { + "epoch": 0.41, + "grad_norm": 0.0745797053981257, + "learning_rate": 0.0006704464122863928, + "loss": 1.385, + "step": 3792 + }, + { + "epoch": 0.41, + "grad_norm": 0.0809561549922701, + "learning_rate": 0.0006702827232785425, + "loss": 1.5045, + "step": 3793 + }, + { + "epoch": 0.41, + "grad_norm": 0.07043268111949744, + "learning_rate": 0.0006701190136233249, + "loss": 1.4208, + "step": 3794 + }, + { + "epoch": 0.41, + "grad_norm": 0.06464522265885646, + "learning_rate": 0.0006699552833405903, + "loss": 1.4705, + "step": 3795 + }, + { + "epoch": 0.41, + "grad_norm": 0.06898157473720405, + "learning_rate": 0.0006697915324501918, + "loss": 1.3685, + "step": 3796 + }, + { + "epoch": 0.41, + "grad_norm": 0.06720721473990293, + "learning_rate": 0.0006696277609719845, + "loss": 1.2577, + "step": 3797 + }, + { + "epoch": 0.41, + "grad_norm": 0.069477039304262, + "learning_rate": 0.0006694639689258265, + "loss": 1.395, + "step": 3798 + }, + { + "epoch": 0.41, + "grad_norm": 0.067690922335574, + "learning_rate": 0.0006693001563315782, + "loss": 1.4064, + "step": 3799 + }, + { + "epoch": 0.41, + "grad_norm": 0.07169248794058057, + "learning_rate": 0.0006691363232091019, + "loss": 1.483, + "step": 3800 + }, + { + "epoch": 0.41, + "grad_norm": 0.07878728528871834, + "learning_rate": 0.0006689724695782635, + "loss": 1.5265, + "step": 3801 + }, + { + "epoch": 0.41, + "grad_norm": 0.07027898095463854, + "learning_rate": 0.0006688085954589307, + "loss": 1.4565, + "step": 3802 + }, + { + "epoch": 0.41, + "grad_norm": 0.0671644222122429, + "learning_rate": 0.0006686447008709735, + "loss": 1.3886, + "step": 3803 + }, + { + "epoch": 0.41, + "grad_norm": 0.10803177504795355, + "learning_rate": 0.0006684807858342652, + "loss": 1.4621, + "step": 3804 + }, + { + "epoch": 0.41, + "grad_norm": 0.06890699521351634, + "learning_rate": 0.0006683168503686806, + "loss": 1.3289, + "step": 3805 + }, + { + "epoch": 0.41, + "grad_norm": 0.06272061534057344, + "learning_rate": 0.0006681528944940977, + "loss": 1.4191, + "step": 3806 + }, + { + "epoch": 0.41, + "grad_norm": 0.06759035500761486, + "learning_rate": 0.0006679889182303966, + "loss": 1.4501, + "step": 3807 + }, + { + "epoch": 0.41, + "grad_norm": 0.07934872841569579, + "learning_rate": 0.00066782492159746, + "loss": 1.3587, + "step": 3808 + }, + { + "epoch": 0.41, + "grad_norm": 0.07583621080731881, + "learning_rate": 0.0006676609046151732, + "loss": 1.2725, + "step": 3809 + }, + { + "epoch": 0.41, + "grad_norm": 0.08046256366810259, + "learning_rate": 0.0006674968673034235, + "loss": 1.3842, + "step": 3810 + }, + { + "epoch": 0.41, + "grad_norm": 0.08949644141491607, + "learning_rate": 0.0006673328096821012, + "loss": 1.3413, + "step": 3811 + }, + { + "epoch": 0.41, + "grad_norm": 0.07477384583849257, + "learning_rate": 0.0006671687317710989, + "loss": 1.4916, + "step": 3812 + }, + { + "epoch": 0.41, + "grad_norm": 0.07549342598106959, + "learning_rate": 0.0006670046335903116, + "loss": 1.3989, + "step": 3813 + }, + { + "epoch": 0.41, + "grad_norm": 0.07054448270490725, + "learning_rate": 0.0006668405151596367, + "loss": 1.3175, + "step": 3814 + }, + { + "epoch": 0.41, + "grad_norm": 0.06726660936523654, + "learning_rate": 0.0006666763764989742, + "loss": 1.5434, + "step": 3815 + }, + { + "epoch": 0.41, + "grad_norm": 0.07343583098757128, + "learning_rate": 0.0006665122176282264, + "loss": 1.4393, + "step": 3816 + }, + { + "epoch": 0.41, + "grad_norm": 0.07954843590714228, + "learning_rate": 0.000666348038567298, + "loss": 1.4805, + "step": 3817 + }, + { + "epoch": 0.41, + "grad_norm": 0.0749554275646494, + "learning_rate": 0.0006661838393360966, + "loss": 1.3784, + "step": 3818 + }, + { + "epoch": 0.41, + "grad_norm": 0.07420331079114773, + "learning_rate": 0.0006660196199545317, + "loss": 1.3676, + "step": 3819 + }, + { + "epoch": 0.41, + "grad_norm": 0.06857050240511989, + "learning_rate": 0.0006658553804425156, + "loss": 1.4668, + "step": 3820 + }, + { + "epoch": 0.41, + "grad_norm": 0.07062870193427842, + "learning_rate": 0.0006656911208199627, + "loss": 1.4805, + "step": 3821 + }, + { + "epoch": 0.41, + "grad_norm": 0.07851558591594957, + "learning_rate": 0.0006655268411067903, + "loss": 1.4744, + "step": 3822 + }, + { + "epoch": 0.41, + "grad_norm": 0.07572325313412566, + "learning_rate": 0.0006653625413229177, + "loss": 1.4193, + "step": 3823 + }, + { + "epoch": 0.41, + "grad_norm": 0.07822957044039873, + "learning_rate": 0.000665198221488267, + "loss": 1.4386, + "step": 3824 + }, + { + "epoch": 0.41, + "grad_norm": 0.07604473764552287, + "learning_rate": 0.0006650338816227622, + "loss": 1.3252, + "step": 3825 + }, + { + "epoch": 0.41, + "grad_norm": 0.07524178279695189, + "learning_rate": 0.0006648695217463304, + "loss": 1.3825, + "step": 3826 + }, + { + "epoch": 0.41, + "grad_norm": 0.08116651197896221, + "learning_rate": 0.0006647051418789007, + "loss": 1.3457, + "step": 3827 + }, + { + "epoch": 0.41, + "grad_norm": 0.07209272417387406, + "learning_rate": 0.0006645407420404047, + "loss": 1.3494, + "step": 3828 + }, + { + "epoch": 0.41, + "grad_norm": 0.07602965698815438, + "learning_rate": 0.0006643763222507765, + "loss": 1.4471, + "step": 3829 + }, + { + "epoch": 0.41, + "grad_norm": 0.06920958645347326, + "learning_rate": 0.0006642118825299526, + "loss": 1.3819, + "step": 3830 + }, + { + "epoch": 0.41, + "grad_norm": 0.06868419959482276, + "learning_rate": 0.0006640474228978716, + "loss": 1.3569, + "step": 3831 + }, + { + "epoch": 0.41, + "grad_norm": 0.07442565049284244, + "learning_rate": 0.0006638829433744753, + "loss": 1.4242, + "step": 3832 + }, + { + "epoch": 0.41, + "grad_norm": 0.07469968744098102, + "learning_rate": 0.0006637184439797069, + "loss": 1.3941, + "step": 3833 + }, + { + "epoch": 0.41, + "grad_norm": 0.06954189557918407, + "learning_rate": 0.0006635539247335128, + "loss": 1.3249, + "step": 3834 + }, + { + "epoch": 0.41, + "grad_norm": 0.0788039300618776, + "learning_rate": 0.0006633893856558415, + "loss": 1.5244, + "step": 3835 + }, + { + "epoch": 0.41, + "grad_norm": 0.07227186871954694, + "learning_rate": 0.000663224826766644, + "loss": 1.4414, + "step": 3836 + }, + { + "epoch": 0.41, + "grad_norm": 0.07364965898774103, + "learning_rate": 0.0006630602480858734, + "loss": 1.4668, + "step": 3837 + }, + { + "epoch": 0.41, + "grad_norm": 0.0710617632092743, + "learning_rate": 0.0006628956496334856, + "loss": 1.4955, + "step": 3838 + }, + { + "epoch": 0.41, + "grad_norm": 0.07483232033855866, + "learning_rate": 0.0006627310314294385, + "loss": 1.4276, + "step": 3839 + }, + { + "epoch": 0.41, + "grad_norm": 0.07028946813885235, + "learning_rate": 0.0006625663934936932, + "loss": 1.4192, + "step": 3840 + }, + { + "epoch": 0.41, + "grad_norm": 0.06907195642106281, + "learning_rate": 0.0006624017358462122, + "loss": 1.509, + "step": 3841 + }, + { + "epoch": 0.41, + "grad_norm": 0.08082576245434056, + "learning_rate": 0.0006622370585069604, + "loss": 1.4527, + "step": 3842 + }, + { + "epoch": 0.41, + "grad_norm": 0.07246371293472104, + "learning_rate": 0.0006620723614959063, + "loss": 1.4888, + "step": 3843 + }, + { + "epoch": 0.41, + "grad_norm": 0.07731335402711258, + "learning_rate": 0.0006619076448330197, + "loss": 1.3546, + "step": 3844 + }, + { + "epoch": 0.41, + "grad_norm": 0.09375234814524461, + "learning_rate": 0.0006617429085382727, + "loss": 1.5487, + "step": 3845 + }, + { + "epoch": 0.41, + "grad_norm": 0.08419642597319515, + "learning_rate": 0.0006615781526316406, + "loss": 1.3322, + "step": 3846 + }, + { + "epoch": 0.41, + "grad_norm": 0.07497704397986803, + "learning_rate": 0.0006614133771331006, + "loss": 1.3227, + "step": 3847 + }, + { + "epoch": 0.41, + "grad_norm": 0.06673807622226641, + "learning_rate": 0.0006612485820626317, + "loss": 1.4786, + "step": 3848 + }, + { + "epoch": 0.41, + "grad_norm": 0.06545079125780927, + "learning_rate": 0.0006610837674402167, + "loss": 1.4818, + "step": 3849 + }, + { + "epoch": 0.41, + "grad_norm": 0.07666246902718891, + "learning_rate": 0.0006609189332858394, + "loss": 1.4476, + "step": 3850 + }, + { + "epoch": 0.41, + "grad_norm": 0.07178457217246913, + "learning_rate": 0.0006607540796194866, + "loss": 1.4296, + "step": 3851 + }, + { + "epoch": 0.41, + "grad_norm": 0.07540630488822511, + "learning_rate": 0.0006605892064611477, + "loss": 1.3576, + "step": 3852 + }, + { + "epoch": 0.41, + "grad_norm": 0.06905586130396371, + "learning_rate": 0.0006604243138308137, + "loss": 1.4058, + "step": 3853 + }, + { + "epoch": 0.41, + "grad_norm": 0.0762169441747839, + "learning_rate": 0.0006602594017484785, + "loss": 1.4798, + "step": 3854 + }, + { + "epoch": 0.41, + "grad_norm": 0.07945689113659538, + "learning_rate": 0.0006600944702341385, + "loss": 1.4323, + "step": 3855 + }, + { + "epoch": 0.41, + "grad_norm": 0.06695641836147707, + "learning_rate": 0.0006599295193077922, + "loss": 1.514, + "step": 3856 + }, + { + "epoch": 0.41, + "grad_norm": 0.06348489376560794, + "learning_rate": 0.0006597645489894399, + "loss": 1.3441, + "step": 3857 + }, + { + "epoch": 0.41, + "grad_norm": 0.07358283615988383, + "learning_rate": 0.0006595995592990855, + "loss": 1.3343, + "step": 3858 + }, + { + "epoch": 0.41, + "grad_norm": 0.06725547686839599, + "learning_rate": 0.0006594345502567342, + "loss": 1.4087, + "step": 3859 + }, + { + "epoch": 0.41, + "grad_norm": 0.08059962050387262, + "learning_rate": 0.0006592695218823942, + "loss": 1.4242, + "step": 3860 + }, + { + "epoch": 0.42, + "grad_norm": 0.07206294434314071, + "learning_rate": 0.0006591044741960754, + "loss": 1.3445, + "step": 3861 + }, + { + "epoch": 0.42, + "grad_norm": 0.06549271529864518, + "learning_rate": 0.0006589394072177907, + "loss": 1.3822, + "step": 3862 + }, + { + "epoch": 0.42, + "grad_norm": 0.07173780240938447, + "learning_rate": 0.0006587743209675549, + "loss": 1.5295, + "step": 3863 + }, + { + "epoch": 0.42, + "grad_norm": 0.07004691337394642, + "learning_rate": 0.0006586092154653854, + "loss": 1.4876, + "step": 3864 + }, + { + "epoch": 0.42, + "grad_norm": 0.07900190551261445, + "learning_rate": 0.0006584440907313016, + "loss": 1.4928, + "step": 3865 + }, + { + "epoch": 0.42, + "grad_norm": 0.07316898820714804, + "learning_rate": 0.0006582789467853256, + "loss": 1.3457, + "step": 3866 + }, + { + "epoch": 0.42, + "grad_norm": 0.06359569667254837, + "learning_rate": 0.0006581137836474816, + "loss": 1.3676, + "step": 3867 + }, + { + "epoch": 0.42, + "grad_norm": 0.06782660467001123, + "learning_rate": 0.0006579486013377963, + "loss": 1.3953, + "step": 3868 + }, + { + "epoch": 0.42, + "grad_norm": 0.06696194015947708, + "learning_rate": 0.0006577833998762985, + "loss": 1.4628, + "step": 3869 + }, + { + "epoch": 0.42, + "grad_norm": 0.0720934944292986, + "learning_rate": 0.0006576181792830193, + "loss": 1.395, + "step": 3870 + }, + { + "epoch": 0.42, + "grad_norm": 0.05904143097036177, + "learning_rate": 0.0006574529395779928, + "loss": 1.2883, + "step": 3871 + }, + { + "epoch": 0.42, + "grad_norm": 0.07144924842031333, + "learning_rate": 0.0006572876807812545, + "loss": 1.3083, + "step": 3872 + }, + { + "epoch": 0.42, + "grad_norm": 0.07096535038028348, + "learning_rate": 0.0006571224029128425, + "loss": 1.4658, + "step": 3873 + }, + { + "epoch": 0.42, + "grad_norm": 0.06858119737322739, + "learning_rate": 0.0006569571059927977, + "loss": 1.397, + "step": 3874 + }, + { + "epoch": 0.42, + "grad_norm": 0.07465588152502295, + "learning_rate": 0.0006567917900411625, + "loss": 1.4628, + "step": 3875 + }, + { + "epoch": 0.42, + "grad_norm": 0.07150645537132062, + "learning_rate": 0.0006566264550779821, + "loss": 1.3837, + "step": 3876 + }, + { + "epoch": 0.42, + "grad_norm": 0.06856592343026248, + "learning_rate": 0.0006564611011233043, + "loss": 1.3578, + "step": 3877 + }, + { + "epoch": 0.42, + "grad_norm": 0.06183930235911227, + "learning_rate": 0.0006562957281971785, + "loss": 1.3457, + "step": 3878 + }, + { + "epoch": 0.42, + "grad_norm": 0.06662531178283607, + "learning_rate": 0.0006561303363196568, + "loss": 1.357, + "step": 3879 + }, + { + "epoch": 0.42, + "grad_norm": 0.07518583399000245, + "learning_rate": 0.0006559649255107935, + "loss": 1.4672, + "step": 3880 + }, + { + "epoch": 0.42, + "grad_norm": 0.0693085282173378, + "learning_rate": 0.0006557994957906455, + "loss": 1.3528, + "step": 3881 + }, + { + "epoch": 0.42, + "grad_norm": 0.0728523880755816, + "learning_rate": 0.0006556340471792712, + "loss": 1.4816, + "step": 3882 + }, + { + "epoch": 0.42, + "grad_norm": 0.06637118795337808, + "learning_rate": 0.0006554685796967323, + "loss": 1.4812, + "step": 3883 + }, + { + "epoch": 0.42, + "grad_norm": 0.06861984917136406, + "learning_rate": 0.0006553030933630921, + "loss": 1.3983, + "step": 3884 + }, + { + "epoch": 0.42, + "grad_norm": 0.06529551777213016, + "learning_rate": 0.0006551375881984165, + "loss": 1.4422, + "step": 3885 + }, + { + "epoch": 0.42, + "grad_norm": 0.06858208005798797, + "learning_rate": 0.0006549720642227735, + "loss": 1.4688, + "step": 3886 + }, + { + "epoch": 0.42, + "grad_norm": 0.07143616441393273, + "learning_rate": 0.0006548065214562332, + "loss": 1.3626, + "step": 3887 + }, + { + "epoch": 0.42, + "grad_norm": 0.07404672141993965, + "learning_rate": 0.0006546409599188686, + "loss": 1.347, + "step": 3888 + }, + { + "epoch": 0.42, + "grad_norm": 0.06567975995952656, + "learning_rate": 0.0006544753796307547, + "loss": 1.3886, + "step": 3889 + }, + { + "epoch": 0.42, + "grad_norm": 0.06827963964218622, + "learning_rate": 0.0006543097806119682, + "loss": 1.3641, + "step": 3890 + }, + { + "epoch": 0.42, + "grad_norm": 0.06502890786809211, + "learning_rate": 0.000654144162882589, + "loss": 1.5318, + "step": 3891 + }, + { + "epoch": 0.42, + "grad_norm": 0.09549961420573892, + "learning_rate": 0.0006539785264626985, + "loss": 1.5141, + "step": 3892 + }, + { + "epoch": 0.42, + "grad_norm": 0.0696505156579795, + "learning_rate": 0.0006538128713723808, + "loss": 1.4501, + "step": 3893 + }, + { + "epoch": 0.42, + "grad_norm": 0.07241761729384924, + "learning_rate": 0.0006536471976317223, + "loss": 1.4871, + "step": 3894 + }, + { + "epoch": 0.42, + "grad_norm": 0.07391803138958859, + "learning_rate": 0.0006534815052608114, + "loss": 1.4029, + "step": 3895 + }, + { + "epoch": 0.42, + "grad_norm": 0.06593299094089043, + "learning_rate": 0.0006533157942797387, + "loss": 1.5064, + "step": 3896 + }, + { + "epoch": 0.42, + "grad_norm": 0.07575361796363428, + "learning_rate": 0.0006531500647085974, + "loss": 1.3223, + "step": 3897 + }, + { + "epoch": 0.42, + "grad_norm": 0.07197208892800534, + "learning_rate": 0.0006529843165674828, + "loss": 1.3693, + "step": 3898 + }, + { + "epoch": 0.42, + "grad_norm": 0.0776799924076783, + "learning_rate": 0.0006528185498764924, + "loss": 1.4277, + "step": 3899 + }, + { + "epoch": 0.42, + "grad_norm": 0.07989342835825704, + "learning_rate": 0.0006526527646557261, + "loss": 1.4741, + "step": 3900 + }, + { + "epoch": 0.42, + "grad_norm": 0.07733258169149501, + "learning_rate": 0.0006524869609252856, + "loss": 1.4633, + "step": 3901 + }, + { + "epoch": 0.42, + "grad_norm": 0.07556221365420827, + "learning_rate": 0.0006523211387052755, + "loss": 1.4267, + "step": 3902 + }, + { + "epoch": 0.42, + "grad_norm": 0.07166604005031749, + "learning_rate": 0.0006521552980158023, + "loss": 1.4879, + "step": 3903 + }, + { + "epoch": 0.42, + "grad_norm": 0.07005460949807596, + "learning_rate": 0.0006519894388769744, + "loss": 1.4087, + "step": 3904 + }, + { + "epoch": 0.42, + "grad_norm": 0.08364807345793063, + "learning_rate": 0.0006518235613089034, + "loss": 1.461, + "step": 3905 + }, + { + "epoch": 0.42, + "grad_norm": 0.07413803750026045, + "learning_rate": 0.0006516576653317019, + "loss": 1.3398, + "step": 3906 + }, + { + "epoch": 0.42, + "grad_norm": 0.07896706543787335, + "learning_rate": 0.0006514917509654857, + "loss": 1.3738, + "step": 3907 + }, + { + "epoch": 0.42, + "grad_norm": 0.0638598561453725, + "learning_rate": 0.0006513258182303724, + "loss": 1.4045, + "step": 3908 + }, + { + "epoch": 0.42, + "grad_norm": 0.06952205296431396, + "learning_rate": 0.000651159867146482, + "loss": 1.42, + "step": 3909 + }, + { + "epoch": 0.42, + "grad_norm": 0.06892081357631127, + "learning_rate": 0.0006509938977339366, + "loss": 1.4329, + "step": 3910 + }, + { + "epoch": 0.42, + "grad_norm": 0.06937126649817552, + "learning_rate": 0.0006508279100128605, + "loss": 1.3472, + "step": 3911 + }, + { + "epoch": 0.42, + "grad_norm": 0.06116336612759358, + "learning_rate": 0.0006506619040033804, + "loss": 1.4974, + "step": 3912 + }, + { + "epoch": 0.42, + "grad_norm": 0.06792960629230754, + "learning_rate": 0.0006504958797256249, + "loss": 1.5023, + "step": 3913 + }, + { + "epoch": 0.42, + "grad_norm": 0.07305324582122641, + "learning_rate": 0.0006503298371997252, + "loss": 1.3766, + "step": 3914 + }, + { + "epoch": 0.42, + "grad_norm": 0.06663241510812803, + "learning_rate": 0.0006501637764458145, + "loss": 1.3254, + "step": 3915 + }, + { + "epoch": 0.42, + "grad_norm": 0.0649843862468896, + "learning_rate": 0.0006499976974840281, + "loss": 1.4063, + "step": 3916 + }, + { + "epoch": 0.42, + "grad_norm": 0.06951734575941149, + "learning_rate": 0.0006498316003345039, + "loss": 1.3195, + "step": 3917 + }, + { + "epoch": 0.42, + "grad_norm": 0.062361862591796616, + "learning_rate": 0.0006496654850173815, + "loss": 1.3639, + "step": 3918 + }, + { + "epoch": 0.42, + "grad_norm": 0.07616360631714768, + "learning_rate": 0.0006494993515528031, + "loss": 1.516, + "step": 3919 + }, + { + "epoch": 0.42, + "grad_norm": 0.06141123114164976, + "learning_rate": 0.0006493331999609132, + "loss": 1.4371, + "step": 3920 + }, + { + "epoch": 0.42, + "grad_norm": 0.06802623499257793, + "learning_rate": 0.0006491670302618576, + "loss": 1.4329, + "step": 3921 + }, + { + "epoch": 0.42, + "grad_norm": 0.0772677461754373, + "learning_rate": 0.0006490008424757855, + "loss": 1.5025, + "step": 3922 + }, + { + "epoch": 0.42, + "grad_norm": 0.0743215556309416, + "learning_rate": 0.0006488346366228475, + "loss": 1.3276, + "step": 3923 + }, + { + "epoch": 0.42, + "grad_norm": 0.07205238649167922, + "learning_rate": 0.0006486684127231967, + "loss": 1.4187, + "step": 3924 + }, + { + "epoch": 0.42, + "grad_norm": 0.0643706136749596, + "learning_rate": 0.0006485021707969882, + "loss": 1.42, + "step": 3925 + }, + { + "epoch": 0.42, + "grad_norm": 0.06063894800539872, + "learning_rate": 0.0006483359108643798, + "loss": 1.3124, + "step": 3926 + }, + { + "epoch": 0.42, + "grad_norm": 0.06558860865385341, + "learning_rate": 0.0006481696329455307, + "loss": 1.5174, + "step": 3927 + }, + { + "epoch": 0.42, + "grad_norm": 0.07291103363275614, + "learning_rate": 0.0006480033370606027, + "loss": 1.5637, + "step": 3928 + }, + { + "epoch": 0.42, + "grad_norm": 0.06763422148856137, + "learning_rate": 0.0006478370232297599, + "loss": 1.5378, + "step": 3929 + }, + { + "epoch": 0.42, + "grad_norm": 0.06303247852882345, + "learning_rate": 0.0006476706914731683, + "loss": 1.3136, + "step": 3930 + }, + { + "epoch": 0.42, + "grad_norm": 0.08152561088657095, + "learning_rate": 0.0006475043418109965, + "loss": 1.4433, + "step": 3931 + }, + { + "epoch": 0.42, + "grad_norm": 0.06088011791680538, + "learning_rate": 0.0006473379742634144, + "loss": 1.3877, + "step": 3932 + }, + { + "epoch": 0.42, + "grad_norm": 0.07318502818351857, + "learning_rate": 0.0006471715888505951, + "loss": 1.4242, + "step": 3933 + }, + { + "epoch": 0.42, + "grad_norm": 0.06649810034662097, + "learning_rate": 0.0006470051855927134, + "loss": 1.4482, + "step": 3934 + }, + { + "epoch": 0.42, + "grad_norm": 0.07058262099194323, + "learning_rate": 0.000646838764509946, + "loss": 1.4439, + "step": 3935 + }, + { + "epoch": 0.42, + "grad_norm": 0.06514689463950991, + "learning_rate": 0.0006466723256224723, + "loss": 1.4394, + "step": 3936 + }, + { + "epoch": 0.42, + "grad_norm": 0.08531249688658517, + "learning_rate": 0.0006465058689504733, + "loss": 1.4133, + "step": 3937 + }, + { + "epoch": 0.42, + "grad_norm": 0.09075806648354802, + "learning_rate": 0.0006463393945141327, + "loss": 1.4447, + "step": 3938 + }, + { + "epoch": 0.42, + "grad_norm": 0.06925448902528702, + "learning_rate": 0.0006461729023336361, + "loss": 1.4942, + "step": 3939 + }, + { + "epoch": 0.42, + "grad_norm": 0.08057013127356427, + "learning_rate": 0.0006460063924291709, + "loss": 1.4033, + "step": 3940 + }, + { + "epoch": 0.42, + "grad_norm": 0.0744006256177114, + "learning_rate": 0.0006458398648209274, + "loss": 1.4624, + "step": 3941 + }, + { + "epoch": 0.42, + "grad_norm": 0.08683211100878094, + "learning_rate": 0.0006456733195290976, + "loss": 1.3895, + "step": 3942 + }, + { + "epoch": 0.42, + "grad_norm": 0.07420107511822488, + "learning_rate": 0.0006455067565738755, + "loss": 1.4124, + "step": 3943 + }, + { + "epoch": 0.42, + "grad_norm": 0.08515285186593229, + "learning_rate": 0.0006453401759754574, + "loss": 1.3782, + "step": 3944 + }, + { + "epoch": 0.42, + "grad_norm": 0.07716068943132816, + "learning_rate": 0.0006451735777540421, + "loss": 1.4545, + "step": 3945 + }, + { + "epoch": 0.42, + "grad_norm": 0.06785250313095319, + "learning_rate": 0.0006450069619298299, + "loss": 1.4342, + "step": 3946 + }, + { + "epoch": 0.42, + "grad_norm": 0.08320823203786909, + "learning_rate": 0.0006448403285230238, + "loss": 1.3588, + "step": 3947 + }, + { + "epoch": 0.42, + "grad_norm": 0.06851001898167915, + "learning_rate": 0.0006446736775538284, + "loss": 1.4696, + "step": 3948 + }, + { + "epoch": 0.42, + "grad_norm": 0.07424712344584676, + "learning_rate": 0.0006445070090424507, + "loss": 1.368, + "step": 3949 + }, + { + "epoch": 0.42, + "grad_norm": 0.08196924625145434, + "learning_rate": 0.0006443403230091001, + "loss": 1.5129, + "step": 3950 + }, + { + "epoch": 0.42, + "grad_norm": 0.07066837376848595, + "learning_rate": 0.0006441736194739878, + "loss": 1.4107, + "step": 3951 + }, + { + "epoch": 0.42, + "grad_norm": 0.07186081308313993, + "learning_rate": 0.0006440068984573271, + "loss": 1.3675, + "step": 3952 + }, + { + "epoch": 0.42, + "grad_norm": 0.08857135342229146, + "learning_rate": 0.0006438401599793333, + "loss": 1.3819, + "step": 3953 + }, + { + "epoch": 0.43, + "grad_norm": 0.07697014336064355, + "learning_rate": 0.0006436734040602244, + "loss": 1.4648, + "step": 3954 + }, + { + "epoch": 0.43, + "grad_norm": 0.0697777333276736, + "learning_rate": 0.0006435066307202197, + "loss": 1.5152, + "step": 3955 + }, + { + "epoch": 0.43, + "grad_norm": 0.07935233015115574, + "learning_rate": 0.0006433398399795415, + "loss": 1.3593, + "step": 3956 + }, + { + "epoch": 0.43, + "grad_norm": 0.07245156010257515, + "learning_rate": 0.0006431730318584135, + "loss": 1.5383, + "step": 3957 + }, + { + "epoch": 0.43, + "grad_norm": 0.06912143950929635, + "learning_rate": 0.0006430062063770618, + "loss": 1.3255, + "step": 3958 + }, + { + "epoch": 0.43, + "grad_norm": 0.06871716581567094, + "learning_rate": 0.0006428393635557146, + "loss": 1.377, + "step": 3959 + }, + { + "epoch": 0.43, + "grad_norm": 0.07122351216156715, + "learning_rate": 0.000642672503414602, + "loss": 1.4079, + "step": 3960 + }, + { + "epoch": 0.43, + "grad_norm": 0.07245998086323546, + "learning_rate": 0.0006425056259739566, + "loss": 1.5169, + "step": 3961 + }, + { + "epoch": 0.43, + "grad_norm": 0.07059710490678917, + "learning_rate": 0.0006423387312540126, + "loss": 1.3649, + "step": 3962 + }, + { + "epoch": 0.43, + "grad_norm": 0.07049724857677844, + "learning_rate": 0.0006421718192750069, + "loss": 1.451, + "step": 3963 + }, + { + "epoch": 0.43, + "grad_norm": 0.06736834343072567, + "learning_rate": 0.000642004890057178, + "loss": 1.4212, + "step": 3964 + }, + { + "epoch": 0.43, + "grad_norm": 0.07409561524105795, + "learning_rate": 0.0006418379436207664, + "loss": 1.5065, + "step": 3965 + }, + { + "epoch": 0.43, + "grad_norm": 0.07636154384442553, + "learning_rate": 0.0006416709799860152, + "loss": 1.3027, + "step": 3966 + }, + { + "epoch": 0.43, + "grad_norm": 0.07598801731212228, + "learning_rate": 0.0006415039991731694, + "loss": 1.4313, + "step": 3967 + }, + { + "epoch": 0.43, + "grad_norm": 0.07390579715074876, + "learning_rate": 0.0006413370012024759, + "loss": 1.3851, + "step": 3968 + }, + { + "epoch": 0.43, + "grad_norm": 0.0717562024777435, + "learning_rate": 0.0006411699860941835, + "loss": 1.474, + "step": 3969 + }, + { + "epoch": 0.43, + "grad_norm": 0.06909040510305756, + "learning_rate": 0.0006410029538685438, + "loss": 1.4227, + "step": 3970 + }, + { + "epoch": 0.43, + "grad_norm": 0.06699697627532278, + "learning_rate": 0.0006408359045458099, + "loss": 1.5094, + "step": 3971 + }, + { + "epoch": 0.43, + "grad_norm": 0.07224536624675192, + "learning_rate": 0.0006406688381462367, + "loss": 1.4341, + "step": 3972 + }, + { + "epoch": 0.43, + "grad_norm": 0.07865011939575806, + "learning_rate": 0.0006405017546900822, + "loss": 1.4284, + "step": 3973 + }, + { + "epoch": 0.43, + "grad_norm": 0.07174267385677884, + "learning_rate": 0.0006403346541976056, + "loss": 1.4537, + "step": 3974 + }, + { + "epoch": 0.43, + "grad_norm": 0.06841466489928297, + "learning_rate": 0.0006401675366890682, + "loss": 1.4555, + "step": 3975 + }, + { + "epoch": 0.43, + "grad_norm": 0.07168977043503581, + "learning_rate": 0.0006400004021847338, + "loss": 1.456, + "step": 3976 + }, + { + "epoch": 0.43, + "grad_norm": 0.08316765009925761, + "learning_rate": 0.000639833250704868, + "loss": 1.5264, + "step": 3977 + }, + { + "epoch": 0.43, + "grad_norm": 0.06598310254941155, + "learning_rate": 0.0006396660822697383, + "loss": 1.2632, + "step": 3978 + }, + { + "epoch": 0.43, + "grad_norm": 0.06861447351985102, + "learning_rate": 0.000639498896899615, + "loss": 1.3635, + "step": 3979 + }, + { + "epoch": 0.43, + "grad_norm": 0.06990865790262073, + "learning_rate": 0.0006393316946147692, + "loss": 1.3088, + "step": 3980 + }, + { + "epoch": 0.43, + "grad_norm": 0.06726246864470845, + "learning_rate": 0.0006391644754354751, + "loss": 1.4502, + "step": 3981 + }, + { + "epoch": 0.43, + "grad_norm": 0.07210289911724266, + "learning_rate": 0.0006389972393820087, + "loss": 1.4364, + "step": 3982 + }, + { + "epoch": 0.43, + "grad_norm": 0.0708583088350892, + "learning_rate": 0.0006388299864746477, + "loss": 1.4931, + "step": 3983 + }, + { + "epoch": 0.43, + "grad_norm": 0.06838091953553109, + "learning_rate": 0.0006386627167336724, + "loss": 1.3996, + "step": 3984 + }, + { + "epoch": 0.43, + "grad_norm": 0.07333635634297311, + "learning_rate": 0.0006384954301793647, + "loss": 1.424, + "step": 3985 + }, + { + "epoch": 0.43, + "grad_norm": 0.06392704323596084, + "learning_rate": 0.0006383281268320083, + "loss": 1.4805, + "step": 3986 + }, + { + "epoch": 0.43, + "grad_norm": 0.07239214592386142, + "learning_rate": 0.0006381608067118898, + "loss": 1.4503, + "step": 3987 + }, + { + "epoch": 0.43, + "grad_norm": 0.08377596451570307, + "learning_rate": 0.0006379934698392972, + "loss": 1.4575, + "step": 3988 + }, + { + "epoch": 0.43, + "grad_norm": 0.06512746205390593, + "learning_rate": 0.0006378261162345207, + "loss": 1.4052, + "step": 3989 + }, + { + "epoch": 0.43, + "grad_norm": 0.06566868198185125, + "learning_rate": 0.0006376587459178525, + "loss": 1.4508, + "step": 3990 + }, + { + "epoch": 0.43, + "grad_norm": 0.07464547519704835, + "learning_rate": 0.0006374913589095865, + "loss": 1.3952, + "step": 3991 + }, + { + "epoch": 0.43, + "grad_norm": 0.07711648505702222, + "learning_rate": 0.0006373239552300194, + "loss": 1.3761, + "step": 3992 + }, + { + "epoch": 0.43, + "grad_norm": 0.06850586335147511, + "learning_rate": 0.0006371565348994492, + "loss": 1.4297, + "step": 3993 + }, + { + "epoch": 0.43, + "grad_norm": 0.07199508377082078, + "learning_rate": 0.0006369890979381765, + "loss": 1.5062, + "step": 3994 + }, + { + "epoch": 0.43, + "grad_norm": 0.06794921663514014, + "learning_rate": 0.0006368216443665033, + "loss": 1.3346, + "step": 3995 + }, + { + "epoch": 0.43, + "grad_norm": 0.06926739185641112, + "learning_rate": 0.000636654174204734, + "loss": 1.48, + "step": 3996 + }, + { + "epoch": 0.43, + "grad_norm": 0.06888232643275677, + "learning_rate": 0.0006364866874731749, + "loss": 1.2917, + "step": 3997 + }, + { + "epoch": 0.43, + "grad_norm": 0.07729538672205612, + "learning_rate": 0.0006363191841921344, + "loss": 1.4689, + "step": 3998 + }, + { + "epoch": 0.43, + "grad_norm": 0.0666543134995083, + "learning_rate": 0.0006361516643819229, + "loss": 1.3915, + "step": 3999 + }, + { + "epoch": 0.43, + "grad_norm": 0.08126873599265884, + "learning_rate": 0.0006359841280628529, + "loss": 1.4609, + "step": 4000 + }, + { + "epoch": 0.43, + "grad_norm": 0.0778814602771884, + "learning_rate": 0.0006358165752552383, + "loss": 1.5199, + "step": 4001 + }, + { + "epoch": 0.43, + "grad_norm": 0.06526420166226415, + "learning_rate": 0.0006356490059793959, + "loss": 1.4386, + "step": 4002 + }, + { + "epoch": 0.43, + "grad_norm": 0.07223692530289383, + "learning_rate": 0.0006354814202556437, + "loss": 1.3302, + "step": 4003 + }, + { + "epoch": 0.43, + "grad_norm": 0.06565906223942776, + "learning_rate": 0.0006353138181043024, + "loss": 1.4046, + "step": 4004 + }, + { + "epoch": 0.43, + "grad_norm": 0.06969485520321976, + "learning_rate": 0.000635146199545694, + "loss": 1.4483, + "step": 4005 + }, + { + "epoch": 0.43, + "grad_norm": 0.0769829877285532, + "learning_rate": 0.000634978564600143, + "loss": 1.5446, + "step": 4006 + }, + { + "epoch": 0.43, + "grad_norm": 0.06600204951040782, + "learning_rate": 0.0006348109132879758, + "loss": 1.4368, + "step": 4007 + }, + { + "epoch": 0.43, + "grad_norm": 0.06277465263963822, + "learning_rate": 0.0006346432456295206, + "loss": 1.4953, + "step": 4008 + }, + { + "epoch": 0.43, + "grad_norm": 0.070869092214546, + "learning_rate": 0.0006344755616451075, + "loss": 1.3439, + "step": 4009 + }, + { + "epoch": 0.43, + "grad_norm": 0.0851992383775605, + "learning_rate": 0.0006343078613550692, + "loss": 1.2783, + "step": 4010 + }, + { + "epoch": 0.43, + "grad_norm": 0.06873785217748689, + "learning_rate": 0.0006341401447797395, + "loss": 1.4184, + "step": 4011 + }, + { + "epoch": 0.43, + "grad_norm": 0.08105491442820943, + "learning_rate": 0.0006339724119394548, + "loss": 1.4541, + "step": 4012 + }, + { + "epoch": 0.43, + "grad_norm": 0.09058309410357024, + "learning_rate": 0.0006338046628545533, + "loss": 1.3389, + "step": 4013 + }, + { + "epoch": 0.43, + "grad_norm": 0.07765474200223683, + "learning_rate": 0.0006336368975453751, + "loss": 1.3905, + "step": 4014 + }, + { + "epoch": 0.43, + "grad_norm": 0.08691514341346585, + "learning_rate": 0.0006334691160322625, + "loss": 1.3661, + "step": 4015 + }, + { + "epoch": 0.43, + "grad_norm": 0.07644850905371951, + "learning_rate": 0.0006333013183355594, + "loss": 1.4375, + "step": 4016 + }, + { + "epoch": 0.43, + "grad_norm": 0.06857380019815038, + "learning_rate": 0.0006331335044756118, + "loss": 1.3521, + "step": 4017 + }, + { + "epoch": 0.43, + "grad_norm": 0.08305252497824059, + "learning_rate": 0.0006329656744727679, + "loss": 1.3916, + "step": 4018 + }, + { + "epoch": 0.43, + "grad_norm": 0.08124314368644833, + "learning_rate": 0.0006327978283473775, + "loss": 1.3291, + "step": 4019 + }, + { + "epoch": 0.43, + "grad_norm": 0.0705597696963439, + "learning_rate": 0.0006326299661197925, + "loss": 1.3706, + "step": 4020 + }, + { + "epoch": 0.43, + "grad_norm": 0.07269778983693065, + "learning_rate": 0.000632462087810367, + "loss": 1.3945, + "step": 4021 + }, + { + "epoch": 0.43, + "grad_norm": 0.07146559700538933, + "learning_rate": 0.0006322941934394568, + "loss": 1.533, + "step": 4022 + }, + { + "epoch": 0.43, + "grad_norm": 0.08857351618785715, + "learning_rate": 0.0006321262830274192, + "loss": 1.4558, + "step": 4023 + }, + { + "epoch": 0.43, + "grad_norm": 0.07938377494020217, + "learning_rate": 0.0006319583565946147, + "loss": 1.3931, + "step": 4024 + }, + { + "epoch": 0.43, + "grad_norm": 0.07280102266618659, + "learning_rate": 0.0006317904141614043, + "loss": 1.3522, + "step": 4025 + }, + { + "epoch": 0.43, + "grad_norm": 0.07857482129301427, + "learning_rate": 0.0006316224557481518, + "loss": 1.4833, + "step": 4026 + }, + { + "epoch": 0.43, + "grad_norm": 0.06592901984933465, + "learning_rate": 0.0006314544813752229, + "loss": 1.4067, + "step": 4027 + }, + { + "epoch": 0.43, + "grad_norm": 0.07367040385418791, + "learning_rate": 0.0006312864910629848, + "loss": 1.4827, + "step": 4028 + }, + { + "epoch": 0.43, + "grad_norm": 0.0735499496321778, + "learning_rate": 0.0006311184848318071, + "loss": 1.4299, + "step": 4029 + }, + { + "epoch": 0.43, + "grad_norm": 0.07313825624311018, + "learning_rate": 0.0006309504627020611, + "loss": 1.4739, + "step": 4030 + }, + { + "epoch": 0.43, + "grad_norm": 0.06941465489720243, + "learning_rate": 0.0006307824246941199, + "loss": 1.4239, + "step": 4031 + }, + { + "epoch": 0.43, + "grad_norm": 0.07456618633064974, + "learning_rate": 0.0006306143708283592, + "loss": 1.3066, + "step": 4032 + }, + { + "epoch": 0.43, + "grad_norm": 0.0651709204419574, + "learning_rate": 0.0006304463011251554, + "loss": 1.3473, + "step": 4033 + }, + { + "epoch": 0.43, + "grad_norm": 0.06531237023428524, + "learning_rate": 0.000630278215604888, + "loss": 1.5217, + "step": 4034 + }, + { + "epoch": 0.43, + "grad_norm": 0.08111952291273701, + "learning_rate": 0.0006301101142879378, + "loss": 1.302, + "step": 4035 + }, + { + "epoch": 0.43, + "grad_norm": 0.07363733510463709, + "learning_rate": 0.0006299419971946876, + "loss": 1.446, + "step": 4036 + }, + { + "epoch": 0.43, + "grad_norm": 0.07311494980959608, + "learning_rate": 0.0006297738643455224, + "loss": 1.4033, + "step": 4037 + }, + { + "epoch": 0.43, + "grad_norm": 0.06715207919719457, + "learning_rate": 0.0006296057157608287, + "loss": 1.5365, + "step": 4038 + }, + { + "epoch": 0.43, + "grad_norm": 0.06925573041694051, + "learning_rate": 0.0006294375514609951, + "loss": 1.3796, + "step": 4039 + }, + { + "epoch": 0.43, + "grad_norm": 0.0819579049167436, + "learning_rate": 0.0006292693714664122, + "loss": 1.416, + "step": 4040 + }, + { + "epoch": 0.43, + "grad_norm": 0.06194321911570573, + "learning_rate": 0.0006291011757974722, + "loss": 1.4414, + "step": 4041 + }, + { + "epoch": 0.43, + "grad_norm": 0.07743332118321378, + "learning_rate": 0.0006289329644745698, + "loss": 1.4206, + "step": 4042 + }, + { + "epoch": 0.43, + "grad_norm": 0.07286834756737166, + "learning_rate": 0.0006287647375181009, + "loss": 1.4865, + "step": 4043 + }, + { + "epoch": 0.43, + "grad_norm": 0.06525179714561152, + "learning_rate": 0.0006285964949484637, + "loss": 1.2784, + "step": 4044 + }, + { + "epoch": 0.43, + "grad_norm": 0.07796288896846361, + "learning_rate": 0.0006284282367860579, + "loss": 1.4092, + "step": 4045 + }, + { + "epoch": 0.43, + "grad_norm": 0.07380537860259485, + "learning_rate": 0.0006282599630512858, + "loss": 1.5595, + "step": 4046 + }, + { + "epoch": 0.44, + "grad_norm": 0.07290283264188771, + "learning_rate": 0.000628091673764551, + "loss": 1.4883, + "step": 4047 + }, + { + "epoch": 0.44, + "grad_norm": 0.07557073453800928, + "learning_rate": 0.0006279233689462591, + "loss": 1.3976, + "step": 4048 + }, + { + "epoch": 0.44, + "grad_norm": 0.07190307484813235, + "learning_rate": 0.0006277550486168177, + "loss": 1.4121, + "step": 4049 + }, + { + "epoch": 0.44, + "grad_norm": 0.07124234491193457, + "learning_rate": 0.0006275867127966363, + "loss": 1.3455, + "step": 4050 + }, + { + "epoch": 0.44, + "grad_norm": 0.07187076898229122, + "learning_rate": 0.000627418361506126, + "loss": 1.3166, + "step": 4051 + }, + { + "epoch": 0.44, + "grad_norm": 0.06811580777354202, + "learning_rate": 0.0006272499947657002, + "loss": 1.5047, + "step": 4052 + }, + { + "epoch": 0.44, + "grad_norm": 0.07519583887683878, + "learning_rate": 0.000627081612595774, + "loss": 1.4374, + "step": 4053 + }, + { + "epoch": 0.44, + "grad_norm": 0.0658215338355946, + "learning_rate": 0.0006269132150167638, + "loss": 1.4198, + "step": 4054 + }, + { + "epoch": 0.44, + "grad_norm": 0.06636046398423283, + "learning_rate": 0.0006267448020490889, + "loss": 1.5282, + "step": 4055 + }, + { + "epoch": 0.44, + "grad_norm": 0.0756700595936872, + "learning_rate": 0.0006265763737131698, + "loss": 1.4572, + "step": 4056 + }, + { + "epoch": 0.44, + "grad_norm": 0.09031156746298717, + "learning_rate": 0.000626407930029429, + "loss": 1.4109, + "step": 4057 + }, + { + "epoch": 0.44, + "grad_norm": 0.07865278513159098, + "learning_rate": 0.0006262394710182909, + "loss": 1.3605, + "step": 4058 + }, + { + "epoch": 0.44, + "grad_norm": 0.06988033127417924, + "learning_rate": 0.0006260709967001816, + "loss": 1.4146, + "step": 4059 + }, + { + "epoch": 0.44, + "grad_norm": 0.07532677831427176, + "learning_rate": 0.0006259025070955295, + "loss": 1.4004, + "step": 4060 + }, + { + "epoch": 0.44, + "grad_norm": 0.07109168814063885, + "learning_rate": 0.0006257340022247643, + "loss": 1.5781, + "step": 4061 + }, + { + "epoch": 0.44, + "grad_norm": 0.06307341769528307, + "learning_rate": 0.0006255654821083178, + "loss": 1.3837, + "step": 4062 + }, + { + "epoch": 0.44, + "grad_norm": 0.0657165372516695, + "learning_rate": 0.0006253969467666238, + "loss": 1.4479, + "step": 4063 + }, + { + "epoch": 0.44, + "grad_norm": 0.0767516467964421, + "learning_rate": 0.0006252283962201177, + "loss": 1.3942, + "step": 4064 + }, + { + "epoch": 0.44, + "grad_norm": 0.07047450174961416, + "learning_rate": 0.0006250598304892368, + "loss": 1.4335, + "step": 4065 + }, + { + "epoch": 0.44, + "grad_norm": 0.0726381170938812, + "learning_rate": 0.0006248912495944203, + "loss": 1.4966, + "step": 4066 + }, + { + "epoch": 0.44, + "grad_norm": 0.07488327336903305, + "learning_rate": 0.0006247226535561092, + "loss": 1.4666, + "step": 4067 + }, + { + "epoch": 0.44, + "grad_norm": 0.06558892902830106, + "learning_rate": 0.0006245540423947462, + "loss": 1.4455, + "step": 4068 + }, + { + "epoch": 0.44, + "grad_norm": 0.07251446239916863, + "learning_rate": 0.0006243854161307765, + "loss": 1.5224, + "step": 4069 + }, + { + "epoch": 0.44, + "grad_norm": 0.08316498630023138, + "learning_rate": 0.0006242167747846461, + "loss": 1.5017, + "step": 4070 + }, + { + "epoch": 0.44, + "grad_norm": 0.07975250717728095, + "learning_rate": 0.0006240481183768036, + "loss": 1.3195, + "step": 4071 + }, + { + "epoch": 0.44, + "grad_norm": 0.07204884155011891, + "learning_rate": 0.000623879446927699, + "loss": 1.4155, + "step": 4072 + }, + { + "epoch": 0.44, + "grad_norm": 0.07334104065864457, + "learning_rate": 0.0006237107604577843, + "loss": 1.3981, + "step": 4073 + }, + { + "epoch": 0.44, + "grad_norm": 0.07842758263483142, + "learning_rate": 0.0006235420589875136, + "loss": 1.4745, + "step": 4074 + }, + { + "epoch": 0.44, + "grad_norm": 0.0705062668239952, + "learning_rate": 0.0006233733425373422, + "loss": 1.4089, + "step": 4075 + }, + { + "epoch": 0.44, + "grad_norm": 0.07756434672164429, + "learning_rate": 0.0006232046111277277, + "loss": 1.5183, + "step": 4076 + }, + { + "epoch": 0.44, + "grad_norm": 0.07135757491952958, + "learning_rate": 0.0006230358647791294, + "loss": 1.5568, + "step": 4077 + }, + { + "epoch": 0.44, + "grad_norm": 0.08626731930825417, + "learning_rate": 0.0006228671035120082, + "loss": 1.3864, + "step": 4078 + }, + { + "epoch": 0.44, + "grad_norm": 0.07264549223156583, + "learning_rate": 0.0006226983273468273, + "loss": 1.4313, + "step": 4079 + }, + { + "epoch": 0.44, + "grad_norm": 0.06987639541786198, + "learning_rate": 0.0006225295363040511, + "loss": 1.361, + "step": 4080 + }, + { + "epoch": 0.44, + "grad_norm": 0.06641572646235297, + "learning_rate": 0.0006223607304041462, + "loss": 1.5163, + "step": 4081 + }, + { + "epoch": 0.44, + "grad_norm": 0.06436519422690486, + "learning_rate": 0.0006221919096675808, + "loss": 1.4704, + "step": 4082 + }, + { + "epoch": 0.44, + "grad_norm": 0.07388014001773682, + "learning_rate": 0.0006220230741148253, + "loss": 1.5568, + "step": 4083 + }, + { + "epoch": 0.44, + "grad_norm": 0.06842339021724836, + "learning_rate": 0.0006218542237663513, + "loss": 1.4529, + "step": 4084 + }, + { + "epoch": 0.44, + "grad_norm": 0.07737167494281082, + "learning_rate": 0.0006216853586426325, + "loss": 1.4532, + "step": 4085 + }, + { + "epoch": 0.44, + "grad_norm": 0.08744856475360263, + "learning_rate": 0.0006215164787641446, + "loss": 1.3995, + "step": 4086 + }, + { + "epoch": 0.44, + "grad_norm": 0.08103564337406777, + "learning_rate": 0.0006213475841513646, + "loss": 1.2311, + "step": 4087 + }, + { + "epoch": 0.44, + "grad_norm": 0.07470751509790251, + "learning_rate": 0.0006211786748247716, + "loss": 1.4645, + "step": 4088 + }, + { + "epoch": 0.44, + "grad_norm": 0.06984095772926356, + "learning_rate": 0.0006210097508048466, + "loss": 1.3588, + "step": 4089 + }, + { + "epoch": 0.44, + "grad_norm": 0.07684632265692232, + "learning_rate": 0.0006208408121120723, + "loss": 1.432, + "step": 4090 + }, + { + "epoch": 0.44, + "grad_norm": 0.0648317073790488, + "learning_rate": 0.0006206718587669326, + "loss": 1.3795, + "step": 4091 + }, + { + "epoch": 0.44, + "grad_norm": 0.06610090338490285, + "learning_rate": 0.0006205028907899143, + "loss": 1.4399, + "step": 4092 + }, + { + "epoch": 0.44, + "grad_norm": 0.06986369171000673, + "learning_rate": 0.0006203339082015048, + "loss": 1.3131, + "step": 4093 + }, + { + "epoch": 0.44, + "grad_norm": 0.06672274588600151, + "learning_rate": 0.0006201649110221943, + "loss": 1.4506, + "step": 4094 + }, + { + "epoch": 0.44, + "grad_norm": 0.07641605904306757, + "learning_rate": 0.000619995899272474, + "loss": 1.4011, + "step": 4095 + }, + { + "epoch": 0.44, + "grad_norm": 0.07299058614515767, + "learning_rate": 0.0006198268729728371, + "loss": 1.3871, + "step": 4096 + }, + { + "epoch": 0.44, + "grad_norm": 0.06910804786567874, + "learning_rate": 0.0006196578321437789, + "loss": 1.2873, + "step": 4097 + }, + { + "epoch": 0.44, + "grad_norm": 0.0663791199619923, + "learning_rate": 0.000619488776805796, + "loss": 1.4373, + "step": 4098 + }, + { + "epoch": 0.44, + "grad_norm": 0.06725867211121205, + "learning_rate": 0.0006193197069793869, + "loss": 1.4147, + "step": 4099 + }, + { + "epoch": 0.44, + "grad_norm": 0.06213689170540425, + "learning_rate": 0.000619150622685052, + "loss": 1.4147, + "step": 4100 + }, + { + "epoch": 0.44, + "grad_norm": 0.058435811328203476, + "learning_rate": 0.0006189815239432935, + "loss": 1.4477, + "step": 4101 + }, + { + "epoch": 0.44, + "grad_norm": 0.06735167267567684, + "learning_rate": 0.0006188124107746148, + "loss": 1.3274, + "step": 4102 + }, + { + "epoch": 0.44, + "grad_norm": 0.07419176458112457, + "learning_rate": 0.0006186432831995218, + "loss": 1.3537, + "step": 4103 + }, + { + "epoch": 0.44, + "grad_norm": 0.07341471855142019, + "learning_rate": 0.0006184741412385217, + "loss": 1.5122, + "step": 4104 + }, + { + "epoch": 0.44, + "grad_norm": 0.06830118546681271, + "learning_rate": 0.0006183049849121233, + "loss": 1.3571, + "step": 4105 + }, + { + "epoch": 0.44, + "grad_norm": 0.0636175844946524, + "learning_rate": 0.000618135814240838, + "loss": 1.3388, + "step": 4106 + }, + { + "epoch": 0.44, + "grad_norm": 0.06988407450517839, + "learning_rate": 0.0006179666292451775, + "loss": 1.4889, + "step": 4107 + }, + { + "epoch": 0.44, + "grad_norm": 0.06755098834408893, + "learning_rate": 0.0006177974299456568, + "loss": 1.4158, + "step": 4108 + }, + { + "epoch": 0.44, + "grad_norm": 0.07359010079927847, + "learning_rate": 0.0006176282163627917, + "loss": 1.4536, + "step": 4109 + }, + { + "epoch": 0.44, + "grad_norm": 0.08025899963759743, + "learning_rate": 0.0006174589885170995, + "loss": 1.5464, + "step": 4110 + }, + { + "epoch": 0.44, + "grad_norm": 0.08035685133786531, + "learning_rate": 0.0006172897464291004, + "loss": 1.3692, + "step": 4111 + }, + { + "epoch": 0.44, + "grad_norm": 0.06498891842009204, + "learning_rate": 0.000617120490119315, + "loss": 1.5264, + "step": 4112 + }, + { + "epoch": 0.44, + "grad_norm": 0.06996794393706789, + "learning_rate": 0.0006169512196082663, + "loss": 1.484, + "step": 4113 + }, + { + "epoch": 0.44, + "grad_norm": 0.07656932570480635, + "learning_rate": 0.0006167819349164791, + "loss": 1.4086, + "step": 4114 + }, + { + "epoch": 0.44, + "grad_norm": 0.07442145054353973, + "learning_rate": 0.0006166126360644797, + "loss": 1.3669, + "step": 4115 + }, + { + "epoch": 0.44, + "grad_norm": 0.06996384412019334, + "learning_rate": 0.0006164433230727962, + "loss": 1.4832, + "step": 4116 + }, + { + "epoch": 0.44, + "grad_norm": 0.07644256155086539, + "learning_rate": 0.0006162739959619583, + "loss": 1.4555, + "step": 4117 + }, + { + "epoch": 0.44, + "grad_norm": 0.06486478925259731, + "learning_rate": 0.0006161046547524976, + "loss": 1.486, + "step": 4118 + }, + { + "epoch": 0.44, + "grad_norm": 0.07282495629633286, + "learning_rate": 0.0006159352994649469, + "loss": 1.4068, + "step": 4119 + }, + { + "epoch": 0.44, + "grad_norm": 0.0756036520808425, + "learning_rate": 0.0006157659301198418, + "loss": 1.5446, + "step": 4120 + }, + { + "epoch": 0.44, + "grad_norm": 0.07928837539192085, + "learning_rate": 0.0006155965467377186, + "loss": 1.3833, + "step": 4121 + }, + { + "epoch": 0.44, + "grad_norm": 0.07648852141287911, + "learning_rate": 0.0006154271493391155, + "loss": 1.4019, + "step": 4122 + }, + { + "epoch": 0.44, + "grad_norm": 0.08129914315194162, + "learning_rate": 0.0006152577379445725, + "loss": 1.4999, + "step": 4123 + }, + { + "epoch": 0.44, + "grad_norm": 0.06435353991840793, + "learning_rate": 0.0006150883125746314, + "loss": 1.429, + "step": 4124 + }, + { + "epoch": 0.44, + "grad_norm": 0.07344269169809117, + "learning_rate": 0.0006149188732498357, + "loss": 1.4553, + "step": 4125 + }, + { + "epoch": 0.44, + "grad_norm": 0.06969141501711071, + "learning_rate": 0.0006147494199907305, + "loss": 1.3447, + "step": 4126 + }, + { + "epoch": 0.44, + "grad_norm": 0.07913855361793129, + "learning_rate": 0.0006145799528178625, + "loss": 1.4756, + "step": 4127 + }, + { + "epoch": 0.44, + "grad_norm": 0.07453073475167775, + "learning_rate": 0.0006144104717517802, + "loss": 1.5231, + "step": 4128 + }, + { + "epoch": 0.44, + "grad_norm": 0.07847315579126765, + "learning_rate": 0.0006142409768130338, + "loss": 1.5112, + "step": 4129 + }, + { + "epoch": 0.44, + "grad_norm": 0.06889484864529667, + "learning_rate": 0.0006140714680221749, + "loss": 1.3361, + "step": 4130 + }, + { + "epoch": 0.44, + "grad_norm": 0.07564516476238581, + "learning_rate": 0.0006139019453997575, + "loss": 1.3954, + "step": 4131 + }, + { + "epoch": 0.44, + "grad_norm": 0.06694427175894813, + "learning_rate": 0.0006137324089663365, + "loss": 1.4801, + "step": 4132 + }, + { + "epoch": 0.44, + "grad_norm": 0.06993820649694431, + "learning_rate": 0.0006135628587424687, + "loss": 1.3924, + "step": 4133 + }, + { + "epoch": 0.44, + "grad_norm": 0.06788981217297224, + "learning_rate": 0.0006133932947487129, + "loss": 1.3172, + "step": 4134 + }, + { + "epoch": 0.44, + "grad_norm": 0.07762909048019521, + "learning_rate": 0.0006132237170056291, + "loss": 1.3408, + "step": 4135 + }, + { + "epoch": 0.44, + "grad_norm": 0.07190854220257663, + "learning_rate": 0.0006130541255337791, + "loss": 1.3979, + "step": 4136 + }, + { + "epoch": 0.44, + "grad_norm": 0.07586277240340727, + "learning_rate": 0.0006128845203537269, + "loss": 1.3706, + "step": 4137 + }, + { + "epoch": 0.44, + "grad_norm": 0.08398736072378425, + "learning_rate": 0.0006127149014860374, + "loss": 1.4149, + "step": 4138 + }, + { + "epoch": 0.44, + "grad_norm": 0.07868961218092786, + "learning_rate": 0.0006125452689512774, + "loss": 1.4638, + "step": 4139 + }, + { + "epoch": 0.45, + "grad_norm": 0.06358548319471856, + "learning_rate": 0.0006123756227700155, + "loss": 1.3557, + "step": 4140 + }, + { + "epoch": 0.45, + "grad_norm": 0.06691413699984927, + "learning_rate": 0.000612205962962822, + "loss": 1.4534, + "step": 4141 + }, + { + "epoch": 0.45, + "grad_norm": 0.06411817333259076, + "learning_rate": 0.0006120362895502687, + "loss": 1.4033, + "step": 4142 + }, + { + "epoch": 0.45, + "grad_norm": 0.061015608518283186, + "learning_rate": 0.0006118666025529292, + "loss": 1.4078, + "step": 4143 + }, + { + "epoch": 0.45, + "grad_norm": 0.06242891829313593, + "learning_rate": 0.0006116969019913781, + "loss": 1.6057, + "step": 4144 + }, + { + "epoch": 0.45, + "grad_norm": 0.07128735272434375, + "learning_rate": 0.0006115271878861928, + "loss": 1.5334, + "step": 4145 + }, + { + "epoch": 0.45, + "grad_norm": 0.06366021932361336, + "learning_rate": 0.0006113574602579515, + "loss": 1.376, + "step": 4146 + }, + { + "epoch": 0.45, + "grad_norm": 0.07725408870783299, + "learning_rate": 0.0006111877191272339, + "loss": 1.3857, + "step": 4147 + }, + { + "epoch": 0.45, + "grad_norm": 0.07624293185866599, + "learning_rate": 0.0006110179645146224, + "loss": 1.3786, + "step": 4148 + }, + { + "epoch": 0.45, + "grad_norm": 0.06575853794546838, + "learning_rate": 0.0006108481964407, + "loss": 1.3347, + "step": 4149 + }, + { + "epoch": 0.45, + "grad_norm": 0.07375952009354378, + "learning_rate": 0.0006106784149260513, + "loss": 1.4017, + "step": 4150 + }, + { + "epoch": 0.45, + "grad_norm": 0.07059410446959481, + "learning_rate": 0.0006105086199912635, + "loss": 1.3164, + "step": 4151 + }, + { + "epoch": 0.45, + "grad_norm": 0.07501949153880226, + "learning_rate": 0.0006103388116569244, + "loss": 1.5203, + "step": 4152 + }, + { + "epoch": 0.45, + "grad_norm": 0.06371702168963504, + "learning_rate": 0.000610168989943624, + "loss": 1.4431, + "step": 4153 + }, + { + "epoch": 0.45, + "grad_norm": 0.06411394548266706, + "learning_rate": 0.0006099991548719539, + "loss": 1.5797, + "step": 4154 + }, + { + "epoch": 0.45, + "grad_norm": 0.07495407167224134, + "learning_rate": 0.000609829306462507, + "loss": 1.4326, + "step": 4155 + }, + { + "epoch": 0.45, + "grad_norm": 0.06631718739285854, + "learning_rate": 0.0006096594447358778, + "loss": 1.4092, + "step": 4156 + }, + { + "epoch": 0.45, + "grad_norm": 0.09938044865046552, + "learning_rate": 0.0006094895697126631, + "loss": 1.5385, + "step": 4157 + }, + { + "epoch": 0.45, + "grad_norm": 0.07461912150158716, + "learning_rate": 0.0006093196814134605, + "loss": 1.4419, + "step": 4158 + }, + { + "epoch": 0.45, + "grad_norm": 0.07312130807661461, + "learning_rate": 0.0006091497798588699, + "loss": 1.4032, + "step": 4159 + }, + { + "epoch": 0.45, + "grad_norm": 0.07582986448445779, + "learning_rate": 0.000608979865069492, + "loss": 1.4466, + "step": 4160 + }, + { + "epoch": 0.45, + "grad_norm": 0.07880974884557526, + "learning_rate": 0.0006088099370659297, + "loss": 1.4004, + "step": 4161 + }, + { + "epoch": 0.45, + "grad_norm": 0.07458495001160687, + "learning_rate": 0.0006086399958687875, + "loss": 1.4518, + "step": 4162 + }, + { + "epoch": 0.45, + "grad_norm": 0.06578258899976172, + "learning_rate": 0.0006084700414986712, + "loss": 1.3905, + "step": 4163 + }, + { + "epoch": 0.45, + "grad_norm": 0.06810011832709272, + "learning_rate": 0.0006083000739761884, + "loss": 1.5071, + "step": 4164 + }, + { + "epoch": 0.45, + "grad_norm": 0.07909807518347882, + "learning_rate": 0.0006081300933219485, + "loss": 1.5617, + "step": 4165 + }, + { + "epoch": 0.45, + "grad_norm": 0.06909964402324427, + "learning_rate": 0.0006079600995565617, + "loss": 1.4311, + "step": 4166 + }, + { + "epoch": 0.45, + "grad_norm": 0.07644740121314376, + "learning_rate": 0.0006077900927006408, + "loss": 1.5397, + "step": 4167 + }, + { + "epoch": 0.45, + "grad_norm": 0.07791533648735414, + "learning_rate": 0.0006076200727747994, + "loss": 1.4011, + "step": 4168 + }, + { + "epoch": 0.45, + "grad_norm": 0.07737760669772996, + "learning_rate": 0.0006074500397996533, + "loss": 1.3309, + "step": 4169 + }, + { + "epoch": 0.45, + "grad_norm": 0.06895771259558378, + "learning_rate": 0.0006072799937958195, + "loss": 1.3748, + "step": 4170 + }, + { + "epoch": 0.45, + "grad_norm": 0.07586645523731778, + "learning_rate": 0.0006071099347839164, + "loss": 1.5111, + "step": 4171 + }, + { + "epoch": 0.45, + "grad_norm": 0.07236277970051411, + "learning_rate": 0.0006069398627845645, + "loss": 1.3702, + "step": 4172 + }, + { + "epoch": 0.45, + "grad_norm": 0.07276198083117988, + "learning_rate": 0.0006067697778183856, + "loss": 1.3674, + "step": 4173 + }, + { + "epoch": 0.45, + "grad_norm": 0.06649290069396598, + "learning_rate": 0.0006065996799060031, + "loss": 1.406, + "step": 4174 + }, + { + "epoch": 0.45, + "grad_norm": 0.085528893791016, + "learning_rate": 0.0006064295690680418, + "loss": 1.4379, + "step": 4175 + }, + { + "epoch": 0.45, + "grad_norm": 0.07056553332509218, + "learning_rate": 0.0006062594453251285, + "loss": 1.5733, + "step": 4176 + }, + { + "epoch": 0.45, + "grad_norm": 0.06910344638946603, + "learning_rate": 0.0006060893086978909, + "loss": 1.5036, + "step": 4177 + }, + { + "epoch": 0.45, + "grad_norm": 0.06867054468060886, + "learning_rate": 0.0006059191592069589, + "loss": 1.367, + "step": 4178 + }, + { + "epoch": 0.45, + "grad_norm": 0.07692617680362644, + "learning_rate": 0.0006057489968729638, + "loss": 1.516, + "step": 4179 + }, + { + "epoch": 0.45, + "grad_norm": 0.06656309146573545, + "learning_rate": 0.0006055788217165383, + "loss": 1.4344, + "step": 4180 + }, + { + "epoch": 0.45, + "grad_norm": 0.06873108247286198, + "learning_rate": 0.0006054086337583165, + "loss": 1.4402, + "step": 4181 + }, + { + "epoch": 0.45, + "grad_norm": 0.07460649744446407, + "learning_rate": 0.0006052384330189346, + "loss": 1.4741, + "step": 4182 + }, + { + "epoch": 0.45, + "grad_norm": 0.07849888148581544, + "learning_rate": 0.0006050682195190299, + "loss": 1.3884, + "step": 4183 + }, + { + "epoch": 0.45, + "grad_norm": 0.0652682809816125, + "learning_rate": 0.0006048979932792413, + "loss": 1.3857, + "step": 4184 + }, + { + "epoch": 0.45, + "grad_norm": 0.07701868904645039, + "learning_rate": 0.0006047277543202095, + "loss": 1.4581, + "step": 4185 + }, + { + "epoch": 0.45, + "grad_norm": 0.06849774253433671, + "learning_rate": 0.0006045575026625762, + "loss": 1.3599, + "step": 4186 + }, + { + "epoch": 0.45, + "grad_norm": 0.09158825757368835, + "learning_rate": 0.0006043872383269854, + "loss": 1.3715, + "step": 4187 + }, + { + "epoch": 0.45, + "grad_norm": 0.07279400872351754, + "learning_rate": 0.000604216961334082, + "loss": 1.3506, + "step": 4188 + }, + { + "epoch": 0.45, + "grad_norm": 0.06912852463603696, + "learning_rate": 0.0006040466717045127, + "loss": 1.4299, + "step": 4189 + }, + { + "epoch": 0.45, + "grad_norm": 0.06763599779398562, + "learning_rate": 0.0006038763694589258, + "loss": 1.3763, + "step": 4190 + }, + { + "epoch": 0.45, + "grad_norm": 0.08951066042952609, + "learning_rate": 0.0006037060546179712, + "loss": 1.4625, + "step": 4191 + }, + { + "epoch": 0.45, + "grad_norm": 0.06326259809844488, + "learning_rate": 0.0006035357272022997, + "loss": 1.4229, + "step": 4192 + }, + { + "epoch": 0.45, + "grad_norm": 0.07451116529186678, + "learning_rate": 0.0006033653872325644, + "loss": 1.4435, + "step": 4193 + }, + { + "epoch": 0.45, + "grad_norm": 0.07508265893694721, + "learning_rate": 0.0006031950347294196, + "loss": 1.4549, + "step": 4194 + }, + { + "epoch": 0.45, + "grad_norm": 0.07826032094894797, + "learning_rate": 0.0006030246697135209, + "loss": 1.4891, + "step": 4195 + }, + { + "epoch": 0.45, + "grad_norm": 0.07645274230350402, + "learning_rate": 0.000602854292205526, + "loss": 1.3029, + "step": 4196 + }, + { + "epoch": 0.45, + "grad_norm": 0.06920033529496816, + "learning_rate": 0.0006026839022260935, + "loss": 1.412, + "step": 4197 + }, + { + "epoch": 0.45, + "grad_norm": 0.06704934333095056, + "learning_rate": 0.0006025134997958839, + "loss": 1.2994, + "step": 4198 + }, + { + "epoch": 0.45, + "grad_norm": 0.07423418599780926, + "learning_rate": 0.000602343084935559, + "loss": 1.3095, + "step": 4199 + }, + { + "epoch": 0.45, + "grad_norm": 0.07390960821320829, + "learning_rate": 0.0006021726576657822, + "loss": 1.3535, + "step": 4200 + }, + { + "epoch": 0.45, + "grad_norm": 0.07476816449840105, + "learning_rate": 0.0006020022180072184, + "loss": 1.4619, + "step": 4201 + }, + { + "epoch": 0.45, + "grad_norm": 0.07002103516627521, + "learning_rate": 0.0006018317659805341, + "loss": 1.3879, + "step": 4202 + }, + { + "epoch": 0.45, + "grad_norm": 0.0773920967736904, + "learning_rate": 0.000601661301606397, + "loss": 1.5078, + "step": 4203 + }, + { + "epoch": 0.45, + "grad_norm": 0.07217551586812339, + "learning_rate": 0.0006014908249054767, + "loss": 1.4657, + "step": 4204 + }, + { + "epoch": 0.45, + "grad_norm": 0.07543631118107905, + "learning_rate": 0.000601320335898444, + "loss": 1.4275, + "step": 4205 + }, + { + "epoch": 0.45, + "grad_norm": 0.07223024017477711, + "learning_rate": 0.0006011498346059712, + "loss": 1.3861, + "step": 4206 + }, + { + "epoch": 0.45, + "grad_norm": 0.07472220435048994, + "learning_rate": 0.0006009793210487322, + "loss": 1.4108, + "step": 4207 + }, + { + "epoch": 0.45, + "grad_norm": 0.07639838356520745, + "learning_rate": 0.0006008087952474024, + "loss": 1.4983, + "step": 4208 + }, + { + "epoch": 0.45, + "grad_norm": 0.0701511264724409, + "learning_rate": 0.0006006382572226587, + "loss": 1.3362, + "step": 4209 + }, + { + "epoch": 0.45, + "grad_norm": 0.06770431572735086, + "learning_rate": 0.0006004677069951792, + "loss": 1.4401, + "step": 4210 + }, + { + "epoch": 0.45, + "grad_norm": 0.07739188467731939, + "learning_rate": 0.0006002971445856441, + "loss": 1.4421, + "step": 4211 + }, + { + "epoch": 0.45, + "grad_norm": 0.07539465339428007, + "learning_rate": 0.0006001265700147344, + "loss": 1.5035, + "step": 4212 + }, + { + "epoch": 0.45, + "grad_norm": 0.07078364592406552, + "learning_rate": 0.0005999559833031327, + "loss": 1.3512, + "step": 4213 + }, + { + "epoch": 0.45, + "grad_norm": 0.07865502303660855, + "learning_rate": 0.0005997853844715237, + "loss": 1.4592, + "step": 4214 + }, + { + "epoch": 0.45, + "grad_norm": 0.06829634894502473, + "learning_rate": 0.0005996147735405925, + "loss": 1.3703, + "step": 4215 + }, + { + "epoch": 0.45, + "grad_norm": 0.0781852996100699, + "learning_rate": 0.0005994441505310269, + "loss": 1.3338, + "step": 4216 + }, + { + "epoch": 0.45, + "grad_norm": 0.07062610585794313, + "learning_rate": 0.0005992735154635151, + "loss": 1.4492, + "step": 4217 + }, + { + "epoch": 0.45, + "grad_norm": 0.07909815760336844, + "learning_rate": 0.0005991028683587471, + "loss": 1.3787, + "step": 4218 + }, + { + "epoch": 0.45, + "grad_norm": 0.07063792835909273, + "learning_rate": 0.000598932209237415, + "loss": 1.363, + "step": 4219 + }, + { + "epoch": 0.45, + "grad_norm": 0.08213431406304056, + "learning_rate": 0.0005987615381202112, + "loss": 1.3396, + "step": 4220 + }, + { + "epoch": 0.45, + "grad_norm": 0.07766459822155773, + "learning_rate": 0.0005985908550278305, + "loss": 1.4481, + "step": 4221 + }, + { + "epoch": 0.45, + "grad_norm": 0.07358174371250564, + "learning_rate": 0.0005984201599809689, + "loss": 1.37, + "step": 4222 + }, + { + "epoch": 0.45, + "grad_norm": 0.06590233723419557, + "learning_rate": 0.0005982494530003233, + "loss": 1.3558, + "step": 4223 + }, + { + "epoch": 0.45, + "grad_norm": 0.0682442044026668, + "learning_rate": 0.0005980787341065929, + "loss": 1.4652, + "step": 4224 + }, + { + "epoch": 0.45, + "grad_norm": 0.07891003356492753, + "learning_rate": 0.000597908003320478, + "loss": 1.4018, + "step": 4225 + }, + { + "epoch": 0.45, + "grad_norm": 0.06639971946412793, + "learning_rate": 0.00059773726066268, + "loss": 1.3917, + "step": 4226 + }, + { + "epoch": 0.45, + "grad_norm": 0.07335602496727649, + "learning_rate": 0.0005975665061539022, + "loss": 1.3812, + "step": 4227 + }, + { + "epoch": 0.45, + "grad_norm": 0.07410909291067329, + "learning_rate": 0.0005973957398148493, + "loss": 1.6306, + "step": 4228 + }, + { + "epoch": 0.45, + "grad_norm": 0.06263196785177541, + "learning_rate": 0.000597224961666227, + "loss": 1.4055, + "step": 4229 + }, + { + "epoch": 0.45, + "grad_norm": 0.06865685568182275, + "learning_rate": 0.0005970541717287431, + "loss": 1.4365, + "step": 4230 + }, + { + "epoch": 0.45, + "grad_norm": 0.07007647676666574, + "learning_rate": 0.0005968833700231062, + "loss": 1.564, + "step": 4231 + }, + { + "epoch": 0.45, + "grad_norm": 0.0759327624640143, + "learning_rate": 0.0005967125565700265, + "loss": 1.3756, + "step": 4232 + }, + { + "epoch": 0.46, + "grad_norm": 0.0683659235338103, + "learning_rate": 0.0005965417313902162, + "loss": 1.3516, + "step": 4233 + }, + { + "epoch": 0.46, + "grad_norm": 0.059114031755854154, + "learning_rate": 0.000596370894504388, + "loss": 1.3317, + "step": 4234 + }, + { + "epoch": 0.46, + "grad_norm": 0.06596578435593728, + "learning_rate": 0.0005962000459332566, + "loss": 1.3701, + "step": 4235 + }, + { + "epoch": 0.46, + "grad_norm": 0.056174330967718004, + "learning_rate": 0.000596029185697538, + "loss": 1.4079, + "step": 4236 + }, + { + "epoch": 0.46, + "grad_norm": 0.06556568495064191, + "learning_rate": 0.0005958583138179494, + "loss": 1.3345, + "step": 4237 + }, + { + "epoch": 0.46, + "grad_norm": 0.07692922309310339, + "learning_rate": 0.00059568743031521, + "loss": 1.4097, + "step": 4238 + }, + { + "epoch": 0.46, + "grad_norm": 0.06730920807440371, + "learning_rate": 0.0005955165352100398, + "loss": 1.3266, + "step": 4239 + }, + { + "epoch": 0.46, + "grad_norm": 0.06838790689808874, + "learning_rate": 0.0005953456285231602, + "loss": 1.433, + "step": 4240 + }, + { + "epoch": 0.46, + "grad_norm": 0.06915277142703034, + "learning_rate": 0.0005951747102752946, + "loss": 1.429, + "step": 4241 + }, + { + "epoch": 0.46, + "grad_norm": 0.07277351313611133, + "learning_rate": 0.0005950037804871673, + "loss": 1.5549, + "step": 4242 + }, + { + "epoch": 0.46, + "grad_norm": 0.06650552884193628, + "learning_rate": 0.0005948328391795038, + "loss": 1.3798, + "step": 4243 + }, + { + "epoch": 0.46, + "grad_norm": 0.07550184977976046, + "learning_rate": 0.000594661886373032, + "loss": 1.3489, + "step": 4244 + }, + { + "epoch": 0.46, + "grad_norm": 0.06434529850760629, + "learning_rate": 0.00059449092208848, + "loss": 1.5048, + "step": 4245 + }, + { + "epoch": 0.46, + "grad_norm": 0.06134605286144569, + "learning_rate": 0.0005943199463465779, + "loss": 1.4264, + "step": 4246 + }, + { + "epoch": 0.46, + "grad_norm": 0.07135379478202215, + "learning_rate": 0.000594148959168057, + "loss": 1.4708, + "step": 4247 + }, + { + "epoch": 0.46, + "grad_norm": 0.0657081744632789, + "learning_rate": 0.0005939779605736504, + "loss": 1.4203, + "step": 4248 + }, + { + "epoch": 0.46, + "grad_norm": 0.09721619161803492, + "learning_rate": 0.0005938069505840919, + "loss": 1.5514, + "step": 4249 + }, + { + "epoch": 0.46, + "grad_norm": 0.06960076559949502, + "learning_rate": 0.0005936359292201174, + "loss": 1.4348, + "step": 4250 + }, + { + "epoch": 0.46, + "grad_norm": 0.0681511654307289, + "learning_rate": 0.0005934648965024636, + "loss": 1.2808, + "step": 4251 + }, + { + "epoch": 0.46, + "grad_norm": 0.0665974511558386, + "learning_rate": 0.0005932938524518689, + "loss": 1.548, + "step": 4252 + }, + { + "epoch": 0.46, + "grad_norm": 0.06975298195353812, + "learning_rate": 0.0005931227970890731, + "loss": 1.5667, + "step": 4253 + }, + { + "epoch": 0.46, + "grad_norm": 0.0671332270408465, + "learning_rate": 0.000592951730434817, + "loss": 1.3778, + "step": 4254 + }, + { + "epoch": 0.46, + "grad_norm": 0.062144346033571184, + "learning_rate": 0.0005927806525098431, + "loss": 1.4931, + "step": 4255 + }, + { + "epoch": 0.46, + "grad_norm": 0.06924031944664585, + "learning_rate": 0.0005926095633348952, + "loss": 1.4989, + "step": 4256 + }, + { + "epoch": 0.46, + "grad_norm": 0.06798196988097031, + "learning_rate": 0.0005924384629307184, + "loss": 1.4264, + "step": 4257 + }, + { + "epoch": 0.46, + "grad_norm": 0.07981962984856707, + "learning_rate": 0.0005922673513180596, + "loss": 1.379, + "step": 4258 + }, + { + "epoch": 0.46, + "grad_norm": 0.07231588001633618, + "learning_rate": 0.0005920962285176661, + "loss": 1.3043, + "step": 4259 + }, + { + "epoch": 0.46, + "grad_norm": 0.0742394001366356, + "learning_rate": 0.0005919250945502874, + "loss": 1.4649, + "step": 4260 + }, + { + "epoch": 0.46, + "grad_norm": 0.0699345660413472, + "learning_rate": 0.0005917539494366741, + "loss": 1.3223, + "step": 4261 + }, + { + "epoch": 0.46, + "grad_norm": 0.07213363643829698, + "learning_rate": 0.0005915827931975782, + "loss": 1.3135, + "step": 4262 + }, + { + "epoch": 0.46, + "grad_norm": 0.07930439093632695, + "learning_rate": 0.0005914116258537527, + "loss": 1.423, + "step": 4263 + }, + { + "epoch": 0.46, + "grad_norm": 0.07094944522631236, + "learning_rate": 0.0005912404474259526, + "loss": 1.4871, + "step": 4264 + }, + { + "epoch": 0.46, + "grad_norm": 0.07994927967283738, + "learning_rate": 0.0005910692579349336, + "loss": 1.4013, + "step": 4265 + }, + { + "epoch": 0.46, + "grad_norm": 0.0739574577729969, + "learning_rate": 0.0005908980574014532, + "loss": 1.3338, + "step": 4266 + }, + { + "epoch": 0.46, + "grad_norm": 0.07033576100419545, + "learning_rate": 0.00059072684584627, + "loss": 1.3731, + "step": 4267 + }, + { + "epoch": 0.46, + "grad_norm": 0.0693006408871248, + "learning_rate": 0.0005905556232901437, + "loss": 1.4151, + "step": 4268 + }, + { + "epoch": 0.46, + "grad_norm": 0.0771275790905268, + "learning_rate": 0.000590384389753836, + "loss": 1.3806, + "step": 4269 + }, + { + "epoch": 0.46, + "grad_norm": 0.08802543077110558, + "learning_rate": 0.0005902131452581096, + "loss": 1.4195, + "step": 4270 + }, + { + "epoch": 0.46, + "grad_norm": 0.07815218988772281, + "learning_rate": 0.0005900418898237281, + "loss": 1.3446, + "step": 4271 + }, + { + "epoch": 0.46, + "grad_norm": 0.07661324405037143, + "learning_rate": 0.0005898706234714572, + "loss": 1.5356, + "step": 4272 + }, + { + "epoch": 0.46, + "grad_norm": 0.07920864004503088, + "learning_rate": 0.0005896993462220633, + "loss": 1.3312, + "step": 4273 + }, + { + "epoch": 0.46, + "grad_norm": 0.07351431361264203, + "learning_rate": 0.0005895280580963144, + "loss": 1.3353, + "step": 4274 + }, + { + "epoch": 0.46, + "grad_norm": 0.0829204899421725, + "learning_rate": 0.0005893567591149799, + "loss": 1.4148, + "step": 4275 + }, + { + "epoch": 0.46, + "grad_norm": 0.0731711710914952, + "learning_rate": 0.0005891854492988302, + "loss": 1.4031, + "step": 4276 + }, + { + "epoch": 0.46, + "grad_norm": 0.07697900216587132, + "learning_rate": 0.0005890141286686375, + "loss": 1.4344, + "step": 4277 + }, + { + "epoch": 0.46, + "grad_norm": 0.08406706969300401, + "learning_rate": 0.0005888427972451746, + "loss": 1.4329, + "step": 4278 + }, + { + "epoch": 0.46, + "grad_norm": 0.07000902617829333, + "learning_rate": 0.0005886714550492163, + "loss": 1.5284, + "step": 4279 + }, + { + "epoch": 0.46, + "grad_norm": 0.07299706209596549, + "learning_rate": 0.0005885001021015383, + "loss": 1.5595, + "step": 4280 + }, + { + "epoch": 0.46, + "grad_norm": 0.06511331156240723, + "learning_rate": 0.0005883287384229181, + "loss": 1.4026, + "step": 4281 + }, + { + "epoch": 0.46, + "grad_norm": 0.08615882506688809, + "learning_rate": 0.0005881573640341335, + "loss": 1.3818, + "step": 4282 + }, + { + "epoch": 0.46, + "grad_norm": 0.0665931298491308, + "learning_rate": 0.0005879859789559649, + "loss": 1.489, + "step": 4283 + }, + { + "epoch": 0.46, + "grad_norm": 0.06736534693938794, + "learning_rate": 0.0005878145832091929, + "loss": 1.4695, + "step": 4284 + }, + { + "epoch": 0.46, + "grad_norm": 0.07637517648261326, + "learning_rate": 0.0005876431768145997, + "loss": 1.4606, + "step": 4285 + }, + { + "epoch": 0.46, + "grad_norm": 0.07295821930998996, + "learning_rate": 0.0005874717597929696, + "loss": 1.2971, + "step": 4286 + }, + { + "epoch": 0.46, + "grad_norm": 0.06826614255036653, + "learning_rate": 0.0005873003321650868, + "loss": 1.4135, + "step": 4287 + }, + { + "epoch": 0.46, + "grad_norm": 0.07459899080496685, + "learning_rate": 0.0005871288939517377, + "loss": 1.4705, + "step": 4288 + }, + { + "epoch": 0.46, + "grad_norm": 0.07846593078913912, + "learning_rate": 0.00058695744517371, + "loss": 1.4887, + "step": 4289 + }, + { + "epoch": 0.46, + "grad_norm": 0.07214161019843333, + "learning_rate": 0.0005867859858517922, + "loss": 1.3555, + "step": 4290 + }, + { + "epoch": 0.46, + "grad_norm": 0.06799052813777913, + "learning_rate": 0.0005866145160067745, + "loss": 1.5964, + "step": 4291 + }, + { + "epoch": 0.46, + "grad_norm": 0.06470531737409344, + "learning_rate": 0.000586443035659448, + "loss": 1.4667, + "step": 4292 + }, + { + "epoch": 0.46, + "grad_norm": 0.06948444252266078, + "learning_rate": 0.0005862715448306055, + "loss": 1.4447, + "step": 4293 + }, + { + "epoch": 0.46, + "grad_norm": 0.06420616680388852, + "learning_rate": 0.0005861000435410407, + "loss": 1.4562, + "step": 4294 + }, + { + "epoch": 0.46, + "grad_norm": 0.07606806237403266, + "learning_rate": 0.0005859285318115488, + "loss": 1.4291, + "step": 4295 + }, + { + "epoch": 0.46, + "grad_norm": 0.06970676132945107, + "learning_rate": 0.0005857570096629262, + "loss": 1.5096, + "step": 4296 + }, + { + "epoch": 0.46, + "grad_norm": 0.07218587347413118, + "learning_rate": 0.0005855854771159706, + "loss": 1.4344, + "step": 4297 + }, + { + "epoch": 0.46, + "grad_norm": 0.0869640303145645, + "learning_rate": 0.0005854139341914808, + "loss": 1.4323, + "step": 4298 + }, + { + "epoch": 0.46, + "grad_norm": 0.06296273663509053, + "learning_rate": 0.0005852423809102568, + "loss": 1.3476, + "step": 4299 + }, + { + "epoch": 0.46, + "grad_norm": 0.06801967942797818, + "learning_rate": 0.0005850708172931005, + "loss": 1.4381, + "step": 4300 + }, + { + "epoch": 0.46, + "grad_norm": 0.07313413676647226, + "learning_rate": 0.0005848992433608142, + "loss": 1.4988, + "step": 4301 + }, + { + "epoch": 0.46, + "grad_norm": 0.07855268328588744, + "learning_rate": 0.0005847276591342021, + "loss": 1.3393, + "step": 4302 + }, + { + "epoch": 0.46, + "grad_norm": 0.07208810569830114, + "learning_rate": 0.0005845560646340691, + "loss": 1.4611, + "step": 4303 + }, + { + "epoch": 0.46, + "grad_norm": 0.07485263168250995, + "learning_rate": 0.0005843844598812218, + "loss": 1.394, + "step": 4304 + }, + { + "epoch": 0.46, + "grad_norm": 0.06973519268508456, + "learning_rate": 0.0005842128448964676, + "loss": 1.3173, + "step": 4305 + }, + { + "epoch": 0.46, + "grad_norm": 0.06435508387831129, + "learning_rate": 0.000584041219700616, + "loss": 1.4103, + "step": 4306 + }, + { + "epoch": 0.46, + "grad_norm": 0.0719765280377028, + "learning_rate": 0.0005838695843144766, + "loss": 1.3714, + "step": 4307 + }, + { + "epoch": 0.46, + "grad_norm": 0.08143997254921763, + "learning_rate": 0.000583697938758861, + "loss": 1.5099, + "step": 4308 + }, + { + "epoch": 0.46, + "grad_norm": 0.06992837500062335, + "learning_rate": 0.0005835262830545816, + "loss": 1.3621, + "step": 4309 + }, + { + "epoch": 0.46, + "grad_norm": 0.0667326641216811, + "learning_rate": 0.0005833546172224526, + "loss": 1.3966, + "step": 4310 + }, + { + "epoch": 0.46, + "grad_norm": 0.07025203654887667, + "learning_rate": 0.0005831829412832888, + "loss": 1.3464, + "step": 4311 + }, + { + "epoch": 0.46, + "grad_norm": 0.07092819341479226, + "learning_rate": 0.0005830112552579067, + "loss": 1.4383, + "step": 4312 + }, + { + "epoch": 0.46, + "grad_norm": 0.06969893660218596, + "learning_rate": 0.0005828395591671238, + "loss": 1.3822, + "step": 4313 + }, + { + "epoch": 0.46, + "grad_norm": 0.07472487467182264, + "learning_rate": 0.0005826678530317585, + "loss": 1.3483, + "step": 4314 + }, + { + "epoch": 0.46, + "grad_norm": 0.08045792356361967, + "learning_rate": 0.0005824961368726311, + "loss": 1.2365, + "step": 4315 + }, + { + "epoch": 0.46, + "grad_norm": 0.07539767278084446, + "learning_rate": 0.0005823244107105627, + "loss": 1.4309, + "step": 4316 + }, + { + "epoch": 0.46, + "grad_norm": 0.06989912203654702, + "learning_rate": 0.0005821526745663758, + "loss": 1.3415, + "step": 4317 + }, + { + "epoch": 0.46, + "grad_norm": 0.07123258473151052, + "learning_rate": 0.000581980928460894, + "loss": 1.3155, + "step": 4318 + }, + { + "epoch": 0.46, + "grad_norm": 0.07321426972149589, + "learning_rate": 0.0005818091724149417, + "loss": 1.5035, + "step": 4319 + }, + { + "epoch": 0.46, + "grad_norm": 0.06861095456012398, + "learning_rate": 0.0005816374064493453, + "loss": 1.4533, + "step": 4320 + }, + { + "epoch": 0.46, + "grad_norm": 0.062160527409850376, + "learning_rate": 0.000581465630584932, + "loss": 1.5065, + "step": 4321 + }, + { + "epoch": 0.46, + "grad_norm": 0.08033920097956411, + "learning_rate": 0.0005812938448425299, + "loss": 1.4659, + "step": 4322 + }, + { + "epoch": 0.46, + "grad_norm": 0.06743837135831846, + "learning_rate": 0.0005811220492429692, + "loss": 1.4259, + "step": 4323 + }, + { + "epoch": 0.46, + "grad_norm": 0.07239672168722948, + "learning_rate": 0.0005809502438070801, + "loss": 1.3405, + "step": 4324 + }, + { + "epoch": 0.46, + "grad_norm": 0.07073635590191434, + "learning_rate": 0.000580778428555695, + "loss": 1.3189, + "step": 4325 + }, + { + "epoch": 0.47, + "grad_norm": 0.07614227248503913, + "learning_rate": 0.0005806066035096468, + "loss": 1.3165, + "step": 4326 + }, + { + "epoch": 0.47, + "grad_norm": 0.06700790849180441, + "learning_rate": 0.0005804347686897702, + "loss": 1.3281, + "step": 4327 + }, + { + "epoch": 0.47, + "grad_norm": 0.06791159995686306, + "learning_rate": 0.0005802629241169005, + "loss": 1.5144, + "step": 4328 + }, + { + "epoch": 0.47, + "grad_norm": 0.06581161360888663, + "learning_rate": 0.0005800910698118745, + "loss": 1.6056, + "step": 4329 + }, + { + "epoch": 0.47, + "grad_norm": 0.07091361131731538, + "learning_rate": 0.0005799192057955303, + "loss": 1.3676, + "step": 4330 + }, + { + "epoch": 0.47, + "grad_norm": 0.06892622529489716, + "learning_rate": 0.0005797473320887068, + "loss": 1.3971, + "step": 4331 + }, + { + "epoch": 0.47, + "grad_norm": 0.06963628707063342, + "learning_rate": 0.0005795754487122444, + "loss": 1.4121, + "step": 4332 + }, + { + "epoch": 0.47, + "grad_norm": 0.07610530573128409, + "learning_rate": 0.0005794035556869843, + "loss": 1.4506, + "step": 4333 + }, + { + "epoch": 0.47, + "grad_norm": 0.06066084045830262, + "learning_rate": 0.0005792316530337696, + "loss": 1.5658, + "step": 4334 + }, + { + "epoch": 0.47, + "grad_norm": 0.0625241482545445, + "learning_rate": 0.0005790597407734437, + "loss": 1.3852, + "step": 4335 + }, + { + "epoch": 0.47, + "grad_norm": 0.07533639979979685, + "learning_rate": 0.0005788878189268516, + "loss": 1.4158, + "step": 4336 + }, + { + "epoch": 0.47, + "grad_norm": 0.063951064948048, + "learning_rate": 0.0005787158875148397, + "loss": 1.6017, + "step": 4337 + }, + { + "epoch": 0.47, + "grad_norm": 0.07185074481823407, + "learning_rate": 0.0005785439465582548, + "loss": 1.3789, + "step": 4338 + }, + { + "epoch": 0.47, + "grad_norm": 0.067738052717109, + "learning_rate": 0.0005783719960779458, + "loss": 1.2404, + "step": 4339 + }, + { + "epoch": 0.47, + "grad_norm": 0.06693252733909952, + "learning_rate": 0.000578200036094762, + "loss": 1.4805, + "step": 4340 + }, + { + "epoch": 0.47, + "grad_norm": 0.0707406767223583, + "learning_rate": 0.0005780280666295543, + "loss": 1.3044, + "step": 4341 + }, + { + "epoch": 0.47, + "grad_norm": 0.07245024813674909, + "learning_rate": 0.0005778560877031743, + "loss": 1.4834, + "step": 4342 + }, + { + "epoch": 0.47, + "grad_norm": 0.07076343834104014, + "learning_rate": 0.0005776840993364754, + "loss": 1.4862, + "step": 4343 + }, + { + "epoch": 0.47, + "grad_norm": 0.0771692706601467, + "learning_rate": 0.0005775121015503117, + "loss": 1.5121, + "step": 4344 + }, + { + "epoch": 0.47, + "grad_norm": 0.07575869372023414, + "learning_rate": 0.0005773400943655385, + "loss": 1.4231, + "step": 4345 + }, + { + "epoch": 0.47, + "grad_norm": 0.08331985864502259, + "learning_rate": 0.0005771680778030122, + "loss": 1.4944, + "step": 4346 + }, + { + "epoch": 0.47, + "grad_norm": 0.0978213404908466, + "learning_rate": 0.0005769960518835902, + "loss": 1.385, + "step": 4347 + }, + { + "epoch": 0.47, + "grad_norm": 0.07030500514431032, + "learning_rate": 0.0005768240166281317, + "loss": 1.384, + "step": 4348 + }, + { + "epoch": 0.47, + "grad_norm": 0.0731870634540761, + "learning_rate": 0.0005766519720574963, + "loss": 1.5135, + "step": 4349 + }, + { + "epoch": 0.47, + "grad_norm": 0.07700046097610479, + "learning_rate": 0.0005764799181925449, + "loss": 1.468, + "step": 4350 + }, + { + "epoch": 0.47, + "grad_norm": 0.0820989017585397, + "learning_rate": 0.0005763078550541399, + "loss": 1.4766, + "step": 4351 + }, + { + "epoch": 0.47, + "grad_norm": 0.06727729877440962, + "learning_rate": 0.0005761357826631443, + "loss": 1.4807, + "step": 4352 + }, + { + "epoch": 0.47, + "grad_norm": 0.0710779421387419, + "learning_rate": 0.0005759637010404226, + "loss": 1.4129, + "step": 4353 + }, + { + "epoch": 0.47, + "grad_norm": 0.0792314675429011, + "learning_rate": 0.0005757916102068402, + "loss": 1.5648, + "step": 4354 + }, + { + "epoch": 0.47, + "grad_norm": 0.0694154213276982, + "learning_rate": 0.000575619510183264, + "loss": 1.3538, + "step": 4355 + }, + { + "epoch": 0.47, + "grad_norm": 0.0635064723999925, + "learning_rate": 0.0005754474009905613, + "loss": 1.3959, + "step": 4356 + }, + { + "epoch": 0.47, + "grad_norm": 0.06970467537810597, + "learning_rate": 0.0005752752826496012, + "loss": 1.4013, + "step": 4357 + }, + { + "epoch": 0.47, + "grad_norm": 0.07260049255850706, + "learning_rate": 0.0005751031551812536, + "loss": 1.3132, + "step": 4358 + }, + { + "epoch": 0.47, + "grad_norm": 0.06955971587200312, + "learning_rate": 0.0005749310186063892, + "loss": 1.3798, + "step": 4359 + }, + { + "epoch": 0.47, + "grad_norm": 0.0679171340858405, + "learning_rate": 0.0005747588729458809, + "loss": 1.526, + "step": 4360 + }, + { + "epoch": 0.47, + "grad_norm": 0.0673009342150572, + "learning_rate": 0.0005745867182206012, + "loss": 1.4129, + "step": 4361 + }, + { + "epoch": 0.47, + "grad_norm": 0.07326363249587493, + "learning_rate": 0.000574414554451425, + "loss": 1.4206, + "step": 4362 + }, + { + "epoch": 0.47, + "grad_norm": 0.06410449649673688, + "learning_rate": 0.0005742423816592274, + "loss": 1.3288, + "step": 4363 + }, + { + "epoch": 0.47, + "grad_norm": 0.06985286145175268, + "learning_rate": 0.0005740701998648849, + "loss": 1.3431, + "step": 4364 + }, + { + "epoch": 0.47, + "grad_norm": 0.07041595467515156, + "learning_rate": 0.0005738980090892757, + "loss": 1.4546, + "step": 4365 + }, + { + "epoch": 0.47, + "grad_norm": 0.07074849648123872, + "learning_rate": 0.0005737258093532781, + "loss": 1.4942, + "step": 4366 + }, + { + "epoch": 0.47, + "grad_norm": 0.06451965099343698, + "learning_rate": 0.0005735536006777716, + "loss": 1.4386, + "step": 4367 + }, + { + "epoch": 0.47, + "grad_norm": 0.07518999046838014, + "learning_rate": 0.0005733813830836376, + "loss": 1.5297, + "step": 4368 + }, + { + "epoch": 0.47, + "grad_norm": 0.06507127189100723, + "learning_rate": 0.000573209156591758, + "loss": 1.4292, + "step": 4369 + }, + { + "epoch": 0.47, + "grad_norm": 0.061908429397981614, + "learning_rate": 0.0005730369212230157, + "loss": 1.4905, + "step": 4370 + }, + { + "epoch": 0.47, + "grad_norm": 0.07486397178116465, + "learning_rate": 0.0005728646769982951, + "loss": 1.3875, + "step": 4371 + }, + { + "epoch": 0.47, + "grad_norm": 0.06995650744403155, + "learning_rate": 0.0005726924239384809, + "loss": 1.3832, + "step": 4372 + }, + { + "epoch": 0.47, + "grad_norm": 0.06642041686352557, + "learning_rate": 0.0005725201620644598, + "loss": 1.3663, + "step": 4373 + }, + { + "epoch": 0.47, + "grad_norm": 0.06838526746301918, + "learning_rate": 0.0005723478913971191, + "loss": 1.4206, + "step": 4374 + }, + { + "epoch": 0.47, + "grad_norm": 0.06994726686310161, + "learning_rate": 0.0005721756119573471, + "loss": 1.3621, + "step": 4375 + }, + { + "epoch": 0.47, + "grad_norm": 0.07335027824922698, + "learning_rate": 0.0005720033237660334, + "loss": 1.38, + "step": 4376 + }, + { + "epoch": 0.47, + "grad_norm": 0.07132320370362646, + "learning_rate": 0.0005718310268440683, + "loss": 1.5475, + "step": 4377 + }, + { + "epoch": 0.47, + "grad_norm": 0.07136276677595751, + "learning_rate": 0.0005716587212123435, + "loss": 1.4212, + "step": 4378 + }, + { + "epoch": 0.47, + "grad_norm": 0.06487403418814532, + "learning_rate": 0.0005714864068917519, + "loss": 1.4789, + "step": 4379 + }, + { + "epoch": 0.47, + "grad_norm": 0.0772161895483952, + "learning_rate": 0.0005713140839031871, + "loss": 1.4534, + "step": 4380 + }, + { + "epoch": 0.47, + "grad_norm": 0.06981128959729192, + "learning_rate": 0.0005711417522675437, + "loss": 1.4698, + "step": 4381 + }, + { + "epoch": 0.47, + "grad_norm": 0.07765049782121027, + "learning_rate": 0.0005709694120057174, + "loss": 1.3064, + "step": 4382 + }, + { + "epoch": 0.47, + "grad_norm": 0.07241987185501046, + "learning_rate": 0.0005707970631386054, + "loss": 1.4083, + "step": 4383 + }, + { + "epoch": 0.47, + "grad_norm": 0.07081369496009181, + "learning_rate": 0.0005706247056871053, + "loss": 1.4168, + "step": 4384 + }, + { + "epoch": 0.47, + "grad_norm": 0.06703084002299411, + "learning_rate": 0.0005704523396721162, + "loss": 1.4372, + "step": 4385 + }, + { + "epoch": 0.47, + "grad_norm": 0.08033733458464856, + "learning_rate": 0.000570279965114538, + "loss": 1.2662, + "step": 4386 + }, + { + "epoch": 0.47, + "grad_norm": 0.0781908026380043, + "learning_rate": 0.0005701075820352718, + "loss": 1.4461, + "step": 4387 + }, + { + "epoch": 0.47, + "grad_norm": 0.07027921368888441, + "learning_rate": 0.0005699351904552196, + "loss": 1.38, + "step": 4388 + }, + { + "epoch": 0.47, + "grad_norm": 0.07984943706970755, + "learning_rate": 0.0005697627903952844, + "loss": 1.3059, + "step": 4389 + }, + { + "epoch": 0.47, + "grad_norm": 0.07262529776501059, + "learning_rate": 0.0005695903818763703, + "loss": 1.439, + "step": 4390 + }, + { + "epoch": 0.47, + "grad_norm": 0.06658673915041431, + "learning_rate": 0.0005694179649193826, + "loss": 1.2767, + "step": 4391 + }, + { + "epoch": 0.47, + "grad_norm": 0.07230322011342759, + "learning_rate": 0.0005692455395452272, + "loss": 1.6068, + "step": 4392 + }, + { + "epoch": 0.47, + "grad_norm": 0.07247693991750412, + "learning_rate": 0.0005690731057748116, + "loss": 1.4641, + "step": 4393 + }, + { + "epoch": 0.47, + "grad_norm": 0.08824859279901634, + "learning_rate": 0.0005689006636290436, + "loss": 1.397, + "step": 4394 + }, + { + "epoch": 0.47, + "grad_norm": 0.07435843692754146, + "learning_rate": 0.0005687282131288326, + "loss": 1.4981, + "step": 4395 + }, + { + "epoch": 0.47, + "grad_norm": 0.06866404893479804, + "learning_rate": 0.000568555754295089, + "loss": 1.3269, + "step": 4396 + }, + { + "epoch": 0.47, + "grad_norm": 0.06631490616956406, + "learning_rate": 0.0005683832871487237, + "loss": 1.3666, + "step": 4397 + }, + { + "epoch": 0.47, + "grad_norm": 0.07461689282658565, + "learning_rate": 0.0005682108117106492, + "loss": 1.442, + "step": 4398 + }, + { + "epoch": 0.47, + "grad_norm": 0.07629678720903356, + "learning_rate": 0.0005680383280017785, + "loss": 1.4704, + "step": 4399 + }, + { + "epoch": 0.47, + "grad_norm": 0.10030040676241504, + "learning_rate": 0.000567865836043026, + "loss": 1.4009, + "step": 4400 + }, + { + "epoch": 0.47, + "grad_norm": 0.07009548150800028, + "learning_rate": 0.0005676933358553068, + "loss": 1.3269, + "step": 4401 + }, + { + "epoch": 0.47, + "grad_norm": 0.07800720543403114, + "learning_rate": 0.0005675208274595376, + "loss": 1.4252, + "step": 4402 + }, + { + "epoch": 0.47, + "grad_norm": 0.06738892904108221, + "learning_rate": 0.0005673483108766348, + "loss": 1.3973, + "step": 4403 + }, + { + "epoch": 0.47, + "grad_norm": 0.06857122139693983, + "learning_rate": 0.0005671757861275175, + "loss": 1.3929, + "step": 4404 + }, + { + "epoch": 0.47, + "grad_norm": 0.07166471776508838, + "learning_rate": 0.0005670032532331045, + "loss": 1.4055, + "step": 4405 + }, + { + "epoch": 0.47, + "grad_norm": 0.06517833141153938, + "learning_rate": 0.0005668307122143159, + "loss": 1.4336, + "step": 4406 + }, + { + "epoch": 0.47, + "grad_norm": 0.07899339662584212, + "learning_rate": 0.0005666581630920731, + "loss": 1.428, + "step": 4407 + }, + { + "epoch": 0.47, + "grad_norm": 0.0858859644599195, + "learning_rate": 0.0005664856058872984, + "loss": 1.3167, + "step": 4408 + }, + { + "epoch": 0.47, + "grad_norm": 0.06573428062527853, + "learning_rate": 0.0005663130406209144, + "loss": 1.3983, + "step": 4409 + }, + { + "epoch": 0.47, + "grad_norm": 0.07532366412662411, + "learning_rate": 0.000566140467313846, + "loss": 1.5081, + "step": 4410 + }, + { + "epoch": 0.47, + "grad_norm": 0.07402368057331848, + "learning_rate": 0.0005659678859870177, + "loss": 1.2766, + "step": 4411 + }, + { + "epoch": 0.47, + "grad_norm": 0.07084658555164398, + "learning_rate": 0.0005657952966613558, + "loss": 1.4809, + "step": 4412 + }, + { + "epoch": 0.47, + "grad_norm": 0.07293649298192152, + "learning_rate": 0.0005656226993577874, + "loss": 1.3181, + "step": 4413 + }, + { + "epoch": 0.47, + "grad_norm": 0.07343140003558507, + "learning_rate": 0.0005654500940972404, + "loss": 1.3817, + "step": 4414 + }, + { + "epoch": 0.47, + "grad_norm": 0.06442382898080595, + "learning_rate": 0.000565277480900644, + "loss": 1.565, + "step": 4415 + }, + { + "epoch": 0.47, + "grad_norm": 0.0758026294438705, + "learning_rate": 0.0005651048597889277, + "loss": 1.5896, + "step": 4416 + }, + { + "epoch": 0.47, + "grad_norm": 0.07908556991375093, + "learning_rate": 0.000564932230783023, + "loss": 1.2737, + "step": 4417 + }, + { + "epoch": 0.47, + "grad_norm": 0.06293706327093158, + "learning_rate": 0.0005647595939038615, + "loss": 1.4938, + "step": 4418 + }, + { + "epoch": 0.48, + "grad_norm": 0.06813466656281142, + "learning_rate": 0.0005645869491723757, + "loss": 1.4309, + "step": 4419 + }, + { + "epoch": 0.48, + "grad_norm": 0.07377659147668462, + "learning_rate": 0.0005644142966094997, + "loss": 1.4835, + "step": 4420 + }, + { + "epoch": 0.48, + "grad_norm": 0.07262404263783771, + "learning_rate": 0.0005642416362361683, + "loss": 1.4194, + "step": 4421 + }, + { + "epoch": 0.48, + "grad_norm": 0.06906843043045555, + "learning_rate": 0.0005640689680733171, + "loss": 1.4486, + "step": 4422 + }, + { + "epoch": 0.48, + "grad_norm": 0.07374356166217189, + "learning_rate": 0.0005638962921418824, + "loss": 1.4561, + "step": 4423 + }, + { + "epoch": 0.48, + "grad_norm": 0.07256179926993496, + "learning_rate": 0.0005637236084628022, + "loss": 1.4977, + "step": 4424 + }, + { + "epoch": 0.48, + "grad_norm": 0.09589845080482406, + "learning_rate": 0.0005635509170570148, + "loss": 1.4129, + "step": 4425 + }, + { + "epoch": 0.48, + "grad_norm": 0.074427280353466, + "learning_rate": 0.0005633782179454594, + "loss": 1.366, + "step": 4426 + }, + { + "epoch": 0.48, + "grad_norm": 0.06692816156957947, + "learning_rate": 0.0005632055111490767, + "loss": 1.3508, + "step": 4427 + }, + { + "epoch": 0.48, + "grad_norm": 0.07199780349642708, + "learning_rate": 0.0005630327966888079, + "loss": 1.4069, + "step": 4428 + }, + { + "epoch": 0.48, + "grad_norm": 0.0683222669972921, + "learning_rate": 0.0005628600745855952, + "loss": 1.5312, + "step": 4429 + }, + { + "epoch": 0.48, + "grad_norm": 0.07072452845567084, + "learning_rate": 0.0005626873448603817, + "loss": 1.2589, + "step": 4430 + }, + { + "epoch": 0.48, + "grad_norm": 0.07226687346488332, + "learning_rate": 0.0005625146075341115, + "loss": 1.5328, + "step": 4431 + }, + { + "epoch": 0.48, + "grad_norm": 0.06829504835227422, + "learning_rate": 0.0005623418626277296, + "loss": 1.5685, + "step": 4432 + }, + { + "epoch": 0.48, + "grad_norm": 0.06920654184104114, + "learning_rate": 0.000562169110162182, + "loss": 1.3811, + "step": 4433 + }, + { + "epoch": 0.48, + "grad_norm": 0.07101655341266463, + "learning_rate": 0.0005619963501584154, + "loss": 1.6274, + "step": 4434 + }, + { + "epoch": 0.48, + "grad_norm": 0.0678822950684123, + "learning_rate": 0.0005618235826373776, + "loss": 1.4166, + "step": 4435 + }, + { + "epoch": 0.48, + "grad_norm": 0.06572661972423154, + "learning_rate": 0.0005616508076200174, + "loss": 1.4391, + "step": 4436 + }, + { + "epoch": 0.48, + "grad_norm": 0.06820610609290627, + "learning_rate": 0.0005614780251272842, + "loss": 1.5358, + "step": 4437 + }, + { + "epoch": 0.48, + "grad_norm": 0.06550275046899529, + "learning_rate": 0.0005613052351801283, + "loss": 1.4511, + "step": 4438 + }, + { + "epoch": 0.48, + "grad_norm": 0.08188025593494892, + "learning_rate": 0.0005611324377995016, + "loss": 1.4931, + "step": 4439 + }, + { + "epoch": 0.48, + "grad_norm": 0.06581901611273966, + "learning_rate": 0.0005609596330063558, + "loss": 1.3614, + "step": 4440 + }, + { + "epoch": 0.48, + "grad_norm": 0.06954853363417197, + "learning_rate": 0.0005607868208216445, + "loss": 1.3153, + "step": 4441 + }, + { + "epoch": 0.48, + "grad_norm": 0.07447233452866256, + "learning_rate": 0.0005606140012663214, + "loss": 1.5368, + "step": 4442 + }, + { + "epoch": 0.48, + "grad_norm": 0.06884509785274705, + "learning_rate": 0.0005604411743613418, + "loss": 1.51, + "step": 4443 + }, + { + "epoch": 0.48, + "grad_norm": 0.07476522479204314, + "learning_rate": 0.0005602683401276614, + "loss": 1.3806, + "step": 4444 + }, + { + "epoch": 0.48, + "grad_norm": 0.07323636031812013, + "learning_rate": 0.0005600954985862373, + "loss": 1.315, + "step": 4445 + }, + { + "epoch": 0.48, + "grad_norm": 0.06912970850725218, + "learning_rate": 0.0005599226497580264, + "loss": 1.4559, + "step": 4446 + }, + { + "epoch": 0.48, + "grad_norm": 0.0730926337836783, + "learning_rate": 0.0005597497936639878, + "loss": 1.4216, + "step": 4447 + }, + { + "epoch": 0.48, + "grad_norm": 0.07234954230583959, + "learning_rate": 0.0005595769303250809, + "loss": 1.4416, + "step": 4448 + }, + { + "epoch": 0.48, + "grad_norm": 0.0746131568968767, + "learning_rate": 0.0005594040597622653, + "loss": 1.3221, + "step": 4449 + }, + { + "epoch": 0.48, + "grad_norm": 0.08148817831391064, + "learning_rate": 0.0005592311819965032, + "loss": 1.4065, + "step": 4450 + }, + { + "epoch": 0.48, + "grad_norm": 0.07166491026144559, + "learning_rate": 0.0005590582970487558, + "loss": 1.345, + "step": 4451 + }, + { + "epoch": 0.48, + "grad_norm": 0.07930429551915798, + "learning_rate": 0.0005588854049399864, + "loss": 1.4026, + "step": 4452 + }, + { + "epoch": 0.48, + "grad_norm": 0.06973923917211951, + "learning_rate": 0.0005587125056911586, + "loss": 1.3695, + "step": 4453 + }, + { + "epoch": 0.48, + "grad_norm": 0.07736874764036672, + "learning_rate": 0.000558539599323237, + "loss": 1.3889, + "step": 4454 + }, + { + "epoch": 0.48, + "grad_norm": 0.07395876425197255, + "learning_rate": 0.0005583666858571873, + "loss": 1.4323, + "step": 4455 + }, + { + "epoch": 0.48, + "grad_norm": 0.06956108570224902, + "learning_rate": 0.0005581937653139757, + "loss": 1.3301, + "step": 4456 + }, + { + "epoch": 0.48, + "grad_norm": 0.070340521017195, + "learning_rate": 0.0005580208377145693, + "loss": 1.4881, + "step": 4457 + }, + { + "epoch": 0.48, + "grad_norm": 0.07142190688929934, + "learning_rate": 0.0005578479030799362, + "loss": 1.3902, + "step": 4458 + }, + { + "epoch": 0.48, + "grad_norm": 0.07158868756773161, + "learning_rate": 0.0005576749614310456, + "loss": 1.4741, + "step": 4459 + }, + { + "epoch": 0.48, + "grad_norm": 0.06838868568645787, + "learning_rate": 0.0005575020127888672, + "loss": 1.4185, + "step": 4460 + }, + { + "epoch": 0.48, + "grad_norm": 0.07183543922807431, + "learning_rate": 0.0005573290571743714, + "loss": 1.4307, + "step": 4461 + }, + { + "epoch": 0.48, + "grad_norm": 0.06986236197336322, + "learning_rate": 0.0005571560946085298, + "loss": 1.4594, + "step": 4462 + }, + { + "epoch": 0.48, + "grad_norm": 0.06696960142160002, + "learning_rate": 0.0005569831251123145, + "loss": 1.3465, + "step": 4463 + }, + { + "epoch": 0.48, + "grad_norm": 0.07233568368627842, + "learning_rate": 0.0005568101487066991, + "loss": 1.307, + "step": 4464 + }, + { + "epoch": 0.48, + "grad_norm": 0.0775921186659931, + "learning_rate": 0.0005566371654126572, + "loss": 1.439, + "step": 4465 + }, + { + "epoch": 0.48, + "grad_norm": 0.07908711625273704, + "learning_rate": 0.0005564641752511636, + "loss": 1.3838, + "step": 4466 + }, + { + "epoch": 0.48, + "grad_norm": 0.07270285235982457, + "learning_rate": 0.0005562911782431943, + "loss": 1.4833, + "step": 4467 + }, + { + "epoch": 0.48, + "grad_norm": 0.07069593509582667, + "learning_rate": 0.0005561181744097255, + "loss": 1.4951, + "step": 4468 + }, + { + "epoch": 0.48, + "grad_norm": 0.07302606485108576, + "learning_rate": 0.0005559451637717345, + "loss": 1.376, + "step": 4469 + }, + { + "epoch": 0.48, + "grad_norm": 0.06534902828368297, + "learning_rate": 0.0005557721463501997, + "loss": 1.395, + "step": 4470 + }, + { + "epoch": 0.48, + "grad_norm": 0.06380982214192189, + "learning_rate": 0.0005555991221660998, + "loss": 1.3312, + "step": 4471 + }, + { + "epoch": 0.48, + "grad_norm": 0.06465927979215724, + "learning_rate": 0.0005554260912404146, + "loss": 1.3872, + "step": 4472 + }, + { + "epoch": 0.48, + "grad_norm": 0.07193715282214726, + "learning_rate": 0.0005552530535941248, + "loss": 1.3708, + "step": 4473 + }, + { + "epoch": 0.48, + "grad_norm": 0.06858753038473571, + "learning_rate": 0.0005550800092482117, + "loss": 1.4944, + "step": 4474 + }, + { + "epoch": 0.48, + "grad_norm": 0.06901279556324783, + "learning_rate": 0.0005549069582236576, + "loss": 1.3406, + "step": 4475 + }, + { + "epoch": 0.48, + "grad_norm": 0.08512978246349323, + "learning_rate": 0.0005547339005414456, + "loss": 1.5978, + "step": 4476 + }, + { + "epoch": 0.48, + "grad_norm": 0.07861647095932271, + "learning_rate": 0.0005545608362225594, + "loss": 1.4021, + "step": 4477 + }, + { + "epoch": 0.48, + "grad_norm": 0.06934085764688186, + "learning_rate": 0.0005543877652879837, + "loss": 1.3235, + "step": 4478 + }, + { + "epoch": 0.48, + "grad_norm": 0.07068395348115039, + "learning_rate": 0.0005542146877587041, + "loss": 1.4409, + "step": 4479 + }, + { + "epoch": 0.48, + "grad_norm": 0.07537414137102522, + "learning_rate": 0.0005540416036557064, + "loss": 1.4742, + "step": 4480 + }, + { + "epoch": 0.48, + "grad_norm": 0.06280318869662363, + "learning_rate": 0.0005538685129999782, + "loss": 1.4793, + "step": 4481 + }, + { + "epoch": 0.48, + "grad_norm": 0.06802012013643947, + "learning_rate": 0.000553695415812507, + "loss": 1.4218, + "step": 4482 + }, + { + "epoch": 0.48, + "grad_norm": 0.07184833062274965, + "learning_rate": 0.0005535223121142817, + "loss": 1.5405, + "step": 4483 + }, + { + "epoch": 0.48, + "grad_norm": 0.07048066216163036, + "learning_rate": 0.0005533492019262913, + "loss": 1.5284, + "step": 4484 + }, + { + "epoch": 0.48, + "grad_norm": 0.07208127756365079, + "learning_rate": 0.0005531760852695263, + "loss": 1.4083, + "step": 4485 + }, + { + "epoch": 0.48, + "grad_norm": 0.06955811505268919, + "learning_rate": 0.0005530029621649777, + "loss": 1.3813, + "step": 4486 + }, + { + "epoch": 0.48, + "grad_norm": 0.07251905452428771, + "learning_rate": 0.0005528298326336372, + "loss": 1.47, + "step": 4487 + }, + { + "epoch": 0.48, + "grad_norm": 0.0660393445159999, + "learning_rate": 0.0005526566966964972, + "loss": 1.4468, + "step": 4488 + }, + { + "epoch": 0.48, + "grad_norm": 0.07447153416480336, + "learning_rate": 0.0005524835543745515, + "loss": 1.2929, + "step": 4489 + }, + { + "epoch": 0.48, + "grad_norm": 0.07285995323219417, + "learning_rate": 0.0005523104056887936, + "loss": 1.3571, + "step": 4490 + }, + { + "epoch": 0.48, + "grad_norm": 0.07066125440052619, + "learning_rate": 0.0005521372506602187, + "loss": 1.3609, + "step": 4491 + }, + { + "epoch": 0.48, + "grad_norm": 0.06989235987621864, + "learning_rate": 0.0005519640893098227, + "loss": 1.4226, + "step": 4492 + }, + { + "epoch": 0.48, + "grad_norm": 0.06912670079475867, + "learning_rate": 0.0005517909216586015, + "loss": 1.4072, + "step": 4493 + }, + { + "epoch": 0.48, + "grad_norm": 0.08376149611071462, + "learning_rate": 0.0005516177477275523, + "loss": 1.4395, + "step": 4494 + }, + { + "epoch": 0.48, + "grad_norm": 0.07540252701541866, + "learning_rate": 0.0005514445675376735, + "loss": 1.3781, + "step": 4495 + }, + { + "epoch": 0.48, + "grad_norm": 0.07338801331012032, + "learning_rate": 0.0005512713811099636, + "loss": 1.4511, + "step": 4496 + }, + { + "epoch": 0.48, + "grad_norm": 0.08017833620848996, + "learning_rate": 0.0005510981884654217, + "loss": 1.3373, + "step": 4497 + }, + { + "epoch": 0.48, + "grad_norm": 0.07695481291241366, + "learning_rate": 0.0005509249896250485, + "loss": 1.3973, + "step": 4498 + }, + { + "epoch": 0.48, + "grad_norm": 0.07799573575606975, + "learning_rate": 0.0005507517846098447, + "loss": 1.3026, + "step": 4499 + }, + { + "epoch": 0.48, + "grad_norm": 0.06890484703699848, + "learning_rate": 0.0005505785734408121, + "loss": 1.4129, + "step": 4500 + }, + { + "epoch": 0.48, + "grad_norm": 0.07170266528926741, + "learning_rate": 0.0005504053561389531, + "loss": 1.4825, + "step": 4501 + }, + { + "epoch": 0.48, + "grad_norm": 0.08265522375488303, + "learning_rate": 0.0005502321327252708, + "loss": 1.413, + "step": 4502 + }, + { + "epoch": 0.48, + "grad_norm": 0.06619748791951723, + "learning_rate": 0.0005500589032207692, + "loss": 1.375, + "step": 4503 + }, + { + "epoch": 0.48, + "grad_norm": 0.07303920274619637, + "learning_rate": 0.000549885667646453, + "loss": 1.4716, + "step": 4504 + }, + { + "epoch": 0.48, + "grad_norm": 0.08679445748778027, + "learning_rate": 0.0005497124260233277, + "loss": 1.5006, + "step": 4505 + }, + { + "epoch": 0.48, + "grad_norm": 0.06787411422320562, + "learning_rate": 0.0005495391783723994, + "loss": 1.4461, + "step": 4506 + }, + { + "epoch": 0.48, + "grad_norm": 0.0813816825415281, + "learning_rate": 0.0005493659247146749, + "loss": 1.6357, + "step": 4507 + }, + { + "epoch": 0.48, + "grad_norm": 0.07138574670659505, + "learning_rate": 0.0005491926650711619, + "loss": 1.4653, + "step": 4508 + }, + { + "epoch": 0.48, + "grad_norm": 0.06750733162437504, + "learning_rate": 0.0005490193994628685, + "loss": 1.4337, + "step": 4509 + }, + { + "epoch": 0.48, + "grad_norm": 0.0987904441000795, + "learning_rate": 0.000548846127910804, + "loss": 1.2916, + "step": 4510 + }, + { + "epoch": 0.48, + "grad_norm": 0.07504186207018469, + "learning_rate": 0.000548672850435978, + "loss": 1.3991, + "step": 4511 + }, + { + "epoch": 0.49, + "grad_norm": 0.08576301042708652, + "learning_rate": 0.0005484995670594012, + "loss": 1.5069, + "step": 4512 + }, + { + "epoch": 0.49, + "grad_norm": 0.07276167204073009, + "learning_rate": 0.0005483262778020846, + "loss": 1.4622, + "step": 4513 + }, + { + "epoch": 0.49, + "grad_norm": 0.08992303325112609, + "learning_rate": 0.0005481529826850403, + "loss": 1.3602, + "step": 4514 + }, + { + "epoch": 0.49, + "grad_norm": 0.07875033982616096, + "learning_rate": 0.000547979681729281, + "loss": 1.585, + "step": 4515 + }, + { + "epoch": 0.49, + "grad_norm": 0.07207386003591792, + "learning_rate": 0.0005478063749558195, + "loss": 1.4695, + "step": 4516 + }, + { + "epoch": 0.49, + "grad_norm": 0.0719171163854882, + "learning_rate": 0.0005476330623856705, + "loss": 1.2742, + "step": 4517 + }, + { + "epoch": 0.49, + "grad_norm": 0.07184078857477177, + "learning_rate": 0.0005474597440398483, + "loss": 1.4122, + "step": 4518 + }, + { + "epoch": 0.49, + "grad_norm": 0.06633376011230552, + "learning_rate": 0.0005472864199393687, + "loss": 1.308, + "step": 4519 + }, + { + "epoch": 0.49, + "grad_norm": 0.07621419629579644, + "learning_rate": 0.0005471130901052476, + "loss": 1.5846, + "step": 4520 + }, + { + "epoch": 0.49, + "grad_norm": 0.06731631619116417, + "learning_rate": 0.0005469397545585019, + "loss": 1.3949, + "step": 4521 + }, + { + "epoch": 0.49, + "grad_norm": 0.07302884182715914, + "learning_rate": 0.000546766413320149, + "loss": 1.4557, + "step": 4522 + }, + { + "epoch": 0.49, + "grad_norm": 0.06445838444376298, + "learning_rate": 0.0005465930664112073, + "loss": 1.4074, + "step": 4523 + }, + { + "epoch": 0.49, + "grad_norm": 0.06288037927464722, + "learning_rate": 0.000546419713852696, + "loss": 1.3382, + "step": 4524 + }, + { + "epoch": 0.49, + "grad_norm": 0.07343137466979509, + "learning_rate": 0.000546246355665634, + "loss": 1.4763, + "step": 4525 + }, + { + "epoch": 0.49, + "grad_norm": 0.09605837351043238, + "learning_rate": 0.000546072991871042, + "loss": 1.4698, + "step": 4526 + }, + { + "epoch": 0.49, + "grad_norm": 0.07278503081830355, + "learning_rate": 0.0005458996224899409, + "loss": 1.4603, + "step": 4527 + }, + { + "epoch": 0.49, + "grad_norm": 0.06792443909696488, + "learning_rate": 0.0005457262475433523, + "loss": 1.3286, + "step": 4528 + }, + { + "epoch": 0.49, + "grad_norm": 0.07308926990410777, + "learning_rate": 0.0005455528670522987, + "loss": 1.3996, + "step": 4529 + }, + { + "epoch": 0.49, + "grad_norm": 0.07682204161835285, + "learning_rate": 0.0005453794810378028, + "loss": 1.367, + "step": 4530 + }, + { + "epoch": 0.49, + "grad_norm": 0.06996761782406571, + "learning_rate": 0.0005452060895208883, + "loss": 1.3614, + "step": 4531 + }, + { + "epoch": 0.49, + "grad_norm": 0.0725382634877934, + "learning_rate": 0.0005450326925225798, + "loss": 1.3161, + "step": 4532 + }, + { + "epoch": 0.49, + "grad_norm": 0.06988327489990985, + "learning_rate": 0.000544859290063902, + "loss": 1.4512, + "step": 4533 + }, + { + "epoch": 0.49, + "grad_norm": 0.08344302247555796, + "learning_rate": 0.0005446858821658805, + "loss": 1.4152, + "step": 4534 + }, + { + "epoch": 0.49, + "grad_norm": 0.07425148455363119, + "learning_rate": 0.000544512468849542, + "loss": 1.3646, + "step": 4535 + }, + { + "epoch": 0.49, + "grad_norm": 0.06737391959124295, + "learning_rate": 0.0005443390501359129, + "loss": 1.3049, + "step": 4536 + }, + { + "epoch": 0.49, + "grad_norm": 0.06954957142687951, + "learning_rate": 0.0005441656260460212, + "loss": 1.366, + "step": 4537 + }, + { + "epoch": 0.49, + "grad_norm": 0.07154336570704602, + "learning_rate": 0.0005439921966008953, + "loss": 1.5324, + "step": 4538 + }, + { + "epoch": 0.49, + "grad_norm": 0.08572842758012887, + "learning_rate": 0.0005438187618215636, + "loss": 1.4391, + "step": 4539 + }, + { + "epoch": 0.49, + "grad_norm": 0.06743260458188859, + "learning_rate": 0.0005436453217290562, + "loss": 1.3816, + "step": 4540 + }, + { + "epoch": 0.49, + "grad_norm": 0.07155234240297914, + "learning_rate": 0.000543471876344403, + "loss": 1.4001, + "step": 4541 + }, + { + "epoch": 0.49, + "grad_norm": 0.07525337751704271, + "learning_rate": 0.0005432984256886347, + "loss": 1.5117, + "step": 4542 + }, + { + "epoch": 0.49, + "grad_norm": 0.07358363432443406, + "learning_rate": 0.0005431249697827832, + "loss": 1.335, + "step": 4543 + }, + { + "epoch": 0.49, + "grad_norm": 0.07454610224380064, + "learning_rate": 0.0005429515086478805, + "loss": 1.3017, + "step": 4544 + }, + { + "epoch": 0.49, + "grad_norm": 0.07046647316226497, + "learning_rate": 0.0005427780423049593, + "loss": 1.3929, + "step": 4545 + }, + { + "epoch": 0.49, + "grad_norm": 0.0766116705717234, + "learning_rate": 0.0005426045707750529, + "loss": 1.4339, + "step": 4546 + }, + { + "epoch": 0.49, + "grad_norm": 0.09166538682860578, + "learning_rate": 0.0005424310940791953, + "loss": 1.4755, + "step": 4547 + }, + { + "epoch": 0.49, + "grad_norm": 0.07278076098680313, + "learning_rate": 0.0005422576122384215, + "loss": 1.5623, + "step": 4548 + }, + { + "epoch": 0.49, + "grad_norm": 0.06308207348750028, + "learning_rate": 0.0005420841252737664, + "loss": 1.3334, + "step": 4549 + }, + { + "epoch": 0.49, + "grad_norm": 0.07808545423036595, + "learning_rate": 0.0005419106332062661, + "loss": 1.5023, + "step": 4550 + }, + { + "epoch": 0.49, + "grad_norm": 0.07914617981438024, + "learning_rate": 0.000541737136056957, + "loss": 1.4335, + "step": 4551 + }, + { + "epoch": 0.49, + "grad_norm": 0.07101119935426858, + "learning_rate": 0.0005415636338468762, + "loss": 1.3554, + "step": 4552 + }, + { + "epoch": 0.49, + "grad_norm": 0.07221781995588977, + "learning_rate": 0.0005413901265970616, + "loss": 1.3392, + "step": 4553 + }, + { + "epoch": 0.49, + "grad_norm": 0.06859955402895859, + "learning_rate": 0.0005412166143285514, + "loss": 1.3689, + "step": 4554 + }, + { + "epoch": 0.49, + "grad_norm": 0.06889021096031589, + "learning_rate": 0.0005410430970623847, + "loss": 1.4025, + "step": 4555 + }, + { + "epoch": 0.49, + "grad_norm": 0.07536620068094364, + "learning_rate": 0.0005408695748196009, + "loss": 1.3711, + "step": 4556 + }, + { + "epoch": 0.49, + "grad_norm": 0.07286775325340998, + "learning_rate": 0.0005406960476212403, + "loss": 1.4687, + "step": 4557 + }, + { + "epoch": 0.49, + "grad_norm": 0.07270129112592764, + "learning_rate": 0.0005405225154883435, + "loss": 1.5237, + "step": 4558 + }, + { + "epoch": 0.49, + "grad_norm": 0.07454308740712048, + "learning_rate": 0.0005403489784419521, + "loss": 1.5203, + "step": 4559 + }, + { + "epoch": 0.49, + "grad_norm": 0.07490105304575796, + "learning_rate": 0.000540175436503108, + "loss": 1.3618, + "step": 4560 + }, + { + "epoch": 0.49, + "grad_norm": 0.07055456412536586, + "learning_rate": 0.0005400018896928537, + "loss": 1.3952, + "step": 4561 + }, + { + "epoch": 0.49, + "grad_norm": 0.07493601484219946, + "learning_rate": 0.0005398283380322323, + "loss": 1.4886, + "step": 4562 + }, + { + "epoch": 0.49, + "grad_norm": 0.07994133908501737, + "learning_rate": 0.0005396547815422877, + "loss": 1.3913, + "step": 4563 + }, + { + "epoch": 0.49, + "grad_norm": 0.07205633839504125, + "learning_rate": 0.000539481220244064, + "loss": 1.4247, + "step": 4564 + }, + { + "epoch": 0.49, + "grad_norm": 0.08296967780452945, + "learning_rate": 0.0005393076541586062, + "loss": 1.4473, + "step": 4565 + }, + { + "epoch": 0.49, + "grad_norm": 0.06787442123709032, + "learning_rate": 0.0005391340833069601, + "loss": 1.3086, + "step": 4566 + }, + { + "epoch": 0.49, + "grad_norm": 0.08012266054574767, + "learning_rate": 0.0005389605077101712, + "loss": 1.4119, + "step": 4567 + }, + { + "epoch": 0.49, + "grad_norm": 0.08161258295766774, + "learning_rate": 0.0005387869273892865, + "loss": 1.5046, + "step": 4568 + }, + { + "epoch": 0.49, + "grad_norm": 0.07775070711351714, + "learning_rate": 0.0005386133423653532, + "loss": 1.3888, + "step": 4569 + }, + { + "epoch": 0.49, + "grad_norm": 0.06586472603954369, + "learning_rate": 0.0005384397526594189, + "loss": 1.3614, + "step": 4570 + }, + { + "epoch": 0.49, + "grad_norm": 0.08135731486330744, + "learning_rate": 0.0005382661582925322, + "loss": 1.3754, + "step": 4571 + }, + { + "epoch": 0.49, + "grad_norm": 0.06833444123583289, + "learning_rate": 0.000538092559285742, + "loss": 1.5292, + "step": 4572 + }, + { + "epoch": 0.49, + "grad_norm": 0.07780963232580691, + "learning_rate": 0.0005379189556600974, + "loss": 1.3701, + "step": 4573 + }, + { + "epoch": 0.49, + "grad_norm": 0.06877199878994798, + "learning_rate": 0.0005377453474366489, + "loss": 1.3532, + "step": 4574 + }, + { + "epoch": 0.49, + "grad_norm": 0.06300244430878904, + "learning_rate": 0.0005375717346364468, + "loss": 1.3087, + "step": 4575 + }, + { + "epoch": 0.49, + "grad_norm": 0.08631255658908871, + "learning_rate": 0.0005373981172805421, + "loss": 1.5267, + "step": 4576 + }, + { + "epoch": 0.49, + "grad_norm": 0.0741591794846563, + "learning_rate": 0.0005372244953899872, + "loss": 1.4152, + "step": 4577 + }, + { + "epoch": 0.49, + "grad_norm": 0.07917543188100162, + "learning_rate": 0.0005370508689858336, + "loss": 1.5721, + "step": 4578 + }, + { + "epoch": 0.49, + "grad_norm": 0.07534740443199708, + "learning_rate": 0.0005368772380891345, + "loss": 1.3377, + "step": 4579 + }, + { + "epoch": 0.49, + "grad_norm": 0.06688936328416421, + "learning_rate": 0.0005367036027209431, + "loss": 1.4143, + "step": 4580 + }, + { + "epoch": 0.49, + "grad_norm": 0.06971105161281534, + "learning_rate": 0.0005365299629023133, + "loss": 1.4915, + "step": 4581 + }, + { + "epoch": 0.49, + "grad_norm": 0.07018375199107363, + "learning_rate": 0.0005363563186542997, + "loss": 1.3651, + "step": 4582 + }, + { + "epoch": 0.49, + "grad_norm": 0.0783407287552422, + "learning_rate": 0.000536182669997957, + "loss": 1.4741, + "step": 4583 + }, + { + "epoch": 0.49, + "grad_norm": 0.06865204082546987, + "learning_rate": 0.0005360090169543409, + "loss": 1.5494, + "step": 4584 + }, + { + "epoch": 0.49, + "grad_norm": 0.07492596982291116, + "learning_rate": 0.0005358353595445074, + "loss": 1.4175, + "step": 4585 + }, + { + "epoch": 0.49, + "grad_norm": 0.06726307354329226, + "learning_rate": 0.0005356616977895129, + "loss": 1.4331, + "step": 4586 + }, + { + "epoch": 0.49, + "grad_norm": 0.09002694081447887, + "learning_rate": 0.0005354880317104144, + "loss": 1.4617, + "step": 4587 + }, + { + "epoch": 0.49, + "grad_norm": 0.069512338457349, + "learning_rate": 0.0005353143613282702, + "loss": 1.4518, + "step": 4588 + }, + { + "epoch": 0.49, + "grad_norm": 0.07455424036661215, + "learning_rate": 0.0005351406866641377, + "loss": 1.5088, + "step": 4589 + }, + { + "epoch": 0.49, + "grad_norm": 0.07415562850466256, + "learning_rate": 0.0005349670077390757, + "loss": 1.3662, + "step": 4590 + }, + { + "epoch": 0.49, + "grad_norm": 0.0715278258922769, + "learning_rate": 0.0005347933245741435, + "loss": 1.3138, + "step": 4591 + }, + { + "epoch": 0.49, + "grad_norm": 0.07669012583344414, + "learning_rate": 0.0005346196371904009, + "loss": 1.488, + "step": 4592 + }, + { + "epoch": 0.49, + "grad_norm": 0.06974715535155264, + "learning_rate": 0.0005344459456089078, + "loss": 1.5462, + "step": 4593 + }, + { + "epoch": 0.49, + "grad_norm": 0.07338763101109938, + "learning_rate": 0.0005342722498507251, + "loss": 1.485, + "step": 4594 + }, + { + "epoch": 0.49, + "grad_norm": 0.0694636156603943, + "learning_rate": 0.0005340985499369137, + "loss": 1.3388, + "step": 4595 + }, + { + "epoch": 0.49, + "grad_norm": 0.0705713385482358, + "learning_rate": 0.0005339248458885359, + "loss": 1.4198, + "step": 4596 + }, + { + "epoch": 0.49, + "grad_norm": 0.08029313414307719, + "learning_rate": 0.0005337511377266535, + "loss": 1.6143, + "step": 4597 + }, + { + "epoch": 0.49, + "grad_norm": 0.06277611025631606, + "learning_rate": 0.0005335774254723293, + "loss": 1.3245, + "step": 4598 + }, + { + "epoch": 0.49, + "grad_norm": 0.07068327245009445, + "learning_rate": 0.0005334037091466264, + "loss": 1.4243, + "step": 4599 + }, + { + "epoch": 0.49, + "grad_norm": 0.07234677948360466, + "learning_rate": 0.0005332299887706087, + "loss": 1.4454, + "step": 4600 + }, + { + "epoch": 0.49, + "grad_norm": 0.07374854687243138, + "learning_rate": 0.0005330562643653401, + "loss": 1.484, + "step": 4601 + }, + { + "epoch": 0.49, + "grad_norm": 0.08347732349695075, + "learning_rate": 0.0005328825359518858, + "loss": 1.4723, + "step": 4602 + }, + { + "epoch": 0.49, + "grad_norm": 0.06287568215439522, + "learning_rate": 0.0005327088035513105, + "loss": 1.4508, + "step": 4603 + }, + { + "epoch": 0.49, + "grad_norm": 0.07172576192681832, + "learning_rate": 0.00053253506718468, + "loss": 1.3933, + "step": 4604 + }, + { + "epoch": 0.5, + "grad_norm": 0.07005732728591357, + "learning_rate": 0.0005323613268730605, + "loss": 1.4256, + "step": 4605 + }, + { + "epoch": 0.5, + "grad_norm": 0.0654226164057184, + "learning_rate": 0.0005321875826375185, + "loss": 1.464, + "step": 4606 + }, + { + "epoch": 0.5, + "grad_norm": 0.06238379236280632, + "learning_rate": 0.000532013834499121, + "loss": 1.4955, + "step": 4607 + }, + { + "epoch": 0.5, + "grad_norm": 0.06840929269433478, + "learning_rate": 0.0005318400824789359, + "loss": 1.4424, + "step": 4608 + }, + { + "epoch": 0.5, + "grad_norm": 0.07194276160370738, + "learning_rate": 0.0005316663265980309, + "loss": 1.2864, + "step": 4609 + }, + { + "epoch": 0.5, + "grad_norm": 0.05734857788180746, + "learning_rate": 0.0005314925668774747, + "loss": 1.3987, + "step": 4610 + }, + { + "epoch": 0.5, + "grad_norm": 0.06876114472500308, + "learning_rate": 0.0005313188033383363, + "loss": 1.4162, + "step": 4611 + }, + { + "epoch": 0.5, + "grad_norm": 0.06978076302819285, + "learning_rate": 0.0005311450360016847, + "loss": 1.4863, + "step": 4612 + }, + { + "epoch": 0.5, + "grad_norm": 0.061183894124280415, + "learning_rate": 0.0005309712648885904, + "loss": 1.3364, + "step": 4613 + }, + { + "epoch": 0.5, + "grad_norm": 0.07043102493486708, + "learning_rate": 0.0005307974900201235, + "loss": 1.5243, + "step": 4614 + }, + { + "epoch": 0.5, + "grad_norm": 0.06643923473201709, + "learning_rate": 0.0005306237114173545, + "loss": 1.5084, + "step": 4615 + }, + { + "epoch": 0.5, + "grad_norm": 0.07447149586592648, + "learning_rate": 0.0005304499291013551, + "loss": 1.3401, + "step": 4616 + }, + { + "epoch": 0.5, + "grad_norm": 0.06198491339411245, + "learning_rate": 0.0005302761430931968, + "loss": 1.3923, + "step": 4617 + }, + { + "epoch": 0.5, + "grad_norm": 0.07738412696592357, + "learning_rate": 0.0005301023534139516, + "loss": 1.3069, + "step": 4618 + }, + { + "epoch": 0.5, + "grad_norm": 0.08466989307130728, + "learning_rate": 0.0005299285600846926, + "loss": 1.4284, + "step": 4619 + }, + { + "epoch": 0.5, + "grad_norm": 0.06723599229505743, + "learning_rate": 0.0005297547631264922, + "loss": 1.347, + "step": 4620 + }, + { + "epoch": 0.5, + "grad_norm": 0.07153350664015816, + "learning_rate": 0.0005295809625604244, + "loss": 1.4633, + "step": 4621 + }, + { + "epoch": 0.5, + "grad_norm": 0.07407785833339331, + "learning_rate": 0.0005294071584075629, + "loss": 1.3834, + "step": 4622 + }, + { + "epoch": 0.5, + "grad_norm": 0.0764897207373361, + "learning_rate": 0.0005292333506889819, + "loss": 1.3923, + "step": 4623 + }, + { + "epoch": 0.5, + "grad_norm": 0.0747599974968833, + "learning_rate": 0.0005290595394257564, + "loss": 1.3799, + "step": 4624 + }, + { + "epoch": 0.5, + "grad_norm": 0.07394829288250857, + "learning_rate": 0.0005288857246389617, + "loss": 1.357, + "step": 4625 + }, + { + "epoch": 0.5, + "grad_norm": 0.07255830568972636, + "learning_rate": 0.000528711906349673, + "loss": 1.3896, + "step": 4626 + }, + { + "epoch": 0.5, + "grad_norm": 0.08099083798541638, + "learning_rate": 0.0005285380845789669, + "loss": 1.4059, + "step": 4627 + }, + { + "epoch": 0.5, + "grad_norm": 0.07223762794677366, + "learning_rate": 0.0005283642593479197, + "loss": 1.4438, + "step": 4628 + }, + { + "epoch": 0.5, + "grad_norm": 0.07054943655122418, + "learning_rate": 0.0005281904306776082, + "loss": 1.3298, + "step": 4629 + }, + { + "epoch": 0.5, + "grad_norm": 0.07290095631200455, + "learning_rate": 0.0005280165985891098, + "loss": 1.4432, + "step": 4630 + }, + { + "epoch": 0.5, + "grad_norm": 0.06882239361849153, + "learning_rate": 0.0005278427631035022, + "loss": 1.3861, + "step": 4631 + }, + { + "epoch": 0.5, + "grad_norm": 0.0774646267226411, + "learning_rate": 0.0005276689242418635, + "loss": 1.3935, + "step": 4632 + }, + { + "epoch": 0.5, + "grad_norm": 0.07159465617312386, + "learning_rate": 0.0005274950820252725, + "loss": 1.4455, + "step": 4633 + }, + { + "epoch": 0.5, + "grad_norm": 0.07941785759449224, + "learning_rate": 0.000527321236474808, + "loss": 1.457, + "step": 4634 + }, + { + "epoch": 0.5, + "grad_norm": 0.08133325764192359, + "learning_rate": 0.0005271473876115494, + "loss": 1.4672, + "step": 4635 + }, + { + "epoch": 0.5, + "grad_norm": 0.07566371625930349, + "learning_rate": 0.0005269735354565763, + "loss": 1.4521, + "step": 4636 + }, + { + "epoch": 0.5, + "grad_norm": 0.07032893323259092, + "learning_rate": 0.0005267996800309692, + "loss": 1.5184, + "step": 4637 + }, + { + "epoch": 0.5, + "grad_norm": 0.07951209008943733, + "learning_rate": 0.0005266258213558084, + "loss": 1.5179, + "step": 4638 + }, + { + "epoch": 0.5, + "grad_norm": 0.0868690888705973, + "learning_rate": 0.0005264519594521751, + "loss": 1.4653, + "step": 4639 + }, + { + "epoch": 0.5, + "grad_norm": 0.07875135983511622, + "learning_rate": 0.0005262780943411504, + "loss": 1.4295, + "step": 4640 + }, + { + "epoch": 0.5, + "grad_norm": 0.07812734317245881, + "learning_rate": 0.0005261042260438163, + "loss": 1.3016, + "step": 4641 + }, + { + "epoch": 0.5, + "grad_norm": 0.07367453442907974, + "learning_rate": 0.0005259303545812546, + "loss": 1.4366, + "step": 4642 + }, + { + "epoch": 0.5, + "grad_norm": 0.07696299129620354, + "learning_rate": 0.000525756479974548, + "loss": 1.4112, + "step": 4643 + }, + { + "epoch": 0.5, + "grad_norm": 0.0726616742047467, + "learning_rate": 0.0005255826022447796, + "loss": 1.5034, + "step": 4644 + }, + { + "epoch": 0.5, + "grad_norm": 0.07007504723694576, + "learning_rate": 0.0005254087214130324, + "loss": 1.5622, + "step": 4645 + }, + { + "epoch": 0.5, + "grad_norm": 0.09152270269567035, + "learning_rate": 0.0005252348375003902, + "loss": 1.4707, + "step": 4646 + }, + { + "epoch": 0.5, + "grad_norm": 0.07062202199553434, + "learning_rate": 0.0005250609505279369, + "loss": 1.4475, + "step": 4647 + }, + { + "epoch": 0.5, + "grad_norm": 0.07991628604235958, + "learning_rate": 0.0005248870605167569, + "loss": 1.5251, + "step": 4648 + }, + { + "epoch": 0.5, + "grad_norm": 0.07321059260782213, + "learning_rate": 0.000524713167487935, + "loss": 1.3062, + "step": 4649 + }, + { + "epoch": 0.5, + "grad_norm": 0.06430883856415529, + "learning_rate": 0.0005245392714625564, + "loss": 1.4268, + "step": 4650 + }, + { + "epoch": 0.5, + "grad_norm": 0.07432342350797429, + "learning_rate": 0.0005243653724617067, + "loss": 1.3743, + "step": 4651 + }, + { + "epoch": 0.5, + "grad_norm": 0.07084662663170704, + "learning_rate": 0.0005241914705064713, + "loss": 1.3945, + "step": 4652 + }, + { + "epoch": 0.5, + "grad_norm": 0.07007986233491623, + "learning_rate": 0.0005240175656179368, + "loss": 1.4225, + "step": 4653 + }, + { + "epoch": 0.5, + "grad_norm": 0.09277118002504049, + "learning_rate": 0.0005238436578171898, + "loss": 1.4031, + "step": 4654 + }, + { + "epoch": 0.5, + "grad_norm": 0.07412570293831583, + "learning_rate": 0.0005236697471253167, + "loss": 1.4623, + "step": 4655 + }, + { + "epoch": 0.5, + "grad_norm": 0.0703204816023259, + "learning_rate": 0.0005234958335634057, + "loss": 1.2797, + "step": 4656 + }, + { + "epoch": 0.5, + "grad_norm": 0.06764260000870075, + "learning_rate": 0.0005233219171525436, + "loss": 1.5262, + "step": 4657 + }, + { + "epoch": 0.5, + "grad_norm": 0.06908150282828376, + "learning_rate": 0.0005231479979138186, + "loss": 1.5008, + "step": 4658 + }, + { + "epoch": 0.5, + "grad_norm": 0.06594842101397937, + "learning_rate": 0.0005229740758683192, + "loss": 1.5086, + "step": 4659 + }, + { + "epoch": 0.5, + "grad_norm": 0.07234559595933199, + "learning_rate": 0.0005228001510371337, + "loss": 1.3922, + "step": 4660 + }, + { + "epoch": 0.5, + "grad_norm": 0.07615374538885955, + "learning_rate": 0.0005226262234413514, + "loss": 1.3848, + "step": 4661 + }, + { + "epoch": 0.5, + "grad_norm": 0.07029248323561711, + "learning_rate": 0.0005224522931020616, + "loss": 1.445, + "step": 4662 + }, + { + "epoch": 0.5, + "grad_norm": 0.0714406791924088, + "learning_rate": 0.0005222783600403536, + "loss": 1.5777, + "step": 4663 + }, + { + "epoch": 0.5, + "grad_norm": 0.0748912481646773, + "learning_rate": 0.0005221044242773177, + "loss": 1.2854, + "step": 4664 + }, + { + "epoch": 0.5, + "grad_norm": 0.06734632544256124, + "learning_rate": 0.0005219304858340443, + "loss": 1.5012, + "step": 4665 + }, + { + "epoch": 0.5, + "grad_norm": 0.06856504138644735, + "learning_rate": 0.0005217565447316238, + "loss": 1.4233, + "step": 4666 + }, + { + "epoch": 0.5, + "grad_norm": 0.07015175624730764, + "learning_rate": 0.0005215826009911474, + "loss": 1.5598, + "step": 4667 + }, + { + "epoch": 0.5, + "grad_norm": 0.07662387766996255, + "learning_rate": 0.0005214086546337061, + "loss": 1.4279, + "step": 4668 + }, + { + "epoch": 0.5, + "grad_norm": 0.07579601306263993, + "learning_rate": 0.0005212347056803916, + "loss": 1.4488, + "step": 4669 + }, + { + "epoch": 0.5, + "grad_norm": 0.08407512048240309, + "learning_rate": 0.0005210607541522958, + "loss": 1.4203, + "step": 4670 + }, + { + "epoch": 0.5, + "grad_norm": 0.06957229687866279, + "learning_rate": 0.000520886800070511, + "loss": 1.3616, + "step": 4671 + }, + { + "epoch": 0.5, + "grad_norm": 0.06557597435831303, + "learning_rate": 0.0005207128434561297, + "loss": 1.2343, + "step": 4672 + }, + { + "epoch": 0.5, + "grad_norm": 0.10138332283543867, + "learning_rate": 0.0005205388843302446, + "loss": 1.3596, + "step": 4673 + }, + { + "epoch": 0.5, + "grad_norm": 0.059370486462788453, + "learning_rate": 0.0005203649227139491, + "loss": 1.4954, + "step": 4674 + }, + { + "epoch": 0.5, + "grad_norm": 0.06702999161881795, + "learning_rate": 0.0005201909586283365, + "loss": 1.419, + "step": 4675 + }, + { + "epoch": 0.5, + "grad_norm": 0.07669954138606098, + "learning_rate": 0.0005200169920945005, + "loss": 1.344, + "step": 4676 + }, + { + "epoch": 0.5, + "grad_norm": 0.07665471343138372, + "learning_rate": 0.0005198430231335352, + "loss": 1.3518, + "step": 4677 + }, + { + "epoch": 0.5, + "grad_norm": 0.07485280276094071, + "learning_rate": 0.000519669051766535, + "loss": 1.3616, + "step": 4678 + }, + { + "epoch": 0.5, + "grad_norm": 0.08512190117286093, + "learning_rate": 0.0005194950780145945, + "loss": 1.4598, + "step": 4679 + }, + { + "epoch": 0.5, + "grad_norm": 0.07047653270509795, + "learning_rate": 0.0005193211018988084, + "loss": 1.3529, + "step": 4680 + }, + { + "epoch": 0.5, + "grad_norm": 0.06881206519124212, + "learning_rate": 0.0005191471234402723, + "loss": 1.4288, + "step": 4681 + }, + { + "epoch": 0.5, + "grad_norm": 0.0804348080365886, + "learning_rate": 0.0005189731426600813, + "loss": 1.3915, + "step": 4682 + }, + { + "epoch": 0.5, + "grad_norm": 0.06573587273377841, + "learning_rate": 0.0005187991595793314, + "loss": 1.4739, + "step": 4683 + }, + { + "epoch": 0.5, + "grad_norm": 0.06985488464953007, + "learning_rate": 0.0005186251742191187, + "loss": 1.4218, + "step": 4684 + }, + { + "epoch": 0.5, + "grad_norm": 0.08365828393922413, + "learning_rate": 0.0005184511866005392, + "loss": 1.5171, + "step": 4685 + }, + { + "epoch": 0.5, + "grad_norm": 0.06799405095670837, + "learning_rate": 0.0005182771967446899, + "loss": 1.4046, + "step": 4686 + }, + { + "epoch": 0.5, + "grad_norm": 0.07745680866308548, + "learning_rate": 0.0005181032046726674, + "loss": 1.3601, + "step": 4687 + }, + { + "epoch": 0.5, + "grad_norm": 0.07401868558341498, + "learning_rate": 0.0005179292104055689, + "loss": 1.408, + "step": 4688 + }, + { + "epoch": 0.5, + "grad_norm": 0.08185325765888811, + "learning_rate": 0.0005177552139644919, + "loss": 1.3529, + "step": 4689 + }, + { + "epoch": 0.5, + "grad_norm": 0.06555219557774031, + "learning_rate": 0.0005175812153705339, + "loss": 1.3481, + "step": 4690 + }, + { + "epoch": 0.5, + "grad_norm": 0.07034421079868959, + "learning_rate": 0.000517407214644793, + "loss": 1.4121, + "step": 4691 + }, + { + "epoch": 0.5, + "grad_norm": 0.06276669263952558, + "learning_rate": 0.0005172332118083673, + "loss": 1.3234, + "step": 4692 + }, + { + "epoch": 0.5, + "grad_norm": 0.07723946949855989, + "learning_rate": 0.0005170592068823553, + "loss": 1.5377, + "step": 4693 + }, + { + "epoch": 0.5, + "grad_norm": 0.08084112493527873, + "learning_rate": 0.0005168851998878555, + "loss": 1.3488, + "step": 4694 + }, + { + "epoch": 0.5, + "grad_norm": 0.0727018555758983, + "learning_rate": 0.0005167111908459672, + "loss": 1.3023, + "step": 4695 + }, + { + "epoch": 0.5, + "grad_norm": 0.07233243830653298, + "learning_rate": 0.0005165371797777894, + "loss": 1.4445, + "step": 4696 + }, + { + "epoch": 0.5, + "grad_norm": 0.07291339507881155, + "learning_rate": 0.0005163631667044213, + "loss": 1.4742, + "step": 4697 + }, + { + "epoch": 0.51, + "grad_norm": 0.07670065393263528, + "learning_rate": 0.000516189151646963, + "loss": 1.4396, + "step": 4698 + }, + { + "epoch": 0.51, + "grad_norm": 0.07358001838784738, + "learning_rate": 0.0005160151346265142, + "loss": 1.409, + "step": 4699 + }, + { + "epoch": 0.51, + "grad_norm": 0.08157498764671632, + "learning_rate": 0.0005158411156641751, + "loss": 1.3561, + "step": 4700 + }, + { + "epoch": 0.51, + "grad_norm": 0.07501124171108461, + "learning_rate": 0.0005156670947810462, + "loss": 1.4614, + "step": 4701 + }, + { + "epoch": 0.51, + "grad_norm": 0.06975465507140635, + "learning_rate": 0.000515493071998228, + "loss": 1.3463, + "step": 4702 + }, + { + "epoch": 0.51, + "grad_norm": 0.07219506375827546, + "learning_rate": 0.0005153190473368213, + "loss": 1.4587, + "step": 4703 + }, + { + "epoch": 0.51, + "grad_norm": 0.07223521378851384, + "learning_rate": 0.0005151450208179276, + "loss": 1.3898, + "step": 4704 + }, + { + "epoch": 0.51, + "grad_norm": 0.08370766965759803, + "learning_rate": 0.0005149709924626476, + "loss": 1.3624, + "step": 4705 + }, + { + "epoch": 0.51, + "grad_norm": 0.08079188900750653, + "learning_rate": 0.0005147969622920832, + "loss": 1.3803, + "step": 4706 + }, + { + "epoch": 0.51, + "grad_norm": 0.07792965487429286, + "learning_rate": 0.0005146229303273363, + "loss": 1.4281, + "step": 4707 + }, + { + "epoch": 0.51, + "grad_norm": 0.08238838852280057, + "learning_rate": 0.0005144488965895084, + "loss": 1.4642, + "step": 4708 + }, + { + "epoch": 0.51, + "grad_norm": 0.0669417006150072, + "learning_rate": 0.0005142748610997023, + "loss": 1.484, + "step": 4709 + }, + { + "epoch": 0.51, + "grad_norm": 0.07877637474591272, + "learning_rate": 0.0005141008238790199, + "loss": 1.3851, + "step": 4710 + }, + { + "epoch": 0.51, + "grad_norm": 0.07109987453919168, + "learning_rate": 0.0005139267849485639, + "loss": 1.3506, + "step": 4711 + }, + { + "epoch": 0.51, + "grad_norm": 0.07645449263811037, + "learning_rate": 0.0005137527443294374, + "loss": 1.4377, + "step": 4712 + }, + { + "epoch": 0.51, + "grad_norm": 0.06840270491375017, + "learning_rate": 0.0005135787020427432, + "loss": 1.3547, + "step": 4713 + }, + { + "epoch": 0.51, + "grad_norm": 0.07120138812630702, + "learning_rate": 0.0005134046581095844, + "loss": 1.4399, + "step": 4714 + }, + { + "epoch": 0.51, + "grad_norm": 0.07151681302142116, + "learning_rate": 0.0005132306125510648, + "loss": 1.2813, + "step": 4715 + }, + { + "epoch": 0.51, + "grad_norm": 0.06240065323117308, + "learning_rate": 0.0005130565653882877, + "loss": 1.3654, + "step": 4716 + }, + { + "epoch": 0.51, + "grad_norm": 0.06194617156336227, + "learning_rate": 0.0005128825166423569, + "loss": 1.5099, + "step": 4717 + }, + { + "epoch": 0.51, + "grad_norm": 0.059304667134885136, + "learning_rate": 0.0005127084663343769, + "loss": 1.4544, + "step": 4718 + }, + { + "epoch": 0.51, + "grad_norm": 0.06483159155925523, + "learning_rate": 0.0005125344144854513, + "loss": 1.5357, + "step": 4719 + }, + { + "epoch": 0.51, + "grad_norm": 0.06246551571893033, + "learning_rate": 0.0005123603611166848, + "loss": 1.5015, + "step": 4720 + }, + { + "epoch": 0.51, + "grad_norm": 0.061595172916569754, + "learning_rate": 0.0005121863062491818, + "loss": 1.3058, + "step": 4721 + }, + { + "epoch": 0.51, + "grad_norm": 0.06544023845066463, + "learning_rate": 0.0005120122499040473, + "loss": 1.392, + "step": 4722 + }, + { + "epoch": 0.51, + "grad_norm": 0.0759664433784072, + "learning_rate": 0.0005118381921023859, + "loss": 1.4097, + "step": 4723 + }, + { + "epoch": 0.51, + "grad_norm": 0.07274017908537568, + "learning_rate": 0.0005116641328653031, + "loss": 1.5345, + "step": 4724 + }, + { + "epoch": 0.51, + "grad_norm": 0.06619000110896354, + "learning_rate": 0.0005114900722139039, + "loss": 1.4603, + "step": 4725 + }, + { + "epoch": 0.51, + "grad_norm": 0.07000980439320045, + "learning_rate": 0.0005113160101692938, + "loss": 1.4563, + "step": 4726 + }, + { + "epoch": 0.51, + "grad_norm": 0.0685154283152642, + "learning_rate": 0.0005111419467525786, + "loss": 1.4495, + "step": 4727 + }, + { + "epoch": 0.51, + "grad_norm": 0.07039172540427986, + "learning_rate": 0.0005109678819848637, + "loss": 1.403, + "step": 4728 + }, + { + "epoch": 0.51, + "grad_norm": 0.06839571480775747, + "learning_rate": 0.0005107938158872554, + "loss": 1.4476, + "step": 4729 + }, + { + "epoch": 0.51, + "grad_norm": 0.06564357301738481, + "learning_rate": 0.0005106197484808598, + "loss": 1.368, + "step": 4730 + }, + { + "epoch": 0.51, + "grad_norm": 0.06865827489358711, + "learning_rate": 0.0005104456797867831, + "loss": 1.3423, + "step": 4731 + }, + { + "epoch": 0.51, + "grad_norm": 0.07352389360835015, + "learning_rate": 0.0005102716098261315, + "loss": 1.4653, + "step": 4732 + }, + { + "epoch": 0.51, + "grad_norm": 0.07081825517552962, + "learning_rate": 0.0005100975386200119, + "loss": 1.3664, + "step": 4733 + }, + { + "epoch": 0.51, + "grad_norm": 0.07545040261023422, + "learning_rate": 0.000509923466189531, + "loss": 1.5089, + "step": 4734 + }, + { + "epoch": 0.51, + "grad_norm": 0.07863333546994716, + "learning_rate": 0.0005097493925557956, + "loss": 1.5012, + "step": 4735 + }, + { + "epoch": 0.51, + "grad_norm": 0.06897139505039224, + "learning_rate": 0.0005095753177399127, + "loss": 1.4103, + "step": 4736 + }, + { + "epoch": 0.51, + "grad_norm": 0.0709893551702056, + "learning_rate": 0.0005094012417629895, + "loss": 1.5152, + "step": 4737 + }, + { + "epoch": 0.51, + "grad_norm": 0.07522261575946469, + "learning_rate": 0.0005092271646461334, + "loss": 1.3445, + "step": 4738 + }, + { + "epoch": 0.51, + "grad_norm": 0.07113884668637967, + "learning_rate": 0.0005090530864104517, + "loss": 1.3651, + "step": 4739 + }, + { + "epoch": 0.51, + "grad_norm": 0.07430397727410497, + "learning_rate": 0.000508879007077052, + "loss": 1.5024, + "step": 4740 + }, + { + "epoch": 0.51, + "grad_norm": 0.06937378871027804, + "learning_rate": 0.0005087049266670424, + "loss": 1.4918, + "step": 4741 + }, + { + "epoch": 0.51, + "grad_norm": 0.07196532177141281, + "learning_rate": 0.0005085308452015301, + "loss": 1.5521, + "step": 4742 + }, + { + "epoch": 0.51, + "grad_norm": 0.07511206297719938, + "learning_rate": 0.0005083567627016235, + "loss": 1.5095, + "step": 4743 + }, + { + "epoch": 0.51, + "grad_norm": 0.06208055254053003, + "learning_rate": 0.0005081826791884307, + "loss": 1.4788, + "step": 4744 + }, + { + "epoch": 0.51, + "grad_norm": 0.06275328428534259, + "learning_rate": 0.0005080085946830596, + "loss": 1.2569, + "step": 4745 + }, + { + "epoch": 0.51, + "grad_norm": 0.07479166475912431, + "learning_rate": 0.0005078345092066191, + "loss": 1.3658, + "step": 4746 + }, + { + "epoch": 0.51, + "grad_norm": 0.06983578026027626, + "learning_rate": 0.0005076604227802171, + "loss": 1.506, + "step": 4747 + }, + { + "epoch": 0.51, + "grad_norm": 0.059381228868428844, + "learning_rate": 0.0005074863354249625, + "loss": 1.5236, + "step": 4748 + }, + { + "epoch": 0.51, + "grad_norm": 0.07064049104101132, + "learning_rate": 0.000507312247161964, + "loss": 1.2982, + "step": 4749 + }, + { + "epoch": 0.51, + "grad_norm": 0.06537241706728257, + "learning_rate": 0.0005071381580123302, + "loss": 1.3971, + "step": 4750 + }, + { + "epoch": 0.51, + "grad_norm": 0.0705342768829638, + "learning_rate": 0.0005069640679971702, + "loss": 1.556, + "step": 4751 + }, + { + "epoch": 0.51, + "grad_norm": 0.07234045693835783, + "learning_rate": 0.000506789977137593, + "loss": 1.4512, + "step": 4752 + }, + { + "epoch": 0.51, + "grad_norm": 0.07044135313910474, + "learning_rate": 0.0005066158854547075, + "loss": 1.4174, + "step": 4753 + }, + { + "epoch": 0.51, + "grad_norm": 0.07391834794874395, + "learning_rate": 0.0005064417929696232, + "loss": 1.378, + "step": 4754 + }, + { + "epoch": 0.51, + "grad_norm": 0.0717476739675131, + "learning_rate": 0.0005062676997034493, + "loss": 1.3867, + "step": 4755 + }, + { + "epoch": 0.51, + "grad_norm": 0.067547995646433, + "learning_rate": 0.0005060936056772951, + "loss": 1.5415, + "step": 4756 + }, + { + "epoch": 0.51, + "grad_norm": 0.0709070460555177, + "learning_rate": 0.0005059195109122705, + "loss": 1.4631, + "step": 4757 + }, + { + "epoch": 0.51, + "grad_norm": 0.0732869766299112, + "learning_rate": 0.0005057454154294846, + "loss": 1.4731, + "step": 4758 + }, + { + "epoch": 0.51, + "grad_norm": 0.07631628853739582, + "learning_rate": 0.0005055713192500472, + "loss": 1.3617, + "step": 4759 + }, + { + "epoch": 0.51, + "grad_norm": 0.07557827019800746, + "learning_rate": 0.0005053972223950682, + "loss": 1.3886, + "step": 4760 + }, + { + "epoch": 0.51, + "grad_norm": 0.07541011499079218, + "learning_rate": 0.0005052231248856573, + "loss": 1.4401, + "step": 4761 + }, + { + "epoch": 0.51, + "grad_norm": 0.06878411120055702, + "learning_rate": 0.0005050490267429246, + "loss": 1.4652, + "step": 4762 + }, + { + "epoch": 0.51, + "grad_norm": 0.06739762564942098, + "learning_rate": 0.00050487492798798, + "loss": 1.5507, + "step": 4763 + }, + { + "epoch": 0.51, + "grad_norm": 0.08015595700952176, + "learning_rate": 0.0005047008286419336, + "loss": 1.3467, + "step": 4764 + }, + { + "epoch": 0.51, + "grad_norm": 0.07001966186468692, + "learning_rate": 0.0005045267287258953, + "loss": 1.4121, + "step": 4765 + }, + { + "epoch": 0.51, + "grad_norm": 0.07030643248207329, + "learning_rate": 0.0005043526282609757, + "loss": 1.4879, + "step": 4766 + }, + { + "epoch": 0.51, + "grad_norm": 0.08827989390479461, + "learning_rate": 0.0005041785272682849, + "loss": 1.3802, + "step": 4767 + }, + { + "epoch": 0.51, + "grad_norm": 0.06827504201270179, + "learning_rate": 0.0005040044257689333, + "loss": 1.2448, + "step": 4768 + }, + { + "epoch": 0.51, + "grad_norm": 0.07915638860324421, + "learning_rate": 0.0005038303237840314, + "loss": 1.4055, + "step": 4769 + }, + { + "epoch": 0.51, + "grad_norm": 0.06918765384054784, + "learning_rate": 0.0005036562213346893, + "loss": 1.538, + "step": 4770 + }, + { + "epoch": 0.51, + "grad_norm": 0.08004483251964818, + "learning_rate": 0.0005034821184420179, + "loss": 1.279, + "step": 4771 + }, + { + "epoch": 0.51, + "grad_norm": 0.06912086940038036, + "learning_rate": 0.0005033080151271276, + "loss": 1.3917, + "step": 4772 + }, + { + "epoch": 0.51, + "grad_norm": 0.0714885845925155, + "learning_rate": 0.000503133911411129, + "loss": 1.394, + "step": 4773 + }, + { + "epoch": 0.51, + "grad_norm": 0.06714730640721422, + "learning_rate": 0.0005029598073151329, + "loss": 1.4416, + "step": 4774 + }, + { + "epoch": 0.51, + "grad_norm": 0.07827813518251049, + "learning_rate": 0.00050278570286025, + "loss": 1.3962, + "step": 4775 + }, + { + "epoch": 0.51, + "grad_norm": 0.08034014575300731, + "learning_rate": 0.0005026115980675908, + "loss": 1.3307, + "step": 4776 + }, + { + "epoch": 0.51, + "grad_norm": 0.0765192032903928, + "learning_rate": 0.0005024374929582664, + "loss": 1.4161, + "step": 4777 + }, + { + "epoch": 0.51, + "grad_norm": 0.06608261104152828, + "learning_rate": 0.0005022633875533879, + "loss": 1.2704, + "step": 4778 + }, + { + "epoch": 0.51, + "grad_norm": 0.07850238429821357, + "learning_rate": 0.0005020892818740656, + "loss": 1.4342, + "step": 4779 + }, + { + "epoch": 0.51, + "grad_norm": 0.06403022438627028, + "learning_rate": 0.0005019151759414107, + "loss": 1.4186, + "step": 4780 + }, + { + "epoch": 0.51, + "grad_norm": 0.06799933698291283, + "learning_rate": 0.0005017410697765342, + "loss": 1.362, + "step": 4781 + }, + { + "epoch": 0.51, + "grad_norm": 0.0665150830459829, + "learning_rate": 0.0005015669634005467, + "loss": 1.4916, + "step": 4782 + }, + { + "epoch": 0.51, + "grad_norm": 0.07236361062725231, + "learning_rate": 0.0005013928568345597, + "loss": 1.2738, + "step": 4783 + }, + { + "epoch": 0.51, + "grad_norm": 0.06861181480465103, + "learning_rate": 0.000501218750099684, + "loss": 1.3728, + "step": 4784 + }, + { + "epoch": 0.51, + "grad_norm": 0.06525807506942764, + "learning_rate": 0.0005010446432170306, + "loss": 1.5133, + "step": 4785 + }, + { + "epoch": 0.51, + "grad_norm": 0.06543208409660094, + "learning_rate": 0.0005008705362077108, + "loss": 1.3821, + "step": 4786 + }, + { + "epoch": 0.51, + "grad_norm": 0.07321867883420735, + "learning_rate": 0.0005006964290928351, + "loss": 1.3791, + "step": 4787 + }, + { + "epoch": 0.51, + "grad_norm": 0.07359931820206309, + "learning_rate": 0.0005005223218935152, + "loss": 1.3444, + "step": 4788 + }, + { + "epoch": 0.51, + "grad_norm": 0.07192584249908926, + "learning_rate": 0.0005003482146308621, + "loss": 1.3597, + "step": 4789 + }, + { + "epoch": 0.51, + "grad_norm": 0.06602677971933513, + "learning_rate": 0.0005001741073259866, + "loss": 1.5547, + "step": 4790 + }, + { + "epoch": 0.52, + "grad_norm": 0.07961813739728019, + "learning_rate": 0.0005, + "loss": 1.4441, + "step": 4791 + }, + { + "epoch": 0.52, + "grad_norm": 0.07216239496007772, + "learning_rate": 0.0004998258926740136, + "loss": 1.3991, + "step": 4792 + }, + { + "epoch": 0.52, + "grad_norm": 0.06996627259186475, + "learning_rate": 0.0004996517853691379, + "loss": 1.498, + "step": 4793 + }, + { + "epoch": 0.52, + "grad_norm": 0.06657604028460089, + "learning_rate": 0.0004994776781064847, + "loss": 1.3081, + "step": 4794 + }, + { + "epoch": 0.52, + "grad_norm": 0.06303048383434355, + "learning_rate": 0.0004993035709071648, + "loss": 1.2912, + "step": 4795 + }, + { + "epoch": 0.52, + "grad_norm": 0.06521095894949946, + "learning_rate": 0.0004991294637922893, + "loss": 1.3, + "step": 4796 + }, + { + "epoch": 0.52, + "grad_norm": 0.07399255311468003, + "learning_rate": 0.0004989553567829695, + "loss": 1.4204, + "step": 4797 + }, + { + "epoch": 0.52, + "grad_norm": 0.08417173627975091, + "learning_rate": 0.000498781249900316, + "loss": 1.514, + "step": 4798 + }, + { + "epoch": 0.52, + "grad_norm": 0.07708032383741255, + "learning_rate": 0.0004986071431654404, + "loss": 1.4721, + "step": 4799 + }, + { + "epoch": 0.52, + "grad_norm": 0.06307820528877853, + "learning_rate": 0.0004984330365994535, + "loss": 1.373, + "step": 4800 + }, + { + "epoch": 0.52, + "grad_norm": 0.06111579146187217, + "learning_rate": 0.000498258930223466, + "loss": 1.3943, + "step": 4801 + }, + { + "epoch": 0.52, + "grad_norm": 0.06554523449766973, + "learning_rate": 0.0004980848240585895, + "loss": 1.3626, + "step": 4802 + }, + { + "epoch": 0.52, + "grad_norm": 0.06604543741412207, + "learning_rate": 0.0004979107181259345, + "loss": 1.4796, + "step": 4803 + }, + { + "epoch": 0.52, + "grad_norm": 0.06646732193706745, + "learning_rate": 0.0004977366124466121, + "loss": 1.337, + "step": 4804 + }, + { + "epoch": 0.52, + "grad_norm": 0.07501224583003234, + "learning_rate": 0.0004975625070417335, + "loss": 1.4135, + "step": 4805 + }, + { + "epoch": 0.52, + "grad_norm": 0.0737723782461909, + "learning_rate": 0.0004973884019324092, + "loss": 1.4717, + "step": 4806 + }, + { + "epoch": 0.52, + "grad_norm": 0.07017623757888976, + "learning_rate": 0.0004972142971397503, + "loss": 1.2908, + "step": 4807 + }, + { + "epoch": 0.52, + "grad_norm": 0.08090698115220855, + "learning_rate": 0.0004970401926848673, + "loss": 1.3435, + "step": 4808 + }, + { + "epoch": 0.52, + "grad_norm": 0.07428142762072636, + "learning_rate": 0.0004968660885888712, + "loss": 1.3949, + "step": 4809 + }, + { + "epoch": 0.52, + "grad_norm": 0.06726515819501672, + "learning_rate": 0.0004966919848728726, + "loss": 1.5103, + "step": 4810 + }, + { + "epoch": 0.52, + "grad_norm": 0.07380195335443142, + "learning_rate": 0.0004965178815579822, + "loss": 1.3751, + "step": 4811 + }, + { + "epoch": 0.52, + "grad_norm": 0.07761558508982735, + "learning_rate": 0.0004963437786653108, + "loss": 1.3809, + "step": 4812 + }, + { + "epoch": 0.52, + "grad_norm": 0.07763719108725761, + "learning_rate": 0.0004961696762159687, + "loss": 1.4223, + "step": 4813 + }, + { + "epoch": 0.52, + "grad_norm": 0.07189756416959095, + "learning_rate": 0.0004959955742310667, + "loss": 1.3861, + "step": 4814 + }, + { + "epoch": 0.52, + "grad_norm": 0.08204004830906156, + "learning_rate": 0.0004958214727317151, + "loss": 1.3729, + "step": 4815 + }, + { + "epoch": 0.52, + "grad_norm": 0.07748801457158774, + "learning_rate": 0.0004956473717390242, + "loss": 1.4368, + "step": 4816 + }, + { + "epoch": 0.52, + "grad_norm": 0.07114876371367315, + "learning_rate": 0.0004954732712741046, + "loss": 1.3806, + "step": 4817 + }, + { + "epoch": 0.52, + "grad_norm": 0.07188339735039269, + "learning_rate": 0.0004952991713580667, + "loss": 1.2483, + "step": 4818 + }, + { + "epoch": 0.52, + "grad_norm": 0.07930257872447899, + "learning_rate": 0.0004951250720120203, + "loss": 1.3644, + "step": 4819 + }, + { + "epoch": 0.52, + "grad_norm": 0.08348873172896026, + "learning_rate": 0.0004949509732570756, + "loss": 1.3957, + "step": 4820 + }, + { + "epoch": 0.52, + "grad_norm": 0.08389458458452004, + "learning_rate": 0.0004947768751143428, + "loss": 1.4093, + "step": 4821 + }, + { + "epoch": 0.52, + "grad_norm": 0.06612652000902616, + "learning_rate": 0.000494602777604932, + "loss": 1.3784, + "step": 4822 + }, + { + "epoch": 0.52, + "grad_norm": 0.0741760050819813, + "learning_rate": 0.0004944286807499529, + "loss": 1.4181, + "step": 4823 + }, + { + "epoch": 0.52, + "grad_norm": 0.07490795828156847, + "learning_rate": 0.0004942545845705155, + "loss": 1.3859, + "step": 4824 + }, + { + "epoch": 0.52, + "grad_norm": 0.07099564390158039, + "learning_rate": 0.0004940804890877297, + "loss": 1.437, + "step": 4825 + }, + { + "epoch": 0.52, + "grad_norm": 0.07139635992893732, + "learning_rate": 0.0004939063943227048, + "loss": 1.2719, + "step": 4826 + }, + { + "epoch": 0.52, + "grad_norm": 0.07376969080532272, + "learning_rate": 0.0004937323002965506, + "loss": 1.4981, + "step": 4827 + }, + { + "epoch": 0.52, + "grad_norm": 0.07045853122954072, + "learning_rate": 0.0004935582070303767, + "loss": 1.3566, + "step": 4828 + }, + { + "epoch": 0.52, + "grad_norm": 0.0717218632271678, + "learning_rate": 0.0004933841145452926, + "loss": 1.2817, + "step": 4829 + }, + { + "epoch": 0.52, + "grad_norm": 0.06912439103196728, + "learning_rate": 0.0004932100228624072, + "loss": 1.3627, + "step": 4830 + }, + { + "epoch": 0.52, + "grad_norm": 0.06397379656010241, + "learning_rate": 0.0004930359320028299, + "loss": 1.3331, + "step": 4831 + }, + { + "epoch": 0.52, + "grad_norm": 0.072065307732877, + "learning_rate": 0.0004928618419876698, + "loss": 1.4236, + "step": 4832 + }, + { + "epoch": 0.52, + "grad_norm": 0.06265824186043788, + "learning_rate": 0.0004926877528380362, + "loss": 1.4034, + "step": 4833 + }, + { + "epoch": 0.52, + "grad_norm": 0.07470555319601731, + "learning_rate": 0.0004925136645750376, + "loss": 1.3959, + "step": 4834 + }, + { + "epoch": 0.52, + "grad_norm": 0.06716092261155211, + "learning_rate": 0.0004923395772197829, + "loss": 1.3977, + "step": 4835 + }, + { + "epoch": 0.52, + "grad_norm": 0.07475858200018855, + "learning_rate": 0.000492165490793381, + "loss": 1.3418, + "step": 4836 + }, + { + "epoch": 0.52, + "grad_norm": 0.07327276783550837, + "learning_rate": 0.0004919914053169404, + "loss": 1.5526, + "step": 4837 + }, + { + "epoch": 0.52, + "grad_norm": 0.06446215898606858, + "learning_rate": 0.0004918173208115694, + "loss": 1.4026, + "step": 4838 + }, + { + "epoch": 0.52, + "grad_norm": 0.06850668413178283, + "learning_rate": 0.0004916432372983767, + "loss": 1.4541, + "step": 4839 + }, + { + "epoch": 0.52, + "grad_norm": 0.07883512917030402, + "learning_rate": 0.0004914691547984701, + "loss": 1.4152, + "step": 4840 + }, + { + "epoch": 0.52, + "grad_norm": 0.06626183570546103, + "learning_rate": 0.0004912950733329579, + "loss": 1.5067, + "step": 4841 + }, + { + "epoch": 0.52, + "grad_norm": 0.06982230220511904, + "learning_rate": 0.000491120992922948, + "loss": 1.4825, + "step": 4842 + }, + { + "epoch": 0.52, + "grad_norm": 0.06640439609436137, + "learning_rate": 0.0004909469135895484, + "loss": 1.5727, + "step": 4843 + }, + { + "epoch": 0.52, + "grad_norm": 0.07230302368111746, + "learning_rate": 0.0004907728353538667, + "loss": 1.3462, + "step": 4844 + }, + { + "epoch": 0.52, + "grad_norm": 0.07980176876442475, + "learning_rate": 0.0004905987582370106, + "loss": 1.2809, + "step": 4845 + }, + { + "epoch": 0.52, + "grad_norm": 0.07376211478806756, + "learning_rate": 0.0004904246822600874, + "loss": 1.3073, + "step": 4846 + }, + { + "epoch": 0.52, + "grad_norm": 0.08760449272236642, + "learning_rate": 0.0004902506074442044, + "loss": 1.4225, + "step": 4847 + }, + { + "epoch": 0.52, + "grad_norm": 0.06840610524758446, + "learning_rate": 0.000490076533810469, + "loss": 1.2817, + "step": 4848 + }, + { + "epoch": 0.52, + "grad_norm": 0.07559520805663945, + "learning_rate": 0.0004899024613799881, + "loss": 1.5285, + "step": 4849 + }, + { + "epoch": 0.52, + "grad_norm": 0.06808732086358632, + "learning_rate": 0.0004897283901738686, + "loss": 1.3411, + "step": 4850 + }, + { + "epoch": 0.52, + "grad_norm": 0.06805750834540934, + "learning_rate": 0.0004895543202132172, + "loss": 1.431, + "step": 4851 + }, + { + "epoch": 0.52, + "grad_norm": 0.08061022722716565, + "learning_rate": 0.0004893802515191403, + "loss": 1.4008, + "step": 4852 + }, + { + "epoch": 0.52, + "grad_norm": 0.07093877908970345, + "learning_rate": 0.0004892061841127446, + "loss": 1.3412, + "step": 4853 + }, + { + "epoch": 0.52, + "grad_norm": 0.07059607839069051, + "learning_rate": 0.0004890321180151364, + "loss": 1.3494, + "step": 4854 + }, + { + "epoch": 0.52, + "grad_norm": 0.07822255808458586, + "learning_rate": 0.0004888580532474216, + "loss": 1.5409, + "step": 4855 + }, + { + "epoch": 0.52, + "grad_norm": 0.07227079108820397, + "learning_rate": 0.0004886839898307062, + "loss": 1.473, + "step": 4856 + }, + { + "epoch": 0.52, + "grad_norm": 0.07038880965726567, + "learning_rate": 0.0004885099277860961, + "loss": 1.3489, + "step": 4857 + }, + { + "epoch": 0.52, + "grad_norm": 0.06991878687842776, + "learning_rate": 0.0004883358671346968, + "loss": 1.4336, + "step": 4858 + }, + { + "epoch": 0.52, + "grad_norm": 0.06717727331715036, + "learning_rate": 0.000488161807897614, + "loss": 1.4411, + "step": 4859 + }, + { + "epoch": 0.52, + "grad_norm": 0.0774671198594342, + "learning_rate": 0.00048798775009595285, + "loss": 1.4623, + "step": 4860 + }, + { + "epoch": 0.52, + "grad_norm": 0.06980051958932858, + "learning_rate": 0.0004878136937508183, + "loss": 1.3775, + "step": 4861 + }, + { + "epoch": 0.52, + "grad_norm": 0.06641487458265112, + "learning_rate": 0.00048763963888331544, + "loss": 1.4088, + "step": 4862 + }, + { + "epoch": 0.52, + "grad_norm": 0.076824429835218, + "learning_rate": 0.00048746558551454876, + "loss": 1.3507, + "step": 4863 + }, + { + "epoch": 0.52, + "grad_norm": 0.06712314946460401, + "learning_rate": 0.00048729153366562324, + "loss": 1.364, + "step": 4864 + }, + { + "epoch": 0.52, + "grad_norm": 0.06919590987860398, + "learning_rate": 0.00048711748335764305, + "loss": 1.4172, + "step": 4865 + }, + { + "epoch": 0.52, + "grad_norm": 0.07129027417864583, + "learning_rate": 0.00048694343461171233, + "loss": 1.5116, + "step": 4866 + }, + { + "epoch": 0.52, + "grad_norm": 0.0732730577898096, + "learning_rate": 0.0004867693874489353, + "loss": 1.504, + "step": 4867 + }, + { + "epoch": 0.52, + "grad_norm": 0.06935474063557696, + "learning_rate": 0.0004865953418904156, + "loss": 1.5817, + "step": 4868 + }, + { + "epoch": 0.52, + "grad_norm": 0.06946055369077489, + "learning_rate": 0.0004864212979572569, + "loss": 1.5422, + "step": 4869 + }, + { + "epoch": 0.52, + "grad_norm": 0.08287847765182309, + "learning_rate": 0.0004862472556705626, + "loss": 1.4387, + "step": 4870 + }, + { + "epoch": 0.52, + "grad_norm": 0.06424473267777256, + "learning_rate": 0.00048607321505143614, + "loss": 1.3416, + "step": 4871 + }, + { + "epoch": 0.52, + "grad_norm": 0.07389225955640676, + "learning_rate": 0.0004858991761209803, + "loss": 1.3793, + "step": 4872 + }, + { + "epoch": 0.52, + "grad_norm": 0.07360082163492014, + "learning_rate": 0.0004857251389002979, + "loss": 1.344, + "step": 4873 + }, + { + "epoch": 0.52, + "grad_norm": 0.07449052954350506, + "learning_rate": 0.0004855511034104916, + "loss": 1.3882, + "step": 4874 + }, + { + "epoch": 0.52, + "grad_norm": 0.08073253698261196, + "learning_rate": 0.0004853770696726638, + "loss": 1.3821, + "step": 4875 + }, + { + "epoch": 0.52, + "grad_norm": 0.07201609348875974, + "learning_rate": 0.0004852030377079168, + "loss": 1.3864, + "step": 4876 + }, + { + "epoch": 0.52, + "grad_norm": 0.07729040284480578, + "learning_rate": 0.00048502900753735246, + "loss": 1.3482, + "step": 4877 + }, + { + "epoch": 0.52, + "grad_norm": 0.08204100045657077, + "learning_rate": 0.0004848549791820725, + "loss": 1.3878, + "step": 4878 + }, + { + "epoch": 0.52, + "grad_norm": 0.07915089902414485, + "learning_rate": 0.0004846809526631786, + "loss": 1.4156, + "step": 4879 + }, + { + "epoch": 0.52, + "grad_norm": 0.06668874663977116, + "learning_rate": 0.00048450692800177205, + "loss": 1.4023, + "step": 4880 + }, + { + "epoch": 0.52, + "grad_norm": 0.07506899782913949, + "learning_rate": 0.00048433290521895375, + "loss": 1.5963, + "step": 4881 + }, + { + "epoch": 0.52, + "grad_norm": 0.08009982251249653, + "learning_rate": 0.000484158884335825, + "loss": 1.5212, + "step": 4882 + }, + { + "epoch": 0.52, + "grad_norm": 0.07433691781861437, + "learning_rate": 0.00048398486537348583, + "loss": 1.4625, + "step": 4883 + }, + { + "epoch": 0.53, + "grad_norm": 0.07517421414037016, + "learning_rate": 0.0004838108483530371, + "loss": 1.3501, + "step": 4884 + }, + { + "epoch": 0.53, + "grad_norm": 0.07718074248971674, + "learning_rate": 0.00048363683329557877, + "loss": 1.449, + "step": 4885 + }, + { + "epoch": 0.53, + "grad_norm": 0.06772550140183944, + "learning_rate": 0.0004834628202222107, + "loss": 1.4537, + "step": 4886 + }, + { + "epoch": 0.53, + "grad_norm": 0.08030132699081302, + "learning_rate": 0.00048328880915403285, + "loss": 1.3553, + "step": 4887 + }, + { + "epoch": 0.53, + "grad_norm": 0.08022333563213116, + "learning_rate": 0.0004831148001121445, + "loss": 1.3818, + "step": 4888 + }, + { + "epoch": 0.53, + "grad_norm": 0.08425144353421923, + "learning_rate": 0.0004829407931176447, + "loss": 1.4234, + "step": 4889 + }, + { + "epoch": 0.53, + "grad_norm": 0.07772699541497993, + "learning_rate": 0.00048276678819163265, + "loss": 1.275, + "step": 4890 + }, + { + "epoch": 0.53, + "grad_norm": 0.07129243130758527, + "learning_rate": 0.00048259278535520703, + "loss": 1.3798, + "step": 4891 + }, + { + "epoch": 0.53, + "grad_norm": 0.08053342693305993, + "learning_rate": 0.0004824187846294662, + "loss": 1.5514, + "step": 4892 + }, + { + "epoch": 0.53, + "grad_norm": 0.08309886930824148, + "learning_rate": 0.00048224478603550833, + "loss": 1.538, + "step": 4893 + }, + { + "epoch": 0.53, + "grad_norm": 0.06876084759596014, + "learning_rate": 0.0004820707895944312, + "loss": 1.4856, + "step": 4894 + }, + { + "epoch": 0.53, + "grad_norm": 0.07354818557342893, + "learning_rate": 0.00048189679532733274, + "loss": 1.3469, + "step": 4895 + }, + { + "epoch": 0.53, + "grad_norm": 0.07530483097981298, + "learning_rate": 0.00048172280325531027, + "loss": 1.3601, + "step": 4896 + }, + { + "epoch": 0.53, + "grad_norm": 0.077507644997754, + "learning_rate": 0.0004815488133994608, + "loss": 1.4887, + "step": 4897 + }, + { + "epoch": 0.53, + "grad_norm": 0.07757860383810837, + "learning_rate": 0.0004813748257808814, + "loss": 1.4837, + "step": 4898 + }, + { + "epoch": 0.53, + "grad_norm": 0.0844479797681231, + "learning_rate": 0.00048120084042066865, + "loss": 1.4884, + "step": 4899 + }, + { + "epoch": 0.53, + "grad_norm": 0.09641157890597221, + "learning_rate": 0.0004810268573399187, + "loss": 1.4827, + "step": 4900 + }, + { + "epoch": 0.53, + "grad_norm": 0.07859998643895252, + "learning_rate": 0.0004808528765597278, + "loss": 1.4992, + "step": 4901 + }, + { + "epoch": 0.53, + "grad_norm": 0.07276275280157116, + "learning_rate": 0.00048067889810119157, + "loss": 1.3915, + "step": 4902 + }, + { + "epoch": 0.53, + "grad_norm": 0.08248731446994635, + "learning_rate": 0.00048050492198540575, + "loss": 1.5134, + "step": 4903 + }, + { + "epoch": 0.53, + "grad_norm": 0.07469307310941618, + "learning_rate": 0.00048033094823346517, + "loss": 1.429, + "step": 4904 + }, + { + "epoch": 0.53, + "grad_norm": 0.06984611564695324, + "learning_rate": 0.00048015697686646486, + "loss": 1.3318, + "step": 4905 + }, + { + "epoch": 0.53, + "grad_norm": 0.07699548220008161, + "learning_rate": 0.00047998300790549957, + "loss": 1.4534, + "step": 4906 + }, + { + "epoch": 0.53, + "grad_norm": 0.07689484583591935, + "learning_rate": 0.0004798090413716636, + "loss": 1.458, + "step": 4907 + }, + { + "epoch": 0.53, + "grad_norm": 0.0694871817732402, + "learning_rate": 0.00047963507728605105, + "loss": 1.4913, + "step": 4908 + }, + { + "epoch": 0.53, + "grad_norm": 0.07682175823997765, + "learning_rate": 0.00047946111566975544, + "loss": 1.4395, + "step": 4909 + }, + { + "epoch": 0.53, + "grad_norm": 0.07852039121044253, + "learning_rate": 0.00047928715654387043, + "loss": 1.3262, + "step": 4910 + }, + { + "epoch": 0.53, + "grad_norm": 0.07051721959880726, + "learning_rate": 0.0004791131999294891, + "loss": 1.5198, + "step": 4911 + }, + { + "epoch": 0.53, + "grad_norm": 0.0830403180149037, + "learning_rate": 0.00047893924584770423, + "loss": 1.4268, + "step": 4912 + }, + { + "epoch": 0.53, + "grad_norm": 0.07212490399033414, + "learning_rate": 0.0004787652943196087, + "loss": 1.3805, + "step": 4913 + }, + { + "epoch": 0.53, + "grad_norm": 0.07701948410209183, + "learning_rate": 0.0004785913453662941, + "loss": 1.4523, + "step": 4914 + }, + { + "epoch": 0.53, + "grad_norm": 0.08254302412735906, + "learning_rate": 0.00047841739900885284, + "loss": 1.3042, + "step": 4915 + }, + { + "epoch": 0.53, + "grad_norm": 0.06816403715521224, + "learning_rate": 0.0004782434552683763, + "loss": 1.4307, + "step": 4916 + }, + { + "epoch": 0.53, + "grad_norm": 0.07860013899326017, + "learning_rate": 0.0004780695141659557, + "loss": 1.6049, + "step": 4917 + }, + { + "epoch": 0.53, + "grad_norm": 0.06599668818079388, + "learning_rate": 0.0004778955757226823, + "loss": 1.4359, + "step": 4918 + }, + { + "epoch": 0.53, + "grad_norm": 0.0784233713723611, + "learning_rate": 0.0004777216399596465, + "loss": 1.2344, + "step": 4919 + }, + { + "epoch": 0.53, + "grad_norm": 0.0788183292534905, + "learning_rate": 0.0004775477068979385, + "loss": 1.416, + "step": 4920 + }, + { + "epoch": 0.53, + "grad_norm": 0.07366391835272944, + "learning_rate": 0.00047737377655864867, + "loss": 1.3873, + "step": 4921 + }, + { + "epoch": 0.53, + "grad_norm": 0.0644438910372059, + "learning_rate": 0.00047719984896286635, + "loss": 1.2997, + "step": 4922 + }, + { + "epoch": 0.53, + "grad_norm": 0.0819510510639513, + "learning_rate": 0.0004770259241316809, + "loss": 1.3432, + "step": 4923 + }, + { + "epoch": 0.53, + "grad_norm": 0.07460646311584986, + "learning_rate": 0.00047685200208618164, + "loss": 1.4545, + "step": 4924 + }, + { + "epoch": 0.53, + "grad_norm": 0.07142513170538935, + "learning_rate": 0.00047667808284745656, + "loss": 1.456, + "step": 4925 + }, + { + "epoch": 0.53, + "grad_norm": 0.0638211680555277, + "learning_rate": 0.0004765041664365945, + "loss": 1.384, + "step": 4926 + }, + { + "epoch": 0.53, + "grad_norm": 0.06456251777135609, + "learning_rate": 0.00047633025287468323, + "loss": 1.3933, + "step": 4927 + }, + { + "epoch": 0.53, + "grad_norm": 0.07219441230096289, + "learning_rate": 0.00047615634218281034, + "loss": 1.4249, + "step": 4928 + }, + { + "epoch": 0.53, + "grad_norm": 0.06770038061326396, + "learning_rate": 0.0004759824343820632, + "loss": 1.3451, + "step": 4929 + }, + { + "epoch": 0.53, + "grad_norm": 0.06401901008264176, + "learning_rate": 0.00047580852949352876, + "loss": 1.3593, + "step": 4930 + }, + { + "epoch": 0.53, + "grad_norm": 0.06572917995129061, + "learning_rate": 0.0004756346275382934, + "loss": 1.4683, + "step": 4931 + }, + { + "epoch": 0.53, + "grad_norm": 0.06690009643906077, + "learning_rate": 0.00047546072853744357, + "loss": 1.4288, + "step": 4932 + }, + { + "epoch": 0.53, + "grad_norm": 0.06554492012605238, + "learning_rate": 0.00047528683251206493, + "loss": 1.5643, + "step": 4933 + }, + { + "epoch": 0.53, + "grad_norm": 0.0632941403255531, + "learning_rate": 0.00047511293948324324, + "loss": 1.2482, + "step": 4934 + }, + { + "epoch": 0.53, + "grad_norm": 0.06731580775226245, + "learning_rate": 0.0004749390494720633, + "loss": 1.4638, + "step": 4935 + }, + { + "epoch": 0.53, + "grad_norm": 0.07153188163916746, + "learning_rate": 0.00047476516249960994, + "loss": 1.4326, + "step": 4936 + }, + { + "epoch": 0.53, + "grad_norm": 0.05895315648337665, + "learning_rate": 0.00047459127858696763, + "loss": 1.519, + "step": 4937 + }, + { + "epoch": 0.53, + "grad_norm": 0.07150295093078468, + "learning_rate": 0.00047441739775522045, + "loss": 1.3454, + "step": 4938 + }, + { + "epoch": 0.53, + "grad_norm": 0.06787792092572041, + "learning_rate": 0.000474243520025452, + "loss": 1.3111, + "step": 4939 + }, + { + "epoch": 0.53, + "grad_norm": 0.06984567395943, + "learning_rate": 0.00047406964541874544, + "loss": 1.4355, + "step": 4940 + }, + { + "epoch": 0.53, + "grad_norm": 0.07572599576818485, + "learning_rate": 0.00047389577395618387, + "loss": 1.5107, + "step": 4941 + }, + { + "epoch": 0.53, + "grad_norm": 0.08604265411994148, + "learning_rate": 0.0004737219056588497, + "loss": 1.4844, + "step": 4942 + }, + { + "epoch": 0.53, + "grad_norm": 0.06685170487349368, + "learning_rate": 0.00047354804054782493, + "loss": 1.3661, + "step": 4943 + }, + { + "epoch": 0.53, + "grad_norm": 0.07421917272121109, + "learning_rate": 0.0004733741786441916, + "loss": 1.3045, + "step": 4944 + }, + { + "epoch": 0.53, + "grad_norm": 0.08034262065898659, + "learning_rate": 0.00047320031996903094, + "loss": 1.3913, + "step": 4945 + }, + { + "epoch": 0.53, + "grad_norm": 0.07111550377539654, + "learning_rate": 0.0004730264645434238, + "loss": 1.4248, + "step": 4946 + }, + { + "epoch": 0.53, + "grad_norm": 0.07320459036647926, + "learning_rate": 0.0004728526123884508, + "loss": 1.378, + "step": 4947 + }, + { + "epoch": 0.53, + "grad_norm": 0.08675173001306088, + "learning_rate": 0.0004726787635251921, + "loss": 1.3621, + "step": 4948 + }, + { + "epoch": 0.53, + "grad_norm": 0.07183641238182674, + "learning_rate": 0.00047250491797472754, + "loss": 1.438, + "step": 4949 + }, + { + "epoch": 0.53, + "grad_norm": 0.0757586168872166, + "learning_rate": 0.00047233107575813657, + "loss": 1.5881, + "step": 4950 + }, + { + "epoch": 0.53, + "grad_norm": 0.07513930507667875, + "learning_rate": 0.0004721572368964979, + "loss": 1.5212, + "step": 4951 + }, + { + "epoch": 0.53, + "grad_norm": 0.08132533337284426, + "learning_rate": 0.0004719834014108903, + "loss": 1.4006, + "step": 4952 + }, + { + "epoch": 0.53, + "grad_norm": 0.07089111386192162, + "learning_rate": 0.00047180956932239186, + "loss": 1.3653, + "step": 4953 + }, + { + "epoch": 0.53, + "grad_norm": 0.07935963141630312, + "learning_rate": 0.00047163574065208034, + "loss": 1.4387, + "step": 4954 + }, + { + "epoch": 0.53, + "grad_norm": 0.07808586869134204, + "learning_rate": 0.000471461915421033, + "loss": 1.4203, + "step": 4955 + }, + { + "epoch": 0.53, + "grad_norm": 0.07325513089570637, + "learning_rate": 0.00047128809365032707, + "loss": 1.4985, + "step": 4956 + }, + { + "epoch": 0.53, + "grad_norm": 0.08026244304016605, + "learning_rate": 0.0004711142753610385, + "loss": 1.3556, + "step": 4957 + }, + { + "epoch": 0.53, + "grad_norm": 0.07566667413783172, + "learning_rate": 0.0004709404605742437, + "loss": 1.3653, + "step": 4958 + }, + { + "epoch": 0.53, + "grad_norm": 0.07429766224344789, + "learning_rate": 0.0004707666493110182, + "loss": 1.509, + "step": 4959 + }, + { + "epoch": 0.53, + "grad_norm": 0.0794655393764005, + "learning_rate": 0.00047059284159243727, + "loss": 1.4117, + "step": 4960 + }, + { + "epoch": 0.53, + "grad_norm": 0.07529539563266788, + "learning_rate": 0.0004704190374395757, + "loss": 1.3844, + "step": 4961 + }, + { + "epoch": 0.53, + "grad_norm": 0.07016820804619064, + "learning_rate": 0.00047024523687350773, + "loss": 1.4303, + "step": 4962 + }, + { + "epoch": 0.53, + "grad_norm": 0.07816173017864488, + "learning_rate": 0.0004700714399153075, + "loss": 1.3392, + "step": 4963 + }, + { + "epoch": 0.53, + "grad_norm": 0.06755985966842121, + "learning_rate": 0.0004698976465860483, + "loss": 1.4734, + "step": 4964 + }, + { + "epoch": 0.53, + "grad_norm": 0.07271241598715444, + "learning_rate": 0.0004697238569068033, + "loss": 1.3556, + "step": 4965 + }, + { + "epoch": 0.53, + "grad_norm": 0.06993775956506902, + "learning_rate": 0.0004695500708986451, + "loss": 1.5511, + "step": 4966 + }, + { + "epoch": 0.53, + "grad_norm": 0.07252382373357674, + "learning_rate": 0.00046937628858264555, + "loss": 1.4274, + "step": 4967 + }, + { + "epoch": 0.53, + "grad_norm": 0.08483848022811791, + "learning_rate": 0.0004692025099798767, + "loss": 1.4444, + "step": 4968 + }, + { + "epoch": 0.53, + "grad_norm": 0.0715220291276646, + "learning_rate": 0.0004690287351114097, + "loss": 1.3493, + "step": 4969 + }, + { + "epoch": 0.53, + "grad_norm": 0.07550992035600171, + "learning_rate": 0.00046885496399831536, + "loss": 1.3109, + "step": 4970 + }, + { + "epoch": 0.53, + "grad_norm": 0.06715757190667862, + "learning_rate": 0.0004686811966616639, + "loss": 1.4589, + "step": 4971 + }, + { + "epoch": 0.53, + "grad_norm": 0.08206633402247311, + "learning_rate": 0.00046850743312252537, + "loss": 1.332, + "step": 4972 + }, + { + "epoch": 0.53, + "grad_norm": 0.0812947283676224, + "learning_rate": 0.00046833367340196915, + "loss": 1.3324, + "step": 4973 + }, + { + "epoch": 0.53, + "grad_norm": 0.08377563101034731, + "learning_rate": 0.0004681599175210641, + "loss": 1.4677, + "step": 4974 + }, + { + "epoch": 0.53, + "grad_norm": 0.07932030491381303, + "learning_rate": 0.000467986165500879, + "loss": 1.3165, + "step": 4975 + }, + { + "epoch": 0.53, + "grad_norm": 0.07764813651091716, + "learning_rate": 0.0004678124173624816, + "loss": 1.4508, + "step": 4976 + }, + { + "epoch": 0.54, + "grad_norm": 0.06502997929922044, + "learning_rate": 0.00046763867312693975, + "loss": 1.4183, + "step": 4977 + }, + { + "epoch": 0.54, + "grad_norm": 0.08892043190200546, + "learning_rate": 0.0004674649328153202, + "loss": 1.3739, + "step": 4978 + }, + { + "epoch": 0.54, + "grad_norm": 0.07284899072991152, + "learning_rate": 0.0004672911964486896, + "loss": 1.2622, + "step": 4979 + }, + { + "epoch": 0.54, + "grad_norm": 0.07680367290052509, + "learning_rate": 0.00046711746404811435, + "loss": 1.5053, + "step": 4980 + }, + { + "epoch": 0.54, + "grad_norm": 0.07623392032659518, + "learning_rate": 0.0004669437356346599, + "loss": 1.3632, + "step": 4981 + }, + { + "epoch": 0.54, + "grad_norm": 0.07287035493877488, + "learning_rate": 0.0004667700112293913, + "loss": 1.4439, + "step": 4982 + }, + { + "epoch": 0.54, + "grad_norm": 0.07694402909619838, + "learning_rate": 0.0004665962908533736, + "loss": 1.2747, + "step": 4983 + }, + { + "epoch": 0.54, + "grad_norm": 0.06519847555927114, + "learning_rate": 0.00046642257452767085, + "loss": 1.3187, + "step": 4984 + }, + { + "epoch": 0.54, + "grad_norm": 0.0753700653266978, + "learning_rate": 0.00046624886227334653, + "loss": 1.4432, + "step": 4985 + }, + { + "epoch": 0.54, + "grad_norm": 0.06877063017007487, + "learning_rate": 0.0004660751541114641, + "loss": 1.4588, + "step": 4986 + }, + { + "epoch": 0.54, + "grad_norm": 0.06914165470466452, + "learning_rate": 0.00046590145006308626, + "loss": 1.3748, + "step": 4987 + }, + { + "epoch": 0.54, + "grad_norm": 0.07154339906462596, + "learning_rate": 0.0004657277501492751, + "loss": 1.3536, + "step": 4988 + }, + { + "epoch": 0.54, + "grad_norm": 0.061994530518439867, + "learning_rate": 0.0004655540543910924, + "loss": 1.3233, + "step": 4989 + }, + { + "epoch": 0.54, + "grad_norm": 0.07749942945527018, + "learning_rate": 0.0004653803628095992, + "loss": 1.4189, + "step": 4990 + }, + { + "epoch": 0.54, + "grad_norm": 0.06924609370873705, + "learning_rate": 0.00046520667542585654, + "loss": 1.363, + "step": 4991 + }, + { + "epoch": 0.54, + "grad_norm": 0.0765738814212289, + "learning_rate": 0.0004650329922609244, + "loss": 1.3572, + "step": 4992 + }, + { + "epoch": 0.54, + "grad_norm": 0.07121048885445819, + "learning_rate": 0.0004648593133358624, + "loss": 1.4481, + "step": 4993 + }, + { + "epoch": 0.54, + "grad_norm": 0.06260534252699797, + "learning_rate": 0.0004646856386717299, + "loss": 1.4498, + "step": 4994 + }, + { + "epoch": 0.54, + "grad_norm": 0.06744698219437496, + "learning_rate": 0.0004645119682895855, + "loss": 1.4486, + "step": 4995 + }, + { + "epoch": 0.54, + "grad_norm": 0.07100221736713873, + "learning_rate": 0.0004643383022104871, + "loss": 1.4091, + "step": 4996 + }, + { + "epoch": 0.54, + "grad_norm": 0.06257697855926746, + "learning_rate": 0.00046416464045549266, + "loss": 1.3761, + "step": 4997 + }, + { + "epoch": 0.54, + "grad_norm": 0.0644591791026162, + "learning_rate": 0.0004639909830456592, + "loss": 1.3716, + "step": 4998 + }, + { + "epoch": 0.54, + "grad_norm": 0.06888699844748095, + "learning_rate": 0.0004638173300020431, + "loss": 1.4956, + "step": 4999 + }, + { + "epoch": 0.54, + "grad_norm": 0.06980960282827073, + "learning_rate": 0.0004636436813457005, + "loss": 1.3363, + "step": 5000 + }, + { + "epoch": 0.54, + "grad_norm": 0.06117394784086422, + "learning_rate": 0.0004634700370976867, + "loss": 1.5449, + "step": 5001 + }, + { + "epoch": 0.54, + "grad_norm": 0.07078852629655827, + "learning_rate": 0.00046329639727905696, + "loss": 1.3434, + "step": 5002 + }, + { + "epoch": 0.54, + "grad_norm": 0.06698154268968336, + "learning_rate": 0.00046312276191086567, + "loss": 1.4607, + "step": 5003 + }, + { + "epoch": 0.54, + "grad_norm": 0.06296613954642143, + "learning_rate": 0.0004629491310141665, + "loss": 1.391, + "step": 5004 + }, + { + "epoch": 0.54, + "grad_norm": 0.07509008854360427, + "learning_rate": 0.00046277550461001297, + "loss": 1.4132, + "step": 5005 + }, + { + "epoch": 0.54, + "grad_norm": 0.06856227972641615, + "learning_rate": 0.00046260188271945784, + "loss": 1.3535, + "step": 5006 + }, + { + "epoch": 0.54, + "grad_norm": 0.07113060912731874, + "learning_rate": 0.0004624282653635534, + "loss": 1.4244, + "step": 5007 + }, + { + "epoch": 0.54, + "grad_norm": 0.08050206389441429, + "learning_rate": 0.00046225465256335117, + "loss": 1.4556, + "step": 5008 + }, + { + "epoch": 0.54, + "grad_norm": 0.06897136804433396, + "learning_rate": 0.0004620810443399028, + "loss": 1.3154, + "step": 5009 + }, + { + "epoch": 0.54, + "grad_norm": 0.0757574536258104, + "learning_rate": 0.0004619074407142582, + "loss": 1.5634, + "step": 5010 + }, + { + "epoch": 0.54, + "grad_norm": 0.07923391536996084, + "learning_rate": 0.0004617338417074679, + "loss": 1.4741, + "step": 5011 + }, + { + "epoch": 0.54, + "grad_norm": 0.08502196581005937, + "learning_rate": 0.00046156024734058114, + "loss": 1.4186, + "step": 5012 + }, + { + "epoch": 0.54, + "grad_norm": 0.07460264161817834, + "learning_rate": 0.0004613866576346468, + "loss": 1.3466, + "step": 5013 + }, + { + "epoch": 0.54, + "grad_norm": 0.07965479727859294, + "learning_rate": 0.0004612130726107135, + "loss": 1.5325, + "step": 5014 + }, + { + "epoch": 0.54, + "grad_norm": 0.07123399184880777, + "learning_rate": 0.0004610394922898289, + "loss": 1.36, + "step": 5015 + }, + { + "epoch": 0.54, + "grad_norm": 0.07470424341472832, + "learning_rate": 0.00046086591669303997, + "loss": 1.2811, + "step": 5016 + }, + { + "epoch": 0.54, + "grad_norm": 0.06729237043022075, + "learning_rate": 0.0004606923458413937, + "loss": 1.5231, + "step": 5017 + }, + { + "epoch": 0.54, + "grad_norm": 0.07790469758464379, + "learning_rate": 0.000460518779755936, + "loss": 1.317, + "step": 5018 + }, + { + "epoch": 0.54, + "grad_norm": 0.0712544564620146, + "learning_rate": 0.00046034521845771256, + "loss": 1.4068, + "step": 5019 + }, + { + "epoch": 0.54, + "grad_norm": 0.08929737083626135, + "learning_rate": 0.00046017166196776787, + "loss": 1.3364, + "step": 5020 + }, + { + "epoch": 0.54, + "grad_norm": 0.06544812497801969, + "learning_rate": 0.00045999811030714643, + "loss": 1.4243, + "step": 5021 + }, + { + "epoch": 0.54, + "grad_norm": 0.060540401781816446, + "learning_rate": 0.0004598245634968921, + "loss": 1.3251, + "step": 5022 + }, + { + "epoch": 0.54, + "grad_norm": 0.0753615240710179, + "learning_rate": 0.000459651021558048, + "loss": 1.3198, + "step": 5023 + }, + { + "epoch": 0.54, + "grad_norm": 0.08191977033070631, + "learning_rate": 0.0004594774845116565, + "loss": 1.3843, + "step": 5024 + }, + { + "epoch": 0.54, + "grad_norm": 0.07716411871829355, + "learning_rate": 0.00045930395237875983, + "loss": 1.2769, + "step": 5025 + }, + { + "epoch": 0.54, + "grad_norm": 0.07458412119990497, + "learning_rate": 0.0004591304251803992, + "loss": 1.6653, + "step": 5026 + }, + { + "epoch": 0.54, + "grad_norm": 0.06024873879839457, + "learning_rate": 0.0004589569029376153, + "loss": 1.2628, + "step": 5027 + }, + { + "epoch": 0.54, + "grad_norm": 0.07220559994143401, + "learning_rate": 0.00045878338567144854, + "loss": 1.4473, + "step": 5028 + }, + { + "epoch": 0.54, + "grad_norm": 0.08935952893918574, + "learning_rate": 0.0004586098734029384, + "loss": 1.4616, + "step": 5029 + }, + { + "epoch": 0.54, + "grad_norm": 0.07031227068731213, + "learning_rate": 0.0004584363661531239, + "loss": 1.3846, + "step": 5030 + }, + { + "epoch": 0.54, + "grad_norm": 0.07215927891814075, + "learning_rate": 0.00045826286394304316, + "loss": 1.4024, + "step": 5031 + }, + { + "epoch": 0.54, + "grad_norm": 0.06502965527596266, + "learning_rate": 0.00045808936679373396, + "loss": 1.4927, + "step": 5032 + }, + { + "epoch": 0.54, + "grad_norm": 0.07239411343302822, + "learning_rate": 0.00045791587472623365, + "loss": 1.3573, + "step": 5033 + }, + { + "epoch": 0.54, + "grad_norm": 0.06741578912286643, + "learning_rate": 0.0004577423877615786, + "loss": 1.4525, + "step": 5034 + }, + { + "epoch": 0.54, + "grad_norm": 0.08016804263999956, + "learning_rate": 0.00045756890592080473, + "loss": 1.6092, + "step": 5035 + }, + { + "epoch": 0.54, + "grad_norm": 0.08362373973885047, + "learning_rate": 0.0004573954292249471, + "loss": 1.3283, + "step": 5036 + }, + { + "epoch": 0.54, + "grad_norm": 0.07866004625026597, + "learning_rate": 0.00045722195769504084, + "loss": 1.4516, + "step": 5037 + }, + { + "epoch": 0.54, + "grad_norm": 0.08537438521785196, + "learning_rate": 0.0004570484913521196, + "loss": 1.4975, + "step": 5038 + }, + { + "epoch": 0.54, + "grad_norm": 0.07595030335723768, + "learning_rate": 0.00045687503021721673, + "loss": 1.4266, + "step": 5039 + }, + { + "epoch": 0.54, + "grad_norm": 0.07736407471090755, + "learning_rate": 0.00045670157431136545, + "loss": 1.4982, + "step": 5040 + }, + { + "epoch": 0.54, + "grad_norm": 0.0772172433915742, + "learning_rate": 0.00045652812365559725, + "loss": 1.4539, + "step": 5041 + }, + { + "epoch": 0.54, + "grad_norm": 0.06913636508981376, + "learning_rate": 0.00045635467827094404, + "loss": 1.3728, + "step": 5042 + }, + { + "epoch": 0.54, + "grad_norm": 0.07308174202558122, + "learning_rate": 0.00045618123817843656, + "loss": 1.2952, + "step": 5043 + }, + { + "epoch": 0.54, + "grad_norm": 0.07119469079079288, + "learning_rate": 0.00045600780339910487, + "loss": 1.4836, + "step": 5044 + }, + { + "epoch": 0.54, + "grad_norm": 0.07229097508085704, + "learning_rate": 0.0004558343739539788, + "loss": 1.3501, + "step": 5045 + }, + { + "epoch": 0.54, + "grad_norm": 0.07533651801310617, + "learning_rate": 0.00045566094986408716, + "loss": 1.506, + "step": 5046 + }, + { + "epoch": 0.54, + "grad_norm": 0.07069151684931094, + "learning_rate": 0.0004554875311504581, + "loss": 1.421, + "step": 5047 + }, + { + "epoch": 0.54, + "grad_norm": 0.07104602711266829, + "learning_rate": 0.0004553141178341195, + "loss": 1.302, + "step": 5048 + }, + { + "epoch": 0.54, + "grad_norm": 0.0678684179203935, + "learning_rate": 0.00045514070993609806, + "loss": 1.2245, + "step": 5049 + }, + { + "epoch": 0.54, + "grad_norm": 0.06362739487382921, + "learning_rate": 0.00045496730747742023, + "loss": 1.4206, + "step": 5050 + }, + { + "epoch": 0.54, + "grad_norm": 0.07856895251337907, + "learning_rate": 0.00045479391047911186, + "loss": 1.4577, + "step": 5051 + }, + { + "epoch": 0.54, + "grad_norm": 0.07390180709879325, + "learning_rate": 0.00045462051896219736, + "loss": 1.2967, + "step": 5052 + }, + { + "epoch": 0.54, + "grad_norm": 0.06240181363019398, + "learning_rate": 0.0004544471329477015, + "loss": 1.3616, + "step": 5053 + }, + { + "epoch": 0.54, + "grad_norm": 0.0769563634912301, + "learning_rate": 0.0004542737524566478, + "loss": 1.3491, + "step": 5054 + }, + { + "epoch": 0.54, + "grad_norm": 0.0772523406670056, + "learning_rate": 0.00045410037751005916, + "loss": 1.5153, + "step": 5055 + }, + { + "epoch": 0.54, + "grad_norm": 0.08634109712464239, + "learning_rate": 0.0004539270081289581, + "loss": 1.4924, + "step": 5056 + }, + { + "epoch": 0.54, + "grad_norm": 0.07431762471246574, + "learning_rate": 0.0004537536443343662, + "loss": 1.3931, + "step": 5057 + }, + { + "epoch": 0.54, + "grad_norm": 0.0733649512462882, + "learning_rate": 0.0004535802861473042, + "loss": 1.3972, + "step": 5058 + }, + { + "epoch": 0.54, + "grad_norm": 0.07901391203531107, + "learning_rate": 0.0004534069335887926, + "loss": 1.4817, + "step": 5059 + }, + { + "epoch": 0.54, + "grad_norm": 0.07381995246703327, + "learning_rate": 0.000453233586679851, + "loss": 1.3819, + "step": 5060 + }, + { + "epoch": 0.54, + "grad_norm": 0.07247418114870308, + "learning_rate": 0.0004530602454414982, + "loss": 1.3994, + "step": 5061 + }, + { + "epoch": 0.54, + "grad_norm": 0.08511342964700407, + "learning_rate": 0.00045288690989475264, + "loss": 1.5179, + "step": 5062 + }, + { + "epoch": 0.54, + "grad_norm": 0.07305640500166688, + "learning_rate": 0.0004527135800606314, + "loss": 1.352, + "step": 5063 + }, + { + "epoch": 0.54, + "grad_norm": 0.07150911228214982, + "learning_rate": 0.0004525402559601517, + "loss": 1.4765, + "step": 5064 + }, + { + "epoch": 0.54, + "grad_norm": 0.06880615891115242, + "learning_rate": 0.0004523669376143296, + "loss": 1.383, + "step": 5065 + }, + { + "epoch": 0.54, + "grad_norm": 0.07549278756591758, + "learning_rate": 0.00045219362504418057, + "loss": 1.4114, + "step": 5066 + }, + { + "epoch": 0.54, + "grad_norm": 0.08469046092663751, + "learning_rate": 0.00045202031827071916, + "loss": 1.458, + "step": 5067 + }, + { + "epoch": 0.54, + "grad_norm": 0.06922994658984552, + "learning_rate": 0.0004518470173149597, + "loss": 1.4044, + "step": 5068 + }, + { + "epoch": 0.54, + "grad_norm": 0.07163348102127615, + "learning_rate": 0.00045167372219791544, + "loss": 1.3636, + "step": 5069 + }, + { + "epoch": 0.55, + "grad_norm": 0.07201028025488586, + "learning_rate": 0.00045150043294059876, + "loss": 1.4034, + "step": 5070 + }, + { + "epoch": 0.55, + "grad_norm": 0.07383927628148199, + "learning_rate": 0.000451327149564022, + "loss": 1.4078, + "step": 5071 + }, + { + "epoch": 0.55, + "grad_norm": 0.07201378668042176, + "learning_rate": 0.00045115387208919625, + "loss": 1.4128, + "step": 5072 + }, + { + "epoch": 0.55, + "grad_norm": 0.07129619592093052, + "learning_rate": 0.0004509806005371317, + "loss": 1.458, + "step": 5073 + }, + { + "epoch": 0.55, + "grad_norm": 0.07247629019072475, + "learning_rate": 0.0004508073349288384, + "loss": 1.3639, + "step": 5074 + }, + { + "epoch": 0.55, + "grad_norm": 0.1810663332686453, + "learning_rate": 0.0004506340752853252, + "loss": 1.4445, + "step": 5075 + }, + { + "epoch": 0.55, + "grad_norm": 0.07918478567312066, + "learning_rate": 0.0004504608216276007, + "loss": 1.3372, + "step": 5076 + }, + { + "epoch": 0.55, + "grad_norm": 0.07803524195785769, + "learning_rate": 0.0004502875739766724, + "loss": 1.4658, + "step": 5077 + }, + { + "epoch": 0.55, + "grad_norm": 0.07553645875301666, + "learning_rate": 0.00045011433235354697, + "loss": 1.372, + "step": 5078 + }, + { + "epoch": 0.55, + "grad_norm": 0.06978572187693603, + "learning_rate": 0.0004499410967792308, + "loss": 1.4196, + "step": 5079 + }, + { + "epoch": 0.55, + "grad_norm": 0.06894296278745332, + "learning_rate": 0.00044976786727472936, + "loss": 1.3758, + "step": 5080 + }, + { + "epoch": 0.55, + "grad_norm": 0.07302702532691431, + "learning_rate": 0.000449594643861047, + "loss": 1.3609, + "step": 5081 + }, + { + "epoch": 0.55, + "grad_norm": 0.0809007009229191, + "learning_rate": 0.00044942142655918796, + "loss": 1.3801, + "step": 5082 + }, + { + "epoch": 0.55, + "grad_norm": 0.07892839586023646, + "learning_rate": 0.0004492482153901554, + "loss": 1.4028, + "step": 5083 + }, + { + "epoch": 0.55, + "grad_norm": 0.06926018562785509, + "learning_rate": 0.00044907501037495155, + "loss": 1.4091, + "step": 5084 + }, + { + "epoch": 0.55, + "grad_norm": 0.07193791827021002, + "learning_rate": 0.0004489018115345784, + "loss": 1.2789, + "step": 5085 + }, + { + "epoch": 0.55, + "grad_norm": 0.07040347068886654, + "learning_rate": 0.0004487286188900365, + "loss": 1.3098, + "step": 5086 + }, + { + "epoch": 0.55, + "grad_norm": 0.07605857094439195, + "learning_rate": 0.00044855543246232653, + "loss": 1.3199, + "step": 5087 + }, + { + "epoch": 0.55, + "grad_norm": 0.07217955296294648, + "learning_rate": 0.00044838225227244766, + "loss": 1.4811, + "step": 5088 + }, + { + "epoch": 0.55, + "grad_norm": 0.07890617853340841, + "learning_rate": 0.0004482090783413986, + "loss": 1.3277, + "step": 5089 + }, + { + "epoch": 0.55, + "grad_norm": 0.0720778390267614, + "learning_rate": 0.00044803591069017746, + "loss": 1.5122, + "step": 5090 + }, + { + "epoch": 0.55, + "grad_norm": 0.07516660854615728, + "learning_rate": 0.0004478627493397813, + "loss": 1.3591, + "step": 5091 + }, + { + "epoch": 0.55, + "grad_norm": 0.07776960659986273, + "learning_rate": 0.0004476895943112064, + "loss": 1.504, + "step": 5092 + }, + { + "epoch": 0.55, + "grad_norm": 0.06708427653195059, + "learning_rate": 0.0004475164456254488, + "loss": 1.3771, + "step": 5093 + }, + { + "epoch": 0.55, + "grad_norm": 0.0806630681787127, + "learning_rate": 0.0004473433033035028, + "loss": 1.4429, + "step": 5094 + }, + { + "epoch": 0.55, + "grad_norm": 0.06961576483572854, + "learning_rate": 0.00044717016736636295, + "loss": 1.3978, + "step": 5095 + }, + { + "epoch": 0.55, + "grad_norm": 0.07131127279311827, + "learning_rate": 0.0004469970378350224, + "loss": 1.3899, + "step": 5096 + }, + { + "epoch": 0.55, + "grad_norm": 0.06955600958029026, + "learning_rate": 0.00044682391473047366, + "loss": 1.4512, + "step": 5097 + }, + { + "epoch": 0.55, + "grad_norm": 0.06367222085226946, + "learning_rate": 0.0004466507980737087, + "loss": 1.3487, + "step": 5098 + }, + { + "epoch": 0.55, + "grad_norm": 0.08530236840783734, + "learning_rate": 0.0004464776878857184, + "loss": 1.3804, + "step": 5099 + }, + { + "epoch": 0.55, + "grad_norm": 0.07515745169686079, + "learning_rate": 0.000446304584187493, + "loss": 1.409, + "step": 5100 + }, + { + "epoch": 0.55, + "grad_norm": 0.059871998178720805, + "learning_rate": 0.0004461314870000217, + "loss": 1.3639, + "step": 5101 + }, + { + "epoch": 0.55, + "grad_norm": 0.06668678886268802, + "learning_rate": 0.0004459583963442935, + "loss": 1.4565, + "step": 5102 + }, + { + "epoch": 0.55, + "grad_norm": 0.06941757215145417, + "learning_rate": 0.000445785312241296, + "loss": 1.4375, + "step": 5103 + }, + { + "epoch": 0.55, + "grad_norm": 0.0787820736556593, + "learning_rate": 0.0004456122347120164, + "loss": 1.4982, + "step": 5104 + }, + { + "epoch": 0.55, + "grad_norm": 0.06498468989445924, + "learning_rate": 0.0004454391637774408, + "loss": 1.4231, + "step": 5105 + }, + { + "epoch": 0.55, + "grad_norm": 0.07333559120436196, + "learning_rate": 0.0004452660994585545, + "loss": 1.4282, + "step": 5106 + }, + { + "epoch": 0.55, + "grad_norm": 0.06590759319312026, + "learning_rate": 0.00044509304177634245, + "loss": 1.4366, + "step": 5107 + }, + { + "epoch": 0.55, + "grad_norm": 0.07684068243714061, + "learning_rate": 0.00044491999075178844, + "loss": 1.4612, + "step": 5108 + }, + { + "epoch": 0.55, + "grad_norm": 0.06923333908354005, + "learning_rate": 0.0004447469464058753, + "loss": 1.2806, + "step": 5109 + }, + { + "epoch": 0.55, + "grad_norm": 0.07182215488774789, + "learning_rate": 0.00044457390875958546, + "loss": 1.6128, + "step": 5110 + }, + { + "epoch": 0.55, + "grad_norm": 0.06531612126963693, + "learning_rate": 0.0004444008778339003, + "loss": 1.5235, + "step": 5111 + }, + { + "epoch": 0.55, + "grad_norm": 0.06562690937863525, + "learning_rate": 0.0004442278536498003, + "loss": 1.4009, + "step": 5112 + }, + { + "epoch": 0.55, + "grad_norm": 0.06630177290727778, + "learning_rate": 0.00044405483622826544, + "loss": 1.3705, + "step": 5113 + }, + { + "epoch": 0.55, + "grad_norm": 0.07884292129674725, + "learning_rate": 0.0004438818255902746, + "loss": 1.4984, + "step": 5114 + }, + { + "epoch": 0.55, + "grad_norm": 0.07161075616132401, + "learning_rate": 0.00044370882175680585, + "loss": 1.4684, + "step": 5115 + }, + { + "epoch": 0.55, + "grad_norm": 0.07517122372057895, + "learning_rate": 0.00044353582474883645, + "loss": 1.3819, + "step": 5116 + }, + { + "epoch": 0.55, + "grad_norm": 0.07602287354464461, + "learning_rate": 0.0004433628345873429, + "loss": 1.3158, + "step": 5117 + }, + { + "epoch": 0.55, + "grad_norm": 0.0617182529028056, + "learning_rate": 0.000443189851293301, + "loss": 1.4243, + "step": 5118 + }, + { + "epoch": 0.55, + "grad_norm": 0.07441545210439454, + "learning_rate": 0.0004430168748876855, + "loss": 1.3834, + "step": 5119 + }, + { + "epoch": 0.55, + "grad_norm": 0.06804643338631677, + "learning_rate": 0.00044284390539147024, + "loss": 1.4772, + "step": 5120 + }, + { + "epoch": 0.55, + "grad_norm": 0.06762244884350038, + "learning_rate": 0.00044267094282562865, + "loss": 1.3884, + "step": 5121 + }, + { + "epoch": 0.55, + "grad_norm": 0.07151377791730641, + "learning_rate": 0.00044249798721113286, + "loss": 1.3676, + "step": 5122 + }, + { + "epoch": 0.55, + "grad_norm": 0.06859534272623603, + "learning_rate": 0.0004423250385689542, + "loss": 1.4373, + "step": 5123 + }, + { + "epoch": 0.55, + "grad_norm": 0.0777754907921359, + "learning_rate": 0.0004421520969200636, + "loss": 1.41, + "step": 5124 + }, + { + "epoch": 0.55, + "grad_norm": 0.06955694265720186, + "learning_rate": 0.0004419791622854308, + "loss": 1.5175, + "step": 5125 + }, + { + "epoch": 0.55, + "grad_norm": 0.06921231387512972, + "learning_rate": 0.00044180623468602457, + "loss": 1.3898, + "step": 5126 + }, + { + "epoch": 0.55, + "grad_norm": 0.0718189778864757, + "learning_rate": 0.00044163331414281287, + "loss": 1.3719, + "step": 5127 + }, + { + "epoch": 0.55, + "grad_norm": 0.07074598625511523, + "learning_rate": 0.000441460400676763, + "loss": 1.5005, + "step": 5128 + }, + { + "epoch": 0.55, + "grad_norm": 0.06483544761867031, + "learning_rate": 0.00044128749430884153, + "loss": 1.5393, + "step": 5129 + }, + { + "epoch": 0.55, + "grad_norm": 0.06688587191843826, + "learning_rate": 0.00044111459506001373, + "loss": 1.3699, + "step": 5130 + }, + { + "epoch": 0.55, + "grad_norm": 0.07058261521922415, + "learning_rate": 0.00044094170295124423, + "loss": 1.3599, + "step": 5131 + }, + { + "epoch": 0.55, + "grad_norm": 0.08377792244328604, + "learning_rate": 0.0004407688180034968, + "loss": 1.5726, + "step": 5132 + }, + { + "epoch": 0.55, + "grad_norm": 0.08170757981847657, + "learning_rate": 0.0004405959402377345, + "loss": 1.4028, + "step": 5133 + }, + { + "epoch": 0.55, + "grad_norm": 0.07094933529123704, + "learning_rate": 0.00044042306967491927, + "loss": 1.4782, + "step": 5134 + }, + { + "epoch": 0.55, + "grad_norm": 0.07833739407165695, + "learning_rate": 0.0004402502063360121, + "loss": 1.439, + "step": 5135 + }, + { + "epoch": 0.55, + "grad_norm": 0.07612811567792596, + "learning_rate": 0.00044007735024197375, + "loss": 1.4726, + "step": 5136 + }, + { + "epoch": 0.55, + "grad_norm": 0.07233376447979718, + "learning_rate": 0.0004399045014137629, + "loss": 1.484, + "step": 5137 + }, + { + "epoch": 0.55, + "grad_norm": 0.07426330129609558, + "learning_rate": 0.00043973165987233853, + "loss": 1.4103, + "step": 5138 + }, + { + "epoch": 0.55, + "grad_norm": 0.0703597646707655, + "learning_rate": 0.00043955882563865824, + "loss": 1.406, + "step": 5139 + }, + { + "epoch": 0.55, + "grad_norm": 0.0747228033894531, + "learning_rate": 0.0004393859987336786, + "loss": 1.3999, + "step": 5140 + }, + { + "epoch": 0.55, + "grad_norm": 0.06645446667750779, + "learning_rate": 0.0004392131791783556, + "loss": 1.4284, + "step": 5141 + }, + { + "epoch": 0.55, + "grad_norm": 0.06742755209718568, + "learning_rate": 0.0004390403669936443, + "loss": 1.5311, + "step": 5142 + }, + { + "epoch": 0.55, + "grad_norm": 0.06452897694766252, + "learning_rate": 0.0004388675622004985, + "loss": 1.5793, + "step": 5143 + }, + { + "epoch": 0.55, + "grad_norm": 0.07123286746462744, + "learning_rate": 0.00043869476481987166, + "loss": 1.3647, + "step": 5144 + }, + { + "epoch": 0.55, + "grad_norm": 0.080404488096446, + "learning_rate": 0.00043852197487271596, + "loss": 1.3003, + "step": 5145 + }, + { + "epoch": 0.55, + "grad_norm": 0.06632696267396859, + "learning_rate": 0.00043834919237998275, + "loss": 1.3213, + "step": 5146 + }, + { + "epoch": 0.55, + "grad_norm": 0.072257797841883, + "learning_rate": 0.0004381764173626225, + "loss": 1.4167, + "step": 5147 + }, + { + "epoch": 0.55, + "grad_norm": 0.07509748873673347, + "learning_rate": 0.0004380036498415847, + "loss": 1.3869, + "step": 5148 + }, + { + "epoch": 0.55, + "grad_norm": 0.0757740088264892, + "learning_rate": 0.0004378308898378181, + "loss": 1.5651, + "step": 5149 + }, + { + "epoch": 0.55, + "grad_norm": 0.08298039825124719, + "learning_rate": 0.0004376581373722705, + "loss": 1.5168, + "step": 5150 + }, + { + "epoch": 0.55, + "grad_norm": 0.06581738179640959, + "learning_rate": 0.0004374853924658886, + "loss": 1.4759, + "step": 5151 + }, + { + "epoch": 0.55, + "grad_norm": 0.07360615892941527, + "learning_rate": 0.00043731265513961837, + "loss": 1.4205, + "step": 5152 + }, + { + "epoch": 0.55, + "grad_norm": 0.0761582288989777, + "learning_rate": 0.00043713992541440495, + "loss": 1.4479, + "step": 5153 + }, + { + "epoch": 0.55, + "grad_norm": 0.07168122590048633, + "learning_rate": 0.0004369672033111921, + "loss": 1.2982, + "step": 5154 + }, + { + "epoch": 0.55, + "grad_norm": 0.06899343353227715, + "learning_rate": 0.0004367944888509233, + "loss": 1.4615, + "step": 5155 + }, + { + "epoch": 0.55, + "grad_norm": 0.0851060919458039, + "learning_rate": 0.00043662178205454064, + "loss": 1.416, + "step": 5156 + }, + { + "epoch": 0.55, + "grad_norm": 0.07129039991770322, + "learning_rate": 0.0004364490829429855, + "loss": 1.4104, + "step": 5157 + }, + { + "epoch": 0.55, + "grad_norm": 0.07358697165011492, + "learning_rate": 0.000436276391537198, + "loss": 1.4107, + "step": 5158 + }, + { + "epoch": 0.55, + "grad_norm": 0.06873354857206569, + "learning_rate": 0.0004361037078581176, + "loss": 1.4866, + "step": 5159 + }, + { + "epoch": 0.55, + "grad_norm": 0.06921469555044571, + "learning_rate": 0.00043593103192668306, + "loss": 1.2982, + "step": 5160 + }, + { + "epoch": 0.55, + "grad_norm": 0.08636862001391342, + "learning_rate": 0.00043575836376383173, + "loss": 1.3917, + "step": 5161 + }, + { + "epoch": 0.55, + "grad_norm": 0.0737680093296688, + "learning_rate": 0.0004355857033905003, + "loss": 1.3104, + "step": 5162 + }, + { + "epoch": 0.56, + "grad_norm": 0.07065380230712737, + "learning_rate": 0.0004354130508276243, + "loss": 1.412, + "step": 5163 + }, + { + "epoch": 0.56, + "grad_norm": 0.0777158993181712, + "learning_rate": 0.0004352404060961387, + "loss": 1.368, + "step": 5164 + }, + { + "epoch": 0.56, + "grad_norm": 0.07163302597716577, + "learning_rate": 0.00043506776921697703, + "loss": 1.3967, + "step": 5165 + }, + { + "epoch": 0.56, + "grad_norm": 0.07395361060929702, + "learning_rate": 0.0004348951402110721, + "loss": 1.4889, + "step": 5166 + }, + { + "epoch": 0.56, + "grad_norm": 0.07668834064910103, + "learning_rate": 0.0004347225190993563, + "loss": 1.3945, + "step": 5167 + }, + { + "epoch": 0.56, + "grad_norm": 0.07379835994602744, + "learning_rate": 0.00043454990590275966, + "loss": 1.2941, + "step": 5168 + }, + { + "epoch": 0.56, + "grad_norm": 0.07353437475217274, + "learning_rate": 0.00043437730064221274, + "loss": 1.4592, + "step": 5169 + }, + { + "epoch": 0.56, + "grad_norm": 0.07180642897850834, + "learning_rate": 0.00043420470333864437, + "loss": 1.5081, + "step": 5170 + }, + { + "epoch": 0.56, + "grad_norm": 0.07480628068562017, + "learning_rate": 0.0004340321140129824, + "loss": 1.4688, + "step": 5171 + }, + { + "epoch": 0.56, + "grad_norm": 0.08326786901526848, + "learning_rate": 0.0004338595326861542, + "loss": 1.3402, + "step": 5172 + }, + { + "epoch": 0.56, + "grad_norm": 0.0776630677788923, + "learning_rate": 0.00043368695937908564, + "loss": 1.396, + "step": 5173 + }, + { + "epoch": 0.56, + "grad_norm": 0.0858497428936398, + "learning_rate": 0.00043351439411270175, + "loss": 1.6149, + "step": 5174 + }, + { + "epoch": 0.56, + "grad_norm": 0.07842818825426176, + "learning_rate": 0.00043334183690792687, + "loss": 1.3343, + "step": 5175 + }, + { + "epoch": 0.56, + "grad_norm": 0.07189766698764516, + "learning_rate": 0.00043316928778568413, + "loss": 1.4638, + "step": 5176 + }, + { + "epoch": 0.56, + "grad_norm": 0.08069092707873271, + "learning_rate": 0.0004329967467668955, + "loss": 1.6077, + "step": 5177 + }, + { + "epoch": 0.56, + "grad_norm": 0.06730626948761761, + "learning_rate": 0.00043282421387248266, + "loss": 1.4001, + "step": 5178 + }, + { + "epoch": 0.56, + "grad_norm": 0.09400270596405078, + "learning_rate": 0.0004326516891233652, + "loss": 1.4182, + "step": 5179 + }, + { + "epoch": 0.56, + "grad_norm": 0.07152190135429132, + "learning_rate": 0.00043247917254046265, + "loss": 1.4702, + "step": 5180 + }, + { + "epoch": 0.56, + "grad_norm": 0.07839367829975089, + "learning_rate": 0.0004323066641446932, + "loss": 1.3874, + "step": 5181 + }, + { + "epoch": 0.56, + "grad_norm": 0.08025327287685355, + "learning_rate": 0.00043213416395697406, + "loss": 1.4811, + "step": 5182 + }, + { + "epoch": 0.56, + "grad_norm": 0.0675459205855955, + "learning_rate": 0.0004319616719982216, + "loss": 1.5319, + "step": 5183 + }, + { + "epoch": 0.56, + "grad_norm": 0.08674747999573565, + "learning_rate": 0.00043178918828935093, + "loss": 1.3552, + "step": 5184 + }, + { + "epoch": 0.56, + "grad_norm": 0.06783046466875754, + "learning_rate": 0.0004316167128512763, + "loss": 1.4993, + "step": 5185 + }, + { + "epoch": 0.56, + "grad_norm": 0.0766712737981568, + "learning_rate": 0.000431444245704911, + "loss": 1.3712, + "step": 5186 + }, + { + "epoch": 0.56, + "grad_norm": 0.07620524476335197, + "learning_rate": 0.00043127178687116734, + "loss": 1.2, + "step": 5187 + }, + { + "epoch": 0.56, + "grad_norm": 0.0698091025429192, + "learning_rate": 0.0004310993363709563, + "loss": 1.451, + "step": 5188 + }, + { + "epoch": 0.56, + "grad_norm": 0.07871163624673415, + "learning_rate": 0.0004309268942251887, + "loss": 1.4297, + "step": 5189 + }, + { + "epoch": 0.56, + "grad_norm": 0.07820467834592452, + "learning_rate": 0.0004307544604547728, + "loss": 1.3821, + "step": 5190 + }, + { + "epoch": 0.56, + "grad_norm": 0.07426071313960025, + "learning_rate": 0.00043058203508061755, + "loss": 1.5387, + "step": 5191 + }, + { + "epoch": 0.56, + "grad_norm": 0.07152198924937515, + "learning_rate": 0.00043040961812362984, + "loss": 1.4379, + "step": 5192 + }, + { + "epoch": 0.56, + "grad_norm": 0.07643513013877934, + "learning_rate": 0.00043023720960471567, + "loss": 1.3828, + "step": 5193 + }, + { + "epoch": 0.56, + "grad_norm": 0.07377944276619808, + "learning_rate": 0.0004300648095447805, + "loss": 1.3577, + "step": 5194 + }, + { + "epoch": 0.56, + "grad_norm": 0.07677807667832921, + "learning_rate": 0.0004298924179647283, + "loss": 1.4854, + "step": 5195 + }, + { + "epoch": 0.56, + "grad_norm": 0.07254987146980628, + "learning_rate": 0.00042972003488546206, + "loss": 1.3919, + "step": 5196 + }, + { + "epoch": 0.56, + "grad_norm": 0.07606417273762252, + "learning_rate": 0.00042954766032788386, + "loss": 1.4481, + "step": 5197 + }, + { + "epoch": 0.56, + "grad_norm": 0.06574538650509663, + "learning_rate": 0.00042937529431289476, + "loss": 1.4083, + "step": 5198 + }, + { + "epoch": 0.56, + "grad_norm": 0.08056330530383395, + "learning_rate": 0.00042920293686139483, + "loss": 1.3452, + "step": 5199 + }, + { + "epoch": 0.56, + "grad_norm": 0.07254927775620691, + "learning_rate": 0.00042903058799428277, + "loss": 1.354, + "step": 5200 + }, + { + "epoch": 0.56, + "grad_norm": 0.08042786998418212, + "learning_rate": 0.0004288582477324566, + "loss": 1.4644, + "step": 5201 + }, + { + "epoch": 0.56, + "grad_norm": 0.07551326119414072, + "learning_rate": 0.0004286859160968131, + "loss": 1.4498, + "step": 5202 + }, + { + "epoch": 0.56, + "grad_norm": 0.0716710195185741, + "learning_rate": 0.0004285135931082481, + "loss": 1.3936, + "step": 5203 + }, + { + "epoch": 0.56, + "grad_norm": 0.07607372161962338, + "learning_rate": 0.0004283412787876565, + "loss": 1.4397, + "step": 5204 + }, + { + "epoch": 0.56, + "grad_norm": 0.0767839216278687, + "learning_rate": 0.0004281689731559318, + "loss": 1.5554, + "step": 5205 + }, + { + "epoch": 0.56, + "grad_norm": 0.08170307596491168, + "learning_rate": 0.00042799667623396676, + "loss": 1.4555, + "step": 5206 + }, + { + "epoch": 0.56, + "grad_norm": 0.07293635890780277, + "learning_rate": 0.000427824388042653, + "loss": 1.5309, + "step": 5207 + }, + { + "epoch": 0.56, + "grad_norm": 0.06361479379989111, + "learning_rate": 0.00042765210860288097, + "loss": 1.3286, + "step": 5208 + }, + { + "epoch": 0.56, + "grad_norm": 0.06701607782127969, + "learning_rate": 0.0004274798379355402, + "loss": 1.3905, + "step": 5209 + }, + { + "epoch": 0.56, + "grad_norm": 0.08632887914379973, + "learning_rate": 0.00042730757606151927, + "loss": 1.4785, + "step": 5210 + }, + { + "epoch": 0.56, + "grad_norm": 0.06647493250126521, + "learning_rate": 0.0004271353230017052, + "loss": 1.2555, + "step": 5211 + }, + { + "epoch": 0.56, + "grad_norm": 0.06693961197696197, + "learning_rate": 0.0004269630787769845, + "loss": 1.4008, + "step": 5212 + }, + { + "epoch": 0.56, + "grad_norm": 0.0692409003412069, + "learning_rate": 0.0004267908434082421, + "loss": 1.4041, + "step": 5213 + }, + { + "epoch": 0.56, + "grad_norm": 0.06737205948239505, + "learning_rate": 0.0004266186169163624, + "loss": 1.3695, + "step": 5214 + }, + { + "epoch": 0.56, + "grad_norm": 0.06829870460828749, + "learning_rate": 0.0004264463993222285, + "loss": 1.3776, + "step": 5215 + }, + { + "epoch": 0.56, + "grad_norm": 0.06568111411168952, + "learning_rate": 0.0004262741906467221, + "loss": 1.5051, + "step": 5216 + }, + { + "epoch": 0.56, + "grad_norm": 0.07099711966396452, + "learning_rate": 0.0004261019909107243, + "loss": 1.4215, + "step": 5217 + }, + { + "epoch": 0.56, + "grad_norm": 0.06926480274268498, + "learning_rate": 0.000425929800135115, + "loss": 1.4874, + "step": 5218 + }, + { + "epoch": 0.56, + "grad_norm": 0.07346867907033706, + "learning_rate": 0.0004257576183407726, + "loss": 1.4229, + "step": 5219 + }, + { + "epoch": 0.56, + "grad_norm": 0.06602439589330132, + "learning_rate": 0.0004255854455485753, + "loss": 1.5083, + "step": 5220 + }, + { + "epoch": 0.56, + "grad_norm": 0.08114794312950216, + "learning_rate": 0.0004254132817793989, + "loss": 1.3613, + "step": 5221 + }, + { + "epoch": 0.56, + "grad_norm": 0.07671248645303533, + "learning_rate": 0.0004252411270541193, + "loss": 1.5036, + "step": 5222 + }, + { + "epoch": 0.56, + "grad_norm": 0.07326874251575916, + "learning_rate": 0.0004250689813936108, + "loss": 1.4248, + "step": 5223 + }, + { + "epoch": 0.56, + "grad_norm": 0.06407015535943295, + "learning_rate": 0.00042489684481874655, + "loss": 1.3891, + "step": 5224 + }, + { + "epoch": 0.56, + "grad_norm": 0.06590779720525618, + "learning_rate": 0.00042472471735039894, + "loss": 1.2699, + "step": 5225 + }, + { + "epoch": 0.56, + "grad_norm": 0.07001647041392903, + "learning_rate": 0.0004245525990094388, + "loss": 1.4988, + "step": 5226 + }, + { + "epoch": 0.56, + "grad_norm": 0.06950346761352938, + "learning_rate": 0.00042438048981673613, + "loss": 1.5613, + "step": 5227 + }, + { + "epoch": 0.56, + "grad_norm": 0.0681780359829946, + "learning_rate": 0.00042420838979315975, + "loss": 1.3695, + "step": 5228 + }, + { + "epoch": 0.56, + "grad_norm": 0.07072671929945357, + "learning_rate": 0.0004240362989595774, + "loss": 1.3276, + "step": 5229 + }, + { + "epoch": 0.56, + "grad_norm": 0.07027533413197294, + "learning_rate": 0.00042386421733685574, + "loss": 1.414, + "step": 5230 + }, + { + "epoch": 0.56, + "grad_norm": 0.07273910038980894, + "learning_rate": 0.00042369214494586025, + "loss": 1.3791, + "step": 5231 + }, + { + "epoch": 0.56, + "grad_norm": 0.07673259671575565, + "learning_rate": 0.00042352008180745527, + "loss": 1.4342, + "step": 5232 + }, + { + "epoch": 0.56, + "grad_norm": 0.08629006705556348, + "learning_rate": 0.0004233480279425039, + "loss": 1.4249, + "step": 5233 + }, + { + "epoch": 0.56, + "grad_norm": 0.07021380822979276, + "learning_rate": 0.00042317598337186845, + "loss": 1.3425, + "step": 5234 + }, + { + "epoch": 0.56, + "grad_norm": 0.0696246874174527, + "learning_rate": 0.0004230039481164099, + "loss": 1.396, + "step": 5235 + }, + { + "epoch": 0.56, + "grad_norm": 0.0771553120990635, + "learning_rate": 0.00042283192219698797, + "loss": 1.3683, + "step": 5236 + }, + { + "epoch": 0.56, + "grad_norm": 0.06880356851494933, + "learning_rate": 0.00042265990563446166, + "loss": 1.3027, + "step": 5237 + }, + { + "epoch": 0.56, + "grad_norm": 0.07126039016589009, + "learning_rate": 0.0004224878984496884, + "loss": 1.4568, + "step": 5238 + }, + { + "epoch": 0.56, + "grad_norm": 0.07841400172673146, + "learning_rate": 0.00042231590066352454, + "loss": 1.353, + "step": 5239 + }, + { + "epoch": 0.56, + "grad_norm": 0.0691154552365852, + "learning_rate": 0.00042214391229682564, + "loss": 1.3609, + "step": 5240 + }, + { + "epoch": 0.56, + "grad_norm": 0.08822877788777062, + "learning_rate": 0.0004219719333704458, + "loss": 1.5357, + "step": 5241 + }, + { + "epoch": 0.56, + "grad_norm": 0.07321921465130389, + "learning_rate": 0.00042179996390523817, + "loss": 1.4246, + "step": 5242 + }, + { + "epoch": 0.56, + "grad_norm": 0.07663401686520896, + "learning_rate": 0.0004216280039220544, + "loss": 1.4507, + "step": 5243 + }, + { + "epoch": 0.56, + "grad_norm": 0.06902686680633362, + "learning_rate": 0.00042145605344174524, + "loss": 1.3971, + "step": 5244 + }, + { + "epoch": 0.56, + "grad_norm": 0.06877425014001001, + "learning_rate": 0.0004212841124851605, + "loss": 1.4818, + "step": 5245 + }, + { + "epoch": 0.56, + "grad_norm": 0.0796041285678449, + "learning_rate": 0.00042111218107314846, + "loss": 1.4862, + "step": 5246 + }, + { + "epoch": 0.56, + "grad_norm": 0.07232671407705622, + "learning_rate": 0.00042094025922655636, + "loss": 1.493, + "step": 5247 + }, + { + "epoch": 0.56, + "grad_norm": 0.06841978145994511, + "learning_rate": 0.0004207683469662305, + "loss": 1.61, + "step": 5248 + }, + { + "epoch": 0.56, + "grad_norm": 0.08341542608065328, + "learning_rate": 0.0004205964443130157, + "loss": 1.4736, + "step": 5249 + }, + { + "epoch": 0.56, + "grad_norm": 0.08253205503423554, + "learning_rate": 0.0004204245512877557, + "loss": 1.4058, + "step": 5250 + }, + { + "epoch": 0.56, + "grad_norm": 0.07622013336203287, + "learning_rate": 0.00042025266791129325, + "loss": 1.3469, + "step": 5251 + }, + { + "epoch": 0.56, + "grad_norm": 0.07590685646308382, + "learning_rate": 0.00042008079420446985, + "loss": 1.4038, + "step": 5252 + }, + { + "epoch": 0.56, + "grad_norm": 0.06613348327522989, + "learning_rate": 0.0004199089301881256, + "loss": 1.2766, + "step": 5253 + }, + { + "epoch": 0.56, + "grad_norm": 0.06636202703584451, + "learning_rate": 0.0004197370758830997, + "loss": 1.4253, + "step": 5254 + }, + { + "epoch": 0.56, + "grad_norm": 0.08224651639354623, + "learning_rate": 0.0004195652313102299, + "loss": 1.3336, + "step": 5255 + }, + { + "epoch": 0.57, + "grad_norm": 0.07657486580957353, + "learning_rate": 0.00041939339649035325, + "loss": 1.4397, + "step": 5256 + }, + { + "epoch": 0.57, + "grad_norm": 0.07238429465194317, + "learning_rate": 0.0004192215714443052, + "loss": 1.3522, + "step": 5257 + }, + { + "epoch": 0.57, + "grad_norm": 0.07108040706690585, + "learning_rate": 0.00041904975619292003, + "loss": 1.4461, + "step": 5258 + }, + { + "epoch": 0.57, + "grad_norm": 0.07753429464243641, + "learning_rate": 0.00041887795075703095, + "loss": 1.4958, + "step": 5259 + }, + { + "epoch": 0.57, + "grad_norm": 0.0755302483106443, + "learning_rate": 0.00041870615515747007, + "loss": 1.4597, + "step": 5260 + }, + { + "epoch": 0.57, + "grad_norm": 0.06787790634763187, + "learning_rate": 0.00041853436941506817, + "loss": 1.3877, + "step": 5261 + }, + { + "epoch": 0.57, + "grad_norm": 0.0661392999583233, + "learning_rate": 0.00041836259355065473, + "loss": 1.3623, + "step": 5262 + }, + { + "epoch": 0.57, + "grad_norm": 0.07417584421347115, + "learning_rate": 0.00041819082758505857, + "loss": 1.4985, + "step": 5263 + }, + { + "epoch": 0.57, + "grad_norm": 0.0684843530224598, + "learning_rate": 0.00041801907153910623, + "loss": 1.3694, + "step": 5264 + }, + { + "epoch": 0.57, + "grad_norm": 0.0726643145095135, + "learning_rate": 0.0004178473254336242, + "loss": 1.4311, + "step": 5265 + }, + { + "epoch": 0.57, + "grad_norm": 0.07855844880744575, + "learning_rate": 0.00041767558928943734, + "loss": 1.5054, + "step": 5266 + }, + { + "epoch": 0.57, + "grad_norm": 0.06463843675977816, + "learning_rate": 0.0004175038631273689, + "loss": 1.4252, + "step": 5267 + }, + { + "epoch": 0.57, + "grad_norm": 0.07200443421609361, + "learning_rate": 0.0004173321469682415, + "loss": 1.5673, + "step": 5268 + }, + { + "epoch": 0.57, + "grad_norm": 0.07157206049559349, + "learning_rate": 0.0004171604408328764, + "loss": 1.4307, + "step": 5269 + }, + { + "epoch": 0.57, + "grad_norm": 0.07888986817590185, + "learning_rate": 0.00041698874474209327, + "loss": 1.355, + "step": 5270 + }, + { + "epoch": 0.57, + "grad_norm": 0.08408446086913257, + "learning_rate": 0.0004168170587167111, + "loss": 1.5721, + "step": 5271 + }, + { + "epoch": 0.57, + "grad_norm": 0.07150037519993824, + "learning_rate": 0.0004166453827775474, + "loss": 1.4548, + "step": 5272 + }, + { + "epoch": 0.57, + "grad_norm": 0.0723367901238978, + "learning_rate": 0.00041647371694541845, + "loss": 1.3577, + "step": 5273 + }, + { + "epoch": 0.57, + "grad_norm": 0.0668665999900835, + "learning_rate": 0.00041630206124113923, + "loss": 1.3156, + "step": 5274 + }, + { + "epoch": 0.57, + "grad_norm": 0.07303698085136547, + "learning_rate": 0.0004161304156855235, + "loss": 1.3796, + "step": 5275 + }, + { + "epoch": 0.57, + "grad_norm": 0.07114217269771465, + "learning_rate": 0.00041595878029938415, + "loss": 1.4391, + "step": 5276 + }, + { + "epoch": 0.57, + "grad_norm": 0.07038735670023649, + "learning_rate": 0.0004157871551035324, + "loss": 1.3433, + "step": 5277 + }, + { + "epoch": 0.57, + "grad_norm": 0.0765105156773969, + "learning_rate": 0.0004156155401187783, + "loss": 1.3778, + "step": 5278 + }, + { + "epoch": 0.57, + "grad_norm": 0.06795495327597158, + "learning_rate": 0.00041544393536593096, + "loss": 1.3867, + "step": 5279 + }, + { + "epoch": 0.57, + "grad_norm": 0.07040727125995247, + "learning_rate": 0.000415272340865798, + "loss": 1.419, + "step": 5280 + }, + { + "epoch": 0.57, + "grad_norm": 0.07715273193285668, + "learning_rate": 0.0004151007566391857, + "loss": 1.3902, + "step": 5281 + }, + { + "epoch": 0.57, + "grad_norm": 0.07115238949389142, + "learning_rate": 0.0004149291827068995, + "loss": 1.3634, + "step": 5282 + }, + { + "epoch": 0.57, + "grad_norm": 0.06990145449700264, + "learning_rate": 0.00041475761908974315, + "loss": 1.3727, + "step": 5283 + }, + { + "epoch": 0.57, + "grad_norm": 0.07417513717261448, + "learning_rate": 0.00041458606580851943, + "loss": 1.3928, + "step": 5284 + }, + { + "epoch": 0.57, + "grad_norm": 0.07342873182353658, + "learning_rate": 0.0004144145228840296, + "loss": 1.3938, + "step": 5285 + }, + { + "epoch": 0.57, + "grad_norm": 0.07729370889778275, + "learning_rate": 0.00041424299033707384, + "loss": 1.3694, + "step": 5286 + }, + { + "epoch": 0.57, + "grad_norm": 0.07807411980809978, + "learning_rate": 0.00041407146818845124, + "loss": 1.4838, + "step": 5287 + }, + { + "epoch": 0.57, + "grad_norm": 0.07769125995931969, + "learning_rate": 0.0004138999564589594, + "loss": 1.3846, + "step": 5288 + }, + { + "epoch": 0.57, + "grad_norm": 0.07203153523933757, + "learning_rate": 0.00041372845516939456, + "loss": 1.4529, + "step": 5289 + }, + { + "epoch": 0.57, + "grad_norm": 0.0852466033918914, + "learning_rate": 0.000413556964340552, + "loss": 1.435, + "step": 5290 + }, + { + "epoch": 0.57, + "grad_norm": 0.07876953151957321, + "learning_rate": 0.0004133854839932256, + "loss": 1.3949, + "step": 5291 + }, + { + "epoch": 0.57, + "grad_norm": 0.07922352701188885, + "learning_rate": 0.00041321401414820785, + "loss": 1.3702, + "step": 5292 + }, + { + "epoch": 0.57, + "grad_norm": 0.07789017134336326, + "learning_rate": 0.00041304255482628997, + "loss": 1.399, + "step": 5293 + }, + { + "epoch": 0.57, + "grad_norm": 0.07120922513834405, + "learning_rate": 0.00041287110604826233, + "loss": 1.2416, + "step": 5294 + }, + { + "epoch": 0.57, + "grad_norm": 0.09864215287276547, + "learning_rate": 0.0004126996678349133, + "loss": 1.4479, + "step": 5295 + }, + { + "epoch": 0.57, + "grad_norm": 0.07540022268253116, + "learning_rate": 0.0004125282402070306, + "loss": 1.5072, + "step": 5296 + }, + { + "epoch": 0.57, + "grad_norm": 0.07117404624697435, + "learning_rate": 0.0004123568231854003, + "loss": 1.4258, + "step": 5297 + }, + { + "epoch": 0.57, + "grad_norm": 0.07632072817859777, + "learning_rate": 0.00041218541679080724, + "loss": 1.3239, + "step": 5298 + }, + { + "epoch": 0.57, + "grad_norm": 0.07545200184891443, + "learning_rate": 0.00041201402104403516, + "loss": 1.4645, + "step": 5299 + }, + { + "epoch": 0.57, + "grad_norm": 0.0755586931749317, + "learning_rate": 0.00041184263596586644, + "loss": 1.3638, + "step": 5300 + }, + { + "epoch": 0.57, + "grad_norm": 0.07534667284843548, + "learning_rate": 0.00041167126157708194, + "loss": 1.2639, + "step": 5301 + }, + { + "epoch": 0.57, + "grad_norm": 0.07371800282548627, + "learning_rate": 0.0004114998978984616, + "loss": 1.3628, + "step": 5302 + }, + { + "epoch": 0.57, + "grad_norm": 0.06979442007264192, + "learning_rate": 0.0004113285449507837, + "loss": 1.5124, + "step": 5303 + }, + { + "epoch": 0.57, + "grad_norm": 0.07518472070533627, + "learning_rate": 0.00041115720275482535, + "loss": 1.3375, + "step": 5304 + }, + { + "epoch": 0.57, + "grad_norm": 0.07145575246092586, + "learning_rate": 0.0004109858713313628, + "loss": 1.268, + "step": 5305 + }, + { + "epoch": 0.57, + "grad_norm": 0.07227095780800502, + "learning_rate": 0.0004108145507011698, + "loss": 1.4299, + "step": 5306 + }, + { + "epoch": 0.57, + "grad_norm": 0.06962460632342404, + "learning_rate": 0.0004106432408850202, + "loss": 1.3067, + "step": 5307 + }, + { + "epoch": 0.57, + "grad_norm": 0.06967858569287429, + "learning_rate": 0.0004104719419036856, + "loss": 1.4634, + "step": 5308 + }, + { + "epoch": 0.57, + "grad_norm": 0.07452803560588936, + "learning_rate": 0.00041030065377793673, + "loss": 1.2936, + "step": 5309 + }, + { + "epoch": 0.57, + "grad_norm": 0.07198736708156497, + "learning_rate": 0.0004101293765285429, + "loss": 1.399, + "step": 5310 + }, + { + "epoch": 0.57, + "grad_norm": 0.07088761102492468, + "learning_rate": 0.00040995811017627195, + "loss": 1.5214, + "step": 5311 + }, + { + "epoch": 0.57, + "grad_norm": 0.08037863075518545, + "learning_rate": 0.0004097868547418905, + "loss": 1.538, + "step": 5312 + }, + { + "epoch": 0.57, + "grad_norm": 0.06702118564346261, + "learning_rate": 0.00040961561024616393, + "loss": 1.4452, + "step": 5313 + }, + { + "epoch": 0.57, + "grad_norm": 0.07138907507596554, + "learning_rate": 0.00040944437670985636, + "loss": 1.4375, + "step": 5314 + }, + { + "epoch": 0.57, + "grad_norm": 0.0713287767088554, + "learning_rate": 0.00040927315415373013, + "loss": 1.3658, + "step": 5315 + }, + { + "epoch": 0.57, + "grad_norm": 0.07246596365203058, + "learning_rate": 0.00040910194259854705, + "loss": 1.3858, + "step": 5316 + }, + { + "epoch": 0.57, + "grad_norm": 0.0638642019848533, + "learning_rate": 0.00040893074206506646, + "loss": 1.449, + "step": 5317 + }, + { + "epoch": 0.57, + "grad_norm": 0.07467248996983068, + "learning_rate": 0.0004087595525740475, + "loss": 1.45, + "step": 5318 + }, + { + "epoch": 0.57, + "grad_norm": 0.06856809835411172, + "learning_rate": 0.0004085883741462474, + "loss": 1.4768, + "step": 5319 + }, + { + "epoch": 0.57, + "grad_norm": 0.07499617753743705, + "learning_rate": 0.00040841720680242187, + "loss": 1.4109, + "step": 5320 + }, + { + "epoch": 0.57, + "grad_norm": 0.07075707467749452, + "learning_rate": 0.0004082460505633259, + "loss": 1.5193, + "step": 5321 + }, + { + "epoch": 0.57, + "grad_norm": 0.08615689422162827, + "learning_rate": 0.0004080749054497126, + "loss": 1.3668, + "step": 5322 + }, + { + "epoch": 0.57, + "grad_norm": 0.08010696554699227, + "learning_rate": 0.00040790377148233406, + "loss": 1.5512, + "step": 5323 + }, + { + "epoch": 0.57, + "grad_norm": 0.06867799500138569, + "learning_rate": 0.00040773264868194047, + "loss": 1.5605, + "step": 5324 + }, + { + "epoch": 0.57, + "grad_norm": 0.07358887873840544, + "learning_rate": 0.0004075615370692815, + "loss": 1.3062, + "step": 5325 + }, + { + "epoch": 0.57, + "grad_norm": 0.07190535444709516, + "learning_rate": 0.0004073904366651049, + "loss": 1.5042, + "step": 5326 + }, + { + "epoch": 0.57, + "grad_norm": 0.07395900114719453, + "learning_rate": 0.0004072193474901571, + "loss": 1.4081, + "step": 5327 + }, + { + "epoch": 0.57, + "grad_norm": 0.07036662502059775, + "learning_rate": 0.0004070482695651832, + "loss": 1.4774, + "step": 5328 + }, + { + "epoch": 0.57, + "grad_norm": 0.0837011412853149, + "learning_rate": 0.000406877202910927, + "loss": 1.4898, + "step": 5329 + }, + { + "epoch": 0.57, + "grad_norm": 0.06325539977822217, + "learning_rate": 0.00040670614754813105, + "loss": 1.3729, + "step": 5330 + }, + { + "epoch": 0.57, + "grad_norm": 0.06890633970155305, + "learning_rate": 0.0004065351034975364, + "loss": 1.3829, + "step": 5331 + }, + { + "epoch": 0.57, + "grad_norm": 0.06839743814713234, + "learning_rate": 0.0004063640707798826, + "loss": 1.5331, + "step": 5332 + }, + { + "epoch": 0.57, + "grad_norm": 0.07358862957787274, + "learning_rate": 0.00040619304941590806, + "loss": 1.3844, + "step": 5333 + }, + { + "epoch": 0.57, + "grad_norm": 0.08022228572519535, + "learning_rate": 0.00040602203942634974, + "loss": 1.4749, + "step": 5334 + }, + { + "epoch": 0.57, + "grad_norm": 0.07053418115513409, + "learning_rate": 0.00040585104083194296, + "loss": 1.5135, + "step": 5335 + }, + { + "epoch": 0.57, + "grad_norm": 0.07580591745380308, + "learning_rate": 0.0004056800536534222, + "loss": 1.476, + "step": 5336 + }, + { + "epoch": 0.57, + "grad_norm": 0.0740733400567423, + "learning_rate": 0.0004055090779115202, + "loss": 1.4435, + "step": 5337 + }, + { + "epoch": 0.57, + "grad_norm": 0.07623581006510774, + "learning_rate": 0.0004053381136269682, + "loss": 1.3887, + "step": 5338 + }, + { + "epoch": 0.57, + "grad_norm": 0.07726090719594393, + "learning_rate": 0.0004051671608204962, + "loss": 1.3204, + "step": 5339 + }, + { + "epoch": 0.57, + "grad_norm": 0.07643636380776819, + "learning_rate": 0.00040499621951283285, + "loss": 1.5598, + "step": 5340 + }, + { + "epoch": 0.57, + "grad_norm": 0.06916237569138638, + "learning_rate": 0.0004048252897247054, + "loss": 1.2734, + "step": 5341 + }, + { + "epoch": 0.57, + "grad_norm": 0.07440005449676713, + "learning_rate": 0.00040465437147683985, + "loss": 1.477, + "step": 5342 + }, + { + "epoch": 0.57, + "grad_norm": 0.0768591188002704, + "learning_rate": 0.0004044834647899603, + "loss": 1.4269, + "step": 5343 + }, + { + "epoch": 0.57, + "grad_norm": 0.07706826003310004, + "learning_rate": 0.00040431256968479, + "loss": 1.3852, + "step": 5344 + }, + { + "epoch": 0.57, + "grad_norm": 0.07365496429342674, + "learning_rate": 0.00040414168618205053, + "loss": 1.5324, + "step": 5345 + }, + { + "epoch": 0.57, + "grad_norm": 0.06961222207303024, + "learning_rate": 0.000403970814302462, + "loss": 1.4392, + "step": 5346 + }, + { + "epoch": 0.57, + "grad_norm": 0.07663822923497113, + "learning_rate": 0.0004037999540667436, + "loss": 1.3213, + "step": 5347 + }, + { + "epoch": 0.57, + "grad_norm": 0.08041052042443397, + "learning_rate": 0.00040362910549561216, + "loss": 1.4161, + "step": 5348 + }, + { + "epoch": 0.58, + "grad_norm": 0.07464715250262671, + "learning_rate": 0.000403458268609784, + "loss": 1.5022, + "step": 5349 + }, + { + "epoch": 0.58, + "grad_norm": 0.06798489895346704, + "learning_rate": 0.0004032874434299735, + "loss": 1.329, + "step": 5350 + }, + { + "epoch": 0.58, + "grad_norm": 0.07330991474639785, + "learning_rate": 0.0004031166299768939, + "loss": 1.4582, + "step": 5351 + }, + { + "epoch": 0.58, + "grad_norm": 0.07128785978243318, + "learning_rate": 0.000402945828271257, + "loss": 1.3826, + "step": 5352 + }, + { + "epoch": 0.58, + "grad_norm": 0.07672704144785629, + "learning_rate": 0.000402775038333773, + "loss": 1.3935, + "step": 5353 + }, + { + "epoch": 0.58, + "grad_norm": 0.06971968315018112, + "learning_rate": 0.00040260426018515086, + "loss": 1.3436, + "step": 5354 + }, + { + "epoch": 0.58, + "grad_norm": 0.08551247843667656, + "learning_rate": 0.0004024334938460978, + "loss": 1.5431, + "step": 5355 + }, + { + "epoch": 0.58, + "grad_norm": 0.06496244821036254, + "learning_rate": 0.00040226273933732006, + "loss": 1.4034, + "step": 5356 + }, + { + "epoch": 0.58, + "grad_norm": 0.06880854364756596, + "learning_rate": 0.0004020919966795221, + "loss": 1.3686, + "step": 5357 + }, + { + "epoch": 0.58, + "grad_norm": 0.061631643039155914, + "learning_rate": 0.0004019212658934073, + "loss": 1.4865, + "step": 5358 + }, + { + "epoch": 0.58, + "grad_norm": 0.0876656022645856, + "learning_rate": 0.0004017505469996769, + "loss": 1.4607, + "step": 5359 + }, + { + "epoch": 0.58, + "grad_norm": 0.07698134695084392, + "learning_rate": 0.0004015798400190314, + "loss": 1.3628, + "step": 5360 + }, + { + "epoch": 0.58, + "grad_norm": 0.07046999280271365, + "learning_rate": 0.0004014091449721696, + "loss": 1.3769, + "step": 5361 + }, + { + "epoch": 0.58, + "grad_norm": 0.07410553795969785, + "learning_rate": 0.00040123846187978897, + "loss": 1.337, + "step": 5362 + }, + { + "epoch": 0.58, + "grad_norm": 0.07474813390239679, + "learning_rate": 0.00040106779076258515, + "loss": 1.3158, + "step": 5363 + }, + { + "epoch": 0.58, + "grad_norm": 0.07356400677418831, + "learning_rate": 0.00040089713164125285, + "loss": 1.4531, + "step": 5364 + }, + { + "epoch": 0.58, + "grad_norm": 0.08282254384195048, + "learning_rate": 0.00040072648453648506, + "loss": 1.4346, + "step": 5365 + }, + { + "epoch": 0.58, + "grad_norm": 0.08105180775455986, + "learning_rate": 0.00040055584946897317, + "loss": 1.5177, + "step": 5366 + }, + { + "epoch": 0.58, + "grad_norm": 0.06655886062838566, + "learning_rate": 0.00040038522645940744, + "loss": 1.3714, + "step": 5367 + }, + { + "epoch": 0.58, + "grad_norm": 0.0762914733486254, + "learning_rate": 0.0004002146155284764, + "loss": 1.355, + "step": 5368 + }, + { + "epoch": 0.58, + "grad_norm": 0.07308623399730424, + "learning_rate": 0.0004000440166968674, + "loss": 1.3991, + "step": 5369 + }, + { + "epoch": 0.58, + "grad_norm": 0.07344200850911864, + "learning_rate": 0.0003998734299852659, + "loss": 1.4345, + "step": 5370 + }, + { + "epoch": 0.58, + "grad_norm": 0.07950989021718434, + "learning_rate": 0.00039970285541435604, + "loss": 1.2364, + "step": 5371 + }, + { + "epoch": 0.58, + "grad_norm": 0.06865838341036416, + "learning_rate": 0.0003995322930048208, + "loss": 1.3799, + "step": 5372 + }, + { + "epoch": 0.58, + "grad_norm": 0.08467264426498419, + "learning_rate": 0.0003993617427773415, + "loss": 1.4482, + "step": 5373 + }, + { + "epoch": 0.58, + "grad_norm": 0.07733714287405516, + "learning_rate": 0.0003991912047525976, + "loss": 1.4416, + "step": 5374 + }, + { + "epoch": 0.58, + "grad_norm": 0.11476822294757576, + "learning_rate": 0.0003990206789512679, + "loss": 1.4573, + "step": 5375 + }, + { + "epoch": 0.58, + "grad_norm": 0.07610655086519266, + "learning_rate": 0.00039885016539402895, + "loss": 1.3066, + "step": 5376 + }, + { + "epoch": 0.58, + "grad_norm": 0.07716286349323345, + "learning_rate": 0.0003986796641015561, + "loss": 1.3914, + "step": 5377 + }, + { + "epoch": 0.58, + "grad_norm": 0.08078118508279136, + "learning_rate": 0.0003985091750945233, + "loss": 1.3834, + "step": 5378 + }, + { + "epoch": 0.58, + "grad_norm": 0.08079701422701666, + "learning_rate": 0.0003983386983936031, + "loss": 1.3173, + "step": 5379 + }, + { + "epoch": 0.58, + "grad_norm": 0.07174180247292715, + "learning_rate": 0.0003981682340194661, + "loss": 1.6485, + "step": 5380 + }, + { + "epoch": 0.58, + "grad_norm": 0.08102967123260112, + "learning_rate": 0.00039799778199278176, + "loss": 1.4069, + "step": 5381 + }, + { + "epoch": 0.58, + "grad_norm": 0.08146881326489869, + "learning_rate": 0.0003978273423342179, + "loss": 1.3601, + "step": 5382 + }, + { + "epoch": 0.58, + "grad_norm": 0.07227396353091131, + "learning_rate": 0.00039765691506444113, + "loss": 1.5108, + "step": 5383 + }, + { + "epoch": 0.58, + "grad_norm": 0.07519482559814357, + "learning_rate": 0.00039748650020411625, + "loss": 1.3298, + "step": 5384 + }, + { + "epoch": 0.58, + "grad_norm": 0.06780383496089704, + "learning_rate": 0.00039731609777390647, + "loss": 1.4425, + "step": 5385 + }, + { + "epoch": 0.58, + "grad_norm": 0.06902960397709067, + "learning_rate": 0.000397145707794474, + "loss": 1.3888, + "step": 5386 + }, + { + "epoch": 0.58, + "grad_norm": 0.06986637026932582, + "learning_rate": 0.0003969753302864791, + "loss": 1.3675, + "step": 5387 + }, + { + "epoch": 0.58, + "grad_norm": 0.0691980178959223, + "learning_rate": 0.00039680496527058054, + "loss": 1.3784, + "step": 5388 + }, + { + "epoch": 0.58, + "grad_norm": 0.08156045851765133, + "learning_rate": 0.00039663461276743557, + "loss": 1.4175, + "step": 5389 + }, + { + "epoch": 0.58, + "grad_norm": 0.0704137566422376, + "learning_rate": 0.00039646427279770035, + "loss": 1.4708, + "step": 5390 + }, + { + "epoch": 0.58, + "grad_norm": 0.06468213372241707, + "learning_rate": 0.000396293945382029, + "loss": 1.3275, + "step": 5391 + }, + { + "epoch": 0.58, + "grad_norm": 0.07230454038232008, + "learning_rate": 0.00039612363054107423, + "loss": 1.4686, + "step": 5392 + }, + { + "epoch": 0.58, + "grad_norm": 0.06080188424261516, + "learning_rate": 0.00039595332829548736, + "loss": 1.3506, + "step": 5393 + }, + { + "epoch": 0.58, + "grad_norm": 0.08481811022519306, + "learning_rate": 0.00039578303866591804, + "loss": 1.4125, + "step": 5394 + }, + { + "epoch": 0.58, + "grad_norm": 0.07164820257648974, + "learning_rate": 0.00039561276167301475, + "loss": 1.398, + "step": 5395 + }, + { + "epoch": 0.58, + "grad_norm": 0.07295058056378592, + "learning_rate": 0.0003954424973374239, + "loss": 1.4414, + "step": 5396 + }, + { + "epoch": 0.58, + "grad_norm": 0.07617537325382101, + "learning_rate": 0.0003952722456797906, + "loss": 1.5086, + "step": 5397 + }, + { + "epoch": 0.58, + "grad_norm": 0.0723477940325076, + "learning_rate": 0.0003951020067207587, + "loss": 1.4446, + "step": 5398 + }, + { + "epoch": 0.58, + "grad_norm": 0.08723825363550239, + "learning_rate": 0.0003949317804809701, + "loss": 1.3639, + "step": 5399 + }, + { + "epoch": 0.58, + "grad_norm": 0.06936075965065762, + "learning_rate": 0.00039476156698106554, + "loss": 1.3929, + "step": 5400 + }, + { + "epoch": 0.58, + "grad_norm": 0.07934707933846326, + "learning_rate": 0.0003945913662416836, + "loss": 1.3018, + "step": 5401 + }, + { + "epoch": 0.58, + "grad_norm": 0.07516365654439944, + "learning_rate": 0.0003944211782834618, + "loss": 1.4766, + "step": 5402 + }, + { + "epoch": 0.58, + "grad_norm": 0.07511497051173129, + "learning_rate": 0.00039425100312703626, + "loss": 1.446, + "step": 5403 + }, + { + "epoch": 0.58, + "grad_norm": 0.07399958481633905, + "learning_rate": 0.00039408084079304114, + "loss": 1.4446, + "step": 5404 + }, + { + "epoch": 0.58, + "grad_norm": 0.07322395813105882, + "learning_rate": 0.0003939106913021091, + "loss": 1.4424, + "step": 5405 + }, + { + "epoch": 0.58, + "grad_norm": 0.08006397128046831, + "learning_rate": 0.00039374055467487163, + "loss": 1.4953, + "step": 5406 + }, + { + "epoch": 0.58, + "grad_norm": 0.08113213797120367, + "learning_rate": 0.00039357043093195824, + "loss": 1.4554, + "step": 5407 + }, + { + "epoch": 0.58, + "grad_norm": 0.08034824832299563, + "learning_rate": 0.0003934003200939969, + "loss": 1.2838, + "step": 5408 + }, + { + "epoch": 0.58, + "grad_norm": 0.07582701660283922, + "learning_rate": 0.00039323022218161436, + "loss": 1.3473, + "step": 5409 + }, + { + "epoch": 0.58, + "grad_norm": 0.07262946774124066, + "learning_rate": 0.0003930601372154355, + "loss": 1.4852, + "step": 5410 + }, + { + "epoch": 0.58, + "grad_norm": 0.0816848756450506, + "learning_rate": 0.00039289006521608374, + "loss": 1.3998, + "step": 5411 + }, + { + "epoch": 0.58, + "grad_norm": 0.08026040007883284, + "learning_rate": 0.00039272000620418077, + "loss": 1.3904, + "step": 5412 + }, + { + "epoch": 0.58, + "grad_norm": 0.08005161253565878, + "learning_rate": 0.0003925499602003467, + "loss": 1.3568, + "step": 5413 + }, + { + "epoch": 0.58, + "grad_norm": 0.08971571870062349, + "learning_rate": 0.00039237992722520064, + "loss": 1.3311, + "step": 5414 + }, + { + "epoch": 0.58, + "grad_norm": 0.0753830626442861, + "learning_rate": 0.00039220990729935937, + "loss": 1.4081, + "step": 5415 + }, + { + "epoch": 0.58, + "grad_norm": 0.08683335516423724, + "learning_rate": 0.0003920399004434383, + "loss": 1.4205, + "step": 5416 + }, + { + "epoch": 0.58, + "grad_norm": 0.08114337234622578, + "learning_rate": 0.0003918699066780517, + "loss": 1.4885, + "step": 5417 + }, + { + "epoch": 0.58, + "grad_norm": 0.07527136672140505, + "learning_rate": 0.0003916999260238115, + "loss": 1.3366, + "step": 5418 + }, + { + "epoch": 0.58, + "grad_norm": 0.07529646089819533, + "learning_rate": 0.0003915299585013288, + "loss": 1.4195, + "step": 5419 + }, + { + "epoch": 0.58, + "grad_norm": 0.08565027017623841, + "learning_rate": 0.00039136000413121246, + "loss": 1.4588, + "step": 5420 + }, + { + "epoch": 0.58, + "grad_norm": 0.07864361897315884, + "learning_rate": 0.00039119006293407025, + "loss": 1.4351, + "step": 5421 + }, + { + "epoch": 0.58, + "grad_norm": 0.07915941892313927, + "learning_rate": 0.0003910201349305081, + "loss": 1.3672, + "step": 5422 + }, + { + "epoch": 0.58, + "grad_norm": 0.07156264135526873, + "learning_rate": 0.0003908502201411303, + "loss": 1.4315, + "step": 5423 + }, + { + "epoch": 0.58, + "grad_norm": 0.07618419764564519, + "learning_rate": 0.0003906803185865395, + "loss": 1.5126, + "step": 5424 + }, + { + "epoch": 0.58, + "grad_norm": 0.0814665018963305, + "learning_rate": 0.0003905104302873369, + "loss": 1.467, + "step": 5425 + }, + { + "epoch": 0.58, + "grad_norm": 0.06795989374856158, + "learning_rate": 0.0003903405552641222, + "loss": 1.4223, + "step": 5426 + }, + { + "epoch": 0.58, + "grad_norm": 0.07673542464248548, + "learning_rate": 0.00039017069353749324, + "loss": 1.3443, + "step": 5427 + }, + { + "epoch": 0.58, + "grad_norm": 0.07803117959879066, + "learning_rate": 0.00039000084512804615, + "loss": 1.4296, + "step": 5428 + }, + { + "epoch": 0.58, + "grad_norm": 0.07549394873728246, + "learning_rate": 0.000389831010056376, + "loss": 1.4681, + "step": 5429 + }, + { + "epoch": 0.58, + "grad_norm": 0.06315628829607588, + "learning_rate": 0.00038966118834307564, + "loss": 1.4331, + "step": 5430 + }, + { + "epoch": 0.58, + "grad_norm": 0.07803465268274827, + "learning_rate": 0.0003894913800087365, + "loss": 1.3954, + "step": 5431 + }, + { + "epoch": 0.58, + "grad_norm": 0.0775945281904031, + "learning_rate": 0.00038932158507394885, + "loss": 1.5557, + "step": 5432 + }, + { + "epoch": 0.58, + "grad_norm": 0.07374023221278247, + "learning_rate": 0.0003891518035593002, + "loss": 1.4375, + "step": 5433 + }, + { + "epoch": 0.58, + "grad_norm": 0.06745322580856865, + "learning_rate": 0.00038898203548537767, + "loss": 1.4085, + "step": 5434 + }, + { + "epoch": 0.58, + "grad_norm": 0.07268110095101987, + "learning_rate": 0.0003888122808727661, + "loss": 1.2798, + "step": 5435 + }, + { + "epoch": 0.58, + "grad_norm": 0.06780737778028098, + "learning_rate": 0.00038864253974204866, + "loss": 1.42, + "step": 5436 + }, + { + "epoch": 0.58, + "grad_norm": 0.06232751201032706, + "learning_rate": 0.0003884728121138073, + "loss": 1.368, + "step": 5437 + }, + { + "epoch": 0.58, + "grad_norm": 0.07425964477371508, + "learning_rate": 0.00038830309800862194, + "loss": 1.3845, + "step": 5438 + }, + { + "epoch": 0.58, + "grad_norm": 0.0696927104312303, + "learning_rate": 0.00038813339744707095, + "loss": 1.5026, + "step": 5439 + }, + { + "epoch": 0.58, + "grad_norm": 0.08829232052578292, + "learning_rate": 0.0003879637104497313, + "loss": 1.459, + "step": 5440 + }, + { + "epoch": 0.58, + "grad_norm": 0.0800197842374922, + "learning_rate": 0.00038779403703717797, + "loss": 1.3374, + "step": 5441 + }, + { + "epoch": 0.59, + "grad_norm": 0.08461196112841203, + "learning_rate": 0.0003876243772299843, + "loss": 1.4803, + "step": 5442 + }, + { + "epoch": 0.59, + "grad_norm": 0.07591094969169548, + "learning_rate": 0.00038745473104872277, + "loss": 1.4347, + "step": 5443 + }, + { + "epoch": 0.59, + "grad_norm": 0.06951993132870146, + "learning_rate": 0.00038728509851396274, + "loss": 1.3894, + "step": 5444 + }, + { + "epoch": 0.59, + "grad_norm": 0.0709955709325333, + "learning_rate": 0.0003871154796462732, + "loss": 1.4263, + "step": 5445 + }, + { + "epoch": 0.59, + "grad_norm": 0.07217695702237546, + "learning_rate": 0.00038694587446622093, + "loss": 1.4771, + "step": 5446 + }, + { + "epoch": 0.59, + "grad_norm": 0.07624851858056204, + "learning_rate": 0.000386776282994371, + "loss": 1.5054, + "step": 5447 + }, + { + "epoch": 0.59, + "grad_norm": 0.0805471238267498, + "learning_rate": 0.0003866067052512872, + "loss": 1.4343, + "step": 5448 + }, + { + "epoch": 0.59, + "grad_norm": 0.07193135020200961, + "learning_rate": 0.0003864371412575314, + "loss": 1.4548, + "step": 5449 + }, + { + "epoch": 0.59, + "grad_norm": 0.07357020438933615, + "learning_rate": 0.00038626759103366367, + "loss": 1.4873, + "step": 5450 + }, + { + "epoch": 0.59, + "grad_norm": 0.08159699049650707, + "learning_rate": 0.0003860980546002425, + "loss": 1.4111, + "step": 5451 + }, + { + "epoch": 0.59, + "grad_norm": 0.08174122049659675, + "learning_rate": 0.00038592853197782503, + "loss": 1.3474, + "step": 5452 + }, + { + "epoch": 0.59, + "grad_norm": 0.07962770380661251, + "learning_rate": 0.0003857590231869664, + "loss": 1.3244, + "step": 5453 + }, + { + "epoch": 0.59, + "grad_norm": 0.07448555257556383, + "learning_rate": 0.00038558952824821994, + "loss": 1.4534, + "step": 5454 + }, + { + "epoch": 0.59, + "grad_norm": 0.07277064642875133, + "learning_rate": 0.0003854200471821376, + "loss": 1.4508, + "step": 5455 + }, + { + "epoch": 0.59, + "grad_norm": 0.08151762819863291, + "learning_rate": 0.0003852505800092695, + "loss": 1.3559, + "step": 5456 + }, + { + "epoch": 0.59, + "grad_norm": 0.08049679696768172, + "learning_rate": 0.0003850811267501643, + "loss": 1.3082, + "step": 5457 + }, + { + "epoch": 0.59, + "grad_norm": 0.06983411242009606, + "learning_rate": 0.0003849116874253686, + "loss": 1.4051, + "step": 5458 + }, + { + "epoch": 0.59, + "grad_norm": 0.07159626918035832, + "learning_rate": 0.0003847422620554276, + "loss": 1.4298, + "step": 5459 + }, + { + "epoch": 0.59, + "grad_norm": 0.08503770511535662, + "learning_rate": 0.00038457285066088463, + "loss": 1.3354, + "step": 5460 + }, + { + "epoch": 0.59, + "grad_norm": 0.0790162568405594, + "learning_rate": 0.00038440345326228156, + "loss": 1.3853, + "step": 5461 + }, + { + "epoch": 0.59, + "grad_norm": 0.0782423315532537, + "learning_rate": 0.0003842340698801581, + "loss": 1.4141, + "step": 5462 + }, + { + "epoch": 0.59, + "grad_norm": 0.07828919322816909, + "learning_rate": 0.0003840647005350529, + "loss": 1.3817, + "step": 5463 + }, + { + "epoch": 0.59, + "grad_norm": 0.07866537991857481, + "learning_rate": 0.00038389534524750266, + "loss": 1.2861, + "step": 5464 + }, + { + "epoch": 0.59, + "grad_norm": 0.07442515151685897, + "learning_rate": 0.00038372600403804185, + "loss": 1.4331, + "step": 5465 + }, + { + "epoch": 0.59, + "grad_norm": 0.08202431853846086, + "learning_rate": 0.000383556676927204, + "loss": 1.3602, + "step": 5466 + }, + { + "epoch": 0.59, + "grad_norm": 0.08207556685486656, + "learning_rate": 0.0003833873639355203, + "loss": 1.4111, + "step": 5467 + }, + { + "epoch": 0.59, + "grad_norm": 0.07216498981129676, + "learning_rate": 0.000383218065083521, + "loss": 1.3615, + "step": 5468 + }, + { + "epoch": 0.59, + "grad_norm": 0.07135005849767023, + "learning_rate": 0.0003830487803917338, + "loss": 1.4001, + "step": 5469 + }, + { + "epoch": 0.59, + "grad_norm": 0.07557193610406127, + "learning_rate": 0.00038287950988068507, + "loss": 1.4354, + "step": 5470 + }, + { + "epoch": 0.59, + "grad_norm": 0.07651272647178654, + "learning_rate": 0.0003827102535708997, + "loss": 1.4316, + "step": 5471 + }, + { + "epoch": 0.59, + "grad_norm": 0.07205282353825856, + "learning_rate": 0.00038254101148290043, + "loss": 1.6366, + "step": 5472 + }, + { + "epoch": 0.59, + "grad_norm": 0.08323470409611829, + "learning_rate": 0.00038237178363720835, + "loss": 1.413, + "step": 5473 + }, + { + "epoch": 0.59, + "grad_norm": 0.06608968374186841, + "learning_rate": 0.00038220257005434314, + "loss": 1.3479, + "step": 5474 + }, + { + "epoch": 0.59, + "grad_norm": 0.08621171799268833, + "learning_rate": 0.00038203337075482245, + "loss": 1.4297, + "step": 5475 + }, + { + "epoch": 0.59, + "grad_norm": 0.06968222782720418, + "learning_rate": 0.00038186418575916227, + "loss": 1.3863, + "step": 5476 + }, + { + "epoch": 0.59, + "grad_norm": 0.08547720995611516, + "learning_rate": 0.0003816950150878768, + "loss": 1.4868, + "step": 5477 + }, + { + "epoch": 0.59, + "grad_norm": 0.07746098530633695, + "learning_rate": 0.0003815258587614785, + "loss": 1.3578, + "step": 5478 + }, + { + "epoch": 0.59, + "grad_norm": 0.07851996940602321, + "learning_rate": 0.00038135671680047836, + "loss": 1.3269, + "step": 5479 + }, + { + "epoch": 0.59, + "grad_norm": 0.0671031884146776, + "learning_rate": 0.00038118758922538534, + "loss": 1.3328, + "step": 5480 + }, + { + "epoch": 0.59, + "grad_norm": 0.07086563839326887, + "learning_rate": 0.0003810184760567066, + "loss": 1.3801, + "step": 5481 + }, + { + "epoch": 0.59, + "grad_norm": 0.06829227406937718, + "learning_rate": 0.00038084937731494795, + "loss": 1.3466, + "step": 5482 + }, + { + "epoch": 0.59, + "grad_norm": 0.07644260647676682, + "learning_rate": 0.0003806802930206131, + "loss": 1.463, + "step": 5483 + }, + { + "epoch": 0.59, + "grad_norm": 0.07327912352959912, + "learning_rate": 0.00038051122319420405, + "loss": 1.5011, + "step": 5484 + }, + { + "epoch": 0.59, + "grad_norm": 0.07474131903712519, + "learning_rate": 0.00038034216785622126, + "loss": 1.4398, + "step": 5485 + }, + { + "epoch": 0.59, + "grad_norm": 0.06789310016224517, + "learning_rate": 0.0003801731270271629, + "loss": 1.4834, + "step": 5486 + }, + { + "epoch": 0.59, + "grad_norm": 0.07304893429551268, + "learning_rate": 0.0003800041007275261, + "loss": 1.4962, + "step": 5487 + }, + { + "epoch": 0.59, + "grad_norm": 0.06520647255990622, + "learning_rate": 0.00037983508897780583, + "loss": 1.3565, + "step": 5488 + }, + { + "epoch": 0.59, + "grad_norm": 0.06924010423700398, + "learning_rate": 0.00037966609179849526, + "loss": 1.3497, + "step": 5489 + }, + { + "epoch": 0.59, + "grad_norm": 0.07253469655081643, + "learning_rate": 0.0003794971092100858, + "loss": 1.363, + "step": 5490 + }, + { + "epoch": 0.59, + "grad_norm": 0.06834466384222009, + "learning_rate": 0.00037932814123306735, + "loss": 1.4292, + "step": 5491 + }, + { + "epoch": 0.59, + "grad_norm": 0.07204827925530317, + "learning_rate": 0.00037915918788792793, + "loss": 1.5018, + "step": 5492 + }, + { + "epoch": 0.59, + "grad_norm": 0.07344716634947808, + "learning_rate": 0.00037899024919515334, + "loss": 1.38, + "step": 5493 + }, + { + "epoch": 0.59, + "grad_norm": 0.07802721400721604, + "learning_rate": 0.0003788213251752284, + "loss": 1.4881, + "step": 5494 + }, + { + "epoch": 0.59, + "grad_norm": 0.07188704680601309, + "learning_rate": 0.00037865241584863547, + "loss": 1.3633, + "step": 5495 + }, + { + "epoch": 0.59, + "grad_norm": 0.06597149494787036, + "learning_rate": 0.0003784835212358556, + "loss": 1.4876, + "step": 5496 + }, + { + "epoch": 0.59, + "grad_norm": 0.07149421228145474, + "learning_rate": 0.00037831464135736764, + "loss": 1.4034, + "step": 5497 + }, + { + "epoch": 0.59, + "grad_norm": 0.07525825575415172, + "learning_rate": 0.00037814577623364884, + "loss": 1.2948, + "step": 5498 + }, + { + "epoch": 0.59, + "grad_norm": 0.08479029685348394, + "learning_rate": 0.00037797692588517487, + "loss": 1.4546, + "step": 5499 + }, + { + "epoch": 0.59, + "grad_norm": 0.07051587577136423, + "learning_rate": 0.00037780809033241926, + "loss": 1.5106, + "step": 5500 + }, + { + "epoch": 0.59, + "grad_norm": 0.07380593539742977, + "learning_rate": 0.00037763926959585387, + "loss": 1.3892, + "step": 5501 + }, + { + "epoch": 0.59, + "grad_norm": 0.06881571512378142, + "learning_rate": 0.000377470463695949, + "loss": 1.4178, + "step": 5502 + }, + { + "epoch": 0.59, + "grad_norm": 0.07028405289575519, + "learning_rate": 0.0003773016726531728, + "loss": 1.3714, + "step": 5503 + }, + { + "epoch": 0.59, + "grad_norm": 0.0666545859612911, + "learning_rate": 0.00037713289648799177, + "loss": 1.4037, + "step": 5504 + }, + { + "epoch": 0.59, + "grad_norm": 0.06767247145129425, + "learning_rate": 0.00037696413522087067, + "loss": 1.2454, + "step": 5505 + }, + { + "epoch": 0.59, + "grad_norm": 0.07342524302568901, + "learning_rate": 0.00037679538887227246, + "loss": 1.3988, + "step": 5506 + }, + { + "epoch": 0.59, + "grad_norm": 0.07264381277887294, + "learning_rate": 0.000376626657462658, + "loss": 1.5017, + "step": 5507 + }, + { + "epoch": 0.59, + "grad_norm": 0.06619640197119608, + "learning_rate": 0.00037645794101248665, + "loss": 1.3296, + "step": 5508 + }, + { + "epoch": 0.59, + "grad_norm": 0.08066585644997017, + "learning_rate": 0.00037628923954221574, + "loss": 1.4821, + "step": 5509 + }, + { + "epoch": 0.59, + "grad_norm": 0.0779357570182373, + "learning_rate": 0.00037612055307230114, + "loss": 1.3817, + "step": 5510 + }, + { + "epoch": 0.59, + "grad_norm": 0.08662510505643409, + "learning_rate": 0.0003759518816231966, + "loss": 1.2358, + "step": 5511 + }, + { + "epoch": 0.59, + "grad_norm": 0.06984981321199445, + "learning_rate": 0.0003757832252153539, + "loss": 1.2271, + "step": 5512 + }, + { + "epoch": 0.59, + "grad_norm": 0.07931974426531138, + "learning_rate": 0.00037561458386922356, + "loss": 1.4736, + "step": 5513 + }, + { + "epoch": 0.59, + "grad_norm": 0.08119674665971423, + "learning_rate": 0.0003754459576052537, + "loss": 1.399, + "step": 5514 + }, + { + "epoch": 0.59, + "grad_norm": 0.07910822175099083, + "learning_rate": 0.0003752773464438909, + "loss": 1.2959, + "step": 5515 + }, + { + "epoch": 0.59, + "grad_norm": 0.07694259855361689, + "learning_rate": 0.0003751087504055797, + "loss": 1.4247, + "step": 5516 + }, + { + "epoch": 0.59, + "grad_norm": 0.07256522449985622, + "learning_rate": 0.0003749401695107634, + "loss": 1.397, + "step": 5517 + }, + { + "epoch": 0.59, + "grad_norm": 0.08607831710188796, + "learning_rate": 0.00037477160377988246, + "loss": 1.5007, + "step": 5518 + }, + { + "epoch": 0.59, + "grad_norm": 0.07415689538573572, + "learning_rate": 0.0003746030532333763, + "loss": 1.4612, + "step": 5519 + }, + { + "epoch": 0.59, + "grad_norm": 0.07758296899328197, + "learning_rate": 0.0003744345178916823, + "loss": 1.3624, + "step": 5520 + }, + { + "epoch": 0.59, + "grad_norm": 0.08082820232713231, + "learning_rate": 0.00037426599777523573, + "loss": 1.4306, + "step": 5521 + }, + { + "epoch": 0.59, + "grad_norm": 0.07400004575923684, + "learning_rate": 0.0003740974929044706, + "loss": 1.3545, + "step": 5522 + }, + { + "epoch": 0.59, + "grad_norm": 0.07599643071172713, + "learning_rate": 0.00037392900329981843, + "loss": 1.3584, + "step": 5523 + }, + { + "epoch": 0.59, + "grad_norm": 0.07895934453748053, + "learning_rate": 0.00037376052898170913, + "loss": 1.4622, + "step": 5524 + }, + { + "epoch": 0.59, + "grad_norm": 0.0668384332444495, + "learning_rate": 0.00037359206997057107, + "loss": 1.5848, + "step": 5525 + }, + { + "epoch": 0.59, + "grad_norm": 0.07121712503947927, + "learning_rate": 0.0003734236262868303, + "loss": 1.3717, + "step": 5526 + }, + { + "epoch": 0.59, + "grad_norm": 0.0732117144452143, + "learning_rate": 0.00037325519795091127, + "loss": 1.5614, + "step": 5527 + }, + { + "epoch": 0.59, + "grad_norm": 0.08251413632425897, + "learning_rate": 0.00037308678498323647, + "loss": 1.3482, + "step": 5528 + }, + { + "epoch": 0.59, + "grad_norm": 0.07988610447393804, + "learning_rate": 0.00037291838740422623, + "loss": 1.441, + "step": 5529 + }, + { + "epoch": 0.59, + "grad_norm": 0.07343160375426805, + "learning_rate": 0.0003727500052342999, + "loss": 1.4126, + "step": 5530 + }, + { + "epoch": 0.59, + "grad_norm": 0.0750280771624314, + "learning_rate": 0.00037258163849387407, + "loss": 1.3603, + "step": 5531 + }, + { + "epoch": 0.59, + "grad_norm": 0.0699588962331154, + "learning_rate": 0.0003724132872033638, + "loss": 1.4122, + "step": 5532 + }, + { + "epoch": 0.59, + "grad_norm": 0.08618571728366133, + "learning_rate": 0.0003722449513831823, + "loss": 1.3199, + "step": 5533 + }, + { + "epoch": 0.59, + "grad_norm": 0.0752992863299164, + "learning_rate": 0.00037207663105374094, + "loss": 1.3892, + "step": 5534 + }, + { + "epoch": 0.6, + "grad_norm": 0.0756774846041659, + "learning_rate": 0.00037190832623544903, + "loss": 1.4252, + "step": 5535 + }, + { + "epoch": 0.6, + "grad_norm": 0.0755846484269021, + "learning_rate": 0.0003717400369487142, + "loss": 1.4436, + "step": 5536 + }, + { + "epoch": 0.6, + "grad_norm": 0.07183736040402938, + "learning_rate": 0.0003715717632139421, + "loss": 1.2719, + "step": 5537 + }, + { + "epoch": 0.6, + "grad_norm": 0.07633084282336201, + "learning_rate": 0.0003714035050515366, + "loss": 1.4026, + "step": 5538 + }, + { + "epoch": 0.6, + "grad_norm": 0.07675837417693088, + "learning_rate": 0.0003712352624818993, + "loss": 1.2591, + "step": 5539 + }, + { + "epoch": 0.6, + "grad_norm": 0.07676462023153384, + "learning_rate": 0.00037106703552543024, + "loss": 1.3136, + "step": 5540 + }, + { + "epoch": 0.6, + "grad_norm": 0.07689170885756248, + "learning_rate": 0.0003708988242025277, + "loss": 1.3616, + "step": 5541 + }, + { + "epoch": 0.6, + "grad_norm": 0.07474625872774958, + "learning_rate": 0.0003707306285335879, + "loss": 1.4878, + "step": 5542 + }, + { + "epoch": 0.6, + "grad_norm": 0.08136785858544164, + "learning_rate": 0.0003705624485390049, + "loss": 1.3011, + "step": 5543 + }, + { + "epoch": 0.6, + "grad_norm": 0.08019714407726865, + "learning_rate": 0.0003703942842391714, + "loss": 1.4349, + "step": 5544 + }, + { + "epoch": 0.6, + "grad_norm": 0.06931800845539068, + "learning_rate": 0.0003702261356544777, + "loss": 1.336, + "step": 5545 + }, + { + "epoch": 0.6, + "grad_norm": 0.06865776028434824, + "learning_rate": 0.00037005800280531245, + "loss": 1.3829, + "step": 5546 + }, + { + "epoch": 0.6, + "grad_norm": 0.07110683293273809, + "learning_rate": 0.0003698898857120622, + "loss": 1.3449, + "step": 5547 + }, + { + "epoch": 0.6, + "grad_norm": 0.06752517336614956, + "learning_rate": 0.00036972178439511206, + "loss": 1.4429, + "step": 5548 + }, + { + "epoch": 0.6, + "grad_norm": 0.07369625011178438, + "learning_rate": 0.00036955369887484477, + "loss": 1.3135, + "step": 5549 + }, + { + "epoch": 0.6, + "grad_norm": 0.08217739104019585, + "learning_rate": 0.00036938562917164107, + "loss": 1.44, + "step": 5550 + }, + { + "epoch": 0.6, + "grad_norm": 0.0679176740185262, + "learning_rate": 0.00036921757530588016, + "loss": 1.4109, + "step": 5551 + }, + { + "epoch": 0.6, + "grad_norm": 0.08183989458372486, + "learning_rate": 0.000369049537297939, + "loss": 1.5463, + "step": 5552 + }, + { + "epoch": 0.6, + "grad_norm": 0.08068697173407725, + "learning_rate": 0.00036888151516819304, + "loss": 1.3758, + "step": 5553 + }, + { + "epoch": 0.6, + "grad_norm": 0.0916066824594436, + "learning_rate": 0.0003687135089370153, + "loss": 1.4083, + "step": 5554 + }, + { + "epoch": 0.6, + "grad_norm": 0.07203913084540307, + "learning_rate": 0.0003685455186247772, + "loss": 1.3663, + "step": 5555 + }, + { + "epoch": 0.6, + "grad_norm": 0.07810221503711978, + "learning_rate": 0.0003683775442518482, + "loss": 1.337, + "step": 5556 + }, + { + "epoch": 0.6, + "grad_norm": 0.07378044550397307, + "learning_rate": 0.0003682095858385958, + "loss": 1.4417, + "step": 5557 + }, + { + "epoch": 0.6, + "grad_norm": 0.07662962879711044, + "learning_rate": 0.0003680416434053854, + "loss": 1.4428, + "step": 5558 + }, + { + "epoch": 0.6, + "grad_norm": 0.07391624067869233, + "learning_rate": 0.00036787371697258087, + "loss": 1.5021, + "step": 5559 + }, + { + "epoch": 0.6, + "grad_norm": 0.08018611758276063, + "learning_rate": 0.0003677058065605434, + "loss": 1.4745, + "step": 5560 + }, + { + "epoch": 0.6, + "grad_norm": 0.07141145070323125, + "learning_rate": 0.00036753791218963305, + "loss": 1.5691, + "step": 5561 + }, + { + "epoch": 0.6, + "grad_norm": 0.07200974918384846, + "learning_rate": 0.00036737003388020756, + "loss": 1.3799, + "step": 5562 + }, + { + "epoch": 0.6, + "grad_norm": 0.07893684170114153, + "learning_rate": 0.0003672021716526226, + "loss": 1.4226, + "step": 5563 + }, + { + "epoch": 0.6, + "grad_norm": 0.08330290597126691, + "learning_rate": 0.0003670343255272322, + "loss": 1.556, + "step": 5564 + }, + { + "epoch": 0.6, + "grad_norm": 0.07058247997252769, + "learning_rate": 0.00036686649552438827, + "loss": 1.4039, + "step": 5565 + }, + { + "epoch": 0.6, + "grad_norm": 0.07577260612687944, + "learning_rate": 0.00036669868166444065, + "loss": 1.3556, + "step": 5566 + }, + { + "epoch": 0.6, + "grad_norm": 0.07442078259679677, + "learning_rate": 0.0003665308839677375, + "loss": 1.4111, + "step": 5567 + }, + { + "epoch": 0.6, + "grad_norm": 0.09062648799718757, + "learning_rate": 0.00036636310245462484, + "loss": 1.3685, + "step": 5568 + }, + { + "epoch": 0.6, + "grad_norm": 0.07404072406825808, + "learning_rate": 0.00036619533714544664, + "loss": 1.2866, + "step": 5569 + }, + { + "epoch": 0.6, + "grad_norm": 0.07341209572066199, + "learning_rate": 0.00036602758806054535, + "loss": 1.3663, + "step": 5570 + }, + { + "epoch": 0.6, + "grad_norm": 0.08367994768627389, + "learning_rate": 0.00036585985522026067, + "loss": 1.3816, + "step": 5571 + }, + { + "epoch": 0.6, + "grad_norm": 0.06888392014168349, + "learning_rate": 0.000365692138644931, + "loss": 1.4769, + "step": 5572 + }, + { + "epoch": 0.6, + "grad_norm": 0.07446670991451924, + "learning_rate": 0.0003655244383548926, + "loss": 1.4686, + "step": 5573 + }, + { + "epoch": 0.6, + "grad_norm": 0.07197126237743647, + "learning_rate": 0.00036535675437047955, + "loss": 1.3757, + "step": 5574 + }, + { + "epoch": 0.6, + "grad_norm": 0.07647613896270093, + "learning_rate": 0.00036518908671202426, + "loss": 1.5409, + "step": 5575 + }, + { + "epoch": 0.6, + "grad_norm": 0.06842253526012758, + "learning_rate": 0.00036502143539985705, + "loss": 1.3703, + "step": 5576 + }, + { + "epoch": 0.6, + "grad_norm": 0.07301037980837188, + "learning_rate": 0.00036485380045430597, + "loss": 1.4469, + "step": 5577 + }, + { + "epoch": 0.6, + "grad_norm": 0.07007991133973272, + "learning_rate": 0.0003646861818956977, + "loss": 1.4505, + "step": 5578 + }, + { + "epoch": 0.6, + "grad_norm": 0.06574887026402759, + "learning_rate": 0.0003645185797443563, + "loss": 1.2006, + "step": 5579 + }, + { + "epoch": 0.6, + "grad_norm": 0.07588016531671705, + "learning_rate": 0.0003643509940206043, + "loss": 1.3754, + "step": 5580 + }, + { + "epoch": 0.6, + "grad_norm": 0.06515398225680037, + "learning_rate": 0.00036418342474476184, + "loss": 1.368, + "step": 5581 + }, + { + "epoch": 0.6, + "grad_norm": 0.07544085970434565, + "learning_rate": 0.00036401587193714724, + "loss": 1.5863, + "step": 5582 + }, + { + "epoch": 0.6, + "grad_norm": 0.07153631958666246, + "learning_rate": 0.00036384833561807706, + "loss": 1.4028, + "step": 5583 + }, + { + "epoch": 0.6, + "grad_norm": 0.06872766857481105, + "learning_rate": 0.0003636808158078656, + "loss": 1.3447, + "step": 5584 + }, + { + "epoch": 0.6, + "grad_norm": 0.08987840989416099, + "learning_rate": 0.00036351331252682515, + "loss": 1.4119, + "step": 5585 + }, + { + "epoch": 0.6, + "grad_norm": 0.06816450340215846, + "learning_rate": 0.0003633458257952661, + "loss": 1.4812, + "step": 5586 + }, + { + "epoch": 0.6, + "grad_norm": 0.0737825306184352, + "learning_rate": 0.0003631783556334968, + "loss": 1.4151, + "step": 5587 + }, + { + "epoch": 0.6, + "grad_norm": 0.07504967730605744, + "learning_rate": 0.00036301090206182366, + "loss": 1.4566, + "step": 5588 + }, + { + "epoch": 0.6, + "grad_norm": 0.08464044928871382, + "learning_rate": 0.00036284346510055064, + "loss": 1.3867, + "step": 5589 + }, + { + "epoch": 0.6, + "grad_norm": 0.07468777452003433, + "learning_rate": 0.0003626760447699806, + "loss": 1.5272, + "step": 5590 + }, + { + "epoch": 0.6, + "grad_norm": 0.07626542283005774, + "learning_rate": 0.0003625086410904136, + "loss": 1.3815, + "step": 5591 + }, + { + "epoch": 0.6, + "grad_norm": 0.06636087185478588, + "learning_rate": 0.0003623412540821478, + "loss": 1.2893, + "step": 5592 + }, + { + "epoch": 0.6, + "grad_norm": 0.06748116009133115, + "learning_rate": 0.0003621738837654795, + "loss": 1.4307, + "step": 5593 + }, + { + "epoch": 0.6, + "grad_norm": 0.08014808265191209, + "learning_rate": 0.00036200653016070283, + "loss": 1.2923, + "step": 5594 + }, + { + "epoch": 0.6, + "grad_norm": 0.07630050936118145, + "learning_rate": 0.0003618391932881102, + "loss": 1.5177, + "step": 5595 + }, + { + "epoch": 0.6, + "grad_norm": 0.08900857906661022, + "learning_rate": 0.0003616718731679918, + "loss": 1.421, + "step": 5596 + }, + { + "epoch": 0.6, + "grad_norm": 0.07500532846595925, + "learning_rate": 0.0003615045698206355, + "loss": 1.3891, + "step": 5597 + }, + { + "epoch": 0.6, + "grad_norm": 0.07810709867424619, + "learning_rate": 0.00036133728326632764, + "loss": 1.4612, + "step": 5598 + }, + { + "epoch": 0.6, + "grad_norm": 0.06859701862319084, + "learning_rate": 0.0003611700135253523, + "loss": 1.3761, + "step": 5599 + }, + { + "epoch": 0.6, + "grad_norm": 0.07414205880381541, + "learning_rate": 0.0003610027606179913, + "loss": 1.4306, + "step": 5600 + }, + { + "epoch": 0.6, + "grad_norm": 0.06934922386528766, + "learning_rate": 0.00036083552456452484, + "loss": 1.463, + "step": 5601 + }, + { + "epoch": 0.6, + "grad_norm": 0.0737091137736752, + "learning_rate": 0.00036066830538523096, + "loss": 1.3899, + "step": 5602 + }, + { + "epoch": 0.6, + "grad_norm": 0.07854348024840235, + "learning_rate": 0.0003605011031003853, + "loss": 1.348, + "step": 5603 + }, + { + "epoch": 0.6, + "grad_norm": 0.08047178520802252, + "learning_rate": 0.0003603339177302618, + "loss": 1.5244, + "step": 5604 + }, + { + "epoch": 0.6, + "grad_norm": 0.07429033202974966, + "learning_rate": 0.00036016674929513216, + "loss": 1.4901, + "step": 5605 + }, + { + "epoch": 0.6, + "grad_norm": 0.07099137795592182, + "learning_rate": 0.00035999959781526637, + "loss": 1.4711, + "step": 5606 + }, + { + "epoch": 0.6, + "grad_norm": 0.07344959139200635, + "learning_rate": 0.00035983246331093197, + "loss": 1.4571, + "step": 5607 + }, + { + "epoch": 0.6, + "grad_norm": 0.08400809429915737, + "learning_rate": 0.00035966534580239454, + "loss": 1.2612, + "step": 5608 + }, + { + "epoch": 0.6, + "grad_norm": 0.08538467421828809, + "learning_rate": 0.0003594982453099178, + "loss": 1.3764, + "step": 5609 + }, + { + "epoch": 0.6, + "grad_norm": 0.07206064549666952, + "learning_rate": 0.00035933116185376325, + "loss": 1.4262, + "step": 5610 + }, + { + "epoch": 0.6, + "grad_norm": 0.07958752096332246, + "learning_rate": 0.00035916409545419027, + "loss": 1.3995, + "step": 5611 + }, + { + "epoch": 0.6, + "grad_norm": 0.07919682179856065, + "learning_rate": 0.00035899704613145635, + "loss": 1.5493, + "step": 5612 + }, + { + "epoch": 0.6, + "grad_norm": 0.07986030378083334, + "learning_rate": 0.00035883001390581647, + "loss": 1.4705, + "step": 5613 + }, + { + "epoch": 0.6, + "grad_norm": 0.07799866116922917, + "learning_rate": 0.0003586629987975243, + "loss": 1.4285, + "step": 5614 + }, + { + "epoch": 0.6, + "grad_norm": 0.07339482852672152, + "learning_rate": 0.00035849600082683066, + "loss": 1.4108, + "step": 5615 + }, + { + "epoch": 0.6, + "grad_norm": 0.07719796136377283, + "learning_rate": 0.00035832902001398483, + "loss": 1.5008, + "step": 5616 + }, + { + "epoch": 0.6, + "grad_norm": 0.08039852916277493, + "learning_rate": 0.0003581620563792336, + "loss": 1.4794, + "step": 5617 + }, + { + "epoch": 0.6, + "grad_norm": 0.08287709546388787, + "learning_rate": 0.00035799510994282216, + "loss": 1.3035, + "step": 5618 + }, + { + "epoch": 0.6, + "grad_norm": 0.07496917924553367, + "learning_rate": 0.0003578281807249931, + "loss": 1.4986, + "step": 5619 + }, + { + "epoch": 0.6, + "grad_norm": 0.06504525578999058, + "learning_rate": 0.0003576612687459873, + "loss": 1.6326, + "step": 5620 + }, + { + "epoch": 0.6, + "grad_norm": 0.07543080011759193, + "learning_rate": 0.00035749437402604346, + "loss": 1.4127, + "step": 5621 + }, + { + "epoch": 0.6, + "grad_norm": 0.06900592630400436, + "learning_rate": 0.00035732749658539797, + "loss": 1.4264, + "step": 5622 + }, + { + "epoch": 0.6, + "grad_norm": 0.0689469569944877, + "learning_rate": 0.00035716063644428565, + "loss": 1.2965, + "step": 5623 + }, + { + "epoch": 0.6, + "grad_norm": 0.07147758440559308, + "learning_rate": 0.00035699379362293836, + "loss": 1.3985, + "step": 5624 + }, + { + "epoch": 0.6, + "grad_norm": 0.07056189553778609, + "learning_rate": 0.00035682696814158657, + "loss": 1.333, + "step": 5625 + }, + { + "epoch": 0.6, + "grad_norm": 0.07538426411309675, + "learning_rate": 0.00035666016002045854, + "loss": 1.3421, + "step": 5626 + }, + { + "epoch": 0.6, + "grad_norm": 0.06937029736035574, + "learning_rate": 0.0003564933692797803, + "loss": 1.4397, + "step": 5627 + }, + { + "epoch": 0.6, + "grad_norm": 0.06838536960364613, + "learning_rate": 0.0003563265959397757, + "loss": 1.4764, + "step": 5628 + }, + { + "epoch": 0.61, + "grad_norm": 0.06858569045988618, + "learning_rate": 0.0003561598400206667, + "loss": 1.3769, + "step": 5629 + }, + { + "epoch": 0.61, + "grad_norm": 0.0821419094374213, + "learning_rate": 0.00035599310154267307, + "loss": 1.4455, + "step": 5630 + }, + { + "epoch": 0.61, + "grad_norm": 0.07978858517243766, + "learning_rate": 0.00035582638052601223, + "loss": 1.4309, + "step": 5631 + }, + { + "epoch": 0.61, + "grad_norm": 0.07710280608060323, + "learning_rate": 0.00035565967699089984, + "loss": 1.412, + "step": 5632 + }, + { + "epoch": 0.61, + "grad_norm": 0.07006371462775733, + "learning_rate": 0.00035549299095754937, + "loss": 1.3961, + "step": 5633 + }, + { + "epoch": 0.61, + "grad_norm": 0.07673372395440631, + "learning_rate": 0.0003553263224461718, + "loss": 1.2666, + "step": 5634 + }, + { + "epoch": 0.61, + "grad_norm": 0.07617606707799483, + "learning_rate": 0.00035515967147697647, + "loss": 1.3182, + "step": 5635 + }, + { + "epoch": 0.61, + "grad_norm": 0.07299159334036456, + "learning_rate": 0.0003549930380701701, + "loss": 1.3535, + "step": 5636 + }, + { + "epoch": 0.61, + "grad_norm": 0.07548869479560834, + "learning_rate": 0.000354826422245958, + "loss": 1.3972, + "step": 5637 + }, + { + "epoch": 0.61, + "grad_norm": 0.07528600549226311, + "learning_rate": 0.00035465982402454267, + "loss": 1.3542, + "step": 5638 + }, + { + "epoch": 0.61, + "grad_norm": 0.0761655413610927, + "learning_rate": 0.0003544932434261246, + "loss": 1.5579, + "step": 5639 + }, + { + "epoch": 0.61, + "grad_norm": 0.07328638809355852, + "learning_rate": 0.0003543266804709026, + "loss": 1.4611, + "step": 5640 + }, + { + "epoch": 0.61, + "grad_norm": 0.0684025976932677, + "learning_rate": 0.00035416013517907264, + "loss": 1.3723, + "step": 5641 + }, + { + "epoch": 0.61, + "grad_norm": 0.0709444446488586, + "learning_rate": 0.00035399360757082913, + "loss": 1.3982, + "step": 5642 + }, + { + "epoch": 0.61, + "grad_norm": 0.07902829727346929, + "learning_rate": 0.000353827097666364, + "loss": 1.3444, + "step": 5643 + }, + { + "epoch": 0.61, + "grad_norm": 0.07138351900963495, + "learning_rate": 0.00035366060548586744, + "loss": 1.3536, + "step": 5644 + }, + { + "epoch": 0.61, + "grad_norm": 0.06664446570362624, + "learning_rate": 0.00035349413104952685, + "loss": 1.4987, + "step": 5645 + }, + { + "epoch": 0.61, + "grad_norm": 0.06565668811270635, + "learning_rate": 0.0003533276743775279, + "loss": 1.3504, + "step": 5646 + }, + { + "epoch": 0.61, + "grad_norm": 0.06867503022652852, + "learning_rate": 0.00035316123549005416, + "loss": 1.3162, + "step": 5647 + }, + { + "epoch": 0.61, + "grad_norm": 0.08049531458815742, + "learning_rate": 0.0003529948144072867, + "loss": 1.3726, + "step": 5648 + }, + { + "epoch": 0.61, + "grad_norm": 0.06401788263416966, + "learning_rate": 0.0003528284111494049, + "loss": 1.3049, + "step": 5649 + }, + { + "epoch": 0.61, + "grad_norm": 0.08055355952171174, + "learning_rate": 0.00035266202573658567, + "loss": 1.5669, + "step": 5650 + }, + { + "epoch": 0.61, + "grad_norm": 0.07453158758644808, + "learning_rate": 0.00035249565818900366, + "loss": 1.4813, + "step": 5651 + }, + { + "epoch": 0.61, + "grad_norm": 0.07887708489653913, + "learning_rate": 0.00035232930852683165, + "loss": 1.4236, + "step": 5652 + }, + { + "epoch": 0.61, + "grad_norm": 0.07109618467705087, + "learning_rate": 0.0003521629767702401, + "loss": 1.2988, + "step": 5653 + }, + { + "epoch": 0.61, + "grad_norm": 0.06710743003825625, + "learning_rate": 0.00035199666293939726, + "loss": 1.4079, + "step": 5654 + }, + { + "epoch": 0.61, + "grad_norm": 0.07155522294010239, + "learning_rate": 0.0003518303670544696, + "loss": 1.4105, + "step": 5655 + }, + { + "epoch": 0.61, + "grad_norm": 0.07196979769357402, + "learning_rate": 0.00035166408913562034, + "loss": 1.4136, + "step": 5656 + }, + { + "epoch": 0.61, + "grad_norm": 0.07745496675046756, + "learning_rate": 0.00035149782920301174, + "loss": 1.4632, + "step": 5657 + }, + { + "epoch": 0.61, + "grad_norm": 0.07447542178913955, + "learning_rate": 0.0003513315872768035, + "loss": 1.4284, + "step": 5658 + }, + { + "epoch": 0.61, + "grad_norm": 0.07836930328423564, + "learning_rate": 0.00035116536337715255, + "loss": 1.4194, + "step": 5659 + }, + { + "epoch": 0.61, + "grad_norm": 0.08966749202790163, + "learning_rate": 0.00035099915752421465, + "loss": 1.4606, + "step": 5660 + }, + { + "epoch": 0.61, + "grad_norm": 0.06903241945632177, + "learning_rate": 0.0003508329697381425, + "loss": 1.4138, + "step": 5661 + }, + { + "epoch": 0.61, + "grad_norm": 0.08028060766204789, + "learning_rate": 0.0003506668000390869, + "loss": 1.4581, + "step": 5662 + }, + { + "epoch": 0.61, + "grad_norm": 0.07138617804983953, + "learning_rate": 0.00035050064844719677, + "loss": 1.3281, + "step": 5663 + }, + { + "epoch": 0.61, + "grad_norm": 0.09922730576053841, + "learning_rate": 0.0003503345149826185, + "loss": 1.343, + "step": 5664 + }, + { + "epoch": 0.61, + "grad_norm": 0.07817632658073502, + "learning_rate": 0.00035016839966549627, + "loss": 1.5559, + "step": 5665 + }, + { + "epoch": 0.61, + "grad_norm": 0.07861593935008956, + "learning_rate": 0.000350002302515972, + "loss": 1.3796, + "step": 5666 + }, + { + "epoch": 0.61, + "grad_norm": 0.07251164645828607, + "learning_rate": 0.0003498362235541856, + "loss": 1.4127, + "step": 5667 + }, + { + "epoch": 0.61, + "grad_norm": 0.06707603220109654, + "learning_rate": 0.0003496701628002749, + "loss": 1.3143, + "step": 5668 + }, + { + "epoch": 0.61, + "grad_norm": 0.06988053951437045, + "learning_rate": 0.00034950412027437525, + "loss": 1.4305, + "step": 5669 + }, + { + "epoch": 0.61, + "grad_norm": 0.07867145829080913, + "learning_rate": 0.0003493380959966197, + "loss": 1.2806, + "step": 5670 + }, + { + "epoch": 0.61, + "grad_norm": 0.07429734651699726, + "learning_rate": 0.00034917208998713956, + "loss": 1.4044, + "step": 5671 + }, + { + "epoch": 0.61, + "grad_norm": 0.07446972862489869, + "learning_rate": 0.0003490061022660634, + "loss": 1.4015, + "step": 5672 + }, + { + "epoch": 0.61, + "grad_norm": 0.07585596482806031, + "learning_rate": 0.00034884013285351796, + "loss": 1.4996, + "step": 5673 + }, + { + "epoch": 0.61, + "grad_norm": 0.0765452008993846, + "learning_rate": 0.0003486741817696275, + "loss": 1.3428, + "step": 5674 + }, + { + "epoch": 0.61, + "grad_norm": 0.07743629189736943, + "learning_rate": 0.0003485082490345143, + "loss": 1.3471, + "step": 5675 + }, + { + "epoch": 0.61, + "grad_norm": 0.07962414669280672, + "learning_rate": 0.00034834233466829825, + "loss": 1.5078, + "step": 5676 + }, + { + "epoch": 0.61, + "grad_norm": 0.06823095793714402, + "learning_rate": 0.0003481764386910968, + "loss": 1.2538, + "step": 5677 + }, + { + "epoch": 0.61, + "grad_norm": 0.07430885125530548, + "learning_rate": 0.00034801056112302554, + "loss": 1.5397, + "step": 5678 + }, + { + "epoch": 0.61, + "grad_norm": 0.07967890542328465, + "learning_rate": 0.0003478447019841978, + "loss": 1.4244, + "step": 5679 + }, + { + "epoch": 0.61, + "grad_norm": 0.08165222831567905, + "learning_rate": 0.00034767886129472453, + "loss": 1.4542, + "step": 5680 + }, + { + "epoch": 0.61, + "grad_norm": 0.08037753942606386, + "learning_rate": 0.0003475130390747144, + "loss": 1.5213, + "step": 5681 + }, + { + "epoch": 0.61, + "grad_norm": 0.07534567859413022, + "learning_rate": 0.000347347235344274, + "loss": 1.3967, + "step": 5682 + }, + { + "epoch": 0.61, + "grad_norm": 0.08780878813456622, + "learning_rate": 0.0003471814501235076, + "loss": 1.2945, + "step": 5683 + }, + { + "epoch": 0.61, + "grad_norm": 0.07526528057162522, + "learning_rate": 0.00034701568343251723, + "loss": 1.4243, + "step": 5684 + }, + { + "epoch": 0.61, + "grad_norm": 0.0877269611831134, + "learning_rate": 0.0003468499352914026, + "loss": 1.2562, + "step": 5685 + }, + { + "epoch": 0.61, + "grad_norm": 0.07211662248051867, + "learning_rate": 0.00034668420572026156, + "loss": 1.3204, + "step": 5686 + }, + { + "epoch": 0.61, + "grad_norm": 0.0785050832257113, + "learning_rate": 0.00034651849473918883, + "loss": 1.3089, + "step": 5687 + }, + { + "epoch": 0.61, + "grad_norm": 0.0733152327279244, + "learning_rate": 0.00034635280236827785, + "loss": 1.4598, + "step": 5688 + }, + { + "epoch": 0.61, + "grad_norm": 0.07122292618843426, + "learning_rate": 0.0003461871286276194, + "loss": 1.4084, + "step": 5689 + }, + { + "epoch": 0.61, + "grad_norm": 0.07521042733808453, + "learning_rate": 0.0003460214735373016, + "loss": 1.2403, + "step": 5690 + }, + { + "epoch": 0.61, + "grad_norm": 0.08669744108413878, + "learning_rate": 0.00034585583711741114, + "loss": 1.3522, + "step": 5691 + }, + { + "epoch": 0.61, + "grad_norm": 0.07735992970438209, + "learning_rate": 0.00034569021938803184, + "loss": 1.3652, + "step": 5692 + }, + { + "epoch": 0.61, + "grad_norm": 0.08321736056147003, + "learning_rate": 0.0003455246203692454, + "loss": 1.4371, + "step": 5693 + }, + { + "epoch": 0.61, + "grad_norm": 0.07046318299439341, + "learning_rate": 0.0003453590400811313, + "loss": 1.2962, + "step": 5694 + }, + { + "epoch": 0.61, + "grad_norm": 0.08303814502094009, + "learning_rate": 0.0003451934785437668, + "loss": 1.3947, + "step": 5695 + }, + { + "epoch": 0.61, + "grad_norm": 0.0778596641434968, + "learning_rate": 0.00034502793577722657, + "loss": 1.4846, + "step": 5696 + }, + { + "epoch": 0.61, + "grad_norm": 0.07305095025031742, + "learning_rate": 0.00034486241180158375, + "loss": 1.4616, + "step": 5697 + }, + { + "epoch": 0.61, + "grad_norm": 0.07711619596739502, + "learning_rate": 0.00034469690663690793, + "loss": 1.244, + "step": 5698 + }, + { + "epoch": 0.61, + "grad_norm": 0.08701736627740074, + "learning_rate": 0.0003445314203032678, + "loss": 1.5445, + "step": 5699 + }, + { + "epoch": 0.61, + "grad_norm": 0.08825112294005741, + "learning_rate": 0.0003443659528207289, + "loss": 1.2481, + "step": 5700 + }, + { + "epoch": 0.61, + "grad_norm": 0.08846932338421327, + "learning_rate": 0.0003442005042093547, + "loss": 1.3274, + "step": 5701 + }, + { + "epoch": 0.61, + "grad_norm": 0.07205817936514314, + "learning_rate": 0.00034403507448920655, + "loss": 1.433, + "step": 5702 + }, + { + "epoch": 0.61, + "grad_norm": 0.07069231060608115, + "learning_rate": 0.00034386966368034333, + "loss": 1.2406, + "step": 5703 + }, + { + "epoch": 0.61, + "grad_norm": 0.07557644180661727, + "learning_rate": 0.0003437042718028215, + "loss": 1.4779, + "step": 5704 + }, + { + "epoch": 0.61, + "grad_norm": 0.07339080355677123, + "learning_rate": 0.00034353889887669574, + "loss": 1.4528, + "step": 5705 + }, + { + "epoch": 0.61, + "grad_norm": 0.06957718608936084, + "learning_rate": 0.00034337354492201784, + "loss": 1.5388, + "step": 5706 + }, + { + "epoch": 0.61, + "grad_norm": 0.07314609581938167, + "learning_rate": 0.0003432082099588377, + "loss": 1.349, + "step": 5707 + }, + { + "epoch": 0.61, + "grad_norm": 0.0818507302919073, + "learning_rate": 0.0003430428940072026, + "loss": 1.3628, + "step": 5708 + }, + { + "epoch": 0.61, + "grad_norm": 0.0648690866111131, + "learning_rate": 0.0003428775970871575, + "loss": 1.4247, + "step": 5709 + }, + { + "epoch": 0.61, + "grad_norm": 0.06938785217931223, + "learning_rate": 0.0003427123192187456, + "loss": 1.5578, + "step": 5710 + }, + { + "epoch": 0.61, + "grad_norm": 0.07535029207051505, + "learning_rate": 0.00034254706042200725, + "loss": 1.3668, + "step": 5711 + }, + { + "epoch": 0.61, + "grad_norm": 0.07028672868584891, + "learning_rate": 0.00034238182071698065, + "loss": 1.4355, + "step": 5712 + }, + { + "epoch": 0.61, + "grad_norm": 0.08299347809832416, + "learning_rate": 0.0003422166001237016, + "loss": 1.4188, + "step": 5713 + }, + { + "epoch": 0.61, + "grad_norm": 0.06277772175495448, + "learning_rate": 0.0003420513986622038, + "loss": 1.2739, + "step": 5714 + }, + { + "epoch": 0.61, + "grad_norm": 0.06737514638829561, + "learning_rate": 0.0003418862163525185, + "loss": 1.3447, + "step": 5715 + }, + { + "epoch": 0.61, + "grad_norm": 0.07349604640587126, + "learning_rate": 0.0003417210532146744, + "loss": 1.3473, + "step": 5716 + }, + { + "epoch": 0.61, + "grad_norm": 0.0861956852391659, + "learning_rate": 0.00034155590926869837, + "loss": 1.1675, + "step": 5717 + }, + { + "epoch": 0.61, + "grad_norm": 0.06819916582304868, + "learning_rate": 0.0003413907845346147, + "loss": 1.418, + "step": 5718 + }, + { + "epoch": 0.61, + "grad_norm": 0.07113808982321314, + "learning_rate": 0.0003412256790324452, + "loss": 1.3437, + "step": 5719 + }, + { + "epoch": 0.61, + "grad_norm": 0.08113355490818232, + "learning_rate": 0.00034106059278220935, + "loss": 1.3868, + "step": 5720 + }, + { + "epoch": 0.61, + "grad_norm": 0.09610388012345854, + "learning_rate": 0.00034089552580392456, + "loss": 1.4229, + "step": 5721 + }, + { + "epoch": 0.62, + "grad_norm": 0.07795794477824468, + "learning_rate": 0.00034073047811760586, + "loss": 1.4547, + "step": 5722 + }, + { + "epoch": 0.62, + "grad_norm": 0.09967082596439263, + "learning_rate": 0.0003405654497432658, + "loss": 1.4848, + "step": 5723 + }, + { + "epoch": 0.62, + "grad_norm": 0.06998698314030954, + "learning_rate": 0.0003404004407009145, + "loss": 1.4644, + "step": 5724 + }, + { + "epoch": 0.62, + "grad_norm": 0.08424361964264927, + "learning_rate": 0.0003402354510105601, + "loss": 1.468, + "step": 5725 + }, + { + "epoch": 0.62, + "grad_norm": 0.07588249803140369, + "learning_rate": 0.00034007048069220803, + "loss": 1.2903, + "step": 5726 + }, + { + "epoch": 0.62, + "grad_norm": 0.0718749317137767, + "learning_rate": 0.00033990552976586143, + "loss": 1.4871, + "step": 5727 + }, + { + "epoch": 0.62, + "grad_norm": 0.06527144687140929, + "learning_rate": 0.0003397405982515214, + "loss": 1.3258, + "step": 5728 + }, + { + "epoch": 0.62, + "grad_norm": 0.08734474868450842, + "learning_rate": 0.0003395756861691864, + "loss": 1.3422, + "step": 5729 + }, + { + "epoch": 0.62, + "grad_norm": 0.07129891373227273, + "learning_rate": 0.0003394107935388525, + "loss": 1.5485, + "step": 5730 + }, + { + "epoch": 0.62, + "grad_norm": 0.0688040837256214, + "learning_rate": 0.00033924592038051346, + "loss": 1.4304, + "step": 5731 + }, + { + "epoch": 0.62, + "grad_norm": 0.07444968184588249, + "learning_rate": 0.0003390810667141606, + "loss": 1.3824, + "step": 5732 + }, + { + "epoch": 0.62, + "grad_norm": 0.07877965685992819, + "learning_rate": 0.0003389162325597834, + "loss": 1.4078, + "step": 5733 + }, + { + "epoch": 0.62, + "grad_norm": 0.07229117315627312, + "learning_rate": 0.0003387514179373683, + "loss": 1.3603, + "step": 5734 + }, + { + "epoch": 0.62, + "grad_norm": 0.07622358969159525, + "learning_rate": 0.00033858662286689954, + "loss": 1.4646, + "step": 5735 + }, + { + "epoch": 0.62, + "grad_norm": 0.07074157195742369, + "learning_rate": 0.0003384218473683594, + "loss": 1.4831, + "step": 5736 + }, + { + "epoch": 0.62, + "grad_norm": 0.07407245287057423, + "learning_rate": 0.0003382570914617273, + "loss": 1.519, + "step": 5737 + }, + { + "epoch": 0.62, + "grad_norm": 0.07466035597162812, + "learning_rate": 0.0003380923551669804, + "loss": 1.228, + "step": 5738 + }, + { + "epoch": 0.62, + "grad_norm": 0.0683061545829768, + "learning_rate": 0.0003379276385040938, + "loss": 1.4295, + "step": 5739 + }, + { + "epoch": 0.62, + "grad_norm": 0.07171623879287245, + "learning_rate": 0.00033776294149303953, + "loss": 1.5231, + "step": 5740 + }, + { + "epoch": 0.62, + "grad_norm": 0.07017306983889705, + "learning_rate": 0.0003375982641537881, + "loss": 1.2489, + "step": 5741 + }, + { + "epoch": 0.62, + "grad_norm": 0.06908949184279577, + "learning_rate": 0.0003374336065063069, + "loss": 1.5955, + "step": 5742 + }, + { + "epoch": 0.62, + "grad_norm": 0.07588952795946573, + "learning_rate": 0.0003372689685705614, + "loss": 1.5576, + "step": 5743 + }, + { + "epoch": 0.62, + "grad_norm": 0.0807730430692002, + "learning_rate": 0.00033710435036651446, + "loss": 1.422, + "step": 5744 + }, + { + "epoch": 0.62, + "grad_norm": 0.07615707250575719, + "learning_rate": 0.0003369397519141267, + "loss": 1.4909, + "step": 5745 + }, + { + "epoch": 0.62, + "grad_norm": 0.07415125469436308, + "learning_rate": 0.00033677517323335614, + "loss": 1.4658, + "step": 5746 + }, + { + "epoch": 0.62, + "grad_norm": 0.07714066587006227, + "learning_rate": 0.00033661061434415844, + "loss": 1.423, + "step": 5747 + }, + { + "epoch": 0.62, + "grad_norm": 0.06975556296096408, + "learning_rate": 0.00033644607526648717, + "loss": 1.4794, + "step": 5748 + }, + { + "epoch": 0.62, + "grad_norm": 0.07700941531030474, + "learning_rate": 0.0003362815560202931, + "loss": 1.4553, + "step": 5749 + }, + { + "epoch": 0.62, + "grad_norm": 0.07152268833698286, + "learning_rate": 0.00033611705662552494, + "loss": 1.4017, + "step": 5750 + }, + { + "epoch": 0.62, + "grad_norm": 0.07145733194757649, + "learning_rate": 0.0003359525771021285, + "loss": 1.3974, + "step": 5751 + }, + { + "epoch": 0.62, + "grad_norm": 0.07285528831823655, + "learning_rate": 0.0003357881174700476, + "loss": 1.3365, + "step": 5752 + }, + { + "epoch": 0.62, + "grad_norm": 0.07203425116451612, + "learning_rate": 0.0003356236777492236, + "loss": 1.4425, + "step": 5753 + }, + { + "epoch": 0.62, + "grad_norm": 0.07216624969608204, + "learning_rate": 0.00033545925795959544, + "loss": 1.4025, + "step": 5754 + }, + { + "epoch": 0.62, + "grad_norm": 0.07809662601678845, + "learning_rate": 0.00033529485812109933, + "loss": 1.4063, + "step": 5755 + }, + { + "epoch": 0.62, + "grad_norm": 0.07661840340526746, + "learning_rate": 0.0003351304782536697, + "loss": 1.4225, + "step": 5756 + }, + { + "epoch": 0.62, + "grad_norm": 0.0756729672262736, + "learning_rate": 0.0003349661183772379, + "loss": 1.3925, + "step": 5757 + }, + { + "epoch": 0.62, + "grad_norm": 0.0788412033149621, + "learning_rate": 0.0003348017785117331, + "loss": 1.5799, + "step": 5758 + }, + { + "epoch": 0.62, + "grad_norm": 0.07957797898161738, + "learning_rate": 0.0003346374586770823, + "loss": 1.3948, + "step": 5759 + }, + { + "epoch": 0.62, + "grad_norm": 0.07042844223390973, + "learning_rate": 0.00033447315889320985, + "loss": 1.2962, + "step": 5760 + }, + { + "epoch": 0.62, + "grad_norm": 0.07623292390015404, + "learning_rate": 0.0003343088791800374, + "loss": 1.4195, + "step": 5761 + }, + { + "epoch": 0.62, + "grad_norm": 0.07479667738963493, + "learning_rate": 0.0003341446195574846, + "loss": 1.5432, + "step": 5762 + }, + { + "epoch": 0.62, + "grad_norm": 0.09130577475740195, + "learning_rate": 0.0003339803800454684, + "loss": 1.3322, + "step": 5763 + }, + { + "epoch": 0.62, + "grad_norm": 0.0830672023590636, + "learning_rate": 0.00033381616066390353, + "loss": 1.3991, + "step": 5764 + }, + { + "epoch": 0.62, + "grad_norm": 0.07167158802173361, + "learning_rate": 0.00033365196143270203, + "loss": 1.5648, + "step": 5765 + }, + { + "epoch": 0.62, + "grad_norm": 0.08324039636311231, + "learning_rate": 0.0003334877823717737, + "loss": 1.1953, + "step": 5766 + }, + { + "epoch": 0.62, + "grad_norm": 0.07364194494627035, + "learning_rate": 0.0003333236235010259, + "loss": 1.3725, + "step": 5767 + }, + { + "epoch": 0.62, + "grad_norm": 0.08352686220297945, + "learning_rate": 0.0003331594848403634, + "loss": 1.2813, + "step": 5768 + }, + { + "epoch": 0.62, + "grad_norm": 0.0782586913324151, + "learning_rate": 0.0003329953664096884, + "loss": 1.4421, + "step": 5769 + }, + { + "epoch": 0.62, + "grad_norm": 0.07344286447573398, + "learning_rate": 0.00033283126822890107, + "loss": 1.4148, + "step": 5770 + }, + { + "epoch": 0.62, + "grad_norm": 0.07640927867015894, + "learning_rate": 0.00033266719031789895, + "loss": 1.3939, + "step": 5771 + }, + { + "epoch": 0.62, + "grad_norm": 0.07444837850894832, + "learning_rate": 0.0003325031326965767, + "loss": 1.344, + "step": 5772 + }, + { + "epoch": 0.62, + "grad_norm": 0.07453338538015851, + "learning_rate": 0.0003323390953848271, + "loss": 1.5043, + "step": 5773 + }, + { + "epoch": 0.62, + "grad_norm": 0.07563824140788851, + "learning_rate": 0.0003321750784025401, + "loss": 1.4378, + "step": 5774 + }, + { + "epoch": 0.62, + "grad_norm": 0.07733704126648222, + "learning_rate": 0.00033201108176960346, + "loss": 1.3634, + "step": 5775 + }, + { + "epoch": 0.62, + "grad_norm": 0.07477323086087968, + "learning_rate": 0.00033184710550590243, + "loss": 1.3377, + "step": 5776 + }, + { + "epoch": 0.62, + "grad_norm": 0.0673227413865871, + "learning_rate": 0.00033168314963131953, + "loss": 1.3578, + "step": 5777 + }, + { + "epoch": 0.62, + "grad_norm": 0.07476933458770825, + "learning_rate": 0.00033151921416573485, + "loss": 1.3904, + "step": 5778 + }, + { + "epoch": 0.62, + "grad_norm": 0.06899281005764235, + "learning_rate": 0.0003313552991290264, + "loss": 1.3827, + "step": 5779 + }, + { + "epoch": 0.62, + "grad_norm": 0.07602161019541753, + "learning_rate": 0.0003311914045410694, + "loss": 1.4342, + "step": 5780 + }, + { + "epoch": 0.62, + "grad_norm": 0.06935786602316016, + "learning_rate": 0.00033102753042173644, + "loss": 1.2842, + "step": 5781 + }, + { + "epoch": 0.62, + "grad_norm": 0.07221675602626097, + "learning_rate": 0.00033086367679089826, + "loss": 1.3586, + "step": 5782 + }, + { + "epoch": 0.62, + "grad_norm": 0.08951430196528462, + "learning_rate": 0.00033069984366842207, + "loss": 1.3451, + "step": 5783 + }, + { + "epoch": 0.62, + "grad_norm": 0.06737689928701268, + "learning_rate": 0.00033053603107417365, + "loss": 1.3806, + "step": 5784 + }, + { + "epoch": 0.62, + "grad_norm": 0.0926594824874406, + "learning_rate": 0.0003303722390280156, + "loss": 1.3205, + "step": 5785 + }, + { + "epoch": 0.62, + "grad_norm": 0.06461257789980361, + "learning_rate": 0.00033020846754980824, + "loss": 1.3715, + "step": 5786 + }, + { + "epoch": 0.62, + "grad_norm": 0.08945684857302136, + "learning_rate": 0.0003300447166594097, + "loss": 1.4229, + "step": 5787 + }, + { + "epoch": 0.62, + "grad_norm": 0.06326462043703998, + "learning_rate": 0.0003298809863766752, + "loss": 1.3268, + "step": 5788 + }, + { + "epoch": 0.62, + "grad_norm": 0.07313020967684405, + "learning_rate": 0.0003297172767214576, + "loss": 1.4367, + "step": 5789 + }, + { + "epoch": 0.62, + "grad_norm": 0.07901312968141494, + "learning_rate": 0.0003295535877136072, + "loss": 1.3237, + "step": 5790 + }, + { + "epoch": 0.62, + "grad_norm": 0.07933674541674322, + "learning_rate": 0.00032938991937297193, + "loss": 1.3319, + "step": 5791 + }, + { + "epoch": 0.62, + "grad_norm": 0.06740982353930541, + "learning_rate": 0.0003292262717193972, + "loss": 1.3509, + "step": 5792 + }, + { + "epoch": 0.62, + "grad_norm": 0.07754239914727032, + "learning_rate": 0.00032906264477272575, + "loss": 1.3317, + "step": 5793 + }, + { + "epoch": 0.62, + "grad_norm": 0.07521240547901578, + "learning_rate": 0.0003288990385527978, + "loss": 1.4327, + "step": 5794 + }, + { + "epoch": 0.62, + "grad_norm": 0.0807145346435453, + "learning_rate": 0.00032873545307945143, + "loss": 1.4121, + "step": 5795 + }, + { + "epoch": 0.62, + "grad_norm": 0.0873071269093502, + "learning_rate": 0.0003285718883725217, + "loss": 1.346, + "step": 5796 + }, + { + "epoch": 0.62, + "grad_norm": 0.07939894137269075, + "learning_rate": 0.00032840834445184154, + "loss": 1.4587, + "step": 5797 + }, + { + "epoch": 0.62, + "grad_norm": 0.08248088922264714, + "learning_rate": 0.0003282448213372412, + "loss": 1.3405, + "step": 5798 + }, + { + "epoch": 0.62, + "grad_norm": 0.08439168880904907, + "learning_rate": 0.0003280813190485484, + "loss": 1.3494, + "step": 5799 + }, + { + "epoch": 0.62, + "grad_norm": 0.08123391440953981, + "learning_rate": 0.00032791783760558835, + "loss": 1.4026, + "step": 5800 + }, + { + "epoch": 0.62, + "grad_norm": 0.07908771534976788, + "learning_rate": 0.00032775437702818377, + "loss": 1.5283, + "step": 5801 + }, + { + "epoch": 0.62, + "grad_norm": 0.07532811513448483, + "learning_rate": 0.0003275909373361548, + "loss": 1.4235, + "step": 5802 + }, + { + "epoch": 0.62, + "grad_norm": 0.07166058099901026, + "learning_rate": 0.0003274275185493192, + "loss": 1.4699, + "step": 5803 + }, + { + "epoch": 0.62, + "grad_norm": 0.08297468674472518, + "learning_rate": 0.0003272641206874918, + "loss": 1.4871, + "step": 5804 + }, + { + "epoch": 0.62, + "grad_norm": 0.07243487395107884, + "learning_rate": 0.0003271007437704852, + "loss": 1.3383, + "step": 5805 + }, + { + "epoch": 0.62, + "grad_norm": 0.0805152716343893, + "learning_rate": 0.00032693738781810957, + "loss": 1.482, + "step": 5806 + }, + { + "epoch": 0.62, + "grad_norm": 0.09221296792088401, + "learning_rate": 0.00032677405285017226, + "loss": 1.4894, + "step": 5807 + }, + { + "epoch": 0.62, + "grad_norm": 0.07764307331973927, + "learning_rate": 0.0003266107388864783, + "loss": 1.4943, + "step": 5808 + }, + { + "epoch": 0.62, + "grad_norm": 0.0771348222959278, + "learning_rate": 0.0003264474459468299, + "loss": 1.395, + "step": 5809 + }, + { + "epoch": 0.62, + "grad_norm": 0.07434205741667296, + "learning_rate": 0.00032628417405102704, + "loss": 1.3975, + "step": 5810 + }, + { + "epoch": 0.62, + "grad_norm": 0.08459239119791959, + "learning_rate": 0.0003261209232188671, + "loss": 1.4852, + "step": 5811 + }, + { + "epoch": 0.62, + "grad_norm": 0.08710954280161412, + "learning_rate": 0.00032595769347014446, + "loss": 1.2992, + "step": 5812 + }, + { + "epoch": 0.62, + "grad_norm": 0.07702098501175848, + "learning_rate": 0.0003257944848246519, + "loss": 1.384, + "step": 5813 + }, + { + "epoch": 0.62, + "grad_norm": 0.0729400817671885, + "learning_rate": 0.00032563129730217826, + "loss": 1.3524, + "step": 5814 + }, + { + "epoch": 0.63, + "grad_norm": 0.07949284921131701, + "learning_rate": 0.0003254681309225111, + "loss": 1.4447, + "step": 5815 + }, + { + "epoch": 0.63, + "grad_norm": 0.08413069338126254, + "learning_rate": 0.00032530498570543476, + "loss": 1.5163, + "step": 5816 + }, + { + "epoch": 0.63, + "grad_norm": 0.07414571013437149, + "learning_rate": 0.000325141861670731, + "loss": 1.2736, + "step": 5817 + }, + { + "epoch": 0.63, + "grad_norm": 0.0709935359677738, + "learning_rate": 0.0003249787588381795, + "loss": 1.4867, + "step": 5818 + }, + { + "epoch": 0.63, + "grad_norm": 0.08271727344452846, + "learning_rate": 0.0003248156772275569, + "loss": 1.4364, + "step": 5819 + }, + { + "epoch": 0.63, + "grad_norm": 0.07447507008238005, + "learning_rate": 0.00032465261685863723, + "loss": 1.3646, + "step": 5820 + }, + { + "epoch": 0.63, + "grad_norm": 0.07074186981804943, + "learning_rate": 0.0003244895777511925, + "loss": 1.2683, + "step": 5821 + }, + { + "epoch": 0.63, + "grad_norm": 0.08099285892882745, + "learning_rate": 0.0003243265599249914, + "loss": 1.3554, + "step": 5822 + }, + { + "epoch": 0.63, + "grad_norm": 0.0766455672893839, + "learning_rate": 0.00032416356339980053, + "loss": 1.3961, + "step": 5823 + }, + { + "epoch": 0.63, + "grad_norm": 0.07939556315878743, + "learning_rate": 0.00032400058819538407, + "loss": 1.3909, + "step": 5824 + }, + { + "epoch": 0.63, + "grad_norm": 0.07740960550571895, + "learning_rate": 0.00032383763433150274, + "loss": 1.5367, + "step": 5825 + }, + { + "epoch": 0.63, + "grad_norm": 0.07881442246987015, + "learning_rate": 0.0003236747018279157, + "loss": 1.6059, + "step": 5826 + }, + { + "epoch": 0.63, + "grad_norm": 0.07457194940604828, + "learning_rate": 0.000323511790704379, + "loss": 1.4712, + "step": 5827 + }, + { + "epoch": 0.63, + "grad_norm": 0.07818352176339892, + "learning_rate": 0.00032334890098064593, + "loss": 1.3353, + "step": 5828 + }, + { + "epoch": 0.63, + "grad_norm": 0.07577655989104184, + "learning_rate": 0.00032318603267646775, + "loss": 1.4101, + "step": 5829 + }, + { + "epoch": 0.63, + "grad_norm": 0.08063222118709179, + "learning_rate": 0.0003230231858115927, + "loss": 1.4651, + "step": 5830 + }, + { + "epoch": 0.63, + "grad_norm": 0.07871062903426618, + "learning_rate": 0.0003228603604057664, + "loss": 1.54, + "step": 5831 + }, + { + "epoch": 0.63, + "grad_norm": 0.07753991143364851, + "learning_rate": 0.00032269755647873217, + "loss": 1.4145, + "step": 5832 + }, + { + "epoch": 0.63, + "grad_norm": 0.07769271192950013, + "learning_rate": 0.00032253477405023046, + "loss": 1.2744, + "step": 5833 + }, + { + "epoch": 0.63, + "grad_norm": 0.07149459574934523, + "learning_rate": 0.00032237201313999926, + "loss": 1.4112, + "step": 5834 + }, + { + "epoch": 0.63, + "grad_norm": 0.06967019720018568, + "learning_rate": 0.0003222092737677739, + "loss": 1.4218, + "step": 5835 + }, + { + "epoch": 0.63, + "grad_norm": 0.07366431502138024, + "learning_rate": 0.0003220465559532869, + "loss": 1.3923, + "step": 5836 + }, + { + "epoch": 0.63, + "grad_norm": 0.0774173907387302, + "learning_rate": 0.0003218838597162685, + "loss": 1.3827, + "step": 5837 + }, + { + "epoch": 0.63, + "grad_norm": 0.09568525653553758, + "learning_rate": 0.0003217211850764462, + "loss": 1.4002, + "step": 5838 + }, + { + "epoch": 0.63, + "grad_norm": 0.06820415005290484, + "learning_rate": 0.0003215585320535449, + "loss": 1.5259, + "step": 5839 + }, + { + "epoch": 0.63, + "grad_norm": 0.07099253896133205, + "learning_rate": 0.0003213959006672866, + "loss": 1.4814, + "step": 5840 + }, + { + "epoch": 0.63, + "grad_norm": 0.10048590704557143, + "learning_rate": 0.0003212332909373912, + "loss": 1.3643, + "step": 5841 + }, + { + "epoch": 0.63, + "grad_norm": 0.0868215430127107, + "learning_rate": 0.0003210707028835755, + "loss": 1.2828, + "step": 5842 + }, + { + "epoch": 0.63, + "grad_norm": 0.07111722341191054, + "learning_rate": 0.00032090813652555395, + "loss": 1.2882, + "step": 5843 + }, + { + "epoch": 0.63, + "grad_norm": 0.072419215989912, + "learning_rate": 0.00032074559188303834, + "loss": 1.4059, + "step": 5844 + }, + { + "epoch": 0.63, + "grad_norm": 0.07626815207305306, + "learning_rate": 0.00032058306897573787, + "loss": 1.3845, + "step": 5845 + }, + { + "epoch": 0.63, + "grad_norm": 0.07593411084596641, + "learning_rate": 0.0003204205678233586, + "loss": 1.2355, + "step": 5846 + }, + { + "epoch": 0.63, + "grad_norm": 0.07671474226352402, + "learning_rate": 0.00032025808844560465, + "loss": 1.2905, + "step": 5847 + }, + { + "epoch": 0.63, + "grad_norm": 0.07194390806509542, + "learning_rate": 0.000320095630862177, + "loss": 1.3861, + "step": 5848 + }, + { + "epoch": 0.63, + "grad_norm": 0.0882906222537737, + "learning_rate": 0.0003199331950927745, + "loss": 1.4485, + "step": 5849 + }, + { + "epoch": 0.63, + "grad_norm": 0.07800170587048255, + "learning_rate": 0.0003197707811570928, + "loss": 1.5706, + "step": 5850 + }, + { + "epoch": 0.63, + "grad_norm": 0.0798895187757382, + "learning_rate": 0.0003196083890748252, + "loss": 1.3286, + "step": 5851 + }, + { + "epoch": 0.63, + "grad_norm": 0.08321227249490748, + "learning_rate": 0.0003194460188656624, + "loss": 1.3371, + "step": 5852 + }, + { + "epoch": 0.63, + "grad_norm": 0.07770505707512657, + "learning_rate": 0.0003192836705492923, + "loss": 1.5892, + "step": 5853 + }, + { + "epoch": 0.63, + "grad_norm": 0.08257359787550428, + "learning_rate": 0.00031912134414540007, + "loss": 1.4664, + "step": 5854 + }, + { + "epoch": 0.63, + "grad_norm": 0.07573547661484602, + "learning_rate": 0.00031895903967366844, + "loss": 1.3355, + "step": 5855 + }, + { + "epoch": 0.63, + "grad_norm": 0.07977638149685118, + "learning_rate": 0.00031879675715377765, + "loss": 1.5056, + "step": 5856 + }, + { + "epoch": 0.63, + "grad_norm": 0.0700197292003201, + "learning_rate": 0.00031863449660540457, + "loss": 1.4674, + "step": 5857 + }, + { + "epoch": 0.63, + "grad_norm": 0.07645749377104584, + "learning_rate": 0.00031847225804822405, + "loss": 1.4547, + "step": 5858 + }, + { + "epoch": 0.63, + "grad_norm": 0.07689507595670635, + "learning_rate": 0.00031831004150190796, + "loss": 1.4252, + "step": 5859 + }, + { + "epoch": 0.63, + "grad_norm": 0.06998294349242434, + "learning_rate": 0.0003181478469861259, + "loss": 1.4104, + "step": 5860 + }, + { + "epoch": 0.63, + "grad_norm": 0.07176385236960926, + "learning_rate": 0.00031798567452054414, + "loss": 1.4554, + "step": 5861 + }, + { + "epoch": 0.63, + "grad_norm": 0.07609553056940516, + "learning_rate": 0.0003178235241248269, + "loss": 1.3244, + "step": 5862 + }, + { + "epoch": 0.63, + "grad_norm": 0.07696676290756585, + "learning_rate": 0.0003176613958186355, + "loss": 1.3908, + "step": 5863 + }, + { + "epoch": 0.63, + "grad_norm": 0.08404653627393327, + "learning_rate": 0.00031749928962162844, + "loss": 1.4277, + "step": 5864 + }, + { + "epoch": 0.63, + "grad_norm": 0.07363258110080857, + "learning_rate": 0.00031733720555346157, + "loss": 1.5204, + "step": 5865 + }, + { + "epoch": 0.63, + "grad_norm": 0.07463614990723165, + "learning_rate": 0.0003171751436337886, + "loss": 1.4818, + "step": 5866 + }, + { + "epoch": 0.63, + "grad_norm": 0.07347277265972793, + "learning_rate": 0.00031701310388225945, + "loss": 1.5231, + "step": 5867 + }, + { + "epoch": 0.63, + "grad_norm": 0.07243901069441412, + "learning_rate": 0.0003168510863185224, + "loss": 1.368, + "step": 5868 + }, + { + "epoch": 0.63, + "grad_norm": 0.07148451498935104, + "learning_rate": 0.00031668909096222255, + "loss": 1.4055, + "step": 5869 + }, + { + "epoch": 0.63, + "grad_norm": 0.07683946813743962, + "learning_rate": 0.0003165271178330023, + "loss": 1.304, + "step": 5870 + }, + { + "epoch": 0.63, + "grad_norm": 0.08296014033921659, + "learning_rate": 0.00031636516695050164, + "loss": 1.3915, + "step": 5871 + }, + { + "epoch": 0.63, + "grad_norm": 0.07000354267739249, + "learning_rate": 0.0003162032383343576, + "loss": 1.3169, + "step": 5872 + }, + { + "epoch": 0.63, + "grad_norm": 0.07802272002326763, + "learning_rate": 0.0003160413320042045, + "loss": 1.47, + "step": 5873 + }, + { + "epoch": 0.63, + "grad_norm": 0.0732789511602166, + "learning_rate": 0.000315879447979674, + "loss": 1.3223, + "step": 5874 + }, + { + "epoch": 0.63, + "grad_norm": 0.07027730292619676, + "learning_rate": 0.0003157175862803953, + "loss": 1.3835, + "step": 5875 + }, + { + "epoch": 0.63, + "grad_norm": 0.07682569968171925, + "learning_rate": 0.0003155557469259946, + "loss": 1.359, + "step": 5876 + }, + { + "epoch": 0.63, + "grad_norm": 0.08350094563625435, + "learning_rate": 0.00031539392993609554, + "loss": 1.3508, + "step": 5877 + }, + { + "epoch": 0.63, + "grad_norm": 0.08295722149013737, + "learning_rate": 0.0003152321353303188, + "loss": 1.4293, + "step": 5878 + }, + { + "epoch": 0.63, + "grad_norm": 0.07254010025148988, + "learning_rate": 0.0003150703631282826, + "loss": 1.3245, + "step": 5879 + }, + { + "epoch": 0.63, + "grad_norm": 0.07751813189871887, + "learning_rate": 0.00031490861334960247, + "loss": 1.2882, + "step": 5880 + }, + { + "epoch": 0.63, + "grad_norm": 0.07379579097767187, + "learning_rate": 0.00031474688601389113, + "loss": 1.5361, + "step": 5881 + }, + { + "epoch": 0.63, + "grad_norm": 0.06670928281533853, + "learning_rate": 0.00031458518114075836, + "loss": 1.3023, + "step": 5882 + }, + { + "epoch": 0.63, + "grad_norm": 0.06800302796922687, + "learning_rate": 0.00031442349874981167, + "loss": 1.4567, + "step": 5883 + }, + { + "epoch": 0.63, + "grad_norm": 0.07874237147537874, + "learning_rate": 0.0003142618388606556, + "loss": 1.3324, + "step": 5884 + }, + { + "epoch": 0.63, + "grad_norm": 0.07217062766502823, + "learning_rate": 0.0003141002014928918, + "loss": 1.4494, + "step": 5885 + }, + { + "epoch": 0.63, + "grad_norm": 0.07671074428154255, + "learning_rate": 0.0003139385866661196, + "loss": 1.4268, + "step": 5886 + }, + { + "epoch": 0.63, + "grad_norm": 0.08152511966680674, + "learning_rate": 0.0003137769943999352, + "loss": 1.3887, + "step": 5887 + }, + { + "epoch": 0.63, + "grad_norm": 0.0857558783134707, + "learning_rate": 0.00031361542471393226, + "loss": 1.373, + "step": 5888 + }, + { + "epoch": 0.63, + "grad_norm": 0.08265072365117745, + "learning_rate": 0.00031345387762770163, + "loss": 1.4538, + "step": 5889 + }, + { + "epoch": 0.63, + "grad_norm": 0.08161194391076494, + "learning_rate": 0.00031329235316083126, + "loss": 1.3585, + "step": 5890 + }, + { + "epoch": 0.63, + "grad_norm": 0.07593695011204729, + "learning_rate": 0.00031313085133290695, + "loss": 1.4407, + "step": 5891 + }, + { + "epoch": 0.63, + "grad_norm": 0.07210716065409313, + "learning_rate": 0.00031296937216351113, + "loss": 1.4562, + "step": 5892 + }, + { + "epoch": 0.63, + "grad_norm": 0.0850661492304202, + "learning_rate": 0.0003128079156722236, + "loss": 1.572, + "step": 5893 + }, + { + "epoch": 0.63, + "grad_norm": 0.09330568226061443, + "learning_rate": 0.0003126464818786218, + "loss": 1.3642, + "step": 5894 + }, + { + "epoch": 0.63, + "grad_norm": 0.0737289442763714, + "learning_rate": 0.0003124850708022799, + "loss": 1.3718, + "step": 5895 + }, + { + "epoch": 0.63, + "grad_norm": 0.08218771057410153, + "learning_rate": 0.00031232368246276955, + "loss": 1.4054, + "step": 5896 + }, + { + "epoch": 0.63, + "grad_norm": 0.07967506567629058, + "learning_rate": 0.00031216231687965977, + "loss": 1.4413, + "step": 5897 + }, + { + "epoch": 0.63, + "grad_norm": 0.079496248186044, + "learning_rate": 0.00031200097407251687, + "loss": 1.5249, + "step": 5898 + }, + { + "epoch": 0.63, + "grad_norm": 0.07693598790618682, + "learning_rate": 0.0003118396540609038, + "loss": 1.4197, + "step": 5899 + }, + { + "epoch": 0.63, + "grad_norm": 0.08543054581020419, + "learning_rate": 0.0003116783568643814, + "loss": 1.3692, + "step": 5900 + }, + { + "epoch": 0.63, + "grad_norm": 0.0774464397249776, + "learning_rate": 0.00031151708250250735, + "loss": 1.349, + "step": 5901 + }, + { + "epoch": 0.63, + "grad_norm": 0.06921752872907357, + "learning_rate": 0.000311355830994837, + "loss": 1.4459, + "step": 5902 + }, + { + "epoch": 0.63, + "grad_norm": 0.07514722249298769, + "learning_rate": 0.00031119460236092247, + "loss": 1.4358, + "step": 5903 + }, + { + "epoch": 0.63, + "grad_norm": 0.08242771723457627, + "learning_rate": 0.00031103339662031325, + "loss": 1.3992, + "step": 5904 + }, + { + "epoch": 0.63, + "grad_norm": 0.0757734027343892, + "learning_rate": 0.00031087221379255616, + "loss": 1.3503, + "step": 5905 + }, + { + "epoch": 0.63, + "grad_norm": 0.07672806483686527, + "learning_rate": 0.0003107110538971952, + "loss": 1.4053, + "step": 5906 + }, + { + "epoch": 0.63, + "grad_norm": 0.08680615262647434, + "learning_rate": 0.00031054991695377156, + "loss": 1.5851, + "step": 5907 + }, + { + "epoch": 0.64, + "grad_norm": 0.08822808648220977, + "learning_rate": 0.0003103888029818235, + "loss": 1.4323, + "step": 5908 + }, + { + "epoch": 0.64, + "grad_norm": 0.07106740585924771, + "learning_rate": 0.00031022771200088706, + "loss": 1.4376, + "step": 5909 + }, + { + "epoch": 0.64, + "grad_norm": 0.07779044400772091, + "learning_rate": 0.0003100666440304946, + "loss": 1.3308, + "step": 5910 + }, + { + "epoch": 0.64, + "grad_norm": 0.07194910372531178, + "learning_rate": 0.00030990559909017636, + "loss": 1.4664, + "step": 5911 + }, + { + "epoch": 0.64, + "grad_norm": 0.07745473453560854, + "learning_rate": 0.00030974457719945954, + "loss": 1.37, + "step": 5912 + }, + { + "epoch": 0.64, + "grad_norm": 0.0743549948835792, + "learning_rate": 0.00030958357837786854, + "loss": 1.5214, + "step": 5913 + }, + { + "epoch": 0.64, + "grad_norm": 0.07378607983212841, + "learning_rate": 0.00030942260264492534, + "loss": 1.59, + "step": 5914 + }, + { + "epoch": 0.64, + "grad_norm": 0.08354354717365467, + "learning_rate": 0.0003092616500201485, + "loss": 1.3002, + "step": 5915 + }, + { + "epoch": 0.64, + "grad_norm": 0.0759387023706967, + "learning_rate": 0.0003091007205230541, + "loss": 1.3571, + "step": 5916 + }, + { + "epoch": 0.64, + "grad_norm": 0.06915812880321831, + "learning_rate": 0.00030893981417315553, + "loss": 1.4155, + "step": 5917 + }, + { + "epoch": 0.64, + "grad_norm": 0.08602623259978609, + "learning_rate": 0.00030877893098996324, + "loss": 1.2942, + "step": 5918 + }, + { + "epoch": 0.64, + "grad_norm": 0.07686753605428095, + "learning_rate": 0.0003086180709929849, + "loss": 1.2931, + "step": 5919 + }, + { + "epoch": 0.64, + "grad_norm": 0.0743510939165553, + "learning_rate": 0.0003084572342017251, + "loss": 1.3802, + "step": 5920 + }, + { + "epoch": 0.64, + "grad_norm": 0.08302234915061892, + "learning_rate": 0.00030829642063568595, + "loss": 1.4475, + "step": 5921 + }, + { + "epoch": 0.64, + "grad_norm": 0.08139252633012196, + "learning_rate": 0.00030813563031436674, + "loss": 1.3602, + "step": 5922 + }, + { + "epoch": 0.64, + "grad_norm": 0.07148548640627475, + "learning_rate": 0.0003079748632572639, + "loss": 1.3305, + "step": 5923 + }, + { + "epoch": 0.64, + "grad_norm": 0.0792567664336374, + "learning_rate": 0.0003078141194838707, + "loss": 1.3468, + "step": 5924 + }, + { + "epoch": 0.64, + "grad_norm": 0.07455553638782846, + "learning_rate": 0.00030765339901367824, + "loss": 1.4425, + "step": 5925 + }, + { + "epoch": 0.64, + "grad_norm": 0.0762782607168731, + "learning_rate": 0.00030749270186617426, + "loss": 1.4555, + "step": 5926 + }, + { + "epoch": 0.64, + "grad_norm": 0.07274576676044348, + "learning_rate": 0.0003073320280608437, + "loss": 1.4174, + "step": 5927 + }, + { + "epoch": 0.64, + "grad_norm": 0.0684112547807106, + "learning_rate": 0.00030717137761716916, + "loss": 1.539, + "step": 5928 + }, + { + "epoch": 0.64, + "grad_norm": 0.07220831738070271, + "learning_rate": 0.0003070107505546298, + "loss": 1.2818, + "step": 5929 + }, + { + "epoch": 0.64, + "grad_norm": 0.0973184931653835, + "learning_rate": 0.00030685014689270243, + "loss": 1.4565, + "step": 5930 + }, + { + "epoch": 0.64, + "grad_norm": 0.07731877758281028, + "learning_rate": 0.0003066895666508605, + "loss": 1.3536, + "step": 5931 + }, + { + "epoch": 0.64, + "grad_norm": 0.07415858374192567, + "learning_rate": 0.0003065290098485749, + "loss": 1.4543, + "step": 5932 + }, + { + "epoch": 0.64, + "grad_norm": 0.07258684303449475, + "learning_rate": 0.000306368476505314, + "loss": 1.3418, + "step": 5933 + }, + { + "epoch": 0.64, + "grad_norm": 0.0692859825650207, + "learning_rate": 0.0003062079666405429, + "loss": 1.4166, + "step": 5934 + }, + { + "epoch": 0.64, + "grad_norm": 0.07029306454398922, + "learning_rate": 0.00030604748027372394, + "loss": 1.4104, + "step": 5935 + }, + { + "epoch": 0.64, + "grad_norm": 0.0733708023287787, + "learning_rate": 0.0003058870174243165, + "loss": 1.368, + "step": 5936 + }, + { + "epoch": 0.64, + "grad_norm": 0.07324475363368584, + "learning_rate": 0.00030572657811177756, + "loss": 1.5348, + "step": 5937 + }, + { + "epoch": 0.64, + "grad_norm": 0.07459659298410051, + "learning_rate": 0.0003055661623555608, + "loss": 1.5159, + "step": 5938 + }, + { + "epoch": 0.64, + "grad_norm": 0.07507360349738781, + "learning_rate": 0.000305405770175117, + "loss": 1.4729, + "step": 5939 + }, + { + "epoch": 0.64, + "grad_norm": 0.0633251934623615, + "learning_rate": 0.0003052454015898948, + "loss": 1.3679, + "step": 5940 + }, + { + "epoch": 0.64, + "grad_norm": 0.0793842984745501, + "learning_rate": 0.00030508505661933874, + "loss": 1.3561, + "step": 5941 + }, + { + "epoch": 0.64, + "grad_norm": 0.06819289917335115, + "learning_rate": 0.0003049247352828917, + "loss": 1.4199, + "step": 5942 + }, + { + "epoch": 0.64, + "grad_norm": 0.06660602563715051, + "learning_rate": 0.00030476443759999293, + "loss": 1.4692, + "step": 5943 + }, + { + "epoch": 0.64, + "grad_norm": 0.0742100503490787, + "learning_rate": 0.00030460416359007913, + "loss": 1.3488, + "step": 5944 + }, + { + "epoch": 0.64, + "grad_norm": 0.07296538978034156, + "learning_rate": 0.00030444391327258424, + "loss": 1.4411, + "step": 5945 + }, + { + "epoch": 0.64, + "grad_norm": 0.07363651934129214, + "learning_rate": 0.00030428368666693905, + "loss": 1.4599, + "step": 5946 + }, + { + "epoch": 0.64, + "grad_norm": 0.08639448348123929, + "learning_rate": 0.0003041234837925715, + "loss": 1.3221, + "step": 5947 + }, + { + "epoch": 0.64, + "grad_norm": 0.0759787051112038, + "learning_rate": 0.0003039633046689069, + "loss": 1.4504, + "step": 5948 + }, + { + "epoch": 0.64, + "grad_norm": 0.08473623915638286, + "learning_rate": 0.0003038031493153675, + "loss": 1.4055, + "step": 5949 + }, + { + "epoch": 0.64, + "grad_norm": 0.09560174827623097, + "learning_rate": 0.00030364301775137245, + "loss": 1.4658, + "step": 5950 + }, + { + "epoch": 0.64, + "grad_norm": 0.06961875392765221, + "learning_rate": 0.00030348290999633875, + "loss": 1.3609, + "step": 5951 + }, + { + "epoch": 0.64, + "grad_norm": 0.0714538707164538, + "learning_rate": 0.0003033228260696795, + "loss": 1.3151, + "step": 5952 + }, + { + "epoch": 0.64, + "grad_norm": 0.07808335058336303, + "learning_rate": 0.00030316276599080566, + "loss": 1.4946, + "step": 5953 + }, + { + "epoch": 0.64, + "grad_norm": 0.08282668322888084, + "learning_rate": 0.0003030027297791251, + "loss": 1.3759, + "step": 5954 + }, + { + "epoch": 0.64, + "grad_norm": 0.06800510962454319, + "learning_rate": 0.00030284271745404257, + "loss": 1.4049, + "step": 5955 + }, + { + "epoch": 0.64, + "grad_norm": 0.07475879845301409, + "learning_rate": 0.00030268272903496036, + "loss": 1.4277, + "step": 5956 + }, + { + "epoch": 0.64, + "grad_norm": 0.07642167347690558, + "learning_rate": 0.0003025227645412775, + "loss": 1.3954, + "step": 5957 + }, + { + "epoch": 0.64, + "grad_norm": 0.07643834144422731, + "learning_rate": 0.0003023628239923902, + "loss": 1.3185, + "step": 5958 + }, + { + "epoch": 0.64, + "grad_norm": 0.07685637721938932, + "learning_rate": 0.0003022029074076919, + "loss": 1.4371, + "step": 5959 + }, + { + "epoch": 0.64, + "grad_norm": 0.07909995683759365, + "learning_rate": 0.000302043014806573, + "loss": 1.4715, + "step": 5960 + }, + { + "epoch": 0.64, + "grad_norm": 0.07349604482190411, + "learning_rate": 0.00030188314620842094, + "loss": 1.3706, + "step": 5961 + }, + { + "epoch": 0.64, + "grad_norm": 0.07233447503471593, + "learning_rate": 0.0003017233016326207, + "loss": 1.3405, + "step": 5962 + }, + { + "epoch": 0.64, + "grad_norm": 0.0720093425925684, + "learning_rate": 0.0003015634810985534, + "loss": 1.5037, + "step": 5963 + }, + { + "epoch": 0.64, + "grad_norm": 0.0726984904284892, + "learning_rate": 0.0003014036846255982, + "loss": 1.1959, + "step": 5964 + }, + { + "epoch": 0.64, + "grad_norm": 0.07403128407492846, + "learning_rate": 0.0003012439122331309, + "loss": 1.4965, + "step": 5965 + }, + { + "epoch": 0.64, + "grad_norm": 0.07709803247439713, + "learning_rate": 0.0003010841639405244, + "loss": 1.5168, + "step": 5966 + }, + { + "epoch": 0.64, + "grad_norm": 0.07602607776353369, + "learning_rate": 0.0003009244397671489, + "loss": 1.5096, + "step": 5967 + }, + { + "epoch": 0.64, + "grad_norm": 0.07978329522099706, + "learning_rate": 0.0003007647397323714, + "loss": 1.5163, + "step": 5968 + }, + { + "epoch": 0.64, + "grad_norm": 0.08888699106540596, + "learning_rate": 0.00030060506385555595, + "loss": 1.479, + "step": 5969 + }, + { + "epoch": 0.64, + "grad_norm": 0.07168143621723064, + "learning_rate": 0.000300445412156064, + "loss": 1.3272, + "step": 5970 + }, + { + "epoch": 0.64, + "grad_norm": 0.07329373929619436, + "learning_rate": 0.0003002857846532538, + "loss": 1.3277, + "step": 5971 + }, + { + "epoch": 0.64, + "grad_norm": 0.07654288914163611, + "learning_rate": 0.00030012618136648087, + "loss": 1.3359, + "step": 5972 + }, + { + "epoch": 0.64, + "grad_norm": 0.07354952702168815, + "learning_rate": 0.0002999666023150974, + "loss": 1.4215, + "step": 5973 + }, + { + "epoch": 0.64, + "grad_norm": 0.06871219507940302, + "learning_rate": 0.000299807047518453, + "loss": 1.3821, + "step": 5974 + }, + { + "epoch": 0.64, + "grad_norm": 0.06497905023784828, + "learning_rate": 0.00029964751699589423, + "loss": 1.403, + "step": 5975 + }, + { + "epoch": 0.64, + "grad_norm": 0.07652963092713426, + "learning_rate": 0.00029948801076676475, + "loss": 1.3289, + "step": 5976 + }, + { + "epoch": 0.64, + "grad_norm": 0.071199491483903, + "learning_rate": 0.00029932852885040527, + "loss": 1.3271, + "step": 5977 + }, + { + "epoch": 0.64, + "grad_norm": 0.0745442586868254, + "learning_rate": 0.0002991690712661534, + "loss": 1.4917, + "step": 5978 + }, + { + "epoch": 0.64, + "grad_norm": 0.07328338974119428, + "learning_rate": 0.00029900963803334413, + "loss": 1.5249, + "step": 5979 + }, + { + "epoch": 0.64, + "grad_norm": 0.07679011633383984, + "learning_rate": 0.0002988502291713091, + "loss": 1.4964, + "step": 5980 + }, + { + "epoch": 0.64, + "grad_norm": 0.06510401298584284, + "learning_rate": 0.0002986908446993772, + "loss": 1.4489, + "step": 5981 + }, + { + "epoch": 0.64, + "grad_norm": 0.07088074772151598, + "learning_rate": 0.0002985314846368745, + "loss": 1.4252, + "step": 5982 + }, + { + "epoch": 0.64, + "grad_norm": 0.08126290418323827, + "learning_rate": 0.00029837214900312395, + "loss": 1.3525, + "step": 5983 + }, + { + "epoch": 0.64, + "grad_norm": 0.06975498384895287, + "learning_rate": 0.00029821283781744537, + "loss": 1.5507, + "step": 5984 + }, + { + "epoch": 0.64, + "grad_norm": 0.07945659664328315, + "learning_rate": 0.0002980535510991558, + "loss": 1.3528, + "step": 5985 + }, + { + "epoch": 0.64, + "grad_norm": 0.07123058023593087, + "learning_rate": 0.0002978942888675693, + "loss": 1.3405, + "step": 5986 + }, + { + "epoch": 0.64, + "grad_norm": 0.07698558265674868, + "learning_rate": 0.0002977350511419972, + "loss": 1.432, + "step": 5987 + }, + { + "epoch": 0.64, + "grad_norm": 0.06750769980073211, + "learning_rate": 0.0002975758379417474, + "loss": 1.5532, + "step": 5988 + }, + { + "epoch": 0.64, + "grad_norm": 0.07374373847083314, + "learning_rate": 0.00029741664928612503, + "loss": 1.3748, + "step": 5989 + }, + { + "epoch": 0.64, + "grad_norm": 0.06784693456068619, + "learning_rate": 0.00029725748519443247, + "loss": 1.3238, + "step": 5990 + }, + { + "epoch": 0.64, + "grad_norm": 0.0844336706395431, + "learning_rate": 0.0002970983456859687, + "loss": 1.3847, + "step": 5991 + }, + { + "epoch": 0.64, + "grad_norm": 0.07079001459931357, + "learning_rate": 0.00029693923078003004, + "loss": 1.2528, + "step": 5992 + }, + { + "epoch": 0.64, + "grad_norm": 0.07149040354390375, + "learning_rate": 0.00029678014049590995, + "loss": 1.3724, + "step": 5993 + }, + { + "epoch": 0.64, + "grad_norm": 0.06824266996087566, + "learning_rate": 0.00029662107485289814, + "loss": 1.4637, + "step": 5994 + }, + { + "epoch": 0.64, + "grad_norm": 0.0774125335188962, + "learning_rate": 0.0002964620338702823, + "loss": 1.38, + "step": 5995 + }, + { + "epoch": 0.64, + "grad_norm": 0.07150515374202804, + "learning_rate": 0.00029630301756734643, + "loss": 1.524, + "step": 5996 + }, + { + "epoch": 0.64, + "grad_norm": 0.07467047959655457, + "learning_rate": 0.000296144025963372, + "loss": 1.3389, + "step": 5997 + }, + { + "epoch": 0.64, + "grad_norm": 0.07005420069909853, + "learning_rate": 0.00029598505907763717, + "loss": 1.2697, + "step": 5998 + }, + { + "epoch": 0.64, + "grad_norm": 0.07941624953171082, + "learning_rate": 0.00029582611692941733, + "loss": 1.5686, + "step": 5999 + }, + { + "epoch": 0.64, + "grad_norm": 0.08267345828706943, + "learning_rate": 0.0002956671995379847, + "loss": 1.4291, + "step": 6000 + }, + { + "epoch": 0.65, + "grad_norm": 0.09815036580786829, + "learning_rate": 0.0002955083069226086, + "loss": 1.3402, + "step": 6001 + }, + { + "epoch": 0.65, + "grad_norm": 0.07314143721806532, + "learning_rate": 0.0002953494391025552, + "loss": 1.3167, + "step": 6002 + }, + { + "epoch": 0.65, + "grad_norm": 0.07718320633480043, + "learning_rate": 0.0002951905960970879, + "loss": 1.3187, + "step": 6003 + }, + { + "epoch": 0.65, + "grad_norm": 0.07022848258771092, + "learning_rate": 0.000295031777925467, + "loss": 1.388, + "step": 6004 + }, + { + "epoch": 0.65, + "grad_norm": 0.0880795025536751, + "learning_rate": 0.0002948729846069496, + "loss": 1.3403, + "step": 6005 + }, + { + "epoch": 0.65, + "grad_norm": 0.0701067624869438, + "learning_rate": 0.00029471421616078985, + "loss": 1.2425, + "step": 6006 + }, + { + "epoch": 0.65, + "grad_norm": 0.09666753220487535, + "learning_rate": 0.0002945554726062392, + "loss": 1.4792, + "step": 6007 + }, + { + "epoch": 0.65, + "grad_norm": 0.08171114169072571, + "learning_rate": 0.0002943967539625458, + "loss": 1.5026, + "step": 6008 + }, + { + "epoch": 0.65, + "grad_norm": 0.11522314251288446, + "learning_rate": 0.0002942380602489547, + "loss": 1.5163, + "step": 6009 + }, + { + "epoch": 0.65, + "grad_norm": 0.0769067408158239, + "learning_rate": 0.0002940793914847083, + "loss": 1.3969, + "step": 6010 + }, + { + "epoch": 0.65, + "grad_norm": 0.07605757451663893, + "learning_rate": 0.0002939207476890456, + "loss": 1.3257, + "step": 6011 + }, + { + "epoch": 0.65, + "grad_norm": 0.08573115295190666, + "learning_rate": 0.00029376212888120257, + "loss": 1.4362, + "step": 6012 + }, + { + "epoch": 0.65, + "grad_norm": 0.09276575884914588, + "learning_rate": 0.00029360353508041257, + "loss": 1.503, + "step": 6013 + }, + { + "epoch": 0.65, + "grad_norm": 0.07993298408856686, + "learning_rate": 0.00029344496630590543, + "loss": 1.4253, + "step": 6014 + }, + { + "epoch": 0.65, + "grad_norm": 0.08552281071719056, + "learning_rate": 0.00029328642257690833, + "loss": 1.3724, + "step": 6015 + }, + { + "epoch": 0.65, + "grad_norm": 0.07851945938289058, + "learning_rate": 0.0002931279039126451, + "loss": 1.314, + "step": 6016 + }, + { + "epoch": 0.65, + "grad_norm": 0.07843057357746579, + "learning_rate": 0.0002929694103323366, + "loss": 1.475, + "step": 6017 + }, + { + "epoch": 0.65, + "grad_norm": 0.08650642474253452, + "learning_rate": 0.00029281094185520094, + "loss": 1.4325, + "step": 6018 + }, + { + "epoch": 0.65, + "grad_norm": 0.07755758493084858, + "learning_rate": 0.00029265249850045285, + "loss": 1.4469, + "step": 6019 + }, + { + "epoch": 0.65, + "grad_norm": 0.08462709644270011, + "learning_rate": 0.00029249408028730406, + "loss": 1.4838, + "step": 6020 + }, + { + "epoch": 0.65, + "grad_norm": 0.08276090223466218, + "learning_rate": 0.0002923356872349635, + "loss": 1.4411, + "step": 6021 + }, + { + "epoch": 0.65, + "grad_norm": 0.08773444524959222, + "learning_rate": 0.00029217731936263683, + "loss": 1.3605, + "step": 6022 + }, + { + "epoch": 0.65, + "grad_norm": 0.08218699242748785, + "learning_rate": 0.0002920189766895265, + "loss": 1.3351, + "step": 6023 + }, + { + "epoch": 0.65, + "grad_norm": 0.07040731094288417, + "learning_rate": 0.00029186065923483243, + "loss": 1.3763, + "step": 6024 + }, + { + "epoch": 0.65, + "grad_norm": 0.0740188467934851, + "learning_rate": 0.0002917023670177511, + "loss": 1.3715, + "step": 6025 + }, + { + "epoch": 0.65, + "grad_norm": 0.07362205859410367, + "learning_rate": 0.00029154410005747584, + "loss": 1.3693, + "step": 6026 + }, + { + "epoch": 0.65, + "grad_norm": 0.07984353221296256, + "learning_rate": 0.0002913858583731969, + "loss": 1.4406, + "step": 6027 + }, + { + "epoch": 0.65, + "grad_norm": 0.07519072202814801, + "learning_rate": 0.0002912276419841019, + "loss": 1.4988, + "step": 6028 + }, + { + "epoch": 0.65, + "grad_norm": 0.07631248401472579, + "learning_rate": 0.0002910694509093752, + "loss": 1.4252, + "step": 6029 + }, + { + "epoch": 0.65, + "grad_norm": 0.08027426610180229, + "learning_rate": 0.00029091128516819763, + "loss": 1.2927, + "step": 6030 + }, + { + "epoch": 0.65, + "grad_norm": 0.10664126095564695, + "learning_rate": 0.00029075314477974764, + "loss": 1.3158, + "step": 6031 + }, + { + "epoch": 0.65, + "grad_norm": 0.07485328411133679, + "learning_rate": 0.0002905950297632004, + "loss": 1.3941, + "step": 6032 + }, + { + "epoch": 0.65, + "grad_norm": 0.08601281945619063, + "learning_rate": 0.0002904369401377275, + "loss": 1.4172, + "step": 6033 + }, + { + "epoch": 0.65, + "grad_norm": 0.07099566029910874, + "learning_rate": 0.00029027887592249815, + "loss": 1.3107, + "step": 6034 + }, + { + "epoch": 0.65, + "grad_norm": 0.07243682422839058, + "learning_rate": 0.00029012083713667833, + "loss": 1.3992, + "step": 6035 + }, + { + "epoch": 0.65, + "grad_norm": 0.07818980658292189, + "learning_rate": 0.0002899628237994305, + "loss": 1.3995, + "step": 6036 + }, + { + "epoch": 0.65, + "grad_norm": 0.07345641595125825, + "learning_rate": 0.00028980483592991427, + "loss": 1.4564, + "step": 6037 + }, + { + "epoch": 0.65, + "grad_norm": 0.07378658076318882, + "learning_rate": 0.0002896468735472863, + "loss": 1.3621, + "step": 6038 + }, + { + "epoch": 0.65, + "grad_norm": 0.07730154495877888, + "learning_rate": 0.0002894889366707002, + "loss": 1.3741, + "step": 6039 + }, + { + "epoch": 0.65, + "grad_norm": 0.07867744297535843, + "learning_rate": 0.0002893310253193062, + "loss": 1.4205, + "step": 6040 + }, + { + "epoch": 0.65, + "grad_norm": 0.0806104655157845, + "learning_rate": 0.0002891731395122516, + "loss": 1.3632, + "step": 6041 + }, + { + "epoch": 0.65, + "grad_norm": 0.09214380580505434, + "learning_rate": 0.0002890152792686809, + "loss": 1.7369, + "step": 6042 + }, + { + "epoch": 0.65, + "grad_norm": 0.09003660892734845, + "learning_rate": 0.0002888574446077348, + "loss": 1.3157, + "step": 6043 + }, + { + "epoch": 0.65, + "grad_norm": 0.08107202269847597, + "learning_rate": 0.0002886996355485514, + "loss": 1.3031, + "step": 6044 + }, + { + "epoch": 0.65, + "grad_norm": 0.07863112211660843, + "learning_rate": 0.0002885418521102658, + "loss": 1.4732, + "step": 6045 + }, + { + "epoch": 0.65, + "grad_norm": 0.08105230040654554, + "learning_rate": 0.00028838409431200974, + "loss": 1.2731, + "step": 6046 + }, + { + "epoch": 0.65, + "grad_norm": 0.08080853361750891, + "learning_rate": 0.0002882263621729116, + "loss": 1.4831, + "step": 6047 + }, + { + "epoch": 0.65, + "grad_norm": 0.06849354451778983, + "learning_rate": 0.0002880686557120973, + "loss": 1.3394, + "step": 6048 + }, + { + "epoch": 0.65, + "grad_norm": 0.0843146464274864, + "learning_rate": 0.00028791097494868895, + "loss": 1.3068, + "step": 6049 + }, + { + "epoch": 0.65, + "grad_norm": 0.07860656896213726, + "learning_rate": 0.0002877533199018061, + "loss": 1.4533, + "step": 6050 + }, + { + "epoch": 0.65, + "grad_norm": 0.07718407963474753, + "learning_rate": 0.00028759569059056507, + "loss": 1.4552, + "step": 6051 + }, + { + "epoch": 0.65, + "grad_norm": 0.07045024717197383, + "learning_rate": 0.00028743808703407867, + "loss": 1.4094, + "step": 6052 + }, + { + "epoch": 0.65, + "grad_norm": 0.08392898263167461, + "learning_rate": 0.000287280509251457, + "loss": 1.4199, + "step": 6053 + }, + { + "epoch": 0.65, + "grad_norm": 0.07135753984364539, + "learning_rate": 0.00028712295726180715, + "loss": 1.3966, + "step": 6054 + }, + { + "epoch": 0.65, + "grad_norm": 0.07971585134038697, + "learning_rate": 0.00028696543108423247, + "loss": 1.3936, + "step": 6055 + }, + { + "epoch": 0.65, + "grad_norm": 0.07298002690430236, + "learning_rate": 0.0002868079307378336, + "loss": 1.4275, + "step": 6056 + }, + { + "epoch": 0.65, + "grad_norm": 0.07606117292151965, + "learning_rate": 0.0002866504562417086, + "loss": 1.4009, + "step": 6057 + }, + { + "epoch": 0.65, + "grad_norm": 0.07673914296478794, + "learning_rate": 0.0002864930076149509, + "loss": 1.4826, + "step": 6058 + }, + { + "epoch": 0.65, + "grad_norm": 0.07935188856224078, + "learning_rate": 0.00028633558487665213, + "loss": 1.3802, + "step": 6059 + }, + { + "epoch": 0.65, + "grad_norm": 0.07771790506095733, + "learning_rate": 0.00028617818804590046, + "loss": 1.3848, + "step": 6060 + }, + { + "epoch": 0.65, + "grad_norm": 0.08481704582222165, + "learning_rate": 0.0002860208171417804, + "loss": 1.4468, + "step": 6061 + }, + { + "epoch": 0.65, + "grad_norm": 0.08220689565548146, + "learning_rate": 0.0002858634721833741, + "loss": 1.4044, + "step": 6062 + }, + { + "epoch": 0.65, + "grad_norm": 0.08349805126907142, + "learning_rate": 0.00028570615318976013, + "loss": 1.4339, + "step": 6063 + }, + { + "epoch": 0.65, + "grad_norm": 0.0782694576272315, + "learning_rate": 0.0002855488601800137, + "loss": 1.377, + "step": 6064 + }, + { + "epoch": 0.65, + "grad_norm": 0.07431275992276852, + "learning_rate": 0.0002853915931732073, + "loss": 1.3656, + "step": 6065 + }, + { + "epoch": 0.65, + "grad_norm": 0.09453173639235407, + "learning_rate": 0.00028523435218841026, + "loss": 1.3901, + "step": 6066 + }, + { + "epoch": 0.65, + "grad_norm": 0.07031269040346856, + "learning_rate": 0.00028507713724468844, + "loss": 1.4042, + "step": 6067 + }, + { + "epoch": 0.65, + "grad_norm": 0.0882233685502084, + "learning_rate": 0.0002849199483611045, + "loss": 1.3748, + "step": 6068 + }, + { + "epoch": 0.65, + "grad_norm": 0.07604641853251956, + "learning_rate": 0.0002847627855567183, + "loss": 1.36, + "step": 6069 + }, + { + "epoch": 0.65, + "grad_norm": 0.08338095020434803, + "learning_rate": 0.0002846056488505866, + "loss": 1.4305, + "step": 6070 + }, + { + "epoch": 0.65, + "grad_norm": 0.07695158760839388, + "learning_rate": 0.0002844485382617624, + "loss": 1.3133, + "step": 6071 + }, + { + "epoch": 0.65, + "grad_norm": 0.07082354574245085, + "learning_rate": 0.000284291453809296, + "loss": 1.3982, + "step": 6072 + }, + { + "epoch": 0.65, + "grad_norm": 0.07190282632214344, + "learning_rate": 0.0002841343955122347, + "loss": 1.3965, + "step": 6073 + }, + { + "epoch": 0.65, + "grad_norm": 0.08003261627937998, + "learning_rate": 0.00028397736338962187, + "loss": 1.4751, + "step": 6074 + }, + { + "epoch": 0.65, + "grad_norm": 0.07217113093699555, + "learning_rate": 0.0002838203574604986, + "loss": 1.4419, + "step": 6075 + }, + { + "epoch": 0.65, + "grad_norm": 0.09568634177982653, + "learning_rate": 0.00028366337774390237, + "loss": 1.3784, + "step": 6076 + }, + { + "epoch": 0.65, + "grad_norm": 0.08143677906916136, + "learning_rate": 0.0002835064242588673, + "loss": 1.3997, + "step": 6077 + }, + { + "epoch": 0.65, + "grad_norm": 0.06714182759584644, + "learning_rate": 0.00028334949702442483, + "loss": 1.3418, + "step": 6078 + }, + { + "epoch": 0.65, + "grad_norm": 0.07817525588333529, + "learning_rate": 0.0002831925960596027, + "loss": 1.4515, + "step": 6079 + }, + { + "epoch": 0.65, + "grad_norm": 0.07766148184272453, + "learning_rate": 0.0002830357213834256, + "loss": 1.4837, + "step": 6080 + }, + { + "epoch": 0.65, + "grad_norm": 0.08646731671686747, + "learning_rate": 0.0002828788730149152, + "loss": 1.4054, + "step": 6081 + }, + { + "epoch": 0.65, + "grad_norm": 0.08325901309195903, + "learning_rate": 0.0002827220509730903, + "loss": 1.3141, + "step": 6082 + }, + { + "epoch": 0.65, + "grad_norm": 0.07714246322339714, + "learning_rate": 0.0002825652552769655, + "loss": 1.3832, + "step": 6083 + }, + { + "epoch": 0.65, + "grad_norm": 0.08684031433009541, + "learning_rate": 0.0002824084859455531, + "loss": 1.3973, + "step": 6084 + }, + { + "epoch": 0.65, + "grad_norm": 0.06920637783187342, + "learning_rate": 0.0002822517429978622, + "loss": 1.3132, + "step": 6085 + }, + { + "epoch": 0.65, + "grad_norm": 0.07746410524265271, + "learning_rate": 0.0002820950264528979, + "loss": 1.3978, + "step": 6086 + }, + { + "epoch": 0.65, + "grad_norm": 0.07727541915447587, + "learning_rate": 0.00028193833632966293, + "loss": 1.3852, + "step": 6087 + }, + { + "epoch": 0.65, + "grad_norm": 0.07721133474886398, + "learning_rate": 0.0002817816726471566, + "loss": 1.4555, + "step": 6088 + }, + { + "epoch": 0.65, + "grad_norm": 0.07974176592718676, + "learning_rate": 0.0002816250354243748, + "loss": 1.4801, + "step": 6089 + }, + { + "epoch": 0.65, + "grad_norm": 0.06787223694888275, + "learning_rate": 0.00028146842468031007, + "loss": 1.4043, + "step": 6090 + }, + { + "epoch": 0.65, + "grad_norm": 0.08216216293404127, + "learning_rate": 0.00028131184043395254, + "loss": 1.3816, + "step": 6091 + }, + { + "epoch": 0.65, + "grad_norm": 0.07311183455847245, + "learning_rate": 0.000281155282704288, + "loss": 1.5151, + "step": 6092 + }, + { + "epoch": 0.65, + "grad_norm": 0.08228569645390815, + "learning_rate": 0.00028099875151029987, + "loss": 1.4022, + "step": 6093 + }, + { + "epoch": 0.66, + "grad_norm": 0.08468156268757646, + "learning_rate": 0.0002808422468709684, + "loss": 1.3635, + "step": 6094 + }, + { + "epoch": 0.66, + "grad_norm": 0.084115782220857, + "learning_rate": 0.00028068576880526985, + "loss": 1.5106, + "step": 6095 + }, + { + "epoch": 0.66, + "grad_norm": 0.0939884815741877, + "learning_rate": 0.0002805293173321779, + "loss": 1.4746, + "step": 6096 + }, + { + "epoch": 0.66, + "grad_norm": 0.08867327787743164, + "learning_rate": 0.00028037289247066305, + "loss": 1.3817, + "step": 6097 + }, + { + "epoch": 0.66, + "grad_norm": 0.0810061217356107, + "learning_rate": 0.000280216494239692, + "loss": 1.3271, + "step": 6098 + }, + { + "epoch": 0.66, + "grad_norm": 0.08379496101255957, + "learning_rate": 0.0002800601226582289, + "loss": 1.3737, + "step": 6099 + }, + { + "epoch": 0.66, + "grad_norm": 0.08031308779705486, + "learning_rate": 0.00027990377774523396, + "loss": 1.5601, + "step": 6100 + }, + { + "epoch": 0.66, + "grad_norm": 0.07753119254590604, + "learning_rate": 0.00027974745951966494, + "loss": 1.3183, + "step": 6101 + }, + { + "epoch": 0.66, + "grad_norm": 0.08531571995244437, + "learning_rate": 0.00027959116800047566, + "loss": 1.3538, + "step": 6102 + }, + { + "epoch": 0.66, + "grad_norm": 0.0776622313440044, + "learning_rate": 0.00027943490320661704, + "loss": 1.4598, + "step": 6103 + }, + { + "epoch": 0.66, + "grad_norm": 0.09492248497676697, + "learning_rate": 0.000279278665157037, + "loss": 1.3709, + "step": 6104 + }, + { + "epoch": 0.66, + "grad_norm": 0.08248818947863316, + "learning_rate": 0.0002791224538706797, + "loss": 1.3884, + "step": 6105 + }, + { + "epoch": 0.66, + "grad_norm": 0.07468677694976233, + "learning_rate": 0.0002789662693664862, + "loss": 1.3133, + "step": 6106 + }, + { + "epoch": 0.66, + "grad_norm": 0.08673493539274889, + "learning_rate": 0.00027881011166339483, + "loss": 1.3215, + "step": 6107 + }, + { + "epoch": 0.66, + "grad_norm": 0.08082725029580298, + "learning_rate": 0.00027865398078033977, + "loss": 1.4312, + "step": 6108 + }, + { + "epoch": 0.66, + "grad_norm": 0.08379905097619719, + "learning_rate": 0.0002784978767362528, + "loss": 1.419, + "step": 6109 + }, + { + "epoch": 0.66, + "grad_norm": 0.07778141120838956, + "learning_rate": 0.0002783417995500619, + "loss": 1.4632, + "step": 6110 + }, + { + "epoch": 0.66, + "grad_norm": 0.07634657455999405, + "learning_rate": 0.00027818574924069186, + "loss": 1.3575, + "step": 6111 + }, + { + "epoch": 0.66, + "grad_norm": 0.08029962788812359, + "learning_rate": 0.0002780297258270643, + "loss": 1.3785, + "step": 6112 + }, + { + "epoch": 0.66, + "grad_norm": 0.08130136262534324, + "learning_rate": 0.00027787372932809806, + "loss": 1.5693, + "step": 6113 + }, + { + "epoch": 0.66, + "grad_norm": 0.0788973332314687, + "learning_rate": 0.0002777177597627077, + "loss": 1.3804, + "step": 6114 + }, + { + "epoch": 0.66, + "grad_norm": 0.09548515561076097, + "learning_rate": 0.00027756181714980524, + "loss": 1.3901, + "step": 6115 + }, + { + "epoch": 0.66, + "grad_norm": 0.07356430507555382, + "learning_rate": 0.0002774059015082996, + "loss": 1.3691, + "step": 6116 + }, + { + "epoch": 0.66, + "grad_norm": 0.0895079629964563, + "learning_rate": 0.0002772500128570955, + "loss": 1.3651, + "step": 6117 + }, + { + "epoch": 0.66, + "grad_norm": 0.0756789159867376, + "learning_rate": 0.00027709415121509527, + "loss": 1.3808, + "step": 6118 + }, + { + "epoch": 0.66, + "grad_norm": 0.08174272585535419, + "learning_rate": 0.00027693831660119796, + "loss": 1.4582, + "step": 6119 + }, + { + "epoch": 0.66, + "grad_norm": 0.07347513506484865, + "learning_rate": 0.00027678250903429876, + "loss": 1.3427, + "step": 6120 + }, + { + "epoch": 0.66, + "grad_norm": 0.07628590655891802, + "learning_rate": 0.0002766267285332896, + "loss": 1.3316, + "step": 6121 + }, + { + "epoch": 0.66, + "grad_norm": 0.0805266209828289, + "learning_rate": 0.00027647097511706, + "loss": 1.4766, + "step": 6122 + }, + { + "epoch": 0.66, + "grad_norm": 0.07804202650616353, + "learning_rate": 0.0002763152488044951, + "loss": 1.3473, + "step": 6123 + }, + { + "epoch": 0.66, + "grad_norm": 0.07563813059212712, + "learning_rate": 0.0002761595496144773, + "loss": 1.4365, + "step": 6124 + }, + { + "epoch": 0.66, + "grad_norm": 0.07205259007008297, + "learning_rate": 0.00027600387756588615, + "loss": 1.4753, + "step": 6125 + }, + { + "epoch": 0.66, + "grad_norm": 0.0768814719100409, + "learning_rate": 0.00027584823267759684, + "loss": 1.2608, + "step": 6126 + }, + { + "epoch": 0.66, + "grad_norm": 0.07656294739845501, + "learning_rate": 0.000275692614968482, + "loss": 1.3841, + "step": 6127 + }, + { + "epoch": 0.66, + "grad_norm": 0.0785779592286055, + "learning_rate": 0.00027553702445741126, + "loss": 1.5065, + "step": 6128 + }, + { + "epoch": 0.66, + "grad_norm": 0.07947757326516629, + "learning_rate": 0.0002753814611632498, + "loss": 1.4559, + "step": 6129 + }, + { + "epoch": 0.66, + "grad_norm": 0.07434673630299542, + "learning_rate": 0.0002752259251048606, + "loss": 1.3781, + "step": 6130 + }, + { + "epoch": 0.66, + "grad_norm": 0.08122447339583906, + "learning_rate": 0.00027507041630110305, + "loss": 1.5396, + "step": 6131 + }, + { + "epoch": 0.66, + "grad_norm": 0.07766472263427851, + "learning_rate": 0.0002749149347708331, + "loss": 1.3898, + "step": 6132 + }, + { + "epoch": 0.66, + "grad_norm": 0.0875456038166925, + "learning_rate": 0.000274759480532903, + "loss": 1.3486, + "step": 6133 + }, + { + "epoch": 0.66, + "grad_norm": 0.08330838816033348, + "learning_rate": 0.00027460405360616244, + "loss": 1.37, + "step": 6134 + }, + { + "epoch": 0.66, + "grad_norm": 0.07743418564873557, + "learning_rate": 0.00027444865400945765, + "loss": 1.2946, + "step": 6135 + }, + { + "epoch": 0.66, + "grad_norm": 0.07452191764881552, + "learning_rate": 0.0002742932817616309, + "loss": 1.4033, + "step": 6136 + }, + { + "epoch": 0.66, + "grad_norm": 0.08104030355111651, + "learning_rate": 0.0002741379368815219, + "loss": 1.3705, + "step": 6137 + }, + { + "epoch": 0.66, + "grad_norm": 0.08512209537619303, + "learning_rate": 0.0002739826193879669, + "loss": 1.4029, + "step": 6138 + }, + { + "epoch": 0.66, + "grad_norm": 0.08401661215344652, + "learning_rate": 0.0002738273292997983, + "loss": 1.3075, + "step": 6139 + }, + { + "epoch": 0.66, + "grad_norm": 0.08496479321642669, + "learning_rate": 0.00027367206663584574, + "loss": 1.412, + "step": 6140 + }, + { + "epoch": 0.66, + "grad_norm": 0.07883745123661512, + "learning_rate": 0.00027351683141493567, + "loss": 1.3398, + "step": 6141 + }, + { + "epoch": 0.66, + "grad_norm": 0.07946198965375008, + "learning_rate": 0.00027336162365589045, + "loss": 1.3921, + "step": 6142 + }, + { + "epoch": 0.66, + "grad_norm": 0.06745687764508318, + "learning_rate": 0.00027320644337752964, + "loss": 1.4423, + "step": 6143 + }, + { + "epoch": 0.66, + "grad_norm": 0.08892866120135398, + "learning_rate": 0.00027305129059866954, + "loss": 1.2575, + "step": 6144 + }, + { + "epoch": 0.66, + "grad_norm": 0.06875335376318778, + "learning_rate": 0.00027289616533812276, + "loss": 1.4766, + "step": 6145 + }, + { + "epoch": 0.66, + "grad_norm": 0.08030815203424742, + "learning_rate": 0.00027274106761469884, + "loss": 1.3601, + "step": 6146 + }, + { + "epoch": 0.66, + "grad_norm": 0.07478088113386552, + "learning_rate": 0.00027258599744720414, + "loss": 1.4551, + "step": 6147 + }, + { + "epoch": 0.66, + "grad_norm": 0.07733691783191772, + "learning_rate": 0.000272430954854441, + "loss": 1.4606, + "step": 6148 + }, + { + "epoch": 0.66, + "grad_norm": 0.07488938900948164, + "learning_rate": 0.0002722759398552093, + "loss": 1.315, + "step": 6149 + }, + { + "epoch": 0.66, + "grad_norm": 0.07264299787776964, + "learning_rate": 0.00027212095246830506, + "loss": 1.4117, + "step": 6150 + }, + { + "epoch": 0.66, + "grad_norm": 0.08146802376307108, + "learning_rate": 0.00027196599271252074, + "loss": 1.3426, + "step": 6151 + }, + { + "epoch": 0.66, + "grad_norm": 0.07965692665262414, + "learning_rate": 0.00027181106060664617, + "loss": 1.5066, + "step": 6152 + }, + { + "epoch": 0.66, + "grad_norm": 0.08155946793072419, + "learning_rate": 0.0002716561561694673, + "loss": 1.3994, + "step": 6153 + }, + { + "epoch": 0.66, + "grad_norm": 0.07316359754028685, + "learning_rate": 0.00027150127941976635, + "loss": 1.395, + "step": 6154 + }, + { + "epoch": 0.66, + "grad_norm": 0.08202388565748747, + "learning_rate": 0.0002713464303763231, + "loss": 1.3386, + "step": 6155 + }, + { + "epoch": 0.66, + "grad_norm": 0.07804125380153318, + "learning_rate": 0.0002711916090579137, + "loss": 1.4255, + "step": 6156 + }, + { + "epoch": 0.66, + "grad_norm": 0.07175148851830035, + "learning_rate": 0.00027103681548331027, + "loss": 1.3847, + "step": 6157 + }, + { + "epoch": 0.66, + "grad_norm": 0.07387603354527797, + "learning_rate": 0.00027088204967128235, + "loss": 1.4942, + "step": 6158 + }, + { + "epoch": 0.66, + "grad_norm": 0.06952552748421588, + "learning_rate": 0.00027072731164059594, + "loss": 1.4904, + "step": 6159 + }, + { + "epoch": 0.66, + "grad_norm": 0.07546195447701311, + "learning_rate": 0.00027057260141001327, + "loss": 1.3508, + "step": 6160 + }, + { + "epoch": 0.66, + "grad_norm": 0.07874233894984566, + "learning_rate": 0.0002704179189982936, + "loss": 1.3431, + "step": 6161 + }, + { + "epoch": 0.66, + "grad_norm": 0.07561417364598048, + "learning_rate": 0.00027026326442419296, + "loss": 1.3988, + "step": 6162 + }, + { + "epoch": 0.66, + "grad_norm": 0.06969834031693835, + "learning_rate": 0.00027010863770646356, + "loss": 1.4481, + "step": 6163 + }, + { + "epoch": 0.66, + "grad_norm": 0.06957347570850005, + "learning_rate": 0.0002699540388638542, + "loss": 1.5234, + "step": 6164 + }, + { + "epoch": 0.66, + "grad_norm": 0.08021592576713846, + "learning_rate": 0.0002697994679151106, + "loss": 1.4236, + "step": 6165 + }, + { + "epoch": 0.66, + "grad_norm": 0.07249343492297182, + "learning_rate": 0.0002696449248789754, + "loss": 1.4235, + "step": 6166 + }, + { + "epoch": 0.66, + "grad_norm": 0.08347570438059976, + "learning_rate": 0.000269490409774187, + "loss": 1.4639, + "step": 6167 + }, + { + "epoch": 0.66, + "grad_norm": 0.06813592066830779, + "learning_rate": 0.0002693359226194812, + "loss": 1.4131, + "step": 6168 + }, + { + "epoch": 0.66, + "grad_norm": 0.088520289977426, + "learning_rate": 0.0002691814634335902, + "loss": 1.4653, + "step": 6169 + }, + { + "epoch": 0.66, + "grad_norm": 0.07927725480495933, + "learning_rate": 0.00026902703223524217, + "loss": 1.3032, + "step": 6170 + }, + { + "epoch": 0.66, + "grad_norm": 0.07476030815511266, + "learning_rate": 0.0002688726290431629, + "loss": 1.4893, + "step": 6171 + }, + { + "epoch": 0.66, + "grad_norm": 0.07592562154301802, + "learning_rate": 0.0002687182538760743, + "loss": 1.547, + "step": 6172 + }, + { + "epoch": 0.66, + "grad_norm": 0.07741661330328563, + "learning_rate": 0.0002685639067526948, + "loss": 1.432, + "step": 6173 + }, + { + "epoch": 0.66, + "grad_norm": 0.07284139646621345, + "learning_rate": 0.0002684095876917393, + "loss": 1.4776, + "step": 6174 + }, + { + "epoch": 0.66, + "grad_norm": 0.08675181858083143, + "learning_rate": 0.00026825529671191986, + "loss": 1.3249, + "step": 6175 + }, + { + "epoch": 0.66, + "grad_norm": 0.08411753433275078, + "learning_rate": 0.0002681010338319444, + "loss": 1.4295, + "step": 6176 + }, + { + "epoch": 0.66, + "grad_norm": 0.07837071521163565, + "learning_rate": 0.00026794679907051817, + "loss": 1.4625, + "step": 6177 + }, + { + "epoch": 0.66, + "grad_norm": 0.07019930824849117, + "learning_rate": 0.0002677925924463427, + "loss": 1.3814, + "step": 6178 + }, + { + "epoch": 0.66, + "grad_norm": 0.07544575050760528, + "learning_rate": 0.00026763841397811573, + "loss": 1.6822, + "step": 6179 + }, + { + "epoch": 0.66, + "grad_norm": 0.0711653064704804, + "learning_rate": 0.0002674842636845322, + "loss": 1.2985, + "step": 6180 + }, + { + "epoch": 0.66, + "grad_norm": 0.08226687927820503, + "learning_rate": 0.0002673301415842835, + "loss": 1.2929, + "step": 6181 + }, + { + "epoch": 0.66, + "grad_norm": 0.08082723301222589, + "learning_rate": 0.00026717604769605695, + "loss": 1.4702, + "step": 6182 + }, + { + "epoch": 0.66, + "grad_norm": 0.0771809400828876, + "learning_rate": 0.0002670219820385374, + "loss": 1.4699, + "step": 6183 + }, + { + "epoch": 0.66, + "grad_norm": 0.07712962428203328, + "learning_rate": 0.0002668679446304061, + "loss": 1.3207, + "step": 6184 + }, + { + "epoch": 0.66, + "grad_norm": 0.07212775681925493, + "learning_rate": 0.00026671393549033983, + "loss": 1.4173, + "step": 6185 + }, + { + "epoch": 0.66, + "grad_norm": 0.08585221084325963, + "learning_rate": 0.0002665599546370131, + "loss": 1.4432, + "step": 6186 + }, + { + "epoch": 0.67, + "grad_norm": 0.07161579555511431, + "learning_rate": 0.0002664060020890969, + "loss": 1.3093, + "step": 6187 + }, + { + "epoch": 0.67, + "grad_norm": 0.07166138838974309, + "learning_rate": 0.00026625207786525805, + "loss": 1.3763, + "step": 6188 + }, + { + "epoch": 0.67, + "grad_norm": 0.08179686758556896, + "learning_rate": 0.0002660981819841607, + "loss": 1.3632, + "step": 6189 + }, + { + "epoch": 0.67, + "grad_norm": 0.07687496958224152, + "learning_rate": 0.00026594431446446526, + "loss": 1.2778, + "step": 6190 + }, + { + "epoch": 0.67, + "grad_norm": 0.08429720606973103, + "learning_rate": 0.00026579047532482845, + "loss": 1.4235, + "step": 6191 + }, + { + "epoch": 0.67, + "grad_norm": 0.07874039330715418, + "learning_rate": 0.00026563666458390394, + "loss": 1.468, + "step": 6192 + }, + { + "epoch": 0.67, + "grad_norm": 0.0849153270188568, + "learning_rate": 0.00026548288226034203, + "loss": 1.3426, + "step": 6193 + }, + { + "epoch": 0.67, + "grad_norm": 0.07719925522577803, + "learning_rate": 0.00026532912837278893, + "loss": 1.3349, + "step": 6194 + }, + { + "epoch": 0.67, + "grad_norm": 0.07244606222045334, + "learning_rate": 0.0002651754029398883, + "loss": 1.4874, + "step": 6195 + }, + { + "epoch": 0.67, + "grad_norm": 0.07435186661080587, + "learning_rate": 0.0002650217059802794, + "loss": 1.4572, + "step": 6196 + }, + { + "epoch": 0.67, + "grad_norm": 0.07718559468867621, + "learning_rate": 0.0002648680375125989, + "loss": 1.4856, + "step": 6197 + }, + { + "epoch": 0.67, + "grad_norm": 0.08202214417553845, + "learning_rate": 0.0002647143975554793, + "loss": 1.3457, + "step": 6198 + }, + { + "epoch": 0.67, + "grad_norm": 0.0760250326747222, + "learning_rate": 0.0002645607861275502, + "loss": 1.439, + "step": 6199 + }, + { + "epoch": 0.67, + "grad_norm": 0.07603471814525921, + "learning_rate": 0.00026440720324743763, + "loss": 1.3743, + "step": 6200 + }, + { + "epoch": 0.67, + "grad_norm": 0.0828742228552649, + "learning_rate": 0.0002642536489337636, + "loss": 1.4984, + "step": 6201 + }, + { + "epoch": 0.67, + "grad_norm": 0.08352744199976005, + "learning_rate": 0.00026410012320514744, + "loss": 1.5592, + "step": 6202 + }, + { + "epoch": 0.67, + "grad_norm": 0.07460991618324651, + "learning_rate": 0.0002639466260802048, + "loss": 1.4728, + "step": 6203 + }, + { + "epoch": 0.67, + "grad_norm": 0.08349369977006431, + "learning_rate": 0.0002637931575775474, + "loss": 1.3287, + "step": 6204 + }, + { + "epoch": 0.67, + "grad_norm": 0.08921844889478096, + "learning_rate": 0.00026363971771578413, + "loss": 1.4241, + "step": 6205 + }, + { + "epoch": 0.67, + "grad_norm": 0.07896415452446431, + "learning_rate": 0.00026348630651352, + "loss": 1.3908, + "step": 6206 + }, + { + "epoch": 0.67, + "grad_norm": 0.08815108621019546, + "learning_rate": 0.00026333292398935635, + "loss": 1.4188, + "step": 6207 + }, + { + "epoch": 0.67, + "grad_norm": 0.0785662676339109, + "learning_rate": 0.00026317957016189155, + "loss": 1.4875, + "step": 6208 + }, + { + "epoch": 0.67, + "grad_norm": 0.07435226352357563, + "learning_rate": 0.0002630262450497205, + "loss": 1.4019, + "step": 6209 + }, + { + "epoch": 0.67, + "grad_norm": 0.07869236534132566, + "learning_rate": 0.0002628729486714341, + "loss": 1.5194, + "step": 6210 + }, + { + "epoch": 0.67, + "grad_norm": 0.08111495196318634, + "learning_rate": 0.0002627196810456201, + "loss": 1.4171, + "step": 6211 + }, + { + "epoch": 0.67, + "grad_norm": 0.0752417116249629, + "learning_rate": 0.000262566442190863, + "loss": 1.1826, + "step": 6212 + }, + { + "epoch": 0.67, + "grad_norm": 0.07090451927739674, + "learning_rate": 0.0002624132321257432, + "loss": 1.4379, + "step": 6213 + }, + { + "epoch": 0.67, + "grad_norm": 0.072209257295845, + "learning_rate": 0.00026226005086883806, + "loss": 1.2822, + "step": 6214 + }, + { + "epoch": 0.67, + "grad_norm": 0.0775073866558998, + "learning_rate": 0.0002621068984387216, + "loss": 1.6329, + "step": 6215 + }, + { + "epoch": 0.67, + "grad_norm": 0.07276027526755828, + "learning_rate": 0.00026195377485396375, + "loss": 1.4094, + "step": 6216 + }, + { + "epoch": 0.67, + "grad_norm": 0.07660187872068078, + "learning_rate": 0.0002618006801331313, + "loss": 1.4048, + "step": 6217 + }, + { + "epoch": 0.67, + "grad_norm": 0.07561214597012043, + "learning_rate": 0.00026164761429478767, + "loss": 1.3556, + "step": 6218 + }, + { + "epoch": 0.67, + "grad_norm": 0.07608383237940829, + "learning_rate": 0.00026149457735749235, + "loss": 1.4997, + "step": 6219 + }, + { + "epoch": 0.67, + "grad_norm": 0.08117197065650657, + "learning_rate": 0.00026134156933980187, + "loss": 1.5299, + "step": 6220 + }, + { + "epoch": 0.67, + "grad_norm": 0.07667811374754589, + "learning_rate": 0.000261188590260269, + "loss": 1.5114, + "step": 6221 + }, + { + "epoch": 0.67, + "grad_norm": 0.07236230591516954, + "learning_rate": 0.0002610356401374427, + "loss": 1.4631, + "step": 6222 + }, + { + "epoch": 0.67, + "grad_norm": 0.0729862517775749, + "learning_rate": 0.0002608827189898688, + "loss": 1.4073, + "step": 6223 + }, + { + "epoch": 0.67, + "grad_norm": 0.08487372745622469, + "learning_rate": 0.0002607298268360899, + "loss": 1.5601, + "step": 6224 + }, + { + "epoch": 0.67, + "grad_norm": 0.0895758449211697, + "learning_rate": 0.0002605769636946441, + "loss": 1.2656, + "step": 6225 + }, + { + "epoch": 0.67, + "grad_norm": 0.07973668237947351, + "learning_rate": 0.00026042412958406715, + "loss": 1.2544, + "step": 6226 + }, + { + "epoch": 0.67, + "grad_norm": 0.07568869419754443, + "learning_rate": 0.00026027132452289013, + "loss": 1.4048, + "step": 6227 + }, + { + "epoch": 0.67, + "grad_norm": 0.0744278379703313, + "learning_rate": 0.0002601185485296418, + "loss": 1.4452, + "step": 6228 + }, + { + "epoch": 0.67, + "grad_norm": 0.07385615820726067, + "learning_rate": 0.00025996580162284614, + "loss": 1.3883, + "step": 6229 + }, + { + "epoch": 0.67, + "grad_norm": 0.07289747818726597, + "learning_rate": 0.0002598130838210246, + "loss": 1.5465, + "step": 6230 + }, + { + "epoch": 0.67, + "grad_norm": 0.07867781806652333, + "learning_rate": 0.0002596603951426949, + "loss": 1.4572, + "step": 6231 + }, + { + "epoch": 0.67, + "grad_norm": 0.07007763050297192, + "learning_rate": 0.00025950773560637076, + "loss": 1.5115, + "step": 6232 + }, + { + "epoch": 0.67, + "grad_norm": 0.0781571173215587, + "learning_rate": 0.00025935510523056267, + "loss": 1.2784, + "step": 6233 + }, + { + "epoch": 0.67, + "grad_norm": 0.08274854533054618, + "learning_rate": 0.00025920250403377787, + "loss": 1.4113, + "step": 6234 + }, + { + "epoch": 0.67, + "grad_norm": 0.0769569700994162, + "learning_rate": 0.0002590499320345195, + "loss": 1.4623, + "step": 6235 + }, + { + "epoch": 0.67, + "grad_norm": 0.07322203756357455, + "learning_rate": 0.0002588973892512875, + "loss": 1.3969, + "step": 6236 + }, + { + "epoch": 0.67, + "grad_norm": 0.07285908482504222, + "learning_rate": 0.00025874487570257864, + "loss": 1.3235, + "step": 6237 + }, + { + "epoch": 0.67, + "grad_norm": 0.0824245182399957, + "learning_rate": 0.0002585923914068851, + "loss": 1.3926, + "step": 6238 + }, + { + "epoch": 0.67, + "grad_norm": 0.08538154374456672, + "learning_rate": 0.00025843993638269616, + "loss": 1.3852, + "step": 6239 + }, + { + "epoch": 0.67, + "grad_norm": 0.08303180796543792, + "learning_rate": 0.000258287510648498, + "loss": 1.291, + "step": 6240 + }, + { + "epoch": 0.67, + "grad_norm": 0.0744008611060297, + "learning_rate": 0.00025813511422277224, + "loss": 1.4407, + "step": 6241 + }, + { + "epoch": 0.67, + "grad_norm": 0.07491379607406161, + "learning_rate": 0.0002579827471239978, + "loss": 1.5728, + "step": 6242 + }, + { + "epoch": 0.67, + "grad_norm": 0.07224802257116374, + "learning_rate": 0.0002578304093706497, + "loss": 1.4437, + "step": 6243 + }, + { + "epoch": 0.67, + "grad_norm": 0.08108850715756206, + "learning_rate": 0.00025767810098119927, + "loss": 1.527, + "step": 6244 + }, + { + "epoch": 0.67, + "grad_norm": 0.07409277926113383, + "learning_rate": 0.00025752582197411446, + "loss": 1.5389, + "step": 6245 + }, + { + "epoch": 0.67, + "grad_norm": 0.08538872621157978, + "learning_rate": 0.00025737357236785984, + "loss": 1.3672, + "step": 6246 + }, + { + "epoch": 0.67, + "grad_norm": 0.08357544942655332, + "learning_rate": 0.0002572213521808959, + "loss": 1.5069, + "step": 6247 + }, + { + "epoch": 0.67, + "grad_norm": 0.07945973711646714, + "learning_rate": 0.0002570691614316802, + "loss": 1.368, + "step": 6248 + }, + { + "epoch": 0.67, + "grad_norm": 0.08676847576563627, + "learning_rate": 0.00025691700013866616, + "loss": 1.3731, + "step": 6249 + }, + { + "epoch": 0.67, + "grad_norm": 0.08102362106338495, + "learning_rate": 0.00025676486832030366, + "loss": 1.4159, + "step": 6250 + }, + { + "epoch": 0.67, + "grad_norm": 0.08972372454953455, + "learning_rate": 0.00025661276599503945, + "loss": 1.3462, + "step": 6251 + }, + { + "epoch": 0.67, + "grad_norm": 0.08439968518474841, + "learning_rate": 0.0002564606931813166, + "loss": 1.3473, + "step": 6252 + }, + { + "epoch": 0.67, + "grad_norm": 0.07589797298359928, + "learning_rate": 0.0002563086498975742, + "loss": 1.4392, + "step": 6253 + }, + { + "epoch": 0.67, + "grad_norm": 0.07431464211955395, + "learning_rate": 0.00025615663616224805, + "loss": 1.3391, + "step": 6254 + }, + { + "epoch": 0.67, + "grad_norm": 0.07807770877253503, + "learning_rate": 0.00025600465199377066, + "loss": 1.3193, + "step": 6255 + }, + { + "epoch": 0.67, + "grad_norm": 0.07428484878269796, + "learning_rate": 0.00025585269741057026, + "loss": 1.4478, + "step": 6256 + }, + { + "epoch": 0.67, + "grad_norm": 0.07081123718795393, + "learning_rate": 0.00025570077243107194, + "loss": 1.4212, + "step": 6257 + }, + { + "epoch": 0.67, + "grad_norm": 0.08312259339625878, + "learning_rate": 0.00025554887707369733, + "loss": 1.3672, + "step": 6258 + }, + { + "epoch": 0.67, + "grad_norm": 0.09350706453268828, + "learning_rate": 0.0002553970113568642, + "loss": 1.5117, + "step": 6259 + }, + { + "epoch": 0.67, + "grad_norm": 0.07624662839361142, + "learning_rate": 0.0002552451752989865, + "loss": 1.4194, + "step": 6260 + }, + { + "epoch": 0.67, + "grad_norm": 0.08381231697762678, + "learning_rate": 0.0002550933689184751, + "loss": 1.4533, + "step": 6261 + }, + { + "epoch": 0.67, + "grad_norm": 0.0744415792079469, + "learning_rate": 0.00025494159223373727, + "loss": 1.4775, + "step": 6262 + }, + { + "epoch": 0.67, + "grad_norm": 0.07773631383404493, + "learning_rate": 0.00025478984526317597, + "loss": 1.3141, + "step": 6263 + }, + { + "epoch": 0.67, + "grad_norm": 0.07366855260516596, + "learning_rate": 0.0002546381280251913, + "loss": 1.4199, + "step": 6264 + }, + { + "epoch": 0.67, + "grad_norm": 0.0801259194771756, + "learning_rate": 0.0002544864405381797, + "loss": 1.2572, + "step": 6265 + }, + { + "epoch": 0.67, + "grad_norm": 0.07759226269944282, + "learning_rate": 0.0002543347828205335, + "loss": 1.3919, + "step": 6266 + }, + { + "epoch": 0.67, + "grad_norm": 0.07901655886822204, + "learning_rate": 0.0002541831548906417, + "loss": 1.4355, + "step": 6267 + }, + { + "epoch": 0.67, + "grad_norm": 0.08200143516361498, + "learning_rate": 0.0002540315567668901, + "loss": 1.3433, + "step": 6268 + }, + { + "epoch": 0.67, + "grad_norm": 0.0860479874628173, + "learning_rate": 0.00025387998846766014, + "loss": 1.3361, + "step": 6269 + }, + { + "epoch": 0.67, + "grad_norm": 0.0750332981935864, + "learning_rate": 0.00025372845001132995, + "loss": 1.39, + "step": 6270 + }, + { + "epoch": 0.67, + "grad_norm": 0.08091321341472489, + "learning_rate": 0.0002535769414162744, + "loss": 1.3943, + "step": 6271 + }, + { + "epoch": 0.67, + "grad_norm": 0.08131512343994637, + "learning_rate": 0.0002534254627008641, + "loss": 1.3901, + "step": 6272 + }, + { + "epoch": 0.67, + "grad_norm": 0.08340487879640397, + "learning_rate": 0.0002532740138834665, + "loss": 1.4814, + "step": 6273 + }, + { + "epoch": 0.67, + "grad_norm": 0.0823885510183785, + "learning_rate": 0.00025312259498244547, + "loss": 1.4846, + "step": 6274 + }, + { + "epoch": 0.67, + "grad_norm": 0.08451185254200341, + "learning_rate": 0.00025297120601616073, + "loss": 1.2977, + "step": 6275 + }, + { + "epoch": 0.67, + "grad_norm": 0.08059700760409108, + "learning_rate": 0.0002528198470029689, + "loss": 1.3714, + "step": 6276 + }, + { + "epoch": 0.67, + "grad_norm": 0.08329022693965066, + "learning_rate": 0.00025266851796122296, + "loss": 1.3995, + "step": 6277 + }, + { + "epoch": 0.67, + "grad_norm": 0.07771378342062599, + "learning_rate": 0.0002525172189092717, + "loss": 1.4348, + "step": 6278 + }, + { + "epoch": 0.67, + "grad_norm": 0.08460287024642418, + "learning_rate": 0.00025236594986546113, + "loss": 1.4236, + "step": 6279 + }, + { + "epoch": 0.68, + "grad_norm": 0.07704392579140935, + "learning_rate": 0.00025221471084813275, + "loss": 1.3786, + "step": 6280 + }, + { + "epoch": 0.68, + "grad_norm": 0.08446989165175656, + "learning_rate": 0.0002520635018756248, + "loss": 1.3692, + "step": 6281 + }, + { + "epoch": 0.68, + "grad_norm": 0.08354359803524838, + "learning_rate": 0.00025191232296627197, + "loss": 1.569, + "step": 6282 + }, + { + "epoch": 0.68, + "grad_norm": 0.08001104967843899, + "learning_rate": 0.0002517611741384055, + "loss": 1.3756, + "step": 6283 + }, + { + "epoch": 0.68, + "grad_norm": 0.0799008608461087, + "learning_rate": 0.0002516100554103523, + "loss": 1.4743, + "step": 6284 + }, + { + "epoch": 0.68, + "grad_norm": 0.08895399103852052, + "learning_rate": 0.0002514589668004362, + "loss": 1.4824, + "step": 6285 + }, + { + "epoch": 0.68, + "grad_norm": 0.08389049233996097, + "learning_rate": 0.00025130790832697737, + "loss": 1.4368, + "step": 6286 + }, + { + "epoch": 0.68, + "grad_norm": 0.08167409917869137, + "learning_rate": 0.0002511568800082919, + "loss": 1.4084, + "step": 6287 + }, + { + "epoch": 0.68, + "grad_norm": 0.0820738335616238, + "learning_rate": 0.00025100588186269257, + "loss": 1.272, + "step": 6288 + }, + { + "epoch": 0.68, + "grad_norm": 0.07262380780001264, + "learning_rate": 0.0002508549139084887, + "loss": 1.5007, + "step": 6289 + }, + { + "epoch": 0.68, + "grad_norm": 0.08250263142633718, + "learning_rate": 0.00025070397616398543, + "loss": 1.291, + "step": 6290 + }, + { + "epoch": 0.68, + "grad_norm": 0.07324180126088345, + "learning_rate": 0.00025055306864748433, + "loss": 1.3554, + "step": 6291 + }, + { + "epoch": 0.68, + "grad_norm": 0.07491530886846616, + "learning_rate": 0.0002504021913772836, + "loss": 1.3406, + "step": 6292 + }, + { + "epoch": 0.68, + "grad_norm": 0.07998014625838289, + "learning_rate": 0.00025025134437167793, + "loss": 1.522, + "step": 6293 + }, + { + "epoch": 0.68, + "grad_norm": 0.08076320367760288, + "learning_rate": 0.0002501005276489575, + "loss": 1.3689, + "step": 6294 + }, + { + "epoch": 0.68, + "grad_norm": 0.0781089542719641, + "learning_rate": 0.00024994974122740965, + "loss": 1.4426, + "step": 6295 + }, + { + "epoch": 0.68, + "grad_norm": 0.07999448507874525, + "learning_rate": 0.00024979898512531793, + "loss": 1.4671, + "step": 6296 + }, + { + "epoch": 0.68, + "grad_norm": 0.07443936163831828, + "learning_rate": 0.00024964825936096163, + "loss": 1.4709, + "step": 6297 + }, + { + "epoch": 0.68, + "grad_norm": 0.07752927506986254, + "learning_rate": 0.00024949756395261703, + "loss": 1.4224, + "step": 6298 + }, + { + "epoch": 0.68, + "grad_norm": 0.07297892064768649, + "learning_rate": 0.00024934689891855657, + "loss": 1.4106, + "step": 6299 + }, + { + "epoch": 0.68, + "grad_norm": 0.0705016410192572, + "learning_rate": 0.00024919626427704874, + "loss": 1.4373, + "step": 6300 + }, + { + "epoch": 0.68, + "grad_norm": 0.06998077427911564, + "learning_rate": 0.0002490456600463583, + "loss": 1.3348, + "step": 6301 + }, + { + "epoch": 0.68, + "grad_norm": 0.08936338441734996, + "learning_rate": 0.0002488950862447469, + "loss": 1.4434, + "step": 6302 + }, + { + "epoch": 0.68, + "grad_norm": 0.08152473768427583, + "learning_rate": 0.00024874454289047196, + "loss": 1.3254, + "step": 6303 + }, + { + "epoch": 0.68, + "grad_norm": 0.0811328676071586, + "learning_rate": 0.0002485940300017873, + "loss": 1.2318, + "step": 6304 + }, + { + "epoch": 0.68, + "grad_norm": 0.07536323950532108, + "learning_rate": 0.00024844354759694337, + "loss": 1.3747, + "step": 6305 + }, + { + "epoch": 0.68, + "grad_norm": 0.08189574613290967, + "learning_rate": 0.0002482930956941865, + "loss": 1.3824, + "step": 6306 + }, + { + "epoch": 0.68, + "grad_norm": 0.08466969026863551, + "learning_rate": 0.0002481426743117594, + "loss": 1.4511, + "step": 6307 + }, + { + "epoch": 0.68, + "grad_norm": 0.07700482866848125, + "learning_rate": 0.00024799228346790154, + "loss": 1.3756, + "step": 6308 + }, + { + "epoch": 0.68, + "grad_norm": 0.07394423034126002, + "learning_rate": 0.00024784192318084796, + "loss": 1.3997, + "step": 6309 + }, + { + "epoch": 0.68, + "grad_norm": 0.08163148393060057, + "learning_rate": 0.0002476915934688305, + "loss": 1.3569, + "step": 6310 + }, + { + "epoch": 0.68, + "grad_norm": 0.08245747676419808, + "learning_rate": 0.00024754129435007756, + "loss": 1.4508, + "step": 6311 + }, + { + "epoch": 0.68, + "grad_norm": 0.08316884689132627, + "learning_rate": 0.00024739102584281266, + "loss": 1.5506, + "step": 6312 + }, + { + "epoch": 0.68, + "grad_norm": 0.08981091998302579, + "learning_rate": 0.0002472407879652567, + "loss": 1.3847, + "step": 6313 + }, + { + "epoch": 0.68, + "grad_norm": 0.07069410558333868, + "learning_rate": 0.00024709058073562684, + "loss": 1.3207, + "step": 6314 + }, + { + "epoch": 0.68, + "grad_norm": 0.07217099140842033, + "learning_rate": 0.0002469404041721358, + "loss": 1.2995, + "step": 6315 + }, + { + "epoch": 0.68, + "grad_norm": 0.0819390025189319, + "learning_rate": 0.0002467902582929931, + "loss": 1.3276, + "step": 6316 + }, + { + "epoch": 0.68, + "grad_norm": 0.09764517482898454, + "learning_rate": 0.0002466401431164048, + "loss": 1.3504, + "step": 6317 + }, + { + "epoch": 0.68, + "grad_norm": 0.08663621031922736, + "learning_rate": 0.0002464900586605724, + "loss": 1.4582, + "step": 6318 + }, + { + "epoch": 0.68, + "grad_norm": 0.07135499230948997, + "learning_rate": 0.0002463400049436944, + "loss": 1.3456, + "step": 6319 + }, + { + "epoch": 0.68, + "grad_norm": 0.07806358456612215, + "learning_rate": 0.0002461899819839654, + "loss": 1.4174, + "step": 6320 + }, + { + "epoch": 0.68, + "grad_norm": 0.08049798140545715, + "learning_rate": 0.000246039989799576, + "loss": 1.3761, + "step": 6321 + }, + { + "epoch": 0.68, + "grad_norm": 0.08031384734125688, + "learning_rate": 0.00024589002840871355, + "loss": 1.374, + "step": 6322 + }, + { + "epoch": 0.68, + "grad_norm": 0.07909678000250613, + "learning_rate": 0.00024574009782956096, + "loss": 1.6276, + "step": 6323 + }, + { + "epoch": 0.68, + "grad_norm": 0.07584212527403288, + "learning_rate": 0.0002455901980802983, + "loss": 1.428, + "step": 6324 + }, + { + "epoch": 0.68, + "grad_norm": 0.08462804630783761, + "learning_rate": 0.000245440329179101, + "loss": 1.3807, + "step": 6325 + }, + { + "epoch": 0.68, + "grad_norm": 0.09435281069530776, + "learning_rate": 0.0002452904911441414, + "loss": 1.3654, + "step": 6326 + }, + { + "epoch": 0.68, + "grad_norm": 0.08026059661730983, + "learning_rate": 0.0002451406839935881, + "loss": 1.2393, + "step": 6327 + }, + { + "epoch": 0.68, + "grad_norm": 0.09463728059653936, + "learning_rate": 0.00024499090774560524, + "loss": 1.3196, + "step": 6328 + }, + { + "epoch": 0.68, + "grad_norm": 0.08561772489183278, + "learning_rate": 0.00024484116241835403, + "loss": 1.4476, + "step": 6329 + }, + { + "epoch": 0.68, + "grad_norm": 0.08344976913598492, + "learning_rate": 0.00024469144802999164, + "loss": 1.3839, + "step": 6330 + }, + { + "epoch": 0.68, + "grad_norm": 0.0855239337159454, + "learning_rate": 0.0002445417645986713, + "loss": 1.5262, + "step": 6331 + }, + { + "epoch": 0.68, + "grad_norm": 0.08926849614318069, + "learning_rate": 0.00024439211214254277, + "loss": 1.3539, + "step": 6332 + }, + { + "epoch": 0.68, + "grad_norm": 0.08822555963908468, + "learning_rate": 0.0002442424906797519, + "loss": 1.3936, + "step": 6333 + }, + { + "epoch": 0.68, + "grad_norm": 0.08578391466405638, + "learning_rate": 0.0002440929002284406, + "loss": 1.4229, + "step": 6334 + }, + { + "epoch": 0.68, + "grad_norm": 0.086223295049479, + "learning_rate": 0.00024394334080674734, + "loss": 1.3489, + "step": 6335 + }, + { + "epoch": 0.68, + "grad_norm": 0.10149756749252606, + "learning_rate": 0.00024379381243280703, + "loss": 1.5394, + "step": 6336 + }, + { + "epoch": 0.68, + "grad_norm": 0.08212011780136454, + "learning_rate": 0.00024364431512475, + "loss": 1.4554, + "step": 6337 + }, + { + "epoch": 0.68, + "grad_norm": 0.09010571016682901, + "learning_rate": 0.00024349484890070355, + "loss": 1.4116, + "step": 6338 + }, + { + "epoch": 0.68, + "grad_norm": 0.10287841772251785, + "learning_rate": 0.00024334541377879116, + "loss": 1.4102, + "step": 6339 + }, + { + "epoch": 0.68, + "grad_norm": 0.08646469129338254, + "learning_rate": 0.00024319600977713203, + "loss": 1.3129, + "step": 6340 + }, + { + "epoch": 0.68, + "grad_norm": 0.07477706387936801, + "learning_rate": 0.00024304663691384205, + "loss": 1.3683, + "step": 6341 + }, + { + "epoch": 0.68, + "grad_norm": 0.0742561087884464, + "learning_rate": 0.00024289729520703335, + "loss": 1.4076, + "step": 6342 + }, + { + "epoch": 0.68, + "grad_norm": 0.0727554495538966, + "learning_rate": 0.00024274798467481396, + "loss": 1.391, + "step": 6343 + }, + { + "epoch": 0.68, + "grad_norm": 0.09096214796504422, + "learning_rate": 0.00024259870533528815, + "loss": 1.3449, + "step": 6344 + }, + { + "epoch": 0.68, + "grad_norm": 0.07170963499794096, + "learning_rate": 0.00024244945720655693, + "loss": 1.3732, + "step": 6345 + }, + { + "epoch": 0.68, + "grad_norm": 0.07006967689524883, + "learning_rate": 0.0002423002403067167, + "loss": 1.5636, + "step": 6346 + }, + { + "epoch": 0.68, + "grad_norm": 0.07720905952192304, + "learning_rate": 0.00024215105465386078, + "loss": 1.2022, + "step": 6347 + }, + { + "epoch": 0.68, + "grad_norm": 0.07092099425334585, + "learning_rate": 0.0002420019002660786, + "loss": 1.3869, + "step": 6348 + }, + { + "epoch": 0.68, + "grad_norm": 0.07731471823323678, + "learning_rate": 0.0002418527771614553, + "loss": 1.375, + "step": 6349 + }, + { + "epoch": 0.68, + "grad_norm": 0.0670229260877233, + "learning_rate": 0.00024170368535807274, + "loss": 1.3853, + "step": 6350 + }, + { + "epoch": 0.68, + "grad_norm": 0.07526592555668116, + "learning_rate": 0.00024155462487400898, + "loss": 1.4127, + "step": 6351 + }, + { + "epoch": 0.68, + "grad_norm": 0.0824513831130822, + "learning_rate": 0.00024140559572733778, + "loss": 1.4122, + "step": 6352 + }, + { + "epoch": 0.68, + "grad_norm": 0.06022553289796199, + "learning_rate": 0.0002412565979361298, + "loss": 1.4234, + "step": 6353 + }, + { + "epoch": 0.68, + "grad_norm": 0.06856769172839336, + "learning_rate": 0.00024110763151845112, + "loss": 1.3135, + "step": 6354 + }, + { + "epoch": 0.68, + "grad_norm": 0.07706925472267376, + "learning_rate": 0.00024095869649236491, + "loss": 1.295, + "step": 6355 + }, + { + "epoch": 0.68, + "grad_norm": 0.08128807781499314, + "learning_rate": 0.00024080979287592952, + "loss": 1.4854, + "step": 6356 + }, + { + "epoch": 0.68, + "grad_norm": 0.07997548107576101, + "learning_rate": 0.0002406609206872004, + "loss": 1.4402, + "step": 6357 + }, + { + "epoch": 0.68, + "grad_norm": 0.0812986548596618, + "learning_rate": 0.00024051207994422885, + "loss": 1.3249, + "step": 6358 + }, + { + "epoch": 0.68, + "grad_norm": 0.07668872546913742, + "learning_rate": 0.00024036327066506203, + "loss": 1.4075, + "step": 6359 + }, + { + "epoch": 0.68, + "grad_norm": 0.07576029951246802, + "learning_rate": 0.00024021449286774378, + "loss": 1.3767, + "step": 6360 + }, + { + "epoch": 0.68, + "grad_norm": 0.0705989781571333, + "learning_rate": 0.00024006574657031406, + "loss": 1.4834, + "step": 6361 + }, + { + "epoch": 0.68, + "grad_norm": 0.08410816163192895, + "learning_rate": 0.00023991703179080848, + "loss": 1.3311, + "step": 6362 + }, + { + "epoch": 0.68, + "grad_norm": 0.0744909882298907, + "learning_rate": 0.00023976834854725943, + "loss": 1.4475, + "step": 6363 + }, + { + "epoch": 0.68, + "grad_norm": 0.07417287239926322, + "learning_rate": 0.00023961969685769568, + "loss": 1.4801, + "step": 6364 + }, + { + "epoch": 0.68, + "grad_norm": 0.08508771858692672, + "learning_rate": 0.00023947107674014096, + "loss": 1.3167, + "step": 6365 + }, + { + "epoch": 0.68, + "grad_norm": 0.08704235220539557, + "learning_rate": 0.00023932248821261637, + "loss": 1.4221, + "step": 6366 + }, + { + "epoch": 0.68, + "grad_norm": 0.06998956050843806, + "learning_rate": 0.00023917393129313896, + "loss": 1.4133, + "step": 6367 + }, + { + "epoch": 0.68, + "grad_norm": 0.07324472581915419, + "learning_rate": 0.00023902540599972144, + "loss": 1.378, + "step": 6368 + }, + { + "epoch": 0.68, + "grad_norm": 0.07294277583373579, + "learning_rate": 0.00023887691235037313, + "loss": 1.4382, + "step": 6369 + }, + { + "epoch": 0.68, + "grad_norm": 0.08534875004191578, + "learning_rate": 0.0002387284503630996, + "loss": 1.4318, + "step": 6370 + }, + { + "epoch": 0.68, + "grad_norm": 0.07220491552722837, + "learning_rate": 0.00023858002005590202, + "loss": 1.4859, + "step": 6371 + }, + { + "epoch": 0.68, + "grad_norm": 0.08203133231113481, + "learning_rate": 0.00023843162144677828, + "loss": 1.4108, + "step": 6372 + }, + { + "epoch": 0.69, + "grad_norm": 0.07399688753489576, + "learning_rate": 0.00023828325455372236, + "loss": 1.4796, + "step": 6373 + }, + { + "epoch": 0.69, + "grad_norm": 0.08070783051161995, + "learning_rate": 0.00023813491939472393, + "loss": 1.344, + "step": 6374 + }, + { + "epoch": 0.69, + "grad_norm": 0.08277727679169404, + "learning_rate": 0.0002379866159877695, + "loss": 1.2994, + "step": 6375 + }, + { + "epoch": 0.69, + "grad_norm": 0.07044262154113139, + "learning_rate": 0.00023783834435084117, + "loss": 1.3152, + "step": 6376 + }, + { + "epoch": 0.69, + "grad_norm": 0.07933013616591182, + "learning_rate": 0.0002376901045019172, + "loss": 1.4583, + "step": 6377 + }, + { + "epoch": 0.69, + "grad_norm": 0.08308805766312702, + "learning_rate": 0.0002375418964589724, + "loss": 1.2326, + "step": 6378 + }, + { + "epoch": 0.69, + "grad_norm": 0.08249281840241653, + "learning_rate": 0.00023739372023997763, + "loss": 1.4402, + "step": 6379 + }, + { + "epoch": 0.69, + "grad_norm": 0.09484665425518647, + "learning_rate": 0.00023724557586289946, + "loss": 1.1794, + "step": 6380 + }, + { + "epoch": 0.69, + "grad_norm": 0.0739011722621357, + "learning_rate": 0.0002370974633457011, + "loss": 1.3369, + "step": 6381 + }, + { + "epoch": 0.69, + "grad_norm": 0.07765413859608447, + "learning_rate": 0.00023694938270634188, + "loss": 1.3658, + "step": 6382 + }, + { + "epoch": 0.69, + "grad_norm": 0.08727919704600484, + "learning_rate": 0.0002368013339627766, + "loss": 1.3588, + "step": 6383 + }, + { + "epoch": 0.69, + "grad_norm": 0.07282288372917742, + "learning_rate": 0.00023665331713295702, + "loss": 1.3357, + "step": 6384 + }, + { + "epoch": 0.69, + "grad_norm": 0.07995318342105713, + "learning_rate": 0.00023650533223483085, + "loss": 1.4342, + "step": 6385 + }, + { + "epoch": 0.69, + "grad_norm": 0.07350102385403665, + "learning_rate": 0.00023635737928634148, + "loss": 1.4507, + "step": 6386 + }, + { + "epoch": 0.69, + "grad_norm": 0.07565174355512315, + "learning_rate": 0.0002362094583054286, + "loss": 1.4482, + "step": 6387 + }, + { + "epoch": 0.69, + "grad_norm": 0.07992476149419737, + "learning_rate": 0.00023606156931002838, + "loss": 1.4335, + "step": 6388 + }, + { + "epoch": 0.69, + "grad_norm": 0.08573538817338362, + "learning_rate": 0.00023591371231807296, + "loss": 1.397, + "step": 6389 + }, + { + "epoch": 0.69, + "grad_norm": 0.08571710420595538, + "learning_rate": 0.00023576588734749022, + "loss": 1.4763, + "step": 6390 + }, + { + "epoch": 0.69, + "grad_norm": 0.0731916157652849, + "learning_rate": 0.00023561809441620458, + "loss": 1.3818, + "step": 6391 + }, + { + "epoch": 0.69, + "grad_norm": 0.08771184646456208, + "learning_rate": 0.00023547033354213658, + "loss": 1.3579, + "step": 6392 + }, + { + "epoch": 0.69, + "grad_norm": 0.077798234525272, + "learning_rate": 0.0002353226047432025, + "loss": 1.3273, + "step": 6393 + }, + { + "epoch": 0.69, + "grad_norm": 0.07815607317243291, + "learning_rate": 0.00023517490803731507, + "loss": 1.4548, + "step": 6394 + }, + { + "epoch": 0.69, + "grad_norm": 0.08263717397214161, + "learning_rate": 0.00023502724344238318, + "loss": 1.4002, + "step": 6395 + }, + { + "epoch": 0.69, + "grad_norm": 0.08569366911481649, + "learning_rate": 0.00023487961097631155, + "loss": 1.3903, + "step": 6396 + }, + { + "epoch": 0.69, + "grad_norm": 0.07121023055466971, + "learning_rate": 0.0002347320106570009, + "loss": 1.3606, + "step": 6397 + }, + { + "epoch": 0.69, + "grad_norm": 0.08475099857112675, + "learning_rate": 0.00023458444250234868, + "loss": 1.461, + "step": 6398 + }, + { + "epoch": 0.69, + "grad_norm": 0.08412237390244266, + "learning_rate": 0.00023443690653024763, + "loss": 1.3305, + "step": 6399 + }, + { + "epoch": 0.69, + "grad_norm": 0.08095037113122881, + "learning_rate": 0.0002342894027585872, + "loss": 1.2791, + "step": 6400 + }, + { + "epoch": 0.69, + "grad_norm": 0.06950633340592942, + "learning_rate": 0.0002341419312052529, + "loss": 1.2907, + "step": 6401 + }, + { + "epoch": 0.69, + "grad_norm": 0.08264516966426638, + "learning_rate": 0.00023399449188812584, + "loss": 1.2311, + "step": 6402 + }, + { + "epoch": 0.69, + "grad_norm": 0.08734564167313208, + "learning_rate": 0.00023384708482508364, + "loss": 1.4628, + "step": 6403 + }, + { + "epoch": 0.69, + "grad_norm": 0.08726129525601546, + "learning_rate": 0.0002336997100340002, + "loss": 1.3995, + "step": 6404 + }, + { + "epoch": 0.69, + "grad_norm": 0.07588282327103045, + "learning_rate": 0.00023355236753274484, + "loss": 1.4151, + "step": 6405 + }, + { + "epoch": 0.69, + "grad_norm": 0.08370642493906598, + "learning_rate": 0.00023340505733918365, + "loss": 1.4016, + "step": 6406 + }, + { + "epoch": 0.69, + "grad_norm": 0.08108362721094939, + "learning_rate": 0.0002332577794711783, + "loss": 1.598, + "step": 6407 + }, + { + "epoch": 0.69, + "grad_norm": 0.08135817340718471, + "learning_rate": 0.0002331105339465867, + "loss": 1.2881, + "step": 6408 + }, + { + "epoch": 0.69, + "grad_norm": 0.09012388270227195, + "learning_rate": 0.00023296332078326292, + "loss": 1.3892, + "step": 6409 + }, + { + "epoch": 0.69, + "grad_norm": 0.07793560241657278, + "learning_rate": 0.00023281613999905732, + "loss": 1.4816, + "step": 6410 + }, + { + "epoch": 0.69, + "grad_norm": 0.08867833354510514, + "learning_rate": 0.00023266899161181564, + "loss": 1.4143, + "step": 6411 + }, + { + "epoch": 0.69, + "grad_norm": 0.07688612344754879, + "learning_rate": 0.00023252187563938044, + "loss": 1.4396, + "step": 6412 + }, + { + "epoch": 0.69, + "grad_norm": 0.07695243857532634, + "learning_rate": 0.00023237479209959006, + "loss": 1.4831, + "step": 6413 + }, + { + "epoch": 0.69, + "grad_norm": 0.10597533406334113, + "learning_rate": 0.0002322277410102786, + "loss": 1.3586, + "step": 6414 + }, + { + "epoch": 0.69, + "grad_norm": 0.08486673254253517, + "learning_rate": 0.00023208072238927675, + "loss": 1.4086, + "step": 6415 + }, + { + "epoch": 0.69, + "grad_norm": 0.09352301771338838, + "learning_rate": 0.00023193373625441112, + "loss": 1.3706, + "step": 6416 + }, + { + "epoch": 0.69, + "grad_norm": 0.09267212329917103, + "learning_rate": 0.0002317867826235041, + "loss": 1.4183, + "step": 6417 + }, + { + "epoch": 0.69, + "grad_norm": 0.08024382166748611, + "learning_rate": 0.00023163986151437423, + "loss": 1.4594, + "step": 6418 + }, + { + "epoch": 0.69, + "grad_norm": 0.07312178808860083, + "learning_rate": 0.0002314929729448363, + "loss": 1.3003, + "step": 6419 + }, + { + "epoch": 0.69, + "grad_norm": 0.1003750783018416, + "learning_rate": 0.0002313461169327013, + "loss": 1.3099, + "step": 6420 + }, + { + "epoch": 0.69, + "grad_norm": 0.08298443226564493, + "learning_rate": 0.00023119929349577555, + "loss": 1.4246, + "step": 6421 + }, + { + "epoch": 0.69, + "grad_norm": 0.08834680875376852, + "learning_rate": 0.00023105250265186223, + "loss": 1.5498, + "step": 6422 + }, + { + "epoch": 0.69, + "grad_norm": 0.08112147320182733, + "learning_rate": 0.00023090574441876033, + "loss": 1.531, + "step": 6423 + }, + { + "epoch": 0.69, + "grad_norm": 0.08887189548723178, + "learning_rate": 0.00023075901881426447, + "loss": 1.4392, + "step": 6424 + }, + { + "epoch": 0.69, + "grad_norm": 0.07775527371255965, + "learning_rate": 0.00023061232585616577, + "loss": 1.2903, + "step": 6425 + }, + { + "epoch": 0.69, + "grad_norm": 0.07631985491448418, + "learning_rate": 0.00023046566556225145, + "loss": 1.366, + "step": 6426 + }, + { + "epoch": 0.69, + "grad_norm": 0.07884367654719883, + "learning_rate": 0.0002303190379503043, + "loss": 1.4406, + "step": 6427 + }, + { + "epoch": 0.69, + "grad_norm": 0.07647140083236327, + "learning_rate": 0.00023017244303810363, + "loss": 1.5203, + "step": 6428 + }, + { + "epoch": 0.69, + "grad_norm": 0.0755226169127557, + "learning_rate": 0.0002300258808434245, + "loss": 1.4164, + "step": 6429 + }, + { + "epoch": 0.69, + "grad_norm": 0.07622607302032058, + "learning_rate": 0.00022987935138403786, + "loss": 1.4841, + "step": 6430 + }, + { + "epoch": 0.69, + "grad_norm": 0.0824646838031537, + "learning_rate": 0.00022973285467771116, + "loss": 1.5279, + "step": 6431 + }, + { + "epoch": 0.69, + "grad_norm": 0.0886875422328421, + "learning_rate": 0.0002295863907422077, + "loss": 1.545, + "step": 6432 + }, + { + "epoch": 0.69, + "grad_norm": 0.08848464683625346, + "learning_rate": 0.00022943995959528652, + "loss": 1.3529, + "step": 6433 + }, + { + "epoch": 0.69, + "grad_norm": 0.08132183333693678, + "learning_rate": 0.00022929356125470297, + "loss": 1.36, + "step": 6434 + }, + { + "epoch": 0.69, + "grad_norm": 0.0879109504827401, + "learning_rate": 0.0002291471957382085, + "loss": 1.4193, + "step": 6435 + }, + { + "epoch": 0.69, + "grad_norm": 0.07435601021309238, + "learning_rate": 0.00022900086306355022, + "loss": 1.4843, + "step": 6436 + }, + { + "epoch": 0.69, + "grad_norm": 0.07947785083610934, + "learning_rate": 0.00022885456324847153, + "loss": 1.4676, + "step": 6437 + }, + { + "epoch": 0.69, + "grad_norm": 0.08881617486248049, + "learning_rate": 0.00022870829631071204, + "loss": 1.385, + "step": 6438 + }, + { + "epoch": 0.69, + "grad_norm": 0.08314651513325921, + "learning_rate": 0.00022856206226800686, + "loss": 1.4345, + "step": 6439 + }, + { + "epoch": 0.69, + "grad_norm": 0.08039152643497237, + "learning_rate": 0.00022841586113808726, + "loss": 1.4642, + "step": 6440 + }, + { + "epoch": 0.69, + "grad_norm": 0.08254667264623641, + "learning_rate": 0.00022826969293868098, + "loss": 1.3568, + "step": 6441 + }, + { + "epoch": 0.69, + "grad_norm": 0.0850751659769357, + "learning_rate": 0.00022812355768751102, + "loss": 1.4961, + "step": 6442 + }, + { + "epoch": 0.69, + "grad_norm": 0.07569920317359677, + "learning_rate": 0.00022797745540229704, + "loss": 1.3281, + "step": 6443 + }, + { + "epoch": 0.69, + "grad_norm": 0.08527834837940627, + "learning_rate": 0.00022783138610075454, + "loss": 1.5263, + "step": 6444 + }, + { + "epoch": 0.69, + "grad_norm": 0.09203558938704211, + "learning_rate": 0.00022768534980059464, + "loss": 1.3848, + "step": 6445 + }, + { + "epoch": 0.69, + "grad_norm": 0.08002778832140865, + "learning_rate": 0.00022753934651952484, + "loss": 1.4289, + "step": 6446 + }, + { + "epoch": 0.69, + "grad_norm": 0.07691283557052742, + "learning_rate": 0.0002273933762752488, + "loss": 1.4839, + "step": 6447 + }, + { + "epoch": 0.69, + "grad_norm": 0.08555435524639886, + "learning_rate": 0.00022724743908546552, + "loss": 1.3288, + "step": 6448 + }, + { + "epoch": 0.69, + "grad_norm": 0.07359411407844321, + "learning_rate": 0.00022710153496787074, + "loss": 1.3386, + "step": 6449 + }, + { + "epoch": 0.69, + "grad_norm": 0.08535186262318026, + "learning_rate": 0.0002269556639401555, + "loss": 1.3845, + "step": 6450 + }, + { + "epoch": 0.69, + "grad_norm": 0.08081806643440836, + "learning_rate": 0.00022680982602000748, + "loss": 1.3093, + "step": 6451 + }, + { + "epoch": 0.69, + "grad_norm": 0.08042330242595397, + "learning_rate": 0.00022666402122510976, + "loss": 1.2788, + "step": 6452 + }, + { + "epoch": 0.69, + "grad_norm": 0.0770073548322996, + "learning_rate": 0.00022651824957314176, + "loss": 1.3039, + "step": 6453 + }, + { + "epoch": 0.69, + "grad_norm": 0.07566086212385112, + "learning_rate": 0.00022637251108177902, + "loss": 1.4038, + "step": 6454 + }, + { + "epoch": 0.69, + "grad_norm": 0.09810806066509348, + "learning_rate": 0.0002262268057686925, + "loss": 1.393, + "step": 6455 + }, + { + "epoch": 0.69, + "grad_norm": 0.07640142270989021, + "learning_rate": 0.0002260811336515496, + "loss": 1.5287, + "step": 6456 + }, + { + "epoch": 0.69, + "grad_norm": 0.08288360938472895, + "learning_rate": 0.00022593549474801377, + "loss": 1.3465, + "step": 6457 + }, + { + "epoch": 0.69, + "grad_norm": 0.08416800605991567, + "learning_rate": 0.00022578988907574388, + "loss": 1.375, + "step": 6458 + }, + { + "epoch": 0.69, + "grad_norm": 0.08118995759959549, + "learning_rate": 0.00022564431665239544, + "loss": 1.4158, + "step": 6459 + }, + { + "epoch": 0.69, + "grad_norm": 0.09327858003512501, + "learning_rate": 0.00022549877749561943, + "loss": 1.4409, + "step": 6460 + }, + { + "epoch": 0.69, + "grad_norm": 0.08104347386981663, + "learning_rate": 0.00022535327162306285, + "loss": 1.3011, + "step": 6461 + }, + { + "epoch": 0.69, + "grad_norm": 0.08079496316961529, + "learning_rate": 0.00022520779905236892, + "loss": 1.3472, + "step": 6462 + }, + { + "epoch": 0.69, + "grad_norm": 0.09930812169108252, + "learning_rate": 0.00022506235980117697, + "loss": 1.3556, + "step": 6463 + }, + { + "epoch": 0.69, + "grad_norm": 0.081827074933239, + "learning_rate": 0.00022491695388712147, + "loss": 1.4348, + "step": 6464 + }, + { + "epoch": 0.69, + "grad_norm": 0.08327321510981332, + "learning_rate": 0.0002247715813278337, + "loss": 1.2857, + "step": 6465 + }, + { + "epoch": 0.7, + "grad_norm": 0.07800325727916337, + "learning_rate": 0.00022462624214094075, + "loss": 1.4488, + "step": 6466 + }, + { + "epoch": 0.7, + "grad_norm": 0.09429196943285079, + "learning_rate": 0.00022448093634406507, + "loss": 1.483, + "step": 6467 + }, + { + "epoch": 0.7, + "grad_norm": 0.09258873648328031, + "learning_rate": 0.00022433566395482573, + "loss": 1.3856, + "step": 6468 + }, + { + "epoch": 0.7, + "grad_norm": 0.08195780122541721, + "learning_rate": 0.0002241904249908377, + "loss": 1.288, + "step": 6469 + }, + { + "epoch": 0.7, + "grad_norm": 0.07988131648227399, + "learning_rate": 0.0002240452194697115, + "loss": 1.4128, + "step": 6470 + }, + { + "epoch": 0.7, + "grad_norm": 0.0894159323508057, + "learning_rate": 0.00022390004740905362, + "loss": 1.5256, + "step": 6471 + }, + { + "epoch": 0.7, + "grad_norm": 0.08880130190892369, + "learning_rate": 0.000223754908826467, + "loss": 1.2916, + "step": 6472 + }, + { + "epoch": 0.7, + "grad_norm": 0.09106245882443116, + "learning_rate": 0.00022360980373954987, + "loss": 1.4184, + "step": 6473 + }, + { + "epoch": 0.7, + "grad_norm": 0.08358404758333761, + "learning_rate": 0.0002234647321658969, + "loss": 1.4161, + "step": 6474 + }, + { + "epoch": 0.7, + "grad_norm": 0.08990776461075271, + "learning_rate": 0.00022331969412309877, + "loss": 1.3321, + "step": 6475 + }, + { + "epoch": 0.7, + "grad_norm": 0.08028040754728914, + "learning_rate": 0.00022317468962874132, + "loss": 1.3376, + "step": 6476 + }, + { + "epoch": 0.7, + "grad_norm": 0.09033607654277113, + "learning_rate": 0.00022302971870040718, + "loss": 1.4686, + "step": 6477 + }, + { + "epoch": 0.7, + "grad_norm": 0.09633659162771828, + "learning_rate": 0.00022288478135567465, + "loss": 1.4402, + "step": 6478 + }, + { + "epoch": 0.7, + "grad_norm": 0.07054694812027362, + "learning_rate": 0.00022273987761211756, + "loss": 1.4753, + "step": 6479 + }, + { + "epoch": 0.7, + "grad_norm": 0.07971226310945649, + "learning_rate": 0.00022259500748730637, + "loss": 1.4936, + "step": 6480 + }, + { + "epoch": 0.7, + "grad_norm": 0.08270479882148428, + "learning_rate": 0.00022245017099880665, + "loss": 1.3579, + "step": 6481 + }, + { + "epoch": 0.7, + "grad_norm": 0.0943209601689459, + "learning_rate": 0.0002223053681641808, + "loss": 1.4191, + "step": 6482 + }, + { + "epoch": 0.7, + "grad_norm": 0.08834209518549241, + "learning_rate": 0.00022216059900098624, + "loss": 1.445, + "step": 6483 + }, + { + "epoch": 0.7, + "grad_norm": 0.08388454303429033, + "learning_rate": 0.00022201586352677688, + "loss": 1.4445, + "step": 6484 + }, + { + "epoch": 0.7, + "grad_norm": 0.07909092713902374, + "learning_rate": 0.0002218711617591026, + "loss": 1.3958, + "step": 6485 + }, + { + "epoch": 0.7, + "grad_norm": 0.06978073299876707, + "learning_rate": 0.00022172649371550863, + "loss": 1.4058, + "step": 6486 + }, + { + "epoch": 0.7, + "grad_norm": 0.08679179635542227, + "learning_rate": 0.0002215818594135367, + "loss": 1.3989, + "step": 6487 + }, + { + "epoch": 0.7, + "grad_norm": 0.07993985077411585, + "learning_rate": 0.0002214372588707243, + "loss": 1.4374, + "step": 6488 + }, + { + "epoch": 0.7, + "grad_norm": 0.07427798279179676, + "learning_rate": 0.00022129269210460444, + "loss": 1.4993, + "step": 6489 + }, + { + "epoch": 0.7, + "grad_norm": 0.08155449054888644, + "learning_rate": 0.00022114815913270653, + "loss": 1.4904, + "step": 6490 + }, + { + "epoch": 0.7, + "grad_norm": 0.07970364164040267, + "learning_rate": 0.00022100365997255601, + "loss": 1.2852, + "step": 6491 + }, + { + "epoch": 0.7, + "grad_norm": 0.08682486023092911, + "learning_rate": 0.00022085919464167326, + "loss": 1.6039, + "step": 6492 + }, + { + "epoch": 0.7, + "grad_norm": 0.07331684440190417, + "learning_rate": 0.00022071476315757544, + "loss": 1.3076, + "step": 6493 + }, + { + "epoch": 0.7, + "grad_norm": 0.0757239710866081, + "learning_rate": 0.00022057036553777565, + "loss": 1.3685, + "step": 6494 + }, + { + "epoch": 0.7, + "grad_norm": 0.07585935488051394, + "learning_rate": 0.00022042600179978216, + "loss": 1.4274, + "step": 6495 + }, + { + "epoch": 0.7, + "grad_norm": 0.08444732219970214, + "learning_rate": 0.00022028167196109981, + "loss": 1.4211, + "step": 6496 + }, + { + "epoch": 0.7, + "grad_norm": 0.07518887569695178, + "learning_rate": 0.0002201373760392293, + "loss": 1.3419, + "step": 6497 + }, + { + "epoch": 0.7, + "grad_norm": 0.07301302504211518, + "learning_rate": 0.0002199931140516665, + "loss": 1.5196, + "step": 6498 + }, + { + "epoch": 0.7, + "grad_norm": 0.07598122555463341, + "learning_rate": 0.00021984888601590404, + "loss": 1.4913, + "step": 6499 + }, + { + "epoch": 0.7, + "grad_norm": 0.07442679390168067, + "learning_rate": 0.0002197046919494301, + "loss": 1.4397, + "step": 6500 + }, + { + "epoch": 0.7, + "grad_norm": 0.10135433982914047, + "learning_rate": 0.0002195605318697284, + "loss": 1.3109, + "step": 6501 + }, + { + "epoch": 0.7, + "grad_norm": 0.07208256394451946, + "learning_rate": 0.00021941640579427928, + "loss": 1.4048, + "step": 6502 + }, + { + "epoch": 0.7, + "grad_norm": 0.0787725168679611, + "learning_rate": 0.00021927231374055823, + "loss": 1.4708, + "step": 6503 + }, + { + "epoch": 0.7, + "grad_norm": 0.08301111523284674, + "learning_rate": 0.00021912825572603678, + "loss": 1.2329, + "step": 6504 + }, + { + "epoch": 0.7, + "grad_norm": 0.08558231206577362, + "learning_rate": 0.00021898423176818266, + "loss": 1.5087, + "step": 6505 + }, + { + "epoch": 0.7, + "grad_norm": 0.08352462729960797, + "learning_rate": 0.0002188402418844594, + "loss": 1.3946, + "step": 6506 + }, + { + "epoch": 0.7, + "grad_norm": 0.07363711061252484, + "learning_rate": 0.00021869628609232596, + "loss": 1.3682, + "step": 6507 + }, + { + "epoch": 0.7, + "grad_norm": 0.0805971289346462, + "learning_rate": 0.0002185523644092376, + "loss": 1.3311, + "step": 6508 + }, + { + "epoch": 0.7, + "grad_norm": 0.07032069350557803, + "learning_rate": 0.00021840847685264555, + "loss": 1.3778, + "step": 6509 + }, + { + "epoch": 0.7, + "grad_norm": 0.0744760329733795, + "learning_rate": 0.00021826462343999627, + "loss": 1.4149, + "step": 6510 + }, + { + "epoch": 0.7, + "grad_norm": 0.09067490689524893, + "learning_rate": 0.0002181208041887327, + "loss": 1.442, + "step": 6511 + }, + { + "epoch": 0.7, + "grad_norm": 0.0837501444944338, + "learning_rate": 0.0002179770191162936, + "loss": 1.4889, + "step": 6512 + }, + { + "epoch": 0.7, + "grad_norm": 0.08007588034484431, + "learning_rate": 0.00021783326824011324, + "loss": 1.4214, + "step": 6513 + }, + { + "epoch": 0.7, + "grad_norm": 0.0799023651814438, + "learning_rate": 0.00021768955157762165, + "loss": 1.4904, + "step": 6514 + }, + { + "epoch": 0.7, + "grad_norm": 0.09065512534636933, + "learning_rate": 0.00021754586914624524, + "loss": 1.4548, + "step": 6515 + }, + { + "epoch": 0.7, + "grad_norm": 0.08477293649906326, + "learning_rate": 0.0002174022209634061, + "loss": 1.4235, + "step": 6516 + }, + { + "epoch": 0.7, + "grad_norm": 0.0779585839119935, + "learning_rate": 0.0002172586070465218, + "loss": 1.3522, + "step": 6517 + }, + { + "epoch": 0.7, + "grad_norm": 0.07615724676838401, + "learning_rate": 0.0002171150274130061, + "loss": 1.4947, + "step": 6518 + }, + { + "epoch": 0.7, + "grad_norm": 0.07558094107947531, + "learning_rate": 0.0002169714820802688, + "loss": 1.5059, + "step": 6519 + }, + { + "epoch": 0.7, + "grad_norm": 0.09027457729867946, + "learning_rate": 0.0002168279710657149, + "loss": 1.4962, + "step": 6520 + }, + { + "epoch": 0.7, + "grad_norm": 0.07694453291924619, + "learning_rate": 0.0002166844943867457, + "loss": 1.5161, + "step": 6521 + }, + { + "epoch": 0.7, + "grad_norm": 0.08320596409149107, + "learning_rate": 0.00021654105206075848, + "loss": 1.3651, + "step": 6522 + }, + { + "epoch": 0.7, + "grad_norm": 0.09290290757932727, + "learning_rate": 0.0002163976441051459, + "loss": 1.3206, + "step": 6523 + }, + { + "epoch": 0.7, + "grad_norm": 0.07738617365116893, + "learning_rate": 0.00021625427053729656, + "loss": 1.3863, + "step": 6524 + }, + { + "epoch": 0.7, + "grad_norm": 0.08379385652143403, + "learning_rate": 0.0002161109313745953, + "loss": 1.3935, + "step": 6525 + }, + { + "epoch": 0.7, + "grad_norm": 0.08494754800851972, + "learning_rate": 0.00021596762663442215, + "loss": 1.4045, + "step": 6526 + }, + { + "epoch": 0.7, + "grad_norm": 0.08370244101996513, + "learning_rate": 0.0002158243563341535, + "loss": 1.3563, + "step": 6527 + }, + { + "epoch": 0.7, + "grad_norm": 0.08251734877414564, + "learning_rate": 0.00021568112049116152, + "loss": 1.1581, + "step": 6528 + }, + { + "epoch": 0.7, + "grad_norm": 0.07399029848646638, + "learning_rate": 0.00021553791912281369, + "loss": 1.2488, + "step": 6529 + }, + { + "epoch": 0.7, + "grad_norm": 0.08615102440250381, + "learning_rate": 0.00021539475224647382, + "loss": 1.3296, + "step": 6530 + }, + { + "epoch": 0.7, + "grad_norm": 0.08760288742801585, + "learning_rate": 0.00021525161987950164, + "loss": 1.4621, + "step": 6531 + }, + { + "epoch": 0.7, + "grad_norm": 0.08548065697127558, + "learning_rate": 0.00021510852203925206, + "loss": 1.4295, + "step": 6532 + }, + { + "epoch": 0.7, + "grad_norm": 0.08830861858417012, + "learning_rate": 0.0002149654587430765, + "loss": 1.3621, + "step": 6533 + }, + { + "epoch": 0.7, + "grad_norm": 0.07901395402583786, + "learning_rate": 0.00021482243000832158, + "loss": 1.3396, + "step": 6534 + }, + { + "epoch": 0.7, + "grad_norm": 0.07521402912545687, + "learning_rate": 0.00021467943585233036, + "loss": 1.4412, + "step": 6535 + }, + { + "epoch": 0.7, + "grad_norm": 0.0757548604259532, + "learning_rate": 0.000214536476292441, + "loss": 1.35, + "step": 6536 + }, + { + "epoch": 0.7, + "grad_norm": 0.08152821401632664, + "learning_rate": 0.0002143935513459882, + "loss": 1.4062, + "step": 6537 + }, + { + "epoch": 0.7, + "grad_norm": 0.09936944029433495, + "learning_rate": 0.0002142506610303017, + "loss": 1.4571, + "step": 6538 + }, + { + "epoch": 0.7, + "grad_norm": 0.07181111096163625, + "learning_rate": 0.00021410780536270779, + "loss": 1.3778, + "step": 6539 + }, + { + "epoch": 0.7, + "grad_norm": 0.07568757215933702, + "learning_rate": 0.00021396498436052826, + "loss": 1.3805, + "step": 6540 + }, + { + "epoch": 0.7, + "grad_norm": 0.08252191822278969, + "learning_rate": 0.0002138221980410802, + "loss": 1.4755, + "step": 6541 + }, + { + "epoch": 0.7, + "grad_norm": 0.0965810583333691, + "learning_rate": 0.00021367944642167736, + "loss": 1.4406, + "step": 6542 + }, + { + "epoch": 0.7, + "grad_norm": 0.08218602214053825, + "learning_rate": 0.00021353672951962888, + "loss": 1.3727, + "step": 6543 + }, + { + "epoch": 0.7, + "grad_norm": 0.07903607477438594, + "learning_rate": 0.0002133940473522395, + "loss": 1.493, + "step": 6544 + }, + { + "epoch": 0.7, + "grad_norm": 0.09434836139967438, + "learning_rate": 0.00021325139993680982, + "loss": 1.489, + "step": 6545 + }, + { + "epoch": 0.7, + "grad_norm": 0.09415498817850494, + "learning_rate": 0.00021310878729063643, + "loss": 1.4413, + "step": 6546 + }, + { + "epoch": 0.7, + "grad_norm": 0.0837639102330669, + "learning_rate": 0.00021296620943101185, + "loss": 1.4946, + "step": 6547 + }, + { + "epoch": 0.7, + "grad_norm": 0.08977086282659386, + "learning_rate": 0.00021282366637522378, + "loss": 1.3019, + "step": 6548 + }, + { + "epoch": 0.7, + "grad_norm": 0.0778087682911158, + "learning_rate": 0.00021268115814055617, + "loss": 1.333, + "step": 6549 + }, + { + "epoch": 0.7, + "grad_norm": 0.0750368541827475, + "learning_rate": 0.0002125386847442889, + "loss": 1.5137, + "step": 6550 + }, + { + "epoch": 0.7, + "grad_norm": 0.07990480483989432, + "learning_rate": 0.00021239624620369692, + "loss": 1.4183, + "step": 6551 + }, + { + "epoch": 0.7, + "grad_norm": 0.08691069242823043, + "learning_rate": 0.00021225384253605156, + "loss": 1.3846, + "step": 6552 + }, + { + "epoch": 0.7, + "grad_norm": 0.08922151650826089, + "learning_rate": 0.00021211147375862005, + "loss": 1.5065, + "step": 6553 + }, + { + "epoch": 0.7, + "grad_norm": 0.08532353831500455, + "learning_rate": 0.00021196913988866467, + "loss": 1.2723, + "step": 6554 + }, + { + "epoch": 0.7, + "grad_norm": 0.08102466561548363, + "learning_rate": 0.00021182684094344422, + "loss": 1.3856, + "step": 6555 + }, + { + "epoch": 0.7, + "grad_norm": 0.07457371890788876, + "learning_rate": 0.00021168457694021282, + "loss": 1.3585, + "step": 6556 + }, + { + "epoch": 0.7, + "grad_norm": 0.07968114957329238, + "learning_rate": 0.00021154234789622024, + "loss": 1.4162, + "step": 6557 + }, + { + "epoch": 0.7, + "grad_norm": 0.08118810789452972, + "learning_rate": 0.00021140015382871248, + "loss": 1.4492, + "step": 6558 + }, + { + "epoch": 0.71, + "grad_norm": 0.0803685591148541, + "learning_rate": 0.00021125799475493114, + "loss": 1.3808, + "step": 6559 + }, + { + "epoch": 0.71, + "grad_norm": 0.08168756775809427, + "learning_rate": 0.00021111587069211324, + "loss": 1.306, + "step": 6560 + }, + { + "epoch": 0.71, + "grad_norm": 0.08662437531664363, + "learning_rate": 0.00021097378165749192, + "loss": 1.3639, + "step": 6561 + }, + { + "epoch": 0.71, + "grad_norm": 0.07911911522419161, + "learning_rate": 0.00021083172766829623, + "loss": 1.4021, + "step": 6562 + }, + { + "epoch": 0.71, + "grad_norm": 0.08142696269384453, + "learning_rate": 0.00021068970874175025, + "loss": 1.3587, + "step": 6563 + }, + { + "epoch": 0.71, + "grad_norm": 0.08014034479548356, + "learning_rate": 0.00021054772489507452, + "loss": 1.3174, + "step": 6564 + }, + { + "epoch": 0.71, + "grad_norm": 0.08382442401757927, + "learning_rate": 0.00021040577614548522, + "loss": 1.4604, + "step": 6565 + }, + { + "epoch": 0.71, + "grad_norm": 0.07901248508081173, + "learning_rate": 0.0002102638625101939, + "loss": 1.4022, + "step": 6566 + }, + { + "epoch": 0.71, + "grad_norm": 0.08222010138581641, + "learning_rate": 0.00021012198400640803, + "loss": 1.4083, + "step": 6567 + }, + { + "epoch": 0.71, + "grad_norm": 0.08063227244132427, + "learning_rate": 0.0002099801406513311, + "loss": 1.3365, + "step": 6568 + }, + { + "epoch": 0.71, + "grad_norm": 0.08005206480715796, + "learning_rate": 0.00020983833246216178, + "loss": 1.421, + "step": 6569 + }, + { + "epoch": 0.71, + "grad_norm": 0.0734211309304561, + "learning_rate": 0.00020969655945609494, + "loss": 1.299, + "step": 6570 + }, + { + "epoch": 0.71, + "grad_norm": 0.07864468033355397, + "learning_rate": 0.00020955482165032136, + "loss": 1.4178, + "step": 6571 + }, + { + "epoch": 0.71, + "grad_norm": 0.07951495019839461, + "learning_rate": 0.0002094131190620267, + "loss": 1.2936, + "step": 6572 + }, + { + "epoch": 0.71, + "grad_norm": 0.0797927878475767, + "learning_rate": 0.00020927145170839323, + "loss": 1.4509, + "step": 6573 + }, + { + "epoch": 0.71, + "grad_norm": 0.07822452330414903, + "learning_rate": 0.00020912981960659872, + "loss": 1.4728, + "step": 6574 + }, + { + "epoch": 0.71, + "grad_norm": 0.09082590234638262, + "learning_rate": 0.00020898822277381612, + "loss": 1.57, + "step": 6575 + }, + { + "epoch": 0.71, + "grad_norm": 0.07487868864298572, + "learning_rate": 0.00020884666122721502, + "loss": 1.4102, + "step": 6576 + }, + { + "epoch": 0.71, + "grad_norm": 0.08487261642892924, + "learning_rate": 0.00020870513498395978, + "loss": 1.3731, + "step": 6577 + }, + { + "epoch": 0.71, + "grad_norm": 0.07665208224280295, + "learning_rate": 0.00020856364406121136, + "loss": 1.4139, + "step": 6578 + }, + { + "epoch": 0.71, + "grad_norm": 0.0736129009710678, + "learning_rate": 0.0002084221884761257, + "loss": 1.4054, + "step": 6579 + }, + { + "epoch": 0.71, + "grad_norm": 0.08456473210977278, + "learning_rate": 0.00020828076824585485, + "loss": 1.2447, + "step": 6580 + }, + { + "epoch": 0.71, + "grad_norm": 0.07277321479264692, + "learning_rate": 0.0002081393833875468, + "loss": 1.5035, + "step": 6581 + }, + { + "epoch": 0.71, + "grad_norm": 0.08135520280681077, + "learning_rate": 0.00020799803391834443, + "loss": 1.3139, + "step": 6582 + }, + { + "epoch": 0.71, + "grad_norm": 0.08097736134436409, + "learning_rate": 0.00020785671985538724, + "loss": 1.3363, + "step": 6583 + }, + { + "epoch": 0.71, + "grad_norm": 0.08425630719100732, + "learning_rate": 0.00020771544121581003, + "loss": 1.4049, + "step": 6584 + }, + { + "epoch": 0.71, + "grad_norm": 0.08781705002009797, + "learning_rate": 0.0002075741980167431, + "loss": 1.4498, + "step": 6585 + }, + { + "epoch": 0.71, + "grad_norm": 0.07521461031735634, + "learning_rate": 0.000207432990275313, + "loss": 1.4064, + "step": 6586 + }, + { + "epoch": 0.71, + "grad_norm": 0.08087613506815886, + "learning_rate": 0.00020729181800864145, + "loss": 1.4177, + "step": 6587 + }, + { + "epoch": 0.71, + "grad_norm": 0.08082117128976071, + "learning_rate": 0.00020715068123384588, + "loss": 1.4106, + "step": 6588 + }, + { + "epoch": 0.71, + "grad_norm": 0.0746232658644306, + "learning_rate": 0.00020700957996803982, + "loss": 1.4639, + "step": 6589 + }, + { + "epoch": 0.71, + "grad_norm": 0.07719550175977072, + "learning_rate": 0.00020686851422833247, + "loss": 1.4654, + "step": 6590 + }, + { + "epoch": 0.71, + "grad_norm": 0.08352707823350032, + "learning_rate": 0.00020672748403182818, + "loss": 1.4165, + "step": 6591 + }, + { + "epoch": 0.71, + "grad_norm": 0.08250365019800493, + "learning_rate": 0.00020658648939562752, + "loss": 1.4926, + "step": 6592 + }, + { + "epoch": 0.71, + "grad_norm": 0.08343810025009954, + "learning_rate": 0.00020644553033682673, + "loss": 1.4649, + "step": 6593 + }, + { + "epoch": 0.71, + "grad_norm": 0.08558698583672085, + "learning_rate": 0.00020630460687251723, + "loss": 1.4754, + "step": 6594 + }, + { + "epoch": 0.71, + "grad_norm": 0.0862044285372647, + "learning_rate": 0.0002061637190197867, + "loss": 1.4983, + "step": 6595 + }, + { + "epoch": 0.71, + "grad_norm": 0.08385121801130181, + "learning_rate": 0.0002060228667957184, + "loss": 1.4094, + "step": 6596 + }, + { + "epoch": 0.71, + "grad_norm": 0.08458614443447766, + "learning_rate": 0.000205882050217391, + "loss": 1.4314, + "step": 6597 + }, + { + "epoch": 0.71, + "grad_norm": 0.0940062094037127, + "learning_rate": 0.00020574126930187882, + "loss": 1.3113, + "step": 6598 + }, + { + "epoch": 0.71, + "grad_norm": 0.08044626858810763, + "learning_rate": 0.00020560052406625235, + "loss": 1.2947, + "step": 6599 + }, + { + "epoch": 0.71, + "grad_norm": 0.10262807774306441, + "learning_rate": 0.00020545981452757718, + "loss": 1.4088, + "step": 6600 + }, + { + "epoch": 0.71, + "grad_norm": 0.08041627362609009, + "learning_rate": 0.00020531914070291485, + "loss": 1.3947, + "step": 6601 + }, + { + "epoch": 0.71, + "grad_norm": 0.08017300590515987, + "learning_rate": 0.00020517850260932285, + "loss": 1.3487, + "step": 6602 + }, + { + "epoch": 0.71, + "grad_norm": 0.09120867994933178, + "learning_rate": 0.00020503790026385365, + "loss": 1.3817, + "step": 6603 + }, + { + "epoch": 0.71, + "grad_norm": 0.08010760497374898, + "learning_rate": 0.00020489733368355588, + "loss": 1.4757, + "step": 6604 + }, + { + "epoch": 0.71, + "grad_norm": 0.07122969441559135, + "learning_rate": 0.00020475680288547398, + "loss": 1.2778, + "step": 6605 + }, + { + "epoch": 0.71, + "grad_norm": 0.08403794664807253, + "learning_rate": 0.00020461630788664743, + "loss": 1.5009, + "step": 6606 + }, + { + "epoch": 0.71, + "grad_norm": 0.08299400551352507, + "learning_rate": 0.0002044758487041119, + "loss": 1.3146, + "step": 6607 + }, + { + "epoch": 0.71, + "grad_norm": 0.07976714523904847, + "learning_rate": 0.0002043354253548987, + "loss": 1.4699, + "step": 6608 + }, + { + "epoch": 0.71, + "grad_norm": 0.08026151620792248, + "learning_rate": 0.00020419503785603445, + "loss": 1.3158, + "step": 6609 + }, + { + "epoch": 0.71, + "grad_norm": 0.08797413778999093, + "learning_rate": 0.00020405468622454155, + "loss": 1.3516, + "step": 6610 + }, + { + "epoch": 0.71, + "grad_norm": 0.07741596819135142, + "learning_rate": 0.00020391437047743817, + "loss": 1.4267, + "step": 6611 + }, + { + "epoch": 0.71, + "grad_norm": 0.0867998747324431, + "learning_rate": 0.0002037740906317383, + "loss": 1.3826, + "step": 6612 + }, + { + "epoch": 0.71, + "grad_norm": 0.09645908165108294, + "learning_rate": 0.000203633846704451, + "loss": 1.4439, + "step": 6613 + }, + { + "epoch": 0.71, + "grad_norm": 0.07676929477174242, + "learning_rate": 0.0002034936387125816, + "loss": 1.3473, + "step": 6614 + }, + { + "epoch": 0.71, + "grad_norm": 0.08455784449172907, + "learning_rate": 0.0002033534666731308, + "loss": 1.2772, + "step": 6615 + }, + { + "epoch": 0.71, + "grad_norm": 0.08107012147918617, + "learning_rate": 0.00020321333060309478, + "loss": 1.286, + "step": 6616 + }, + { + "epoch": 0.71, + "grad_norm": 0.08464678556549356, + "learning_rate": 0.00020307323051946553, + "loss": 1.4273, + "step": 6617 + }, + { + "epoch": 0.71, + "grad_norm": 0.08124259630730062, + "learning_rate": 0.0002029331664392311, + "loss": 1.2747, + "step": 6618 + }, + { + "epoch": 0.71, + "grad_norm": 0.09432309785978557, + "learning_rate": 0.0002027931383793741, + "loss": 1.4427, + "step": 6619 + }, + { + "epoch": 0.71, + "grad_norm": 0.07536259226364693, + "learning_rate": 0.0002026531463568736, + "loss": 1.3773, + "step": 6620 + }, + { + "epoch": 0.71, + "grad_norm": 0.10086324922011485, + "learning_rate": 0.00020251319038870448, + "loss": 1.5472, + "step": 6621 + }, + { + "epoch": 0.71, + "grad_norm": 0.09653867084761454, + "learning_rate": 0.0002023732704918364, + "loss": 1.5599, + "step": 6622 + }, + { + "epoch": 0.71, + "grad_norm": 0.084699031110335, + "learning_rate": 0.00020223338668323533, + "loss": 1.4765, + "step": 6623 + }, + { + "epoch": 0.71, + "grad_norm": 0.07554415347563273, + "learning_rate": 0.0002020935389798629, + "loss": 1.4258, + "step": 6624 + }, + { + "epoch": 0.71, + "grad_norm": 0.07818526536382409, + "learning_rate": 0.00020195372739867569, + "loss": 1.3709, + "step": 6625 + }, + { + "epoch": 0.71, + "grad_norm": 0.08850803610170471, + "learning_rate": 0.00020181395195662655, + "loss": 1.5076, + "step": 6626 + }, + { + "epoch": 0.71, + "grad_norm": 0.0876713653675245, + "learning_rate": 0.00020167421267066392, + "loss": 1.3318, + "step": 6627 + }, + { + "epoch": 0.71, + "grad_norm": 0.08688217151281227, + "learning_rate": 0.0002015345095577314, + "loss": 1.4213, + "step": 6628 + }, + { + "epoch": 0.71, + "grad_norm": 0.08836116928923096, + "learning_rate": 0.00020139484263476866, + "loss": 1.2419, + "step": 6629 + }, + { + "epoch": 0.71, + "grad_norm": 0.0895517491670006, + "learning_rate": 0.00020125521191871054, + "loss": 1.433, + "step": 6630 + }, + { + "epoch": 0.71, + "grad_norm": 0.08227120369012335, + "learning_rate": 0.00020111561742648803, + "loss": 1.4532, + "step": 6631 + }, + { + "epoch": 0.71, + "grad_norm": 0.08709107947772152, + "learning_rate": 0.0002009760591750272, + "loss": 1.4795, + "step": 6632 + }, + { + "epoch": 0.71, + "grad_norm": 0.08139579162806306, + "learning_rate": 0.00020083653718125027, + "loss": 1.4581, + "step": 6633 + }, + { + "epoch": 0.71, + "grad_norm": 0.08475824957944199, + "learning_rate": 0.0002006970514620744, + "loss": 1.4174, + "step": 6634 + }, + { + "epoch": 0.71, + "grad_norm": 0.08621342413318868, + "learning_rate": 0.00020055760203441288, + "loss": 1.4104, + "step": 6635 + }, + { + "epoch": 0.71, + "grad_norm": 0.08288494862554391, + "learning_rate": 0.0002004181889151746, + "loss": 1.3434, + "step": 6636 + }, + { + "epoch": 0.71, + "grad_norm": 0.08305143444288314, + "learning_rate": 0.0002002788121212636, + "loss": 1.3572, + "step": 6637 + }, + { + "epoch": 0.71, + "grad_norm": 0.07601786290155091, + "learning_rate": 0.00020013947166957992, + "loss": 1.3279, + "step": 6638 + }, + { + "epoch": 0.71, + "grad_norm": 0.07653362087750513, + "learning_rate": 0.00020000016757701922, + "loss": 1.4184, + "step": 6639 + }, + { + "epoch": 0.71, + "grad_norm": 0.07932924099857178, + "learning_rate": 0.00019986089986047246, + "loss": 1.4887, + "step": 6640 + }, + { + "epoch": 0.71, + "grad_norm": 0.07901853454265233, + "learning_rate": 0.00019972166853682617, + "loss": 1.4399, + "step": 6641 + }, + { + "epoch": 0.71, + "grad_norm": 0.06553771047838028, + "learning_rate": 0.00019958247362296278, + "loss": 1.3047, + "step": 6642 + }, + { + "epoch": 0.71, + "grad_norm": 0.07917463053808547, + "learning_rate": 0.00019944331513576036, + "loss": 1.5311, + "step": 6643 + }, + { + "epoch": 0.71, + "grad_norm": 0.0745452152182659, + "learning_rate": 0.00019930419309209196, + "loss": 1.4733, + "step": 6644 + }, + { + "epoch": 0.71, + "grad_norm": 0.07259087329080621, + "learning_rate": 0.00019916510750882683, + "loss": 1.3976, + "step": 6645 + }, + { + "epoch": 0.71, + "grad_norm": 0.08174348493303095, + "learning_rate": 0.00019902605840282968, + "loss": 1.3222, + "step": 6646 + }, + { + "epoch": 0.71, + "grad_norm": 0.07719419124593137, + "learning_rate": 0.0001988870457909604, + "loss": 1.4798, + "step": 6647 + }, + { + "epoch": 0.71, + "grad_norm": 0.09086005817838735, + "learning_rate": 0.00019874806969007492, + "loss": 1.4071, + "step": 6648 + }, + { + "epoch": 0.71, + "grad_norm": 0.07637012035806491, + "learning_rate": 0.00019860913011702475, + "loss": 1.4735, + "step": 6649 + }, + { + "epoch": 0.71, + "grad_norm": 0.08008763937136829, + "learning_rate": 0.0001984702270886566, + "loss": 1.4013, + "step": 6650 + }, + { + "epoch": 0.71, + "grad_norm": 0.07633349271039644, + "learning_rate": 0.0001983313606218128, + "loss": 1.4385, + "step": 6651 + }, + { + "epoch": 0.72, + "grad_norm": 0.0922809276923969, + "learning_rate": 0.00019819253073333166, + "loss": 1.5445, + "step": 6652 + }, + { + "epoch": 0.72, + "grad_norm": 0.08152360957588603, + "learning_rate": 0.0001980537374400465, + "loss": 1.3338, + "step": 6653 + }, + { + "epoch": 0.72, + "grad_norm": 0.0858820904568059, + "learning_rate": 0.00019791498075878662, + "loss": 1.2776, + "step": 6654 + }, + { + "epoch": 0.72, + "grad_norm": 0.07880393375507283, + "learning_rate": 0.000197776260706377, + "loss": 1.3785, + "step": 6655 + }, + { + "epoch": 0.72, + "grad_norm": 0.07789571614399886, + "learning_rate": 0.00019763757729963745, + "loss": 1.2272, + "step": 6656 + }, + { + "epoch": 0.72, + "grad_norm": 0.08571963301433981, + "learning_rate": 0.0001974989305553841, + "loss": 1.3977, + "step": 6657 + }, + { + "epoch": 0.72, + "grad_norm": 0.08812834195849592, + "learning_rate": 0.00019736032049042846, + "loss": 1.3836, + "step": 6658 + }, + { + "epoch": 0.72, + "grad_norm": 0.08479265010733626, + "learning_rate": 0.00019722174712157714, + "loss": 1.4593, + "step": 6659 + }, + { + "epoch": 0.72, + "grad_norm": 0.08609566303687616, + "learning_rate": 0.00019708321046563298, + "loss": 1.2265, + "step": 6660 + }, + { + "epoch": 0.72, + "grad_norm": 0.0765427963656903, + "learning_rate": 0.0001969447105393937, + "loss": 1.3816, + "step": 6661 + }, + { + "epoch": 0.72, + "grad_norm": 0.07992363192285609, + "learning_rate": 0.00019680624735965324, + "loss": 1.3869, + "step": 6662 + }, + { + "epoch": 0.72, + "grad_norm": 0.08216831699566847, + "learning_rate": 0.00019666782094320042, + "loss": 1.3774, + "step": 6663 + }, + { + "epoch": 0.72, + "grad_norm": 0.08439452976795346, + "learning_rate": 0.00019652943130682015, + "loss": 1.385, + "step": 6664 + }, + { + "epoch": 0.72, + "grad_norm": 0.08737819487197597, + "learning_rate": 0.00019639107846729242, + "loss": 1.5652, + "step": 6665 + }, + { + "epoch": 0.72, + "grad_norm": 0.08022829112347864, + "learning_rate": 0.00019625276244139317, + "loss": 1.3677, + "step": 6666 + }, + { + "epoch": 0.72, + "grad_norm": 0.09083698232520176, + "learning_rate": 0.00019611448324589377, + "loss": 1.3274, + "step": 6667 + }, + { + "epoch": 0.72, + "grad_norm": 0.09574330590908853, + "learning_rate": 0.00019597624089756076, + "loss": 1.3743, + "step": 6668 + }, + { + "epoch": 0.72, + "grad_norm": 0.07621982941746845, + "learning_rate": 0.0001958380354131567, + "loss": 1.4548, + "step": 6669 + }, + { + "epoch": 0.72, + "grad_norm": 0.08335567959546465, + "learning_rate": 0.00019569986680943957, + "loss": 1.381, + "step": 6670 + }, + { + "epoch": 0.72, + "grad_norm": 0.08064847869524633, + "learning_rate": 0.00019556173510316271, + "loss": 1.4698, + "step": 6671 + }, + { + "epoch": 0.72, + "grad_norm": 0.08671777975446042, + "learning_rate": 0.00019542364031107485, + "loss": 1.3249, + "step": 6672 + }, + { + "epoch": 0.72, + "grad_norm": 0.0753294442038951, + "learning_rate": 0.0001952855824499206, + "loss": 1.3026, + "step": 6673 + }, + { + "epoch": 0.72, + "grad_norm": 0.0810641059229283, + "learning_rate": 0.00019514756153644027, + "loss": 1.4282, + "step": 6674 + }, + { + "epoch": 0.72, + "grad_norm": 0.08289829139487187, + "learning_rate": 0.0001950095775873688, + "loss": 1.4603, + "step": 6675 + }, + { + "epoch": 0.72, + "grad_norm": 0.09110856841308793, + "learning_rate": 0.00019487163061943758, + "loss": 1.3966, + "step": 6676 + }, + { + "epoch": 0.72, + "grad_norm": 0.08034250256655166, + "learning_rate": 0.00019473372064937323, + "loss": 1.4295, + "step": 6677 + }, + { + "epoch": 0.72, + "grad_norm": 0.08049880862150381, + "learning_rate": 0.0001945958476938975, + "loss": 1.426, + "step": 6678 + }, + { + "epoch": 0.72, + "grad_norm": 0.08222994978626211, + "learning_rate": 0.0001944580117697281, + "loss": 1.3235, + "step": 6679 + }, + { + "epoch": 0.72, + "grad_norm": 0.09559967814473865, + "learning_rate": 0.00019432021289357833, + "loss": 1.4478, + "step": 6680 + }, + { + "epoch": 0.72, + "grad_norm": 0.08036374699327789, + "learning_rate": 0.00019418245108215637, + "loss": 1.422, + "step": 6681 + }, + { + "epoch": 0.72, + "grad_norm": 0.08060853174720173, + "learning_rate": 0.00019404472635216674, + "loss": 1.4803, + "step": 6682 + }, + { + "epoch": 0.72, + "grad_norm": 0.08451954416817149, + "learning_rate": 0.00019390703872030886, + "loss": 1.3489, + "step": 6683 + }, + { + "epoch": 0.72, + "grad_norm": 0.0786560087602854, + "learning_rate": 0.0001937693882032776, + "loss": 1.4111, + "step": 6684 + }, + { + "epoch": 0.72, + "grad_norm": 0.09707523238495702, + "learning_rate": 0.00019363177481776373, + "loss": 1.5404, + "step": 6685 + }, + { + "epoch": 0.72, + "grad_norm": 0.08754859574303579, + "learning_rate": 0.0001934941985804536, + "loss": 1.3307, + "step": 6686 + }, + { + "epoch": 0.72, + "grad_norm": 0.07550925047472697, + "learning_rate": 0.0001933566595080284, + "loss": 1.3903, + "step": 6687 + }, + { + "epoch": 0.72, + "grad_norm": 0.09006506803931079, + "learning_rate": 0.00019321915761716534, + "loss": 1.5876, + "step": 6688 + }, + { + "epoch": 0.72, + "grad_norm": 0.08481310995440604, + "learning_rate": 0.00019308169292453725, + "loss": 1.4452, + "step": 6689 + }, + { + "epoch": 0.72, + "grad_norm": 0.08923095224891851, + "learning_rate": 0.00019294426544681182, + "loss": 1.384, + "step": 6690 + }, + { + "epoch": 0.72, + "grad_norm": 0.07069536368701973, + "learning_rate": 0.00019280687520065282, + "loss": 1.3548, + "step": 6691 + }, + { + "epoch": 0.72, + "grad_norm": 0.07602239730273473, + "learning_rate": 0.00019266952220271937, + "loss": 1.3895, + "step": 6692 + }, + { + "epoch": 0.72, + "grad_norm": 0.07398666072791947, + "learning_rate": 0.00019253220646966597, + "loss": 1.2796, + "step": 6693 + }, + { + "epoch": 0.72, + "grad_norm": 0.08773848956447536, + "learning_rate": 0.0001923949280181423, + "loss": 1.4594, + "step": 6694 + }, + { + "epoch": 0.72, + "grad_norm": 0.07650409822997449, + "learning_rate": 0.00019225768686479428, + "loss": 1.2462, + "step": 6695 + }, + { + "epoch": 0.72, + "grad_norm": 0.07586459619579579, + "learning_rate": 0.0001921204830262625, + "loss": 1.4254, + "step": 6696 + }, + { + "epoch": 0.72, + "grad_norm": 0.08147345080849903, + "learning_rate": 0.0001919833165191836, + "loss": 1.4551, + "step": 6697 + }, + { + "epoch": 0.72, + "grad_norm": 0.07772700515033043, + "learning_rate": 0.0001918461873601896, + "loss": 1.2775, + "step": 6698 + }, + { + "epoch": 0.72, + "grad_norm": 0.07878468764229728, + "learning_rate": 0.0001917090955659076, + "loss": 1.4245, + "step": 6699 + }, + { + "epoch": 0.72, + "grad_norm": 0.0856986791366906, + "learning_rate": 0.0001915720411529606, + "loss": 1.4059, + "step": 6700 + }, + { + "epoch": 0.72, + "grad_norm": 0.08232357102950424, + "learning_rate": 0.0001914350241379671, + "loss": 1.3602, + "step": 6701 + }, + { + "epoch": 0.72, + "grad_norm": 0.08037236283367581, + "learning_rate": 0.00019129804453754052, + "loss": 1.3657, + "step": 6702 + }, + { + "epoch": 0.72, + "grad_norm": 0.08306951753793287, + "learning_rate": 0.0001911611023682905, + "loss": 1.2233, + "step": 6703 + }, + { + "epoch": 0.72, + "grad_norm": 0.0731465844221435, + "learning_rate": 0.00019102419764682133, + "loss": 1.4167, + "step": 6704 + }, + { + "epoch": 0.72, + "grad_norm": 0.07859228132072026, + "learning_rate": 0.0001908873303897336, + "loss": 1.3531, + "step": 6705 + }, + { + "epoch": 0.72, + "grad_norm": 0.07670616178704094, + "learning_rate": 0.00019075050061362252, + "loss": 1.4396, + "step": 6706 + }, + { + "epoch": 0.72, + "grad_norm": 0.07452515984590537, + "learning_rate": 0.00019061370833507946, + "loss": 1.4069, + "step": 6707 + }, + { + "epoch": 0.72, + "grad_norm": 0.09091438656192163, + "learning_rate": 0.000190476953570691, + "loss": 1.357, + "step": 6708 + }, + { + "epoch": 0.72, + "grad_norm": 0.0885594352489358, + "learning_rate": 0.00019034023633703883, + "loss": 1.4248, + "step": 6709 + }, + { + "epoch": 0.72, + "grad_norm": 0.07470022744007065, + "learning_rate": 0.0001902035566507006, + "loss": 1.4364, + "step": 6710 + }, + { + "epoch": 0.72, + "grad_norm": 0.07785651744361902, + "learning_rate": 0.00019006691452824932, + "loss": 1.4123, + "step": 6711 + }, + { + "epoch": 0.72, + "grad_norm": 0.08260984881365113, + "learning_rate": 0.00018993030998625294, + "loss": 1.4369, + "step": 6712 + }, + { + "epoch": 0.72, + "grad_norm": 0.08785121882037714, + "learning_rate": 0.00018979374304127567, + "loss": 1.4798, + "step": 6713 + }, + { + "epoch": 0.72, + "grad_norm": 0.08974738369160999, + "learning_rate": 0.00018965721370987649, + "loss": 1.3725, + "step": 6714 + }, + { + "epoch": 0.72, + "grad_norm": 0.07439597985191031, + "learning_rate": 0.00018952072200860987, + "loss": 1.3802, + "step": 6715 + }, + { + "epoch": 0.72, + "grad_norm": 0.07989917071160745, + "learning_rate": 0.00018938426795402614, + "loss": 1.4656, + "step": 6716 + }, + { + "epoch": 0.72, + "grad_norm": 0.07990885134710889, + "learning_rate": 0.00018924785156267088, + "loss": 1.3247, + "step": 6717 + }, + { + "epoch": 0.72, + "grad_norm": 0.0920282425858599, + "learning_rate": 0.0001891114728510848, + "loss": 1.5226, + "step": 6718 + }, + { + "epoch": 0.72, + "grad_norm": 0.08098133988211124, + "learning_rate": 0.00018897513183580445, + "loss": 1.3289, + "step": 6719 + }, + { + "epoch": 0.72, + "grad_norm": 0.08236995237832075, + "learning_rate": 0.00018883882853336183, + "loss": 1.4082, + "step": 6720 + }, + { + "epoch": 0.72, + "grad_norm": 0.08109450367713297, + "learning_rate": 0.00018870256296028376, + "loss": 1.3274, + "step": 6721 + }, + { + "epoch": 0.72, + "grad_norm": 0.08854925614558622, + "learning_rate": 0.00018856633513309313, + "loss": 1.4633, + "step": 6722 + }, + { + "epoch": 0.72, + "grad_norm": 0.08776537527339491, + "learning_rate": 0.00018843014506830823, + "loss": 1.4247, + "step": 6723 + }, + { + "epoch": 0.72, + "grad_norm": 0.08403123751330256, + "learning_rate": 0.0001882939927824424, + "loss": 1.3254, + "step": 6724 + }, + { + "epoch": 0.72, + "grad_norm": 0.07484992936906583, + "learning_rate": 0.00018815787829200436, + "loss": 1.4866, + "step": 6725 + }, + { + "epoch": 0.72, + "grad_norm": 0.08493873152531263, + "learning_rate": 0.0001880218016134987, + "loss": 1.2764, + "step": 6726 + }, + { + "epoch": 0.72, + "grad_norm": 0.08016073948899083, + "learning_rate": 0.00018788576276342528, + "loss": 1.5368, + "step": 6727 + }, + { + "epoch": 0.72, + "grad_norm": 0.08021313491786318, + "learning_rate": 0.00018774976175827895, + "loss": 1.4752, + "step": 6728 + }, + { + "epoch": 0.72, + "grad_norm": 0.08468752473482426, + "learning_rate": 0.00018761379861455076, + "loss": 1.4151, + "step": 6729 + }, + { + "epoch": 0.72, + "grad_norm": 0.07521467300223297, + "learning_rate": 0.00018747787334872618, + "loss": 1.4668, + "step": 6730 + }, + { + "epoch": 0.72, + "grad_norm": 0.07685387563084291, + "learning_rate": 0.00018734198597728698, + "loss": 1.4331, + "step": 6731 + }, + { + "epoch": 0.72, + "grad_norm": 0.07505611286286995, + "learning_rate": 0.00018720613651670997, + "loss": 1.4281, + "step": 6732 + }, + { + "epoch": 0.72, + "grad_norm": 0.07502769080720458, + "learning_rate": 0.0001870703249834671, + "loss": 1.3482, + "step": 6733 + }, + { + "epoch": 0.72, + "grad_norm": 0.07203431096058605, + "learning_rate": 0.0001869345513940262, + "loss": 1.3121, + "step": 6734 + }, + { + "epoch": 0.72, + "grad_norm": 0.07909327996419319, + "learning_rate": 0.00018679881576485043, + "loss": 1.4207, + "step": 6735 + }, + { + "epoch": 0.72, + "grad_norm": 0.09281906210695522, + "learning_rate": 0.00018666311811239795, + "loss": 1.3021, + "step": 6736 + }, + { + "epoch": 0.72, + "grad_norm": 0.10364312066748778, + "learning_rate": 0.00018652745845312252, + "loss": 1.3919, + "step": 6737 + }, + { + "epoch": 0.72, + "grad_norm": 0.07723214415961016, + "learning_rate": 0.00018639183680347344, + "loss": 1.4411, + "step": 6738 + }, + { + "epoch": 0.72, + "grad_norm": 0.07938462403198333, + "learning_rate": 0.00018625625317989547, + "loss": 1.5324, + "step": 6739 + }, + { + "epoch": 0.72, + "grad_norm": 0.0817394915072993, + "learning_rate": 0.00018612070759882826, + "loss": 1.4159, + "step": 6740 + }, + { + "epoch": 0.72, + "grad_norm": 0.07546117695110775, + "learning_rate": 0.00018598520007670743, + "loss": 1.4377, + "step": 6741 + }, + { + "epoch": 0.72, + "grad_norm": 0.07981149695058505, + "learning_rate": 0.00018584973062996379, + "loss": 1.3169, + "step": 6742 + }, + { + "epoch": 0.72, + "grad_norm": 0.09914212403092261, + "learning_rate": 0.0001857142992750232, + "loss": 1.3235, + "step": 6743 + }, + { + "epoch": 0.72, + "grad_norm": 0.08197515988292878, + "learning_rate": 0.0001855789060283073, + "loss": 1.4592, + "step": 6744 + }, + { + "epoch": 0.73, + "grad_norm": 0.10267448225101261, + "learning_rate": 0.00018544355090623337, + "loss": 1.3162, + "step": 6745 + }, + { + "epoch": 0.73, + "grad_norm": 0.07850174588448537, + "learning_rate": 0.00018530823392521302, + "loss": 1.3222, + "step": 6746 + }, + { + "epoch": 0.73, + "grad_norm": 0.08075899888732171, + "learning_rate": 0.00018517295510165417, + "loss": 1.4816, + "step": 6747 + }, + { + "epoch": 0.73, + "grad_norm": 0.08032012562669273, + "learning_rate": 0.00018503771445196006, + "loss": 1.3729, + "step": 6748 + }, + { + "epoch": 0.73, + "grad_norm": 0.07918002511033893, + "learning_rate": 0.00018490251199252878, + "loss": 1.267, + "step": 6749 + }, + { + "epoch": 0.73, + "grad_norm": 0.07957204447098715, + "learning_rate": 0.0001847673477397542, + "loss": 1.4327, + "step": 6750 + }, + { + "epoch": 0.73, + "grad_norm": 0.07659661414310863, + "learning_rate": 0.00018463222171002564, + "loss": 1.4241, + "step": 6751 + }, + { + "epoch": 0.73, + "grad_norm": 0.08910841479830099, + "learning_rate": 0.00018449713391972732, + "loss": 1.4116, + "step": 6752 + }, + { + "epoch": 0.73, + "grad_norm": 0.0708037129156938, + "learning_rate": 0.00018436208438523915, + "loss": 1.3844, + "step": 6753 + }, + { + "epoch": 0.73, + "grad_norm": 0.080526008014301, + "learning_rate": 0.00018422707312293663, + "loss": 1.4397, + "step": 6754 + }, + { + "epoch": 0.73, + "grad_norm": 0.09423939921649865, + "learning_rate": 0.00018409210014918992, + "loss": 1.3331, + "step": 6755 + }, + { + "epoch": 0.73, + "grad_norm": 0.08920464006608342, + "learning_rate": 0.0001839571654803654, + "loss": 1.43, + "step": 6756 + }, + { + "epoch": 0.73, + "grad_norm": 0.08378333881364608, + "learning_rate": 0.0001838222691328239, + "loss": 1.4928, + "step": 6757 + }, + { + "epoch": 0.73, + "grad_norm": 0.07805182934549852, + "learning_rate": 0.00018368741112292252, + "loss": 1.3854, + "step": 6758 + }, + { + "epoch": 0.73, + "grad_norm": 0.0854097806654915, + "learning_rate": 0.0001835525914670128, + "loss": 1.3806, + "step": 6759 + }, + { + "epoch": 0.73, + "grad_norm": 0.08040018273693553, + "learning_rate": 0.00018341781018144253, + "loss": 1.3825, + "step": 6760 + }, + { + "epoch": 0.73, + "grad_norm": 0.0936813686907315, + "learning_rate": 0.00018328306728255405, + "loss": 1.5222, + "step": 6761 + }, + { + "epoch": 0.73, + "grad_norm": 0.07344171050358542, + "learning_rate": 0.00018314836278668557, + "loss": 1.3782, + "step": 6762 + }, + { + "epoch": 0.73, + "grad_norm": 0.07684534780565848, + "learning_rate": 0.00018301369671017058, + "loss": 1.4264, + "step": 6763 + }, + { + "epoch": 0.73, + "grad_norm": 0.07826953273917311, + "learning_rate": 0.00018287906906933754, + "loss": 1.3789, + "step": 6764 + }, + { + "epoch": 0.73, + "grad_norm": 0.08928722109970684, + "learning_rate": 0.00018274447988051064, + "loss": 1.3112, + "step": 6765 + }, + { + "epoch": 0.73, + "grad_norm": 0.08674179911378348, + "learning_rate": 0.00018260992916000946, + "loss": 1.4021, + "step": 6766 + }, + { + "epoch": 0.73, + "grad_norm": 0.08089963246558474, + "learning_rate": 0.00018247541692414858, + "loss": 1.489, + "step": 6767 + }, + { + "epoch": 0.73, + "grad_norm": 0.09059576891894354, + "learning_rate": 0.00018234094318923794, + "loss": 1.2786, + "step": 6768 + }, + { + "epoch": 0.73, + "grad_norm": 0.0848582732695057, + "learning_rate": 0.000182206507971583, + "loss": 1.3481, + "step": 6769 + }, + { + "epoch": 0.73, + "grad_norm": 0.07842148758676074, + "learning_rate": 0.0001820721112874848, + "loss": 1.3778, + "step": 6770 + }, + { + "epoch": 0.73, + "grad_norm": 0.08192393685903324, + "learning_rate": 0.000181937753153239, + "loss": 1.431, + "step": 6771 + }, + { + "epoch": 0.73, + "grad_norm": 0.08794094605478588, + "learning_rate": 0.00018180343358513713, + "loss": 1.5262, + "step": 6772 + }, + { + "epoch": 0.73, + "grad_norm": 0.07989683360465649, + "learning_rate": 0.00018166915259946616, + "loss": 1.4531, + "step": 6773 + }, + { + "epoch": 0.73, + "grad_norm": 0.07895430276687004, + "learning_rate": 0.00018153491021250762, + "loss": 1.3966, + "step": 6774 + }, + { + "epoch": 0.73, + "grad_norm": 0.07984445806504739, + "learning_rate": 0.0001814007064405392, + "loss": 1.3677, + "step": 6775 + }, + { + "epoch": 0.73, + "grad_norm": 0.08715921957027024, + "learning_rate": 0.00018126654129983367, + "loss": 1.3904, + "step": 6776 + }, + { + "epoch": 0.73, + "grad_norm": 0.0923525775887198, + "learning_rate": 0.00018113241480665883, + "loss": 1.4055, + "step": 6777 + }, + { + "epoch": 0.73, + "grad_norm": 0.08056941965998873, + "learning_rate": 0.00018099832697727786, + "loss": 1.3418, + "step": 6778 + }, + { + "epoch": 0.73, + "grad_norm": 0.07920305560868664, + "learning_rate": 0.00018086427782794962, + "loss": 1.3279, + "step": 6779 + }, + { + "epoch": 0.73, + "grad_norm": 0.07615284040014661, + "learning_rate": 0.00018073026737492782, + "loss": 1.3265, + "step": 6780 + }, + { + "epoch": 0.73, + "grad_norm": 0.07508902500142603, + "learning_rate": 0.00018059629563446173, + "loss": 1.4519, + "step": 6781 + }, + { + "epoch": 0.73, + "grad_norm": 0.09785702332279825, + "learning_rate": 0.00018046236262279615, + "loss": 1.4866, + "step": 6782 + }, + { + "epoch": 0.73, + "grad_norm": 0.09289516193260337, + "learning_rate": 0.00018032846835617055, + "loss": 1.1696, + "step": 6783 + }, + { + "epoch": 0.73, + "grad_norm": 0.09300543988745133, + "learning_rate": 0.00018019461285082023, + "loss": 1.4885, + "step": 6784 + }, + { + "epoch": 0.73, + "grad_norm": 0.0857508580574828, + "learning_rate": 0.00018006079612297582, + "loss": 1.3232, + "step": 6785 + }, + { + "epoch": 0.73, + "grad_norm": 0.07557541283565075, + "learning_rate": 0.0001799270181888627, + "loss": 1.3176, + "step": 6786 + }, + { + "epoch": 0.73, + "grad_norm": 0.0937229554586484, + "learning_rate": 0.00017979327906470204, + "loss": 1.4221, + "step": 6787 + }, + { + "epoch": 0.73, + "grad_norm": 0.08422026801156707, + "learning_rate": 0.00017965957876671046, + "loss": 1.3148, + "step": 6788 + }, + { + "epoch": 0.73, + "grad_norm": 0.0870791699339773, + "learning_rate": 0.0001795259173110993, + "loss": 1.1682, + "step": 6789 + }, + { + "epoch": 0.73, + "grad_norm": 0.08035752064799632, + "learning_rate": 0.0001793922947140753, + "loss": 1.381, + "step": 6790 + }, + { + "epoch": 0.73, + "grad_norm": 0.07755396439121148, + "learning_rate": 0.0001792587109918411, + "loss": 1.3554, + "step": 6791 + }, + { + "epoch": 0.73, + "grad_norm": 0.07669787633054774, + "learning_rate": 0.00017912516616059378, + "loss": 1.347, + "step": 6792 + }, + { + "epoch": 0.73, + "grad_norm": 0.0838443620302552, + "learning_rate": 0.00017899166023652624, + "loss": 1.3293, + "step": 6793 + }, + { + "epoch": 0.73, + "grad_norm": 0.08594325030118602, + "learning_rate": 0.0001788581932358268, + "loss": 1.5661, + "step": 6794 + }, + { + "epoch": 0.73, + "grad_norm": 0.0741443300137945, + "learning_rate": 0.0001787247651746784, + "loss": 1.3332, + "step": 6795 + }, + { + "epoch": 0.73, + "grad_norm": 0.08201836586996483, + "learning_rate": 0.0001785913760692598, + "loss": 1.4351, + "step": 6796 + }, + { + "epoch": 0.73, + "grad_norm": 0.08571123279116978, + "learning_rate": 0.0001784580259357451, + "loss": 1.3921, + "step": 6797 + }, + { + "epoch": 0.73, + "grad_norm": 0.08420236015565297, + "learning_rate": 0.00017832471479030328, + "loss": 1.4287, + "step": 6798 + }, + { + "epoch": 0.73, + "grad_norm": 0.08673883035937369, + "learning_rate": 0.0001781914426490986, + "loss": 1.3989, + "step": 6799 + }, + { + "epoch": 0.73, + "grad_norm": 0.08328479946706482, + "learning_rate": 0.00017805820952829094, + "loss": 1.3287, + "step": 6800 + }, + { + "epoch": 0.73, + "grad_norm": 0.0911264911348588, + "learning_rate": 0.00017792501544403546, + "loss": 1.2763, + "step": 6801 + }, + { + "epoch": 0.73, + "grad_norm": 0.07562969503527785, + "learning_rate": 0.00017779186041248202, + "loss": 1.3719, + "step": 6802 + }, + { + "epoch": 0.73, + "grad_norm": 0.08436317460320578, + "learning_rate": 0.00017765874444977637, + "loss": 1.5092, + "step": 6803 + }, + { + "epoch": 0.73, + "grad_norm": 0.08201803089533123, + "learning_rate": 0.00017752566757205934, + "loss": 1.3735, + "step": 6804 + }, + { + "epoch": 0.73, + "grad_norm": 0.08212245968024402, + "learning_rate": 0.0001773926297954667, + "loss": 1.437, + "step": 6805 + }, + { + "epoch": 0.73, + "grad_norm": 0.08936482521224416, + "learning_rate": 0.00017725963113612996, + "loss": 1.3278, + "step": 6806 + }, + { + "epoch": 0.73, + "grad_norm": 0.08005483146062434, + "learning_rate": 0.0001771266716101757, + "loss": 1.5125, + "step": 6807 + }, + { + "epoch": 0.73, + "grad_norm": 0.08323718244792362, + "learning_rate": 0.00017699375123372553, + "loss": 1.3158, + "step": 6808 + }, + { + "epoch": 0.73, + "grad_norm": 0.08374696265235664, + "learning_rate": 0.0001768608700228967, + "loss": 1.4074, + "step": 6809 + }, + { + "epoch": 0.73, + "grad_norm": 0.08846882203118381, + "learning_rate": 0.0001767280279938014, + "loss": 1.465, + "step": 6810 + }, + { + "epoch": 0.73, + "grad_norm": 0.08056026493132028, + "learning_rate": 0.00017659522516254707, + "loss": 1.3597, + "step": 6811 + }, + { + "epoch": 0.73, + "grad_norm": 0.09657117745998345, + "learning_rate": 0.0001764624615452366, + "loss": 1.4254, + "step": 6812 + }, + { + "epoch": 0.73, + "grad_norm": 0.08434230471203434, + "learning_rate": 0.00017632973715796824, + "loss": 1.3593, + "step": 6813 + }, + { + "epoch": 0.73, + "grad_norm": 0.07279166739385243, + "learning_rate": 0.00017619705201683494, + "loss": 1.5091, + "step": 6814 + }, + { + "epoch": 0.73, + "grad_norm": 0.08449448457832491, + "learning_rate": 0.00017606440613792546, + "loss": 1.4328, + "step": 6815 + }, + { + "epoch": 0.73, + "grad_norm": 0.0848450788789903, + "learning_rate": 0.00017593179953732363, + "loss": 1.4954, + "step": 6816 + }, + { + "epoch": 0.73, + "grad_norm": 0.08389551707213994, + "learning_rate": 0.00017579923223110815, + "loss": 1.5587, + "step": 6817 + }, + { + "epoch": 0.73, + "grad_norm": 0.08551329662455956, + "learning_rate": 0.00017566670423535346, + "loss": 1.4431, + "step": 6818 + }, + { + "epoch": 0.73, + "grad_norm": 0.0845378310088562, + "learning_rate": 0.00017553421556612924, + "loss": 1.4607, + "step": 6819 + }, + { + "epoch": 0.73, + "grad_norm": 0.08248996628222881, + "learning_rate": 0.0001754017662395, + "loss": 1.35, + "step": 6820 + }, + { + "epoch": 0.73, + "grad_norm": 0.09292398961034305, + "learning_rate": 0.00017526935627152542, + "loss": 1.354, + "step": 6821 + }, + { + "epoch": 0.73, + "grad_norm": 0.09073753070746698, + "learning_rate": 0.00017513698567826096, + "loss": 1.3038, + "step": 6822 + }, + { + "epoch": 0.73, + "grad_norm": 0.09641388034069218, + "learning_rate": 0.0001750046544757571, + "loss": 1.3099, + "step": 6823 + }, + { + "epoch": 0.73, + "grad_norm": 0.08378939811932765, + "learning_rate": 0.00017487236268005918, + "loss": 1.4058, + "step": 6824 + }, + { + "epoch": 0.73, + "grad_norm": 0.08011359281231487, + "learning_rate": 0.00017474011030720832, + "loss": 1.3095, + "step": 6825 + }, + { + "epoch": 0.73, + "grad_norm": 0.07936773141110776, + "learning_rate": 0.00017460789737324024, + "loss": 1.4565, + "step": 6826 + }, + { + "epoch": 0.73, + "grad_norm": 0.08898734259408628, + "learning_rate": 0.00017447572389418643, + "loss": 1.4878, + "step": 6827 + }, + { + "epoch": 0.73, + "grad_norm": 0.08102931349353243, + "learning_rate": 0.0001743435898860735, + "loss": 1.3488, + "step": 6828 + }, + { + "epoch": 0.73, + "grad_norm": 0.08001499042331985, + "learning_rate": 0.00017421149536492282, + "loss": 1.4098, + "step": 6829 + }, + { + "epoch": 0.73, + "grad_norm": 0.08157457664209204, + "learning_rate": 0.0001740794403467517, + "loss": 1.3322, + "step": 6830 + }, + { + "epoch": 0.73, + "grad_norm": 0.07722036111521809, + "learning_rate": 0.00017394742484757187, + "loss": 1.3963, + "step": 6831 + }, + { + "epoch": 0.73, + "grad_norm": 0.08352673258439874, + "learning_rate": 0.000173815448883391, + "loss": 1.2971, + "step": 6832 + }, + { + "epoch": 0.73, + "grad_norm": 0.07859349149790394, + "learning_rate": 0.00017368351247021136, + "loss": 1.4829, + "step": 6833 + }, + { + "epoch": 0.73, + "grad_norm": 0.07105629459606275, + "learning_rate": 0.00017355161562403076, + "loss": 1.304, + "step": 6834 + }, + { + "epoch": 0.73, + "grad_norm": 0.08610324427789169, + "learning_rate": 0.00017341975836084245, + "loss": 1.2876, + "step": 6835 + }, + { + "epoch": 0.73, + "grad_norm": 0.0754237529009342, + "learning_rate": 0.0001732879406966341, + "loss": 1.3283, + "step": 6836 + }, + { + "epoch": 0.73, + "grad_norm": 0.07922581197977313, + "learning_rate": 0.0001731561626473893, + "loss": 1.3629, + "step": 6837 + }, + { + "epoch": 0.74, + "grad_norm": 0.08558328737606986, + "learning_rate": 0.00017302442422908676, + "loss": 1.5443, + "step": 6838 + }, + { + "epoch": 0.74, + "grad_norm": 0.07826006509172885, + "learning_rate": 0.00017289272545769986, + "loss": 1.3043, + "step": 6839 + }, + { + "epoch": 0.74, + "grad_norm": 0.09022868404772783, + "learning_rate": 0.00017276106634919774, + "loss": 1.4351, + "step": 6840 + }, + { + "epoch": 0.74, + "grad_norm": 0.09189036230143136, + "learning_rate": 0.0001726294469195448, + "loss": 1.2811, + "step": 6841 + }, + { + "epoch": 0.74, + "grad_norm": 0.08050200243446126, + "learning_rate": 0.00017249786718469967, + "loss": 1.5167, + "step": 6842 + }, + { + "epoch": 0.74, + "grad_norm": 0.07399654613968369, + "learning_rate": 0.00017236632716061728, + "loss": 1.4766, + "step": 6843 + }, + { + "epoch": 0.74, + "grad_norm": 0.0806654913246944, + "learning_rate": 0.00017223482686324736, + "loss": 1.4564, + "step": 6844 + }, + { + "epoch": 0.74, + "grad_norm": 0.09296418958185575, + "learning_rate": 0.0001721033663085345, + "loss": 1.4995, + "step": 6845 + }, + { + "epoch": 0.74, + "grad_norm": 0.08405262508188, + "learning_rate": 0.00017197194551241897, + "loss": 1.4281, + "step": 6846 + }, + { + "epoch": 0.74, + "grad_norm": 0.09050133536256577, + "learning_rate": 0.00017184056449083603, + "loss": 1.452, + "step": 6847 + }, + { + "epoch": 0.74, + "grad_norm": 0.10443701802852487, + "learning_rate": 0.00017170922325971584, + "loss": 1.5564, + "step": 6848 + }, + { + "epoch": 0.74, + "grad_norm": 0.08562641911936063, + "learning_rate": 0.00017157792183498412, + "loss": 1.2908, + "step": 6849 + }, + { + "epoch": 0.74, + "grad_norm": 0.07636600524567706, + "learning_rate": 0.00017144666023256178, + "loss": 1.3667, + "step": 6850 + }, + { + "epoch": 0.74, + "grad_norm": 0.08478261331322622, + "learning_rate": 0.00017131543846836457, + "loss": 1.3854, + "step": 6851 + }, + { + "epoch": 0.74, + "grad_norm": 0.08098833976518655, + "learning_rate": 0.00017118425655830344, + "loss": 1.5091, + "step": 6852 + }, + { + "epoch": 0.74, + "grad_norm": 0.08677585467877516, + "learning_rate": 0.0001710531145182848, + "loss": 1.5628, + "step": 6853 + }, + { + "epoch": 0.74, + "grad_norm": 0.07471384846368484, + "learning_rate": 0.0001709220123642103, + "loss": 1.4041, + "step": 6854 + }, + { + "epoch": 0.74, + "grad_norm": 0.09541745514219427, + "learning_rate": 0.00017079095011197608, + "loss": 1.4971, + "step": 6855 + }, + { + "epoch": 0.74, + "grad_norm": 0.08623794035116948, + "learning_rate": 0.0001706599277774743, + "loss": 1.3316, + "step": 6856 + }, + { + "epoch": 0.74, + "grad_norm": 0.08054776192726237, + "learning_rate": 0.00017052894537659147, + "loss": 1.43, + "step": 6857 + }, + { + "epoch": 0.74, + "grad_norm": 0.07521154256399776, + "learning_rate": 0.00017039800292520995, + "loss": 1.3866, + "step": 6858 + }, + { + "epoch": 0.74, + "grad_norm": 0.09193438771336371, + "learning_rate": 0.00017026710043920702, + "loss": 1.3083, + "step": 6859 + }, + { + "epoch": 0.74, + "grad_norm": 0.08977963139511945, + "learning_rate": 0.0001701362379344547, + "loss": 1.3806, + "step": 6860 + }, + { + "epoch": 0.74, + "grad_norm": 0.07698085395458001, + "learning_rate": 0.00017000541542682086, + "loss": 1.4348, + "step": 6861 + }, + { + "epoch": 0.74, + "grad_norm": 0.08198947779570069, + "learning_rate": 0.00016987463293216814, + "loss": 1.4138, + "step": 6862 + }, + { + "epoch": 0.74, + "grad_norm": 0.07855413193058092, + "learning_rate": 0.0001697438904663543, + "loss": 1.4153, + "step": 6863 + }, + { + "epoch": 0.74, + "grad_norm": 0.08950865101868916, + "learning_rate": 0.00016961318804523218, + "loss": 1.5416, + "step": 6864 + }, + { + "epoch": 0.74, + "grad_norm": 0.0831439080639515, + "learning_rate": 0.00016948252568465, + "loss": 1.4049, + "step": 6865 + }, + { + "epoch": 0.74, + "grad_norm": 0.08513958963759437, + "learning_rate": 0.0001693519034004512, + "loss": 1.4206, + "step": 6866 + }, + { + "epoch": 0.74, + "grad_norm": 0.07840923965012238, + "learning_rate": 0.00016922132120847388, + "loss": 1.3569, + "step": 6867 + }, + { + "epoch": 0.74, + "grad_norm": 0.07532709899874017, + "learning_rate": 0.0001690907791245518, + "loss": 1.2623, + "step": 6868 + }, + { + "epoch": 0.74, + "grad_norm": 0.08043137997388435, + "learning_rate": 0.00016896027716451363, + "loss": 1.3917, + "step": 6869 + }, + { + "epoch": 0.74, + "grad_norm": 0.07859670126561226, + "learning_rate": 0.00016882981534418302, + "loss": 1.495, + "step": 6870 + }, + { + "epoch": 0.74, + "grad_norm": 0.08174506145645243, + "learning_rate": 0.00016869939367937896, + "loss": 1.447, + "step": 6871 + }, + { + "epoch": 0.74, + "grad_norm": 0.07206520273087716, + "learning_rate": 0.00016856901218591596, + "loss": 1.5492, + "step": 6872 + }, + { + "epoch": 0.74, + "grad_norm": 0.08019987669464876, + "learning_rate": 0.00016843867087960252, + "loss": 1.4361, + "step": 6873 + }, + { + "epoch": 0.74, + "grad_norm": 0.07809580316950983, + "learning_rate": 0.00016830836977624325, + "loss": 1.2727, + "step": 6874 + }, + { + "epoch": 0.74, + "grad_norm": 0.07459830181974891, + "learning_rate": 0.00016817810889163787, + "loss": 1.425, + "step": 6875 + }, + { + "epoch": 0.74, + "grad_norm": 0.08806181104637921, + "learning_rate": 0.00016804788824158057, + "loss": 1.5296, + "step": 6876 + }, + { + "epoch": 0.74, + "grad_norm": 0.07800614211582017, + "learning_rate": 0.00016791770784186128, + "loss": 1.5628, + "step": 6877 + }, + { + "epoch": 0.74, + "grad_norm": 0.08281656842314608, + "learning_rate": 0.00016778756770826486, + "loss": 1.3046, + "step": 6878 + }, + { + "epoch": 0.74, + "grad_norm": 0.08666547575815937, + "learning_rate": 0.00016765746785657104, + "loss": 1.4894, + "step": 6879 + }, + { + "epoch": 0.74, + "grad_norm": 0.07603211449722727, + "learning_rate": 0.00016752740830255504, + "loss": 1.4852, + "step": 6880 + }, + { + "epoch": 0.74, + "grad_norm": 0.08116015501105417, + "learning_rate": 0.0001673973890619871, + "loss": 1.5455, + "step": 6881 + }, + { + "epoch": 0.74, + "grad_norm": 0.08366331426875613, + "learning_rate": 0.00016726741015063223, + "loss": 1.4234, + "step": 6882 + }, + { + "epoch": 0.74, + "grad_norm": 0.09604489598470241, + "learning_rate": 0.00016713747158425118, + "loss": 1.3777, + "step": 6883 + }, + { + "epoch": 0.74, + "grad_norm": 0.08394195627592127, + "learning_rate": 0.00016700757337859907, + "loss": 1.4061, + "step": 6884 + }, + { + "epoch": 0.74, + "grad_norm": 0.08751800146513225, + "learning_rate": 0.00016687771554942688, + "loss": 1.4041, + "step": 6885 + }, + { + "epoch": 0.74, + "grad_norm": 0.08575353248678774, + "learning_rate": 0.00016674789811247992, + "loss": 1.3437, + "step": 6886 + }, + { + "epoch": 0.74, + "grad_norm": 0.08504085235306681, + "learning_rate": 0.0001666181210834994, + "loss": 1.4633, + "step": 6887 + }, + { + "epoch": 0.74, + "grad_norm": 0.09564690373909444, + "learning_rate": 0.00016648838447822084, + "loss": 1.4409, + "step": 6888 + }, + { + "epoch": 0.74, + "grad_norm": 0.08571014076959972, + "learning_rate": 0.0001663586883123755, + "loss": 1.3986, + "step": 6889 + }, + { + "epoch": 0.74, + "grad_norm": 0.07663223113335221, + "learning_rate": 0.00016622903260168955, + "loss": 1.4524, + "step": 6890 + }, + { + "epoch": 0.74, + "grad_norm": 0.07946817775922309, + "learning_rate": 0.00016609941736188394, + "loss": 1.3571, + "step": 6891 + }, + { + "epoch": 0.74, + "grad_norm": 0.08637460712979757, + "learning_rate": 0.00016596984260867516, + "loss": 1.4576, + "step": 6892 + }, + { + "epoch": 0.74, + "grad_norm": 0.07903291672994443, + "learning_rate": 0.00016584030835777464, + "loss": 1.4884, + "step": 6893 + }, + { + "epoch": 0.74, + "grad_norm": 0.07466860169220521, + "learning_rate": 0.00016571081462488874, + "loss": 1.5506, + "step": 6894 + }, + { + "epoch": 0.74, + "grad_norm": 0.06936360532492965, + "learning_rate": 0.0001655813614257189, + "loss": 1.3753, + "step": 6895 + }, + { + "epoch": 0.74, + "grad_norm": 0.09451161532387511, + "learning_rate": 0.00016545194877596193, + "loss": 1.5499, + "step": 6896 + }, + { + "epoch": 0.74, + "grad_norm": 0.08750817216023275, + "learning_rate": 0.00016532257669130967, + "loss": 1.4725, + "step": 6897 + }, + { + "epoch": 0.74, + "grad_norm": 0.08644183462372003, + "learning_rate": 0.00016519324518744866, + "loss": 1.4153, + "step": 6898 + }, + { + "epoch": 0.74, + "grad_norm": 0.07856152367805538, + "learning_rate": 0.00016506395428006088, + "loss": 1.4271, + "step": 6899 + }, + { + "epoch": 0.74, + "grad_norm": 0.08313932519241506, + "learning_rate": 0.00016493470398482352, + "loss": 1.4276, + "step": 6900 + }, + { + "epoch": 0.74, + "grad_norm": 0.07758330661107049, + "learning_rate": 0.00016480549431740831, + "loss": 1.4259, + "step": 6901 + }, + { + "epoch": 0.74, + "grad_norm": 0.07890737430673259, + "learning_rate": 0.00016467632529348247, + "loss": 1.5194, + "step": 6902 + }, + { + "epoch": 0.74, + "grad_norm": 0.09159297058951059, + "learning_rate": 0.0001645471969287084, + "loss": 1.487, + "step": 6903 + }, + { + "epoch": 0.74, + "grad_norm": 0.09770573554325404, + "learning_rate": 0.00016441810923874318, + "loss": 1.4068, + "step": 6904 + }, + { + "epoch": 0.74, + "grad_norm": 0.09916837391473986, + "learning_rate": 0.00016428906223923902, + "loss": 1.3563, + "step": 6905 + }, + { + "epoch": 0.74, + "grad_norm": 0.08312448828582723, + "learning_rate": 0.00016416005594584355, + "loss": 1.4346, + "step": 6906 + }, + { + "epoch": 0.74, + "grad_norm": 0.0777976180163894, + "learning_rate": 0.00016403109037419893, + "loss": 1.3848, + "step": 6907 + }, + { + "epoch": 0.74, + "grad_norm": 0.07890686457460733, + "learning_rate": 0.00016390216553994292, + "loss": 1.3671, + "step": 6908 + }, + { + "epoch": 0.74, + "grad_norm": 0.07973944756595773, + "learning_rate": 0.00016377328145870823, + "loss": 1.3963, + "step": 6909 + }, + { + "epoch": 0.74, + "grad_norm": 0.07806363228733787, + "learning_rate": 0.00016364443814612207, + "loss": 1.3893, + "step": 6910 + }, + { + "epoch": 0.74, + "grad_norm": 0.07209223731006845, + "learning_rate": 0.00016351563561780742, + "loss": 1.4204, + "step": 6911 + }, + { + "epoch": 0.74, + "grad_norm": 0.07909441403477985, + "learning_rate": 0.00016338687388938217, + "loss": 1.3775, + "step": 6912 + }, + { + "epoch": 0.74, + "grad_norm": 0.07972099827853533, + "learning_rate": 0.00016325815297645873, + "loss": 1.5274, + "step": 6913 + }, + { + "epoch": 0.74, + "grad_norm": 0.08052821554655319, + "learning_rate": 0.00016312947289464518, + "loss": 1.5595, + "step": 6914 + }, + { + "epoch": 0.74, + "grad_norm": 0.08023420829022035, + "learning_rate": 0.0001630008336595446, + "loss": 1.3303, + "step": 6915 + }, + { + "epoch": 0.74, + "grad_norm": 0.09799506539759276, + "learning_rate": 0.00016287223528675476, + "loss": 1.2331, + "step": 6916 + }, + { + "epoch": 0.74, + "grad_norm": 0.08491064095612154, + "learning_rate": 0.00016274367779186844, + "loss": 1.3602, + "step": 6917 + }, + { + "epoch": 0.74, + "grad_norm": 0.08807116482937358, + "learning_rate": 0.00016261516119047393, + "loss": 1.4781, + "step": 6918 + }, + { + "epoch": 0.74, + "grad_norm": 0.10870127860147914, + "learning_rate": 0.00016248668549815443, + "loss": 1.5202, + "step": 6919 + }, + { + "epoch": 0.74, + "grad_norm": 0.10562154840968066, + "learning_rate": 0.0001623582507304877, + "loss": 1.5252, + "step": 6920 + }, + { + "epoch": 0.74, + "grad_norm": 0.08298775907519637, + "learning_rate": 0.0001622298569030473, + "loss": 1.4743, + "step": 6921 + }, + { + "epoch": 0.74, + "grad_norm": 0.08203093210794524, + "learning_rate": 0.000162101504031401, + "loss": 1.4706, + "step": 6922 + }, + { + "epoch": 0.74, + "grad_norm": 0.08380641907542187, + "learning_rate": 0.00016197319213111233, + "loss": 1.335, + "step": 6923 + }, + { + "epoch": 0.74, + "grad_norm": 0.08283572450475699, + "learning_rate": 0.0001618449212177396, + "loss": 1.3608, + "step": 6924 + }, + { + "epoch": 0.74, + "grad_norm": 0.09123974004311679, + "learning_rate": 0.00016171669130683592, + "loss": 1.305, + "step": 6925 + }, + { + "epoch": 0.74, + "grad_norm": 0.09077853862321274, + "learning_rate": 0.00016158850241394958, + "loss": 1.4741, + "step": 6926 + }, + { + "epoch": 0.74, + "grad_norm": 0.08266347964611022, + "learning_rate": 0.00016146035455462393, + "loss": 1.4007, + "step": 6927 + }, + { + "epoch": 0.74, + "grad_norm": 0.07728062995794181, + "learning_rate": 0.0001613322477443976, + "loss": 1.3946, + "step": 6928 + }, + { + "epoch": 0.74, + "grad_norm": 0.09516214966689458, + "learning_rate": 0.00016120418199880367, + "loss": 1.4611, + "step": 6929 + }, + { + "epoch": 0.74, + "grad_norm": 0.08415646166248951, + "learning_rate": 0.0001610761573333706, + "loss": 1.3602, + "step": 6930 + }, + { + "epoch": 0.75, + "grad_norm": 0.08683899922320816, + "learning_rate": 0.00016094817376362215, + "loss": 1.4251, + "step": 6931 + }, + { + "epoch": 0.75, + "grad_norm": 0.08047309008486432, + "learning_rate": 0.00016082023130507627, + "loss": 1.3802, + "step": 6932 + }, + { + "epoch": 0.75, + "grad_norm": 0.08534330070542163, + "learning_rate": 0.0001606923299732468, + "loss": 1.347, + "step": 6933 + }, + { + "epoch": 0.75, + "grad_norm": 0.08466164032891992, + "learning_rate": 0.00016056446978364214, + "loss": 1.3549, + "step": 6934 + }, + { + "epoch": 0.75, + "grad_norm": 0.10321861631811807, + "learning_rate": 0.00016043665075176562, + "loss": 1.4485, + "step": 6935 + }, + { + "epoch": 0.75, + "grad_norm": 0.0927153279870653, + "learning_rate": 0.000160308872893116, + "loss": 1.3688, + "step": 6936 + }, + { + "epoch": 0.75, + "grad_norm": 0.09696757316712741, + "learning_rate": 0.00016018113622318664, + "loss": 1.3439, + "step": 6937 + }, + { + "epoch": 0.75, + "grad_norm": 0.07852402945748814, + "learning_rate": 0.00016005344075746585, + "loss": 1.4648, + "step": 6938 + }, + { + "epoch": 0.75, + "grad_norm": 0.07922286942853338, + "learning_rate": 0.0001599257865114374, + "loss": 1.233, + "step": 6939 + }, + { + "epoch": 0.75, + "grad_norm": 0.0913938351396083, + "learning_rate": 0.0001597981735005799, + "loss": 1.4028, + "step": 6940 + }, + { + "epoch": 0.75, + "grad_norm": 0.08728248486685027, + "learning_rate": 0.0001596706017403665, + "loss": 1.3228, + "step": 6941 + }, + { + "epoch": 0.75, + "grad_norm": 0.08034521864864075, + "learning_rate": 0.000159543071246266, + "loss": 1.3307, + "step": 6942 + }, + { + "epoch": 0.75, + "grad_norm": 0.08437406293251533, + "learning_rate": 0.00015941558203374197, + "loss": 1.3317, + "step": 6943 + }, + { + "epoch": 0.75, + "grad_norm": 0.07271689555275264, + "learning_rate": 0.00015928813411825266, + "loss": 1.388, + "step": 6944 + }, + { + "epoch": 0.75, + "grad_norm": 0.09175529197602372, + "learning_rate": 0.00015916072751525167, + "loss": 1.5004, + "step": 6945 + }, + { + "epoch": 0.75, + "grad_norm": 0.08084840838382006, + "learning_rate": 0.0001590333622401877, + "loss": 1.3225, + "step": 6946 + }, + { + "epoch": 0.75, + "grad_norm": 0.0839105076082594, + "learning_rate": 0.00015890603830850402, + "loss": 1.4301, + "step": 6947 + }, + { + "epoch": 0.75, + "grad_norm": 0.10145040433206401, + "learning_rate": 0.000158778755735639, + "loss": 1.4455, + "step": 6948 + }, + { + "epoch": 0.75, + "grad_norm": 0.07785525394316473, + "learning_rate": 0.0001586515145370262, + "loss": 1.3215, + "step": 6949 + }, + { + "epoch": 0.75, + "grad_norm": 0.07590586417847282, + "learning_rate": 0.00015852431472809426, + "loss": 1.4979, + "step": 6950 + }, + { + "epoch": 0.75, + "grad_norm": 0.07980851983434818, + "learning_rate": 0.0001583971563242662, + "loss": 1.3844, + "step": 6951 + }, + { + "epoch": 0.75, + "grad_norm": 0.07922936787588304, + "learning_rate": 0.0001582700393409608, + "loss": 1.4856, + "step": 6952 + }, + { + "epoch": 0.75, + "grad_norm": 0.08281630274525655, + "learning_rate": 0.00015814296379359106, + "loss": 1.3173, + "step": 6953 + }, + { + "epoch": 0.75, + "grad_norm": 0.08092482775318731, + "learning_rate": 0.00015801592969756555, + "loss": 1.3588, + "step": 6954 + }, + { + "epoch": 0.75, + "grad_norm": 0.07710543321208346, + "learning_rate": 0.00015788893706828773, + "loss": 1.4724, + "step": 6955 + }, + { + "epoch": 0.75, + "grad_norm": 0.07537722490115532, + "learning_rate": 0.00015776198592115553, + "loss": 1.4065, + "step": 6956 + }, + { + "epoch": 0.75, + "grad_norm": 0.07354587379308565, + "learning_rate": 0.00015763507627156265, + "loss": 1.4229, + "step": 6957 + }, + { + "epoch": 0.75, + "grad_norm": 0.08239350486018063, + "learning_rate": 0.00015750820813489685, + "loss": 1.3505, + "step": 6958 + }, + { + "epoch": 0.75, + "grad_norm": 0.08386481585345223, + "learning_rate": 0.00015738138152654175, + "loss": 1.3849, + "step": 6959 + }, + { + "epoch": 0.75, + "grad_norm": 0.09620737533851652, + "learning_rate": 0.00015725459646187518, + "loss": 1.3966, + "step": 6960 + }, + { + "epoch": 0.75, + "grad_norm": 0.08601988560556079, + "learning_rate": 0.00015712785295627035, + "loss": 1.4484, + "step": 6961 + }, + { + "epoch": 0.75, + "grad_norm": 0.07897105164507949, + "learning_rate": 0.0001570011510250956, + "loss": 1.4333, + "step": 6962 + }, + { + "epoch": 0.75, + "grad_norm": 0.07358390406887073, + "learning_rate": 0.00015687449068371367, + "loss": 1.429, + "step": 6963 + }, + { + "epoch": 0.75, + "grad_norm": 0.0924152537075702, + "learning_rate": 0.00015674787194748264, + "loss": 1.3773, + "step": 6964 + }, + { + "epoch": 0.75, + "grad_norm": 0.08221188593201989, + "learning_rate": 0.00015662129483175568, + "loss": 1.3499, + "step": 6965 + }, + { + "epoch": 0.75, + "grad_norm": 0.08749139899956103, + "learning_rate": 0.00015649475935188033, + "loss": 1.4698, + "step": 6966 + }, + { + "epoch": 0.75, + "grad_norm": 0.08393889404422417, + "learning_rate": 0.00015636826552319972, + "loss": 1.4097, + "step": 6967 + }, + { + "epoch": 0.75, + "grad_norm": 0.07418914646855072, + "learning_rate": 0.00015624181336105187, + "loss": 1.2507, + "step": 6968 + }, + { + "epoch": 0.75, + "grad_norm": 0.08998017862926071, + "learning_rate": 0.0001561154028807689, + "loss": 1.4847, + "step": 6969 + }, + { + "epoch": 0.75, + "grad_norm": 0.07646907923162406, + "learning_rate": 0.00015598903409767896, + "loss": 1.3316, + "step": 6970 + }, + { + "epoch": 0.75, + "grad_norm": 0.10029053480803479, + "learning_rate": 0.00015586270702710474, + "loss": 1.5359, + "step": 6971 + }, + { + "epoch": 0.75, + "grad_norm": 0.08308434884812765, + "learning_rate": 0.00015573642168436358, + "loss": 1.504, + "step": 6972 + }, + { + "epoch": 0.75, + "grad_norm": 0.10198312095249795, + "learning_rate": 0.00015561017808476813, + "loss": 1.4175, + "step": 6973 + }, + { + "epoch": 0.75, + "grad_norm": 0.0819281330153054, + "learning_rate": 0.00015548397624362605, + "loss": 1.371, + "step": 6974 + }, + { + "epoch": 0.75, + "grad_norm": 0.08491787970373423, + "learning_rate": 0.0001553578161762394, + "loss": 1.4177, + "step": 6975 + }, + { + "epoch": 0.75, + "grad_norm": 0.08401926419336586, + "learning_rate": 0.00015523169789790576, + "loss": 1.3111, + "step": 6976 + }, + { + "epoch": 0.75, + "grad_norm": 0.08999804666215713, + "learning_rate": 0.00015510562142391742, + "loss": 1.3743, + "step": 6977 + }, + { + "epoch": 0.75, + "grad_norm": 0.08523286268070919, + "learning_rate": 0.0001549795867695616, + "loss": 1.4312, + "step": 6978 + }, + { + "epoch": 0.75, + "grad_norm": 0.08185701371966983, + "learning_rate": 0.00015485359395012011, + "loss": 1.3798, + "step": 6979 + }, + { + "epoch": 0.75, + "grad_norm": 0.0701710856775418, + "learning_rate": 0.00015472764298087027, + "loss": 1.4199, + "step": 6980 + }, + { + "epoch": 0.75, + "grad_norm": 0.08314221104473174, + "learning_rate": 0.00015460173387708427, + "loss": 1.3538, + "step": 6981 + }, + { + "epoch": 0.75, + "grad_norm": 0.08420556211767127, + "learning_rate": 0.00015447586665402857, + "loss": 1.391, + "step": 6982 + }, + { + "epoch": 0.75, + "grad_norm": 0.07528567612802567, + "learning_rate": 0.00015435004132696546, + "loss": 1.4486, + "step": 6983 + }, + { + "epoch": 0.75, + "grad_norm": 0.09387052135404927, + "learning_rate": 0.0001542242579111513, + "loss": 1.3717, + "step": 6984 + }, + { + "epoch": 0.75, + "grad_norm": 0.07264079271114803, + "learning_rate": 0.0001540985164218379, + "loss": 1.4776, + "step": 6985 + }, + { + "epoch": 0.75, + "grad_norm": 0.0687592679653029, + "learning_rate": 0.0001539728168742721, + "loss": 1.4902, + "step": 6986 + }, + { + "epoch": 0.75, + "grad_norm": 0.08720354161857613, + "learning_rate": 0.00015384715928369502, + "loss": 1.45, + "step": 6987 + }, + { + "epoch": 0.75, + "grad_norm": 0.08356950973283157, + "learning_rate": 0.00015372154366534324, + "loss": 1.4589, + "step": 6988 + }, + { + "epoch": 0.75, + "grad_norm": 0.08640405818330486, + "learning_rate": 0.00015359597003444824, + "loss": 1.4614, + "step": 6989 + }, + { + "epoch": 0.75, + "grad_norm": 0.07591193807593741, + "learning_rate": 0.00015347043840623615, + "loss": 1.3445, + "step": 6990 + }, + { + "epoch": 0.75, + "grad_norm": 0.07509872422742482, + "learning_rate": 0.00015334494879592787, + "loss": 1.5149, + "step": 6991 + }, + { + "epoch": 0.75, + "grad_norm": 0.08250599953346692, + "learning_rate": 0.00015321950121873967, + "loss": 1.3254, + "step": 6992 + }, + { + "epoch": 0.75, + "grad_norm": 0.07975103285921857, + "learning_rate": 0.00015309409568988263, + "loss": 1.3806, + "step": 6993 + }, + { + "epoch": 0.75, + "grad_norm": 0.08522385169066489, + "learning_rate": 0.0001529687322245623, + "loss": 1.3766, + "step": 6994 + }, + { + "epoch": 0.75, + "grad_norm": 0.08078623532738036, + "learning_rate": 0.0001528434108379796, + "loss": 1.3841, + "step": 6995 + }, + { + "epoch": 0.75, + "grad_norm": 0.09287538197642288, + "learning_rate": 0.00015271813154533033, + "loss": 1.397, + "step": 6996 + }, + { + "epoch": 0.75, + "grad_norm": 0.07227532782369564, + "learning_rate": 0.00015259289436180467, + "loss": 1.3578, + "step": 6997 + }, + { + "epoch": 0.75, + "grad_norm": 0.08275454676238209, + "learning_rate": 0.0001524676993025883, + "loss": 1.4225, + "step": 6998 + }, + { + "epoch": 0.75, + "grad_norm": 0.0787200646752553, + "learning_rate": 0.00015234254638286183, + "loss": 1.433, + "step": 6999 + }, + { + "epoch": 0.75, + "grad_norm": 0.08559670118586739, + "learning_rate": 0.00015221743561779987, + "loss": 1.4289, + "step": 7000 + }, + { + "epoch": 0.75, + "grad_norm": 0.08685366803895994, + "learning_rate": 0.00015209236702257278, + "loss": 1.4789, + "step": 7001 + }, + { + "epoch": 0.75, + "grad_norm": 0.08398812238516672, + "learning_rate": 0.0001519673406123458, + "loss": 1.3169, + "step": 7002 + }, + { + "epoch": 0.75, + "grad_norm": 0.07951981075675485, + "learning_rate": 0.00015184235640227845, + "loss": 1.46, + "step": 7003 + }, + { + "epoch": 0.75, + "grad_norm": 0.07797865721296189, + "learning_rate": 0.00015171741440752568, + "loss": 1.5121, + "step": 7004 + }, + { + "epoch": 0.75, + "grad_norm": 0.08033960308925113, + "learning_rate": 0.00015159251464323732, + "loss": 1.4325, + "step": 7005 + }, + { + "epoch": 0.75, + "grad_norm": 0.07822679950394763, + "learning_rate": 0.00015146765712455745, + "loss": 1.4887, + "step": 7006 + }, + { + "epoch": 0.75, + "grad_norm": 0.07221658273320822, + "learning_rate": 0.00015134284186662584, + "loss": 1.5121, + "step": 7007 + }, + { + "epoch": 0.75, + "grad_norm": 0.0851519454515554, + "learning_rate": 0.00015121806888457673, + "loss": 1.4916, + "step": 7008 + }, + { + "epoch": 0.75, + "grad_norm": 0.07553393733953745, + "learning_rate": 0.00015109333819353905, + "loss": 1.3452, + "step": 7009 + }, + { + "epoch": 0.75, + "grad_norm": 0.08710936906626386, + "learning_rate": 0.00015096864980863718, + "loss": 1.3347, + "step": 7010 + }, + { + "epoch": 0.75, + "grad_norm": 0.08725477793257659, + "learning_rate": 0.00015084400374498964, + "loss": 1.412, + "step": 7011 + }, + { + "epoch": 0.75, + "grad_norm": 0.08923549232165805, + "learning_rate": 0.0001507194000177105, + "loss": 1.2951, + "step": 7012 + }, + { + "epoch": 0.75, + "grad_norm": 0.0796244829660177, + "learning_rate": 0.00015059483864190816, + "loss": 1.2852, + "step": 7013 + }, + { + "epoch": 0.75, + "grad_norm": 0.08487704673509294, + "learning_rate": 0.00015047031963268614, + "loss": 1.4351, + "step": 7014 + }, + { + "epoch": 0.75, + "grad_norm": 0.08299755706265552, + "learning_rate": 0.0001503458430051431, + "loss": 1.2572, + "step": 7015 + }, + { + "epoch": 0.75, + "grad_norm": 0.09293429207949322, + "learning_rate": 0.00015022140877437185, + "loss": 1.4802, + "step": 7016 + }, + { + "epoch": 0.75, + "grad_norm": 0.07406152365690835, + "learning_rate": 0.0001500970169554608, + "loss": 1.3775, + "step": 7017 + }, + { + "epoch": 0.75, + "grad_norm": 0.08114305241044793, + "learning_rate": 0.00014997266756349264, + "loss": 1.3368, + "step": 7018 + }, + { + "epoch": 0.75, + "grad_norm": 0.07629530714726956, + "learning_rate": 0.00014984836061354524, + "loss": 1.4962, + "step": 7019 + }, + { + "epoch": 0.75, + "grad_norm": 0.08980752121203324, + "learning_rate": 0.00014972409612069138, + "loss": 1.3905, + "step": 7020 + }, + { + "epoch": 0.75, + "grad_norm": 0.08575581179496382, + "learning_rate": 0.00014959987409999853, + "loss": 1.3924, + "step": 7021 + }, + { + "epoch": 0.75, + "grad_norm": 0.08198613603708774, + "learning_rate": 0.00014947569456652876, + "loss": 1.339, + "step": 7022 + }, + { + "epoch": 0.75, + "grad_norm": 0.09454705124999215, + "learning_rate": 0.00014935155753533947, + "loss": 1.4268, + "step": 7023 + }, + { + "epoch": 0.76, + "grad_norm": 0.09143985910869612, + "learning_rate": 0.00014922746302148282, + "loss": 1.5904, + "step": 7024 + }, + { + "epoch": 0.76, + "grad_norm": 0.08156496168263828, + "learning_rate": 0.00014910341104000546, + "loss": 1.4352, + "step": 7025 + }, + { + "epoch": 0.76, + "grad_norm": 0.08359413915614876, + "learning_rate": 0.00014897940160594925, + "loss": 1.4386, + "step": 7026 + }, + { + "epoch": 0.76, + "grad_norm": 0.08953163494531653, + "learning_rate": 0.00014885543473435088, + "loss": 1.4843, + "step": 7027 + }, + { + "epoch": 0.76, + "grad_norm": 0.07495695660289363, + "learning_rate": 0.00014873151044024146, + "loss": 1.2414, + "step": 7028 + }, + { + "epoch": 0.76, + "grad_norm": 0.10177289279918775, + "learning_rate": 0.00014860762873864742, + "loss": 1.4096, + "step": 7029 + }, + { + "epoch": 0.76, + "grad_norm": 0.07368892484521711, + "learning_rate": 0.00014848378964458999, + "loss": 1.3566, + "step": 7030 + }, + { + "epoch": 0.76, + "grad_norm": 0.08689439609994665, + "learning_rate": 0.0001483599931730849, + "loss": 1.4542, + "step": 7031 + }, + { + "epoch": 0.76, + "grad_norm": 0.08443024292684892, + "learning_rate": 0.00014823623933914276, + "loss": 1.3873, + "step": 7032 + }, + { + "epoch": 0.76, + "grad_norm": 0.08822916976981438, + "learning_rate": 0.00014811252815776955, + "loss": 1.5633, + "step": 7033 + }, + { + "epoch": 0.76, + "grad_norm": 0.0887025471292702, + "learning_rate": 0.0001479888596439652, + "loss": 1.4117, + "step": 7034 + }, + { + "epoch": 0.76, + "grad_norm": 0.08246689872933267, + "learning_rate": 0.0001478652338127252, + "loss": 1.4453, + "step": 7035 + }, + { + "epoch": 0.76, + "grad_norm": 0.07093731196022528, + "learning_rate": 0.00014774165067903982, + "loss": 1.3772, + "step": 7036 + }, + { + "epoch": 0.76, + "grad_norm": 0.08363424689449472, + "learning_rate": 0.00014761811025789352, + "loss": 1.327, + "step": 7037 + }, + { + "epoch": 0.76, + "grad_norm": 0.07842351016801305, + "learning_rate": 0.00014749461256426615, + "loss": 1.3652, + "step": 7038 + }, + { + "epoch": 0.76, + "grad_norm": 0.0824205239126465, + "learning_rate": 0.00014737115761313246, + "loss": 1.4238, + "step": 7039 + }, + { + "epoch": 0.76, + "grad_norm": 0.08687263476544752, + "learning_rate": 0.00014724774541946146, + "loss": 1.3724, + "step": 7040 + }, + { + "epoch": 0.76, + "grad_norm": 0.0920673100075393, + "learning_rate": 0.00014712437599821742, + "loss": 1.2506, + "step": 7041 + }, + { + "epoch": 0.76, + "grad_norm": 0.09222049188447305, + "learning_rate": 0.00014700104936435953, + "loss": 1.4085, + "step": 7042 + }, + { + "epoch": 0.76, + "grad_norm": 0.07736454871275107, + "learning_rate": 0.00014687776553284137, + "loss": 1.4622, + "step": 7043 + }, + { + "epoch": 0.76, + "grad_norm": 0.08255409541057177, + "learning_rate": 0.00014675452451861138, + "loss": 1.5405, + "step": 7044 + }, + { + "epoch": 0.76, + "grad_norm": 0.08612416866292971, + "learning_rate": 0.00014663132633661313, + "loss": 1.417, + "step": 7045 + }, + { + "epoch": 0.76, + "grad_norm": 0.09432241903711268, + "learning_rate": 0.00014650817100178492, + "loss": 1.4246, + "step": 7046 + }, + { + "epoch": 0.76, + "grad_norm": 0.11601460726127812, + "learning_rate": 0.00014638505852905954, + "loss": 1.3404, + "step": 7047 + }, + { + "epoch": 0.76, + "grad_norm": 0.0966656462751512, + "learning_rate": 0.00014626198893336506, + "loss": 1.461, + "step": 7048 + }, + { + "epoch": 0.76, + "grad_norm": 0.07949808600283965, + "learning_rate": 0.00014613896222962375, + "loss": 1.3932, + "step": 7049 + }, + { + "epoch": 0.76, + "grad_norm": 0.09250961738605959, + "learning_rate": 0.00014601597843275327, + "loss": 1.4166, + "step": 7050 + }, + { + "epoch": 0.76, + "grad_norm": 0.07636058338566189, + "learning_rate": 0.00014589303755766587, + "loss": 1.4146, + "step": 7051 + }, + { + "epoch": 0.76, + "grad_norm": 0.08105055959370543, + "learning_rate": 0.0001457701396192685, + "loss": 1.3308, + "step": 7052 + }, + { + "epoch": 0.76, + "grad_norm": 0.08548635220894996, + "learning_rate": 0.00014564728463246275, + "loss": 1.3975, + "step": 7053 + }, + { + "epoch": 0.76, + "grad_norm": 0.07965255750151616, + "learning_rate": 0.00014552447261214534, + "loss": 1.4911, + "step": 7054 + }, + { + "epoch": 0.76, + "grad_norm": 0.08537985574986948, + "learning_rate": 0.00014540170357320786, + "loss": 1.4963, + "step": 7055 + }, + { + "epoch": 0.76, + "grad_norm": 0.09078007237282826, + "learning_rate": 0.0001452789775305362, + "loss": 1.4235, + "step": 7056 + }, + { + "epoch": 0.76, + "grad_norm": 0.08402632342143757, + "learning_rate": 0.0001451562944990114, + "loss": 1.4207, + "step": 7057 + }, + { + "epoch": 0.76, + "grad_norm": 0.08301706262546693, + "learning_rate": 0.00014503365449350936, + "loss": 1.2685, + "step": 7058 + }, + { + "epoch": 0.76, + "grad_norm": 0.08203445439664891, + "learning_rate": 0.00014491105752890033, + "loss": 1.3933, + "step": 7059 + }, + { + "epoch": 0.76, + "grad_norm": 0.09063623215892826, + "learning_rate": 0.00014478850362004974, + "loss": 1.486, + "step": 7060 + }, + { + "epoch": 0.76, + "grad_norm": 0.09475208951182909, + "learning_rate": 0.00014466599278181787, + "loss": 1.4044, + "step": 7061 + }, + { + "epoch": 0.76, + "grad_norm": 0.09177737333236773, + "learning_rate": 0.0001445435250290592, + "loss": 1.4223, + "step": 7062 + }, + { + "epoch": 0.76, + "grad_norm": 0.09652109094250604, + "learning_rate": 0.00014442110037662375, + "loss": 1.4139, + "step": 7063 + }, + { + "epoch": 0.76, + "grad_norm": 0.09218566946256986, + "learning_rate": 0.00014429871883935575, + "loss": 1.2943, + "step": 7064 + }, + { + "epoch": 0.76, + "grad_norm": 0.08224360335861525, + "learning_rate": 0.0001441763804320942, + "loss": 1.4995, + "step": 7065 + }, + { + "epoch": 0.76, + "grad_norm": 0.08621259301000005, + "learning_rate": 0.00014405408516967328, + "loss": 1.381, + "step": 7066 + }, + { + "epoch": 0.76, + "grad_norm": 0.08733699795943922, + "learning_rate": 0.00014393183306692176, + "loss": 1.4354, + "step": 7067 + }, + { + "epoch": 0.76, + "grad_norm": 0.0859822893466842, + "learning_rate": 0.00014380962413866288, + "loss": 1.4137, + "step": 7068 + }, + { + "epoch": 0.76, + "grad_norm": 0.08013451149466999, + "learning_rate": 0.00014368745839971509, + "loss": 1.2975, + "step": 7069 + }, + { + "epoch": 0.76, + "grad_norm": 0.07670849214756494, + "learning_rate": 0.00014356533586489152, + "loss": 1.4086, + "step": 7070 + }, + { + "epoch": 0.76, + "grad_norm": 0.08379391904018833, + "learning_rate": 0.00014344325654899964, + "loss": 1.5488, + "step": 7071 + }, + { + "epoch": 0.76, + "grad_norm": 0.08446287225279143, + "learning_rate": 0.0001433212204668421, + "loss": 1.3633, + "step": 7072 + }, + { + "epoch": 0.76, + "grad_norm": 0.08655495607730566, + "learning_rate": 0.00014319922763321642, + "loss": 1.4289, + "step": 7073 + }, + { + "epoch": 0.76, + "grad_norm": 0.09076922400021534, + "learning_rate": 0.0001430772780629145, + "loss": 1.3839, + "step": 7074 + }, + { + "epoch": 0.76, + "grad_norm": 0.07944001534849933, + "learning_rate": 0.00014295537177072288, + "loss": 1.3809, + "step": 7075 + }, + { + "epoch": 0.76, + "grad_norm": 0.09329551688513704, + "learning_rate": 0.00014283350877142343, + "loss": 1.3861, + "step": 7076 + }, + { + "epoch": 0.76, + "grad_norm": 0.07042640493257381, + "learning_rate": 0.00014271168907979248, + "loss": 1.3829, + "step": 7077 + }, + { + "epoch": 0.76, + "grad_norm": 0.07897438905434498, + "learning_rate": 0.00014258991271060085, + "loss": 1.4558, + "step": 7078 + }, + { + "epoch": 0.76, + "grad_norm": 0.09126203160664402, + "learning_rate": 0.00014246817967861463, + "loss": 1.3902, + "step": 7079 + }, + { + "epoch": 0.76, + "grad_norm": 0.09630423913710359, + "learning_rate": 0.00014234648999859412, + "loss": 1.3629, + "step": 7080 + }, + { + "epoch": 0.76, + "grad_norm": 0.08349720636262284, + "learning_rate": 0.0001422248436852947, + "loss": 1.3694, + "step": 7081 + }, + { + "epoch": 0.76, + "grad_norm": 0.08293949990701834, + "learning_rate": 0.00014210324075346654, + "loss": 1.3749, + "step": 7082 + }, + { + "epoch": 0.76, + "grad_norm": 0.07570143074146496, + "learning_rate": 0.00014198168121785416, + "loss": 1.4489, + "step": 7083 + }, + { + "epoch": 0.76, + "grad_norm": 0.08376068026895099, + "learning_rate": 0.0001418601650931974, + "loss": 1.3698, + "step": 7084 + }, + { + "epoch": 0.76, + "grad_norm": 0.08051201255768421, + "learning_rate": 0.0001417386923942301, + "loss": 1.3801, + "step": 7085 + }, + { + "epoch": 0.76, + "grad_norm": 0.08361829214712761, + "learning_rate": 0.00014161726313568162, + "loss": 1.3365, + "step": 7086 + }, + { + "epoch": 0.76, + "grad_norm": 0.07539311701974652, + "learning_rate": 0.00014149587733227543, + "loss": 1.5533, + "step": 7087 + }, + { + "epoch": 0.76, + "grad_norm": 0.07636646791153252, + "learning_rate": 0.00014137453499873, + "loss": 1.3016, + "step": 7088 + }, + { + "epoch": 0.76, + "grad_norm": 0.07726830815322598, + "learning_rate": 0.00014125323614975878, + "loss": 1.4055, + "step": 7089 + }, + { + "epoch": 0.76, + "grad_norm": 0.07852283896570228, + "learning_rate": 0.00014113198080006927, + "loss": 1.3658, + "step": 7090 + }, + { + "epoch": 0.76, + "grad_norm": 0.08455046614695923, + "learning_rate": 0.00014101076896436428, + "loss": 1.4775, + "step": 7091 + }, + { + "epoch": 0.76, + "grad_norm": 0.0870258879015629, + "learning_rate": 0.00014088960065734136, + "loss": 1.4282, + "step": 7092 + }, + { + "epoch": 0.76, + "grad_norm": 0.08600593545883693, + "learning_rate": 0.00014076847589369223, + "loss": 1.4818, + "step": 7093 + }, + { + "epoch": 0.76, + "grad_norm": 0.0742503862653404, + "learning_rate": 0.00014064739468810388, + "loss": 1.4268, + "step": 7094 + }, + { + "epoch": 0.76, + "grad_norm": 0.08703341486513806, + "learning_rate": 0.00014052635705525814, + "loss": 1.3743, + "step": 7095 + }, + { + "epoch": 0.76, + "grad_norm": 0.0916770967788742, + "learning_rate": 0.00014040536300983052, + "loss": 1.4698, + "step": 7096 + }, + { + "epoch": 0.76, + "grad_norm": 0.0784670687340215, + "learning_rate": 0.00014028441256649238, + "loss": 1.4073, + "step": 7097 + }, + { + "epoch": 0.76, + "grad_norm": 0.08774542735944259, + "learning_rate": 0.00014016350573990948, + "loss": 1.4194, + "step": 7098 + }, + { + "epoch": 0.76, + "grad_norm": 0.09906949969586148, + "learning_rate": 0.0001400426425447419, + "loss": 1.3304, + "step": 7099 + }, + { + "epoch": 0.76, + "grad_norm": 0.09401233175045126, + "learning_rate": 0.00013992182299564493, + "loss": 1.4606, + "step": 7100 + }, + { + "epoch": 0.76, + "grad_norm": 0.07694095237545967, + "learning_rate": 0.00013980104710726844, + "loss": 1.4595, + "step": 7101 + }, + { + "epoch": 0.76, + "grad_norm": 0.1054928297352277, + "learning_rate": 0.00013968031489425658, + "loss": 1.3676, + "step": 7102 + }, + { + "epoch": 0.76, + "grad_norm": 0.08347109928024152, + "learning_rate": 0.0001395596263712488, + "loss": 1.3819, + "step": 7103 + }, + { + "epoch": 0.76, + "grad_norm": 0.07712119034782984, + "learning_rate": 0.00013943898155287904, + "loss": 1.3646, + "step": 7104 + }, + { + "epoch": 0.76, + "grad_norm": 0.08194542905625728, + "learning_rate": 0.00013931838045377586, + "loss": 1.4872, + "step": 7105 + }, + { + "epoch": 0.76, + "grad_norm": 0.08432223869123434, + "learning_rate": 0.00013919782308856232, + "loss": 1.4337, + "step": 7106 + }, + { + "epoch": 0.76, + "grad_norm": 0.09573328871170617, + "learning_rate": 0.00013907730947185665, + "loss": 1.4155, + "step": 7107 + }, + { + "epoch": 0.76, + "grad_norm": 0.08869671452675124, + "learning_rate": 0.00013895683961827167, + "loss": 1.3511, + "step": 7108 + }, + { + "epoch": 0.76, + "grad_norm": 0.07279819542175914, + "learning_rate": 0.00013883641354241438, + "loss": 1.4694, + "step": 7109 + }, + { + "epoch": 0.76, + "grad_norm": 0.07300922537855549, + "learning_rate": 0.00013871603125888704, + "loss": 1.3933, + "step": 7110 + }, + { + "epoch": 0.76, + "grad_norm": 0.0893723766297854, + "learning_rate": 0.00013859569278228668, + "loss": 1.2866, + "step": 7111 + }, + { + "epoch": 0.76, + "grad_norm": 0.07385807095451379, + "learning_rate": 0.00013847539812720435, + "loss": 1.3117, + "step": 7112 + }, + { + "epoch": 0.76, + "grad_norm": 0.08154523440885406, + "learning_rate": 0.00013835514730822646, + "loss": 1.4402, + "step": 7113 + }, + { + "epoch": 0.76, + "grad_norm": 0.07631693608060588, + "learning_rate": 0.00013823494033993362, + "loss": 1.2876, + "step": 7114 + }, + { + "epoch": 0.76, + "grad_norm": 0.07501813680429896, + "learning_rate": 0.00013811477723690147, + "loss": 1.5077, + "step": 7115 + }, + { + "epoch": 0.76, + "grad_norm": 0.09582028831990151, + "learning_rate": 0.0001379946580137003, + "loss": 1.1807, + "step": 7116 + }, + { + "epoch": 0.77, + "grad_norm": 0.08195412649055485, + "learning_rate": 0.0001378745826848949, + "loss": 1.3206, + "step": 7117 + }, + { + "epoch": 0.77, + "grad_norm": 0.08577541487377897, + "learning_rate": 0.00013775455126504465, + "loss": 1.4594, + "step": 7118 + }, + { + "epoch": 0.77, + "grad_norm": 0.08293162734543053, + "learning_rate": 0.00013763456376870387, + "loss": 1.3694, + "step": 7119 + }, + { + "epoch": 0.77, + "grad_norm": 0.08782077768957836, + "learning_rate": 0.00013751462021042166, + "loss": 1.5043, + "step": 7120 + }, + { + "epoch": 0.77, + "grad_norm": 0.07999103741806815, + "learning_rate": 0.0001373947206047413, + "loss": 1.292, + "step": 7121 + }, + { + "epoch": 0.77, + "grad_norm": 0.08470840165153588, + "learning_rate": 0.00013727486496620112, + "loss": 1.376, + "step": 7122 + }, + { + "epoch": 0.77, + "grad_norm": 0.08698455156845768, + "learning_rate": 0.00013715505330933427, + "loss": 1.4579, + "step": 7123 + }, + { + "epoch": 0.77, + "grad_norm": 0.08819929465794887, + "learning_rate": 0.00013703528564866792, + "loss": 1.3595, + "step": 7124 + }, + { + "epoch": 0.77, + "grad_norm": 0.0860304979498625, + "learning_rate": 0.0001369155619987245, + "loss": 1.3969, + "step": 7125 + }, + { + "epoch": 0.77, + "grad_norm": 0.08037149086224887, + "learning_rate": 0.0001367958823740213, + "loss": 1.5538, + "step": 7126 + }, + { + "epoch": 0.77, + "grad_norm": 0.07933503079166238, + "learning_rate": 0.0001366762467890692, + "loss": 1.3866, + "step": 7127 + }, + { + "epoch": 0.77, + "grad_norm": 0.0927014586146838, + "learning_rate": 0.00013655665525837474, + "loss": 1.2759, + "step": 7128 + }, + { + "epoch": 0.77, + "grad_norm": 0.08431397953477916, + "learning_rate": 0.00013643710779643892, + "loss": 1.4357, + "step": 7129 + }, + { + "epoch": 0.77, + "grad_norm": 0.09579874268870016, + "learning_rate": 0.00013631760441775703, + "loss": 1.4489, + "step": 7130 + }, + { + "epoch": 0.77, + "grad_norm": 0.08347123456391652, + "learning_rate": 0.00013619814513681945, + "loss": 1.3502, + "step": 7131 + }, + { + "epoch": 0.77, + "grad_norm": 0.08037094478894655, + "learning_rate": 0.0001360787299681111, + "loss": 1.4554, + "step": 7132 + }, + { + "epoch": 0.77, + "grad_norm": 0.08713009992952041, + "learning_rate": 0.00013595935892611122, + "loss": 1.3501, + "step": 7133 + }, + { + "epoch": 0.77, + "grad_norm": 0.079963043140441, + "learning_rate": 0.00013584003202529415, + "loss": 1.3216, + "step": 7134 + }, + { + "epoch": 0.77, + "grad_norm": 0.08654644726903905, + "learning_rate": 0.00013572074928012878, + "loss": 1.3006, + "step": 7135 + }, + { + "epoch": 0.77, + "grad_norm": 0.08684088272894674, + "learning_rate": 0.00013560151070507825, + "loss": 1.4253, + "step": 7136 + }, + { + "epoch": 0.77, + "grad_norm": 0.08305476767947234, + "learning_rate": 0.00013548231631460095, + "loss": 1.3423, + "step": 7137 + }, + { + "epoch": 0.77, + "grad_norm": 0.07840906222655904, + "learning_rate": 0.00013536316612314936, + "loss": 1.316, + "step": 7138 + }, + { + "epoch": 0.77, + "grad_norm": 0.08868368034508109, + "learning_rate": 0.00013524406014517115, + "loss": 1.4035, + "step": 7139 + }, + { + "epoch": 0.77, + "grad_norm": 0.0825493183209432, + "learning_rate": 0.00013512499839510794, + "loss": 1.4327, + "step": 7140 + }, + { + "epoch": 0.77, + "grad_norm": 0.07142852180236557, + "learning_rate": 0.00013500598088739664, + "loss": 1.2882, + "step": 7141 + }, + { + "epoch": 0.77, + "grad_norm": 0.07839573055539671, + "learning_rate": 0.00013488700763646862, + "loss": 1.3449, + "step": 7142 + }, + { + "epoch": 0.77, + "grad_norm": 0.0762490819632461, + "learning_rate": 0.0001347680786567495, + "loss": 1.446, + "step": 7143 + }, + { + "epoch": 0.77, + "grad_norm": 0.08253038492998804, + "learning_rate": 0.00013464919396266017, + "loss": 1.3892, + "step": 7144 + }, + { + "epoch": 0.77, + "grad_norm": 0.08080981648970774, + "learning_rate": 0.00013453035356861544, + "loss": 1.4137, + "step": 7145 + }, + { + "epoch": 0.77, + "grad_norm": 0.07895680489359269, + "learning_rate": 0.00013441155748902535, + "loss": 1.3648, + "step": 7146 + }, + { + "epoch": 0.77, + "grad_norm": 0.09131009564719973, + "learning_rate": 0.00013429280573829438, + "loss": 1.3353, + "step": 7147 + }, + { + "epoch": 0.77, + "grad_norm": 0.08148140496357519, + "learning_rate": 0.00013417409833082155, + "loss": 1.3986, + "step": 7148 + }, + { + "epoch": 0.77, + "grad_norm": 0.09545610989391617, + "learning_rate": 0.0001340554352810003, + "loss": 1.5625, + "step": 7149 + }, + { + "epoch": 0.77, + "grad_norm": 0.09783883668758428, + "learning_rate": 0.00013393681660321915, + "loss": 1.439, + "step": 7150 + }, + { + "epoch": 0.77, + "grad_norm": 0.07799550589752935, + "learning_rate": 0.00013381824231186113, + "loss": 1.4207, + "step": 7151 + }, + { + "epoch": 0.77, + "grad_norm": 0.0795862542261532, + "learning_rate": 0.00013369971242130352, + "loss": 1.404, + "step": 7152 + }, + { + "epoch": 0.77, + "grad_norm": 0.08548654694614052, + "learning_rate": 0.00013358122694591862, + "loss": 1.3907, + "step": 7153 + }, + { + "epoch": 0.77, + "grad_norm": 0.08134899653490377, + "learning_rate": 0.00013346278590007334, + "loss": 1.3212, + "step": 7154 + }, + { + "epoch": 0.77, + "grad_norm": 0.08104358016743386, + "learning_rate": 0.0001333443892981287, + "loss": 1.4092, + "step": 7155 + }, + { + "epoch": 0.77, + "grad_norm": 0.08339197096891865, + "learning_rate": 0.00013322603715444097, + "loss": 1.4359, + "step": 7156 + }, + { + "epoch": 0.77, + "grad_norm": 0.07981284471191327, + "learning_rate": 0.00013310772948336085, + "loss": 1.512, + "step": 7157 + }, + { + "epoch": 0.77, + "grad_norm": 0.07761084452308815, + "learning_rate": 0.00013298946629923335, + "loss": 1.3671, + "step": 7158 + }, + { + "epoch": 0.77, + "grad_norm": 0.0854348960861969, + "learning_rate": 0.00013287124761639823, + "loss": 1.494, + "step": 7159 + }, + { + "epoch": 0.77, + "grad_norm": 0.09161357129893245, + "learning_rate": 0.0001327530734491902, + "loss": 1.4476, + "step": 7160 + }, + { + "epoch": 0.77, + "grad_norm": 0.08312868545947096, + "learning_rate": 0.0001326349438119379, + "loss": 1.2806, + "step": 7161 + }, + { + "epoch": 0.77, + "grad_norm": 0.09452434467284239, + "learning_rate": 0.0001325168587189652, + "loss": 1.5183, + "step": 7162 + }, + { + "epoch": 0.77, + "grad_norm": 0.08587057449972695, + "learning_rate": 0.00013239881818459043, + "loss": 1.4184, + "step": 7163 + }, + { + "epoch": 0.77, + "grad_norm": 0.08793194779558704, + "learning_rate": 0.0001322808222231261, + "loss": 1.386, + "step": 7164 + }, + { + "epoch": 0.77, + "grad_norm": 0.08722167548904788, + "learning_rate": 0.00013216287084887984, + "loss": 1.3514, + "step": 7165 + }, + { + "epoch": 0.77, + "grad_norm": 0.08853779388704727, + "learning_rate": 0.00013204496407615373, + "loss": 1.4281, + "step": 7166 + }, + { + "epoch": 0.77, + "grad_norm": 0.09532120705919452, + "learning_rate": 0.00013192710191924412, + "loss": 1.4111, + "step": 7167 + }, + { + "epoch": 0.77, + "grad_norm": 0.09657694916039453, + "learning_rate": 0.00013180928439244233, + "loss": 1.3998, + "step": 7168 + }, + { + "epoch": 0.77, + "grad_norm": 0.07677970663766127, + "learning_rate": 0.00013169151151003433, + "loss": 1.2778, + "step": 7169 + }, + { + "epoch": 0.77, + "grad_norm": 0.09092227790364336, + "learning_rate": 0.00013157378328630025, + "loss": 1.4506, + "step": 7170 + }, + { + "epoch": 0.77, + "grad_norm": 0.09163414959090961, + "learning_rate": 0.00013145609973551502, + "loss": 1.4888, + "step": 7171 + }, + { + "epoch": 0.77, + "grad_norm": 0.089725966578204, + "learning_rate": 0.00013133846087194824, + "loss": 1.4996, + "step": 7172 + }, + { + "epoch": 0.77, + "grad_norm": 0.08845121892261157, + "learning_rate": 0.0001312208667098642, + "loss": 1.4485, + "step": 7173 + }, + { + "epoch": 0.77, + "grad_norm": 0.08463233673164806, + "learning_rate": 0.00013110331726352132, + "loss": 1.4001, + "step": 7174 + }, + { + "epoch": 0.77, + "grad_norm": 0.08616388370330381, + "learning_rate": 0.00013098581254717312, + "loss": 1.4804, + "step": 7175 + }, + { + "epoch": 0.77, + "grad_norm": 0.08580067427689526, + "learning_rate": 0.00013086835257506717, + "loss": 1.4569, + "step": 7176 + }, + { + "epoch": 0.77, + "grad_norm": 0.08164815485831177, + "learning_rate": 0.00013075093736144612, + "loss": 1.4586, + "step": 7177 + }, + { + "epoch": 0.77, + "grad_norm": 0.09324023988053676, + "learning_rate": 0.000130633566920547, + "loss": 1.4063, + "step": 7178 + }, + { + "epoch": 0.77, + "grad_norm": 0.08436554667047103, + "learning_rate": 0.00013051624126660132, + "loss": 1.4858, + "step": 7179 + }, + { + "epoch": 0.77, + "grad_norm": 0.08268273314661022, + "learning_rate": 0.00013039896041383504, + "loss": 1.4565, + "step": 7180 + }, + { + "epoch": 0.77, + "grad_norm": 0.08402105295671775, + "learning_rate": 0.000130281724376469, + "loss": 1.4243, + "step": 7181 + }, + { + "epoch": 0.77, + "grad_norm": 0.0714282284349483, + "learning_rate": 0.00013016453316871867, + "loss": 1.3911, + "step": 7182 + }, + { + "epoch": 0.77, + "grad_norm": 0.08425509199398158, + "learning_rate": 0.00013004738680479354, + "loss": 1.3734, + "step": 7183 + }, + { + "epoch": 0.77, + "grad_norm": 0.09272718405902941, + "learning_rate": 0.00012993028529889816, + "loss": 1.4259, + "step": 7184 + }, + { + "epoch": 0.77, + "grad_norm": 0.0790776375510767, + "learning_rate": 0.00012981322866523171, + "loss": 1.3094, + "step": 7185 + }, + { + "epoch": 0.77, + "grad_norm": 0.07733944744848811, + "learning_rate": 0.00012969621691798734, + "loss": 1.3611, + "step": 7186 + }, + { + "epoch": 0.77, + "grad_norm": 0.07745412806827585, + "learning_rate": 0.0001295792500713533, + "loss": 1.2361, + "step": 7187 + }, + { + "epoch": 0.77, + "grad_norm": 0.08395765539333197, + "learning_rate": 0.00012946232813951236, + "loss": 1.4544, + "step": 7188 + }, + { + "epoch": 0.77, + "grad_norm": 0.08265912082983201, + "learning_rate": 0.00012934545113664142, + "loss": 1.4492, + "step": 7189 + }, + { + "epoch": 0.77, + "grad_norm": 0.08664702529416787, + "learning_rate": 0.00012922861907691257, + "loss": 1.3912, + "step": 7190 + }, + { + "epoch": 0.77, + "grad_norm": 0.08828986408607989, + "learning_rate": 0.00012911183197449183, + "loss": 1.3833, + "step": 7191 + }, + { + "epoch": 0.77, + "grad_norm": 0.0783447810072099, + "learning_rate": 0.00012899508984354002, + "loss": 1.5175, + "step": 7192 + }, + { + "epoch": 0.77, + "grad_norm": 0.09375213580877549, + "learning_rate": 0.00012887839269821262, + "loss": 1.3619, + "step": 7193 + }, + { + "epoch": 0.77, + "grad_norm": 0.09220600644539853, + "learning_rate": 0.00012876174055265966, + "loss": 1.4331, + "step": 7194 + }, + { + "epoch": 0.77, + "grad_norm": 0.07795700921437147, + "learning_rate": 0.00012864513342102535, + "loss": 1.4297, + "step": 7195 + }, + { + "epoch": 0.77, + "grad_norm": 0.08299473491200346, + "learning_rate": 0.00012852857131744888, + "loss": 1.469, + "step": 7196 + }, + { + "epoch": 0.77, + "grad_norm": 0.09896839999352933, + "learning_rate": 0.00012841205425606395, + "loss": 1.3864, + "step": 7197 + }, + { + "epoch": 0.77, + "grad_norm": 0.0817748274050442, + "learning_rate": 0.00012829558225099834, + "loss": 1.4143, + "step": 7198 + }, + { + "epoch": 0.77, + "grad_norm": 0.12473542368934352, + "learning_rate": 0.0001281791553163749, + "loss": 1.3604, + "step": 7199 + }, + { + "epoch": 0.77, + "grad_norm": 0.09029268877199315, + "learning_rate": 0.00012806277346631085, + "loss": 1.4519, + "step": 7200 + }, + { + "epoch": 0.77, + "grad_norm": 0.09018864980005668, + "learning_rate": 0.0001279464367149178, + "loss": 1.3292, + "step": 7201 + }, + { + "epoch": 0.77, + "grad_norm": 0.07386791132315604, + "learning_rate": 0.00012783014507630175, + "loss": 1.5217, + "step": 7202 + }, + { + "epoch": 0.77, + "grad_norm": 0.08689566481818276, + "learning_rate": 0.00012771389856456371, + "loss": 1.5659, + "step": 7203 + }, + { + "epoch": 0.77, + "grad_norm": 0.07796769099863722, + "learning_rate": 0.0001275976971937991, + "loss": 1.3743, + "step": 7204 + }, + { + "epoch": 0.77, + "grad_norm": 0.08347089844634027, + "learning_rate": 0.00012748154097809745, + "loss": 1.403, + "step": 7205 + }, + { + "epoch": 0.77, + "grad_norm": 0.08194139361623523, + "learning_rate": 0.00012736542993154316, + "loss": 1.5831, + "step": 7206 + }, + { + "epoch": 0.77, + "grad_norm": 0.08814386072544067, + "learning_rate": 0.00012724936406821537, + "loss": 1.4637, + "step": 7207 + }, + { + "epoch": 0.77, + "grad_norm": 0.09305664303495668, + "learning_rate": 0.000127133343402187, + "loss": 1.3209, + "step": 7208 + }, + { + "epoch": 0.77, + "grad_norm": 0.09063551416325358, + "learning_rate": 0.00012701736794752644, + "loss": 1.3946, + "step": 7209 + }, + { + "epoch": 0.78, + "grad_norm": 0.09015549472437932, + "learning_rate": 0.0001269014377182957, + "loss": 1.3488, + "step": 7210 + }, + { + "epoch": 0.78, + "grad_norm": 0.0804346799482095, + "learning_rate": 0.00012678555272855208, + "loss": 1.4834, + "step": 7211 + }, + { + "epoch": 0.78, + "grad_norm": 0.07872396579005872, + "learning_rate": 0.00012666971299234668, + "loss": 1.3482, + "step": 7212 + }, + { + "epoch": 0.78, + "grad_norm": 0.08578567070288019, + "learning_rate": 0.00012655391852372584, + "loss": 1.3363, + "step": 7213 + }, + { + "epoch": 0.78, + "grad_norm": 0.08786717737868875, + "learning_rate": 0.00012643816933672968, + "loss": 1.3873, + "step": 7214 + }, + { + "epoch": 0.78, + "grad_norm": 0.10304717691086623, + "learning_rate": 0.00012632246544539333, + "loss": 1.4498, + "step": 7215 + }, + { + "epoch": 0.78, + "grad_norm": 0.08021184395395602, + "learning_rate": 0.00012620680686374646, + "loss": 1.3443, + "step": 7216 + }, + { + "epoch": 0.78, + "grad_norm": 0.10865624373995214, + "learning_rate": 0.00012609119360581277, + "loss": 1.4498, + "step": 7217 + }, + { + "epoch": 0.78, + "grad_norm": 0.07778178613046131, + "learning_rate": 0.00012597562568561095, + "loss": 1.5317, + "step": 7218 + }, + { + "epoch": 0.78, + "grad_norm": 0.07738530338405038, + "learning_rate": 0.00012586010311715408, + "loss": 1.4558, + "step": 7219 + }, + { + "epoch": 0.78, + "grad_norm": 0.08931510175039695, + "learning_rate": 0.0001257446259144494, + "loss": 1.503, + "step": 7220 + }, + { + "epoch": 0.78, + "grad_norm": 0.09255034325625278, + "learning_rate": 0.00012562919409149915, + "loss": 1.3232, + "step": 7221 + }, + { + "epoch": 0.78, + "grad_norm": 0.079332226562438, + "learning_rate": 0.00012551380766230003, + "loss": 1.4021, + "step": 7222 + }, + { + "epoch": 0.78, + "grad_norm": 0.08365566359520343, + "learning_rate": 0.00012539846664084248, + "loss": 1.4243, + "step": 7223 + }, + { + "epoch": 0.78, + "grad_norm": 0.09729578487033033, + "learning_rate": 0.00012528317104111225, + "loss": 1.3776, + "step": 7224 + }, + { + "epoch": 0.78, + "grad_norm": 0.08689847549229128, + "learning_rate": 0.00012516792087708962, + "loss": 1.4569, + "step": 7225 + }, + { + "epoch": 0.78, + "grad_norm": 0.08986204287640165, + "learning_rate": 0.0001250527161627486, + "loss": 1.3315, + "step": 7226 + }, + { + "epoch": 0.78, + "grad_norm": 0.08902530175684555, + "learning_rate": 0.00012493755691205845, + "loss": 1.3933, + "step": 7227 + }, + { + "epoch": 0.78, + "grad_norm": 0.07783856035461831, + "learning_rate": 0.00012482244313898267, + "loss": 1.3352, + "step": 7228 + }, + { + "epoch": 0.78, + "grad_norm": 0.07579194127448878, + "learning_rate": 0.0001247073748574789, + "loss": 1.3036, + "step": 7229 + }, + { + "epoch": 0.78, + "grad_norm": 0.07552103458438016, + "learning_rate": 0.00012459235208149984, + "loss": 1.4446, + "step": 7230 + }, + { + "epoch": 0.78, + "grad_norm": 0.07717100398872971, + "learning_rate": 0.00012447737482499245, + "loss": 1.4675, + "step": 7231 + }, + { + "epoch": 0.78, + "grad_norm": 0.08179823386986428, + "learning_rate": 0.000124362443101898, + "loss": 1.2272, + "step": 7232 + }, + { + "epoch": 0.78, + "grad_norm": 0.08450782049483559, + "learning_rate": 0.00012424755692615213, + "loss": 1.4358, + "step": 7233 + }, + { + "epoch": 0.78, + "grad_norm": 0.08644532397598073, + "learning_rate": 0.00012413271631168545, + "loss": 1.4029, + "step": 7234 + }, + { + "epoch": 0.78, + "grad_norm": 0.07850539908313922, + "learning_rate": 0.00012401792127242285, + "loss": 1.2961, + "step": 7235 + }, + { + "epoch": 0.78, + "grad_norm": 0.0896043126589486, + "learning_rate": 0.00012390317182228334, + "loss": 1.405, + "step": 7236 + }, + { + "epoch": 0.78, + "grad_norm": 0.07587555720819515, + "learning_rate": 0.0001237884679751809, + "loss": 1.4865, + "step": 7237 + }, + { + "epoch": 0.78, + "grad_norm": 0.08624531471226798, + "learning_rate": 0.00012367380974502383, + "loss": 1.4715, + "step": 7238 + }, + { + "epoch": 0.78, + "grad_norm": 0.08904974913087878, + "learning_rate": 0.00012355919714571458, + "loss": 1.5391, + "step": 7239 + }, + { + "epoch": 0.78, + "grad_norm": 0.09677429940240297, + "learning_rate": 0.0001234446301911506, + "loss": 1.3389, + "step": 7240 + }, + { + "epoch": 0.78, + "grad_norm": 0.09073539193834754, + "learning_rate": 0.00012333010889522327, + "loss": 1.4164, + "step": 7241 + }, + { + "epoch": 0.78, + "grad_norm": 0.08293798819734584, + "learning_rate": 0.00012321563327181883, + "loss": 1.3885, + "step": 7242 + }, + { + "epoch": 0.78, + "grad_norm": 0.07925689642566201, + "learning_rate": 0.00012310120333481795, + "loss": 1.3638, + "step": 7243 + }, + { + "epoch": 0.78, + "grad_norm": 0.0848046515098787, + "learning_rate": 0.0001229868190980955, + "loss": 1.3759, + "step": 7244 + }, + { + "epoch": 0.78, + "grad_norm": 0.09296734522311166, + "learning_rate": 0.00012287248057552092, + "loss": 1.3025, + "step": 7245 + }, + { + "epoch": 0.78, + "grad_norm": 0.08184586378841201, + "learning_rate": 0.0001227581877809582, + "loss": 1.4481, + "step": 7246 + }, + { + "epoch": 0.78, + "grad_norm": 0.07882572428683679, + "learning_rate": 0.0001226439407282659, + "loss": 1.3656, + "step": 7247 + }, + { + "epoch": 0.78, + "grad_norm": 0.09208007004167677, + "learning_rate": 0.0001225297394312966, + "loss": 1.4169, + "step": 7248 + }, + { + "epoch": 0.78, + "grad_norm": 0.08227884585316046, + "learning_rate": 0.00012241558390389769, + "loss": 1.4895, + "step": 7249 + }, + { + "epoch": 0.78, + "grad_norm": 0.0897848075866568, + "learning_rate": 0.00012230147415991116, + "loss": 1.4633, + "step": 7250 + }, + { + "epoch": 0.78, + "grad_norm": 0.07979829089445617, + "learning_rate": 0.0001221874102131728, + "loss": 1.39, + "step": 7251 + }, + { + "epoch": 0.78, + "grad_norm": 0.08325287183551135, + "learning_rate": 0.00012207339207751354, + "loss": 1.3288, + "step": 7252 + }, + { + "epoch": 0.78, + "grad_norm": 0.08921350734918522, + "learning_rate": 0.00012195941976675867, + "loss": 1.3689, + "step": 7253 + }, + { + "epoch": 0.78, + "grad_norm": 0.09011140635333832, + "learning_rate": 0.00012184549329472717, + "loss": 1.4063, + "step": 7254 + }, + { + "epoch": 0.78, + "grad_norm": 0.08625806008476061, + "learning_rate": 0.0001217316126752333, + "loss": 1.445, + "step": 7255 + }, + { + "epoch": 0.78, + "grad_norm": 0.07562931560165986, + "learning_rate": 0.00012161777792208562, + "loss": 1.5065, + "step": 7256 + }, + { + "epoch": 0.78, + "grad_norm": 0.08207305626274926, + "learning_rate": 0.00012150398904908672, + "loss": 1.2579, + "step": 7257 + }, + { + "epoch": 0.78, + "grad_norm": 0.08553307505785233, + "learning_rate": 0.00012139024607003402, + "loss": 1.3923, + "step": 7258 + }, + { + "epoch": 0.78, + "grad_norm": 0.07733318986471323, + "learning_rate": 0.00012127654899871936, + "loss": 1.439, + "step": 7259 + }, + { + "epoch": 0.78, + "grad_norm": 0.09377725522902389, + "learning_rate": 0.0001211628978489287, + "loss": 1.348, + "step": 7260 + }, + { + "epoch": 0.78, + "grad_norm": 0.09011338811288713, + "learning_rate": 0.00012104929263444269, + "loss": 1.4032, + "step": 7261 + }, + { + "epoch": 0.78, + "grad_norm": 0.09418809745410185, + "learning_rate": 0.00012093573336903651, + "loss": 1.3855, + "step": 7262 + }, + { + "epoch": 0.78, + "grad_norm": 0.09075476347236389, + "learning_rate": 0.00012082222006647942, + "loss": 1.4853, + "step": 7263 + }, + { + "epoch": 0.78, + "grad_norm": 0.08586052880258192, + "learning_rate": 0.0001207087527405355, + "loss": 1.4392, + "step": 7264 + }, + { + "epoch": 0.78, + "grad_norm": 0.09258361143240627, + "learning_rate": 0.00012059533140496276, + "loss": 1.4554, + "step": 7265 + }, + { + "epoch": 0.78, + "grad_norm": 0.09197353044257664, + "learning_rate": 0.0001204819560735142, + "loss": 1.4223, + "step": 7266 + }, + { + "epoch": 0.78, + "grad_norm": 0.08122398178814307, + "learning_rate": 0.00012036862675993677, + "loss": 1.4581, + "step": 7267 + }, + { + "epoch": 0.78, + "grad_norm": 0.08569140445620367, + "learning_rate": 0.00012025534347797212, + "loss": 1.557, + "step": 7268 + }, + { + "epoch": 0.78, + "grad_norm": 0.08442336639975227, + "learning_rate": 0.00012014210624135641, + "loss": 1.3068, + "step": 7269 + }, + { + "epoch": 0.78, + "grad_norm": 0.09359124396229755, + "learning_rate": 0.00012002891506381974, + "loss": 1.4421, + "step": 7270 + }, + { + "epoch": 0.78, + "grad_norm": 0.09664875074993715, + "learning_rate": 0.0001199157699590872, + "loss": 1.51, + "step": 7271 + }, + { + "epoch": 0.78, + "grad_norm": 0.08256153160798978, + "learning_rate": 0.00011980267094087777, + "loss": 1.4581, + "step": 7272 + }, + { + "epoch": 0.78, + "grad_norm": 0.08356266546750146, + "learning_rate": 0.00011968961802290523, + "loss": 1.3983, + "step": 7273 + }, + { + "epoch": 0.78, + "grad_norm": 0.07715962500880093, + "learning_rate": 0.0001195766112188778, + "loss": 1.3396, + "step": 7274 + }, + { + "epoch": 0.78, + "grad_norm": 0.07775746783057781, + "learning_rate": 0.00011946365054249775, + "loss": 1.41, + "step": 7275 + }, + { + "epoch": 0.78, + "grad_norm": 0.08050408481614375, + "learning_rate": 0.00011935073600746182, + "loss": 1.4984, + "step": 7276 + }, + { + "epoch": 0.78, + "grad_norm": 0.08051706956456427, + "learning_rate": 0.00011923786762746148, + "loss": 1.3049, + "step": 7277 + }, + { + "epoch": 0.78, + "grad_norm": 0.09187755184619545, + "learning_rate": 0.00011912504541618252, + "loss": 1.3603, + "step": 7278 + }, + { + "epoch": 0.78, + "grad_norm": 0.0945150395456591, + "learning_rate": 0.00011901226938730469, + "loss": 1.5084, + "step": 7279 + }, + { + "epoch": 0.78, + "grad_norm": 0.07896225931399797, + "learning_rate": 0.00011889953955450272, + "loss": 1.5249, + "step": 7280 + }, + { + "epoch": 0.78, + "grad_norm": 0.07849848551878082, + "learning_rate": 0.00011878685593144556, + "loss": 1.4698, + "step": 7281 + }, + { + "epoch": 0.78, + "grad_norm": 0.08101879037358578, + "learning_rate": 0.00011867421853179622, + "loss": 1.3817, + "step": 7282 + }, + { + "epoch": 0.78, + "grad_norm": 0.10727296608575486, + "learning_rate": 0.0001185616273692125, + "loss": 1.2219, + "step": 7283 + }, + { + "epoch": 0.78, + "grad_norm": 0.08454663697420552, + "learning_rate": 0.00011844908245734659, + "loss": 1.396, + "step": 7284 + }, + { + "epoch": 0.78, + "grad_norm": 0.09072240043086682, + "learning_rate": 0.00011833658380984491, + "loss": 1.2478, + "step": 7285 + }, + { + "epoch": 0.78, + "grad_norm": 0.08550914434220919, + "learning_rate": 0.00011822413144034805, + "loss": 1.4036, + "step": 7286 + }, + { + "epoch": 0.78, + "grad_norm": 0.08168062993549902, + "learning_rate": 0.0001181117253624916, + "loss": 1.3707, + "step": 7287 + }, + { + "epoch": 0.78, + "grad_norm": 0.08925027465919966, + "learning_rate": 0.00011799936558990482, + "loss": 1.3877, + "step": 7288 + }, + { + "epoch": 0.78, + "grad_norm": 0.09254528526528016, + "learning_rate": 0.00011788705213621198, + "loss": 1.4509, + "step": 7289 + }, + { + "epoch": 0.78, + "grad_norm": 0.11415106012704615, + "learning_rate": 0.00011777478501503152, + "loss": 1.4539, + "step": 7290 + }, + { + "epoch": 0.78, + "grad_norm": 0.0838403721688183, + "learning_rate": 0.00011766256423997602, + "loss": 1.2205, + "step": 7291 + }, + { + "epoch": 0.78, + "grad_norm": 0.07991379938968937, + "learning_rate": 0.00011755038982465266, + "loss": 1.4549, + "step": 7292 + }, + { + "epoch": 0.78, + "grad_norm": 0.08065051687823484, + "learning_rate": 0.0001174382617826632, + "loss": 1.3445, + "step": 7293 + }, + { + "epoch": 0.78, + "grad_norm": 0.08678497683129696, + "learning_rate": 0.00011732618012760327, + "loss": 1.492, + "step": 7294 + }, + { + "epoch": 0.78, + "grad_norm": 0.08293945639486879, + "learning_rate": 0.00011721414487306326, + "loss": 1.3688, + "step": 7295 + }, + { + "epoch": 0.78, + "grad_norm": 0.08227444119004665, + "learning_rate": 0.00011710215603262797, + "loss": 1.3773, + "step": 7296 + }, + { + "epoch": 0.78, + "grad_norm": 0.08266583007232876, + "learning_rate": 0.00011699021361987633, + "loss": 1.3837, + "step": 7297 + }, + { + "epoch": 0.78, + "grad_norm": 0.09906936808598413, + "learning_rate": 0.00011687831764838158, + "loss": 1.429, + "step": 7298 + }, + { + "epoch": 0.78, + "grad_norm": 0.08195428558569683, + "learning_rate": 0.00011676646813171166, + "loss": 1.4555, + "step": 7299 + }, + { + "epoch": 0.78, + "grad_norm": 0.08268397919617312, + "learning_rate": 0.00011665466508342876, + "loss": 1.358, + "step": 7300 + }, + { + "epoch": 0.78, + "grad_norm": 0.08430853926059426, + "learning_rate": 0.0001165429085170892, + "loss": 1.3842, + "step": 7301 + }, + { + "epoch": 0.78, + "grad_norm": 0.09381193980846146, + "learning_rate": 0.000116431198446244, + "loss": 1.365, + "step": 7302 + }, + { + "epoch": 0.79, + "grad_norm": 0.08280165821756859, + "learning_rate": 0.00011631953488443847, + "loss": 1.3367, + "step": 7303 + }, + { + "epoch": 0.79, + "grad_norm": 0.09674682116615263, + "learning_rate": 0.00011620791784521195, + "loss": 1.5064, + "step": 7304 + }, + { + "epoch": 0.79, + "grad_norm": 0.0895578939531404, + "learning_rate": 0.00011609634734209867, + "loss": 1.3573, + "step": 7305 + }, + { + "epoch": 0.79, + "grad_norm": 0.09195433336912709, + "learning_rate": 0.00011598482338862676, + "loss": 1.316, + "step": 7306 + }, + { + "epoch": 0.79, + "grad_norm": 0.0850041748062091, + "learning_rate": 0.00011587334599831877, + "loss": 1.3207, + "step": 7307 + }, + { + "epoch": 0.79, + "grad_norm": 0.09066164990814109, + "learning_rate": 0.00011576191518469192, + "loss": 1.4068, + "step": 7308 + }, + { + "epoch": 0.79, + "grad_norm": 0.08737458422241964, + "learning_rate": 0.0001156505309612576, + "loss": 1.3028, + "step": 7309 + }, + { + "epoch": 0.79, + "grad_norm": 0.0818355985921247, + "learning_rate": 0.00011553919334152135, + "loss": 1.4178, + "step": 7310 + }, + { + "epoch": 0.79, + "grad_norm": 0.09412720167056654, + "learning_rate": 0.00011542790233898331, + "loss": 1.4519, + "step": 7311 + }, + { + "epoch": 0.79, + "grad_norm": 0.07815617669426021, + "learning_rate": 0.00011531665796713814, + "loss": 1.4109, + "step": 7312 + }, + { + "epoch": 0.79, + "grad_norm": 0.08461422716107501, + "learning_rate": 0.0001152054602394742, + "loss": 1.4821, + "step": 7313 + }, + { + "epoch": 0.79, + "grad_norm": 0.09249266123132792, + "learning_rate": 0.00011509430916947483, + "loss": 1.4866, + "step": 7314 + }, + { + "epoch": 0.79, + "grad_norm": 0.08558705248643976, + "learning_rate": 0.00011498320477061758, + "loss": 1.462, + "step": 7315 + }, + { + "epoch": 0.79, + "grad_norm": 0.08300356854080107, + "learning_rate": 0.00011487214705637395, + "loss": 1.2952, + "step": 7316 + }, + { + "epoch": 0.79, + "grad_norm": 0.08004935332127502, + "learning_rate": 0.00011476113604021038, + "loss": 1.3786, + "step": 7317 + }, + { + "epoch": 0.79, + "grad_norm": 0.07554824940526937, + "learning_rate": 0.00011465017173558717, + "loss": 1.4424, + "step": 7318 + }, + { + "epoch": 0.79, + "grad_norm": 0.08169386017284468, + "learning_rate": 0.00011453925415595901, + "loss": 1.4019, + "step": 7319 + }, + { + "epoch": 0.79, + "grad_norm": 0.08875579848412374, + "learning_rate": 0.00011442838331477513, + "loss": 1.4455, + "step": 7320 + }, + { + "epoch": 0.79, + "grad_norm": 0.08377514812003137, + "learning_rate": 0.00011431755922547921, + "loss": 1.3962, + "step": 7321 + }, + { + "epoch": 0.79, + "grad_norm": 0.08782221330386418, + "learning_rate": 0.00011420678190150879, + "loss": 1.4341, + "step": 7322 + }, + { + "epoch": 0.79, + "grad_norm": 0.07954298875210211, + "learning_rate": 0.00011409605135629603, + "loss": 1.5656, + "step": 7323 + }, + { + "epoch": 0.79, + "grad_norm": 0.0961529478173863, + "learning_rate": 0.00011398536760326761, + "loss": 1.3828, + "step": 7324 + }, + { + "epoch": 0.79, + "grad_norm": 0.0792818090967388, + "learning_rate": 0.00011387473065584403, + "loss": 1.3704, + "step": 7325 + }, + { + "epoch": 0.79, + "grad_norm": 0.09990077427687467, + "learning_rate": 0.00011376414052744055, + "loss": 1.4352, + "step": 7326 + }, + { + "epoch": 0.79, + "grad_norm": 0.09129573736611461, + "learning_rate": 0.00011365359723146673, + "loss": 1.3839, + "step": 7327 + }, + { + "epoch": 0.79, + "grad_norm": 0.08876545291340723, + "learning_rate": 0.00011354310078132618, + "loss": 1.2871, + "step": 7328 + }, + { + "epoch": 0.79, + "grad_norm": 0.0826376515378444, + "learning_rate": 0.00011343265119041685, + "loss": 1.4464, + "step": 7329 + }, + { + "epoch": 0.79, + "grad_norm": 0.08897338013882695, + "learning_rate": 0.00011332224847213124, + "loss": 1.4832, + "step": 7330 + }, + { + "epoch": 0.79, + "grad_norm": 0.08554350555655972, + "learning_rate": 0.00011321189263985621, + "loss": 1.3448, + "step": 7331 + }, + { + "epoch": 0.79, + "grad_norm": 0.08963501695769718, + "learning_rate": 0.00011310158370697254, + "loss": 1.4981, + "step": 7332 + }, + { + "epoch": 0.79, + "grad_norm": 0.08891139005332287, + "learning_rate": 0.00011299132168685566, + "loss": 1.5208, + "step": 7333 + }, + { + "epoch": 0.79, + "grad_norm": 0.07985887227024256, + "learning_rate": 0.00011288110659287543, + "loss": 1.2565, + "step": 7334 + }, + { + "epoch": 0.79, + "grad_norm": 0.08249018502823845, + "learning_rate": 0.00011277093843839548, + "loss": 1.3601, + "step": 7335 + }, + { + "epoch": 0.79, + "grad_norm": 0.0882205268492248, + "learning_rate": 0.00011266081723677434, + "loss": 1.343, + "step": 7336 + }, + { + "epoch": 0.79, + "grad_norm": 0.08568573371670497, + "learning_rate": 0.00011255074300136437, + "loss": 1.3917, + "step": 7337 + }, + { + "epoch": 0.79, + "grad_norm": 0.07816830269456879, + "learning_rate": 0.00011244071574551268, + "loss": 1.3174, + "step": 7338 + }, + { + "epoch": 0.79, + "grad_norm": 0.08957362122918865, + "learning_rate": 0.00011233073548256018, + "loss": 1.354, + "step": 7339 + }, + { + "epoch": 0.79, + "grad_norm": 0.08878251511658643, + "learning_rate": 0.00011222080222584263, + "loss": 1.3677, + "step": 7340 + }, + { + "epoch": 0.79, + "grad_norm": 0.08375678858047989, + "learning_rate": 0.00011211091598868956, + "loss": 1.4903, + "step": 7341 + }, + { + "epoch": 0.79, + "grad_norm": 0.0887016822378597, + "learning_rate": 0.00011200107678442517, + "loss": 1.2804, + "step": 7342 + }, + { + "epoch": 0.79, + "grad_norm": 0.0869215161095677, + "learning_rate": 0.000111891284626368, + "loss": 1.3929, + "step": 7343 + }, + { + "epoch": 0.79, + "grad_norm": 0.08806012815888258, + "learning_rate": 0.00011178153952783043, + "loss": 1.3918, + "step": 7344 + }, + { + "epoch": 0.79, + "grad_norm": 0.08770599045346396, + "learning_rate": 0.00011167184150211962, + "loss": 1.3323, + "step": 7345 + }, + { + "epoch": 0.79, + "grad_norm": 0.08611854804897985, + "learning_rate": 0.00011156219056253691, + "loss": 1.3703, + "step": 7346 + }, + { + "epoch": 0.79, + "grad_norm": 0.08968488237086665, + "learning_rate": 0.00011145258672237762, + "loss": 1.4883, + "step": 7347 + }, + { + "epoch": 0.79, + "grad_norm": 0.09790986626447239, + "learning_rate": 0.00011134302999493173, + "loss": 1.3519, + "step": 7348 + }, + { + "epoch": 0.79, + "grad_norm": 0.08092278640901887, + "learning_rate": 0.0001112335203934836, + "loss": 1.278, + "step": 7349 + }, + { + "epoch": 0.79, + "grad_norm": 0.07777105411353137, + "learning_rate": 0.00011112405793131114, + "loss": 1.3237, + "step": 7350 + }, + { + "epoch": 0.79, + "grad_norm": 0.09088764719653418, + "learning_rate": 0.00011101464262168731, + "loss": 1.4631, + "step": 7351 + }, + { + "epoch": 0.79, + "grad_norm": 0.08000448230902753, + "learning_rate": 0.00011090527447787924, + "loss": 1.3473, + "step": 7352 + }, + { + "epoch": 0.79, + "grad_norm": 0.08583186983779009, + "learning_rate": 0.00011079595351314791, + "loss": 1.4034, + "step": 7353 + }, + { + "epoch": 0.79, + "grad_norm": 0.0960634586034175, + "learning_rate": 0.00011068667974074903, + "loss": 1.4432, + "step": 7354 + }, + { + "epoch": 0.79, + "grad_norm": 0.08076190348815085, + "learning_rate": 0.00011057745317393252, + "loss": 1.334, + "step": 7355 + }, + { + "epoch": 0.79, + "grad_norm": 0.09802013313693342, + "learning_rate": 0.00011046827382594227, + "loss": 1.3139, + "step": 7356 + }, + { + "epoch": 0.79, + "grad_norm": 0.08870505251066106, + "learning_rate": 0.00011035914171001665, + "loss": 1.369, + "step": 7357 + }, + { + "epoch": 0.79, + "grad_norm": 0.10760046353869862, + "learning_rate": 0.0001102500568393886, + "loss": 1.4748, + "step": 7358 + }, + { + "epoch": 0.79, + "grad_norm": 0.08482909523650954, + "learning_rate": 0.0001101410192272848, + "loss": 1.309, + "step": 7359 + }, + { + "epoch": 0.79, + "grad_norm": 0.08967836852723367, + "learning_rate": 0.00011003202888692632, + "loss": 1.3521, + "step": 7360 + }, + { + "epoch": 0.79, + "grad_norm": 0.08971932596246342, + "learning_rate": 0.00010992308583152877, + "loss": 1.3055, + "step": 7361 + }, + { + "epoch": 0.79, + "grad_norm": 0.08965578968467429, + "learning_rate": 0.00010981419007430199, + "loss": 1.496, + "step": 7362 + }, + { + "epoch": 0.79, + "grad_norm": 0.08924314469159167, + "learning_rate": 0.00010970534162844975, + "loss": 1.495, + "step": 7363 + }, + { + "epoch": 0.79, + "grad_norm": 0.08709222636542864, + "learning_rate": 0.00010959654050717032, + "loss": 1.2642, + "step": 7364 + }, + { + "epoch": 0.79, + "grad_norm": 0.08930747249209584, + "learning_rate": 0.00010948778672365644, + "loss": 1.3918, + "step": 7365 + }, + { + "epoch": 0.79, + "grad_norm": 0.08647523289589773, + "learning_rate": 0.00010937908029109461, + "loss": 1.4539, + "step": 7366 + }, + { + "epoch": 0.79, + "grad_norm": 0.08336240679126437, + "learning_rate": 0.00010927042122266611, + "loss": 1.3636, + "step": 7367 + }, + { + "epoch": 0.79, + "grad_norm": 0.07905964491305716, + "learning_rate": 0.00010916180953154592, + "loss": 1.3862, + "step": 7368 + }, + { + "epoch": 0.79, + "grad_norm": 0.09007504611985878, + "learning_rate": 0.00010905324523090377, + "loss": 1.4469, + "step": 7369 + }, + { + "epoch": 0.79, + "grad_norm": 0.08221245517814316, + "learning_rate": 0.00010894472833390357, + "loss": 1.4572, + "step": 7370 + }, + { + "epoch": 0.79, + "grad_norm": 0.08384087412285039, + "learning_rate": 0.00010883625885370319, + "loss": 1.5368, + "step": 7371 + }, + { + "epoch": 0.79, + "grad_norm": 0.09317832358790074, + "learning_rate": 0.00010872783680345489, + "loss": 1.4228, + "step": 7372 + }, + { + "epoch": 0.79, + "grad_norm": 0.10753007879502102, + "learning_rate": 0.00010861946219630525, + "loss": 1.3968, + "step": 7373 + }, + { + "epoch": 0.79, + "grad_norm": 0.09197775161563887, + "learning_rate": 0.00010851113504539528, + "loss": 1.3543, + "step": 7374 + }, + { + "epoch": 0.79, + "grad_norm": 0.08071650914079001, + "learning_rate": 0.00010840285536385968, + "loss": 1.421, + "step": 7375 + }, + { + "epoch": 0.79, + "grad_norm": 0.09167372528944248, + "learning_rate": 0.00010829462316482797, + "loss": 1.5266, + "step": 7376 + }, + { + "epoch": 0.79, + "grad_norm": 0.09098539935181792, + "learning_rate": 0.00010818643846142373, + "loss": 1.3709, + "step": 7377 + }, + { + "epoch": 0.79, + "grad_norm": 0.08429483207935262, + "learning_rate": 0.00010807830126676443, + "loss": 1.3496, + "step": 7378 + }, + { + "epoch": 0.79, + "grad_norm": 0.08474915241120738, + "learning_rate": 0.0001079702115939623, + "loss": 1.3521, + "step": 7379 + }, + { + "epoch": 0.79, + "grad_norm": 0.08776848704159057, + "learning_rate": 0.0001078621694561237, + "loss": 1.3723, + "step": 7380 + }, + { + "epoch": 0.79, + "grad_norm": 0.08454882765125812, + "learning_rate": 0.00010775417486634893, + "loss": 1.4085, + "step": 7381 + }, + { + "epoch": 0.79, + "grad_norm": 0.08768259170723725, + "learning_rate": 0.00010764622783773259, + "loss": 1.5293, + "step": 7382 + }, + { + "epoch": 0.79, + "grad_norm": 0.08057887240979167, + "learning_rate": 0.00010753832838336397, + "loss": 1.4617, + "step": 7383 + }, + { + "epoch": 0.79, + "grad_norm": 0.08991989764932008, + "learning_rate": 0.00010743047651632588, + "loss": 1.359, + "step": 7384 + }, + { + "epoch": 0.79, + "grad_norm": 0.08712173949360097, + "learning_rate": 0.00010732267224969588, + "loss": 1.3348, + "step": 7385 + }, + { + "epoch": 0.79, + "grad_norm": 0.09148073926024225, + "learning_rate": 0.00010721491559654579, + "loss": 1.3145, + "step": 7386 + }, + { + "epoch": 0.79, + "grad_norm": 0.09115660354014754, + "learning_rate": 0.00010710720656994117, + "loss": 1.3405, + "step": 7387 + }, + { + "epoch": 0.79, + "grad_norm": 0.09285950683691585, + "learning_rate": 0.00010699954518294224, + "loss": 1.351, + "step": 7388 + }, + { + "epoch": 0.79, + "grad_norm": 0.08496458358175875, + "learning_rate": 0.00010689193144860354, + "loss": 1.3887, + "step": 7389 + }, + { + "epoch": 0.79, + "grad_norm": 0.08814196543992106, + "learning_rate": 0.00010678436537997321, + "loss": 1.3929, + "step": 7390 + }, + { + "epoch": 0.79, + "grad_norm": 0.08392941262208066, + "learning_rate": 0.00010667684699009439, + "loss": 1.4186, + "step": 7391 + }, + { + "epoch": 0.79, + "grad_norm": 0.0825169692685744, + "learning_rate": 0.00010656937629200369, + "loss": 1.5325, + "step": 7392 + }, + { + "epoch": 0.79, + "grad_norm": 0.0969018358700212, + "learning_rate": 0.00010646195329873259, + "loss": 1.5803, + "step": 7393 + }, + { + "epoch": 0.79, + "grad_norm": 0.08785792908215653, + "learning_rate": 0.00010635457802330628, + "loss": 1.3293, + "step": 7394 + }, + { + "epoch": 0.79, + "grad_norm": 0.08970546446133174, + "learning_rate": 0.0001062472504787445, + "loss": 1.4592, + "step": 7395 + }, + { + "epoch": 0.8, + "grad_norm": 0.09689624451528016, + "learning_rate": 0.0001061399706780612, + "loss": 1.3814, + "step": 7396 + }, + { + "epoch": 0.8, + "grad_norm": 0.08661583703872869, + "learning_rate": 0.00010603273863426411, + "loss": 1.4196, + "step": 7397 + }, + { + "epoch": 0.8, + "grad_norm": 0.08672305211358776, + "learning_rate": 0.00010592555436035573, + "loss": 1.4574, + "step": 7398 + }, + { + "epoch": 0.8, + "grad_norm": 0.07671987734255735, + "learning_rate": 0.00010581841786933261, + "loss": 1.471, + "step": 7399 + }, + { + "epoch": 0.8, + "grad_norm": 0.07563795205528116, + "learning_rate": 0.00010571132917418508, + "loss": 1.4617, + "step": 7400 + }, + { + "epoch": 0.8, + "grad_norm": 0.09722906137813156, + "learning_rate": 0.00010560428828789837, + "loss": 1.2705, + "step": 7401 + }, + { + "epoch": 0.8, + "grad_norm": 0.09142567018690823, + "learning_rate": 0.00010549729522345142, + "loss": 1.4441, + "step": 7402 + }, + { + "epoch": 0.8, + "grad_norm": 0.09043229441252876, + "learning_rate": 0.0001053903499938173, + "loss": 1.4887, + "step": 7403 + }, + { + "epoch": 0.8, + "grad_norm": 0.08808126569357126, + "learning_rate": 0.0001052834526119637, + "loss": 1.5165, + "step": 7404 + }, + { + "epoch": 0.8, + "grad_norm": 0.10853347121607891, + "learning_rate": 0.00010517660309085242, + "loss": 1.3782, + "step": 7405 + }, + { + "epoch": 0.8, + "grad_norm": 0.08195414140573754, + "learning_rate": 0.00010506980144343898, + "loss": 1.3016, + "step": 7406 + }, + { + "epoch": 0.8, + "grad_norm": 0.09994610269025854, + "learning_rate": 0.00010496304768267373, + "loss": 1.3852, + "step": 7407 + }, + { + "epoch": 0.8, + "grad_norm": 0.09477202194767692, + "learning_rate": 0.00010485634182150089, + "loss": 1.3728, + "step": 7408 + }, + { + "epoch": 0.8, + "grad_norm": 0.08841337933183288, + "learning_rate": 0.00010474968387285882, + "loss": 1.3434, + "step": 7409 + }, + { + "epoch": 0.8, + "grad_norm": 0.0959761214605188, + "learning_rate": 0.00010464307384968019, + "loss": 1.5121, + "step": 7410 + }, + { + "epoch": 0.8, + "grad_norm": 0.08672942952818297, + "learning_rate": 0.00010453651176489198, + "loss": 1.4939, + "step": 7411 + }, + { + "epoch": 0.8, + "grad_norm": 0.08689856792950482, + "learning_rate": 0.00010442999763141509, + "loss": 1.4176, + "step": 7412 + }, + { + "epoch": 0.8, + "grad_norm": 0.08847913538326554, + "learning_rate": 0.00010432353146216456, + "loss": 1.3859, + "step": 7413 + }, + { + "epoch": 0.8, + "grad_norm": 0.08940969838357604, + "learning_rate": 0.00010421711327005013, + "loss": 1.3339, + "step": 7414 + }, + { + "epoch": 0.8, + "grad_norm": 0.08584386698958232, + "learning_rate": 0.00010411074306797502, + "loss": 1.2584, + "step": 7415 + }, + { + "epoch": 0.8, + "grad_norm": 0.09041104984604122, + "learning_rate": 0.00010400442086883715, + "loss": 1.5023, + "step": 7416 + }, + { + "epoch": 0.8, + "grad_norm": 0.08328881943694692, + "learning_rate": 0.0001038981466855286, + "loss": 1.2562, + "step": 7417 + }, + { + "epoch": 0.8, + "grad_norm": 0.07626411694127093, + "learning_rate": 0.00010379192053093522, + "loss": 1.3377, + "step": 7418 + }, + { + "epoch": 0.8, + "grad_norm": 0.08720457355500504, + "learning_rate": 0.00010368574241793738, + "loss": 1.364, + "step": 7419 + }, + { + "epoch": 0.8, + "grad_norm": 0.07654734453604108, + "learning_rate": 0.00010357961235940977, + "loss": 1.3639, + "step": 7420 + }, + { + "epoch": 0.8, + "grad_norm": 0.08657449519324938, + "learning_rate": 0.0001034735303682206, + "loss": 1.3308, + "step": 7421 + }, + { + "epoch": 0.8, + "grad_norm": 0.08250237320381912, + "learning_rate": 0.00010336749645723298, + "loss": 1.4285, + "step": 7422 + }, + { + "epoch": 0.8, + "grad_norm": 0.08605554787724745, + "learning_rate": 0.00010326151063930395, + "loss": 1.3979, + "step": 7423 + }, + { + "epoch": 0.8, + "grad_norm": 0.09436166424350222, + "learning_rate": 0.0001031555729272845, + "loss": 1.4827, + "step": 7424 + }, + { + "epoch": 0.8, + "grad_norm": 0.0873621789720717, + "learning_rate": 0.00010304968333401982, + "loss": 1.4897, + "step": 7425 + }, + { + "epoch": 0.8, + "grad_norm": 0.08852806729006432, + "learning_rate": 0.00010294384187234951, + "loss": 1.4744, + "step": 7426 + }, + { + "epoch": 0.8, + "grad_norm": 0.09802979552309042, + "learning_rate": 0.00010283804855510742, + "loss": 1.3337, + "step": 7427 + }, + { + "epoch": 0.8, + "grad_norm": 0.08249683463009905, + "learning_rate": 0.00010273230339512102, + "loss": 1.3253, + "step": 7428 + }, + { + "epoch": 0.8, + "grad_norm": 0.08737069761111654, + "learning_rate": 0.00010262660640521242, + "loss": 1.4135, + "step": 7429 + }, + { + "epoch": 0.8, + "grad_norm": 0.07672002832446094, + "learning_rate": 0.00010252095759819785, + "loss": 1.3387, + "step": 7430 + }, + { + "epoch": 0.8, + "grad_norm": 0.08146901689082577, + "learning_rate": 0.00010241535698688743, + "loss": 1.4587, + "step": 7431 + }, + { + "epoch": 0.8, + "grad_norm": 0.08357617514349239, + "learning_rate": 0.00010230980458408573, + "loss": 1.3892, + "step": 7432 + }, + { + "epoch": 0.8, + "grad_norm": 0.09688593678688902, + "learning_rate": 0.00010220430040259116, + "loss": 1.4337, + "step": 7433 + }, + { + "epoch": 0.8, + "grad_norm": 0.0949961436992429, + "learning_rate": 0.00010209884445519668, + "loss": 1.479, + "step": 7434 + }, + { + "epoch": 0.8, + "grad_norm": 0.09217958997274633, + "learning_rate": 0.00010199343675468897, + "loss": 1.4274, + "step": 7435 + }, + { + "epoch": 0.8, + "grad_norm": 0.0927875344960079, + "learning_rate": 0.0001018880773138493, + "loss": 1.4548, + "step": 7436 + }, + { + "epoch": 0.8, + "grad_norm": 0.10240453114593467, + "learning_rate": 0.00010178276614545267, + "loss": 1.3366, + "step": 7437 + }, + { + "epoch": 0.8, + "grad_norm": 0.07930711688295726, + "learning_rate": 0.00010167750326226848, + "loss": 1.3864, + "step": 7438 + }, + { + "epoch": 0.8, + "grad_norm": 0.08988242509021517, + "learning_rate": 0.00010157228867706041, + "loss": 1.3858, + "step": 7439 + }, + { + "epoch": 0.8, + "grad_norm": 0.08546466142893103, + "learning_rate": 0.00010146712240258577, + "loss": 1.337, + "step": 7440 + }, + { + "epoch": 0.8, + "grad_norm": 0.07476124167921687, + "learning_rate": 0.00010136200445159649, + "loss": 1.3619, + "step": 7441 + }, + { + "epoch": 0.8, + "grad_norm": 0.08503840192574158, + "learning_rate": 0.00010125693483683862, + "loss": 1.5063, + "step": 7442 + }, + { + "epoch": 0.8, + "grad_norm": 0.08121488722572119, + "learning_rate": 0.00010115191357105192, + "loss": 1.3646, + "step": 7443 + }, + { + "epoch": 0.8, + "grad_norm": 0.08638910819398816, + "learning_rate": 0.00010104694066697089, + "loss": 1.4189, + "step": 7444 + }, + { + "epoch": 0.8, + "grad_norm": 0.07556675074773149, + "learning_rate": 0.00010094201613732373, + "loss": 1.4163, + "step": 7445 + }, + { + "epoch": 0.8, + "grad_norm": 0.09294527602966572, + "learning_rate": 0.00010083713999483269, + "loss": 1.3499, + "step": 7446 + }, + { + "epoch": 0.8, + "grad_norm": 0.09451147367174063, + "learning_rate": 0.00010073231225221457, + "loss": 1.3585, + "step": 7447 + }, + { + "epoch": 0.8, + "grad_norm": 0.09048973763769977, + "learning_rate": 0.00010062753292218025, + "loss": 1.4001, + "step": 7448 + }, + { + "epoch": 0.8, + "grad_norm": 0.09350501774740308, + "learning_rate": 0.00010052280201743425, + "loss": 1.376, + "step": 7449 + }, + { + "epoch": 0.8, + "grad_norm": 0.08746862295951763, + "learning_rate": 0.0001004181195506757, + "loss": 1.4711, + "step": 7450 + }, + { + "epoch": 0.8, + "grad_norm": 0.08986402754420884, + "learning_rate": 0.00010031348553459785, + "loss": 1.5504, + "step": 7451 + }, + { + "epoch": 0.8, + "grad_norm": 0.08757788063869364, + "learning_rate": 0.00010020889998188771, + "loss": 1.3495, + "step": 7452 + }, + { + "epoch": 0.8, + "grad_norm": 0.0786814505136955, + "learning_rate": 0.00010010436290522673, + "loss": 1.3102, + "step": 7453 + }, + { + "epoch": 0.8, + "grad_norm": 0.08204954091010624, + "learning_rate": 9.999987431729051e-05, + "loss": 1.6017, + "step": 7454 + }, + { + "epoch": 0.8, + "grad_norm": 0.08699570930256927, + "learning_rate": 9.989543423074854e-05, + "loss": 1.4445, + "step": 7455 + }, + { + "epoch": 0.8, + "grad_norm": 0.08730952432960949, + "learning_rate": 9.979104265826438e-05, + "loss": 1.409, + "step": 7456 + }, + { + "epoch": 0.8, + "grad_norm": 0.08541075113090651, + "learning_rate": 9.96866996124961e-05, + "loss": 1.3488, + "step": 7457 + }, + { + "epoch": 0.8, + "grad_norm": 0.09227061283218439, + "learning_rate": 9.958240510609568e-05, + "loss": 1.3261, + "step": 7458 + }, + { + "epoch": 0.8, + "grad_norm": 0.07891602228455197, + "learning_rate": 9.947815915170894e-05, + "loss": 1.4237, + "step": 7459 + }, + { + "epoch": 0.8, + "grad_norm": 0.10392044416795326, + "learning_rate": 9.937396176197621e-05, + "loss": 1.373, + "step": 7460 + }, + { + "epoch": 0.8, + "grad_norm": 0.08909632543444393, + "learning_rate": 9.926981294953191e-05, + "loss": 1.4048, + "step": 7461 + }, + { + "epoch": 0.8, + "grad_norm": 0.08711148193189107, + "learning_rate": 9.91657127270042e-05, + "loss": 1.301, + "step": 7462 + }, + { + "epoch": 0.8, + "grad_norm": 0.09677062189294994, + "learning_rate": 9.906166110701587e-05, + "loss": 1.4211, + "step": 7463 + }, + { + "epoch": 0.8, + "grad_norm": 0.08012618245248758, + "learning_rate": 9.895765810218322e-05, + "loss": 1.3605, + "step": 7464 + }, + { + "epoch": 0.8, + "grad_norm": 0.07881596378929731, + "learning_rate": 9.885370372511727e-05, + "loss": 1.2357, + "step": 7465 + }, + { + "epoch": 0.8, + "grad_norm": 0.09099731322497107, + "learning_rate": 9.874979798842255e-05, + "loss": 1.3286, + "step": 7466 + }, + { + "epoch": 0.8, + "grad_norm": 0.08318317206545245, + "learning_rate": 9.86459409046983e-05, + "loss": 1.3099, + "step": 7467 + }, + { + "epoch": 0.8, + "grad_norm": 0.08834905278247804, + "learning_rate": 9.854213248653721e-05, + "loss": 1.3114, + "step": 7468 + }, + { + "epoch": 0.8, + "grad_norm": 0.08740069078768421, + "learning_rate": 9.843837274652667e-05, + "loss": 1.5023, + "step": 7469 + }, + { + "epoch": 0.8, + "grad_norm": 0.08485296530187857, + "learning_rate": 9.833466169724792e-05, + "loss": 1.3516, + "step": 7470 + }, + { + "epoch": 0.8, + "grad_norm": 0.09309838507104544, + "learning_rate": 9.823099935127605e-05, + "loss": 1.4752, + "step": 7471 + }, + { + "epoch": 0.8, + "grad_norm": 0.08699771252470198, + "learning_rate": 9.812738572118063e-05, + "loss": 1.467, + "step": 7472 + }, + { + "epoch": 0.8, + "grad_norm": 0.08672188155001476, + "learning_rate": 9.802382081952527e-05, + "loss": 1.3471, + "step": 7473 + }, + { + "epoch": 0.8, + "grad_norm": 0.0881050276317137, + "learning_rate": 9.792030465886736e-05, + "loss": 1.3859, + "step": 7474 + }, + { + "epoch": 0.8, + "grad_norm": 0.0867622769936228, + "learning_rate": 9.781683725175866e-05, + "loss": 1.542, + "step": 7475 + }, + { + "epoch": 0.8, + "grad_norm": 0.08473741654209976, + "learning_rate": 9.771341861074523e-05, + "loss": 1.3428, + "step": 7476 + }, + { + "epoch": 0.8, + "grad_norm": 0.1022488836419732, + "learning_rate": 9.761004874836644e-05, + "loss": 1.3613, + "step": 7477 + }, + { + "epoch": 0.8, + "grad_norm": 0.09639605063432713, + "learning_rate": 9.750672767715651e-05, + "loss": 1.4877, + "step": 7478 + }, + { + "epoch": 0.8, + "grad_norm": 0.09102505887751455, + "learning_rate": 9.740345540964357e-05, + "loss": 1.3897, + "step": 7479 + }, + { + "epoch": 0.8, + "grad_norm": 0.09344953204952636, + "learning_rate": 9.730023195834947e-05, + "loss": 1.5112, + "step": 7480 + }, + { + "epoch": 0.8, + "grad_norm": 0.09225774918138216, + "learning_rate": 9.71970573357906e-05, + "loss": 1.2466, + "step": 7481 + }, + { + "epoch": 0.8, + "grad_norm": 0.08174545024210766, + "learning_rate": 9.709393155447732e-05, + "loss": 1.4278, + "step": 7482 + }, + { + "epoch": 0.8, + "grad_norm": 0.08691964908390155, + "learning_rate": 9.699085462691376e-05, + "loss": 1.3324, + "step": 7483 + }, + { + "epoch": 0.8, + "grad_norm": 0.08499423352944359, + "learning_rate": 9.688782656559842e-05, + "loss": 1.3487, + "step": 7484 + }, + { + "epoch": 0.8, + "grad_norm": 0.1086986268584967, + "learning_rate": 9.6784847383024e-05, + "loss": 1.3336, + "step": 7485 + }, + { + "epoch": 0.8, + "grad_norm": 0.10390079226716396, + "learning_rate": 9.66819170916769e-05, + "loss": 1.4299, + "step": 7486 + }, + { + "epoch": 0.8, + "grad_norm": 0.08179398408151457, + "learning_rate": 9.657903570403769e-05, + "loss": 1.3214, + "step": 7487 + }, + { + "epoch": 0.8, + "grad_norm": 0.09271074862383373, + "learning_rate": 9.647620323258121e-05, + "loss": 1.3402, + "step": 7488 + }, + { + "epoch": 0.81, + "grad_norm": 0.09774757883056379, + "learning_rate": 9.637341968977636e-05, + "loss": 1.4131, + "step": 7489 + }, + { + "epoch": 0.81, + "grad_norm": 0.07832701611547217, + "learning_rate": 9.627068508808579e-05, + "loss": 1.3784, + "step": 7490 + }, + { + "epoch": 0.81, + "grad_norm": 0.1041386239978518, + "learning_rate": 9.616799943996651e-05, + "loss": 1.295, + "step": 7491 + }, + { + "epoch": 0.81, + "grad_norm": 0.08705483512520137, + "learning_rate": 9.606536275786965e-05, + "loss": 1.3471, + "step": 7492 + }, + { + "epoch": 0.81, + "grad_norm": 0.08664530820976736, + "learning_rate": 9.596277505423994e-05, + "loss": 1.4664, + "step": 7493 + }, + { + "epoch": 0.81, + "grad_norm": 0.10276841331323949, + "learning_rate": 9.586023634151674e-05, + "loss": 1.4516, + "step": 7494 + }, + { + "epoch": 0.81, + "grad_norm": 0.09129386033307563, + "learning_rate": 9.575774663213327e-05, + "loss": 1.4629, + "step": 7495 + }, + { + "epoch": 0.81, + "grad_norm": 0.09433687443593726, + "learning_rate": 9.565530593851656e-05, + "loss": 1.478, + "step": 7496 + }, + { + "epoch": 0.81, + "grad_norm": 0.08781641900491051, + "learning_rate": 9.55529142730881e-05, + "loss": 1.2935, + "step": 7497 + }, + { + "epoch": 0.81, + "grad_norm": 0.10812828620908439, + "learning_rate": 9.545057164826315e-05, + "loss": 1.4357, + "step": 7498 + }, + { + "epoch": 0.81, + "grad_norm": 0.10200044996981032, + "learning_rate": 9.534827807645091e-05, + "loss": 1.229, + "step": 7499 + }, + { + "epoch": 0.81, + "grad_norm": 0.09240236105764514, + "learning_rate": 9.524603357005501e-05, + "loss": 1.4634, + "step": 7500 + }, + { + "epoch": 0.81, + "grad_norm": 0.09454120611142644, + "learning_rate": 9.514383814147309e-05, + "loss": 1.4214, + "step": 7501 + }, + { + "epoch": 0.81, + "grad_norm": 0.09548820233824558, + "learning_rate": 9.50416918030964e-05, + "loss": 1.3984, + "step": 7502 + }, + { + "epoch": 0.81, + "grad_norm": 0.07630747179285953, + "learning_rate": 9.493959456731072e-05, + "loss": 1.4153, + "step": 7503 + }, + { + "epoch": 0.81, + "grad_norm": 0.0819944969222753, + "learning_rate": 9.483754644649573e-05, + "loss": 1.2477, + "step": 7504 + }, + { + "epoch": 0.81, + "grad_norm": 0.08982939320379192, + "learning_rate": 9.473554745302492e-05, + "loss": 1.4237, + "step": 7505 + }, + { + "epoch": 0.81, + "grad_norm": 0.0941844203191723, + "learning_rate": 9.463359759926615e-05, + "loss": 1.405, + "step": 7506 + }, + { + "epoch": 0.81, + "grad_norm": 0.09325934826499595, + "learning_rate": 9.453169689758134e-05, + "loss": 1.5425, + "step": 7507 + }, + { + "epoch": 0.81, + "grad_norm": 0.09148103195006312, + "learning_rate": 9.442984536032612e-05, + "loss": 1.4171, + "step": 7508 + }, + { + "epoch": 0.81, + "grad_norm": 0.09186090103298966, + "learning_rate": 9.43280429998502e-05, + "loss": 1.3543, + "step": 7509 + }, + { + "epoch": 0.81, + "grad_norm": 0.08035881723942448, + "learning_rate": 9.422628982849785e-05, + "loss": 1.4375, + "step": 7510 + }, + { + "epoch": 0.81, + "grad_norm": 0.09344417265322892, + "learning_rate": 9.412458585860656e-05, + "loss": 1.4238, + "step": 7511 + }, + { + "epoch": 0.81, + "grad_norm": 0.0960705210553254, + "learning_rate": 9.402293110250853e-05, + "loss": 1.5747, + "step": 7512 + }, + { + "epoch": 0.81, + "grad_norm": 0.1033151773139312, + "learning_rate": 9.392132557252986e-05, + "loss": 1.3517, + "step": 7513 + }, + { + "epoch": 0.81, + "grad_norm": 0.09385641709122178, + "learning_rate": 9.381976928099029e-05, + "loss": 1.3943, + "step": 7514 + }, + { + "epoch": 0.81, + "grad_norm": 0.0779487282752614, + "learning_rate": 9.371826224020397e-05, + "loss": 1.4504, + "step": 7515 + }, + { + "epoch": 0.81, + "grad_norm": 0.081718534217225, + "learning_rate": 9.361680446247922e-05, + "loss": 1.3773, + "step": 7516 + }, + { + "epoch": 0.81, + "grad_norm": 0.07810061118795375, + "learning_rate": 9.351539596011776e-05, + "loss": 1.336, + "step": 7517 + }, + { + "epoch": 0.81, + "grad_norm": 0.08383963624184462, + "learning_rate": 9.341403674541605e-05, + "loss": 1.4401, + "step": 7518 + }, + { + "epoch": 0.81, + "grad_norm": 0.08943783242052575, + "learning_rate": 9.331272683066399e-05, + "loss": 1.401, + "step": 7519 + }, + { + "epoch": 0.81, + "grad_norm": 0.08459384581972972, + "learning_rate": 9.321146622814597e-05, + "loss": 1.6098, + "step": 7520 + }, + { + "epoch": 0.81, + "grad_norm": 0.08989778220774186, + "learning_rate": 9.311025495013997e-05, + "loss": 1.4655, + "step": 7521 + }, + { + "epoch": 0.81, + "grad_norm": 0.08350938557351134, + "learning_rate": 9.30090930089183e-05, + "loss": 1.4446, + "step": 7522 + }, + { + "epoch": 0.81, + "grad_norm": 0.09453361809735741, + "learning_rate": 9.290798041674736e-05, + "loss": 1.3595, + "step": 7523 + }, + { + "epoch": 0.81, + "grad_norm": 0.08402181500544711, + "learning_rate": 9.280691718588713e-05, + "loss": 1.4212, + "step": 7524 + }, + { + "epoch": 0.81, + "grad_norm": 0.08162461265250069, + "learning_rate": 9.270590332859202e-05, + "loss": 1.4145, + "step": 7525 + }, + { + "epoch": 0.81, + "grad_norm": 0.07636153927084101, + "learning_rate": 9.260493885711035e-05, + "loss": 1.3644, + "step": 7526 + }, + { + "epoch": 0.81, + "grad_norm": 0.08638875647764933, + "learning_rate": 9.250402378368427e-05, + "loss": 1.5009, + "step": 7527 + }, + { + "epoch": 0.81, + "grad_norm": 0.0784996066786888, + "learning_rate": 9.240315812055028e-05, + "loss": 1.3155, + "step": 7528 + }, + { + "epoch": 0.81, + "grad_norm": 0.09020209891163362, + "learning_rate": 9.230234187993858e-05, + "loss": 1.3745, + "step": 7529 + }, + { + "epoch": 0.81, + "grad_norm": 0.07728842140450685, + "learning_rate": 9.220157507407334e-05, + "loss": 1.473, + "step": 7530 + }, + { + "epoch": 0.81, + "grad_norm": 0.09999693012740897, + "learning_rate": 9.210085771517296e-05, + "loss": 1.3881, + "step": 7531 + }, + { + "epoch": 0.81, + "grad_norm": 0.07475659418648158, + "learning_rate": 9.200018981544999e-05, + "loss": 1.4635, + "step": 7532 + }, + { + "epoch": 0.81, + "grad_norm": 0.08610114317683623, + "learning_rate": 9.189957138711053e-05, + "loss": 1.3655, + "step": 7533 + }, + { + "epoch": 0.81, + "grad_norm": 0.08629125966598605, + "learning_rate": 9.179900244235489e-05, + "loss": 1.4349, + "step": 7534 + }, + { + "epoch": 0.81, + "grad_norm": 0.08839588239342566, + "learning_rate": 9.169848299337764e-05, + "loss": 1.3432, + "step": 7535 + }, + { + "epoch": 0.81, + "grad_norm": 0.09113204431680144, + "learning_rate": 9.159801305236687e-05, + "loss": 1.4708, + "step": 7536 + }, + { + "epoch": 0.81, + "grad_norm": 0.08841340566615136, + "learning_rate": 9.149759263150493e-05, + "loss": 1.3131, + "step": 7537 + }, + { + "epoch": 0.81, + "grad_norm": 0.09119900971580401, + "learning_rate": 9.139722174296838e-05, + "loss": 1.4045, + "step": 7538 + }, + { + "epoch": 0.81, + "grad_norm": 0.08828684717343677, + "learning_rate": 9.129690039892735e-05, + "loss": 1.3373, + "step": 7539 + }, + { + "epoch": 0.81, + "grad_norm": 0.09411904346839678, + "learning_rate": 9.119662861154598e-05, + "loss": 1.4342, + "step": 7540 + }, + { + "epoch": 0.81, + "grad_norm": 0.09128939712399425, + "learning_rate": 9.109640639298294e-05, + "loss": 1.3869, + "step": 7541 + }, + { + "epoch": 0.81, + "grad_norm": 0.0786323938770777, + "learning_rate": 9.099623375539017e-05, + "loss": 1.3981, + "step": 7542 + }, + { + "epoch": 0.81, + "grad_norm": 0.08550467735039376, + "learning_rate": 9.089611071091414e-05, + "loss": 1.2798, + "step": 7543 + }, + { + "epoch": 0.81, + "grad_norm": 0.0956798289127353, + "learning_rate": 9.079603727169521e-05, + "loss": 1.4181, + "step": 7544 + }, + { + "epoch": 0.81, + "grad_norm": 0.09370505228997768, + "learning_rate": 9.069601344986733e-05, + "loss": 1.4377, + "step": 7545 + }, + { + "epoch": 0.81, + "grad_norm": 0.09234717974288324, + "learning_rate": 9.059603925755894e-05, + "loss": 1.308, + "step": 7546 + }, + { + "epoch": 0.81, + "grad_norm": 0.07823660374355815, + "learning_rate": 9.049611470689234e-05, + "loss": 1.5184, + "step": 7547 + }, + { + "epoch": 0.81, + "grad_norm": 0.08104361176940131, + "learning_rate": 9.039623980998346e-05, + "loss": 1.4144, + "step": 7548 + }, + { + "epoch": 0.81, + "grad_norm": 0.08313514255907395, + "learning_rate": 9.029641457894266e-05, + "loss": 1.558, + "step": 7549 + }, + { + "epoch": 0.81, + "grad_norm": 0.08348448293389067, + "learning_rate": 9.019663902587416e-05, + "loss": 1.4278, + "step": 7550 + }, + { + "epoch": 0.81, + "grad_norm": 0.09062830049572031, + "learning_rate": 9.009691316287599e-05, + "loss": 1.2734, + "step": 7551 + }, + { + "epoch": 0.81, + "grad_norm": 0.09077517992882414, + "learning_rate": 8.99972370020401e-05, + "loss": 1.3934, + "step": 7552 + }, + { + "epoch": 0.81, + "grad_norm": 0.08704980419159235, + "learning_rate": 8.989761055545276e-05, + "loss": 1.4072, + "step": 7553 + }, + { + "epoch": 0.81, + "grad_norm": 0.09613577933462986, + "learning_rate": 8.979803383519408e-05, + "loss": 1.2797, + "step": 7554 + }, + { + "epoch": 0.81, + "grad_norm": 0.08722762807612891, + "learning_rate": 8.969850685333786e-05, + "loss": 1.4651, + "step": 7555 + }, + { + "epoch": 0.81, + "grad_norm": 0.08697344594357496, + "learning_rate": 8.959902962195221e-05, + "loss": 1.2314, + "step": 7556 + }, + { + "epoch": 0.81, + "grad_norm": 0.08732520594131644, + "learning_rate": 8.949960215309921e-05, + "loss": 1.3147, + "step": 7557 + }, + { + "epoch": 0.81, + "grad_norm": 0.08586624155147374, + "learning_rate": 8.94002244588345e-05, + "loss": 1.297, + "step": 7558 + }, + { + "epoch": 0.81, + "grad_norm": 0.0888351209656821, + "learning_rate": 8.930089655120832e-05, + "loss": 1.4383, + "step": 7559 + }, + { + "epoch": 0.81, + "grad_norm": 0.08601421622050333, + "learning_rate": 8.920161844226416e-05, + "loss": 1.454, + "step": 7560 + }, + { + "epoch": 0.81, + "grad_norm": 0.09618707225930943, + "learning_rate": 8.910239014404015e-05, + "loss": 1.4776, + "step": 7561 + }, + { + "epoch": 0.81, + "grad_norm": 0.08193952890730456, + "learning_rate": 8.900321166856773e-05, + "loss": 1.5756, + "step": 7562 + }, + { + "epoch": 0.81, + "grad_norm": 0.08770806903706792, + "learning_rate": 8.8904083027873e-05, + "loss": 1.4326, + "step": 7563 + }, + { + "epoch": 0.81, + "grad_norm": 0.084513768626814, + "learning_rate": 8.88050042339753e-05, + "loss": 1.4, + "step": 7564 + }, + { + "epoch": 0.81, + "grad_norm": 0.09441648259709004, + "learning_rate": 8.870597529888847e-05, + "loss": 1.4048, + "step": 7565 + }, + { + "epoch": 0.81, + "grad_norm": 0.09472993962668384, + "learning_rate": 8.86069962346202e-05, + "loss": 1.4668, + "step": 7566 + }, + { + "epoch": 0.81, + "grad_norm": 0.08358205795408, + "learning_rate": 8.850806705317183e-05, + "loss": 1.4674, + "step": 7567 + }, + { + "epoch": 0.81, + "grad_norm": 0.09200422224464193, + "learning_rate": 8.840918776653889e-05, + "loss": 1.3779, + "step": 7568 + }, + { + "epoch": 0.81, + "grad_norm": 0.07949303886065458, + "learning_rate": 8.831035838671108e-05, + "loss": 1.3128, + "step": 7569 + }, + { + "epoch": 0.81, + "grad_norm": 0.09294930528202666, + "learning_rate": 8.82115789256715e-05, + "loss": 1.4005, + "step": 7570 + }, + { + "epoch": 0.81, + "grad_norm": 0.09418408164473525, + "learning_rate": 8.811284939539771e-05, + "loss": 1.3714, + "step": 7571 + }, + { + "epoch": 0.81, + "grad_norm": 0.08533122362657032, + "learning_rate": 8.801416980786098e-05, + "loss": 1.3943, + "step": 7572 + }, + { + "epoch": 0.81, + "grad_norm": 0.08479975478034155, + "learning_rate": 8.791554017502635e-05, + "loss": 1.4126, + "step": 7573 + }, + { + "epoch": 0.81, + "grad_norm": 0.08617405006978586, + "learning_rate": 8.781696050885313e-05, + "loss": 1.2658, + "step": 7574 + }, + { + "epoch": 0.81, + "grad_norm": 0.08868157737014712, + "learning_rate": 8.771843082129455e-05, + "loss": 1.3586, + "step": 7575 + }, + { + "epoch": 0.81, + "grad_norm": 0.09198408870723747, + "learning_rate": 8.761995112429749e-05, + "loss": 1.3592, + "step": 7576 + }, + { + "epoch": 0.81, + "grad_norm": 0.09868921187630147, + "learning_rate": 8.752152142980308e-05, + "loss": 1.339, + "step": 7577 + }, + { + "epoch": 0.81, + "grad_norm": 0.0973691738128529, + "learning_rate": 8.74231417497463e-05, + "loss": 1.299, + "step": 7578 + }, + { + "epoch": 0.81, + "grad_norm": 0.09481197124866866, + "learning_rate": 8.732481209605587e-05, + "loss": 1.3499, + "step": 7579 + }, + { + "epoch": 0.81, + "grad_norm": 0.10127492341278146, + "learning_rate": 8.722653248065466e-05, + "loss": 1.4342, + "step": 7580 + }, + { + "epoch": 0.81, + "grad_norm": 0.0879516972178395, + "learning_rate": 8.712830291545953e-05, + "loss": 1.3871, + "step": 7581 + }, + { + "epoch": 0.82, + "grad_norm": 0.08331806460834346, + "learning_rate": 8.703012341238109e-05, + "loss": 1.3958, + "step": 7582 + }, + { + "epoch": 0.82, + "grad_norm": 0.09854127039810365, + "learning_rate": 8.69319939833238e-05, + "loss": 1.4561, + "step": 7583 + }, + { + "epoch": 0.82, + "grad_norm": 0.08925024465869981, + "learning_rate": 8.683391464018625e-05, + "loss": 1.4291, + "step": 7584 + }, + { + "epoch": 0.82, + "grad_norm": 0.09092135858413694, + "learning_rate": 8.673588539486116e-05, + "loss": 1.4605, + "step": 7585 + }, + { + "epoch": 0.82, + "grad_norm": 0.08755962376538613, + "learning_rate": 8.663790625923452e-05, + "loss": 1.502, + "step": 7586 + }, + { + "epoch": 0.82, + "grad_norm": 0.08256713069946808, + "learning_rate": 8.653997724518681e-05, + "loss": 1.4174, + "step": 7587 + }, + { + "epoch": 0.82, + "grad_norm": 0.09290712035854404, + "learning_rate": 8.644209836459244e-05, + "loss": 1.4073, + "step": 7588 + }, + { + "epoch": 0.82, + "grad_norm": 0.0882259607720163, + "learning_rate": 8.634426962931924e-05, + "loss": 1.3124, + "step": 7589 + }, + { + "epoch": 0.82, + "grad_norm": 0.08602143518147524, + "learning_rate": 8.62464910512294e-05, + "loss": 1.3498, + "step": 7590 + }, + { + "epoch": 0.82, + "grad_norm": 0.08533184505192618, + "learning_rate": 8.614876264217913e-05, + "loss": 1.4247, + "step": 7591 + }, + { + "epoch": 0.82, + "grad_norm": 0.08913080708015374, + "learning_rate": 8.60510844140181e-05, + "loss": 1.4991, + "step": 7592 + }, + { + "epoch": 0.82, + "grad_norm": 0.09800999230765522, + "learning_rate": 8.595345637859009e-05, + "loss": 1.3599, + "step": 7593 + }, + { + "epoch": 0.82, + "grad_norm": 0.08656762358628595, + "learning_rate": 8.585587854773308e-05, + "loss": 1.3821, + "step": 7594 + }, + { + "epoch": 0.82, + "grad_norm": 0.07657721840844864, + "learning_rate": 8.575835093327844e-05, + "loss": 1.4007, + "step": 7595 + }, + { + "epoch": 0.82, + "grad_norm": 0.08925546459390414, + "learning_rate": 8.566087354705182e-05, + "loss": 1.4172, + "step": 7596 + }, + { + "epoch": 0.82, + "grad_norm": 0.09137356710770007, + "learning_rate": 8.556344640087288e-05, + "loss": 1.4559, + "step": 7597 + }, + { + "epoch": 0.82, + "grad_norm": 0.0977920327352257, + "learning_rate": 8.546606950655467e-05, + "loss": 1.2935, + "step": 7598 + }, + { + "epoch": 0.82, + "grad_norm": 0.09795370695417718, + "learning_rate": 8.536874287590468e-05, + "loss": 1.3887, + "step": 7599 + }, + { + "epoch": 0.82, + "grad_norm": 0.09176662763448601, + "learning_rate": 8.527146652072421e-05, + "loss": 1.5284, + "step": 7600 + }, + { + "epoch": 0.82, + "grad_norm": 0.08372935816698124, + "learning_rate": 8.517424045280808e-05, + "loss": 1.3019, + "step": 7601 + }, + { + "epoch": 0.82, + "grad_norm": 0.08514073546626989, + "learning_rate": 8.507706468394544e-05, + "loss": 1.3655, + "step": 7602 + }, + { + "epoch": 0.82, + "grad_norm": 0.09309189623499317, + "learning_rate": 8.497993922591934e-05, + "loss": 1.3931, + "step": 7603 + }, + { + "epoch": 0.82, + "grad_norm": 0.10140416177523041, + "learning_rate": 8.488286409050622e-05, + "loss": 1.3656, + "step": 7604 + }, + { + "epoch": 0.82, + "grad_norm": 0.09603451991030429, + "learning_rate": 8.478583928947692e-05, + "loss": 1.5697, + "step": 7605 + }, + { + "epoch": 0.82, + "grad_norm": 0.0977670160586023, + "learning_rate": 8.46888648345962e-05, + "loss": 1.5035, + "step": 7606 + }, + { + "epoch": 0.82, + "grad_norm": 0.08551839072309805, + "learning_rate": 8.459194073762227e-05, + "loss": 1.4294, + "step": 7607 + }, + { + "epoch": 0.82, + "grad_norm": 0.08037591290539492, + "learning_rate": 8.449506701030774e-05, + "loss": 1.5657, + "step": 7608 + }, + { + "epoch": 0.82, + "grad_norm": 0.09590980900355255, + "learning_rate": 8.439824366439885e-05, + "loss": 1.2933, + "step": 7609 + }, + { + "epoch": 0.82, + "grad_norm": 0.08713666532241826, + "learning_rate": 8.430147071163558e-05, + "loss": 1.4531, + "step": 7610 + }, + { + "epoch": 0.82, + "grad_norm": 0.08992186550468441, + "learning_rate": 8.420474816375212e-05, + "loss": 1.4069, + "step": 7611 + }, + { + "epoch": 0.82, + "grad_norm": 0.08338253904896663, + "learning_rate": 8.410807603247655e-05, + "loss": 1.496, + "step": 7612 + }, + { + "epoch": 0.82, + "grad_norm": 0.09154435310147199, + "learning_rate": 8.401145432953045e-05, + "loss": 1.4009, + "step": 7613 + }, + { + "epoch": 0.82, + "grad_norm": 0.11182926553803102, + "learning_rate": 8.391488306662976e-05, + "loss": 1.4552, + "step": 7614 + }, + { + "epoch": 0.82, + "grad_norm": 0.0862731974075149, + "learning_rate": 8.381836225548383e-05, + "loss": 1.2472, + "step": 7615 + }, + { + "epoch": 0.82, + "grad_norm": 0.08422895721651501, + "learning_rate": 8.372189190779639e-05, + "loss": 1.439, + "step": 7616 + }, + { + "epoch": 0.82, + "grad_norm": 0.09190535096534928, + "learning_rate": 8.362547203526455e-05, + "loss": 1.3751, + "step": 7617 + }, + { + "epoch": 0.82, + "grad_norm": 0.08301888840775173, + "learning_rate": 8.352910264957969e-05, + "loss": 1.351, + "step": 7618 + }, + { + "epoch": 0.82, + "grad_norm": 0.09395473446856016, + "learning_rate": 8.343278376242708e-05, + "loss": 1.426, + "step": 7619 + }, + { + "epoch": 0.82, + "grad_norm": 0.09327444144541883, + "learning_rate": 8.333651538548542e-05, + "loss": 1.4021, + "step": 7620 + }, + { + "epoch": 0.82, + "grad_norm": 0.09002012453941054, + "learning_rate": 8.324029753042773e-05, + "loss": 1.4536, + "step": 7621 + }, + { + "epoch": 0.82, + "grad_norm": 0.11071663575181982, + "learning_rate": 8.314413020892086e-05, + "loss": 1.4495, + "step": 7622 + }, + { + "epoch": 0.82, + "grad_norm": 0.0961113671788749, + "learning_rate": 8.304801343262525e-05, + "loss": 1.4342, + "step": 7623 + }, + { + "epoch": 0.82, + "grad_norm": 0.09501998230845707, + "learning_rate": 8.295194721319555e-05, + "loss": 1.2294, + "step": 7624 + }, + { + "epoch": 0.82, + "grad_norm": 0.08577388799800784, + "learning_rate": 8.285593156228005e-05, + "loss": 1.4448, + "step": 7625 + }, + { + "epoch": 0.82, + "grad_norm": 0.08585778313120833, + "learning_rate": 8.275996649152085e-05, + "loss": 1.4738, + "step": 7626 + }, + { + "epoch": 0.82, + "grad_norm": 0.09073679170236165, + "learning_rate": 8.266405201255411e-05, + "loss": 1.3423, + "step": 7627 + }, + { + "epoch": 0.82, + "grad_norm": 0.08366332122612558, + "learning_rate": 8.256818813701e-05, + "loss": 1.3645, + "step": 7628 + }, + { + "epoch": 0.82, + "grad_norm": 0.1010036314167468, + "learning_rate": 8.247237487651204e-05, + "loss": 1.2581, + "step": 7629 + }, + { + "epoch": 0.82, + "grad_norm": 0.08561960263147947, + "learning_rate": 8.237661224267806e-05, + "loss": 1.3947, + "step": 7630 + }, + { + "epoch": 0.82, + "grad_norm": 0.08579164829357397, + "learning_rate": 8.228090024711976e-05, + "loss": 1.585, + "step": 7631 + }, + { + "epoch": 0.82, + "grad_norm": 0.07982612565350453, + "learning_rate": 8.218523890144225e-05, + "loss": 1.3644, + "step": 7632 + }, + { + "epoch": 0.82, + "grad_norm": 0.09154428443060891, + "learning_rate": 8.208962821724497e-05, + "loss": 1.4524, + "step": 7633 + }, + { + "epoch": 0.82, + "grad_norm": 0.10977900239432357, + "learning_rate": 8.199406820612116e-05, + "loss": 1.4807, + "step": 7634 + }, + { + "epoch": 0.82, + "grad_norm": 0.087843255682426, + "learning_rate": 8.189855887965763e-05, + "loss": 1.3473, + "step": 7635 + }, + { + "epoch": 0.82, + "grad_norm": 0.0944862488161355, + "learning_rate": 8.180310024943515e-05, + "loss": 1.4448, + "step": 7636 + }, + { + "epoch": 0.82, + "grad_norm": 0.08936470015321014, + "learning_rate": 8.170769232702868e-05, + "loss": 1.4059, + "step": 7637 + }, + { + "epoch": 0.82, + "grad_norm": 0.0942866875546708, + "learning_rate": 8.16123351240064e-05, + "loss": 1.46, + "step": 7638 + }, + { + "epoch": 0.82, + "grad_norm": 0.09433441619949548, + "learning_rate": 8.15170286519309e-05, + "loss": 1.3914, + "step": 7639 + }, + { + "epoch": 0.82, + "grad_norm": 0.08536000362791818, + "learning_rate": 8.142177292235853e-05, + "loss": 1.4205, + "step": 7640 + }, + { + "epoch": 0.82, + "grad_norm": 0.1131150345766374, + "learning_rate": 8.132656794683912e-05, + "loss": 1.3373, + "step": 7641 + }, + { + "epoch": 0.82, + "grad_norm": 0.09912843864926806, + "learning_rate": 8.123141373691678e-05, + "loss": 1.4338, + "step": 7642 + }, + { + "epoch": 0.82, + "grad_norm": 0.09231463345028104, + "learning_rate": 8.113631030412932e-05, + "loss": 1.4434, + "step": 7643 + }, + { + "epoch": 0.82, + "grad_norm": 0.09127964871657997, + "learning_rate": 8.104125766000814e-05, + "loss": 1.4054, + "step": 7644 + }, + { + "epoch": 0.82, + "grad_norm": 0.09058060725166724, + "learning_rate": 8.094625581607901e-05, + "loss": 1.3209, + "step": 7645 + }, + { + "epoch": 0.82, + "grad_norm": 0.09059867682124947, + "learning_rate": 8.085130478386088e-05, + "loss": 1.4915, + "step": 7646 + }, + { + "epoch": 0.82, + "grad_norm": 0.08947413283209918, + "learning_rate": 8.075640457486722e-05, + "loss": 1.2977, + "step": 7647 + }, + { + "epoch": 0.82, + "grad_norm": 0.08459648939665679, + "learning_rate": 8.066155520060475e-05, + "loss": 1.2952, + "step": 7648 + }, + { + "epoch": 0.82, + "grad_norm": 0.08603049389030244, + "learning_rate": 8.05667566725744e-05, + "loss": 1.378, + "step": 7649 + }, + { + "epoch": 0.82, + "grad_norm": 0.0853412982084942, + "learning_rate": 8.047200900227092e-05, + "loss": 1.3794, + "step": 7650 + }, + { + "epoch": 0.82, + "grad_norm": 0.10203544145492183, + "learning_rate": 8.037731220118256e-05, + "loss": 1.4407, + "step": 7651 + }, + { + "epoch": 0.82, + "grad_norm": 0.09023196890129039, + "learning_rate": 8.028266628079173e-05, + "loss": 1.4527, + "step": 7652 + }, + { + "epoch": 0.82, + "grad_norm": 0.0808947281733063, + "learning_rate": 8.018807125257476e-05, + "loss": 1.4643, + "step": 7653 + }, + { + "epoch": 0.82, + "grad_norm": 0.09435279095402682, + "learning_rate": 8.009352712800128e-05, + "loss": 1.4356, + "step": 7654 + }, + { + "epoch": 0.82, + "grad_norm": 0.08363094023395468, + "learning_rate": 7.999903391853547e-05, + "loss": 1.4149, + "step": 7655 + }, + { + "epoch": 0.82, + "grad_norm": 0.09201333593543766, + "learning_rate": 7.990459163563474e-05, + "loss": 1.4506, + "step": 7656 + }, + { + "epoch": 0.82, + "grad_norm": 0.08267699262829642, + "learning_rate": 7.981020029075043e-05, + "loss": 1.3957, + "step": 7657 + }, + { + "epoch": 0.82, + "grad_norm": 0.09365508246569157, + "learning_rate": 7.971585989532792e-05, + "loss": 1.2511, + "step": 7658 + }, + { + "epoch": 0.82, + "grad_norm": 0.08658708993034162, + "learning_rate": 7.962157046080648e-05, + "loss": 1.4408, + "step": 7659 + }, + { + "epoch": 0.82, + "grad_norm": 0.09438322216968405, + "learning_rate": 7.952733199861878e-05, + "loss": 1.439, + "step": 7660 + }, + { + "epoch": 0.82, + "grad_norm": 0.0874301144366475, + "learning_rate": 7.943314452019168e-05, + "loss": 1.4711, + "step": 7661 + }, + { + "epoch": 0.82, + "grad_norm": 0.08256679231136793, + "learning_rate": 7.933900803694576e-05, + "loss": 1.4718, + "step": 7662 + }, + { + "epoch": 0.82, + "grad_norm": 0.08325944584328684, + "learning_rate": 7.92449225602953e-05, + "loss": 1.3529, + "step": 7663 + }, + { + "epoch": 0.82, + "grad_norm": 0.08691118535626666, + "learning_rate": 7.915088810164855e-05, + "loss": 1.4396, + "step": 7664 + }, + { + "epoch": 0.82, + "grad_norm": 0.08063120692409934, + "learning_rate": 7.905690467240762e-05, + "loss": 1.5015, + "step": 7665 + }, + { + "epoch": 0.82, + "grad_norm": 0.080837406638755, + "learning_rate": 7.89629722839682e-05, + "loss": 1.2455, + "step": 7666 + }, + { + "epoch": 0.82, + "grad_norm": 0.08510788864340407, + "learning_rate": 7.886909094771982e-05, + "loss": 1.3956, + "step": 7667 + }, + { + "epoch": 0.82, + "grad_norm": 0.08059477667817433, + "learning_rate": 7.877526067504614e-05, + "loss": 1.4848, + "step": 7668 + }, + { + "epoch": 0.82, + "grad_norm": 0.08384953530723206, + "learning_rate": 7.868148147732413e-05, + "loss": 1.3094, + "step": 7669 + }, + { + "epoch": 0.82, + "grad_norm": 0.08851835803646642, + "learning_rate": 7.858775336592505e-05, + "loss": 1.3807, + "step": 7670 + }, + { + "epoch": 0.82, + "grad_norm": 0.09268354025660072, + "learning_rate": 7.849407635221379e-05, + "loss": 1.4575, + "step": 7671 + }, + { + "epoch": 0.82, + "grad_norm": 0.07953833744699162, + "learning_rate": 7.840045044754879e-05, + "loss": 1.4784, + "step": 7672 + }, + { + "epoch": 0.82, + "grad_norm": 0.09061781935977607, + "learning_rate": 7.830687566328265e-05, + "loss": 1.493, + "step": 7673 + }, + { + "epoch": 0.82, + "grad_norm": 0.09061908733465755, + "learning_rate": 7.82133520107618e-05, + "loss": 1.2913, + "step": 7674 + }, + { + "epoch": 0.83, + "grad_norm": 0.08607373087290072, + "learning_rate": 7.811987950132599e-05, + "loss": 1.2954, + "step": 7675 + }, + { + "epoch": 0.83, + "grad_norm": 0.09028855772629082, + "learning_rate": 7.802645814630921e-05, + "loss": 1.3465, + "step": 7676 + }, + { + "epoch": 0.83, + "grad_norm": 0.08628410645460714, + "learning_rate": 7.793308795703929e-05, + "loss": 1.3811, + "step": 7677 + }, + { + "epoch": 0.83, + "grad_norm": 0.09710158064190848, + "learning_rate": 7.783976894483752e-05, + "loss": 1.4639, + "step": 7678 + }, + { + "epoch": 0.83, + "grad_norm": 0.08846974972943605, + "learning_rate": 7.774650112101911e-05, + "loss": 1.3841, + "step": 7679 + }, + { + "epoch": 0.83, + "grad_norm": 0.07809311245800762, + "learning_rate": 7.765328449689312e-05, + "loss": 1.5403, + "step": 7680 + }, + { + "epoch": 0.83, + "grad_norm": 0.08931552275490337, + "learning_rate": 7.756011908376254e-05, + "loss": 1.5256, + "step": 7681 + }, + { + "epoch": 0.83, + "grad_norm": 0.0891463968433547, + "learning_rate": 7.74670048929238e-05, + "loss": 1.3094, + "step": 7682 + }, + { + "epoch": 0.83, + "grad_norm": 0.08977675661275883, + "learning_rate": 7.737394193566744e-05, + "loss": 1.4532, + "step": 7683 + }, + { + "epoch": 0.83, + "grad_norm": 0.08825867071922235, + "learning_rate": 7.72809302232777e-05, + "loss": 1.3373, + "step": 7684 + }, + { + "epoch": 0.83, + "grad_norm": 0.09372314922944172, + "learning_rate": 7.71879697670324e-05, + "loss": 1.3349, + "step": 7685 + }, + { + "epoch": 0.83, + "grad_norm": 0.09768865167003908, + "learning_rate": 7.709506057820343e-05, + "loss": 1.4697, + "step": 7686 + }, + { + "epoch": 0.83, + "grad_norm": 0.08895694737402585, + "learning_rate": 7.700220266805647e-05, + "loss": 1.3731, + "step": 7687 + }, + { + "epoch": 0.83, + "grad_norm": 0.09179791236859439, + "learning_rate": 7.690939604785069e-05, + "loss": 1.4724, + "step": 7688 + }, + { + "epoch": 0.83, + "grad_norm": 0.0897147299222315, + "learning_rate": 7.681664072883914e-05, + "loss": 1.4539, + "step": 7689 + }, + { + "epoch": 0.83, + "grad_norm": 0.09394049537942313, + "learning_rate": 7.672393672226901e-05, + "loss": 1.2418, + "step": 7690 + }, + { + "epoch": 0.83, + "grad_norm": 0.08350359871804075, + "learning_rate": 7.663128403938063e-05, + "loss": 1.5139, + "step": 7691 + }, + { + "epoch": 0.83, + "grad_norm": 0.08732287161011305, + "learning_rate": 7.653868269140869e-05, + "loss": 1.306, + "step": 7692 + }, + { + "epoch": 0.83, + "grad_norm": 0.0896616196238533, + "learning_rate": 7.644613268958144e-05, + "loss": 1.3701, + "step": 7693 + }, + { + "epoch": 0.83, + "grad_norm": 0.08764433658911641, + "learning_rate": 7.635363404512069e-05, + "loss": 1.4636, + "step": 7694 + }, + { + "epoch": 0.83, + "grad_norm": 0.1030645066169904, + "learning_rate": 7.626118676924237e-05, + "loss": 1.2745, + "step": 7695 + }, + { + "epoch": 0.83, + "grad_norm": 0.09333495190715421, + "learning_rate": 7.616879087315614e-05, + "loss": 1.3622, + "step": 7696 + }, + { + "epoch": 0.83, + "grad_norm": 0.10195538862682457, + "learning_rate": 7.607644636806505e-05, + "loss": 1.4147, + "step": 7697 + }, + { + "epoch": 0.83, + "grad_norm": 0.08923282440460135, + "learning_rate": 7.598415326516638e-05, + "loss": 1.4762, + "step": 7698 + }, + { + "epoch": 0.83, + "grad_norm": 0.08931760167995045, + "learning_rate": 7.5891911575651e-05, + "loss": 1.3004, + "step": 7699 + }, + { + "epoch": 0.83, + "grad_norm": 0.08938475556208395, + "learning_rate": 7.579972131070334e-05, + "loss": 1.2865, + "step": 7700 + }, + { + "epoch": 0.83, + "grad_norm": 0.07974469882035637, + "learning_rate": 7.570758248150194e-05, + "loss": 1.32, + "step": 7701 + }, + { + "epoch": 0.83, + "grad_norm": 0.11124565390940556, + "learning_rate": 7.5615495099219e-05, + "loss": 1.3306, + "step": 7702 + }, + { + "epoch": 0.83, + "grad_norm": 0.09465556704118003, + "learning_rate": 7.552345917502023e-05, + "loss": 1.4847, + "step": 7703 + }, + { + "epoch": 0.83, + "grad_norm": 0.08633690644820705, + "learning_rate": 7.543147472006545e-05, + "loss": 1.3595, + "step": 7704 + }, + { + "epoch": 0.83, + "grad_norm": 0.07694845369969425, + "learning_rate": 7.533954174550817e-05, + "loss": 1.4188, + "step": 7705 + }, + { + "epoch": 0.83, + "grad_norm": 0.0888851522271176, + "learning_rate": 7.524766026249536e-05, + "loss": 1.4088, + "step": 7706 + }, + { + "epoch": 0.83, + "grad_norm": 0.08604335226277512, + "learning_rate": 7.51558302821681e-05, + "loss": 1.3517, + "step": 7707 + }, + { + "epoch": 0.83, + "grad_norm": 0.0916267922007084, + "learning_rate": 7.506405181566123e-05, + "loss": 1.507, + "step": 7708 + }, + { + "epoch": 0.83, + "grad_norm": 0.0858819386378702, + "learning_rate": 7.497232487410299e-05, + "loss": 1.4484, + "step": 7709 + }, + { + "epoch": 0.83, + "grad_norm": 0.08443971498772733, + "learning_rate": 7.488064946861555e-05, + "loss": 1.3468, + "step": 7710 + }, + { + "epoch": 0.83, + "grad_norm": 0.08820651296219477, + "learning_rate": 7.478902561031503e-05, + "loss": 1.3766, + "step": 7711 + }, + { + "epoch": 0.83, + "grad_norm": 0.08569244916818314, + "learning_rate": 7.469745331031114e-05, + "loss": 1.4528, + "step": 7712 + }, + { + "epoch": 0.83, + "grad_norm": 0.09377223486281296, + "learning_rate": 7.460593257970716e-05, + "loss": 1.426, + "step": 7713 + }, + { + "epoch": 0.83, + "grad_norm": 0.09446187498251976, + "learning_rate": 7.451446342960044e-05, + "loss": 1.3474, + "step": 7714 + }, + { + "epoch": 0.83, + "grad_norm": 0.08327545046965788, + "learning_rate": 7.4423045871082e-05, + "loss": 1.3567, + "step": 7715 + }, + { + "epoch": 0.83, + "grad_norm": 0.08832129555517432, + "learning_rate": 7.433167991523631e-05, + "loss": 1.4224, + "step": 7716 + }, + { + "epoch": 0.83, + "grad_norm": 0.08522053430819553, + "learning_rate": 7.424036557314196e-05, + "loss": 1.5376, + "step": 7717 + }, + { + "epoch": 0.83, + "grad_norm": 0.09278789708741045, + "learning_rate": 7.414910285587117e-05, + "loss": 1.4763, + "step": 7718 + }, + { + "epoch": 0.83, + "grad_norm": 0.09264503427204793, + "learning_rate": 7.405789177448985e-05, + "loss": 1.2921, + "step": 7719 + }, + { + "epoch": 0.83, + "grad_norm": 0.0856871779296312, + "learning_rate": 7.39667323400574e-05, + "loss": 1.3306, + "step": 7720 + }, + { + "epoch": 0.83, + "grad_norm": 0.08823520611675048, + "learning_rate": 7.387562456362762e-05, + "loss": 1.345, + "step": 7721 + }, + { + "epoch": 0.83, + "grad_norm": 0.08796941351682677, + "learning_rate": 7.378456845624725e-05, + "loss": 1.5243, + "step": 7722 + }, + { + "epoch": 0.83, + "grad_norm": 0.08979097920858628, + "learning_rate": 7.369356402895738e-05, + "loss": 1.3536, + "step": 7723 + }, + { + "epoch": 0.83, + "grad_norm": 0.08436809874166219, + "learning_rate": 7.360261129279272e-05, + "loss": 1.3238, + "step": 7724 + }, + { + "epoch": 0.83, + "grad_norm": 0.10352532323355235, + "learning_rate": 7.351171025878128e-05, + "loss": 1.4435, + "step": 7725 + }, + { + "epoch": 0.83, + "grad_norm": 0.07711372998127677, + "learning_rate": 7.342086093794531e-05, + "loss": 1.4107, + "step": 7726 + }, + { + "epoch": 0.83, + "grad_norm": 0.09640807472724792, + "learning_rate": 7.333006334130077e-05, + "loss": 1.3328, + "step": 7727 + }, + { + "epoch": 0.83, + "grad_norm": 0.08914874349546113, + "learning_rate": 7.32393174798568e-05, + "loss": 1.3379, + "step": 7728 + }, + { + "epoch": 0.83, + "grad_norm": 0.08496885016196233, + "learning_rate": 7.314862336461692e-05, + "loss": 1.3343, + "step": 7729 + }, + { + "epoch": 0.83, + "grad_norm": 0.07735334418737376, + "learning_rate": 7.305798100657823e-05, + "loss": 1.4266, + "step": 7730 + }, + { + "epoch": 0.83, + "grad_norm": 0.08938125918239577, + "learning_rate": 7.2967390416731e-05, + "loss": 1.4782, + "step": 7731 + }, + { + "epoch": 0.83, + "grad_norm": 0.09278179808639771, + "learning_rate": 7.287685160605995e-05, + "loss": 1.3485, + "step": 7732 + }, + { + "epoch": 0.83, + "grad_norm": 0.08403083852248348, + "learning_rate": 7.278636458554322e-05, + "loss": 1.4589, + "step": 7733 + }, + { + "epoch": 0.83, + "grad_norm": 0.09068179591128482, + "learning_rate": 7.26959293661525e-05, + "loss": 1.5567, + "step": 7734 + }, + { + "epoch": 0.83, + "grad_norm": 0.0762056740331929, + "learning_rate": 7.26055459588535e-05, + "loss": 1.2854, + "step": 7735 + }, + { + "epoch": 0.83, + "grad_norm": 0.07528963922741068, + "learning_rate": 7.251521437460562e-05, + "loss": 1.3909, + "step": 7736 + }, + { + "epoch": 0.83, + "grad_norm": 0.0942168001290263, + "learning_rate": 7.242493462436161e-05, + "loss": 1.2895, + "step": 7737 + }, + { + "epoch": 0.83, + "grad_norm": 0.10240321129164565, + "learning_rate": 7.233470671906844e-05, + "loss": 1.2489, + "step": 7738 + }, + { + "epoch": 0.83, + "grad_norm": 0.08504284002213647, + "learning_rate": 7.224453066966652e-05, + "loss": 1.3601, + "step": 7739 + }, + { + "epoch": 0.83, + "grad_norm": 0.09587235474791764, + "learning_rate": 7.215440648708982e-05, + "loss": 1.4656, + "step": 7740 + }, + { + "epoch": 0.83, + "grad_norm": 0.08536226344078945, + "learning_rate": 7.206433418226649e-05, + "loss": 1.4795, + "step": 7741 + }, + { + "epoch": 0.83, + "grad_norm": 0.09140896762573765, + "learning_rate": 7.197431376611785e-05, + "loss": 1.4325, + "step": 7742 + }, + { + "epoch": 0.83, + "grad_norm": 0.09015041988015379, + "learning_rate": 7.188434524955939e-05, + "loss": 1.4109, + "step": 7743 + }, + { + "epoch": 0.83, + "grad_norm": 0.08271750495957778, + "learning_rate": 7.179442864349988e-05, + "loss": 1.4846, + "step": 7744 + }, + { + "epoch": 0.83, + "grad_norm": 0.08829217021420534, + "learning_rate": 7.170456395884217e-05, + "loss": 1.4766, + "step": 7745 + }, + { + "epoch": 0.83, + "grad_norm": 0.08833891796417223, + "learning_rate": 7.161475120648276e-05, + "loss": 1.3419, + "step": 7746 + }, + { + "epoch": 0.83, + "grad_norm": 0.08900580755952446, + "learning_rate": 7.152499039731153e-05, + "loss": 1.5181, + "step": 7747 + }, + { + "epoch": 0.83, + "grad_norm": 0.0860664905985196, + "learning_rate": 7.143528154221235e-05, + "loss": 1.4304, + "step": 7748 + }, + { + "epoch": 0.83, + "grad_norm": 0.08884608101931617, + "learning_rate": 7.13456246520629e-05, + "loss": 1.4872, + "step": 7749 + }, + { + "epoch": 0.83, + "grad_norm": 0.08909893472267622, + "learning_rate": 7.125601973773416e-05, + "loss": 1.3473, + "step": 7750 + }, + { + "epoch": 0.83, + "grad_norm": 0.08642569234811122, + "learning_rate": 7.11664668100912e-05, + "loss": 1.5026, + "step": 7751 + }, + { + "epoch": 0.83, + "grad_norm": 0.0867109188311294, + "learning_rate": 7.107696587999263e-05, + "loss": 1.3332, + "step": 7752 + }, + { + "epoch": 0.83, + "grad_norm": 0.07775756140298237, + "learning_rate": 7.098751695829048e-05, + "loss": 1.2844, + "step": 7753 + }, + { + "epoch": 0.83, + "grad_norm": 0.08664325787505547, + "learning_rate": 7.089812005583096e-05, + "loss": 1.4412, + "step": 7754 + }, + { + "epoch": 0.83, + "grad_norm": 0.08714207866053697, + "learning_rate": 7.080877518345385e-05, + "loss": 1.5046, + "step": 7755 + }, + { + "epoch": 0.83, + "grad_norm": 0.09008286495715372, + "learning_rate": 7.071948235199228e-05, + "loss": 1.4347, + "step": 7756 + }, + { + "epoch": 0.83, + "grad_norm": 0.09963396647707887, + "learning_rate": 7.063024157227344e-05, + "loss": 1.411, + "step": 7757 + }, + { + "epoch": 0.83, + "grad_norm": 0.08125783843505932, + "learning_rate": 7.054105285511814e-05, + "loss": 1.4923, + "step": 7758 + }, + { + "epoch": 0.83, + "grad_norm": 0.09825938535932652, + "learning_rate": 7.045191621134068e-05, + "loss": 1.4293, + "step": 7759 + }, + { + "epoch": 0.83, + "grad_norm": 0.08176546166160557, + "learning_rate": 7.036283165174923e-05, + "loss": 1.4709, + "step": 7760 + }, + { + "epoch": 0.83, + "grad_norm": 0.07799311174622484, + "learning_rate": 7.027379918714577e-05, + "loss": 1.3462, + "step": 7761 + }, + { + "epoch": 0.83, + "grad_norm": 0.08215288669420732, + "learning_rate": 7.018481882832561e-05, + "loss": 1.3547, + "step": 7762 + }, + { + "epoch": 0.83, + "grad_norm": 0.0971949111065358, + "learning_rate": 7.00958905860779e-05, + "loss": 1.3067, + "step": 7763 + }, + { + "epoch": 0.83, + "grad_norm": 0.0810382090903955, + "learning_rate": 7.000701447118563e-05, + "loss": 1.4154, + "step": 7764 + }, + { + "epoch": 0.83, + "grad_norm": 0.08624335475689453, + "learning_rate": 6.991819049442516e-05, + "loss": 1.4382, + "step": 7765 + }, + { + "epoch": 0.83, + "grad_norm": 0.0923274614359002, + "learning_rate": 6.982941866656684e-05, + "loss": 1.4152, + "step": 7766 + }, + { + "epoch": 0.83, + "grad_norm": 0.08995040104069521, + "learning_rate": 6.974069899837465e-05, + "loss": 1.5865, + "step": 7767 + }, + { + "epoch": 0.84, + "grad_norm": 0.0899147874974617, + "learning_rate": 6.965203150060589e-05, + "loss": 1.5105, + "step": 7768 + }, + { + "epoch": 0.84, + "grad_norm": 0.0835013673745431, + "learning_rate": 6.956341618401196e-05, + "loss": 1.3654, + "step": 7769 + }, + { + "epoch": 0.84, + "grad_norm": 0.09350769325031699, + "learning_rate": 6.947485305933787e-05, + "loss": 1.3866, + "step": 7770 + }, + { + "epoch": 0.84, + "grad_norm": 0.07761428841872348, + "learning_rate": 6.938634213732198e-05, + "loss": 1.3952, + "step": 7771 + }, + { + "epoch": 0.84, + "grad_norm": 0.10393913360882924, + "learning_rate": 6.929788342869676e-05, + "loss": 1.4183, + "step": 7772 + }, + { + "epoch": 0.84, + "grad_norm": 0.09154838170195079, + "learning_rate": 6.920947694418789e-05, + "loss": 1.2753, + "step": 7773 + }, + { + "epoch": 0.84, + "grad_norm": 0.09637707494862725, + "learning_rate": 6.912112269451526e-05, + "loss": 1.3974, + "step": 7774 + }, + { + "epoch": 0.84, + "grad_norm": 0.09508845670840485, + "learning_rate": 6.903282069039185e-05, + "loss": 1.4752, + "step": 7775 + }, + { + "epoch": 0.84, + "grad_norm": 0.11155591005942683, + "learning_rate": 6.894457094252471e-05, + "loss": 1.3607, + "step": 7776 + }, + { + "epoch": 0.84, + "grad_norm": 0.10317484976555591, + "learning_rate": 6.885637346161449e-05, + "loss": 1.3526, + "step": 7777 + }, + { + "epoch": 0.84, + "grad_norm": 0.08389106407279763, + "learning_rate": 6.876822825835527e-05, + "loss": 1.2543, + "step": 7778 + }, + { + "epoch": 0.84, + "grad_norm": 0.09115262148193395, + "learning_rate": 6.86801353434351e-05, + "loss": 1.422, + "step": 7779 + }, + { + "epoch": 0.84, + "grad_norm": 0.0919312396699346, + "learning_rate": 6.859209472753559e-05, + "loss": 1.3834, + "step": 7780 + }, + { + "epoch": 0.84, + "grad_norm": 0.0771367133563575, + "learning_rate": 6.850410642133176e-05, + "loss": 1.3485, + "step": 7781 + }, + { + "epoch": 0.84, + "grad_norm": 0.08852606976685708, + "learning_rate": 6.841617043549259e-05, + "loss": 1.337, + "step": 7782 + }, + { + "epoch": 0.84, + "grad_norm": 0.09033201320751114, + "learning_rate": 6.832828678068092e-05, + "loss": 1.4328, + "step": 7783 + }, + { + "epoch": 0.84, + "grad_norm": 0.08586344860911486, + "learning_rate": 6.824045546755242e-05, + "loss": 1.5029, + "step": 7784 + }, + { + "epoch": 0.84, + "grad_norm": 0.09016545117776988, + "learning_rate": 6.815267650675721e-05, + "loss": 1.5326, + "step": 7785 + }, + { + "epoch": 0.84, + "grad_norm": 0.09318420049557376, + "learning_rate": 6.80649499089389e-05, + "loss": 1.3503, + "step": 7786 + }, + { + "epoch": 0.84, + "grad_norm": 0.09915799008577442, + "learning_rate": 6.797727568473444e-05, + "loss": 1.4098, + "step": 7787 + }, + { + "epoch": 0.84, + "grad_norm": 0.0998684888213455, + "learning_rate": 6.788965384477463e-05, + "loss": 1.3458, + "step": 7788 + }, + { + "epoch": 0.84, + "grad_norm": 0.09354345098880852, + "learning_rate": 6.780208439968417e-05, + "loss": 1.4528, + "step": 7789 + }, + { + "epoch": 0.84, + "grad_norm": 0.10346740823288775, + "learning_rate": 6.771456736008086e-05, + "loss": 1.4361, + "step": 7790 + }, + { + "epoch": 0.84, + "grad_norm": 0.08885901539295292, + "learning_rate": 6.762710273657658e-05, + "loss": 1.389, + "step": 7791 + }, + { + "epoch": 0.84, + "grad_norm": 0.09811897324623484, + "learning_rate": 6.753969053977683e-05, + "loss": 1.4192, + "step": 7792 + }, + { + "epoch": 0.84, + "grad_norm": 0.08673617423121921, + "learning_rate": 6.745233078028041e-05, + "loss": 1.2513, + "step": 7793 + }, + { + "epoch": 0.84, + "grad_norm": 0.0912880836237642, + "learning_rate": 6.736502346868018e-05, + "loss": 1.2771, + "step": 7794 + }, + { + "epoch": 0.84, + "grad_norm": 0.08520826836546021, + "learning_rate": 6.727776861556239e-05, + "loss": 1.2978, + "step": 7795 + }, + { + "epoch": 0.84, + "grad_norm": 0.09253343356505377, + "learning_rate": 6.719056623150687e-05, + "loss": 1.4179, + "step": 7796 + }, + { + "epoch": 0.84, + "grad_norm": 0.09727907298470896, + "learning_rate": 6.710341632708733e-05, + "loss": 1.2541, + "step": 7797 + }, + { + "epoch": 0.84, + "grad_norm": 0.10455922957410266, + "learning_rate": 6.701631891287108e-05, + "loss": 1.4294, + "step": 7798 + }, + { + "epoch": 0.84, + "grad_norm": 0.09565500343844599, + "learning_rate": 6.692927399941878e-05, + "loss": 1.3228, + "step": 7799 + }, + { + "epoch": 0.84, + "grad_norm": 0.09670216486916945, + "learning_rate": 6.684228159728501e-05, + "loss": 1.337, + "step": 7800 + }, + { + "epoch": 0.84, + "grad_norm": 0.09031993166332863, + "learning_rate": 6.675534171701802e-05, + "loss": 1.4904, + "step": 7801 + }, + { + "epoch": 0.84, + "grad_norm": 0.1106867657427879, + "learning_rate": 6.666845436915941e-05, + "loss": 1.4883, + "step": 7802 + }, + { + "epoch": 0.84, + "grad_norm": 0.08750518151251388, + "learning_rate": 6.658161956424457e-05, + "loss": 1.5734, + "step": 7803 + }, + { + "epoch": 0.84, + "grad_norm": 0.09277619307591722, + "learning_rate": 6.64948373128027e-05, + "loss": 1.4289, + "step": 7804 + }, + { + "epoch": 0.84, + "grad_norm": 0.09255864978715336, + "learning_rate": 6.640810762535637e-05, + "loss": 1.4721, + "step": 7805 + }, + { + "epoch": 0.84, + "grad_norm": 0.08307761174818809, + "learning_rate": 6.632143051242168e-05, + "loss": 1.3225, + "step": 7806 + }, + { + "epoch": 0.84, + "grad_norm": 0.08957267637082313, + "learning_rate": 6.623480598450866e-05, + "loss": 1.262, + "step": 7807 + }, + { + "epoch": 0.84, + "grad_norm": 0.10565226536610126, + "learning_rate": 6.61482340521209e-05, + "loss": 1.3611, + "step": 7808 + }, + { + "epoch": 0.84, + "grad_norm": 0.09842970799267935, + "learning_rate": 6.606171472575539e-05, + "loss": 1.3758, + "step": 7809 + }, + { + "epoch": 0.84, + "grad_norm": 0.09138816156285817, + "learning_rate": 6.597524801590299e-05, + "loss": 1.3514, + "step": 7810 + }, + { + "epoch": 0.84, + "grad_norm": 0.08304644485072452, + "learning_rate": 6.588883393304818e-05, + "loss": 1.4504, + "step": 7811 + }, + { + "epoch": 0.84, + "grad_norm": 0.08179421892153053, + "learning_rate": 6.580247248766875e-05, + "loss": 1.4556, + "step": 7812 + }, + { + "epoch": 0.84, + "grad_norm": 0.0905917159466832, + "learning_rate": 6.571616369023642e-05, + "loss": 1.343, + "step": 7813 + }, + { + "epoch": 0.84, + "grad_norm": 0.08782451845628722, + "learning_rate": 6.562990755121661e-05, + "loss": 1.4379, + "step": 7814 + }, + { + "epoch": 0.84, + "grad_norm": 0.08681317029442595, + "learning_rate": 6.554370408106796e-05, + "loss": 1.3353, + "step": 7815 + }, + { + "epoch": 0.84, + "grad_norm": 0.09018112284327645, + "learning_rate": 6.545755329024295e-05, + "loss": 1.4735, + "step": 7816 + }, + { + "epoch": 0.84, + "grad_norm": 0.08596032825622608, + "learning_rate": 6.537145518918774e-05, + "loss": 1.4837, + "step": 7817 + }, + { + "epoch": 0.84, + "grad_norm": 0.0842586560156261, + "learning_rate": 6.528540978834191e-05, + "loss": 1.3523, + "step": 7818 + }, + { + "epoch": 0.84, + "grad_norm": 0.09000204358620255, + "learning_rate": 6.519941709813881e-05, + "loss": 1.3803, + "step": 7819 + }, + { + "epoch": 0.84, + "grad_norm": 0.09290775564210622, + "learning_rate": 6.511347712900545e-05, + "loss": 1.375, + "step": 7820 + }, + { + "epoch": 0.84, + "grad_norm": 0.09291821777416866, + "learning_rate": 6.502758989136221e-05, + "loss": 1.3519, + "step": 7821 + }, + { + "epoch": 0.84, + "grad_norm": 0.10079947970251664, + "learning_rate": 6.494175539562325e-05, + "loss": 1.4085, + "step": 7822 + }, + { + "epoch": 0.84, + "grad_norm": 0.08932330902539856, + "learning_rate": 6.485597365219648e-05, + "loss": 1.4194, + "step": 7823 + }, + { + "epoch": 0.84, + "grad_norm": 0.08012412657276556, + "learning_rate": 6.477024467148291e-05, + "loss": 1.4567, + "step": 7824 + }, + { + "epoch": 0.84, + "grad_norm": 0.08880866528917783, + "learning_rate": 6.468456846387777e-05, + "loss": 1.3814, + "step": 7825 + }, + { + "epoch": 0.84, + "grad_norm": 0.08893574046009894, + "learning_rate": 6.459894503976943e-05, + "loss": 1.3846, + "step": 7826 + }, + { + "epoch": 0.84, + "grad_norm": 0.08956078096009155, + "learning_rate": 6.451337440953998e-05, + "loss": 1.3439, + "step": 7827 + }, + { + "epoch": 0.84, + "grad_norm": 0.10529417583896636, + "learning_rate": 6.442785658356525e-05, + "loss": 1.4773, + "step": 7828 + }, + { + "epoch": 0.84, + "grad_norm": 0.09718365997826772, + "learning_rate": 6.434239157221466e-05, + "loss": 1.3271, + "step": 7829 + }, + { + "epoch": 0.84, + "grad_norm": 0.08864347101686207, + "learning_rate": 6.42569793858509e-05, + "loss": 1.3864, + "step": 7830 + }, + { + "epoch": 0.84, + "grad_norm": 0.09400814744697399, + "learning_rate": 6.417162003483063e-05, + "loss": 1.5692, + "step": 7831 + }, + { + "epoch": 0.84, + "grad_norm": 0.08719216814926357, + "learning_rate": 6.408631352950406e-05, + "loss": 1.3842, + "step": 7832 + }, + { + "epoch": 0.84, + "grad_norm": 0.07677562030215503, + "learning_rate": 6.40010598802147e-05, + "loss": 1.3368, + "step": 7833 + }, + { + "epoch": 0.84, + "grad_norm": 0.07755566060205164, + "learning_rate": 6.391585909729997e-05, + "loss": 1.4292, + "step": 7834 + }, + { + "epoch": 0.84, + "grad_norm": 0.08532107826874637, + "learning_rate": 6.383071119109079e-05, + "loss": 1.4152, + "step": 7835 + }, + { + "epoch": 0.84, + "grad_norm": 0.0802070942760828, + "learning_rate": 6.374561617191166e-05, + "loss": 1.3902, + "step": 7836 + }, + { + "epoch": 0.84, + "grad_norm": 0.07747124635800962, + "learning_rate": 6.366057405008042e-05, + "loss": 1.1828, + "step": 7837 + }, + { + "epoch": 0.84, + "grad_norm": 0.09197460885363941, + "learning_rate": 6.357558483590887e-05, + "loss": 1.2937, + "step": 7838 + }, + { + "epoch": 0.84, + "grad_norm": 0.08686641796307093, + "learning_rate": 6.349064853970232e-05, + "loss": 1.4106, + "step": 7839 + }, + { + "epoch": 0.84, + "grad_norm": 0.09014370721732225, + "learning_rate": 6.340576517175939e-05, + "loss": 1.4875, + "step": 7840 + }, + { + "epoch": 0.84, + "grad_norm": 0.08256481623816217, + "learning_rate": 6.332093474237265e-05, + "loss": 1.3485, + "step": 7841 + }, + { + "epoch": 0.84, + "grad_norm": 0.08533178161354073, + "learning_rate": 6.323615726182813e-05, + "loss": 1.4926, + "step": 7842 + }, + { + "epoch": 0.84, + "grad_norm": 0.09430168667379107, + "learning_rate": 6.315143274040519e-05, + "loss": 1.4384, + "step": 7843 + }, + { + "epoch": 0.84, + "grad_norm": 0.11100609742165347, + "learning_rate": 6.306676118837707e-05, + "loss": 1.365, + "step": 7844 + }, + { + "epoch": 0.84, + "grad_norm": 0.08113856887278259, + "learning_rate": 6.29821426160106e-05, + "loss": 1.3599, + "step": 7845 + }, + { + "epoch": 0.84, + "grad_norm": 0.0783171748843241, + "learning_rate": 6.289757703356597e-05, + "loss": 1.3204, + "step": 7846 + }, + { + "epoch": 0.84, + "grad_norm": 0.10432054223441194, + "learning_rate": 6.281306445129697e-05, + "loss": 1.4898, + "step": 7847 + }, + { + "epoch": 0.84, + "grad_norm": 0.08632182749912745, + "learning_rate": 6.27286048794512e-05, + "loss": 1.3361, + "step": 7848 + }, + { + "epoch": 0.84, + "grad_norm": 0.09476579758108704, + "learning_rate": 6.264419832826945e-05, + "loss": 1.425, + "step": 7849 + }, + { + "epoch": 0.84, + "grad_norm": 0.08215841019664528, + "learning_rate": 6.255984480798649e-05, + "loss": 1.4258, + "step": 7850 + }, + { + "epoch": 0.84, + "grad_norm": 0.08563993560179116, + "learning_rate": 6.247554432883052e-05, + "loss": 1.3825, + "step": 7851 + }, + { + "epoch": 0.84, + "grad_norm": 0.08548292176570288, + "learning_rate": 6.239129690102307e-05, + "loss": 1.3595, + "step": 7852 + }, + { + "epoch": 0.84, + "grad_norm": 0.07864817887270605, + "learning_rate": 6.230710253477956e-05, + "loss": 1.4026, + "step": 7853 + }, + { + "epoch": 0.84, + "grad_norm": 0.07912513593576152, + "learning_rate": 6.222296124030891e-05, + "loss": 1.4491, + "step": 7854 + }, + { + "epoch": 0.84, + "grad_norm": 0.0775400683256103, + "learning_rate": 6.213887302781335e-05, + "loss": 1.3045, + "step": 7855 + }, + { + "epoch": 0.84, + "grad_norm": 0.08471158998301948, + "learning_rate": 6.2054837907489e-05, + "loss": 1.3708, + "step": 7856 + }, + { + "epoch": 0.84, + "grad_norm": 0.07866617830977912, + "learning_rate": 6.197085588952556e-05, + "loss": 1.379, + "step": 7857 + }, + { + "epoch": 0.84, + "grad_norm": 0.08669780094760313, + "learning_rate": 6.188692698410575e-05, + "loss": 1.2851, + "step": 7858 + }, + { + "epoch": 0.84, + "grad_norm": 0.09045076299727618, + "learning_rate": 6.180305120140644e-05, + "loss": 1.3331, + "step": 7859 + }, + { + "epoch": 0.84, + "grad_norm": 0.0852228463127037, + "learning_rate": 6.1719228551598e-05, + "loss": 1.4925, + "step": 7860 + }, + { + "epoch": 0.85, + "grad_norm": 0.08271086957993047, + "learning_rate": 6.163545904484397e-05, + "loss": 1.3317, + "step": 7861 + }, + { + "epoch": 0.85, + "grad_norm": 0.07976940004119311, + "learning_rate": 6.155174269130182e-05, + "loss": 1.37, + "step": 7862 + }, + { + "epoch": 0.85, + "grad_norm": 0.07844337767876314, + "learning_rate": 6.146807950112248e-05, + "loss": 1.5441, + "step": 7863 + }, + { + "epoch": 0.85, + "grad_norm": 0.08755021665953842, + "learning_rate": 6.13844694844503e-05, + "loss": 1.3235, + "step": 7864 + }, + { + "epoch": 0.85, + "grad_norm": 0.11060847383411719, + "learning_rate": 6.130091265142329e-05, + "loss": 1.424, + "step": 7865 + }, + { + "epoch": 0.85, + "grad_norm": 0.0794293051223916, + "learning_rate": 6.12174090121732e-05, + "loss": 1.4387, + "step": 7866 + }, + { + "epoch": 0.85, + "grad_norm": 0.08712838702078128, + "learning_rate": 6.113395857682485e-05, + "loss": 1.3756, + "step": 7867 + }, + { + "epoch": 0.85, + "grad_norm": 0.09304835093342179, + "learning_rate": 6.105056135549714e-05, + "loss": 1.4444, + "step": 7868 + }, + { + "epoch": 0.85, + "grad_norm": 0.08206255704216997, + "learning_rate": 6.096721735830202e-05, + "loss": 1.4388, + "step": 7869 + }, + { + "epoch": 0.85, + "grad_norm": 0.0805377403471389, + "learning_rate": 6.0883926595345475e-05, + "loss": 1.3054, + "step": 7870 + }, + { + "epoch": 0.85, + "grad_norm": 0.09047074750357005, + "learning_rate": 6.080068907672659e-05, + "loss": 1.3322, + "step": 7871 + }, + { + "epoch": 0.85, + "grad_norm": 0.10712564001781039, + "learning_rate": 6.071750481253835e-05, + "loss": 1.4827, + "step": 7872 + }, + { + "epoch": 0.85, + "grad_norm": 0.08000036424175803, + "learning_rate": 6.0634373812867185e-05, + "loss": 1.4809, + "step": 7873 + }, + { + "epoch": 0.85, + "grad_norm": 0.08355315667819273, + "learning_rate": 6.055129608779275e-05, + "loss": 1.3978, + "step": 7874 + }, + { + "epoch": 0.85, + "grad_norm": 0.0897445519805741, + "learning_rate": 6.046827164738872e-05, + "loss": 1.4598, + "step": 7875 + }, + { + "epoch": 0.85, + "grad_norm": 0.08593164499884852, + "learning_rate": 6.03853005017222e-05, + "loss": 1.2729, + "step": 7876 + }, + { + "epoch": 0.85, + "grad_norm": 0.09280406295835326, + "learning_rate": 6.0302382660853416e-05, + "loss": 1.3568, + "step": 7877 + }, + { + "epoch": 0.85, + "grad_norm": 0.09287445817382776, + "learning_rate": 6.0219518134836726e-05, + "loss": 1.3667, + "step": 7878 + }, + { + "epoch": 0.85, + "grad_norm": 0.08942747992632392, + "learning_rate": 6.0136706933719586e-05, + "loss": 1.3309, + "step": 7879 + }, + { + "epoch": 0.85, + "grad_norm": 0.09044881308433124, + "learning_rate": 6.0053949067543077e-05, + "loss": 1.333, + "step": 7880 + }, + { + "epoch": 0.85, + "grad_norm": 0.08218485919758337, + "learning_rate": 5.997124454634201e-05, + "loss": 1.5086, + "step": 7881 + }, + { + "epoch": 0.85, + "grad_norm": 0.08150958759929276, + "learning_rate": 5.9888593380144574e-05, + "loss": 1.4831, + "step": 7882 + }, + { + "epoch": 0.85, + "grad_norm": 0.09238224876183417, + "learning_rate": 5.9805995578972426e-05, + "loss": 1.3159, + "step": 7883 + }, + { + "epoch": 0.85, + "grad_norm": 0.08132944733289485, + "learning_rate": 5.972345115284095e-05, + "loss": 1.4739, + "step": 7884 + }, + { + "epoch": 0.85, + "grad_norm": 0.09200855967945543, + "learning_rate": 5.96409601117589e-05, + "loss": 1.3772, + "step": 7885 + }, + { + "epoch": 0.85, + "grad_norm": 0.10121485467365761, + "learning_rate": 5.955852246572851e-05, + "loss": 1.317, + "step": 7886 + }, + { + "epoch": 0.85, + "grad_norm": 0.08068215984602292, + "learning_rate": 5.9476138224745734e-05, + "loss": 1.3589, + "step": 7887 + }, + { + "epoch": 0.85, + "grad_norm": 0.08866123553653318, + "learning_rate": 5.939380739880002e-05, + "loss": 1.4508, + "step": 7888 + }, + { + "epoch": 0.85, + "grad_norm": 0.08589646053077514, + "learning_rate": 5.9311529997874095e-05, + "loss": 1.3438, + "step": 7889 + }, + { + "epoch": 0.85, + "grad_norm": 0.09725805400582763, + "learning_rate": 5.922930603194437e-05, + "loss": 1.4084, + "step": 7890 + }, + { + "epoch": 0.85, + "grad_norm": 0.09455544461619558, + "learning_rate": 5.9147135510981e-05, + "loss": 1.4086, + "step": 7891 + }, + { + "epoch": 0.85, + "grad_norm": 0.10467580678385532, + "learning_rate": 5.906501844494716e-05, + "loss": 1.3407, + "step": 7892 + }, + { + "epoch": 0.85, + "grad_norm": 0.07795243427227122, + "learning_rate": 5.898295484379995e-05, + "loss": 1.3612, + "step": 7893 + }, + { + "epoch": 0.85, + "grad_norm": 0.1038394375264948, + "learning_rate": 5.890094471749002e-05, + "loss": 1.4229, + "step": 7894 + }, + { + "epoch": 0.85, + "grad_norm": 0.09720640404759297, + "learning_rate": 5.881898807596114e-05, + "loss": 1.3402, + "step": 7895 + }, + { + "epoch": 0.85, + "grad_norm": 0.07723043481387958, + "learning_rate": 5.873708492915092e-05, + "loss": 1.4209, + "step": 7896 + }, + { + "epoch": 0.85, + "grad_norm": 0.09067326993878297, + "learning_rate": 5.8655235286990513e-05, + "loss": 1.4408, + "step": 7897 + }, + { + "epoch": 0.85, + "grad_norm": 0.09774190841467567, + "learning_rate": 5.857343915940433e-05, + "loss": 1.452, + "step": 7898 + }, + { + "epoch": 0.85, + "grad_norm": 0.09164211233774362, + "learning_rate": 5.8491696556310535e-05, + "loss": 1.4819, + "step": 7899 + }, + { + "epoch": 0.85, + "grad_norm": 0.08938979443232763, + "learning_rate": 5.841000748762054e-05, + "loss": 1.2762, + "step": 7900 + }, + { + "epoch": 0.85, + "grad_norm": 0.08276635984258755, + "learning_rate": 5.832837196323964e-05, + "loss": 1.4339, + "step": 7901 + }, + { + "epoch": 0.85, + "grad_norm": 0.10140530865368988, + "learning_rate": 5.824678999306621e-05, + "loss": 1.4236, + "step": 7902 + }, + { + "epoch": 0.85, + "grad_norm": 0.08952137652557425, + "learning_rate": 5.816526158699243e-05, + "loss": 1.3607, + "step": 7903 + }, + { + "epoch": 0.85, + "grad_norm": 0.08598514612393836, + "learning_rate": 5.8083786754904e-05, + "loss": 1.3682, + "step": 7904 + }, + { + "epoch": 0.85, + "grad_norm": 0.08180567616052078, + "learning_rate": 5.800236550667992e-05, + "loss": 1.3125, + "step": 7905 + }, + { + "epoch": 0.85, + "grad_norm": 0.08795388382397808, + "learning_rate": 5.7920997852192735e-05, + "loss": 1.44, + "step": 7906 + }, + { + "epoch": 0.85, + "grad_norm": 0.07932353663760587, + "learning_rate": 5.7839683801308775e-05, + "loss": 1.3103, + "step": 7907 + }, + { + "epoch": 0.85, + "grad_norm": 0.08941791390098239, + "learning_rate": 5.775842336388742e-05, + "loss": 1.2789, + "step": 7908 + }, + { + "epoch": 0.85, + "grad_norm": 0.0943055727960341, + "learning_rate": 5.767721654978186e-05, + "loss": 1.4448, + "step": 7909 + }, + { + "epoch": 0.85, + "grad_norm": 0.08605694259527719, + "learning_rate": 5.759606336883893e-05, + "loss": 1.3103, + "step": 7910 + }, + { + "epoch": 0.85, + "grad_norm": 0.09528400159248888, + "learning_rate": 5.7514963830898324e-05, + "loss": 1.4157, + "step": 7911 + }, + { + "epoch": 0.85, + "grad_norm": 0.0838022572347359, + "learning_rate": 5.7433917945793776e-05, + "loss": 1.3387, + "step": 7912 + }, + { + "epoch": 0.85, + "grad_norm": 0.11424160907650843, + "learning_rate": 5.73529257233526e-05, + "loss": 1.3327, + "step": 7913 + }, + { + "epoch": 0.85, + "grad_norm": 0.09560274070572669, + "learning_rate": 5.72719871733951e-05, + "loss": 1.4464, + "step": 7914 + }, + { + "epoch": 0.85, + "grad_norm": 0.08509483829820473, + "learning_rate": 5.719110230573543e-05, + "loss": 1.3786, + "step": 7915 + }, + { + "epoch": 0.85, + "grad_norm": 0.10596939753078925, + "learning_rate": 5.7110271130181356e-05, + "loss": 1.523, + "step": 7916 + }, + { + "epoch": 0.85, + "grad_norm": 0.09238716068882286, + "learning_rate": 5.7029493656533646e-05, + "loss": 1.3509, + "step": 7917 + }, + { + "epoch": 0.85, + "grad_norm": 0.09918986128963173, + "learning_rate": 5.694876989458697e-05, + "loss": 1.3125, + "step": 7918 + }, + { + "epoch": 0.85, + "grad_norm": 0.09547700248487656, + "learning_rate": 5.6868099854129493e-05, + "loss": 1.4866, + "step": 7919 + }, + { + "epoch": 0.85, + "grad_norm": 0.10238214475114867, + "learning_rate": 5.678748354494251e-05, + "loss": 1.511, + "step": 7920 + }, + { + "epoch": 0.85, + "grad_norm": 0.09238523148440415, + "learning_rate": 5.6706920976801246e-05, + "loss": 1.4352, + "step": 7921 + }, + { + "epoch": 0.85, + "grad_norm": 0.08533477612675754, + "learning_rate": 5.662641215947406e-05, + "loss": 1.4433, + "step": 7922 + }, + { + "epoch": 0.85, + "grad_norm": 0.0923241212621198, + "learning_rate": 5.654595710272287e-05, + "loss": 1.4694, + "step": 7923 + }, + { + "epoch": 0.85, + "grad_norm": 0.098711665661809, + "learning_rate": 5.6465555816303196e-05, + "loss": 1.3371, + "step": 7924 + }, + { + "epoch": 0.85, + "grad_norm": 0.09113188080489218, + "learning_rate": 5.638520830996402e-05, + "loss": 1.5073, + "step": 7925 + }, + { + "epoch": 0.85, + "grad_norm": 0.09294883936559913, + "learning_rate": 5.630491459344766e-05, + "loss": 1.4731, + "step": 7926 + }, + { + "epoch": 0.85, + "grad_norm": 0.0958511718237879, + "learning_rate": 5.622467467649006e-05, + "loss": 1.5133, + "step": 7927 + }, + { + "epoch": 0.85, + "grad_norm": 0.09150286778743173, + "learning_rate": 5.614448856882065e-05, + "loss": 1.3905, + "step": 7928 + }, + { + "epoch": 0.85, + "grad_norm": 0.09388354098673922, + "learning_rate": 5.60643562801621e-05, + "loss": 1.433, + "step": 7929 + }, + { + "epoch": 0.85, + "grad_norm": 0.1002396530061073, + "learning_rate": 5.5984277820230855e-05, + "loss": 1.3662, + "step": 7930 + }, + { + "epoch": 0.85, + "grad_norm": 0.10274461624460365, + "learning_rate": 5.5904253198736824e-05, + "loss": 1.5483, + "step": 7931 + }, + { + "epoch": 0.85, + "grad_norm": 0.08530544037612882, + "learning_rate": 5.582428242538307e-05, + "loss": 1.467, + "step": 7932 + }, + { + "epoch": 0.85, + "grad_norm": 0.09574392115798785, + "learning_rate": 5.574436550986628e-05, + "loss": 1.3484, + "step": 7933 + }, + { + "epoch": 0.85, + "grad_norm": 0.07777008187231625, + "learning_rate": 5.5664502461876706e-05, + "loss": 1.287, + "step": 7934 + }, + { + "epoch": 0.85, + "grad_norm": 0.09424269991976277, + "learning_rate": 5.558469329109822e-05, + "loss": 1.4066, + "step": 7935 + }, + { + "epoch": 0.85, + "grad_norm": 0.08766616531253907, + "learning_rate": 5.550493800720774e-05, + "loss": 1.2889, + "step": 7936 + }, + { + "epoch": 0.85, + "grad_norm": 0.08486592252459829, + "learning_rate": 5.5425236619875866e-05, + "loss": 1.3519, + "step": 7937 + }, + { + "epoch": 0.85, + "grad_norm": 0.08143108081202038, + "learning_rate": 5.5345589138766864e-05, + "loss": 1.4174, + "step": 7938 + }, + { + "epoch": 0.85, + "grad_norm": 0.08634015059967805, + "learning_rate": 5.5265995573538066e-05, + "loss": 1.3629, + "step": 7939 + }, + { + "epoch": 0.85, + "grad_norm": 0.09101367306765057, + "learning_rate": 5.5186455933840476e-05, + "loss": 1.3984, + "step": 7940 + }, + { + "epoch": 0.85, + "grad_norm": 0.08940959291798563, + "learning_rate": 5.5106970229318765e-05, + "loss": 1.3313, + "step": 7941 + }, + { + "epoch": 0.85, + "grad_norm": 0.08414515779509103, + "learning_rate": 5.502753846961067e-05, + "loss": 1.387, + "step": 7942 + }, + { + "epoch": 0.85, + "grad_norm": 0.08772764724912127, + "learning_rate": 5.494816066434749e-05, + "loss": 1.4057, + "step": 7943 + }, + { + "epoch": 0.85, + "grad_norm": 0.08757481261207309, + "learning_rate": 5.4868836823154246e-05, + "loss": 1.2524, + "step": 7944 + }, + { + "epoch": 0.85, + "grad_norm": 0.09430417292165912, + "learning_rate": 5.478956695564902e-05, + "loss": 1.4384, + "step": 7945 + }, + { + "epoch": 0.85, + "grad_norm": 0.0910526631063559, + "learning_rate": 5.471035107144373e-05, + "loss": 1.4247, + "step": 7946 + }, + { + "epoch": 0.85, + "grad_norm": 0.09675309496202968, + "learning_rate": 5.4631189180143596e-05, + "loss": 1.4936, + "step": 7947 + }, + { + "epoch": 0.85, + "grad_norm": 0.07853317251477367, + "learning_rate": 5.4552081291347035e-05, + "loss": 1.4164, + "step": 7948 + }, + { + "epoch": 0.85, + "grad_norm": 0.09942725188704955, + "learning_rate": 5.4473027414646384e-05, + "loss": 1.5146, + "step": 7949 + }, + { + "epoch": 0.85, + "grad_norm": 0.08892933014210838, + "learning_rate": 5.439402755962719e-05, + "loss": 1.4987, + "step": 7950 + }, + { + "epoch": 0.85, + "grad_norm": 0.1075372531467069, + "learning_rate": 5.43150817358683e-05, + "loss": 1.3922, + "step": 7951 + }, + { + "epoch": 0.85, + "grad_norm": 0.09669611531183489, + "learning_rate": 5.423618995294238e-05, + "loss": 1.2791, + "step": 7952 + }, + { + "epoch": 0.85, + "grad_norm": 0.08838658799987743, + "learning_rate": 5.415735222041518e-05, + "loss": 1.2887, + "step": 7953 + }, + { + "epoch": 0.86, + "grad_norm": 0.08859874586580094, + "learning_rate": 5.407856854784598e-05, + "loss": 1.4271, + "step": 7954 + }, + { + "epoch": 0.86, + "grad_norm": 0.09346223291532334, + "learning_rate": 5.3999838944787713e-05, + "loss": 1.3762, + "step": 7955 + }, + { + "epoch": 0.86, + "grad_norm": 0.08402497812656093, + "learning_rate": 5.392116342078662e-05, + "loss": 1.4116, + "step": 7956 + }, + { + "epoch": 0.86, + "grad_norm": 0.08561235386678215, + "learning_rate": 5.3842541985382244e-05, + "loss": 1.3021, + "step": 7957 + }, + { + "epoch": 0.86, + "grad_norm": 0.08016569594538953, + "learning_rate": 5.3763974648107786e-05, + "loss": 1.3236, + "step": 7958 + }, + { + "epoch": 0.86, + "grad_norm": 0.08856641914868289, + "learning_rate": 5.36854614184899e-05, + "loss": 1.556, + "step": 7959 + }, + { + "epoch": 0.86, + "grad_norm": 0.09532590526560956, + "learning_rate": 5.360700230604842e-05, + "loss": 1.408, + "step": 7960 + }, + { + "epoch": 0.86, + "grad_norm": 0.08393650575881223, + "learning_rate": 5.3528597320296855e-05, + "loss": 1.2279, + "step": 7961 + }, + { + "epoch": 0.86, + "grad_norm": 0.08808394205662512, + "learning_rate": 5.3450246470742204e-05, + "loss": 1.3444, + "step": 7962 + }, + { + "epoch": 0.86, + "grad_norm": 0.08187518003995406, + "learning_rate": 5.337194976688464e-05, + "loss": 1.4492, + "step": 7963 + }, + { + "epoch": 0.86, + "grad_norm": 0.09074287291952292, + "learning_rate": 5.3293707218217805e-05, + "loss": 1.3961, + "step": 7964 + }, + { + "epoch": 0.86, + "grad_norm": 0.09922091033979089, + "learning_rate": 5.3215518834229036e-05, + "loss": 1.4014, + "step": 7965 + }, + { + "epoch": 0.86, + "grad_norm": 0.09558377560461183, + "learning_rate": 5.313738462439899e-05, + "loss": 1.4919, + "step": 7966 + }, + { + "epoch": 0.86, + "grad_norm": 0.07975403401656565, + "learning_rate": 5.3059304598201576e-05, + "loss": 1.374, + "step": 7967 + }, + { + "epoch": 0.86, + "grad_norm": 0.09718667621815387, + "learning_rate": 5.298127876510428e-05, + "loss": 1.4082, + "step": 7968 + }, + { + "epoch": 0.86, + "grad_norm": 0.0912549963123792, + "learning_rate": 5.290330713456809e-05, + "loss": 1.4846, + "step": 7969 + }, + { + "epoch": 0.86, + "grad_norm": 0.08168961849298514, + "learning_rate": 5.282538971604728e-05, + "loss": 1.366, + "step": 7970 + }, + { + "epoch": 0.86, + "grad_norm": 0.08900406927437467, + "learning_rate": 5.274752651898956e-05, + "loss": 1.3303, + "step": 7971 + }, + { + "epoch": 0.86, + "grad_norm": 0.09610658260467948, + "learning_rate": 5.266971755283628e-05, + "loss": 1.5749, + "step": 7972 + }, + { + "epoch": 0.86, + "grad_norm": 0.08865058880800436, + "learning_rate": 5.259196282702178e-05, + "loss": 1.3821, + "step": 7973 + }, + { + "epoch": 0.86, + "grad_norm": 0.09517903021205283, + "learning_rate": 5.251426235097439e-05, + "loss": 1.4055, + "step": 7974 + }, + { + "epoch": 0.86, + "grad_norm": 0.08493156034832718, + "learning_rate": 5.243661613411543e-05, + "loss": 1.3444, + "step": 7975 + }, + { + "epoch": 0.86, + "grad_norm": 0.0838612191297965, + "learning_rate": 5.2359024185859585e-05, + "loss": 1.4417, + "step": 7976 + }, + { + "epoch": 0.86, + "grad_norm": 0.09421372340720958, + "learning_rate": 5.2281486515615375e-05, + "loss": 1.4135, + "step": 7977 + }, + { + "epoch": 0.86, + "grad_norm": 0.09153333076851533, + "learning_rate": 5.220400313278451e-05, + "loss": 1.4349, + "step": 7978 + }, + { + "epoch": 0.86, + "grad_norm": 0.09161799711452127, + "learning_rate": 5.212657404676191e-05, + "loss": 1.4369, + "step": 7979 + }, + { + "epoch": 0.86, + "grad_norm": 0.08816016774243185, + "learning_rate": 5.2049199266936355e-05, + "loss": 1.3312, + "step": 7980 + }, + { + "epoch": 0.86, + "grad_norm": 0.07536067522168258, + "learning_rate": 5.197187880268972e-05, + "loss": 1.4213, + "step": 7981 + }, + { + "epoch": 0.86, + "grad_norm": 0.0926666875343112, + "learning_rate": 5.1894612663397345e-05, + "loss": 1.3562, + "step": 7982 + }, + { + "epoch": 0.86, + "grad_norm": 0.09777436980649994, + "learning_rate": 5.1817400858427956e-05, + "loss": 1.4365, + "step": 7983 + }, + { + "epoch": 0.86, + "grad_norm": 0.0811904250377712, + "learning_rate": 5.1740243397144e-05, + "loss": 1.4611, + "step": 7984 + }, + { + "epoch": 0.86, + "grad_norm": 0.08800265668953564, + "learning_rate": 5.16631402889009e-05, + "loss": 1.3533, + "step": 7985 + }, + { + "epoch": 0.86, + "grad_norm": 0.0904440816356616, + "learning_rate": 5.158609154304761e-05, + "loss": 1.4972, + "step": 7986 + }, + { + "epoch": 0.86, + "grad_norm": 0.08685858702675321, + "learning_rate": 5.150909716892671e-05, + "loss": 1.3354, + "step": 7987 + }, + { + "epoch": 0.86, + "grad_norm": 0.09362833005578333, + "learning_rate": 5.143215717587385e-05, + "loss": 1.3281, + "step": 7988 + }, + { + "epoch": 0.86, + "grad_norm": 0.07966025286356111, + "learning_rate": 5.13552715732184e-05, + "loss": 1.5392, + "step": 7989 + }, + { + "epoch": 0.86, + "grad_norm": 0.08725214418209264, + "learning_rate": 5.127844037028312e-05, + "loss": 1.4684, + "step": 7990 + }, + { + "epoch": 0.86, + "grad_norm": 0.09737399318771994, + "learning_rate": 5.120166357638378e-05, + "loss": 1.4776, + "step": 7991 + }, + { + "epoch": 0.86, + "grad_norm": 0.08768025693285349, + "learning_rate": 5.112494120083e-05, + "loss": 1.3234, + "step": 7992 + }, + { + "epoch": 0.86, + "grad_norm": 0.08323505095792275, + "learning_rate": 5.104827325292466e-05, + "loss": 1.5446, + "step": 7993 + }, + { + "epoch": 0.86, + "grad_norm": 0.09839799519704864, + "learning_rate": 5.0971659741963883e-05, + "loss": 1.3734, + "step": 7994 + }, + { + "epoch": 0.86, + "grad_norm": 0.0963408838494438, + "learning_rate": 5.0895100677237515e-05, + "loss": 1.374, + "step": 7995 + }, + { + "epoch": 0.86, + "grad_norm": 0.10273257116177338, + "learning_rate": 5.081859606802841e-05, + "loss": 1.3963, + "step": 7996 + }, + { + "epoch": 0.86, + "grad_norm": 0.09690958321962514, + "learning_rate": 5.074214592361315e-05, + "loss": 1.4474, + "step": 7997 + }, + { + "epoch": 0.86, + "grad_norm": 0.09055382203489995, + "learning_rate": 5.066575025326148e-05, + "loss": 1.2824, + "step": 7998 + }, + { + "epoch": 0.86, + "grad_norm": 0.09214018733037252, + "learning_rate": 5.05894090662366e-05, + "loss": 1.3172, + "step": 7999 + }, + { + "epoch": 0.86, + "grad_norm": 0.08020416377684451, + "learning_rate": 5.05131223717954e-05, + "loss": 1.4465, + "step": 8000 + }, + { + "epoch": 0.86, + "grad_norm": 0.10471508281926799, + "learning_rate": 5.043689017918757e-05, + "loss": 1.3915, + "step": 8001 + }, + { + "epoch": 0.86, + "grad_norm": 0.10116000957392209, + "learning_rate": 5.0360712497656725e-05, + "loss": 1.2644, + "step": 8002 + }, + { + "epoch": 0.86, + "grad_norm": 0.09559909241116268, + "learning_rate": 5.028458933643976e-05, + "loss": 1.3338, + "step": 8003 + }, + { + "epoch": 0.86, + "grad_norm": 0.09002643649763474, + "learning_rate": 5.020852070476656e-05, + "loss": 1.4668, + "step": 8004 + }, + { + "epoch": 0.86, + "grad_norm": 0.09528584400679911, + "learning_rate": 5.013250661186103e-05, + "loss": 1.33, + "step": 8005 + }, + { + "epoch": 0.86, + "grad_norm": 0.11374398792014954, + "learning_rate": 5.005654706694002e-05, + "loss": 1.3475, + "step": 8006 + }, + { + "epoch": 0.86, + "grad_norm": 0.09495489508659871, + "learning_rate": 4.9980642079213764e-05, + "loss": 1.4521, + "step": 8007 + }, + { + "epoch": 0.86, + "grad_norm": 0.09302443756883642, + "learning_rate": 4.9904791657886085e-05, + "loss": 1.4601, + "step": 8008 + }, + { + "epoch": 0.86, + "grad_norm": 0.09287663041146321, + "learning_rate": 4.982899581215428e-05, + "loss": 1.3386, + "step": 8009 + }, + { + "epoch": 0.86, + "grad_norm": 0.08839803430724054, + "learning_rate": 4.975325455120855e-05, + "loss": 1.4648, + "step": 8010 + }, + { + "epoch": 0.86, + "grad_norm": 0.09007925416026948, + "learning_rate": 4.9677567884233e-05, + "loss": 1.5249, + "step": 8011 + }, + { + "epoch": 0.86, + "grad_norm": 0.1024334249851111, + "learning_rate": 4.96019358204049e-05, + "loss": 1.3792, + "step": 8012 + }, + { + "epoch": 0.86, + "grad_norm": 0.09493434008827761, + "learning_rate": 4.952635836889474e-05, + "loss": 1.4692, + "step": 8013 + }, + { + "epoch": 0.86, + "grad_norm": 0.10177317419169603, + "learning_rate": 4.94508355388667e-05, + "loss": 1.3567, + "step": 8014 + }, + { + "epoch": 0.86, + "grad_norm": 0.10359395185727091, + "learning_rate": 4.937536733947817e-05, + "loss": 1.4361, + "step": 8015 + }, + { + "epoch": 0.86, + "grad_norm": 0.1014764345449319, + "learning_rate": 4.9299953779879894e-05, + "loss": 1.4492, + "step": 8016 + }, + { + "epoch": 0.86, + "grad_norm": 0.09141219987822603, + "learning_rate": 4.9224594869215934e-05, + "loss": 1.3856, + "step": 8017 + }, + { + "epoch": 0.86, + "grad_norm": 0.10701698487475056, + "learning_rate": 4.914929061662399e-05, + "loss": 1.3098, + "step": 8018 + }, + { + "epoch": 0.86, + "grad_norm": 0.08421051966495124, + "learning_rate": 4.9074041031234804e-05, + "loss": 1.2876, + "step": 8019 + }, + { + "epoch": 0.86, + "grad_norm": 0.10280735588357584, + "learning_rate": 4.89988461221727e-05, + "loss": 1.2543, + "step": 8020 + }, + { + "epoch": 0.86, + "grad_norm": 0.0871700644770041, + "learning_rate": 4.8923705898555496e-05, + "loss": 1.4419, + "step": 8021 + }, + { + "epoch": 0.86, + "grad_norm": 0.09208405244887743, + "learning_rate": 4.884862036949389e-05, + "loss": 1.4242, + "step": 8022 + }, + { + "epoch": 0.86, + "grad_norm": 0.09519872895834376, + "learning_rate": 4.877358954409245e-05, + "loss": 1.4272, + "step": 8023 + }, + { + "epoch": 0.86, + "grad_norm": 0.08160045804385208, + "learning_rate": 4.8698613431448934e-05, + "loss": 1.442, + "step": 8024 + }, + { + "epoch": 0.86, + "grad_norm": 0.08917162958217983, + "learning_rate": 4.862369204065437e-05, + "loss": 1.4528, + "step": 8025 + }, + { + "epoch": 0.86, + "grad_norm": 0.09584081971903959, + "learning_rate": 4.8548825380793304e-05, + "loss": 1.4295, + "step": 8026 + }, + { + "epoch": 0.86, + "grad_norm": 0.08745885865269687, + "learning_rate": 4.84740134609436e-05, + "loss": 1.4092, + "step": 8027 + }, + { + "epoch": 0.86, + "grad_norm": 0.08685155962456509, + "learning_rate": 4.839925629017639e-05, + "loss": 1.3759, + "step": 8028 + }, + { + "epoch": 0.86, + "grad_norm": 0.10410625722723307, + "learning_rate": 4.8324553877556186e-05, + "loss": 1.4921, + "step": 8029 + }, + { + "epoch": 0.86, + "grad_norm": 0.09053463510410303, + "learning_rate": 4.8249906232140984e-05, + "loss": 1.3488, + "step": 8030 + }, + { + "epoch": 0.86, + "grad_norm": 0.08590497883548834, + "learning_rate": 4.8175313362982195e-05, + "loss": 1.4228, + "step": 8031 + }, + { + "epoch": 0.86, + "grad_norm": 0.10821065446154124, + "learning_rate": 4.810077527912421e-05, + "loss": 1.3144, + "step": 8032 + }, + { + "epoch": 0.86, + "grad_norm": 0.10917917244641281, + "learning_rate": 4.8026291989605174e-05, + "loss": 1.3937, + "step": 8033 + }, + { + "epoch": 0.86, + "grad_norm": 0.09423165223915081, + "learning_rate": 4.7951863503456485e-05, + "loss": 1.3646, + "step": 8034 + }, + { + "epoch": 0.86, + "grad_norm": 0.088316397400894, + "learning_rate": 4.787748982970275e-05, + "loss": 1.3682, + "step": 8035 + }, + { + "epoch": 0.86, + "grad_norm": 0.10998794053022858, + "learning_rate": 4.780317097736203e-05, + "loss": 1.3058, + "step": 8036 + }, + { + "epoch": 0.86, + "grad_norm": 0.09088548506112062, + "learning_rate": 4.7728906955446015e-05, + "loss": 1.4925, + "step": 8037 + }, + { + "epoch": 0.86, + "grad_norm": 0.08992202733297892, + "learning_rate": 4.765469777295906e-05, + "loss": 1.5035, + "step": 8038 + }, + { + "epoch": 0.86, + "grad_norm": 0.08358952526085513, + "learning_rate": 4.7580543438899446e-05, + "loss": 1.3281, + "step": 8039 + }, + { + "epoch": 0.86, + "grad_norm": 0.08568139006226679, + "learning_rate": 4.750644396225873e-05, + "loss": 1.5197, + "step": 8040 + }, + { + "epoch": 0.86, + "grad_norm": 0.08993280314740992, + "learning_rate": 4.743239935202165e-05, + "loss": 1.3513, + "step": 8041 + }, + { + "epoch": 0.86, + "grad_norm": 0.09889419546611622, + "learning_rate": 4.7358409617166307e-05, + "loss": 1.3172, + "step": 8042 + }, + { + "epoch": 0.86, + "grad_norm": 0.09847518037940564, + "learning_rate": 4.728447476666442e-05, + "loss": 1.3748, + "step": 8043 + }, + { + "epoch": 0.86, + "grad_norm": 0.09646547794616295, + "learning_rate": 4.7210594809480645e-05, + "loss": 1.4414, + "step": 8044 + }, + { + "epoch": 0.86, + "grad_norm": 0.08558448303419687, + "learning_rate": 4.71367697545732e-05, + "loss": 1.3872, + "step": 8045 + }, + { + "epoch": 0.86, + "grad_norm": 0.08697043289257039, + "learning_rate": 4.706299961089383e-05, + "loss": 1.338, + "step": 8046 + }, + { + "epoch": 0.87, + "grad_norm": 0.08883717181982263, + "learning_rate": 4.698928438738714e-05, + "loss": 1.4657, + "step": 8047 + }, + { + "epoch": 0.87, + "grad_norm": 0.09612420925192143, + "learning_rate": 4.691562409299161e-05, + "loss": 1.3923, + "step": 8048 + }, + { + "epoch": 0.87, + "grad_norm": 0.0866917810947192, + "learning_rate": 4.684201873663868e-05, + "loss": 1.5458, + "step": 8049 + }, + { + "epoch": 0.87, + "grad_norm": 0.10058692936269363, + "learning_rate": 4.676846832725312e-05, + "loss": 1.294, + "step": 8050 + }, + { + "epoch": 0.87, + "grad_norm": 0.09684992742639126, + "learning_rate": 4.6694972873753296e-05, + "loss": 1.4402, + "step": 8051 + }, + { + "epoch": 0.87, + "grad_norm": 0.11670958899864131, + "learning_rate": 4.66215323850509e-05, + "loss": 1.3937, + "step": 8052 + }, + { + "epoch": 0.87, + "grad_norm": 0.08558904787337474, + "learning_rate": 4.6548146870050656e-05, + "loss": 1.4114, + "step": 8053 + }, + { + "epoch": 0.87, + "grad_norm": 0.08350040748116296, + "learning_rate": 4.647481633765088e-05, + "loss": 1.2515, + "step": 8054 + }, + { + "epoch": 0.87, + "grad_norm": 0.08270056183524263, + "learning_rate": 4.640154079674325e-05, + "loss": 1.3878, + "step": 8055 + }, + { + "epoch": 0.87, + "grad_norm": 0.12140269918508494, + "learning_rate": 4.632832025621253e-05, + "loss": 1.369, + "step": 8056 + }, + { + "epoch": 0.87, + "grad_norm": 0.0967211345492668, + "learning_rate": 4.625515472493697e-05, + "loss": 1.4863, + "step": 8057 + }, + { + "epoch": 0.87, + "grad_norm": 0.09594188315943698, + "learning_rate": 4.6182044211788343e-05, + "loss": 1.3982, + "step": 8058 + }, + { + "epoch": 0.87, + "grad_norm": 0.10442851702577187, + "learning_rate": 4.610898872563135e-05, + "loss": 1.4728, + "step": 8059 + }, + { + "epoch": 0.87, + "grad_norm": 0.09089308048752692, + "learning_rate": 4.6035988275324236e-05, + "loss": 1.4235, + "step": 8060 + }, + { + "epoch": 0.87, + "grad_norm": 0.10029383253191723, + "learning_rate": 4.596304286971853e-05, + "loss": 1.4527, + "step": 8061 + }, + { + "epoch": 0.87, + "grad_norm": 0.09245414374315496, + "learning_rate": 4.5890152517659326e-05, + "loss": 1.4476, + "step": 8062 + }, + { + "epoch": 0.87, + "grad_norm": 0.08749737182144246, + "learning_rate": 4.58173172279846e-05, + "loss": 1.4544, + "step": 8063 + }, + { + "epoch": 0.87, + "grad_norm": 0.08128681609621158, + "learning_rate": 4.574453700952591e-05, + "loss": 1.315, + "step": 8064 + }, + { + "epoch": 0.87, + "grad_norm": 0.0977477408035849, + "learning_rate": 4.56718118711083e-05, + "loss": 1.3868, + "step": 8065 + }, + { + "epoch": 0.87, + "grad_norm": 0.08346194411051658, + "learning_rate": 4.559914182154967e-05, + "loss": 1.3092, + "step": 8066 + }, + { + "epoch": 0.87, + "grad_norm": 0.08313207581438073, + "learning_rate": 4.5526526869661686e-05, + "loss": 1.3348, + "step": 8067 + }, + { + "epoch": 0.87, + "grad_norm": 0.09434227073134552, + "learning_rate": 4.545396702424926e-05, + "loss": 1.2725, + "step": 8068 + }, + { + "epoch": 0.87, + "grad_norm": 0.09172296973103769, + "learning_rate": 4.538146229411033e-05, + "loss": 1.5323, + "step": 8069 + }, + { + "epoch": 0.87, + "grad_norm": 0.09822663625867462, + "learning_rate": 4.5309012688036334e-05, + "loss": 1.4093, + "step": 8070 + }, + { + "epoch": 0.87, + "grad_norm": 0.0931555383209421, + "learning_rate": 4.5236618214812234e-05, + "loss": 1.307, + "step": 8071 + }, + { + "epoch": 0.87, + "grad_norm": 0.09686713999156278, + "learning_rate": 4.5164278883215834e-05, + "loss": 1.4408, + "step": 8072 + }, + { + "epoch": 0.87, + "grad_norm": 0.09891740183537313, + "learning_rate": 4.509199470201869e-05, + "loss": 1.3064, + "step": 8073 + }, + { + "epoch": 0.87, + "grad_norm": 0.08416754685939414, + "learning_rate": 4.501976567998561e-05, + "loss": 1.2797, + "step": 8074 + }, + { + "epoch": 0.87, + "grad_norm": 0.09197596824111855, + "learning_rate": 4.494759182587438e-05, + "loss": 1.4559, + "step": 8075 + }, + { + "epoch": 0.87, + "grad_norm": 0.09727193551485228, + "learning_rate": 4.487547314843643e-05, + "loss": 1.466, + "step": 8076 + }, + { + "epoch": 0.87, + "grad_norm": 0.09239804210902398, + "learning_rate": 4.480340965641655e-05, + "loss": 1.4517, + "step": 8077 + }, + { + "epoch": 0.87, + "grad_norm": 0.09649305439725554, + "learning_rate": 4.473140135855247e-05, + "loss": 1.4156, + "step": 8078 + }, + { + "epoch": 0.87, + "grad_norm": 0.09236065359889135, + "learning_rate": 4.4659448263575544e-05, + "loss": 1.4829, + "step": 8079 + }, + { + "epoch": 0.87, + "grad_norm": 0.0994382561845817, + "learning_rate": 4.4587550380210284e-05, + "loss": 1.3684, + "step": 8080 + }, + { + "epoch": 0.87, + "grad_norm": 0.09209726648563313, + "learning_rate": 4.451570771717467e-05, + "loss": 1.3426, + "step": 8081 + }, + { + "epoch": 0.87, + "grad_norm": 0.09843478293680565, + "learning_rate": 4.444392028317967e-05, + "loss": 1.3716, + "step": 8082 + }, + { + "epoch": 0.87, + "grad_norm": 0.10283688142537882, + "learning_rate": 4.437218808693e-05, + "loss": 1.3521, + "step": 8083 + }, + { + "epoch": 0.87, + "grad_norm": 0.09645222990375216, + "learning_rate": 4.4300511137123186e-05, + "loss": 1.4416, + "step": 8084 + }, + { + "epoch": 0.87, + "grad_norm": 0.08737555151807626, + "learning_rate": 4.422888944245046e-05, + "loss": 1.3301, + "step": 8085 + }, + { + "epoch": 0.87, + "grad_norm": 0.09666379041694352, + "learning_rate": 4.415732301159625e-05, + "loss": 1.5381, + "step": 8086 + }, + { + "epoch": 0.87, + "grad_norm": 0.09304119796433967, + "learning_rate": 4.408581185323807e-05, + "loss": 1.4706, + "step": 8087 + }, + { + "epoch": 0.87, + "grad_norm": 0.11082626671337686, + "learning_rate": 4.401435597604697e-05, + "loss": 1.3994, + "step": 8088 + }, + { + "epoch": 0.87, + "grad_norm": 0.08799149367663399, + "learning_rate": 4.394295538868731e-05, + "loss": 1.5273, + "step": 8089 + }, + { + "epoch": 0.87, + "grad_norm": 0.09010079273368712, + "learning_rate": 4.387161009981661e-05, + "loss": 1.3633, + "step": 8090 + }, + { + "epoch": 0.87, + "grad_norm": 0.08354802566216252, + "learning_rate": 4.3800320118085605e-05, + "loss": 1.4994, + "step": 8091 + }, + { + "epoch": 0.87, + "grad_norm": 0.08402331275744471, + "learning_rate": 4.3729085452138496e-05, + "loss": 1.376, + "step": 8092 + }, + { + "epoch": 0.87, + "grad_norm": 0.10391735563408136, + "learning_rate": 4.365790611061293e-05, + "loss": 1.4819, + "step": 8093 + }, + { + "epoch": 0.87, + "grad_norm": 0.09007821896954256, + "learning_rate": 4.3586782102139344e-05, + "loss": 1.3756, + "step": 8094 + }, + { + "epoch": 0.87, + "grad_norm": 0.08903422612703692, + "learning_rate": 4.3515713435342e-05, + "loss": 1.4106, + "step": 8095 + }, + { + "epoch": 0.87, + "grad_norm": 0.09313008870552218, + "learning_rate": 4.344470011883817e-05, + "loss": 1.402, + "step": 8096 + }, + { + "epoch": 0.87, + "grad_norm": 0.08543654211563571, + "learning_rate": 4.33737421612384e-05, + "loss": 1.2531, + "step": 8097 + }, + { + "epoch": 0.87, + "grad_norm": 0.09067470275876136, + "learning_rate": 4.330283957114656e-05, + "loss": 1.3704, + "step": 8098 + }, + { + "epoch": 0.87, + "grad_norm": 0.08563274293716956, + "learning_rate": 4.323199235716002e-05, + "loss": 1.3759, + "step": 8099 + }, + { + "epoch": 0.87, + "grad_norm": 0.08905782530462407, + "learning_rate": 4.316120052786904e-05, + "loss": 1.3192, + "step": 8100 + }, + { + "epoch": 0.87, + "grad_norm": 0.09857674223026314, + "learning_rate": 4.309046409185757e-05, + "loss": 1.3845, + "step": 8101 + }, + { + "epoch": 0.87, + "grad_norm": 0.08694920472324444, + "learning_rate": 4.301978305770249e-05, + "loss": 1.4144, + "step": 8102 + }, + { + "epoch": 0.87, + "grad_norm": 0.08527994623935795, + "learning_rate": 4.294915743397409e-05, + "loss": 1.4158, + "step": 8103 + }, + { + "epoch": 0.87, + "grad_norm": 0.09263598147216909, + "learning_rate": 4.287858722923604e-05, + "loss": 1.5114, + "step": 8104 + }, + { + "epoch": 0.87, + "grad_norm": 0.09202117731423061, + "learning_rate": 4.2808072452045374e-05, + "loss": 1.429, + "step": 8105 + }, + { + "epoch": 0.87, + "grad_norm": 0.08638915131619937, + "learning_rate": 4.273761311095192e-05, + "loss": 1.4321, + "step": 8106 + }, + { + "epoch": 0.87, + "grad_norm": 0.09522134929831542, + "learning_rate": 4.2667209214499346e-05, + "loss": 1.3787, + "step": 8107 + }, + { + "epoch": 0.87, + "grad_norm": 0.10469829873713113, + "learning_rate": 4.259686077122443e-05, + "loss": 1.3612, + "step": 8108 + }, + { + "epoch": 0.87, + "grad_norm": 0.0855733834471551, + "learning_rate": 4.2526567789656965e-05, + "loss": 1.4306, + "step": 8109 + }, + { + "epoch": 0.87, + "grad_norm": 0.09614067627908375, + "learning_rate": 4.2456330278320245e-05, + "loss": 1.4499, + "step": 8110 + }, + { + "epoch": 0.87, + "grad_norm": 0.08390308759988215, + "learning_rate": 4.2386148245731e-05, + "loss": 1.3943, + "step": 8111 + }, + { + "epoch": 0.87, + "grad_norm": 0.09902179964016455, + "learning_rate": 4.2316021700398944e-05, + "loss": 1.4578, + "step": 8112 + }, + { + "epoch": 0.87, + "grad_norm": 0.09883714745917478, + "learning_rate": 4.224595065082703e-05, + "loss": 1.4541, + "step": 8113 + }, + { + "epoch": 0.87, + "grad_norm": 0.08727422039676933, + "learning_rate": 4.217593510551176e-05, + "loss": 1.4615, + "step": 8114 + }, + { + "epoch": 0.87, + "grad_norm": 0.09334590367762645, + "learning_rate": 4.210597507294267e-05, + "loss": 1.4209, + "step": 8115 + }, + { + "epoch": 0.87, + "grad_norm": 0.09456309243317293, + "learning_rate": 4.203607056160269e-05, + "loss": 1.3918, + "step": 8116 + }, + { + "epoch": 0.87, + "grad_norm": 0.09896588270561626, + "learning_rate": 4.19662215799681e-05, + "loss": 1.4424, + "step": 8117 + }, + { + "epoch": 0.87, + "grad_norm": 0.08847654465807804, + "learning_rate": 4.1896428136508125e-05, + "loss": 1.3337, + "step": 8118 + }, + { + "epoch": 0.87, + "grad_norm": 0.11081100920646882, + "learning_rate": 4.182669023968561e-05, + "loss": 1.4485, + "step": 8119 + }, + { + "epoch": 0.87, + "grad_norm": 0.09416340783753362, + "learning_rate": 4.17570078979565e-05, + "loss": 1.414, + "step": 8120 + }, + { + "epoch": 0.87, + "grad_norm": 0.09604121115206532, + "learning_rate": 4.168738111976989e-05, + "loss": 1.4759, + "step": 8121 + }, + { + "epoch": 0.87, + "grad_norm": 0.08494053518936755, + "learning_rate": 4.161780991356845e-05, + "loss": 1.2911, + "step": 8122 + }, + { + "epoch": 0.87, + "grad_norm": 0.09232432805630465, + "learning_rate": 4.154829428778778e-05, + "loss": 1.4698, + "step": 8123 + }, + { + "epoch": 0.87, + "grad_norm": 0.08560962063353268, + "learning_rate": 4.147883425085702e-05, + "loss": 1.3934, + "step": 8124 + }, + { + "epoch": 0.87, + "grad_norm": 0.08548953603826323, + "learning_rate": 4.140942981119833e-05, + "loss": 1.3536, + "step": 8125 + }, + { + "epoch": 0.87, + "grad_norm": 0.09186465089721559, + "learning_rate": 4.1340080977227244e-05, + "loss": 1.3503, + "step": 8126 + }, + { + "epoch": 0.87, + "grad_norm": 0.09138641763789578, + "learning_rate": 4.127078775735266e-05, + "loss": 1.4755, + "step": 8127 + }, + { + "epoch": 0.87, + "grad_norm": 0.09084712166052157, + "learning_rate": 4.1201550159976455e-05, + "loss": 1.3593, + "step": 8128 + }, + { + "epoch": 0.87, + "grad_norm": 0.09212969623548035, + "learning_rate": 4.113236819349403e-05, + "loss": 1.5437, + "step": 8129 + }, + { + "epoch": 0.87, + "grad_norm": 0.09237647458246168, + "learning_rate": 4.106324186629396e-05, + "loss": 1.3442, + "step": 8130 + }, + { + "epoch": 0.87, + "grad_norm": 0.09251388456673824, + "learning_rate": 4.0994171186757966e-05, + "loss": 1.586, + "step": 8131 + }, + { + "epoch": 0.87, + "grad_norm": 0.09511611784151426, + "learning_rate": 4.0925156163261256e-05, + "loss": 1.4247, + "step": 8132 + }, + { + "epoch": 0.87, + "grad_norm": 0.09355509883336248, + "learning_rate": 4.085619680417196e-05, + "loss": 1.4298, + "step": 8133 + }, + { + "epoch": 0.87, + "grad_norm": 0.08637365018956814, + "learning_rate": 4.0787293117851674e-05, + "loss": 1.3321, + "step": 8134 + }, + { + "epoch": 0.87, + "grad_norm": 0.08396638217296636, + "learning_rate": 4.071844511265527e-05, + "loss": 1.5422, + "step": 8135 + }, + { + "epoch": 0.87, + "grad_norm": 0.08649007647069878, + "learning_rate": 4.064965279693083e-05, + "loss": 1.3918, + "step": 8136 + }, + { + "epoch": 0.87, + "grad_norm": 0.09236978127469905, + "learning_rate": 4.058091617901949e-05, + "loss": 1.4447, + "step": 8137 + }, + { + "epoch": 0.87, + "grad_norm": 0.08843730477667042, + "learning_rate": 4.0512235267256e-05, + "loss": 1.3427, + "step": 8138 + }, + { + "epoch": 0.87, + "grad_norm": 0.10048906860747468, + "learning_rate": 4.044361006996811e-05, + "loss": 1.3218, + "step": 8139 + }, + { + "epoch": 0.88, + "grad_norm": 0.09575911752372211, + "learning_rate": 4.0375040595476754e-05, + "loss": 1.3955, + "step": 8140 + }, + { + "epoch": 0.88, + "grad_norm": 0.09332874770342339, + "learning_rate": 4.0306526852096296e-05, + "loss": 1.2804, + "step": 8141 + }, + { + "epoch": 0.88, + "grad_norm": 0.089037521052994, + "learning_rate": 4.023806884813436e-05, + "loss": 1.4618, + "step": 8142 + }, + { + "epoch": 0.88, + "grad_norm": 0.08529572187836744, + "learning_rate": 4.016966659189158e-05, + "loss": 1.394, + "step": 8143 + }, + { + "epoch": 0.88, + "grad_norm": 0.09280673609000385, + "learning_rate": 4.010132009166195e-05, + "loss": 1.281, + "step": 8144 + }, + { + "epoch": 0.88, + "grad_norm": 0.08763093597932838, + "learning_rate": 4.0033029355732886e-05, + "loss": 1.344, + "step": 8145 + }, + { + "epoch": 0.88, + "grad_norm": 0.09000825851517992, + "learning_rate": 3.996479439238465e-05, + "loss": 1.4322, + "step": 8146 + }, + { + "epoch": 0.88, + "grad_norm": 0.10024569194799243, + "learning_rate": 3.989661520989102e-05, + "loss": 1.5138, + "step": 8147 + }, + { + "epoch": 0.88, + "grad_norm": 0.0913436778896483, + "learning_rate": 3.982849181651915e-05, + "loss": 1.3464, + "step": 8148 + }, + { + "epoch": 0.88, + "grad_norm": 0.0862936421392638, + "learning_rate": 3.9760424220529004e-05, + "loss": 1.3939, + "step": 8149 + }, + { + "epoch": 0.88, + "grad_norm": 0.10394172085424071, + "learning_rate": 3.969241243017413e-05, + "loss": 1.5113, + "step": 8150 + }, + { + "epoch": 0.88, + "grad_norm": 0.0917900252830298, + "learning_rate": 3.962445645370122e-05, + "loss": 1.3708, + "step": 8151 + }, + { + "epoch": 0.88, + "grad_norm": 0.08331465806789863, + "learning_rate": 3.955655629935007e-05, + "loss": 1.2775, + "step": 8152 + }, + { + "epoch": 0.88, + "grad_norm": 0.0838333563003349, + "learning_rate": 3.948871197535386e-05, + "loss": 1.4261, + "step": 8153 + }, + { + "epoch": 0.88, + "grad_norm": 0.09130774844385221, + "learning_rate": 3.942092348993903e-05, + "loss": 1.384, + "step": 8154 + }, + { + "epoch": 0.88, + "grad_norm": 0.09102809118679295, + "learning_rate": 3.935319085132505e-05, + "loss": 1.4354, + "step": 8155 + }, + { + "epoch": 0.88, + "grad_norm": 0.09312677606946752, + "learning_rate": 3.928551406772468e-05, + "loss": 1.4494, + "step": 8156 + }, + { + "epoch": 0.88, + "grad_norm": 0.08760962335416297, + "learning_rate": 3.921789314734409e-05, + "loss": 1.3796, + "step": 8157 + }, + { + "epoch": 0.88, + "grad_norm": 0.1140094609269149, + "learning_rate": 3.915032809838259e-05, + "loss": 1.502, + "step": 8158 + }, + { + "epoch": 0.88, + "grad_norm": 0.08082863729081557, + "learning_rate": 3.908281892903254e-05, + "loss": 1.4221, + "step": 8159 + }, + { + "epoch": 0.88, + "grad_norm": 0.08881742661475506, + "learning_rate": 3.901536564747965e-05, + "loss": 1.4697, + "step": 8160 + }, + { + "epoch": 0.88, + "grad_norm": 0.09152808811324147, + "learning_rate": 3.8947968261903054e-05, + "loss": 1.3648, + "step": 8161 + }, + { + "epoch": 0.88, + "grad_norm": 0.09256609786251456, + "learning_rate": 3.888062678047472e-05, + "loss": 1.4266, + "step": 8162 + }, + { + "epoch": 0.88, + "grad_norm": 0.07859820908539289, + "learning_rate": 3.88133412113601e-05, + "loss": 1.3956, + "step": 8163 + }, + { + "epoch": 0.88, + "grad_norm": 0.09774279455050242, + "learning_rate": 3.874611156271801e-05, + "loss": 1.4649, + "step": 8164 + }, + { + "epoch": 0.88, + "grad_norm": 0.09097510907480971, + "learning_rate": 3.867893784269988e-05, + "loss": 1.4925, + "step": 8165 + }, + { + "epoch": 0.88, + "grad_norm": 0.08421560686125693, + "learning_rate": 3.861182005945091e-05, + "loss": 1.4309, + "step": 8166 + }, + { + "epoch": 0.88, + "grad_norm": 0.09794270345444765, + "learning_rate": 3.854475822110953e-05, + "loss": 1.3782, + "step": 8167 + }, + { + "epoch": 0.88, + "grad_norm": 0.10010697158639269, + "learning_rate": 3.8477752335807027e-05, + "loss": 1.3843, + "step": 8168 + }, + { + "epoch": 0.88, + "grad_norm": 0.09122969411040488, + "learning_rate": 3.841080241166811e-05, + "loss": 1.4884, + "step": 8169 + }, + { + "epoch": 0.88, + "grad_norm": 0.0879295421905876, + "learning_rate": 3.8343908456810905e-05, + "loss": 1.4185, + "step": 8170 + }, + { + "epoch": 0.88, + "grad_norm": 0.10266989127410892, + "learning_rate": 3.82770704793462e-05, + "loss": 1.4521, + "step": 8171 + }, + { + "epoch": 0.88, + "grad_norm": 0.09692602731626357, + "learning_rate": 3.8210288487378566e-05, + "loss": 1.5678, + "step": 8172 + }, + { + "epoch": 0.88, + "grad_norm": 0.0856255659833879, + "learning_rate": 3.8143562489005525e-05, + "loss": 1.4182, + "step": 8173 + }, + { + "epoch": 0.88, + "grad_norm": 0.08637972321065973, + "learning_rate": 3.8076892492317713e-05, + "loss": 1.4223, + "step": 8174 + }, + { + "epoch": 0.88, + "grad_norm": 0.085387579620603, + "learning_rate": 3.801027850539929e-05, + "loss": 1.3246, + "step": 8175 + }, + { + "epoch": 0.88, + "grad_norm": 0.07966648182833903, + "learning_rate": 3.794372053632722e-05, + "loss": 1.3599, + "step": 8176 + }, + { + "epoch": 0.88, + "grad_norm": 0.0839450905097096, + "learning_rate": 3.7877218593172057e-05, + "loss": 1.3715, + "step": 8177 + }, + { + "epoch": 0.88, + "grad_norm": 0.08710525678299814, + "learning_rate": 3.781077268399724e-05, + "loss": 1.5138, + "step": 8178 + }, + { + "epoch": 0.88, + "grad_norm": 0.0909527791884506, + "learning_rate": 3.7744382816859765e-05, + "loss": 1.2932, + "step": 8179 + }, + { + "epoch": 0.88, + "grad_norm": 0.08725990644653168, + "learning_rate": 3.7678048999809365e-05, + "loss": 1.4722, + "step": 8180 + }, + { + "epoch": 0.88, + "grad_norm": 0.08838118634523105, + "learning_rate": 3.761177124088943e-05, + "loss": 1.3409, + "step": 8181 + }, + { + "epoch": 0.88, + "grad_norm": 0.0817359643411812, + "learning_rate": 3.7545549548136435e-05, + "loss": 1.3198, + "step": 8182 + }, + { + "epoch": 0.88, + "grad_norm": 0.0910296821176027, + "learning_rate": 3.747938392957972e-05, + "loss": 1.406, + "step": 8183 + }, + { + "epoch": 0.88, + "grad_norm": 0.08337072181587418, + "learning_rate": 3.741327439324232e-05, + "loss": 1.2309, + "step": 8184 + }, + { + "epoch": 0.88, + "grad_norm": 0.08964416503075122, + "learning_rate": 3.734722094714027e-05, + "loss": 1.4342, + "step": 8185 + }, + { + "epoch": 0.88, + "grad_norm": 0.08834670597246252, + "learning_rate": 3.7281223599282655e-05, + "loss": 1.5451, + "step": 8186 + }, + { + "epoch": 0.88, + "grad_norm": 0.09156897718666505, + "learning_rate": 3.72152823576718e-05, + "loss": 1.3623, + "step": 8187 + }, + { + "epoch": 0.88, + "grad_norm": 0.0897607277218215, + "learning_rate": 3.714939723030347e-05, + "loss": 1.4662, + "step": 8188 + }, + { + "epoch": 0.88, + "grad_norm": 0.09034869839292647, + "learning_rate": 3.7083568225166454e-05, + "loss": 1.2751, + "step": 8189 + }, + { + "epoch": 0.88, + "grad_norm": 0.09641389241523105, + "learning_rate": 3.701779535024269e-05, + "loss": 1.3636, + "step": 8190 + }, + { + "epoch": 0.88, + "grad_norm": 0.09110058458879819, + "learning_rate": 3.69520786135073e-05, + "loss": 1.415, + "step": 8191 + }, + { + "epoch": 0.88, + "grad_norm": 0.09185325382635562, + "learning_rate": 3.688641802292891e-05, + "loss": 1.4223, + "step": 8192 + }, + { + "epoch": 0.88, + "grad_norm": 0.08397425268962305, + "learning_rate": 3.682081358646883e-05, + "loss": 1.4142, + "step": 8193 + }, + { + "epoch": 0.88, + "grad_norm": 0.07977066022161716, + "learning_rate": 3.67552653120819e-05, + "loss": 1.414, + "step": 8194 + }, + { + "epoch": 0.88, + "grad_norm": 0.07986806002290193, + "learning_rate": 3.668977320771616e-05, + "loss": 1.3467, + "step": 8195 + }, + { + "epoch": 0.88, + "grad_norm": 0.0937341798408163, + "learning_rate": 3.66243372813127e-05, + "loss": 1.3114, + "step": 8196 + }, + { + "epoch": 0.88, + "grad_norm": 0.09905772226978368, + "learning_rate": 3.655895754080579e-05, + "loss": 1.2811, + "step": 8197 + }, + { + "epoch": 0.88, + "grad_norm": 0.0976774300018668, + "learning_rate": 3.649363399412309e-05, + "loss": 1.395, + "step": 8198 + }, + { + "epoch": 0.88, + "grad_norm": 0.09179719331361488, + "learning_rate": 3.6428366649185084e-05, + "loss": 1.3635, + "step": 8199 + }, + { + "epoch": 0.88, + "grad_norm": 0.0824569558000461, + "learning_rate": 3.636315551390584e-05, + "loss": 1.4963, + "step": 8200 + }, + { + "epoch": 0.88, + "grad_norm": 0.08942944820307387, + "learning_rate": 3.6298000596192485e-05, + "loss": 1.4102, + "step": 8201 + }, + { + "epoch": 0.88, + "grad_norm": 0.0899389335487092, + "learning_rate": 3.623290190394507e-05, + "loss": 1.3796, + "step": 8202 + }, + { + "epoch": 0.88, + "grad_norm": 0.0912806741082675, + "learning_rate": 3.616785944505713e-05, + "loss": 1.3977, + "step": 8203 + }, + { + "epoch": 0.88, + "grad_norm": 0.09622140600547263, + "learning_rate": 3.61028732274154e-05, + "loss": 1.3994, + "step": 8204 + }, + { + "epoch": 0.88, + "grad_norm": 0.10230214814948685, + "learning_rate": 3.603794325889953e-05, + "loss": 1.3927, + "step": 8205 + }, + { + "epoch": 0.88, + "grad_norm": 0.08549966685330587, + "learning_rate": 3.597306954738255e-05, + "loss": 1.4888, + "step": 8206 + }, + { + "epoch": 0.88, + "grad_norm": 0.0797934896092907, + "learning_rate": 3.590825210073073e-05, + "loss": 1.3104, + "step": 8207 + }, + { + "epoch": 0.88, + "grad_norm": 0.08264588594248158, + "learning_rate": 3.584349092680328e-05, + "loss": 1.5409, + "step": 8208 + }, + { + "epoch": 0.88, + "grad_norm": 0.08128287757448842, + "learning_rate": 3.577878603345269e-05, + "loss": 1.3391, + "step": 8209 + }, + { + "epoch": 0.88, + "grad_norm": 0.09632212209637667, + "learning_rate": 3.5714137428524754e-05, + "loss": 1.3682, + "step": 8210 + }, + { + "epoch": 0.88, + "grad_norm": 0.08181637197782007, + "learning_rate": 3.564954511985824e-05, + "loss": 1.478, + "step": 8211 + }, + { + "epoch": 0.88, + "grad_norm": 0.08614079720602566, + "learning_rate": 3.5585009115285226e-05, + "loss": 1.2792, + "step": 8212 + }, + { + "epoch": 0.88, + "grad_norm": 0.09936056430723951, + "learning_rate": 3.552052942263101e-05, + "loss": 1.4256, + "step": 8213 + }, + { + "epoch": 0.88, + "grad_norm": 0.08886464083868731, + "learning_rate": 3.545610604971383e-05, + "loss": 1.361, + "step": 8214 + }, + { + "epoch": 0.88, + "grad_norm": 0.09233698701338725, + "learning_rate": 3.5391739004345335e-05, + "loss": 1.4835, + "step": 8215 + }, + { + "epoch": 0.88, + "grad_norm": 0.08301256671081278, + "learning_rate": 3.5327428294330336e-05, + "loss": 1.4588, + "step": 8216 + }, + { + "epoch": 0.88, + "grad_norm": 0.09239690186282472, + "learning_rate": 3.5263173927466584e-05, + "loss": 1.3954, + "step": 8217 + }, + { + "epoch": 0.88, + "grad_norm": 0.09754474472861756, + "learning_rate": 3.5198975911545136e-05, + "loss": 1.3358, + "step": 8218 + }, + { + "epoch": 0.88, + "grad_norm": 0.08706581069856931, + "learning_rate": 3.513483425435021e-05, + "loss": 1.3275, + "step": 8219 + }, + { + "epoch": 0.88, + "grad_norm": 0.09382740937626291, + "learning_rate": 3.507074896365942e-05, + "loss": 1.3643, + "step": 8220 + }, + { + "epoch": 0.88, + "grad_norm": 0.08549684695802111, + "learning_rate": 3.500672004724303e-05, + "loss": 1.3358, + "step": 8221 + }, + { + "epoch": 0.88, + "grad_norm": 0.11311319233591514, + "learning_rate": 3.494274751286497e-05, + "loss": 1.4496, + "step": 8222 + }, + { + "epoch": 0.88, + "grad_norm": 0.08135780208806964, + "learning_rate": 3.4878831368282126e-05, + "loss": 1.2636, + "step": 8223 + }, + { + "epoch": 0.88, + "grad_norm": 0.09552206092091241, + "learning_rate": 3.4814971621244415e-05, + "loss": 1.4177, + "step": 8224 + }, + { + "epoch": 0.88, + "grad_norm": 0.09716090579217876, + "learning_rate": 3.4751168279495095e-05, + "loss": 1.288, + "step": 8225 + }, + { + "epoch": 0.88, + "grad_norm": 0.08658930830747578, + "learning_rate": 3.468742135077069e-05, + "loss": 1.3388, + "step": 8226 + }, + { + "epoch": 0.88, + "grad_norm": 0.09649678229661654, + "learning_rate": 3.462373084280057e-05, + "loss": 1.3767, + "step": 8227 + }, + { + "epoch": 0.88, + "grad_norm": 0.09611441101312547, + "learning_rate": 3.456009676330751e-05, + "loss": 1.4051, + "step": 8228 + }, + { + "epoch": 0.88, + "grad_norm": 0.09029750550796355, + "learning_rate": 3.449651912000734e-05, + "loss": 1.3124, + "step": 8229 + }, + { + "epoch": 0.88, + "grad_norm": 0.10382301198139691, + "learning_rate": 3.4432997920609e-05, + "loss": 1.4823, + "step": 8230 + }, + { + "epoch": 0.88, + "grad_norm": 0.08153783473681955, + "learning_rate": 3.436953317281472e-05, + "loss": 1.4889, + "step": 8231 + }, + { + "epoch": 0.88, + "grad_norm": 0.09764009699745096, + "learning_rate": 3.430612488431989e-05, + "loss": 1.3913, + "step": 8232 + }, + { + "epoch": 0.89, + "grad_norm": 0.08631013956906346, + "learning_rate": 3.4242773062812815e-05, + "loss": 1.4285, + "step": 8233 + }, + { + "epoch": 0.89, + "grad_norm": 0.08769483348443738, + "learning_rate": 3.4179477715975237e-05, + "loss": 1.4148, + "step": 8234 + }, + { + "epoch": 0.89, + "grad_norm": 0.09050904791758085, + "learning_rate": 3.411623885148202e-05, + "loss": 1.2897, + "step": 8235 + }, + { + "epoch": 0.89, + "grad_norm": 0.08907965528334316, + "learning_rate": 3.405305647700085e-05, + "loss": 1.3697, + "step": 8236 + }, + { + "epoch": 0.89, + "grad_norm": 0.08801888670287704, + "learning_rate": 3.398993060019295e-05, + "loss": 1.3306, + "step": 8237 + }, + { + "epoch": 0.89, + "grad_norm": 0.08602038634493521, + "learning_rate": 3.392686122871263e-05, + "loss": 1.4508, + "step": 8238 + }, + { + "epoch": 0.89, + "grad_norm": 0.08606685174750452, + "learning_rate": 3.3863848370207226e-05, + "loss": 1.4797, + "step": 8239 + }, + { + "epoch": 0.89, + "grad_norm": 0.10244754385585404, + "learning_rate": 3.380089203231712e-05, + "loss": 1.4539, + "step": 8240 + }, + { + "epoch": 0.89, + "grad_norm": 0.09051226085948132, + "learning_rate": 3.373799222267615e-05, + "loss": 1.4318, + "step": 8241 + }, + { + "epoch": 0.89, + "grad_norm": 0.10024875889437078, + "learning_rate": 3.3675148948911e-05, + "loss": 1.3614, + "step": 8242 + }, + { + "epoch": 0.89, + "grad_norm": 0.09779553374981109, + "learning_rate": 3.361236221864172e-05, + "loss": 1.272, + "step": 8243 + }, + { + "epoch": 0.89, + "grad_norm": 0.08717187939794446, + "learning_rate": 3.354963203948147e-05, + "loss": 1.423, + "step": 8244 + }, + { + "epoch": 0.89, + "grad_norm": 0.09053686476661967, + "learning_rate": 3.348695841903637e-05, + "loss": 1.3593, + "step": 8245 + }, + { + "epoch": 0.89, + "grad_norm": 0.08330610617134679, + "learning_rate": 3.342434136490585e-05, + "loss": 1.4082, + "step": 8246 + }, + { + "epoch": 0.89, + "grad_norm": 0.08656762493107648, + "learning_rate": 3.3361780884682615e-05, + "loss": 1.3553, + "step": 8247 + }, + { + "epoch": 0.89, + "grad_norm": 0.10053769825211262, + "learning_rate": 3.329927698595203e-05, + "loss": 1.3354, + "step": 8248 + }, + { + "epoch": 0.89, + "grad_norm": 0.10128003625082153, + "learning_rate": 3.3236829676293215e-05, + "loss": 1.3203, + "step": 8249 + }, + { + "epoch": 0.89, + "grad_norm": 0.08681193791874181, + "learning_rate": 3.3174438963277875e-05, + "loss": 1.3869, + "step": 8250 + }, + { + "epoch": 0.89, + "grad_norm": 0.09209269431223971, + "learning_rate": 3.311210485447125e-05, + "loss": 1.3276, + "step": 8251 + }, + { + "epoch": 0.89, + "grad_norm": 0.09032137653302173, + "learning_rate": 3.30498273574315e-05, + "loss": 1.3161, + "step": 8252 + }, + { + "epoch": 0.89, + "grad_norm": 0.08830919203330095, + "learning_rate": 3.298760647970994e-05, + "loss": 1.4686, + "step": 8253 + }, + { + "epoch": 0.89, + "grad_norm": 0.09601488594029912, + "learning_rate": 3.292544222885124e-05, + "loss": 1.4203, + "step": 8254 + }, + { + "epoch": 0.89, + "grad_norm": 0.0936075049034287, + "learning_rate": 3.286333461239288e-05, + "loss": 1.453, + "step": 8255 + }, + { + "epoch": 0.89, + "grad_norm": 0.08604891420228421, + "learning_rate": 3.280128363786561e-05, + "loss": 1.2659, + "step": 8256 + }, + { + "epoch": 0.89, + "grad_norm": 0.08944258325837522, + "learning_rate": 3.273928931279346e-05, + "loss": 1.4166, + "step": 8257 + }, + { + "epoch": 0.89, + "grad_norm": 0.09903855128508778, + "learning_rate": 3.267735164469332e-05, + "loss": 1.2721, + "step": 8258 + }, + { + "epoch": 0.89, + "grad_norm": 0.09662137777758281, + "learning_rate": 3.261547064107551e-05, + "loss": 1.2992, + "step": 8259 + }, + { + "epoch": 0.89, + "grad_norm": 0.08389210761394819, + "learning_rate": 3.255364630944313e-05, + "loss": 1.3329, + "step": 8260 + }, + { + "epoch": 0.89, + "grad_norm": 0.08569831100242886, + "learning_rate": 3.249187865729264e-05, + "loss": 1.3118, + "step": 8261 + }, + { + "epoch": 0.89, + "grad_norm": 0.0990971080529239, + "learning_rate": 3.24301676921136e-05, + "loss": 1.338, + "step": 8262 + }, + { + "epoch": 0.89, + "grad_norm": 0.10539828259931236, + "learning_rate": 3.236851342138874e-05, + "loss": 1.3873, + "step": 8263 + }, + { + "epoch": 0.89, + "grad_norm": 0.09089848271652821, + "learning_rate": 3.2306915852593713e-05, + "loss": 1.3069, + "step": 8264 + }, + { + "epoch": 0.89, + "grad_norm": 0.08731495361645467, + "learning_rate": 3.224537499319757e-05, + "loss": 1.451, + "step": 8265 + }, + { + "epoch": 0.89, + "grad_norm": 0.0963751591762757, + "learning_rate": 3.218389085066237e-05, + "loss": 1.4377, + "step": 8266 + }, + { + "epoch": 0.89, + "grad_norm": 0.08766836169616739, + "learning_rate": 3.2122463432443125e-05, + "loss": 1.3519, + "step": 8267 + }, + { + "epoch": 0.89, + "grad_norm": 0.0944698914392843, + "learning_rate": 3.206109274598817e-05, + "loss": 1.3163, + "step": 8268 + }, + { + "epoch": 0.89, + "grad_norm": 0.08410850904393825, + "learning_rate": 3.199977879873906e-05, + "loss": 1.3856, + "step": 8269 + }, + { + "epoch": 0.89, + "grad_norm": 0.08900858429825506, + "learning_rate": 3.193852159813021e-05, + "loss": 1.4729, + "step": 8270 + }, + { + "epoch": 0.89, + "grad_norm": 0.08340149953121077, + "learning_rate": 3.18773211515892e-05, + "loss": 1.4201, + "step": 8271 + }, + { + "epoch": 0.89, + "grad_norm": 0.08810900737089097, + "learning_rate": 3.181617746653687e-05, + "loss": 1.3035, + "step": 8272 + }, + { + "epoch": 0.89, + "grad_norm": 0.09909487833329844, + "learning_rate": 3.1755090550387165e-05, + "loss": 1.3747, + "step": 8273 + }, + { + "epoch": 0.89, + "grad_norm": 0.09206263771365691, + "learning_rate": 3.169406041054695e-05, + "loss": 1.4318, + "step": 8274 + }, + { + "epoch": 0.89, + "grad_norm": 0.08936295451925626, + "learning_rate": 3.163308705441648e-05, + "loss": 1.354, + "step": 8275 + }, + { + "epoch": 0.89, + "grad_norm": 0.10108582918313291, + "learning_rate": 3.157217048938882e-05, + "loss": 1.3638, + "step": 8276 + }, + { + "epoch": 0.89, + "grad_norm": 0.09810973020134794, + "learning_rate": 3.151131072285041e-05, + "loss": 1.3389, + "step": 8277 + }, + { + "epoch": 0.89, + "grad_norm": 0.09990108657933389, + "learning_rate": 3.145050776218078e-05, + "loss": 1.5351, + "step": 8278 + }, + { + "epoch": 0.89, + "grad_norm": 0.09063262414102946, + "learning_rate": 3.138976161475238e-05, + "loss": 1.4391, + "step": 8279 + }, + { + "epoch": 0.89, + "grad_norm": 0.08380459100263056, + "learning_rate": 3.132907228793086e-05, + "loss": 1.3581, + "step": 8280 + }, + { + "epoch": 0.89, + "grad_norm": 0.08742457245686931, + "learning_rate": 3.126843978907518e-05, + "loss": 1.2955, + "step": 8281 + }, + { + "epoch": 0.89, + "grad_norm": 0.09016049568453172, + "learning_rate": 3.1207864125537165e-05, + "loss": 1.2513, + "step": 8282 + }, + { + "epoch": 0.89, + "grad_norm": 0.08375867746186022, + "learning_rate": 3.1147345304661734e-05, + "loss": 1.3385, + "step": 8283 + }, + { + "epoch": 0.89, + "grad_norm": 0.10576397254106343, + "learning_rate": 3.108688333378701e-05, + "loss": 1.4306, + "step": 8284 + }, + { + "epoch": 0.89, + "grad_norm": 0.10166972259158993, + "learning_rate": 3.10264782202444e-05, + "loss": 1.4401, + "step": 8285 + }, + { + "epoch": 0.89, + "grad_norm": 0.0816827777590841, + "learning_rate": 3.0966129971358005e-05, + "loss": 1.3993, + "step": 8286 + }, + { + "epoch": 0.89, + "grad_norm": 0.10865251945702202, + "learning_rate": 3.0905838594445346e-05, + "loss": 1.4408, + "step": 8287 + }, + { + "epoch": 0.89, + "grad_norm": 0.09242901362551612, + "learning_rate": 3.0845604096817024e-05, + "loss": 1.4811, + "step": 8288 + }, + { + "epoch": 0.89, + "grad_norm": 0.09310275405757654, + "learning_rate": 3.078542648577659e-05, + "loss": 1.427, + "step": 8289 + }, + { + "epoch": 0.89, + "grad_norm": 0.07919899361904145, + "learning_rate": 3.072530576862081e-05, + "loss": 1.3695, + "step": 8290 + }, + { + "epoch": 0.89, + "grad_norm": 0.09071826200447604, + "learning_rate": 3.06652419526397e-05, + "loss": 1.3786, + "step": 8291 + }, + { + "epoch": 0.89, + "grad_norm": 0.09205784757595566, + "learning_rate": 3.060523504511587e-05, + "loss": 1.4101, + "step": 8292 + }, + { + "epoch": 0.89, + "grad_norm": 0.09121367166574848, + "learning_rate": 3.05452850533256e-05, + "loss": 1.4315, + "step": 8293 + }, + { + "epoch": 0.89, + "grad_norm": 0.08840960290934788, + "learning_rate": 3.048539198453798e-05, + "loss": 1.1353, + "step": 8294 + }, + { + "epoch": 0.89, + "grad_norm": 0.10546852151528087, + "learning_rate": 3.0425555846015196e-05, + "loss": 1.3073, + "step": 8295 + }, + { + "epoch": 0.89, + "grad_norm": 0.09881277238387846, + "learning_rate": 3.0365776645012666e-05, + "loss": 1.3881, + "step": 8296 + }, + { + "epoch": 0.89, + "grad_norm": 0.0854003085075494, + "learning_rate": 3.0306054388778814e-05, + "loss": 1.3781, + "step": 8297 + }, + { + "epoch": 0.89, + "grad_norm": 0.09904759952136048, + "learning_rate": 3.0246389084555127e-05, + "loss": 1.5183, + "step": 8298 + }, + { + "epoch": 0.89, + "grad_norm": 0.08909127911291063, + "learning_rate": 3.0186780739576202e-05, + "loss": 1.4428, + "step": 8299 + }, + { + "epoch": 0.89, + "grad_norm": 0.11642320664644766, + "learning_rate": 3.012722936106993e-05, + "loss": 1.3976, + "step": 8300 + }, + { + "epoch": 0.89, + "grad_norm": 0.09675931420968514, + "learning_rate": 3.0067734956256863e-05, + "loss": 1.4935, + "step": 8301 + }, + { + "epoch": 0.89, + "grad_norm": 0.09474977351473926, + "learning_rate": 3.0008297532351182e-05, + "loss": 1.365, + "step": 8302 + }, + { + "epoch": 0.89, + "grad_norm": 0.10272168744723374, + "learning_rate": 2.9948917096559615e-05, + "loss": 1.2695, + "step": 8303 + }, + { + "epoch": 0.89, + "grad_norm": 0.08298955604398278, + "learning_rate": 2.9889593656082404e-05, + "loss": 1.4956, + "step": 8304 + }, + { + "epoch": 0.89, + "grad_norm": 0.09179600755809245, + "learning_rate": 2.983032721811263e-05, + "loss": 1.4941, + "step": 8305 + }, + { + "epoch": 0.89, + "grad_norm": 0.10682099436596237, + "learning_rate": 2.977111778983671e-05, + "loss": 1.3633, + "step": 8306 + }, + { + "epoch": 0.89, + "grad_norm": 0.0867515300642767, + "learning_rate": 2.9711965378433793e-05, + "loss": 1.3357, + "step": 8307 + }, + { + "epoch": 0.89, + "grad_norm": 0.08754850931104234, + "learning_rate": 2.9652869991076413e-05, + "loss": 1.4155, + "step": 8308 + }, + { + "epoch": 0.89, + "grad_norm": 0.09424777427849733, + "learning_rate": 2.9593831634930123e-05, + "loss": 1.3722, + "step": 8309 + }, + { + "epoch": 0.89, + "grad_norm": 0.09294745036970158, + "learning_rate": 2.9534850317153415e-05, + "loss": 1.3322, + "step": 8310 + }, + { + "epoch": 0.89, + "grad_norm": 0.09331836902204226, + "learning_rate": 2.947592604489807e-05, + "loss": 1.3521, + "step": 8311 + }, + { + "epoch": 0.89, + "grad_norm": 0.09313847602316322, + "learning_rate": 2.9417058825308875e-05, + "loss": 1.3364, + "step": 8312 + }, + { + "epoch": 0.89, + "grad_norm": 0.09053115548644763, + "learning_rate": 2.9358248665523667e-05, + "loss": 1.4487, + "step": 8313 + }, + { + "epoch": 0.89, + "grad_norm": 0.09041572250814107, + "learning_rate": 2.9299495572673307e-05, + "loss": 1.4212, + "step": 8314 + }, + { + "epoch": 0.89, + "grad_norm": 0.0904359258699773, + "learning_rate": 2.9240799553881814e-05, + "loss": 1.3782, + "step": 8315 + }, + { + "epoch": 0.89, + "grad_norm": 0.08136115391502427, + "learning_rate": 2.918216061626644e-05, + "loss": 1.415, + "step": 8316 + }, + { + "epoch": 0.89, + "grad_norm": 0.08998781295143174, + "learning_rate": 2.912357876693711e-05, + "loss": 1.3324, + "step": 8317 + }, + { + "epoch": 0.89, + "grad_norm": 0.09399137616736783, + "learning_rate": 2.9065054012997305e-05, + "loss": 1.4648, + "step": 8318 + }, + { + "epoch": 0.89, + "grad_norm": 0.08235405096329945, + "learning_rate": 2.900658636154324e-05, + "loss": 1.3484, + "step": 8319 + }, + { + "epoch": 0.89, + "grad_norm": 0.10489144653783913, + "learning_rate": 2.8948175819664357e-05, + "loss": 1.5477, + "step": 8320 + }, + { + "epoch": 0.89, + "grad_norm": 0.09116817963939151, + "learning_rate": 2.8889822394443043e-05, + "loss": 1.3189, + "step": 8321 + }, + { + "epoch": 0.89, + "grad_norm": 0.09584664025860186, + "learning_rate": 2.883152609295503e-05, + "loss": 1.4975, + "step": 8322 + }, + { + "epoch": 0.89, + "grad_norm": 0.08977636111226732, + "learning_rate": 2.8773286922268883e-05, + "loss": 1.3452, + "step": 8323 + }, + { + "epoch": 0.89, + "grad_norm": 0.07654042445721576, + "learning_rate": 2.871510488944612e-05, + "loss": 1.4143, + "step": 8324 + }, + { + "epoch": 0.89, + "grad_norm": 0.08818203775801058, + "learning_rate": 2.8656980001541765e-05, + "loss": 1.4636, + "step": 8325 + }, + { + "epoch": 0.9, + "grad_norm": 0.08432117009970455, + "learning_rate": 2.859891226560346e-05, + "loss": 1.3565, + "step": 8326 + }, + { + "epoch": 0.9, + "grad_norm": 0.09712799846018604, + "learning_rate": 2.854090168867224e-05, + "loss": 1.4048, + "step": 8327 + }, + { + "epoch": 0.9, + "grad_norm": 0.10385353675645356, + "learning_rate": 2.848294827778214e-05, + "loss": 1.347, + "step": 8328 + }, + { + "epoch": 0.9, + "grad_norm": 0.0959182760982914, + "learning_rate": 2.8425052039959987e-05, + "loss": 1.3536, + "step": 8329 + }, + { + "epoch": 0.9, + "grad_norm": 0.09054068618274394, + "learning_rate": 2.836721298222611e-05, + "loss": 1.3895, + "step": 8330 + }, + { + "epoch": 0.9, + "grad_norm": 0.08613128379624303, + "learning_rate": 2.8309431111593675e-05, + "loss": 1.329, + "step": 8331 + }, + { + "epoch": 0.9, + "grad_norm": 0.09153174477614746, + "learning_rate": 2.8251706435068803e-05, + "loss": 1.3473, + "step": 8332 + }, + { + "epoch": 0.9, + "grad_norm": 0.0895519519633704, + "learning_rate": 2.8194038959650892e-05, + "loss": 1.38, + "step": 8333 + }, + { + "epoch": 0.9, + "grad_norm": 0.09474284361504132, + "learning_rate": 2.813642869233235e-05, + "loss": 1.3456, + "step": 8334 + }, + { + "epoch": 0.9, + "grad_norm": 0.08310663147369579, + "learning_rate": 2.8078875640098646e-05, + "loss": 1.4512, + "step": 8335 + }, + { + "epoch": 0.9, + "grad_norm": 0.09109971946210933, + "learning_rate": 2.802137980992814e-05, + "loss": 1.3312, + "step": 8336 + }, + { + "epoch": 0.9, + "grad_norm": 0.08391879676313042, + "learning_rate": 2.796394120879259e-05, + "loss": 1.3833, + "step": 8337 + }, + { + "epoch": 0.9, + "grad_norm": 0.07946902228691093, + "learning_rate": 2.790655984365642e-05, + "loss": 1.4357, + "step": 8338 + }, + { + "epoch": 0.9, + "grad_norm": 0.10628108602411328, + "learning_rate": 2.7849235721477406e-05, + "loss": 1.3521, + "step": 8339 + }, + { + "epoch": 0.9, + "grad_norm": 0.10121675475351374, + "learning_rate": 2.7791968849206427e-05, + "loss": 1.4077, + "step": 8340 + }, + { + "epoch": 0.9, + "grad_norm": 0.08187516155572779, + "learning_rate": 2.7734759233787045e-05, + "loss": 1.3724, + "step": 8341 + }, + { + "epoch": 0.9, + "grad_norm": 0.0986900611957909, + "learning_rate": 2.7677606882156314e-05, + "loss": 1.369, + "step": 8342 + }, + { + "epoch": 0.9, + "grad_norm": 0.08405140486248891, + "learning_rate": 2.7620511801244143e-05, + "loss": 1.4385, + "step": 8343 + }, + { + "epoch": 0.9, + "grad_norm": 0.09081596189763722, + "learning_rate": 2.7563473997973433e-05, + "loss": 1.3842, + "step": 8344 + }, + { + "epoch": 0.9, + "grad_norm": 0.08228297439170708, + "learning_rate": 2.750649347926021e-05, + "loss": 1.3592, + "step": 8345 + }, + { + "epoch": 0.9, + "grad_norm": 0.08391568513119509, + "learning_rate": 2.7449570252013556e-05, + "loss": 1.3608, + "step": 8346 + }, + { + "epoch": 0.9, + "grad_norm": 0.08954952617668473, + "learning_rate": 2.7392704323135677e-05, + "loss": 1.3169, + "step": 8347 + }, + { + "epoch": 0.9, + "grad_norm": 0.0917650342566164, + "learning_rate": 2.733589569952172e-05, + "loss": 1.3512, + "step": 8348 + }, + { + "epoch": 0.9, + "grad_norm": 0.09179354595681569, + "learning_rate": 2.7279144388059896e-05, + "loss": 1.5651, + "step": 8349 + }, + { + "epoch": 0.9, + "grad_norm": 0.08588776856338311, + "learning_rate": 2.7222450395631592e-05, + "loss": 1.3745, + "step": 8350 + }, + { + "epoch": 0.9, + "grad_norm": 0.0909950628609119, + "learning_rate": 2.7165813729111032e-05, + "loss": 1.3828, + "step": 8351 + }, + { + "epoch": 0.9, + "grad_norm": 0.10197866780298187, + "learning_rate": 2.7109234395365667e-05, + "loss": 1.364, + "step": 8352 + }, + { + "epoch": 0.9, + "grad_norm": 0.0879616622479054, + "learning_rate": 2.7052712401256006e-05, + "loss": 1.4415, + "step": 8353 + }, + { + "epoch": 0.9, + "grad_norm": 0.09835874727160945, + "learning_rate": 2.6996247753635404e-05, + "loss": 1.3447, + "step": 8354 + }, + { + "epoch": 0.9, + "grad_norm": 0.0963073590732697, + "learning_rate": 2.6939840459350496e-05, + "loss": 1.339, + "step": 8355 + }, + { + "epoch": 0.9, + "grad_norm": 0.08902332715993876, + "learning_rate": 2.6883490525240804e-05, + "loss": 1.3539, + "step": 8356 + }, + { + "epoch": 0.9, + "grad_norm": 0.08103360913138252, + "learning_rate": 2.6827197958138928e-05, + "loss": 1.2907, + "step": 8357 + }, + { + "epoch": 0.9, + "grad_norm": 0.09731155956807074, + "learning_rate": 2.677096276487062e-05, + "loss": 1.4633, + "step": 8358 + }, + { + "epoch": 0.9, + "grad_norm": 0.0887681860239562, + "learning_rate": 2.6714784952254544e-05, + "loss": 1.4055, + "step": 8359 + }, + { + "epoch": 0.9, + "grad_norm": 0.08898853045298434, + "learning_rate": 2.6658664527102417e-05, + "loss": 1.258, + "step": 8360 + }, + { + "epoch": 0.9, + "grad_norm": 0.08523520098416243, + "learning_rate": 2.660260149621907e-05, + "loss": 1.3498, + "step": 8361 + }, + { + "epoch": 0.9, + "grad_norm": 0.08728423609774599, + "learning_rate": 2.6546595866402403e-05, + "loss": 1.3685, + "step": 8362 + }, + { + "epoch": 0.9, + "grad_norm": 0.08238661659369678, + "learning_rate": 2.6490647644443143e-05, + "loss": 1.4899, + "step": 8363 + }, + { + "epoch": 0.9, + "grad_norm": 0.09252426038134721, + "learning_rate": 2.6434756837125317e-05, + "loss": 1.5046, + "step": 8364 + }, + { + "epoch": 0.9, + "grad_norm": 0.08778490489889275, + "learning_rate": 2.6378923451225888e-05, + "loss": 1.3537, + "step": 8365 + }, + { + "epoch": 0.9, + "grad_norm": 0.10171854887120627, + "learning_rate": 2.6323147493514833e-05, + "loss": 1.3695, + "step": 8366 + }, + { + "epoch": 0.9, + "grad_norm": 0.09384589952775692, + "learning_rate": 2.6267428970755125e-05, + "loss": 1.3111, + "step": 8367 + }, + { + "epoch": 0.9, + "grad_norm": 0.0795961798737711, + "learning_rate": 2.621176788970281e-05, + "loss": 1.3767, + "step": 8368 + }, + { + "epoch": 0.9, + "grad_norm": 0.10107183568789642, + "learning_rate": 2.6156164257107097e-05, + "loss": 1.2705, + "step": 8369 + }, + { + "epoch": 0.9, + "grad_norm": 0.09333781313598113, + "learning_rate": 2.6100618079710037e-05, + "loss": 1.37, + "step": 8370 + }, + { + "epoch": 0.9, + "grad_norm": 0.08833430413490173, + "learning_rate": 2.6045129364246856e-05, + "loss": 1.4379, + "step": 8371 + }, + { + "epoch": 0.9, + "grad_norm": 0.08838142607452992, + "learning_rate": 2.5989698117445615e-05, + "loss": 1.5307, + "step": 8372 + }, + { + "epoch": 0.9, + "grad_norm": 0.09304373165717744, + "learning_rate": 2.593432434602766e-05, + "loss": 1.4593, + "step": 8373 + }, + { + "epoch": 0.9, + "grad_norm": 0.08477827831395272, + "learning_rate": 2.5879008056707286e-05, + "loss": 1.3841, + "step": 8374 + }, + { + "epoch": 0.9, + "grad_norm": 0.09963160622039322, + "learning_rate": 2.5823749256191687e-05, + "loss": 1.3131, + "step": 8375 + }, + { + "epoch": 0.9, + "grad_norm": 0.09050444438851227, + "learning_rate": 2.5768547951181277e-05, + "loss": 1.4336, + "step": 8376 + }, + { + "epoch": 0.9, + "grad_norm": 0.09180191842894916, + "learning_rate": 2.571340414836931e-05, + "loss": 1.37, + "step": 8377 + }, + { + "epoch": 0.9, + "grad_norm": 0.0813612383349236, + "learning_rate": 2.565831785444228e-05, + "loss": 1.4256, + "step": 8378 + }, + { + "epoch": 0.9, + "grad_norm": 0.10347883114896525, + "learning_rate": 2.5603289076079394e-05, + "loss": 1.4034, + "step": 8379 + }, + { + "epoch": 0.9, + "grad_norm": 0.08899944676178057, + "learning_rate": 2.5548317819953203e-05, + "loss": 1.3462, + "step": 8380 + }, + { + "epoch": 0.9, + "grad_norm": 0.09492130882777211, + "learning_rate": 2.5493404092729267e-05, + "loss": 1.4667, + "step": 8381 + }, + { + "epoch": 0.9, + "grad_norm": 0.08012737818017668, + "learning_rate": 2.5438547901065866e-05, + "loss": 1.3719, + "step": 8382 + }, + { + "epoch": 0.9, + "grad_norm": 0.09302578161515027, + "learning_rate": 2.538374925161463e-05, + "loss": 1.4519, + "step": 8383 + }, + { + "epoch": 0.9, + "grad_norm": 0.08547204393981939, + "learning_rate": 2.5329008151020072e-05, + "loss": 1.5377, + "step": 8384 + }, + { + "epoch": 0.9, + "grad_norm": 0.07712752399790215, + "learning_rate": 2.5274324605919664e-05, + "loss": 1.3414, + "step": 8385 + }, + { + "epoch": 0.9, + "grad_norm": 0.08767049163112096, + "learning_rate": 2.521969862294404e-05, + "loss": 1.4337, + "step": 8386 + }, + { + "epoch": 0.9, + "grad_norm": 0.08594081612171055, + "learning_rate": 2.5165130208716914e-05, + "loss": 1.3469, + "step": 8387 + }, + { + "epoch": 0.9, + "grad_norm": 0.09468859963832922, + "learning_rate": 2.5110619369854594e-05, + "loss": 1.3351, + "step": 8388 + }, + { + "epoch": 0.9, + "grad_norm": 0.09186319861919497, + "learning_rate": 2.505616611296685e-05, + "loss": 1.3704, + "step": 8389 + }, + { + "epoch": 0.9, + "grad_norm": 0.08461677792305342, + "learning_rate": 2.5001770444656457e-05, + "loss": 1.4311, + "step": 8390 + }, + { + "epoch": 0.9, + "grad_norm": 0.1280313908368673, + "learning_rate": 2.494743237151892e-05, + "loss": 1.3177, + "step": 8391 + }, + { + "epoch": 0.9, + "grad_norm": 0.11643983133341222, + "learning_rate": 2.4893151900142906e-05, + "loss": 1.3947, + "step": 8392 + }, + { + "epoch": 0.9, + "grad_norm": 0.10473989822954327, + "learning_rate": 2.4838929037110268e-05, + "loss": 1.5387, + "step": 8393 + }, + { + "epoch": 0.9, + "grad_norm": 0.0843967030128136, + "learning_rate": 2.4784763788995523e-05, + "loss": 1.4826, + "step": 8394 + }, + { + "epoch": 0.9, + "grad_norm": 0.08342362278756797, + "learning_rate": 2.473065616236647e-05, + "loss": 1.3826, + "step": 8395 + }, + { + "epoch": 0.9, + "grad_norm": 0.08230065911414464, + "learning_rate": 2.4676606163783978e-05, + "loss": 1.395, + "step": 8396 + }, + { + "epoch": 0.9, + "grad_norm": 0.08570971292153445, + "learning_rate": 2.462261379980163e-05, + "loss": 1.4211, + "step": 8397 + }, + { + "epoch": 0.9, + "grad_norm": 0.09686187113928392, + "learning_rate": 2.4568679076966194e-05, + "loss": 1.4408, + "step": 8398 + }, + { + "epoch": 0.9, + "grad_norm": 0.0853459619893161, + "learning_rate": 2.4514802001817438e-05, + "loss": 1.4611, + "step": 8399 + }, + { + "epoch": 0.9, + "grad_norm": 0.08748074962675632, + "learning_rate": 2.4460982580888303e-05, + "loss": 1.4843, + "step": 8400 + }, + { + "epoch": 0.9, + "grad_norm": 0.09367239616088394, + "learning_rate": 2.4407220820704402e-05, + "loss": 1.371, + "step": 8401 + }, + { + "epoch": 0.9, + "grad_norm": 0.10033912608729492, + "learning_rate": 2.435351672778463e-05, + "loss": 1.4728, + "step": 8402 + }, + { + "epoch": 0.9, + "grad_norm": 0.08858671836179179, + "learning_rate": 2.4299870308640726e-05, + "loss": 1.4597, + "step": 8403 + }, + { + "epoch": 0.9, + "grad_norm": 0.09235237842112576, + "learning_rate": 2.4246281569777485e-05, + "loss": 1.4919, + "step": 8404 + }, + { + "epoch": 0.9, + "grad_norm": 0.103872125039485, + "learning_rate": 2.4192750517692873e-05, + "loss": 1.4196, + "step": 8405 + }, + { + "epoch": 0.9, + "grad_norm": 0.09659718234578298, + "learning_rate": 2.4139277158877538e-05, + "loss": 1.3337, + "step": 8406 + }, + { + "epoch": 0.9, + "grad_norm": 0.08205692848700044, + "learning_rate": 2.4085861499815398e-05, + "loss": 1.2939, + "step": 8407 + }, + { + "epoch": 0.9, + "grad_norm": 0.10968793439303617, + "learning_rate": 2.4032503546983332e-05, + "loss": 1.4226, + "step": 8408 + }, + { + "epoch": 0.9, + "grad_norm": 0.08408076692975283, + "learning_rate": 2.397920330685116e-05, + "loss": 1.5346, + "step": 8409 + }, + { + "epoch": 0.9, + "grad_norm": 0.08412734327092307, + "learning_rate": 2.392596078588155e-05, + "loss": 1.4113, + "step": 8410 + }, + { + "epoch": 0.9, + "grad_norm": 0.0865364726324679, + "learning_rate": 2.3872775990530504e-05, + "loss": 1.351, + "step": 8411 + }, + { + "epoch": 0.9, + "grad_norm": 0.08882596689561814, + "learning_rate": 2.3819648927246916e-05, + "loss": 1.4098, + "step": 8412 + }, + { + "epoch": 0.9, + "grad_norm": 0.09977260706687908, + "learning_rate": 2.376657960247247e-05, + "loss": 1.4846, + "step": 8413 + }, + { + "epoch": 0.9, + "grad_norm": 0.08832253323200623, + "learning_rate": 2.371356802264202e-05, + "loss": 1.3652, + "step": 8414 + }, + { + "epoch": 0.9, + "grad_norm": 0.09015004280653494, + "learning_rate": 2.3660614194183584e-05, + "loss": 1.5453, + "step": 8415 + }, + { + "epoch": 0.9, + "grad_norm": 0.09993461728963231, + "learning_rate": 2.3607718123517753e-05, + "loss": 1.3951, + "step": 8416 + }, + { + "epoch": 0.9, + "grad_norm": 0.09386030180420961, + "learning_rate": 2.3554879817058504e-05, + "loss": 1.2722, + "step": 8417 + }, + { + "epoch": 0.9, + "grad_norm": 0.08852572128057404, + "learning_rate": 2.3502099281212774e-05, + "loss": 1.3115, + "step": 8418 + }, + { + "epoch": 0.91, + "grad_norm": 0.08987717291858999, + "learning_rate": 2.3449376522380107e-05, + "loss": 1.3226, + "step": 8419 + }, + { + "epoch": 0.91, + "grad_norm": 0.10203851051114089, + "learning_rate": 2.3396711546953442e-05, + "loss": 1.4404, + "step": 8420 + }, + { + "epoch": 0.91, + "grad_norm": 0.09785692101309608, + "learning_rate": 2.3344104361318675e-05, + "loss": 1.2662, + "step": 8421 + }, + { + "epoch": 0.91, + "grad_norm": 0.09629320552610036, + "learning_rate": 2.3291554971854477e-05, + "loss": 1.3061, + "step": 8422 + }, + { + "epoch": 0.91, + "grad_norm": 0.09022413330869145, + "learning_rate": 2.32390633849327e-05, + "loss": 1.3738, + "step": 8423 + }, + { + "epoch": 0.91, + "grad_norm": 0.09209515044576586, + "learning_rate": 2.3186629606918197e-05, + "loss": 1.4061, + "step": 8424 + }, + { + "epoch": 0.91, + "grad_norm": 0.09556573011727089, + "learning_rate": 2.31342536441686e-05, + "loss": 1.3278, + "step": 8425 + }, + { + "epoch": 0.91, + "grad_norm": 0.09352813983137584, + "learning_rate": 2.3081935503034777e-05, + "loss": 1.5637, + "step": 8426 + }, + { + "epoch": 0.91, + "grad_norm": 0.09343439987112688, + "learning_rate": 2.3029675189860544e-05, + "loss": 1.4015, + "step": 8427 + }, + { + "epoch": 0.91, + "grad_norm": 0.10169587181273342, + "learning_rate": 2.297747271098244e-05, + "loss": 1.3474, + "step": 8428 + }, + { + "epoch": 0.91, + "grad_norm": 0.10179498660670094, + "learning_rate": 2.29253280727304e-05, + "loss": 1.4285, + "step": 8429 + }, + { + "epoch": 0.91, + "grad_norm": 0.09330307333224806, + "learning_rate": 2.2873241281427038e-05, + "loss": 1.3877, + "step": 8430 + }, + { + "epoch": 0.91, + "grad_norm": 0.10039279736866197, + "learning_rate": 2.2821212343388075e-05, + "loss": 1.4382, + "step": 8431 + }, + { + "epoch": 0.91, + "grad_norm": 0.10346327438587212, + "learning_rate": 2.2769241264922193e-05, + "loss": 1.3383, + "step": 8432 + }, + { + "epoch": 0.91, + "grad_norm": 0.08989579051513284, + "learning_rate": 2.2717328052331122e-05, + "loss": 1.3829, + "step": 8433 + }, + { + "epoch": 0.91, + "grad_norm": 0.09469033069439596, + "learning_rate": 2.2665472711909385e-05, + "loss": 1.4191, + "step": 8434 + }, + { + "epoch": 0.91, + "grad_norm": 0.08302585181396888, + "learning_rate": 2.2613675249944676e-05, + "loss": 1.552, + "step": 8435 + }, + { + "epoch": 0.91, + "grad_norm": 0.07639788909277988, + "learning_rate": 2.256193567271775e-05, + "loss": 1.5736, + "step": 8436 + }, + { + "epoch": 0.91, + "grad_norm": 0.10245714094055836, + "learning_rate": 2.2510253986502026e-05, + "loss": 1.4083, + "step": 8437 + }, + { + "epoch": 0.91, + "grad_norm": 0.10338545883531776, + "learning_rate": 2.2458630197564222e-05, + "loss": 1.3372, + "step": 8438 + }, + { + "epoch": 0.91, + "grad_norm": 0.10610452397828928, + "learning_rate": 2.2407064312163827e-05, + "loss": 1.4028, + "step": 8439 + }, + { + "epoch": 0.91, + "grad_norm": 0.07244559862137315, + "learning_rate": 2.235555633655345e-05, + "loss": 1.3914, + "step": 8440 + }, + { + "epoch": 0.91, + "grad_norm": 0.09405091787870902, + "learning_rate": 2.230410627697843e-05, + "loss": 1.3263, + "step": 8441 + }, + { + "epoch": 0.91, + "grad_norm": 0.09015438680566427, + "learning_rate": 2.2252714139677444e-05, + "loss": 1.4548, + "step": 8442 + }, + { + "epoch": 0.91, + "grad_norm": 0.08618967083701604, + "learning_rate": 2.2201379930882006e-05, + "loss": 1.3307, + "step": 8443 + }, + { + "epoch": 0.91, + "grad_norm": 0.09062863447448566, + "learning_rate": 2.2150103656816356e-05, + "loss": 1.3327, + "step": 8444 + }, + { + "epoch": 0.91, + "grad_norm": 0.08555145550793038, + "learning_rate": 2.2098885323698027e-05, + "loss": 1.3822, + "step": 8445 + }, + { + "epoch": 0.91, + "grad_norm": 0.10137210592380712, + "learning_rate": 2.2047724937737546e-05, + "loss": 1.5497, + "step": 8446 + }, + { + "epoch": 0.91, + "grad_norm": 0.09072992995784526, + "learning_rate": 2.1996622505138065e-05, + "loss": 1.5256, + "step": 8447 + }, + { + "epoch": 0.91, + "grad_norm": 0.08917769482057666, + "learning_rate": 2.1945578032096015e-05, + "loss": 1.4008, + "step": 8448 + }, + { + "epoch": 0.91, + "grad_norm": 0.08259821066802236, + "learning_rate": 2.1894591524800832e-05, + "loss": 1.3465, + "step": 8449 + }, + { + "epoch": 0.91, + "grad_norm": 0.09017995327516512, + "learning_rate": 2.1843662989434688e-05, + "loss": 1.535, + "step": 8450 + }, + { + "epoch": 0.91, + "grad_norm": 0.09148728765294482, + "learning_rate": 2.179279243217286e-05, + "loss": 1.2695, + "step": 8451 + }, + { + "epoch": 0.91, + "grad_norm": 0.09181262449336586, + "learning_rate": 2.174197985918358e-05, + "loss": 1.3005, + "step": 8452 + }, + { + "epoch": 0.91, + "grad_norm": 0.09799199206773658, + "learning_rate": 2.169122527662798e-05, + "loss": 1.4007, + "step": 8453 + }, + { + "epoch": 0.91, + "grad_norm": 0.08347453430194898, + "learning_rate": 2.1640528690660298e-05, + "loss": 1.3682, + "step": 8454 + }, + { + "epoch": 0.91, + "grad_norm": 0.08692980607633988, + "learning_rate": 2.158989010742779e-05, + "loss": 1.4219, + "step": 8455 + }, + { + "epoch": 0.91, + "grad_norm": 0.08937756989442294, + "learning_rate": 2.1539309533070316e-05, + "loss": 1.3182, + "step": 8456 + }, + { + "epoch": 0.91, + "grad_norm": 0.08836818598437016, + "learning_rate": 2.1488786973721085e-05, + "loss": 1.3727, + "step": 8457 + }, + { + "epoch": 0.91, + "grad_norm": 0.0946721247446865, + "learning_rate": 2.1438322435506196e-05, + "loss": 1.4795, + "step": 8458 + }, + { + "epoch": 0.91, + "grad_norm": 0.10350896930890328, + "learning_rate": 2.1387915924544475e-05, + "loss": 1.3853, + "step": 8459 + }, + { + "epoch": 0.91, + "grad_norm": 0.10058033450733786, + "learning_rate": 2.133756744694798e-05, + "loss": 1.417, + "step": 8460 + }, + { + "epoch": 0.91, + "grad_norm": 0.09827699073225621, + "learning_rate": 2.128727700882166e-05, + "loss": 1.4018, + "step": 8461 + }, + { + "epoch": 0.91, + "grad_norm": 0.09041865873232152, + "learning_rate": 2.1237044616263412e-05, + "loss": 1.424, + "step": 8462 + }, + { + "epoch": 0.91, + "grad_norm": 0.0924230597271768, + "learning_rate": 2.1186870275363977e-05, + "loss": 1.3944, + "step": 8463 + }, + { + "epoch": 0.91, + "grad_norm": 0.0873473916377948, + "learning_rate": 2.1136753992207268e-05, + "loss": 1.3969, + "step": 8464 + }, + { + "epoch": 0.91, + "grad_norm": 0.08325181748499406, + "learning_rate": 2.108669577287009e-05, + "loss": 1.4619, + "step": 8465 + }, + { + "epoch": 0.91, + "grad_norm": 0.0866440412000076, + "learning_rate": 2.103669562342203e-05, + "loss": 1.4301, + "step": 8466 + }, + { + "epoch": 0.91, + "grad_norm": 0.08572274163520682, + "learning_rate": 2.098675354992596e-05, + "loss": 1.347, + "step": 8467 + }, + { + "epoch": 0.91, + "grad_norm": 0.11805279277002773, + "learning_rate": 2.093686955843732e-05, + "loss": 1.283, + "step": 8468 + }, + { + "epoch": 0.91, + "grad_norm": 0.09658742123983585, + "learning_rate": 2.088704365500482e-05, + "loss": 1.2494, + "step": 8469 + }, + { + "epoch": 0.91, + "grad_norm": 0.08822907080986658, + "learning_rate": 2.0837275845670135e-05, + "loss": 1.3896, + "step": 8470 + }, + { + "epoch": 0.91, + "grad_norm": 0.08932206436251997, + "learning_rate": 2.0787566136467705e-05, + "loss": 1.393, + "step": 8471 + }, + { + "epoch": 0.91, + "grad_norm": 0.09641547295931442, + "learning_rate": 2.0737914533424885e-05, + "loss": 1.3631, + "step": 8472 + }, + { + "epoch": 0.91, + "grad_norm": 0.09249433521253747, + "learning_rate": 2.0688321042562186e-05, + "loss": 1.4559, + "step": 8473 + }, + { + "epoch": 0.91, + "grad_norm": 0.08820267488822597, + "learning_rate": 2.0638785669893024e-05, + "loss": 1.4652, + "step": 8474 + }, + { + "epoch": 0.91, + "grad_norm": 0.08816880733726157, + "learning_rate": 2.0589308421423704e-05, + "loss": 1.3932, + "step": 8475 + }, + { + "epoch": 0.91, + "grad_norm": 0.09785846516443422, + "learning_rate": 2.053988930315348e-05, + "loss": 1.3651, + "step": 8476 + }, + { + "epoch": 0.91, + "grad_norm": 0.09819695747342767, + "learning_rate": 2.049052832107473e-05, + "loss": 1.3235, + "step": 8477 + }, + { + "epoch": 0.91, + "grad_norm": 0.09234628217230027, + "learning_rate": 2.0441225481172443e-05, + "loss": 1.4419, + "step": 8478 + }, + { + "epoch": 0.91, + "grad_norm": 0.10311619376021568, + "learning_rate": 2.039198078942489e-05, + "loss": 1.3974, + "step": 8479 + }, + { + "epoch": 0.91, + "grad_norm": 0.09331694810071177, + "learning_rate": 2.0342794251803188e-05, + "loss": 1.4798, + "step": 8480 + }, + { + "epoch": 0.91, + "grad_norm": 0.09020593321897641, + "learning_rate": 2.029366587427123e-05, + "loss": 1.3993, + "step": 8481 + }, + { + "epoch": 0.91, + "grad_norm": 0.09205109315945209, + "learning_rate": 2.0244595662786136e-05, + "loss": 1.4413, + "step": 8482 + }, + { + "epoch": 0.91, + "grad_norm": 0.08248355323819702, + "learning_rate": 2.019558362329782e-05, + "loss": 1.4507, + "step": 8483 + }, + { + "epoch": 0.91, + "grad_norm": 0.09469002579477122, + "learning_rate": 2.014662976174908e-05, + "loss": 1.496, + "step": 8484 + }, + { + "epoch": 0.91, + "grad_norm": 0.09997222647885515, + "learning_rate": 2.0097734084075723e-05, + "loss": 1.3537, + "step": 8485 + }, + { + "epoch": 0.91, + "grad_norm": 0.09214343688886159, + "learning_rate": 2.0048896596206677e-05, + "loss": 1.4185, + "step": 8486 + }, + { + "epoch": 0.91, + "grad_norm": 0.08602997561822767, + "learning_rate": 2.000011730406348e-05, + "loss": 1.3327, + "step": 8487 + }, + { + "epoch": 0.91, + "grad_norm": 0.08411014919679415, + "learning_rate": 1.99513962135609e-05, + "loss": 1.3495, + "step": 8488 + }, + { + "epoch": 0.91, + "grad_norm": 0.09021699901169272, + "learning_rate": 1.9902733330606604e-05, + "loss": 1.4673, + "step": 8489 + }, + { + "epoch": 0.91, + "grad_norm": 0.0894410838431081, + "learning_rate": 1.9854128661100925e-05, + "loss": 1.4469, + "step": 8490 + }, + { + "epoch": 0.91, + "grad_norm": 0.09476249544207571, + "learning_rate": 1.980558221093748e-05, + "loss": 1.4729, + "step": 8491 + }, + { + "epoch": 0.91, + "grad_norm": 0.08738490149073329, + "learning_rate": 1.9757093986002728e-05, + "loss": 1.4347, + "step": 8492 + }, + { + "epoch": 0.91, + "grad_norm": 0.08126371645001917, + "learning_rate": 1.9708663992175968e-05, + "loss": 1.4183, + "step": 8493 + }, + { + "epoch": 0.91, + "grad_norm": 0.09864540184993932, + "learning_rate": 1.9660292235329446e-05, + "loss": 1.4789, + "step": 8494 + }, + { + "epoch": 0.91, + "grad_norm": 0.08836207440306233, + "learning_rate": 1.961197872132847e-05, + "loss": 1.3434, + "step": 8495 + }, + { + "epoch": 0.91, + "grad_norm": 0.08445418161336433, + "learning_rate": 1.95637234560313e-05, + "loss": 1.2764, + "step": 8496 + }, + { + "epoch": 0.91, + "grad_norm": 0.0985059807913652, + "learning_rate": 1.951552644528892e-05, + "loss": 1.455, + "step": 8497 + }, + { + "epoch": 0.91, + "grad_norm": 0.08635710557765165, + "learning_rate": 1.946738769494555e-05, + "loss": 1.4062, + "step": 8498 + }, + { + "epoch": 0.91, + "grad_norm": 0.0912158272982496, + "learning_rate": 1.9419307210837954e-05, + "loss": 1.4382, + "step": 8499 + }, + { + "epoch": 0.91, + "grad_norm": 0.08528304563372603, + "learning_rate": 1.9371284998796147e-05, + "loss": 1.3473, + "step": 8500 + }, + { + "epoch": 0.91, + "grad_norm": 0.09228094156924532, + "learning_rate": 1.9323321064643128e-05, + "loss": 1.5235, + "step": 8501 + }, + { + "epoch": 0.91, + "grad_norm": 0.08233938925848544, + "learning_rate": 1.9275415414194476e-05, + "loss": 1.376, + "step": 8502 + }, + { + "epoch": 0.91, + "grad_norm": 0.08850689703770695, + "learning_rate": 1.922756805325909e-05, + "loss": 1.4039, + "step": 8503 + }, + { + "epoch": 0.91, + "grad_norm": 0.09958912606213574, + "learning_rate": 1.9179778987638508e-05, + "loss": 1.4, + "step": 8504 + }, + { + "epoch": 0.91, + "grad_norm": 0.09810559605964786, + "learning_rate": 1.913204822312742e-05, + "loss": 1.3595, + "step": 8505 + }, + { + "epoch": 0.91, + "grad_norm": 0.09463852301076034, + "learning_rate": 1.9084375765513197e-05, + "loss": 1.4617, + "step": 8506 + }, + { + "epoch": 0.91, + "grad_norm": 0.11013757373202628, + "learning_rate": 1.9036761620576436e-05, + "loss": 1.4929, + "step": 8507 + }, + { + "epoch": 0.91, + "grad_norm": 0.12099079593085553, + "learning_rate": 1.898920579409047e-05, + "loss": 1.4703, + "step": 8508 + }, + { + "epoch": 0.91, + "grad_norm": 0.08528369129827018, + "learning_rate": 1.894170829182157e-05, + "loss": 1.4153, + "step": 8509 + }, + { + "epoch": 0.91, + "grad_norm": 0.08461384742941523, + "learning_rate": 1.889426911952896e-05, + "loss": 1.3631, + "step": 8510 + }, + { + "epoch": 0.91, + "grad_norm": 0.09677086610219493, + "learning_rate": 1.8846888282964937e-05, + "loss": 1.3896, + "step": 8511 + }, + { + "epoch": 0.92, + "grad_norm": 0.09429266082090436, + "learning_rate": 1.8799565787874394e-05, + "loss": 1.3049, + "step": 8512 + }, + { + "epoch": 0.92, + "grad_norm": 0.08097096508478885, + "learning_rate": 1.875230163999553e-05, + "loss": 1.2469, + "step": 8513 + }, + { + "epoch": 0.92, + "grad_norm": 0.09900252527706288, + "learning_rate": 1.870509584505925e-05, + "loss": 1.3851, + "step": 8514 + }, + { + "epoch": 0.92, + "grad_norm": 0.08627629957622081, + "learning_rate": 1.8657948408789262e-05, + "loss": 1.3561, + "step": 8515 + }, + { + "epoch": 0.92, + "grad_norm": 0.10019881440339429, + "learning_rate": 1.8610859336902486e-05, + "loss": 1.3432, + "step": 8516 + }, + { + "epoch": 0.92, + "grad_norm": 0.09158849272181173, + "learning_rate": 1.8563828635108692e-05, + "loss": 1.3539, + "step": 8517 + }, + { + "epoch": 0.92, + "grad_norm": 0.09867557457914743, + "learning_rate": 1.8516856309110375e-05, + "loss": 1.4684, + "step": 8518 + }, + { + "epoch": 0.92, + "grad_norm": 0.08122593100233426, + "learning_rate": 1.8469942364603142e-05, + "loss": 1.4484, + "step": 8519 + }, + { + "epoch": 0.92, + "grad_norm": 0.0842885239004775, + "learning_rate": 1.8423086807275557e-05, + "loss": 1.3915, + "step": 8520 + }, + { + "epoch": 0.92, + "grad_norm": 0.08729296238568429, + "learning_rate": 1.8376289642808854e-05, + "loss": 1.2997, + "step": 8521 + }, + { + "epoch": 0.92, + "grad_norm": 0.08963895604227475, + "learning_rate": 1.8329550876877487e-05, + "loss": 1.5127, + "step": 8522 + }, + { + "epoch": 0.92, + "grad_norm": 0.08888042005595129, + "learning_rate": 1.8282870515148652e-05, + "loss": 1.3661, + "step": 8523 + }, + { + "epoch": 0.92, + "grad_norm": 0.08759784012714594, + "learning_rate": 1.8236248563282542e-05, + "loss": 1.4205, + "step": 8524 + }, + { + "epoch": 0.92, + "grad_norm": 0.09167737995294478, + "learning_rate": 1.8189685026932136e-05, + "loss": 1.4306, + "step": 8525 + }, + { + "epoch": 0.92, + "grad_norm": 0.08403460807889258, + "learning_rate": 1.8143179911743414e-05, + "loss": 1.357, + "step": 8526 + }, + { + "epoch": 0.92, + "grad_norm": 0.08519773533614822, + "learning_rate": 1.8096733223355476e-05, + "loss": 1.3824, + "step": 8527 + }, + { + "epoch": 0.92, + "grad_norm": 0.09879834853603861, + "learning_rate": 1.805034496739988e-05, + "loss": 1.4378, + "step": 8528 + }, + { + "epoch": 0.92, + "grad_norm": 0.09648184208305852, + "learning_rate": 1.8004015149501563e-05, + "loss": 1.3694, + "step": 8529 + }, + { + "epoch": 0.92, + "grad_norm": 0.08839735081030436, + "learning_rate": 1.795774377527809e-05, + "loss": 1.3559, + "step": 8530 + }, + { + "epoch": 0.92, + "grad_norm": 0.08930280050423725, + "learning_rate": 1.7911530850339976e-05, + "loss": 1.3384, + "step": 8531 + }, + { + "epoch": 0.92, + "grad_norm": 0.09363327856588406, + "learning_rate": 1.7865376380290842e-05, + "loss": 1.3748, + "step": 8532 + }, + { + "epoch": 0.92, + "grad_norm": 0.08800254556910388, + "learning_rate": 1.7819280370726944e-05, + "loss": 1.3577, + "step": 8533 + }, + { + "epoch": 0.92, + "grad_norm": 0.09220582337447861, + "learning_rate": 1.7773242827237634e-05, + "loss": 1.3874, + "step": 8534 + }, + { + "epoch": 0.92, + "grad_norm": 0.09027578303790597, + "learning_rate": 1.7727263755405176e-05, + "loss": 1.5059, + "step": 8535 + }, + { + "epoch": 0.92, + "grad_norm": 0.08826799704652526, + "learning_rate": 1.7681343160804608e-05, + "loss": 1.4667, + "step": 8536 + }, + { + "epoch": 0.92, + "grad_norm": 0.0898786378584817, + "learning_rate": 1.7635481049003975e-05, + "loss": 1.4077, + "step": 8537 + }, + { + "epoch": 0.92, + "grad_norm": 0.09251862718568461, + "learning_rate": 1.7589677425564222e-05, + "loss": 1.4512, + "step": 8538 + }, + { + "epoch": 0.92, + "grad_norm": 0.08630638711926578, + "learning_rate": 1.7543932296039232e-05, + "loss": 1.4314, + "step": 8539 + }, + { + "epoch": 0.92, + "grad_norm": 0.08552853445385915, + "learning_rate": 1.749824566597569e-05, + "loss": 1.3791, + "step": 8540 + }, + { + "epoch": 0.92, + "grad_norm": 0.0998202597307536, + "learning_rate": 1.7452617540913264e-05, + "loss": 1.3047, + "step": 8541 + }, + { + "epoch": 0.92, + "grad_norm": 0.08909459675523979, + "learning_rate": 1.7407047926384644e-05, + "loss": 1.3275, + "step": 8542 + }, + { + "epoch": 0.92, + "grad_norm": 0.08463440139479447, + "learning_rate": 1.7361536827915137e-05, + "loss": 1.3971, + "step": 8543 + }, + { + "epoch": 0.92, + "grad_norm": 0.094601329627551, + "learning_rate": 1.7316084251023213e-05, + "loss": 1.395, + "step": 8544 + }, + { + "epoch": 0.92, + "grad_norm": 0.09133703125568057, + "learning_rate": 1.7270690201220242e-05, + "loss": 1.2725, + "step": 8545 + }, + { + "epoch": 0.92, + "grad_norm": 0.11459907690898916, + "learning_rate": 1.7225354684010208e-05, + "loss": 1.395, + "step": 8546 + }, + { + "epoch": 0.92, + "grad_norm": 0.09232465716072903, + "learning_rate": 1.7180077704890274e-05, + "loss": 1.3476, + "step": 8547 + }, + { + "epoch": 0.92, + "grad_norm": 0.09444109804027652, + "learning_rate": 1.7134859269350543e-05, + "loss": 1.3807, + "step": 8548 + }, + { + "epoch": 0.92, + "grad_norm": 0.09289326082336746, + "learning_rate": 1.7089699382873746e-05, + "loss": 1.4472, + "step": 8549 + }, + { + "epoch": 0.92, + "grad_norm": 0.08862007020517378, + "learning_rate": 1.7044598050935724e-05, + "loss": 1.4563, + "step": 8550 + }, + { + "epoch": 0.92, + "grad_norm": 0.0919549410530479, + "learning_rate": 1.6999555279005263e-05, + "loss": 1.4505, + "step": 8551 + }, + { + "epoch": 0.92, + "grad_norm": 0.09517584637890769, + "learning_rate": 1.6954571072543777e-05, + "loss": 1.3477, + "step": 8552 + }, + { + "epoch": 0.92, + "grad_norm": 0.0937890999439233, + "learning_rate": 1.6909645437005905e-05, + "loss": 1.3643, + "step": 8553 + }, + { + "epoch": 0.92, + "grad_norm": 0.08987385955355238, + "learning_rate": 1.686477837783906e-05, + "loss": 1.4256, + "step": 8554 + }, + { + "epoch": 0.92, + "grad_norm": 0.09477203791651483, + "learning_rate": 1.6819969900483345e-05, + "loss": 1.3805, + "step": 8555 + }, + { + "epoch": 0.92, + "grad_norm": 0.09633659162771828, + "learning_rate": 1.6775220010372182e-05, + "loss": 1.4013, + "step": 8556 + }, + { + "epoch": 0.92, + "grad_norm": 0.09878142652040257, + "learning_rate": 1.6730528712931404e-05, + "loss": 1.4194, + "step": 8557 + }, + { + "epoch": 0.92, + "grad_norm": 0.09332498556478858, + "learning_rate": 1.6685896013580226e-05, + "loss": 1.4105, + "step": 8558 + }, + { + "epoch": 0.92, + "grad_norm": 0.08565145674426411, + "learning_rate": 1.6641321917730268e-05, + "loss": 1.4823, + "step": 8559 + }, + { + "epoch": 0.92, + "grad_norm": 0.1172115003327829, + "learning_rate": 1.6596806430786425e-05, + "loss": 1.3143, + "step": 8560 + }, + { + "epoch": 0.92, + "grad_norm": 0.09783235169089964, + "learning_rate": 1.655234955814644e-05, + "loss": 1.4258, + "step": 8561 + }, + { + "epoch": 0.92, + "grad_norm": 0.1005044359265623, + "learning_rate": 1.650795130520072e-05, + "loss": 1.4208, + "step": 8562 + }, + { + "epoch": 0.92, + "grad_norm": 0.08384326929780318, + "learning_rate": 1.646361167733279e-05, + "loss": 1.2791, + "step": 8563 + }, + { + "epoch": 0.92, + "grad_norm": 0.09126815563432995, + "learning_rate": 1.6419330679918855e-05, + "loss": 1.4265, + "step": 8564 + }, + { + "epoch": 0.92, + "grad_norm": 0.0946576406020799, + "learning_rate": 1.637510831832828e-05, + "loss": 1.2326, + "step": 8565 + }, + { + "epoch": 0.92, + "grad_norm": 0.0927423035932863, + "learning_rate": 1.633094459792317e-05, + "loss": 1.2565, + "step": 8566 + }, + { + "epoch": 0.92, + "grad_norm": 0.09451555563163766, + "learning_rate": 1.6286839524058463e-05, + "loss": 1.4538, + "step": 8567 + }, + { + "epoch": 0.92, + "grad_norm": 0.08899655462628817, + "learning_rate": 1.6242793102082043e-05, + "loss": 1.3785, + "step": 8568 + }, + { + "epoch": 0.92, + "grad_norm": 0.09187045471105075, + "learning_rate": 1.6198805337334756e-05, + "loss": 1.3574, + "step": 8569 + }, + { + "epoch": 0.92, + "grad_norm": 0.08860419206793055, + "learning_rate": 1.6154876235150273e-05, + "loss": 1.3431, + "step": 8570 + }, + { + "epoch": 0.92, + "grad_norm": 0.08899383899424537, + "learning_rate": 1.611100580085506e-05, + "loss": 1.5686, + "step": 8571 + }, + { + "epoch": 0.92, + "grad_norm": 0.10070643337059408, + "learning_rate": 1.6067194039768584e-05, + "loss": 1.2775, + "step": 8572 + }, + { + "epoch": 0.92, + "grad_norm": 0.09644479465809108, + "learning_rate": 1.6023440957203262e-05, + "loss": 1.3781, + "step": 8573 + }, + { + "epoch": 0.92, + "grad_norm": 0.07905816945435647, + "learning_rate": 1.5979746558464236e-05, + "loss": 1.2644, + "step": 8574 + }, + { + "epoch": 0.92, + "grad_norm": 0.10817332670827906, + "learning_rate": 1.593611084884955e-05, + "loss": 1.4592, + "step": 8575 + }, + { + "epoch": 0.92, + "grad_norm": 0.08316527046152339, + "learning_rate": 1.5892533833650356e-05, + "loss": 1.305, + "step": 8576 + }, + { + "epoch": 0.92, + "grad_norm": 0.11415884426723807, + "learning_rate": 1.5849015518150377e-05, + "loss": 1.4799, + "step": 8577 + }, + { + "epoch": 0.92, + "grad_norm": 0.083268649141457, + "learning_rate": 1.5805555907626334e-05, + "loss": 1.2927, + "step": 8578 + }, + { + "epoch": 0.92, + "grad_norm": 0.0914170798789987, + "learning_rate": 1.5762155007347956e-05, + "loss": 1.3428, + "step": 8579 + }, + { + "epoch": 0.92, + "grad_norm": 0.10408842767047184, + "learning_rate": 1.57188128225777e-05, + "loss": 1.4778, + "step": 8580 + }, + { + "epoch": 0.92, + "grad_norm": 0.08966895390199751, + "learning_rate": 1.5675529358570916e-05, + "loss": 1.4545, + "step": 8581 + }, + { + "epoch": 0.92, + "grad_norm": 0.09387839113739459, + "learning_rate": 1.5632304620575965e-05, + "loss": 1.3898, + "step": 8582 + }, + { + "epoch": 0.92, + "grad_norm": 0.08271804328233938, + "learning_rate": 1.558913861383393e-05, + "loss": 1.2611, + "step": 8583 + }, + { + "epoch": 0.92, + "grad_norm": 0.09070946609853438, + "learning_rate": 1.554603134357879e-05, + "loss": 1.4368, + "step": 8584 + }, + { + "epoch": 0.92, + "grad_norm": 0.0922460182899588, + "learning_rate": 1.5502982815037648e-05, + "loss": 1.4203, + "step": 8585 + }, + { + "epoch": 0.92, + "grad_norm": 0.09415164483339611, + "learning_rate": 1.545999303343004e-05, + "loss": 1.43, + "step": 8586 + }, + { + "epoch": 0.92, + "grad_norm": 0.07968818770023066, + "learning_rate": 1.5417062003968808e-05, + "loss": 1.3758, + "step": 8587 + }, + { + "epoch": 0.92, + "grad_norm": 0.0973453065604615, + "learning_rate": 1.537418973185939e-05, + "loss": 1.4526, + "step": 8588 + }, + { + "epoch": 0.92, + "grad_norm": 0.09681877333037021, + "learning_rate": 1.53313762223003e-05, + "loss": 1.3711, + "step": 8589 + }, + { + "epoch": 0.92, + "grad_norm": 0.10431149269563894, + "learning_rate": 1.5288621480482657e-05, + "loss": 1.3885, + "step": 8590 + }, + { + "epoch": 0.92, + "grad_norm": 0.086045199743274, + "learning_rate": 1.5245925511590708e-05, + "loss": 1.4141, + "step": 8591 + }, + { + "epoch": 0.92, + "grad_norm": 0.08735933257811349, + "learning_rate": 1.5203288320801589e-05, + "loss": 1.3915, + "step": 8592 + }, + { + "epoch": 0.92, + "grad_norm": 0.09089960974137015, + "learning_rate": 1.5160709913284998e-05, + "loss": 1.3794, + "step": 8593 + }, + { + "epoch": 0.92, + "grad_norm": 0.10346871676245731, + "learning_rate": 1.5118190294203916e-05, + "loss": 1.2933, + "step": 8594 + }, + { + "epoch": 0.92, + "grad_norm": 0.09863144571471517, + "learning_rate": 1.5075729468713828e-05, + "loss": 1.4533, + "step": 8595 + }, + { + "epoch": 0.92, + "grad_norm": 0.08689322248559195, + "learning_rate": 1.5033327441963395e-05, + "loss": 1.4257, + "step": 8596 + }, + { + "epoch": 0.92, + "grad_norm": 0.09689504666913185, + "learning_rate": 1.4990984219093895e-05, + "loss": 1.308, + "step": 8597 + }, + { + "epoch": 0.92, + "grad_norm": 0.1088708301464488, + "learning_rate": 1.4948699805239719e-05, + "loss": 1.3454, + "step": 8598 + }, + { + "epoch": 0.92, + "grad_norm": 0.09341805770506104, + "learning_rate": 1.490647420552782e-05, + "loss": 1.4011, + "step": 8599 + }, + { + "epoch": 0.92, + "grad_norm": 0.09385136498735533, + "learning_rate": 1.4864307425078328e-05, + "loss": 1.4219, + "step": 8600 + }, + { + "epoch": 0.92, + "grad_norm": 0.0821232435993565, + "learning_rate": 1.4822199469004094e-05, + "loss": 1.4294, + "step": 8601 + }, + { + "epoch": 0.92, + "grad_norm": 0.08799735582406523, + "learning_rate": 1.4780150342410814e-05, + "loss": 1.395, + "step": 8602 + }, + { + "epoch": 0.92, + "grad_norm": 0.09669252271897247, + "learning_rate": 1.4738160050397132e-05, + "loss": 1.3807, + "step": 8603 + }, + { + "epoch": 0.92, + "grad_norm": 0.09423337439377573, + "learning_rate": 1.4696228598054473e-05, + "loss": 1.3328, + "step": 8604 + }, + { + "epoch": 0.93, + "grad_norm": 0.087042175661016, + "learning_rate": 1.4654355990467216e-05, + "loss": 1.2981, + "step": 8605 + }, + { + "epoch": 0.93, + "grad_norm": 0.08860134615702277, + "learning_rate": 1.4612542232712522e-05, + "loss": 1.5621, + "step": 8606 + }, + { + "epoch": 0.93, + "grad_norm": 0.08828791523559235, + "learning_rate": 1.4570787329860502e-05, + "loss": 1.3671, + "step": 8607 + }, + { + "epoch": 0.93, + "grad_norm": 0.08608911561738074, + "learning_rate": 1.4529091286973995e-05, + "loss": 1.2861, + "step": 8608 + }, + { + "epoch": 0.93, + "grad_norm": 0.08437715352144702, + "learning_rate": 1.44874541091089e-05, + "loss": 1.3159, + "step": 8609 + }, + { + "epoch": 0.93, + "grad_norm": 0.08595818844939189, + "learning_rate": 1.444587580131379e-05, + "loss": 1.4947, + "step": 8610 + }, + { + "epoch": 0.93, + "grad_norm": 0.11629088640945023, + "learning_rate": 1.4404356368630133e-05, + "loss": 1.3563, + "step": 8611 + }, + { + "epoch": 0.93, + "grad_norm": 0.09872012121874528, + "learning_rate": 1.4362895816092403e-05, + "loss": 1.2898, + "step": 8612 + }, + { + "epoch": 0.93, + "grad_norm": 0.09723751176966317, + "learning_rate": 1.4321494148727854e-05, + "loss": 1.3872, + "step": 8613 + }, + { + "epoch": 0.93, + "grad_norm": 0.08164864248624706, + "learning_rate": 1.4280151371556471e-05, + "loss": 1.3384, + "step": 8614 + }, + { + "epoch": 0.93, + "grad_norm": 0.0825178834649068, + "learning_rate": 1.4238867489591301e-05, + "loss": 1.3563, + "step": 8615 + }, + { + "epoch": 0.93, + "grad_norm": 0.09765191426189007, + "learning_rate": 1.4197642507838115e-05, + "loss": 1.2718, + "step": 8616 + }, + { + "epoch": 0.93, + "grad_norm": 0.09370234390805968, + "learning_rate": 1.4156476431295584e-05, + "loss": 1.3856, + "step": 8617 + }, + { + "epoch": 0.93, + "grad_norm": 0.08484573748097803, + "learning_rate": 1.4115369264955213e-05, + "loss": 1.4273, + "step": 8618 + }, + { + "epoch": 0.93, + "grad_norm": 0.09953841084466902, + "learning_rate": 1.407432101380146e-05, + "loss": 1.3876, + "step": 8619 + }, + { + "epoch": 0.93, + "grad_norm": 0.10160114969677866, + "learning_rate": 1.4033331682811568e-05, + "loss": 1.3629, + "step": 8620 + }, + { + "epoch": 0.93, + "grad_norm": 0.08803516529663949, + "learning_rate": 1.3992401276955502e-05, + "loss": 1.4832, + "step": 8621 + }, + { + "epoch": 0.93, + "grad_norm": 0.09318963980304762, + "learning_rate": 1.3951529801196294e-05, + "loss": 1.4404, + "step": 8622 + }, + { + "epoch": 0.93, + "grad_norm": 0.08645141350772045, + "learning_rate": 1.3910717260489869e-05, + "loss": 1.411, + "step": 8623 + }, + { + "epoch": 0.93, + "grad_norm": 0.0894926625646689, + "learning_rate": 1.3869963659784657e-05, + "loss": 1.3681, + "step": 8624 + }, + { + "epoch": 0.93, + "grad_norm": 0.09056101206876106, + "learning_rate": 1.3829269004022427e-05, + "loss": 1.4809, + "step": 8625 + }, + { + "epoch": 0.93, + "grad_norm": 0.09887299470866391, + "learning_rate": 1.3788633298137287e-05, + "loss": 1.4831, + "step": 8626 + }, + { + "epoch": 0.93, + "grad_norm": 0.13513774910578263, + "learning_rate": 1.3748056547056632e-05, + "loss": 1.5056, + "step": 8627 + }, + { + "epoch": 0.93, + "grad_norm": 0.10581823456619087, + "learning_rate": 1.3707538755700521e-05, + "loss": 1.4962, + "step": 8628 + }, + { + "epoch": 0.93, + "grad_norm": 0.084364216444337, + "learning_rate": 1.3667079928981807e-05, + "loss": 1.3145, + "step": 8629 + }, + { + "epoch": 0.93, + "grad_norm": 0.08748428403689872, + "learning_rate": 1.362668007180634e-05, + "loss": 1.4716, + "step": 8630 + }, + { + "epoch": 0.93, + "grad_norm": 0.0791241119174475, + "learning_rate": 1.3586339189072649e-05, + "loss": 1.2485, + "step": 8631 + }, + { + "epoch": 0.93, + "grad_norm": 0.08759161498800068, + "learning_rate": 1.354605728567232e-05, + "loss": 1.3997, + "step": 8632 + }, + { + "epoch": 0.93, + "grad_norm": 0.09708298148600865, + "learning_rate": 1.35058343664895e-05, + "loss": 1.4377, + "step": 8633 + }, + { + "epoch": 0.93, + "grad_norm": 0.09798526414450459, + "learning_rate": 1.3465670436401512e-05, + "loss": 1.4938, + "step": 8634 + }, + { + "epoch": 0.93, + "grad_norm": 0.10297317798540645, + "learning_rate": 1.3425565500278347e-05, + "loss": 1.377, + "step": 8635 + }, + { + "epoch": 0.93, + "grad_norm": 0.09540758310047924, + "learning_rate": 1.338551956298284e-05, + "loss": 1.4806, + "step": 8636 + }, + { + "epoch": 0.93, + "grad_norm": 0.09294268328813406, + "learning_rate": 1.334553262937066e-05, + "loss": 1.5293, + "step": 8637 + }, + { + "epoch": 0.93, + "grad_norm": 0.09074500638078706, + "learning_rate": 1.3305604704290431e-05, + "loss": 1.5192, + "step": 8638 + }, + { + "epoch": 0.93, + "grad_norm": 0.08647215940428625, + "learning_rate": 1.3265735792583499e-05, + "loss": 1.3124, + "step": 8639 + }, + { + "epoch": 0.93, + "grad_norm": 0.08799218164916595, + "learning_rate": 1.3225925899084169e-05, + "loss": 1.5912, + "step": 8640 + }, + { + "epoch": 0.93, + "grad_norm": 0.09268037997654895, + "learning_rate": 1.3186175028619518e-05, + "loss": 1.4546, + "step": 8641 + }, + { + "epoch": 0.93, + "grad_norm": 0.09057608833206314, + "learning_rate": 1.3146483186009417e-05, + "loss": 1.414, + "step": 8642 + }, + { + "epoch": 0.93, + "grad_norm": 0.09004670263797164, + "learning_rate": 1.3106850376066626e-05, + "loss": 1.3588, + "step": 8643 + }, + { + "epoch": 0.93, + "grad_norm": 0.11473962684426484, + "learning_rate": 1.3067276603596856e-05, + "loss": 1.5842, + "step": 8644 + }, + { + "epoch": 0.93, + "grad_norm": 0.10836472932270103, + "learning_rate": 1.3027761873398436e-05, + "loss": 1.2715, + "step": 8645 + }, + { + "epoch": 0.93, + "grad_norm": 0.09167158169420705, + "learning_rate": 1.2988306190262755e-05, + "loss": 1.3965, + "step": 8646 + }, + { + "epoch": 0.93, + "grad_norm": 0.08991242846948891, + "learning_rate": 1.2948909558974042e-05, + "loss": 1.4294, + "step": 8647 + }, + { + "epoch": 0.93, + "grad_norm": 0.09468638044701046, + "learning_rate": 1.2909571984309032e-05, + "loss": 1.4412, + "step": 8648 + }, + { + "epoch": 0.93, + "grad_norm": 0.08676751377745191, + "learning_rate": 1.2870293471037741e-05, + "loss": 1.4942, + "step": 8649 + }, + { + "epoch": 0.93, + "grad_norm": 0.0916862759987886, + "learning_rate": 1.283107402392275e-05, + "loss": 1.4295, + "step": 8650 + }, + { + "epoch": 0.93, + "grad_norm": 0.09650520392353956, + "learning_rate": 1.2791913647719589e-05, + "loss": 1.3463, + "step": 8651 + }, + { + "epoch": 0.93, + "grad_norm": 0.08393566526589911, + "learning_rate": 1.2752812347176512e-05, + "loss": 1.3341, + "step": 8652 + }, + { + "epoch": 0.93, + "grad_norm": 0.09170932221535763, + "learning_rate": 1.271377012703473e-05, + "loss": 1.4208, + "step": 8653 + }, + { + "epoch": 0.93, + "grad_norm": 0.08959861151947758, + "learning_rate": 1.2674786992028287e-05, + "loss": 1.3821, + "step": 8654 + }, + { + "epoch": 0.93, + "grad_norm": 0.0848972599871631, + "learning_rate": 1.2635862946883957e-05, + "loss": 1.3334, + "step": 8655 + }, + { + "epoch": 0.93, + "grad_norm": 0.0993068814281763, + "learning_rate": 1.2596997996321469e-05, + "loss": 1.2615, + "step": 8656 + }, + { + "epoch": 0.93, + "grad_norm": 0.09646313429632936, + "learning_rate": 1.2558192145053326e-05, + "loss": 1.3858, + "step": 8657 + }, + { + "epoch": 0.93, + "grad_norm": 0.09286391088063424, + "learning_rate": 1.2519445397784769e-05, + "loss": 1.3417, + "step": 8658 + }, + { + "epoch": 0.93, + "grad_norm": 0.08902398492787911, + "learning_rate": 1.2480757759214145e-05, + "loss": 1.498, + "step": 8659 + }, + { + "epoch": 0.93, + "grad_norm": 0.08915222091918729, + "learning_rate": 1.2442129234032373e-05, + "loss": 1.2159, + "step": 8660 + }, + { + "epoch": 0.93, + "grad_norm": 0.10864732924798791, + "learning_rate": 1.2403559826923205e-05, + "loss": 1.448, + "step": 8661 + }, + { + "epoch": 0.93, + "grad_norm": 0.0973476935490254, + "learning_rate": 1.236504954256351e-05, + "loss": 1.3799, + "step": 8662 + }, + { + "epoch": 0.93, + "grad_norm": 0.08686513943637508, + "learning_rate": 1.2326598385622723e-05, + "loss": 1.4464, + "step": 8663 + }, + { + "epoch": 0.93, + "grad_norm": 0.08930018544005622, + "learning_rate": 1.2288206360763055e-05, + "loss": 1.3085, + "step": 8664 + }, + { + "epoch": 0.93, + "grad_norm": 0.08339424642042234, + "learning_rate": 1.2249873472639782e-05, + "loss": 1.4383, + "step": 8665 + }, + { + "epoch": 0.93, + "grad_norm": 0.09666134438716192, + "learning_rate": 1.2211599725900913e-05, + "loss": 1.3808, + "step": 8666 + }, + { + "epoch": 0.93, + "grad_norm": 0.09117203971501421, + "learning_rate": 1.2173385125187175e-05, + "loss": 1.2604, + "step": 8667 + }, + { + "epoch": 0.93, + "grad_norm": 0.08962445862990973, + "learning_rate": 1.213522967513231e-05, + "loss": 1.3797, + "step": 8668 + }, + { + "epoch": 0.93, + "grad_norm": 0.0963226720097203, + "learning_rate": 1.2097133380362835e-05, + "loss": 1.3911, + "step": 8669 + }, + { + "epoch": 0.93, + "grad_norm": 0.08829865843109752, + "learning_rate": 1.2059096245497946e-05, + "loss": 1.5422, + "step": 8670 + }, + { + "epoch": 0.93, + "grad_norm": 0.09592687958075631, + "learning_rate": 1.202111827514979e-05, + "loss": 1.3628, + "step": 8671 + }, + { + "epoch": 0.93, + "grad_norm": 0.07879187528901951, + "learning_rate": 1.1983199473923456e-05, + "loss": 1.5028, + "step": 8672 + }, + { + "epoch": 0.93, + "grad_norm": 0.10046305256324412, + "learning_rate": 1.1945339846416603e-05, + "loss": 1.5293, + "step": 8673 + }, + { + "epoch": 0.93, + "grad_norm": 0.08752446327580514, + "learning_rate": 1.1907539397219835e-05, + "loss": 1.3339, + "step": 8674 + }, + { + "epoch": 0.93, + "grad_norm": 0.09815530338289363, + "learning_rate": 1.1869798130916654e-05, + "loss": 1.3674, + "step": 8675 + }, + { + "epoch": 0.93, + "grad_norm": 0.0921843523158939, + "learning_rate": 1.1832116052083231e-05, + "loss": 1.2522, + "step": 8676 + }, + { + "epoch": 0.93, + "grad_norm": 0.09920247734847988, + "learning_rate": 1.1794493165288745e-05, + "loss": 1.4375, + "step": 8677 + }, + { + "epoch": 0.93, + "grad_norm": 0.10373638226182667, + "learning_rate": 1.1756929475095102e-05, + "loss": 1.3585, + "step": 8678 + }, + { + "epoch": 0.93, + "grad_norm": 0.11289756227251307, + "learning_rate": 1.1719424986056936e-05, + "loss": 1.398, + "step": 8679 + }, + { + "epoch": 0.93, + "grad_norm": 0.08615684017355264, + "learning_rate": 1.168197970272189e-05, + "loss": 1.4001, + "step": 8680 + }, + { + "epoch": 0.93, + "grad_norm": 0.08292246245321147, + "learning_rate": 1.1644593629630274e-05, + "loss": 1.3374, + "step": 8681 + }, + { + "epoch": 0.93, + "grad_norm": 0.09281794835484808, + "learning_rate": 1.1607266771315295e-05, + "loss": 1.4318, + "step": 8682 + }, + { + "epoch": 0.93, + "grad_norm": 0.09380792402917534, + "learning_rate": 1.1569999132302999e-05, + "loss": 1.3245, + "step": 8683 + }, + { + "epoch": 0.93, + "grad_norm": 0.08317011645192439, + "learning_rate": 1.1532790717112162e-05, + "loss": 1.4646, + "step": 8684 + }, + { + "epoch": 0.93, + "grad_norm": 0.10464656535720604, + "learning_rate": 1.1495641530254452e-05, + "loss": 1.4131, + "step": 8685 + }, + { + "epoch": 0.93, + "grad_norm": 0.10344553432386078, + "learning_rate": 1.1458551576234322e-05, + "loss": 1.3669, + "step": 8686 + }, + { + "epoch": 0.93, + "grad_norm": 0.08569223859657236, + "learning_rate": 1.1421520859549062e-05, + "loss": 1.386, + "step": 8687 + }, + { + "epoch": 0.93, + "grad_norm": 0.09514608590411519, + "learning_rate": 1.1384549384688803e-05, + "loss": 1.4228, + "step": 8688 + }, + { + "epoch": 0.93, + "grad_norm": 0.09771515023676958, + "learning_rate": 1.1347637156136459e-05, + "loss": 1.395, + "step": 8689 + }, + { + "epoch": 0.93, + "grad_norm": 0.0772270936421878, + "learning_rate": 1.1310784178367729e-05, + "loss": 1.2789, + "step": 8690 + }, + { + "epoch": 0.93, + "grad_norm": 0.0911160570848233, + "learning_rate": 1.1273990455851203e-05, + "loss": 1.3026, + "step": 8691 + }, + { + "epoch": 0.93, + "grad_norm": 0.0914312510350616, + "learning_rate": 1.12372559930482e-05, + "loss": 1.4928, + "step": 8692 + }, + { + "epoch": 0.93, + "grad_norm": 0.09252646977879138, + "learning_rate": 1.1200580794412995e-05, + "loss": 1.4267, + "step": 8693 + }, + { + "epoch": 0.93, + "grad_norm": 0.09167015810592967, + "learning_rate": 1.1163964864392472e-05, + "loss": 1.3711, + "step": 8694 + }, + { + "epoch": 0.93, + "grad_norm": 0.08638462067121223, + "learning_rate": 1.1127408207426471e-05, + "loss": 1.4772, + "step": 8695 + }, + { + "epoch": 0.93, + "grad_norm": 0.08894168565959135, + "learning_rate": 1.109091082794761e-05, + "loss": 1.4653, + "step": 8696 + }, + { + "epoch": 0.93, + "grad_norm": 0.08528838348096536, + "learning_rate": 1.1054472730381403e-05, + "loss": 1.5519, + "step": 8697 + }, + { + "epoch": 0.94, + "grad_norm": 0.11797144062561185, + "learning_rate": 1.1018093919145988e-05, + "loss": 1.3926, + "step": 8698 + }, + { + "epoch": 0.94, + "grad_norm": 0.09838257455982573, + "learning_rate": 1.0981774398652444e-05, + "loss": 1.3492, + "step": 8699 + }, + { + "epoch": 0.94, + "grad_norm": 0.0912813666237488, + "learning_rate": 1.0945514173304693e-05, + "loss": 1.3605, + "step": 8700 + }, + { + "epoch": 0.94, + "grad_norm": 0.08843196545973088, + "learning_rate": 1.090931324749933e-05, + "loss": 1.3818, + "step": 8701 + }, + { + "epoch": 0.94, + "grad_norm": 0.09639414610841635, + "learning_rate": 1.0873171625625899e-05, + "loss": 1.5414, + "step": 8702 + }, + { + "epoch": 0.94, + "grad_norm": 0.09300844643291478, + "learning_rate": 1.0837089312066728e-05, + "loss": 1.4713, + "step": 8703 + }, + { + "epoch": 0.94, + "grad_norm": 0.0935356886871057, + "learning_rate": 1.0801066311196872e-05, + "loss": 1.3597, + "step": 8704 + }, + { + "epoch": 0.94, + "grad_norm": 0.08624681569480633, + "learning_rate": 1.0765102627384226e-05, + "loss": 1.4299, + "step": 8705 + }, + { + "epoch": 0.94, + "grad_norm": 0.09823549145427711, + "learning_rate": 1.0729198264989582e-05, + "loss": 1.4004, + "step": 8706 + }, + { + "epoch": 0.94, + "grad_norm": 0.08638494814697543, + "learning_rate": 1.0693353228366342e-05, + "loss": 1.486, + "step": 8707 + }, + { + "epoch": 0.94, + "grad_norm": 0.08204185326485428, + "learning_rate": 1.0657567521860977e-05, + "loss": 1.4409, + "step": 8708 + }, + { + "epoch": 0.94, + "grad_norm": 0.09744301403720242, + "learning_rate": 1.0621841149812572e-05, + "loss": 1.3364, + "step": 8709 + }, + { + "epoch": 0.94, + "grad_norm": 0.10000582529959977, + "learning_rate": 1.0586174116552993e-05, + "loss": 1.4559, + "step": 8710 + }, + { + "epoch": 0.94, + "grad_norm": 0.09744949985617589, + "learning_rate": 1.0550566426407115e-05, + "loss": 1.3111, + "step": 8711 + }, + { + "epoch": 0.94, + "grad_norm": 0.08195953839293012, + "learning_rate": 1.0515018083692484e-05, + "loss": 1.443, + "step": 8712 + }, + { + "epoch": 0.94, + "grad_norm": 0.09149345640565289, + "learning_rate": 1.0479529092719375e-05, + "loss": 1.307, + "step": 8713 + }, + { + "epoch": 0.94, + "grad_norm": 0.09314773244510169, + "learning_rate": 1.0444099457791012e-05, + "loss": 1.3025, + "step": 8714 + }, + { + "epoch": 0.94, + "grad_norm": 0.07480106948838042, + "learning_rate": 1.0408729183203403e-05, + "loss": 1.3991, + "step": 8715 + }, + { + "epoch": 0.94, + "grad_norm": 0.0877240164790989, + "learning_rate": 1.0373418273245228e-05, + "loss": 1.3505, + "step": 8716 + }, + { + "epoch": 0.94, + "grad_norm": 0.09580152668794165, + "learning_rate": 1.0338166732198062e-05, + "loss": 1.4759, + "step": 8717 + }, + { + "epoch": 0.94, + "grad_norm": 0.09232784222569064, + "learning_rate": 1.0302974564336265e-05, + "loss": 1.333, + "step": 8718 + }, + { + "epoch": 0.94, + "grad_norm": 0.09517950722743784, + "learning_rate": 1.026784177392709e-05, + "loss": 1.4041, + "step": 8719 + }, + { + "epoch": 0.94, + "grad_norm": 0.0870973350453796, + "learning_rate": 1.023276836523046e-05, + "loss": 1.4549, + "step": 8720 + }, + { + "epoch": 0.94, + "grad_norm": 0.08866760352849476, + "learning_rate": 1.0197754342499199e-05, + "loss": 1.3589, + "step": 8721 + }, + { + "epoch": 0.94, + "grad_norm": 0.09513227232510038, + "learning_rate": 1.0162799709978743e-05, + "loss": 1.3897, + "step": 8722 + }, + { + "epoch": 0.94, + "grad_norm": 0.09371045515382317, + "learning_rate": 1.0127904471907589e-05, + "loss": 1.4099, + "step": 8723 + }, + { + "epoch": 0.94, + "grad_norm": 0.08614066611489142, + "learning_rate": 1.0093068632516855e-05, + "loss": 1.3689, + "step": 8724 + }, + { + "epoch": 0.94, + "grad_norm": 0.10384855286320732, + "learning_rate": 1.0058292196030549e-05, + "loss": 1.4494, + "step": 8725 + }, + { + "epoch": 0.94, + "grad_norm": 0.08381647067968226, + "learning_rate": 1.0023575166665355e-05, + "loss": 1.4584, + "step": 8726 + }, + { + "epoch": 0.94, + "grad_norm": 0.10300897484233143, + "learning_rate": 9.98891754863085e-06, + "loss": 1.3709, + "step": 8727 + }, + { + "epoch": 0.94, + "grad_norm": 0.10475793362232905, + "learning_rate": 9.954319346129503e-06, + "loss": 1.2983, + "step": 8728 + }, + { + "epoch": 0.94, + "grad_norm": 0.08644278946240058, + "learning_rate": 9.919780563356295e-06, + "loss": 1.3854, + "step": 8729 + }, + { + "epoch": 0.94, + "grad_norm": 0.09194633055680919, + "learning_rate": 9.88530120449932e-06, + "loss": 1.4767, + "step": 8730 + }, + { + "epoch": 0.94, + "grad_norm": 0.09169105507664722, + "learning_rate": 9.850881273739231e-06, + "loss": 1.5221, + "step": 8731 + }, + { + "epoch": 0.94, + "grad_norm": 0.0917619755532548, + "learning_rate": 9.816520775249583e-06, + "loss": 1.4147, + "step": 8732 + }, + { + "epoch": 0.94, + "grad_norm": 0.08201196705016786, + "learning_rate": 9.782219713196705e-06, + "loss": 1.329, + "step": 8733 + }, + { + "epoch": 0.94, + "grad_norm": 0.09267725476057265, + "learning_rate": 9.747978091739774e-06, + "loss": 1.4925, + "step": 8734 + }, + { + "epoch": 0.94, + "grad_norm": 0.08967739621306049, + "learning_rate": 9.713795915030577e-06, + "loss": 1.4223, + "step": 8735 + }, + { + "epoch": 0.94, + "grad_norm": 0.08957521459912487, + "learning_rate": 9.679673187214022e-06, + "loss": 1.4221, + "step": 8736 + }, + { + "epoch": 0.94, + "grad_norm": 0.09355925735987494, + "learning_rate": 9.64560991242741e-06, + "loss": 1.5056, + "step": 8737 + }, + { + "epoch": 0.94, + "grad_norm": 0.08319093196250256, + "learning_rate": 9.611606094801052e-06, + "loss": 1.5389, + "step": 8738 + }, + { + "epoch": 0.94, + "grad_norm": 0.07375547083039735, + "learning_rate": 9.577661738458143e-06, + "loss": 1.4192, + "step": 8739 + }, + { + "epoch": 0.94, + "grad_norm": 0.08483871899049837, + "learning_rate": 9.543776847514507e-06, + "loss": 1.5044, + "step": 8740 + }, + { + "epoch": 0.94, + "grad_norm": 0.09976069792945877, + "learning_rate": 9.509951426078745e-06, + "loss": 1.4148, + "step": 8741 + }, + { + "epoch": 0.94, + "grad_norm": 0.08695437000305326, + "learning_rate": 9.476185478252352e-06, + "loss": 1.367, + "step": 8742 + }, + { + "epoch": 0.94, + "grad_norm": 0.09822966313776028, + "learning_rate": 9.442479008129557e-06, + "loss": 1.479, + "step": 8743 + }, + { + "epoch": 0.94, + "grad_norm": 0.0886525612017647, + "learning_rate": 9.40883201979742e-06, + "loss": 1.4471, + "step": 8744 + }, + { + "epoch": 0.94, + "grad_norm": 0.08808052951426312, + "learning_rate": 9.37524451733568e-06, + "loss": 1.2375, + "step": 8745 + }, + { + "epoch": 0.94, + "grad_norm": 0.09043626699549577, + "learning_rate": 9.341716504817021e-06, + "loss": 1.42, + "step": 8746 + }, + { + "epoch": 0.94, + "grad_norm": 0.09434528279859916, + "learning_rate": 9.308247986306862e-06, + "loss": 1.4542, + "step": 8747 + }, + { + "epoch": 0.94, + "grad_norm": 0.09964885817378788, + "learning_rate": 9.274838965863174e-06, + "loss": 1.3686, + "step": 8748 + }, + { + "epoch": 0.94, + "grad_norm": 0.09345635978911934, + "learning_rate": 9.241489447537111e-06, + "loss": 1.4048, + "step": 8749 + }, + { + "epoch": 0.94, + "grad_norm": 0.0984638899196532, + "learning_rate": 9.208199435372377e-06, + "loss": 1.3195, + "step": 8750 + }, + { + "epoch": 0.94, + "grad_norm": 0.08103394530212814, + "learning_rate": 9.174968933405414e-06, + "loss": 1.4755, + "step": 8751 + }, + { + "epoch": 0.94, + "grad_norm": 0.10063030829037273, + "learning_rate": 9.141797945665609e-06, + "loss": 1.3979, + "step": 8752 + }, + { + "epoch": 0.94, + "grad_norm": 0.10028878781779615, + "learning_rate": 9.108686476175133e-06, + "loss": 1.5094, + "step": 8753 + }, + { + "epoch": 0.94, + "grad_norm": 0.1027783191091369, + "learning_rate": 9.075634528948717e-06, + "loss": 1.3391, + "step": 8754 + }, + { + "epoch": 0.94, + "grad_norm": 0.08672950066908128, + "learning_rate": 9.042642107994104e-06, + "loss": 1.397, + "step": 8755 + }, + { + "epoch": 0.94, + "grad_norm": 0.08870879929526027, + "learning_rate": 9.0097092173117e-06, + "loss": 1.364, + "step": 8756 + }, + { + "epoch": 0.94, + "grad_norm": 0.09341416581700006, + "learning_rate": 8.976835860894761e-06, + "loss": 1.3041, + "step": 8757 + }, + { + "epoch": 0.94, + "grad_norm": 0.11196489344792297, + "learning_rate": 8.944022042729317e-06, + "loss": 1.4337, + "step": 8758 + }, + { + "epoch": 0.94, + "grad_norm": 0.10451525028124231, + "learning_rate": 8.91126776679413e-06, + "loss": 1.3766, + "step": 8759 + }, + { + "epoch": 0.94, + "grad_norm": 0.08593762869175031, + "learning_rate": 8.87857303706069e-06, + "loss": 1.4982, + "step": 8760 + }, + { + "epoch": 0.94, + "grad_norm": 0.07904295751893473, + "learning_rate": 8.845937857493491e-06, + "loss": 1.3474, + "step": 8761 + }, + { + "epoch": 0.94, + "grad_norm": 0.08276173777838187, + "learning_rate": 8.813362232049592e-06, + "loss": 1.4948, + "step": 8762 + }, + { + "epoch": 0.94, + "grad_norm": 0.0953114667332252, + "learning_rate": 8.780846164678836e-06, + "loss": 1.4323, + "step": 8763 + }, + { + "epoch": 0.94, + "grad_norm": 0.08730452540721823, + "learning_rate": 8.748389659324008e-06, + "loss": 1.4438, + "step": 8764 + }, + { + "epoch": 0.94, + "grad_norm": 0.09130547003069196, + "learning_rate": 8.715992719920574e-06, + "loss": 1.5007, + "step": 8765 + }, + { + "epoch": 0.94, + "grad_norm": 0.10602039868255145, + "learning_rate": 8.683655350396724e-06, + "loss": 1.4545, + "step": 8766 + }, + { + "epoch": 0.94, + "grad_norm": 0.08236441196708506, + "learning_rate": 8.651377554673434e-06, + "loss": 1.4182, + "step": 8767 + }, + { + "epoch": 0.94, + "grad_norm": 0.08527216278876118, + "learning_rate": 8.619159336664683e-06, + "loss": 1.4018, + "step": 8768 + }, + { + "epoch": 0.94, + "grad_norm": 0.08511637576566533, + "learning_rate": 8.587000700276792e-06, + "loss": 1.5263, + "step": 8769 + }, + { + "epoch": 0.94, + "grad_norm": 0.08838260890517773, + "learning_rate": 8.554901649409252e-06, + "loss": 1.4725, + "step": 8770 + }, + { + "epoch": 0.94, + "grad_norm": 0.08090486237576237, + "learning_rate": 8.522862187954172e-06, + "loss": 1.438, + "step": 8771 + }, + { + "epoch": 0.94, + "grad_norm": 0.09265052659328142, + "learning_rate": 8.490882319796389e-06, + "loss": 1.4177, + "step": 8772 + }, + { + "epoch": 0.94, + "grad_norm": 0.0914917526601237, + "learning_rate": 8.458962048813634e-06, + "loss": 1.3132, + "step": 8773 + }, + { + "epoch": 0.94, + "grad_norm": 0.08859690630090684, + "learning_rate": 8.427101378876367e-06, + "loss": 1.6076, + "step": 8774 + }, + { + "epoch": 0.94, + "grad_norm": 0.08901781244111316, + "learning_rate": 8.39530031384772e-06, + "loss": 1.4397, + "step": 8775 + }, + { + "epoch": 0.94, + "grad_norm": 0.10086813132432688, + "learning_rate": 8.36355885758372e-06, + "loss": 1.449, + "step": 8776 + }, + { + "epoch": 0.94, + "grad_norm": 0.09140538883618672, + "learning_rate": 8.331877013933176e-06, + "loss": 1.1962, + "step": 8777 + }, + { + "epoch": 0.94, + "grad_norm": 0.08401749894727048, + "learning_rate": 8.300254786737627e-06, + "loss": 1.4226, + "step": 8778 + }, + { + "epoch": 0.94, + "grad_norm": 0.09632136067271782, + "learning_rate": 8.268692179831228e-06, + "loss": 1.3028, + "step": 8779 + }, + { + "epoch": 0.94, + "grad_norm": 0.08786148939407268, + "learning_rate": 8.237189197041195e-06, + "loss": 1.5339, + "step": 8780 + }, + { + "epoch": 0.94, + "grad_norm": 0.08665394568612342, + "learning_rate": 8.205745842187361e-06, + "loss": 1.4943, + "step": 8781 + }, + { + "epoch": 0.94, + "grad_norm": 0.10345725784687435, + "learning_rate": 8.17436211908229e-06, + "loss": 1.3102, + "step": 8782 + }, + { + "epoch": 0.94, + "grad_norm": 0.08766312295144872, + "learning_rate": 8.14303803153138e-06, + "loss": 1.5278, + "step": 8783 + }, + { + "epoch": 0.94, + "grad_norm": 0.08302435920388034, + "learning_rate": 8.111773583332872e-06, + "loss": 1.3101, + "step": 8784 + }, + { + "epoch": 0.94, + "grad_norm": 0.08086046970243763, + "learning_rate": 8.080568778277509e-06, + "loss": 1.3628, + "step": 8785 + }, + { + "epoch": 0.94, + "grad_norm": 0.07696214120226756, + "learning_rate": 8.049423620149154e-06, + "loss": 1.3877, + "step": 8786 + }, + { + "epoch": 0.94, + "grad_norm": 0.08412889864646508, + "learning_rate": 8.018338112724178e-06, + "loss": 1.3764, + "step": 8787 + }, + { + "epoch": 0.94, + "grad_norm": 0.11455424686467591, + "learning_rate": 7.987312259771839e-06, + "loss": 1.4572, + "step": 8788 + }, + { + "epoch": 0.94, + "grad_norm": 0.09261780785683074, + "learning_rate": 7.956346065054132e-06, + "loss": 1.5225, + "step": 8789 + }, + { + "epoch": 0.94, + "grad_norm": 0.09033658557442632, + "learning_rate": 7.925439532325772e-06, + "loss": 1.4358, + "step": 8790 + }, + { + "epoch": 0.95, + "grad_norm": 0.09358776230596985, + "learning_rate": 7.894592665334265e-06, + "loss": 1.4481, + "step": 8791 + }, + { + "epoch": 0.95, + "grad_norm": 0.10293383897592415, + "learning_rate": 7.863805467820006e-06, + "loss": 1.2969, + "step": 8792 + }, + { + "epoch": 0.95, + "grad_norm": 0.09437810328651862, + "learning_rate": 7.833077943515955e-06, + "loss": 1.3577, + "step": 8793 + }, + { + "epoch": 0.95, + "grad_norm": 0.09550001282233132, + "learning_rate": 7.80241009614796e-06, + "loss": 1.362, + "step": 8794 + }, + { + "epoch": 0.95, + "grad_norm": 0.103722456788033, + "learning_rate": 7.771801929434608e-06, + "loss": 1.5316, + "step": 8795 + }, + { + "epoch": 0.95, + "grad_norm": 0.09672039793209836, + "learning_rate": 7.74125344708726e-06, + "loss": 1.4404, + "step": 8796 + }, + { + "epoch": 0.95, + "grad_norm": 0.11381378663777032, + "learning_rate": 7.71076465281001e-06, + "loss": 1.3989, + "step": 8797 + }, + { + "epoch": 0.95, + "grad_norm": 0.0943974351742112, + "learning_rate": 7.680335550299678e-06, + "loss": 1.5955, + "step": 8798 + }, + { + "epoch": 0.95, + "grad_norm": 0.08554720301543454, + "learning_rate": 7.649966143245979e-06, + "loss": 1.3918, + "step": 8799 + }, + { + "epoch": 0.95, + "grad_norm": 0.10710528712328721, + "learning_rate": 7.619656435331301e-06, + "loss": 1.494, + "step": 8800 + }, + { + "epoch": 0.95, + "grad_norm": 0.09189467724009155, + "learning_rate": 7.589406430230705e-06, + "loss": 1.3021, + "step": 8801 + }, + { + "epoch": 0.95, + "grad_norm": 0.10057691654011505, + "learning_rate": 7.559216131612256e-06, + "loss": 1.3472, + "step": 8802 + }, + { + "epoch": 0.95, + "grad_norm": 0.0821028096426198, + "learning_rate": 7.529085543136472e-06, + "loss": 1.4207, + "step": 8803 + }, + { + "epoch": 0.95, + "grad_norm": 0.0782184005263043, + "learning_rate": 7.499014668456872e-06, + "loss": 1.3374, + "step": 8804 + }, + { + "epoch": 0.95, + "grad_norm": 0.0876206795684908, + "learning_rate": 7.469003511219707e-06, + "loss": 1.4071, + "step": 8805 + }, + { + "epoch": 0.95, + "grad_norm": 0.08655313629027807, + "learning_rate": 7.4390520750638455e-06, + "loss": 1.3533, + "step": 8806 + }, + { + "epoch": 0.95, + "grad_norm": 0.0862730449274471, + "learning_rate": 7.409160363621048e-06, + "loss": 1.3673, + "step": 8807 + }, + { + "epoch": 0.95, + "grad_norm": 0.08486986348752033, + "learning_rate": 7.379328380515804e-06, + "loss": 1.3241, + "step": 8808 + }, + { + "epoch": 0.95, + "grad_norm": 0.09760545363166366, + "learning_rate": 7.349556129365276e-06, + "loss": 1.3747, + "step": 8809 + }, + { + "epoch": 0.95, + "grad_norm": 0.10666679386119536, + "learning_rate": 7.31984361377952e-06, + "loss": 1.3719, + "step": 8810 + }, + { + "epoch": 0.95, + "grad_norm": 0.10084244049618121, + "learning_rate": 7.29019083736121e-06, + "loss": 1.3092, + "step": 8811 + }, + { + "epoch": 0.95, + "grad_norm": 0.09202231083032954, + "learning_rate": 7.260597803705971e-06, + "loss": 1.2991, + "step": 8812 + }, + { + "epoch": 0.95, + "grad_norm": 0.10290993988751386, + "learning_rate": 7.23106451640193e-06, + "loss": 1.4151, + "step": 8813 + }, + { + "epoch": 0.95, + "grad_norm": 0.08561006665964292, + "learning_rate": 7.201590979030115e-06, + "loss": 1.4943, + "step": 8814 + }, + { + "epoch": 0.95, + "grad_norm": 0.0865490043036091, + "learning_rate": 7.172177195164386e-06, + "loss": 1.3253, + "step": 8815 + }, + { + "epoch": 0.95, + "grad_norm": 0.09230136981204798, + "learning_rate": 7.1428231683711705e-06, + "loss": 1.3461, + "step": 8816 + }, + { + "epoch": 0.95, + "grad_norm": 0.09945844805144954, + "learning_rate": 7.113528902209787e-06, + "loss": 1.2755, + "step": 8817 + }, + { + "epoch": 0.95, + "grad_norm": 0.09948172641397596, + "learning_rate": 7.084294400232283e-06, + "loss": 1.3729, + "step": 8818 + }, + { + "epoch": 0.95, + "grad_norm": 0.0992359215039714, + "learning_rate": 7.055119665983378e-06, + "loss": 1.4723, + "step": 8819 + }, + { + "epoch": 0.95, + "grad_norm": 0.10929611354225881, + "learning_rate": 7.026004703000688e-06, + "loss": 1.389, + "step": 8820 + }, + { + "epoch": 0.95, + "grad_norm": 0.08294067051563485, + "learning_rate": 6.996949514814499e-06, + "loss": 1.2973, + "step": 8821 + }, + { + "epoch": 0.95, + "grad_norm": 0.11065562311316486, + "learning_rate": 6.967954104947771e-06, + "loss": 1.4557, + "step": 8822 + }, + { + "epoch": 0.95, + "grad_norm": 0.10295369467271562, + "learning_rate": 6.939018476916359e-06, + "loss": 1.3141, + "step": 8823 + }, + { + "epoch": 0.95, + "grad_norm": 0.09378386174057356, + "learning_rate": 6.910142634228789e-06, + "loss": 1.3767, + "step": 8824 + }, + { + "epoch": 0.95, + "grad_norm": 0.09977427325258244, + "learning_rate": 6.8813265803863715e-06, + "loss": 1.5256, + "step": 8825 + }, + { + "epoch": 0.95, + "grad_norm": 0.10057069838924039, + "learning_rate": 6.852570318883145e-06, + "loss": 1.4013, + "step": 8826 + }, + { + "epoch": 0.95, + "grad_norm": 0.09187264561880268, + "learning_rate": 6.8238738532059306e-06, + "loss": 1.4982, + "step": 8827 + }, + { + "epoch": 0.95, + "grad_norm": 0.11291029430421218, + "learning_rate": 6.795237186834169e-06, + "loss": 1.3988, + "step": 8828 + }, + { + "epoch": 0.95, + "grad_norm": 0.08897163991167156, + "learning_rate": 6.766660323240303e-06, + "loss": 1.5356, + "step": 8829 + }, + { + "epoch": 0.95, + "grad_norm": 0.09496027710275812, + "learning_rate": 6.738143265889285e-06, + "loss": 1.4172, + "step": 8830 + }, + { + "epoch": 0.95, + "grad_norm": 0.08272734550929048, + "learning_rate": 6.709686018238958e-06, + "loss": 1.3842, + "step": 8831 + }, + { + "epoch": 0.95, + "grad_norm": 0.102791559328205, + "learning_rate": 6.681288583739786e-06, + "loss": 1.4251, + "step": 8832 + }, + { + "epoch": 0.95, + "grad_norm": 0.108585952617785, + "learning_rate": 6.652950965835181e-06, + "loss": 1.3605, + "step": 8833 + }, + { + "epoch": 0.95, + "grad_norm": 0.10555789963522569, + "learning_rate": 6.624673167961004e-06, + "loss": 1.4318, + "step": 8834 + }, + { + "epoch": 0.95, + "grad_norm": 0.0852695401594645, + "learning_rate": 6.59645519354618e-06, + "loss": 1.3621, + "step": 8835 + }, + { + "epoch": 0.95, + "grad_norm": 0.09853582405605046, + "learning_rate": 6.568297046012195e-06, + "loss": 1.3444, + "step": 8836 + }, + { + "epoch": 0.95, + "grad_norm": 0.08919377889013902, + "learning_rate": 6.540198728773262e-06, + "loss": 1.3925, + "step": 8837 + }, + { + "epoch": 0.95, + "grad_norm": 0.08643002957608505, + "learning_rate": 6.512160245236431e-06, + "loss": 1.3932, + "step": 8838 + }, + { + "epoch": 0.95, + "grad_norm": 0.08706342127575102, + "learning_rate": 6.484181598801541e-06, + "loss": 1.4577, + "step": 8839 + }, + { + "epoch": 0.95, + "grad_norm": 0.0957071682990628, + "learning_rate": 6.4562627928610455e-06, + "loss": 1.3636, + "step": 8840 + }, + { + "epoch": 0.95, + "grad_norm": 0.09110174617138717, + "learning_rate": 6.4284038308001224e-06, + "loss": 1.3166, + "step": 8841 + }, + { + "epoch": 0.95, + "grad_norm": 0.09374793989183004, + "learning_rate": 6.400604715996905e-06, + "loss": 1.3368, + "step": 8842 + }, + { + "epoch": 0.95, + "grad_norm": 0.11342827940082034, + "learning_rate": 6.372865451822085e-06, + "loss": 1.2744, + "step": 8843 + }, + { + "epoch": 0.95, + "grad_norm": 0.09499693290134015, + "learning_rate": 6.345186041639028e-06, + "loss": 1.4215, + "step": 8844 + }, + { + "epoch": 0.95, + "grad_norm": 0.08185468984627872, + "learning_rate": 6.317566488804105e-06, + "loss": 1.4175, + "step": 8845 + }, + { + "epoch": 0.95, + "grad_norm": 0.09568579794257903, + "learning_rate": 6.290006796666248e-06, + "loss": 1.5329, + "step": 8846 + }, + { + "epoch": 0.95, + "grad_norm": 0.08883975464927474, + "learning_rate": 6.262506968567061e-06, + "loss": 1.3307, + "step": 8847 + }, + { + "epoch": 0.95, + "grad_norm": 0.08740844984942242, + "learning_rate": 6.2350670078411555e-06, + "loss": 1.3654, + "step": 8848 + }, + { + "epoch": 0.95, + "grad_norm": 0.09062562990586284, + "learning_rate": 6.207686917815592e-06, + "loss": 1.4515, + "step": 8849 + }, + { + "epoch": 0.95, + "grad_norm": 0.08741639466532415, + "learning_rate": 6.1803667018103805e-06, + "loss": 1.4691, + "step": 8850 + }, + { + "epoch": 0.95, + "grad_norm": 0.10116002798678095, + "learning_rate": 6.15310636313815e-06, + "loss": 1.3397, + "step": 8851 + }, + { + "epoch": 0.95, + "grad_norm": 0.09585867618812836, + "learning_rate": 6.125905905104368e-06, + "loss": 1.3576, + "step": 8852 + }, + { + "epoch": 0.95, + "grad_norm": 0.11018206546669015, + "learning_rate": 6.098765331007061e-06, + "loss": 1.3525, + "step": 8853 + }, + { + "epoch": 0.95, + "grad_norm": 0.08725015747575635, + "learning_rate": 6.0716846441372655e-06, + "loss": 1.3746, + "step": 8854 + }, + { + "epoch": 0.95, + "grad_norm": 0.09572859468974974, + "learning_rate": 6.04466384777852e-06, + "loss": 1.4461, + "step": 8855 + }, + { + "epoch": 0.95, + "grad_norm": 0.09604637593854443, + "learning_rate": 6.017702945207149e-06, + "loss": 1.529, + "step": 8856 + }, + { + "epoch": 0.95, + "grad_norm": 0.09035469492567368, + "learning_rate": 5.990801939692314e-06, + "loss": 1.4351, + "step": 8857 + }, + { + "epoch": 0.95, + "grad_norm": 0.0945266969821737, + "learning_rate": 5.963960834495907e-06, + "loss": 1.3995, + "step": 8858 + }, + { + "epoch": 0.95, + "grad_norm": 0.08392027138389029, + "learning_rate": 5.937179632872436e-06, + "loss": 1.3874, + "step": 8859 + }, + { + "epoch": 0.95, + "grad_norm": 0.09700625587597556, + "learning_rate": 5.9104583380691914e-06, + "loss": 1.4869, + "step": 8860 + }, + { + "epoch": 0.95, + "grad_norm": 0.08440155967829839, + "learning_rate": 5.883796953326359e-06, + "loss": 1.3654, + "step": 8861 + }, + { + "epoch": 0.95, + "grad_norm": 0.08299105068275839, + "learning_rate": 5.85719548187652e-06, + "loss": 1.4762, + "step": 8862 + }, + { + "epoch": 0.95, + "grad_norm": 0.09878299275346496, + "learning_rate": 5.8306539269453725e-06, + "loss": 1.4466, + "step": 8863 + }, + { + "epoch": 0.95, + "grad_norm": 0.08433678896475492, + "learning_rate": 5.804172291751064e-06, + "loss": 1.3548, + "step": 8864 + }, + { + "epoch": 0.95, + "grad_norm": 0.09183326720148738, + "learning_rate": 5.777750579504581e-06, + "loss": 1.3366, + "step": 8865 + }, + { + "epoch": 0.95, + "grad_norm": 0.08709653574788266, + "learning_rate": 5.7513887934096954e-06, + "loss": 1.4318, + "step": 8866 + }, + { + "epoch": 0.95, + "grad_norm": 0.09201497815107694, + "learning_rate": 5.725086936662905e-06, + "loss": 1.3497, + "step": 8867 + }, + { + "epoch": 0.95, + "grad_norm": 0.11283563450980807, + "learning_rate": 5.698845012453324e-06, + "loss": 1.4845, + "step": 8868 + }, + { + "epoch": 0.95, + "grad_norm": 0.09315366127124443, + "learning_rate": 5.672663023962854e-06, + "loss": 1.4852, + "step": 8869 + }, + { + "epoch": 0.95, + "grad_norm": 0.08316494710549423, + "learning_rate": 5.646540974366287e-06, + "loss": 1.4266, + "step": 8870 + }, + { + "epoch": 0.95, + "grad_norm": 0.09282115286451564, + "learning_rate": 5.620478866830814e-06, + "loss": 1.4259, + "step": 8871 + }, + { + "epoch": 0.95, + "grad_norm": 0.10159676345240197, + "learning_rate": 5.594476704516738e-06, + "loss": 1.3527, + "step": 8872 + }, + { + "epoch": 0.95, + "grad_norm": 0.09893435119118751, + "learning_rate": 5.5685344905768156e-06, + "loss": 1.5138, + "step": 8873 + }, + { + "epoch": 0.95, + "grad_norm": 0.09329564540974669, + "learning_rate": 5.542652228156697e-06, + "loss": 1.464, + "step": 8874 + }, + { + "epoch": 0.95, + "grad_norm": 0.09485592930638007, + "learning_rate": 5.516829920394595e-06, + "loss": 1.4764, + "step": 8875 + }, + { + "epoch": 0.95, + "grad_norm": 0.09133805600317245, + "learning_rate": 5.4910675704216154e-06, + "loss": 1.5208, + "step": 8876 + }, + { + "epoch": 0.95, + "grad_norm": 0.08495275003072635, + "learning_rate": 5.4653651813615366e-06, + "loss": 1.388, + "step": 8877 + }, + { + "epoch": 0.95, + "grad_norm": 0.08760989309934672, + "learning_rate": 5.439722756330812e-06, + "loss": 1.4567, + "step": 8878 + }, + { + "epoch": 0.95, + "grad_norm": 0.09048100636363733, + "learning_rate": 5.41414029843873e-06, + "loss": 1.4263, + "step": 8879 + }, + { + "epoch": 0.95, + "grad_norm": 0.09197041411794825, + "learning_rate": 5.388617810787255e-06, + "loss": 1.3345, + "step": 8880 + }, + { + "epoch": 0.95, + "grad_norm": 0.10020996050813048, + "learning_rate": 5.3631552964710784e-06, + "loss": 1.4928, + "step": 8881 + }, + { + "epoch": 0.95, + "grad_norm": 0.09509674597731703, + "learning_rate": 5.337752758577563e-06, + "loss": 1.5502, + "step": 8882 + }, + { + "epoch": 0.95, + "grad_norm": 0.09385515688158769, + "learning_rate": 5.312410200186857e-06, + "loss": 1.4702, + "step": 8883 + }, + { + "epoch": 0.96, + "grad_norm": 0.08914837524362412, + "learning_rate": 5.287127624371946e-06, + "loss": 1.4887, + "step": 8884 + }, + { + "epoch": 0.96, + "grad_norm": 0.09664353503901052, + "learning_rate": 5.2619050341982665e-06, + "loss": 1.5038, + "step": 8885 + }, + { + "epoch": 0.96, + "grad_norm": 0.09159510331470239, + "learning_rate": 5.236742432724262e-06, + "loss": 1.5056, + "step": 8886 + }, + { + "epoch": 0.96, + "grad_norm": 0.08356282150014831, + "learning_rate": 5.2116398230009355e-06, + "loss": 1.3525, + "step": 8887 + }, + { + "epoch": 0.96, + "grad_norm": 0.08806156912501846, + "learning_rate": 5.1865972080720195e-06, + "loss": 1.4004, + "step": 8888 + }, + { + "epoch": 0.96, + "grad_norm": 0.09457677127411936, + "learning_rate": 5.16161459097414e-06, + "loss": 1.4904, + "step": 8889 + }, + { + "epoch": 0.96, + "grad_norm": 0.09236016076443747, + "learning_rate": 5.136691974736429e-06, + "loss": 1.4403, + "step": 8890 + }, + { + "epoch": 0.96, + "grad_norm": 0.09057456912244687, + "learning_rate": 5.11182936238086e-06, + "loss": 1.2834, + "step": 8891 + }, + { + "epoch": 0.96, + "grad_norm": 0.09408635409281225, + "learning_rate": 5.087026756922187e-06, + "loss": 1.4948, + "step": 8892 + }, + { + "epoch": 0.96, + "grad_norm": 0.08715396491466379, + "learning_rate": 5.062284161367669e-06, + "loss": 1.2594, + "step": 8893 + }, + { + "epoch": 0.96, + "grad_norm": 0.09798288199490669, + "learning_rate": 5.037601578717521e-06, + "loss": 1.3734, + "step": 8894 + }, + { + "epoch": 0.96, + "grad_norm": 0.09873923966438046, + "learning_rate": 5.01297901196468e-06, + "loss": 1.5297, + "step": 8895 + }, + { + "epoch": 0.96, + "grad_norm": 0.09524621935613473, + "learning_rate": 4.9884164640944806e-06, + "loss": 1.3661, + "step": 8896 + }, + { + "epoch": 0.96, + "grad_norm": 0.08793095352213838, + "learning_rate": 4.963913938085374e-06, + "loss": 1.3534, + "step": 8897 + }, + { + "epoch": 0.96, + "grad_norm": 0.08738144033221784, + "learning_rate": 4.939471436908427e-06, + "loss": 1.4034, + "step": 8898 + }, + { + "epoch": 0.96, + "grad_norm": 0.08983226630169384, + "learning_rate": 4.915088963527214e-06, + "loss": 1.2978, + "step": 8899 + }, + { + "epoch": 0.96, + "grad_norm": 0.09660666651070206, + "learning_rate": 4.890766520898315e-06, + "loss": 1.3553, + "step": 8900 + }, + { + "epoch": 0.96, + "grad_norm": 0.0857590426342619, + "learning_rate": 4.866504111970871e-06, + "loss": 1.4055, + "step": 8901 + }, + { + "epoch": 0.96, + "grad_norm": 0.09299660118242754, + "learning_rate": 4.8423017396868055e-06, + "loss": 1.3946, + "step": 8902 + }, + { + "epoch": 0.96, + "grad_norm": 0.09265501470007696, + "learning_rate": 4.818159406980715e-06, + "loss": 1.424, + "step": 8903 + }, + { + "epoch": 0.96, + "grad_norm": 0.0873024345487018, + "learning_rate": 4.794077116779927e-06, + "loss": 1.2523, + "step": 8904 + }, + { + "epoch": 0.96, + "grad_norm": 0.0941279091102085, + "learning_rate": 4.770054872004548e-06, + "loss": 1.3808, + "step": 8905 + }, + { + "epoch": 0.96, + "grad_norm": 0.10765797937551637, + "learning_rate": 4.74609267556736e-06, + "loss": 1.3262, + "step": 8906 + }, + { + "epoch": 0.96, + "grad_norm": 0.10585056616671595, + "learning_rate": 4.72219053037376e-06, + "loss": 1.2806, + "step": 8907 + }, + { + "epoch": 0.96, + "grad_norm": 0.10284106309103908, + "learning_rate": 4.6983484393220974e-06, + "loss": 1.3896, + "step": 8908 + }, + { + "epoch": 0.96, + "grad_norm": 0.0927126007379314, + "learning_rate": 4.674566405303227e-06, + "loss": 1.3906, + "step": 8909 + }, + { + "epoch": 0.96, + "grad_norm": 0.09260180558774815, + "learning_rate": 4.65084443120084e-06, + "loss": 1.3763, + "step": 8910 + }, + { + "epoch": 0.96, + "grad_norm": 0.09414960587863648, + "learning_rate": 4.627182519891304e-06, + "loss": 1.4103, + "step": 8911 + }, + { + "epoch": 0.96, + "grad_norm": 0.09675803044725435, + "learning_rate": 4.603580674243657e-06, + "loss": 1.3207, + "step": 8912 + }, + { + "epoch": 0.96, + "grad_norm": 0.09529069911101469, + "learning_rate": 4.580038897119776e-06, + "loss": 1.5118, + "step": 8913 + }, + { + "epoch": 0.96, + "grad_norm": 0.07988399140996678, + "learning_rate": 4.5565571913741e-06, + "loss": 1.2666, + "step": 8914 + }, + { + "epoch": 0.96, + "grad_norm": 0.08536402815347709, + "learning_rate": 4.533135559853962e-06, + "loss": 1.4486, + "step": 8915 + }, + { + "epoch": 0.96, + "grad_norm": 0.08185123235807608, + "learning_rate": 4.509774005399314e-06, + "loss": 1.388, + "step": 8916 + }, + { + "epoch": 0.96, + "grad_norm": 0.09593022780045278, + "learning_rate": 4.486472530842723e-06, + "loss": 1.5757, + "step": 8917 + }, + { + "epoch": 0.96, + "grad_norm": 0.08176639351464489, + "learning_rate": 4.463231139009649e-06, + "loss": 1.4924, + "step": 8918 + }, + { + "epoch": 0.96, + "grad_norm": 0.07809646949759338, + "learning_rate": 4.440049832718174e-06, + "loss": 1.3543, + "step": 8919 + }, + { + "epoch": 0.96, + "grad_norm": 0.09367404408882468, + "learning_rate": 4.416928614779103e-06, + "loss": 1.4076, + "step": 8920 + }, + { + "epoch": 0.96, + "grad_norm": 0.08363712680509189, + "learning_rate": 4.3938674879959726e-06, + "loss": 1.364, + "step": 8921 + }, + { + "epoch": 0.96, + "grad_norm": 0.0999487851449133, + "learning_rate": 4.370866455165046e-06, + "loss": 1.3693, + "step": 8922 + }, + { + "epoch": 0.96, + "grad_norm": 0.09668059541295229, + "learning_rate": 4.3479255190752574e-06, + "loss": 1.3714, + "step": 8923 + }, + { + "epoch": 0.96, + "grad_norm": 0.08567231889954466, + "learning_rate": 4.3250446825082166e-06, + "loss": 1.4526, + "step": 8924 + }, + { + "epoch": 0.96, + "grad_norm": 0.08982678310195207, + "learning_rate": 4.302223948238426e-06, + "loss": 1.4021, + "step": 8925 + }, + { + "epoch": 0.96, + "grad_norm": 0.09531371167655645, + "learning_rate": 4.279463319032894e-06, + "loss": 1.321, + "step": 8926 + }, + { + "epoch": 0.96, + "grad_norm": 0.08096601838384176, + "learning_rate": 4.256762797651414e-06, + "loss": 1.2234, + "step": 8927 + }, + { + "epoch": 0.96, + "grad_norm": 0.09513159682892831, + "learning_rate": 4.2341223868465615e-06, + "loss": 1.4223, + "step": 8928 + }, + { + "epoch": 0.96, + "grad_norm": 0.10007520400593102, + "learning_rate": 4.2115420893635295e-06, + "loss": 1.4104, + "step": 8929 + }, + { + "epoch": 0.96, + "grad_norm": 0.07827562474694724, + "learning_rate": 4.189021907940238e-06, + "loss": 1.3046, + "step": 8930 + }, + { + "epoch": 0.96, + "grad_norm": 0.0964464664312945, + "learning_rate": 4.166561845307393e-06, + "loss": 1.4174, + "step": 8931 + }, + { + "epoch": 0.96, + "grad_norm": 0.09301449302503279, + "learning_rate": 4.144161904188315e-06, + "loss": 1.4113, + "step": 8932 + }, + { + "epoch": 0.96, + "grad_norm": 0.09578933773852154, + "learning_rate": 4.1218220872990546e-06, + "loss": 1.3976, + "step": 8933 + }, + { + "epoch": 0.96, + "grad_norm": 0.08666688349596204, + "learning_rate": 4.099542397348444e-06, + "loss": 1.4173, + "step": 8934 + }, + { + "epoch": 0.96, + "grad_norm": 0.07548291457470417, + "learning_rate": 4.077322837037933e-06, + "loss": 1.2473, + "step": 8935 + }, + { + "epoch": 0.96, + "grad_norm": 0.08139822299567276, + "learning_rate": 4.0551634090617575e-06, + "loss": 1.3223, + "step": 8936 + }, + { + "epoch": 0.96, + "grad_norm": 0.08647643102992054, + "learning_rate": 4.033064116106766e-06, + "loss": 1.3527, + "step": 8937 + }, + { + "epoch": 0.96, + "grad_norm": 0.09598127034315897, + "learning_rate": 4.01102496085265e-06, + "loss": 1.4437, + "step": 8938 + }, + { + "epoch": 0.96, + "grad_norm": 0.0811119581027814, + "learning_rate": 3.989045945971659e-06, + "loss": 1.3237, + "step": 8939 + }, + { + "epoch": 0.96, + "grad_norm": 0.10572601439524844, + "learning_rate": 3.967127074128885e-06, + "loss": 1.3066, + "step": 8940 + }, + { + "epoch": 0.96, + "grad_norm": 0.08475692276958388, + "learning_rate": 3.945268347981979e-06, + "loss": 1.2337, + "step": 8941 + }, + { + "epoch": 0.96, + "grad_norm": 0.08313401249782255, + "learning_rate": 3.923469770181543e-06, + "loss": 1.387, + "step": 8942 + }, + { + "epoch": 0.96, + "grad_norm": 0.08968561187112413, + "learning_rate": 3.901731343370574e-06, + "loss": 1.4705, + "step": 8943 + }, + { + "epoch": 0.96, + "grad_norm": 0.08126894441730576, + "learning_rate": 3.880053070184964e-06, + "loss": 1.4054, + "step": 8944 + }, + { + "epoch": 0.96, + "grad_norm": 0.08610883749970781, + "learning_rate": 3.858434953253332e-06, + "loss": 1.4202, + "step": 8945 + }, + { + "epoch": 0.96, + "grad_norm": 0.0860092367603009, + "learning_rate": 3.836876995196914e-06, + "loss": 1.478, + "step": 8946 + }, + { + "epoch": 0.96, + "grad_norm": 0.10382181892590997, + "learning_rate": 3.815379198629732e-06, + "loss": 1.4004, + "step": 8947 + }, + { + "epoch": 0.96, + "grad_norm": 0.08284626907086795, + "learning_rate": 3.793941566158421e-06, + "loss": 1.3705, + "step": 8948 + }, + { + "epoch": 0.96, + "grad_norm": 0.08119746209465538, + "learning_rate": 3.7725641003823476e-06, + "loss": 1.437, + "step": 8949 + }, + { + "epoch": 0.96, + "grad_norm": 0.16574799900803355, + "learning_rate": 3.75124680389366e-06, + "loss": 1.4948, + "step": 8950 + }, + { + "epoch": 0.96, + "grad_norm": 0.08457090061547087, + "learning_rate": 3.7299896792771236e-06, + "loss": 1.3338, + "step": 8951 + }, + { + "epoch": 0.96, + "grad_norm": 0.08507021365850548, + "learning_rate": 3.708792729110233e-06, + "loss": 1.2983, + "step": 8952 + }, + { + "epoch": 0.96, + "grad_norm": 0.09508738545345005, + "learning_rate": 3.687655955963154e-06, + "loss": 1.3152, + "step": 8953 + }, + { + "epoch": 0.96, + "grad_norm": 0.08769842563914661, + "learning_rate": 3.666579362398892e-06, + "loss": 1.3787, + "step": 8954 + }, + { + "epoch": 0.96, + "grad_norm": 0.09713108114963706, + "learning_rate": 3.645562950973014e-06, + "loss": 1.5562, + "step": 8955 + }, + { + "epoch": 0.96, + "grad_norm": 0.09295817353089024, + "learning_rate": 3.624606724233759e-06, + "loss": 1.3034, + "step": 8956 + }, + { + "epoch": 0.96, + "grad_norm": 0.09973446153914593, + "learning_rate": 3.6037106847223168e-06, + "loss": 1.4841, + "step": 8957 + }, + { + "epoch": 0.96, + "grad_norm": 0.08914007872724189, + "learning_rate": 3.5828748349722164e-06, + "loss": 1.4014, + "step": 8958 + }, + { + "epoch": 0.96, + "grad_norm": 0.09706649081960167, + "learning_rate": 3.5620991775099363e-06, + "loss": 1.4964, + "step": 8959 + }, + { + "epoch": 0.96, + "grad_norm": 0.08729316376152077, + "learning_rate": 3.5413837148546847e-06, + "loss": 1.4441, + "step": 8960 + }, + { + "epoch": 0.96, + "grad_norm": 0.07420659828466358, + "learning_rate": 3.520728449518118e-06, + "loss": 1.4565, + "step": 8961 + }, + { + "epoch": 0.96, + "grad_norm": 0.10539629443071484, + "learning_rate": 3.5001333840049e-06, + "loss": 1.4997, + "step": 8962 + }, + { + "epoch": 0.96, + "grad_norm": 0.08586868055644019, + "learning_rate": 3.4795985208121994e-06, + "loss": 1.3828, + "step": 8963 + }, + { + "epoch": 0.96, + "grad_norm": 0.10015038752842724, + "learning_rate": 3.459123862429969e-06, + "loss": 1.3812, + "step": 8964 + }, + { + "epoch": 0.96, + "grad_norm": 0.10194468458724672, + "learning_rate": 3.438709411340779e-06, + "loss": 1.3115, + "step": 8965 + }, + { + "epoch": 0.96, + "grad_norm": 0.09206340021786165, + "learning_rate": 3.4183551700199823e-06, + "loss": 1.3946, + "step": 8966 + }, + { + "epoch": 0.96, + "grad_norm": 0.08262825186118691, + "learning_rate": 3.3980611409356044e-06, + "loss": 1.3658, + "step": 8967 + }, + { + "epoch": 0.96, + "grad_norm": 0.08863719060179712, + "learning_rate": 3.377827326548344e-06, + "loss": 1.5557, + "step": 8968 + }, + { + "epoch": 0.96, + "grad_norm": 0.08816582721504075, + "learning_rate": 3.3576537293116825e-06, + "loss": 1.3183, + "step": 8969 + }, + { + "epoch": 0.96, + "grad_norm": 0.0976051244422402, + "learning_rate": 3.337540351671664e-06, + "loss": 1.375, + "step": 8970 + }, + { + "epoch": 0.96, + "grad_norm": 0.09067808571609655, + "learning_rate": 3.317487196067115e-06, + "loss": 1.4822, + "step": 8971 + }, + { + "epoch": 0.96, + "grad_norm": 0.09566472825698379, + "learning_rate": 3.2974942649295904e-06, + "loss": 1.475, + "step": 8972 + }, + { + "epoch": 0.96, + "grad_norm": 0.08622749394764569, + "learning_rate": 3.277561560683262e-06, + "loss": 1.4134, + "step": 8973 + }, + { + "epoch": 0.96, + "grad_norm": 0.08993542829439416, + "learning_rate": 3.257689085745086e-06, + "loss": 1.3753, + "step": 8974 + }, + { + "epoch": 0.96, + "grad_norm": 0.09586665598997336, + "learning_rate": 3.237876842524634e-06, + "loss": 1.4037, + "step": 8975 + }, + { + "epoch": 0.96, + "grad_norm": 0.08857577090180915, + "learning_rate": 3.2181248334242076e-06, + "loss": 1.3484, + "step": 8976 + }, + { + "epoch": 0.97, + "grad_norm": 0.09287873616748278, + "learning_rate": 3.1984330608387793e-06, + "loss": 1.344, + "step": 8977 + }, + { + "epoch": 0.97, + "grad_norm": 0.09875558892977225, + "learning_rate": 3.1788015271561053e-06, + "loss": 1.4483, + "step": 8978 + }, + { + "epoch": 0.97, + "grad_norm": 0.09041828728366473, + "learning_rate": 3.1592302347565605e-06, + "loss": 1.3532, + "step": 8979 + }, + { + "epoch": 0.97, + "grad_norm": 0.07800414751561797, + "learning_rate": 3.1397191860132456e-06, + "loss": 1.3968, + "step": 8980 + }, + { + "epoch": 0.97, + "grad_norm": 0.08232355971655188, + "learning_rate": 3.1202683832918797e-06, + "loss": 1.3541, + "step": 8981 + }, + { + "epoch": 0.97, + "grad_norm": 0.0951746586827946, + "learning_rate": 3.1008778289509654e-06, + "loss": 1.6037, + "step": 8982 + }, + { + "epoch": 0.97, + "grad_norm": 0.08960082679565905, + "learning_rate": 3.0815475253417325e-06, + "loss": 1.4719, + "step": 8983 + }, + { + "epoch": 0.97, + "grad_norm": 0.11251667581254801, + "learning_rate": 3.0622774748079175e-06, + "loss": 1.3597, + "step": 8984 + }, + { + "epoch": 0.97, + "grad_norm": 0.09527485741233266, + "learning_rate": 3.043067679686262e-06, + "loss": 1.3548, + "step": 8985 + }, + { + "epoch": 0.97, + "grad_norm": 0.0836889698103457, + "learning_rate": 3.0239181423058483e-06, + "loss": 1.4547, + "step": 8986 + }, + { + "epoch": 0.97, + "grad_norm": 0.0796557012099375, + "learning_rate": 3.004828864988707e-06, + "loss": 1.277, + "step": 8987 + }, + { + "epoch": 0.97, + "grad_norm": 0.09149789312059337, + "learning_rate": 2.9857998500494866e-06, + "loss": 1.3898, + "step": 8988 + }, + { + "epoch": 0.97, + "grad_norm": 0.08092008180043826, + "learning_rate": 2.9668310997955083e-06, + "loss": 1.3283, + "step": 8989 + }, + { + "epoch": 0.97, + "grad_norm": 0.09915783393849893, + "learning_rate": 2.9479226165268215e-06, + "loss": 1.4008, + "step": 8990 + }, + { + "epoch": 0.97, + "grad_norm": 0.08577745745175479, + "learning_rate": 2.9290744025360915e-06, + "loss": 1.3352, + "step": 8991 + }, + { + "epoch": 0.97, + "grad_norm": 0.08612397942263612, + "learning_rate": 2.910286460108713e-06, + "loss": 1.4729, + "step": 8992 + }, + { + "epoch": 0.97, + "grad_norm": 0.07870393466324724, + "learning_rate": 2.891558791522864e-06, + "loss": 1.4015, + "step": 8993 + }, + { + "epoch": 0.97, + "grad_norm": 0.08394931460604015, + "learning_rate": 2.872891399049338e-06, + "loss": 1.2502, + "step": 8994 + }, + { + "epoch": 0.97, + "grad_norm": 0.10427009072509341, + "learning_rate": 2.854284284951547e-06, + "loss": 1.428, + "step": 8995 + }, + { + "epoch": 0.97, + "grad_norm": 0.08846241604213603, + "learning_rate": 2.8357374514856872e-06, + "loss": 1.2706, + "step": 8996 + }, + { + "epoch": 0.97, + "grad_norm": 0.09327150463271378, + "learning_rate": 2.817250900900681e-06, + "loss": 1.3801, + "step": 8997 + }, + { + "epoch": 0.97, + "grad_norm": 0.11128758227648324, + "learning_rate": 2.798824635438069e-06, + "loss": 1.3507, + "step": 8998 + }, + { + "epoch": 0.97, + "grad_norm": 0.08923177417576252, + "learning_rate": 2.780458657332008e-06, + "loss": 1.3579, + "step": 8999 + }, + { + "epoch": 0.97, + "grad_norm": 0.08947199378843616, + "learning_rate": 2.762152968809606e-06, + "loss": 1.4248, + "step": 9000 + }, + { + "epoch": 0.97, + "grad_norm": 0.09648030606466207, + "learning_rate": 2.743907572090365e-06, + "loss": 1.4384, + "step": 9001 + }, + { + "epoch": 0.97, + "grad_norm": 0.08206314363811901, + "learning_rate": 2.7257224693866266e-06, + "loss": 1.4017, + "step": 9002 + }, + { + "epoch": 0.97, + "grad_norm": 0.09363077823329279, + "learning_rate": 2.7075976629033495e-06, + "loss": 1.3157, + "step": 9003 + }, + { + "epoch": 0.97, + "grad_norm": 0.09298672001535016, + "learning_rate": 2.689533154838386e-06, + "loss": 1.3577, + "step": 9004 + }, + { + "epoch": 0.97, + "grad_norm": 0.11480343177435744, + "learning_rate": 2.671528947381929e-06, + "loss": 1.5121, + "step": 9005 + }, + { + "epoch": 0.97, + "grad_norm": 0.0905443724363777, + "learning_rate": 2.653585042717177e-06, + "loss": 1.3501, + "step": 9006 + }, + { + "epoch": 0.97, + "grad_norm": 0.0913932632121586, + "learning_rate": 2.635701443019889e-06, + "loss": 1.2952, + "step": 9007 + }, + { + "epoch": 0.97, + "grad_norm": 0.10332073678504636, + "learning_rate": 2.617878150458386e-06, + "loss": 1.4585, + "step": 9008 + }, + { + "epoch": 0.97, + "grad_norm": 0.09795339200836412, + "learning_rate": 2.600115167193995e-06, + "loss": 1.4621, + "step": 9009 + }, + { + "epoch": 0.97, + "grad_norm": 0.08852957559404895, + "learning_rate": 2.582412495380382e-06, + "loss": 1.4335, + "step": 9010 + }, + { + "epoch": 0.97, + "grad_norm": 0.09455078692555385, + "learning_rate": 2.5647701371641075e-06, + "loss": 1.4161, + "step": 9011 + }, + { + "epoch": 0.97, + "grad_norm": 0.09596235942780901, + "learning_rate": 2.547188094684405e-06, + "loss": 1.4151, + "step": 9012 + }, + { + "epoch": 0.97, + "grad_norm": 0.08374721842744182, + "learning_rate": 2.5296663700731247e-06, + "loss": 1.3216, + "step": 9013 + }, + { + "epoch": 0.97, + "grad_norm": 0.09278676414140465, + "learning_rate": 2.5122049654547897e-06, + "loss": 1.456, + "step": 9014 + }, + { + "epoch": 0.97, + "grad_norm": 0.08964255210780839, + "learning_rate": 2.494803882946761e-06, + "loss": 1.3075, + "step": 9015 + }, + { + "epoch": 0.97, + "grad_norm": 0.09620974578886025, + "learning_rate": 2.4774631246589074e-06, + "loss": 1.4034, + "step": 9016 + }, + { + "epoch": 0.97, + "grad_norm": 0.11383840109527507, + "learning_rate": 2.4601826926938242e-06, + "loss": 1.5086, + "step": 9017 + }, + { + "epoch": 0.97, + "grad_norm": 0.09144766054964178, + "learning_rate": 2.442962589146891e-06, + "loss": 1.3065, + "step": 9018 + }, + { + "epoch": 0.97, + "grad_norm": 0.08185228768258924, + "learning_rate": 2.425802816106104e-06, + "loss": 1.3369, + "step": 9019 + }, + { + "epoch": 0.97, + "grad_norm": 0.0840524727285538, + "learning_rate": 2.4087033756521328e-06, + "loss": 1.4063, + "step": 9020 + }, + { + "epoch": 0.97, + "grad_norm": 0.08907504192099573, + "learning_rate": 2.391664269858318e-06, + "loss": 1.3916, + "step": 9021 + }, + { + "epoch": 0.97, + "grad_norm": 0.09370670461297587, + "learning_rate": 2.374685500790785e-06, + "loss": 1.4576, + "step": 9022 + }, + { + "epoch": 0.97, + "grad_norm": 0.08615449445440274, + "learning_rate": 2.3577670705081654e-06, + "loss": 1.4466, + "step": 9023 + }, + { + "epoch": 0.97, + "grad_norm": 0.08779102881285784, + "learning_rate": 2.3409089810618734e-06, + "loss": 1.3638, + "step": 9024 + }, + { + "epoch": 0.97, + "grad_norm": 0.08879645514225343, + "learning_rate": 2.324111234496107e-06, + "loss": 1.477, + "step": 9025 + }, + { + "epoch": 0.97, + "grad_norm": 0.08444548732237621, + "learning_rate": 2.3073738328476255e-06, + "loss": 1.4513, + "step": 9026 + }, + { + "epoch": 0.97, + "grad_norm": 0.09078258966657334, + "learning_rate": 2.2906967781458065e-06, + "loss": 1.5172, + "step": 9027 + }, + { + "epoch": 0.97, + "grad_norm": 0.09488873511558187, + "learning_rate": 2.2740800724129206e-06, + "loss": 1.4049, + "step": 9028 + }, + { + "epoch": 0.97, + "grad_norm": 0.09875341751463869, + "learning_rate": 2.257523717663745e-06, + "loss": 1.4016, + "step": 9029 + }, + { + "epoch": 0.97, + "grad_norm": 0.11312923220297097, + "learning_rate": 2.2410277159057858e-06, + "loss": 1.2807, + "step": 9030 + }, + { + "epoch": 0.97, + "grad_norm": 0.09081658104439949, + "learning_rate": 2.2245920691392753e-06, + "loss": 1.3431, + "step": 9031 + }, + { + "epoch": 0.97, + "grad_norm": 0.09947730055048168, + "learning_rate": 2.208216779357064e-06, + "loss": 1.3545, + "step": 9032 + }, + { + "epoch": 0.97, + "grad_norm": 0.0918733894258067, + "learning_rate": 2.1919018485446753e-06, + "loss": 1.4349, + "step": 9033 + }, + { + "epoch": 0.97, + "grad_norm": 0.09406473803075574, + "learning_rate": 2.175647278680415e-06, + "loss": 1.4259, + "step": 9034 + }, + { + "epoch": 0.97, + "grad_norm": 0.08127694221980779, + "learning_rate": 2.1594530717352068e-06, + "loss": 1.3125, + "step": 9035 + }, + { + "epoch": 0.97, + "grad_norm": 0.09175464617559018, + "learning_rate": 2.1433192296725912e-06, + "loss": 1.3946, + "step": 9036 + }, + { + "epoch": 0.97, + "grad_norm": 0.09641222973699672, + "learning_rate": 2.127245754448892e-06, + "loss": 1.4488, + "step": 9037 + }, + { + "epoch": 0.97, + "grad_norm": 0.08726870720260296, + "learning_rate": 2.111232648013106e-06, + "loss": 1.3105, + "step": 9038 + }, + { + "epoch": 0.97, + "grad_norm": 0.11137866349602377, + "learning_rate": 2.0952799123068466e-06, + "loss": 1.3899, + "step": 9039 + }, + { + "epoch": 0.97, + "grad_norm": 0.08429603496474124, + "learning_rate": 2.0793875492644e-06, + "loss": 1.3003, + "step": 9040 + }, + { + "epoch": 0.97, + "grad_norm": 0.10461075575496355, + "learning_rate": 2.0635555608128354e-06, + "loss": 1.3635, + "step": 9041 + }, + { + "epoch": 0.97, + "grad_norm": 0.09558409348647286, + "learning_rate": 2.04778394887184e-06, + "loss": 1.3797, + "step": 9042 + }, + { + "epoch": 0.97, + "grad_norm": 0.09213081325338919, + "learning_rate": 2.0320727153537165e-06, + "loss": 1.4851, + "step": 9043 + }, + { + "epoch": 0.97, + "grad_norm": 0.10022093578171336, + "learning_rate": 2.016421862163498e-06, + "loss": 1.3951, + "step": 9044 + }, + { + "epoch": 0.97, + "grad_norm": 0.09523644080123797, + "learning_rate": 2.0008313911989986e-06, + "loss": 1.3995, + "step": 9045 + }, + { + "epoch": 0.97, + "grad_norm": 0.09133021078387467, + "learning_rate": 1.9853013043504844e-06, + "loss": 1.4518, + "step": 9046 + }, + { + "epoch": 0.97, + "grad_norm": 0.08688534304999465, + "learning_rate": 1.969831603501171e-06, + "loss": 1.3965, + "step": 9047 + }, + { + "epoch": 0.97, + "grad_norm": 0.08773971157963967, + "learning_rate": 1.954422290526725e-06, + "loss": 1.3931, + "step": 9048 + }, + { + "epoch": 0.97, + "grad_norm": 0.09246961888491326, + "learning_rate": 1.9390733672955406e-06, + "loss": 1.3227, + "step": 9049 + }, + { + "epoch": 0.97, + "grad_norm": 0.10713648382422544, + "learning_rate": 1.9237848356688514e-06, + "loss": 1.4001, + "step": 9050 + }, + { + "epoch": 0.97, + "grad_norm": 0.08792536366905146, + "learning_rate": 1.9085566975003963e-06, + "loss": 1.4077, + "step": 9051 + }, + { + "epoch": 0.97, + "grad_norm": 0.07995517162388631, + "learning_rate": 1.893388954636588e-06, + "loss": 1.5223, + "step": 9052 + }, + { + "epoch": 0.97, + "grad_norm": 0.10201954934078061, + "learning_rate": 1.878281608916621e-06, + "loss": 1.4578, + "step": 9053 + }, + { + "epoch": 0.97, + "grad_norm": 0.08765267905443119, + "learning_rate": 1.8632346621723085e-06, + "loss": 1.4235, + "step": 9054 + }, + { + "epoch": 0.97, + "grad_norm": 0.09134361289135108, + "learning_rate": 1.8482481162280795e-06, + "loss": 1.3734, + "step": 9055 + }, + { + "epoch": 0.97, + "grad_norm": 0.09517979343552131, + "learning_rate": 1.833321972901203e-06, + "loss": 1.4759, + "step": 9056 + }, + { + "epoch": 0.97, + "grad_norm": 0.09392044720740886, + "learning_rate": 1.8184562340014532e-06, + "loss": 1.3799, + "step": 9057 + }, + { + "epoch": 0.97, + "grad_norm": 0.08945528953340533, + "learning_rate": 1.8036509013313884e-06, + "loss": 1.4674, + "step": 9058 + }, + { + "epoch": 0.97, + "grad_norm": 0.09660244996835152, + "learning_rate": 1.7889059766862392e-06, + "loss": 1.4577, + "step": 9059 + }, + { + "epoch": 0.97, + "grad_norm": 0.08955683843704676, + "learning_rate": 1.7742214618537977e-06, + "loss": 1.3246, + "step": 9060 + }, + { + "epoch": 0.97, + "grad_norm": 0.09367894047592056, + "learning_rate": 1.7595973586145842e-06, + "loss": 1.3879, + "step": 9061 + }, + { + "epoch": 0.97, + "grad_norm": 0.09055649217132475, + "learning_rate": 1.7450336687420131e-06, + "loss": 1.3893, + "step": 9062 + }, + { + "epoch": 0.97, + "grad_norm": 0.09954889766272536, + "learning_rate": 1.7305303940017836e-06, + "loss": 1.3906, + "step": 9063 + }, + { + "epoch": 0.97, + "grad_norm": 0.08601609477859878, + "learning_rate": 1.7160875361525441e-06, + "loss": 1.41, + "step": 9064 + }, + { + "epoch": 0.97, + "grad_norm": 0.09150624684421146, + "learning_rate": 1.701705096945505e-06, + "loss": 1.4518, + "step": 9065 + }, + { + "epoch": 0.97, + "grad_norm": 0.08989981788436734, + "learning_rate": 1.6873830781246601e-06, + "loss": 1.4331, + "step": 9066 + }, + { + "epoch": 0.97, + "grad_norm": 0.08875228716445183, + "learning_rate": 1.673121481426565e-06, + "loss": 1.374, + "step": 9067 + }, + { + "epoch": 0.97, + "grad_norm": 0.08737529837010975, + "learning_rate": 1.6589203085804472e-06, + "loss": 1.4335, + "step": 9068 + }, + { + "epoch": 0.97, + "grad_norm": 0.10109421859275902, + "learning_rate": 1.6447795613083183e-06, + "loss": 1.4097, + "step": 9069 + }, + { + "epoch": 0.98, + "grad_norm": 0.09946451803794594, + "learning_rate": 1.6306992413247512e-06, + "loss": 1.3549, + "step": 9070 + }, + { + "epoch": 0.98, + "grad_norm": 0.08896478727278188, + "learning_rate": 1.6166793503370469e-06, + "loss": 1.3679, + "step": 9071 + }, + { + "epoch": 0.98, + "grad_norm": 0.10638462313903883, + "learning_rate": 1.602719890045179e-06, + "loss": 1.3335, + "step": 9072 + }, + { + "epoch": 0.98, + "grad_norm": 0.089120932283575, + "learning_rate": 1.5888208621417376e-06, + "loss": 1.3602, + "step": 9073 + }, + { + "epoch": 0.98, + "grad_norm": 0.1012988002695788, + "learning_rate": 1.5749822683120419e-06, + "loss": 1.3864, + "step": 9074 + }, + { + "epoch": 0.98, + "grad_norm": 0.10437337636041694, + "learning_rate": 1.561204110234138e-06, + "loss": 1.433, + "step": 9075 + }, + { + "epoch": 0.98, + "grad_norm": 0.09601976601444809, + "learning_rate": 1.547486389578523e-06, + "loss": 1.5101, + "step": 9076 + }, + { + "epoch": 0.98, + "grad_norm": 0.09522304132825528, + "learning_rate": 1.5338291080086441e-06, + "loss": 1.3575, + "step": 9077 + }, + { + "epoch": 0.98, + "grad_norm": 0.09616477229666578, + "learning_rate": 1.5202322671805103e-06, + "loss": 1.5156, + "step": 9078 + }, + { + "epoch": 0.98, + "grad_norm": 0.09378474058753226, + "learning_rate": 1.5066958687426914e-06, + "loss": 1.4158, + "step": 9079 + }, + { + "epoch": 0.98, + "grad_norm": 0.08739425978756746, + "learning_rate": 1.493219914336541e-06, + "loss": 1.4265, + "step": 9080 + }, + { + "epoch": 0.98, + "grad_norm": 0.098000838767873, + "learning_rate": 1.4798044055961414e-06, + "loss": 1.3646, + "step": 9081 + }, + { + "epoch": 0.98, + "grad_norm": 0.09244726842890279, + "learning_rate": 1.4664493441480797e-06, + "loss": 1.3537, + "step": 9082 + }, + { + "epoch": 0.98, + "grad_norm": 0.09582741121293425, + "learning_rate": 1.4531547316117832e-06, + "loss": 1.219, + "step": 9083 + }, + { + "epoch": 0.98, + "grad_norm": 0.10720997957465893, + "learning_rate": 1.4399205695991847e-06, + "loss": 1.4641, + "step": 9084 + }, + { + "epoch": 0.98, + "grad_norm": 0.08406612664504096, + "learning_rate": 1.4267468597150558e-06, + "loss": 1.3459, + "step": 9085 + }, + { + "epoch": 0.98, + "grad_norm": 0.09440844125905276, + "learning_rate": 1.4136336035566744e-06, + "loss": 1.5147, + "step": 9086 + }, + { + "epoch": 0.98, + "grad_norm": 0.09102998393092081, + "learning_rate": 1.4005808027141576e-06, + "loss": 1.4601, + "step": 9087 + }, + { + "epoch": 0.98, + "grad_norm": 0.10331120527508215, + "learning_rate": 1.3875884587700727e-06, + "loss": 1.3414, + "step": 9088 + }, + { + "epoch": 0.98, + "grad_norm": 0.09685017624372445, + "learning_rate": 1.3746565732999372e-06, + "loss": 1.4713, + "step": 9089 + }, + { + "epoch": 0.98, + "grad_norm": 0.09074395569266269, + "learning_rate": 1.3617851478716637e-06, + "loss": 1.3245, + "step": 9090 + }, + { + "epoch": 0.98, + "grad_norm": 0.09620486567097974, + "learning_rate": 1.348974184046059e-06, + "loss": 1.366, + "step": 9091 + }, + { + "epoch": 0.98, + "grad_norm": 0.10708772859844796, + "learning_rate": 1.3362236833763808e-06, + "loss": 1.4079, + "step": 9092 + }, + { + "epoch": 0.98, + "grad_norm": 0.0812319976321557, + "learning_rate": 1.3235336474087812e-06, + "loss": 1.3993, + "step": 9093 + }, + { + "epoch": 0.98, + "grad_norm": 0.0775704183224788, + "learning_rate": 1.3109040776819182e-06, + "loss": 1.3545, + "step": 9094 + }, + { + "epoch": 0.98, + "grad_norm": 0.0848197503815719, + "learning_rate": 1.2983349757271778e-06, + "loss": 1.3832, + "step": 9095 + }, + { + "epoch": 0.98, + "grad_norm": 0.08434932724362751, + "learning_rate": 1.285826343068619e-06, + "loss": 1.4432, + "step": 9096 + }, + { + "epoch": 0.98, + "grad_norm": 0.09075226208293427, + "learning_rate": 1.2733781812229729e-06, + "loss": 1.5216, + "step": 9097 + }, + { + "epoch": 0.98, + "grad_norm": 0.1065687767191086, + "learning_rate": 1.260990491699532e-06, + "loss": 1.4738, + "step": 9098 + }, + { + "epoch": 0.98, + "grad_norm": 0.08992583345049047, + "learning_rate": 1.248663276000428e-06, + "loss": 1.4144, + "step": 9099 + }, + { + "epoch": 0.98, + "grad_norm": 0.08435920582959416, + "learning_rate": 1.2363965356204099e-06, + "loss": 1.4694, + "step": 9100 + }, + { + "epoch": 0.98, + "grad_norm": 0.10098857061162941, + "learning_rate": 1.2241902720467324e-06, + "loss": 1.4237, + "step": 9101 + }, + { + "epoch": 0.98, + "grad_norm": 0.09986073903862418, + "learning_rate": 1.2120444867596003e-06, + "loss": 1.4478, + "step": 9102 + }, + { + "epoch": 0.98, + "grad_norm": 0.08558072461227653, + "learning_rate": 1.1999591812316135e-06, + "loss": 1.5613, + "step": 9103 + }, + { + "epoch": 0.98, + "grad_norm": 0.09939211521634202, + "learning_rate": 1.1879343569282109e-06, + "loss": 1.383, + "step": 9104 + }, + { + "epoch": 0.98, + "grad_norm": 0.09749991658975933, + "learning_rate": 1.1759700153073927e-06, + "loss": 1.4085, + "step": 9105 + }, + { + "epoch": 0.98, + "grad_norm": 0.08925710668430854, + "learning_rate": 1.1640661578199986e-06, + "loss": 1.3, + "step": 9106 + }, + { + "epoch": 0.98, + "grad_norm": 0.08628724330928919, + "learning_rate": 1.1522227859092627e-06, + "loss": 1.5635, + "step": 9107 + }, + { + "epoch": 0.98, + "grad_norm": 0.09571820500346655, + "learning_rate": 1.1404399010113696e-06, + "loss": 1.4007, + "step": 9108 + }, + { + "epoch": 0.98, + "grad_norm": 0.08511360335320821, + "learning_rate": 1.1287175045548992e-06, + "loss": 1.3939, + "step": 9109 + }, + { + "epoch": 0.98, + "grad_norm": 0.093149927054585, + "learning_rate": 1.117055597961325e-06, + "loss": 1.3354, + "step": 9110 + }, + { + "epoch": 0.98, + "grad_norm": 0.08879778321147576, + "learning_rate": 1.105454182644683e-06, + "loss": 1.4454, + "step": 9111 + }, + { + "epoch": 0.98, + "grad_norm": 0.09563179301500772, + "learning_rate": 1.0939132600116808e-06, + "loss": 1.4794, + "step": 9112 + }, + { + "epoch": 0.98, + "grad_norm": 0.08586217819702109, + "learning_rate": 1.0824328314616994e-06, + "loss": 1.4134, + "step": 9113 + }, + { + "epoch": 0.98, + "grad_norm": 0.08860707467489773, + "learning_rate": 1.0710128983867362e-06, + "loss": 1.4085, + "step": 9114 + }, + { + "epoch": 0.98, + "grad_norm": 0.09383682611238597, + "learning_rate": 1.0596534621715171e-06, + "loss": 1.4335, + "step": 9115 + }, + { + "epoch": 0.98, + "grad_norm": 0.0889687494892466, + "learning_rate": 1.04835452419344e-06, + "loss": 1.4469, + "step": 9116 + }, + { + "epoch": 0.98, + "grad_norm": 0.09343696027937952, + "learning_rate": 1.037116085822576e-06, + "loss": 1.5305, + "step": 9117 + }, + { + "epoch": 0.98, + "grad_norm": 0.10193965533313602, + "learning_rate": 1.0259381484215013e-06, + "loss": 1.3598, + "step": 9118 + }, + { + "epoch": 0.98, + "grad_norm": 0.08673396632150293, + "learning_rate": 1.0148207133456878e-06, + "loss": 1.3471, + "step": 9119 + }, + { + "epoch": 0.98, + "grad_norm": 0.09320776936813663, + "learning_rate": 1.0037637819431121e-06, + "loss": 1.1593, + "step": 9120 + }, + { + "epoch": 0.98, + "grad_norm": 0.09137352888592001, + "learning_rate": 9.927673555544804e-07, + "loss": 1.315, + "step": 9121 + }, + { + "epoch": 0.98, + "grad_norm": 0.08717919879493985, + "learning_rate": 9.818314355131697e-07, + "loss": 1.4096, + "step": 9122 + }, + { + "epoch": 0.98, + "grad_norm": 0.0928947483788626, + "learning_rate": 9.709560231451198e-07, + "loss": 1.452, + "step": 9123 + }, + { + "epoch": 0.98, + "grad_norm": 0.08813190059284774, + "learning_rate": 9.60141119769109e-07, + "loss": 1.4183, + "step": 9124 + }, + { + "epoch": 0.98, + "grad_norm": 0.08887987124924661, + "learning_rate": 9.493867266964218e-07, + "loss": 1.4121, + "step": 9125 + }, + { + "epoch": 0.98, + "grad_norm": 0.09395453002384421, + "learning_rate": 9.386928452310706e-07, + "loss": 1.426, + "step": 9126 + }, + { + "epoch": 0.98, + "grad_norm": 0.08377161772511613, + "learning_rate": 9.280594766697959e-07, + "loss": 1.4216, + "step": 9127 + }, + { + "epoch": 0.98, + "grad_norm": 0.09163942047825116, + "learning_rate": 9.174866223018441e-07, + "loss": 1.4048, + "step": 9128 + }, + { + "epoch": 0.98, + "grad_norm": 0.08910281911488153, + "learning_rate": 9.069742834092454e-07, + "loss": 1.3467, + "step": 9129 + }, + { + "epoch": 0.98, + "grad_norm": 0.09118313763931776, + "learning_rate": 8.965224612665912e-07, + "loss": 1.5055, + "step": 9130 + }, + { + "epoch": 0.98, + "grad_norm": 0.09895577173828514, + "learning_rate": 8.861311571413122e-07, + "loss": 1.4136, + "step": 9131 + }, + { + "epoch": 0.98, + "grad_norm": 0.09511323789429267, + "learning_rate": 8.75800372293345e-07, + "loss": 1.2273, + "step": 9132 + }, + { + "epoch": 0.98, + "grad_norm": 0.09517490821029329, + "learning_rate": 8.655301079752987e-07, + "loss": 1.3734, + "step": 9133 + }, + { + "epoch": 0.98, + "grad_norm": 0.08708118193105326, + "learning_rate": 8.553203654325103e-07, + "loss": 1.4706, + "step": 9134 + }, + { + "epoch": 0.98, + "grad_norm": 0.09081075474479096, + "learning_rate": 8.451711459029343e-07, + "loss": 1.2992, + "step": 9135 + }, + { + "epoch": 0.98, + "grad_norm": 0.10093740980687556, + "learning_rate": 8.350824506172527e-07, + "loss": 1.3703, + "step": 9136 + }, + { + "epoch": 0.98, + "grad_norm": 0.080606655701831, + "learning_rate": 8.250542807986538e-07, + "loss": 1.4454, + "step": 9137 + }, + { + "epoch": 0.98, + "grad_norm": 0.08759808465816735, + "learning_rate": 8.150866376631649e-07, + "loss": 1.4234, + "step": 9138 + }, + { + "epoch": 0.98, + "grad_norm": 0.09199899630041176, + "learning_rate": 8.051795224193748e-07, + "loss": 1.5712, + "step": 9139 + }, + { + "epoch": 0.98, + "grad_norm": 0.08938543020630885, + "learning_rate": 7.953329362685447e-07, + "loss": 1.4132, + "step": 9140 + }, + { + "epoch": 0.98, + "grad_norm": 0.09203648239831115, + "learning_rate": 7.855468804046084e-07, + "loss": 1.4387, + "step": 9141 + }, + { + "epoch": 0.98, + "grad_norm": 0.0975507858137972, + "learning_rate": 7.758213560141725e-07, + "loss": 1.3312, + "step": 9142 + }, + { + "epoch": 0.98, + "grad_norm": 0.09787399811432872, + "learning_rate": 7.661563642765157e-07, + "loss": 1.3107, + "step": 9143 + }, + { + "epoch": 0.98, + "grad_norm": 0.10078879497252471, + "learning_rate": 7.565519063634785e-07, + "loss": 1.3406, + "step": 9144 + }, + { + "epoch": 0.98, + "grad_norm": 0.09294027961434816, + "learning_rate": 7.470079834396848e-07, + "loss": 1.3681, + "step": 9145 + }, + { + "epoch": 0.98, + "grad_norm": 0.09205734553283813, + "learning_rate": 7.375245966623756e-07, + "loss": 1.3883, + "step": 9146 + }, + { + "epoch": 0.98, + "grad_norm": 0.08730325996375386, + "learning_rate": 7.281017471814089e-07, + "loss": 1.3657, + "step": 9147 + }, + { + "epoch": 0.98, + "grad_norm": 0.08832469615914386, + "learning_rate": 7.187394361393707e-07, + "loss": 1.3283, + "step": 9148 + }, + { + "epoch": 0.98, + "grad_norm": 0.08891798495898455, + "learning_rate": 7.09437664671464e-07, + "loss": 1.4239, + "step": 9149 + }, + { + "epoch": 0.98, + "grad_norm": 0.08509761132201243, + "learning_rate": 7.001964339055645e-07, + "loss": 1.2972, + "step": 9150 + }, + { + "epoch": 0.98, + "grad_norm": 0.09860938568263267, + "learning_rate": 6.910157449621646e-07, + "loss": 1.3948, + "step": 9151 + }, + { + "epoch": 0.98, + "grad_norm": 0.09700345844785578, + "learning_rate": 6.818955989545406e-07, + "loss": 1.4834, + "step": 9152 + }, + { + "epoch": 0.98, + "grad_norm": 0.10282320662162178, + "learning_rate": 6.72835996988419e-07, + "loss": 1.3405, + "step": 9153 + }, + { + "epoch": 0.98, + "grad_norm": 0.09532553767319768, + "learning_rate": 6.638369401624212e-07, + "loss": 1.3601, + "step": 9154 + }, + { + "epoch": 0.98, + "grad_norm": 0.09944816708848722, + "learning_rate": 6.548984295676741e-07, + "loss": 1.3718, + "step": 9155 + }, + { + "epoch": 0.98, + "grad_norm": 0.08797403192670447, + "learning_rate": 6.460204662879776e-07, + "loss": 1.467, + "step": 9156 + }, + { + "epoch": 0.98, + "grad_norm": 0.09782593171572332, + "learning_rate": 6.37203051399804e-07, + "loss": 1.3464, + "step": 9157 + }, + { + "epoch": 0.98, + "grad_norm": 0.09783108557900717, + "learning_rate": 6.284461859723534e-07, + "loss": 1.3776, + "step": 9158 + }, + { + "epoch": 0.98, + "grad_norm": 0.08536515596852107, + "learning_rate": 6.197498710673876e-07, + "loss": 1.3792, + "step": 9159 + }, + { + "epoch": 0.98, + "grad_norm": 0.08102996000876958, + "learning_rate": 6.111141077393967e-07, + "loss": 1.3457, + "step": 9160 + }, + { + "epoch": 0.98, + "grad_norm": 0.08950933513385094, + "learning_rate": 6.025388970354872e-07, + "loss": 1.5071, + "step": 9161 + }, + { + "epoch": 0.98, + "grad_norm": 0.09077163258171603, + "learning_rate": 5.940242399953832e-07, + "loss": 1.5238, + "step": 9162 + }, + { + "epoch": 0.99, + "grad_norm": 0.08107218496059505, + "learning_rate": 5.855701376515921e-07, + "loss": 1.4836, + "step": 9163 + }, + { + "epoch": 0.99, + "grad_norm": 0.09402355006480244, + "learning_rate": 5.771765910291271e-07, + "loss": 1.3273, + "step": 9164 + }, + { + "epoch": 0.99, + "grad_norm": 0.08419332875007346, + "learning_rate": 5.688436011457854e-07, + "loss": 1.2936, + "step": 9165 + }, + { + "epoch": 0.99, + "grad_norm": 0.09582927476497023, + "learning_rate": 5.605711690119808e-07, + "loss": 1.4482, + "step": 9166 + }, + { + "epoch": 0.99, + "grad_norm": 0.08145256942098168, + "learning_rate": 5.523592956307444e-07, + "loss": 1.3393, + "step": 9167 + }, + { + "epoch": 0.99, + "grad_norm": 0.08945519192985447, + "learning_rate": 5.442079819977797e-07, + "loss": 1.4419, + "step": 9168 + }, + { + "epoch": 0.99, + "grad_norm": 0.08389369232847793, + "learning_rate": 5.361172291014627e-07, + "loss": 1.4232, + "step": 9169 + }, + { + "epoch": 0.99, + "grad_norm": 0.09665716756692867, + "learning_rate": 5.280870379228975e-07, + "loss": 1.502, + "step": 9170 + }, + { + "epoch": 0.99, + "grad_norm": 0.09107272005780782, + "learning_rate": 5.201174094356942e-07, + "loss": 1.374, + "step": 9171 + }, + { + "epoch": 0.99, + "grad_norm": 0.08525633295064047, + "learning_rate": 5.122083446062464e-07, + "loss": 1.2464, + "step": 9172 + }, + { + "epoch": 0.99, + "grad_norm": 0.09145092325910216, + "learning_rate": 5.043598443935094e-07, + "loss": 1.3063, + "step": 9173 + }, + { + "epoch": 0.99, + "grad_norm": 0.09121429449434486, + "learning_rate": 4.965719097491661e-07, + "loss": 1.317, + "step": 9174 + }, + { + "epoch": 0.99, + "grad_norm": 0.08106699097565444, + "learning_rate": 4.888445416175724e-07, + "loss": 1.3662, + "step": 9175 + }, + { + "epoch": 0.99, + "grad_norm": 0.09235616001095853, + "learning_rate": 4.8117774093559e-07, + "loss": 1.4448, + "step": 9176 + }, + { + "epoch": 0.99, + "grad_norm": 0.08632917581540081, + "learning_rate": 4.735715086329195e-07, + "loss": 1.4205, + "step": 9177 + }, + { + "epoch": 0.99, + "grad_norm": 0.09459405410772538, + "learning_rate": 4.6602584563187886e-07, + "loss": 1.3787, + "step": 9178 + }, + { + "epoch": 0.99, + "grad_norm": 0.08651932809943559, + "learning_rate": 4.5854075284729177e-07, + "loss": 1.4209, + "step": 9179 + }, + { + "epoch": 0.99, + "grad_norm": 0.08827714040145299, + "learning_rate": 4.511162311868766e-07, + "loss": 1.3239, + "step": 9180 + }, + { + "epoch": 0.99, + "grad_norm": 0.08381943113452264, + "learning_rate": 4.4375228155085766e-07, + "loss": 1.4219, + "step": 9181 + }, + { + "epoch": 0.99, + "grad_norm": 0.08236325719668838, + "learning_rate": 4.3644890483202084e-07, + "loss": 1.3715, + "step": 9182 + }, + { + "epoch": 0.99, + "grad_norm": 0.09243341173187043, + "learning_rate": 4.292061019160465e-07, + "loss": 1.4119, + "step": 9183 + }, + { + "epoch": 0.99, + "grad_norm": 0.10070109950559669, + "learning_rate": 4.2202387368112104e-07, + "loss": 1.472, + "step": 9184 + }, + { + "epoch": 0.99, + "grad_norm": 0.09517079092290456, + "learning_rate": 4.149022209981035e-07, + "loss": 1.3448, + "step": 9185 + }, + { + "epoch": 0.99, + "grad_norm": 0.09084655013556947, + "learning_rate": 4.0784114473052526e-07, + "loss": 1.2689, + "step": 9186 + }, + { + "epoch": 0.99, + "grad_norm": 0.08422060321187015, + "learning_rate": 4.008406457345903e-07, + "loss": 1.3204, + "step": 9187 + }, + { + "epoch": 0.99, + "grad_norm": 0.08559323844255416, + "learning_rate": 3.9390072485906424e-07, + "loss": 1.3425, + "step": 9188 + }, + { + "epoch": 0.99, + "grad_norm": 0.08430437320924258, + "learning_rate": 3.8702138294549605e-07, + "loss": 1.3853, + "step": 9189 + }, + { + "epoch": 0.99, + "grad_norm": 0.09296172134493086, + "learning_rate": 3.8020262082799626e-07, + "loss": 1.3051, + "step": 9190 + }, + { + "epoch": 0.99, + "grad_norm": 0.08668309232549491, + "learning_rate": 3.7344443933340355e-07, + "loss": 1.3997, + "step": 9191 + }, + { + "epoch": 0.99, + "grad_norm": 0.08495748652651433, + "learning_rate": 3.6674683928111795e-07, + "loss": 1.3981, + "step": 9192 + }, + { + "epoch": 0.99, + "grad_norm": 0.11190443758993558, + "learning_rate": 3.6010982148332314e-07, + "loss": 1.4863, + "step": 9193 + }, + { + "epoch": 0.99, + "grad_norm": 0.08562198339929697, + "learning_rate": 3.535333867447088e-07, + "loss": 1.2775, + "step": 9194 + }, + { + "epoch": 0.99, + "grad_norm": 0.08887863740413854, + "learning_rate": 3.4701753586269257e-07, + "loss": 1.3826, + "step": 9195 + }, + { + "epoch": 0.99, + "grad_norm": 0.0911839138816826, + "learning_rate": 3.405622696274202e-07, + "loss": 1.4951, + "step": 9196 + }, + { + "epoch": 0.99, + "grad_norm": 0.10007743747550958, + "learning_rate": 3.341675888215434e-07, + "loss": 1.3479, + "step": 9197 + }, + { + "epoch": 0.99, + "grad_norm": 0.08820855203603718, + "learning_rate": 3.278334942204419e-07, + "loss": 1.3686, + "step": 9198 + }, + { + "epoch": 0.99, + "grad_norm": 0.09489741230917534, + "learning_rate": 3.215599865921126e-07, + "loss": 1.3376, + "step": 9199 + }, + { + "epoch": 0.99, + "grad_norm": 0.09132514257961939, + "learning_rate": 3.1534706669733573e-07, + "loss": 1.4629, + "step": 9200 + }, + { + "epoch": 0.99, + "grad_norm": 0.09654225348968377, + "learning_rate": 3.0919473528939754e-07, + "loss": 1.3531, + "step": 9201 + }, + { + "epoch": 0.99, + "grad_norm": 0.09488757695139786, + "learning_rate": 3.0310299311431256e-07, + "loss": 1.4028, + "step": 9202 + }, + { + "epoch": 0.99, + "grad_norm": 0.10174093481738378, + "learning_rate": 2.9707184091071204e-07, + "loss": 1.4304, + "step": 9203 + }, + { + "epoch": 0.99, + "grad_norm": 0.08821629876713558, + "learning_rate": 2.9110127940984445e-07, + "loss": 1.3028, + "step": 9204 + }, + { + "epoch": 0.99, + "grad_norm": 0.08766097557322772, + "learning_rate": 2.851913093357417e-07, + "loss": 1.4125, + "step": 9205 + }, + { + "epoch": 0.99, + "grad_norm": 0.0901572714979954, + "learning_rate": 2.793419314049972e-07, + "loss": 1.4407, + "step": 9206 + }, + { + "epoch": 0.99, + "grad_norm": 0.08821420180605707, + "learning_rate": 2.73553146326766e-07, + "loss": 1.432, + "step": 9207 + }, + { + "epoch": 0.99, + "grad_norm": 0.08633075490026776, + "learning_rate": 2.6782495480309756e-07, + "loss": 1.35, + "step": 9208 + }, + { + "epoch": 0.99, + "grad_norm": 0.08272348964502109, + "learning_rate": 2.621573575284919e-07, + "loss": 1.3314, + "step": 9209 + }, + { + "epoch": 0.99, + "grad_norm": 0.08738865558697523, + "learning_rate": 2.565503551901216e-07, + "loss": 1.3905, + "step": 9210 + }, + { + "epoch": 0.99, + "grad_norm": 0.08583764484613975, + "learning_rate": 2.510039484679427e-07, + "loss": 1.4012, + "step": 9211 + }, + { + "epoch": 0.99, + "grad_norm": 0.09265865578712432, + "learning_rate": 2.4551813803441734e-07, + "loss": 1.2547, + "step": 9212 + }, + { + "epoch": 0.99, + "grad_norm": 0.07892496874291217, + "learning_rate": 2.400929245546801e-07, + "loss": 1.3251, + "step": 9213 + }, + { + "epoch": 0.99, + "grad_norm": 0.08624414306385945, + "learning_rate": 2.3472830868670469e-07, + "loss": 1.3862, + "step": 9214 + }, + { + "epoch": 0.99, + "grad_norm": 0.09302196091866992, + "learning_rate": 2.2942429108080422e-07, + "loss": 1.2674, + "step": 9215 + }, + { + "epoch": 0.99, + "grad_norm": 0.08821491575497864, + "learning_rate": 2.2418087238018637e-07, + "loss": 1.4064, + "step": 9216 + }, + { + "epoch": 0.99, + "grad_norm": 0.08949986370580101, + "learning_rate": 2.189980532206759e-07, + "loss": 1.4349, + "step": 9217 + }, + { + "epoch": 0.99, + "grad_norm": 0.08278441450527144, + "learning_rate": 2.1387583423060354e-07, + "loss": 1.2998, + "step": 9218 + }, + { + "epoch": 0.99, + "grad_norm": 0.0840801051048805, + "learning_rate": 2.0881421603113904e-07, + "loss": 1.263, + "step": 9219 + }, + { + "epoch": 0.99, + "grad_norm": 0.08056200911342393, + "learning_rate": 2.0381319923595821e-07, + "loss": 1.4156, + "step": 9220 + }, + { + "epoch": 0.99, + "grad_norm": 0.08724772973658486, + "learning_rate": 1.9887278445152036e-07, + "loss": 1.2512, + "step": 9221 + }, + { + "epoch": 0.99, + "grad_norm": 0.09455867642372127, + "learning_rate": 1.9399297227684633e-07, + "loss": 1.4142, + "step": 9222 + }, + { + "epoch": 0.99, + "grad_norm": 0.09343959412492665, + "learning_rate": 1.8917376330357395e-07, + "loss": 1.3958, + "step": 9223 + }, + { + "epoch": 0.99, + "grad_norm": 0.08794548124148231, + "learning_rate": 1.8441515811612464e-07, + "loss": 1.3576, + "step": 9224 + }, + { + "epoch": 0.99, + "grad_norm": 0.09691778279436149, + "learning_rate": 1.797171572914813e-07, + "loss": 1.5349, + "step": 9225 + }, + { + "epoch": 0.99, + "grad_norm": 0.07983284987363899, + "learning_rate": 1.7507976139924385e-07, + "loss": 1.4171, + "step": 9226 + }, + { + "epoch": 0.99, + "grad_norm": 0.09287129310883346, + "learning_rate": 1.7050297100174028e-07, + "loss": 1.3165, + "step": 9227 + }, + { + "epoch": 0.99, + "grad_norm": 0.08646817433793243, + "learning_rate": 1.6598678665397104e-07, + "loss": 1.4993, + "step": 9228 + }, + { + "epoch": 0.99, + "grad_norm": 0.07929013433839835, + "learning_rate": 1.6153120890344265e-07, + "loss": 1.5507, + "step": 9229 + }, + { + "epoch": 0.99, + "grad_norm": 0.08850803873232053, + "learning_rate": 1.5713623829050062e-07, + "loss": 1.4476, + "step": 9230 + }, + { + "epoch": 0.99, + "grad_norm": 0.0888883518336363, + "learning_rate": 1.5280187534794098e-07, + "loss": 1.3417, + "step": 9231 + }, + { + "epoch": 0.99, + "grad_norm": 0.09379458112499682, + "learning_rate": 1.4852812060145438e-07, + "loss": 1.4362, + "step": 9232 + }, + { + "epoch": 0.99, + "grad_norm": 0.08832514165492539, + "learning_rate": 1.4431497456918186e-07, + "loss": 1.4955, + "step": 9233 + }, + { + "epoch": 0.99, + "grad_norm": 0.10270631070515934, + "learning_rate": 1.4016243776193706e-07, + "loss": 1.314, + "step": 9234 + }, + { + "epoch": 0.99, + "grad_norm": 0.08689177286241863, + "learning_rate": 1.360705106833171e-07, + "loss": 1.4187, + "step": 9235 + }, + { + "epoch": 0.99, + "grad_norm": 0.0904049295681052, + "learning_rate": 1.320391938294252e-07, + "loss": 1.4229, + "step": 9236 + }, + { + "epoch": 0.99, + "grad_norm": 0.08826812497826056, + "learning_rate": 1.280684876890925e-07, + "loss": 1.3759, + "step": 9237 + }, + { + "epoch": 0.99, + "grad_norm": 0.09454769152128116, + "learning_rate": 1.2415839274376728e-07, + "loss": 1.4044, + "step": 9238 + }, + { + "epoch": 0.99, + "grad_norm": 0.12649764184311058, + "learning_rate": 1.2030890946757022e-07, + "loss": 1.4513, + "step": 9239 + }, + { + "epoch": 0.99, + "grad_norm": 0.08893899583982746, + "learning_rate": 1.1652003832729464e-07, + "loss": 1.4767, + "step": 9240 + }, + { + "epoch": 0.99, + "grad_norm": 0.09708840021285432, + "learning_rate": 1.1279177978229527e-07, + "loss": 1.2564, + "step": 9241 + }, + { + "epoch": 0.99, + "grad_norm": 0.08160711737594363, + "learning_rate": 1.0912413428471046e-07, + "loss": 1.2285, + "step": 9242 + }, + { + "epoch": 0.99, + "grad_norm": 0.0830542224283073, + "learning_rate": 1.0551710227912903e-07, + "loss": 1.3295, + "step": 9243 + }, + { + "epoch": 0.99, + "grad_norm": 0.10843619771485967, + "learning_rate": 1.0197068420308986e-07, + "loss": 1.4789, + "step": 9244 + }, + { + "epoch": 0.99, + "grad_norm": 0.08399511285269524, + "learning_rate": 9.848488048647131e-08, + "loss": 1.4328, + "step": 9245 + }, + { + "epoch": 0.99, + "grad_norm": 0.08369868710316501, + "learning_rate": 9.505969155193528e-08, + "loss": 1.4427, + "step": 9246 + }, + { + "epoch": 0.99, + "grad_norm": 0.08815787268710067, + "learning_rate": 9.169511781492724e-08, + "loss": 1.284, + "step": 9247 + }, + { + "epoch": 0.99, + "grad_norm": 0.10365706578408941, + "learning_rate": 8.839115968328759e-08, + "loss": 1.4171, + "step": 9248 + }, + { + "epoch": 0.99, + "grad_norm": 0.0868752961016487, + "learning_rate": 8.514781755769585e-08, + "loss": 1.3582, + "step": 9249 + }, + { + "epoch": 0.99, + "grad_norm": 0.09414347762024002, + "learning_rate": 8.1965091831393e-08, + "loss": 1.3758, + "step": 9250 + }, + { + "epoch": 0.99, + "grad_norm": 0.08954872276660882, + "learning_rate": 7.884298289029258e-08, + "loss": 1.3475, + "step": 9251 + }, + { + "epoch": 0.99, + "grad_norm": 0.10030726142452168, + "learning_rate": 7.57814911129806e-08, + "loss": 1.3205, + "step": 9252 + }, + { + "epoch": 0.99, + "grad_norm": 0.09762209771924428, + "learning_rate": 7.278061687066018e-08, + "loss": 1.3829, + "step": 9253 + }, + { + "epoch": 0.99, + "grad_norm": 0.08836513485757565, + "learning_rate": 6.984036052720688e-08, + "loss": 1.3295, + "step": 9254 + }, + { + "epoch": 0.99, + "grad_norm": 0.0871339758958895, + "learning_rate": 6.696072243911333e-08, + "loss": 1.3928, + "step": 9255 + }, + { + "epoch": 1.0, + "grad_norm": 0.09098880914087802, + "learning_rate": 6.414170295560017e-08, + "loss": 1.357, + "step": 9256 + }, + { + "epoch": 1.0, + "grad_norm": 0.09077999286470577, + "learning_rate": 6.138330241839407e-08, + "loss": 1.3801, + "step": 9257 + }, + { + "epoch": 1.0, + "grad_norm": 0.10122837068892225, + "learning_rate": 5.868552116206072e-08, + "loss": 1.3623, + "step": 9258 + }, + { + "epoch": 1.0, + "grad_norm": 0.08517700022337132, + "learning_rate": 5.604835951367182e-08, + "loss": 1.4786, + "step": 9259 + }, + { + "epoch": 1.0, + "grad_norm": 0.08585211456743395, + "learning_rate": 5.3471817792971614e-08, + "loss": 1.4968, + "step": 9260 + }, + { + "epoch": 1.0, + "grad_norm": 0.10044181669808067, + "learning_rate": 5.095589631237685e-08, + "loss": 1.2755, + "step": 9261 + }, + { + "epoch": 1.0, + "grad_norm": 0.08811673378812793, + "learning_rate": 4.8500595376976816e-08, + "loss": 1.2943, + "step": 9262 + }, + { + "epoch": 1.0, + "grad_norm": 0.08452180165981107, + "learning_rate": 4.610591528447783e-08, + "loss": 1.3082, + "step": 9263 + }, + { + "epoch": 1.0, + "grad_norm": 0.09509764207121812, + "learning_rate": 4.3771856325203196e-08, + "loss": 1.3976, + "step": 9264 + }, + { + "epoch": 1.0, + "grad_norm": 0.0905369920640337, + "learning_rate": 4.1498418782259795e-08, + "loss": 1.34, + "step": 9265 + }, + { + "epoch": 1.0, + "grad_norm": 0.08517538334754213, + "learning_rate": 3.928560293120498e-08, + "loss": 1.3423, + "step": 9266 + }, + { + "epoch": 1.0, + "grad_norm": 0.0853860335389513, + "learning_rate": 3.713340904043516e-08, + "loss": 1.4828, + "step": 9267 + }, + { + "epoch": 1.0, + "grad_norm": 0.10931919049201298, + "learning_rate": 3.504183737085276e-08, + "loss": 1.3513, + "step": 9268 + }, + { + "epoch": 1.0, + "grad_norm": 0.08736777021378683, + "learning_rate": 3.301088817608822e-08, + "loss": 1.4117, + "step": 9269 + }, + { + "epoch": 1.0, + "grad_norm": 0.10028110998630704, + "learning_rate": 3.104056170244451e-08, + "loss": 1.4898, + "step": 9270 + }, + { + "epoch": 1.0, + "grad_norm": 0.09773062139045051, + "learning_rate": 2.9130858188730626e-08, + "loss": 1.3318, + "step": 9271 + }, + { + "epoch": 1.0, + "grad_norm": 0.08128181127641702, + "learning_rate": 2.7281777866594582e-08, + "loss": 1.4327, + "step": 9272 + }, + { + "epoch": 1.0, + "grad_norm": 0.0943358489444862, + "learning_rate": 2.5493320960190413e-08, + "loss": 1.3043, + "step": 9273 + }, + { + "epoch": 1.0, + "grad_norm": 0.09785602520380977, + "learning_rate": 2.3765487686400188e-08, + "loss": 1.4111, + "step": 9274 + }, + { + "epoch": 1.0, + "grad_norm": 0.09533372939648309, + "learning_rate": 2.2098278254722993e-08, + "loss": 1.4909, + "step": 9275 + }, + { + "epoch": 1.0, + "grad_norm": 0.0801968457699735, + "learning_rate": 2.0491692867330434e-08, + "loss": 1.4355, + "step": 9276 + }, + { + "epoch": 1.0, + "grad_norm": 0.09429240525483946, + "learning_rate": 1.894573171901115e-08, + "loss": 1.4304, + "step": 9277 + }, + { + "epoch": 1.0, + "grad_norm": 0.10266163176098933, + "learning_rate": 1.746039499717078e-08, + "loss": 1.4492, + "step": 9278 + }, + { + "epoch": 1.0, + "grad_norm": 0.09383290941084661, + "learning_rate": 1.6035682881998527e-08, + "loss": 1.4451, + "step": 9279 + }, + { + "epoch": 1.0, + "grad_norm": 0.080437028926394, + "learning_rate": 1.4671595546245086e-08, + "loss": 1.4012, + "step": 9280 + }, + { + "epoch": 1.0, + "grad_norm": 0.08351402759362497, + "learning_rate": 1.3368133155222672e-08, + "loss": 1.3904, + "step": 9281 + }, + { + "epoch": 1.0, + "grad_norm": 0.091617721380892, + "learning_rate": 1.2125295867027042e-08, + "loss": 1.3431, + "step": 9282 + }, + { + "epoch": 1.0, + "grad_norm": 0.10262114214274354, + "learning_rate": 1.094308383237097e-08, + "loss": 1.5308, + "step": 9283 + }, + { + "epoch": 1.0, + "grad_norm": 0.10109618312260171, + "learning_rate": 9.82149719458425e-09, + "loss": 1.3743, + "step": 9284 + }, + { + "epoch": 1.0, + "grad_norm": 0.09909394437365746, + "learning_rate": 8.760536089724714e-09, + "loss": 1.435, + "step": 9285 + }, + { + "epoch": 1.0, + "grad_norm": 0.08764241854659703, + "learning_rate": 7.760200646300675e-09, + "loss": 1.3543, + "step": 9286 + }, + { + "epoch": 1.0, + "grad_norm": 0.10072587519427252, + "learning_rate": 6.820490985715022e-09, + "loss": 1.3164, + "step": 9287 + }, + { + "epoch": 1.0, + "grad_norm": 0.09365884377453729, + "learning_rate": 5.941407221932149e-09, + "loss": 1.3535, + "step": 9288 + }, + { + "epoch": 1.0, + "grad_norm": 0.09650653930095349, + "learning_rate": 5.122949461422444e-09, + "loss": 1.3542, + "step": 9289 + }, + { + "epoch": 1.0, + "grad_norm": 0.08379342583541467, + "learning_rate": 4.365117803550866e-09, + "loss": 1.4495, + "step": 9290 + }, + { + "epoch": 1.0, + "grad_norm": 0.09115901593403421, + "learning_rate": 3.667912340132862e-09, + "loss": 1.3161, + "step": 9291 + }, + { + "epoch": 1.0, + "grad_norm": 0.10871669726957177, + "learning_rate": 3.031333155767424e-09, + "loss": 1.5187, + "step": 9292 + }, + { + "epoch": 1.0, + "grad_norm": 0.08912092052721272, + "learning_rate": 2.455380327559542e-09, + "loss": 1.3403, + "step": 9293 + }, + { + "epoch": 1.0, + "grad_norm": 0.08373270056508372, + "learning_rate": 1.9400539253977557e-09, + "loss": 1.3308, + "step": 9294 + }, + { + "epoch": 1.0, + "grad_norm": 0.09699578700835747, + "learning_rate": 1.4853540117321095e-09, + "loss": 1.3618, + "step": 9295 + }, + { + "epoch": 1.0, + "grad_norm": 0.08636922925763955, + "learning_rate": 1.0912806417961995e-09, + "loss": 1.3992, + "step": 9296 + }, + { + "epoch": 1.0, + "grad_norm": 0.09516627466784042, + "learning_rate": 7.578338632741044e-10, + "loss": 1.423, + "step": 9297 + }, + { + "epoch": 1.0, + "grad_norm": 0.10331085370003165, + "learning_rate": 4.850137165779422e-10, + "loss": 1.3025, + "step": 9298 + }, + { + "epoch": 1.0, + "grad_norm": 0.08985317351843392, + "learning_rate": 2.728202349033815e-10, + "loss": 1.3703, + "step": 9299 + }, + { + "epoch": 1.0, + "grad_norm": 0.09341887519280219, + "learning_rate": 1.2125344384106285e-10, + "loss": 1.4087, + "step": 9300 + }, + { + "epoch": 1.0, + "grad_norm": 0.08905643836734109, + "learning_rate": 3.031336187619971e-11, + "loss": 1.4227, + "step": 9301 + }, + { + "epoch": 1.0, + "grad_norm": 0.08366057997777612, + "learning_rate": 0.0, + "loss": 1.5995, + "step": 9302 + }, + { + "epoch": 1.0, + "step": 9302, + "total_flos": 1.5268155381671526e+18, + "train_loss": 1.4317849794912534, + "train_runtime": 52932.8416, + "train_samples_per_second": 11.248, + "train_steps_per_second": 0.176 + } + ], + "logging_steps": 1.0, + "max_steps": 9302, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "total_flos": 1.5268155381671526e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}