{ "best_metric": 0.5115005185204882, "best_model_checkpoint": "/m/triton/scratch/elec/puhe/p/palp3/MUCS/indicwav2vec_outputs/pd_warmup_2000/s300_shuff500/checkpoint-1000", "epoch": 1.6, "eval_steps": 1000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": NaN, "learning_rate": 0.0, "loss": 76.924, "step": 1 }, { "epoch": 0.0032, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 40.9666, "step": 2 }, { "epoch": 0.0048, "grad_norm": 22.267667770385742, "learning_rate": 3e-07, "loss": 40.3514, "step": 3 }, { "epoch": 0.0064, "grad_norm": 11.958697319030762, "learning_rate": 6e-07, "loss": 28.7886, "step": 4 }, { "epoch": 0.008, "grad_norm": 14.436713218688965, "learning_rate": 9e-07, "loss": 33.1337, "step": 5 }, { "epoch": 0.0096, "grad_norm": 15.921396255493164, "learning_rate": 1.2e-06, "loss": 29.2715, "step": 6 }, { "epoch": 0.0112, "grad_norm": 11.616898536682129, "learning_rate": 1.4999999999999998e-06, "loss": 28.6694, "step": 7 }, { "epoch": 0.0128, "grad_norm": 12.15279483795166, "learning_rate": 1.8e-06, "loss": 26.7664, "step": 8 }, { "epoch": 0.0144, "grad_norm": 15.99345874786377, "learning_rate": 2.1e-06, "loss": 27.2963, "step": 9 }, { "epoch": 0.016, "grad_norm": 10.025712966918945, "learning_rate": 2.4e-06, "loss": 22.7932, "step": 10 }, { "epoch": 0.0176, "grad_norm": 9.379335403442383, "learning_rate": 2.6999999999999996e-06, "loss": 20.7226, "step": 11 }, { "epoch": 0.0192, "grad_norm": 12.390824317932129, "learning_rate": 2.9999999999999997e-06, "loss": 27.5995, "step": 12 }, { "epoch": 0.0208, "grad_norm": 10.201970100402832, "learning_rate": 3.2999999999999993e-06, "loss": 23.3013, "step": 13 }, { "epoch": 0.0224, "grad_norm": 11.483911514282227, "learning_rate": 3.6e-06, "loss": 24.6987, "step": 14 }, { "epoch": 0.024, "grad_norm": 9.661028861999512, "learning_rate": 3.899999999999999e-06, "loss": 22.0543, "step": 15 }, { "epoch": 0.0256, "grad_norm": 9.904827117919922, "learning_rate": 4.2e-06, "loss": 20.3867, "step": 16 }, { "epoch": 0.0272, "grad_norm": 10.597962379455566, "learning_rate": 4.499999999999999e-06, "loss": 24.6232, "step": 17 }, { "epoch": 0.0288, "grad_norm": 12.7444486618042, "learning_rate": 4.8e-06, "loss": 25.3891, "step": 18 }, { "epoch": 0.0304, "grad_norm": 10.090996742248535, "learning_rate": 5.1e-06, "loss": 21.5661, "step": 19 }, { "epoch": 0.032, "grad_norm": 10.649155616760254, "learning_rate": 5.399999999999999e-06, "loss": 23.0623, "step": 20 }, { "epoch": 0.0336, "grad_norm": 10.286359786987305, "learning_rate": 5.7e-06, "loss": 20.7193, "step": 21 }, { "epoch": 0.0352, "grad_norm": 10.968955993652344, "learning_rate": 5.999999999999999e-06, "loss": 24.0174, "step": 22 }, { "epoch": 0.0368, "grad_norm": 9.74566650390625, "learning_rate": 6.3e-06, "loss": 21.454, "step": 23 }, { "epoch": 0.0384, "grad_norm": 11.162517547607422, "learning_rate": 6.599999999999999e-06, "loss": 23.5266, "step": 24 }, { "epoch": 0.04, "grad_norm": 9.547463417053223, "learning_rate": 6.899999999999999e-06, "loss": 20.214, "step": 25 }, { "epoch": 0.0416, "grad_norm": 10.754602432250977, "learning_rate": 7.2e-06, "loss": 22.6765, "step": 26 }, { "epoch": 0.0432, "grad_norm": 9.74982738494873, "learning_rate": 7.499999999999999e-06, "loss": 20.4099, "step": 27 }, { "epoch": 0.0448, "grad_norm": 10.397897720336914, "learning_rate": 7.799999999999998e-06, "loss": 21.5807, "step": 28 }, { "epoch": 0.0464, "grad_norm": 10.956497192382812, "learning_rate": 8.099999999999999e-06, "loss": 23.4123, "step": 29 }, { "epoch": 0.048, "grad_norm": 10.610095977783203, "learning_rate": 8.4e-06, "loss": 21.6038, "step": 30 }, { "epoch": 0.0496, "grad_norm": 10.559882164001465, "learning_rate": 8.7e-06, "loss": 21.7059, "step": 31 }, { "epoch": 0.0512, "grad_norm": 10.241806983947754, "learning_rate": 8.999999999999999e-06, "loss": 21.5684, "step": 32 }, { "epoch": 0.0528, "grad_norm": 9.802021980285645, "learning_rate": 9.299999999999999e-06, "loss": 19.0525, "step": 33 }, { "epoch": 0.0544, "grad_norm": 10.859997749328613, "learning_rate": 9.6e-06, "loss": 20.7641, "step": 34 }, { "epoch": 0.056, "grad_norm": 11.377524375915527, "learning_rate": 9.9e-06, "loss": 21.9332, "step": 35 }, { "epoch": 0.0576, "grad_norm": 10.088302612304688, "learning_rate": 1.02e-05, "loss": 19.4306, "step": 36 }, { "epoch": 0.0592, "grad_norm": 12.069904327392578, "learning_rate": 1.05e-05, "loss": 23.6146, "step": 37 }, { "epoch": 0.0608, "grad_norm": 12.05452823638916, "learning_rate": 1.0799999999999998e-05, "loss": 22.8087, "step": 38 }, { "epoch": 0.0624, "grad_norm": 12.891792297363281, "learning_rate": 1.1099999999999999e-05, "loss": 21.6699, "step": 39 }, { "epoch": 0.064, "grad_norm": 10.262922286987305, "learning_rate": 1.14e-05, "loss": 19.0892, "step": 40 }, { "epoch": 0.0656, "grad_norm": 11.921724319458008, "learning_rate": 1.17e-05, "loss": 21.5905, "step": 41 }, { "epoch": 0.0672, "grad_norm": 11.676680564880371, "learning_rate": 1.1999999999999999e-05, "loss": 20.87, "step": 42 }, { "epoch": 0.0688, "grad_norm": 10.26872730255127, "learning_rate": 1.2299999999999999e-05, "loss": 18.0263, "step": 43 }, { "epoch": 0.0704, "grad_norm": 10.563227653503418, "learning_rate": 1.26e-05, "loss": 18.7622, "step": 44 }, { "epoch": 0.072, "grad_norm": 14.507094383239746, "learning_rate": 1.2899999999999998e-05, "loss": 24.0105, "step": 45 }, { "epoch": 0.0736, "grad_norm": 11.43386173248291, "learning_rate": 1.3199999999999997e-05, "loss": 19.3601, "step": 46 }, { "epoch": 0.0752, "grad_norm": 11.63315200805664, "learning_rate": 1.3499999999999998e-05, "loss": 19.0576, "step": 47 }, { "epoch": 0.0768, "grad_norm": 12.388842582702637, "learning_rate": 1.3799999999999998e-05, "loss": 19.2927, "step": 48 }, { "epoch": 0.0784, "grad_norm": 12.819602966308594, "learning_rate": 1.4099999999999999e-05, "loss": 19.936, "step": 49 }, { "epoch": 0.08, "grad_norm": NaN, "learning_rate": 1.4099999999999999e-05, "loss": 17.9793, "step": 50 }, { "epoch": 0.0816, "grad_norm": 65.82632446289062, "learning_rate": 1.44e-05, "loss": 64.1187, "step": 51 }, { "epoch": 0.0832, "grad_norm": 26.94750213623047, "learning_rate": 1.47e-05, "loss": 33.92, "step": 52 }, { "epoch": 0.0848, "grad_norm": 24.70115852355957, "learning_rate": 1.4999999999999999e-05, "loss": 34.5358, "step": 53 }, { "epoch": 0.0864, "grad_norm": 18.708255767822266, "learning_rate": 1.53e-05, "loss": 25.5629, "step": 54 }, { "epoch": 0.088, "grad_norm": 21.256839752197266, "learning_rate": 1.5599999999999996e-05, "loss": 27.8492, "step": 55 }, { "epoch": 0.0896, "grad_norm": 17.251680374145508, "learning_rate": 1.5899999999999997e-05, "loss": 22.1176, "step": 56 }, { "epoch": 0.0912, "grad_norm": 31.902353286743164, "learning_rate": 1.6199999999999997e-05, "loss": 26.8606, "step": 57 }, { "epoch": 0.0928, "grad_norm": 20.44807243347168, "learning_rate": 1.6499999999999998e-05, "loss": 23.1075, "step": 58 }, { "epoch": 0.0944, "grad_norm": 17.042905807495117, "learning_rate": 1.68e-05, "loss": 21.0526, "step": 59 }, { "epoch": 0.096, "grad_norm": 18.13207244873047, "learning_rate": 1.71e-05, "loss": 20.8373, "step": 60 }, { "epoch": 0.0976, "grad_norm": 17.839736938476562, "learning_rate": 1.74e-05, "loss": 21.2095, "step": 61 }, { "epoch": 0.0992, "grad_norm": 18.765409469604492, "learning_rate": 1.7699999999999997e-05, "loss": 21.5782, "step": 62 }, { "epoch": 0.1008, "grad_norm": 18.407758712768555, "learning_rate": 1.7999999999999997e-05, "loss": 21.009, "step": 63 }, { "epoch": 0.1024, "grad_norm": 38.16777038574219, "learning_rate": 1.8299999999999998e-05, "loss": 21.9343, "step": 64 }, { "epoch": 0.104, "grad_norm": 21.490079879760742, "learning_rate": 1.8599999999999998e-05, "loss": 22.2912, "step": 65 }, { "epoch": 0.1056, "grad_norm": 22.932668685913086, "learning_rate": 1.89e-05, "loss": 20.3447, "step": 66 }, { "epoch": 0.1072, "grad_norm": 26.10978126525879, "learning_rate": 1.92e-05, "loss": 23.8814, "step": 67 }, { "epoch": 0.1088, "grad_norm": 18.075897216796875, "learning_rate": 1.95e-05, "loss": 18.7612, "step": 68 }, { "epoch": 0.1104, "grad_norm": 18.735963821411133, "learning_rate": 1.98e-05, "loss": 18.8199, "step": 69 }, { "epoch": 0.112, "grad_norm": 22.017709732055664, "learning_rate": 2.01e-05, "loss": 20.6774, "step": 70 }, { "epoch": 0.1136, "grad_norm": 38.08246994018555, "learning_rate": 2.04e-05, "loss": 30.1672, "step": 71 }, { "epoch": 0.1152, "grad_norm": 22.627145767211914, "learning_rate": 2.07e-05, "loss": 20.0791, "step": 72 }, { "epoch": 0.1168, "grad_norm": 30.097496032714844, "learning_rate": 2.1e-05, "loss": 23.1779, "step": 73 }, { "epoch": 0.1184, "grad_norm": 22.679004669189453, "learning_rate": 2.1299999999999996e-05, "loss": 19.4401, "step": 74 }, { "epoch": 0.12, "grad_norm": 24.041168212890625, "learning_rate": 2.1599999999999996e-05, "loss": 19.3537, "step": 75 }, { "epoch": 0.1216, "grad_norm": 26.076839447021484, "learning_rate": 2.1899999999999997e-05, "loss": 20.2714, "step": 76 }, { "epoch": 0.1232, "grad_norm": 25.533342361450195, "learning_rate": 2.2199999999999998e-05, "loss": 19.6379, "step": 77 }, { "epoch": 0.1248, "grad_norm": 23.547039031982422, "learning_rate": 2.2499999999999998e-05, "loss": 18.0205, "step": 78 }, { "epoch": 0.1264, "grad_norm": 23.1020565032959, "learning_rate": 2.28e-05, "loss": 17.6504, "step": 79 }, { "epoch": 0.128, "grad_norm": 28.12115478515625, "learning_rate": 2.31e-05, "loss": 19.9713, "step": 80 }, { "epoch": 0.1296, "grad_norm": 24.276756286621094, "learning_rate": 2.34e-05, "loss": 17.293, "step": 81 }, { "epoch": 0.1312, "grad_norm": 21.40571403503418, "learning_rate": 2.3699999999999997e-05, "loss": 15.7224, "step": 82 }, { "epoch": 0.1328, "grad_norm": 24.049413681030273, "learning_rate": 2.3999999999999997e-05, "loss": 16.4272, "step": 83 }, { "epoch": 0.1344, "grad_norm": 27.002574920654297, "learning_rate": 2.4299999999999998e-05, "loss": 17.5712, "step": 84 }, { "epoch": 0.136, "grad_norm": 31.374860763549805, "learning_rate": 2.4599999999999998e-05, "loss": 19.1111, "step": 85 }, { "epoch": 0.1376, "grad_norm": 28.4317684173584, "learning_rate": 2.49e-05, "loss": 17.1805, "step": 86 }, { "epoch": 0.1392, "grad_norm": 32.47872543334961, "learning_rate": 2.52e-05, "loss": 18.6232, "step": 87 }, { "epoch": 0.1408, "grad_norm": 28.26717185974121, "learning_rate": 2.55e-05, "loss": 16.3721, "step": 88 }, { "epoch": 0.1424, "grad_norm": 29.546110153198242, "learning_rate": 2.5799999999999997e-05, "loss": 16.6354, "step": 89 }, { "epoch": 0.144, "grad_norm": 32.13431930541992, "learning_rate": 2.6099999999999997e-05, "loss": 16.9808, "step": 90 }, { "epoch": 0.1456, "grad_norm": 36.368682861328125, "learning_rate": 2.6399999999999995e-05, "loss": 17.5935, "step": 91 }, { "epoch": 0.1472, "grad_norm": 28.789241790771484, "learning_rate": 2.6699999999999995e-05, "loss": 14.6106, "step": 92 }, { "epoch": 0.1488, "grad_norm": 30.914873123168945, "learning_rate": 2.6999999999999996e-05, "loss": 14.5306, "step": 93 }, { "epoch": 0.1504, "grad_norm": 31.848777770996094, "learning_rate": 2.7299999999999996e-05, "loss": 14.7256, "step": 94 }, { "epoch": 0.152, "grad_norm": 43.699851989746094, "learning_rate": 2.7599999999999997e-05, "loss": 16.3519, "step": 95 }, { "epoch": 0.1536, "grad_norm": 39.825836181640625, "learning_rate": 2.7899999999999997e-05, "loss": 15.9264, "step": 96 }, { "epoch": 0.1552, "grad_norm": 32.04133224487305, "learning_rate": 2.8199999999999998e-05, "loss": 13.4326, "step": 97 }, { "epoch": 0.1568, "grad_norm": 39.8133659362793, "learning_rate": 2.8499999999999998e-05, "loss": 14.3427, "step": 98 }, { "epoch": 0.1584, "grad_norm": 36.54108810424805, "learning_rate": 2.88e-05, "loss": 13.5702, "step": 99 }, { "epoch": 0.16, "grad_norm": NaN, "learning_rate": 2.88e-05, "loss": 14.6284, "step": 100 }, { "epoch": 0.1616, "grad_norm": NaN, "learning_rate": 2.88e-05, "loss": 25.1519, "step": 101 }, { "epoch": 0.1632, "grad_norm": 146.48204040527344, "learning_rate": 2.91e-05, "loss": 26.3907, "step": 102 }, { "epoch": 0.1648, "grad_norm": 56.64327621459961, "learning_rate": 2.94e-05, "loss": 17.1257, "step": 103 }, { "epoch": 0.1664, "grad_norm": 87.11421966552734, "learning_rate": 2.97e-05, "loss": 18.9131, "step": 104 }, { "epoch": 0.168, "grad_norm": 57.33453369140625, "learning_rate": 2.9999999999999997e-05, "loss": 15.569, "step": 105 }, { "epoch": 0.1696, "grad_norm": 46.68961715698242, "learning_rate": 3.0299999999999998e-05, "loss": 13.1879, "step": 106 }, { "epoch": 0.1712, "grad_norm": 59.92051315307617, "learning_rate": 3.06e-05, "loss": 14.6438, "step": 107 }, { "epoch": 0.1728, "grad_norm": 57.27889633178711, "learning_rate": 3.09e-05, "loss": 14.3459, "step": 108 }, { "epoch": 0.1744, "grad_norm": 51.79650115966797, "learning_rate": 3.119999999999999e-05, "loss": 12.7327, "step": 109 }, { "epoch": 0.176, "grad_norm": 45.90155029296875, "learning_rate": 3.149999999999999e-05, "loss": 10.6128, "step": 110 }, { "epoch": 0.1776, "grad_norm": 46.71571350097656, "learning_rate": 3.1799999999999994e-05, "loss": 11.1419, "step": 111 }, { "epoch": 0.1792, "grad_norm": 54.34334182739258, "learning_rate": 3.2099999999999994e-05, "loss": 11.3816, "step": 112 }, { "epoch": 0.1808, "grad_norm": 44.52326965332031, "learning_rate": 3.2399999999999995e-05, "loss": 9.99, "step": 113 }, { "epoch": 0.1824, "grad_norm": 51.66781997680664, "learning_rate": 3.2699999999999995e-05, "loss": 10.7379, "step": 114 }, { "epoch": 0.184, "grad_norm": 44.501441955566406, "learning_rate": 3.2999999999999996e-05, "loss": 9.5815, "step": 115 }, { "epoch": 0.1856, "grad_norm": 42.56653594970703, "learning_rate": 3.3299999999999996e-05, "loss": 9.265, "step": 116 }, { "epoch": 0.1872, "grad_norm": 40.3764762878418, "learning_rate": 3.36e-05, "loss": 8.6506, "step": 117 }, { "epoch": 0.1888, "grad_norm": 40.9335823059082, "learning_rate": 3.39e-05, "loss": 8.5488, "step": 118 }, { "epoch": 0.1904, "grad_norm": 42.77170181274414, "learning_rate": 3.42e-05, "loss": 8.5519, "step": 119 }, { "epoch": 0.192, "grad_norm": 39.98923873901367, "learning_rate": 3.45e-05, "loss": 8.0759, "step": 120 }, { "epoch": 0.1936, "grad_norm": 33.6198844909668, "learning_rate": 3.48e-05, "loss": 7.3643, "step": 121 }, { "epoch": 0.1952, "grad_norm": 36.385223388671875, "learning_rate": 3.51e-05, "loss": 7.4889, "step": 122 }, { "epoch": 0.1968, "grad_norm": 34.519630432128906, "learning_rate": 3.539999999999999e-05, "loss": 7.0802, "step": 123 }, { "epoch": 0.1984, "grad_norm": 31.894567489624023, "learning_rate": 3.5699999999999994e-05, "loss": 6.8235, "step": 124 }, { "epoch": 0.2, "grad_norm": 34.02376174926758, "learning_rate": 3.5999999999999994e-05, "loss": 6.9291, "step": 125 }, { "epoch": 0.2016, "grad_norm": 28.912235260009766, "learning_rate": 3.6299999999999995e-05, "loss": 6.4299, "step": 126 }, { "epoch": 0.2032, "grad_norm": 28.84571647644043, "learning_rate": 3.6599999999999995e-05, "loss": 6.3964, "step": 127 }, { "epoch": 0.2048, "grad_norm": 27.383811950683594, "learning_rate": 3.6899999999999996e-05, "loss": 6.2274, "step": 128 }, { "epoch": 0.2064, "grad_norm": 29.78708267211914, "learning_rate": 3.7199999999999996e-05, "loss": 6.0653, "step": 129 }, { "epoch": 0.208, "grad_norm": 28.544218063354492, "learning_rate": 3.75e-05, "loss": 6.2053, "step": 130 }, { "epoch": 0.2096, "grad_norm": 22.004798889160156, "learning_rate": 3.78e-05, "loss": 5.7141, "step": 131 }, { "epoch": 0.2112, "grad_norm": 20.87236785888672, "learning_rate": 3.81e-05, "loss": 5.6288, "step": 132 }, { "epoch": 0.2128, "grad_norm": 21.300033569335938, "learning_rate": 3.84e-05, "loss": 5.5556, "step": 133 }, { "epoch": 0.2144, "grad_norm": 19.316028594970703, "learning_rate": 3.87e-05, "loss": 5.4672, "step": 134 }, { "epoch": 0.216, "grad_norm": 19.701513290405273, "learning_rate": 3.9e-05, "loss": 5.4904, "step": 135 }, { "epoch": 0.2176, "grad_norm": 13.995134353637695, "learning_rate": 3.93e-05, "loss": 5.1972, "step": 136 }, { "epoch": 0.2192, "grad_norm": 14.500862121582031, "learning_rate": 3.96e-05, "loss": 5.1985, "step": 137 }, { "epoch": 0.2208, "grad_norm": 10.669529914855957, "learning_rate": 3.99e-05, "loss": 5.0034, "step": 138 }, { "epoch": 0.2224, "grad_norm": 8.051897048950195, "learning_rate": 4.02e-05, "loss": 4.9248, "step": 139 }, { "epoch": 0.224, "grad_norm": 11.402167320251465, "learning_rate": 4.05e-05, "loss": 5.0594, "step": 140 }, { "epoch": 0.2256, "grad_norm": 10.03395938873291, "learning_rate": 4.08e-05, "loss": 5.037, "step": 141 }, { "epoch": 0.2272, "grad_norm": 8.426224708557129, "learning_rate": 4.11e-05, "loss": 4.9605, "step": 142 }, { "epoch": 0.2288, "grad_norm": 4.501130104064941, "learning_rate": 4.14e-05, "loss": 4.7972, "step": 143 }, { "epoch": 0.2304, "grad_norm": 4.365025520324707, "learning_rate": 4.17e-05, "loss": 4.8065, "step": 144 }, { "epoch": 0.232, "grad_norm": 4.868807792663574, "learning_rate": 4.2e-05, "loss": 4.8179, "step": 145 }, { "epoch": 0.2336, "grad_norm": 3.3663101196289062, "learning_rate": 4.229999999999999e-05, "loss": 4.7889, "step": 146 }, { "epoch": 0.2352, "grad_norm": 3.2163665294647217, "learning_rate": 4.259999999999999e-05, "loss": 4.702, "step": 147 }, { "epoch": 0.2368, "grad_norm": 3.1945879459381104, "learning_rate": 4.289999999999999e-05, "loss": 4.7035, "step": 148 }, { "epoch": 0.2384, "grad_norm": 4.427632808685303, "learning_rate": 4.319999999999999e-05, "loss": 4.7546, "step": 149 }, { "epoch": 0.24, "grad_norm": 5.550114631652832, "learning_rate": 4.3499999999999993e-05, "loss": 4.9709, "step": 150 }, { "epoch": 0.2416, "grad_norm": 53.31546401977539, "learning_rate": 4.3799999999999994e-05, "loss": 5.9402, "step": 151 }, { "epoch": 0.2432, "grad_norm": 7.296631336212158, "learning_rate": 4.4099999999999995e-05, "loss": 4.5481, "step": 152 }, { "epoch": 0.2448, "grad_norm": 5.802248954772949, "learning_rate": 4.4399999999999995e-05, "loss": 4.4998, "step": 153 }, { "epoch": 0.2464, "grad_norm": 3.7592978477478027, "learning_rate": 4.4699999999999996e-05, "loss": 4.4536, "step": 154 }, { "epoch": 0.248, "grad_norm": 3.3045403957366943, "learning_rate": 4.4999999999999996e-05, "loss": 4.391, "step": 155 }, { "epoch": 0.2496, "grad_norm": 5.204708099365234, "learning_rate": 4.5299999999999997e-05, "loss": 4.459, "step": 156 }, { "epoch": 0.2512, "grad_norm": 4.284322261810303, "learning_rate": 4.56e-05, "loss": 4.3071, "step": 157 }, { "epoch": 0.2528, "grad_norm": 3.191441059112549, "learning_rate": 4.59e-05, "loss": 4.3388, "step": 158 }, { "epoch": 0.2544, "grad_norm": 3.909501552581787, "learning_rate": 4.62e-05, "loss": 4.2491, "step": 159 }, { "epoch": 0.256, "grad_norm": 3.5245823860168457, "learning_rate": 4.65e-05, "loss": 4.3493, "step": 160 }, { "epoch": 0.2576, "grad_norm": 2.955253839492798, "learning_rate": 4.68e-05, "loss": 4.2599, "step": 161 }, { "epoch": 0.2592, "grad_norm": 3.2109367847442627, "learning_rate": 4.709999999999999e-05, "loss": 4.161, "step": 162 }, { "epoch": 0.2608, "grad_norm": 2.6348652839660645, "learning_rate": 4.7399999999999993e-05, "loss": 4.1815, "step": 163 }, { "epoch": 0.2624, "grad_norm": 2.593337297439575, "learning_rate": 4.7699999999999994e-05, "loss": 4.0801, "step": 164 }, { "epoch": 0.264, "grad_norm": 3.77801251411438, "learning_rate": 4.7999999999999994e-05, "loss": 4.1635, "step": 165 }, { "epoch": 0.2656, "grad_norm": 2.2782046794891357, "learning_rate": 4.8299999999999995e-05, "loss": 4.0901, "step": 166 }, { "epoch": 0.2672, "grad_norm": 1.8724416494369507, "learning_rate": 4.8599999999999995e-05, "loss": 4.0705, "step": 167 }, { "epoch": 0.2688, "grad_norm": 1.7749762535095215, "learning_rate": 4.8899999999999996e-05, "loss": 4.0042, "step": 168 }, { "epoch": 0.2704, "grad_norm": 1.4947004318237305, "learning_rate": 4.9199999999999997e-05, "loss": 4.0104, "step": 169 }, { "epoch": 0.272, "grad_norm": 3.1837706565856934, "learning_rate": 4.95e-05, "loss": 4.0004, "step": 170 }, { "epoch": 0.2736, "grad_norm": 1.914663553237915, "learning_rate": 4.98e-05, "loss": 3.9682, "step": 171 }, { "epoch": 0.2752, "grad_norm": 1.708173394203186, "learning_rate": 5.01e-05, "loss": 3.9609, "step": 172 }, { "epoch": 0.2768, "grad_norm": 4.2649407386779785, "learning_rate": 5.04e-05, "loss": 3.9917, "step": 173 }, { "epoch": 0.2784, "grad_norm": 1.626754641532898, "learning_rate": 5.07e-05, "loss": 3.9464, "step": 174 }, { "epoch": 0.28, "grad_norm": 1.2504980564117432, "learning_rate": 5.1e-05, "loss": 3.9311, "step": 175 }, { "epoch": 0.2816, "grad_norm": 1.5902968645095825, "learning_rate": 5.13e-05, "loss": 3.9733, "step": 176 }, { "epoch": 0.2832, "grad_norm": 1.160041093826294, "learning_rate": 5.1599999999999994e-05, "loss": 3.9358, "step": 177 }, { "epoch": 0.2848, "grad_norm": 1.2470163106918335, "learning_rate": 5.1899999999999994e-05, "loss": 3.9375, "step": 178 }, { "epoch": 0.2864, "grad_norm": 1.0665581226348877, "learning_rate": 5.2199999999999995e-05, "loss": 3.9701, "step": 179 }, { "epoch": 0.288, "grad_norm": 1.6660319566726685, "learning_rate": 5.2499999999999995e-05, "loss": 3.8957, "step": 180 }, { "epoch": 0.2896, "grad_norm": 1.1934682130813599, "learning_rate": 5.279999999999999e-05, "loss": 3.9527, "step": 181 }, { "epoch": 0.2912, "grad_norm": 2.6113617420196533, "learning_rate": 5.309999999999999e-05, "loss": 3.9198, "step": 182 }, { "epoch": 0.2928, "grad_norm": 1.4902148246765137, "learning_rate": 5.339999999999999e-05, "loss": 3.8525, "step": 183 }, { "epoch": 0.2944, "grad_norm": 1.645369529724121, "learning_rate": 5.369999999999999e-05, "loss": 3.8462, "step": 184 }, { "epoch": 0.296, "grad_norm": 1.7309008836746216, "learning_rate": 5.399999999999999e-05, "loss": 3.9076, "step": 185 }, { "epoch": 0.2976, "grad_norm": 1.9582569599151611, "learning_rate": 5.429999999999999e-05, "loss": 3.9207, "step": 186 }, { "epoch": 0.2992, "grad_norm": 1.795342206954956, "learning_rate": 5.459999999999999e-05, "loss": 3.9195, "step": 187 }, { "epoch": 0.3008, "grad_norm": 0.6605049967765808, "learning_rate": 5.489999999999999e-05, "loss": 3.8484, "step": 188 }, { "epoch": 0.3024, "grad_norm": 1.3379170894622803, "learning_rate": 5.519999999999999e-05, "loss": 3.8709, "step": 189 }, { "epoch": 0.304, "grad_norm": 2.7950329780578613, "learning_rate": 5.5499999999999994e-05, "loss": 3.8463, "step": 190 }, { "epoch": 0.3056, "grad_norm": 4.116920471191406, "learning_rate": 5.5799999999999994e-05, "loss": 4.0707, "step": 191 }, { "epoch": 0.3072, "grad_norm": 1.450844168663025, "learning_rate": 5.6099999999999995e-05, "loss": 3.9311, "step": 192 }, { "epoch": 0.3088, "grad_norm": 3.2481865882873535, "learning_rate": 5.6399999999999995e-05, "loss": 4.0222, "step": 193 }, { "epoch": 0.3104, "grad_norm": 1.377772331237793, "learning_rate": 5.6699999999999996e-05, "loss": 3.9005, "step": 194 }, { "epoch": 0.312, "grad_norm": 2.0002799034118652, "learning_rate": 5.6999999999999996e-05, "loss": 3.9021, "step": 195 }, { "epoch": 0.3136, "grad_norm": 2.1365325450897217, "learning_rate": 5.73e-05, "loss": 3.8869, "step": 196 }, { "epoch": 0.3152, "grad_norm": 2.5995476245880127, "learning_rate": 5.76e-05, "loss": 3.8563, "step": 197 }, { "epoch": 0.3168, "grad_norm": 3.046848773956299, "learning_rate": 5.79e-05, "loss": 3.9452, "step": 198 }, { "epoch": 0.3184, "grad_norm": 2.9868597984313965, "learning_rate": 5.82e-05, "loss": 3.9164, "step": 199 }, { "epoch": 0.32, "grad_norm": NaN, "learning_rate": 5.82e-05, "loss": 3.8911, "step": 200 }, { "epoch": 0.3216, "grad_norm": 22.723224639892578, "learning_rate": 5.85e-05, "loss": 4.3327, "step": 201 }, { "epoch": 0.3232, "grad_norm": 14.148520469665527, "learning_rate": 5.88e-05, "loss": 4.0732, "step": 202 }, { "epoch": 0.3248, "grad_norm": 12.10940170288086, "learning_rate": 5.91e-05, "loss": 4.0389, "step": 203 }, { "epoch": 0.3264, "grad_norm": 7.240406513214111, "learning_rate": 5.94e-05, "loss": 3.938, "step": 204 }, { "epoch": 0.328, "grad_norm": 4.678879261016846, "learning_rate": 5.97e-05, "loss": 3.911, "step": 205 }, { "epoch": 0.3296, "grad_norm": 2.724951982498169, "learning_rate": 5.9999999999999995e-05, "loss": 3.9436, "step": 206 }, { "epoch": 0.3312, "grad_norm": 4.7506279945373535, "learning_rate": 6.0299999999999995e-05, "loss": 3.8465, "step": 207 }, { "epoch": 0.3328, "grad_norm": 2.6390953063964844, "learning_rate": 6.0599999999999996e-05, "loss": 3.932, "step": 208 }, { "epoch": 0.3344, "grad_norm": 3.661578893661499, "learning_rate": 6.0899999999999996e-05, "loss": 3.8498, "step": 209 }, { "epoch": 0.336, "grad_norm": 2.446004867553711, "learning_rate": 6.12e-05, "loss": 3.9245, "step": 210 }, { "epoch": 0.3376, "grad_norm": 1.197083592414856, "learning_rate": 6.149999999999999e-05, "loss": 4.0049, "step": 211 }, { "epoch": 0.3392, "grad_norm": 3.957880735397339, "learning_rate": 6.18e-05, "loss": 3.8129, "step": 212 }, { "epoch": 0.3408, "grad_norm": 2.243058681488037, "learning_rate": 6.209999999999999e-05, "loss": 3.8419, "step": 213 }, { "epoch": 0.3424, "grad_norm": 0.8457456827163696, "learning_rate": 6.239999999999999e-05, "loss": 3.8015, "step": 214 }, { "epoch": 0.344, "grad_norm": 2.7040092945098877, "learning_rate": 6.269999999999999e-05, "loss": 3.7757, "step": 215 }, { "epoch": 0.3456, "grad_norm": 2.867565155029297, "learning_rate": 6.299999999999999e-05, "loss": 3.748, "step": 216 }, { "epoch": 0.3472, "grad_norm": 9.108602523803711, "learning_rate": 6.33e-05, "loss": 4.0417, "step": 217 }, { "epoch": 0.3488, "grad_norm": 2.7541725635528564, "learning_rate": 6.359999999999999e-05, "loss": 3.7908, "step": 218 }, { "epoch": 0.3504, "grad_norm": 1.1848869323730469, "learning_rate": 6.39e-05, "loss": 3.7637, "step": 219 }, { "epoch": 0.352, "grad_norm": 0.6500396132469177, "learning_rate": 6.419999999999999e-05, "loss": 3.8055, "step": 220 }, { "epoch": 0.3536, "grad_norm": 2.706550359725952, "learning_rate": 6.45e-05, "loss": 3.7637, "step": 221 }, { "epoch": 0.3552, "grad_norm": 5.064160346984863, "learning_rate": 6.479999999999999e-05, "loss": 3.7861, "step": 222 }, { "epoch": 0.3568, "grad_norm": 3.20385479927063, "learning_rate": 6.51e-05, "loss": 3.7752, "step": 223 }, { "epoch": 0.3584, "grad_norm": 2.3726119995117188, "learning_rate": 6.539999999999999e-05, "loss": 3.7934, "step": 224 }, { "epoch": 0.36, "grad_norm": 1.985705852508545, "learning_rate": 6.57e-05, "loss": 3.8806, "step": 225 }, { "epoch": 0.3616, "grad_norm": 0.669208288192749, "learning_rate": 6.599999999999999e-05, "loss": 3.7576, "step": 226 }, { "epoch": 0.3632, "grad_norm": 1.7072322368621826, "learning_rate": 6.63e-05, "loss": 3.7382, "step": 227 }, { "epoch": 0.3648, "grad_norm": 2.339816093444824, "learning_rate": 6.659999999999999e-05, "loss": 3.8001, "step": 228 }, { "epoch": 0.3664, "grad_norm": 0.6553944945335388, "learning_rate": 6.69e-05, "loss": 3.7473, "step": 229 }, { "epoch": 0.368, "grad_norm": 1.8117849826812744, "learning_rate": 6.72e-05, "loss": 3.79, "step": 230 }, { "epoch": 0.3696, "grad_norm": 1.0229136943817139, "learning_rate": 6.75e-05, "loss": 3.7968, "step": 231 }, { "epoch": 0.3712, "grad_norm": 1.6037867069244385, "learning_rate": 6.78e-05, "loss": 3.7716, "step": 232 }, { "epoch": 0.3728, "grad_norm": 1.716901183128357, "learning_rate": 6.81e-05, "loss": 3.8464, "step": 233 }, { "epoch": 0.3744, "grad_norm": 9.919891357421875, "learning_rate": 6.84e-05, "loss": 3.8106, "step": 234 }, { "epoch": 0.376, "grad_norm": 1.2543926239013672, "learning_rate": 6.87e-05, "loss": 3.7871, "step": 235 }, { "epoch": 0.3776, "grad_norm": 5.111069202423096, "learning_rate": 6.9e-05, "loss": 3.9449, "step": 236 }, { "epoch": 0.3792, "grad_norm": 0.940678060054779, "learning_rate": 6.93e-05, "loss": 3.784, "step": 237 }, { "epoch": 0.3808, "grad_norm": 0.9248812794685364, "learning_rate": 6.96e-05, "loss": 3.8137, "step": 238 }, { "epoch": 0.3824, "grad_norm": 0.8821243643760681, "learning_rate": 6.989999999999999e-05, "loss": 3.7626, "step": 239 }, { "epoch": 0.384, "grad_norm": 1.0918103456497192, "learning_rate": 7.02e-05, "loss": 3.7819, "step": 240 }, { "epoch": 0.3856, "grad_norm": 0.6585227251052856, "learning_rate": 7.049999999999999e-05, "loss": 3.7891, "step": 241 }, { "epoch": 0.3872, "grad_norm": 3.0343358516693115, "learning_rate": 7.079999999999999e-05, "loss": 3.7803, "step": 242 }, { "epoch": 0.3888, "grad_norm": 2.1487510204315186, "learning_rate": 7.11e-05, "loss": 3.8404, "step": 243 }, { "epoch": 0.3904, "grad_norm": 1.0203007459640503, "learning_rate": 7.139999999999999e-05, "loss": 3.7602, "step": 244 }, { "epoch": 0.392, "grad_norm": 0.8433353900909424, "learning_rate": 7.17e-05, "loss": 3.7826, "step": 245 }, { "epoch": 0.3936, "grad_norm": 2.8857128620147705, "learning_rate": 7.199999999999999e-05, "loss": 3.7436, "step": 246 }, { "epoch": 0.3952, "grad_norm": 6.611523628234863, "learning_rate": 7.23e-05, "loss": 4.0391, "step": 247 }, { "epoch": 0.3968, "grad_norm": 0.7234116196632385, "learning_rate": 7.259999999999999e-05, "loss": 3.8167, "step": 248 }, { "epoch": 0.3984, "grad_norm": 0.973664402961731, "learning_rate": 7.29e-05, "loss": 3.8963, "step": 249 }, { "epoch": 0.4, "grad_norm": 1.6993762254714966, "learning_rate": 7.319999999999999e-05, "loss": 3.9033, "step": 250 }, { "epoch": 0.4016, "grad_norm": 17.571664810180664, "learning_rate": 7.35e-05, "loss": 4.2596, "step": 251 }, { "epoch": 0.4032, "grad_norm": 11.271060943603516, "learning_rate": 7.379999999999999e-05, "loss": 4.1186, "step": 252 }, { "epoch": 0.4048, "grad_norm": 8.646568298339844, "learning_rate": 7.41e-05, "loss": 4.013, "step": 253 }, { "epoch": 0.4064, "grad_norm": 2.02486252784729, "learning_rate": 7.439999999999999e-05, "loss": 3.8974, "step": 254 }, { "epoch": 0.408, "grad_norm": 3.4109764099121094, "learning_rate": 7.47e-05, "loss": 3.8285, "step": 255 }, { "epoch": 0.4096, "grad_norm": 3.8505735397338867, "learning_rate": 7.5e-05, "loss": 3.9306, "step": 256 }, { "epoch": 0.4112, "grad_norm": 7.018677234649658, "learning_rate": 7.529999999999999e-05, "loss": 3.8432, "step": 257 }, { "epoch": 0.4128, "grad_norm": 4.351247310638428, "learning_rate": 7.56e-05, "loss": 3.8534, "step": 258 }, { "epoch": 0.4144, "grad_norm": 5.365427494049072, "learning_rate": 7.589999999999999e-05, "loss": 3.8408, "step": 259 }, { "epoch": 0.416, "grad_norm": 3.984861135482788, "learning_rate": 7.62e-05, "loss": 3.7589, "step": 260 }, { "epoch": 0.4176, "grad_norm": 1.2847763299942017, "learning_rate": 7.649999999999999e-05, "loss": 3.748, "step": 261 }, { "epoch": 0.4192, "grad_norm": 4.559200286865234, "learning_rate": 7.68e-05, "loss": 3.8104, "step": 262 }, { "epoch": 0.4208, "grad_norm": 4.230029106140137, "learning_rate": 7.709999999999999e-05, "loss": 3.7676, "step": 263 }, { "epoch": 0.4224, "grad_norm": 6.13962984085083, "learning_rate": 7.74e-05, "loss": 3.7336, "step": 264 }, { "epoch": 0.424, "grad_norm": 4.625703811645508, "learning_rate": 7.769999999999999e-05, "loss": 3.744, "step": 265 }, { "epoch": 0.4256, "grad_norm": 4.050301551818848, "learning_rate": 7.8e-05, "loss": 3.7662, "step": 266 }, { "epoch": 0.4272, "grad_norm": 3.0125648975372314, "learning_rate": 7.829999999999999e-05, "loss": 3.7469, "step": 267 }, { "epoch": 0.4288, "grad_norm": 0.6710224747657776, "learning_rate": 7.86e-05, "loss": 3.7913, "step": 268 }, { "epoch": 0.4304, "grad_norm": 0.7062709927558899, "learning_rate": 7.89e-05, "loss": 3.7765, "step": 269 }, { "epoch": 0.432, "grad_norm": 5.028995990753174, "learning_rate": 7.92e-05, "loss": 3.7567, "step": 270 }, { "epoch": 0.4336, "grad_norm": 4.44848108291626, "learning_rate": 7.95e-05, "loss": 3.7673, "step": 271 }, { "epoch": 0.4352, "grad_norm": 4.467078685760498, "learning_rate": 7.98e-05, "loss": 3.7462, "step": 272 }, { "epoch": 0.4368, "grad_norm": 3.1866374015808105, "learning_rate": 8.01e-05, "loss": 3.715, "step": 273 }, { "epoch": 0.4384, "grad_norm": 2.605476140975952, "learning_rate": 8.04e-05, "loss": 3.6984, "step": 274 }, { "epoch": 0.44, "grad_norm": 0.6094714999198914, "learning_rate": 8.07e-05, "loss": 3.7325, "step": 275 }, { "epoch": 0.4416, "grad_norm": 6.599428653717041, "learning_rate": 8.1e-05, "loss": 3.8765, "step": 276 }, { "epoch": 0.4432, "grad_norm": 2.6780223846435547, "learning_rate": 8.13e-05, "loss": 3.7577, "step": 277 }, { "epoch": 0.4448, "grad_norm": 3.663605213165283, "learning_rate": 8.16e-05, "loss": 3.8035, "step": 278 }, { "epoch": 0.4464, "grad_norm": 2.812157392501831, "learning_rate": 8.19e-05, "loss": 3.749, "step": 279 }, { "epoch": 0.448, "grad_norm": 2.2692039012908936, "learning_rate": 8.22e-05, "loss": 3.7149, "step": 280 }, { "epoch": 0.4496, "grad_norm": 1.1938503980636597, "learning_rate": 8.25e-05, "loss": 3.7246, "step": 281 }, { "epoch": 0.4512, "grad_norm": 1.3016897439956665, "learning_rate": 8.28e-05, "loss": 3.6932, "step": 282 }, { "epoch": 0.4528, "grad_norm": 2.0602409839630127, "learning_rate": 8.31e-05, "loss": 3.7896, "step": 283 }, { "epoch": 0.4544, "grad_norm": 1.2453322410583496, "learning_rate": 8.34e-05, "loss": 3.7896, "step": 284 }, { "epoch": 0.456, "grad_norm": 0.7699930667877197, "learning_rate": 8.37e-05, "loss": 3.7406, "step": 285 }, { "epoch": 0.4576, "grad_norm": 0.9949842691421509, "learning_rate": 8.4e-05, "loss": 3.7951, "step": 286 }, { "epoch": 0.4592, "grad_norm": 1.2708395719528198, "learning_rate": 8.43e-05, "loss": 3.7142, "step": 287 }, { "epoch": 0.4608, "grad_norm": 1.6578696966171265, "learning_rate": 8.459999999999998e-05, "loss": 3.7042, "step": 288 }, { "epoch": 0.4624, "grad_norm": 0.9027276635169983, "learning_rate": 8.489999999999999e-05, "loss": 3.6875, "step": 289 }, { "epoch": 0.464, "grad_norm": 1.3110026121139526, "learning_rate": 8.519999999999998e-05, "loss": 3.7219, "step": 290 }, { "epoch": 0.4656, "grad_norm": 0.9840269088745117, "learning_rate": 8.549999999999999e-05, "loss": 3.7555, "step": 291 }, { "epoch": 0.4672, "grad_norm": 1.4040346145629883, "learning_rate": 8.579999999999998e-05, "loss": 3.7981, "step": 292 }, { "epoch": 0.4688, "grad_norm": 1.0543975830078125, "learning_rate": 8.609999999999999e-05, "loss": 3.7075, "step": 293 }, { "epoch": 0.4704, "grad_norm": 0.9345111846923828, "learning_rate": 8.639999999999999e-05, "loss": 3.677, "step": 294 }, { "epoch": 0.472, "grad_norm": 1.007042646408081, "learning_rate": 8.669999999999998e-05, "loss": 3.7533, "step": 295 }, { "epoch": 0.4736, "grad_norm": 1.7284626960754395, "learning_rate": 8.699999999999999e-05, "loss": 3.6897, "step": 296 }, { "epoch": 0.4752, "grad_norm": 2.507981538772583, "learning_rate": 8.729999999999998e-05, "loss": 3.8189, "step": 297 }, { "epoch": 0.4768, "grad_norm": 2.26454496383667, "learning_rate": 8.759999999999999e-05, "loss": 3.8095, "step": 298 }, { "epoch": 0.4784, "grad_norm": 1.5712822675704956, "learning_rate": 8.789999999999998e-05, "loss": 3.8321, "step": 299 }, { "epoch": 0.48, "grad_norm": 1.8837485313415527, "learning_rate": 8.819999999999999e-05, "loss": 3.9162, "step": 300 }, { "epoch": 0.4816, "grad_norm": 14.770750999450684, "learning_rate": 8.849999999999998e-05, "loss": 4.0945, "step": 301 }, { "epoch": 0.4832, "grad_norm": 14.976526260375977, "learning_rate": 8.879999999999999e-05, "loss": 4.0664, "step": 302 }, { "epoch": 0.4848, "grad_norm": 9.496882438659668, "learning_rate": 8.909999999999998e-05, "loss": 4.0088, "step": 303 }, { "epoch": 0.4864, "grad_norm": 13.879667282104492, "learning_rate": 8.939999999999999e-05, "loss": 4.1825, "step": 304 }, { "epoch": 0.488, "grad_norm": 6.049519062042236, "learning_rate": 8.969999999999998e-05, "loss": 3.883, "step": 305 }, { "epoch": 0.4896, "grad_norm": 1.975704312324524, "learning_rate": 8.999999999999999e-05, "loss": 3.8272, "step": 306 }, { "epoch": 0.4912, "grad_norm": 5.8130598068237305, "learning_rate": 9.029999999999999e-05, "loss": 3.8001, "step": 307 }, { "epoch": 0.4928, "grad_norm": 3.5878612995147705, "learning_rate": 9.059999999999999e-05, "loss": 3.8353, "step": 308 }, { "epoch": 0.4944, "grad_norm": 4.1221513748168945, "learning_rate": 9.089999999999999e-05, "loss": 3.9413, "step": 309 }, { "epoch": 0.496, "grad_norm": 5.489438533782959, "learning_rate": 9.12e-05, "loss": 3.7904, "step": 310 }, { "epoch": 0.4976, "grad_norm": 1.6620556116104126, "learning_rate": 9.149999999999999e-05, "loss": 3.7937, "step": 311 }, { "epoch": 0.4992, "grad_norm": 1.13966703414917, "learning_rate": 9.18e-05, "loss": 3.7336, "step": 312 }, { "epoch": 0.5008, "grad_norm": 1.5242611169815063, "learning_rate": 9.209999999999999e-05, "loss": 3.7161, "step": 313 }, { "epoch": 0.5024, "grad_norm": 3.9798734188079834, "learning_rate": 9.24e-05, "loss": 3.7, "step": 314 }, { "epoch": 0.504, "grad_norm": 7.939405918121338, "learning_rate": 9.269999999999999e-05, "loss": 3.7537, "step": 315 }, { "epoch": 0.5056, "grad_norm": 4.0154709815979, "learning_rate": 9.3e-05, "loss": 3.708, "step": 316 }, { "epoch": 0.5072, "grad_norm": 4.138357639312744, "learning_rate": 9.329999999999999e-05, "loss": 3.7166, "step": 317 }, { "epoch": 0.5088, "grad_norm": 8.64471435546875, "learning_rate": 9.36e-05, "loss": 4.0627, "step": 318 }, { "epoch": 0.5104, "grad_norm": 0.7231702208518982, "learning_rate": 9.389999999999999e-05, "loss": 3.7427, "step": 319 }, { "epoch": 0.512, "grad_norm": 2.371631622314453, "learning_rate": 9.419999999999999e-05, "loss": 3.7203, "step": 320 }, { "epoch": 0.5136, "grad_norm": 4.284900188446045, "learning_rate": 9.449999999999999e-05, "loss": 3.7413, "step": 321 }, { "epoch": 0.5152, "grad_norm": 3.0372443199157715, "learning_rate": 9.479999999999999e-05, "loss": 3.6992, "step": 322 }, { "epoch": 0.5168, "grad_norm": 1.9789845943450928, "learning_rate": 9.51e-05, "loss": 3.7184, "step": 323 }, { "epoch": 0.5184, "grad_norm": 1.624227523803711, "learning_rate": 9.539999999999999e-05, "loss": 3.6749, "step": 324 }, { "epoch": 0.52, "grad_norm": 1.5696678161621094, "learning_rate": 9.57e-05, "loss": 3.7581, "step": 325 }, { "epoch": 0.5216, "grad_norm": 2.7740790843963623, "learning_rate": 9.599999999999999e-05, "loss": 3.6773, "step": 326 }, { "epoch": 0.5232, "grad_norm": 2.1769227981567383, "learning_rate": 9.63e-05, "loss": 3.6821, "step": 327 }, { "epoch": 0.5248, "grad_norm": 3.454484224319458, "learning_rate": 9.659999999999999e-05, "loss": 3.6977, "step": 328 }, { "epoch": 0.5264, "grad_norm": 1.035311222076416, "learning_rate": 9.69e-05, "loss": 3.6816, "step": 329 }, { "epoch": 0.528, "grad_norm": 1.0064358711242676, "learning_rate": 9.719999999999999e-05, "loss": 3.7105, "step": 330 }, { "epoch": 0.5296, "grad_norm": 2.302251100540161, "learning_rate": 9.75e-05, "loss": 3.7867, "step": 331 }, { "epoch": 0.5312, "grad_norm": 2.8935694694519043, "learning_rate": 9.779999999999999e-05, "loss": 3.7186, "step": 332 }, { "epoch": 0.5328, "grad_norm": 4.943471908569336, "learning_rate": 9.81e-05, "loss": 3.6946, "step": 333 }, { "epoch": 0.5344, "grad_norm": 2.8398258686065674, "learning_rate": 9.839999999999999e-05, "loss": 3.7091, "step": 334 }, { "epoch": 0.536, "grad_norm": 3.0762977600097656, "learning_rate": 9.87e-05, "loss": 3.7574, "step": 335 }, { "epoch": 0.5376, "grad_norm": 1.8813797235488892, "learning_rate": 9.9e-05, "loss": 3.7118, "step": 336 }, { "epoch": 0.5392, "grad_norm": 0.8849917054176331, "learning_rate": 9.93e-05, "loss": 3.77, "step": 337 }, { "epoch": 0.5408, "grad_norm": 1.4980673789978027, "learning_rate": 9.96e-05, "loss": 3.6983, "step": 338 }, { "epoch": 0.5424, "grad_norm": 1.593652367591858, "learning_rate": 9.99e-05, "loss": 3.6434, "step": 339 }, { "epoch": 0.544, "grad_norm": 1.0899137258529663, "learning_rate": 0.0001002, "loss": 3.6421, "step": 340 }, { "epoch": 0.5456, "grad_norm": 0.6649819016456604, "learning_rate": 0.0001005, "loss": 3.6891, "step": 341 }, { "epoch": 0.5472, "grad_norm": 1.5821571350097656, "learning_rate": 0.0001008, "loss": 3.794, "step": 342 }, { "epoch": 0.5488, "grad_norm": 2.7996299266815186, "learning_rate": 0.0001011, "loss": 3.6779, "step": 343 }, { "epoch": 0.5504, "grad_norm": 2.5575685501098633, "learning_rate": 0.0001014, "loss": 3.7879, "step": 344 }, { "epoch": 0.552, "grad_norm": 1.2596614360809326, "learning_rate": 0.00010169999999999999, "loss": 3.7208, "step": 345 }, { "epoch": 0.5536, "grad_norm": 0.8037718534469604, "learning_rate": 0.000102, "loss": 3.6811, "step": 346 }, { "epoch": 0.5552, "grad_norm": 1.3349064588546753, "learning_rate": 0.00010229999999999999, "loss": 3.6959, "step": 347 }, { "epoch": 0.5568, "grad_norm": 1.269572138786316, "learning_rate": 0.0001026, "loss": 3.7447, "step": 348 }, { "epoch": 0.5584, "grad_norm": 1.022356629371643, "learning_rate": 0.0001029, "loss": 3.6698, "step": 349 }, { "epoch": 0.56, "grad_norm": 1.5080015659332275, "learning_rate": 0.00010319999999999999, "loss": 3.7888, "step": 350 }, { "epoch": 0.5616, "grad_norm": 39.285152435302734, "learning_rate": 0.00010349999999999998, "loss": 5.0541, "step": 351 }, { "epoch": 0.5632, "grad_norm": 9.576460838317871, "learning_rate": 0.00010379999999999999, "loss": 4.014, "step": 352 }, { "epoch": 0.5648, "grad_norm": 6.356123447418213, "learning_rate": 0.00010409999999999998, "loss": 3.9312, "step": 353 }, { "epoch": 0.5664, "grad_norm": 2.139646291732788, "learning_rate": 0.00010439999999999999, "loss": 3.7221, "step": 354 }, { "epoch": 0.568, "grad_norm": 1.8156242370605469, "learning_rate": 0.00010469999999999998, "loss": 3.8395, "step": 355 }, { "epoch": 0.5696, "grad_norm": 4.785361289978027, "learning_rate": 0.00010499999999999999, "loss": 3.719, "step": 356 }, { "epoch": 0.5712, "grad_norm": 3.600017786026001, "learning_rate": 0.00010529999999999998, "loss": 3.7428, "step": 357 }, { "epoch": 0.5728, "grad_norm": 2.4187095165252686, "learning_rate": 0.00010559999999999998, "loss": 3.6832, "step": 358 }, { "epoch": 0.5744, "grad_norm": 1.5843887329101562, "learning_rate": 0.00010589999999999999, "loss": 3.6504, "step": 359 }, { "epoch": 0.576, "grad_norm": 1.5045654773712158, "learning_rate": 0.00010619999999999998, "loss": 3.6816, "step": 360 }, { "epoch": 0.5776, "grad_norm": 2.58827543258667, "learning_rate": 0.00010649999999999999, "loss": 3.642, "step": 361 }, { "epoch": 0.5792, "grad_norm": 2.5386884212493896, "learning_rate": 0.00010679999999999998, "loss": 3.6578, "step": 362 }, { "epoch": 0.5808, "grad_norm": 2.9344706535339355, "learning_rate": 0.00010709999999999999, "loss": 3.6694, "step": 363 }, { "epoch": 0.5824, "grad_norm": 2.340221643447876, "learning_rate": 0.00010739999999999998, "loss": 3.6504, "step": 364 }, { "epoch": 0.584, "grad_norm": 2.816999912261963, "learning_rate": 0.00010769999999999999, "loss": 3.6654, "step": 365 }, { "epoch": 0.5856, "grad_norm": 1.5071390867233276, "learning_rate": 0.00010799999999999998, "loss": 3.6406, "step": 366 }, { "epoch": 0.5872, "grad_norm": 0.7593219876289368, "learning_rate": 0.00010829999999999999, "loss": 3.6683, "step": 367 }, { "epoch": 0.5888, "grad_norm": 2.646967887878418, "learning_rate": 0.00010859999999999998, "loss": 3.6844, "step": 368 }, { "epoch": 0.5904, "grad_norm": 2.8628735542297363, "learning_rate": 0.00010889999999999999, "loss": 3.6159, "step": 369 }, { "epoch": 0.592, "grad_norm": 2.4796457290649414, "learning_rate": 0.00010919999999999998, "loss": 3.6951, "step": 370 }, { "epoch": 0.5936, "grad_norm": 1.1962988376617432, "learning_rate": 0.00010949999999999999, "loss": 3.5725, "step": 371 }, { "epoch": 0.5952, "grad_norm": 0.9031145572662354, "learning_rate": 0.00010979999999999999, "loss": 3.5847, "step": 372 }, { "epoch": 0.5968, "grad_norm": 2.3092095851898193, "learning_rate": 0.00011009999999999999, "loss": 3.6181, "step": 373 }, { "epoch": 0.5984, "grad_norm": 1.1398162841796875, "learning_rate": 0.00011039999999999999, "loss": 3.6774, "step": 374 }, { "epoch": 0.6, "grad_norm": 1.0708636045455933, "learning_rate": 0.0001107, "loss": 3.5872, "step": 375 }, { "epoch": 0.6016, "grad_norm": 0.873319149017334, "learning_rate": 0.00011099999999999999, "loss": 3.6512, "step": 376 }, { "epoch": 0.6032, "grad_norm": 0.8766927123069763, "learning_rate": 0.0001113, "loss": 3.6434, "step": 377 }, { "epoch": 0.6048, "grad_norm": 0.7983845472335815, "learning_rate": 0.00011159999999999999, "loss": 3.5572, "step": 378 }, { "epoch": 0.6064, "grad_norm": 6.645163059234619, "learning_rate": 0.0001119, "loss": 3.7195, "step": 379 }, { "epoch": 0.608, "grad_norm": 4.845424652099609, "learning_rate": 0.00011219999999999999, "loss": 3.7342, "step": 380 }, { "epoch": 0.6096, "grad_norm": 0.7528507709503174, "learning_rate": 0.0001125, "loss": 3.5633, "step": 381 }, { "epoch": 0.6112, "grad_norm": 0.69414883852005, "learning_rate": 0.00011279999999999999, "loss": 3.6274, "step": 382 }, { "epoch": 0.6128, "grad_norm": 2.0266685485839844, "learning_rate": 0.00011309999999999998, "loss": 3.5491, "step": 383 }, { "epoch": 0.6144, "grad_norm": 0.7420620918273926, "learning_rate": 0.00011339999999999999, "loss": 3.6293, "step": 384 }, { "epoch": 0.616, "grad_norm": 1.1609050035476685, "learning_rate": 0.00011369999999999999, "loss": 3.5036, "step": 385 }, { "epoch": 0.6176, "grad_norm": 1.6696407794952393, "learning_rate": 0.00011399999999999999, "loss": 3.5909, "step": 386 }, { "epoch": 0.6192, "grad_norm": 1.1470685005187988, "learning_rate": 0.00011429999999999999, "loss": 3.6343, "step": 387 }, { "epoch": 0.6208, "grad_norm": 1.3622722625732422, "learning_rate": 0.0001146, "loss": 3.5755, "step": 388 }, { "epoch": 0.6224, "grad_norm": 1.2317267656326294, "learning_rate": 0.00011489999999999999, "loss": 3.5538, "step": 389 }, { "epoch": 0.624, "grad_norm": 1.1414676904678345, "learning_rate": 0.0001152, "loss": 3.5555, "step": 390 }, { "epoch": 0.6256, "grad_norm": 1.998960018157959, "learning_rate": 0.00011549999999999999, "loss": 3.5465, "step": 391 }, { "epoch": 0.6272, "grad_norm": 1.4650264978408813, "learning_rate": 0.0001158, "loss": 3.5777, "step": 392 }, { "epoch": 0.6288, "grad_norm": 1.5700796842575073, "learning_rate": 0.00011609999999999999, "loss": 3.6034, "step": 393 }, { "epoch": 0.6304, "grad_norm": 2.38299298286438, "learning_rate": 0.0001164, "loss": 3.5582, "step": 394 }, { "epoch": 0.632, "grad_norm": 1.2898112535476685, "learning_rate": 0.00011669999999999999, "loss": 3.4869, "step": 395 }, { "epoch": 0.6336, "grad_norm": 1.2601486444473267, "learning_rate": 0.000117, "loss": 3.5423, "step": 396 }, { "epoch": 0.6352, "grad_norm": 1.907885193824768, "learning_rate": 0.00011729999999999999, "loss": 3.5019, "step": 397 }, { "epoch": 0.6368, "grad_norm": 1.2280569076538086, "learning_rate": 0.0001176, "loss": 3.6299, "step": 398 }, { "epoch": 0.6384, "grad_norm": 2.214331865310669, "learning_rate": 0.00011789999999999999, "loss": 3.5979, "step": 399 }, { "epoch": 0.64, "grad_norm": NaN, "learning_rate": 0.00011789999999999999, "loss": 3.4218, "step": 400 }, { "epoch": 0.6416, "grad_norm": 38.77112579345703, "learning_rate": 0.0001182, "loss": 4.5562, "step": 401 }, { "epoch": 0.6432, "grad_norm": 8.848291397094727, "learning_rate": 0.0001185, "loss": 3.7699, "step": 402 }, { "epoch": 0.6448, "grad_norm": 6.007197856903076, "learning_rate": 0.0001188, "loss": 3.539, "step": 403 }, { "epoch": 0.6464, "grad_norm": 3.0180368423461914, "learning_rate": 0.0001191, "loss": 3.5382, "step": 404 }, { "epoch": 0.648, "grad_norm": 2.071746587753296, "learning_rate": 0.0001194, "loss": 3.5857, "step": 405 }, { "epoch": 0.6496, "grad_norm": 4.427801132202148, "learning_rate": 0.0001197, "loss": 3.526, "step": 406 }, { "epoch": 0.6512, "grad_norm": 5.680927753448486, "learning_rate": 0.00011999999999999999, "loss": 3.4296, "step": 407 }, { "epoch": 0.6528, "grad_norm": 2.7837042808532715, "learning_rate": 0.0001203, "loss": 3.4657, "step": 408 }, { "epoch": 0.6544, "grad_norm": 4.605573654174805, "learning_rate": 0.00012059999999999999, "loss": 3.4279, "step": 409 }, { "epoch": 0.656, "grad_norm": 1.96554696559906, "learning_rate": 0.0001209, "loss": 3.3618, "step": 410 }, { "epoch": 0.6576, "grad_norm": 5.76222038269043, "learning_rate": 0.00012119999999999999, "loss": 3.4163, "step": 411 }, { "epoch": 0.6592, "grad_norm": 4.640344619750977, "learning_rate": 0.0001215, "loss": 3.4429, "step": 412 }, { "epoch": 0.6608, "grad_norm": 4.301933288574219, "learning_rate": 0.00012179999999999999, "loss": 3.3255, "step": 413 }, { "epoch": 0.6624, "grad_norm": 3.781334638595581, "learning_rate": 0.00012209999999999999, "loss": 3.3261, "step": 414 }, { "epoch": 0.664, "grad_norm": 3.663053035736084, "learning_rate": 0.0001224, "loss": 3.2962, "step": 415 }, { "epoch": 0.6656, "grad_norm": 3.2776567935943604, "learning_rate": 0.00012269999999999997, "loss": 3.333, "step": 416 }, { "epoch": 0.6672, "grad_norm": 1.000927209854126, "learning_rate": 0.00012299999999999998, "loss": 3.2321, "step": 417 }, { "epoch": 0.6688, "grad_norm": 1.561220407485962, "learning_rate": 0.0001233, "loss": 3.3121, "step": 418 }, { "epoch": 0.6704, "grad_norm": 0.8714520931243896, "learning_rate": 0.0001236, "loss": 3.3394, "step": 419 }, { "epoch": 0.672, "grad_norm": 1.1457229852676392, "learning_rate": 0.00012389999999999998, "loss": 3.1645, "step": 420 }, { "epoch": 0.6736, "grad_norm": 2.054020881652832, "learning_rate": 0.00012419999999999998, "loss": 3.2115, "step": 421 }, { "epoch": 0.6752, "grad_norm": 3.8146936893463135, "learning_rate": 0.0001245, "loss": 3.3334, "step": 422 }, { "epoch": 0.6768, "grad_norm": 2.3825631141662598, "learning_rate": 0.00012479999999999997, "loss": 3.2264, "step": 423 }, { "epoch": 0.6784, "grad_norm": 1.282517671585083, "learning_rate": 0.00012509999999999998, "loss": 3.3354, "step": 424 }, { "epoch": 0.68, "grad_norm": 1.5535123348236084, "learning_rate": 0.00012539999999999999, "loss": 3.1819, "step": 425 }, { "epoch": 0.6816, "grad_norm": 1.8400110006332397, "learning_rate": 0.0001257, "loss": 3.1989, "step": 426 }, { "epoch": 0.6832, "grad_norm": 1.3851298093795776, "learning_rate": 0.00012599999999999997, "loss": 3.2336, "step": 427 }, { "epoch": 0.6848, "grad_norm": 1.884459376335144, "learning_rate": 0.00012629999999999998, "loss": 3.2123, "step": 428 }, { "epoch": 0.6864, "grad_norm": 1.7640012502670288, "learning_rate": 0.0001266, "loss": 3.0558, "step": 429 }, { "epoch": 0.688, "grad_norm": 2.564265489578247, "learning_rate": 0.0001269, "loss": 3.0314, "step": 430 }, { "epoch": 0.6896, "grad_norm": 1.8793052434921265, "learning_rate": 0.00012719999999999997, "loss": 3.0916, "step": 431 }, { "epoch": 0.6912, "grad_norm": 1.3174560070037842, "learning_rate": 0.00012749999999999998, "loss": 3.0977, "step": 432 }, { "epoch": 0.6928, "grad_norm": 0.9135323166847229, "learning_rate": 0.0001278, "loss": 3.1459, "step": 433 }, { "epoch": 0.6944, "grad_norm": 1.05746591091156, "learning_rate": 0.0001281, "loss": 3.1823, "step": 434 }, { "epoch": 0.696, "grad_norm": 1.2425645589828491, "learning_rate": 0.00012839999999999998, "loss": 2.9603, "step": 435 }, { "epoch": 0.6976, "grad_norm": 1.2454054355621338, "learning_rate": 0.00012869999999999998, "loss": 2.9414, "step": 436 }, { "epoch": 0.6992, "grad_norm": 0.9464673399925232, "learning_rate": 0.000129, "loss": 3.1432, "step": 437 }, { "epoch": 0.7008, "grad_norm": 1.5856995582580566, "learning_rate": 0.0001293, "loss": 3.0063, "step": 438 }, { "epoch": 0.7024, "grad_norm": 1.043485403060913, "learning_rate": 0.00012959999999999998, "loss": 2.9146, "step": 439 }, { "epoch": 0.704, "grad_norm": 1.240867257118225, "learning_rate": 0.00012989999999999999, "loss": 2.9979, "step": 440 }, { "epoch": 0.7056, "grad_norm": 1.7289670705795288, "learning_rate": 0.0001302, "loss": 2.8699, "step": 441 }, { "epoch": 0.7072, "grad_norm": 1.728317141532898, "learning_rate": 0.0001305, "loss": 2.7802, "step": 442 }, { "epoch": 0.7088, "grad_norm": 0.960502028465271, "learning_rate": 0.00013079999999999998, "loss": 2.9141, "step": 443 }, { "epoch": 0.7104, "grad_norm": 2.093698501586914, "learning_rate": 0.0001311, "loss": 3.1318, "step": 444 }, { "epoch": 0.712, "grad_norm": 1.6515812873840332, "learning_rate": 0.0001314, "loss": 2.9467, "step": 445 }, { "epoch": 0.7136, "grad_norm": 1.4129968881607056, "learning_rate": 0.00013169999999999998, "loss": 2.7909, "step": 446 }, { "epoch": 0.7152, "grad_norm": 1.5885038375854492, "learning_rate": 0.00013199999999999998, "loss": 3.1262, "step": 447 }, { "epoch": 0.7168, "grad_norm": 1.222842812538147, "learning_rate": 0.0001323, "loss": 3.2029, "step": 448 }, { "epoch": 0.7184, "grad_norm": 1.3282477855682373, "learning_rate": 0.0001326, "loss": 3.0013, "step": 449 }, { "epoch": 0.72, "grad_norm": NaN, "learning_rate": 0.0001326, "loss": 3.4352, "step": 450 }, { "epoch": 0.7216, "grad_norm": 27.799081802368164, "learning_rate": 0.00013289999999999998, "loss": 4.3657, "step": 451 }, { "epoch": 0.7232, "grad_norm": 5.403924465179443, "learning_rate": 0.00013319999999999999, "loss": 2.9244, "step": 452 }, { "epoch": 0.7248, "grad_norm": 3.8071448802948, "learning_rate": 0.0001335, "loss": 2.927, "step": 453 }, { "epoch": 0.7264, "grad_norm": 3.504509210586548, "learning_rate": 0.0001338, "loss": 2.9719, "step": 454 }, { "epoch": 0.728, "grad_norm": 3.500847578048706, "learning_rate": 0.00013409999999999998, "loss": 3.0386, "step": 455 }, { "epoch": 0.7296, "grad_norm": 3.5392863750457764, "learning_rate": 0.0001344, "loss": 2.9468, "step": 456 }, { "epoch": 0.7312, "grad_norm": 5.1045732498168945, "learning_rate": 0.0001347, "loss": 2.7885, "step": 457 }, { "epoch": 0.7328, "grad_norm": 6.027789115905762, "learning_rate": 0.000135, "loss": 2.9067, "step": 458 }, { "epoch": 0.7344, "grad_norm": 5.094452381134033, "learning_rate": 0.00013529999999999998, "loss": 2.8379, "step": 459 }, { "epoch": 0.736, "grad_norm": 2.6457953453063965, "learning_rate": 0.0001356, "loss": 2.8325, "step": 460 }, { "epoch": 0.7376, "grad_norm": 1.5734143257141113, "learning_rate": 0.0001359, "loss": 2.8004, "step": 461 }, { "epoch": 0.7392, "grad_norm": 2.7408978939056396, "learning_rate": 0.0001362, "loss": 2.5294, "step": 462 }, { "epoch": 0.7408, "grad_norm": 3.2462551593780518, "learning_rate": 0.00013649999999999998, "loss": 2.5774, "step": 463 }, { "epoch": 0.7424, "grad_norm": 5.122827529907227, "learning_rate": 0.0001368, "loss": 2.5412, "step": 464 }, { "epoch": 0.744, "grad_norm": 6.828001976013184, "learning_rate": 0.0001371, "loss": 2.5768, "step": 465 }, { "epoch": 0.7456, "grad_norm": 5.996628761291504, "learning_rate": 0.0001374, "loss": 2.5803, "step": 466 }, { "epoch": 0.7472, "grad_norm": 3.842134714126587, "learning_rate": 0.00013769999999999999, "loss": 2.3747, "step": 467 }, { "epoch": 0.7488, "grad_norm": 1.4524292945861816, "learning_rate": 0.000138, "loss": 2.4186, "step": 468 }, { "epoch": 0.7504, "grad_norm": 1.6084707975387573, "learning_rate": 0.0001383, "loss": 2.3012, "step": 469 }, { "epoch": 0.752, "grad_norm": 2.121351718902588, "learning_rate": 0.0001386, "loss": 2.442, "step": 470 }, { "epoch": 0.7536, "grad_norm": 1.5034464597702026, "learning_rate": 0.0001389, "loss": 2.2728, "step": 471 }, { "epoch": 0.7552, "grad_norm": 1.2867931127548218, "learning_rate": 0.0001392, "loss": 2.3669, "step": 472 }, { "epoch": 0.7568, "grad_norm": 1.8455201387405396, "learning_rate": 0.0001395, "loss": 2.3831, "step": 473 }, { "epoch": 0.7584, "grad_norm": 1.4569259881973267, "learning_rate": 0.00013979999999999998, "loss": 2.6884, "step": 474 }, { "epoch": 0.76, "grad_norm": 1.9550684690475464, "learning_rate": 0.0001401, "loss": 2.4852, "step": 475 }, { "epoch": 0.7616, "grad_norm": 2.876927137374878, "learning_rate": 0.0001404, "loss": 2.5227, "step": 476 }, { "epoch": 0.7632, "grad_norm": 1.2651807069778442, "learning_rate": 0.00014069999999999998, "loss": 2.5994, "step": 477 }, { "epoch": 0.7648, "grad_norm": 1.26189386844635, "learning_rate": 0.00014099999999999998, "loss": 2.2933, "step": 478 }, { "epoch": 0.7664, "grad_norm": 1.3137550354003906, "learning_rate": 0.0001413, "loss": 2.3087, "step": 479 }, { "epoch": 0.768, "grad_norm": 1.7220642566680908, "learning_rate": 0.00014159999999999997, "loss": 2.4592, "step": 480 }, { "epoch": 0.7696, "grad_norm": 1.3261381387710571, "learning_rate": 0.00014189999999999998, "loss": 2.0056, "step": 481 }, { "epoch": 0.7712, "grad_norm": 2.571230173110962, "learning_rate": 0.0001422, "loss": 2.2005, "step": 482 }, { "epoch": 0.7728, "grad_norm": 1.9342719316482544, "learning_rate": 0.0001425, "loss": 2.444, "step": 483 }, { "epoch": 0.7744, "grad_norm": 1.9060297012329102, "learning_rate": 0.00014279999999999997, "loss": 2.5657, "step": 484 }, { "epoch": 0.776, "grad_norm": 1.7057262659072876, "learning_rate": 0.00014309999999999998, "loss": 2.2488, "step": 485 }, { "epoch": 0.7776, "grad_norm": 1.5254745483398438, "learning_rate": 0.0001434, "loss": 2.3053, "step": 486 }, { "epoch": 0.7792, "grad_norm": 1.2841426134109497, "learning_rate": 0.00014369999999999997, "loss": 2.7327, "step": 487 }, { "epoch": 0.7808, "grad_norm": 1.2939062118530273, "learning_rate": 0.00014399999999999998, "loss": 2.1748, "step": 488 }, { "epoch": 0.7824, "grad_norm": 1.041858434677124, "learning_rate": 0.00014429999999999998, "loss": 2.2685, "step": 489 }, { "epoch": 0.784, "grad_norm": 1.1529954671859741, "learning_rate": 0.0001446, "loss": 2.6499, "step": 490 }, { "epoch": 0.7856, "grad_norm": 1.2997585535049438, "learning_rate": 0.00014489999999999997, "loss": 2.4287, "step": 491 }, { "epoch": 0.7872, "grad_norm": 1.8214664459228516, "learning_rate": 0.00014519999999999998, "loss": 2.4024, "step": 492 }, { "epoch": 0.7888, "grad_norm": 2.8641598224639893, "learning_rate": 0.00014549999999999999, "loss": 2.3568, "step": 493 }, { "epoch": 0.7904, "grad_norm": 2.793945789337158, "learning_rate": 0.0001458, "loss": 2.635, "step": 494 }, { "epoch": 0.792, "grad_norm": 1.2558726072311401, "learning_rate": 0.00014609999999999997, "loss": 2.4789, "step": 495 }, { "epoch": 0.7936, "grad_norm": 1.8537378311157227, "learning_rate": 0.00014639999999999998, "loss": 2.0977, "step": 496 }, { "epoch": 0.7952, "grad_norm": 1.3181400299072266, "learning_rate": 0.0001467, "loss": 2.168, "step": 497 }, { "epoch": 0.7968, "grad_norm": 9.861762046813965, "learning_rate": 0.000147, "loss": 3.4399, "step": 498 }, { "epoch": 0.7984, "grad_norm": 2.7572944164276123, "learning_rate": 0.00014729999999999998, "loss": 2.4976, "step": 499 }, { "epoch": 0.8, "grad_norm": 3.072735071182251, "learning_rate": 0.00014759999999999998, "loss": 3.0006, "step": 500 }, { "epoch": 0.8016, "grad_norm": 3.723292350769043, "learning_rate": 0.0001479, "loss": 2.4371, "step": 501 }, { "epoch": 0.8032, "grad_norm": 5.342506408691406, "learning_rate": 0.0001482, "loss": 3.0689, "step": 502 }, { "epoch": 0.8048, "grad_norm": 5.763881683349609, "learning_rate": 0.00014849999999999998, "loss": 2.8854, "step": 503 }, { "epoch": 0.8064, "grad_norm": 1.8335249423980713, "learning_rate": 0.00014879999999999998, "loss": 2.4936, "step": 504 }, { "epoch": 0.808, "grad_norm": 2.8503644466400146, "learning_rate": 0.0001491, "loss": 2.4671, "step": 505 }, { "epoch": 0.8096, "grad_norm": 5.93911600112915, "learning_rate": 0.0001494, "loss": 2.2154, "step": 506 }, { "epoch": 0.8112, "grad_norm": 4.656365871429443, "learning_rate": 0.00014969999999999998, "loss": 2.2564, "step": 507 }, { "epoch": 0.8128, "grad_norm": 4.47904109954834, "learning_rate": 0.00015, "loss": 2.4022, "step": 508 }, { "epoch": 0.8144, "grad_norm": 2.0499017238616943, "learning_rate": 0.0001503, "loss": 1.9681, "step": 509 }, { "epoch": 0.816, "grad_norm": 1.0935138463974, "learning_rate": 0.00015059999999999997, "loss": 2.0941, "step": 510 }, { "epoch": 0.8176, "grad_norm": 2.3944854736328125, "learning_rate": 0.00015089999999999998, "loss": 2.0268, "step": 511 }, { "epoch": 0.8192, "grad_norm": 6.021939277648926, "learning_rate": 0.0001512, "loss": 2.3145, "step": 512 }, { "epoch": 0.8208, "grad_norm": 5.291767120361328, "learning_rate": 0.0001515, "loss": 2.038, "step": 513 }, { "epoch": 0.8224, "grad_norm": 4.051759719848633, "learning_rate": 0.00015179999999999998, "loss": 1.8124, "step": 514 }, { "epoch": 0.824, "grad_norm": 6.387513637542725, "learning_rate": 0.00015209999999999998, "loss": 2.114, "step": 515 }, { "epoch": 0.8256, "grad_norm": 3.993975877761841, "learning_rate": 0.0001524, "loss": 1.9412, "step": 516 }, { "epoch": 0.8272, "grad_norm": 2.036212682723999, "learning_rate": 0.0001527, "loss": 1.8678, "step": 517 }, { "epoch": 0.8288, "grad_norm": 1.404420256614685, "learning_rate": 0.00015299999999999998, "loss": 2.1287, "step": 518 }, { "epoch": 0.8304, "grad_norm": 1.0048662424087524, "learning_rate": 0.00015329999999999999, "loss": 1.9134, "step": 519 }, { "epoch": 0.832, "grad_norm": 2.347856044769287, "learning_rate": 0.0001536, "loss": 1.8799, "step": 520 }, { "epoch": 0.8336, "grad_norm": 3.0598201751708984, "learning_rate": 0.0001539, "loss": 1.9441, "step": 521 }, { "epoch": 0.8352, "grad_norm": 2.636126756668091, "learning_rate": 0.00015419999999999998, "loss": 1.7355, "step": 522 }, { "epoch": 0.8368, "grad_norm": 1.8599352836608887, "learning_rate": 0.0001545, "loss": 1.9851, "step": 523 }, { "epoch": 0.8384, "grad_norm": 0.9748109579086304, "learning_rate": 0.0001548, "loss": 1.7774, "step": 524 }, { "epoch": 0.84, "grad_norm": 1.414323091506958, "learning_rate": 0.0001551, "loss": 2.1997, "step": 525 }, { "epoch": 0.8416, "grad_norm": 2.8852648735046387, "learning_rate": 0.00015539999999999998, "loss": 2.0179, "step": 526 }, { "epoch": 0.8432, "grad_norm": 2.0136239528656006, "learning_rate": 0.0001557, "loss": 1.9451, "step": 527 }, { "epoch": 0.8448, "grad_norm": 2.07312273979187, "learning_rate": 0.000156, "loss": 1.7522, "step": 528 }, { "epoch": 0.8464, "grad_norm": 1.4143507480621338, "learning_rate": 0.0001563, "loss": 1.6561, "step": 529 }, { "epoch": 0.848, "grad_norm": 3.017238140106201, "learning_rate": 0.00015659999999999998, "loss": 1.913, "step": 530 }, { "epoch": 0.8496, "grad_norm": 0.9368352293968201, "learning_rate": 0.0001569, "loss": 1.8592, "step": 531 }, { "epoch": 0.8512, "grad_norm": 1.308072566986084, "learning_rate": 0.0001572, "loss": 2.2341, "step": 532 }, { "epoch": 0.8528, "grad_norm": 2.2798593044281006, "learning_rate": 0.00015749999999999998, "loss": 1.9506, "step": 533 }, { "epoch": 0.8544, "grad_norm": 2.6132118701934814, "learning_rate": 0.0001578, "loss": 1.9343, "step": 534 }, { "epoch": 0.856, "grad_norm": 1.162194848060608, "learning_rate": 0.0001581, "loss": 1.9341, "step": 535 }, { "epoch": 0.8576, "grad_norm": 1.3427730798721313, "learning_rate": 0.0001584, "loss": 1.7395, "step": 536 }, { "epoch": 0.8592, "grad_norm": 2.1670310497283936, "learning_rate": 0.00015869999999999998, "loss": 2.3282, "step": 537 }, { "epoch": 0.8608, "grad_norm": 1.257582187652588, "learning_rate": 0.000159, "loss": 2.09, "step": 538 }, { "epoch": 0.8624, "grad_norm": 1.4573386907577515, "learning_rate": 0.0001593, "loss": 1.8402, "step": 539 }, { "epoch": 0.864, "grad_norm": 1.3384615182876587, "learning_rate": 0.0001596, "loss": 1.7193, "step": 540 }, { "epoch": 0.8656, "grad_norm": 2.220402479171753, "learning_rate": 0.00015989999999999998, "loss": 1.5656, "step": 541 }, { "epoch": 0.8672, "grad_norm": 2.4653773307800293, "learning_rate": 0.0001602, "loss": 2.0628, "step": 542 }, { "epoch": 0.8688, "grad_norm": 1.280678391456604, "learning_rate": 0.0001605, "loss": 1.8363, "step": 543 }, { "epoch": 0.8704, "grad_norm": 2.4655933380126953, "learning_rate": 0.0001608, "loss": 1.7545, "step": 544 }, { "epoch": 0.872, "grad_norm": 1.506415605545044, "learning_rate": 0.00016109999999999999, "loss": 1.8381, "step": 545 }, { "epoch": 0.8736, "grad_norm": 1.1475555896759033, "learning_rate": 0.0001614, "loss": 1.829, "step": 546 }, { "epoch": 0.8752, "grad_norm": 1.4434545040130615, "learning_rate": 0.0001617, "loss": 1.8184, "step": 547 }, { "epoch": 0.8768, "grad_norm": 1.8260152339935303, "learning_rate": 0.000162, "loss": 1.9946, "step": 548 }, { "epoch": 0.8784, "grad_norm": 1.8104926347732544, "learning_rate": 0.0001623, "loss": 2.031, "step": 549 }, { "epoch": 0.88, "grad_norm": 2.094877243041992, "learning_rate": 0.0001626, "loss": 2.2711, "step": 550 }, { "epoch": 0.8816, "grad_norm": 23.733247756958008, "learning_rate": 0.0001629, "loss": 4.8174, "step": 551 }, { "epoch": 0.8832, "grad_norm": 12.243576049804688, "learning_rate": 0.0001632, "loss": 3.3335, "step": 552 }, { "epoch": 0.8848, "grad_norm": 3.982137441635132, "learning_rate": 0.0001635, "loss": 2.3234, "step": 553 }, { "epoch": 0.8864, "grad_norm": 2.5422203540802, "learning_rate": 0.0001638, "loss": 2.1768, "step": 554 }, { "epoch": 0.888, "grad_norm": 2.649517059326172, "learning_rate": 0.0001641, "loss": 2.022, "step": 555 }, { "epoch": 0.8896, "grad_norm": 4.723710536956787, "learning_rate": 0.0001644, "loss": 1.8511, "step": 556 }, { "epoch": 0.8912, "grad_norm": 2.3035788536071777, "learning_rate": 0.0001647, "loss": 1.9528, "step": 557 }, { "epoch": 0.8928, "grad_norm": 3.8410518169403076, "learning_rate": 0.000165, "loss": 1.9104, "step": 558 }, { "epoch": 0.8944, "grad_norm": 3.0108225345611572, "learning_rate": 0.0001653, "loss": 1.7834, "step": 559 }, { "epoch": 0.896, "grad_norm": 1.3487671613693237, "learning_rate": 0.0001656, "loss": 1.883, "step": 560 }, { "epoch": 0.8976, "grad_norm": 1.061733365058899, "learning_rate": 0.0001659, "loss": 1.5688, "step": 561 }, { "epoch": 0.8992, "grad_norm": 2.0784027576446533, "learning_rate": 0.0001662, "loss": 1.6914, "step": 562 }, { "epoch": 0.9008, "grad_norm": 6.085043907165527, "learning_rate": 0.0001665, "loss": 2.3407, "step": 563 }, { "epoch": 0.9024, "grad_norm": 1.459148645401001, "learning_rate": 0.0001668, "loss": 1.7104, "step": 564 }, { "epoch": 0.904, "grad_norm": 1.9622076749801636, "learning_rate": 0.0001671, "loss": 1.5955, "step": 565 }, { "epoch": 0.9056, "grad_norm": 1.2756608724594116, "learning_rate": 0.0001674, "loss": 1.4071, "step": 566 }, { "epoch": 0.9072, "grad_norm": 0.940319299697876, "learning_rate": 0.0001677, "loss": 1.6557, "step": 567 }, { "epoch": 0.9088, "grad_norm": 0.9497667551040649, "learning_rate": 0.000168, "loss": 1.774, "step": 568 }, { "epoch": 0.9104, "grad_norm": 1.1930807828903198, "learning_rate": 0.0001683, "loss": 1.8378, "step": 569 }, { "epoch": 0.912, "grad_norm": 1.7330429553985596, "learning_rate": 0.0001686, "loss": 1.6816, "step": 570 }, { "epoch": 0.9136, "grad_norm": 0.9604584574699402, "learning_rate": 0.00016889999999999996, "loss": 1.6782, "step": 571 }, { "epoch": 0.9152, "grad_norm": 0.9503042101860046, "learning_rate": 0.00016919999999999997, "loss": 1.5947, "step": 572 }, { "epoch": 0.9168, "grad_norm": 1.1088024377822876, "learning_rate": 0.00016949999999999997, "loss": 1.6978, "step": 573 }, { "epoch": 0.9184, "grad_norm": 1.118318796157837, "learning_rate": 0.00016979999999999998, "loss": 1.656, "step": 574 }, { "epoch": 0.92, "grad_norm": 1.5163230895996094, "learning_rate": 0.00017009999999999996, "loss": 1.6588, "step": 575 }, { "epoch": 0.9216, "grad_norm": 1.4612356424331665, "learning_rate": 0.00017039999999999997, "loss": 1.9119, "step": 576 }, { "epoch": 0.9232, "grad_norm": 1.2807903289794922, "learning_rate": 0.00017069999999999998, "loss": 1.4299, "step": 577 }, { "epoch": 0.9248, "grad_norm": 1.049907922744751, "learning_rate": 0.00017099999999999998, "loss": 1.3226, "step": 578 }, { "epoch": 0.9264, "grad_norm": 1.0162078142166138, "learning_rate": 0.00017129999999999996, "loss": 1.8021, "step": 579 }, { "epoch": 0.928, "grad_norm": 1.3673537969589233, "learning_rate": 0.00017159999999999997, "loss": 1.6087, "step": 580 }, { "epoch": 0.9296, "grad_norm": 1.2779172658920288, "learning_rate": 0.00017189999999999998, "loss": 1.6225, "step": 581 }, { "epoch": 0.9312, "grad_norm": 1.2135735750198364, "learning_rate": 0.00017219999999999998, "loss": 1.5889, "step": 582 }, { "epoch": 0.9328, "grad_norm": 1.45180344581604, "learning_rate": 0.00017249999999999996, "loss": 2.1697, "step": 583 }, { "epoch": 0.9344, "grad_norm": 1.1630367040634155, "learning_rate": 0.00017279999999999997, "loss": 1.79, "step": 584 }, { "epoch": 0.936, "grad_norm": 2.428530693054199, "learning_rate": 0.00017309999999999998, "loss": 1.5455, "step": 585 }, { "epoch": 0.9376, "grad_norm": 1.3975725173950195, "learning_rate": 0.00017339999999999996, "loss": 1.7658, "step": 586 }, { "epoch": 0.9392, "grad_norm": 1.242210865020752, "learning_rate": 0.00017369999999999997, "loss": 1.8039, "step": 587 }, { "epoch": 0.9408, "grad_norm": 1.071577787399292, "learning_rate": 0.00017399999999999997, "loss": 1.7215, "step": 588 }, { "epoch": 0.9424, "grad_norm": 1.208039402961731, "learning_rate": 0.00017429999999999998, "loss": 1.8733, "step": 589 }, { "epoch": 0.944, "grad_norm": 1.5233865976333618, "learning_rate": 0.00017459999999999996, "loss": 1.4408, "step": 590 }, { "epoch": 0.9456, "grad_norm": 1.411783218383789, "learning_rate": 0.00017489999999999997, "loss": 1.8393, "step": 591 }, { "epoch": 0.9472, "grad_norm": 1.629401683807373, "learning_rate": 0.00017519999999999998, "loss": 1.679, "step": 592 }, { "epoch": 0.9488, "grad_norm": 1.487720012664795, "learning_rate": 0.00017549999999999998, "loss": 1.9937, "step": 593 }, { "epoch": 0.9504, "grad_norm": 1.7428632974624634, "learning_rate": 0.00017579999999999996, "loss": 1.8585, "step": 594 }, { "epoch": 0.952, "grad_norm": 1.5290313959121704, "learning_rate": 0.00017609999999999997, "loss": 1.758, "step": 595 }, { "epoch": 0.9536, "grad_norm": 1.4210582971572876, "learning_rate": 0.00017639999999999998, "loss": 1.6403, "step": 596 }, { "epoch": 0.9552, "grad_norm": 1.487386703491211, "learning_rate": 0.00017669999999999999, "loss": 2.0706, "step": 597 }, { "epoch": 0.9568, "grad_norm": 1.789679765701294, "learning_rate": 0.00017699999999999997, "loss": 2.0324, "step": 598 }, { "epoch": 0.9584, "grad_norm": 3.552408456802368, "learning_rate": 0.00017729999999999997, "loss": 2.4765, "step": 599 }, { "epoch": 0.96, "grad_norm": 2.6970980167388916, "learning_rate": 0.00017759999999999998, "loss": 2.49, "step": 600 }, { "epoch": 0.9616, "grad_norm": 6.3989667892456055, "learning_rate": 0.0001779, "loss": 2.2124, "step": 601 }, { "epoch": 0.9632, "grad_norm": 3.559483528137207, "learning_rate": 0.00017819999999999997, "loss": 1.9114, "step": 602 }, { "epoch": 0.9648, "grad_norm": 2.688811779022217, "learning_rate": 0.00017849999999999997, "loss": 1.7274, "step": 603 }, { "epoch": 0.9664, "grad_norm": 1.4167048931121826, "learning_rate": 0.00017879999999999998, "loss": 1.5342, "step": 604 }, { "epoch": 0.968, "grad_norm": 1.0234233140945435, "learning_rate": 0.0001791, "loss": 1.6476, "step": 605 }, { "epoch": 0.9696, "grad_norm": 2.3607473373413086, "learning_rate": 0.00017939999999999997, "loss": 1.8034, "step": 606 }, { "epoch": 0.9712, "grad_norm": 1.8193793296813965, "learning_rate": 0.00017969999999999998, "loss": 1.2502, "step": 607 }, { "epoch": 0.9728, "grad_norm": 2.5050389766693115, "learning_rate": 0.00017999999999999998, "loss": 1.7518, "step": 608 }, { "epoch": 0.9744, "grad_norm": 1.852980375289917, "learning_rate": 0.00018029999999999996, "loss": 2.2657, "step": 609 }, { "epoch": 0.976, "grad_norm": 1.1846544742584229, "learning_rate": 0.00018059999999999997, "loss": 1.6213, "step": 610 }, { "epoch": 0.9776, "grad_norm": 1.1806446313858032, "learning_rate": 0.00018089999999999998, "loss": 1.5566, "step": 611 }, { "epoch": 0.9792, "grad_norm": 0.9722961187362671, "learning_rate": 0.00018119999999999999, "loss": 1.4004, "step": 612 }, { "epoch": 0.9808, "grad_norm": 1.2534488439559937, "learning_rate": 0.00018149999999999997, "loss": 1.9613, "step": 613 }, { "epoch": 0.9824, "grad_norm": 1.55427885055542, "learning_rate": 0.00018179999999999997, "loss": 1.3668, "step": 614 }, { "epoch": 0.984, "grad_norm": 1.8559104204177856, "learning_rate": 0.00018209999999999998, "loss": 1.153, "step": 615 }, { "epoch": 0.9856, "grad_norm": 1.3127942085266113, "learning_rate": 0.0001824, "loss": 1.6635, "step": 616 }, { "epoch": 0.9872, "grad_norm": 1.3206202983856201, "learning_rate": 0.00018269999999999997, "loss": 1.5436, "step": 617 }, { "epoch": 0.9888, "grad_norm": 1.0405744314193726, "learning_rate": 0.00018299999999999998, "loss": 1.6072, "step": 618 }, { "epoch": 0.9904, "grad_norm": 1.1208364963531494, "learning_rate": 0.00018329999999999998, "loss": 1.3522, "step": 619 }, { "epoch": 0.992, "grad_norm": 1.4611485004425049, "learning_rate": 0.0001836, "loss": 1.8288, "step": 620 }, { "epoch": 0.9936, "grad_norm": 2.102464199066162, "learning_rate": 0.00018389999999999997, "loss": 2.2311, "step": 621 }, { "epoch": 0.9952, "grad_norm": 1.3121858835220337, "learning_rate": 0.00018419999999999998, "loss": 1.6955, "step": 622 }, { "epoch": 0.9968, "grad_norm": 1.732784390449524, "learning_rate": 0.00018449999999999999, "loss": 2.066, "step": 623 }, { "epoch": 0.9984, "grad_norm": 1.474577784538269, "learning_rate": 0.0001848, "loss": 1.7517, "step": 624 }, { "epoch": 1.0, "grad_norm": NaN, "learning_rate": 0.0001848, "loss": 2.8279, "step": 625 }, { "epoch": 1.0016, "grad_norm": 21.10396385192871, "learning_rate": 0.00018509999999999997, "loss": 4.359, "step": 626 }, { "epoch": 1.0032, "grad_norm": 2.5289759635925293, "learning_rate": 0.00018539999999999998, "loss": 1.5803, "step": 627 }, { "epoch": 1.0048, "grad_norm": 17.63152503967285, "learning_rate": 0.0001857, "loss": 4.1665, "step": 628 }, { "epoch": 1.0064, "grad_norm": 1.2565017938613892, "learning_rate": 0.000186, "loss": 1.5211, "step": 629 }, { "epoch": 1.008, "grad_norm": 0.9237573146820068, "learning_rate": 0.00018629999999999997, "loss": 1.6206, "step": 630 }, { "epoch": 1.0096, "grad_norm": 1.304307222366333, "learning_rate": 0.00018659999999999998, "loss": 1.7635, "step": 631 }, { "epoch": 1.0112, "grad_norm": 2.240795850753784, "learning_rate": 0.0001869, "loss": 1.4183, "step": 632 }, { "epoch": 1.0128, "grad_norm": 1.2945712804794312, "learning_rate": 0.0001872, "loss": 1.8278, "step": 633 }, { "epoch": 1.0144, "grad_norm": 2.4284050464630127, "learning_rate": 0.00018749999999999998, "loss": 1.8362, "step": 634 }, { "epoch": 1.016, "grad_norm": 1.5324746370315552, "learning_rate": 0.00018779999999999998, "loss": 1.3312, "step": 635 }, { "epoch": 1.0176, "grad_norm": 0.9457862973213196, "learning_rate": 0.0001881, "loss": 1.5771, "step": 636 }, { "epoch": 1.0192, "grad_norm": 1.761409878730774, "learning_rate": 0.00018839999999999997, "loss": 1.3939, "step": 637 }, { "epoch": 1.0208, "grad_norm": 2.4509124755859375, "learning_rate": 0.00018869999999999998, "loss": 1.305, "step": 638 }, { "epoch": 1.0224, "grad_norm": 1.434770941734314, "learning_rate": 0.00018899999999999999, "loss": 1.2198, "step": 639 }, { "epoch": 1.024, "grad_norm": 1.683680772781372, "learning_rate": 0.0001893, "loss": 1.4401, "step": 640 }, { "epoch": 1.0256, "grad_norm": 1.468677282333374, "learning_rate": 0.00018959999999999997, "loss": 1.3005, "step": 641 }, { "epoch": 1.0272, "grad_norm": 2.2306525707244873, "learning_rate": 0.00018989999999999998, "loss": 1.6153, "step": 642 }, { "epoch": 1.0288, "grad_norm": 4.796661853790283, "learning_rate": 0.0001902, "loss": 2.2058, "step": 643 }, { "epoch": 1.0304, "grad_norm": 1.139748454093933, "learning_rate": 0.0001905, "loss": 1.3829, "step": 644 }, { "epoch": 1.032, "grad_norm": 1.9971469640731812, "learning_rate": 0.00019079999999999998, "loss": 1.5598, "step": 645 }, { "epoch": 1.0336, "grad_norm": 2.224128007888794, "learning_rate": 0.00019109999999999998, "loss": 1.3654, "step": 646 }, { "epoch": 1.0352, "grad_norm": 2.5159313678741455, "learning_rate": 0.0001914, "loss": 1.6379, "step": 647 }, { "epoch": 1.0368, "grad_norm": 1.9604592323303223, "learning_rate": 0.0001917, "loss": 1.5734, "step": 648 }, { "epoch": 1.0384, "grad_norm": 1.4151877164840698, "learning_rate": 0.00019199999999999998, "loss": 1.1349, "step": 649 }, { "epoch": 1.04, "grad_norm": 1.21165931224823, "learning_rate": 0.00019229999999999999, "loss": 1.7592, "step": 650 }, { "epoch": 1.0416, "grad_norm": 2.344447135925293, "learning_rate": 0.0001926, "loss": 1.505, "step": 651 }, { "epoch": 1.0432, "grad_norm": 2.5432910919189453, "learning_rate": 0.0001929, "loss": 1.6673, "step": 652 }, { "epoch": 1.0448, "grad_norm": 1.5895689725875854, "learning_rate": 0.00019319999999999998, "loss": 1.6617, "step": 653 }, { "epoch": 1.0464, "grad_norm": 1.7360563278198242, "learning_rate": 0.0001935, "loss": 1.6216, "step": 654 }, { "epoch": 1.048, "grad_norm": 1.3723790645599365, "learning_rate": 0.0001938, "loss": 1.257, "step": 655 }, { "epoch": 1.0496, "grad_norm": 0.8750591278076172, "learning_rate": 0.0001941, "loss": 1.4356, "step": 656 }, { "epoch": 1.0512, "grad_norm": 1.407861590385437, "learning_rate": 0.00019439999999999998, "loss": 1.346, "step": 657 }, { "epoch": 1.0528, "grad_norm": 1.2812424898147583, "learning_rate": 0.0001947, "loss": 1.2363, "step": 658 }, { "epoch": 1.0544, "grad_norm": 1.2920845746994019, "learning_rate": 0.000195, "loss": 1.4353, "step": 659 }, { "epoch": 1.056, "grad_norm": 1.0122877359390259, "learning_rate": 0.00019529999999999998, "loss": 1.5272, "step": 660 }, { "epoch": 1.0576, "grad_norm": 1.0607578754425049, "learning_rate": 0.00019559999999999998, "loss": 1.1926, "step": 661 }, { "epoch": 1.0592, "grad_norm": 1.2849078178405762, "learning_rate": 0.0001959, "loss": 1.361, "step": 662 }, { "epoch": 1.0608, "grad_norm": 2.199488401412964, "learning_rate": 0.0001962, "loss": 1.4892, "step": 663 }, { "epoch": 1.0624, "grad_norm": 1.7300806045532227, "learning_rate": 0.00019649999999999998, "loss": 1.4795, "step": 664 }, { "epoch": 1.064, "grad_norm": 1.210700273513794, "learning_rate": 0.00019679999999999999, "loss": 1.6863, "step": 665 }, { "epoch": 1.0656, "grad_norm": 1.1998845338821411, "learning_rate": 0.0001971, "loss": 1.3863, "step": 666 }, { "epoch": 1.0672, "grad_norm": 1.5421574115753174, "learning_rate": 0.0001974, "loss": 1.9558, "step": 667 }, { "epoch": 1.0688, "grad_norm": 2.3596279621124268, "learning_rate": 0.00019769999999999998, "loss": 1.3471, "step": 668 }, { "epoch": 1.0704, "grad_norm": 1.3288168907165527, "learning_rate": 0.000198, "loss": 1.3686, "step": 669 }, { "epoch": 1.072, "grad_norm": 1.5977771282196045, "learning_rate": 0.0001983, "loss": 1.6142, "step": 670 }, { "epoch": 1.0735999999999999, "grad_norm": 1.171886682510376, "learning_rate": 0.0001986, "loss": 1.8817, "step": 671 }, { "epoch": 1.0752, "grad_norm": 1.4820473194122314, "learning_rate": 0.00019889999999999998, "loss": 1.502, "step": 672 }, { "epoch": 1.0768, "grad_norm": 1.4286924600601196, "learning_rate": 0.0001992, "loss": 1.9869, "step": 673 }, { "epoch": 1.0784, "grad_norm": 1.496476650238037, "learning_rate": 0.0001995, "loss": 1.5545, "step": 674 }, { "epoch": 1.08, "grad_norm": 11.650896072387695, "learning_rate": 0.0001998, "loss": 3.5297, "step": 675 }, { "epoch": 1.0816, "grad_norm": 10.930564880371094, "learning_rate": 0.00020009999999999998, "loss": 3.5244, "step": 676 }, { "epoch": 1.0832, "grad_norm": 4.526219367980957, "learning_rate": 0.0002004, "loss": 1.948, "step": 677 }, { "epoch": 1.0848, "grad_norm": 1.8217471837997437, "learning_rate": 0.0002007, "loss": 1.6832, "step": 678 }, { "epoch": 1.0864, "grad_norm": 2.5544323921203613, "learning_rate": 0.000201, "loss": 2.3308, "step": 679 }, { "epoch": 1.088, "grad_norm": 2.732450246810913, "learning_rate": 0.0002013, "loss": 1.7663, "step": 680 }, { "epoch": 1.0896, "grad_norm": 4.002326488494873, "learning_rate": 0.0002016, "loss": 1.9597, "step": 681 }, { "epoch": 1.0912, "grad_norm": 2.9579389095306396, "learning_rate": 0.0002019, "loss": 1.6625, "step": 682 }, { "epoch": 1.0928, "grad_norm": 3.6762917041778564, "learning_rate": 0.0002022, "loss": 1.4949, "step": 683 }, { "epoch": 1.0944, "grad_norm": 2.8355441093444824, "learning_rate": 0.0002025, "loss": 1.5695, "step": 684 }, { "epoch": 1.096, "grad_norm": 2.894350290298462, "learning_rate": 0.0002028, "loss": 1.5717, "step": 685 }, { "epoch": 1.0976, "grad_norm": 4.992308616638184, "learning_rate": 0.0002031, "loss": 1.7573, "step": 686 }, { "epoch": 1.0992, "grad_norm": 1.175133466720581, "learning_rate": 0.00020339999999999998, "loss": 1.2767, "step": 687 }, { "epoch": 1.1008, "grad_norm": 0.7449688911437988, "learning_rate": 0.0002037, "loss": 1.4346, "step": 688 }, { "epoch": 1.1024, "grad_norm": 2.100440740585327, "learning_rate": 0.000204, "loss": 1.5286, "step": 689 }, { "epoch": 1.104, "grad_norm": 1.06446373462677, "learning_rate": 0.0002043, "loss": 1.4716, "step": 690 }, { "epoch": 1.1056, "grad_norm": 1.1813894510269165, "learning_rate": 0.00020459999999999999, "loss": 1.729, "step": 691 }, { "epoch": 1.1072, "grad_norm": 1.2244285345077515, "learning_rate": 0.0002049, "loss": 1.5456, "step": 692 }, { "epoch": 1.1088, "grad_norm": 1.395920991897583, "learning_rate": 0.0002052, "loss": 1.6253, "step": 693 }, { "epoch": 1.1104, "grad_norm": 0.8973720073699951, "learning_rate": 0.0002055, "loss": 1.3474, "step": 694 }, { "epoch": 1.112, "grad_norm": 0.9351361393928528, "learning_rate": 0.0002058, "loss": 1.5375, "step": 695 }, { "epoch": 1.1136, "grad_norm": 0.9488412737846375, "learning_rate": 0.0002061, "loss": 1.2332, "step": 696 }, { "epoch": 1.1152, "grad_norm": 0.800336480140686, "learning_rate": 0.00020639999999999998, "loss": 1.3265, "step": 697 }, { "epoch": 1.1168, "grad_norm": 1.771794319152832, "learning_rate": 0.00020669999999999996, "loss": 1.3347, "step": 698 }, { "epoch": 1.1184, "grad_norm": 3.4581542015075684, "learning_rate": 0.00020699999999999996, "loss": 1.783, "step": 699 }, { "epoch": 1.12, "grad_norm": 0.837477445602417, "learning_rate": 0.00020729999999999997, "loss": 1.3295, "step": 700 }, { "epoch": 1.1216, "grad_norm": 2.1295042037963867, "learning_rate": 0.00020759999999999998, "loss": 1.4163, "step": 701 }, { "epoch": 1.1232, "grad_norm": 1.0342674255371094, "learning_rate": 0.00020789999999999996, "loss": 1.2891, "step": 702 }, { "epoch": 1.1248, "grad_norm": 1.1783955097198486, "learning_rate": 0.00020819999999999996, "loss": 1.5386, "step": 703 }, { "epoch": 1.1264, "grad_norm": 2.5988528728485107, "learning_rate": 0.00020849999999999997, "loss": 1.5942, "step": 704 }, { "epoch": 1.1280000000000001, "grad_norm": 1.206281065940857, "learning_rate": 0.00020879999999999998, "loss": 1.4828, "step": 705 }, { "epoch": 1.1296, "grad_norm": 1.60711669921875, "learning_rate": 0.00020909999999999996, "loss": 1.6748, "step": 706 }, { "epoch": 1.1312, "grad_norm": 1.3890515565872192, "learning_rate": 0.00020939999999999997, "loss": 1.4464, "step": 707 }, { "epoch": 1.1328, "grad_norm": 1.4788490533828735, "learning_rate": 0.00020969999999999997, "loss": 1.9039, "step": 708 }, { "epoch": 1.1344, "grad_norm": 0.9197102189064026, "learning_rate": 0.00020999999999999998, "loss": 1.5258, "step": 709 }, { "epoch": 1.1360000000000001, "grad_norm": 3.082664728164673, "learning_rate": 0.00021029999999999996, "loss": 1.6637, "step": 710 }, { "epoch": 1.1376, "grad_norm": 1.3979014158248901, "learning_rate": 0.00021059999999999997, "loss": 1.3499, "step": 711 }, { "epoch": 1.1392, "grad_norm": 2.7370402812957764, "learning_rate": 0.00021089999999999998, "loss": 1.7379, "step": 712 }, { "epoch": 1.1408, "grad_norm": 1.36969792842865, "learning_rate": 0.00021119999999999996, "loss": 1.3235, "step": 713 }, { "epoch": 1.1424, "grad_norm": 1.3009356260299683, "learning_rate": 0.00021149999999999996, "loss": 1.2209, "step": 714 }, { "epoch": 1.144, "grad_norm": 1.0813698768615723, "learning_rate": 0.00021179999999999997, "loss": 1.237, "step": 715 }, { "epoch": 1.1456, "grad_norm": 1.2386032342910767, "learning_rate": 0.00021209999999999998, "loss": 1.5799, "step": 716 }, { "epoch": 1.1472, "grad_norm": 1.7847639322280884, "learning_rate": 0.00021239999999999996, "loss": 1.852, "step": 717 }, { "epoch": 1.1488, "grad_norm": 1.4111274480819702, "learning_rate": 0.00021269999999999997, "loss": 1.7199, "step": 718 }, { "epoch": 1.1504, "grad_norm": 1.6253108978271484, "learning_rate": 0.00021299999999999997, "loss": 1.6921, "step": 719 }, { "epoch": 1.152, "grad_norm": 1.3691827058792114, "learning_rate": 0.00021329999999999998, "loss": 1.5512, "step": 720 }, { "epoch": 1.1536, "grad_norm": 1.0425063371658325, "learning_rate": 0.00021359999999999996, "loss": 1.6953, "step": 721 }, { "epoch": 1.1552, "grad_norm": 1.5456832647323608, "learning_rate": 0.00021389999999999997, "loss": 1.2235, "step": 722 }, { "epoch": 1.1568, "grad_norm": 5.289543151855469, "learning_rate": 0.00021419999999999998, "loss": 2.3722, "step": 723 }, { "epoch": 1.1584, "grad_norm": 3.060047149658203, "learning_rate": 0.00021449999999999998, "loss": 1.7361, "step": 724 }, { "epoch": 1.16, "grad_norm": 2.2316486835479736, "learning_rate": 0.00021479999999999996, "loss": 2.2835, "step": 725 }, { "epoch": 1.1616, "grad_norm": 11.319620132446289, "learning_rate": 0.00021509999999999997, "loss": 2.8832, "step": 726 }, { "epoch": 1.1632, "grad_norm": 4.8169121742248535, "learning_rate": 0.00021539999999999998, "loss": 2.4756, "step": 727 }, { "epoch": 1.1648, "grad_norm": 4.998867511749268, "learning_rate": 0.00021569999999999998, "loss": 2.5684, "step": 728 }, { "epoch": 1.1663999999999999, "grad_norm": 1.6369566917419434, "learning_rate": 0.00021599999999999996, "loss": 1.9992, "step": 729 }, { "epoch": 1.168, "grad_norm": 4.867010593414307, "learning_rate": 0.00021629999999999997, "loss": 1.6888, "step": 730 }, { "epoch": 1.1696, "grad_norm": 4.665241241455078, "learning_rate": 0.00021659999999999998, "loss": 1.703, "step": 731 }, { "epoch": 1.1712, "grad_norm": 4.923267364501953, "learning_rate": 0.0002169, "loss": 1.8941, "step": 732 }, { "epoch": 1.1728, "grad_norm": 5.301763534545898, "learning_rate": 0.00021719999999999997, "loss": 1.8063, "step": 733 }, { "epoch": 1.1743999999999999, "grad_norm": 5.480170726776123, "learning_rate": 0.00021749999999999997, "loss": 1.6896, "step": 734 }, { "epoch": 1.176, "grad_norm": 5.136298656463623, "learning_rate": 0.00021779999999999998, "loss": 1.7705, "step": 735 }, { "epoch": 1.1776, "grad_norm": 2.6885194778442383, "learning_rate": 0.00021809999999999996, "loss": 1.29, "step": 736 }, { "epoch": 1.1792, "grad_norm": 1.7481634616851807, "learning_rate": 0.00021839999999999997, "loss": 1.8537, "step": 737 }, { "epoch": 1.1808, "grad_norm": 2.1533167362213135, "learning_rate": 0.00021869999999999998, "loss": 1.3772, "step": 738 }, { "epoch": 1.1824, "grad_norm": 1.7290595769882202, "learning_rate": 0.00021899999999999998, "loss": 1.2517, "step": 739 }, { "epoch": 1.184, "grad_norm": 5.765242576599121, "learning_rate": 0.00021929999999999996, "loss": 1.9578, "step": 740 }, { "epoch": 1.1856, "grad_norm": 5.171415328979492, "learning_rate": 0.00021959999999999997, "loss": 1.6263, "step": 741 }, { "epoch": 1.1872, "grad_norm": 2.4269332885742188, "learning_rate": 0.00021989999999999998, "loss": 1.5452, "step": 742 }, { "epoch": 1.1888, "grad_norm": 1.2465523481369019, "learning_rate": 0.00022019999999999999, "loss": 1.2282, "step": 743 }, { "epoch": 1.1904, "grad_norm": 1.9669184684753418, "learning_rate": 0.00022049999999999997, "loss": 1.1983, "step": 744 }, { "epoch": 1.192, "grad_norm": 5.07749605178833, "learning_rate": 0.00022079999999999997, "loss": 1.7768, "step": 745 }, { "epoch": 1.1936, "grad_norm": 1.4985103607177734, "learning_rate": 0.00022109999999999998, "loss": 1.504, "step": 746 }, { "epoch": 1.1952, "grad_norm": 0.8555597066879272, "learning_rate": 0.0002214, "loss": 1.2868, "step": 747 }, { "epoch": 1.1968, "grad_norm": 1.0134530067443848, "learning_rate": 0.00022169999999999997, "loss": 1.5408, "step": 748 }, { "epoch": 1.1984, "grad_norm": 2.2932121753692627, "learning_rate": 0.00022199999999999998, "loss": 1.4542, "step": 749 }, { "epoch": 1.2, "grad_norm": 2.048572063446045, "learning_rate": 0.00022229999999999998, "loss": 1.6303, "step": 750 }, { "epoch": 1.2016, "grad_norm": 1.3398712873458862, "learning_rate": 0.0002226, "loss": 1.6202, "step": 751 }, { "epoch": 1.2032, "grad_norm": 1.2826422452926636, "learning_rate": 0.00022289999999999997, "loss": 1.114, "step": 752 }, { "epoch": 1.2048, "grad_norm": 0.9887292385101318, "learning_rate": 0.00022319999999999998, "loss": 1.2535, "step": 753 }, { "epoch": 1.2064, "grad_norm": 0.8880885243415833, "learning_rate": 0.00022349999999999998, "loss": 1.123, "step": 754 }, { "epoch": 1.208, "grad_norm": 4.056207180023193, "learning_rate": 0.0002238, "loss": 1.7848, "step": 755 }, { "epoch": 1.2096, "grad_norm": 1.0242630243301392, "learning_rate": 0.00022409999999999997, "loss": 1.6084, "step": 756 }, { "epoch": 1.2112, "grad_norm": 1.5216087102890015, "learning_rate": 0.00022439999999999998, "loss": 1.0884, "step": 757 }, { "epoch": 1.2128, "grad_norm": 1.1595895290374756, "learning_rate": 0.0002247, "loss": 1.4366, "step": 758 }, { "epoch": 1.2144, "grad_norm": 1.78994619846344, "learning_rate": 0.000225, "loss": 1.4678, "step": 759 }, { "epoch": 1.216, "grad_norm": 1.5748515129089355, "learning_rate": 0.00022529999999999997, "loss": 1.5742, "step": 760 }, { "epoch": 1.2176, "grad_norm": 1.2527673244476318, "learning_rate": 0.00022559999999999998, "loss": 1.2028, "step": 761 }, { "epoch": 1.2192, "grad_norm": 1.4951261281967163, "learning_rate": 0.0002259, "loss": 1.8887, "step": 762 }, { "epoch": 1.2208, "grad_norm": 1.1303513050079346, "learning_rate": 0.00022619999999999997, "loss": 1.6479, "step": 763 }, { "epoch": 1.2224, "grad_norm": 1.3236031532287598, "learning_rate": 0.00022649999999999998, "loss": 1.703, "step": 764 }, { "epoch": 1.224, "grad_norm": 1.5853848457336426, "learning_rate": 0.00022679999999999998, "loss": 1.8706, "step": 765 }, { "epoch": 1.2256, "grad_norm": 2.0144317150115967, "learning_rate": 0.0002271, "loss": 1.495, "step": 766 }, { "epoch": 1.2272, "grad_norm": 1.472916841506958, "learning_rate": 0.00022739999999999997, "loss": 1.6167, "step": 767 }, { "epoch": 1.2288000000000001, "grad_norm": 1.3060656785964966, "learning_rate": 0.00022769999999999998, "loss": 1.5432, "step": 768 }, { "epoch": 1.2304, "grad_norm": 1.9118512868881226, "learning_rate": 0.00022799999999999999, "loss": 2.3, "step": 769 }, { "epoch": 1.232, "grad_norm": 1.5411431789398193, "learning_rate": 0.0002283, "loss": 1.3154, "step": 770 }, { "epoch": 1.2336, "grad_norm": 1.2540593147277832, "learning_rate": 0.00022859999999999997, "loss": 1.3445, "step": 771 }, { "epoch": 1.2352, "grad_norm": 1.74718177318573, "learning_rate": 0.00022889999999999998, "loss": 1.7902, "step": 772 }, { "epoch": 1.2368000000000001, "grad_norm": 1.5919808149337769, "learning_rate": 0.0002292, "loss": 1.7406, "step": 773 }, { "epoch": 1.2384, "grad_norm": 2.1802892684936523, "learning_rate": 0.0002295, "loss": 2.3024, "step": 774 }, { "epoch": 1.24, "grad_norm": NaN, "learning_rate": 0.0002295, "loss": 2.2887, "step": 775 }, { "epoch": 1.2416, "grad_norm": 5.746895790100098, "learning_rate": 0.00022979999999999997, "loss": 2.6305, "step": 776 }, { "epoch": 1.2432, "grad_norm": 5.819034099578857, "learning_rate": 0.00023009999999999998, "loss": 2.3811, "step": 777 }, { "epoch": 1.2448, "grad_norm": 2.528698444366455, "learning_rate": 0.0002304, "loss": 1.9592, "step": 778 }, { "epoch": 1.2464, "grad_norm": 4.070464611053467, "learning_rate": 0.0002307, "loss": 2.035, "step": 779 }, { "epoch": 1.248, "grad_norm": 1.93435800075531, "learning_rate": 0.00023099999999999998, "loss": 1.8043, "step": 780 }, { "epoch": 1.2496, "grad_norm": 3.285830497741699, "learning_rate": 0.00023129999999999998, "loss": 1.7722, "step": 781 }, { "epoch": 1.2511999999999999, "grad_norm": 4.356208324432373, "learning_rate": 0.0002316, "loss": 1.6131, "step": 782 }, { "epoch": 1.2528000000000001, "grad_norm": 5.4774603843688965, "learning_rate": 0.0002319, "loss": 1.5492, "step": 783 }, { "epoch": 1.2544, "grad_norm": 3.671088218688965, "learning_rate": 0.00023219999999999998, "loss": 1.6548, "step": 784 }, { "epoch": 1.256, "grad_norm": 3.7231082916259766, "learning_rate": 0.00023249999999999999, "loss": 1.3874, "step": 785 }, { "epoch": 1.2576, "grad_norm": 1.782421588897705, "learning_rate": 0.0002328, "loss": 1.3857, "step": 786 }, { "epoch": 1.2591999999999999, "grad_norm": 1.4939918518066406, "learning_rate": 0.00023309999999999997, "loss": 1.7764, "step": 787 }, { "epoch": 1.2608, "grad_norm": 1.064145565032959, "learning_rate": 0.00023339999999999998, "loss": 1.3059, "step": 788 }, { "epoch": 1.2624, "grad_norm": 2.0802013874053955, "learning_rate": 0.0002337, "loss": 1.3497, "step": 789 }, { "epoch": 1.264, "grad_norm": 2.4550795555114746, "learning_rate": 0.000234, "loss": 1.6281, "step": 790 }, { "epoch": 1.2656, "grad_norm": 1.424136996269226, "learning_rate": 0.00023429999999999998, "loss": 1.3037, "step": 791 }, { "epoch": 1.2671999999999999, "grad_norm": 2.6355724334716797, "learning_rate": 0.00023459999999999998, "loss": 1.2327, "step": 792 }, { "epoch": 1.2688, "grad_norm": 1.9551432132720947, "learning_rate": 0.0002349, "loss": 1.1327, "step": 793 }, { "epoch": 1.2704, "grad_norm": 0.920864462852478, "learning_rate": 0.0002352, "loss": 1.329, "step": 794 }, { "epoch": 1.272, "grad_norm": 0.8361489176750183, "learning_rate": 0.00023549999999999998, "loss": 1.4342, "step": 795 }, { "epoch": 1.2736, "grad_norm": 1.4463287591934204, "learning_rate": 0.00023579999999999999, "loss": 1.2214, "step": 796 }, { "epoch": 1.2752, "grad_norm": 0.8743594884872437, "learning_rate": 0.0002361, "loss": 1.126, "step": 797 }, { "epoch": 1.2768, "grad_norm": 0.9425063133239746, "learning_rate": 0.0002364, "loss": 1.4563, "step": 798 }, { "epoch": 1.2784, "grad_norm": 1.1034338474273682, "learning_rate": 0.00023669999999999998, "loss": 1.495, "step": 799 }, { "epoch": 1.28, "grad_norm": 1.235459804534912, "learning_rate": 0.000237, "loss": 1.2458, "step": 800 }, { "epoch": 1.2816, "grad_norm": 1.0407472848892212, "learning_rate": 0.0002373, "loss": 1.2275, "step": 801 }, { "epoch": 1.2832, "grad_norm": 1.0057398080825806, "learning_rate": 0.0002376, "loss": 1.2064, "step": 802 }, { "epoch": 1.2848, "grad_norm": 1.2582429647445679, "learning_rate": 0.00023789999999999998, "loss": 1.457, "step": 803 }, { "epoch": 1.2864, "grad_norm": 1.2544838190078735, "learning_rate": 0.0002382, "loss": 1.0123, "step": 804 }, { "epoch": 1.288, "grad_norm": 1.267555832862854, "learning_rate": 0.0002385, "loss": 1.3412, "step": 805 }, { "epoch": 1.2896, "grad_norm": 1.585595726966858, "learning_rate": 0.0002388, "loss": 1.3812, "step": 806 }, { "epoch": 1.2912, "grad_norm": 1.1115787029266357, "learning_rate": 0.00023909999999999998, "loss": 1.2368, "step": 807 }, { "epoch": 1.2928, "grad_norm": 1.1539804935455322, "learning_rate": 0.0002394, "loss": 1.0813, "step": 808 }, { "epoch": 1.2944, "grad_norm": 1.2517136335372925, "learning_rate": 0.0002397, "loss": 1.2567, "step": 809 }, { "epoch": 1.296, "grad_norm": 1.0020838975906372, "learning_rate": 0.00023999999999999998, "loss": 1.3813, "step": 810 }, { "epoch": 1.2976, "grad_norm": 1.278122067451477, "learning_rate": 0.00024029999999999999, "loss": 1.4291, "step": 811 }, { "epoch": 1.2992, "grad_norm": 1.1353975534439087, "learning_rate": 0.0002406, "loss": 1.5433, "step": 812 }, { "epoch": 1.3008, "grad_norm": 1.0918465852737427, "learning_rate": 0.0002409, "loss": 1.4993, "step": 813 }, { "epoch": 1.3024, "grad_norm": 1.1691175699234009, "learning_rate": 0.00024119999999999998, "loss": 1.442, "step": 814 }, { "epoch": 1.304, "grad_norm": 2.067641258239746, "learning_rate": 0.0002415, "loss": 1.3307, "step": 815 }, { "epoch": 1.3056, "grad_norm": 1.2151570320129395, "learning_rate": 0.0002418, "loss": 1.3187, "step": 816 }, { "epoch": 1.3072, "grad_norm": 1.3653641939163208, "learning_rate": 0.0002421, "loss": 1.1919, "step": 817 }, { "epoch": 1.3088, "grad_norm": 1.193217158317566, "learning_rate": 0.00024239999999999998, "loss": 1.5955, "step": 818 }, { "epoch": 1.3104, "grad_norm": 1.2559990882873535, "learning_rate": 0.0002427, "loss": 1.6028, "step": 819 }, { "epoch": 1.312, "grad_norm": 1.5885244607925415, "learning_rate": 0.000243, "loss": 1.4464, "step": 820 }, { "epoch": 1.3136, "grad_norm": 1.5733736753463745, "learning_rate": 0.0002433, "loss": 1.743, "step": 821 }, { "epoch": 1.3152, "grad_norm": 2.05718731880188, "learning_rate": 0.00024359999999999999, "loss": 1.4574, "step": 822 }, { "epoch": 1.3168, "grad_norm": 1.5828289985656738, "learning_rate": 0.00024389999999999997, "loss": 1.9888, "step": 823 }, { "epoch": 1.3184, "grad_norm": 1.8411847352981567, "learning_rate": 0.00024419999999999997, "loss": 1.3748, "step": 824 }, { "epoch": 1.32, "grad_norm": 2.4953489303588867, "learning_rate": 0.0002445, "loss": 2.2596, "step": 825 }, { "epoch": 1.3216, "grad_norm": 22.735031127929688, "learning_rate": 0.0002448, "loss": 5.1826, "step": 826 }, { "epoch": 1.3232, "grad_norm": 2.6146559715270996, "learning_rate": 0.00024509999999999994, "loss": 1.7049, "step": 827 }, { "epoch": 1.3248, "grad_norm": 1.7866498231887817, "learning_rate": 0.00024539999999999995, "loss": 1.6428, "step": 828 }, { "epoch": 1.3264, "grad_norm": 3.011427640914917, "learning_rate": 0.00024569999999999995, "loss": 1.7732, "step": 829 }, { "epoch": 1.328, "grad_norm": 1.7671997547149658, "learning_rate": 0.00024599999999999996, "loss": 1.7, "step": 830 }, { "epoch": 1.3296000000000001, "grad_norm": 3.2919392585754395, "learning_rate": 0.00024629999999999997, "loss": 1.8395, "step": 831 }, { "epoch": 1.3312, "grad_norm": 3.4365289211273193, "learning_rate": 0.0002466, "loss": 1.4656, "step": 832 }, { "epoch": 1.3328, "grad_norm": 2.34206485748291, "learning_rate": 0.0002469, "loss": 1.3553, "step": 833 }, { "epoch": 1.3344, "grad_norm": 1.3108103275299072, "learning_rate": 0.0002472, "loss": 1.6943, "step": 834 }, { "epoch": 1.336, "grad_norm": 1.1649718284606934, "learning_rate": 0.00024749999999999994, "loss": 1.6788, "step": 835 }, { "epoch": 1.3376000000000001, "grad_norm": 0.8755460977554321, "learning_rate": 0.00024779999999999995, "loss": 1.064, "step": 836 }, { "epoch": 1.3392, "grad_norm": 1.0399974584579468, "learning_rate": 0.00024809999999999996, "loss": 1.1169, "step": 837 }, { "epoch": 1.3408, "grad_norm": 1.590290904045105, "learning_rate": 0.00024839999999999997, "loss": 1.32, "step": 838 }, { "epoch": 1.3424, "grad_norm": 1.9613844156265259, "learning_rate": 0.0002487, "loss": 0.9774, "step": 839 }, { "epoch": 1.3439999999999999, "grad_norm": 1.3067349195480347, "learning_rate": 0.000249, "loss": 1.0947, "step": 840 }, { "epoch": 1.3456000000000001, "grad_norm": 3.495009422302246, "learning_rate": 0.0002493, "loss": 1.4792, "step": 841 }, { "epoch": 1.3472, "grad_norm": 2.386378526687622, "learning_rate": 0.00024959999999999994, "loss": 1.2189, "step": 842 }, { "epoch": 1.3488, "grad_norm": 1.2430686950683594, "learning_rate": 0.00024989999999999995, "loss": 1.383, "step": 843 }, { "epoch": 1.3504, "grad_norm": 1.1015182733535767, "learning_rate": 0.00025019999999999996, "loss": 1.1695, "step": 844 }, { "epoch": 1.3519999999999999, "grad_norm": 1.2849568128585815, "learning_rate": 0.00025049999999999996, "loss": 1.4384, "step": 845 }, { "epoch": 1.3536000000000001, "grad_norm": 0.9367717504501343, "learning_rate": 0.00025079999999999997, "loss": 1.2806, "step": 846 }, { "epoch": 1.3552, "grad_norm": 1.9065661430358887, "learning_rate": 0.0002511, "loss": 1.9186, "step": 847 }, { "epoch": 1.3568, "grad_norm": 1.8349933624267578, "learning_rate": 0.0002514, "loss": 1.2997, "step": 848 }, { "epoch": 1.3584, "grad_norm": 2.411646604537964, "learning_rate": 0.0002517, "loss": 1.7798, "step": 849 }, { "epoch": 1.3599999999999999, "grad_norm": 1.3963836431503296, "learning_rate": 0.00025199999999999995, "loss": 1.2455, "step": 850 }, { "epoch": 1.3616, "grad_norm": 1.7644349336624146, "learning_rate": 0.00025229999999999995, "loss": 1.5101, "step": 851 }, { "epoch": 1.3632, "grad_norm": 1.1302613019943237, "learning_rate": 0.00025259999999999996, "loss": 1.3869, "step": 852 }, { "epoch": 1.3648, "grad_norm": 2.062229633331299, "learning_rate": 0.00025289999999999997, "loss": 1.4488, "step": 853 }, { "epoch": 1.3664, "grad_norm": 1.800642967224121, "learning_rate": 0.0002532, "loss": 1.0541, "step": 854 }, { "epoch": 1.3679999999999999, "grad_norm": 3.4561281204223633, "learning_rate": 0.0002535, "loss": 1.4694, "step": 855 }, { "epoch": 1.3696, "grad_norm": 2.443664073944092, "learning_rate": 0.0002538, "loss": 1.5959, "step": 856 }, { "epoch": 1.3712, "grad_norm": 1.2733495235443115, "learning_rate": 0.0002541, "loss": 1.3463, "step": 857 }, { "epoch": 1.3728, "grad_norm": 1.4782954454421997, "learning_rate": 0.00025439999999999995, "loss": 1.3913, "step": 858 }, { "epoch": 1.3744, "grad_norm": 1.355779767036438, "learning_rate": 0.00025469999999999996, "loss": 1.4062, "step": 859 }, { "epoch": 1.376, "grad_norm": 0.991340696811676, "learning_rate": 0.00025499999999999996, "loss": 1.0575, "step": 860 }, { "epoch": 1.3776, "grad_norm": 1.4136706590652466, "learning_rate": 0.00025529999999999997, "loss": 1.4418, "step": 861 }, { "epoch": 1.3792, "grad_norm": 1.5610219240188599, "learning_rate": 0.0002556, "loss": 1.909, "step": 862 }, { "epoch": 1.3808, "grad_norm": 1.4637738466262817, "learning_rate": 0.0002559, "loss": 1.4926, "step": 863 }, { "epoch": 1.3824, "grad_norm": 1.174544095993042, "learning_rate": 0.0002562, "loss": 1.444, "step": 864 }, { "epoch": 1.384, "grad_norm": 1.295507550239563, "learning_rate": 0.00025649999999999995, "loss": 1.211, "step": 865 }, { "epoch": 1.3856, "grad_norm": 1.341482162475586, "learning_rate": 0.00025679999999999995, "loss": 1.5489, "step": 866 }, { "epoch": 1.3872, "grad_norm": 1.5005024671554565, "learning_rate": 0.00025709999999999996, "loss": 1.304, "step": 867 }, { "epoch": 1.3888, "grad_norm": 1.7614165544509888, "learning_rate": 0.00025739999999999997, "loss": 1.411, "step": 868 }, { "epoch": 1.3904, "grad_norm": 2.1121573448181152, "learning_rate": 0.0002577, "loss": 1.5675, "step": 869 }, { "epoch": 1.392, "grad_norm": 1.4016284942626953, "learning_rate": 0.000258, "loss": 1.2976, "step": 870 }, { "epoch": 1.3936, "grad_norm": 1.4677424430847168, "learning_rate": 0.0002583, "loss": 1.5295, "step": 871 }, { "epoch": 1.3952, "grad_norm": 1.7327654361724854, "learning_rate": 0.0002586, "loss": 1.6782, "step": 872 }, { "epoch": 1.3968, "grad_norm": 1.684560775756836, "learning_rate": 0.00025889999999999995, "loss": 2.1258, "step": 873 }, { "epoch": 1.3984, "grad_norm": 1.1350618600845337, "learning_rate": 0.00025919999999999996, "loss": 1.2016, "step": 874 }, { "epoch": 1.4, "grad_norm": 1.8442115783691406, "learning_rate": 0.00025949999999999997, "loss": 2.2616, "step": 875 }, { "epoch": 1.4016, "grad_norm": 15.330982208251953, "learning_rate": 0.00025979999999999997, "loss": 3.7807, "step": 876 }, { "epoch": 1.4032, "grad_norm": 9.512137413024902, "learning_rate": 0.0002601, "loss": 2.7959, "step": 877 }, { "epoch": 1.4048, "grad_norm": 4.83724308013916, "learning_rate": 0.0002604, "loss": 2.2513, "step": 878 }, { "epoch": 1.4064, "grad_norm": 1.0789581537246704, "learning_rate": 0.0002607, "loss": 1.1761, "step": 879 }, { "epoch": 1.408, "grad_norm": 1.283165693283081, "learning_rate": 0.000261, "loss": 1.3131, "step": 880 }, { "epoch": 1.4096, "grad_norm": 1.3923134803771973, "learning_rate": 0.00026129999999999995, "loss": 1.4793, "step": 881 }, { "epoch": 1.4112, "grad_norm": 1.2186331748962402, "learning_rate": 0.00026159999999999996, "loss": 1.4069, "step": 882 }, { "epoch": 1.4128, "grad_norm": 1.7576051950454712, "learning_rate": 0.00026189999999999997, "loss": 1.6623, "step": 883 }, { "epoch": 1.4144, "grad_norm": 2.6093623638153076, "learning_rate": 0.0002622, "loss": 0.9214, "step": 884 }, { "epoch": 1.416, "grad_norm": 1.0863568782806396, "learning_rate": 0.0002625, "loss": 1.4146, "step": 885 }, { "epoch": 1.4176, "grad_norm": 2.363821506500244, "learning_rate": 0.0002628, "loss": 1.8021, "step": 886 }, { "epoch": 1.4192, "grad_norm": 1.1821964979171753, "learning_rate": 0.0002631, "loss": 1.1834, "step": 887 }, { "epoch": 1.4208, "grad_norm": 1.32361900806427, "learning_rate": 0.00026339999999999995, "loss": 1.192, "step": 888 }, { "epoch": 1.4224, "grad_norm": 1.3281641006469727, "learning_rate": 0.00026369999999999996, "loss": 1.0575, "step": 889 }, { "epoch": 1.424, "grad_norm": 1.5585789680480957, "learning_rate": 0.00026399999999999997, "loss": 1.1708, "step": 890 }, { "epoch": 1.4256, "grad_norm": 2.31046462059021, "learning_rate": 0.0002643, "loss": 1.1794, "step": 891 }, { "epoch": 1.4272, "grad_norm": 1.7033979892730713, "learning_rate": 0.0002646, "loss": 1.3221, "step": 892 }, { "epoch": 1.4288, "grad_norm": 2.653367519378662, "learning_rate": 0.0002649, "loss": 1.4937, "step": 893 }, { "epoch": 1.4304000000000001, "grad_norm": 0.9184427261352539, "learning_rate": 0.0002652, "loss": 1.078, "step": 894 }, { "epoch": 1.432, "grad_norm": 0.9819865226745605, "learning_rate": 0.0002655, "loss": 1.186, "step": 895 }, { "epoch": 1.4336, "grad_norm": 1.0578396320343018, "learning_rate": 0.00026579999999999996, "loss": 1.0641, "step": 896 }, { "epoch": 1.4352, "grad_norm": 1.4637776613235474, "learning_rate": 0.00026609999999999996, "loss": 1.373, "step": 897 }, { "epoch": 1.4368, "grad_norm": 0.9520303606987, "learning_rate": 0.00026639999999999997, "loss": 1.1442, "step": 898 }, { "epoch": 1.4384000000000001, "grad_norm": 1.6817363500595093, "learning_rate": 0.0002667, "loss": 1.4782, "step": 899 }, { "epoch": 1.44, "grad_norm": 2.1572883129119873, "learning_rate": 0.000267, "loss": 1.8022, "step": 900 }, { "epoch": 1.4416, "grad_norm": 0.8815500736236572, "learning_rate": 0.0002673, "loss": 1.4329, "step": 901 }, { "epoch": 1.4432, "grad_norm": 1.2165837287902832, "learning_rate": 0.0002676, "loss": 1.1709, "step": 902 }, { "epoch": 1.4447999999999999, "grad_norm": 2.121063470840454, "learning_rate": 0.0002679, "loss": 1.3277, "step": 903 }, { "epoch": 1.4464000000000001, "grad_norm": 1.4610421657562256, "learning_rate": 0.00026819999999999996, "loss": 1.5098, "step": 904 }, { "epoch": 1.448, "grad_norm": 2.9947142601013184, "learning_rate": 0.00026849999999999997, "loss": 1.3525, "step": 905 }, { "epoch": 1.4496, "grad_norm": 1.1811401844024658, "learning_rate": 0.0002688, "loss": 1.1726, "step": 906 }, { "epoch": 1.4512, "grad_norm": 1.4365415573120117, "learning_rate": 0.0002691, "loss": 1.2433, "step": 907 }, { "epoch": 1.4527999999999999, "grad_norm": 1.34075927734375, "learning_rate": 0.0002694, "loss": 1.0205, "step": 908 }, { "epoch": 1.4544000000000001, "grad_norm": 2.7684597969055176, "learning_rate": 0.0002697, "loss": 1.0883, "step": 909 }, { "epoch": 1.456, "grad_norm": 1.557430624961853, "learning_rate": 0.00027, "loss": 1.6006, "step": 910 }, { "epoch": 1.4576, "grad_norm": 1.616085410118103, "learning_rate": 0.00027029999999999996, "loss": 1.4207, "step": 911 }, { "epoch": 1.4592, "grad_norm": 5.76104211807251, "learning_rate": 0.00027059999999999996, "loss": 1.6985, "step": 912 }, { "epoch": 1.4607999999999999, "grad_norm": 1.2783349752426147, "learning_rate": 0.00027089999999999997, "loss": 1.1576, "step": 913 }, { "epoch": 1.4624, "grad_norm": 1.2653543949127197, "learning_rate": 0.0002712, "loss": 1.4433, "step": 914 }, { "epoch": 1.464, "grad_norm": 1.2063896656036377, "learning_rate": 0.0002715, "loss": 1.5359, "step": 915 }, { "epoch": 1.4656, "grad_norm": 2.794680118560791, "learning_rate": 0.0002718, "loss": 1.5251, "step": 916 }, { "epoch": 1.4672, "grad_norm": 3.2242326736450195, "learning_rate": 0.0002721, "loss": 1.9892, "step": 917 }, { "epoch": 1.4687999999999999, "grad_norm": 1.8846021890640259, "learning_rate": 0.0002724, "loss": 1.8354, "step": 918 }, { "epoch": 1.4704, "grad_norm": 2.0368640422821045, "learning_rate": 0.00027269999999999996, "loss": 1.5269, "step": 919 }, { "epoch": 1.472, "grad_norm": 1.5392261743545532, "learning_rate": 0.00027299999999999997, "loss": 1.6935, "step": 920 }, { "epoch": 1.4736, "grad_norm": 1.5438854694366455, "learning_rate": 0.0002733, "loss": 1.7796, "step": 921 }, { "epoch": 1.4752, "grad_norm": 1.796651005744934, "learning_rate": 0.0002736, "loss": 1.3982, "step": 922 }, { "epoch": 1.4768, "grad_norm": 2.2069437503814697, "learning_rate": 0.0002739, "loss": 1.7769, "step": 923 }, { "epoch": 1.4784, "grad_norm": 1.9683245420455933, "learning_rate": 0.0002742, "loss": 1.4909, "step": 924 }, { "epoch": 1.48, "grad_norm": 1.911014199256897, "learning_rate": 0.0002745, "loss": 2.0897, "step": 925 }, { "epoch": 1.4816, "grad_norm": 5.754384994506836, "learning_rate": 0.0002748, "loss": 2.1389, "step": 926 }, { "epoch": 1.4832, "grad_norm": 1.59812331199646, "learning_rate": 0.00027509999999999996, "loss": 1.5023, "step": 927 }, { "epoch": 1.4848, "grad_norm": 3.8509624004364014, "learning_rate": 0.00027539999999999997, "loss": 2.156, "step": 928 }, { "epoch": 1.4864, "grad_norm": 1.6457704305648804, "learning_rate": 0.0002757, "loss": 1.6978, "step": 929 }, { "epoch": 1.488, "grad_norm": 9.261984825134277, "learning_rate": 0.000276, "loss": 3.222, "step": 930 }, { "epoch": 1.4896, "grad_norm": 10.191606521606445, "learning_rate": 0.0002763, "loss": 2.7299, "step": 931 }, { "epoch": 1.4912, "grad_norm": 2.034604072570801, "learning_rate": 0.0002766, "loss": 1.363, "step": 932 }, { "epoch": 1.4928, "grad_norm": 2.7943766117095947, "learning_rate": 0.0002769, "loss": 1.4561, "step": 933 }, { "epoch": 1.4944, "grad_norm": 2.739060640335083, "learning_rate": 0.0002772, "loss": 1.5289, "step": 934 }, { "epoch": 1.496, "grad_norm": 2.6572391986846924, "learning_rate": 0.00027749999999999997, "loss": 1.3459, "step": 935 }, { "epoch": 1.4976, "grad_norm": 2.4692184925079346, "learning_rate": 0.0002778, "loss": 1.409, "step": 936 }, { "epoch": 1.4992, "grad_norm": 1.569419264793396, "learning_rate": 0.0002781, "loss": 1.1897, "step": 937 }, { "epoch": 1.5008, "grad_norm": 0.9803001880645752, "learning_rate": 0.0002784, "loss": 1.1971, "step": 938 }, { "epoch": 1.5024, "grad_norm": 1.3759132623672485, "learning_rate": 0.0002787, "loss": 1.1573, "step": 939 }, { "epoch": 1.504, "grad_norm": 1.4470410346984863, "learning_rate": 0.000279, "loss": 1.2441, "step": 940 }, { "epoch": 1.5056, "grad_norm": 1.9103741645812988, "learning_rate": 0.0002793, "loss": 1.2325, "step": 941 }, { "epoch": 1.5072, "grad_norm": 1.5558336973190308, "learning_rate": 0.00027959999999999997, "loss": 1.0402, "step": 942 }, { "epoch": 1.5088, "grad_norm": 2.0115926265716553, "learning_rate": 0.0002799, "loss": 1.2751, "step": 943 }, { "epoch": 1.5104, "grad_norm": 1.6013593673706055, "learning_rate": 0.0002802, "loss": 1.6149, "step": 944 }, { "epoch": 1.512, "grad_norm": 1.3492580652236938, "learning_rate": 0.0002805, "loss": 1.2308, "step": 945 }, { "epoch": 1.5135999999999998, "grad_norm": 1.3978670835494995, "learning_rate": 0.0002808, "loss": 1.1674, "step": 946 }, { "epoch": 1.5152, "grad_norm": 1.257152795791626, "learning_rate": 0.0002811, "loss": 1.2999, "step": 947 }, { "epoch": 1.5168, "grad_norm": 1.3785860538482666, "learning_rate": 0.00028139999999999996, "loss": 1.2184, "step": 948 }, { "epoch": 1.5184, "grad_norm": 2.098989963531494, "learning_rate": 0.00028169999999999996, "loss": 1.4197, "step": 949 }, { "epoch": 1.52, "grad_norm": 1.411068320274353, "learning_rate": 0.00028199999999999997, "loss": 1.3696, "step": 950 }, { "epoch": 1.5215999999999998, "grad_norm": 1.1803005933761597, "learning_rate": 0.0002823, "loss": 1.4752, "step": 951 }, { "epoch": 1.5232, "grad_norm": 1.4621422290802002, "learning_rate": 0.0002826, "loss": 1.2319, "step": 952 }, { "epoch": 1.5248, "grad_norm": 2.065951108932495, "learning_rate": 0.00028289999999999994, "loss": 1.3185, "step": 953 }, { "epoch": 1.5264, "grad_norm": 2.077345371246338, "learning_rate": 0.00028319999999999994, "loss": 1.3232, "step": 954 }, { "epoch": 1.528, "grad_norm": 2.0758562088012695, "learning_rate": 0.00028349999999999995, "loss": 1.1082, "step": 955 }, { "epoch": 1.5295999999999998, "grad_norm": 2.724622964859009, "learning_rate": 0.00028379999999999996, "loss": 1.3383, "step": 956 }, { "epoch": 1.5312000000000001, "grad_norm": 1.9979689121246338, "learning_rate": 0.00028409999999999997, "loss": 1.514, "step": 957 }, { "epoch": 1.5328, "grad_norm": 1.9366734027862549, "learning_rate": 0.0002844, "loss": 1.2723, "step": 958 }, { "epoch": 1.5344, "grad_norm": 2.198087215423584, "learning_rate": 0.0002847, "loss": 1.4332, "step": 959 }, { "epoch": 1.536, "grad_norm": 1.5621610879898071, "learning_rate": 0.000285, "loss": 1.3232, "step": 960 }, { "epoch": 1.5375999999999999, "grad_norm": 1.4429649114608765, "learning_rate": 0.00028529999999999994, "loss": 1.292, "step": 961 }, { "epoch": 1.5392000000000001, "grad_norm": 1.7527788877487183, "learning_rate": 0.00028559999999999995, "loss": 1.3802, "step": 962 }, { "epoch": 1.5408, "grad_norm": 2.562932252883911, "learning_rate": 0.00028589999999999996, "loss": 1.5058, "step": 963 }, { "epoch": 1.5424, "grad_norm": 2.0278782844543457, "learning_rate": 0.00028619999999999996, "loss": 1.4053, "step": 964 }, { "epoch": 1.544, "grad_norm": 2.133039712905884, "learning_rate": 0.00028649999999999997, "loss": 1.5985, "step": 965 }, { "epoch": 1.5455999999999999, "grad_norm": 1.7495462894439697, "learning_rate": 0.0002868, "loss": 1.7425, "step": 966 }, { "epoch": 1.5472000000000001, "grad_norm": 1.314456582069397, "learning_rate": 0.0002871, "loss": 1.1744, "step": 967 }, { "epoch": 1.5488, "grad_norm": 1.5634371042251587, "learning_rate": 0.00028739999999999994, "loss": 1.199, "step": 968 }, { "epoch": 1.5504, "grad_norm": 1.5051501989364624, "learning_rate": 0.00028769999999999995, "loss": 1.3761, "step": 969 }, { "epoch": 1.552, "grad_norm": 1.4913947582244873, "learning_rate": 0.00028799999999999995, "loss": 1.7574, "step": 970 }, { "epoch": 1.5535999999999999, "grad_norm": 2.0032637119293213, "learning_rate": 0.00028829999999999996, "loss": 1.6131, "step": 971 }, { "epoch": 1.5552000000000001, "grad_norm": 1.4599378108978271, "learning_rate": 0.00028859999999999997, "loss": 1.5788, "step": 972 }, { "epoch": 1.5568, "grad_norm": 1.526383638381958, "learning_rate": 0.0002889, "loss": 1.4599, "step": 973 }, { "epoch": 1.5584, "grad_norm": 3.349010705947876, "learning_rate": 0.0002892, "loss": 1.7604, "step": 974 }, { "epoch": 1.56, "grad_norm": 3.1439058780670166, "learning_rate": 0.0002895, "loss": 2.0222, "step": 975 }, { "epoch": 1.5615999999999999, "grad_norm": 5.448111057281494, "learning_rate": 0.00028979999999999994, "loss": 2.2708, "step": 976 }, { "epoch": 1.5632000000000001, "grad_norm": 3.330211877822876, "learning_rate": 0.00029009999999999995, "loss": 2.3857, "step": 977 }, { "epoch": 1.5648, "grad_norm": 1.844307541847229, "learning_rate": 0.00029039999999999996, "loss": 1.8033, "step": 978 }, { "epoch": 1.5664, "grad_norm": 2.17771053314209, "learning_rate": 0.00029069999999999996, "loss": 1.8416, "step": 979 }, { "epoch": 1.568, "grad_norm": 1.889838695526123, "learning_rate": 0.00029099999999999997, "loss": 1.4765, "step": 980 }, { "epoch": 1.5695999999999999, "grad_norm": 1.280713677406311, "learning_rate": 0.0002913, "loss": 1.5771, "step": 981 }, { "epoch": 1.5712000000000002, "grad_norm": 1.2217782735824585, "learning_rate": 0.0002916, "loss": 1.4645, "step": 982 }, { "epoch": 1.5728, "grad_norm": 1.493486762046814, "learning_rate": 0.0002919, "loss": 1.3598, "step": 983 }, { "epoch": 1.5744, "grad_norm": 1.8840752840042114, "learning_rate": 0.00029219999999999995, "loss": 1.5287, "step": 984 }, { "epoch": 1.576, "grad_norm": 1.965975046157837, "learning_rate": 0.00029249999999999995, "loss": 1.2946, "step": 985 }, { "epoch": 1.5776, "grad_norm": 1.5697219371795654, "learning_rate": 0.00029279999999999996, "loss": 1.0137, "step": 986 }, { "epoch": 1.5792000000000002, "grad_norm": 1.665776014328003, "learning_rate": 0.00029309999999999997, "loss": 1.1297, "step": 987 }, { "epoch": 1.5808, "grad_norm": 1.9357331991195679, "learning_rate": 0.0002934, "loss": 1.1271, "step": 988 }, { "epoch": 1.5824, "grad_norm": 0.922601044178009, "learning_rate": 0.0002937, "loss": 1.3413, "step": 989 }, { "epoch": 1.584, "grad_norm": 1.0412627458572388, "learning_rate": 0.000294, "loss": 1.1785, "step": 990 }, { "epoch": 1.5856, "grad_norm": 1.292492151260376, "learning_rate": 0.00029429999999999994, "loss": 1.4531, "step": 991 }, { "epoch": 1.5872000000000002, "grad_norm": 1.4930530786514282, "learning_rate": 0.00029459999999999995, "loss": 1.4747, "step": 992 }, { "epoch": 1.5888, "grad_norm": 1.398553729057312, "learning_rate": 0.00029489999999999996, "loss": 1.2275, "step": 993 }, { "epoch": 1.5904, "grad_norm": 1.1149609088897705, "learning_rate": 0.00029519999999999997, "loss": 1.3404, "step": 994 }, { "epoch": 1.592, "grad_norm": 2.243300676345825, "learning_rate": 0.00029549999999999997, "loss": 1.4117, "step": 995 }, { "epoch": 1.5936, "grad_norm": 0.9678653478622437, "learning_rate": 0.0002958, "loss": 1.2791, "step": 996 }, { "epoch": 1.5952, "grad_norm": 0.9126372337341309, "learning_rate": 0.0002961, "loss": 1.54, "step": 997 }, { "epoch": 1.5968, "grad_norm": 1.447344422340393, "learning_rate": 0.0002964, "loss": 1.2979, "step": 998 }, { "epoch": 1.5984, "grad_norm": 2.5969924926757812, "learning_rate": 0.00029669999999999995, "loss": 1.6844, "step": 999 }, { "epoch": 1.6, "grad_norm": 1.3566679954528809, "learning_rate": 0.00029699999999999996, "loss": 1.2633, "step": 1000 }, { "epoch": 1.6, "eval_cer": 0.342703815793579, "eval_loss": 1.7065902948379517, "eval_runtime": 159.4221, "eval_samples_per_second": 19.671, "eval_steps_per_second": 1.229, "eval_wer": 0.5115005185204882, "step": 1000 }, { "epoch": 1.6, "step": 1000, "total_flos": 6.203691115248614e+18, "train_loss": 4.768421524226666, "train_runtime": 2101.2175, "train_samples_per_second": 15.229, "train_steps_per_second": 0.476 } ], "logging_steps": 1.0, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.203691115248614e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }