|
{ |
|
"best_metric": 0.5115005185204882, |
|
"best_model_checkpoint": "/m/triton/scratch/elec/puhe/p/palp3/MUCS/indicwav2vec_outputs/pd_warmup_2000/s300_shuff500/checkpoint-1000", |
|
"epoch": 1.6, |
|
"eval_steps": 1000, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 76.924, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.0, |
|
"loss": 40.9666, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 22.267667770385742, |
|
"learning_rate": 3e-07, |
|
"loss": 40.3514, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 11.958697319030762, |
|
"learning_rate": 6e-07, |
|
"loss": 28.7886, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 14.436713218688965, |
|
"learning_rate": 9e-07, |
|
"loss": 33.1337, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 15.921396255493164, |
|
"learning_rate": 1.2e-06, |
|
"loss": 29.2715, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0112, |
|
"grad_norm": 11.616898536682129, |
|
"learning_rate": 1.4999999999999998e-06, |
|
"loss": 28.6694, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 12.15279483795166, |
|
"learning_rate": 1.8e-06, |
|
"loss": 26.7664, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0144, |
|
"grad_norm": 15.99345874786377, |
|
"learning_rate": 2.1e-06, |
|
"loss": 27.2963, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 10.025712966918945, |
|
"learning_rate": 2.4e-06, |
|
"loss": 22.7932, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0176, |
|
"grad_norm": 9.379335403442383, |
|
"learning_rate": 2.6999999999999996e-06, |
|
"loss": 20.7226, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 12.390824317932129, |
|
"learning_rate": 2.9999999999999997e-06, |
|
"loss": 27.5995, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0208, |
|
"grad_norm": 10.201970100402832, |
|
"learning_rate": 3.2999999999999993e-06, |
|
"loss": 23.3013, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 11.483911514282227, |
|
"learning_rate": 3.6e-06, |
|
"loss": 24.6987, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 9.661028861999512, |
|
"learning_rate": 3.899999999999999e-06, |
|
"loss": 22.0543, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 9.904827117919922, |
|
"learning_rate": 4.2e-06, |
|
"loss": 20.3867, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0272, |
|
"grad_norm": 10.597962379455566, |
|
"learning_rate": 4.499999999999999e-06, |
|
"loss": 24.6232, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 12.7444486618042, |
|
"learning_rate": 4.8e-06, |
|
"loss": 25.3891, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0304, |
|
"grad_norm": 10.090996742248535, |
|
"learning_rate": 5.1e-06, |
|
"loss": 21.5661, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 10.649155616760254, |
|
"learning_rate": 5.399999999999999e-06, |
|
"loss": 23.0623, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0336, |
|
"grad_norm": 10.286359786987305, |
|
"learning_rate": 5.7e-06, |
|
"loss": 20.7193, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 10.968955993652344, |
|
"learning_rate": 5.999999999999999e-06, |
|
"loss": 24.0174, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0368, |
|
"grad_norm": 9.74566650390625, |
|
"learning_rate": 6.3e-06, |
|
"loss": 21.454, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 11.162517547607422, |
|
"learning_rate": 6.599999999999999e-06, |
|
"loss": 23.5266, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 9.547463417053223, |
|
"learning_rate": 6.899999999999999e-06, |
|
"loss": 20.214, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 10.754602432250977, |
|
"learning_rate": 7.2e-06, |
|
"loss": 22.6765, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0432, |
|
"grad_norm": 9.74982738494873, |
|
"learning_rate": 7.499999999999999e-06, |
|
"loss": 20.4099, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 10.397897720336914, |
|
"learning_rate": 7.799999999999998e-06, |
|
"loss": 21.5807, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0464, |
|
"grad_norm": 10.956497192382812, |
|
"learning_rate": 8.099999999999999e-06, |
|
"loss": 23.4123, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 10.610095977783203, |
|
"learning_rate": 8.4e-06, |
|
"loss": 21.6038, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0496, |
|
"grad_norm": 10.559882164001465, |
|
"learning_rate": 8.7e-06, |
|
"loss": 21.7059, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 10.241806983947754, |
|
"learning_rate": 8.999999999999999e-06, |
|
"loss": 21.5684, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0528, |
|
"grad_norm": 9.802021980285645, |
|
"learning_rate": 9.299999999999999e-06, |
|
"loss": 19.0525, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 10.859997749328613, |
|
"learning_rate": 9.6e-06, |
|
"loss": 20.7641, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 11.377524375915527, |
|
"learning_rate": 9.9e-06, |
|
"loss": 21.9332, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 10.088302612304688, |
|
"learning_rate": 1.02e-05, |
|
"loss": 19.4306, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0592, |
|
"grad_norm": 12.069904327392578, |
|
"learning_rate": 1.05e-05, |
|
"loss": 23.6146, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 12.05452823638916, |
|
"learning_rate": 1.0799999999999998e-05, |
|
"loss": 22.8087, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0624, |
|
"grad_norm": 12.891792297363281, |
|
"learning_rate": 1.1099999999999999e-05, |
|
"loss": 21.6699, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 10.262922286987305, |
|
"learning_rate": 1.14e-05, |
|
"loss": 19.0892, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0656, |
|
"grad_norm": 11.921724319458008, |
|
"learning_rate": 1.17e-05, |
|
"loss": 21.5905, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 11.676680564880371, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 20.87, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0688, |
|
"grad_norm": 10.26872730255127, |
|
"learning_rate": 1.2299999999999999e-05, |
|
"loss": 18.0263, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 10.563227653503418, |
|
"learning_rate": 1.26e-05, |
|
"loss": 18.7622, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 14.507094383239746, |
|
"learning_rate": 1.2899999999999998e-05, |
|
"loss": 24.0105, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 11.43386173248291, |
|
"learning_rate": 1.3199999999999997e-05, |
|
"loss": 19.3601, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0752, |
|
"grad_norm": 11.63315200805664, |
|
"learning_rate": 1.3499999999999998e-05, |
|
"loss": 19.0576, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 12.388842582702637, |
|
"learning_rate": 1.3799999999999998e-05, |
|
"loss": 19.2927, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0784, |
|
"grad_norm": 12.819602966308594, |
|
"learning_rate": 1.4099999999999999e-05, |
|
"loss": 19.936, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4099999999999999e-05, |
|
"loss": 17.9793, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0816, |
|
"grad_norm": 65.82632446289062, |
|
"learning_rate": 1.44e-05, |
|
"loss": 64.1187, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 26.94750213623047, |
|
"learning_rate": 1.47e-05, |
|
"loss": 33.92, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0848, |
|
"grad_norm": 24.70115852355957, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 34.5358, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 18.708255767822266, |
|
"learning_rate": 1.53e-05, |
|
"loss": 25.5629, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 21.256839752197266, |
|
"learning_rate": 1.5599999999999996e-05, |
|
"loss": 27.8492, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 17.251680374145508, |
|
"learning_rate": 1.5899999999999997e-05, |
|
"loss": 22.1176, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0912, |
|
"grad_norm": 31.902353286743164, |
|
"learning_rate": 1.6199999999999997e-05, |
|
"loss": 26.8606, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 20.44807243347168, |
|
"learning_rate": 1.6499999999999998e-05, |
|
"loss": 23.1075, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0944, |
|
"grad_norm": 17.042905807495117, |
|
"learning_rate": 1.68e-05, |
|
"loss": 21.0526, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 18.13207244873047, |
|
"learning_rate": 1.71e-05, |
|
"loss": 20.8373, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0976, |
|
"grad_norm": 17.839736938476562, |
|
"learning_rate": 1.74e-05, |
|
"loss": 21.2095, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 18.765409469604492, |
|
"learning_rate": 1.7699999999999997e-05, |
|
"loss": 21.5782, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1008, |
|
"grad_norm": 18.407758712768555, |
|
"learning_rate": 1.7999999999999997e-05, |
|
"loss": 21.009, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 38.16777038574219, |
|
"learning_rate": 1.8299999999999998e-05, |
|
"loss": 21.9343, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 21.490079879760742, |
|
"learning_rate": 1.8599999999999998e-05, |
|
"loss": 22.2912, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 22.932668685913086, |
|
"learning_rate": 1.89e-05, |
|
"loss": 20.3447, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1072, |
|
"grad_norm": 26.10978126525879, |
|
"learning_rate": 1.92e-05, |
|
"loss": 23.8814, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 18.075897216796875, |
|
"learning_rate": 1.95e-05, |
|
"loss": 18.7612, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1104, |
|
"grad_norm": 18.735963821411133, |
|
"learning_rate": 1.98e-05, |
|
"loss": 18.8199, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 22.017709732055664, |
|
"learning_rate": 2.01e-05, |
|
"loss": 20.6774, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1136, |
|
"grad_norm": 38.08246994018555, |
|
"learning_rate": 2.04e-05, |
|
"loss": 30.1672, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 22.627145767211914, |
|
"learning_rate": 2.07e-05, |
|
"loss": 20.0791, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1168, |
|
"grad_norm": 30.097496032714844, |
|
"learning_rate": 2.1e-05, |
|
"loss": 23.1779, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 22.679004669189453, |
|
"learning_rate": 2.1299999999999996e-05, |
|
"loss": 19.4401, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 24.041168212890625, |
|
"learning_rate": 2.1599999999999996e-05, |
|
"loss": 19.3537, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 26.076839447021484, |
|
"learning_rate": 2.1899999999999997e-05, |
|
"loss": 20.2714, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1232, |
|
"grad_norm": 25.533342361450195, |
|
"learning_rate": 2.2199999999999998e-05, |
|
"loss": 19.6379, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 23.547039031982422, |
|
"learning_rate": 2.2499999999999998e-05, |
|
"loss": 18.0205, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1264, |
|
"grad_norm": 23.1020565032959, |
|
"learning_rate": 2.28e-05, |
|
"loss": 17.6504, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 28.12115478515625, |
|
"learning_rate": 2.31e-05, |
|
"loss": 19.9713, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1296, |
|
"grad_norm": 24.276756286621094, |
|
"learning_rate": 2.34e-05, |
|
"loss": 17.293, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 21.40571403503418, |
|
"learning_rate": 2.3699999999999997e-05, |
|
"loss": 15.7224, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1328, |
|
"grad_norm": 24.049413681030273, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 16.4272, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 27.002574920654297, |
|
"learning_rate": 2.4299999999999998e-05, |
|
"loss": 17.5712, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 31.374860763549805, |
|
"learning_rate": 2.4599999999999998e-05, |
|
"loss": 19.1111, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 28.4317684173584, |
|
"learning_rate": 2.49e-05, |
|
"loss": 17.1805, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.1392, |
|
"grad_norm": 32.47872543334961, |
|
"learning_rate": 2.52e-05, |
|
"loss": 18.6232, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 28.26717185974121, |
|
"learning_rate": 2.55e-05, |
|
"loss": 16.3721, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.1424, |
|
"grad_norm": 29.546110153198242, |
|
"learning_rate": 2.5799999999999997e-05, |
|
"loss": 16.6354, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 32.13431930541992, |
|
"learning_rate": 2.6099999999999997e-05, |
|
"loss": 16.9808, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1456, |
|
"grad_norm": 36.368682861328125, |
|
"learning_rate": 2.6399999999999995e-05, |
|
"loss": 17.5935, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 28.789241790771484, |
|
"learning_rate": 2.6699999999999995e-05, |
|
"loss": 14.6106, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1488, |
|
"grad_norm": 30.914873123168945, |
|
"learning_rate": 2.6999999999999996e-05, |
|
"loss": 14.5306, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 31.848777770996094, |
|
"learning_rate": 2.7299999999999996e-05, |
|
"loss": 14.7256, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 43.699851989746094, |
|
"learning_rate": 2.7599999999999997e-05, |
|
"loss": 16.3519, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 39.825836181640625, |
|
"learning_rate": 2.7899999999999997e-05, |
|
"loss": 15.9264, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1552, |
|
"grad_norm": 32.04133224487305, |
|
"learning_rate": 2.8199999999999998e-05, |
|
"loss": 13.4326, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 39.8133659362793, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 14.3427, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1584, |
|
"grad_norm": 36.54108810424805, |
|
"learning_rate": 2.88e-05, |
|
"loss": 13.5702, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.88e-05, |
|
"loss": 14.6284, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1616, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.88e-05, |
|
"loss": 25.1519, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 146.48204040527344, |
|
"learning_rate": 2.91e-05, |
|
"loss": 26.3907, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1648, |
|
"grad_norm": 56.64327621459961, |
|
"learning_rate": 2.94e-05, |
|
"loss": 17.1257, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 87.11421966552734, |
|
"learning_rate": 2.97e-05, |
|
"loss": 18.9131, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 57.33453369140625, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 15.569, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 46.68961715698242, |
|
"learning_rate": 3.0299999999999998e-05, |
|
"loss": 13.1879, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1712, |
|
"grad_norm": 59.92051315307617, |
|
"learning_rate": 3.06e-05, |
|
"loss": 14.6438, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 57.27889633178711, |
|
"learning_rate": 3.09e-05, |
|
"loss": 14.3459, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.1744, |
|
"grad_norm": 51.79650115966797, |
|
"learning_rate": 3.119999999999999e-05, |
|
"loss": 12.7327, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 45.90155029296875, |
|
"learning_rate": 3.149999999999999e-05, |
|
"loss": 10.6128, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1776, |
|
"grad_norm": 46.71571350097656, |
|
"learning_rate": 3.1799999999999994e-05, |
|
"loss": 11.1419, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 54.34334182739258, |
|
"learning_rate": 3.2099999999999994e-05, |
|
"loss": 11.3816, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.1808, |
|
"grad_norm": 44.52326965332031, |
|
"learning_rate": 3.2399999999999995e-05, |
|
"loss": 9.99, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 51.66781997680664, |
|
"learning_rate": 3.2699999999999995e-05, |
|
"loss": 10.7379, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 44.501441955566406, |
|
"learning_rate": 3.2999999999999996e-05, |
|
"loss": 9.5815, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 42.56653594970703, |
|
"learning_rate": 3.3299999999999996e-05, |
|
"loss": 9.265, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1872, |
|
"grad_norm": 40.3764762878418, |
|
"learning_rate": 3.36e-05, |
|
"loss": 8.6506, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 40.9335823059082, |
|
"learning_rate": 3.39e-05, |
|
"loss": 8.5488, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1904, |
|
"grad_norm": 42.77170181274414, |
|
"learning_rate": 3.42e-05, |
|
"loss": 8.5519, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 39.98923873901367, |
|
"learning_rate": 3.45e-05, |
|
"loss": 8.0759, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1936, |
|
"grad_norm": 33.6198844909668, |
|
"learning_rate": 3.48e-05, |
|
"loss": 7.3643, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 36.385223388671875, |
|
"learning_rate": 3.51e-05, |
|
"loss": 7.4889, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.1968, |
|
"grad_norm": 34.519630432128906, |
|
"learning_rate": 3.539999999999999e-05, |
|
"loss": 7.0802, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 31.894567489624023, |
|
"learning_rate": 3.5699999999999994e-05, |
|
"loss": 6.8235, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 34.02376174926758, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 6.9291, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 28.912235260009766, |
|
"learning_rate": 3.6299999999999995e-05, |
|
"loss": 6.4299, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2032, |
|
"grad_norm": 28.84571647644043, |
|
"learning_rate": 3.6599999999999995e-05, |
|
"loss": 6.3964, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 27.383811950683594, |
|
"learning_rate": 3.6899999999999996e-05, |
|
"loss": 6.2274, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.2064, |
|
"grad_norm": 29.78708267211914, |
|
"learning_rate": 3.7199999999999996e-05, |
|
"loss": 6.0653, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 28.544218063354492, |
|
"learning_rate": 3.75e-05, |
|
"loss": 6.2053, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2096, |
|
"grad_norm": 22.004798889160156, |
|
"learning_rate": 3.78e-05, |
|
"loss": 5.7141, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 20.87236785888672, |
|
"learning_rate": 3.81e-05, |
|
"loss": 5.6288, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2128, |
|
"grad_norm": 21.300033569335938, |
|
"learning_rate": 3.84e-05, |
|
"loss": 5.5556, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 19.316028594970703, |
|
"learning_rate": 3.87e-05, |
|
"loss": 5.4672, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 19.701513290405273, |
|
"learning_rate": 3.9e-05, |
|
"loss": 5.4904, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 13.995134353637695, |
|
"learning_rate": 3.93e-05, |
|
"loss": 5.1972, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2192, |
|
"grad_norm": 14.500862121582031, |
|
"learning_rate": 3.96e-05, |
|
"loss": 5.1985, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 10.669529914855957, |
|
"learning_rate": 3.99e-05, |
|
"loss": 5.0034, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2224, |
|
"grad_norm": 8.051897048950195, |
|
"learning_rate": 4.02e-05, |
|
"loss": 4.9248, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 11.402167320251465, |
|
"learning_rate": 4.05e-05, |
|
"loss": 5.0594, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2256, |
|
"grad_norm": 10.03395938873291, |
|
"learning_rate": 4.08e-05, |
|
"loss": 5.037, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 8.426224708557129, |
|
"learning_rate": 4.11e-05, |
|
"loss": 4.9605, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.2288, |
|
"grad_norm": 4.501130104064941, |
|
"learning_rate": 4.14e-05, |
|
"loss": 4.7972, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 4.365025520324707, |
|
"learning_rate": 4.17e-05, |
|
"loss": 4.8065, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 4.868807792663574, |
|
"learning_rate": 4.2e-05, |
|
"loss": 4.8179, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 3.3663101196289062, |
|
"learning_rate": 4.229999999999999e-05, |
|
"loss": 4.7889, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2352, |
|
"grad_norm": 3.2163665294647217, |
|
"learning_rate": 4.259999999999999e-05, |
|
"loss": 4.702, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 3.1945879459381104, |
|
"learning_rate": 4.289999999999999e-05, |
|
"loss": 4.7035, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.2384, |
|
"grad_norm": 4.427632808685303, |
|
"learning_rate": 4.319999999999999e-05, |
|
"loss": 4.7546, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.550114631652832, |
|
"learning_rate": 4.3499999999999993e-05, |
|
"loss": 4.9709, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2416, |
|
"grad_norm": 53.31546401977539, |
|
"learning_rate": 4.3799999999999994e-05, |
|
"loss": 5.9402, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 7.296631336212158, |
|
"learning_rate": 4.4099999999999995e-05, |
|
"loss": 4.5481, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2448, |
|
"grad_norm": 5.802248954772949, |
|
"learning_rate": 4.4399999999999995e-05, |
|
"loss": 4.4998, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 3.7592978477478027, |
|
"learning_rate": 4.4699999999999996e-05, |
|
"loss": 4.4536, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 3.3045403957366943, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 4.391, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 5.204708099365234, |
|
"learning_rate": 4.5299999999999997e-05, |
|
"loss": 4.459, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2512, |
|
"grad_norm": 4.284322261810303, |
|
"learning_rate": 4.56e-05, |
|
"loss": 4.3071, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 3.191441059112549, |
|
"learning_rate": 4.59e-05, |
|
"loss": 4.3388, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2544, |
|
"grad_norm": 3.909501552581787, |
|
"learning_rate": 4.62e-05, |
|
"loss": 4.2491, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 3.5245823860168457, |
|
"learning_rate": 4.65e-05, |
|
"loss": 4.3493, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2576, |
|
"grad_norm": 2.955253839492798, |
|
"learning_rate": 4.68e-05, |
|
"loss": 4.2599, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 3.2109367847442627, |
|
"learning_rate": 4.709999999999999e-05, |
|
"loss": 4.161, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2608, |
|
"grad_norm": 2.6348652839660645, |
|
"learning_rate": 4.7399999999999993e-05, |
|
"loss": 4.1815, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 2.593337297439575, |
|
"learning_rate": 4.7699999999999994e-05, |
|
"loss": 4.0801, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 3.77801251411438, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 4.1635, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 2.2782046794891357, |
|
"learning_rate": 4.8299999999999995e-05, |
|
"loss": 4.0901, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2672, |
|
"grad_norm": 1.8724416494369507, |
|
"learning_rate": 4.8599999999999995e-05, |
|
"loss": 4.0705, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 1.7749762535095215, |
|
"learning_rate": 4.8899999999999996e-05, |
|
"loss": 4.0042, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2704, |
|
"grad_norm": 1.4947004318237305, |
|
"learning_rate": 4.9199999999999997e-05, |
|
"loss": 4.0104, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 3.1837706565856934, |
|
"learning_rate": 4.95e-05, |
|
"loss": 4.0004, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2736, |
|
"grad_norm": 1.914663553237915, |
|
"learning_rate": 4.98e-05, |
|
"loss": 3.9682, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 1.708173394203186, |
|
"learning_rate": 5.01e-05, |
|
"loss": 3.9609, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2768, |
|
"grad_norm": 4.2649407386779785, |
|
"learning_rate": 5.04e-05, |
|
"loss": 3.9917, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 1.626754641532898, |
|
"learning_rate": 5.07e-05, |
|
"loss": 3.9464, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.2504980564117432, |
|
"learning_rate": 5.1e-05, |
|
"loss": 3.9311, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 1.5902968645095825, |
|
"learning_rate": 5.13e-05, |
|
"loss": 3.9733, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2832, |
|
"grad_norm": 1.160041093826294, |
|
"learning_rate": 5.1599999999999994e-05, |
|
"loss": 3.9358, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 1.2470163106918335, |
|
"learning_rate": 5.1899999999999994e-05, |
|
"loss": 3.9375, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.2864, |
|
"grad_norm": 1.0665581226348877, |
|
"learning_rate": 5.2199999999999995e-05, |
|
"loss": 3.9701, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.6660319566726685, |
|
"learning_rate": 5.2499999999999995e-05, |
|
"loss": 3.8957, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2896, |
|
"grad_norm": 1.1934682130813599, |
|
"learning_rate": 5.279999999999999e-05, |
|
"loss": 3.9527, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 2.6113617420196533, |
|
"learning_rate": 5.309999999999999e-05, |
|
"loss": 3.9198, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.2928, |
|
"grad_norm": 1.4902148246765137, |
|
"learning_rate": 5.339999999999999e-05, |
|
"loss": 3.8525, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 1.645369529724121, |
|
"learning_rate": 5.369999999999999e-05, |
|
"loss": 3.8462, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 1.7309008836746216, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 3.9076, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.9582569599151611, |
|
"learning_rate": 5.429999999999999e-05, |
|
"loss": 3.9207, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.2992, |
|
"grad_norm": 1.795342206954956, |
|
"learning_rate": 5.459999999999999e-05, |
|
"loss": 3.9195, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.6605049967765808, |
|
"learning_rate": 5.489999999999999e-05, |
|
"loss": 3.8484, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3024, |
|
"grad_norm": 1.3379170894622803, |
|
"learning_rate": 5.519999999999999e-05, |
|
"loss": 3.8709, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 2.7950329780578613, |
|
"learning_rate": 5.5499999999999994e-05, |
|
"loss": 3.8463, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3056, |
|
"grad_norm": 4.116920471191406, |
|
"learning_rate": 5.5799999999999994e-05, |
|
"loss": 4.0707, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 1.450844168663025, |
|
"learning_rate": 5.6099999999999995e-05, |
|
"loss": 3.9311, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3088, |
|
"grad_norm": 3.2481865882873535, |
|
"learning_rate": 5.6399999999999995e-05, |
|
"loss": 4.0222, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 1.377772331237793, |
|
"learning_rate": 5.6699999999999996e-05, |
|
"loss": 3.9005, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 2.0002799034118652, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 3.9021, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 2.1365325450897217, |
|
"learning_rate": 5.73e-05, |
|
"loss": 3.8869, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3152, |
|
"grad_norm": 2.5995476245880127, |
|
"learning_rate": 5.76e-05, |
|
"loss": 3.8563, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 3.046848773956299, |
|
"learning_rate": 5.79e-05, |
|
"loss": 3.9452, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.3184, |
|
"grad_norm": 2.9868597984313965, |
|
"learning_rate": 5.82e-05, |
|
"loss": 3.9164, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.82e-05, |
|
"loss": 3.8911, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3216, |
|
"grad_norm": 22.723224639892578, |
|
"learning_rate": 5.85e-05, |
|
"loss": 4.3327, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 14.148520469665527, |
|
"learning_rate": 5.88e-05, |
|
"loss": 4.0732, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3248, |
|
"grad_norm": 12.10940170288086, |
|
"learning_rate": 5.91e-05, |
|
"loss": 4.0389, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 7.240406513214111, |
|
"learning_rate": 5.94e-05, |
|
"loss": 3.938, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 4.678879261016846, |
|
"learning_rate": 5.97e-05, |
|
"loss": 3.911, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 2.724951982498169, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 3.9436, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3312, |
|
"grad_norm": 4.7506279945373535, |
|
"learning_rate": 6.0299999999999995e-05, |
|
"loss": 3.8465, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 2.6390953063964844, |
|
"learning_rate": 6.0599999999999996e-05, |
|
"loss": 3.932, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3344, |
|
"grad_norm": 3.661578893661499, |
|
"learning_rate": 6.0899999999999996e-05, |
|
"loss": 3.8498, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 2.446004867553711, |
|
"learning_rate": 6.12e-05, |
|
"loss": 3.9245, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3376, |
|
"grad_norm": 1.197083592414856, |
|
"learning_rate": 6.149999999999999e-05, |
|
"loss": 4.0049, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 3.957880735397339, |
|
"learning_rate": 6.18e-05, |
|
"loss": 3.8129, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3408, |
|
"grad_norm": 2.243058681488037, |
|
"learning_rate": 6.209999999999999e-05, |
|
"loss": 3.8419, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 0.8457456827163696, |
|
"learning_rate": 6.239999999999999e-05, |
|
"loss": 3.8015, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 2.7040092945098877, |
|
"learning_rate": 6.269999999999999e-05, |
|
"loss": 3.7757, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 2.867565155029297, |
|
"learning_rate": 6.299999999999999e-05, |
|
"loss": 3.748, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3472, |
|
"grad_norm": 9.108602523803711, |
|
"learning_rate": 6.33e-05, |
|
"loss": 4.0417, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 2.7541725635528564, |
|
"learning_rate": 6.359999999999999e-05, |
|
"loss": 3.7908, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.3504, |
|
"grad_norm": 1.1848869323730469, |
|
"learning_rate": 6.39e-05, |
|
"loss": 3.7637, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.6500396132469177, |
|
"learning_rate": 6.419999999999999e-05, |
|
"loss": 3.8055, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3536, |
|
"grad_norm": 2.706550359725952, |
|
"learning_rate": 6.45e-05, |
|
"loss": 3.7637, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 5.064160346984863, |
|
"learning_rate": 6.479999999999999e-05, |
|
"loss": 3.7861, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3568, |
|
"grad_norm": 3.20385479927063, |
|
"learning_rate": 6.51e-05, |
|
"loss": 3.7752, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 2.3726119995117188, |
|
"learning_rate": 6.539999999999999e-05, |
|
"loss": 3.7934, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.985705852508545, |
|
"learning_rate": 6.57e-05, |
|
"loss": 3.8806, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 0.669208288192749, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 3.7576, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.3632, |
|
"grad_norm": 1.7072322368621826, |
|
"learning_rate": 6.63e-05, |
|
"loss": 3.7382, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 2.339816093444824, |
|
"learning_rate": 6.659999999999999e-05, |
|
"loss": 3.8001, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.3664, |
|
"grad_norm": 0.6553944945335388, |
|
"learning_rate": 6.69e-05, |
|
"loss": 3.7473, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.8117849826812744, |
|
"learning_rate": 6.72e-05, |
|
"loss": 3.79, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3696, |
|
"grad_norm": 1.0229136943817139, |
|
"learning_rate": 6.75e-05, |
|
"loss": 3.7968, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 1.6037867069244385, |
|
"learning_rate": 6.78e-05, |
|
"loss": 3.7716, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.3728, |
|
"grad_norm": 1.716901183128357, |
|
"learning_rate": 6.81e-05, |
|
"loss": 3.8464, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 9.919891357421875, |
|
"learning_rate": 6.84e-05, |
|
"loss": 3.8106, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 1.2543926239013672, |
|
"learning_rate": 6.87e-05, |
|
"loss": 3.7871, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 5.111069202423096, |
|
"learning_rate": 6.9e-05, |
|
"loss": 3.9449, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3792, |
|
"grad_norm": 0.940678060054779, |
|
"learning_rate": 6.93e-05, |
|
"loss": 3.784, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 0.9248812794685364, |
|
"learning_rate": 6.96e-05, |
|
"loss": 3.8137, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.3824, |
|
"grad_norm": 0.8821243643760681, |
|
"learning_rate": 6.989999999999999e-05, |
|
"loss": 3.7626, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.0918103456497192, |
|
"learning_rate": 7.02e-05, |
|
"loss": 3.7819, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3856, |
|
"grad_norm": 0.6585227251052856, |
|
"learning_rate": 7.049999999999999e-05, |
|
"loss": 3.7891, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 3.0343358516693115, |
|
"learning_rate": 7.079999999999999e-05, |
|
"loss": 3.7803, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.3888, |
|
"grad_norm": 2.1487510204315186, |
|
"learning_rate": 7.11e-05, |
|
"loss": 3.8404, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 1.0203007459640503, |
|
"learning_rate": 7.139999999999999e-05, |
|
"loss": 3.7602, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.8433353900909424, |
|
"learning_rate": 7.17e-05, |
|
"loss": 3.7826, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 2.8857128620147705, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 3.7436, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3952, |
|
"grad_norm": 6.611523628234863, |
|
"learning_rate": 7.23e-05, |
|
"loss": 4.0391, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.7234116196632385, |
|
"learning_rate": 7.259999999999999e-05, |
|
"loss": 3.8167, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3984, |
|
"grad_norm": 0.973664402961731, |
|
"learning_rate": 7.29e-05, |
|
"loss": 3.8963, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.6993762254714966, |
|
"learning_rate": 7.319999999999999e-05, |
|
"loss": 3.9033, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4016, |
|
"grad_norm": 17.571664810180664, |
|
"learning_rate": 7.35e-05, |
|
"loss": 4.2596, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 11.271060943603516, |
|
"learning_rate": 7.379999999999999e-05, |
|
"loss": 4.1186, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4048, |
|
"grad_norm": 8.646568298339844, |
|
"learning_rate": 7.41e-05, |
|
"loss": 4.013, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 2.02486252784729, |
|
"learning_rate": 7.439999999999999e-05, |
|
"loss": 3.8974, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 3.4109764099121094, |
|
"learning_rate": 7.47e-05, |
|
"loss": 3.8285, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 3.8505735397338867, |
|
"learning_rate": 7.5e-05, |
|
"loss": 3.9306, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4112, |
|
"grad_norm": 7.018677234649658, |
|
"learning_rate": 7.529999999999999e-05, |
|
"loss": 3.8432, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 4.351247310638428, |
|
"learning_rate": 7.56e-05, |
|
"loss": 3.8534, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4144, |
|
"grad_norm": 5.365427494049072, |
|
"learning_rate": 7.589999999999999e-05, |
|
"loss": 3.8408, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 3.984861135482788, |
|
"learning_rate": 7.62e-05, |
|
"loss": 3.7589, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4176, |
|
"grad_norm": 1.2847763299942017, |
|
"learning_rate": 7.649999999999999e-05, |
|
"loss": 3.748, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 4.559200286865234, |
|
"learning_rate": 7.68e-05, |
|
"loss": 3.8104, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4208, |
|
"grad_norm": 4.230029106140137, |
|
"learning_rate": 7.709999999999999e-05, |
|
"loss": 3.7676, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 6.13962984085083, |
|
"learning_rate": 7.74e-05, |
|
"loss": 3.7336, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 4.625703811645508, |
|
"learning_rate": 7.769999999999999e-05, |
|
"loss": 3.744, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 4.050301551818848, |
|
"learning_rate": 7.8e-05, |
|
"loss": 3.7662, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4272, |
|
"grad_norm": 3.0125648975372314, |
|
"learning_rate": 7.829999999999999e-05, |
|
"loss": 3.7469, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 0.6710224747657776, |
|
"learning_rate": 7.86e-05, |
|
"loss": 3.7913, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.4304, |
|
"grad_norm": 0.7062709927558899, |
|
"learning_rate": 7.89e-05, |
|
"loss": 3.7765, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 5.028995990753174, |
|
"learning_rate": 7.92e-05, |
|
"loss": 3.7567, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4336, |
|
"grad_norm": 4.44848108291626, |
|
"learning_rate": 7.95e-05, |
|
"loss": 3.7673, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 4.467078685760498, |
|
"learning_rate": 7.98e-05, |
|
"loss": 3.7462, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4368, |
|
"grad_norm": 3.1866374015808105, |
|
"learning_rate": 8.01e-05, |
|
"loss": 3.715, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 2.605476140975952, |
|
"learning_rate": 8.04e-05, |
|
"loss": 3.6984, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6094714999198914, |
|
"learning_rate": 8.07e-05, |
|
"loss": 3.7325, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 6.599428653717041, |
|
"learning_rate": 8.1e-05, |
|
"loss": 3.8765, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.4432, |
|
"grad_norm": 2.6780223846435547, |
|
"learning_rate": 8.13e-05, |
|
"loss": 3.7577, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 3.663605213165283, |
|
"learning_rate": 8.16e-05, |
|
"loss": 3.8035, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.4464, |
|
"grad_norm": 2.812157392501831, |
|
"learning_rate": 8.19e-05, |
|
"loss": 3.749, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 2.2692039012908936, |
|
"learning_rate": 8.22e-05, |
|
"loss": 3.7149, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4496, |
|
"grad_norm": 1.1938503980636597, |
|
"learning_rate": 8.25e-05, |
|
"loss": 3.7246, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 1.3016897439956665, |
|
"learning_rate": 8.28e-05, |
|
"loss": 3.6932, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.4528, |
|
"grad_norm": 2.0602409839630127, |
|
"learning_rate": 8.31e-05, |
|
"loss": 3.7896, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 1.2453322410583496, |
|
"learning_rate": 8.34e-05, |
|
"loss": 3.7896, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.7699930667877197, |
|
"learning_rate": 8.37e-05, |
|
"loss": 3.7406, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 0.9949842691421509, |
|
"learning_rate": 8.4e-05, |
|
"loss": 3.7951, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.4592, |
|
"grad_norm": 1.2708395719528198, |
|
"learning_rate": 8.43e-05, |
|
"loss": 3.7142, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 1.6578696966171265, |
|
"learning_rate": 8.459999999999998e-05, |
|
"loss": 3.7042, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.4624, |
|
"grad_norm": 0.9027276635169983, |
|
"learning_rate": 8.489999999999999e-05, |
|
"loss": 3.6875, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 1.3110026121139526, |
|
"learning_rate": 8.519999999999998e-05, |
|
"loss": 3.7219, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4656, |
|
"grad_norm": 0.9840269088745117, |
|
"learning_rate": 8.549999999999999e-05, |
|
"loss": 3.7555, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 1.4040346145629883, |
|
"learning_rate": 8.579999999999998e-05, |
|
"loss": 3.7981, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.4688, |
|
"grad_norm": 1.0543975830078125, |
|
"learning_rate": 8.609999999999999e-05, |
|
"loss": 3.7075, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 0.9345111846923828, |
|
"learning_rate": 8.639999999999999e-05, |
|
"loss": 3.677, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 1.007042646408081, |
|
"learning_rate": 8.669999999999998e-05, |
|
"loss": 3.7533, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 1.7284626960754395, |
|
"learning_rate": 8.699999999999999e-05, |
|
"loss": 3.6897, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.4752, |
|
"grad_norm": 2.507981538772583, |
|
"learning_rate": 8.729999999999998e-05, |
|
"loss": 3.8189, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 2.26454496383667, |
|
"learning_rate": 8.759999999999999e-05, |
|
"loss": 3.8095, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.4784, |
|
"grad_norm": 1.5712822675704956, |
|
"learning_rate": 8.789999999999998e-05, |
|
"loss": 3.8321, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.8837485313415527, |
|
"learning_rate": 8.819999999999999e-05, |
|
"loss": 3.9162, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4816, |
|
"grad_norm": 14.770750999450684, |
|
"learning_rate": 8.849999999999998e-05, |
|
"loss": 4.0945, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 14.976526260375977, |
|
"learning_rate": 8.879999999999999e-05, |
|
"loss": 4.0664, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.4848, |
|
"grad_norm": 9.496882438659668, |
|
"learning_rate": 8.909999999999998e-05, |
|
"loss": 4.0088, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 13.879667282104492, |
|
"learning_rate": 8.939999999999999e-05, |
|
"loss": 4.1825, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 6.049519062042236, |
|
"learning_rate": 8.969999999999998e-05, |
|
"loss": 3.883, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 1.975704312324524, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 3.8272, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.4912, |
|
"grad_norm": 5.8130598068237305, |
|
"learning_rate": 9.029999999999999e-05, |
|
"loss": 3.8001, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 3.5878612995147705, |
|
"learning_rate": 9.059999999999999e-05, |
|
"loss": 3.8353, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.4944, |
|
"grad_norm": 4.1221513748168945, |
|
"learning_rate": 9.089999999999999e-05, |
|
"loss": 3.9413, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 5.489438533782959, |
|
"learning_rate": 9.12e-05, |
|
"loss": 3.7904, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4976, |
|
"grad_norm": 1.6620556116104126, |
|
"learning_rate": 9.149999999999999e-05, |
|
"loss": 3.7937, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 1.13966703414917, |
|
"learning_rate": 9.18e-05, |
|
"loss": 3.7336, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5008, |
|
"grad_norm": 1.5242611169815063, |
|
"learning_rate": 9.209999999999999e-05, |
|
"loss": 3.7161, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 3.9798734188079834, |
|
"learning_rate": 9.24e-05, |
|
"loss": 3.7, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 7.939405918121338, |
|
"learning_rate": 9.269999999999999e-05, |
|
"loss": 3.7537, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 4.0154709815979, |
|
"learning_rate": 9.3e-05, |
|
"loss": 3.708, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5072, |
|
"grad_norm": 4.138357639312744, |
|
"learning_rate": 9.329999999999999e-05, |
|
"loss": 3.7166, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 8.64471435546875, |
|
"learning_rate": 9.36e-05, |
|
"loss": 4.0627, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5104, |
|
"grad_norm": 0.7231702208518982, |
|
"learning_rate": 9.389999999999999e-05, |
|
"loss": 3.7427, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 2.371631622314453, |
|
"learning_rate": 9.419999999999999e-05, |
|
"loss": 3.7203, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5136, |
|
"grad_norm": 4.284900188446045, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 3.7413, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 3.0372443199157715, |
|
"learning_rate": 9.479999999999999e-05, |
|
"loss": 3.6992, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5168, |
|
"grad_norm": 1.9789845943450928, |
|
"learning_rate": 9.51e-05, |
|
"loss": 3.7184, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 1.624227523803711, |
|
"learning_rate": 9.539999999999999e-05, |
|
"loss": 3.6749, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5696678161621094, |
|
"learning_rate": 9.57e-05, |
|
"loss": 3.7581, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 2.7740790843963623, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 3.6773, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5232, |
|
"grad_norm": 2.1769227981567383, |
|
"learning_rate": 9.63e-05, |
|
"loss": 3.6821, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 3.454484224319458, |
|
"learning_rate": 9.659999999999999e-05, |
|
"loss": 3.6977, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.5264, |
|
"grad_norm": 1.035311222076416, |
|
"learning_rate": 9.69e-05, |
|
"loss": 3.6816, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.0064358711242676, |
|
"learning_rate": 9.719999999999999e-05, |
|
"loss": 3.7105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5296, |
|
"grad_norm": 2.302251100540161, |
|
"learning_rate": 9.75e-05, |
|
"loss": 3.7867, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 2.8935694694519043, |
|
"learning_rate": 9.779999999999999e-05, |
|
"loss": 3.7186, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5328, |
|
"grad_norm": 4.943471908569336, |
|
"learning_rate": 9.81e-05, |
|
"loss": 3.6946, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 2.8398258686065674, |
|
"learning_rate": 9.839999999999999e-05, |
|
"loss": 3.7091, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 3.0762977600097656, |
|
"learning_rate": 9.87e-05, |
|
"loss": 3.7574, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 1.8813797235488892, |
|
"learning_rate": 9.9e-05, |
|
"loss": 3.7118, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5392, |
|
"grad_norm": 0.8849917054176331, |
|
"learning_rate": 9.93e-05, |
|
"loss": 3.77, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 1.4980673789978027, |
|
"learning_rate": 9.96e-05, |
|
"loss": 3.6983, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.5424, |
|
"grad_norm": 1.593652367591858, |
|
"learning_rate": 9.99e-05, |
|
"loss": 3.6434, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.0899137258529663, |
|
"learning_rate": 0.0001002, |
|
"loss": 3.6421, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5456, |
|
"grad_norm": 0.6649819016456604, |
|
"learning_rate": 0.0001005, |
|
"loss": 3.6891, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 1.5821571350097656, |
|
"learning_rate": 0.0001008, |
|
"loss": 3.794, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.5488, |
|
"grad_norm": 2.7996299266815186, |
|
"learning_rate": 0.0001011, |
|
"loss": 3.6779, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 2.5575685501098633, |
|
"learning_rate": 0.0001014, |
|
"loss": 3.7879, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 1.2596614360809326, |
|
"learning_rate": 0.00010169999999999999, |
|
"loss": 3.7208, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 0.8037718534469604, |
|
"learning_rate": 0.000102, |
|
"loss": 3.6811, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.5552, |
|
"grad_norm": 1.3349064588546753, |
|
"learning_rate": 0.00010229999999999999, |
|
"loss": 3.6959, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 1.269572138786316, |
|
"learning_rate": 0.0001026, |
|
"loss": 3.7447, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.5584, |
|
"grad_norm": 1.022356629371643, |
|
"learning_rate": 0.0001029, |
|
"loss": 3.6698, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.5080015659332275, |
|
"learning_rate": 0.00010319999999999999, |
|
"loss": 3.7888, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5616, |
|
"grad_norm": 39.285152435302734, |
|
"learning_rate": 0.00010349999999999998, |
|
"loss": 5.0541, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 9.576460838317871, |
|
"learning_rate": 0.00010379999999999999, |
|
"loss": 4.014, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.5648, |
|
"grad_norm": 6.356123447418213, |
|
"learning_rate": 0.00010409999999999998, |
|
"loss": 3.9312, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 2.139646291732788, |
|
"learning_rate": 0.00010439999999999999, |
|
"loss": 3.7221, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 1.8156242370605469, |
|
"learning_rate": 0.00010469999999999998, |
|
"loss": 3.8395, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 4.785361289978027, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 3.719, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5712, |
|
"grad_norm": 3.600017786026001, |
|
"learning_rate": 0.00010529999999999998, |
|
"loss": 3.7428, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 2.4187095165252686, |
|
"learning_rate": 0.00010559999999999998, |
|
"loss": 3.6832, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.5744, |
|
"grad_norm": 1.5843887329101562, |
|
"learning_rate": 0.00010589999999999999, |
|
"loss": 3.6504, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.5045654773712158, |
|
"learning_rate": 0.00010619999999999998, |
|
"loss": 3.6816, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5776, |
|
"grad_norm": 2.58827543258667, |
|
"learning_rate": 0.00010649999999999999, |
|
"loss": 3.642, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 2.5386884212493896, |
|
"learning_rate": 0.00010679999999999998, |
|
"loss": 3.6578, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.5808, |
|
"grad_norm": 2.9344706535339355, |
|
"learning_rate": 0.00010709999999999999, |
|
"loss": 3.6694, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 2.340221643447876, |
|
"learning_rate": 0.00010739999999999998, |
|
"loss": 3.6504, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 2.816999912261963, |
|
"learning_rate": 0.00010769999999999999, |
|
"loss": 3.6654, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 1.5071390867233276, |
|
"learning_rate": 0.00010799999999999998, |
|
"loss": 3.6406, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.5872, |
|
"grad_norm": 0.7593219876289368, |
|
"learning_rate": 0.00010829999999999999, |
|
"loss": 3.6683, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 2.646967887878418, |
|
"learning_rate": 0.00010859999999999998, |
|
"loss": 3.6844, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.5904, |
|
"grad_norm": 2.8628735542297363, |
|
"learning_rate": 0.00010889999999999999, |
|
"loss": 3.6159, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 2.4796457290649414, |
|
"learning_rate": 0.00010919999999999998, |
|
"loss": 3.6951, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5936, |
|
"grad_norm": 1.1962988376617432, |
|
"learning_rate": 0.00010949999999999999, |
|
"loss": 3.5725, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 0.9031145572662354, |
|
"learning_rate": 0.00010979999999999999, |
|
"loss": 3.5847, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.5968, |
|
"grad_norm": 2.3092095851898193, |
|
"learning_rate": 0.00011009999999999999, |
|
"loss": 3.6181, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 1.1398162841796875, |
|
"learning_rate": 0.00011039999999999999, |
|
"loss": 3.6774, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0708636045455933, |
|
"learning_rate": 0.0001107, |
|
"loss": 3.5872, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 0.873319149017334, |
|
"learning_rate": 0.00011099999999999999, |
|
"loss": 3.6512, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6032, |
|
"grad_norm": 0.8766927123069763, |
|
"learning_rate": 0.0001113, |
|
"loss": 3.6434, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 0.7983845472335815, |
|
"learning_rate": 0.00011159999999999999, |
|
"loss": 3.5572, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6064, |
|
"grad_norm": 6.645163059234619, |
|
"learning_rate": 0.0001119, |
|
"loss": 3.7195, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 4.845424652099609, |
|
"learning_rate": 0.00011219999999999999, |
|
"loss": 3.7342, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6096, |
|
"grad_norm": 0.7528507709503174, |
|
"learning_rate": 0.0001125, |
|
"loss": 3.5633, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 0.69414883852005, |
|
"learning_rate": 0.00011279999999999999, |
|
"loss": 3.6274, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.6128, |
|
"grad_norm": 2.0266685485839844, |
|
"learning_rate": 0.00011309999999999998, |
|
"loss": 3.5491, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.7420620918273926, |
|
"learning_rate": 0.00011339999999999999, |
|
"loss": 3.6293, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 1.1609050035476685, |
|
"learning_rate": 0.00011369999999999999, |
|
"loss": 3.5036, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 1.6696407794952393, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 3.5909, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.6192, |
|
"grad_norm": 1.1470685005187988, |
|
"learning_rate": 0.00011429999999999999, |
|
"loss": 3.6343, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 1.3622722625732422, |
|
"learning_rate": 0.0001146, |
|
"loss": 3.5755, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.6224, |
|
"grad_norm": 1.2317267656326294, |
|
"learning_rate": 0.00011489999999999999, |
|
"loss": 3.5538, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.1414676904678345, |
|
"learning_rate": 0.0001152, |
|
"loss": 3.5555, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6256, |
|
"grad_norm": 1.998960018157959, |
|
"learning_rate": 0.00011549999999999999, |
|
"loss": 3.5465, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 1.4650264978408813, |
|
"learning_rate": 0.0001158, |
|
"loss": 3.5777, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.6288, |
|
"grad_norm": 1.5700796842575073, |
|
"learning_rate": 0.00011609999999999999, |
|
"loss": 3.6034, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 2.38299298286438, |
|
"learning_rate": 0.0001164, |
|
"loss": 3.5582, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 1.2898112535476685, |
|
"learning_rate": 0.00011669999999999999, |
|
"loss": 3.4869, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 1.2601486444473267, |
|
"learning_rate": 0.000117, |
|
"loss": 3.5423, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.6352, |
|
"grad_norm": 1.907885193824768, |
|
"learning_rate": 0.00011729999999999999, |
|
"loss": 3.5019, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 1.2280569076538086, |
|
"learning_rate": 0.0001176, |
|
"loss": 3.6299, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.6384, |
|
"grad_norm": 2.214331865310669, |
|
"learning_rate": 0.00011789999999999999, |
|
"loss": 3.5979, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011789999999999999, |
|
"loss": 3.4218, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6416, |
|
"grad_norm": 38.77112579345703, |
|
"learning_rate": 0.0001182, |
|
"loss": 4.5562, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 8.848291397094727, |
|
"learning_rate": 0.0001185, |
|
"loss": 3.7699, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.6448, |
|
"grad_norm": 6.007197856903076, |
|
"learning_rate": 0.0001188, |
|
"loss": 3.539, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 3.0180368423461914, |
|
"learning_rate": 0.0001191, |
|
"loss": 3.5382, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 2.071746587753296, |
|
"learning_rate": 0.0001194, |
|
"loss": 3.5857, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 4.427801132202148, |
|
"learning_rate": 0.0001197, |
|
"loss": 3.526, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.6512, |
|
"grad_norm": 5.680927753448486, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 3.4296, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 2.7837042808532715, |
|
"learning_rate": 0.0001203, |
|
"loss": 3.4657, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.6544, |
|
"grad_norm": 4.605573654174805, |
|
"learning_rate": 0.00012059999999999999, |
|
"loss": 3.4279, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 1.96554696559906, |
|
"learning_rate": 0.0001209, |
|
"loss": 3.3618, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6576, |
|
"grad_norm": 5.76222038269043, |
|
"learning_rate": 0.00012119999999999999, |
|
"loss": 3.4163, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 4.640344619750977, |
|
"learning_rate": 0.0001215, |
|
"loss": 3.4429, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.6608, |
|
"grad_norm": 4.301933288574219, |
|
"learning_rate": 0.00012179999999999999, |
|
"loss": 3.3255, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 3.781334638595581, |
|
"learning_rate": 0.00012209999999999999, |
|
"loss": 3.3261, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 3.663053035736084, |
|
"learning_rate": 0.0001224, |
|
"loss": 3.2962, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 3.2776567935943604, |
|
"learning_rate": 0.00012269999999999997, |
|
"loss": 3.333, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6672, |
|
"grad_norm": 1.000927209854126, |
|
"learning_rate": 0.00012299999999999998, |
|
"loss": 3.2321, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 1.561220407485962, |
|
"learning_rate": 0.0001233, |
|
"loss": 3.3121, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.6704, |
|
"grad_norm": 0.8714520931243896, |
|
"learning_rate": 0.0001236, |
|
"loss": 3.3394, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.1457229852676392, |
|
"learning_rate": 0.00012389999999999998, |
|
"loss": 3.1645, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6736, |
|
"grad_norm": 2.054020881652832, |
|
"learning_rate": 0.00012419999999999998, |
|
"loss": 3.2115, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 3.8146936893463135, |
|
"learning_rate": 0.0001245, |
|
"loss": 3.3334, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.6768, |
|
"grad_norm": 2.3825631141662598, |
|
"learning_rate": 0.00012479999999999997, |
|
"loss": 3.2264, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 1.282517671585083, |
|
"learning_rate": 0.00012509999999999998, |
|
"loss": 3.3354, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.5535123348236084, |
|
"learning_rate": 0.00012539999999999999, |
|
"loss": 3.1819, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 1.8400110006332397, |
|
"learning_rate": 0.0001257, |
|
"loss": 3.1989, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.6832, |
|
"grad_norm": 1.3851298093795776, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 3.2336, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 1.884459376335144, |
|
"learning_rate": 0.00012629999999999998, |
|
"loss": 3.2123, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.6864, |
|
"grad_norm": 1.7640012502670288, |
|
"learning_rate": 0.0001266, |
|
"loss": 3.0558, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 2.564265489578247, |
|
"learning_rate": 0.0001269, |
|
"loss": 3.0314, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6896, |
|
"grad_norm": 1.8793052434921265, |
|
"learning_rate": 0.00012719999999999997, |
|
"loss": 3.0916, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 1.3174560070037842, |
|
"learning_rate": 0.00012749999999999998, |
|
"loss": 3.0977, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.6928, |
|
"grad_norm": 0.9135323166847229, |
|
"learning_rate": 0.0001278, |
|
"loss": 3.1459, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 1.05746591091156, |
|
"learning_rate": 0.0001281, |
|
"loss": 3.1823, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 1.2425645589828491, |
|
"learning_rate": 0.00012839999999999998, |
|
"loss": 2.9603, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 1.2454054355621338, |
|
"learning_rate": 0.00012869999999999998, |
|
"loss": 2.9414, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.6992, |
|
"grad_norm": 0.9464673399925232, |
|
"learning_rate": 0.000129, |
|
"loss": 3.1432, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 1.5856995582580566, |
|
"learning_rate": 0.0001293, |
|
"loss": 3.0063, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.7024, |
|
"grad_norm": 1.043485403060913, |
|
"learning_rate": 0.00012959999999999998, |
|
"loss": 2.9146, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.240867257118225, |
|
"learning_rate": 0.00012989999999999999, |
|
"loss": 2.9979, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7056, |
|
"grad_norm": 1.7289670705795288, |
|
"learning_rate": 0.0001302, |
|
"loss": 2.8699, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 1.728317141532898, |
|
"learning_rate": 0.0001305, |
|
"loss": 2.7802, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.7088, |
|
"grad_norm": 0.960502028465271, |
|
"learning_rate": 0.00013079999999999998, |
|
"loss": 2.9141, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 2.093698501586914, |
|
"learning_rate": 0.0001311, |
|
"loss": 3.1318, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 1.6515812873840332, |
|
"learning_rate": 0.0001314, |
|
"loss": 2.9467, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 1.4129968881607056, |
|
"learning_rate": 0.00013169999999999998, |
|
"loss": 2.7909, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.7152, |
|
"grad_norm": 1.5885038375854492, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 3.1262, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 1.222842812538147, |
|
"learning_rate": 0.0001323, |
|
"loss": 3.2029, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.7184, |
|
"grad_norm": 1.3282477855682373, |
|
"learning_rate": 0.0001326, |
|
"loss": 3.0013, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001326, |
|
"loss": 3.4352, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7216, |
|
"grad_norm": 27.799081802368164, |
|
"learning_rate": 0.00013289999999999998, |
|
"loss": 4.3657, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 5.403924465179443, |
|
"learning_rate": 0.00013319999999999999, |
|
"loss": 2.9244, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.7248, |
|
"grad_norm": 3.8071448802948, |
|
"learning_rate": 0.0001335, |
|
"loss": 2.927, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 3.504509210586548, |
|
"learning_rate": 0.0001338, |
|
"loss": 2.9719, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 3.500847578048706, |
|
"learning_rate": 0.00013409999999999998, |
|
"loss": 3.0386, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 3.5392863750457764, |
|
"learning_rate": 0.0001344, |
|
"loss": 2.9468, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.7312, |
|
"grad_norm": 5.1045732498168945, |
|
"learning_rate": 0.0001347, |
|
"loss": 2.7885, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 6.027789115905762, |
|
"learning_rate": 0.000135, |
|
"loss": 2.9067, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.7344, |
|
"grad_norm": 5.094452381134033, |
|
"learning_rate": 0.00013529999999999998, |
|
"loss": 2.8379, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 2.6457953453063965, |
|
"learning_rate": 0.0001356, |
|
"loss": 2.8325, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7376, |
|
"grad_norm": 1.5734143257141113, |
|
"learning_rate": 0.0001359, |
|
"loss": 2.8004, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 2.7408978939056396, |
|
"learning_rate": 0.0001362, |
|
"loss": 2.5294, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.7408, |
|
"grad_norm": 3.2462551593780518, |
|
"learning_rate": 0.00013649999999999998, |
|
"loss": 2.5774, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 5.122827529907227, |
|
"learning_rate": 0.0001368, |
|
"loss": 2.5412, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 6.828001976013184, |
|
"learning_rate": 0.0001371, |
|
"loss": 2.5768, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 5.996628761291504, |
|
"learning_rate": 0.0001374, |
|
"loss": 2.5803, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.7472, |
|
"grad_norm": 3.842134714126587, |
|
"learning_rate": 0.00013769999999999999, |
|
"loss": 2.3747, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 1.4524292945861816, |
|
"learning_rate": 0.000138, |
|
"loss": 2.4186, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.7504, |
|
"grad_norm": 1.6084707975387573, |
|
"learning_rate": 0.0001383, |
|
"loss": 2.3012, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 2.121351718902588, |
|
"learning_rate": 0.0001386, |
|
"loss": 2.442, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7536, |
|
"grad_norm": 1.5034464597702026, |
|
"learning_rate": 0.0001389, |
|
"loss": 2.2728, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 1.2867931127548218, |
|
"learning_rate": 0.0001392, |
|
"loss": 2.3669, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.7568, |
|
"grad_norm": 1.8455201387405396, |
|
"learning_rate": 0.0001395, |
|
"loss": 2.3831, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 1.4569259881973267, |
|
"learning_rate": 0.00013979999999999998, |
|
"loss": 2.6884, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.9550684690475464, |
|
"learning_rate": 0.0001401, |
|
"loss": 2.4852, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 2.876927137374878, |
|
"learning_rate": 0.0001404, |
|
"loss": 2.5227, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.7632, |
|
"grad_norm": 1.2651807069778442, |
|
"learning_rate": 0.00014069999999999998, |
|
"loss": 2.5994, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 1.26189386844635, |
|
"learning_rate": 0.00014099999999999998, |
|
"loss": 2.2933, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.7664, |
|
"grad_norm": 1.3137550354003906, |
|
"learning_rate": 0.0001413, |
|
"loss": 2.3087, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.7220642566680908, |
|
"learning_rate": 0.00014159999999999997, |
|
"loss": 2.4592, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7696, |
|
"grad_norm": 1.3261381387710571, |
|
"learning_rate": 0.00014189999999999998, |
|
"loss": 2.0056, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 2.571230173110962, |
|
"learning_rate": 0.0001422, |
|
"loss": 2.2005, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.7728, |
|
"grad_norm": 1.9342719316482544, |
|
"learning_rate": 0.0001425, |
|
"loss": 2.444, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 1.9060297012329102, |
|
"learning_rate": 0.00014279999999999997, |
|
"loss": 2.5657, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 1.7057262659072876, |
|
"learning_rate": 0.00014309999999999998, |
|
"loss": 2.2488, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 1.5254745483398438, |
|
"learning_rate": 0.0001434, |
|
"loss": 2.3053, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.7792, |
|
"grad_norm": 1.2841426134109497, |
|
"learning_rate": 0.00014369999999999997, |
|
"loss": 2.7327, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 1.2939062118530273, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 2.1748, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.7824, |
|
"grad_norm": 1.041858434677124, |
|
"learning_rate": 0.00014429999999999998, |
|
"loss": 2.2685, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.1529954671859741, |
|
"learning_rate": 0.0001446, |
|
"loss": 2.6499, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7856, |
|
"grad_norm": 1.2997585535049438, |
|
"learning_rate": 0.00014489999999999997, |
|
"loss": 2.4287, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 1.8214664459228516, |
|
"learning_rate": 0.00014519999999999998, |
|
"loss": 2.4024, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.7888, |
|
"grad_norm": 2.8641598224639893, |
|
"learning_rate": 0.00014549999999999999, |
|
"loss": 2.3568, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 2.793945789337158, |
|
"learning_rate": 0.0001458, |
|
"loss": 2.635, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 1.2558726072311401, |
|
"learning_rate": 0.00014609999999999997, |
|
"loss": 2.4789, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 1.8537378311157227, |
|
"learning_rate": 0.00014639999999999998, |
|
"loss": 2.0977, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.7952, |
|
"grad_norm": 1.3181400299072266, |
|
"learning_rate": 0.0001467, |
|
"loss": 2.168, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 9.861762046813965, |
|
"learning_rate": 0.000147, |
|
"loss": 3.4399, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.7984, |
|
"grad_norm": 2.7572944164276123, |
|
"learning_rate": 0.00014729999999999998, |
|
"loss": 2.4976, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.072735071182251, |
|
"learning_rate": 0.00014759999999999998, |
|
"loss": 3.0006, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8016, |
|
"grad_norm": 3.723292350769043, |
|
"learning_rate": 0.0001479, |
|
"loss": 2.4371, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 5.342506408691406, |
|
"learning_rate": 0.0001482, |
|
"loss": 3.0689, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.8048, |
|
"grad_norm": 5.763881683349609, |
|
"learning_rate": 0.00014849999999999998, |
|
"loss": 2.8854, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 1.8335249423980713, |
|
"learning_rate": 0.00014879999999999998, |
|
"loss": 2.4936, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 2.8503644466400146, |
|
"learning_rate": 0.0001491, |
|
"loss": 2.4671, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 5.93911600112915, |
|
"learning_rate": 0.0001494, |
|
"loss": 2.2154, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.8112, |
|
"grad_norm": 4.656365871429443, |
|
"learning_rate": 0.00014969999999999998, |
|
"loss": 2.2564, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 4.47904109954834, |
|
"learning_rate": 0.00015, |
|
"loss": 2.4022, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.8144, |
|
"grad_norm": 2.0499017238616943, |
|
"learning_rate": 0.0001503, |
|
"loss": 1.9681, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.0935138463974, |
|
"learning_rate": 0.00015059999999999997, |
|
"loss": 2.0941, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8176, |
|
"grad_norm": 2.3944854736328125, |
|
"learning_rate": 0.00015089999999999998, |
|
"loss": 2.0268, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 6.021939277648926, |
|
"learning_rate": 0.0001512, |
|
"loss": 2.3145, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.8208, |
|
"grad_norm": 5.291767120361328, |
|
"learning_rate": 0.0001515, |
|
"loss": 2.038, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 4.051759719848633, |
|
"learning_rate": 0.00015179999999999998, |
|
"loss": 1.8124, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 6.387513637542725, |
|
"learning_rate": 0.00015209999999999998, |
|
"loss": 2.114, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 3.993975877761841, |
|
"learning_rate": 0.0001524, |
|
"loss": 1.9412, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.8272, |
|
"grad_norm": 2.036212682723999, |
|
"learning_rate": 0.0001527, |
|
"loss": 1.8678, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 1.404420256614685, |
|
"learning_rate": 0.00015299999999999998, |
|
"loss": 2.1287, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.8304, |
|
"grad_norm": 1.0048662424087524, |
|
"learning_rate": 0.00015329999999999999, |
|
"loss": 1.9134, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 2.347856044769287, |
|
"learning_rate": 0.0001536, |
|
"loss": 1.8799, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8336, |
|
"grad_norm": 3.0598201751708984, |
|
"learning_rate": 0.0001539, |
|
"loss": 1.9441, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 2.636126756668091, |
|
"learning_rate": 0.00015419999999999998, |
|
"loss": 1.7355, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.8368, |
|
"grad_norm": 1.8599352836608887, |
|
"learning_rate": 0.0001545, |
|
"loss": 1.9851, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 0.9748109579086304, |
|
"learning_rate": 0.0001548, |
|
"loss": 1.7774, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.414323091506958, |
|
"learning_rate": 0.0001551, |
|
"loss": 2.1997, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 2.8852648735046387, |
|
"learning_rate": 0.00015539999999999998, |
|
"loss": 2.0179, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.8432, |
|
"grad_norm": 2.0136239528656006, |
|
"learning_rate": 0.0001557, |
|
"loss": 1.9451, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 2.07312273979187, |
|
"learning_rate": 0.000156, |
|
"loss": 1.7522, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.8464, |
|
"grad_norm": 1.4143507480621338, |
|
"learning_rate": 0.0001563, |
|
"loss": 1.6561, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 3.017238140106201, |
|
"learning_rate": 0.00015659999999999998, |
|
"loss": 1.913, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8496, |
|
"grad_norm": 0.9368352293968201, |
|
"learning_rate": 0.0001569, |
|
"loss": 1.8592, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 1.308072566986084, |
|
"learning_rate": 0.0001572, |
|
"loss": 2.2341, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8528, |
|
"grad_norm": 2.2798593044281006, |
|
"learning_rate": 0.00015749999999999998, |
|
"loss": 1.9506, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 2.6132118701934814, |
|
"learning_rate": 0.0001578, |
|
"loss": 1.9343, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 1.162194848060608, |
|
"learning_rate": 0.0001581, |
|
"loss": 1.9341, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 1.3427730798721313, |
|
"learning_rate": 0.0001584, |
|
"loss": 1.7395, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8592, |
|
"grad_norm": 2.1670310497283936, |
|
"learning_rate": 0.00015869999999999998, |
|
"loss": 2.3282, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 1.257582187652588, |
|
"learning_rate": 0.000159, |
|
"loss": 2.09, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.8624, |
|
"grad_norm": 1.4573386907577515, |
|
"learning_rate": 0.0001593, |
|
"loss": 1.8402, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.3384615182876587, |
|
"learning_rate": 0.0001596, |
|
"loss": 1.7193, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8656, |
|
"grad_norm": 2.220402479171753, |
|
"learning_rate": 0.00015989999999999998, |
|
"loss": 1.5656, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 2.4653773307800293, |
|
"learning_rate": 0.0001602, |
|
"loss": 2.0628, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.8688, |
|
"grad_norm": 1.280678391456604, |
|
"learning_rate": 0.0001605, |
|
"loss": 1.8363, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 2.4655933380126953, |
|
"learning_rate": 0.0001608, |
|
"loss": 1.7545, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 1.506415605545044, |
|
"learning_rate": 0.00016109999999999999, |
|
"loss": 1.8381, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 1.1475555896759033, |
|
"learning_rate": 0.0001614, |
|
"loss": 1.829, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.8752, |
|
"grad_norm": 1.4434545040130615, |
|
"learning_rate": 0.0001617, |
|
"loss": 1.8184, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 1.8260152339935303, |
|
"learning_rate": 0.000162, |
|
"loss": 1.9946, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.8784, |
|
"grad_norm": 1.8104926347732544, |
|
"learning_rate": 0.0001623, |
|
"loss": 2.031, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.094877243041992, |
|
"learning_rate": 0.0001626, |
|
"loss": 2.2711, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8816, |
|
"grad_norm": 23.733247756958008, |
|
"learning_rate": 0.0001629, |
|
"loss": 4.8174, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 12.243576049804688, |
|
"learning_rate": 0.0001632, |
|
"loss": 3.3335, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.8848, |
|
"grad_norm": 3.982137441635132, |
|
"learning_rate": 0.0001635, |
|
"loss": 2.3234, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 2.5422203540802, |
|
"learning_rate": 0.0001638, |
|
"loss": 2.1768, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 2.649517059326172, |
|
"learning_rate": 0.0001641, |
|
"loss": 2.022, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 4.723710536956787, |
|
"learning_rate": 0.0001644, |
|
"loss": 1.8511, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.8912, |
|
"grad_norm": 2.3035788536071777, |
|
"learning_rate": 0.0001647, |
|
"loss": 1.9528, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 3.8410518169403076, |
|
"learning_rate": 0.000165, |
|
"loss": 1.9104, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.8944, |
|
"grad_norm": 3.0108225345611572, |
|
"learning_rate": 0.0001653, |
|
"loss": 1.7834, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.3487671613693237, |
|
"learning_rate": 0.0001656, |
|
"loss": 1.883, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8976, |
|
"grad_norm": 1.061733365058899, |
|
"learning_rate": 0.0001659, |
|
"loss": 1.5688, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 2.0784027576446533, |
|
"learning_rate": 0.0001662, |
|
"loss": 1.6914, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.9008, |
|
"grad_norm": 6.085043907165527, |
|
"learning_rate": 0.0001665, |
|
"loss": 2.3407, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 1.459148645401001, |
|
"learning_rate": 0.0001668, |
|
"loss": 1.7104, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 1.9622076749801636, |
|
"learning_rate": 0.0001671, |
|
"loss": 1.5955, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 1.2756608724594116, |
|
"learning_rate": 0.0001674, |
|
"loss": 1.4071, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.9072, |
|
"grad_norm": 0.940319299697876, |
|
"learning_rate": 0.0001677, |
|
"loss": 1.6557, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 0.9497667551040649, |
|
"learning_rate": 0.000168, |
|
"loss": 1.774, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.9104, |
|
"grad_norm": 1.1930807828903198, |
|
"learning_rate": 0.0001683, |
|
"loss": 1.8378, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.7330429553985596, |
|
"learning_rate": 0.0001686, |
|
"loss": 1.6816, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9136, |
|
"grad_norm": 0.9604584574699402, |
|
"learning_rate": 0.00016889999999999996, |
|
"loss": 1.6782, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 0.9503042101860046, |
|
"learning_rate": 0.00016919999999999997, |
|
"loss": 1.5947, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.9168, |
|
"grad_norm": 1.1088024377822876, |
|
"learning_rate": 0.00016949999999999997, |
|
"loss": 1.6978, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 1.118318796157837, |
|
"learning_rate": 0.00016979999999999998, |
|
"loss": 1.656, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.5163230895996094, |
|
"learning_rate": 0.00017009999999999996, |
|
"loss": 1.6588, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 1.4612356424331665, |
|
"learning_rate": 0.00017039999999999997, |
|
"loss": 1.9119, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.9232, |
|
"grad_norm": 1.2807903289794922, |
|
"learning_rate": 0.00017069999999999998, |
|
"loss": 1.4299, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 1.049907922744751, |
|
"learning_rate": 0.00017099999999999998, |
|
"loss": 1.3226, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.9264, |
|
"grad_norm": 1.0162078142166138, |
|
"learning_rate": 0.00017129999999999996, |
|
"loss": 1.8021, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.3673537969589233, |
|
"learning_rate": 0.00017159999999999997, |
|
"loss": 1.6087, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9296, |
|
"grad_norm": 1.2779172658920288, |
|
"learning_rate": 0.00017189999999999998, |
|
"loss": 1.6225, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 1.2135735750198364, |
|
"learning_rate": 0.00017219999999999998, |
|
"loss": 1.5889, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.9328, |
|
"grad_norm": 1.45180344581604, |
|
"learning_rate": 0.00017249999999999996, |
|
"loss": 2.1697, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 1.1630367040634155, |
|
"learning_rate": 0.00017279999999999997, |
|
"loss": 1.79, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 2.428530693054199, |
|
"learning_rate": 0.00017309999999999998, |
|
"loss": 1.5455, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 1.3975725173950195, |
|
"learning_rate": 0.00017339999999999996, |
|
"loss": 1.7658, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.9392, |
|
"grad_norm": 1.242210865020752, |
|
"learning_rate": 0.00017369999999999997, |
|
"loss": 1.8039, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 1.071577787399292, |
|
"learning_rate": 0.00017399999999999997, |
|
"loss": 1.7215, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.9424, |
|
"grad_norm": 1.208039402961731, |
|
"learning_rate": 0.00017429999999999998, |
|
"loss": 1.8733, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.5233865976333618, |
|
"learning_rate": 0.00017459999999999996, |
|
"loss": 1.4408, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9456, |
|
"grad_norm": 1.411783218383789, |
|
"learning_rate": 0.00017489999999999997, |
|
"loss": 1.8393, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 1.629401683807373, |
|
"learning_rate": 0.00017519999999999998, |
|
"loss": 1.679, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.9488, |
|
"grad_norm": 1.487720012664795, |
|
"learning_rate": 0.00017549999999999998, |
|
"loss": 1.9937, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 1.7428632974624634, |
|
"learning_rate": 0.00017579999999999996, |
|
"loss": 1.8585, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 1.5290313959121704, |
|
"learning_rate": 0.00017609999999999997, |
|
"loss": 1.758, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 1.4210582971572876, |
|
"learning_rate": 0.00017639999999999998, |
|
"loss": 1.6403, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.9552, |
|
"grad_norm": 1.487386703491211, |
|
"learning_rate": 0.00017669999999999999, |
|
"loss": 2.0706, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 1.789679765701294, |
|
"learning_rate": 0.00017699999999999997, |
|
"loss": 2.0324, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.9584, |
|
"grad_norm": 3.552408456802368, |
|
"learning_rate": 0.00017729999999999997, |
|
"loss": 2.4765, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.6970980167388916, |
|
"learning_rate": 0.00017759999999999998, |
|
"loss": 2.49, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9616, |
|
"grad_norm": 6.3989667892456055, |
|
"learning_rate": 0.0001779, |
|
"loss": 2.2124, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.9632, |
|
"grad_norm": 3.559483528137207, |
|
"learning_rate": 0.00017819999999999997, |
|
"loss": 1.9114, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.9648, |
|
"grad_norm": 2.688811779022217, |
|
"learning_rate": 0.00017849999999999997, |
|
"loss": 1.7274, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 1.4167048931121826, |
|
"learning_rate": 0.00017879999999999998, |
|
"loss": 1.5342, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 1.0234233140945435, |
|
"learning_rate": 0.0001791, |
|
"loss": 1.6476, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.9696, |
|
"grad_norm": 2.3607473373413086, |
|
"learning_rate": 0.00017939999999999997, |
|
"loss": 1.8034, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.9712, |
|
"grad_norm": 1.8193793296813965, |
|
"learning_rate": 0.00017969999999999998, |
|
"loss": 1.2502, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 2.5050389766693115, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 1.7518, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.9744, |
|
"grad_norm": 1.852980375289917, |
|
"learning_rate": 0.00018029999999999996, |
|
"loss": 2.2657, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 1.1846544742584229, |
|
"learning_rate": 0.00018059999999999997, |
|
"loss": 1.6213, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9776, |
|
"grad_norm": 1.1806446313858032, |
|
"learning_rate": 0.00018089999999999998, |
|
"loss": 1.5566, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 0.9722961187362671, |
|
"learning_rate": 0.00018119999999999999, |
|
"loss": 1.4004, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.9808, |
|
"grad_norm": 1.2534488439559937, |
|
"learning_rate": 0.00018149999999999997, |
|
"loss": 1.9613, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.9824, |
|
"grad_norm": 1.55427885055542, |
|
"learning_rate": 0.00018179999999999997, |
|
"loss": 1.3668, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 1.8559104204177856, |
|
"learning_rate": 0.00018209999999999998, |
|
"loss": 1.153, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 1.3127942085266113, |
|
"learning_rate": 0.0001824, |
|
"loss": 1.6635, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.9872, |
|
"grad_norm": 1.3206202983856201, |
|
"learning_rate": 0.00018269999999999997, |
|
"loss": 1.5436, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.9888, |
|
"grad_norm": 1.0405744314193726, |
|
"learning_rate": 0.00018299999999999998, |
|
"loss": 1.6072, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.9904, |
|
"grad_norm": 1.1208364963531494, |
|
"learning_rate": 0.00018329999999999998, |
|
"loss": 1.3522, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.4611485004425049, |
|
"learning_rate": 0.0001836, |
|
"loss": 1.8288, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9936, |
|
"grad_norm": 2.102464199066162, |
|
"learning_rate": 0.00018389999999999997, |
|
"loss": 2.2311, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.9952, |
|
"grad_norm": 1.3121858835220337, |
|
"learning_rate": 0.00018419999999999998, |
|
"loss": 1.6955, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.9968, |
|
"grad_norm": 1.732784390449524, |
|
"learning_rate": 0.00018449999999999999, |
|
"loss": 2.066, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 1.474577784538269, |
|
"learning_rate": 0.0001848, |
|
"loss": 1.7517, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001848, |
|
"loss": 2.8279, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.0016, |
|
"grad_norm": 21.10396385192871, |
|
"learning_rate": 0.00018509999999999997, |
|
"loss": 4.359, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.0032, |
|
"grad_norm": 2.5289759635925293, |
|
"learning_rate": 0.00018539999999999998, |
|
"loss": 1.5803, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.0048, |
|
"grad_norm": 17.63152503967285, |
|
"learning_rate": 0.0001857, |
|
"loss": 4.1665, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.0064, |
|
"grad_norm": 1.2565017938613892, |
|
"learning_rate": 0.000186, |
|
"loss": 1.5211, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.9237573146820068, |
|
"learning_rate": 0.00018629999999999997, |
|
"loss": 1.6206, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0096, |
|
"grad_norm": 1.304307222366333, |
|
"learning_rate": 0.00018659999999999998, |
|
"loss": 1.7635, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.0112, |
|
"grad_norm": 2.240795850753784, |
|
"learning_rate": 0.0001869, |
|
"loss": 1.4183, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.0128, |
|
"grad_norm": 1.2945712804794312, |
|
"learning_rate": 0.0001872, |
|
"loss": 1.8278, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.0144, |
|
"grad_norm": 2.4284050464630127, |
|
"learning_rate": 0.00018749999999999998, |
|
"loss": 1.8362, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 1.5324746370315552, |
|
"learning_rate": 0.00018779999999999998, |
|
"loss": 1.3312, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.0176, |
|
"grad_norm": 0.9457862973213196, |
|
"learning_rate": 0.0001881, |
|
"loss": 1.5771, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.0192, |
|
"grad_norm": 1.761409878730774, |
|
"learning_rate": 0.00018839999999999997, |
|
"loss": 1.3939, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.0208, |
|
"grad_norm": 2.4509124755859375, |
|
"learning_rate": 0.00018869999999999998, |
|
"loss": 1.305, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.0224, |
|
"grad_norm": 1.434770941734314, |
|
"learning_rate": 0.00018899999999999999, |
|
"loss": 1.2198, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.683680772781372, |
|
"learning_rate": 0.0001893, |
|
"loss": 1.4401, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0256, |
|
"grad_norm": 1.468677282333374, |
|
"learning_rate": 0.00018959999999999997, |
|
"loss": 1.3005, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.0272, |
|
"grad_norm": 2.2306525707244873, |
|
"learning_rate": 0.00018989999999999998, |
|
"loss": 1.6153, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.0288, |
|
"grad_norm": 4.796661853790283, |
|
"learning_rate": 0.0001902, |
|
"loss": 2.2058, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.0304, |
|
"grad_norm": 1.139748454093933, |
|
"learning_rate": 0.0001905, |
|
"loss": 1.3829, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 1.9971469640731812, |
|
"learning_rate": 0.00019079999999999998, |
|
"loss": 1.5598, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.0336, |
|
"grad_norm": 2.224128007888794, |
|
"learning_rate": 0.00019109999999999998, |
|
"loss": 1.3654, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.0352, |
|
"grad_norm": 2.5159313678741455, |
|
"learning_rate": 0.0001914, |
|
"loss": 1.6379, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.0368, |
|
"grad_norm": 1.9604592323303223, |
|
"learning_rate": 0.0001917, |
|
"loss": 1.5734, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.0384, |
|
"grad_norm": 1.4151877164840698, |
|
"learning_rate": 0.00019199999999999998, |
|
"loss": 1.1349, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.21165931224823, |
|
"learning_rate": 0.00019229999999999999, |
|
"loss": 1.7592, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0416, |
|
"grad_norm": 2.344447135925293, |
|
"learning_rate": 0.0001926, |
|
"loss": 1.505, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.0432, |
|
"grad_norm": 2.5432910919189453, |
|
"learning_rate": 0.0001929, |
|
"loss": 1.6673, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.0448, |
|
"grad_norm": 1.5895689725875854, |
|
"learning_rate": 0.00019319999999999998, |
|
"loss": 1.6617, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.0464, |
|
"grad_norm": 1.7360563278198242, |
|
"learning_rate": 0.0001935, |
|
"loss": 1.6216, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 1.3723790645599365, |
|
"learning_rate": 0.0001938, |
|
"loss": 1.257, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.0496, |
|
"grad_norm": 0.8750591278076172, |
|
"learning_rate": 0.0001941, |
|
"loss": 1.4356, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.0512, |
|
"grad_norm": 1.407861590385437, |
|
"learning_rate": 0.00019439999999999998, |
|
"loss": 1.346, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.0528, |
|
"grad_norm": 1.2812424898147583, |
|
"learning_rate": 0.0001947, |
|
"loss": 1.2363, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.0544, |
|
"grad_norm": 1.2920845746994019, |
|
"learning_rate": 0.000195, |
|
"loss": 1.4353, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 1.0122877359390259, |
|
"learning_rate": 0.00019529999999999998, |
|
"loss": 1.5272, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0576, |
|
"grad_norm": 1.0607578754425049, |
|
"learning_rate": 0.00019559999999999998, |
|
"loss": 1.1926, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.0592, |
|
"grad_norm": 1.2849078178405762, |
|
"learning_rate": 0.0001959, |
|
"loss": 1.361, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.0608, |
|
"grad_norm": 2.199488401412964, |
|
"learning_rate": 0.0001962, |
|
"loss": 1.4892, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.0624, |
|
"grad_norm": 1.7300806045532227, |
|
"learning_rate": 0.00019649999999999998, |
|
"loss": 1.4795, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 1.210700273513794, |
|
"learning_rate": 0.00019679999999999999, |
|
"loss": 1.6863, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.0656, |
|
"grad_norm": 1.1998845338821411, |
|
"learning_rate": 0.0001971, |
|
"loss": 1.3863, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.0672, |
|
"grad_norm": 1.5421574115753174, |
|
"learning_rate": 0.0001974, |
|
"loss": 1.9558, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.0688, |
|
"grad_norm": 2.3596279621124268, |
|
"learning_rate": 0.00019769999999999998, |
|
"loss": 1.3471, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.0704, |
|
"grad_norm": 1.3288168907165527, |
|
"learning_rate": 0.000198, |
|
"loss": 1.3686, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 1.5977771282196045, |
|
"learning_rate": 0.0001983, |
|
"loss": 1.6142, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0735999999999999, |
|
"grad_norm": 1.171886682510376, |
|
"learning_rate": 0.0001986, |
|
"loss": 1.8817, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.0752, |
|
"grad_norm": 1.4820473194122314, |
|
"learning_rate": 0.00019889999999999998, |
|
"loss": 1.502, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.0768, |
|
"grad_norm": 1.4286924600601196, |
|
"learning_rate": 0.0001992, |
|
"loss": 1.9869, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.0784, |
|
"grad_norm": 1.496476650238037, |
|
"learning_rate": 0.0001995, |
|
"loss": 1.5545, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 11.650896072387695, |
|
"learning_rate": 0.0001998, |
|
"loss": 3.5297, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.0816, |
|
"grad_norm": 10.930564880371094, |
|
"learning_rate": 0.00020009999999999998, |
|
"loss": 3.5244, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.0832, |
|
"grad_norm": 4.526219367980957, |
|
"learning_rate": 0.0002004, |
|
"loss": 1.948, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.0848, |
|
"grad_norm": 1.8217471837997437, |
|
"learning_rate": 0.0002007, |
|
"loss": 1.6832, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.0864, |
|
"grad_norm": 2.5544323921203613, |
|
"learning_rate": 0.000201, |
|
"loss": 2.3308, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 2.732450246810913, |
|
"learning_rate": 0.0002013, |
|
"loss": 1.7663, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0896, |
|
"grad_norm": 4.002326488494873, |
|
"learning_rate": 0.0002016, |
|
"loss": 1.9597, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.0912, |
|
"grad_norm": 2.9579389095306396, |
|
"learning_rate": 0.0002019, |
|
"loss": 1.6625, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.0928, |
|
"grad_norm": 3.6762917041778564, |
|
"learning_rate": 0.0002022, |
|
"loss": 1.4949, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.0944, |
|
"grad_norm": 2.8355441093444824, |
|
"learning_rate": 0.0002025, |
|
"loss": 1.5695, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 2.894350290298462, |
|
"learning_rate": 0.0002028, |
|
"loss": 1.5717, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.0976, |
|
"grad_norm": 4.992308616638184, |
|
"learning_rate": 0.0002031, |
|
"loss": 1.7573, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.0992, |
|
"grad_norm": 1.175133466720581, |
|
"learning_rate": 0.00020339999999999998, |
|
"loss": 1.2767, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.1008, |
|
"grad_norm": 0.7449688911437988, |
|
"learning_rate": 0.0002037, |
|
"loss": 1.4346, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.1024, |
|
"grad_norm": 2.100440740585327, |
|
"learning_rate": 0.000204, |
|
"loss": 1.5286, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 1.06446373462677, |
|
"learning_rate": 0.0002043, |
|
"loss": 1.4716, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.1056, |
|
"grad_norm": 1.1813894510269165, |
|
"learning_rate": 0.00020459999999999999, |
|
"loss": 1.729, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.1072, |
|
"grad_norm": 1.2244285345077515, |
|
"learning_rate": 0.0002049, |
|
"loss": 1.5456, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.1088, |
|
"grad_norm": 1.395920991897583, |
|
"learning_rate": 0.0002052, |
|
"loss": 1.6253, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.1104, |
|
"grad_norm": 0.8973720073699951, |
|
"learning_rate": 0.0002055, |
|
"loss": 1.3474, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 0.9351361393928528, |
|
"learning_rate": 0.0002058, |
|
"loss": 1.5375, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.1136, |
|
"grad_norm": 0.9488412737846375, |
|
"learning_rate": 0.0002061, |
|
"loss": 1.2332, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.1152, |
|
"grad_norm": 0.800336480140686, |
|
"learning_rate": 0.00020639999999999998, |
|
"loss": 1.3265, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.1168, |
|
"grad_norm": 1.771794319152832, |
|
"learning_rate": 0.00020669999999999996, |
|
"loss": 1.3347, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.1184, |
|
"grad_norm": 3.4581542015075684, |
|
"learning_rate": 0.00020699999999999996, |
|
"loss": 1.783, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.837477445602417, |
|
"learning_rate": 0.00020729999999999997, |
|
"loss": 1.3295, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1216, |
|
"grad_norm": 2.1295042037963867, |
|
"learning_rate": 0.00020759999999999998, |
|
"loss": 1.4163, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.1232, |
|
"grad_norm": 1.0342674255371094, |
|
"learning_rate": 0.00020789999999999996, |
|
"loss": 1.2891, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.1248, |
|
"grad_norm": 1.1783955097198486, |
|
"learning_rate": 0.00020819999999999996, |
|
"loss": 1.5386, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.1264, |
|
"grad_norm": 2.5988528728485107, |
|
"learning_rate": 0.00020849999999999997, |
|
"loss": 1.5942, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 1.206281065940857, |
|
"learning_rate": 0.00020879999999999998, |
|
"loss": 1.4828, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.1296, |
|
"grad_norm": 1.60711669921875, |
|
"learning_rate": 0.00020909999999999996, |
|
"loss": 1.6748, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.1312, |
|
"grad_norm": 1.3890515565872192, |
|
"learning_rate": 0.00020939999999999997, |
|
"loss": 1.4464, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.1328, |
|
"grad_norm": 1.4788490533828735, |
|
"learning_rate": 0.00020969999999999997, |
|
"loss": 1.9039, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.1344, |
|
"grad_norm": 0.9197102189064026, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 1.5258, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 3.082664728164673, |
|
"learning_rate": 0.00021029999999999996, |
|
"loss": 1.6637, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1376, |
|
"grad_norm": 1.3979014158248901, |
|
"learning_rate": 0.00021059999999999997, |
|
"loss": 1.3499, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.1392, |
|
"grad_norm": 2.7370402812957764, |
|
"learning_rate": 0.00021089999999999998, |
|
"loss": 1.7379, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.1408, |
|
"grad_norm": 1.36969792842865, |
|
"learning_rate": 0.00021119999999999996, |
|
"loss": 1.3235, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.1424, |
|
"grad_norm": 1.3009356260299683, |
|
"learning_rate": 0.00021149999999999996, |
|
"loss": 1.2209, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 1.0813698768615723, |
|
"learning_rate": 0.00021179999999999997, |
|
"loss": 1.237, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.1456, |
|
"grad_norm": 1.2386032342910767, |
|
"learning_rate": 0.00021209999999999998, |
|
"loss": 1.5799, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.1472, |
|
"grad_norm": 1.7847639322280884, |
|
"learning_rate": 0.00021239999999999996, |
|
"loss": 1.852, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.1488, |
|
"grad_norm": 1.4111274480819702, |
|
"learning_rate": 0.00021269999999999997, |
|
"loss": 1.7199, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.1504, |
|
"grad_norm": 1.6253108978271484, |
|
"learning_rate": 0.00021299999999999997, |
|
"loss": 1.6921, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 1.3691827058792114, |
|
"learning_rate": 0.00021329999999999998, |
|
"loss": 1.5512, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1536, |
|
"grad_norm": 1.0425063371658325, |
|
"learning_rate": 0.00021359999999999996, |
|
"loss": 1.6953, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.1552, |
|
"grad_norm": 1.5456832647323608, |
|
"learning_rate": 0.00021389999999999997, |
|
"loss": 1.2235, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.1568, |
|
"grad_norm": 5.289543151855469, |
|
"learning_rate": 0.00021419999999999998, |
|
"loss": 2.3722, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.1584, |
|
"grad_norm": 3.060047149658203, |
|
"learning_rate": 0.00021449999999999998, |
|
"loss": 1.7361, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.2316486835479736, |
|
"learning_rate": 0.00021479999999999996, |
|
"loss": 2.2835, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.1616, |
|
"grad_norm": 11.319620132446289, |
|
"learning_rate": 0.00021509999999999997, |
|
"loss": 2.8832, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.1632, |
|
"grad_norm": 4.8169121742248535, |
|
"learning_rate": 0.00021539999999999998, |
|
"loss": 2.4756, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.1648, |
|
"grad_norm": 4.998867511749268, |
|
"learning_rate": 0.00021569999999999998, |
|
"loss": 2.5684, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.1663999999999999, |
|
"grad_norm": 1.6369566917419434, |
|
"learning_rate": 0.00021599999999999996, |
|
"loss": 1.9992, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 4.867010593414307, |
|
"learning_rate": 0.00021629999999999997, |
|
"loss": 1.6888, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1696, |
|
"grad_norm": 4.665241241455078, |
|
"learning_rate": 0.00021659999999999998, |
|
"loss": 1.703, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.1712, |
|
"grad_norm": 4.923267364501953, |
|
"learning_rate": 0.0002169, |
|
"loss": 1.8941, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.1728, |
|
"grad_norm": 5.301763534545898, |
|
"learning_rate": 0.00021719999999999997, |
|
"loss": 1.8063, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.1743999999999999, |
|
"grad_norm": 5.480170726776123, |
|
"learning_rate": 0.00021749999999999997, |
|
"loss": 1.6896, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 5.136298656463623, |
|
"learning_rate": 0.00021779999999999998, |
|
"loss": 1.7705, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.1776, |
|
"grad_norm": 2.6885194778442383, |
|
"learning_rate": 0.00021809999999999996, |
|
"loss": 1.29, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.1792, |
|
"grad_norm": 1.7481634616851807, |
|
"learning_rate": 0.00021839999999999997, |
|
"loss": 1.8537, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.1808, |
|
"grad_norm": 2.1533167362213135, |
|
"learning_rate": 0.00021869999999999998, |
|
"loss": 1.3772, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.1824, |
|
"grad_norm": 1.7290595769882202, |
|
"learning_rate": 0.00021899999999999998, |
|
"loss": 1.2517, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 5.765242576599121, |
|
"learning_rate": 0.00021929999999999996, |
|
"loss": 1.9578, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1856, |
|
"grad_norm": 5.171415328979492, |
|
"learning_rate": 0.00021959999999999997, |
|
"loss": 1.6263, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.1872, |
|
"grad_norm": 2.4269332885742188, |
|
"learning_rate": 0.00021989999999999998, |
|
"loss": 1.5452, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.1888, |
|
"grad_norm": 1.2465523481369019, |
|
"learning_rate": 0.00022019999999999999, |
|
"loss": 1.2282, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.1904, |
|
"grad_norm": 1.9669184684753418, |
|
"learning_rate": 0.00022049999999999997, |
|
"loss": 1.1983, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 5.07749605178833, |
|
"learning_rate": 0.00022079999999999997, |
|
"loss": 1.7768, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.1936, |
|
"grad_norm": 1.4985103607177734, |
|
"learning_rate": 0.00022109999999999998, |
|
"loss": 1.504, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.1952, |
|
"grad_norm": 0.8555597066879272, |
|
"learning_rate": 0.0002214, |
|
"loss": 1.2868, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.1968, |
|
"grad_norm": 1.0134530067443848, |
|
"learning_rate": 0.00022169999999999997, |
|
"loss": 1.5408, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.1984, |
|
"grad_norm": 2.2932121753692627, |
|
"learning_rate": 0.00022199999999999998, |
|
"loss": 1.4542, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.048572063446045, |
|
"learning_rate": 0.00022229999999999998, |
|
"loss": 1.6303, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2016, |
|
"grad_norm": 1.3398712873458862, |
|
"learning_rate": 0.0002226, |
|
"loss": 1.6202, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.2032, |
|
"grad_norm": 1.2826422452926636, |
|
"learning_rate": 0.00022289999999999997, |
|
"loss": 1.114, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.2048, |
|
"grad_norm": 0.9887292385101318, |
|
"learning_rate": 0.00022319999999999998, |
|
"loss": 1.2535, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.2064, |
|
"grad_norm": 0.8880885243415833, |
|
"learning_rate": 0.00022349999999999998, |
|
"loss": 1.123, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 4.056207180023193, |
|
"learning_rate": 0.0002238, |
|
"loss": 1.7848, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.2096, |
|
"grad_norm": 1.0242630243301392, |
|
"learning_rate": 0.00022409999999999997, |
|
"loss": 1.6084, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.2112, |
|
"grad_norm": 1.5216087102890015, |
|
"learning_rate": 0.00022439999999999998, |
|
"loss": 1.0884, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.2128, |
|
"grad_norm": 1.1595895290374756, |
|
"learning_rate": 0.0002247, |
|
"loss": 1.4366, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.2144, |
|
"grad_norm": 1.78994619846344, |
|
"learning_rate": 0.000225, |
|
"loss": 1.4678, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 1.5748515129089355, |
|
"learning_rate": 0.00022529999999999997, |
|
"loss": 1.5742, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.2176, |
|
"grad_norm": 1.2527673244476318, |
|
"learning_rate": 0.00022559999999999998, |
|
"loss": 1.2028, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.2192, |
|
"grad_norm": 1.4951261281967163, |
|
"learning_rate": 0.0002259, |
|
"loss": 1.8887, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.2208, |
|
"grad_norm": 1.1303513050079346, |
|
"learning_rate": 0.00022619999999999997, |
|
"loss": 1.6479, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.2224, |
|
"grad_norm": 1.3236031532287598, |
|
"learning_rate": 0.00022649999999999998, |
|
"loss": 1.703, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 1.5853848457336426, |
|
"learning_rate": 0.00022679999999999998, |
|
"loss": 1.8706, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.2256, |
|
"grad_norm": 2.0144317150115967, |
|
"learning_rate": 0.0002271, |
|
"loss": 1.495, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.2272, |
|
"grad_norm": 1.472916841506958, |
|
"learning_rate": 0.00022739999999999997, |
|
"loss": 1.6167, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.2288000000000001, |
|
"grad_norm": 1.3060656785964966, |
|
"learning_rate": 0.00022769999999999998, |
|
"loss": 1.5432, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.2304, |
|
"grad_norm": 1.9118512868881226, |
|
"learning_rate": 0.00022799999999999999, |
|
"loss": 2.3, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 1.5411431789398193, |
|
"learning_rate": 0.0002283, |
|
"loss": 1.3154, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2336, |
|
"grad_norm": 1.2540593147277832, |
|
"learning_rate": 0.00022859999999999997, |
|
"loss": 1.3445, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.2352, |
|
"grad_norm": 1.74718177318573, |
|
"learning_rate": 0.00022889999999999998, |
|
"loss": 1.7902, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.2368000000000001, |
|
"grad_norm": 1.5919808149337769, |
|
"learning_rate": 0.0002292, |
|
"loss": 1.7406, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.2384, |
|
"grad_norm": 2.1802892684936523, |
|
"learning_rate": 0.0002295, |
|
"loss": 2.3024, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002295, |
|
"loss": 2.2887, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.2416, |
|
"grad_norm": 5.746895790100098, |
|
"learning_rate": 0.00022979999999999997, |
|
"loss": 2.6305, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.2432, |
|
"grad_norm": 5.819034099578857, |
|
"learning_rate": 0.00023009999999999998, |
|
"loss": 2.3811, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.2448, |
|
"grad_norm": 2.528698444366455, |
|
"learning_rate": 0.0002304, |
|
"loss": 1.9592, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.2464, |
|
"grad_norm": 4.070464611053467, |
|
"learning_rate": 0.0002307, |
|
"loss": 2.035, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.93435800075531, |
|
"learning_rate": 0.00023099999999999998, |
|
"loss": 1.8043, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2496, |
|
"grad_norm": 3.285830497741699, |
|
"learning_rate": 0.00023129999999999998, |
|
"loss": 1.7722, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.2511999999999999, |
|
"grad_norm": 4.356208324432373, |
|
"learning_rate": 0.0002316, |
|
"loss": 1.6131, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.2528000000000001, |
|
"grad_norm": 5.4774603843688965, |
|
"learning_rate": 0.0002319, |
|
"loss": 1.5492, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.2544, |
|
"grad_norm": 3.671088218688965, |
|
"learning_rate": 0.00023219999999999998, |
|
"loss": 1.6548, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 3.7231082916259766, |
|
"learning_rate": 0.00023249999999999999, |
|
"loss": 1.3874, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.2576, |
|
"grad_norm": 1.782421588897705, |
|
"learning_rate": 0.0002328, |
|
"loss": 1.3857, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.2591999999999999, |
|
"grad_norm": 1.4939918518066406, |
|
"learning_rate": 0.00023309999999999997, |
|
"loss": 1.7764, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.2608, |
|
"grad_norm": 1.064145565032959, |
|
"learning_rate": 0.00023339999999999998, |
|
"loss": 1.3059, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.2624, |
|
"grad_norm": 2.0802013874053955, |
|
"learning_rate": 0.0002337, |
|
"loss": 1.3497, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 2.4550795555114746, |
|
"learning_rate": 0.000234, |
|
"loss": 1.6281, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2656, |
|
"grad_norm": 1.424136996269226, |
|
"learning_rate": 0.00023429999999999998, |
|
"loss": 1.3037, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.2671999999999999, |
|
"grad_norm": 2.6355724334716797, |
|
"learning_rate": 0.00023459999999999998, |
|
"loss": 1.2327, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.2688, |
|
"grad_norm": 1.9551432132720947, |
|
"learning_rate": 0.0002349, |
|
"loss": 1.1327, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.2704, |
|
"grad_norm": 0.920864462852478, |
|
"learning_rate": 0.0002352, |
|
"loss": 1.329, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 0.8361489176750183, |
|
"learning_rate": 0.00023549999999999998, |
|
"loss": 1.4342, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.2736, |
|
"grad_norm": 1.4463287591934204, |
|
"learning_rate": 0.00023579999999999999, |
|
"loss": 1.2214, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.2752, |
|
"grad_norm": 0.8743594884872437, |
|
"learning_rate": 0.0002361, |
|
"loss": 1.126, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.2768, |
|
"grad_norm": 0.9425063133239746, |
|
"learning_rate": 0.0002364, |
|
"loss": 1.4563, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.2784, |
|
"grad_norm": 1.1034338474273682, |
|
"learning_rate": 0.00023669999999999998, |
|
"loss": 1.495, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.235459804534912, |
|
"learning_rate": 0.000237, |
|
"loss": 1.2458, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2816, |
|
"grad_norm": 1.0407472848892212, |
|
"learning_rate": 0.0002373, |
|
"loss": 1.2275, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.2832, |
|
"grad_norm": 1.0057398080825806, |
|
"learning_rate": 0.0002376, |
|
"loss": 1.2064, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.2848, |
|
"grad_norm": 1.2582429647445679, |
|
"learning_rate": 0.00023789999999999998, |
|
"loss": 1.457, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.2864, |
|
"grad_norm": 1.2544838190078735, |
|
"learning_rate": 0.0002382, |
|
"loss": 1.0123, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 1.267555832862854, |
|
"learning_rate": 0.0002385, |
|
"loss": 1.3412, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.2896, |
|
"grad_norm": 1.585595726966858, |
|
"learning_rate": 0.0002388, |
|
"loss": 1.3812, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.2912, |
|
"grad_norm": 1.1115787029266357, |
|
"learning_rate": 0.00023909999999999998, |
|
"loss": 1.2368, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.2928, |
|
"grad_norm": 1.1539804935455322, |
|
"learning_rate": 0.0002394, |
|
"loss": 1.0813, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.2944, |
|
"grad_norm": 1.2517136335372925, |
|
"learning_rate": 0.0002397, |
|
"loss": 1.2567, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 1.0020838975906372, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.3813, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2976, |
|
"grad_norm": 1.278122067451477, |
|
"learning_rate": 0.00024029999999999999, |
|
"loss": 1.4291, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.2992, |
|
"grad_norm": 1.1353975534439087, |
|
"learning_rate": 0.0002406, |
|
"loss": 1.5433, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.3008, |
|
"grad_norm": 1.0918465852737427, |
|
"learning_rate": 0.0002409, |
|
"loss": 1.4993, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.3024, |
|
"grad_norm": 1.1691175699234009, |
|
"learning_rate": 0.00024119999999999998, |
|
"loss": 1.442, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 2.067641258239746, |
|
"learning_rate": 0.0002415, |
|
"loss": 1.3307, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.3056, |
|
"grad_norm": 1.2151570320129395, |
|
"learning_rate": 0.0002418, |
|
"loss": 1.3187, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.3072, |
|
"grad_norm": 1.3653641939163208, |
|
"learning_rate": 0.0002421, |
|
"loss": 1.1919, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.3088, |
|
"grad_norm": 1.193217158317566, |
|
"learning_rate": 0.00024239999999999998, |
|
"loss": 1.5955, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.3104, |
|
"grad_norm": 1.2559990882873535, |
|
"learning_rate": 0.0002427, |
|
"loss": 1.6028, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 1.5885244607925415, |
|
"learning_rate": 0.000243, |
|
"loss": 1.4464, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3136, |
|
"grad_norm": 1.5733736753463745, |
|
"learning_rate": 0.0002433, |
|
"loss": 1.743, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.3152, |
|
"grad_norm": 2.05718731880188, |
|
"learning_rate": 0.00024359999999999999, |
|
"loss": 1.4574, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.3168, |
|
"grad_norm": 1.5828289985656738, |
|
"learning_rate": 0.00024389999999999997, |
|
"loss": 1.9888, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 1.3184, |
|
"grad_norm": 1.8411847352981567, |
|
"learning_rate": 0.00024419999999999997, |
|
"loss": 1.3748, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.4953489303588867, |
|
"learning_rate": 0.0002445, |
|
"loss": 2.2596, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.3216, |
|
"grad_norm": 22.735031127929688, |
|
"learning_rate": 0.0002448, |
|
"loss": 5.1826, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.3232, |
|
"grad_norm": 2.6146559715270996, |
|
"learning_rate": 0.00024509999999999994, |
|
"loss": 1.7049, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 1.3248, |
|
"grad_norm": 1.7866498231887817, |
|
"learning_rate": 0.00024539999999999995, |
|
"loss": 1.6428, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.3264, |
|
"grad_norm": 3.011427640914917, |
|
"learning_rate": 0.00024569999999999995, |
|
"loss": 1.7732, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 1.7671997547149658, |
|
"learning_rate": 0.00024599999999999996, |
|
"loss": 1.7, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3296000000000001, |
|
"grad_norm": 3.2919392585754395, |
|
"learning_rate": 0.00024629999999999997, |
|
"loss": 1.8395, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 1.3312, |
|
"grad_norm": 3.4365289211273193, |
|
"learning_rate": 0.0002466, |
|
"loss": 1.4656, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.3328, |
|
"grad_norm": 2.34206485748291, |
|
"learning_rate": 0.0002469, |
|
"loss": 1.3553, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 1.3344, |
|
"grad_norm": 1.3108103275299072, |
|
"learning_rate": 0.0002472, |
|
"loss": 1.6943, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 1.1649718284606934, |
|
"learning_rate": 0.00024749999999999994, |
|
"loss": 1.6788, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.3376000000000001, |
|
"grad_norm": 0.8755460977554321, |
|
"learning_rate": 0.00024779999999999995, |
|
"loss": 1.064, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.3392, |
|
"grad_norm": 1.0399974584579468, |
|
"learning_rate": 0.00024809999999999996, |
|
"loss": 1.1169, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.3408, |
|
"grad_norm": 1.590290904045105, |
|
"learning_rate": 0.00024839999999999997, |
|
"loss": 1.32, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.3424, |
|
"grad_norm": 1.9613844156265259, |
|
"learning_rate": 0.0002487, |
|
"loss": 0.9774, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.3067349195480347, |
|
"learning_rate": 0.000249, |
|
"loss": 1.0947, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3456000000000001, |
|
"grad_norm": 3.495009422302246, |
|
"learning_rate": 0.0002493, |
|
"loss": 1.4792, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.3472, |
|
"grad_norm": 2.386378526687622, |
|
"learning_rate": 0.00024959999999999994, |
|
"loss": 1.2189, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.3488, |
|
"grad_norm": 1.2430686950683594, |
|
"learning_rate": 0.00024989999999999995, |
|
"loss": 1.383, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 1.3504, |
|
"grad_norm": 1.1015182733535767, |
|
"learning_rate": 0.00025019999999999996, |
|
"loss": 1.1695, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 1.2849568128585815, |
|
"learning_rate": 0.00025049999999999996, |
|
"loss": 1.4384, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.3536000000000001, |
|
"grad_norm": 0.9367717504501343, |
|
"learning_rate": 0.00025079999999999997, |
|
"loss": 1.2806, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.3552, |
|
"grad_norm": 1.9065661430358887, |
|
"learning_rate": 0.0002511, |
|
"loss": 1.9186, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 1.3568, |
|
"grad_norm": 1.8349933624267578, |
|
"learning_rate": 0.0002514, |
|
"loss": 1.2997, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.3584, |
|
"grad_norm": 2.411646604537964, |
|
"learning_rate": 0.0002517, |
|
"loss": 1.7798, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.3963836431503296, |
|
"learning_rate": 0.00025199999999999995, |
|
"loss": 1.2455, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3616, |
|
"grad_norm": 1.7644349336624146, |
|
"learning_rate": 0.00025229999999999995, |
|
"loss": 1.5101, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 1.3632, |
|
"grad_norm": 1.1302613019943237, |
|
"learning_rate": 0.00025259999999999996, |
|
"loss": 1.3869, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.3648, |
|
"grad_norm": 2.062229633331299, |
|
"learning_rate": 0.00025289999999999997, |
|
"loss": 1.4488, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 1.3664, |
|
"grad_norm": 1.800642967224121, |
|
"learning_rate": 0.0002532, |
|
"loss": 1.0541, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 3.4561281204223633, |
|
"learning_rate": 0.0002535, |
|
"loss": 1.4694, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.3696, |
|
"grad_norm": 2.443664073944092, |
|
"learning_rate": 0.0002538, |
|
"loss": 1.5959, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.3712, |
|
"grad_norm": 1.2733495235443115, |
|
"learning_rate": 0.0002541, |
|
"loss": 1.3463, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 1.3728, |
|
"grad_norm": 1.4782954454421997, |
|
"learning_rate": 0.00025439999999999995, |
|
"loss": 1.3913, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.3744, |
|
"grad_norm": 1.355779767036438, |
|
"learning_rate": 0.00025469999999999996, |
|
"loss": 1.4062, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.991340696811676, |
|
"learning_rate": 0.00025499999999999996, |
|
"loss": 1.0575, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3776, |
|
"grad_norm": 1.4136706590652466, |
|
"learning_rate": 0.00025529999999999997, |
|
"loss": 1.4418, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 1.3792, |
|
"grad_norm": 1.5610219240188599, |
|
"learning_rate": 0.0002556, |
|
"loss": 1.909, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.3808, |
|
"grad_norm": 1.4637738466262817, |
|
"learning_rate": 0.0002559, |
|
"loss": 1.4926, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 1.3824, |
|
"grad_norm": 1.174544095993042, |
|
"learning_rate": 0.0002562, |
|
"loss": 1.444, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 1.295507550239563, |
|
"learning_rate": 0.00025649999999999995, |
|
"loss": 1.211, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.3856, |
|
"grad_norm": 1.341482162475586, |
|
"learning_rate": 0.00025679999999999995, |
|
"loss": 1.5489, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.3872, |
|
"grad_norm": 1.5005024671554565, |
|
"learning_rate": 0.00025709999999999996, |
|
"loss": 1.304, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 1.3888, |
|
"grad_norm": 1.7614165544509888, |
|
"learning_rate": 0.00025739999999999997, |
|
"loss": 1.411, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.3904, |
|
"grad_norm": 2.1121573448181152, |
|
"learning_rate": 0.0002577, |
|
"loss": 1.5675, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 1.4016284942626953, |
|
"learning_rate": 0.000258, |
|
"loss": 1.2976, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3936, |
|
"grad_norm": 1.4677424430847168, |
|
"learning_rate": 0.0002583, |
|
"loss": 1.5295, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 1.3952, |
|
"grad_norm": 1.7327654361724854, |
|
"learning_rate": 0.0002586, |
|
"loss": 1.6782, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.3968, |
|
"grad_norm": 1.684560775756836, |
|
"learning_rate": 0.00025889999999999995, |
|
"loss": 2.1258, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.3984, |
|
"grad_norm": 1.1350618600845337, |
|
"learning_rate": 0.00025919999999999996, |
|
"loss": 1.2016, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.8442115783691406, |
|
"learning_rate": 0.00025949999999999997, |
|
"loss": 2.2616, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.4016, |
|
"grad_norm": 15.330982208251953, |
|
"learning_rate": 0.00025979999999999997, |
|
"loss": 3.7807, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.4032, |
|
"grad_norm": 9.512137413024902, |
|
"learning_rate": 0.0002601, |
|
"loss": 2.7959, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.4048, |
|
"grad_norm": 4.83724308013916, |
|
"learning_rate": 0.0002604, |
|
"loss": 2.2513, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.4064, |
|
"grad_norm": 1.0789581537246704, |
|
"learning_rate": 0.0002607, |
|
"loss": 1.1761, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 1.283165693283081, |
|
"learning_rate": 0.000261, |
|
"loss": 1.3131, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.4096, |
|
"grad_norm": 1.3923134803771973, |
|
"learning_rate": 0.00026129999999999995, |
|
"loss": 1.4793, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.4112, |
|
"grad_norm": 1.2186331748962402, |
|
"learning_rate": 0.00026159999999999996, |
|
"loss": 1.4069, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.4128, |
|
"grad_norm": 1.7576051950454712, |
|
"learning_rate": 0.00026189999999999997, |
|
"loss": 1.6623, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 1.4144, |
|
"grad_norm": 2.6093623638153076, |
|
"learning_rate": 0.0002622, |
|
"loss": 0.9214, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 1.0863568782806396, |
|
"learning_rate": 0.0002625, |
|
"loss": 1.4146, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.4176, |
|
"grad_norm": 2.363821506500244, |
|
"learning_rate": 0.0002628, |
|
"loss": 1.8021, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.4192, |
|
"grad_norm": 1.1821964979171753, |
|
"learning_rate": 0.0002631, |
|
"loss": 1.1834, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 1.4208, |
|
"grad_norm": 1.32361900806427, |
|
"learning_rate": 0.00026339999999999995, |
|
"loss": 1.192, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.4224, |
|
"grad_norm": 1.3281641006469727, |
|
"learning_rate": 0.00026369999999999996, |
|
"loss": 1.0575, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 1.5585789680480957, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 1.1708, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.4256, |
|
"grad_norm": 2.31046462059021, |
|
"learning_rate": 0.0002643, |
|
"loss": 1.1794, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.4272, |
|
"grad_norm": 1.7033979892730713, |
|
"learning_rate": 0.0002646, |
|
"loss": 1.3221, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.4288, |
|
"grad_norm": 2.653367519378662, |
|
"learning_rate": 0.0002649, |
|
"loss": 1.4937, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 1.4304000000000001, |
|
"grad_norm": 0.9184427261352539, |
|
"learning_rate": 0.0002652, |
|
"loss": 1.078, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 0.9819865226745605, |
|
"learning_rate": 0.0002655, |
|
"loss": 1.186, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.4336, |
|
"grad_norm": 1.0578396320343018, |
|
"learning_rate": 0.00026579999999999996, |
|
"loss": 1.0641, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.4352, |
|
"grad_norm": 1.4637776613235474, |
|
"learning_rate": 0.00026609999999999996, |
|
"loss": 1.373, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 1.4368, |
|
"grad_norm": 0.9520303606987, |
|
"learning_rate": 0.00026639999999999997, |
|
"loss": 1.1442, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 1.4384000000000001, |
|
"grad_norm": 1.6817363500595093, |
|
"learning_rate": 0.0002667, |
|
"loss": 1.4782, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.1572883129119873, |
|
"learning_rate": 0.000267, |
|
"loss": 1.8022, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4416, |
|
"grad_norm": 0.8815500736236572, |
|
"learning_rate": 0.0002673, |
|
"loss": 1.4329, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 1.4432, |
|
"grad_norm": 1.2165837287902832, |
|
"learning_rate": 0.0002676, |
|
"loss": 1.1709, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 1.4447999999999999, |
|
"grad_norm": 2.121063470840454, |
|
"learning_rate": 0.0002679, |
|
"loss": 1.3277, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 1.4464000000000001, |
|
"grad_norm": 1.4610421657562256, |
|
"learning_rate": 0.00026819999999999996, |
|
"loss": 1.5098, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 2.9947142601013184, |
|
"learning_rate": 0.00026849999999999997, |
|
"loss": 1.3525, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.4496, |
|
"grad_norm": 1.1811401844024658, |
|
"learning_rate": 0.0002688, |
|
"loss": 1.1726, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 1.4512, |
|
"grad_norm": 1.4365415573120117, |
|
"learning_rate": 0.0002691, |
|
"loss": 1.2433, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 1.4527999999999999, |
|
"grad_norm": 1.34075927734375, |
|
"learning_rate": 0.0002694, |
|
"loss": 1.0205, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.4544000000000001, |
|
"grad_norm": 2.7684597969055176, |
|
"learning_rate": 0.0002697, |
|
"loss": 1.0883, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 1.557430624961853, |
|
"learning_rate": 0.00027, |
|
"loss": 1.6006, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4576, |
|
"grad_norm": 1.616085410118103, |
|
"learning_rate": 0.00027029999999999996, |
|
"loss": 1.4207, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 1.4592, |
|
"grad_norm": 5.76104211807251, |
|
"learning_rate": 0.00027059999999999996, |
|
"loss": 1.6985, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.4607999999999999, |
|
"grad_norm": 1.2783349752426147, |
|
"learning_rate": 0.00027089999999999997, |
|
"loss": 1.1576, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 1.4624, |
|
"grad_norm": 1.2653543949127197, |
|
"learning_rate": 0.0002712, |
|
"loss": 1.4433, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 1.2063896656036377, |
|
"learning_rate": 0.0002715, |
|
"loss": 1.5359, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.4656, |
|
"grad_norm": 2.794680118560791, |
|
"learning_rate": 0.0002718, |
|
"loss": 1.5251, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.4672, |
|
"grad_norm": 3.2242326736450195, |
|
"learning_rate": 0.0002721, |
|
"loss": 1.9892, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 1.4687999999999999, |
|
"grad_norm": 1.8846021890640259, |
|
"learning_rate": 0.0002724, |
|
"loss": 1.8354, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.4704, |
|
"grad_norm": 2.0368640422821045, |
|
"learning_rate": 0.00027269999999999996, |
|
"loss": 1.5269, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 1.5392261743545532, |
|
"learning_rate": 0.00027299999999999997, |
|
"loss": 1.6935, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4736, |
|
"grad_norm": 1.5438854694366455, |
|
"learning_rate": 0.0002733, |
|
"loss": 1.7796, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 1.4752, |
|
"grad_norm": 1.796651005744934, |
|
"learning_rate": 0.0002736, |
|
"loss": 1.3982, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 1.4768, |
|
"grad_norm": 2.2069437503814697, |
|
"learning_rate": 0.0002739, |
|
"loss": 1.7769, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 1.4784, |
|
"grad_norm": 1.9683245420455933, |
|
"learning_rate": 0.0002742, |
|
"loss": 1.4909, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.911014199256897, |
|
"learning_rate": 0.0002745, |
|
"loss": 2.0897, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.4816, |
|
"grad_norm": 5.754384994506836, |
|
"learning_rate": 0.0002748, |
|
"loss": 2.1389, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 1.4832, |
|
"grad_norm": 1.59812331199646, |
|
"learning_rate": 0.00027509999999999996, |
|
"loss": 1.5023, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 1.4848, |
|
"grad_norm": 3.8509624004364014, |
|
"learning_rate": 0.00027539999999999997, |
|
"loss": 2.156, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.4864, |
|
"grad_norm": 1.6457704305648804, |
|
"learning_rate": 0.0002757, |
|
"loss": 1.6978, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 9.261984825134277, |
|
"learning_rate": 0.000276, |
|
"loss": 3.222, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4896, |
|
"grad_norm": 10.191606521606445, |
|
"learning_rate": 0.0002763, |
|
"loss": 2.7299, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 1.4912, |
|
"grad_norm": 2.034604072570801, |
|
"learning_rate": 0.0002766, |
|
"loss": 1.363, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.4928, |
|
"grad_norm": 2.7943766117095947, |
|
"learning_rate": 0.0002769, |
|
"loss": 1.4561, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 1.4944, |
|
"grad_norm": 2.739060640335083, |
|
"learning_rate": 0.0002772, |
|
"loss": 1.5289, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 2.6572391986846924, |
|
"learning_rate": 0.00027749999999999997, |
|
"loss": 1.3459, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.4976, |
|
"grad_norm": 2.4692184925079346, |
|
"learning_rate": 0.0002778, |
|
"loss": 1.409, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.4992, |
|
"grad_norm": 1.569419264793396, |
|
"learning_rate": 0.0002781, |
|
"loss": 1.1897, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 1.5008, |
|
"grad_norm": 0.9803001880645752, |
|
"learning_rate": 0.0002784, |
|
"loss": 1.1971, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.5024, |
|
"grad_norm": 1.3759132623672485, |
|
"learning_rate": 0.0002787, |
|
"loss": 1.1573, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 1.4470410346984863, |
|
"learning_rate": 0.000279, |
|
"loss": 1.2441, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.5056, |
|
"grad_norm": 1.9103741645812988, |
|
"learning_rate": 0.0002793, |
|
"loss": 1.2325, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 1.5072, |
|
"grad_norm": 1.5558336973190308, |
|
"learning_rate": 0.00027959999999999997, |
|
"loss": 1.0402, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 1.5088, |
|
"grad_norm": 2.0115926265716553, |
|
"learning_rate": 0.0002799, |
|
"loss": 1.2751, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 1.5104, |
|
"grad_norm": 1.6013593673706055, |
|
"learning_rate": 0.0002802, |
|
"loss": 1.6149, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 1.3492580652236938, |
|
"learning_rate": 0.0002805, |
|
"loss": 1.2308, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.5135999999999998, |
|
"grad_norm": 1.3978670835494995, |
|
"learning_rate": 0.0002808, |
|
"loss": 1.1674, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 1.5152, |
|
"grad_norm": 1.257152795791626, |
|
"learning_rate": 0.0002811, |
|
"loss": 1.2999, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 1.5168, |
|
"grad_norm": 1.3785860538482666, |
|
"learning_rate": 0.00028139999999999996, |
|
"loss": 1.2184, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.5184, |
|
"grad_norm": 2.098989963531494, |
|
"learning_rate": 0.00028169999999999996, |
|
"loss": 1.4197, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.411068320274353, |
|
"learning_rate": 0.00028199999999999997, |
|
"loss": 1.3696, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5215999999999998, |
|
"grad_norm": 1.1803005933761597, |
|
"learning_rate": 0.0002823, |
|
"loss": 1.4752, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 1.5232, |
|
"grad_norm": 1.4621422290802002, |
|
"learning_rate": 0.0002826, |
|
"loss": 1.2319, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.5248, |
|
"grad_norm": 2.065951108932495, |
|
"learning_rate": 0.00028289999999999994, |
|
"loss": 1.3185, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 1.5264, |
|
"grad_norm": 2.077345371246338, |
|
"learning_rate": 0.00028319999999999994, |
|
"loss": 1.3232, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 2.0758562088012695, |
|
"learning_rate": 0.00028349999999999995, |
|
"loss": 1.1082, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.5295999999999998, |
|
"grad_norm": 2.724622964859009, |
|
"learning_rate": 0.00028379999999999996, |
|
"loss": 1.3383, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.5312000000000001, |
|
"grad_norm": 1.9979689121246338, |
|
"learning_rate": 0.00028409999999999997, |
|
"loss": 1.514, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 1.5328, |
|
"grad_norm": 1.9366734027862549, |
|
"learning_rate": 0.0002844, |
|
"loss": 1.2723, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 1.5344, |
|
"grad_norm": 2.198087215423584, |
|
"learning_rate": 0.0002847, |
|
"loss": 1.4332, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.5621610879898071, |
|
"learning_rate": 0.000285, |
|
"loss": 1.3232, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.5375999999999999, |
|
"grad_norm": 1.4429649114608765, |
|
"learning_rate": 0.00028529999999999994, |
|
"loss": 1.292, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 1.5392000000000001, |
|
"grad_norm": 1.7527788877487183, |
|
"learning_rate": 0.00028559999999999995, |
|
"loss": 1.3802, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 1.5408, |
|
"grad_norm": 2.562932252883911, |
|
"learning_rate": 0.00028589999999999996, |
|
"loss": 1.5058, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 1.5424, |
|
"grad_norm": 2.0278782844543457, |
|
"learning_rate": 0.00028619999999999996, |
|
"loss": 1.4053, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 2.133039712905884, |
|
"learning_rate": 0.00028649999999999997, |
|
"loss": 1.5985, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.5455999999999999, |
|
"grad_norm": 1.7495462894439697, |
|
"learning_rate": 0.0002868, |
|
"loss": 1.7425, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.5472000000000001, |
|
"grad_norm": 1.314456582069397, |
|
"learning_rate": 0.0002871, |
|
"loss": 1.1744, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 1.5488, |
|
"grad_norm": 1.5634371042251587, |
|
"learning_rate": 0.00028739999999999994, |
|
"loss": 1.199, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.5504, |
|
"grad_norm": 1.5051501989364624, |
|
"learning_rate": 0.00028769999999999995, |
|
"loss": 1.3761, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 1.4913947582244873, |
|
"learning_rate": 0.00028799999999999995, |
|
"loss": 1.7574, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5535999999999999, |
|
"grad_norm": 2.0032637119293213, |
|
"learning_rate": 0.00028829999999999996, |
|
"loss": 1.6131, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 1.5552000000000001, |
|
"grad_norm": 1.4599378108978271, |
|
"learning_rate": 0.00028859999999999997, |
|
"loss": 1.5788, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.5568, |
|
"grad_norm": 1.526383638381958, |
|
"learning_rate": 0.0002889, |
|
"loss": 1.4599, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 1.5584, |
|
"grad_norm": 3.349010705947876, |
|
"learning_rate": 0.0002892, |
|
"loss": 1.7604, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 3.1439058780670166, |
|
"learning_rate": 0.0002895, |
|
"loss": 2.0222, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.5615999999999999, |
|
"grad_norm": 5.448111057281494, |
|
"learning_rate": 0.00028979999999999994, |
|
"loss": 2.2708, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.5632000000000001, |
|
"grad_norm": 3.330211877822876, |
|
"learning_rate": 0.00029009999999999995, |
|
"loss": 2.3857, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 1.5648, |
|
"grad_norm": 1.844307541847229, |
|
"learning_rate": 0.00029039999999999996, |
|
"loss": 1.8033, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.5664, |
|
"grad_norm": 2.17771053314209, |
|
"learning_rate": 0.00029069999999999996, |
|
"loss": 1.8416, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 1.889838695526123, |
|
"learning_rate": 0.00029099999999999997, |
|
"loss": 1.4765, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5695999999999999, |
|
"grad_norm": 1.280713677406311, |
|
"learning_rate": 0.0002913, |
|
"loss": 1.5771, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 1.5712000000000002, |
|
"grad_norm": 1.2217782735824585, |
|
"learning_rate": 0.0002916, |
|
"loss": 1.4645, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 1.5728, |
|
"grad_norm": 1.493486762046814, |
|
"learning_rate": 0.0002919, |
|
"loss": 1.3598, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 1.5744, |
|
"grad_norm": 1.8840752840042114, |
|
"learning_rate": 0.00029219999999999995, |
|
"loss": 1.5287, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 1.965975046157837, |
|
"learning_rate": 0.00029249999999999995, |
|
"loss": 1.2946, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.5776, |
|
"grad_norm": 1.5697219371795654, |
|
"learning_rate": 0.00029279999999999996, |
|
"loss": 1.0137, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.5792000000000002, |
|
"grad_norm": 1.665776014328003, |
|
"learning_rate": 0.00029309999999999997, |
|
"loss": 1.1297, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 1.5808, |
|
"grad_norm": 1.9357331991195679, |
|
"learning_rate": 0.0002934, |
|
"loss": 1.1271, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.5824, |
|
"grad_norm": 0.922601044178009, |
|
"learning_rate": 0.0002937, |
|
"loss": 1.3413, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 1.0412627458572388, |
|
"learning_rate": 0.000294, |
|
"loss": 1.1785, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5856, |
|
"grad_norm": 1.292492151260376, |
|
"learning_rate": 0.00029429999999999994, |
|
"loss": 1.4531, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 1.5872000000000002, |
|
"grad_norm": 1.4930530786514282, |
|
"learning_rate": 0.00029459999999999995, |
|
"loss": 1.4747, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.5888, |
|
"grad_norm": 1.398553729057312, |
|
"learning_rate": 0.00029489999999999996, |
|
"loss": 1.2275, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 1.5904, |
|
"grad_norm": 1.1149609088897705, |
|
"learning_rate": 0.00029519999999999997, |
|
"loss": 1.3404, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 2.243300676345825, |
|
"learning_rate": 0.00029549999999999997, |
|
"loss": 1.4117, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.5936, |
|
"grad_norm": 0.9678653478622437, |
|
"learning_rate": 0.0002958, |
|
"loss": 1.2791, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.5952, |
|
"grad_norm": 0.9126372337341309, |
|
"learning_rate": 0.0002961, |
|
"loss": 1.54, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 1.5968, |
|
"grad_norm": 1.447344422340393, |
|
"learning_rate": 0.0002964, |
|
"loss": 1.2979, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.5984, |
|
"grad_norm": 2.5969924926757812, |
|
"learning_rate": 0.00029669999999999995, |
|
"loss": 1.6844, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.3566679954528809, |
|
"learning_rate": 0.00029699999999999996, |
|
"loss": 1.2633, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_cer": 0.342703815793579, |
|
"eval_loss": 1.7065902948379517, |
|
"eval_runtime": 159.4221, |
|
"eval_samples_per_second": 19.671, |
|
"eval_steps_per_second": 1.229, |
|
"eval_wer": 0.5115005185204882, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"step": 1000, |
|
"total_flos": 6.203691115248614e+18, |
|
"train_loss": 4.768421524226666, |
|
"train_runtime": 2101.2175, |
|
"train_samples_per_second": 15.229, |
|
"train_steps_per_second": 0.476 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.203691115248614e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|