diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12327 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.0, + "eval_steps": 500, + "global_step": 1755, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008547008547008548, + "grad_norm": 32.62020740979074, + "learning_rate": 4.999995994512315e-06, + "loss": 4.276, + "step": 1 + }, + { + "epoch": 0.017094017094017096, + "grad_norm": 29.072421384539055, + "learning_rate": 4.999983978062096e-06, + "loss": 2.8774, + "step": 2 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 30.135778249629343, + "learning_rate": 4.999963950687846e-06, + "loss": 3.7408, + "step": 3 + }, + { + "epoch": 0.03418803418803419, + "grad_norm": 24.189256487874317, + "learning_rate": 4.999935912453743e-06, + "loss": 3.1117, + "step": 4 + }, + { + "epoch": 0.042735042735042736, + "grad_norm": 18.836727485689817, + "learning_rate": 4.999899863449631e-06, + "loss": 3.1051, + "step": 5 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 12.501967391799338, + "learning_rate": 4.999855803791026e-06, + "loss": 2.6904, + "step": 6 + }, + { + "epoch": 0.05982905982905983, + "grad_norm": 9.637791567564221, + "learning_rate": 4.9998037336191115e-06, + "loss": 1.8663, + "step": 7 + }, + { + "epoch": 0.06837606837606838, + "grad_norm": 4.939579902739148, + "learning_rate": 4.9997436531007415e-06, + "loss": 1.7693, + "step": 8 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 2.4003696989542815, + "learning_rate": 4.999675562428437e-06, + "loss": 1.1684, + "step": 9 + }, + { + "epoch": 0.08547008547008547, + "grad_norm": 5.971593419685188, + "learning_rate": 4.999599461820387e-06, + "loss": 2.2486, + "step": 10 + }, + { + "epoch": 0.09401709401709402, + "grad_norm": 3.977027000496355, + "learning_rate": 4.999515351520447e-06, + "loss": 1.7208, + "step": 11 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 2.783958221119982, + "learning_rate": 4.9994232317981405e-06, + "loss": 1.6043, + "step": 12 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.538183754473571, + "learning_rate": 4.999323102948655e-06, + "loss": 1.9126, + "step": 13 + }, + { + "epoch": 0.11965811965811966, + "grad_norm": 2.8220664880830966, + "learning_rate": 4.999214965292841e-06, + "loss": 1.5208, + "step": 14 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 2.9402595332539736, + "learning_rate": 4.999098819177214e-06, + "loss": 1.0275, + "step": 15 + }, + { + "epoch": 0.13675213675213677, + "grad_norm": 36.901371585877804, + "learning_rate": 4.998974664973953e-06, + "loss": 1.081, + "step": 16 + }, + { + "epoch": 0.1452991452991453, + "grad_norm": 2.9647572686177552, + "learning_rate": 4.998842503080894e-06, + "loss": 1.6457, + "step": 17 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 2.1277618530844227, + "learning_rate": 4.998702333921538e-06, + "loss": 1.0708, + "step": 18 + }, + { + "epoch": 0.1623931623931624, + "grad_norm": 2.8870619886137434, + "learning_rate": 4.99855415794504e-06, + "loss": 1.6499, + "step": 19 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 2.0120452229027626, + "learning_rate": 4.998397975626213e-06, + "loss": 1.5059, + "step": 20 + }, + { + "epoch": 0.1794871794871795, + "grad_norm": 1.9920958843151744, + "learning_rate": 4.998233787465529e-06, + "loss": 1.0625, + "step": 21 + }, + { + "epoch": 0.18803418803418803, + "grad_norm": 2.260535248606528, + "learning_rate": 4.998061593989108e-06, + "loss": 1.3096, + "step": 22 + }, + { + "epoch": 0.19658119658119658, + "grad_norm": 1.8563891675327266, + "learning_rate": 4.997881395748727e-06, + "loss": 1.1027, + "step": 23 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 1.7562642717959271, + "learning_rate": 4.99769319332181e-06, + "loss": 1.1636, + "step": 24 + }, + { + "epoch": 0.21367521367521367, + "grad_norm": 1.809288311360476, + "learning_rate": 4.997496987311431e-06, + "loss": 1.3529, + "step": 25 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.8497055141007057, + "learning_rate": 4.997292778346312e-06, + "loss": 1.2778, + "step": 26 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 1.3305413686651983, + "learning_rate": 4.9970805670808174e-06, + "loss": 1.0894, + "step": 27 + }, + { + "epoch": 0.23931623931623933, + "grad_norm": 2.926773487016974, + "learning_rate": 4.996860354194954e-06, + "loss": 1.7158, + "step": 28 + }, + { + "epoch": 0.24786324786324787, + "grad_norm": 2.1198500878749846, + "learning_rate": 4.996632140394372e-06, + "loss": 1.5559, + "step": 29 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 1.184358048001977, + "learning_rate": 4.996395926410354e-06, + "loss": 0.9109, + "step": 30 + }, + { + "epoch": 0.26495726495726496, + "grad_norm": 1.646041152309647, + "learning_rate": 4.996151712999826e-06, + "loss": 1.1522, + "step": 31 + }, + { + "epoch": 0.27350427350427353, + "grad_norm": 1.368219183766303, + "learning_rate": 4.995899500945341e-06, + "loss": 0.8634, + "step": 32 + }, + { + "epoch": 0.28205128205128205, + "grad_norm": 1.577497651144209, + "learning_rate": 4.995639291055084e-06, + "loss": 1.1489, + "step": 33 + }, + { + "epoch": 0.2905982905982906, + "grad_norm": 1.691998808976061, + "learning_rate": 4.99537108416287e-06, + "loss": 1.2498, + "step": 34 + }, + { + "epoch": 0.29914529914529914, + "grad_norm": 1.4882087466641996, + "learning_rate": 4.995094881128138e-06, + "loss": 1.0567, + "step": 35 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 1.0271930703785612, + "learning_rate": 4.994810682835951e-06, + "loss": 0.8448, + "step": 36 + }, + { + "epoch": 0.3162393162393162, + "grad_norm": 1.1199239553430391, + "learning_rate": 4.99451849019699e-06, + "loss": 0.7009, + "step": 37 + }, + { + "epoch": 0.3247863247863248, + "grad_norm": 1.669850515052373, + "learning_rate": 4.994218304147556e-06, + "loss": 1.0606, + "step": 38 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.5674947016471914, + "learning_rate": 4.993910125649561e-06, + "loss": 1.2569, + "step": 39 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 1.0864716234200489, + "learning_rate": 4.993593955690529e-06, + "loss": 0.7918, + "step": 40 + }, + { + "epoch": 0.3504273504273504, + "grad_norm": 1.5801532642600238, + "learning_rate": 4.9932697952835925e-06, + "loss": 1.1788, + "step": 41 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 1.5026314328402226, + "learning_rate": 4.992937645467489e-06, + "loss": 1.174, + "step": 42 + }, + { + "epoch": 0.36752136752136755, + "grad_norm": 1.5519979381594173, + "learning_rate": 4.992597507306552e-06, + "loss": 1.3143, + "step": 43 + }, + { + "epoch": 0.37606837606837606, + "grad_norm": 1.3591841494684331, + "learning_rate": 4.992249381890722e-06, + "loss": 1.1037, + "step": 44 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 1.4384122655785836, + "learning_rate": 4.991893270335526e-06, + "loss": 1.2459, + "step": 45 + }, + { + "epoch": 0.39316239316239315, + "grad_norm": 1.3895569778660726, + "learning_rate": 4.9915291737820836e-06, + "loss": 0.9972, + "step": 46 + }, + { + "epoch": 0.4017094017094017, + "grad_norm": 1.4589079891461676, + "learning_rate": 4.991157093397104e-06, + "loss": 1.0955, + "step": 47 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 1.077697915403046, + "learning_rate": 4.990777030372877e-06, + "loss": 0.8011, + "step": 48 + }, + { + "epoch": 0.4188034188034188, + "grad_norm": 1.2910938732132287, + "learning_rate": 4.990388985927273e-06, + "loss": 1.0665, + "step": 49 + }, + { + "epoch": 0.42735042735042733, + "grad_norm": 1.2714720712304037, + "learning_rate": 4.989992961303738e-06, + "loss": 1.0145, + "step": 50 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 1.2845842411221682, + "learning_rate": 4.989588957771289e-06, + "loss": 0.9378, + "step": 51 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.3419174977624397, + "learning_rate": 4.989176976624511e-06, + "loss": 0.8341, + "step": 52 + }, + { + "epoch": 0.452991452991453, + "grad_norm": 1.0231416966065818, + "learning_rate": 4.988757019183553e-06, + "loss": 0.7548, + "step": 53 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.1252549457024466, + "learning_rate": 4.988329086794122e-06, + "loss": 0.8745, + "step": 54 + }, + { + "epoch": 0.4700854700854701, + "grad_norm": 0.9536636897289945, + "learning_rate": 4.9878931808274796e-06, + "loss": 0.6516, + "step": 55 + }, + { + "epoch": 0.47863247863247865, + "grad_norm": 1.6209383242246733, + "learning_rate": 4.98744930268044e-06, + "loss": 0.9706, + "step": 56 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 1.0731934865207635, + "learning_rate": 4.986997453775361e-06, + "loss": 0.8936, + "step": 57 + }, + { + "epoch": 0.49572649572649574, + "grad_norm": 1.362773877359389, + "learning_rate": 4.986537635560144e-06, + "loss": 0.8172, + "step": 58 + }, + { + "epoch": 0.5042735042735043, + "grad_norm": 4.543126580901282, + "learning_rate": 4.986069849508223e-06, + "loss": 1.0017, + "step": 59 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 1.047286807290104, + "learning_rate": 4.9855940971185705e-06, + "loss": 0.9148, + "step": 60 + }, + { + "epoch": 0.5213675213675214, + "grad_norm": 0.9239545215151804, + "learning_rate": 4.985110379915681e-06, + "loss": 0.803, + "step": 61 + }, + { + "epoch": 0.5299145299145299, + "grad_norm": 1.014258532084436, + "learning_rate": 4.984618699449573e-06, + "loss": 0.8317, + "step": 62 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 1.1970397088754383, + "learning_rate": 4.984119057295783e-06, + "loss": 0.8198, + "step": 63 + }, + { + "epoch": 0.5470085470085471, + "grad_norm": 1.8467307385552865, + "learning_rate": 4.983611455055359e-06, + "loss": 0.7764, + "step": 64 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.0137011006880812, + "learning_rate": 4.983095894354858e-06, + "loss": 0.8778, + "step": 65 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 1.1254196981388234, + "learning_rate": 4.982572376846336e-06, + "loss": 0.7872, + "step": 66 + }, + { + "epoch": 0.5726495726495726, + "grad_norm": 1.0458612262024898, + "learning_rate": 4.982040904207348e-06, + "loss": 1.03, + "step": 67 + }, + { + "epoch": 0.5811965811965812, + "grad_norm": 1.0425261033892632, + "learning_rate": 4.98150147814094e-06, + "loss": 0.9989, + "step": 68 + }, + { + "epoch": 0.5897435897435898, + "grad_norm": 0.8940316221494978, + "learning_rate": 4.980954100375642e-06, + "loss": 0.6748, + "step": 69 + }, + { + "epoch": 0.5982905982905983, + "grad_norm": 1.1615256423288627, + "learning_rate": 4.980398772665468e-06, + "loss": 0.7753, + "step": 70 + }, + { + "epoch": 0.6068376068376068, + "grad_norm": 0.9363884553573214, + "learning_rate": 4.979835496789904e-06, + "loss": 0.8255, + "step": 71 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 1.0064167314698989, + "learning_rate": 4.979264274553906e-06, + "loss": 1.0582, + "step": 72 + }, + { + "epoch": 0.6239316239316239, + "grad_norm": 1.106482001780077, + "learning_rate": 4.97868510778789e-06, + "loss": 0.7719, + "step": 73 + }, + { + "epoch": 0.6324786324786325, + "grad_norm": 0.9957808978272576, + "learning_rate": 4.978097998347737e-06, + "loss": 0.8936, + "step": 74 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 0.9138226587744296, + "learning_rate": 4.977502948114772e-06, + "loss": 0.7896, + "step": 75 + }, + { + "epoch": 0.6495726495726496, + "grad_norm": 0.9641099744456224, + "learning_rate": 4.9768999589957675e-06, + "loss": 0.7238, + "step": 76 + }, + { + "epoch": 0.6581196581196581, + "grad_norm": 0.965423829764785, + "learning_rate": 4.976289032922937e-06, + "loss": 0.9367, + "step": 77 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.9153110347105345, + "learning_rate": 4.975670171853926e-06, + "loss": 0.9802, + "step": 78 + }, + { + "epoch": 0.6752136752136753, + "grad_norm": 1.0031899781143425, + "learning_rate": 4.975043377771806e-06, + "loss": 0.7416, + "step": 79 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 0.9678094486698757, + "learning_rate": 4.9744086526850724e-06, + "loss": 0.8766, + "step": 80 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 1.0103884039407407, + "learning_rate": 4.973765998627628e-06, + "loss": 1.0188, + "step": 81 + }, + { + "epoch": 0.7008547008547008, + "grad_norm": 1.0510378757673386, + "learning_rate": 4.97311541765879e-06, + "loss": 0.9294, + "step": 82 + }, + { + "epoch": 0.7094017094017094, + "grad_norm": 1.0911046658822772, + "learning_rate": 4.972456911863273e-06, + "loss": 0.9152, + "step": 83 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 1.0560120280283998, + "learning_rate": 4.971790483351186e-06, + "loss": 0.7679, + "step": 84 + }, + { + "epoch": 0.7264957264957265, + "grad_norm": 0.9437311405916173, + "learning_rate": 4.971116134258026e-06, + "loss": 0.7396, + "step": 85 + }, + { + "epoch": 0.7350427350427351, + "grad_norm": 0.9132375746546699, + "learning_rate": 4.97043386674467e-06, + "loss": 0.9389, + "step": 86 + }, + { + "epoch": 0.7435897435897436, + "grad_norm": 1.081470401037969, + "learning_rate": 4.969743682997372e-06, + "loss": 0.7307, + "step": 87 + }, + { + "epoch": 0.7521367521367521, + "grad_norm": 0.7993087792851001, + "learning_rate": 4.969045585227747e-06, + "loss": 0.7206, + "step": 88 + }, + { + "epoch": 0.7606837606837606, + "grad_norm": 1.0592462938558866, + "learning_rate": 4.968339575672773e-06, + "loss": 0.8962, + "step": 89 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 1.1781322670806287, + "learning_rate": 4.967625656594782e-06, + "loss": 0.8418, + "step": 90 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.8851002604634136, + "learning_rate": 4.966903830281449e-06, + "loss": 0.7403, + "step": 91 + }, + { + "epoch": 0.7863247863247863, + "grad_norm": 1.1731634314905843, + "learning_rate": 4.966174099045784e-06, + "loss": 0.8497, + "step": 92 + }, + { + "epoch": 0.7948717948717948, + "grad_norm": 0.9637038553583984, + "learning_rate": 4.9654364652261345e-06, + "loss": 0.8345, + "step": 93 + }, + { + "epoch": 0.8034188034188035, + "grad_norm": 0.8910531904379403, + "learning_rate": 4.964690931186165e-06, + "loss": 0.8016, + "step": 94 + }, + { + "epoch": 0.811965811965812, + "grad_norm": 1.0055847088527494, + "learning_rate": 4.963937499314857e-06, + "loss": 1.0533, + "step": 95 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 1.0675718725688317, + "learning_rate": 4.963176172026501e-06, + "loss": 0.9226, + "step": 96 + }, + { + "epoch": 0.8290598290598291, + "grad_norm": 0.9609720971979216, + "learning_rate": 4.962406951760687e-06, + "loss": 0.7381, + "step": 97 + }, + { + "epoch": 0.8376068376068376, + "grad_norm": 0.9128632849930912, + "learning_rate": 4.961629840982296e-06, + "loss": 0.5482, + "step": 98 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.8600066418600247, + "learning_rate": 4.9608448421814944e-06, + "loss": 0.6207, + "step": 99 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 1.035150298649069, + "learning_rate": 4.960051957873726e-06, + "loss": 0.8249, + "step": 100 + }, + { + "epoch": 0.8632478632478633, + "grad_norm": 1.1083245611640722, + "learning_rate": 4.959251190599699e-06, + "loss": 1.0021, + "step": 101 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 0.7667808215315018, + "learning_rate": 4.958442542925385e-06, + "loss": 0.6136, + "step": 102 + }, + { + "epoch": 0.8803418803418803, + "grad_norm": 0.8877096581857563, + "learning_rate": 4.9576260174420085e-06, + "loss": 0.6767, + "step": 103 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.1246219207085053, + "learning_rate": 4.956801616766033e-06, + "loss": 0.783, + "step": 104 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 0.9568379203614392, + "learning_rate": 4.955969343539162e-06, + "loss": 0.8778, + "step": 105 + }, + { + "epoch": 0.905982905982906, + "grad_norm": 0.9605105213476728, + "learning_rate": 4.955129200428323e-06, + "loss": 0.6912, + "step": 106 + }, + { + "epoch": 0.9145299145299145, + "grad_norm": 1.1668396082908756, + "learning_rate": 4.9542811901256615e-06, + "loss": 0.7407, + "step": 107 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.8666294775123025, + "learning_rate": 4.953425315348534e-06, + "loss": 0.6811, + "step": 108 + }, + { + "epoch": 0.9316239316239316, + "grad_norm": 1.0107691793778373, + "learning_rate": 4.952561578839498e-06, + "loss": 0.7776, + "step": 109 + }, + { + "epoch": 0.9401709401709402, + "grad_norm": 1.0252997044098662, + "learning_rate": 4.9516899833663e-06, + "loss": 0.7653, + "step": 110 + }, + { + "epoch": 0.9487179487179487, + "grad_norm": 1.0043988910829953, + "learning_rate": 4.950810531721874e-06, + "loss": 0.7352, + "step": 111 + }, + { + "epoch": 0.9572649572649573, + "grad_norm": 0.9966732259935451, + "learning_rate": 4.949923226724325e-06, + "loss": 0.7956, + "step": 112 + }, + { + "epoch": 0.9658119658119658, + "grad_norm": 1.0002785418115747, + "learning_rate": 4.949028071216926e-06, + "loss": 0.6522, + "step": 113 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 0.8434616881462327, + "learning_rate": 4.948125068068102e-06, + "loss": 0.5084, + "step": 114 + }, + { + "epoch": 0.9829059829059829, + "grad_norm": 0.9643032365710266, + "learning_rate": 4.94721422017143e-06, + "loss": 0.966, + "step": 115 + }, + { + "epoch": 0.9914529914529915, + "grad_norm": 0.8101745858666015, + "learning_rate": 4.946295530445621e-06, + "loss": 0.7198, + "step": 116 + }, + { + "epoch": 1.0, + "grad_norm": 0.8729701781074534, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.6823, + "step": 117 + }, + { + "epoch": 1.0085470085470085, + "grad_norm": 0.842080592587243, + "learning_rate": 4.94443463730707e-06, + "loss": 0.6881, + "step": 118 + }, + { + "epoch": 1.017094017094017, + "grad_norm": 0.845463199764785, + "learning_rate": 4.943492439857357e-06, + "loss": 0.7705, + "step": 119 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.9180778501256365, + "learning_rate": 4.942542412504543e-06, + "loss": 0.804, + "step": 120 + }, + { + "epoch": 1.0341880341880343, + "grad_norm": 0.9068334850104435, + "learning_rate": 4.9415845582928866e-06, + "loss": 0.5952, + "step": 121 + }, + { + "epoch": 1.0427350427350428, + "grad_norm": 1.0711610169869845, + "learning_rate": 4.940618880291725e-06, + "loss": 1.0312, + "step": 122 + }, + { + "epoch": 1.0512820512820513, + "grad_norm": 0.8660635593426306, + "learning_rate": 4.9396453815954695e-06, + "loss": 0.6006, + "step": 123 + }, + { + "epoch": 1.0598290598290598, + "grad_norm": 0.7295836928618811, + "learning_rate": 4.938664065323588e-06, + "loss": 0.6308, + "step": 124 + }, + { + "epoch": 1.0683760683760684, + "grad_norm": 0.9418396522120512, + "learning_rate": 4.937674934620601e-06, + "loss": 0.7542, + "step": 125 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.0198957963558342, + "learning_rate": 4.9366779926560705e-06, + "loss": 0.7289, + "step": 126 + }, + { + "epoch": 1.0854700854700854, + "grad_norm": 0.751898046812773, + "learning_rate": 4.935673242624585e-06, + "loss": 0.5528, + "step": 127 + }, + { + "epoch": 1.0940170940170941, + "grad_norm": 0.792545272551657, + "learning_rate": 4.934660687745758e-06, + "loss": 0.6599, + "step": 128 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 0.904722485954194, + "learning_rate": 4.93364033126421e-06, + "loss": 0.6349, + "step": 129 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 1.0909141456525318, + "learning_rate": 4.93261217644956e-06, + "loss": 0.8165, + "step": 130 + }, + { + "epoch": 1.1196581196581197, + "grad_norm": 0.8711040377712898, + "learning_rate": 4.931576226596418e-06, + "loss": 0.8699, + "step": 131 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 0.8633774504272915, + "learning_rate": 4.930532485024372e-06, + "loss": 0.5695, + "step": 132 + }, + { + "epoch": 1.1367521367521367, + "grad_norm": 0.8889574637503499, + "learning_rate": 4.929480955077976e-06, + "loss": 0.5382, + "step": 133 + }, + { + "epoch": 1.1452991452991452, + "grad_norm": 0.9325352651456993, + "learning_rate": 4.928421640126742e-06, + "loss": 0.6109, + "step": 134 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.7740567991589352, + "learning_rate": 4.927354543565131e-06, + "loss": 0.6082, + "step": 135 + }, + { + "epoch": 1.1623931623931625, + "grad_norm": 0.8569473937370548, + "learning_rate": 4.926279668812533e-06, + "loss": 0.8508, + "step": 136 + }, + { + "epoch": 1.170940170940171, + "grad_norm": 0.9000831758374732, + "learning_rate": 4.925197019313269e-06, + "loss": 0.6447, + "step": 137 + }, + { + "epoch": 1.1794871794871795, + "grad_norm": 0.8268169038977321, + "learning_rate": 4.9241065985365695e-06, + "loss": 0.6781, + "step": 138 + }, + { + "epoch": 1.188034188034188, + "grad_norm": 0.9296228757358989, + "learning_rate": 4.923008409976568e-06, + "loss": 0.736, + "step": 139 + }, + { + "epoch": 1.1965811965811965, + "grad_norm": 0.921899150937168, + "learning_rate": 4.921902457152289e-06, + "loss": 0.7198, + "step": 140 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.9602358855116797, + "learning_rate": 4.920788743607636e-06, + "loss": 0.8336, + "step": 141 + }, + { + "epoch": 1.2136752136752136, + "grad_norm": 0.8896085782433711, + "learning_rate": 4.919667272911383e-06, + "loss": 0.6959, + "step": 142 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.846293805538022, + "learning_rate": 4.91853804865716e-06, + "loss": 0.7555, + "step": 143 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.8603018280332412, + "learning_rate": 4.917401074463441e-06, + "loss": 0.7288, + "step": 144 + }, + { + "epoch": 1.2393162393162394, + "grad_norm": 0.9498713513811322, + "learning_rate": 4.916256353973535e-06, + "loss": 0.647, + "step": 145 + }, + { + "epoch": 1.2478632478632479, + "grad_norm": 0.7982026276207213, + "learning_rate": 4.915103890855574e-06, + "loss": 0.6042, + "step": 146 + }, + { + "epoch": 1.2564102564102564, + "grad_norm": 0.8589595835743361, + "learning_rate": 4.913943688802497e-06, + "loss": 0.6211, + "step": 147 + }, + { + "epoch": 1.264957264957265, + "grad_norm": 0.9630850664374468, + "learning_rate": 4.912775751532047e-06, + "loss": 0.7537, + "step": 148 + }, + { + "epoch": 1.2735042735042734, + "grad_norm": 1.084629843197026, + "learning_rate": 4.91160008278675e-06, + "loss": 0.651, + "step": 149 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 1.077510994503243, + "learning_rate": 4.9104166863339065e-06, + "loss": 0.6962, + "step": 150 + }, + { + "epoch": 1.2905982905982907, + "grad_norm": 0.9523844356052744, + "learning_rate": 4.90922556596558e-06, + "loss": 0.9121, + "step": 151 + }, + { + "epoch": 1.2991452991452992, + "grad_norm": 0.8170982239546886, + "learning_rate": 4.908026725498586e-06, + "loss": 0.5877, + "step": 152 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.8321214392716167, + "learning_rate": 4.9068201687744774e-06, + "loss": 0.6241, + "step": 153 + }, + { + "epoch": 1.3162393162393162, + "grad_norm": 0.744635231568867, + "learning_rate": 4.905605899659532e-06, + "loss": 0.5604, + "step": 154 + }, + { + "epoch": 1.3247863247863247, + "grad_norm": 0.9902531868558738, + "learning_rate": 4.90438392204474e-06, + "loss": 0.7455, + "step": 155 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.871166691569676, + "learning_rate": 4.903154239845798e-06, + "loss": 0.7321, + "step": 156 + }, + { + "epoch": 1.341880341880342, + "grad_norm": 0.9899344615854253, + "learning_rate": 4.901916857003084e-06, + "loss": 0.9039, + "step": 157 + }, + { + "epoch": 1.3504273504273505, + "grad_norm": 0.771145237334227, + "learning_rate": 4.9006717774816585e-06, + "loss": 0.9487, + "step": 158 + }, + { + "epoch": 1.358974358974359, + "grad_norm": 0.8764015362376394, + "learning_rate": 4.8994190052712406e-06, + "loss": 0.4107, + "step": 159 + }, + { + "epoch": 1.3675213675213675, + "grad_norm": 1.0257151713300179, + "learning_rate": 4.898158544386201e-06, + "loss": 0.9209, + "step": 160 + }, + { + "epoch": 1.376068376068376, + "grad_norm": 0.6891258050787235, + "learning_rate": 4.896890398865548e-06, + "loss": 0.473, + "step": 161 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 0.8800514494711961, + "learning_rate": 4.895614572772916e-06, + "loss": 0.5578, + "step": 162 + }, + { + "epoch": 1.393162393162393, + "grad_norm": 1.0877581722845024, + "learning_rate": 4.894331070196548e-06, + "loss": 0.7935, + "step": 163 + }, + { + "epoch": 1.4017094017094016, + "grad_norm": 0.7380344836306842, + "learning_rate": 4.893039895249288e-06, + "loss": 0.5807, + "step": 164 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.9374611560377141, + "learning_rate": 4.8917410520685635e-06, + "loss": 0.7081, + "step": 165 + }, + { + "epoch": 1.4188034188034189, + "grad_norm": 0.9888064163493844, + "learning_rate": 4.890434544816375e-06, + "loss": 0.621, + "step": 166 + }, + { + "epoch": 1.4273504273504274, + "grad_norm": 3.3286973317835074, + "learning_rate": 4.889120377679282e-06, + "loss": 0.6302, + "step": 167 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 0.8952054360752244, + "learning_rate": 4.887798554868388e-06, + "loss": 0.5884, + "step": 168 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.8659549188864638, + "learning_rate": 4.88646908061933e-06, + "loss": 0.5574, + "step": 169 + }, + { + "epoch": 1.452991452991453, + "grad_norm": 0.9342371205015247, + "learning_rate": 4.885131959192262e-06, + "loss": 0.5912, + "step": 170 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.9723104482463416, + "learning_rate": 4.883787194871841e-06, + "loss": 0.5887, + "step": 171 + }, + { + "epoch": 1.4700854700854702, + "grad_norm": 0.7578829988984429, + "learning_rate": 4.882434791967219e-06, + "loss": 0.6065, + "step": 172 + }, + { + "epoch": 1.4786324786324787, + "grad_norm": 0.8891827518266336, + "learning_rate": 4.881074754812021e-06, + "loss": 0.5384, + "step": 173 + }, + { + "epoch": 1.4871794871794872, + "grad_norm": 1.4201619598272113, + "learning_rate": 4.879707087764336e-06, + "loss": 0.6335, + "step": 174 + }, + { + "epoch": 1.4957264957264957, + "grad_norm": 0.7970843509207138, + "learning_rate": 4.878331795206705e-06, + "loss": 0.7274, + "step": 175 + }, + { + "epoch": 1.5042735042735043, + "grad_norm": 1.0616993712483198, + "learning_rate": 4.876948881546101e-06, + "loss": 0.6813, + "step": 176 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 0.8826451632766461, + "learning_rate": 4.875558351213918e-06, + "loss": 0.6883, + "step": 177 + }, + { + "epoch": 1.5213675213675213, + "grad_norm": 0.8241904159222645, + "learning_rate": 4.874160208665958e-06, + "loss": 0.4233, + "step": 178 + }, + { + "epoch": 1.5299145299145298, + "grad_norm": 1.0537700071390603, + "learning_rate": 4.872754458382416e-06, + "loss": 0.6817, + "step": 179 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.8301266822469117, + "learning_rate": 4.8713411048678635e-06, + "loss": 0.8945, + "step": 180 + }, + { + "epoch": 1.547008547008547, + "grad_norm": 0.8550186654323384, + "learning_rate": 4.869920152651239e-06, + "loss": 0.7814, + "step": 181 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 1.3037022506462685, + "learning_rate": 4.868491606285823e-06, + "loss": 0.7902, + "step": 182 + }, + { + "epoch": 1.564102564102564, + "grad_norm": 1.710455466821508, + "learning_rate": 4.86705547034924e-06, + "loss": 0.7026, + "step": 183 + }, + { + "epoch": 1.5726495726495726, + "grad_norm": 0.8504039369254447, + "learning_rate": 4.865611749443428e-06, + "loss": 0.6511, + "step": 184 + }, + { + "epoch": 1.5811965811965814, + "grad_norm": 0.8002095192240137, + "learning_rate": 4.864160448194632e-06, + "loss": 0.5932, + "step": 185 + }, + { + "epoch": 1.5897435897435899, + "grad_norm": 0.95226059536856, + "learning_rate": 4.862701571253387e-06, + "loss": 0.5688, + "step": 186 + }, + { + "epoch": 1.5982905982905984, + "grad_norm": 0.7962164201947765, + "learning_rate": 4.861235123294505e-06, + "loss": 0.5607, + "step": 187 + }, + { + "epoch": 1.606837606837607, + "grad_norm": 1.0060466476736434, + "learning_rate": 4.859761109017056e-06, + "loss": 0.5513, + "step": 188 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.7785587180515594, + "learning_rate": 4.858279533144358e-06, + "loss": 0.6487, + "step": 189 + }, + { + "epoch": 1.623931623931624, + "grad_norm": 0.8540481332902222, + "learning_rate": 4.856790400423958e-06, + "loss": 0.6159, + "step": 190 + }, + { + "epoch": 1.6324786324786325, + "grad_norm": 0.8289468924265531, + "learning_rate": 4.8552937156276185e-06, + "loss": 0.5114, + "step": 191 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 0.9526151621277151, + "learning_rate": 4.8537894835513e-06, + "loss": 0.9854, + "step": 192 + }, + { + "epoch": 1.6495726495726495, + "grad_norm": 0.7910488490007835, + "learning_rate": 4.8522777090151505e-06, + "loss": 0.5659, + "step": 193 + }, + { + "epoch": 1.658119658119658, + "grad_norm": 0.9166695744046246, + "learning_rate": 4.8507583968634845e-06, + "loss": 0.6635, + "step": 194 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.8411571133758534, + "learning_rate": 4.849231551964771e-06, + "loss": 0.5762, + "step": 195 + }, + { + "epoch": 1.6752136752136753, + "grad_norm": 1.0037339289648717, + "learning_rate": 4.847697179211618e-06, + "loss": 0.4801, + "step": 196 + }, + { + "epoch": 1.6837606837606838, + "grad_norm": 0.9650823545296664, + "learning_rate": 4.8461552835207524e-06, + "loss": 0.5585, + "step": 197 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 0.7514863818600783, + "learning_rate": 4.844605869833011e-06, + "loss": 0.5708, + "step": 198 + }, + { + "epoch": 1.7008547008547008, + "grad_norm": 0.9806737566272027, + "learning_rate": 4.84304894311332e-06, + "loss": 0.8652, + "step": 199 + }, + { + "epoch": 1.7094017094017095, + "grad_norm": 0.8228043634560646, + "learning_rate": 4.841484508350679e-06, + "loss": 0.5162, + "step": 200 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.8211227168659619, + "learning_rate": 4.839912570558148e-06, + "loss": 0.5694, + "step": 201 + }, + { + "epoch": 1.7264957264957266, + "grad_norm": 0.8195609975365399, + "learning_rate": 4.838333134772828e-06, + "loss": 0.4863, + "step": 202 + }, + { + "epoch": 1.735042735042735, + "grad_norm": 0.974131586369127, + "learning_rate": 4.836746206055849e-06, + "loss": 0.551, + "step": 203 + }, + { + "epoch": 1.7435897435897436, + "grad_norm": 0.9708650745354207, + "learning_rate": 4.835151789492348e-06, + "loss": 0.6683, + "step": 204 + }, + { + "epoch": 1.7521367521367521, + "grad_norm": 0.8306266939889401, + "learning_rate": 4.83354989019146e-06, + "loss": 0.5167, + "step": 205 + }, + { + "epoch": 1.7606837606837606, + "grad_norm": 0.8692823244900135, + "learning_rate": 4.831940513286293e-06, + "loss": 0.6876, + "step": 206 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 0.9425701508115182, + "learning_rate": 4.83032366393392e-06, + "loss": 0.6458, + "step": 207 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.9353325892585003, + "learning_rate": 4.828699347315357e-06, + "loss": 0.8378, + "step": 208 + }, + { + "epoch": 1.7863247863247862, + "grad_norm": 0.9162028835750752, + "learning_rate": 4.827067568635546e-06, + "loss": 0.5463, + "step": 209 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 1.047609340341694, + "learning_rate": 4.825428333123346e-06, + "loss": 0.7311, + "step": 210 + }, + { + "epoch": 1.8034188034188035, + "grad_norm": 1.0234012438481053, + "learning_rate": 4.823781646031505e-06, + "loss": 0.7, + "step": 211 + }, + { + "epoch": 1.811965811965812, + "grad_norm": 0.9046824788092985, + "learning_rate": 4.822127512636652e-06, + "loss": 0.3795, + "step": 212 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 0.9534823492778117, + "learning_rate": 4.820465938239274e-06, + "loss": 0.6623, + "step": 213 + }, + { + "epoch": 1.8290598290598292, + "grad_norm": 0.9956882984441957, + "learning_rate": 4.8187969281637054e-06, + "loss": 0.6584, + "step": 214 + }, + { + "epoch": 1.8376068376068377, + "grad_norm": 0.9077439371948645, + "learning_rate": 4.817120487758105e-06, + "loss": 0.6596, + "step": 215 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.781496567826663, + "learning_rate": 4.815436622394442e-06, + "loss": 0.5389, + "step": 216 + }, + { + "epoch": 1.8547008547008548, + "grad_norm": 0.9458504816016381, + "learning_rate": 4.813745337468478e-06, + "loss": 0.7911, + "step": 217 + }, + { + "epoch": 1.8632478632478633, + "grad_norm": 0.8741433154654207, + "learning_rate": 4.8120466383997486e-06, + "loss": 0.6354, + "step": 218 + }, + { + "epoch": 1.8717948717948718, + "grad_norm": 0.9740111296003311, + "learning_rate": 4.81034053063155e-06, + "loss": 0.6901, + "step": 219 + }, + { + "epoch": 1.8803418803418803, + "grad_norm": 0.8376294681303905, + "learning_rate": 4.8086270196309174e-06, + "loss": 0.8544, + "step": 220 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.9076663090188682, + "learning_rate": 4.806906110888606e-06, + "loss": 0.6304, + "step": 221 + }, + { + "epoch": 1.8974358974358974, + "grad_norm": 0.9713152148006099, + "learning_rate": 4.805177809919081e-06, + "loss": 0.7956, + "step": 222 + }, + { + "epoch": 1.9059829059829059, + "grad_norm": 1.0226062030376948, + "learning_rate": 4.803442122260494e-06, + "loss": 0.6901, + "step": 223 + }, + { + "epoch": 1.9145299145299144, + "grad_norm": 0.8119065258323606, + "learning_rate": 4.801699053474663e-06, + "loss": 0.5318, + "step": 224 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.9150490205225476, + "learning_rate": 4.799948609147061e-06, + "loss": 0.5784, + "step": 225 + }, + { + "epoch": 1.9316239316239316, + "grad_norm": 0.9035676881199971, + "learning_rate": 4.798190794886795e-06, + "loss": 0.6402, + "step": 226 + }, + { + "epoch": 1.9401709401709402, + "grad_norm": 0.7583311032119001, + "learning_rate": 4.796425616326588e-06, + "loss": 0.4664, + "step": 227 + }, + { + "epoch": 1.9487179487179487, + "grad_norm": 0.9669728608522254, + "learning_rate": 4.79465307912276e-06, + "loss": 0.5637, + "step": 228 + }, + { + "epoch": 1.9572649572649574, + "grad_norm": 0.8667261712455026, + "learning_rate": 4.792873188955213e-06, + "loss": 0.8446, + "step": 229 + }, + { + "epoch": 1.965811965811966, + "grad_norm": 0.9687898626902562, + "learning_rate": 4.791085951527408e-06, + "loss": 0.5547, + "step": 230 + }, + { + "epoch": 1.9743589743589745, + "grad_norm": 0.8157471679820112, + "learning_rate": 4.789291372566352e-06, + "loss": 0.675, + "step": 231 + }, + { + "epoch": 1.982905982905983, + "grad_norm": 0.8596776874736839, + "learning_rate": 4.787489457822576e-06, + "loss": 0.7064, + "step": 232 + }, + { + "epoch": 1.9914529914529915, + "grad_norm": 0.992779009649159, + "learning_rate": 4.785680213070117e-06, + "loss": 0.9077, + "step": 233 + }, + { + "epoch": 2.0, + "grad_norm": 0.9030091834967064, + "learning_rate": 4.783863644106502e-06, + "loss": 0.6325, + "step": 234 + }, + { + "epoch": 2.0085470085470085, + "grad_norm": 0.9423554920024213, + "learning_rate": 4.782039756752728e-06, + "loss": 0.5702, + "step": 235 + }, + { + "epoch": 2.017094017094017, + "grad_norm": 0.8379965111584239, + "learning_rate": 4.780208556853239e-06, + "loss": 0.4504, + "step": 236 + }, + { + "epoch": 2.0256410256410255, + "grad_norm": 0.7824169343393776, + "learning_rate": 4.7783700502759145e-06, + "loss": 0.5999, + "step": 237 + }, + { + "epoch": 2.034188034188034, + "grad_norm": 0.8558656070638038, + "learning_rate": 4.776524242912047e-06, + "loss": 0.7578, + "step": 238 + }, + { + "epoch": 2.0427350427350426, + "grad_norm": 0.8526003041858661, + "learning_rate": 4.774671140676325e-06, + "loss": 0.5964, + "step": 239 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.9112032508436396, + "learning_rate": 4.77281074950681e-06, + "loss": 0.8572, + "step": 240 + }, + { + "epoch": 2.0598290598290596, + "grad_norm": 0.8581968072977434, + "learning_rate": 4.77094307536492e-06, + "loss": 0.5085, + "step": 241 + }, + { + "epoch": 2.0683760683760686, + "grad_norm": 0.7253249437985466, + "learning_rate": 4.769068124235413e-06, + "loss": 0.4323, + "step": 242 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.0234933231378351, + "learning_rate": 4.7671859021263635e-06, + "loss": 0.6929, + "step": 243 + }, + { + "epoch": 2.0854700854700856, + "grad_norm": 0.8536915333923158, + "learning_rate": 4.765296415069146e-06, + "loss": 0.5469, + "step": 244 + }, + { + "epoch": 2.094017094017094, + "grad_norm": 0.8417797765711249, + "learning_rate": 4.763399669118414e-06, + "loss": 0.7197, + "step": 245 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 0.7504923334053624, + "learning_rate": 4.761495670352081e-06, + "loss": 0.5067, + "step": 246 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.8825999832331881, + "learning_rate": 4.759584424871302e-06, + "loss": 0.5412, + "step": 247 + }, + { + "epoch": 2.1196581196581197, + "grad_norm": 0.7981758550893384, + "learning_rate": 4.757665938800453e-06, + "loss": 0.7171, + "step": 248 + }, + { + "epoch": 2.128205128205128, + "grad_norm": 0.7853772293068558, + "learning_rate": 4.755740218287113e-06, + "loss": 0.3035, + "step": 249 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 0.7971592537457343, + "learning_rate": 4.753807269502041e-06, + "loss": 0.6156, + "step": 250 + }, + { + "epoch": 2.1452991452991452, + "grad_norm": 0.7213055348629424, + "learning_rate": 4.7518670986391576e-06, + "loss": 0.4422, + "step": 251 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.8314372014471705, + "learning_rate": 4.749919711915531e-06, + "loss": 0.6491, + "step": 252 + }, + { + "epoch": 2.1623931623931623, + "grad_norm": 0.8906551957721053, + "learning_rate": 4.747965115571345e-06, + "loss": 0.5359, + "step": 253 + }, + { + "epoch": 2.1709401709401708, + "grad_norm": 0.8109146133188956, + "learning_rate": 4.746003315869889e-06, + "loss": 0.5431, + "step": 254 + }, + { + "epoch": 2.1794871794871793, + "grad_norm": 1.0629574061098976, + "learning_rate": 4.744034319097536e-06, + "loss": 0.5012, + "step": 255 + }, + { + "epoch": 2.1880341880341883, + "grad_norm": 0.8497678709421456, + "learning_rate": 4.742058131563718e-06, + "loss": 0.5794, + "step": 256 + }, + { + "epoch": 2.1965811965811968, + "grad_norm": 0.8307773403474242, + "learning_rate": 4.7400747596009125e-06, + "loss": 0.4953, + "step": 257 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 0.8059782529981341, + "learning_rate": 4.738084209564617e-06, + "loss": 0.4722, + "step": 258 + }, + { + "epoch": 2.213675213675214, + "grad_norm": 0.7633142931516772, + "learning_rate": 4.73608648783333e-06, + "loss": 0.535, + "step": 259 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.8445214601230614, + "learning_rate": 4.734081600808531e-06, + "loss": 0.6529, + "step": 260 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 0.7888800398644392, + "learning_rate": 4.73206955491466e-06, + "loss": 0.4053, + "step": 261 + }, + { + "epoch": 2.2393162393162394, + "grad_norm": 0.8353629817434199, + "learning_rate": 4.7300503565990985e-06, + "loss": 0.4727, + "step": 262 + }, + { + "epoch": 2.247863247863248, + "grad_norm": 0.7104386481467555, + "learning_rate": 4.728024012332145e-06, + "loss": 0.4199, + "step": 263 + }, + { + "epoch": 2.2564102564102564, + "grad_norm": 0.8725564755846308, + "learning_rate": 4.725990528606996e-06, + "loss": 0.4241, + "step": 264 + }, + { + "epoch": 2.264957264957265, + "grad_norm": 0.7957719847040631, + "learning_rate": 4.723949911939728e-06, + "loss": 0.5536, + "step": 265 + }, + { + "epoch": 2.2735042735042734, + "grad_norm": 1.082844473015555, + "learning_rate": 4.7219021688692725e-06, + "loss": 0.5833, + "step": 266 + }, + { + "epoch": 2.282051282051282, + "grad_norm": 0.9065232591964502, + "learning_rate": 4.719847305957398e-06, + "loss": 0.5619, + "step": 267 + }, + { + "epoch": 2.2905982905982905, + "grad_norm": 0.8547973656651287, + "learning_rate": 4.717785329788685e-06, + "loss": 0.6832, + "step": 268 + }, + { + "epoch": 2.299145299145299, + "grad_norm": 0.9296372212216126, + "learning_rate": 4.715716246970511e-06, + "loss": 0.7332, + "step": 269 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.7861517591016323, + "learning_rate": 4.7136400641330245e-06, + "loss": 0.5523, + "step": 270 + }, + { + "epoch": 2.316239316239316, + "grad_norm": 0.8432735931836609, + "learning_rate": 4.7115567879291265e-06, + "loss": 0.4783, + "step": 271 + }, + { + "epoch": 2.324786324786325, + "grad_norm": 0.810227255259576, + "learning_rate": 4.709466425034445e-06, + "loss": 0.6832, + "step": 272 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.8627425682827192, + "learning_rate": 4.707368982147318e-06, + "loss": 0.6009, + "step": 273 + }, + { + "epoch": 2.341880341880342, + "grad_norm": 0.8933833494570184, + "learning_rate": 4.705264465988771e-06, + "loss": 0.6507, + "step": 274 + }, + { + "epoch": 2.3504273504273505, + "grad_norm": 0.9536316043302439, + "learning_rate": 4.703152883302498e-06, + "loss": 0.4248, + "step": 275 + }, + { + "epoch": 2.358974358974359, + "grad_norm": 0.8128802583501903, + "learning_rate": 4.701034240854829e-06, + "loss": 0.5562, + "step": 276 + }, + { + "epoch": 2.3675213675213675, + "grad_norm": 0.858099919053899, + "learning_rate": 4.6989085454347236e-06, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 2.376068376068376, + "grad_norm": 0.8472803686361211, + "learning_rate": 4.696775803853739e-06, + "loss": 0.5566, + "step": 278 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 0.7851374043861304, + "learning_rate": 4.694636022946012e-06, + "loss": 0.5711, + "step": 279 + }, + { + "epoch": 2.393162393162393, + "grad_norm": 0.8317592786523537, + "learning_rate": 4.692489209568234e-06, + "loss": 0.7418, + "step": 280 + }, + { + "epoch": 2.4017094017094016, + "grad_norm": 0.8853581482172665, + "learning_rate": 4.690335370599633e-06, + "loss": 0.6775, + "step": 281 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 0.8539229096616547, + "learning_rate": 4.68817451294195e-06, + "loss": 0.5026, + "step": 282 + }, + { + "epoch": 2.4188034188034186, + "grad_norm": 0.8208950002596763, + "learning_rate": 4.686006643519415e-06, + "loss": 0.6944, + "step": 283 + }, + { + "epoch": 2.427350427350427, + "grad_norm": 0.8841090381845832, + "learning_rate": 4.683831769278729e-06, + "loss": 0.5945, + "step": 284 + }, + { + "epoch": 2.435897435897436, + "grad_norm": 0.8232663064958912, + "learning_rate": 4.681649897189036e-06, + "loss": 0.556, + "step": 285 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.8148096515474825, + "learning_rate": 4.679461034241906e-06, + "loss": 0.5738, + "step": 286 + }, + { + "epoch": 2.452991452991453, + "grad_norm": 0.8082914935046664, + "learning_rate": 4.677265187451311e-06, + "loss": 0.4538, + "step": 287 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.9043659946127143, + "learning_rate": 4.675062363853599e-06, + "loss": 0.641, + "step": 288 + }, + { + "epoch": 2.47008547008547, + "grad_norm": 0.6764037851210951, + "learning_rate": 4.672852570507476e-06, + "loss": 0.3586, + "step": 289 + }, + { + "epoch": 2.4786324786324787, + "grad_norm": 0.7621893086172707, + "learning_rate": 4.670635814493985e-06, + "loss": 0.5006, + "step": 290 + }, + { + "epoch": 2.4871794871794872, + "grad_norm": 0.7696631077204455, + "learning_rate": 4.668412102916474e-06, + "loss": 0.4888, + "step": 291 + }, + { + "epoch": 2.4957264957264957, + "grad_norm": 0.8724597404888246, + "learning_rate": 4.666181442900584e-06, + "loss": 0.6726, + "step": 292 + }, + { + "epoch": 2.5042735042735043, + "grad_norm": 1.0178600439419474, + "learning_rate": 4.663943841594219e-06, + "loss": 0.8028, + "step": 293 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 0.821176529609733, + "learning_rate": 4.6616993061675275e-06, + "loss": 0.4614, + "step": 294 + }, + { + "epoch": 2.5213675213675213, + "grad_norm": 0.954374041308384, + "learning_rate": 4.659447843812876e-06, + "loss": 0.5892, + "step": 295 + }, + { + "epoch": 2.52991452991453, + "grad_norm": 0.8797608205693254, + "learning_rate": 4.657189461744829e-06, + "loss": 0.5927, + "step": 296 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 0.8384246808599966, + "learning_rate": 4.654924167200124e-06, + "loss": 0.5969, + "step": 297 + }, + { + "epoch": 2.547008547008547, + "grad_norm": 0.8469590124967289, + "learning_rate": 4.652651967437647e-06, + "loss": 0.4979, + "step": 298 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.7910351749484542, + "learning_rate": 4.650372869738415e-06, + "loss": 0.3562, + "step": 299 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.9453743074559633, + "learning_rate": 4.648086881405542e-06, + "loss": 0.5599, + "step": 300 + }, + { + "epoch": 2.5726495726495724, + "grad_norm": 0.8425619907341163, + "learning_rate": 4.6457940097642315e-06, + "loss": 0.3276, + "step": 301 + }, + { + "epoch": 2.5811965811965814, + "grad_norm": 0.8428502499947056, + "learning_rate": 4.643494262161735e-06, + "loss": 0.4904, + "step": 302 + }, + { + "epoch": 2.58974358974359, + "grad_norm": 0.9186239857634485, + "learning_rate": 4.6411876459673435e-06, + "loss": 0.6082, + "step": 303 + }, + { + "epoch": 2.5982905982905984, + "grad_norm": 0.8347795234984768, + "learning_rate": 4.638874168572355e-06, + "loss": 0.5763, + "step": 304 + }, + { + "epoch": 2.606837606837607, + "grad_norm": 0.9453440342446296, + "learning_rate": 4.636553837390051e-06, + "loss": 0.7632, + "step": 305 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.9624505274395776, + "learning_rate": 4.634226659855681e-06, + "loss": 0.5405, + "step": 306 + }, + { + "epoch": 2.623931623931624, + "grad_norm": 0.739440573623984, + "learning_rate": 4.631892643426428e-06, + "loss": 0.6543, + "step": 307 + }, + { + "epoch": 2.6324786324786325, + "grad_norm": 0.8183326036817442, + "learning_rate": 4.629551795581393e-06, + "loss": 0.4819, + "step": 308 + }, + { + "epoch": 2.641025641025641, + "grad_norm": 1.0800027921184026, + "learning_rate": 4.627204123821563e-06, + "loss": 0.6374, + "step": 309 + }, + { + "epoch": 2.6495726495726495, + "grad_norm": 0.8644012525735127, + "learning_rate": 4.624849635669797e-06, + "loss": 0.676, + "step": 310 + }, + { + "epoch": 2.658119658119658, + "grad_norm": 0.9303352301220938, + "learning_rate": 4.622488338670792e-06, + "loss": 0.6964, + "step": 311 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.8250093622508519, + "learning_rate": 4.620120240391065e-06, + "loss": 0.4156, + "step": 312 + }, + { + "epoch": 2.6752136752136755, + "grad_norm": 0.8760662748622655, + "learning_rate": 4.617745348418928e-06, + "loss": 0.8247, + "step": 313 + }, + { + "epoch": 2.683760683760684, + "grad_norm": 0.7826247149013078, + "learning_rate": 4.61536367036446e-06, + "loss": 0.4056, + "step": 314 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.9719580059372056, + "learning_rate": 4.612975213859487e-06, + "loss": 0.6637, + "step": 315 + }, + { + "epoch": 2.700854700854701, + "grad_norm": 0.7519148907524309, + "learning_rate": 4.6105799865575565e-06, + "loss": 0.3255, + "step": 316 + }, + { + "epoch": 2.7094017094017095, + "grad_norm": 0.8302883777619188, + "learning_rate": 4.60817799613391e-06, + "loss": 0.5064, + "step": 317 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 0.7963915256809042, + "learning_rate": 4.605769250285462e-06, + "loss": 0.5783, + "step": 318 + }, + { + "epoch": 2.7264957264957266, + "grad_norm": 0.8526922440352511, + "learning_rate": 4.603353756730775e-06, + "loss": 0.8161, + "step": 319 + }, + { + "epoch": 2.735042735042735, + "grad_norm": 0.7735568824902, + "learning_rate": 4.600931523210032e-06, + "loss": 0.4646, + "step": 320 + }, + { + "epoch": 2.7435897435897436, + "grad_norm": 0.7480599875712123, + "learning_rate": 4.598502557485015e-06, + "loss": 0.5126, + "step": 321 + }, + { + "epoch": 2.752136752136752, + "grad_norm": 0.978569107209946, + "learning_rate": 4.5960668673390776e-06, + "loss": 0.7456, + "step": 322 + }, + { + "epoch": 2.7606837606837606, + "grad_norm": 0.8165048476366037, + "learning_rate": 4.59362446057712e-06, + "loss": 0.5948, + "step": 323 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.8341162025089808, + "learning_rate": 4.591175345025567e-06, + "loss": 0.3868, + "step": 324 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.78033690790899, + "learning_rate": 4.588719528532342e-06, + "loss": 0.5951, + "step": 325 + }, + { + "epoch": 2.786324786324786, + "grad_norm": 0.9040375747082394, + "learning_rate": 4.586257018966837e-06, + "loss": 0.7382, + "step": 326 + }, + { + "epoch": 2.7948717948717947, + "grad_norm": 0.9062791192886377, + "learning_rate": 4.583787824219894e-06, + "loss": 0.5906, + "step": 327 + }, + { + "epoch": 2.8034188034188032, + "grad_norm": 1.0120723259107143, + "learning_rate": 4.5813119522037765e-06, + "loss": 0.6833, + "step": 328 + }, + { + "epoch": 2.8119658119658117, + "grad_norm": 0.865348091293651, + "learning_rate": 4.578829410852145e-06, + "loss": 0.7787, + "step": 329 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.8348180991577413, + "learning_rate": 4.5763402081200295e-06, + "loss": 0.4408, + "step": 330 + }, + { + "epoch": 2.8290598290598292, + "grad_norm": 0.8011745133788588, + "learning_rate": 4.573844351983807e-06, + "loss": 0.4633, + "step": 331 + }, + { + "epoch": 2.8376068376068377, + "grad_norm": 0.8293170512614328, + "learning_rate": 4.571341850441175e-06, + "loss": 0.4687, + "step": 332 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 0.8232430705506593, + "learning_rate": 4.568832711511125e-06, + "loss": 0.6406, + "step": 333 + }, + { + "epoch": 2.8547008547008548, + "grad_norm": 0.6611830954064016, + "learning_rate": 4.566316943233916e-06, + "loss": 0.5333, + "step": 334 + }, + { + "epoch": 2.8632478632478633, + "grad_norm": 0.7427595171913142, + "learning_rate": 4.56379455367105e-06, + "loss": 0.4849, + "step": 335 + }, + { + "epoch": 2.871794871794872, + "grad_norm": 0.776740740513331, + "learning_rate": 4.561265550905251e-06, + "loss": 0.3279, + "step": 336 + }, + { + "epoch": 2.8803418803418803, + "grad_norm": 0.8914208083154344, + "learning_rate": 4.558729943040427e-06, + "loss": 0.5588, + "step": 337 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.9028547313318527, + "learning_rate": 4.556187738201656e-06, + "loss": 0.5921, + "step": 338 + }, + { + "epoch": 2.8974358974358974, + "grad_norm": 0.9286852889816591, + "learning_rate": 4.553638944535155e-06, + "loss": 0.5297, + "step": 339 + }, + { + "epoch": 2.905982905982906, + "grad_norm": 0.8351144281666316, + "learning_rate": 4.551083570208251e-06, + "loss": 0.5956, + "step": 340 + }, + { + "epoch": 2.9145299145299144, + "grad_norm": 0.7309337413729242, + "learning_rate": 4.548521623409364e-06, + "loss": 0.4151, + "step": 341 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.9346639117201893, + "learning_rate": 4.545953112347967e-06, + "loss": 0.5964, + "step": 342 + }, + { + "epoch": 2.931623931623932, + "grad_norm": 0.81834023562854, + "learning_rate": 4.543378045254575e-06, + "loss": 0.5455, + "step": 343 + }, + { + "epoch": 2.9401709401709404, + "grad_norm": 0.9994568354494042, + "learning_rate": 4.540796430380706e-06, + "loss": 0.4972, + "step": 344 + }, + { + "epoch": 2.948717948717949, + "grad_norm": 0.9623762851964687, + "learning_rate": 4.538208275998861e-06, + "loss": 0.6229, + "step": 345 + }, + { + "epoch": 2.9572649572649574, + "grad_norm": 0.8503131665697095, + "learning_rate": 4.535613590402497e-06, + "loss": 0.5439, + "step": 346 + }, + { + "epoch": 2.965811965811966, + "grad_norm": 0.8914338237199254, + "learning_rate": 4.533012381905999e-06, + "loss": 0.4731, + "step": 347 + }, + { + "epoch": 2.9743589743589745, + "grad_norm": 0.8050974375681713, + "learning_rate": 4.530404658844654e-06, + "loss": 0.3666, + "step": 348 + }, + { + "epoch": 2.982905982905983, + "grad_norm": 0.9766521405817877, + "learning_rate": 4.527790429574623e-06, + "loss": 0.6397, + "step": 349 + }, + { + "epoch": 2.9914529914529915, + "grad_norm": 1.0716768145207356, + "learning_rate": 4.525169702472917e-06, + "loss": 0.3234, + "step": 350 + }, + { + "epoch": 3.0, + "grad_norm": 0.7414645916740304, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4125, + "step": 351 + }, + { + "epoch": 3.0085470085470085, + "grad_norm": 0.8304413817295728, + "learning_rate": 4.519908788386605e-06, + "loss": 0.4659, + "step": 352 + }, + { + "epoch": 3.017094017094017, + "grad_norm": 0.7735458981648151, + "learning_rate": 4.51726861826002e-06, + "loss": 0.5033, + "step": 353 + }, + { + "epoch": 3.0256410256410255, + "grad_norm": 0.7595322185814292, + "learning_rate": 4.514621984017748e-06, + "loss": 0.303, + "step": 354 + }, + { + "epoch": 3.034188034188034, + "grad_norm": 0.7875034261369316, + "learning_rate": 4.511968894140639e-06, + "loss": 0.648, + "step": 355 + }, + { + "epoch": 3.0427350427350426, + "grad_norm": 0.8203545208978309, + "learning_rate": 4.509309357130227e-06, + "loss": 0.4485, + "step": 356 + }, + { + "epoch": 3.051282051282051, + "grad_norm": 0.7946456857985079, + "learning_rate": 4.5066433815087076e-06, + "loss": 0.4074, + "step": 357 + }, + { + "epoch": 3.0598290598290596, + "grad_norm": 1.2003143687959523, + "learning_rate": 4.503970975818905e-06, + "loss": 0.4335, + "step": 358 + }, + { + "epoch": 3.0683760683760686, + "grad_norm": 0.9408760380102702, + "learning_rate": 4.501292148624251e-06, + "loss": 0.4196, + "step": 359 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.9144025827497951, + "learning_rate": 4.498606908508754e-06, + "loss": 0.4479, + "step": 360 + }, + { + "epoch": 3.0854700854700856, + "grad_norm": 0.7687844481709335, + "learning_rate": 4.495915264076967e-06, + "loss": 0.4121, + "step": 361 + }, + { + "epoch": 3.094017094017094, + "grad_norm": 0.7537385468723475, + "learning_rate": 4.493217223953974e-06, + "loss": 0.4461, + "step": 362 + }, + { + "epoch": 3.1025641025641026, + "grad_norm": 0.7718050015382629, + "learning_rate": 4.490512796785344e-06, + "loss": 0.3579, + "step": 363 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.8146700557707305, + "learning_rate": 4.48780199123712e-06, + "loss": 0.3657, + "step": 364 + }, + { + "epoch": 3.1196581196581197, + "grad_norm": 0.7766206702534757, + "learning_rate": 4.485084815995778e-06, + "loss": 0.3605, + "step": 365 + }, + { + "epoch": 3.128205128205128, + "grad_norm": 0.9320602711654982, + "learning_rate": 4.482361279768209e-06, + "loss": 0.4754, + "step": 366 + }, + { + "epoch": 3.1367521367521367, + "grad_norm": 1.002563816761898, + "learning_rate": 4.479631391281685e-06, + "loss": 0.725, + "step": 367 + }, + { + "epoch": 3.1452991452991452, + "grad_norm": 0.8764474287712758, + "learning_rate": 4.476895159283835e-06, + "loss": 0.6423, + "step": 368 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 0.8441568311922192, + "learning_rate": 4.474152592542613e-06, + "loss": 0.5277, + "step": 369 + }, + { + "epoch": 3.1623931623931623, + "grad_norm": 0.8743031497554583, + "learning_rate": 4.4714036998462715e-06, + "loss": 0.5851, + "step": 370 + }, + { + "epoch": 3.1709401709401708, + "grad_norm": 0.933207995427482, + "learning_rate": 4.4686484900033375e-06, + "loss": 0.433, + "step": 371 + }, + { + "epoch": 3.1794871794871793, + "grad_norm": 0.9190018249217231, + "learning_rate": 4.465886971842578e-06, + "loss": 0.4419, + "step": 372 + }, + { + "epoch": 3.1880341880341883, + "grad_norm": 0.8266660789991668, + "learning_rate": 4.463119154212972e-06, + "loss": 0.4605, + "step": 373 + }, + { + "epoch": 3.1965811965811968, + "grad_norm": 0.76565165126929, + "learning_rate": 4.46034504598369e-06, + "loss": 0.5145, + "step": 374 + }, + { + "epoch": 3.2051282051282053, + "grad_norm": 0.8511966627383084, + "learning_rate": 4.457564656044056e-06, + "loss": 0.3751, + "step": 375 + }, + { + "epoch": 3.213675213675214, + "grad_norm": 0.8298685003297344, + "learning_rate": 4.454777993303524e-06, + "loss": 0.7063, + "step": 376 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.8842731250398438, + "learning_rate": 4.451985066691649e-06, + "loss": 0.3819, + "step": 377 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 0.7150831598781322, + "learning_rate": 4.449185885158056e-06, + "loss": 0.3984, + "step": 378 + }, + { + "epoch": 3.2393162393162394, + "grad_norm": 0.8069520223481081, + "learning_rate": 4.446380457672417e-06, + "loss": 0.3641, + "step": 379 + }, + { + "epoch": 3.247863247863248, + "grad_norm": 0.8676544763614075, + "learning_rate": 4.443568793224415e-06, + "loss": 0.5014, + "step": 380 + }, + { + "epoch": 3.2564102564102564, + "grad_norm": 0.8556378002114667, + "learning_rate": 4.44075090082372e-06, + "loss": 0.4846, + "step": 381 + }, + { + "epoch": 3.264957264957265, + "grad_norm": 0.889152669192053, + "learning_rate": 4.437926789499959e-06, + "loss": 0.5168, + "step": 382 + }, + { + "epoch": 3.2735042735042734, + "grad_norm": 0.6725425463954209, + "learning_rate": 4.435096468302687e-06, + "loss": 0.308, + "step": 383 + }, + { + "epoch": 3.282051282051282, + "grad_norm": 0.8877402712011035, + "learning_rate": 4.432259946301355e-06, + "loss": 0.4128, + "step": 384 + }, + { + "epoch": 3.2905982905982905, + "grad_norm": 0.8445915729478649, + "learning_rate": 4.429417232585288e-06, + "loss": 0.3867, + "step": 385 + }, + { + "epoch": 3.299145299145299, + "grad_norm": 0.7327654841257518, + "learning_rate": 4.42656833626365e-06, + "loss": 0.2801, + "step": 386 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 0.9029002294542466, + "learning_rate": 4.423713266465415e-06, + "loss": 0.4407, + "step": 387 + }, + { + "epoch": 3.316239316239316, + "grad_norm": 0.8184482649220772, + "learning_rate": 4.4208520323393425e-06, + "loss": 0.3226, + "step": 388 + }, + { + "epoch": 3.324786324786325, + "grad_norm": 0.9024192814094689, + "learning_rate": 4.417984643053941e-06, + "loss": 0.6365, + "step": 389 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7111467138011386, + "learning_rate": 4.415111107797445e-06, + "loss": 0.4378, + "step": 390 + }, + { + "epoch": 3.341880341880342, + "grad_norm": 0.9441614638935245, + "learning_rate": 4.412231435777784e-06, + "loss": 0.4225, + "step": 391 + }, + { + "epoch": 3.3504273504273505, + "grad_norm": 2.221227624135174, + "learning_rate": 4.409345636222549e-06, + "loss": 0.5193, + "step": 392 + }, + { + "epoch": 3.358974358974359, + "grad_norm": 0.8877919988116371, + "learning_rate": 4.406453718378968e-06, + "loss": 0.7117, + "step": 393 + }, + { + "epoch": 3.3675213675213675, + "grad_norm": 0.8771932944932836, + "learning_rate": 4.4035556915138745e-06, + "loss": 0.5035, + "step": 394 + }, + { + "epoch": 3.376068376068376, + "grad_norm": 0.9387814704326412, + "learning_rate": 4.400651564913676e-06, + "loss": 0.5797, + "step": 395 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 0.8244733294329837, + "learning_rate": 4.397741347884329e-06, + "loss": 0.5068, + "step": 396 + }, + { + "epoch": 3.393162393162393, + "grad_norm": 0.9494021493478192, + "learning_rate": 4.394825049751303e-06, + "loss": 0.5993, + "step": 397 + }, + { + "epoch": 3.4017094017094016, + "grad_norm": 0.7221780905312348, + "learning_rate": 4.391902679859557e-06, + "loss": 0.251, + "step": 398 + }, + { + "epoch": 3.41025641025641, + "grad_norm": 0.8268617275414766, + "learning_rate": 4.388974247573501e-06, + "loss": 0.4459, + "step": 399 + }, + { + "epoch": 3.4188034188034186, + "grad_norm": 1.1369326581432304, + "learning_rate": 4.386039762276976e-06, + "loss": 0.4272, + "step": 400 + }, + { + "epoch": 3.427350427350427, + "grad_norm": 0.8576090900488215, + "learning_rate": 4.3830992333732185e-06, + "loss": 0.4957, + "step": 401 + }, + { + "epoch": 3.435897435897436, + "grad_norm": 0.8819239615707368, + "learning_rate": 4.3801526702848306e-06, + "loss": 0.5069, + "step": 402 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.7180964553933423, + "learning_rate": 4.377200082453748e-06, + "loss": 0.4055, + "step": 403 + }, + { + "epoch": 3.452991452991453, + "grad_norm": 0.8424182432690462, + "learning_rate": 4.374241479341216e-06, + "loss": 0.3744, + "step": 404 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 0.8448473931955931, + "learning_rate": 4.3712768704277535e-06, + "loss": 0.4433, + "step": 405 + }, + { + "epoch": 3.47008547008547, + "grad_norm": 0.8768307747327547, + "learning_rate": 4.368306265213122e-06, + "loss": 0.4239, + "step": 406 + }, + { + "epoch": 3.4786324786324787, + "grad_norm": 0.7559664092738253, + "learning_rate": 4.365329673216301e-06, + "loss": 0.3114, + "step": 407 + }, + { + "epoch": 3.4871794871794872, + "grad_norm": 0.7811272037089495, + "learning_rate": 4.3623471039754525e-06, + "loss": 0.6167, + "step": 408 + }, + { + "epoch": 3.4957264957264957, + "grad_norm": 0.8050998843783346, + "learning_rate": 4.359358567047892e-06, + "loss": 0.4296, + "step": 409 + }, + { + "epoch": 3.5042735042735043, + "grad_norm": 0.764913405706021, + "learning_rate": 4.356364072010059e-06, + "loss": 0.5632, + "step": 410 + }, + { + "epoch": 3.5128205128205128, + "grad_norm": 1.0200860018866134, + "learning_rate": 4.35336362845748e-06, + "loss": 0.7716, + "step": 411 + }, + { + "epoch": 3.5213675213675213, + "grad_norm": 0.8071339648683941, + "learning_rate": 4.35035724600475e-06, + "loss": 0.6755, + "step": 412 + }, + { + "epoch": 3.52991452991453, + "grad_norm": 0.8934834178549457, + "learning_rate": 4.347344934285492e-06, + "loss": 0.5716, + "step": 413 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 0.9013447212285678, + "learning_rate": 4.3443267029523265e-06, + "loss": 0.4951, + "step": 414 + }, + { + "epoch": 3.547008547008547, + "grad_norm": 0.7467615404534375, + "learning_rate": 4.3413025616768426e-06, + "loss": 0.5141, + "step": 415 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.8109187006489191, + "learning_rate": 4.338272520149572e-06, + "loss": 0.4856, + "step": 416 + }, + { + "epoch": 3.564102564102564, + "grad_norm": 0.7978268359836372, + "learning_rate": 4.335236588079949e-06, + "loss": 0.4645, + "step": 417 + }, + { + "epoch": 3.5726495726495724, + "grad_norm": 0.7976611760330177, + "learning_rate": 4.332194775196282e-06, + "loss": 0.5088, + "step": 418 + }, + { + "epoch": 3.5811965811965814, + "grad_norm": 0.7273065066610752, + "learning_rate": 4.329147091245729e-06, + "loss": 0.3724, + "step": 419 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 0.7627333803959988, + "learning_rate": 4.326093545994258e-06, + "loss": 0.3721, + "step": 420 + }, + { + "epoch": 3.5982905982905984, + "grad_norm": 0.8883554964464617, + "learning_rate": 4.3230341492266195e-06, + "loss": 0.3726, + "step": 421 + }, + { + "epoch": 3.606837606837607, + "grad_norm": 0.766873829622007, + "learning_rate": 4.3199689107463125e-06, + "loss": 0.3011, + "step": 422 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.050080591838294, + "learning_rate": 4.316897840375558e-06, + "loss": 0.4624, + "step": 423 + }, + { + "epoch": 3.623931623931624, + "grad_norm": 0.7431746820429679, + "learning_rate": 4.313820947955265e-06, + "loss": 0.4298, + "step": 424 + }, + { + "epoch": 3.6324786324786325, + "grad_norm": 0.7784354492853087, + "learning_rate": 4.310738243344996e-06, + "loss": 0.3922, + "step": 425 + }, + { + "epoch": 3.641025641025641, + "grad_norm": 0.7444834058751557, + "learning_rate": 4.307649736422939e-06, + "loss": 0.2623, + "step": 426 + }, + { + "epoch": 3.6495726495726495, + "grad_norm": 0.7977973139058897, + "learning_rate": 4.304555437085876e-06, + "loss": 0.373, + "step": 427 + }, + { + "epoch": 3.658119658119658, + "grad_norm": 0.8485658441757654, + "learning_rate": 4.301455355249148e-06, + "loss": 0.5303, + "step": 428 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.8224556079644156, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.3876, + "step": 429 + }, + { + "epoch": 3.6752136752136755, + "grad_norm": 0.7754703882264418, + "learning_rate": 4.2952378838306855e-06, + "loss": 0.4484, + "step": 430 + }, + { + "epoch": 3.683760683760684, + "grad_norm": 0.8570419199997498, + "learning_rate": 4.292120514172154e-06, + "loss": 0.4782, + "step": 431 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 0.7756027229434488, + "learning_rate": 4.288997401860303e-06, + "loss": 0.3856, + "step": 432 + }, + { + "epoch": 3.700854700854701, + "grad_norm": 0.8073968441637798, + "learning_rate": 4.285868556902803e-06, + "loss": 0.4776, + "step": 433 + }, + { + "epoch": 3.7094017094017095, + "grad_norm": 0.8790251187999025, + "learning_rate": 4.2827339893256935e-06, + "loss": 0.4384, + "step": 434 + }, + { + "epoch": 3.717948717948718, + "grad_norm": 0.8528022888286432, + "learning_rate": 4.279593709173352e-06, + "loss": 0.4662, + "step": 435 + }, + { + "epoch": 3.7264957264957266, + "grad_norm": 0.8226819943495567, + "learning_rate": 4.276447726508461e-06, + "loss": 0.5625, + "step": 436 + }, + { + "epoch": 3.735042735042735, + "grad_norm": 0.85329699161591, + "learning_rate": 4.273296051411978e-06, + "loss": 0.548, + "step": 437 + }, + { + "epoch": 3.7435897435897436, + "grad_norm": 0.8815606194105096, + "learning_rate": 4.2701386939830966e-06, + "loss": 0.6318, + "step": 438 + }, + { + "epoch": 3.752136752136752, + "grad_norm": 0.9148527978934294, + "learning_rate": 4.2669756643392255e-06, + "loss": 0.6247, + "step": 439 + }, + { + "epoch": 3.7606837606837606, + "grad_norm": 0.7579395274439618, + "learning_rate": 4.263806972615943e-06, + "loss": 0.3085, + "step": 440 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 0.8419223552392247, + "learning_rate": 4.260632628966974e-06, + "loss": 0.3967, + "step": 441 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.7554240130558576, + "learning_rate": 4.257452643564155e-06, + "loss": 0.398, + "step": 442 + }, + { + "epoch": 3.786324786324786, + "grad_norm": 0.8507117836828787, + "learning_rate": 4.254267026597399e-06, + "loss": 0.5881, + "step": 443 + }, + { + "epoch": 3.7948717948717947, + "grad_norm": 0.7353099260117935, + "learning_rate": 4.251075788274667e-06, + "loss": 0.2005, + "step": 444 + }, + { + "epoch": 3.8034188034188032, + "grad_norm": 0.8712901741487685, + "learning_rate": 4.247878938821929e-06, + "loss": 0.4499, + "step": 445 + }, + { + "epoch": 3.8119658119658117, + "grad_norm": 0.8974249580810392, + "learning_rate": 4.2446764884831404e-06, + "loss": 0.5255, + "step": 446 + }, + { + "epoch": 3.8205128205128203, + "grad_norm": 0.8829948352253479, + "learning_rate": 4.2414684475202014e-06, + "loss": 0.4319, + "step": 447 + }, + { + "epoch": 3.8290598290598292, + "grad_norm": 0.8605823079252658, + "learning_rate": 4.238254826212925e-06, + "loss": 0.4846, + "step": 448 + }, + { + "epoch": 3.8376068376068377, + "grad_norm": 0.8041945944691401, + "learning_rate": 4.2350356348590096e-06, + "loss": 0.4267, + "step": 449 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.870824196997254, + "learning_rate": 4.231810883773999e-06, + "loss": 0.5941, + "step": 450 + }, + { + "epoch": 3.8547008547008548, + "grad_norm": 0.9593235942305645, + "learning_rate": 4.228580583291254e-06, + "loss": 0.4393, + "step": 451 + }, + { + "epoch": 3.8632478632478633, + "grad_norm": 1.0063955556890787, + "learning_rate": 4.225344743761918e-06, + "loss": 0.4246, + "step": 452 + }, + { + "epoch": 3.871794871794872, + "grad_norm": 0.8202784309796189, + "learning_rate": 4.2221033755548835e-06, + "loss": 0.3854, + "step": 453 + }, + { + "epoch": 3.8803418803418803, + "grad_norm": 1.0562117712056716, + "learning_rate": 4.218856489056758e-06, + "loss": 0.2155, + "step": 454 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.8464596519960794, + "learning_rate": 4.215604094671835e-06, + "loss": 0.5801, + "step": 455 + }, + { + "epoch": 3.8974358974358974, + "grad_norm": 0.8504550548583248, + "learning_rate": 4.2123462028220505e-06, + "loss": 0.5785, + "step": 456 + }, + { + "epoch": 3.905982905982906, + "grad_norm": 0.9630832827635597, + "learning_rate": 4.209082823946965e-06, + "loss": 0.7302, + "step": 457 + }, + { + "epoch": 3.9145299145299144, + "grad_norm": 0.8968524846983676, + "learning_rate": 4.205813968503717e-06, + "loss": 0.4906, + "step": 458 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 0.8547398378994598, + "learning_rate": 4.202539646966993e-06, + "loss": 0.414, + "step": 459 + }, + { + "epoch": 3.931623931623932, + "grad_norm": 0.6844619258431609, + "learning_rate": 4.1992598698289985e-06, + "loss": 0.3025, + "step": 460 + }, + { + "epoch": 3.9401709401709404, + "grad_norm": 0.8949935454247467, + "learning_rate": 4.1959746475994175e-06, + "loss": 0.5164, + "step": 461 + }, + { + "epoch": 3.948717948717949, + "grad_norm": 0.8379831635152304, + "learning_rate": 4.1926839908053855e-06, + "loss": 0.4645, + "step": 462 + }, + { + "epoch": 3.9572649572649574, + "grad_norm": 0.762265401669729, + "learning_rate": 4.189387909991448e-06, + "loss": 0.4256, + "step": 463 + }, + { + "epoch": 3.965811965811966, + "grad_norm": 0.9100475030814085, + "learning_rate": 4.186086415719537e-06, + "loss": 0.4741, + "step": 464 + }, + { + "epoch": 3.9743589743589745, + "grad_norm": 0.8453866689986883, + "learning_rate": 4.182779518568925e-06, + "loss": 0.504, + "step": 465 + }, + { + "epoch": 3.982905982905983, + "grad_norm": 0.8103875920447905, + "learning_rate": 4.179467229136205e-06, + "loss": 0.2742, + "step": 466 + }, + { + "epoch": 3.9914529914529915, + "grad_norm": 1.1325517449404694, + "learning_rate": 4.176149558035241e-06, + "loss": 0.5041, + "step": 467 + }, + { + "epoch": 4.0, + "grad_norm": 0.8145184159975724, + "learning_rate": 4.172826515897146e-06, + "loss": 0.3232, + "step": 468 + }, + { + "epoch": 4.0085470085470085, + "grad_norm": 0.8456967300947733, + "learning_rate": 4.169498113370245e-06, + "loss": 0.5478, + "step": 469 + }, + { + "epoch": 4.017094017094017, + "grad_norm": 0.7549586169194719, + "learning_rate": 4.166164361120036e-06, + "loss": 0.2411, + "step": 470 + }, + { + "epoch": 4.0256410256410255, + "grad_norm": 0.8089576849557826, + "learning_rate": 4.162825269829165e-06, + "loss": 0.4338, + "step": 471 + }, + { + "epoch": 4.034188034188034, + "grad_norm": 0.6762049989116309, + "learning_rate": 4.15948085019738e-06, + "loss": 0.2678, + "step": 472 + }, + { + "epoch": 4.042735042735043, + "grad_norm": 0.8918685147268757, + "learning_rate": 4.156131112941509e-06, + "loss": 0.4549, + "step": 473 + }, + { + "epoch": 4.051282051282051, + "grad_norm": 1.028450458974256, + "learning_rate": 4.152776068795416e-06, + "loss": 0.5193, + "step": 474 + }, + { + "epoch": 4.05982905982906, + "grad_norm": 0.8593042738505275, + "learning_rate": 4.149415728509971e-06, + "loss": 0.4555, + "step": 475 + }, + { + "epoch": 4.068376068376068, + "grad_norm": 0.8941008995651688, + "learning_rate": 4.146050102853015e-06, + "loss": 0.3665, + "step": 476 + }, + { + "epoch": 4.076923076923077, + "grad_norm": 0.9327966593805307, + "learning_rate": 4.1426792026093274e-06, + "loss": 0.6724, + "step": 477 + }, + { + "epoch": 4.085470085470085, + "grad_norm": 0.8100547548689432, + "learning_rate": 4.139303038580586e-06, + "loss": 0.3283, + "step": 478 + }, + { + "epoch": 4.094017094017094, + "grad_norm": 0.719264457249393, + "learning_rate": 4.135921621585338e-06, + "loss": 0.2572, + "step": 479 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 0.8641442922634487, + "learning_rate": 4.1325349624589625e-06, + "loss": 0.4605, + "step": 480 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.8430072813650188, + "learning_rate": 4.129143072053639e-06, + "loss": 0.3969, + "step": 481 + }, + { + "epoch": 4.119658119658119, + "grad_norm": 0.7899792800908239, + "learning_rate": 4.125745961238305e-06, + "loss": 0.2622, + "step": 482 + }, + { + "epoch": 4.128205128205128, + "grad_norm": 0.8939354624686937, + "learning_rate": 4.122343640898628e-06, + "loss": 0.4555, + "step": 483 + }, + { + "epoch": 4.136752136752137, + "grad_norm": 0.8543201855817723, + "learning_rate": 4.118936121936973e-06, + "loss": 0.4876, + "step": 484 + }, + { + "epoch": 4.145299145299146, + "grad_norm": 0.6680196413010284, + "learning_rate": 4.115523415272358e-06, + "loss": 0.3388, + "step": 485 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 0.9087424068651639, + "learning_rate": 4.112105531840427e-06, + "loss": 0.6114, + "step": 486 + }, + { + "epoch": 4.162393162393163, + "grad_norm": 0.8714388993623177, + "learning_rate": 4.1086824825934126e-06, + "loss": 0.2663, + "step": 487 + }, + { + "epoch": 4.170940170940171, + "grad_norm": 0.9931837553159588, + "learning_rate": 4.1052542785001e-06, + "loss": 0.264, + "step": 488 + }, + { + "epoch": 4.17948717948718, + "grad_norm": 0.9239576219551864, + "learning_rate": 4.101820930545792e-06, + "loss": 0.4188, + "step": 489 + }, + { + "epoch": 4.188034188034188, + "grad_norm": 0.8835490304209346, + "learning_rate": 4.098382449732276e-06, + "loss": 0.4669, + "step": 490 + }, + { + "epoch": 4.196581196581197, + "grad_norm": 1.0141618276410034, + "learning_rate": 4.094938847077784e-06, + "loss": 0.5037, + "step": 491 + }, + { + "epoch": 4.205128205128205, + "grad_norm": 0.8527934027685333, + "learning_rate": 4.091490133616965e-06, + "loss": 0.22, + "step": 492 + }, + { + "epoch": 4.213675213675214, + "grad_norm": 0.8461432923951197, + "learning_rate": 4.08803632040084e-06, + "loss": 0.2893, + "step": 493 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.741023345781096, + "learning_rate": 4.084577418496775e-06, + "loss": 0.3295, + "step": 494 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 0.7688995720250164, + "learning_rate": 4.081113438988443e-06, + "loss": 0.2804, + "step": 495 + }, + { + "epoch": 4.239316239316239, + "grad_norm": 0.9418546280795675, + "learning_rate": 4.077644392975785e-06, + "loss": 0.2362, + "step": 496 + }, + { + "epoch": 4.247863247863248, + "grad_norm": 0.8808691764925719, + "learning_rate": 4.074170291574975e-06, + "loss": 0.4459, + "step": 497 + }, + { + "epoch": 4.256410256410256, + "grad_norm": 0.8611529751464202, + "learning_rate": 4.0706911459183915e-06, + "loss": 0.4481, + "step": 498 + }, + { + "epoch": 4.264957264957265, + "grad_norm": 0.689017158073508, + "learning_rate": 4.067206967154575e-06, + "loss": 0.2915, + "step": 499 + }, + { + "epoch": 4.273504273504273, + "grad_norm": 0.8342711752591144, + "learning_rate": 4.063717766448194e-06, + "loss": 0.3025, + "step": 500 + }, + { + "epoch": 4.282051282051282, + "grad_norm": 0.7552302159412333, + "learning_rate": 4.060223554980007e-06, + "loss": 0.4213, + "step": 501 + }, + { + "epoch": 4.2905982905982905, + "grad_norm": 0.7055560735263742, + "learning_rate": 4.056724343946832e-06, + "loss": 0.168, + "step": 502 + }, + { + "epoch": 4.299145299145299, + "grad_norm": 0.8345661950385564, + "learning_rate": 4.053220144561506e-06, + "loss": 0.2601, + "step": 503 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 0.7767602930561285, + "learning_rate": 4.049710968052851e-06, + "loss": 0.4115, + "step": 504 + }, + { + "epoch": 4.316239316239316, + "grad_norm": 0.7597842473349811, + "learning_rate": 4.046196825665638e-06, + "loss": 0.3013, + "step": 505 + }, + { + "epoch": 4.3247863247863245, + "grad_norm": 0.8218417665919933, + "learning_rate": 4.042677728660549e-06, + "loss": 0.2268, + "step": 506 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.8387501902514646, + "learning_rate": 4.039153688314146e-06, + "loss": 0.4442, + "step": 507 + }, + { + "epoch": 4.3418803418803416, + "grad_norm": 0.9426516698668568, + "learning_rate": 4.035624715918827e-06, + "loss": 0.4359, + "step": 508 + }, + { + "epoch": 4.35042735042735, + "grad_norm": 0.8098999588083045, + "learning_rate": 4.032090822782798e-06, + "loss": 0.3859, + "step": 509 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 0.8245264905162911, + "learning_rate": 4.028552020230031e-06, + "loss": 0.29, + "step": 510 + }, + { + "epoch": 4.367521367521368, + "grad_norm": 0.8375221036962769, + "learning_rate": 4.0250083196002285e-06, + "loss": 0.3568, + "step": 511 + }, + { + "epoch": 4.3760683760683765, + "grad_norm": 0.7413180645147043, + "learning_rate": 4.021459732248792e-06, + "loss": 0.2319, + "step": 512 + }, + { + "epoch": 4.384615384615385, + "grad_norm": 6.263164745538754, + "learning_rate": 4.017906269546778e-06, + "loss": 0.2853, + "step": 513 + }, + { + "epoch": 4.3931623931623935, + "grad_norm": 0.9453800593068873, + "learning_rate": 4.014347942880869e-06, + "loss": 0.2482, + "step": 514 + }, + { + "epoch": 4.401709401709402, + "grad_norm": 0.803412607202504, + "learning_rate": 4.0107847636533314e-06, + "loss": 0.274, + "step": 515 + }, + { + "epoch": 4.410256410256411, + "grad_norm": 0.6071379342770246, + "learning_rate": 4.0072167432819804e-06, + "loss": 0.2204, + "step": 516 + }, + { + "epoch": 4.418803418803419, + "grad_norm": 0.8526809501053975, + "learning_rate": 4.003643893200148e-06, + "loss": 0.3746, + "step": 517 + }, + { + "epoch": 4.427350427350428, + "grad_norm": 1.1170624575831807, + "learning_rate": 4.000066224856636e-06, + "loss": 0.4389, + "step": 518 + }, + { + "epoch": 4.435897435897436, + "grad_norm": 0.8714209677695344, + "learning_rate": 3.996483749715694e-06, + "loss": 0.3663, + "step": 519 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8681016473178148, + "learning_rate": 3.992896479256966e-06, + "loss": 0.3868, + "step": 520 + }, + { + "epoch": 4.452991452991453, + "grad_norm": 0.8431474276532333, + "learning_rate": 3.989304424975468e-06, + "loss": 0.2684, + "step": 521 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 0.8768461034809396, + "learning_rate": 3.985707598381544e-06, + "loss": 0.439, + "step": 522 + }, + { + "epoch": 4.47008547008547, + "grad_norm": 0.8417846294982461, + "learning_rate": 3.9821060110008295e-06, + "loss": 0.422, + "step": 523 + }, + { + "epoch": 4.478632478632479, + "grad_norm": 0.8997660894045065, + "learning_rate": 3.978499674374214e-06, + "loss": 0.4011, + "step": 524 + }, + { + "epoch": 4.487179487179487, + "grad_norm": 0.8281409585698364, + "learning_rate": 3.974888600057808e-06, + "loss": 0.2545, + "step": 525 + }, + { + "epoch": 4.495726495726496, + "grad_norm": 0.9489099979655634, + "learning_rate": 3.971272799622903e-06, + "loss": 0.4931, + "step": 526 + }, + { + "epoch": 4.504273504273504, + "grad_norm": 0.8608170335327134, + "learning_rate": 3.967652284655933e-06, + "loss": 0.2562, + "step": 527 + }, + { + "epoch": 4.512820512820513, + "grad_norm": 0.8290547376675292, + "learning_rate": 3.964027066758442e-06, + "loss": 0.2171, + "step": 528 + }, + { + "epoch": 4.521367521367521, + "grad_norm": 0.8608815039412893, + "learning_rate": 3.960397157547043e-06, + "loss": 0.3892, + "step": 529 + }, + { + "epoch": 4.52991452991453, + "grad_norm": 0.965861951508712, + "learning_rate": 3.956762568653378e-06, + "loss": 0.4057, + "step": 530 + }, + { + "epoch": 4.538461538461538, + "grad_norm": 0.8733549209558379, + "learning_rate": 3.953123311724092e-06, + "loss": 0.4154, + "step": 531 + }, + { + "epoch": 4.547008547008547, + "grad_norm": 0.6653677165284325, + "learning_rate": 3.9494793984207815e-06, + "loss": 0.2331, + "step": 532 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.004281844464304, + "learning_rate": 3.945830840419966e-06, + "loss": 0.4135, + "step": 533 + }, + { + "epoch": 4.564102564102564, + "grad_norm": 0.7585768676680708, + "learning_rate": 3.942177649413051e-06, + "loss": 0.4307, + "step": 534 + }, + { + "epoch": 4.572649572649572, + "grad_norm": 0.8861421749427683, + "learning_rate": 3.938519837106284e-06, + "loss": 0.2649, + "step": 535 + }, + { + "epoch": 4.581196581196581, + "grad_norm": 0.7497323044095112, + "learning_rate": 3.9348574152207245e-06, + "loss": 0.3153, + "step": 536 + }, + { + "epoch": 4.589743589743589, + "grad_norm": 0.7485592898441834, + "learning_rate": 3.931190395492198e-06, + "loss": 0.2989, + "step": 537 + }, + { + "epoch": 4.598290598290598, + "grad_norm": 0.8109828165333799, + "learning_rate": 3.92751878967127e-06, + "loss": 0.4264, + "step": 538 + }, + { + "epoch": 4.6068376068376065, + "grad_norm": 0.7435044944321713, + "learning_rate": 3.923842609523195e-06, + "loss": 0.3741, + "step": 539 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.6527824018088557, + "learning_rate": 3.92016186682789e-06, + "loss": 0.2872, + "step": 540 + }, + { + "epoch": 4.6239316239316235, + "grad_norm": 0.8364632365210702, + "learning_rate": 3.91647657337989e-06, + "loss": 0.4051, + "step": 541 + }, + { + "epoch": 4.632478632478632, + "grad_norm": 0.7308772252045037, + "learning_rate": 3.9127867409883145e-06, + "loss": 0.4665, + "step": 542 + }, + { + "epoch": 4.641025641025641, + "grad_norm": 0.714687471906677, + "learning_rate": 3.909092381476824e-06, + "loss": 0.3013, + "step": 543 + }, + { + "epoch": 4.64957264957265, + "grad_norm": 0.9203048234486699, + "learning_rate": 3.905393506683589e-06, + "loss": 0.5556, + "step": 544 + }, + { + "epoch": 4.6581196581196584, + "grad_norm": 0.8296100085678216, + "learning_rate": 3.901690128461248e-06, + "loss": 0.3578, + "step": 545 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.057400490981615, + "learning_rate": 3.897982258676867e-06, + "loss": 0.4101, + "step": 546 + }, + { + "epoch": 4.6752136752136755, + "grad_norm": 1.0095630702142258, + "learning_rate": 3.894269909211911e-06, + "loss": 0.4108, + "step": 547 + }, + { + "epoch": 4.683760683760684, + "grad_norm": 0.8490424531903906, + "learning_rate": 3.890553091962193e-06, + "loss": 0.3049, + "step": 548 + }, + { + "epoch": 4.6923076923076925, + "grad_norm": 0.8836815839622476, + "learning_rate": 3.8868318188378475e-06, + "loss": 0.2335, + "step": 549 + }, + { + "epoch": 4.700854700854701, + "grad_norm": 0.7553422146759685, + "learning_rate": 3.883106101763285e-06, + "loss": 0.3286, + "step": 550 + }, + { + "epoch": 4.7094017094017095, + "grad_norm": 0.9266604794055423, + "learning_rate": 3.879375952677156e-06, + "loss": 0.3113, + "step": 551 + }, + { + "epoch": 4.717948717948718, + "grad_norm": 0.7998462296426596, + "learning_rate": 3.875641383532313e-06, + "loss": 0.3974, + "step": 552 + }, + { + "epoch": 4.726495726495727, + "grad_norm": 0.7721339739950999, + "learning_rate": 3.871902406295775e-06, + "loss": 0.2705, + "step": 553 + }, + { + "epoch": 4.735042735042735, + "grad_norm": 0.7208568188475144, + "learning_rate": 3.868159032948681e-06, + "loss": 0.3908, + "step": 554 + }, + { + "epoch": 4.743589743589744, + "grad_norm": 0.886277830497899, + "learning_rate": 3.8644112754862614e-06, + "loss": 0.4494, + "step": 555 + }, + { + "epoch": 4.752136752136752, + "grad_norm": 0.9008645068145198, + "learning_rate": 3.860659145917794e-06, + "loss": 0.3228, + "step": 556 + }, + { + "epoch": 4.760683760683761, + "grad_norm": 0.7673395207334885, + "learning_rate": 3.856902656266563e-06, + "loss": 0.3048, + "step": 557 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 0.8698675752196239, + "learning_rate": 3.853141818569829e-06, + "loss": 0.3509, + "step": 558 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 1.050646507319603, + "learning_rate": 3.849376644878783e-06, + "loss": 0.2981, + "step": 559 + }, + { + "epoch": 4.786324786324786, + "grad_norm": 0.7929756502011445, + "learning_rate": 3.84560714725851e-06, + "loss": 0.3696, + "step": 560 + }, + { + "epoch": 4.794871794871795, + "grad_norm": 0.7303218759520699, + "learning_rate": 3.841833337787951e-06, + "loss": 0.4221, + "step": 561 + }, + { + "epoch": 4.803418803418803, + "grad_norm": 0.9455279566847924, + "learning_rate": 3.838055228559864e-06, + "loss": 0.3173, + "step": 562 + }, + { + "epoch": 4.811965811965812, + "grad_norm": 0.8120892388463951, + "learning_rate": 3.834272831680785e-06, + "loss": 0.4401, + "step": 563 + }, + { + "epoch": 4.82051282051282, + "grad_norm": 0.9040278606499692, + "learning_rate": 3.830486159270991e-06, + "loss": 0.4469, + "step": 564 + }, + { + "epoch": 4.829059829059829, + "grad_norm": 0.7713925407833625, + "learning_rate": 3.826695223464455e-06, + "loss": 0.4895, + "step": 565 + }, + { + "epoch": 4.837606837606837, + "grad_norm": 0.7906937990955305, + "learning_rate": 3.822900036408815e-06, + "loss": 0.4848, + "step": 566 + }, + { + "epoch": 4.846153846153846, + "grad_norm": 0.7618954650882707, + "learning_rate": 3.819100610265332e-06, + "loss": 0.3965, + "step": 567 + }, + { + "epoch": 4.854700854700854, + "grad_norm": 0.8218662694671668, + "learning_rate": 3.815296957208849e-06, + "loss": 0.3004, + "step": 568 + }, + { + "epoch": 4.863247863247864, + "grad_norm": 0.9075356752746051, + "learning_rate": 3.811489089427756e-06, + "loss": 0.3602, + "step": 569 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 1.0079500894091658, + "learning_rate": 3.8076770191239444e-06, + "loss": 0.6662, + "step": 570 + }, + { + "epoch": 4.880341880341881, + "grad_norm": 0.9358756861041553, + "learning_rate": 3.8038607585127762e-06, + "loss": 0.6168, + "step": 571 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.7803465983190693, + "learning_rate": 3.8000403198230385e-06, + "loss": 0.2579, + "step": 572 + }, + { + "epoch": 4.897435897435898, + "grad_norm": 0.7743839576202268, + "learning_rate": 3.7962157152969093e-06, + "loss": 0.2667, + "step": 573 + }, + { + "epoch": 4.905982905982906, + "grad_norm": 0.9137844362982184, + "learning_rate": 3.7923869571899115e-06, + "loss": 0.2822, + "step": 574 + }, + { + "epoch": 4.914529914529915, + "grad_norm": 0.9527902955445767, + "learning_rate": 3.7885540577708806e-06, + "loss": 0.4376, + "step": 575 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 0.9681874312358377, + "learning_rate": 3.7847170293219223e-06, + "loss": 0.3069, + "step": 576 + }, + { + "epoch": 4.931623931623932, + "grad_norm": 0.8238895402593656, + "learning_rate": 3.780875884138372e-06, + "loss": 0.1889, + "step": 577 + }, + { + "epoch": 4.94017094017094, + "grad_norm": 0.918981736118876, + "learning_rate": 3.7770306345287577e-06, + "loss": 0.4307, + "step": 578 + }, + { + "epoch": 4.948717948717949, + "grad_norm": 0.8454182927023547, + "learning_rate": 3.7731812928147593e-06, + "loss": 0.3387, + "step": 579 + }, + { + "epoch": 4.957264957264957, + "grad_norm": 0.7835233380602107, + "learning_rate": 3.76932787133117e-06, + "loss": 0.2332, + "step": 580 + }, + { + "epoch": 4.965811965811966, + "grad_norm": 0.8919386512222267, + "learning_rate": 3.7654703824258544e-06, + "loss": 0.4062, + "step": 581 + }, + { + "epoch": 4.9743589743589745, + "grad_norm": 1.0307902409547174, + "learning_rate": 3.7616088384597138e-06, + "loss": 0.6783, + "step": 582 + }, + { + "epoch": 4.982905982905983, + "grad_norm": 0.8082634397054688, + "learning_rate": 3.757743251806639e-06, + "loss": 0.3732, + "step": 583 + }, + { + "epoch": 4.9914529914529915, + "grad_norm": 0.7740530355915068, + "learning_rate": 3.753873634853481e-06, + "loss": 0.376, + "step": 584 + }, + { + "epoch": 5.0, + "grad_norm": 0.8908301366802671, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.3181, + "step": 585 + }, + { + "epoch": 5.0085470085470085, + "grad_norm": 0.8069432751813113, + "learning_rate": 3.746122359658834e-06, + "loss": 0.3528, + "step": 586 + }, + { + "epoch": 5.017094017094017, + "grad_norm": 0.8379587111320108, + "learning_rate": 3.7422407262554567e-06, + "loss": 0.5588, + "step": 587 + }, + { + "epoch": 5.0256410256410255, + "grad_norm": 0.7122032449476201, + "learning_rate": 3.738355112228134e-06, + "loss": 0.2672, + "step": 588 + }, + { + "epoch": 5.034188034188034, + "grad_norm": 0.9524584213957907, + "learning_rate": 3.7344655300278887e-06, + "loss": 0.4974, + "step": 589 + }, + { + "epoch": 5.042735042735043, + "grad_norm": 0.8103317648549345, + "learning_rate": 3.7305719921184626e-06, + "loss": 0.2324, + "step": 590 + }, + { + "epoch": 5.051282051282051, + "grad_norm": 0.7079055681673191, + "learning_rate": 3.7266745109762668e-06, + "loss": 0.2059, + "step": 591 + }, + { + "epoch": 5.05982905982906, + "grad_norm": 0.9947949971700039, + "learning_rate": 3.7227730990903556e-06, + "loss": 0.4619, + "step": 592 + }, + { + "epoch": 5.068376068376068, + "grad_norm": 1.233347361263019, + "learning_rate": 3.718867768962371e-06, + "loss": 0.2852, + "step": 593 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 0.9632110639765008, + "learning_rate": 3.714958533106515e-06, + "loss": 0.4309, + "step": 594 + }, + { + "epoch": 5.085470085470085, + "grad_norm": 0.8286550755670258, + "learning_rate": 3.711045404049507e-06, + "loss": 0.3396, + "step": 595 + }, + { + "epoch": 5.094017094017094, + "grad_norm": 0.9713598976350867, + "learning_rate": 3.7071283943305367e-06, + "loss": 0.3712, + "step": 596 + }, + { + "epoch": 5.102564102564102, + "grad_norm": 0.8550775572776252, + "learning_rate": 3.7032075165012323e-06, + "loss": 0.261, + "step": 597 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 1.181204605664654, + "learning_rate": 3.699282783125616e-06, + "loss": 0.4192, + "step": 598 + }, + { + "epoch": 5.119658119658119, + "grad_norm": 0.8230527538892907, + "learning_rate": 3.6953542067800647e-06, + "loss": 0.4015, + "step": 599 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 0.662879386627582, + "learning_rate": 3.6914218000532697e-06, + "loss": 0.2604, + "step": 600 + }, + { + "epoch": 5.136752136752137, + "grad_norm": 0.8571575604386643, + "learning_rate": 3.6874855755461975e-06, + "loss": 0.2278, + "step": 601 + }, + { + "epoch": 5.145299145299146, + "grad_norm": 0.7657497421323586, + "learning_rate": 3.683545545872045e-06, + "loss": 0.236, + "step": 602 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 0.9438673580943562, + "learning_rate": 3.679601723656205e-06, + "loss": 0.4039, + "step": 603 + }, + { + "epoch": 5.162393162393163, + "grad_norm": 0.7320407216705076, + "learning_rate": 3.675654121536225e-06, + "loss": 0.1617, + "step": 604 + }, + { + "epoch": 5.170940170940171, + "grad_norm": 0.9168657405228899, + "learning_rate": 3.6717027521617593e-06, + "loss": 0.3958, + "step": 605 + }, + { + "epoch": 5.17948717948718, + "grad_norm": 0.8856897705058108, + "learning_rate": 3.667747628194539e-06, + "loss": 0.3021, + "step": 606 + }, + { + "epoch": 5.188034188034188, + "grad_norm": 0.8848451433593771, + "learning_rate": 3.6637887623083235e-06, + "loss": 0.284, + "step": 607 + }, + { + "epoch": 5.196581196581197, + "grad_norm": 0.7931777936733362, + "learning_rate": 3.6598261671888623e-06, + "loss": 0.2598, + "step": 608 + }, + { + "epoch": 5.205128205128205, + "grad_norm": 0.9647752279410728, + "learning_rate": 3.655859855533859e-06, + "loss": 0.2347, + "step": 609 + }, + { + "epoch": 5.213675213675214, + "grad_norm": 0.8027939690593247, + "learning_rate": 3.651889840052922e-06, + "loss": 0.2356, + "step": 610 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.8976635229956506, + "learning_rate": 3.6479161334675294e-06, + "loss": 0.3416, + "step": 611 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 0.7907973289926297, + "learning_rate": 3.643938748510989e-06, + "loss": 0.2042, + "step": 612 + }, + { + "epoch": 5.239316239316239, + "grad_norm": 0.9851284719548342, + "learning_rate": 3.6399576979283914e-06, + "loss": 0.4832, + "step": 613 + }, + { + "epoch": 5.247863247863248, + "grad_norm": 0.757028631944513, + "learning_rate": 3.6359729944765785e-06, + "loss": 0.3568, + "step": 614 + }, + { + "epoch": 5.256410256410256, + "grad_norm": 1.0804566273764746, + "learning_rate": 3.631984650924094e-06, + "loss": 0.3044, + "step": 615 + }, + { + "epoch": 5.264957264957265, + "grad_norm": 1.0035092854743617, + "learning_rate": 3.6279926800511455e-06, + "loss": 0.5795, + "step": 616 + }, + { + "epoch": 5.273504273504273, + "grad_norm": 0.833070471688987, + "learning_rate": 3.623997094649566e-06, + "loss": 0.2358, + "step": 617 + }, + { + "epoch": 5.282051282051282, + "grad_norm": 10.073832467148229, + "learning_rate": 3.6199979075227707e-06, + "loss": 0.2122, + "step": 618 + }, + { + "epoch": 5.2905982905982905, + "grad_norm": 0.9327671566598944, + "learning_rate": 3.6159951314857145e-06, + "loss": 0.1887, + "step": 619 + }, + { + "epoch": 5.299145299145299, + "grad_norm": 0.8129982683347555, + "learning_rate": 3.6119887793648535e-06, + "loss": 0.3573, + "step": 620 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 0.9313154955914471, + "learning_rate": 3.607978863998104e-06, + "loss": 0.2051, + "step": 621 + }, + { + "epoch": 5.316239316239316, + "grad_norm": 0.7081388827230642, + "learning_rate": 3.6039653982347977e-06, + "loss": 0.136, + "step": 622 + }, + { + "epoch": 5.3247863247863245, + "grad_norm": 0.9219903379895183, + "learning_rate": 3.5999483949356458e-06, + "loss": 0.1471, + "step": 623 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.7251881597091862, + "learning_rate": 3.595927866972694e-06, + "loss": 0.2349, + "step": 624 + }, + { + "epoch": 5.3418803418803416, + "grad_norm": 0.9020851683327609, + "learning_rate": 3.5919038272292824e-06, + "loss": 0.3872, + "step": 625 + }, + { + "epoch": 5.35042735042735, + "grad_norm": 0.6985204767840774, + "learning_rate": 3.587876288600004e-06, + "loss": 0.1627, + "step": 626 + }, + { + "epoch": 5.358974358974359, + "grad_norm": 0.7857839377226197, + "learning_rate": 3.583845263990664e-06, + "loss": 0.1391, + "step": 627 + }, + { + "epoch": 5.367521367521368, + "grad_norm": 0.7809786139005731, + "learning_rate": 3.5798107663182386e-06, + "loss": 0.2627, + "step": 628 + }, + { + "epoch": 5.3760683760683765, + "grad_norm": 0.626075541867184, + "learning_rate": 3.5757728085108318e-06, + "loss": 0.1824, + "step": 629 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 0.9968721295278316, + "learning_rate": 3.5717314035076355e-06, + "loss": 0.2707, + "step": 630 + }, + { + "epoch": 5.3931623931623935, + "grad_norm": 0.643224962362624, + "learning_rate": 3.5676865642588894e-06, + "loss": 0.206, + "step": 631 + }, + { + "epoch": 5.401709401709402, + "grad_norm": 0.8976091064580924, + "learning_rate": 3.563638303725835e-06, + "loss": 0.3809, + "step": 632 + }, + { + "epoch": 5.410256410256411, + "grad_norm": 0.8108000905897182, + "learning_rate": 3.559586634880679e-06, + "loss": 0.3308, + "step": 633 + }, + { + "epoch": 5.418803418803419, + "grad_norm": 0.9784876560845137, + "learning_rate": 3.5555315707065496e-06, + "loss": 0.2254, + "step": 634 + }, + { + "epoch": 5.427350427350428, + "grad_norm": 0.9203036555176567, + "learning_rate": 3.551473124197454e-06, + "loss": 0.223, + "step": 635 + }, + { + "epoch": 5.435897435897436, + "grad_norm": 1.0744310232489362, + "learning_rate": 3.5474113083582382e-06, + "loss": 0.4916, + "step": 636 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.8487492865506904, + "learning_rate": 3.543346136204545e-06, + "loss": 0.4219, + "step": 637 + }, + { + "epoch": 5.452991452991453, + "grad_norm": 1.067112173154662, + "learning_rate": 3.539277620762772e-06, + "loss": 0.4644, + "step": 638 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 0.7961984213609168, + "learning_rate": 3.53520577507003e-06, + "loss": 0.3426, + "step": 639 + }, + { + "epoch": 5.47008547008547, + "grad_norm": 0.9902480110932708, + "learning_rate": 3.5311306121741017e-06, + "loss": 0.2114, + "step": 640 + }, + { + "epoch": 5.478632478632479, + "grad_norm": 0.7549750491704087, + "learning_rate": 3.5270521451333984e-06, + "loss": 0.2697, + "step": 641 + }, + { + "epoch": 5.487179487179487, + "grad_norm": 0.8077593813783904, + "learning_rate": 3.522970387016919e-06, + "loss": 0.235, + "step": 642 + }, + { + "epoch": 5.495726495726496, + "grad_norm": 1.1397711670646056, + "learning_rate": 3.5188853509042105e-06, + "loss": 0.3049, + "step": 643 + }, + { + "epoch": 5.504273504273504, + "grad_norm": 0.9371126916927187, + "learning_rate": 3.5147970498853214e-06, + "loss": 0.2272, + "step": 644 + }, + { + "epoch": 5.512820512820513, + "grad_norm": 0.85247512093159, + "learning_rate": 3.5107054970607624e-06, + "loss": 0.1913, + "step": 645 + }, + { + "epoch": 5.521367521367521, + "grad_norm": 0.6709218528404395, + "learning_rate": 3.5066107055414677e-06, + "loss": 0.1069, + "step": 646 + }, + { + "epoch": 5.52991452991453, + "grad_norm": 0.9707847068514217, + "learning_rate": 3.5025126884487447e-06, + "loss": 0.3735, + "step": 647 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 0.8774435456307486, + "learning_rate": 3.4984114589142388e-06, + "loss": 0.1399, + "step": 648 + }, + { + "epoch": 5.547008547008547, + "grad_norm": 0.703068990656212, + "learning_rate": 3.4943070300798913e-06, + "loss": 0.1523, + "step": 649 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.9322284365828467, + "learning_rate": 3.4901994150978926e-06, + "loss": 0.542, + "step": 650 + }, + { + "epoch": 5.564102564102564, + "grad_norm": 0.8731981374849843, + "learning_rate": 3.4860886271306433e-06, + "loss": 0.2596, + "step": 651 + }, + { + "epoch": 5.572649572649572, + "grad_norm": 0.7056414707665756, + "learning_rate": 3.481974679350712e-06, + "loss": 0.2854, + "step": 652 + }, + { + "epoch": 5.581196581196581, + "grad_norm": 0.9683966340382449, + "learning_rate": 3.4778575849407924e-06, + "loss": 0.1741, + "step": 653 + }, + { + "epoch": 5.589743589743589, + "grad_norm": 0.8837532066130346, + "learning_rate": 3.473737357093662e-06, + "loss": 0.1644, + "step": 654 + }, + { + "epoch": 5.598290598290598, + "grad_norm": 0.885697744276433, + "learning_rate": 3.4696140090121377e-06, + "loss": 0.1639, + "step": 655 + }, + { + "epoch": 5.6068376068376065, + "grad_norm": 0.6822269980868856, + "learning_rate": 3.465487553909035e-06, + "loss": 0.2013, + "step": 656 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 0.8666554158855085, + "learning_rate": 3.461358005007128e-06, + "loss": 0.3809, + "step": 657 + }, + { + "epoch": 5.6239316239316235, + "grad_norm": 0.8424133430662399, + "learning_rate": 3.4572253755390996e-06, + "loss": 0.4638, + "step": 658 + }, + { + "epoch": 5.632478632478632, + "grad_norm": 0.7084488043778324, + "learning_rate": 3.4530896787475083e-06, + "loss": 0.2495, + "step": 659 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 0.7454913434451906, + "learning_rate": 3.4489509278847415e-06, + "loss": 0.1966, + "step": 660 + }, + { + "epoch": 5.64957264957265, + "grad_norm": 0.7489666539547467, + "learning_rate": 3.44480913621297e-06, + "loss": 0.3321, + "step": 661 + }, + { + "epoch": 5.6581196581196584, + "grad_norm": 0.8058163715427511, + "learning_rate": 3.44066431700411e-06, + "loss": 0.1957, + "step": 662 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.7440212094445409, + "learning_rate": 3.436516483539781e-06, + "loss": 0.2801, + "step": 663 + }, + { + "epoch": 5.6752136752136755, + "grad_norm": 0.8916063043156528, + "learning_rate": 3.432365649111257e-06, + "loss": 0.3145, + "step": 664 + }, + { + "epoch": 5.683760683760684, + "grad_norm": 0.7370090850118274, + "learning_rate": 3.428211827019434e-06, + "loss": 0.2048, + "step": 665 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 0.5871642655097474, + "learning_rate": 3.4240550305747776e-06, + "loss": 0.1435, + "step": 666 + }, + { + "epoch": 5.700854700854701, + "grad_norm": 0.891319020868617, + "learning_rate": 3.4198952730972845e-06, + "loss": 0.1379, + "step": 667 + }, + { + "epoch": 5.7094017094017095, + "grad_norm": 0.9071253346493615, + "learning_rate": 3.4157325679164416e-06, + "loss": 0.2694, + "step": 668 + }, + { + "epoch": 5.717948717948718, + "grad_norm": 0.7319345015547237, + "learning_rate": 3.4115669283711795e-06, + "loss": 0.1798, + "step": 669 + }, + { + "epoch": 5.726495726495727, + "grad_norm": 0.8775265149736555, + "learning_rate": 3.407398367809832e-06, + "loss": 0.3048, + "step": 670 + }, + { + "epoch": 5.735042735042735, + "grad_norm": 0.9093807763167042, + "learning_rate": 3.403226899590096e-06, + "loss": 0.2582, + "step": 671 + }, + { + "epoch": 5.743589743589744, + "grad_norm": 0.8072037853041546, + "learning_rate": 3.3990525370789793e-06, + "loss": 0.3216, + "step": 672 + }, + { + "epoch": 5.752136752136752, + "grad_norm": 0.7283202704225692, + "learning_rate": 3.3948752936527722e-06, + "loss": 0.1974, + "step": 673 + }, + { + "epoch": 5.760683760683761, + "grad_norm": 1.052656186926491, + "learning_rate": 3.3906951826969905e-06, + "loss": 0.2879, + "step": 674 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 0.9222727700013065, + "learning_rate": 3.386512217606339e-06, + "loss": 0.3203, + "step": 675 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.6737226893721318, + "learning_rate": 3.3823264117846722e-06, + "loss": 0.2091, + "step": 676 + }, + { + "epoch": 5.786324786324786, + "grad_norm": 0.8344812169388631, + "learning_rate": 3.378137778644945e-06, + "loss": 0.1914, + "step": 677 + }, + { + "epoch": 5.794871794871795, + "grad_norm": 0.7313452201207752, + "learning_rate": 3.3739463316091696e-06, + "loss": 0.2082, + "step": 678 + }, + { + "epoch": 5.803418803418803, + "grad_norm": 1.0196584768311239, + "learning_rate": 3.369752084108381e-06, + "loss": 0.3903, + "step": 679 + }, + { + "epoch": 5.811965811965812, + "grad_norm": 0.8240003204407577, + "learning_rate": 3.3655550495825824e-06, + "loss": 0.2762, + "step": 680 + }, + { + "epoch": 5.82051282051282, + "grad_norm": 0.7512931928983033, + "learning_rate": 3.3613552414807093e-06, + "loss": 0.2175, + "step": 681 + }, + { + "epoch": 5.829059829059829, + "grad_norm": 0.6871042497938531, + "learning_rate": 3.3571526732605875e-06, + "loss": 0.2537, + "step": 682 + }, + { + "epoch": 5.837606837606837, + "grad_norm": 0.7647048425944158, + "learning_rate": 3.352947358388884e-06, + "loss": 0.316, + "step": 683 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 1.0523474677444042, + "learning_rate": 3.3487393103410683e-06, + "loss": 0.3565, + "step": 684 + }, + { + "epoch": 5.854700854700854, + "grad_norm": 1.0053910019090018, + "learning_rate": 3.3445285426013683e-06, + "loss": 0.2339, + "step": 685 + }, + { + "epoch": 5.863247863247864, + "grad_norm": 1.1075820032173571, + "learning_rate": 3.3403150686627267e-06, + "loss": 0.1635, + "step": 686 + }, + { + "epoch": 5.871794871794872, + "grad_norm": 0.850285980968383, + "learning_rate": 3.336098902026758e-06, + "loss": 0.2759, + "step": 687 + }, + { + "epoch": 5.880341880341881, + "grad_norm": 0.9845363196465164, + "learning_rate": 3.331880056203706e-06, + "loss": 0.2653, + "step": 688 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.7994207294988291, + "learning_rate": 3.3276585447123957e-06, + "loss": 0.3057, + "step": 689 + }, + { + "epoch": 5.897435897435898, + "grad_norm": 0.9930164416674668, + "learning_rate": 3.3234343810801995e-06, + "loss": 0.3275, + "step": 690 + }, + { + "epoch": 5.905982905982906, + "grad_norm": 0.8107243099975281, + "learning_rate": 3.319207578842985e-06, + "loss": 0.2233, + "step": 691 + }, + { + "epoch": 5.914529914529915, + "grad_norm": 0.8979682402306377, + "learning_rate": 3.314978151545076e-06, + "loss": 0.4968, + "step": 692 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 1.037164824170856, + "learning_rate": 3.3107461127392072e-06, + "loss": 0.2247, + "step": 693 + }, + { + "epoch": 5.931623931623932, + "grad_norm": 0.7226994441140007, + "learning_rate": 3.306511475986482e-06, + "loss": 0.2398, + "step": 694 + }, + { + "epoch": 5.94017094017094, + "grad_norm": 0.9559487252695029, + "learning_rate": 3.3022742548563293e-06, + "loss": 0.4466, + "step": 695 + }, + { + "epoch": 5.948717948717949, + "grad_norm": 0.9680850683049722, + "learning_rate": 3.2980344629264583e-06, + "loss": 0.2387, + "step": 696 + }, + { + "epoch": 5.957264957264957, + "grad_norm": 0.7532667312418817, + "learning_rate": 3.293792113782816e-06, + "loss": 0.1821, + "step": 697 + }, + { + "epoch": 5.965811965811966, + "grad_norm": 1.057782960709235, + "learning_rate": 3.289547221019546e-06, + "loss": 0.202, + "step": 698 + }, + { + "epoch": 5.9743589743589745, + "grad_norm": 0.7047064734557095, + "learning_rate": 3.285299798238938e-06, + "loss": 0.4263, + "step": 699 + }, + { + "epoch": 5.982905982905983, + "grad_norm": 0.6085610798312747, + "learning_rate": 3.281049859051394e-06, + "loss": 0.1122, + "step": 700 + }, + { + "epoch": 5.9914529914529915, + "grad_norm": 0.8539344831378322, + "learning_rate": 3.276797417075377e-06, + "loss": 0.2858, + "step": 701 + }, + { + "epoch": 6.0, + "grad_norm": 1.0174301806268455, + "learning_rate": 3.272542485937369e-06, + "loss": 0.2751, + "step": 702 + }, + { + "epoch": 6.0085470085470085, + "grad_norm": 0.8594263350344946, + "learning_rate": 3.26828507927183e-06, + "loss": 0.1316, + "step": 703 + }, + { + "epoch": 6.017094017094017, + "grad_norm": 0.7086701921056165, + "learning_rate": 3.264025210721153e-06, + "loss": 0.1628, + "step": 704 + }, + { + "epoch": 6.0256410256410255, + "grad_norm": 0.8721167889196723, + "learning_rate": 3.2597628939356174e-06, + "loss": 0.2331, + "step": 705 + }, + { + "epoch": 6.034188034188034, + "grad_norm": 0.9728775357235206, + "learning_rate": 3.25549814257335e-06, + "loss": 0.2376, + "step": 706 + }, + { + "epoch": 6.042735042735043, + "grad_norm": 0.928726012701592, + "learning_rate": 3.2512309703002776e-06, + "loss": 0.2871, + "step": 707 + }, + { + "epoch": 6.051282051282051, + "grad_norm": 1.111083204994824, + "learning_rate": 3.2469613907900847e-06, + "loss": 0.166, + "step": 708 + }, + { + "epoch": 6.05982905982906, + "grad_norm": 1.1257660148157198, + "learning_rate": 3.2426894177241707e-06, + "loss": 0.3438, + "step": 709 + }, + { + "epoch": 6.068376068376068, + "grad_norm": 1.068916052026842, + "learning_rate": 3.2384150647916033e-06, + "loss": 0.1003, + "step": 710 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 1.3536671412811536, + "learning_rate": 3.2341383456890776e-06, + "loss": 0.2147, + "step": 711 + }, + { + "epoch": 6.085470085470085, + "grad_norm": 0.8748429383440107, + "learning_rate": 3.229859274120869e-06, + "loss": 0.0892, + "step": 712 + }, + { + "epoch": 6.094017094017094, + "grad_norm": 0.8616158755713197, + "learning_rate": 3.2255778637987935e-06, + "loss": 0.1496, + "step": 713 + }, + { + "epoch": 6.102564102564102, + "grad_norm": 0.7495676746876246, + "learning_rate": 3.2212941284421595e-06, + "loss": 0.1524, + "step": 714 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.8671271794505848, + "learning_rate": 3.217008081777726e-06, + "loss": 0.0848, + "step": 715 + }, + { + "epoch": 6.119658119658119, + "grad_norm": 1.0037109617812803, + "learning_rate": 3.2127197375396596e-06, + "loss": 0.5167, + "step": 716 + }, + { + "epoch": 6.128205128205128, + "grad_norm": 0.9008170358083694, + "learning_rate": 3.208429109469488e-06, + "loss": 0.2012, + "step": 717 + }, + { + "epoch": 6.136752136752137, + "grad_norm": 0.8869908267151387, + "learning_rate": 3.204136211316057e-06, + "loss": 0.293, + "step": 718 + }, + { + "epoch": 6.145299145299146, + "grad_norm": 0.6786676791676538, + "learning_rate": 3.199841056835489e-06, + "loss": 0.1845, + "step": 719 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 0.9254122112443579, + "learning_rate": 3.195543659791132e-06, + "loss": 0.2013, + "step": 720 + }, + { + "epoch": 6.162393162393163, + "grad_norm": 0.8742089134571687, + "learning_rate": 3.191244033953524e-06, + "loss": 0.251, + "step": 721 + }, + { + "epoch": 6.170940170940171, + "grad_norm": 1.1184341098009263, + "learning_rate": 3.1869421931003446e-06, + "loss": 0.0712, + "step": 722 + }, + { + "epoch": 6.17948717948718, + "grad_norm": 0.8781294540758994, + "learning_rate": 3.182638151016369e-06, + "loss": 0.2918, + "step": 723 + }, + { + "epoch": 6.188034188034188, + "grad_norm": 0.7726159764192209, + "learning_rate": 3.1783319214934274e-06, + "loss": 0.3152, + "step": 724 + }, + { + "epoch": 6.196581196581197, + "grad_norm": 0.7818897804716108, + "learning_rate": 3.17402351833036e-06, + "loss": 0.2823, + "step": 725 + }, + { + "epoch": 6.205128205128205, + "grad_norm": 1.0357434492953472, + "learning_rate": 3.1697129553329708e-06, + "loss": 0.4138, + "step": 726 + }, + { + "epoch": 6.213675213675214, + "grad_norm": 0.7576741761488609, + "learning_rate": 3.1654002463139854e-06, + "loss": 0.2245, + "step": 727 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 1.0668651055363016, + "learning_rate": 3.1610854050930063e-06, + "loss": 0.1697, + "step": 728 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 0.8263885838889453, + "learning_rate": 3.1567684454964674e-06, + "loss": 0.2849, + "step": 729 + }, + { + "epoch": 6.239316239316239, + "grad_norm": 0.897280538956934, + "learning_rate": 3.1524493813575936e-06, + "loss": 0.3428, + "step": 730 + }, + { + "epoch": 6.247863247863248, + "grad_norm": 1.1418192612891902, + "learning_rate": 3.1481282265163493e-06, + "loss": 0.3214, + "step": 731 + }, + { + "epoch": 6.256410256410256, + "grad_norm": 0.8665101374271217, + "learning_rate": 3.1438049948194006e-06, + "loss": 0.111, + "step": 732 + }, + { + "epoch": 6.264957264957265, + "grad_norm": 0.8646101090642209, + "learning_rate": 3.1394797001200705e-06, + "loss": 0.1649, + "step": 733 + }, + { + "epoch": 6.273504273504273, + "grad_norm": 0.7281704329321338, + "learning_rate": 3.1351523562782893e-06, + "loss": 0.0735, + "step": 734 + }, + { + "epoch": 6.282051282051282, + "grad_norm": 0.8724737804025101, + "learning_rate": 3.1308229771605546e-06, + "loss": 0.0932, + "step": 735 + }, + { + "epoch": 6.2905982905982905, + "grad_norm": 0.7411862867946514, + "learning_rate": 3.1264915766398872e-06, + "loss": 0.1875, + "step": 736 + }, + { + "epoch": 6.299145299145299, + "grad_norm": 0.6934460194520883, + "learning_rate": 3.1221581685957837e-06, + "loss": 0.1958, + "step": 737 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 0.9759139901992064, + "learning_rate": 3.117822766914174e-06, + "loss": 0.361, + "step": 738 + }, + { + "epoch": 6.316239316239316, + "grad_norm": 0.8089748492620781, + "learning_rate": 3.1134853854873774e-06, + "loss": 0.2047, + "step": 739 + }, + { + "epoch": 6.3247863247863245, + "grad_norm": 0.7887119389629432, + "learning_rate": 3.109146038214055e-06, + "loss": 0.2278, + "step": 740 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.9218221429987968, + "learning_rate": 3.1048047389991693e-06, + "loss": 0.2887, + "step": 741 + }, + { + "epoch": 6.3418803418803416, + "grad_norm": 0.7478781085448379, + "learning_rate": 3.1004615017539375e-06, + "loss": 0.1427, + "step": 742 + }, + { + "epoch": 6.35042735042735, + "grad_norm": 0.8557863575124964, + "learning_rate": 3.096116340395783e-06, + "loss": 0.2219, + "step": 743 + }, + { + "epoch": 6.358974358974359, + "grad_norm": 1.0726756668060886, + "learning_rate": 3.0917692688483023e-06, + "loss": 0.1425, + "step": 744 + }, + { + "epoch": 6.367521367521368, + "grad_norm": 0.7797929578816128, + "learning_rate": 3.0874203010412057e-06, + "loss": 0.1801, + "step": 745 + }, + { + "epoch": 6.3760683760683765, + "grad_norm": 0.9547590433959044, + "learning_rate": 3.0830694509102835e-06, + "loss": 0.1125, + "step": 746 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 1.2416444033785454, + "learning_rate": 3.0787167323973584e-06, + "loss": 0.2306, + "step": 747 + }, + { + "epoch": 6.3931623931623935, + "grad_norm": 0.6885959295233361, + "learning_rate": 3.074362159450236e-06, + "loss": 0.1142, + "step": 748 + }, + { + "epoch": 6.401709401709402, + "grad_norm": 0.5879587487315899, + "learning_rate": 3.070005746022669e-06, + "loss": 0.1052, + "step": 749 + }, + { + "epoch": 6.410256410256411, + "grad_norm": 1.1695009064228226, + "learning_rate": 3.0656475060743065e-06, + "loss": 0.3528, + "step": 750 + }, + { + "epoch": 6.418803418803419, + "grad_norm": 0.8469776850696916, + "learning_rate": 3.061287453570646e-06, + "loss": 0.3548, + "step": 751 + }, + { + "epoch": 6.427350427350428, + "grad_norm": 0.7680536865979709, + "learning_rate": 3.056925602483e-06, + "loss": 0.1638, + "step": 752 + }, + { + "epoch": 6.435897435897436, + "grad_norm": 0.74533754769051, + "learning_rate": 3.052561966788441e-06, + "loss": 0.1684, + "step": 753 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.8649414778571741, + "learning_rate": 3.0481965604697582e-06, + "loss": 0.1592, + "step": 754 + }, + { + "epoch": 6.452991452991453, + "grad_norm": 0.6651871547467371, + "learning_rate": 3.043829397515419e-06, + "loss": 0.2045, + "step": 755 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 0.7896759182994997, + "learning_rate": 3.039460491919516e-06, + "loss": 0.3469, + "step": 756 + }, + { + "epoch": 6.47008547008547, + "grad_norm": 0.7284989154139125, + "learning_rate": 3.0350898576817268e-06, + "loss": 0.1932, + "step": 757 + }, + { + "epoch": 6.478632478632479, + "grad_norm": 0.8736911419143972, + "learning_rate": 3.03071750880727e-06, + "loss": 0.2464, + "step": 758 + }, + { + "epoch": 6.487179487179487, + "grad_norm": 0.7594717864593361, + "learning_rate": 3.0263434593068562e-06, + "loss": 0.2081, + "step": 759 + }, + { + "epoch": 6.495726495726496, + "grad_norm": 0.9355239477373309, + "learning_rate": 3.021967723196647e-06, + "loss": 0.3858, + "step": 760 + }, + { + "epoch": 6.504273504273504, + "grad_norm": 0.9974022702585318, + "learning_rate": 3.017590314498208e-06, + "loss": 0.1876, + "step": 761 + }, + { + "epoch": 6.512820512820513, + "grad_norm": 0.8613360050143386, + "learning_rate": 3.0132112472384652e-06, + "loss": 0.0735, + "step": 762 + }, + { + "epoch": 6.521367521367521, + "grad_norm": 0.8783727189757656, + "learning_rate": 3.0088305354496574e-06, + "loss": 0.3261, + "step": 763 + }, + { + "epoch": 6.52991452991453, + "grad_norm": 0.9569640404194898, + "learning_rate": 3.004448193169294e-06, + "loss": 0.1975, + "step": 764 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 0.8657428651377301, + "learning_rate": 3.0000642344401115e-06, + "loss": 0.1689, + "step": 765 + }, + { + "epoch": 6.547008547008547, + "grad_norm": 0.8116797215365377, + "learning_rate": 2.9956786733100225e-06, + "loss": 0.2281, + "step": 766 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.7459806019934861, + "learning_rate": 2.9912915238320755e-06, + "loss": 0.1812, + "step": 767 + }, + { + "epoch": 6.564102564102564, + "grad_norm": 0.9075555801622364, + "learning_rate": 2.9869028000644102e-06, + "loss": 0.301, + "step": 768 + }, + { + "epoch": 6.572649572649572, + "grad_norm": 1.0365280365594374, + "learning_rate": 2.9825125160702096e-06, + "loss": 0.3363, + "step": 769 + }, + { + "epoch": 6.581196581196581, + "grad_norm": 1.0681940817467501, + "learning_rate": 2.978120685917656e-06, + "loss": 0.1981, + "step": 770 + }, + { + "epoch": 6.589743589743589, + "grad_norm": 0.6063787298533598, + "learning_rate": 2.9737273236798868e-06, + "loss": 0.0629, + "step": 771 + }, + { + "epoch": 6.598290598290598, + "grad_norm": 0.7740559201346899, + "learning_rate": 2.9693324434349486e-06, + "loss": 0.1796, + "step": 772 + }, + { + "epoch": 6.6068376068376065, + "grad_norm": 0.874415643424101, + "learning_rate": 2.9649360592657526e-06, + "loss": 0.153, + "step": 773 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 1.0167511568733048, + "learning_rate": 2.960538185260029e-06, + "loss": 0.2641, + "step": 774 + }, + { + "epoch": 6.6239316239316235, + "grad_norm": 0.7831542303453528, + "learning_rate": 2.956138835510282e-06, + "loss": 0.2015, + "step": 775 + }, + { + "epoch": 6.632478632478632, + "grad_norm": 0.9313522049165068, + "learning_rate": 2.9517380241137437e-06, + "loss": 0.1415, + "step": 776 + }, + { + "epoch": 6.641025641025641, + "grad_norm": 0.8601509036277558, + "learning_rate": 2.9473357651723324e-06, + "loss": 0.1626, + "step": 777 + }, + { + "epoch": 6.64957264957265, + "grad_norm": 0.9035371429077345, + "learning_rate": 2.942932072792602e-06, + "loss": 0.3335, + "step": 778 + }, + { + "epoch": 6.6581196581196584, + "grad_norm": 0.8289789818585666, + "learning_rate": 2.938526961085701e-06, + "loss": 0.096, + "step": 779 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.04127393541744, + "learning_rate": 2.9341204441673267e-06, + "loss": 0.1766, + "step": 780 + }, + { + "epoch": 6.6752136752136755, + "grad_norm": 0.8898603983416691, + "learning_rate": 2.929712536157677e-06, + "loss": 0.1996, + "step": 781 + }, + { + "epoch": 6.683760683760684, + "grad_norm": 0.8654179207006909, + "learning_rate": 2.925303251181411e-06, + "loss": 0.3013, + "step": 782 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 0.613657904979391, + "learning_rate": 2.920892603367596e-06, + "loss": 0.1672, + "step": 783 + }, + { + "epoch": 6.700854700854701, + "grad_norm": 0.7781355440372467, + "learning_rate": 2.916480606849671e-06, + "loss": 0.3259, + "step": 784 + }, + { + "epoch": 6.7094017094017095, + "grad_norm": 0.8947088549656963, + "learning_rate": 2.9120672757653917e-06, + "loss": 0.2424, + "step": 785 + }, + { + "epoch": 6.717948717948718, + "grad_norm": 0.960212554161266, + "learning_rate": 2.907652624256794e-06, + "loss": 0.2845, + "step": 786 + }, + { + "epoch": 6.726495726495727, + "grad_norm": 0.8486856515975575, + "learning_rate": 2.903236666470143e-06, + "loss": 0.2724, + "step": 787 + }, + { + "epoch": 6.735042735042735, + "grad_norm": 1.0322254546822134, + "learning_rate": 2.89881941655589e-06, + "loss": 0.1461, + "step": 788 + }, + { + "epoch": 6.743589743589744, + "grad_norm": 0.7674403559804235, + "learning_rate": 2.8944008886686288e-06, + "loss": 0.1028, + "step": 789 + }, + { + "epoch": 6.752136752136752, + "grad_norm": 1.1043201498622655, + "learning_rate": 2.889981096967045e-06, + "loss": 0.3388, + "step": 790 + }, + { + "epoch": 6.760683760683761, + "grad_norm": 1.6187698046626917, + "learning_rate": 2.8855600556138757e-06, + "loss": 0.1716, + "step": 791 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 0.8037761816880599, + "learning_rate": 2.881137778775864e-06, + "loss": 0.0999, + "step": 792 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.9542605497236784, + "learning_rate": 2.876714280623708e-06, + "loss": 0.1289, + "step": 793 + }, + { + "epoch": 6.786324786324786, + "grad_norm": 0.6727881003007951, + "learning_rate": 2.872289575332023e-06, + "loss": 0.1749, + "step": 794 + }, + { + "epoch": 6.794871794871795, + "grad_norm": 0.8180347661987107, + "learning_rate": 2.8678636770792907e-06, + "loss": 0.1947, + "step": 795 + }, + { + "epoch": 6.803418803418803, + "grad_norm": 0.670627358457358, + "learning_rate": 2.863436600047815e-06, + "loss": 0.1994, + "step": 796 + }, + { + "epoch": 6.811965811965812, + "grad_norm": 0.9167632135320165, + "learning_rate": 2.8590083584236792e-06, + "loss": 0.2602, + "step": 797 + }, + { + "epoch": 6.82051282051282, + "grad_norm": 0.9336105527397447, + "learning_rate": 2.854578966396697e-06, + "loss": 0.2185, + "step": 798 + }, + { + "epoch": 6.829059829059829, + "grad_norm": 0.5738993328361307, + "learning_rate": 2.8501484381603685e-06, + "loss": 0.1137, + "step": 799 + }, + { + "epoch": 6.837606837606837, + "grad_norm": 1.1698886548057332, + "learning_rate": 2.8457167879118332e-06, + "loss": 0.3665, + "step": 800 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 0.8358950075375864, + "learning_rate": 2.8412840298518295e-06, + "loss": 0.2201, + "step": 801 + }, + { + "epoch": 6.854700854700854, + "grad_norm": 1.0664271372394594, + "learning_rate": 2.836850178184642e-06, + "loss": 0.13, + "step": 802 + }, + { + "epoch": 6.863247863247864, + "grad_norm": 0.7714166530100259, + "learning_rate": 2.8324152471180634e-06, + "loss": 0.1483, + "step": 803 + }, + { + "epoch": 6.871794871794872, + "grad_norm": 0.7116665658763507, + "learning_rate": 2.8279792508633415e-06, + "loss": 0.0972, + "step": 804 + }, + { + "epoch": 6.880341880341881, + "grad_norm": 1.0305206757103451, + "learning_rate": 2.8235422036351384e-06, + "loss": 0.2386, + "step": 805 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.6737644564750103, + "learning_rate": 2.8191041196514874e-06, + "loss": 0.0862, + "step": 806 + }, + { + "epoch": 6.897435897435898, + "grad_norm": 1.0213339444859935, + "learning_rate": 2.8146650131337376e-06, + "loss": 0.2497, + "step": 807 + }, + { + "epoch": 6.905982905982906, + "grad_norm": 1.2015414358677925, + "learning_rate": 2.81022489830652e-06, + "loss": 0.1404, + "step": 808 + }, + { + "epoch": 6.914529914529915, + "grad_norm": 0.7713933139465124, + "learning_rate": 2.8057837893976958e-06, + "loss": 0.2377, + "step": 809 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 1.040720942073932, + "learning_rate": 2.8013417006383078e-06, + "loss": 0.2294, + "step": 810 + }, + { + "epoch": 6.931623931623932, + "grad_norm": 0.8131320254780423, + "learning_rate": 2.7968986462625436e-06, + "loss": 0.2105, + "step": 811 + }, + { + "epoch": 6.94017094017094, + "grad_norm": 0.8827385433058145, + "learning_rate": 2.7924546405076837e-06, + "loss": 0.2545, + "step": 812 + }, + { + "epoch": 6.948717948717949, + "grad_norm": 1.0866373931985307, + "learning_rate": 2.788009697614053e-06, + "loss": 0.1797, + "step": 813 + }, + { + "epoch": 6.957264957264957, + "grad_norm": 0.8286348008973615, + "learning_rate": 2.7835638318249856e-06, + "loss": 0.3427, + "step": 814 + }, + { + "epoch": 6.965811965811966, + "grad_norm": 0.7069844316283795, + "learning_rate": 2.7791170573867698e-06, + "loss": 0.1923, + "step": 815 + }, + { + "epoch": 6.9743589743589745, + "grad_norm": 0.8259615583243639, + "learning_rate": 2.7746693885486044e-06, + "loss": 0.2327, + "step": 816 + }, + { + "epoch": 6.982905982905983, + "grad_norm": 0.8224327747519812, + "learning_rate": 2.770220839562556e-06, + "loss": 0.2679, + "step": 817 + }, + { + "epoch": 6.9914529914529915, + "grad_norm": 0.9872759388663668, + "learning_rate": 2.765771424683513e-06, + "loss": 0.0969, + "step": 818 + }, + { + "epoch": 7.0, + "grad_norm": 0.7830387364303183, + "learning_rate": 2.761321158169134e-06, + "loss": 0.2424, + "step": 819 + }, + { + "epoch": 7.0085470085470085, + "grad_norm": 0.811449830083304, + "learning_rate": 2.7568700542798112e-06, + "loss": 0.1186, + "step": 820 + }, + { + "epoch": 7.017094017094017, + "grad_norm": 0.8884968910045412, + "learning_rate": 2.7524181272786153e-06, + "loss": 0.36, + "step": 821 + }, + { + "epoch": 7.0256410256410255, + "grad_norm": 0.8762185444900101, + "learning_rate": 2.747965391431261e-06, + "loss": 0.1558, + "step": 822 + }, + { + "epoch": 7.034188034188034, + "grad_norm": 0.6490106067899655, + "learning_rate": 2.743511861006049e-06, + "loss": 0.1165, + "step": 823 + }, + { + "epoch": 7.042735042735043, + "grad_norm": 1.2117907753992274, + "learning_rate": 2.739057550273828e-06, + "loss": 0.1867, + "step": 824 + }, + { + "epoch": 7.051282051282051, + "grad_norm": 0.5267319029801504, + "learning_rate": 2.7346024735079483e-06, + "loss": 0.0796, + "step": 825 + }, + { + "epoch": 7.05982905982906, + "grad_norm": 1.017628519036157, + "learning_rate": 2.7301466449842147e-06, + "loss": 0.1076, + "step": 826 + }, + { + "epoch": 7.068376068376068, + "grad_norm": 1.128531455235049, + "learning_rate": 2.725690078980838e-06, + "loss": 0.3083, + "step": 827 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 0.8562518470839942, + "learning_rate": 2.7212327897783963e-06, + "loss": 0.2254, + "step": 828 + }, + { + "epoch": 7.085470085470085, + "grad_norm": 0.6525538186830495, + "learning_rate": 2.7167747916597825e-06, + "loss": 0.0773, + "step": 829 + }, + { + "epoch": 7.094017094017094, + "grad_norm": 0.7411130620062173, + "learning_rate": 2.7123160989101623e-06, + "loss": 0.2117, + "step": 830 + }, + { + "epoch": 7.102564102564102, + "grad_norm": 0.928232225419419, + "learning_rate": 2.7078567258169264e-06, + "loss": 0.1877, + "step": 831 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.566317948447876, + "learning_rate": 2.703396686669646e-06, + "loss": 0.0725, + "step": 832 + }, + { + "epoch": 7.119658119658119, + "grad_norm": 0.5667066947319995, + "learning_rate": 2.698935995760027e-06, + "loss": 0.1054, + "step": 833 + }, + { + "epoch": 7.128205128205128, + "grad_norm": 0.7285759378868003, + "learning_rate": 2.6944746673818623e-06, + "loss": 0.1234, + "step": 834 + }, + { + "epoch": 7.136752136752137, + "grad_norm": 0.9036310236315552, + "learning_rate": 2.6900127158309903e-06, + "loss": 0.1491, + "step": 835 + }, + { + "epoch": 7.145299145299146, + "grad_norm": 1.1011326912978958, + "learning_rate": 2.6855501554052433e-06, + "loss": 0.2026, + "step": 836 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 0.7635833117980484, + "learning_rate": 2.6810870004044065e-06, + "loss": 0.0551, + "step": 837 + }, + { + "epoch": 7.162393162393163, + "grad_norm": 0.6872152173732621, + "learning_rate": 2.6766232651301694e-06, + "loss": 0.0774, + "step": 838 + }, + { + "epoch": 7.170940170940171, + "grad_norm": 0.9105534731343037, + "learning_rate": 2.672158963886082e-06, + "loss": 0.2154, + "step": 839 + }, + { + "epoch": 7.17948717948718, + "grad_norm": 0.8941926255951779, + "learning_rate": 2.667694110977506e-06, + "loss": 0.1053, + "step": 840 + }, + { + "epoch": 7.188034188034188, + "grad_norm": 0.8494727638062671, + "learning_rate": 2.6632287207115735e-06, + "loss": 0.1311, + "step": 841 + }, + { + "epoch": 7.196581196581197, + "grad_norm": 1.052548792291065, + "learning_rate": 2.6587628073971366e-06, + "loss": 0.3121, + "step": 842 + }, + { + "epoch": 7.205128205128205, + "grad_norm": 0.6867370020751574, + "learning_rate": 2.654296385344724e-06, + "loss": 0.1466, + "step": 843 + }, + { + "epoch": 7.213675213675214, + "grad_norm": 1.0121071427225556, + "learning_rate": 2.6498294688664937e-06, + "loss": 0.2121, + "step": 844 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 1.1519304302625446, + "learning_rate": 2.6453620722761897e-06, + "loss": 0.1331, + "step": 845 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 0.8501355517389577, + "learning_rate": 2.6408942098890937e-06, + "loss": 0.221, + "step": 846 + }, + { + "epoch": 7.239316239316239, + "grad_norm": 0.8315914453456078, + "learning_rate": 2.6364258960219794e-06, + "loss": 0.2355, + "step": 847 + }, + { + "epoch": 7.247863247863248, + "grad_norm": 0.6522813347545385, + "learning_rate": 2.631957144993068e-06, + "loss": 0.095, + "step": 848 + }, + { + "epoch": 7.256410256410256, + "grad_norm": 0.8036110735240778, + "learning_rate": 2.6274879711219816e-06, + "loss": 0.2501, + "step": 849 + }, + { + "epoch": 7.264957264957265, + "grad_norm": 1.0254631408066757, + "learning_rate": 2.6230183887296955e-06, + "loss": 0.3126, + "step": 850 + }, + { + "epoch": 7.273504273504273, + "grad_norm": 1.0272875948331592, + "learning_rate": 2.6185484121384974e-06, + "loss": 0.2403, + "step": 851 + }, + { + "epoch": 7.282051282051282, + "grad_norm": 0.7868876719936091, + "learning_rate": 2.6140780556719354e-06, + "loss": 0.0742, + "step": 852 + }, + { + "epoch": 7.2905982905982905, + "grad_norm": 0.6363777029874365, + "learning_rate": 2.6096073336547757e-06, + "loss": 0.1132, + "step": 853 + }, + { + "epoch": 7.299145299145299, + "grad_norm": 0.7336756499861784, + "learning_rate": 2.6051362604129553e-06, + "loss": 0.1026, + "step": 854 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 0.775541539538005, + "learning_rate": 2.6006648502735384e-06, + "loss": 0.1338, + "step": 855 + }, + { + "epoch": 7.316239316239316, + "grad_norm": 0.7702004755767091, + "learning_rate": 2.5961931175646658e-06, + "loss": 0.0985, + "step": 856 + }, + { + "epoch": 7.3247863247863245, + "grad_norm": 0.6316838953297962, + "learning_rate": 2.591721076615517e-06, + "loss": 0.1023, + "step": 857 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.7607344265011406, + "learning_rate": 2.587248741756253e-06, + "loss": 0.1742, + "step": 858 + }, + { + "epoch": 7.3418803418803416, + "grad_norm": 0.6901095678494874, + "learning_rate": 2.5827761273179795e-06, + "loss": 0.2782, + "step": 859 + }, + { + "epoch": 7.35042735042735, + "grad_norm": 1.0303443002912978, + "learning_rate": 2.578303247632701e-06, + "loss": 0.1444, + "step": 860 + }, + { + "epoch": 7.358974358974359, + "grad_norm": 0.879389257012702, + "learning_rate": 2.5738301170332665e-06, + "loss": 0.2016, + "step": 861 + }, + { + "epoch": 7.367521367521368, + "grad_norm": 0.758006315638393, + "learning_rate": 2.5693567498533315e-06, + "loss": 0.107, + "step": 862 + }, + { + "epoch": 7.3760683760683765, + "grad_norm": 0.741991263996712, + "learning_rate": 2.5648831604273117e-06, + "loss": 0.1058, + "step": 863 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 0.5885181500724646, + "learning_rate": 2.560409363090331e-06, + "loss": 0.0387, + "step": 864 + }, + { + "epoch": 7.3931623931623935, + "grad_norm": 0.6251334241704289, + "learning_rate": 2.555935372178183e-06, + "loss": 0.0529, + "step": 865 + }, + { + "epoch": 7.401709401709402, + "grad_norm": 0.7108430489857732, + "learning_rate": 2.5514612020272792e-06, + "loss": 0.0515, + "step": 866 + }, + { + "epoch": 7.410256410256411, + "grad_norm": 0.5671854664656416, + "learning_rate": 2.546986866974606e-06, + "loss": 0.0936, + "step": 867 + }, + { + "epoch": 7.418803418803419, + "grad_norm": 0.7055757070638874, + "learning_rate": 2.54251238135768e-06, + "loss": 0.1075, + "step": 868 + }, + { + "epoch": 7.427350427350428, + "grad_norm": 1.003935743284772, + "learning_rate": 2.5380377595144984e-06, + "loss": 0.2216, + "step": 869 + }, + { + "epoch": 7.435897435897436, + "grad_norm": 0.9449024720361391, + "learning_rate": 2.533563015783494e-06, + "loss": 0.2309, + "step": 870 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.7129554344380816, + "learning_rate": 2.5290881645034932e-06, + "loss": 0.1252, + "step": 871 + }, + { + "epoch": 7.452991452991453, + "grad_norm": 1.0412648511502645, + "learning_rate": 2.524613220013664e-06, + "loss": 0.0476, + "step": 872 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 1.4336752325274813, + "learning_rate": 2.5201381966534748e-06, + "loss": 0.433, + "step": 873 + }, + { + "epoch": 7.47008547008547, + "grad_norm": 1.2607405383601524, + "learning_rate": 2.515663108762648e-06, + "loss": 0.1975, + "step": 874 + }, + { + "epoch": 7.478632478632479, + "grad_norm": 0.9558946359084702, + "learning_rate": 2.511187970681109e-06, + "loss": 0.139, + "step": 875 + }, + { + "epoch": 7.487179487179487, + "grad_norm": 0.8425542820829937, + "learning_rate": 2.5067127967489464e-06, + "loss": 0.1183, + "step": 876 + }, + { + "epoch": 7.495726495726496, + "grad_norm": 0.628921282602329, + "learning_rate": 2.5022376013063653e-06, + "loss": 0.0976, + "step": 877 + }, + { + "epoch": 7.504273504273504, + "grad_norm": 0.7245192855197514, + "learning_rate": 2.497762398693636e-06, + "loss": 0.1356, + "step": 878 + }, + { + "epoch": 7.512820512820513, + "grad_norm": 0.8889602741073305, + "learning_rate": 2.493287203251054e-06, + "loss": 0.2406, + "step": 879 + }, + { + "epoch": 7.521367521367521, + "grad_norm": 1.2959721074337187, + "learning_rate": 2.4888120293188915e-06, + "loss": 0.2891, + "step": 880 + }, + { + "epoch": 7.52991452991453, + "grad_norm": 0.927145734964671, + "learning_rate": 2.484336891237353e-06, + "loss": 0.186, + "step": 881 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 0.7231331416771678, + "learning_rate": 2.4798618033465256e-06, + "loss": 0.1549, + "step": 882 + }, + { + "epoch": 7.547008547008547, + "grad_norm": 0.7356989966291199, + "learning_rate": 2.4753867799863365e-06, + "loss": 0.0729, + "step": 883 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 1.1297589765273541, + "learning_rate": 2.470911835496508e-06, + "loss": 0.1264, + "step": 884 + }, + { + "epoch": 7.564102564102564, + "grad_norm": 0.7268249250400882, + "learning_rate": 2.466436984216507e-06, + "loss": 0.1948, + "step": 885 + }, + { + "epoch": 7.572649572649572, + "grad_norm": 1.0746091944481344, + "learning_rate": 2.4619622404855025e-06, + "loss": 0.0925, + "step": 886 + }, + { + "epoch": 7.581196581196581, + "grad_norm": 0.9343825196353718, + "learning_rate": 2.4574876186423203e-06, + "loss": 0.1994, + "step": 887 + }, + { + "epoch": 7.589743589743589, + "grad_norm": 0.8818256523384325, + "learning_rate": 2.4530131330253946e-06, + "loss": 0.3004, + "step": 888 + }, + { + "epoch": 7.598290598290598, + "grad_norm": 0.6933798747538589, + "learning_rate": 2.4485387979727216e-06, + "loss": 0.0969, + "step": 889 + }, + { + "epoch": 7.6068376068376065, + "grad_norm": 1.05137189120055, + "learning_rate": 2.4440646278218178e-06, + "loss": 0.1033, + "step": 890 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 0.8078492077071381, + "learning_rate": 2.43959063690967e-06, + "loss": 0.1876, + "step": 891 + }, + { + "epoch": 7.6239316239316235, + "grad_norm": 0.8109878148999148, + "learning_rate": 2.435116839572689e-06, + "loss": 0.2169, + "step": 892 + }, + { + "epoch": 7.632478632478632, + "grad_norm": 1.056400836892595, + "learning_rate": 2.430643250146669e-06, + "loss": 0.2165, + "step": 893 + }, + { + "epoch": 7.641025641025641, + "grad_norm": 0.8285219700841594, + "learning_rate": 2.426169882966735e-06, + "loss": 0.2727, + "step": 894 + }, + { + "epoch": 7.64957264957265, + "grad_norm": 0.9095468547064846, + "learning_rate": 2.4216967523673e-06, + "loss": 0.1749, + "step": 895 + }, + { + "epoch": 7.6581196581196584, + "grad_norm": 0.5146017507927608, + "learning_rate": 2.4172238726820205e-06, + "loss": 0.0588, + "step": 896 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.7662282433470485, + "learning_rate": 2.4127512582437486e-06, + "loss": 0.1758, + "step": 897 + }, + { + "epoch": 7.6752136752136755, + "grad_norm": 0.8890016478017211, + "learning_rate": 2.4082789233844837e-06, + "loss": 0.1344, + "step": 898 + }, + { + "epoch": 7.683760683760684, + "grad_norm": 0.8045854211885298, + "learning_rate": 2.403806882435334e-06, + "loss": 0.1668, + "step": 899 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.9376743164546943, + "learning_rate": 2.399335149726463e-06, + "loss": 0.1746, + "step": 900 + }, + { + "epoch": 7.700854700854701, + "grad_norm": 0.8484344804273012, + "learning_rate": 2.394863739587045e-06, + "loss": 0.0338, + "step": 901 + }, + { + "epoch": 7.7094017094017095, + "grad_norm": 0.8856756769399486, + "learning_rate": 2.3903926663452255e-06, + "loss": 0.1161, + "step": 902 + }, + { + "epoch": 7.717948717948718, + "grad_norm": 0.6232667832934703, + "learning_rate": 2.385921944328066e-06, + "loss": 0.0549, + "step": 903 + }, + { + "epoch": 7.726495726495727, + "grad_norm": 1.0427742949381442, + "learning_rate": 2.3814515878615035e-06, + "loss": 0.1645, + "step": 904 + }, + { + "epoch": 7.735042735042735, + "grad_norm": 0.8121694579839741, + "learning_rate": 2.376981611270305e-06, + "loss": 0.1664, + "step": 905 + }, + { + "epoch": 7.743589743589744, + "grad_norm": 0.7659730280591774, + "learning_rate": 2.372512028878019e-06, + "loss": 0.1028, + "step": 906 + }, + { + "epoch": 7.752136752136752, + "grad_norm": 0.5835938881519352, + "learning_rate": 2.3680428550069327e-06, + "loss": 0.0853, + "step": 907 + }, + { + "epoch": 7.760683760683761, + "grad_norm": 0.6898088371986518, + "learning_rate": 2.3635741039780214e-06, + "loss": 0.1352, + "step": 908 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 1.0941235835301037, + "learning_rate": 2.3591057901109063e-06, + "loss": 0.2384, + "step": 909 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 1.0985158914441384, + "learning_rate": 2.3546379277238107e-06, + "loss": 0.1356, + "step": 910 + }, + { + "epoch": 7.786324786324786, + "grad_norm": 0.8285970493804228, + "learning_rate": 2.3501705311335067e-06, + "loss": 0.0798, + "step": 911 + }, + { + "epoch": 7.794871794871795, + "grad_norm": 0.7048258885001851, + "learning_rate": 2.3457036146552766e-06, + "loss": 0.0889, + "step": 912 + }, + { + "epoch": 7.803418803418803, + "grad_norm": 1.3004201397168884, + "learning_rate": 2.341237192602864e-06, + "loss": 0.2357, + "step": 913 + }, + { + "epoch": 7.811965811965812, + "grad_norm": 0.8101870477166999, + "learning_rate": 2.336771279288427e-06, + "loss": 0.0772, + "step": 914 + }, + { + "epoch": 7.82051282051282, + "grad_norm": 0.93339167381936, + "learning_rate": 2.332305889022494e-06, + "loss": 0.0886, + "step": 915 + }, + { + "epoch": 7.829059829059829, + "grad_norm": 0.7680114645074881, + "learning_rate": 2.3278410361139198e-06, + "loss": 0.2533, + "step": 916 + }, + { + "epoch": 7.837606837606837, + "grad_norm": 1.0280566949846137, + "learning_rate": 2.3233767348698314e-06, + "loss": 0.1258, + "step": 917 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 0.7516417773033046, + "learning_rate": 2.3189129995955944e-06, + "loss": 0.2266, + "step": 918 + }, + { + "epoch": 7.854700854700854, + "grad_norm": 0.9280988237858805, + "learning_rate": 2.314449844594758e-06, + "loss": 0.2605, + "step": 919 + }, + { + "epoch": 7.863247863247864, + "grad_norm": 0.9031395365665326, + "learning_rate": 2.3099872841690105e-06, + "loss": 0.2555, + "step": 920 + }, + { + "epoch": 7.871794871794872, + "grad_norm": 0.8324029751037804, + "learning_rate": 2.305525332618138e-06, + "loss": 0.1983, + "step": 921 + }, + { + "epoch": 7.880341880341881, + "grad_norm": 1.0373932446642553, + "learning_rate": 2.3010640042399748e-06, + "loss": 0.2108, + "step": 922 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 0.7324143576403153, + "learning_rate": 2.296603313330355e-06, + "loss": 0.0424, + "step": 923 + }, + { + "epoch": 7.897435897435898, + "grad_norm": 0.8313171728758167, + "learning_rate": 2.2921432741830744e-06, + "loss": 0.2036, + "step": 924 + }, + { + "epoch": 7.905982905982906, + "grad_norm": 0.5819353015291171, + "learning_rate": 2.2876839010898377e-06, + "loss": 0.0989, + "step": 925 + }, + { + "epoch": 7.914529914529915, + "grad_norm": 0.9657210621158413, + "learning_rate": 2.283225208340218e-06, + "loss": 0.1124, + "step": 926 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 0.8348772843589627, + "learning_rate": 2.2787672102216045e-06, + "loss": 0.1371, + "step": 927 + }, + { + "epoch": 7.931623931623932, + "grad_norm": 1.0071415066110063, + "learning_rate": 2.2743099210191623e-06, + "loss": 0.2917, + "step": 928 + }, + { + "epoch": 7.94017094017094, + "grad_norm": 0.7480676950520273, + "learning_rate": 2.2698533550157865e-06, + "loss": 0.1117, + "step": 929 + }, + { + "epoch": 7.948717948717949, + "grad_norm": 0.6984275302001026, + "learning_rate": 2.265397526492052e-06, + "loss": 0.1257, + "step": 930 + }, + { + "epoch": 7.957264957264957, + "grad_norm": 1.0136360419533816, + "learning_rate": 2.2609424497261723e-06, + "loss": 0.1716, + "step": 931 + }, + { + "epoch": 7.965811965811966, + "grad_norm": 0.789818664864703, + "learning_rate": 2.2564881389939524e-06, + "loss": 0.1089, + "step": 932 + }, + { + "epoch": 7.9743589743589745, + "grad_norm": 0.801543110176476, + "learning_rate": 2.25203460856874e-06, + "loss": 0.1356, + "step": 933 + }, + { + "epoch": 7.982905982905983, + "grad_norm": 1.079998000143528, + "learning_rate": 2.2475818727213843e-06, + "loss": 0.1016, + "step": 934 + }, + { + "epoch": 7.9914529914529915, + "grad_norm": 1.2224492900647317, + "learning_rate": 2.24312994572019e-06, + "loss": 0.0684, + "step": 935 + }, + { + "epoch": 8.0, + "grad_norm": 0.7840113283449409, + "learning_rate": 2.238678841830867e-06, + "loss": 0.26, + "step": 936 + }, + { + "epoch": 8.008547008547009, + "grad_norm": 0.6801213213487027, + "learning_rate": 2.2342285753164876e-06, + "loss": 0.0874, + "step": 937 + }, + { + "epoch": 8.017094017094017, + "grad_norm": 0.8205242974812631, + "learning_rate": 2.2297791604374443e-06, + "loss": 0.0564, + "step": 938 + }, + { + "epoch": 8.025641025641026, + "grad_norm": 0.5970829689339425, + "learning_rate": 2.2253306114513964e-06, + "loss": 0.0707, + "step": 939 + }, + { + "epoch": 8.034188034188034, + "grad_norm": 0.7521912608976296, + "learning_rate": 2.220882942613231e-06, + "loss": 0.1745, + "step": 940 + }, + { + "epoch": 8.042735042735043, + "grad_norm": 0.7838835824189015, + "learning_rate": 2.2164361681750148e-06, + "loss": 0.0816, + "step": 941 + }, + { + "epoch": 8.051282051282051, + "grad_norm": 0.8426679690138106, + "learning_rate": 2.2119903023859475e-06, + "loss": 0.0916, + "step": 942 + }, + { + "epoch": 8.05982905982906, + "grad_norm": 1.1073502756448632, + "learning_rate": 2.2075453594923175e-06, + "loss": 0.1688, + "step": 943 + }, + { + "epoch": 8.068376068376068, + "grad_norm": 0.9922388944200498, + "learning_rate": 2.2031013537374564e-06, + "loss": 0.1957, + "step": 944 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 0.6640668878424266, + "learning_rate": 2.1986582993616926e-06, + "loss": 0.0987, + "step": 945 + }, + { + "epoch": 8.085470085470085, + "grad_norm": 1.0112842704537792, + "learning_rate": 2.194216210602305e-06, + "loss": 0.1037, + "step": 946 + }, + { + "epoch": 8.094017094017094, + "grad_norm": 0.8175471256641587, + "learning_rate": 2.1897751016934802e-06, + "loss": 0.1412, + "step": 947 + }, + { + "epoch": 8.102564102564102, + "grad_norm": 0.5118679059079263, + "learning_rate": 2.1853349868662637e-06, + "loss": 0.0745, + "step": 948 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 0.9461630409984287, + "learning_rate": 2.1808958803485134e-06, + "loss": 0.1013, + "step": 949 + }, + { + "epoch": 8.11965811965812, + "grad_norm": 0.9858064756039681, + "learning_rate": 2.1764577963648616e-06, + "loss": 0.143, + "step": 950 + }, + { + "epoch": 8.128205128205128, + "grad_norm": 0.8827829292121262, + "learning_rate": 2.1720207491366598e-06, + "loss": 0.0624, + "step": 951 + }, + { + "epoch": 8.136752136752136, + "grad_norm": 0.6546653319429925, + "learning_rate": 2.167584752881937e-06, + "loss": 0.0823, + "step": 952 + }, + { + "epoch": 8.145299145299145, + "grad_norm": 1.0092688749232046, + "learning_rate": 2.163149821815358e-06, + "loss": 0.235, + "step": 953 + }, + { + "epoch": 8.153846153846153, + "grad_norm": 0.8771490405627712, + "learning_rate": 2.1587159701481718e-06, + "loss": 0.2528, + "step": 954 + }, + { + "epoch": 8.162393162393162, + "grad_norm": 0.946709599885579, + "learning_rate": 2.154283212088168e-06, + "loss": 0.1608, + "step": 955 + }, + { + "epoch": 8.17094017094017, + "grad_norm": 0.5952914827869664, + "learning_rate": 2.1498515618396327e-06, + "loss": 0.1172, + "step": 956 + }, + { + "epoch": 8.179487179487179, + "grad_norm": 0.6912864743108678, + "learning_rate": 2.145421033603304e-06, + "loss": 0.0112, + "step": 957 + }, + { + "epoch": 8.188034188034187, + "grad_norm": 0.6662988373552396, + "learning_rate": 2.1409916415763216e-06, + "loss": 0.0755, + "step": 958 + }, + { + "epoch": 8.196581196581196, + "grad_norm": 0.7960679547494807, + "learning_rate": 2.1365633999521852e-06, + "loss": 0.0607, + "step": 959 + }, + { + "epoch": 8.205128205128204, + "grad_norm": 0.8508884811265452, + "learning_rate": 2.1321363229207097e-06, + "loss": 0.1314, + "step": 960 + }, + { + "epoch": 8.213675213675213, + "grad_norm": 0.8298713309207288, + "learning_rate": 2.127710424667978e-06, + "loss": 0.1361, + "step": 961 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.5117722405915415, + "learning_rate": 2.1232857193762923e-06, + "loss": 0.0464, + "step": 962 + }, + { + "epoch": 8.23076923076923, + "grad_norm": 0.5227615536686315, + "learning_rate": 2.1188622212241366e-06, + "loss": 0.0492, + "step": 963 + }, + { + "epoch": 8.239316239316238, + "grad_norm": 1.0146157619895304, + "learning_rate": 2.114439944386125e-06, + "loss": 0.1391, + "step": 964 + }, + { + "epoch": 8.247863247863247, + "grad_norm": 1.002266709631584, + "learning_rate": 2.1100189030329557e-06, + "loss": 0.1537, + "step": 965 + }, + { + "epoch": 8.256410256410255, + "grad_norm": 0.6498632887785525, + "learning_rate": 2.105599111331372e-06, + "loss": 0.0452, + "step": 966 + }, + { + "epoch": 8.264957264957266, + "grad_norm": 0.9333635521000488, + "learning_rate": 2.101180583444111e-06, + "loss": 0.1001, + "step": 967 + }, + { + "epoch": 8.273504273504274, + "grad_norm": 0.8310263706256398, + "learning_rate": 2.0967633335298583e-06, + "loss": 0.1492, + "step": 968 + }, + { + "epoch": 8.282051282051283, + "grad_norm": 0.5899470844056727, + "learning_rate": 2.0923473757432073e-06, + "loss": 0.0545, + "step": 969 + }, + { + "epoch": 8.290598290598291, + "grad_norm": 1.0233804469655359, + "learning_rate": 2.0879327242346096e-06, + "loss": 0.0926, + "step": 970 + }, + { + "epoch": 8.2991452991453, + "grad_norm": 1.0819059667991204, + "learning_rate": 2.0835193931503297e-06, + "loss": 0.1311, + "step": 971 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 0.46285369597354903, + "learning_rate": 2.079107396632404e-06, + "loss": 0.1046, + "step": 972 + }, + { + "epoch": 8.316239316239317, + "grad_norm": 1.0596263414741827, + "learning_rate": 2.0746967488185903e-06, + "loss": 0.1922, + "step": 973 + }, + { + "epoch": 8.324786324786325, + "grad_norm": 0.652779105438659, + "learning_rate": 2.0702874638423233e-06, + "loss": 0.1108, + "step": 974 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.8934098392963533, + "learning_rate": 2.0658795558326745e-06, + "loss": 0.1306, + "step": 975 + }, + { + "epoch": 8.341880341880342, + "grad_norm": 0.6486169868086549, + "learning_rate": 2.0614730389143004e-06, + "loss": 0.0647, + "step": 976 + }, + { + "epoch": 8.350427350427351, + "grad_norm": 0.6465097746250824, + "learning_rate": 2.057067927207399e-06, + "loss": 0.0592, + "step": 977 + }, + { + "epoch": 8.35897435897436, + "grad_norm": 0.9112154978141302, + "learning_rate": 2.052664234827668e-06, + "loss": 0.1674, + "step": 978 + }, + { + "epoch": 8.367521367521368, + "grad_norm": 1.049107550845732, + "learning_rate": 2.048261975886256e-06, + "loss": 0.2277, + "step": 979 + }, + { + "epoch": 8.376068376068377, + "grad_norm": 0.6487345522229879, + "learning_rate": 2.0438611644897186e-06, + "loss": 0.0773, + "step": 980 + }, + { + "epoch": 8.384615384615385, + "grad_norm": 0.5377657888294962, + "learning_rate": 2.0394618147399713e-06, + "loss": 0.1007, + "step": 981 + }, + { + "epoch": 8.393162393162394, + "grad_norm": 0.6621738996727564, + "learning_rate": 2.0350639407342474e-06, + "loss": 0.0513, + "step": 982 + }, + { + "epoch": 8.401709401709402, + "grad_norm": 0.4872035195253045, + "learning_rate": 2.030667556565052e-06, + "loss": 0.0265, + "step": 983 + }, + { + "epoch": 8.41025641025641, + "grad_norm": 0.9739209495675087, + "learning_rate": 2.026272676320114e-06, + "loss": 0.1961, + "step": 984 + }, + { + "epoch": 8.418803418803419, + "grad_norm": 0.5372233867525106, + "learning_rate": 2.021879314082344e-06, + "loss": 0.1294, + "step": 985 + }, + { + "epoch": 8.427350427350428, + "grad_norm": 0.8921860227920625, + "learning_rate": 2.0174874839297912e-06, + "loss": 0.1127, + "step": 986 + }, + { + "epoch": 8.435897435897436, + "grad_norm": 0.9625191449275724, + "learning_rate": 2.01309719993559e-06, + "loss": 0.1532, + "step": 987 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 1.028032603670202, + "learning_rate": 2.0087084761679245e-06, + "loss": 0.1056, + "step": 988 + }, + { + "epoch": 8.452991452991453, + "grad_norm": 0.6625782427295153, + "learning_rate": 2.0043213266899787e-06, + "loss": 0.1335, + "step": 989 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 1.0598695524412451, + "learning_rate": 1.9999357655598894e-06, + "loss": 0.2141, + "step": 990 + }, + { + "epoch": 8.47008547008547, + "grad_norm": 0.6690236702990353, + "learning_rate": 1.995551806830706e-06, + "loss": 0.1276, + "step": 991 + }, + { + "epoch": 8.478632478632479, + "grad_norm": 0.60166856759265, + "learning_rate": 1.9911694645503443e-06, + "loss": 0.0579, + "step": 992 + }, + { + "epoch": 8.487179487179487, + "grad_norm": 0.9847593137774459, + "learning_rate": 1.986788752761536e-06, + "loss": 0.111, + "step": 993 + }, + { + "epoch": 8.495726495726496, + "grad_norm": 0.982024009501397, + "learning_rate": 1.9824096855017922e-06, + "loss": 0.2799, + "step": 994 + }, + { + "epoch": 8.504273504273504, + "grad_norm": 0.7761724159948599, + "learning_rate": 1.978032276803354e-06, + "loss": 0.1645, + "step": 995 + }, + { + "epoch": 8.512820512820513, + "grad_norm": 0.8471655309381408, + "learning_rate": 1.9736565406931446e-06, + "loss": 0.1332, + "step": 996 + }, + { + "epoch": 8.521367521367521, + "grad_norm": 0.951981738853562, + "learning_rate": 1.969282491192731e-06, + "loss": 0.1401, + "step": 997 + }, + { + "epoch": 8.52991452991453, + "grad_norm": 0.6923660244868259, + "learning_rate": 1.9649101423182732e-06, + "loss": 0.0843, + "step": 998 + }, + { + "epoch": 8.538461538461538, + "grad_norm": 0.8000814315215867, + "learning_rate": 1.960539508080485e-06, + "loss": 0.2153, + "step": 999 + }, + { + "epoch": 8.547008547008547, + "grad_norm": 0.6617963864096438, + "learning_rate": 1.956170602484582e-06, + "loss": 0.0804, + "step": 1000 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 0.8352900774008456, + "learning_rate": 1.9518034395302413e-06, + "loss": 0.2491, + "step": 1001 + }, + { + "epoch": 8.564102564102564, + "grad_norm": 0.5497872023851514, + "learning_rate": 1.94743803321156e-06, + "loss": 0.0348, + "step": 1002 + }, + { + "epoch": 8.572649572649572, + "grad_norm": 0.5285674845565532, + "learning_rate": 1.9430743975170004e-06, + "loss": 0.0767, + "step": 1003 + }, + { + "epoch": 8.581196581196581, + "grad_norm": 0.9005822868435749, + "learning_rate": 1.938712546429354e-06, + "loss": 0.1976, + "step": 1004 + }, + { + "epoch": 8.58974358974359, + "grad_norm": 0.8375381738831286, + "learning_rate": 1.934352493925695e-06, + "loss": 0.2831, + "step": 1005 + }, + { + "epoch": 8.598290598290598, + "grad_norm": 0.7340948938782793, + "learning_rate": 1.9299942539773316e-06, + "loss": 0.0736, + "step": 1006 + }, + { + "epoch": 8.606837606837606, + "grad_norm": 0.39349115495287446, + "learning_rate": 1.925637840549764e-06, + "loss": 0.05, + "step": 1007 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 0.8352497021594969, + "learning_rate": 1.921283267602643e-06, + "loss": 0.133, + "step": 1008 + }, + { + "epoch": 8.623931623931623, + "grad_norm": 0.8392365988719928, + "learning_rate": 1.9169305490897173e-06, + "loss": 0.1195, + "step": 1009 + }, + { + "epoch": 8.632478632478632, + "grad_norm": 0.7904108060873761, + "learning_rate": 1.9125796989587947e-06, + "loss": 0.1938, + "step": 1010 + }, + { + "epoch": 8.64102564102564, + "grad_norm": 0.8645717752205156, + "learning_rate": 1.9082307311516985e-06, + "loss": 0.1548, + "step": 1011 + }, + { + "epoch": 8.649572649572649, + "grad_norm": 0.5395543064003644, + "learning_rate": 1.9038836596042174e-06, + "loss": 0.0941, + "step": 1012 + }, + { + "epoch": 8.658119658119658, + "grad_norm": 0.9345772701049747, + "learning_rate": 1.8995384982460636e-06, + "loss": 0.1512, + "step": 1013 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.7617202728674364, + "learning_rate": 1.895195261000831e-06, + "loss": 0.0691, + "step": 1014 + }, + { + "epoch": 8.675213675213675, + "grad_norm": 0.4683005730689552, + "learning_rate": 1.8908539617859455e-06, + "loss": 0.0309, + "step": 1015 + }, + { + "epoch": 8.683760683760683, + "grad_norm": 1.0820714752748837, + "learning_rate": 1.8865146145126228e-06, + "loss": 0.095, + "step": 1016 + }, + { + "epoch": 8.692307692307692, + "grad_norm": 0.5167947937067587, + "learning_rate": 1.8821772330858259e-06, + "loss": 0.0576, + "step": 1017 + }, + { + "epoch": 8.7008547008547, + "grad_norm": 0.5618671513966412, + "learning_rate": 1.877841831404217e-06, + "loss": 0.0563, + "step": 1018 + }, + { + "epoch": 8.709401709401709, + "grad_norm": 0.7395751100091337, + "learning_rate": 1.873508423360113e-06, + "loss": 0.0616, + "step": 1019 + }, + { + "epoch": 8.717948717948717, + "grad_norm": 0.9613190639149091, + "learning_rate": 1.8691770228394458e-06, + "loss": 0.3077, + "step": 1020 + }, + { + "epoch": 8.726495726495726, + "grad_norm": 0.7355368158189639, + "learning_rate": 1.8648476437217117e-06, + "loss": 0.1128, + "step": 1021 + }, + { + "epoch": 8.735042735042736, + "grad_norm": 0.6420518723233007, + "learning_rate": 1.8605202998799299e-06, + "loss": 0.138, + "step": 1022 + }, + { + "epoch": 8.743589743589745, + "grad_norm": 0.8047403589848431, + "learning_rate": 1.8561950051805994e-06, + "loss": 0.1726, + "step": 1023 + }, + { + "epoch": 8.752136752136753, + "grad_norm": 0.7614318567549201, + "learning_rate": 1.8518717734836522e-06, + "loss": 0.1479, + "step": 1024 + }, + { + "epoch": 8.760683760683762, + "grad_norm": 0.9470748204080588, + "learning_rate": 1.8475506186424075e-06, + "loss": 0.1853, + "step": 1025 + }, + { + "epoch": 8.76923076923077, + "grad_norm": 1.6630512840142453, + "learning_rate": 1.8432315545035328e-06, + "loss": 0.0757, + "step": 1026 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 1.2434105142184977, + "learning_rate": 1.8389145949069953e-06, + "loss": 0.0686, + "step": 1027 + }, + { + "epoch": 8.786324786324787, + "grad_norm": 0.8550828646975722, + "learning_rate": 1.8345997536860154e-06, + "loss": 0.1574, + "step": 1028 + }, + { + "epoch": 8.794871794871796, + "grad_norm": 0.6583015306693577, + "learning_rate": 1.83028704466703e-06, + "loss": 0.0519, + "step": 1029 + }, + { + "epoch": 8.803418803418804, + "grad_norm": 0.8504516863547416, + "learning_rate": 1.8259764816696413e-06, + "loss": 0.0567, + "step": 1030 + }, + { + "epoch": 8.811965811965813, + "grad_norm": 1.2133627934038262, + "learning_rate": 1.8216680785065734e-06, + "loss": 0.1414, + "step": 1031 + }, + { + "epoch": 8.820512820512821, + "grad_norm": 0.7975309833677979, + "learning_rate": 1.8173618489836315e-06, + "loss": 0.0269, + "step": 1032 + }, + { + "epoch": 8.82905982905983, + "grad_norm": 0.8326288867470483, + "learning_rate": 1.813057806899656e-06, + "loss": 0.1349, + "step": 1033 + }, + { + "epoch": 8.837606837606838, + "grad_norm": 0.8892203026048484, + "learning_rate": 1.8087559660464766e-06, + "loss": 0.1477, + "step": 1034 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 1.045254181038715, + "learning_rate": 1.8044563402088686e-06, + "loss": 0.1232, + "step": 1035 + }, + { + "epoch": 8.854700854700855, + "grad_norm": 1.0373914190120757, + "learning_rate": 1.800158943164512e-06, + "loss": 0.0521, + "step": 1036 + }, + { + "epoch": 8.863247863247864, + "grad_norm": 0.4542191780940718, + "learning_rate": 1.7958637886839437e-06, + "loss": 0.0367, + "step": 1037 + }, + { + "epoch": 8.871794871794872, + "grad_norm": 0.8711063804247088, + "learning_rate": 1.7915708905305124e-06, + "loss": 0.097, + "step": 1038 + }, + { + "epoch": 8.88034188034188, + "grad_norm": 0.7418297441426122, + "learning_rate": 1.7872802624603408e-06, + "loss": 0.1599, + "step": 1039 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.6832073792079075, + "learning_rate": 1.7829919182222752e-06, + "loss": 0.103, + "step": 1040 + }, + { + "epoch": 8.897435897435898, + "grad_norm": 1.139330698907758, + "learning_rate": 1.7787058715578415e-06, + "loss": 0.1249, + "step": 1041 + }, + { + "epoch": 8.905982905982906, + "grad_norm": 0.7542113764593271, + "learning_rate": 1.7744221362012075e-06, + "loss": 0.0849, + "step": 1042 + }, + { + "epoch": 8.914529914529915, + "grad_norm": 0.8049453679611847, + "learning_rate": 1.7701407258791323e-06, + "loss": 0.1231, + "step": 1043 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 0.7350710259171148, + "learning_rate": 1.7658616543109237e-06, + "loss": 0.1335, + "step": 1044 + }, + { + "epoch": 8.931623931623932, + "grad_norm": 0.8851916034876457, + "learning_rate": 1.7615849352083975e-06, + "loss": 0.1177, + "step": 1045 + }, + { + "epoch": 8.94017094017094, + "grad_norm": 0.48977542938325314, + "learning_rate": 1.7573105822758307e-06, + "loss": 0.0143, + "step": 1046 + }, + { + "epoch": 8.948717948717949, + "grad_norm": 0.8288418037510138, + "learning_rate": 1.753038609209916e-06, + "loss": 0.0476, + "step": 1047 + }, + { + "epoch": 8.957264957264957, + "grad_norm": 0.6358081416813472, + "learning_rate": 1.7487690296997234e-06, + "loss": 0.0985, + "step": 1048 + }, + { + "epoch": 8.965811965811966, + "grad_norm": 0.6362613648171749, + "learning_rate": 1.7445018574266514e-06, + "loss": 0.0773, + "step": 1049 + }, + { + "epoch": 8.974358974358974, + "grad_norm": 0.7461142400839225, + "learning_rate": 1.740237106064383e-06, + "loss": 0.1196, + "step": 1050 + }, + { + "epoch": 8.982905982905983, + "grad_norm": 0.8936483774841266, + "learning_rate": 1.7359747892788476e-06, + "loss": 0.196, + "step": 1051 + }, + { + "epoch": 8.991452991452991, + "grad_norm": 0.4840749262955058, + "learning_rate": 1.7317149207281697e-06, + "loss": 0.0993, + "step": 1052 + }, + { + "epoch": 9.0, + "grad_norm": 1.0037508288233707, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.2391, + "step": 1053 + }, + { + "epoch": 9.008547008547009, + "grad_norm": 0.5469203775844427, + "learning_rate": 1.723202582924624e-06, + "loss": 0.1241, + "step": 1054 + }, + { + "epoch": 9.017094017094017, + "grad_norm": 0.6115702018377079, + "learning_rate": 1.7189501409486061e-06, + "loss": 0.0685, + "step": 1055 + }, + { + "epoch": 9.025641025641026, + "grad_norm": 0.5787625117983263, + "learning_rate": 1.7147002017610626e-06, + "loss": 0.1274, + "step": 1056 + }, + { + "epoch": 9.034188034188034, + "grad_norm": 0.6270696251364306, + "learning_rate": 1.7104527789804554e-06, + "loss": 0.0868, + "step": 1057 + }, + { + "epoch": 9.042735042735043, + "grad_norm": 0.543963227824456, + "learning_rate": 1.7062078862171838e-06, + "loss": 0.1412, + "step": 1058 + }, + { + "epoch": 9.051282051282051, + "grad_norm": 0.7749740607472595, + "learning_rate": 1.7019655370735428e-06, + "loss": 0.1233, + "step": 1059 + }, + { + "epoch": 9.05982905982906, + "grad_norm": 0.9217914154541708, + "learning_rate": 1.6977257451436714e-06, + "loss": 0.1321, + "step": 1060 + }, + { + "epoch": 9.068376068376068, + "grad_norm": 0.6165308553771175, + "learning_rate": 1.6934885240135179e-06, + "loss": 0.0547, + "step": 1061 + }, + { + "epoch": 9.076923076923077, + "grad_norm": 0.5919180728326213, + "learning_rate": 1.6892538872607936e-06, + "loss": 0.0612, + "step": 1062 + }, + { + "epoch": 9.085470085470085, + "grad_norm": 1.0790808763695527, + "learning_rate": 1.6850218484549247e-06, + "loss": 0.0709, + "step": 1063 + }, + { + "epoch": 9.094017094017094, + "grad_norm": 1.0528333030837587, + "learning_rate": 1.6807924211570151e-06, + "loss": 0.1405, + "step": 1064 + }, + { + "epoch": 9.102564102564102, + "grad_norm": 0.7128081472757782, + "learning_rate": 1.6765656189198013e-06, + "loss": 0.0618, + "step": 1065 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 0.3276318721882367, + "learning_rate": 1.6723414552876052e-06, + "loss": 0.0148, + "step": 1066 + }, + { + "epoch": 9.11965811965812, + "grad_norm": 0.5501475064232987, + "learning_rate": 1.6681199437962952e-06, + "loss": 0.023, + "step": 1067 + }, + { + "epoch": 9.128205128205128, + "grad_norm": 1.0861109374111326, + "learning_rate": 1.663901097973243e-06, + "loss": 0.2336, + "step": 1068 + }, + { + "epoch": 9.136752136752136, + "grad_norm": 1.0153934263214417, + "learning_rate": 1.6596849313372737e-06, + "loss": 0.0865, + "step": 1069 + }, + { + "epoch": 9.145299145299145, + "grad_norm": 0.818922631869137, + "learning_rate": 1.6554714573986325e-06, + "loss": 0.172, + "step": 1070 + }, + { + "epoch": 9.153846153846153, + "grad_norm": 1.0151873298589587, + "learning_rate": 1.6512606896589323e-06, + "loss": 0.1137, + "step": 1071 + }, + { + "epoch": 9.162393162393162, + "grad_norm": 0.8559221976647785, + "learning_rate": 1.647052641611117e-06, + "loss": 0.3052, + "step": 1072 + }, + { + "epoch": 9.17094017094017, + "grad_norm": 0.6823398560394819, + "learning_rate": 1.6428473267394135e-06, + "loss": 0.0456, + "step": 1073 + }, + { + "epoch": 9.179487179487179, + "grad_norm": 0.7232126386293811, + "learning_rate": 1.6386447585192911e-06, + "loss": 0.0662, + "step": 1074 + }, + { + "epoch": 9.188034188034187, + "grad_norm": 0.8760431001553861, + "learning_rate": 1.6344449504174193e-06, + "loss": 0.0734, + "step": 1075 + }, + { + "epoch": 9.196581196581196, + "grad_norm": 0.7741351713881812, + "learning_rate": 1.63024791589162e-06, + "loss": 0.096, + "step": 1076 + }, + { + "epoch": 9.205128205128204, + "grad_norm": 0.5196258263774313, + "learning_rate": 1.6260536683908302e-06, + "loss": 0.0829, + "step": 1077 + }, + { + "epoch": 9.213675213675213, + "grad_norm": 0.8037020755326458, + "learning_rate": 1.621862221355056e-06, + "loss": 0.1224, + "step": 1078 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 0.7282477385753159, + "learning_rate": 1.6176735882153284e-06, + "loss": 0.1329, + "step": 1079 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.7135098215001101, + "learning_rate": 1.613487782393661e-06, + "loss": 0.1125, + "step": 1080 + }, + { + "epoch": 9.239316239316238, + "grad_norm": 0.49568130995063514, + "learning_rate": 1.6093048173030108e-06, + "loss": 0.0488, + "step": 1081 + }, + { + "epoch": 9.247863247863247, + "grad_norm": 0.6156142870138479, + "learning_rate": 1.6051247063472286e-06, + "loss": 0.1364, + "step": 1082 + }, + { + "epoch": 9.256410256410255, + "grad_norm": 0.6682930787471676, + "learning_rate": 1.6009474629210202e-06, + "loss": 0.0712, + "step": 1083 + }, + { + "epoch": 9.264957264957266, + "grad_norm": 0.6640845493688758, + "learning_rate": 1.5967731004099057e-06, + "loss": 0.0641, + "step": 1084 + }, + { + "epoch": 9.273504273504274, + "grad_norm": 0.7636323793775699, + "learning_rate": 1.5926016321901688e-06, + "loss": 0.0429, + "step": 1085 + }, + { + "epoch": 9.282051282051283, + "grad_norm": 0.8877230380963692, + "learning_rate": 1.5884330716288215e-06, + "loss": 0.0643, + "step": 1086 + }, + { + "epoch": 9.290598290598291, + "grad_norm": 0.33476740323934123, + "learning_rate": 1.5842674320835598e-06, + "loss": 0.0205, + "step": 1087 + }, + { + "epoch": 9.2991452991453, + "grad_norm": 0.6572203225317508, + "learning_rate": 1.5801047269027164e-06, + "loss": 0.0686, + "step": 1088 + }, + { + "epoch": 9.307692307692308, + "grad_norm": 0.9194169349218544, + "learning_rate": 1.5759449694252226e-06, + "loss": 0.0975, + "step": 1089 + }, + { + "epoch": 9.316239316239317, + "grad_norm": 0.7860733352724719, + "learning_rate": 1.571788172980566e-06, + "loss": 0.1256, + "step": 1090 + }, + { + "epoch": 9.324786324786325, + "grad_norm": 0.35704020366142286, + "learning_rate": 1.567634350888743e-06, + "loss": 0.0108, + "step": 1091 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 0.5165940959681908, + "learning_rate": 1.56348351646022e-06, + "loss": 0.0392, + "step": 1092 + }, + { + "epoch": 9.341880341880342, + "grad_norm": 1.0167691098950664, + "learning_rate": 1.5593356829958906e-06, + "loss": 0.1348, + "step": 1093 + }, + { + "epoch": 9.350427350427351, + "grad_norm": 0.7949727993522546, + "learning_rate": 1.5551908637870316e-06, + "loss": 0.0468, + "step": 1094 + }, + { + "epoch": 9.35897435897436, + "grad_norm": 0.6917120775734487, + "learning_rate": 1.5510490721152594e-06, + "loss": 0.1252, + "step": 1095 + }, + { + "epoch": 9.367521367521368, + "grad_norm": 0.563045044386508, + "learning_rate": 1.5469103212524917e-06, + "loss": 0.0396, + "step": 1096 + }, + { + "epoch": 9.376068376068377, + "grad_norm": 1.189868604604147, + "learning_rate": 1.5427746244609015e-06, + "loss": 0.1178, + "step": 1097 + }, + { + "epoch": 9.384615384615385, + "grad_norm": 0.8655690423744861, + "learning_rate": 1.5386419949928732e-06, + "loss": 0.1011, + "step": 1098 + }, + { + "epoch": 9.393162393162394, + "grad_norm": 0.27480320961328897, + "learning_rate": 1.534512446090965e-06, + "loss": 0.0076, + "step": 1099 + }, + { + "epoch": 9.401709401709402, + "grad_norm": 0.898643735539036, + "learning_rate": 1.5303859909878632e-06, + "loss": 0.0732, + "step": 1100 + }, + { + "epoch": 9.41025641025641, + "grad_norm": 0.6781316543310858, + "learning_rate": 1.5262626429063385e-06, + "loss": 0.0258, + "step": 1101 + }, + { + "epoch": 9.418803418803419, + "grad_norm": 0.5356639554354824, + "learning_rate": 1.5221424150592078e-06, + "loss": 0.0224, + "step": 1102 + }, + { + "epoch": 9.427350427350428, + "grad_norm": 0.7036579428853127, + "learning_rate": 1.518025320649289e-06, + "loss": 0.1014, + "step": 1103 + }, + { + "epoch": 9.435897435897436, + "grad_norm": 0.8127569945947543, + "learning_rate": 1.5139113728693575e-06, + "loss": 0.0729, + "step": 1104 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.4071231607971864, + "learning_rate": 1.509800584902108e-06, + "loss": 0.0244, + "step": 1105 + }, + { + "epoch": 9.452991452991453, + "grad_norm": 0.8364670311624951, + "learning_rate": 1.5056929699201095e-06, + "loss": 0.0945, + "step": 1106 + }, + { + "epoch": 9.461538461538462, + "grad_norm": 0.4954553788829718, + "learning_rate": 1.5015885410857617e-06, + "loss": 0.0606, + "step": 1107 + }, + { + "epoch": 9.47008547008547, + "grad_norm": 0.5658789103410351, + "learning_rate": 1.4974873115512561e-06, + "loss": 0.0747, + "step": 1108 + }, + { + "epoch": 9.478632478632479, + "grad_norm": 0.7649082423627214, + "learning_rate": 1.4933892944585331e-06, + "loss": 0.0679, + "step": 1109 + }, + { + "epoch": 9.487179487179487, + "grad_norm": 0.47749330261738837, + "learning_rate": 1.489294502939238e-06, + "loss": 0.0497, + "step": 1110 + }, + { + "epoch": 9.495726495726496, + "grad_norm": 0.5052962598593574, + "learning_rate": 1.4852029501146797e-06, + "loss": 0.077, + "step": 1111 + }, + { + "epoch": 9.504273504273504, + "grad_norm": 0.8130091469943291, + "learning_rate": 1.4811146490957903e-06, + "loss": 0.1415, + "step": 1112 + }, + { + "epoch": 9.512820512820513, + "grad_norm": 0.8200553661974441, + "learning_rate": 1.477029612983082e-06, + "loss": 0.0908, + "step": 1113 + }, + { + "epoch": 9.521367521367521, + "grad_norm": 0.4874352736293081, + "learning_rate": 1.4729478548666027e-06, + "loss": 0.0833, + "step": 1114 + }, + { + "epoch": 9.52991452991453, + "grad_norm": 1.0044705325481182, + "learning_rate": 1.468869387825899e-06, + "loss": 0.1289, + "step": 1115 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 0.6176728966289926, + "learning_rate": 1.4647942249299708e-06, + "loss": 0.0646, + "step": 1116 + }, + { + "epoch": 9.547008547008547, + "grad_norm": 0.2590378983739352, + "learning_rate": 1.4607223792372283e-06, + "loss": 0.0066, + "step": 1117 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 0.9995222239324348, + "learning_rate": 1.4566538637954556e-06, + "loss": 0.1536, + "step": 1118 + }, + { + "epoch": 9.564102564102564, + "grad_norm": 1.0559167819192246, + "learning_rate": 1.452588691641763e-06, + "loss": 0.112, + "step": 1119 + }, + { + "epoch": 9.572649572649572, + "grad_norm": 1.040684684533516, + "learning_rate": 1.4485268758025467e-06, + "loss": 0.2305, + "step": 1120 + }, + { + "epoch": 9.581196581196581, + "grad_norm": 0.813868331480055, + "learning_rate": 1.4444684292934508e-06, + "loss": 0.0973, + "step": 1121 + }, + { + "epoch": 9.58974358974359, + "grad_norm": 0.9200586391191897, + "learning_rate": 1.4404133651193214e-06, + "loss": 0.0995, + "step": 1122 + }, + { + "epoch": 9.598290598290598, + "grad_norm": 0.7501002335879925, + "learning_rate": 1.436361696274166e-06, + "loss": 0.1735, + "step": 1123 + }, + { + "epoch": 9.606837606837606, + "grad_norm": 0.6965668620221329, + "learning_rate": 1.4323134357411114e-06, + "loss": 0.0925, + "step": 1124 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 0.5897020132686239, + "learning_rate": 1.4282685964923643e-06, + "loss": 0.0364, + "step": 1125 + }, + { + "epoch": 9.623931623931623, + "grad_norm": 0.4921094546193598, + "learning_rate": 1.4242271914891688e-06, + "loss": 0.0612, + "step": 1126 + }, + { + "epoch": 9.632478632478632, + "grad_norm": 0.674541085706552, + "learning_rate": 1.4201892336817616e-06, + "loss": 0.1126, + "step": 1127 + }, + { + "epoch": 9.64102564102564, + "grad_norm": 0.9111233476933359, + "learning_rate": 1.4161547360093364e-06, + "loss": 0.0828, + "step": 1128 + }, + { + "epoch": 9.649572649572649, + "grad_norm": 0.7087024813552596, + "learning_rate": 1.4121237113999975e-06, + "loss": 0.1226, + "step": 1129 + }, + { + "epoch": 9.658119658119658, + "grad_norm": 0.6092477040379468, + "learning_rate": 1.4080961727707185e-06, + "loss": 0.0601, + "step": 1130 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 0.6395452684949057, + "learning_rate": 1.4040721330273063e-06, + "loss": 0.0502, + "step": 1131 + }, + { + "epoch": 9.675213675213675, + "grad_norm": 0.5101956373827701, + "learning_rate": 1.4000516050643549e-06, + "loss": 0.032, + "step": 1132 + }, + { + "epoch": 9.683760683760683, + "grad_norm": 0.5625589364023198, + "learning_rate": 1.3960346017652027e-06, + "loss": 0.0248, + "step": 1133 + }, + { + "epoch": 9.692307692307692, + "grad_norm": 0.6634624344092164, + "learning_rate": 1.3920211360018971e-06, + "loss": 0.0512, + "step": 1134 + }, + { + "epoch": 9.7008547008547, + "grad_norm": 0.7224811521885878, + "learning_rate": 1.3880112206351476e-06, + "loss": 0.0983, + "step": 1135 + }, + { + "epoch": 9.709401709401709, + "grad_norm": 0.5523514956679569, + "learning_rate": 1.3840048685142863e-06, + "loss": 0.1096, + "step": 1136 + }, + { + "epoch": 9.717948717948717, + "grad_norm": 0.6629443177230425, + "learning_rate": 1.3800020924772295e-06, + "loss": 0.0514, + "step": 1137 + }, + { + "epoch": 9.726495726495726, + "grad_norm": 0.500052262142771, + "learning_rate": 1.3760029053504346e-06, + "loss": 0.0756, + "step": 1138 + }, + { + "epoch": 9.735042735042736, + "grad_norm": 0.7645112724299366, + "learning_rate": 1.372007319948855e-06, + "loss": 0.1317, + "step": 1139 + }, + { + "epoch": 9.743589743589745, + "grad_norm": 0.4826084146079522, + "learning_rate": 1.3680153490759074e-06, + "loss": 0.027, + "step": 1140 + }, + { + "epoch": 9.752136752136753, + "grad_norm": 0.6166594006265274, + "learning_rate": 1.3640270055234227e-06, + "loss": 0.1439, + "step": 1141 + }, + { + "epoch": 9.760683760683762, + "grad_norm": 0.912758359405943, + "learning_rate": 1.360042302071609e-06, + "loss": 0.0884, + "step": 1142 + }, + { + "epoch": 9.76923076923077, + "grad_norm": 0.8898677341687786, + "learning_rate": 1.356061251489012e-06, + "loss": 0.1925, + "step": 1143 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 0.794175190514908, + "learning_rate": 1.3520838665324704e-06, + "loss": 0.067, + "step": 1144 + }, + { + "epoch": 9.786324786324787, + "grad_norm": 0.5056181154976642, + "learning_rate": 1.3481101599470794e-06, + "loss": 0.0584, + "step": 1145 + }, + { + "epoch": 9.794871794871796, + "grad_norm": 0.7530316981449607, + "learning_rate": 1.3441401444661416e-06, + "loss": 0.1078, + "step": 1146 + }, + { + "epoch": 9.803418803418804, + "grad_norm": 0.46970294469799134, + "learning_rate": 1.3401738328111374e-06, + "loss": 0.0502, + "step": 1147 + }, + { + "epoch": 9.811965811965813, + "grad_norm": 0.4670561098785269, + "learning_rate": 1.336211237691678e-06, + "loss": 0.08, + "step": 1148 + }, + { + "epoch": 9.820512820512821, + "grad_norm": 0.9544088217636255, + "learning_rate": 1.3322523718054615e-06, + "loss": 0.0836, + "step": 1149 + }, + { + "epoch": 9.82905982905983, + "grad_norm": 0.9109700464158846, + "learning_rate": 1.328297247838241e-06, + "loss": 0.0952, + "step": 1150 + }, + { + "epoch": 9.837606837606838, + "grad_norm": 0.7733074193148459, + "learning_rate": 1.3243458784637763e-06, + "loss": 0.144, + "step": 1151 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 0.7022197374436578, + "learning_rate": 1.320398276343795e-06, + "loss": 0.134, + "step": 1152 + }, + { + "epoch": 9.854700854700855, + "grad_norm": 0.9098047058181238, + "learning_rate": 1.3164544541279555e-06, + "loss": 0.1418, + "step": 1153 + }, + { + "epoch": 9.863247863247864, + "grad_norm": 0.532927177425754, + "learning_rate": 1.3125144244538038e-06, + "loss": 0.0912, + "step": 1154 + }, + { + "epoch": 9.871794871794872, + "grad_norm": 0.5578707839247825, + "learning_rate": 1.3085781999467303e-06, + "loss": 0.0474, + "step": 1155 + }, + { + "epoch": 9.88034188034188, + "grad_norm": 1.0787188197476811, + "learning_rate": 1.304645793219936e-06, + "loss": 0.1303, + "step": 1156 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.5720067454545272, + "learning_rate": 1.3007172168743854e-06, + "loss": 0.0295, + "step": 1157 + }, + { + "epoch": 9.897435897435898, + "grad_norm": 0.6813484673780763, + "learning_rate": 1.2967924834987687e-06, + "loss": 0.134, + "step": 1158 + }, + { + "epoch": 9.905982905982906, + "grad_norm": 0.4114842099127276, + "learning_rate": 1.2928716056694637e-06, + "loss": 0.0094, + "step": 1159 + }, + { + "epoch": 9.914529914529915, + "grad_norm": 0.8382263936677936, + "learning_rate": 1.288954595950494e-06, + "loss": 0.1134, + "step": 1160 + }, + { + "epoch": 9.923076923076923, + "grad_norm": 1.1972460246350214, + "learning_rate": 1.285041466893485e-06, + "loss": 0.0304, + "step": 1161 + }, + { + "epoch": 9.931623931623932, + "grad_norm": 0.9141699196867842, + "learning_rate": 1.2811322310376303e-06, + "loss": 0.1376, + "step": 1162 + }, + { + "epoch": 9.94017094017094, + "grad_norm": 0.9960126860177222, + "learning_rate": 1.2772269009096456e-06, + "loss": 0.1626, + "step": 1163 + }, + { + "epoch": 9.948717948717949, + "grad_norm": 0.5481120652670447, + "learning_rate": 1.2733254890237334e-06, + "loss": 0.0397, + "step": 1164 + }, + { + "epoch": 9.957264957264957, + "grad_norm": 0.7753907538829564, + "learning_rate": 1.2694280078815382e-06, + "loss": 0.0258, + "step": 1165 + }, + { + "epoch": 9.965811965811966, + "grad_norm": 0.3447580989241403, + "learning_rate": 1.2655344699721111e-06, + "loss": 0.03, + "step": 1166 + }, + { + "epoch": 9.974358974358974, + "grad_norm": 0.5310755009164717, + "learning_rate": 1.2616448877718672e-06, + "loss": 0.0356, + "step": 1167 + }, + { + "epoch": 9.982905982905983, + "grad_norm": 1.1489485415815224, + "learning_rate": 1.257759273744545e-06, + "loss": 0.1356, + "step": 1168 + }, + { + "epoch": 9.991452991452991, + "grad_norm": 0.8228009876608517, + "learning_rate": 1.253877640341166e-06, + "loss": 0.143, + "step": 1169 + }, + { + "epoch": 10.0, + "grad_norm": 0.4149657424465853, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.0409, + "step": 1170 + }, + { + "epoch": 10.008547008547009, + "grad_norm": 0.7528373835054892, + "learning_rate": 1.2461263651465194e-06, + "loss": 0.072, + "step": 1171 + }, + { + "epoch": 10.017094017094017, + "grad_norm": 0.5693118765989197, + "learning_rate": 1.2422567481933604e-06, + "loss": 0.0482, + "step": 1172 + }, + { + "epoch": 10.025641025641026, + "grad_norm": 0.8967770899898463, + "learning_rate": 1.2383911615402873e-06, + "loss": 0.0923, + "step": 1173 + }, + { + "epoch": 10.034188034188034, + "grad_norm": 0.8133267721334558, + "learning_rate": 1.2345296175741466e-06, + "loss": 0.119, + "step": 1174 + }, + { + "epoch": 10.042735042735043, + "grad_norm": 0.27346171070876685, + "learning_rate": 1.2306721286688312e-06, + "loss": 0.0074, + "step": 1175 + }, + { + "epoch": 10.051282051282051, + "grad_norm": 0.47296973060426656, + "learning_rate": 1.226818707185242e-06, + "loss": 0.0555, + "step": 1176 + }, + { + "epoch": 10.05982905982906, + "grad_norm": 0.7330167223259335, + "learning_rate": 1.2229693654712433e-06, + "loss": 0.129, + "step": 1177 + }, + { + "epoch": 10.068376068376068, + "grad_norm": 0.5383813730777611, + "learning_rate": 1.2191241158616284e-06, + "loss": 0.0502, + "step": 1178 + }, + { + "epoch": 10.076923076923077, + "grad_norm": 0.451721049722562, + "learning_rate": 1.2152829706780786e-06, + "loss": 0.0178, + "step": 1179 + }, + { + "epoch": 10.085470085470085, + "grad_norm": 0.8743661988137066, + "learning_rate": 1.2114459422291205e-06, + "loss": 0.0818, + "step": 1180 + }, + { + "epoch": 10.094017094017094, + "grad_norm": 0.4089134604414421, + "learning_rate": 1.2076130428100894e-06, + "loss": 0.0176, + "step": 1181 + }, + { + "epoch": 10.102564102564102, + "grad_norm": 0.6019224728360073, + "learning_rate": 1.203784284703091e-06, + "loss": 0.1036, + "step": 1182 + }, + { + "epoch": 10.11111111111111, + "grad_norm": 0.6988787974863384, + "learning_rate": 1.1999596801769617e-06, + "loss": 0.0825, + "step": 1183 + }, + { + "epoch": 10.11965811965812, + "grad_norm": 0.3663857880245144, + "learning_rate": 1.196139241487225e-06, + "loss": 0.0363, + "step": 1184 + }, + { + "epoch": 10.128205128205128, + "grad_norm": 0.6280681792226723, + "learning_rate": 1.1923229808760565e-06, + "loss": 0.0776, + "step": 1185 + }, + { + "epoch": 10.136752136752136, + "grad_norm": 0.39634122729164556, + "learning_rate": 1.1885109105722454e-06, + "loss": 0.0116, + "step": 1186 + }, + { + "epoch": 10.145299145299145, + "grad_norm": 0.5081856806271968, + "learning_rate": 1.184703042791151e-06, + "loss": 0.0541, + "step": 1187 + }, + { + "epoch": 10.153846153846153, + "grad_norm": 0.5514164069512796, + "learning_rate": 1.1808993897346679e-06, + "loss": 0.0355, + "step": 1188 + }, + { + "epoch": 10.162393162393162, + "grad_norm": 0.3944092444525037, + "learning_rate": 1.1770999635911857e-06, + "loss": 0.0263, + "step": 1189 + }, + { + "epoch": 10.17094017094017, + "grad_norm": 0.4668680631140163, + "learning_rate": 1.1733047765355466e-06, + "loss": 0.0283, + "step": 1190 + }, + { + "epoch": 10.179487179487179, + "grad_norm": 0.48820262814502324, + "learning_rate": 1.1695138407290101e-06, + "loss": 0.0519, + "step": 1191 + }, + { + "epoch": 10.188034188034187, + "grad_norm": 0.8782440276516138, + "learning_rate": 1.1657271683192156e-06, + "loss": 0.1342, + "step": 1192 + }, + { + "epoch": 10.196581196581196, + "grad_norm": 0.6483304893287919, + "learning_rate": 1.1619447714401367e-06, + "loss": 0.0909, + "step": 1193 + }, + { + "epoch": 10.205128205128204, + "grad_norm": 0.9154554603400102, + "learning_rate": 1.1581666622120494e-06, + "loss": 0.1041, + "step": 1194 + }, + { + "epoch": 10.213675213675213, + "grad_norm": 0.8140879592063411, + "learning_rate": 1.154392852741491e-06, + "loss": 0.1302, + "step": 1195 + }, + { + "epoch": 10.222222222222221, + "grad_norm": 0.632195949813156, + "learning_rate": 1.1506233551212186e-06, + "loss": 0.0345, + "step": 1196 + }, + { + "epoch": 10.23076923076923, + "grad_norm": 0.6331347366030665, + "learning_rate": 1.1468581814301718e-06, + "loss": 0.0605, + "step": 1197 + }, + { + "epoch": 10.239316239316238, + "grad_norm": 0.5852831167921697, + "learning_rate": 1.1430973437334375e-06, + "loss": 0.0595, + "step": 1198 + }, + { + "epoch": 10.247863247863247, + "grad_norm": 0.5502452330425632, + "learning_rate": 1.1393408540822073e-06, + "loss": 0.0947, + "step": 1199 + }, + { + "epoch": 10.256410256410255, + "grad_norm": 0.6661243529811738, + "learning_rate": 1.1355887245137383e-06, + "loss": 0.1082, + "step": 1200 + }, + { + "epoch": 10.264957264957266, + "grad_norm": 0.692366618170434, + "learning_rate": 1.1318409670513194e-06, + "loss": 0.0552, + "step": 1201 + }, + { + "epoch": 10.273504273504274, + "grad_norm": 0.41183248684256724, + "learning_rate": 1.1280975937042263e-06, + "loss": 0.0138, + "step": 1202 + }, + { + "epoch": 10.282051282051283, + "grad_norm": 0.3922345475410547, + "learning_rate": 1.1243586164676873e-06, + "loss": 0.0332, + "step": 1203 + }, + { + "epoch": 10.290598290598291, + "grad_norm": 0.4986802355158306, + "learning_rate": 1.1206240473228447e-06, + "loss": 0.0464, + "step": 1204 + }, + { + "epoch": 10.2991452991453, + "grad_norm": 0.5761580848948239, + "learning_rate": 1.1168938982367162e-06, + "loss": 0.1282, + "step": 1205 + }, + { + "epoch": 10.307692307692308, + "grad_norm": 0.48963370949226304, + "learning_rate": 1.1131681811621529e-06, + "loss": 0.0494, + "step": 1206 + }, + { + "epoch": 10.316239316239317, + "grad_norm": 0.30709075253427076, + "learning_rate": 1.1094469080378076e-06, + "loss": 0.0372, + "step": 1207 + }, + { + "epoch": 10.324786324786325, + "grad_norm": 0.363856094256232, + "learning_rate": 1.1057300907880904e-06, + "loss": 0.0152, + "step": 1208 + }, + { + "epoch": 10.333333333333334, + "grad_norm": 0.3184443824187863, + "learning_rate": 1.1020177413231334e-06, + "loss": 0.0366, + "step": 1209 + }, + { + "epoch": 10.341880341880342, + "grad_norm": 0.2350464111005662, + "learning_rate": 1.0983098715387528e-06, + "loss": 0.0048, + "step": 1210 + }, + { + "epoch": 10.350427350427351, + "grad_norm": 1.2205776925052512, + "learning_rate": 1.0946064933164117e-06, + "loss": 0.0724, + "step": 1211 + }, + { + "epoch": 10.35897435897436, + "grad_norm": 0.7312356567575174, + "learning_rate": 1.0909076185231762e-06, + "loss": 0.1381, + "step": 1212 + }, + { + "epoch": 10.367521367521368, + "grad_norm": 0.5863298364245818, + "learning_rate": 1.0872132590116866e-06, + "loss": 0.052, + "step": 1213 + }, + { + "epoch": 10.376068376068377, + "grad_norm": 0.3717821624930494, + "learning_rate": 1.0835234266201109e-06, + "loss": 0.0242, + "step": 1214 + }, + { + "epoch": 10.384615384615385, + "grad_norm": 0.1928478705969336, + "learning_rate": 1.079838133172111e-06, + "loss": 0.0049, + "step": 1215 + }, + { + "epoch": 10.393162393162394, + "grad_norm": 0.32877528239235754, + "learning_rate": 1.0761573904768054e-06, + "loss": 0.0143, + "step": 1216 + }, + { + "epoch": 10.401709401709402, + "grad_norm": 0.702253447542617, + "learning_rate": 1.0724812103287304e-06, + "loss": 0.0888, + "step": 1217 + }, + { + "epoch": 10.41025641025641, + "grad_norm": 0.4895957665769553, + "learning_rate": 1.0688096045078023e-06, + "loss": 0.027, + "step": 1218 + }, + { + "epoch": 10.418803418803419, + "grad_norm": 0.5389561115166098, + "learning_rate": 1.0651425847792767e-06, + "loss": 0.042, + "step": 1219 + }, + { + "epoch": 10.427350427350428, + "grad_norm": 0.32986659917452915, + "learning_rate": 1.061480162893716e-06, + "loss": 0.0118, + "step": 1220 + }, + { + "epoch": 10.435897435897436, + "grad_norm": 0.5670031788128372, + "learning_rate": 1.0578223505869494e-06, + "loss": 0.0561, + "step": 1221 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 0.9213400434155533, + "learning_rate": 1.0541691595800338e-06, + "loss": 0.1897, + "step": 1222 + }, + { + "epoch": 10.452991452991453, + "grad_norm": 0.7140637315596772, + "learning_rate": 1.0505206015792194e-06, + "loss": 0.1617, + "step": 1223 + }, + { + "epoch": 10.461538461538462, + "grad_norm": 0.6165789747562992, + "learning_rate": 1.0468766882759094e-06, + "loss": 0.1076, + "step": 1224 + }, + { + "epoch": 10.47008547008547, + "grad_norm": 0.47363819608468016, + "learning_rate": 1.043237431346622e-06, + "loss": 0.0485, + "step": 1225 + }, + { + "epoch": 10.478632478632479, + "grad_norm": 0.34733303744291694, + "learning_rate": 1.0396028424529578e-06, + "loss": 0.014, + "step": 1226 + }, + { + "epoch": 10.487179487179487, + "grad_norm": 0.5114382942206728, + "learning_rate": 1.0359729332415582e-06, + "loss": 0.0347, + "step": 1227 + }, + { + "epoch": 10.495726495726496, + "grad_norm": 1.1524497729712029, + "learning_rate": 1.032347715344067e-06, + "loss": 0.1512, + "step": 1228 + }, + { + "epoch": 10.504273504273504, + "grad_norm": 0.39107233323596885, + "learning_rate": 1.0287272003770982e-06, + "loss": 0.0251, + "step": 1229 + }, + { + "epoch": 10.512820512820513, + "grad_norm": 0.5521904639634848, + "learning_rate": 1.0251113999421936e-06, + "loss": 0.0487, + "step": 1230 + }, + { + "epoch": 10.521367521367521, + "grad_norm": 0.865100144503052, + "learning_rate": 1.0215003256257874e-06, + "loss": 0.031, + "step": 1231 + }, + { + "epoch": 10.52991452991453, + "grad_norm": 0.7486109733499217, + "learning_rate": 1.0178939889991717e-06, + "loss": 0.0586, + "step": 1232 + }, + { + "epoch": 10.538461538461538, + "grad_norm": 0.5210171135789035, + "learning_rate": 1.014292401618457e-06, + "loss": 0.0453, + "step": 1233 + }, + { + "epoch": 10.547008547008547, + "grad_norm": 0.6957074639868348, + "learning_rate": 1.0106955750245323e-06, + "loss": 0.0592, + "step": 1234 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 0.46970468661795917, + "learning_rate": 1.0071035207430352e-06, + "loss": 0.0371, + "step": 1235 + }, + { + "epoch": 10.564102564102564, + "grad_norm": 0.9700613365583296, + "learning_rate": 1.0035162502843073e-06, + "loss": 0.1562, + "step": 1236 + }, + { + "epoch": 10.572649572649572, + "grad_norm": 0.5923127833018439, + "learning_rate": 9.999337751433643e-07, + "loss": 0.0958, + "step": 1237 + }, + { + "epoch": 10.581196581196581, + "grad_norm": 0.7836272236314462, + "learning_rate": 9.963561067998531e-07, + "loss": 0.1466, + "step": 1238 + }, + { + "epoch": 10.58974358974359, + "grad_norm": 0.5903667674523515, + "learning_rate": 9.927832567180193e-07, + "loss": 0.0754, + "step": 1239 + }, + { + "epoch": 10.598290598290598, + "grad_norm": 0.5989987601274536, + "learning_rate": 9.892152363466692e-07, + "loss": 0.083, + "step": 1240 + }, + { + "epoch": 10.606837606837606, + "grad_norm": 0.7580993772394548, + "learning_rate": 9.856520571191316e-07, + "loss": 0.0482, + "step": 1241 + }, + { + "epoch": 10.615384615384615, + "grad_norm": 0.6157661057336996, + "learning_rate": 9.820937304532221e-07, + "loss": 0.0659, + "step": 1242 + }, + { + "epoch": 10.623931623931623, + "grad_norm": 0.3535321781443096, + "learning_rate": 9.78540267751209e-07, + "loss": 0.0128, + "step": 1243 + }, + { + "epoch": 10.632478632478632, + "grad_norm": 0.5739629799831032, + "learning_rate": 9.749916803997717e-07, + "loss": 0.1112, + "step": 1244 + }, + { + "epoch": 10.64102564102564, + "grad_norm": 0.5198724203896348, + "learning_rate": 9.714479797699695e-07, + "loss": 0.0846, + "step": 1245 + }, + { + "epoch": 10.649572649572649, + "grad_norm": 1.0956938439727597, + "learning_rate": 9.679091772172021e-07, + "loss": 0.0334, + "step": 1246 + }, + { + "epoch": 10.658119658119658, + "grad_norm": 0.6896219245087443, + "learning_rate": 9.643752840811734e-07, + "loss": 0.1022, + "step": 1247 + }, + { + "epoch": 10.666666666666666, + "grad_norm": 0.7358048567920182, + "learning_rate": 9.608463116858544e-07, + "loss": 0.1266, + "step": 1248 + }, + { + "epoch": 10.675213675213675, + "grad_norm": 0.6241086544923451, + "learning_rate": 9.573222713394513e-07, + "loss": 0.0542, + "step": 1249 + }, + { + "epoch": 10.683760683760683, + "grad_norm": 0.7913785676217334, + "learning_rate": 9.538031743343628e-07, + "loss": 0.2033, + "step": 1250 + }, + { + "epoch": 10.692307692307692, + "grad_norm": 0.6768475684484199, + "learning_rate": 9.502890319471491e-07, + "loss": 0.1312, + "step": 1251 + }, + { + "epoch": 10.7008547008547, + "grad_norm": 0.369422414370594, + "learning_rate": 9.467798554384946e-07, + "loss": 0.0389, + "step": 1252 + }, + { + "epoch": 10.709401709401709, + "grad_norm": 0.5810099654433328, + "learning_rate": 9.432756560531691e-07, + "loss": 0.1071, + "step": 1253 + }, + { + "epoch": 10.717948717948717, + "grad_norm": 0.41263470769639693, + "learning_rate": 9.397764450199937e-07, + "loss": 0.0129, + "step": 1254 + }, + { + "epoch": 10.726495726495726, + "grad_norm": 0.7689917693636449, + "learning_rate": 9.362822335518062e-07, + "loss": 0.1342, + "step": 1255 + }, + { + "epoch": 10.735042735042736, + "grad_norm": 0.3029264104977626, + "learning_rate": 9.327930328454249e-07, + "loss": 0.0098, + "step": 1256 + }, + { + "epoch": 10.743589743589745, + "grad_norm": 0.5769986168100306, + "learning_rate": 9.293088540816081e-07, + "loss": 0.0681, + "step": 1257 + }, + { + "epoch": 10.752136752136753, + "grad_norm": 0.6153157026662167, + "learning_rate": 9.258297084250256e-07, + "loss": 0.1117, + "step": 1258 + }, + { + "epoch": 10.760683760683762, + "grad_norm": 0.5001175270228171, + "learning_rate": 9.22355607024217e-07, + "loss": 0.0647, + "step": 1259 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 0.41031875179006044, + "learning_rate": 9.188865610115572e-07, + "loss": 0.049, + "step": 1260 + }, + { + "epoch": 10.777777777777779, + "grad_norm": 0.6200771849234717, + "learning_rate": 9.154225815032242e-07, + "loss": 0.0605, + "step": 1261 + }, + { + "epoch": 10.786324786324787, + "grad_norm": 0.47359609380322704, + "learning_rate": 9.119636795991605e-07, + "loss": 0.0341, + "step": 1262 + }, + { + "epoch": 10.794871794871796, + "grad_norm": 0.5788256097687582, + "learning_rate": 9.085098663830366e-07, + "loss": 0.0482, + "step": 1263 + }, + { + "epoch": 10.803418803418804, + "grad_norm": 0.5159569439778875, + "learning_rate": 9.050611529222167e-07, + "loss": 0.0552, + "step": 1264 + }, + { + "epoch": 10.811965811965813, + "grad_norm": 0.7146916985301787, + "learning_rate": 9.01617550267726e-07, + "loss": 0.108, + "step": 1265 + }, + { + "epoch": 10.820512820512821, + "grad_norm": 0.4995849685364073, + "learning_rate": 8.98179069454209e-07, + "loss": 0.0261, + "step": 1266 + }, + { + "epoch": 10.82905982905983, + "grad_norm": 0.8280784490328196, + "learning_rate": 8.947457214999006e-07, + "loss": 0.096, + "step": 1267 + }, + { + "epoch": 10.837606837606838, + "grad_norm": 0.7122046826322141, + "learning_rate": 8.91317517406588e-07, + "loss": 0.0925, + "step": 1268 + }, + { + "epoch": 10.846153846153847, + "grad_norm": 0.4857030492009978, + "learning_rate": 8.878944681595742e-07, + "loss": 0.0481, + "step": 1269 + }, + { + "epoch": 10.854700854700855, + "grad_norm": 0.561507137918128, + "learning_rate": 8.844765847276432e-07, + "loss": 0.0408, + "step": 1270 + }, + { + "epoch": 10.863247863247864, + "grad_norm": 0.7758353154061892, + "learning_rate": 8.810638780630279e-07, + "loss": 0.1096, + "step": 1271 + }, + { + "epoch": 10.871794871794872, + "grad_norm": 0.515619620193862, + "learning_rate": 8.776563591013729e-07, + "loss": 0.005, + "step": 1272 + }, + { + "epoch": 10.88034188034188, + "grad_norm": 0.9442077883117976, + "learning_rate": 8.742540387616966e-07, + "loss": 0.1571, + "step": 1273 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 1.0257043051233612, + "learning_rate": 8.708569279463622e-07, + "loss": 0.1633, + "step": 1274 + }, + { + "epoch": 10.897435897435898, + "grad_norm": 0.459285913736046, + "learning_rate": 8.674650375410379e-07, + "loss": 0.0502, + "step": 1275 + }, + { + "epoch": 10.905982905982906, + "grad_norm": 0.9103469998042978, + "learning_rate": 8.640783784146625e-07, + "loss": 0.0768, + "step": 1276 + }, + { + "epoch": 10.914529914529915, + "grad_norm": 0.6507237935820749, + "learning_rate": 8.606969614194144e-07, + "loss": 0.1222, + "step": 1277 + }, + { + "epoch": 10.923076923076923, + "grad_norm": 0.5369364732448973, + "learning_rate": 8.573207973906736e-07, + "loss": 0.0935, + "step": 1278 + }, + { + "epoch": 10.931623931623932, + "grad_norm": 0.4727652663160645, + "learning_rate": 8.539498971469848e-07, + "loss": 0.0117, + "step": 1279 + }, + { + "epoch": 10.94017094017094, + "grad_norm": 0.6326787172262908, + "learning_rate": 8.505842714900298e-07, + "loss": 0.127, + "step": 1280 + }, + { + "epoch": 10.948717948717949, + "grad_norm": 0.5802629230404729, + "learning_rate": 8.472239312045851e-07, + "loss": 0.0445, + "step": 1281 + }, + { + "epoch": 10.957264957264957, + "grad_norm": 0.518789507611408, + "learning_rate": 8.438688870584913e-07, + "loss": 0.0894, + "step": 1282 + }, + { + "epoch": 10.965811965811966, + "grad_norm": 0.736324468031325, + "learning_rate": 8.405191498026197e-07, + "loss": 0.0655, + "step": 1283 + }, + { + "epoch": 10.974358974358974, + "grad_norm": 0.7115113974384445, + "learning_rate": 8.371747301708358e-07, + "loss": 0.0763, + "step": 1284 + }, + { + "epoch": 10.982905982905983, + "grad_norm": 0.5593515387241527, + "learning_rate": 8.338356388799637e-07, + "loss": 0.0163, + "step": 1285 + }, + { + "epoch": 10.991452991452991, + "grad_norm": 0.8641104682123674, + "learning_rate": 8.305018866297562e-07, + "loss": 0.0364, + "step": 1286 + }, + { + "epoch": 11.0, + "grad_norm": 0.5402027419043104, + "learning_rate": 8.271734841028553e-07, + "loss": 0.0477, + "step": 1287 + }, + { + "epoch": 11.008547008547009, + "grad_norm": 0.5911273896802919, + "learning_rate": 8.238504419647602e-07, + "loss": 0.0877, + "step": 1288 + }, + { + "epoch": 11.017094017094017, + "grad_norm": 0.33997822462745325, + "learning_rate": 8.205327708637958e-07, + "loss": 0.0254, + "step": 1289 + }, + { + "epoch": 11.025641025641026, + "grad_norm": 0.49353807343817413, + "learning_rate": 8.172204814310741e-07, + "loss": 0.0769, + "step": 1290 + }, + { + "epoch": 11.034188034188034, + "grad_norm": 0.3881327917898042, + "learning_rate": 8.139135842804638e-07, + "loss": 0.0336, + "step": 1291 + }, + { + "epoch": 11.042735042735043, + "grad_norm": 0.691569011124054, + "learning_rate": 8.106120900085526e-07, + "loss": 0.0267, + "step": 1292 + }, + { + "epoch": 11.051282051282051, + "grad_norm": 0.29706237464447505, + "learning_rate": 8.073160091946156e-07, + "loss": 0.0225, + "step": 1293 + }, + { + "epoch": 11.05982905982906, + "grad_norm": 0.5077009563429163, + "learning_rate": 8.040253524005834e-07, + "loss": 0.0514, + "step": 1294 + }, + { + "epoch": 11.068376068376068, + "grad_norm": 0.9891213054149579, + "learning_rate": 8.007401301710022e-07, + "loss": 0.094, + "step": 1295 + }, + { + "epoch": 11.076923076923077, + "grad_norm": 0.41721832260093394, + "learning_rate": 7.974603530330069e-07, + "loss": 0.0428, + "step": 1296 + }, + { + "epoch": 11.085470085470085, + "grad_norm": 0.09346754112243624, + "learning_rate": 7.941860314962843e-07, + "loss": 0.0027, + "step": 1297 + }, + { + "epoch": 11.094017094017094, + "grad_norm": 0.5782118619571033, + "learning_rate": 7.909171760530351e-07, + "loss": 0.0908, + "step": 1298 + }, + { + "epoch": 11.102564102564102, + "grad_norm": 0.37375362031134285, + "learning_rate": 7.876537971779493e-07, + "loss": 0.0469, + "step": 1299 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 0.71782364636737, + "learning_rate": 7.843959053281663e-07, + "loss": 0.1492, + "step": 1300 + }, + { + "epoch": 11.11965811965812, + "grad_norm": 0.6855105794017308, + "learning_rate": 7.811435109432417e-07, + "loss": 0.0959, + "step": 1301 + }, + { + "epoch": 11.128205128205128, + "grad_norm": 0.3250302852365095, + "learning_rate": 7.778966244451169e-07, + "loss": 0.0226, + "step": 1302 + }, + { + "epoch": 11.136752136752136, + "grad_norm": 0.5110437520863196, + "learning_rate": 7.746552562380829e-07, + "loss": 0.0731, + "step": 1303 + }, + { + "epoch": 11.145299145299145, + "grad_norm": 0.4993479038849234, + "learning_rate": 7.714194167087466e-07, + "loss": 0.0406, + "step": 1304 + }, + { + "epoch": 11.153846153846153, + "grad_norm": 0.34456639896835917, + "learning_rate": 7.681891162260016e-07, + "loss": 0.0337, + "step": 1305 + }, + { + "epoch": 11.162393162393162, + "grad_norm": 0.647114798756596, + "learning_rate": 7.649643651409916e-07, + "loss": 0.1163, + "step": 1306 + }, + { + "epoch": 11.17094017094017, + "grad_norm": 0.6597078559115684, + "learning_rate": 7.617451737870754e-07, + "loss": 0.0987, + "step": 1307 + }, + { + "epoch": 11.179487179487179, + "grad_norm": 0.5012062943764185, + "learning_rate": 7.585315524797998e-07, + "loss": 0.046, + "step": 1308 + }, + { + "epoch": 11.188034188034187, + "grad_norm": 0.3789882400899952, + "learning_rate": 7.553235115168598e-07, + "loss": 0.028, + "step": 1309 + }, + { + "epoch": 11.196581196581196, + "grad_norm": 0.18142287558793085, + "learning_rate": 7.521210611780715e-07, + "loss": 0.0058, + "step": 1310 + }, + { + "epoch": 11.205128205128204, + "grad_norm": 0.378803728058699, + "learning_rate": 7.489242117253342e-07, + "loss": 0.0313, + "step": 1311 + }, + { + "epoch": 11.213675213675213, + "grad_norm": 0.4789925234922484, + "learning_rate": 7.457329734026012e-07, + "loss": 0.072, + "step": 1312 + }, + { + "epoch": 11.222222222222221, + "grad_norm": 0.3146472429753484, + "learning_rate": 7.425473564358457e-07, + "loss": 0.0151, + "step": 1313 + }, + { + "epoch": 11.23076923076923, + "grad_norm": 0.34429324177174386, + "learning_rate": 7.393673710330271e-07, + "loss": 0.0272, + "step": 1314 + }, + { + "epoch": 11.239316239316238, + "grad_norm": 0.9134534949643333, + "learning_rate": 7.361930273840581e-07, + "loss": 0.1517, + "step": 1315 + }, + { + "epoch": 11.247863247863247, + "grad_norm": 0.5776250431801766, + "learning_rate": 7.330243356607758e-07, + "loss": 0.0614, + "step": 1316 + }, + { + "epoch": 11.256410256410255, + "grad_norm": 0.6828276002890209, + "learning_rate": 7.298613060169035e-07, + "loss": 0.1427, + "step": 1317 + }, + { + "epoch": 11.264957264957266, + "grad_norm": 0.5870274651894514, + "learning_rate": 7.267039485880225e-07, + "loss": 0.0464, + "step": 1318 + }, + { + "epoch": 11.273504273504274, + "grad_norm": 0.31497786244699366, + "learning_rate": 7.235522734915393e-07, + "loss": 0.0318, + "step": 1319 + }, + { + "epoch": 11.282051282051283, + "grad_norm": 0.38817987373377083, + "learning_rate": 7.204062908266491e-07, + "loss": 0.0418, + "step": 1320 + }, + { + "epoch": 11.290598290598291, + "grad_norm": 0.7395579672928589, + "learning_rate": 7.172660106743073e-07, + "loss": 0.0726, + "step": 1321 + }, + { + "epoch": 11.2991452991453, + "grad_norm": 0.6617468807072606, + "learning_rate": 7.14131443097198e-07, + "loss": 0.1204, + "step": 1322 + }, + { + "epoch": 11.307692307692308, + "grad_norm": 0.39065128812433864, + "learning_rate": 7.110025981396976e-07, + "loss": 0.0464, + "step": 1323 + }, + { + "epoch": 11.316239316239317, + "grad_norm": 0.3353924587905785, + "learning_rate": 7.078794858278462e-07, + "loss": 0.0195, + "step": 1324 + }, + { + "epoch": 11.324786324786325, + "grad_norm": 0.7270883932112006, + "learning_rate": 7.047621161693152e-07, + "loss": 0.1163, + "step": 1325 + }, + { + "epoch": 11.333333333333334, + "grad_norm": 0.2013736973419809, + "learning_rate": 7.016504991533727e-07, + "loss": 0.0045, + "step": 1326 + }, + { + "epoch": 11.341880341880342, + "grad_norm": 0.5377819821360028, + "learning_rate": 6.985446447508526e-07, + "loss": 0.0629, + "step": 1327 + }, + { + "epoch": 11.350427350427351, + "grad_norm": 0.7904997992935608, + "learning_rate": 6.954445629141246e-07, + "loss": 0.127, + "step": 1328 + }, + { + "epoch": 11.35897435897436, + "grad_norm": 0.5448615589144118, + "learning_rate": 6.923502635770618e-07, + "loss": 0.0704, + "step": 1329 + }, + { + "epoch": 11.367521367521368, + "grad_norm": 0.7324833370630733, + "learning_rate": 6.892617566550044e-07, + "loss": 0.0725, + "step": 1330 + }, + { + "epoch": 11.376068376068377, + "grad_norm": 0.4079972569261083, + "learning_rate": 6.861790520447356e-07, + "loss": 0.0249, + "step": 1331 + }, + { + "epoch": 11.384615384615385, + "grad_norm": 0.3360927561523562, + "learning_rate": 6.831021596244425e-07, + "loss": 0.026, + "step": 1332 + }, + { + "epoch": 11.393162393162394, + "grad_norm": 0.4744642260817377, + "learning_rate": 6.800310892536884e-07, + "loss": 0.0397, + "step": 1333 + }, + { + "epoch": 11.401709401709402, + "grad_norm": 0.25327380853084464, + "learning_rate": 6.769658507733815e-07, + "loss": 0.0114, + "step": 1334 + }, + { + "epoch": 11.41025641025641, + "grad_norm": 0.4251209808676598, + "learning_rate": 6.739064540057425e-07, + "loss": 0.0394, + "step": 1335 + }, + { + "epoch": 11.418803418803419, + "grad_norm": 0.5531908728480924, + "learning_rate": 6.708529087542717e-07, + "loss": 0.0769, + "step": 1336 + }, + { + "epoch": 11.427350427350428, + "grad_norm": 0.48252102964953725, + "learning_rate": 6.678052248037184e-07, + "loss": 0.0842, + "step": 1337 + }, + { + "epoch": 11.435897435897436, + "grad_norm": 0.5455617564825324, + "learning_rate": 6.64763411920053e-07, + "loss": 0.041, + "step": 1338 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 0.5918199647503406, + "learning_rate": 6.617274798504286e-07, + "loss": 0.0656, + "step": 1339 + }, + { + "epoch": 11.452991452991453, + "grad_norm": 0.36042607672574983, + "learning_rate": 6.586974383231573e-07, + "loss": 0.0161, + "step": 1340 + }, + { + "epoch": 11.461538461538462, + "grad_norm": 0.5747374588172919, + "learning_rate": 6.556732970476748e-07, + "loss": 0.0967, + "step": 1341 + }, + { + "epoch": 11.47008547008547, + "grad_norm": 0.3626825420456705, + "learning_rate": 6.526550657145089e-07, + "loss": 0.0325, + "step": 1342 + }, + { + "epoch": 11.478632478632479, + "grad_norm": 0.7375881253726265, + "learning_rate": 6.496427539952499e-07, + "loss": 0.0559, + "step": 1343 + }, + { + "epoch": 11.487179487179487, + "grad_norm": 0.4282538497196272, + "learning_rate": 6.4663637154252e-07, + "loss": 0.0635, + "step": 1344 + }, + { + "epoch": 11.495726495726496, + "grad_norm": 0.5249222332292881, + "learning_rate": 6.436359279899426e-07, + "loss": 0.0575, + "step": 1345 + }, + { + "epoch": 11.504273504273504, + "grad_norm": 0.5911125039984814, + "learning_rate": 6.406414329521079e-07, + "loss": 0.0362, + "step": 1346 + }, + { + "epoch": 11.512820512820513, + "grad_norm": 0.6291188878291657, + "learning_rate": 6.376528960245476e-07, + "loss": 0.0416, + "step": 1347 + }, + { + "epoch": 11.521367521367521, + "grad_norm": 1.0438630139472773, + "learning_rate": 6.346703267836998e-07, + "loss": 0.085, + "step": 1348 + }, + { + "epoch": 11.52991452991453, + "grad_norm": 0.3318124571152316, + "learning_rate": 6.316937347868787e-07, + "loss": 0.0337, + "step": 1349 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 0.4881877370841047, + "learning_rate": 6.28723129572247e-07, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 11.547008547008547, + "grad_norm": 0.42206065565383477, + "learning_rate": 6.257585206587843e-07, + "loss": 0.0383, + "step": 1351 + }, + { + "epoch": 11.555555555555555, + "grad_norm": 0.7835268070693865, + "learning_rate": 6.227999175462521e-07, + "loss": 0.0897, + "step": 1352 + }, + { + "epoch": 11.564102564102564, + "grad_norm": 0.43435236055674364, + "learning_rate": 6.198473297151705e-07, + "loss": 0.0424, + "step": 1353 + }, + { + "epoch": 11.572649572649572, + "grad_norm": 0.5417439990210136, + "learning_rate": 6.169007666267824e-07, + "loss": 0.082, + "step": 1354 + }, + { + "epoch": 11.581196581196581, + "grad_norm": 0.38045323551448207, + "learning_rate": 6.139602377230247e-07, + "loss": 0.0378, + "step": 1355 + }, + { + "epoch": 11.58974358974359, + "grad_norm": 0.20022917291801898, + "learning_rate": 6.110257524264998e-07, + "loss": 0.0045, + "step": 1356 + }, + { + "epoch": 11.598290598290598, + "grad_norm": 0.8737992978550385, + "learning_rate": 6.080973201404444e-07, + "loss": 0.1648, + "step": 1357 + }, + { + "epoch": 11.606837606837606, + "grad_norm": 0.43375747282423105, + "learning_rate": 6.051749502486967e-07, + "loss": 0.0603, + "step": 1358 + }, + { + "epoch": 11.615384615384615, + "grad_norm": 0.40339600429776185, + "learning_rate": 6.022586521156714e-07, + "loss": 0.0427, + "step": 1359 + }, + { + "epoch": 11.623931623931623, + "grad_norm": 0.32496203424257325, + "learning_rate": 5.993484350863246e-07, + "loss": 0.0246, + "step": 1360 + }, + { + "epoch": 11.632478632478632, + "grad_norm": 0.08515217413894376, + "learning_rate": 5.964443084861265e-07, + "loss": 0.0018, + "step": 1361 + }, + { + "epoch": 11.64102564102564, + "grad_norm": 0.45210961448699233, + "learning_rate": 5.935462816210325e-07, + "loss": 0.0732, + "step": 1362 + }, + { + "epoch": 11.649572649572649, + "grad_norm": 0.6607902661129196, + "learning_rate": 5.906543637774512e-07, + "loss": 0.1566, + "step": 1363 + }, + { + "epoch": 11.658119658119658, + "grad_norm": 0.579241782197535, + "learning_rate": 5.877685642222163e-07, + "loss": 0.0624, + "step": 1364 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 0.6270729600768505, + "learning_rate": 5.848888922025553e-07, + "loss": 0.0542, + "step": 1365 + }, + { + "epoch": 11.675213675213675, + "grad_norm": 0.24964227547767157, + "learning_rate": 5.820153569460596e-07, + "loss": 0.0082, + "step": 1366 + }, + { + "epoch": 11.683760683760683, + "grad_norm": 0.49031923515529713, + "learning_rate": 5.791479676606587e-07, + "loss": 0.0393, + "step": 1367 + }, + { + "epoch": 11.692307692307692, + "grad_norm": 0.6758857364179156, + "learning_rate": 5.762867335345851e-07, + "loss": 0.0886, + "step": 1368 + }, + { + "epoch": 11.7008547008547, + "grad_norm": 0.4810336398940464, + "learning_rate": 5.734316637363505e-07, + "loss": 0.0568, + "step": 1369 + }, + { + "epoch": 11.709401709401709, + "grad_norm": 0.3710878858390517, + "learning_rate": 5.705827674147125e-07, + "loss": 0.0188, + "step": 1370 + }, + { + "epoch": 11.717948717948717, + "grad_norm": 0.5475396711113039, + "learning_rate": 5.67740053698646e-07, + "loss": 0.0387, + "step": 1371 + }, + { + "epoch": 11.726495726495726, + "grad_norm": 0.4567651524798183, + "learning_rate": 5.649035316973142e-07, + "loss": 0.0337, + "step": 1372 + }, + { + "epoch": 11.735042735042736, + "grad_norm": 0.15447178250306332, + "learning_rate": 5.620732105000415e-07, + "loss": 0.0051, + "step": 1373 + }, + { + "epoch": 11.743589743589745, + "grad_norm": 0.5477727950392361, + "learning_rate": 5.5924909917628e-07, + "loss": 0.0725, + "step": 1374 + }, + { + "epoch": 11.752136752136753, + "grad_norm": 0.3721327831035405, + "learning_rate": 5.564312067755856e-07, + "loss": 0.0257, + "step": 1375 + }, + { + "epoch": 11.760683760683762, + "grad_norm": 0.5341727624288521, + "learning_rate": 5.536195423275839e-07, + "loss": 0.1048, + "step": 1376 + }, + { + "epoch": 11.76923076923077, + "grad_norm": 0.31087455820578935, + "learning_rate": 5.508141148419443e-07, + "loss": 0.0318, + "step": 1377 + }, + { + "epoch": 11.777777777777779, + "grad_norm": 0.6628966101866233, + "learning_rate": 5.48014933308352e-07, + "loss": 0.0887, + "step": 1378 + }, + { + "epoch": 11.786324786324787, + "grad_norm": 0.49027928460694736, + "learning_rate": 5.45222006696477e-07, + "loss": 0.0377, + "step": 1379 + }, + { + "epoch": 11.794871794871796, + "grad_norm": 0.5691374119125762, + "learning_rate": 5.424353439559446e-07, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 11.803418803418804, + "grad_norm": 0.5610287799733852, + "learning_rate": 5.396549540163106e-07, + "loss": 0.0316, + "step": 1381 + }, + { + "epoch": 11.811965811965813, + "grad_norm": 0.4984471059555396, + "learning_rate": 5.36880845787028e-07, + "loss": 0.0357, + "step": 1382 + }, + { + "epoch": 11.820512820512821, + "grad_norm": 0.6272781584817222, + "learning_rate": 5.341130281574233e-07, + "loss": 0.0827, + "step": 1383 + }, + { + "epoch": 11.82905982905983, + "grad_norm": 0.6115936866920039, + "learning_rate": 5.313515099966627e-07, + "loss": 0.056, + "step": 1384 + }, + { + "epoch": 11.837606837606838, + "grad_norm": 0.4513746929956304, + "learning_rate": 5.28596300153728e-07, + "loss": 0.0504, + "step": 1385 + }, + { + "epoch": 11.846153846153847, + "grad_norm": 0.6712793842418363, + "learning_rate": 5.258474074573878e-07, + "loss": 0.1243, + "step": 1386 + }, + { + "epoch": 11.854700854700855, + "grad_norm": 0.38720868482490745, + "learning_rate": 5.231048407161657e-07, + "loss": 0.031, + "step": 1387 + }, + { + "epoch": 11.863247863247864, + "grad_norm": 0.5695039021760533, + "learning_rate": 5.203686087183149e-07, + "loss": 0.0785, + "step": 1388 + }, + { + "epoch": 11.871794871794872, + "grad_norm": 0.8750851731120838, + "learning_rate": 5.176387202317915e-07, + "loss": 0.1761, + "step": 1389 + }, + { + "epoch": 11.88034188034188, + "grad_norm": 0.6257864639644699, + "learning_rate": 5.149151840042224e-07, + "loss": 0.125, + "step": 1390 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 0.41421763085001945, + "learning_rate": 5.121980087628802e-07, + "loss": 0.0376, + "step": 1391 + }, + { + "epoch": 11.897435897435898, + "grad_norm": 0.5926832943736985, + "learning_rate": 5.094872032146562e-07, + "loss": 0.0296, + "step": 1392 + }, + { + "epoch": 11.905982905982906, + "grad_norm": 0.9497155581880431, + "learning_rate": 5.06782776046027e-07, + "loss": 0.0237, + "step": 1393 + }, + { + "epoch": 11.914529914529915, + "grad_norm": 0.4394719509685079, + "learning_rate": 5.040847359230327e-07, + "loss": 0.0431, + "step": 1394 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 0.5155848596346819, + "learning_rate": 5.013930914912477e-07, + "loss": 0.0372, + "step": 1395 + }, + { + "epoch": 11.931623931623932, + "grad_norm": 0.4328919090524444, + "learning_rate": 4.98707851375749e-07, + "loss": 0.0421, + "step": 1396 + }, + { + "epoch": 11.94017094017094, + "grad_norm": 0.4715230395024146, + "learning_rate": 4.96029024181095e-07, + "loss": 0.0448, + "step": 1397 + }, + { + "epoch": 11.948717948717949, + "grad_norm": 0.35700048562694225, + "learning_rate": 4.933566184912931e-07, + "loss": 0.023, + "step": 1398 + }, + { + "epoch": 11.957264957264957, + "grad_norm": 0.5586094473199622, + "learning_rate": 4.906906428697736e-07, + "loss": 0.0127, + "step": 1399 + }, + { + "epoch": 11.965811965811966, + "grad_norm": 0.7247335039292072, + "learning_rate": 4.880311058593617e-07, + "loss": 0.0598, + "step": 1400 + }, + { + "epoch": 11.974358974358974, + "grad_norm": 0.5470831532818907, + "learning_rate": 4.853780159822521e-07, + "loss": 0.0436, + "step": 1401 + }, + { + "epoch": 11.982905982905983, + "grad_norm": 0.4622223877451198, + "learning_rate": 4.827313817399809e-07, + "loss": 0.0676, + "step": 1402 + }, + { + "epoch": 11.991452991452991, + "grad_norm": 0.4091103436084329, + "learning_rate": 4.800912116133955e-07, + "loss": 0.0255, + "step": 1403 + }, + { + "epoch": 12.0, + "grad_norm": 0.7340458945691303, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0752, + "step": 1404 + }, + { + "epoch": 12.008547008547009, + "grad_norm": 0.5210990574780048, + "learning_rate": 4.748302975270838e-07, + "loss": 0.0693, + "step": 1405 + }, + { + "epoch": 12.017094017094017, + "grad_norm": 0.4788263979171196, + "learning_rate": 4.7220957042537793e-07, + "loss": 0.062, + "step": 1406 + }, + { + "epoch": 12.025641025641026, + "grad_norm": 0.36329152290724714, + "learning_rate": 4.6959534115534665e-07, + "loss": 0.0427, + "step": 1407 + }, + { + "epoch": 12.034188034188034, + "grad_norm": 0.34183052198104663, + "learning_rate": 4.669876180940014e-07, + "loss": 0.0343, + "step": 1408 + }, + { + "epoch": 12.042735042735043, + "grad_norm": 0.4294908397852798, + "learning_rate": 4.6438640959750285e-07, + "loss": 0.03, + "step": 1409 + }, + { + "epoch": 12.051282051282051, + "grad_norm": 0.3310588008300683, + "learning_rate": 4.617917240011394e-07, + "loss": 0.0185, + "step": 1410 + }, + { + "epoch": 12.05982905982906, + "grad_norm": 0.26661733443446145, + "learning_rate": 4.592035696192948e-07, + "loss": 0.0165, + "step": 1411 + }, + { + "epoch": 12.068376068376068, + "grad_norm": 0.45245941537813045, + "learning_rate": 4.566219547454251e-07, + "loss": 0.0546, + "step": 1412 + }, + { + "epoch": 12.076923076923077, + "grad_norm": 0.44678544747622206, + "learning_rate": 4.5404688765203236e-07, + "loss": 0.0422, + "step": 1413 + }, + { + "epoch": 12.085470085470085, + "grad_norm": 0.42280662183770285, + "learning_rate": 4.514783765906369e-07, + "loss": 0.0456, + "step": 1414 + }, + { + "epoch": 12.094017094017094, + "grad_norm": 0.630588141911986, + "learning_rate": 4.489164297917492e-07, + "loss": 0.0725, + "step": 1415 + }, + { + "epoch": 12.102564102564102, + "grad_norm": 0.6957962761795572, + "learning_rate": 4.463610554648459e-07, + "loss": 0.0719, + "step": 1416 + }, + { + "epoch": 12.11111111111111, + "grad_norm": 0.2344619909817438, + "learning_rate": 4.438122617983442e-07, + "loss": 0.0085, + "step": 1417 + }, + { + "epoch": 12.11965811965812, + "grad_norm": 0.5297503159148729, + "learning_rate": 4.4127005695957374e-07, + "loss": 0.0426, + "step": 1418 + }, + { + "epoch": 12.128205128205128, + "grad_norm": 0.4180850364931802, + "learning_rate": 4.3873444909474985e-07, + "loss": 0.0465, + "step": 1419 + }, + { + "epoch": 12.136752136752136, + "grad_norm": 0.494857947438109, + "learning_rate": 4.3620544632895e-07, + "loss": 0.0558, + "step": 1420 + }, + { + "epoch": 12.145299145299145, + "grad_norm": 0.5494346854035879, + "learning_rate": 4.336830567660855e-07, + "loss": 0.072, + "step": 1421 + }, + { + "epoch": 12.153846153846153, + "grad_norm": 0.505720729616524, + "learning_rate": 4.311672884888757e-07, + "loss": 0.0531, + "step": 1422 + }, + { + "epoch": 12.162393162393162, + "grad_norm": 0.3882375970304378, + "learning_rate": 4.286581495588249e-07, + "loss": 0.0254, + "step": 1423 + }, + { + "epoch": 12.17094017094017, + "grad_norm": 0.5868112593989185, + "learning_rate": 4.2615564801619325e-07, + "loss": 0.0882, + "step": 1424 + }, + { + "epoch": 12.179487179487179, + "grad_norm": 0.3634578454157606, + "learning_rate": 4.2365979187997094e-07, + "loss": 0.0161, + "step": 1425 + }, + { + "epoch": 12.188034188034187, + "grad_norm": 0.3645718143750022, + "learning_rate": 4.21170589147856e-07, + "loss": 0.0361, + "step": 1426 + }, + { + "epoch": 12.196581196581196, + "grad_norm": 0.21409222209999557, + "learning_rate": 4.1868804779622437e-07, + "loss": 0.009, + "step": 1427 + }, + { + "epoch": 12.205128205128204, + "grad_norm": 0.46176542250006986, + "learning_rate": 4.1621217578010686e-07, + "loss": 0.0558, + "step": 1428 + }, + { + "epoch": 12.213675213675213, + "grad_norm": 0.4530777351923695, + "learning_rate": 4.137429810331639e-07, + "loss": 0.0356, + "step": 1429 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 0.4564673214893363, + "learning_rate": 4.1128047146765936e-07, + "loss": 0.0738, + "step": 1430 + }, + { + "epoch": 12.23076923076923, + "grad_norm": 0.5444892972993393, + "learning_rate": 4.088246549744332e-07, + "loss": 0.0457, + "step": 1431 + }, + { + "epoch": 12.239316239316238, + "grad_norm": 0.46279984642252764, + "learning_rate": 4.063755394228811e-07, + "loss": 0.0411, + "step": 1432 + }, + { + "epoch": 12.247863247863247, + "grad_norm": 0.27693768948677727, + "learning_rate": 4.039331326609239e-07, + "loss": 0.0232, + "step": 1433 + }, + { + "epoch": 12.256410256410255, + "grad_norm": 0.43398332814610746, + "learning_rate": 4.0149744251498537e-07, + "loss": 0.0338, + "step": 1434 + }, + { + "epoch": 12.264957264957266, + "grad_norm": 0.7156656613045878, + "learning_rate": 3.990684767899677e-07, + "loss": 0.0649, + "step": 1435 + }, + { + "epoch": 12.273504273504274, + "grad_norm": 1.6223644181519623, + "learning_rate": 3.9664624326922447e-07, + "loss": 0.0726, + "step": 1436 + }, + { + "epoch": 12.282051282051283, + "grad_norm": 0.5691517529033704, + "learning_rate": 3.9423074971453785e-07, + "loss": 0.1054, + "step": 1437 + }, + { + "epoch": 12.290598290598291, + "grad_norm": 0.25125116996613145, + "learning_rate": 3.918220038660908e-07, + "loss": 0.0081, + "step": 1438 + }, + { + "epoch": 12.2991452991453, + "grad_norm": 0.5363768285869193, + "learning_rate": 3.8942001344244416e-07, + "loss": 0.1076, + "step": 1439 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 0.3908348952351424, + "learning_rate": 3.8702478614051353e-07, + "loss": 0.0472, + "step": 1440 + }, + { + "epoch": 12.316239316239317, + "grad_norm": 0.4980122155106956, + "learning_rate": 3.846363296355404e-07, + "loss": 0.0684, + "step": 1441 + }, + { + "epoch": 12.324786324786325, + "grad_norm": 0.7839866964738976, + "learning_rate": 3.822546515810724e-07, + "loss": 0.136, + "step": 1442 + }, + { + "epoch": 12.333333333333334, + "grad_norm": 0.2920246859904782, + "learning_rate": 3.798797596089351e-07, + "loss": 0.0154, + "step": 1443 + }, + { + "epoch": 12.341880341880342, + "grad_norm": 0.34194465726755663, + "learning_rate": 3.7751166132920877e-07, + "loss": 0.0424, + "step": 1444 + }, + { + "epoch": 12.350427350427351, + "grad_norm": 0.27774192769005757, + "learning_rate": 3.751503643302035e-07, + "loss": 0.0172, + "step": 1445 + }, + { + "epoch": 12.35897435897436, + "grad_norm": 0.4910903215917028, + "learning_rate": 3.727958761784375e-07, + "loss": 0.0594, + "step": 1446 + }, + { + "epoch": 12.367521367521368, + "grad_norm": 0.680826297771005, + "learning_rate": 3.7044820441860806e-07, + "loss": 0.1152, + "step": 1447 + }, + { + "epoch": 12.376068376068377, + "grad_norm": 0.6415763390791678, + "learning_rate": 3.681073565735718e-07, + "loss": 0.0776, + "step": 1448 + }, + { + "epoch": 12.384615384615385, + "grad_norm": 0.5833589699943083, + "learning_rate": 3.6577334014432003e-07, + "loss": 0.1001, + "step": 1449 + }, + { + "epoch": 12.393162393162394, + "grad_norm": 0.5599742675848512, + "learning_rate": 3.634461626099495e-07, + "loss": 0.0769, + "step": 1450 + }, + { + "epoch": 12.401709401709402, + "grad_norm": 0.23408880880648503, + "learning_rate": 3.611258314276461e-07, + "loss": 0.0095, + "step": 1451 + }, + { + "epoch": 12.41025641025641, + "grad_norm": 0.5420648942933317, + "learning_rate": 3.5881235403265713e-07, + "loss": 0.0709, + "step": 1452 + }, + { + "epoch": 12.418803418803419, + "grad_norm": 0.9151307781476735, + "learning_rate": 3.56505737838265e-07, + "loss": 0.1828, + "step": 1453 + }, + { + "epoch": 12.427350427350428, + "grad_norm": 0.5051641079170742, + "learning_rate": 3.5420599023576946e-07, + "loss": 0.0474, + "step": 1454 + }, + { + "epoch": 12.435897435897436, + "grad_norm": 0.43289408007834856, + "learning_rate": 3.51913118594458e-07, + "loss": 0.0373, + "step": 1455 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 0.45036332783573607, + "learning_rate": 3.4962713026158697e-07, + "loss": 0.0531, + "step": 1456 + }, + { + "epoch": 12.452991452991453, + "grad_norm": 0.3970269144308973, + "learning_rate": 3.473480325623535e-07, + "loss": 0.0402, + "step": 1457 + }, + { + "epoch": 12.461538461538462, + "grad_norm": 0.4664586229855834, + "learning_rate": 3.450758327998768e-07, + "loss": 0.0378, + "step": 1458 + }, + { + "epoch": 12.47008547008547, + "grad_norm": 0.5833404030851905, + "learning_rate": 3.428105382551716e-07, + "loss": 0.1227, + "step": 1459 + }, + { + "epoch": 12.478632478632479, + "grad_norm": 0.18900716916813645, + "learning_rate": 3.405521561871247e-07, + "loss": 0.0071, + "step": 1460 + }, + { + "epoch": 12.487179487179487, + "grad_norm": 0.6329825613088673, + "learning_rate": 3.3830069383247343e-07, + "loss": 0.1368, + "step": 1461 + }, + { + "epoch": 12.495726495726496, + "grad_norm": 0.3111158537112211, + "learning_rate": 3.3605615840578224e-07, + "loss": 0.0153, + "step": 1462 + }, + { + "epoch": 12.504273504273504, + "grad_norm": 0.15672363439125053, + "learning_rate": 3.3381855709941733e-07, + "loss": 0.0025, + "step": 1463 + }, + { + "epoch": 12.512820512820513, + "grad_norm": 0.2541815963874005, + "learning_rate": 3.315878970835268e-07, + "loss": 0.0116, + "step": 1464 + }, + { + "epoch": 12.521367521367521, + "grad_norm": 0.48599298068031166, + "learning_rate": 3.293641855060162e-07, + "loss": 0.0386, + "step": 1465 + }, + { + "epoch": 12.52991452991453, + "grad_norm": 0.42569652919107487, + "learning_rate": 3.2714742949252447e-07, + "loss": 0.0412, + "step": 1466 + }, + { + "epoch": 12.538461538461538, + "grad_norm": 0.4651470442081948, + "learning_rate": 3.249376361464021e-07, + "loss": 0.0621, + "step": 1467 + }, + { + "epoch": 12.547008547008547, + "grad_norm": 0.35693088161836933, + "learning_rate": 3.227348125486904e-07, + "loss": 0.0283, + "step": 1468 + }, + { + "epoch": 12.555555555555555, + "grad_norm": 0.5401582449926521, + "learning_rate": 3.2053896575809426e-07, + "loss": 0.0574, + "step": 1469 + }, + { + "epoch": 12.564102564102564, + "grad_norm": 0.5180556722548045, + "learning_rate": 3.1835010281096426e-07, + "loss": 0.0307, + "step": 1470 + }, + { + "epoch": 12.572649572649572, + "grad_norm": 0.5064784455421214, + "learning_rate": 3.1616823072127157e-07, + "loss": 0.0638, + "step": 1471 + }, + { + "epoch": 12.581196581196581, + "grad_norm": 0.2832584057082466, + "learning_rate": 3.1399335648058555e-07, + "loss": 0.0172, + "step": 1472 + }, + { + "epoch": 12.58974358974359, + "grad_norm": 0.07907910657271316, + "learning_rate": 3.118254870580506e-07, + "loss": 0.0019, + "step": 1473 + }, + { + "epoch": 12.598290598290598, + "grad_norm": 0.4856065855582885, + "learning_rate": 3.096646294003675e-07, + "loss": 0.0786, + "step": 1474 + }, + { + "epoch": 12.606837606837606, + "grad_norm": 0.5611585905476629, + "learning_rate": 3.075107904317667e-07, + "loss": 0.0493, + "step": 1475 + }, + { + "epoch": 12.615384615384615, + "grad_norm": 0.48631084188031415, + "learning_rate": 3.0536397705398845e-07, + "loss": 0.1089, + "step": 1476 + }, + { + "epoch": 12.623931623931623, + "grad_norm": 0.5762370364964107, + "learning_rate": 3.032241961462612e-07, + "loss": 0.0783, + "step": 1477 + }, + { + "epoch": 12.632478632478632, + "grad_norm": 0.25360897774454116, + "learning_rate": 3.010914545652771e-07, + "loss": 0.0135, + "step": 1478 + }, + { + "epoch": 12.64102564102564, + "grad_norm": 0.363938511193013, + "learning_rate": 2.9896575914517166e-07, + "loss": 0.0306, + "step": 1479 + }, + { + "epoch": 12.649572649572649, + "grad_norm": 0.1858863424391372, + "learning_rate": 2.9684711669750313e-07, + "loss": 0.0048, + "step": 1480 + }, + { + "epoch": 12.658119658119658, + "grad_norm": 0.24369363096646676, + "learning_rate": 2.9473553401122875e-07, + "loss": 0.0103, + "step": 1481 + }, + { + "epoch": 12.666666666666666, + "grad_norm": 0.41698180965055937, + "learning_rate": 2.9263101785268253e-07, + "loss": 0.0356, + "step": 1482 + }, + { + "epoch": 12.675213675213675, + "grad_norm": 0.2670049194018334, + "learning_rate": 2.9053357496555635e-07, + "loss": 0.0104, + "step": 1483 + }, + { + "epoch": 12.683760683760683, + "grad_norm": 0.17605755616268962, + "learning_rate": 2.8844321207087465e-07, + "loss": 0.0051, + "step": 1484 + }, + { + "epoch": 12.692307692307692, + "grad_norm": 0.4103155743327306, + "learning_rate": 2.8635993586697555e-07, + "loss": 0.0345, + "step": 1485 + }, + { + "epoch": 12.7008547008547, + "grad_norm": 0.4605812519261627, + "learning_rate": 2.84283753029489e-07, + "loss": 0.0229, + "step": 1486 + }, + { + "epoch": 12.709401709401709, + "grad_norm": 0.40217176084041906, + "learning_rate": 2.822146702113157e-07, + "loss": 0.0486, + "step": 1487 + }, + { + "epoch": 12.717948717948717, + "grad_norm": 0.30183863999283517, + "learning_rate": 2.8015269404260333e-07, + "loss": 0.0234, + "step": 1488 + }, + { + "epoch": 12.726495726495726, + "grad_norm": 0.43189722616688536, + "learning_rate": 2.780978311307278e-07, + "loss": 0.028, + "step": 1489 + }, + { + "epoch": 12.735042735042736, + "grad_norm": 0.18951438153753558, + "learning_rate": 2.7605008806027205e-07, + "loss": 0.0056, + "step": 1490 + }, + { + "epoch": 12.743589743589745, + "grad_norm": 0.5437323913691973, + "learning_rate": 2.7400947139300443e-07, + "loss": 0.0827, + "step": 1491 + }, + { + "epoch": 12.752136752136753, + "grad_norm": 0.40461808187778764, + "learning_rate": 2.7197598766785544e-07, + "loss": 0.0484, + "step": 1492 + }, + { + "epoch": 12.760683760683762, + "grad_norm": 0.42001625712471485, + "learning_rate": 2.6994964340090163e-07, + "loss": 0.0497, + "step": 1493 + }, + { + "epoch": 12.76923076923077, + "grad_norm": 0.2970457000400673, + "learning_rate": 2.679304450853401e-07, + "loss": 0.0166, + "step": 1494 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 0.7433458397781355, + "learning_rate": 2.6591839919146963e-07, + "loss": 0.093, + "step": 1495 + }, + { + "epoch": 12.786324786324787, + "grad_norm": 0.5356885326565816, + "learning_rate": 2.6391351216667053e-07, + "loss": 0.0625, + "step": 1496 + }, + { + "epoch": 12.794871794871796, + "grad_norm": 0.30520132963784746, + "learning_rate": 2.6191579043538333e-07, + "loss": 0.0287, + "step": 1497 + }, + { + "epoch": 12.803418803418804, + "grad_norm": 0.4444882375877483, + "learning_rate": 2.599252403990873e-07, + "loss": 0.0426, + "step": 1498 + }, + { + "epoch": 12.811965811965813, + "grad_norm": 0.41946477409959565, + "learning_rate": 2.5794186843628247e-07, + "loss": 0.0485, + "step": 1499 + }, + { + "epoch": 12.820512820512821, + "grad_norm": 0.47615560566109044, + "learning_rate": 2.5596568090246546e-07, + "loss": 0.0585, + "step": 1500 + }, + { + "epoch": 12.82905982905983, + "grad_norm": 0.25278361952778927, + "learning_rate": 2.539966841301117e-07, + "loss": 0.0188, + "step": 1501 + }, + { + "epoch": 12.837606837606838, + "grad_norm": 0.7435079955100731, + "learning_rate": 2.5203488442865574e-07, + "loss": 0.1664, + "step": 1502 + }, + { + "epoch": 12.846153846153847, + "grad_norm": 0.4905397007029981, + "learning_rate": 2.5008028808446995e-07, + "loss": 0.0644, + "step": 1503 + }, + { + "epoch": 12.854700854700855, + "grad_norm": 0.3579408204459078, + "learning_rate": 2.481329013608419e-07, + "loss": 0.0161, + "step": 1504 + }, + { + "epoch": 12.863247863247864, + "grad_norm": 0.3690746454860683, + "learning_rate": 2.4619273049796e-07, + "loss": 0.0147, + "step": 1505 + }, + { + "epoch": 12.871794871794872, + "grad_norm": 0.5539640918674135, + "learning_rate": 2.4425978171288807e-07, + "loss": 0.0746, + "step": 1506 + }, + { + "epoch": 12.88034188034188, + "grad_norm": 0.398801152487831, + "learning_rate": 2.4233406119954756e-07, + "loss": 0.057, + "step": 1507 + }, + { + "epoch": 12.88888888888889, + "grad_norm": 0.28829865983970265, + "learning_rate": 2.404155751286988e-07, + "loss": 0.0131, + "step": 1508 + }, + { + "epoch": 12.897435897435898, + "grad_norm": 0.10754494414168028, + "learning_rate": 2.385043296479195e-07, + "loss": 0.0024, + "step": 1509 + }, + { + "epoch": 12.905982905982906, + "grad_norm": 0.4987503634158719, + "learning_rate": 2.3660033088158646e-07, + "loss": 0.0357, + "step": 1510 + }, + { + "epoch": 12.914529914529915, + "grad_norm": 0.7081484429156921, + "learning_rate": 2.3470358493085433e-07, + "loss": 0.1405, + "step": 1511 + }, + { + "epoch": 12.923076923076923, + "grad_norm": 0.3988280902988456, + "learning_rate": 2.3281409787363652e-07, + "loss": 0.0436, + "step": 1512 + }, + { + "epoch": 12.931623931623932, + "grad_norm": 0.26292834769791484, + "learning_rate": 2.309318757645873e-07, + "loss": 0.0118, + "step": 1513 + }, + { + "epoch": 12.94017094017094, + "grad_norm": 0.2947911611296075, + "learning_rate": 2.2905692463508045e-07, + "loss": 0.0135, + "step": 1514 + }, + { + "epoch": 12.948717948717949, + "grad_norm": 0.36055620968224433, + "learning_rate": 2.271892504931905e-07, + "loss": 0.0364, + "step": 1515 + }, + { + "epoch": 12.957264957264957, + "grad_norm": 0.6805417037367615, + "learning_rate": 2.253288593236755e-07, + "loss": 0.0844, + "step": 1516 + }, + { + "epoch": 12.965811965811966, + "grad_norm": 0.3855234002350498, + "learning_rate": 2.234757570879534e-07, + "loss": 0.0244, + "step": 1517 + }, + { + "epoch": 12.974358974358974, + "grad_norm": 0.7122804793044082, + "learning_rate": 2.2162994972408647e-07, + "loss": 0.1131, + "step": 1518 + }, + { + "epoch": 12.982905982905983, + "grad_norm": 0.2188849119906226, + "learning_rate": 2.1979144314676239e-07, + "loss": 0.0073, + "step": 1519 + }, + { + "epoch": 12.991452991452991, + "grad_norm": 0.47128998500439206, + "learning_rate": 2.17960243247273e-07, + "loss": 0.0437, + "step": 1520 + }, + { + "epoch": 13.0, + "grad_norm": 0.5310311333782931, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.065, + "step": 1521 + }, + { + "epoch": 13.008547008547009, + "grad_norm": 0.3479275975126997, + "learning_rate": 2.1431978692988298e-07, + "loss": 0.0612, + "step": 1522 + }, + { + "epoch": 13.017094017094017, + "grad_norm": 0.3523745771232036, + "learning_rate": 2.1251054217742484e-07, + "loss": 0.0348, + "step": 1523 + }, + { + "epoch": 13.025641025641026, + "grad_norm": 0.48983824212131266, + "learning_rate": 2.107086274336484e-07, + "loss": 0.0506, + "step": 1524 + }, + { + "epoch": 13.034188034188034, + "grad_norm": 0.5469787413822318, + "learning_rate": 2.0891404847259267e-07, + "loss": 0.0375, + "step": 1525 + }, + { + "epoch": 13.042735042735043, + "grad_norm": 0.3376707731497175, + "learning_rate": 2.0712681104478742e-07, + "loss": 0.0378, + "step": 1526 + }, + { + "epoch": 13.051282051282051, + "grad_norm": 0.5034242344115563, + "learning_rate": 2.0534692087724017e-07, + "loss": 0.0555, + "step": 1527 + }, + { + "epoch": 13.05982905982906, + "grad_norm": 0.25502809856153064, + "learning_rate": 2.0357438367341248e-07, + "loss": 0.0124, + "step": 1528 + }, + { + "epoch": 13.068376068376068, + "grad_norm": 0.0696046335947878, + "learning_rate": 2.0180920511320562e-07, + "loss": 0.0017, + "step": 1529 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 0.3575175521969691, + "learning_rate": 2.0005139085293945e-07, + "loss": 0.0353, + "step": 1530 + }, + { + "epoch": 13.085470085470085, + "grad_norm": 0.4869086930900781, + "learning_rate": 1.983009465253377e-07, + "loss": 0.0705, + "step": 1531 + }, + { + "epoch": 13.094017094017094, + "grad_norm": 0.5167937928937567, + "learning_rate": 1.9655787773950691e-07, + "loss": 0.085, + "step": 1532 + }, + { + "epoch": 13.102564102564102, + "grad_norm": 0.29527783197774443, + "learning_rate": 1.9482219008091885e-07, + "loss": 0.0157, + "step": 1533 + }, + { + "epoch": 13.11111111111111, + "grad_norm": 0.3252991705388003, + "learning_rate": 1.9309388911139427e-07, + "loss": 0.0223, + "step": 1534 + }, + { + "epoch": 13.11965811965812, + "grad_norm": 0.33335729772770656, + "learning_rate": 1.9137298036908392e-07, + "loss": 0.025, + "step": 1535 + }, + { + "epoch": 13.128205128205128, + "grad_norm": 0.39648258193280794, + "learning_rate": 1.896594693684503e-07, + "loss": 0.0334, + "step": 1536 + }, + { + "epoch": 13.136752136752136, + "grad_norm": 0.35775145766381444, + "learning_rate": 1.8795336160025157e-07, + "loss": 0.0395, + "step": 1537 + }, + { + "epoch": 13.145299145299145, + "grad_norm": 0.5482622815094932, + "learning_rate": 1.8625466253152314e-07, + "loss": 0.0654, + "step": 1538 + }, + { + "epoch": 13.153846153846153, + "grad_norm": 0.3848712493794455, + "learning_rate": 1.8456337760555915e-07, + "loss": 0.0728, + "step": 1539 + }, + { + "epoch": 13.162393162393162, + "grad_norm": 0.47502030198058615, + "learning_rate": 1.8287951224189555e-07, + "loss": 0.0839, + "step": 1540 + }, + { + "epoch": 13.17094017094017, + "grad_norm": 0.3083495744880248, + "learning_rate": 1.8120307183629533e-07, + "loss": 0.0185, + "step": 1541 + }, + { + "epoch": 13.179487179487179, + "grad_norm": 0.12747381613610742, + "learning_rate": 1.7953406176072636e-07, + "loss": 0.0038, + "step": 1542 + }, + { + "epoch": 13.188034188034187, + "grad_norm": 0.4041400206367801, + "learning_rate": 1.778724873633486e-07, + "loss": 0.0573, + "step": 1543 + }, + { + "epoch": 13.196581196581196, + "grad_norm": 0.4612137803899462, + "learning_rate": 1.7621835396849528e-07, + "loss": 0.0485, + "step": 1544 + }, + { + "epoch": 13.205128205128204, + "grad_norm": 0.25259858595605755, + "learning_rate": 1.745716668766545e-07, + "loss": 0.0119, + "step": 1545 + }, + { + "epoch": 13.213675213675213, + "grad_norm": 0.21737171693683865, + "learning_rate": 1.7293243136445398e-07, + "loss": 0.0084, + "step": 1546 + }, + { + "epoch": 13.222222222222221, + "grad_norm": 0.5257948866349633, + "learning_rate": 1.713006526846439e-07, + "loss": 0.1058, + "step": 1547 + }, + { + "epoch": 13.23076923076923, + "grad_norm": 0.3647445265942455, + "learning_rate": 1.6967633606608082e-07, + "loss": 0.0306, + "step": 1548 + }, + { + "epoch": 13.239316239316238, + "grad_norm": 0.6273652237714832, + "learning_rate": 1.6805948671370726e-07, + "loss": 0.0965, + "step": 1549 + }, + { + "epoch": 13.247863247863247, + "grad_norm": 0.4250500011918908, + "learning_rate": 1.6645010980854082e-07, + "loss": 0.0336, + "step": 1550 + }, + { + "epoch": 13.256410256410255, + "grad_norm": 0.2541269183698496, + "learning_rate": 1.6484821050765209e-07, + "loss": 0.0207, + "step": 1551 + }, + { + "epoch": 13.264957264957266, + "grad_norm": 0.6916501583191542, + "learning_rate": 1.6325379394415168e-07, + "loss": 0.0892, + "step": 1552 + }, + { + "epoch": 13.273504273504274, + "grad_norm": 0.5419859168716888, + "learning_rate": 1.6166686522717217e-07, + "loss": 0.0845, + "step": 1553 + }, + { + "epoch": 13.282051282051283, + "grad_norm": 0.31897878123049517, + "learning_rate": 1.600874294418528e-07, + "loss": 0.0281, + "step": 1554 + }, + { + "epoch": 13.290598290598291, + "grad_norm": 0.37425385936599204, + "learning_rate": 1.5851549164932118e-07, + "loss": 0.0336, + "step": 1555 + }, + { + "epoch": 13.2991452991453, + "grad_norm": 0.2631276266077297, + "learning_rate": 1.569510568866803e-07, + "loss": 0.0182, + "step": 1556 + }, + { + "epoch": 13.307692307692308, + "grad_norm": 0.53551321371183, + "learning_rate": 1.5539413016698923e-07, + "loss": 0.0862, + "step": 1557 + }, + { + "epoch": 13.316239316239317, + "grad_norm": 0.573266375875409, + "learning_rate": 1.5384471647924781e-07, + "loss": 0.0848, + "step": 1558 + }, + { + "epoch": 13.324786324786325, + "grad_norm": 0.34799927862110314, + "learning_rate": 1.5230282078838255e-07, + "loss": 0.0323, + "step": 1559 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.3502320548848171, + "learning_rate": 1.507684480352292e-07, + "loss": 0.027, + "step": 1560 + }, + { + "epoch": 13.341880341880342, + "grad_norm": 0.47342922032100826, + "learning_rate": 1.4924160313651598e-07, + "loss": 0.0559, + "step": 1561 + }, + { + "epoch": 13.350427350427351, + "grad_norm": 0.3824196536808161, + "learning_rate": 1.4772229098485053e-07, + "loss": 0.0346, + "step": 1562 + }, + { + "epoch": 13.35897435897436, + "grad_norm": 0.6077069983234155, + "learning_rate": 1.46210516448701e-07, + "loss": 0.0883, + "step": 1563 + }, + { + "epoch": 13.367521367521368, + "grad_norm": 0.20174686833329616, + "learning_rate": 1.447062843723826e-07, + "loss": 0.0076, + "step": 1564 + }, + { + "epoch": 13.376068376068377, + "grad_norm": 0.3472339294769054, + "learning_rate": 1.432095995760424e-07, + "loss": 0.0281, + "step": 1565 + }, + { + "epoch": 13.384615384615385, + "grad_norm": 0.5395406632267746, + "learning_rate": 1.417204668556421e-07, + "loss": 0.0608, + "step": 1566 + }, + { + "epoch": 13.393162393162394, + "grad_norm": 0.26952215154290254, + "learning_rate": 1.402388909829447e-07, + "loss": 0.0118, + "step": 1567 + }, + { + "epoch": 13.401709401709402, + "grad_norm": 0.3298420949413113, + "learning_rate": 1.387648767054961e-07, + "loss": 0.031, + "step": 1568 + }, + { + "epoch": 13.41025641025641, + "grad_norm": 0.563406459481279, + "learning_rate": 1.3729842874661365e-07, + "loss": 0.113, + "step": 1569 + }, + { + "epoch": 13.418803418803419, + "grad_norm": 0.574681376341801, + "learning_rate": 1.35839551805369e-07, + "loss": 0.0804, + "step": 1570 + }, + { + "epoch": 13.427350427350428, + "grad_norm": 0.3623717101138977, + "learning_rate": 1.3438825055657246e-07, + "loss": 0.0234, + "step": 1571 + }, + { + "epoch": 13.435897435897436, + "grad_norm": 0.4861599303938593, + "learning_rate": 1.3294452965076031e-07, + "loss": 0.0372, + "step": 1572 + }, + { + "epoch": 13.444444444444445, + "grad_norm": 0.34085334824763863, + "learning_rate": 1.31508393714177e-07, + "loss": 0.0206, + "step": 1573 + }, + { + "epoch": 13.452991452991453, + "grad_norm": 0.3258647492947037, + "learning_rate": 1.3007984734876217e-07, + "loss": 0.0258, + "step": 1574 + }, + { + "epoch": 13.461538461538462, + "grad_norm": 0.6071085443076429, + "learning_rate": 1.286588951321363e-07, + "loss": 0.0776, + "step": 1575 + }, + { + "epoch": 13.47008547008547, + "grad_norm": 0.5031416201199156, + "learning_rate": 1.272455416175844e-07, + "loss": 0.0207, + "step": 1576 + }, + { + "epoch": 13.478632478632479, + "grad_norm": 0.3268535611369224, + "learning_rate": 1.258397913340423e-07, + "loss": 0.0355, + "step": 1577 + }, + { + "epoch": 13.487179487179487, + "grad_norm": 0.2851643628328826, + "learning_rate": 1.2444164878608307e-07, + "loss": 0.0221, + "step": 1578 + }, + { + "epoch": 13.495726495726496, + "grad_norm": 0.2217773892646123, + "learning_rate": 1.2305111845390043e-07, + "loss": 0.0087, + "step": 1579 + }, + { + "epoch": 13.504273504273504, + "grad_norm": 0.5523812907991107, + "learning_rate": 1.2166820479329572e-07, + "loss": 0.077, + "step": 1580 + }, + { + "epoch": 13.512820512820513, + "grad_norm": 0.221037095991797, + "learning_rate": 1.2029291223566413e-07, + "loss": 0.0121, + "step": 1581 + }, + { + "epoch": 13.521367521367521, + "grad_norm": 0.39931183898142225, + "learning_rate": 1.1892524518797993e-07, + "loss": 0.0256, + "step": 1582 + }, + { + "epoch": 13.52991452991453, + "grad_norm": 0.4816009623881913, + "learning_rate": 1.1756520803278188e-07, + "loss": 0.0561, + "step": 1583 + }, + { + "epoch": 13.538461538461538, + "grad_norm": 0.12466048888011573, + "learning_rate": 1.1621280512815941e-07, + "loss": 0.0039, + "step": 1584 + }, + { + "epoch": 13.547008547008547, + "grad_norm": 0.5208821923816866, + "learning_rate": 1.1486804080773878e-07, + "loss": 0.0711, + "step": 1585 + }, + { + "epoch": 13.555555555555555, + "grad_norm": 0.3562879598580212, + "learning_rate": 1.1353091938067024e-07, + "loss": 0.038, + "step": 1586 + }, + { + "epoch": 13.564102564102564, + "grad_norm": 0.4062408749849314, + "learning_rate": 1.1220144513161197e-07, + "loss": 0.044, + "step": 1587 + }, + { + "epoch": 13.572649572649572, + "grad_norm": 0.5458930519363671, + "learning_rate": 1.1087962232071814e-07, + "loss": 0.0765, + "step": 1588 + }, + { + "epoch": 13.581196581196581, + "grad_norm": 0.29040690525508095, + "learning_rate": 1.0956545518362532e-07, + "loss": 0.0242, + "step": 1589 + }, + { + "epoch": 13.58974358974359, + "grad_norm": 0.2792827354829269, + "learning_rate": 1.0825894793143721e-07, + "loss": 0.0264, + "step": 1590 + }, + { + "epoch": 13.598290598290598, + "grad_norm": 0.24223454782550927, + "learning_rate": 1.0696010475071267e-07, + "loss": 0.014, + "step": 1591 + }, + { + "epoch": 13.606837606837606, + "grad_norm": 0.4732093758685518, + "learning_rate": 1.0566892980345245e-07, + "loss": 0.0445, + "step": 1592 + }, + { + "epoch": 13.615384615384615, + "grad_norm": 0.5689145979311558, + "learning_rate": 1.0438542722708444e-07, + "loss": 0.0748, + "step": 1593 + }, + { + "epoch": 13.623931623931623, + "grad_norm": 0.5134034697634273, + "learning_rate": 1.0310960113445179e-07, + "loss": 0.0577, + "step": 1594 + }, + { + "epoch": 13.632478632478632, + "grad_norm": 0.12181138755781606, + "learning_rate": 1.0184145561379949e-07, + "loss": 0.0024, + "step": 1595 + }, + { + "epoch": 13.64102564102564, + "grad_norm": 0.42930052739485963, + "learning_rate": 1.0058099472876004e-07, + "loss": 0.0726, + "step": 1596 + }, + { + "epoch": 13.649572649572649, + "grad_norm": 0.5417065797654803, + "learning_rate": 9.932822251834173e-08, + "loss": 0.1092, + "step": 1597 + }, + { + "epoch": 13.658119658119658, + "grad_norm": 0.44463681603211913, + "learning_rate": 9.808314299691591e-08, + "loss": 0.0518, + "step": 1598 + }, + { + "epoch": 13.666666666666666, + "grad_norm": 0.4935889402856598, + "learning_rate": 9.684576015420277e-08, + "loss": 0.0572, + "step": 1599 + }, + { + "epoch": 13.675213675213675, + "grad_norm": 0.3010999309662844, + "learning_rate": 9.561607795526007e-08, + "loss": 0.0236, + "step": 1600 + }, + { + "epoch": 13.683760683760683, + "grad_norm": 0.4115555617813383, + "learning_rate": 9.439410034046942e-08, + "loss": 0.0355, + "step": 1601 + }, + { + "epoch": 13.692307692307692, + "grad_norm": 0.23478638733192264, + "learning_rate": 9.317983122552332e-08, + "loss": 0.0054, + "step": 1602 + }, + { + "epoch": 13.7008547008547, + "grad_norm": 0.400836154779937, + "learning_rate": 9.197327450141402e-08, + "loss": 0.0422, + "step": 1603 + }, + { + "epoch": 13.709401709401709, + "grad_norm": 0.3816228650221198, + "learning_rate": 9.077443403441994e-08, + "loss": 0.0352, + "step": 1604 + }, + { + "epoch": 13.717948717948717, + "grad_norm": 0.2824770084219213, + "learning_rate": 8.958331366609424e-08, + "loss": 0.0181, + "step": 1605 + }, + { + "epoch": 13.726495726495726, + "grad_norm": 0.40392339013272, + "learning_rate": 8.839991721325047e-08, + "loss": 0.0338, + "step": 1606 + }, + { + "epoch": 13.735042735042736, + "grad_norm": 0.3746265965673062, + "learning_rate": 8.72242484679528e-08, + "loss": 0.0333, + "step": 1607 + }, + { + "epoch": 13.743589743589745, + "grad_norm": 0.4591987932732178, + "learning_rate": 8.605631119750297e-08, + "loss": 0.0426, + "step": 1608 + }, + { + "epoch": 13.752136752136753, + "grad_norm": 0.5298373214853962, + "learning_rate": 8.489610914442697e-08, + "loss": 0.0919, + "step": 1609 + }, + { + "epoch": 13.760683760683762, + "grad_norm": 0.5254343214165913, + "learning_rate": 8.374364602646512e-08, + "loss": 0.0598, + "step": 1610 + }, + { + "epoch": 13.76923076923077, + "grad_norm": 0.2749095870578622, + "learning_rate": 8.259892553655946e-08, + "loss": 0.0097, + "step": 1611 + }, + { + "epoch": 13.777777777777779, + "grad_norm": 0.46888063486471226, + "learning_rate": 8.146195134284052e-08, + "loss": 0.0509, + "step": 1612 + }, + { + "epoch": 13.786324786324787, + "grad_norm": 0.5940355968456708, + "learning_rate": 8.033272708861673e-08, + "loss": 0.0554, + "step": 1613 + }, + { + "epoch": 13.794871794871796, + "grad_norm": 0.3992974882422677, + "learning_rate": 7.921125639236416e-08, + "loss": 0.0434, + "step": 1614 + }, + { + "epoch": 13.803418803418804, + "grad_norm": 0.39756035665555717, + "learning_rate": 7.809754284771181e-08, + "loss": 0.0545, + "step": 1615 + }, + { + "epoch": 13.811965811965813, + "grad_norm": 0.41475580652501604, + "learning_rate": 7.699159002343248e-08, + "loss": 0.0425, + "step": 1616 + }, + { + "epoch": 13.820512820512821, + "grad_norm": 0.2936759248756884, + "learning_rate": 7.589340146343077e-08, + "loss": 0.0287, + "step": 1617 + }, + { + "epoch": 13.82905982905983, + "grad_norm": 0.3087411584867849, + "learning_rate": 7.48029806867312e-08, + "loss": 0.026, + "step": 1618 + }, + { + "epoch": 13.837606837606838, + "grad_norm": 0.5275078873095708, + "learning_rate": 7.372033118746708e-08, + "loss": 0.0581, + "step": 1619 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 0.17224019225984827, + "learning_rate": 7.264545643486997e-08, + "loss": 0.0059, + "step": 1620 + }, + { + "epoch": 13.854700854700855, + "grad_norm": 0.6334224216022728, + "learning_rate": 7.157835987325807e-08, + "loss": 0.1366, + "step": 1621 + }, + { + "epoch": 13.863247863247864, + "grad_norm": 0.46343142790855263, + "learning_rate": 7.051904492202472e-08, + "loss": 0.0127, + "step": 1622 + }, + { + "epoch": 13.871794871794872, + "grad_norm": 0.37134376223079346, + "learning_rate": 6.946751497562909e-08, + "loss": 0.0255, + "step": 1623 + }, + { + "epoch": 13.88034188034188, + "grad_norm": 0.5367950100162845, + "learning_rate": 6.842377340358252e-08, + "loss": 0.0488, + "step": 1624 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 0.4398420912857589, + "learning_rate": 6.738782355044048e-08, + "loss": 0.0478, + "step": 1625 + }, + { + "epoch": 13.897435897435898, + "grad_norm": 0.48278027579001104, + "learning_rate": 6.635966873579063e-08, + "loss": 0.0691, + "step": 1626 + }, + { + "epoch": 13.905982905982906, + "grad_norm": 0.3897204443938729, + "learning_rate": 6.5339312254242e-08, + "loss": 0.0468, + "step": 1627 + }, + { + "epoch": 13.914529914529915, + "grad_norm": 0.450160795007421, + "learning_rate": 6.432675737541499e-08, + "loss": 0.0468, + "step": 1628 + }, + { + "epoch": 13.923076923076923, + "grad_norm": 0.4085390644340357, + "learning_rate": 6.332200734393057e-08, + "loss": 0.0402, + "step": 1629 + }, + { + "epoch": 13.931623931623932, + "grad_norm": 0.34213546747552515, + "learning_rate": 6.232506537939942e-08, + "loss": 0.0271, + "step": 1630 + }, + { + "epoch": 13.94017094017094, + "grad_norm": 0.6332223020088068, + "learning_rate": 6.13359346764128e-08, + "loss": 0.1059, + "step": 1631 + }, + { + "epoch": 13.948717948717949, + "grad_norm": 0.6022072045276918, + "learning_rate": 6.035461840453116e-08, + "loss": 0.1051, + "step": 1632 + }, + { + "epoch": 13.957264957264957, + "grad_norm": 0.514020698868604, + "learning_rate": 5.938111970827526e-08, + "loss": 0.0802, + "step": 1633 + }, + { + "epoch": 13.965811965811966, + "grad_norm": 0.07354635381931279, + "learning_rate": 5.841544170711422e-08, + "loss": 0.0016, + "step": 1634 + }, + { + "epoch": 13.974358974358974, + "grad_norm": 0.27811119952922675, + "learning_rate": 5.745758749545749e-08, + "loss": 0.0154, + "step": 1635 + }, + { + "epoch": 13.982905982905983, + "grad_norm": 0.5656664303415825, + "learning_rate": 5.650756014264347e-08, + "loss": 0.0775, + "step": 1636 + }, + { + "epoch": 13.991452991452991, + "grad_norm": 0.4503223978301062, + "learning_rate": 5.556536269293006e-08, + "loss": 0.046, + "step": 1637 + }, + { + "epoch": 14.0, + "grad_norm": 0.4582080471389822, + "learning_rate": 5.463099816548578e-08, + "loss": 0.0648, + "step": 1638 + }, + { + "epoch": 14.008547008547009, + "grad_norm": 0.2483000115491485, + "learning_rate": 5.3704469554379527e-08, + "loss": 0.0108, + "step": 1639 + }, + { + "epoch": 14.017094017094017, + "grad_norm": 0.5165846852271239, + "learning_rate": 5.278577982857025e-08, + "loss": 0.0615, + "step": 1640 + }, + { + "epoch": 14.025641025641026, + "grad_norm": 0.5347615828908445, + "learning_rate": 5.1874931931897854e-08, + "loss": 0.0691, + "step": 1641 + }, + { + "epoch": 14.034188034188034, + "grad_norm": 0.4100464066416389, + "learning_rate": 5.097192878307455e-08, + "loss": 0.0388, + "step": 1642 + }, + { + "epoch": 14.042735042735043, + "grad_norm": 0.6027519857440805, + "learning_rate": 5.0076773275675174e-08, + "loss": 0.1063, + "step": 1643 + }, + { + "epoch": 14.051282051282051, + "grad_norm": 0.2553611985836167, + "learning_rate": 4.91894682781266e-08, + "loss": 0.0135, + "step": 1644 + }, + { + "epoch": 14.05982905982906, + "grad_norm": 0.5578825939428207, + "learning_rate": 4.831001663370083e-08, + "loss": 0.108, + "step": 1645 + }, + { + "epoch": 14.068376068376068, + "grad_norm": 0.4895323748851477, + "learning_rate": 4.743842116050334e-08, + "loss": 0.043, + "step": 1646 + }, + { + "epoch": 14.076923076923077, + "grad_norm": 0.4128574776178798, + "learning_rate": 4.657468465146642e-08, + "loss": 0.0324, + "step": 1647 + }, + { + "epoch": 14.085470085470085, + "grad_norm": 0.46410297469667755, + "learning_rate": 4.571880987433886e-08, + "loss": 0.0566, + "step": 1648 + }, + { + "epoch": 14.094017094017094, + "grad_norm": 0.3848244141143089, + "learning_rate": 4.487079957167767e-08, + "loss": 0.0359, + "step": 1649 + }, + { + "epoch": 14.102564102564102, + "grad_norm": 0.3511233417593998, + "learning_rate": 4.40306564608381e-08, + "loss": 0.0368, + "step": 1650 + }, + { + "epoch": 14.11111111111111, + "grad_norm": 0.1622634968755906, + "learning_rate": 4.319838323396691e-08, + "loss": 0.0058, + "step": 1651 + }, + { + "epoch": 14.11965811965812, + "grad_norm": 0.21186337433136335, + "learning_rate": 4.237398255799191e-08, + "loss": 0.0081, + "step": 1652 + }, + { + "epoch": 14.128205128205128, + "grad_norm": 0.3672513072749568, + "learning_rate": 4.155745707461467e-08, + "loss": 0.0222, + "step": 1653 + }, + { + "epoch": 14.136752136752136, + "grad_norm": 0.4195684480150308, + "learning_rate": 4.0748809400301403e-08, + "loss": 0.0536, + "step": 1654 + }, + { + "epoch": 14.145299145299145, + "grad_norm": 0.42261292114989146, + "learning_rate": 3.994804212627462e-08, + "loss": 0.0441, + "step": 1655 + }, + { + "epoch": 14.153846153846153, + "grad_norm": 0.25385726081335536, + "learning_rate": 3.9155157818505654e-08, + "loss": 0.0112, + "step": 1656 + }, + { + "epoch": 14.162393162393162, + "grad_norm": 0.5415475073424915, + "learning_rate": 3.8370159017704636e-08, + "loss": 0.0725, + "step": 1657 + }, + { + "epoch": 14.17094017094017, + "grad_norm": 0.5554466464885877, + "learning_rate": 3.759304823931359e-08, + "loss": 0.0651, + "step": 1658 + }, + { + "epoch": 14.179487179487179, + "grad_norm": 0.47897150903692604, + "learning_rate": 3.682382797349976e-08, + "loss": 0.041, + "step": 1659 + }, + { + "epoch": 14.188034188034187, + "grad_norm": 0.3348930277986746, + "learning_rate": 3.6062500685143943e-08, + "loss": 0.0316, + "step": 1660 + }, + { + "epoch": 14.196581196581196, + "grad_norm": 0.5813358682096855, + "learning_rate": 3.5309068813836056e-08, + "loss": 0.0748, + "step": 1661 + }, + { + "epoch": 14.205128205128204, + "grad_norm": 0.5989693941469737, + "learning_rate": 3.4563534773866256e-08, + "loss": 0.1531, + "step": 1662 + }, + { + "epoch": 14.213675213675213, + "grad_norm": 0.3821820832883375, + "learning_rate": 3.382590095421606e-08, + "loss": 0.0352, + "step": 1663 + }, + { + "epoch": 14.222222222222221, + "grad_norm": 0.30134449071243946, + "learning_rate": 3.309616971855195e-08, + "loss": 0.0297, + "step": 1664 + }, + { + "epoch": 14.23076923076923, + "grad_norm": 0.4161419876130332, + "learning_rate": 3.237434340521789e-08, + "loss": 0.0468, + "step": 1665 + }, + { + "epoch": 14.239316239316238, + "grad_norm": 0.5095699132041175, + "learning_rate": 3.166042432722671e-08, + "loss": 0.0964, + "step": 1666 + }, + { + "epoch": 14.247863247863247, + "grad_norm": 0.286414200855273, + "learning_rate": 3.095441477225347e-08, + "loss": 0.0246, + "step": 1667 + }, + { + "epoch": 14.256410256410255, + "grad_norm": 0.41134110401011575, + "learning_rate": 3.025631700262877e-08, + "loss": 0.0423, + "step": 1668 + }, + { + "epoch": 14.264957264957266, + "grad_norm": 0.4134548650175047, + "learning_rate": 2.9566133255329864e-08, + "loss": 0.0493, + "step": 1669 + }, + { + "epoch": 14.273504273504274, + "grad_norm": 0.38804088243316787, + "learning_rate": 2.888386574197488e-08, + "loss": 0.0432, + "step": 1670 + }, + { + "epoch": 14.282051282051283, + "grad_norm": 0.11441142627700734, + "learning_rate": 2.8209516648814996e-08, + "loss": 0.0023, + "step": 1671 + }, + { + "epoch": 14.290598290598291, + "grad_norm": 0.7652757240500461, + "learning_rate": 2.7543088136727792e-08, + "loss": 0.0667, + "step": 1672 + }, + { + "epoch": 14.2991452991453, + "grad_norm": 0.2044348005003793, + "learning_rate": 2.688458234121033e-08, + "loss": 0.0069, + "step": 1673 + }, + { + "epoch": 14.307692307692308, + "grad_norm": 0.5310235005102, + "learning_rate": 2.6234001372372196e-08, + "loss": 0.0796, + "step": 1674 + }, + { + "epoch": 14.316239316239317, + "grad_norm": 0.34314194974327356, + "learning_rate": 2.5591347314928572e-08, + "loss": 0.0364, + "step": 1675 + }, + { + "epoch": 14.324786324786325, + "grad_norm": 0.5649921942906795, + "learning_rate": 2.495662222819356e-08, + "loss": 0.0804, + "step": 1676 + }, + { + "epoch": 14.333333333333334, + "grad_norm": 0.3315299484835121, + "learning_rate": 2.4329828146074096e-08, + "loss": 0.0186, + "step": 1677 + }, + { + "epoch": 14.341880341880342, + "grad_norm": 0.38110887495178786, + "learning_rate": 2.3710967077063275e-08, + "loss": 0.0458, + "step": 1678 + }, + { + "epoch": 14.350427350427351, + "grad_norm": 0.3064729882502098, + "learning_rate": 2.310004100423313e-08, + "loss": 0.0137, + "step": 1679 + }, + { + "epoch": 14.35897435897436, + "grad_norm": 0.4986358072772935, + "learning_rate": 2.2497051885228825e-08, + "loss": 0.0603, + "step": 1680 + }, + { + "epoch": 14.367521367521368, + "grad_norm": 0.22672327025207323, + "learning_rate": 2.190200165226336e-08, + "loss": 0.0097, + "step": 1681 + }, + { + "epoch": 14.376068376068377, + "grad_norm": 0.4968406611382955, + "learning_rate": 2.131489221210953e-08, + "loss": 0.0867, + "step": 1682 + }, + { + "epoch": 14.384615384615385, + "grad_norm": 0.3838968595965074, + "learning_rate": 2.0735725446094924e-08, + "loss": 0.0387, + "step": 1683 + }, + { + "epoch": 14.393162393162394, + "grad_norm": 0.3404780059832446, + "learning_rate": 2.016450321009611e-08, + "loss": 0.028, + "step": 1684 + }, + { + "epoch": 14.401709401709402, + "grad_norm": 0.3873785523798569, + "learning_rate": 1.9601227334531958e-08, + "loss": 0.0353, + "step": 1685 + }, + { + "epoch": 14.41025641025641, + "grad_norm": 0.48467481839954696, + "learning_rate": 1.904589962435782e-08, + "loss": 0.0553, + "step": 1686 + }, + { + "epoch": 14.418803418803419, + "grad_norm": 0.4249363956567172, + "learning_rate": 1.8498521859060814e-08, + "loss": 0.0614, + "step": 1687 + }, + { + "epoch": 14.427350427350428, + "grad_norm": 0.5422143620874084, + "learning_rate": 1.795909579265259e-08, + "loss": 0.0819, + "step": 1688 + }, + { + "epoch": 14.435897435897436, + "grad_norm": 0.31554573773900985, + "learning_rate": 1.7427623153664364e-08, + "loss": 0.0199, + "step": 1689 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 0.5292406231146032, + "learning_rate": 1.6904105645142443e-08, + "loss": 0.0642, + "step": 1690 + }, + { + "epoch": 14.452991452991453, + "grad_norm": 0.33194448347383515, + "learning_rate": 1.638854494464104e-08, + "loss": 0.0441, + "step": 1691 + }, + { + "epoch": 14.461538461538462, + "grad_norm": 0.484854828866342, + "learning_rate": 1.5880942704217528e-08, + "loss": 0.0496, + "step": 1692 + }, + { + "epoch": 14.47008547008547, + "grad_norm": 0.43023899300400237, + "learning_rate": 1.5381300550427748e-08, + "loss": 0.0499, + "step": 1693 + }, + { + "epoch": 14.478632478632479, + "grad_norm": 0.5504551057502816, + "learning_rate": 1.4889620084319878e-08, + "loss": 0.0711, + "step": 1694 + }, + { + "epoch": 14.487179487179487, + "grad_norm": 0.17208942331996188, + "learning_rate": 1.4405902881430289e-08, + "loss": 0.0062, + "step": 1695 + }, + { + "epoch": 14.495726495726496, + "grad_norm": 0.4286125913530637, + "learning_rate": 1.393015049177715e-08, + "loss": 0.0502, + "step": 1696 + }, + { + "epoch": 14.504273504273504, + "grad_norm": 0.4889904434508282, + "learning_rate": 1.3462364439857379e-08, + "loss": 0.0571, + "step": 1697 + }, + { + "epoch": 14.512820512820513, + "grad_norm": 0.4541729938104265, + "learning_rate": 1.3002546224639146e-08, + "loss": 0.0438, + "step": 1698 + }, + { + "epoch": 14.521367521367521, + "grad_norm": 0.638295519083425, + "learning_rate": 1.2550697319560211e-08, + "loss": 0.1397, + "step": 1699 + }, + { + "epoch": 14.52991452991453, + "grad_norm": 0.4341569849655031, + "learning_rate": 1.2106819172520434e-08, + "loss": 0.0517, + "step": 1700 + }, + { + "epoch": 14.538461538461538, + "grad_norm": 0.4068406338564505, + "learning_rate": 1.1670913205878431e-08, + "loss": 0.0718, + "step": 1701 + }, + { + "epoch": 14.547008547008547, + "grad_norm": 0.32365525965014147, + "learning_rate": 1.1242980816447147e-08, + "loss": 0.0196, + "step": 1702 + }, + { + "epoch": 14.555555555555555, + "grad_norm": 0.19470073348110065, + "learning_rate": 1.0823023375489128e-08, + "loss": 0.0076, + "step": 1703 + }, + { + "epoch": 14.564102564102564, + "grad_norm": 0.11448802274939457, + "learning_rate": 1.0411042228711254e-08, + "loss": 0.0028, + "step": 1704 + }, + { + "epoch": 14.572649572649572, + "grad_norm": 0.6691841832251686, + "learning_rate": 1.0007038696262517e-08, + "loss": 0.0916, + "step": 1705 + }, + { + "epoch": 14.581196581196581, + "grad_norm": 0.28586370985153986, + "learning_rate": 9.611014072727354e-09, + "loss": 0.0144, + "step": 1706 + }, + { + "epoch": 14.58974358974359, + "grad_norm": 0.5898421301273966, + "learning_rate": 9.222969627123435e-09, + "loss": 0.0968, + "step": 1707 + }, + { + "epoch": 14.598290598290598, + "grad_norm": 0.16072954161372915, + "learning_rate": 8.842906602896661e-09, + "loss": 0.005, + "step": 1708 + }, + { + "epoch": 14.606837606837606, + "grad_norm": 0.2759661423413884, + "learning_rate": 8.470826217917006e-09, + "loss": 0.0215, + "step": 1709 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 0.6260797976325166, + "learning_rate": 8.106729664475178e-09, + "loss": 0.1173, + "step": 1710 + }, + { + "epoch": 14.623931623931623, + "grad_norm": 0.31248553543290747, + "learning_rate": 7.750618109278464e-09, + "loss": 0.0175, + "step": 1711 + }, + { + "epoch": 14.632478632478632, + "grad_norm": 0.27412422037840384, + "learning_rate": 7.402492693447671e-09, + "loss": 0.0158, + "step": 1712 + }, + { + "epoch": 14.64102564102564, + "grad_norm": 0.435526459964807, + "learning_rate": 7.062354532512416e-09, + "loss": 0.0313, + "step": 1713 + }, + { + "epoch": 14.649572649572649, + "grad_norm": 0.5786644177120002, + "learning_rate": 6.730204716407507e-09, + "loss": 0.0898, + "step": 1714 + }, + { + "epoch": 14.658119658119658, + "grad_norm": 0.19256453915909177, + "learning_rate": 6.406044309471004e-09, + "loss": 0.0083, + "step": 1715 + }, + { + "epoch": 14.666666666666666, + "grad_norm": 0.375666393092363, + "learning_rate": 6.089874350439507e-09, + "loss": 0.0299, + "step": 1716 + }, + { + "epoch": 14.675213675213675, + "grad_norm": 0.5319710614208967, + "learning_rate": 5.781695852444258e-09, + "loss": 0.0983, + "step": 1717 + }, + { + "epoch": 14.683760683760683, + "grad_norm": 0.35482923106684494, + "learning_rate": 5.481509803009766e-09, + "loss": 0.0305, + "step": 1718 + }, + { + "epoch": 14.692307692307692, + "grad_norm": 0.19746564564826924, + "learning_rate": 5.189317164049634e-09, + "loss": 0.0103, + "step": 1719 + }, + { + "epoch": 14.7008547008547, + "grad_norm": 0.4282057813149622, + "learning_rate": 4.905118871862402e-09, + "loss": 0.0368, + "step": 1720 + }, + { + "epoch": 14.709401709401709, + "grad_norm": 0.07684527835039032, + "learning_rate": 4.62891583713071e-09, + "loss": 0.002, + "step": 1721 + }, + { + "epoch": 14.717948717948717, + "grad_norm": 0.5770419703333539, + "learning_rate": 4.3607089449165806e-09, + "loss": 0.1029, + "step": 1722 + }, + { + "epoch": 14.726495726495726, + "grad_norm": 0.3026257207483837, + "learning_rate": 4.100499054659757e-09, + "loss": 0.0175, + "step": 1723 + }, + { + "epoch": 14.735042735042736, + "grad_norm": 0.43807229538708947, + "learning_rate": 3.848287000174089e-09, + "loss": 0.0409, + "step": 1724 + }, + { + "epoch": 14.743589743589745, + "grad_norm": 0.32564952319806223, + "learning_rate": 3.6040735896455957e-09, + "loss": 0.0189, + "step": 1725 + }, + { + "epoch": 14.752136752136753, + "grad_norm": 0.42691653228759036, + "learning_rate": 3.367859605628854e-09, + "loss": 0.0333, + "step": 1726 + }, + { + "epoch": 14.760683760683762, + "grad_norm": 0.32786135774227537, + "learning_rate": 3.139645805046165e-09, + "loss": 0.0214, + "step": 1727 + }, + { + "epoch": 14.76923076923077, + "grad_norm": 0.2744239467920148, + "learning_rate": 2.919432919183396e-09, + "loss": 0.0131, + "step": 1728 + }, + { + "epoch": 14.777777777777779, + "grad_norm": 0.30419227440175195, + "learning_rate": 2.7072216536885855e-09, + "loss": 0.0161, + "step": 1729 + }, + { + "epoch": 14.786324786324787, + "grad_norm": 0.6233695528799661, + "learning_rate": 2.5030126885694505e-09, + "loss": 0.0987, + "step": 1730 + }, + { + "epoch": 14.794871794871796, + "grad_norm": 0.29663516180844607, + "learning_rate": 2.3068066781908873e-09, + "loss": 0.023, + "step": 1731 + }, + { + "epoch": 14.803418803418804, + "grad_norm": 0.32598395103645494, + "learning_rate": 2.118604251273859e-09, + "loss": 0.0308, + "step": 1732 + }, + { + "epoch": 14.811965811965813, + "grad_norm": 0.6182730441581095, + "learning_rate": 1.9384060108923463e-09, + "loss": 0.0955, + "step": 1733 + }, + { + "epoch": 14.820512820512821, + "grad_norm": 0.1185843301282404, + "learning_rate": 1.766212534471401e-09, + "loss": 0.0035, + "step": 1734 + }, + { + "epoch": 14.82905982905983, + "grad_norm": 0.5723507056543405, + "learning_rate": 1.6020243737865926e-09, + "loss": 0.0876, + "step": 1735 + }, + { + "epoch": 14.837606837606838, + "grad_norm": 0.2061314944219676, + "learning_rate": 1.4458420549606777e-09, + "loss": 0.0154, + "step": 1736 + }, + { + "epoch": 14.846153846153847, + "grad_norm": 0.6249720933757367, + "learning_rate": 1.297666078462767e-09, + "loss": 0.1111, + "step": 1737 + }, + { + "epoch": 14.854700854700855, + "grad_norm": 0.2860798677708746, + "learning_rate": 1.1574969191061047e-09, + "loss": 0.0237, + "step": 1738 + }, + { + "epoch": 14.863247863247864, + "grad_norm": 0.280044513165246, + "learning_rate": 1.0253350260480688e-09, + "loss": 0.0228, + "step": 1739 + }, + { + "epoch": 14.871794871794872, + "grad_norm": 0.4494241840830116, + "learning_rate": 9.011808227865626e-10, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 14.88034188034188, + "grad_norm": 0.35579500426583943, + "learning_rate": 7.850347071597376e-10, + "loss": 0.0236, + "step": 1741 + }, + { + "epoch": 14.88888888888889, + "grad_norm": 0.13002283698556336, + "learning_rate": 6.768970513457151e-10, + "loss": 0.004, + "step": 1742 + }, + { + "epoch": 14.897435897435898, + "grad_norm": 0.3928491595882552, + "learning_rate": 5.767682018595344e-10, + "loss": 0.0294, + "step": 1743 + }, + { + "epoch": 14.905982905982906, + "grad_norm": 0.31808212261880264, + "learning_rate": 4.846484795528739e-10, + "loss": 0.0263, + "step": 1744 + }, + { + "epoch": 14.914529914529915, + "grad_norm": 0.27391217697985504, + "learning_rate": 4.0053817961321903e-10, + "loss": 0.0188, + "step": 1745 + }, + { + "epoch": 14.923076923076923, + "grad_norm": 0.43076628005985973, + "learning_rate": 3.2443757156330746e-10, + "loss": 0.0529, + "step": 1746 + }, + { + "epoch": 14.931623931623932, + "grad_norm": 0.5306801307778587, + "learning_rate": 2.563468992586304e-10, + "loss": 0.0675, + "step": 1747 + }, + { + "epoch": 14.94017094017094, + "grad_norm": 0.3592276721288456, + "learning_rate": 1.9626638088854344e-10, + "loss": 0.0273, + "step": 1748 + }, + { + "epoch": 14.948717948717949, + "grad_norm": 0.11763294528593544, + "learning_rate": 1.4419620897432318e-10, + "loss": 0.0034, + "step": 1749 + }, + { + "epoch": 14.957264957264957, + "grad_norm": 0.44419236728112543, + "learning_rate": 1.0013655036916758e-10, + "loss": 0.0876, + "step": 1750 + }, + { + "epoch": 14.965811965811966, + "grad_norm": 0.06513844509503423, + "learning_rate": 6.408754625736313e-11, + "loss": 0.0013, + "step": 1751 + }, + { + "epoch": 14.974358974358974, + "grad_norm": 0.32609380727131143, + "learning_rate": 3.604931215400731e-11, + "loss": 0.0196, + "step": 1752 + }, + { + "epoch": 14.982905982905983, + "grad_norm": 0.5441264595409655, + "learning_rate": 1.6021937904731054e-11, + "loss": 0.0642, + "step": 1753 + }, + { + "epoch": 14.991452991452991, + "grad_norm": 0.21315543474359916, + "learning_rate": 4.005487684866083e-12, + "loss": 0.0091, + "step": 1754 + }, + { + "epoch": 15.0, + "grad_norm": 0.38663943234097126, + "learning_rate": 0.0, + "loss": 0.034, + "step": 1755 + }, + { + "epoch": 15.0, + "step": 1755, + "total_flos": 170425732071424.0, + "train_loss": 0.2854139506731957, + "train_runtime": 13259.2544, + "train_samples_per_second": 0.924, + "train_steps_per_second": 0.132 + } + ], + "logging_steps": 1, + "max_steps": 1755, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 170425732071424.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}