{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 13065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003827018752391887, "grad_norm": 1.0470714569091797, "learning_rate": 0.0, "loss": 1.2161, "step": 1 }, { "epoch": 0.0007654037504783774, "grad_norm": 1.1862388849258423, "learning_rate": 5.1020408163265303e-08, "loss": 1.3157, "step": 2 }, { "epoch": 0.001148105625717566, "grad_norm": 1.0985468626022339, "learning_rate": 1.0204081632653061e-07, "loss": 1.1799, "step": 3 }, { "epoch": 0.0015308075009567547, "grad_norm": 1.0894755125045776, "learning_rate": 1.5306122448979592e-07, "loss": 1.2403, "step": 4 }, { "epoch": 0.0019135093761959434, "grad_norm": 0.939231276512146, "learning_rate": 2.0408163265306121e-07, "loss": 1.2529, "step": 5 }, { "epoch": 0.002296211251435132, "grad_norm": 1.0093467235565186, "learning_rate": 2.5510204081632656e-07, "loss": 1.1994, "step": 6 }, { "epoch": 0.0026789131266743206, "grad_norm": 0.9747366905212402, "learning_rate": 3.0612244897959183e-07, "loss": 1.2743, "step": 7 }, { "epoch": 0.0030616150019135095, "grad_norm": 0.9402337074279785, "learning_rate": 3.5714285714285716e-07, "loss": 1.2864, "step": 8 }, { "epoch": 0.003444316877152698, "grad_norm": 1.0305005311965942, "learning_rate": 4.0816326530612243e-07, "loss": 1.2991, "step": 9 }, { "epoch": 0.003827018752391887, "grad_norm": 0.96921706199646, "learning_rate": 4.591836734693878e-07, "loss": 1.2717, "step": 10 }, { "epoch": 0.004209720627631076, "grad_norm": 1.0017738342285156, "learning_rate": 5.102040816326531e-07, "loss": 1.2183, "step": 11 }, { "epoch": 0.004592422502870264, "grad_norm": 1.0692427158355713, "learning_rate": 5.612244897959184e-07, "loss": 1.2483, "step": 12 }, { "epoch": 0.004975124378109453, "grad_norm": 1.05031418800354, "learning_rate": 6.122448979591837e-07, "loss": 1.1991, "step": 13 }, { "epoch": 0.005357826253348641, "grad_norm": 1.059110164642334, "learning_rate": 6.632653061224491e-07, "loss": 1.2757, "step": 14 }, { "epoch": 0.0057405281285878304, "grad_norm": 0.9646329283714294, "learning_rate": 7.142857142857143e-07, "loss": 1.1975, "step": 15 }, { "epoch": 0.006123230003827019, "grad_norm": 0.9525495171546936, "learning_rate": 7.653061224489796e-07, "loss": 1.2378, "step": 16 }, { "epoch": 0.006505931879066207, "grad_norm": 1.0997979640960693, "learning_rate": 8.163265306122449e-07, "loss": 1.2017, "step": 17 }, { "epoch": 0.006888633754305396, "grad_norm": 1.008820652961731, "learning_rate": 8.673469387755103e-07, "loss": 1.208, "step": 18 }, { "epoch": 0.007271335629544585, "grad_norm": 1.0060170888900757, "learning_rate": 9.183673469387756e-07, "loss": 1.2222, "step": 19 }, { "epoch": 0.007654037504783774, "grad_norm": 1.0678315162658691, "learning_rate": 9.69387755102041e-07, "loss": 1.2071, "step": 20 }, { "epoch": 0.008036739380022962, "grad_norm": 1.0120142698287964, "learning_rate": 1.0204081632653063e-06, "loss": 1.1401, "step": 21 }, { "epoch": 0.008419441255262151, "grad_norm": 0.9455398321151733, "learning_rate": 1.0714285714285714e-06, "loss": 1.2188, "step": 22 }, { "epoch": 0.008802143130501339, "grad_norm": 1.064963936805725, "learning_rate": 1.122448979591837e-06, "loss": 1.1924, "step": 23 }, { "epoch": 0.009184845005740528, "grad_norm": 0.9984447360038757, "learning_rate": 1.1734693877551022e-06, "loss": 1.2089, "step": 24 }, { "epoch": 0.009567546880979716, "grad_norm": 1.06656014919281, "learning_rate": 1.2244897959183673e-06, "loss": 1.2195, "step": 25 }, { "epoch": 0.009950248756218905, "grad_norm": 0.9282373785972595, "learning_rate": 1.2755102040816329e-06, "loss": 1.3016, "step": 26 }, { "epoch": 0.010332950631458095, "grad_norm": 0.9531068205833435, "learning_rate": 1.3265306122448982e-06, "loss": 1.252, "step": 27 }, { "epoch": 0.010715652506697282, "grad_norm": 0.8497799634933472, "learning_rate": 1.3775510204081633e-06, "loss": 1.262, "step": 28 }, { "epoch": 0.011098354381936472, "grad_norm": 0.8742958903312683, "learning_rate": 1.4285714285714286e-06, "loss": 1.2076, "step": 29 }, { "epoch": 0.011481056257175661, "grad_norm": 0.8289865255355835, "learning_rate": 1.479591836734694e-06, "loss": 1.186, "step": 30 }, { "epoch": 0.011863758132414848, "grad_norm": 0.843896210193634, "learning_rate": 1.5306122448979593e-06, "loss": 1.119, "step": 31 }, { "epoch": 0.012246460007654038, "grad_norm": 0.9261254072189331, "learning_rate": 1.5816326530612248e-06, "loss": 1.1725, "step": 32 }, { "epoch": 0.012629161882893225, "grad_norm": 0.8471044898033142, "learning_rate": 1.6326530612244897e-06, "loss": 1.1863, "step": 33 }, { "epoch": 0.013011863758132415, "grad_norm": 0.8763002157211304, "learning_rate": 1.6836734693877552e-06, "loss": 1.1707, "step": 34 }, { "epoch": 0.013394565633371604, "grad_norm": 0.8116949200630188, "learning_rate": 1.7346938775510206e-06, "loss": 1.1727, "step": 35 }, { "epoch": 0.013777267508610792, "grad_norm": 0.8081701397895813, "learning_rate": 1.7857142857142859e-06, "loss": 1.1366, "step": 36 }, { "epoch": 0.014159969383849981, "grad_norm": 0.7538459897041321, "learning_rate": 1.8367346938775512e-06, "loss": 1.1781, "step": 37 }, { "epoch": 0.01454267125908917, "grad_norm": 0.6645963191986084, "learning_rate": 1.8877551020408163e-06, "loss": 1.132, "step": 38 }, { "epoch": 0.014925373134328358, "grad_norm": 0.6984105110168457, "learning_rate": 1.938775510204082e-06, "loss": 1.1343, "step": 39 }, { "epoch": 0.015308075009567547, "grad_norm": 0.7099324464797974, "learning_rate": 1.989795918367347e-06, "loss": 1.1385, "step": 40 }, { "epoch": 0.015690776884806735, "grad_norm": 0.6204515099525452, "learning_rate": 2.0408163265306125e-06, "loss": 1.0954, "step": 41 }, { "epoch": 0.016073478760045924, "grad_norm": 0.6240230798721313, "learning_rate": 2.0918367346938776e-06, "loss": 1.1775, "step": 42 }, { "epoch": 0.016456180635285114, "grad_norm": 0.6010141372680664, "learning_rate": 2.1428571428571427e-06, "loss": 1.154, "step": 43 }, { "epoch": 0.016838882510524303, "grad_norm": 0.7546166181564331, "learning_rate": 2.1938775510204083e-06, "loss": 1.1446, "step": 44 }, { "epoch": 0.01722158438576349, "grad_norm": 0.6318696141242981, "learning_rate": 2.244897959183674e-06, "loss": 1.1769, "step": 45 }, { "epoch": 0.017604286261002678, "grad_norm": 0.662175178527832, "learning_rate": 2.295918367346939e-06, "loss": 1.1725, "step": 46 }, { "epoch": 0.017986988136241867, "grad_norm": 0.6639765501022339, "learning_rate": 2.3469387755102044e-06, "loss": 1.139, "step": 47 }, { "epoch": 0.018369690011481057, "grad_norm": 0.6464126706123352, "learning_rate": 2.3979591836734696e-06, "loss": 1.1531, "step": 48 }, { "epoch": 0.018752391886720246, "grad_norm": 0.8371584415435791, "learning_rate": 2.4489795918367347e-06, "loss": 1.0944, "step": 49 }, { "epoch": 0.019135093761959432, "grad_norm": 0.7249736189842224, "learning_rate": 2.5e-06, "loss": 1.1306, "step": 50 }, { "epoch": 0.01951779563719862, "grad_norm": 0.6618884205818176, "learning_rate": 2.5510204081632657e-06, "loss": 1.1012, "step": 51 }, { "epoch": 0.01990049751243781, "grad_norm": 0.7077500224113464, "learning_rate": 2.602040816326531e-06, "loss": 1.1033, "step": 52 }, { "epoch": 0.020283199387677, "grad_norm": 0.7692103385925293, "learning_rate": 2.6530612244897964e-06, "loss": 1.2126, "step": 53 }, { "epoch": 0.02066590126291619, "grad_norm": 0.6467433571815491, "learning_rate": 2.7040816326530615e-06, "loss": 1.1488, "step": 54 }, { "epoch": 0.02104860313815538, "grad_norm": 0.657614529132843, "learning_rate": 2.7551020408163266e-06, "loss": 1.1838, "step": 55 }, { "epoch": 0.021431305013394564, "grad_norm": 0.5774511098861694, "learning_rate": 2.8061224489795917e-06, "loss": 1.1179, "step": 56 }, { "epoch": 0.021814006888633754, "grad_norm": 0.5620210766792297, "learning_rate": 2.8571428571428573e-06, "loss": 1.1095, "step": 57 }, { "epoch": 0.022196708763872943, "grad_norm": 0.5795685648918152, "learning_rate": 2.908163265306123e-06, "loss": 1.1118, "step": 58 }, { "epoch": 0.022579410639112132, "grad_norm": 0.5146721005439758, "learning_rate": 2.959183673469388e-06, "loss": 1.0787, "step": 59 }, { "epoch": 0.022962112514351322, "grad_norm": 0.5769478678703308, "learning_rate": 3.0102040816326534e-06, "loss": 1.0921, "step": 60 }, { "epoch": 0.023344814389590508, "grad_norm": 0.5526740550994873, "learning_rate": 3.0612244897959185e-06, "loss": 1.0955, "step": 61 }, { "epoch": 0.023727516264829697, "grad_norm": 0.5817594528198242, "learning_rate": 3.112244897959184e-06, "loss": 1.0991, "step": 62 }, { "epoch": 0.024110218140068886, "grad_norm": 0.5312849283218384, "learning_rate": 3.1632653061224496e-06, "loss": 1.1036, "step": 63 }, { "epoch": 0.024492920015308076, "grad_norm": 0.5835770964622498, "learning_rate": 3.2142857142857147e-06, "loss": 1.1085, "step": 64 }, { "epoch": 0.024875621890547265, "grad_norm": 0.5763447284698486, "learning_rate": 3.2653061224489794e-06, "loss": 0.9873, "step": 65 }, { "epoch": 0.02525832376578645, "grad_norm": 0.6604176163673401, "learning_rate": 3.316326530612245e-06, "loss": 1.0647, "step": 66 }, { "epoch": 0.02564102564102564, "grad_norm": 0.5478323698043823, "learning_rate": 3.3673469387755105e-06, "loss": 1.0951, "step": 67 }, { "epoch": 0.02602372751626483, "grad_norm": 0.534952700138092, "learning_rate": 3.4183673469387756e-06, "loss": 1.1444, "step": 68 }, { "epoch": 0.02640642939150402, "grad_norm": 0.5427554845809937, "learning_rate": 3.469387755102041e-06, "loss": 1.1034, "step": 69 }, { "epoch": 0.026789131266743208, "grad_norm": 0.5796560645103455, "learning_rate": 3.5204081632653062e-06, "loss": 1.1159, "step": 70 }, { "epoch": 0.027171833141982394, "grad_norm": 0.5019088387489319, "learning_rate": 3.5714285714285718e-06, "loss": 1.0352, "step": 71 }, { "epoch": 0.027554535017221583, "grad_norm": 0.5082002282142639, "learning_rate": 3.6224489795918373e-06, "loss": 1.0558, "step": 72 }, { "epoch": 0.027937236892460773, "grad_norm": 0.6052938103675842, "learning_rate": 3.6734693877551024e-06, "loss": 1.1061, "step": 73 }, { "epoch": 0.028319938767699962, "grad_norm": 0.48267167806625366, "learning_rate": 3.724489795918368e-06, "loss": 1.1048, "step": 74 }, { "epoch": 0.02870264064293915, "grad_norm": 0.5059983730316162, "learning_rate": 3.7755102040816327e-06, "loss": 0.9949, "step": 75 }, { "epoch": 0.02908534251817834, "grad_norm": 0.518397867679596, "learning_rate": 3.826530612244898e-06, "loss": 1.1018, "step": 76 }, { "epoch": 0.029468044393417527, "grad_norm": 0.47104519605636597, "learning_rate": 3.877551020408164e-06, "loss": 1.0287, "step": 77 }, { "epoch": 0.029850746268656716, "grad_norm": 0.4876571595668793, "learning_rate": 3.928571428571429e-06, "loss": 0.9938, "step": 78 }, { "epoch": 0.030233448143895905, "grad_norm": 0.45038989186286926, "learning_rate": 3.979591836734694e-06, "loss": 1.0165, "step": 79 }, { "epoch": 0.030616150019135095, "grad_norm": 0.5359657406806946, "learning_rate": 4.03061224489796e-06, "loss": 1.0505, "step": 80 }, { "epoch": 0.030998851894374284, "grad_norm": 0.4649733603000641, "learning_rate": 4.081632653061225e-06, "loss": 1.0479, "step": 81 }, { "epoch": 0.03138155376961347, "grad_norm": 0.5655681490898132, "learning_rate": 4.13265306122449e-06, "loss": 1.022, "step": 82 }, { "epoch": 0.03176425564485266, "grad_norm": 0.49515441060066223, "learning_rate": 4.183673469387755e-06, "loss": 1.0706, "step": 83 }, { "epoch": 0.03214695752009185, "grad_norm": 0.5692971348762512, "learning_rate": 4.234693877551021e-06, "loss": 1.0011, "step": 84 }, { "epoch": 0.032529659395331034, "grad_norm": 0.47044047713279724, "learning_rate": 4.2857142857142855e-06, "loss": 1.0068, "step": 85 }, { "epoch": 0.03291236127057023, "grad_norm": 0.45850053429603577, "learning_rate": 4.336734693877551e-06, "loss": 0.9753, "step": 86 }, { "epoch": 0.03329506314580941, "grad_norm": 0.4482385516166687, "learning_rate": 4.3877551020408165e-06, "loss": 0.9636, "step": 87 }, { "epoch": 0.033677765021048606, "grad_norm": 0.4905047118663788, "learning_rate": 4.438775510204082e-06, "loss": 0.9946, "step": 88 }, { "epoch": 0.03406046689628779, "grad_norm": 0.46900540590286255, "learning_rate": 4.489795918367348e-06, "loss": 1.104, "step": 89 }, { "epoch": 0.03444316877152698, "grad_norm": 0.47829490900039673, "learning_rate": 4.540816326530613e-06, "loss": 1.0084, "step": 90 }, { "epoch": 0.03482587064676617, "grad_norm": 0.5256441831588745, "learning_rate": 4.591836734693878e-06, "loss": 1.0724, "step": 91 }, { "epoch": 0.035208572522005356, "grad_norm": 0.4520411789417267, "learning_rate": 4.642857142857144e-06, "loss": 0.971, "step": 92 }, { "epoch": 0.03559127439724455, "grad_norm": 0.5247328281402588, "learning_rate": 4.693877551020409e-06, "loss": 1.0251, "step": 93 }, { "epoch": 0.035973976272483735, "grad_norm": 0.6844139099121094, "learning_rate": 4.744897959183674e-06, "loss": 1.0369, "step": 94 }, { "epoch": 0.03635667814772292, "grad_norm": 0.4872569739818573, "learning_rate": 4.795918367346939e-06, "loss": 1.0162, "step": 95 }, { "epoch": 0.03673938002296211, "grad_norm": 0.4742879867553711, "learning_rate": 4.846938775510204e-06, "loss": 0.9565, "step": 96 }, { "epoch": 0.0371220818982013, "grad_norm": 0.43982839584350586, "learning_rate": 4.897959183673469e-06, "loss": 0.9907, "step": 97 }, { "epoch": 0.03750478377344049, "grad_norm": 0.5756723284721375, "learning_rate": 4.948979591836735e-06, "loss": 1.0337, "step": 98 }, { "epoch": 0.03788748564867968, "grad_norm": 0.5395206212997437, "learning_rate": 5e-06, "loss": 0.997, "step": 99 }, { "epoch": 0.038270187523918864, "grad_norm": 0.4611997902393341, "learning_rate": 5.0510204081632655e-06, "loss": 0.9466, "step": 100 }, { "epoch": 0.03865288939915806, "grad_norm": 0.5089948773384094, "learning_rate": 5.1020408163265315e-06, "loss": 0.9507, "step": 101 }, { "epoch": 0.03903559127439724, "grad_norm": 0.4774022400379181, "learning_rate": 5.153061224489796e-06, "loss": 1.0482, "step": 102 }, { "epoch": 0.039418293149636435, "grad_norm": 0.5364905595779419, "learning_rate": 5.204081632653062e-06, "loss": 1.0697, "step": 103 }, { "epoch": 0.03980099502487562, "grad_norm": 0.4609629511833191, "learning_rate": 5.255102040816327e-06, "loss": 0.9926, "step": 104 }, { "epoch": 0.040183696900114814, "grad_norm": 0.4585234224796295, "learning_rate": 5.306122448979593e-06, "loss": 1.0221, "step": 105 }, { "epoch": 0.040566398775354, "grad_norm": 0.45140090584754944, "learning_rate": 5.357142857142857e-06, "loss": 0.9713, "step": 106 }, { "epoch": 0.040949100650593186, "grad_norm": 0.49967166781425476, "learning_rate": 5.408163265306123e-06, "loss": 0.9612, "step": 107 }, { "epoch": 0.04133180252583238, "grad_norm": 0.45559340715408325, "learning_rate": 5.459183673469388e-06, "loss": 0.9503, "step": 108 }, { "epoch": 0.041714504401071564, "grad_norm": 0.44254982471466064, "learning_rate": 5.510204081632653e-06, "loss": 1.0522, "step": 109 }, { "epoch": 0.04209720627631076, "grad_norm": 0.5875375270843506, "learning_rate": 5.561224489795919e-06, "loss": 0.957, "step": 110 }, { "epoch": 0.04247990815154994, "grad_norm": 0.5845476984977722, "learning_rate": 5.6122448979591834e-06, "loss": 1.0481, "step": 111 }, { "epoch": 0.04286261002678913, "grad_norm": 0.5500447154045105, "learning_rate": 5.663265306122449e-06, "loss": 0.9799, "step": 112 }, { "epoch": 0.04324531190202832, "grad_norm": 0.47424161434173584, "learning_rate": 5.7142857142857145e-06, "loss": 0.9475, "step": 113 }, { "epoch": 0.04362801377726751, "grad_norm": 0.5487874746322632, "learning_rate": 5.7653061224489805e-06, "loss": 0.9911, "step": 114 }, { "epoch": 0.0440107156525067, "grad_norm": 0.482888400554657, "learning_rate": 5.816326530612246e-06, "loss": 0.9489, "step": 115 }, { "epoch": 0.044393417527745886, "grad_norm": 0.47023066878318787, "learning_rate": 5.867346938775511e-06, "loss": 0.9713, "step": 116 }, { "epoch": 0.04477611940298507, "grad_norm": 0.4742276072502136, "learning_rate": 5.918367346938776e-06, "loss": 0.9496, "step": 117 }, { "epoch": 0.045158821278224265, "grad_norm": 0.4852151870727539, "learning_rate": 5.969387755102042e-06, "loss": 0.9077, "step": 118 }, { "epoch": 0.04554152315346345, "grad_norm": 0.4497857093811035, "learning_rate": 6.020408163265307e-06, "loss": 1.0174, "step": 119 }, { "epoch": 0.045924225028702644, "grad_norm": 0.48187682032585144, "learning_rate": 6.071428571428571e-06, "loss": 1.0018, "step": 120 }, { "epoch": 0.04630692690394183, "grad_norm": 0.4756864309310913, "learning_rate": 6.122448979591837e-06, "loss": 0.9621, "step": 121 }, { "epoch": 0.046689628779181015, "grad_norm": 0.43599390983581543, "learning_rate": 6.173469387755102e-06, "loss": 0.9397, "step": 122 }, { "epoch": 0.04707233065442021, "grad_norm": 0.4520189166069031, "learning_rate": 6.224489795918368e-06, "loss": 0.9393, "step": 123 }, { "epoch": 0.047455032529659394, "grad_norm": 0.4620411992073059, "learning_rate": 6.275510204081633e-06, "loss": 0.9554, "step": 124 }, { "epoch": 0.04783773440489859, "grad_norm": 0.49836039543151855, "learning_rate": 6.326530612244899e-06, "loss": 0.9806, "step": 125 }, { "epoch": 0.04822043628013777, "grad_norm": 0.4458112418651581, "learning_rate": 6.3775510204081635e-06, "loss": 0.9483, "step": 126 }, { "epoch": 0.04860313815537696, "grad_norm": 0.4571883976459503, "learning_rate": 6.4285714285714295e-06, "loss": 0.9161, "step": 127 }, { "epoch": 0.04898584003061615, "grad_norm": 0.48349729180336, "learning_rate": 6.4795918367346946e-06, "loss": 0.8674, "step": 128 }, { "epoch": 0.04936854190585534, "grad_norm": 0.48979923129081726, "learning_rate": 6.530612244897959e-06, "loss": 0.9522, "step": 129 }, { "epoch": 0.04975124378109453, "grad_norm": 0.48730936646461487, "learning_rate": 6.581632653061225e-06, "loss": 0.8748, "step": 130 }, { "epoch": 0.050133945656333716, "grad_norm": 0.5095522403717041, "learning_rate": 6.63265306122449e-06, "loss": 0.9102, "step": 131 }, { "epoch": 0.0505166475315729, "grad_norm": 0.5574443936347961, "learning_rate": 6.683673469387756e-06, "loss": 0.962, "step": 132 }, { "epoch": 0.050899349406812094, "grad_norm": 0.5862312316894531, "learning_rate": 6.734693877551021e-06, "loss": 0.9191, "step": 133 }, { "epoch": 0.05128205128205128, "grad_norm": 0.5606891512870789, "learning_rate": 6.785714285714287e-06, "loss": 0.9132, "step": 134 }, { "epoch": 0.05166475315729047, "grad_norm": 0.5124997496604919, "learning_rate": 6.836734693877551e-06, "loss": 0.9198, "step": 135 }, { "epoch": 0.05204745503252966, "grad_norm": 0.4736429452896118, "learning_rate": 6.887755102040817e-06, "loss": 0.8699, "step": 136 }, { "epoch": 0.052430156907768845, "grad_norm": 0.4190003275871277, "learning_rate": 6.938775510204082e-06, "loss": 0.9046, "step": 137 }, { "epoch": 0.05281285878300804, "grad_norm": 0.48180967569351196, "learning_rate": 6.989795918367348e-06, "loss": 0.9333, "step": 138 }, { "epoch": 0.05319556065824722, "grad_norm": 0.5333188772201538, "learning_rate": 7.0408163265306125e-06, "loss": 0.9121, "step": 139 }, { "epoch": 0.053578262533486416, "grad_norm": 0.5261953473091125, "learning_rate": 7.091836734693878e-06, "loss": 0.9, "step": 140 }, { "epoch": 0.0539609644087256, "grad_norm": 0.604557991027832, "learning_rate": 7.1428571428571436e-06, "loss": 0.9124, "step": 141 }, { "epoch": 0.05434366628396479, "grad_norm": 0.4716319441795349, "learning_rate": 7.193877551020409e-06, "loss": 0.8781, "step": 142 }, { "epoch": 0.05472636815920398, "grad_norm": 0.5105631947517395, "learning_rate": 7.244897959183675e-06, "loss": 0.9182, "step": 143 }, { "epoch": 0.05510907003444317, "grad_norm": 0.5327917337417603, "learning_rate": 7.295918367346939e-06, "loss": 0.9256, "step": 144 }, { "epoch": 0.05549177190968236, "grad_norm": 0.5028150081634521, "learning_rate": 7.346938775510205e-06, "loss": 0.8789, "step": 145 }, { "epoch": 0.055874473784921545, "grad_norm": 0.4696621000766754, "learning_rate": 7.39795918367347e-06, "loss": 0.9515, "step": 146 }, { "epoch": 0.05625717566016074, "grad_norm": 0.5244583487510681, "learning_rate": 7.448979591836736e-06, "loss": 0.8974, "step": 147 }, { "epoch": 0.056639877535399924, "grad_norm": 0.5413923263549805, "learning_rate": 7.500000000000001e-06, "loss": 0.9241, "step": 148 }, { "epoch": 0.05702257941063911, "grad_norm": 0.5443688035011292, "learning_rate": 7.551020408163265e-06, "loss": 0.9564, "step": 149 }, { "epoch": 0.0574052812858783, "grad_norm": 0.47243309020996094, "learning_rate": 7.602040816326531e-06, "loss": 0.9388, "step": 150 }, { "epoch": 0.05778798316111749, "grad_norm": 0.48446881771087646, "learning_rate": 7.653061224489796e-06, "loss": 0.8474, "step": 151 }, { "epoch": 0.05817068503635668, "grad_norm": 0.4934194087982178, "learning_rate": 7.704081632653061e-06, "loss": 0.9026, "step": 152 }, { "epoch": 0.05855338691159587, "grad_norm": 0.5428822636604309, "learning_rate": 7.755102040816327e-06, "loss": 0.9406, "step": 153 }, { "epoch": 0.05893608878683505, "grad_norm": 0.5220634341239929, "learning_rate": 7.806122448979593e-06, "loss": 0.8356, "step": 154 }, { "epoch": 0.059318790662074246, "grad_norm": 0.49670955538749695, "learning_rate": 7.857142857142858e-06, "loss": 0.8765, "step": 155 }, { "epoch": 0.05970149253731343, "grad_norm": 0.5247951745986938, "learning_rate": 7.908163265306124e-06, "loss": 0.8565, "step": 156 }, { "epoch": 0.060084194412552624, "grad_norm": 0.4811428189277649, "learning_rate": 7.959183673469388e-06, "loss": 0.9383, "step": 157 }, { "epoch": 0.06046689628779181, "grad_norm": 0.5022870302200317, "learning_rate": 8.010204081632654e-06, "loss": 0.9598, "step": 158 }, { "epoch": 0.060849598163030996, "grad_norm": 0.4738556146621704, "learning_rate": 8.06122448979592e-06, "loss": 0.8259, "step": 159 }, { "epoch": 0.06123230003827019, "grad_norm": 0.4968925416469574, "learning_rate": 8.112244897959184e-06, "loss": 0.9179, "step": 160 }, { "epoch": 0.061615001913509375, "grad_norm": 0.5133926868438721, "learning_rate": 8.16326530612245e-06, "loss": 0.8786, "step": 161 }, { "epoch": 0.06199770378874857, "grad_norm": 0.5027845501899719, "learning_rate": 8.214285714285714e-06, "loss": 0.9818, "step": 162 }, { "epoch": 0.062380405663987754, "grad_norm": 0.49573755264282227, "learning_rate": 8.26530612244898e-06, "loss": 0.9224, "step": 163 }, { "epoch": 0.06276310753922694, "grad_norm": 0.5092456936836243, "learning_rate": 8.316326530612246e-06, "loss": 0.9417, "step": 164 }, { "epoch": 0.06314580941446613, "grad_norm": 0.5030081868171692, "learning_rate": 8.36734693877551e-06, "loss": 0.8152, "step": 165 }, { "epoch": 0.06352851128970533, "grad_norm": 0.36900198459625244, "learning_rate": 8.418367346938776e-06, "loss": 0.8006, "step": 166 }, { "epoch": 0.06391121316494451, "grad_norm": 0.4496842622756958, "learning_rate": 8.469387755102042e-06, "loss": 0.8787, "step": 167 }, { "epoch": 0.0642939150401837, "grad_norm": 0.4092608392238617, "learning_rate": 8.520408163265307e-06, "loss": 0.8226, "step": 168 }, { "epoch": 0.06467661691542288, "grad_norm": 0.45240938663482666, "learning_rate": 8.571428571428571e-06, "loss": 0.8679, "step": 169 }, { "epoch": 0.06505931879066207, "grad_norm": 0.5009729266166687, "learning_rate": 8.622448979591837e-06, "loss": 0.8525, "step": 170 }, { "epoch": 0.06544202066590127, "grad_norm": 0.49245187640190125, "learning_rate": 8.673469387755103e-06, "loss": 0.8501, "step": 171 }, { "epoch": 0.06582472254114045, "grad_norm": 0.46312233805656433, "learning_rate": 8.724489795918369e-06, "loss": 0.8729, "step": 172 }, { "epoch": 0.06620742441637964, "grad_norm": 0.42403554916381836, "learning_rate": 8.775510204081633e-06, "loss": 0.8614, "step": 173 }, { "epoch": 0.06659012629161883, "grad_norm": 0.5533359050750732, "learning_rate": 8.826530612244899e-06, "loss": 0.8871, "step": 174 }, { "epoch": 0.06697282816685801, "grad_norm": 0.47463229298591614, "learning_rate": 8.877551020408163e-06, "loss": 0.884, "step": 175 }, { "epoch": 0.06735553004209721, "grad_norm": 0.47151339054107666, "learning_rate": 8.92857142857143e-06, "loss": 0.8311, "step": 176 }, { "epoch": 0.0677382319173364, "grad_norm": 0.4776838421821594, "learning_rate": 8.979591836734695e-06, "loss": 0.8594, "step": 177 }, { "epoch": 0.06812093379257558, "grad_norm": 0.47697770595550537, "learning_rate": 9.03061224489796e-06, "loss": 0.8921, "step": 178 }, { "epoch": 0.06850363566781477, "grad_norm": 0.5196052193641663, "learning_rate": 9.081632653061225e-06, "loss": 0.9104, "step": 179 }, { "epoch": 0.06888633754305395, "grad_norm": 0.46096763014793396, "learning_rate": 9.13265306122449e-06, "loss": 0.9264, "step": 180 }, { "epoch": 0.06926903941829315, "grad_norm": 0.4584346115589142, "learning_rate": 9.183673469387756e-06, "loss": 0.8591, "step": 181 }, { "epoch": 0.06965174129353234, "grad_norm": 0.6072789430618286, "learning_rate": 9.234693877551022e-06, "loss": 0.9005, "step": 182 }, { "epoch": 0.07003444316877153, "grad_norm": 0.4603135287761688, "learning_rate": 9.285714285714288e-06, "loss": 0.8881, "step": 183 }, { "epoch": 0.07041714504401071, "grad_norm": 0.5829232335090637, "learning_rate": 9.336734693877552e-06, "loss": 0.8657, "step": 184 }, { "epoch": 0.0707998469192499, "grad_norm": 0.4620888829231262, "learning_rate": 9.387755102040818e-06, "loss": 0.8981, "step": 185 }, { "epoch": 0.0711825487944891, "grad_norm": 0.4570513367652893, "learning_rate": 9.438775510204082e-06, "loss": 0.9086, "step": 186 }, { "epoch": 0.07156525066972828, "grad_norm": 0.4923151731491089, "learning_rate": 9.489795918367348e-06, "loss": 0.9013, "step": 187 }, { "epoch": 0.07194795254496747, "grad_norm": 0.46703165769577026, "learning_rate": 9.540816326530612e-06, "loss": 0.8472, "step": 188 }, { "epoch": 0.07233065442020666, "grad_norm": 0.4954587519168854, "learning_rate": 9.591836734693878e-06, "loss": 0.8744, "step": 189 }, { "epoch": 0.07271335629544584, "grad_norm": 0.41913801431655884, "learning_rate": 9.642857142857144e-06, "loss": 0.8826, "step": 190 }, { "epoch": 0.07309605817068504, "grad_norm": 0.42646127939224243, "learning_rate": 9.693877551020408e-06, "loss": 0.8131, "step": 191 }, { "epoch": 0.07347876004592423, "grad_norm": 0.48656418919563293, "learning_rate": 9.744897959183674e-06, "loss": 0.8716, "step": 192 }, { "epoch": 0.07386146192116341, "grad_norm": 0.4388425052165985, "learning_rate": 9.795918367346939e-06, "loss": 0.8115, "step": 193 }, { "epoch": 0.0742441637964026, "grad_norm": 0.432698130607605, "learning_rate": 9.846938775510205e-06, "loss": 0.7428, "step": 194 }, { "epoch": 0.07462686567164178, "grad_norm": 0.4851953387260437, "learning_rate": 9.89795918367347e-06, "loss": 0.758, "step": 195 }, { "epoch": 0.07500956754688098, "grad_norm": 0.4156672954559326, "learning_rate": 9.948979591836737e-06, "loss": 0.8463, "step": 196 }, { "epoch": 0.07539226942212017, "grad_norm": 0.43970349431037903, "learning_rate": 1e-05, "loss": 0.8181, "step": 197 }, { "epoch": 0.07577497129735936, "grad_norm": 0.4336532652378082, "learning_rate": 1.0051020408163265e-05, "loss": 0.8643, "step": 198 }, { "epoch": 0.07615767317259854, "grad_norm": 0.44039225578308105, "learning_rate": 1.0102040816326531e-05, "loss": 0.8278, "step": 199 }, { "epoch": 0.07654037504783773, "grad_norm": 0.5034912824630737, "learning_rate": 1.0153061224489797e-05, "loss": 0.8259, "step": 200 }, { "epoch": 0.07692307692307693, "grad_norm": 0.513757050037384, "learning_rate": 1.0204081632653063e-05, "loss": 0.8761, "step": 201 }, { "epoch": 0.07730577879831611, "grad_norm": 0.532858669757843, "learning_rate": 1.0255102040816327e-05, "loss": 0.8086, "step": 202 }, { "epoch": 0.0776884806735553, "grad_norm": 0.4935529828071594, "learning_rate": 1.0306122448979591e-05, "loss": 0.864, "step": 203 }, { "epoch": 0.07807118254879448, "grad_norm": 0.5176499485969543, "learning_rate": 1.0357142857142859e-05, "loss": 0.8712, "step": 204 }, { "epoch": 0.07845388442403368, "grad_norm": 0.45639920234680176, "learning_rate": 1.0408163265306123e-05, "loss": 0.8395, "step": 205 }, { "epoch": 0.07883658629927287, "grad_norm": 0.46378815174102783, "learning_rate": 1.045918367346939e-05, "loss": 0.8759, "step": 206 }, { "epoch": 0.07921928817451206, "grad_norm": 0.4393512010574341, "learning_rate": 1.0510204081632654e-05, "loss": 0.794, "step": 207 }, { "epoch": 0.07960199004975124, "grad_norm": 0.6285015344619751, "learning_rate": 1.0561224489795918e-05, "loss": 0.9104, "step": 208 }, { "epoch": 0.07998469192499043, "grad_norm": 0.46917811036109924, "learning_rate": 1.0612244897959186e-05, "loss": 0.8483, "step": 209 }, { "epoch": 0.08036739380022963, "grad_norm": 0.47618624567985535, "learning_rate": 1.066326530612245e-05, "loss": 0.8345, "step": 210 }, { "epoch": 0.08075009567546881, "grad_norm": 0.5428972244262695, "learning_rate": 1.0714285714285714e-05, "loss": 0.8882, "step": 211 }, { "epoch": 0.081132797550708, "grad_norm": 0.5112171173095703, "learning_rate": 1.076530612244898e-05, "loss": 0.7946, "step": 212 }, { "epoch": 0.08151549942594719, "grad_norm": 0.4824351370334625, "learning_rate": 1.0816326530612246e-05, "loss": 0.8604, "step": 213 }, { "epoch": 0.08189820130118637, "grad_norm": 0.482271283864975, "learning_rate": 1.0867346938775512e-05, "loss": 0.8371, "step": 214 }, { "epoch": 0.08228090317642557, "grad_norm": 0.4499357342720032, "learning_rate": 1.0918367346938776e-05, "loss": 0.8498, "step": 215 }, { "epoch": 0.08266360505166476, "grad_norm": 0.439635694026947, "learning_rate": 1.096938775510204e-05, "loss": 0.8031, "step": 216 }, { "epoch": 0.08304630692690394, "grad_norm": 0.4271392822265625, "learning_rate": 1.1020408163265306e-05, "loss": 0.8042, "step": 217 }, { "epoch": 0.08342900880214313, "grad_norm": 0.4434744119644165, "learning_rate": 1.1071428571428572e-05, "loss": 0.8447, "step": 218 }, { "epoch": 0.08381171067738231, "grad_norm": 0.47778061032295227, "learning_rate": 1.1122448979591838e-05, "loss": 0.8467, "step": 219 }, { "epoch": 0.08419441255262151, "grad_norm": 0.4379095435142517, "learning_rate": 1.1173469387755103e-05, "loss": 0.804, "step": 220 }, { "epoch": 0.0845771144278607, "grad_norm": 0.40598347783088684, "learning_rate": 1.1224489795918367e-05, "loss": 0.7635, "step": 221 }, { "epoch": 0.08495981630309989, "grad_norm": 0.5504045486450195, "learning_rate": 1.1275510204081635e-05, "loss": 0.9028, "step": 222 }, { "epoch": 0.08534251817833907, "grad_norm": 0.48893967270851135, "learning_rate": 1.1326530612244899e-05, "loss": 0.8452, "step": 223 }, { "epoch": 0.08572522005357826, "grad_norm": 0.470890611410141, "learning_rate": 1.1377551020408165e-05, "loss": 0.9067, "step": 224 }, { "epoch": 0.08610792192881746, "grad_norm": 0.4344172775745392, "learning_rate": 1.1428571428571429e-05, "loss": 0.799, "step": 225 }, { "epoch": 0.08649062380405664, "grad_norm": 0.4766117036342621, "learning_rate": 1.1479591836734697e-05, "loss": 0.93, "step": 226 }, { "epoch": 0.08687332567929583, "grad_norm": 0.4237784445285797, "learning_rate": 1.1530612244897961e-05, "loss": 0.7863, "step": 227 }, { "epoch": 0.08725602755453502, "grad_norm": 0.4329286813735962, "learning_rate": 1.1581632653061225e-05, "loss": 0.754, "step": 228 }, { "epoch": 0.0876387294297742, "grad_norm": 0.5776365399360657, "learning_rate": 1.1632653061224491e-05, "loss": 0.8435, "step": 229 }, { "epoch": 0.0880214313050134, "grad_norm": 0.443359375, "learning_rate": 1.1683673469387755e-05, "loss": 0.8763, "step": 230 }, { "epoch": 0.08840413318025259, "grad_norm": 0.4582411050796509, "learning_rate": 1.1734693877551021e-05, "loss": 0.7738, "step": 231 }, { "epoch": 0.08878683505549177, "grad_norm": 0.45713692903518677, "learning_rate": 1.1785714285714287e-05, "loss": 0.8869, "step": 232 }, { "epoch": 0.08916953693073096, "grad_norm": 0.4324229955673218, "learning_rate": 1.1836734693877552e-05, "loss": 0.8225, "step": 233 }, { "epoch": 0.08955223880597014, "grad_norm": 0.4590579569339752, "learning_rate": 1.1887755102040816e-05, "loss": 0.8542, "step": 234 }, { "epoch": 0.08993494068120934, "grad_norm": 0.4642913341522217, "learning_rate": 1.1938775510204084e-05, "loss": 0.8032, "step": 235 }, { "epoch": 0.09031764255644853, "grad_norm": 0.45513075590133667, "learning_rate": 1.1989795918367348e-05, "loss": 0.8033, "step": 236 }, { "epoch": 0.09070034443168772, "grad_norm": 0.45588746666908264, "learning_rate": 1.2040816326530614e-05, "loss": 0.8443, "step": 237 }, { "epoch": 0.0910830463069269, "grad_norm": 0.45800238847732544, "learning_rate": 1.2091836734693878e-05, "loss": 0.8573, "step": 238 }, { "epoch": 0.09146574818216609, "grad_norm": 0.42988210916519165, "learning_rate": 1.2142857142857142e-05, "loss": 0.7567, "step": 239 }, { "epoch": 0.09184845005740529, "grad_norm": 0.44002604484558105, "learning_rate": 1.219387755102041e-05, "loss": 0.7638, "step": 240 }, { "epoch": 0.09223115193264447, "grad_norm": 0.4613363742828369, "learning_rate": 1.2244897959183674e-05, "loss": 0.7739, "step": 241 }, { "epoch": 0.09261385380788366, "grad_norm": 0.43171459436416626, "learning_rate": 1.229591836734694e-05, "loss": 0.7865, "step": 242 }, { "epoch": 0.09299655568312284, "grad_norm": 0.4794076085090637, "learning_rate": 1.2346938775510204e-05, "loss": 0.8584, "step": 243 }, { "epoch": 0.09337925755836203, "grad_norm": 0.547084629535675, "learning_rate": 1.2397959183673472e-05, "loss": 0.8799, "step": 244 }, { "epoch": 0.09376195943360123, "grad_norm": 0.4860130846500397, "learning_rate": 1.2448979591836736e-05, "loss": 0.7778, "step": 245 }, { "epoch": 0.09414466130884042, "grad_norm": 0.4342957139015198, "learning_rate": 1.25e-05, "loss": 0.8777, "step": 246 }, { "epoch": 0.0945273631840796, "grad_norm": 0.44497427344322205, "learning_rate": 1.2551020408163267e-05, "loss": 0.8622, "step": 247 }, { "epoch": 0.09491006505931879, "grad_norm": 0.7338611483573914, "learning_rate": 1.260204081632653e-05, "loss": 0.7776, "step": 248 }, { "epoch": 0.09529276693455797, "grad_norm": 0.45473599433898926, "learning_rate": 1.2653061224489798e-05, "loss": 0.8078, "step": 249 }, { "epoch": 0.09567546880979717, "grad_norm": 0.44900840520858765, "learning_rate": 1.2704081632653063e-05, "loss": 0.7758, "step": 250 }, { "epoch": 0.09605817068503636, "grad_norm": 0.4683789014816284, "learning_rate": 1.2755102040816327e-05, "loss": 0.8655, "step": 251 }, { "epoch": 0.09644087256027555, "grad_norm": 0.4763778746128082, "learning_rate": 1.2806122448979591e-05, "loss": 0.8288, "step": 252 }, { "epoch": 0.09682357443551473, "grad_norm": 0.4903474450111389, "learning_rate": 1.2857142857142859e-05, "loss": 0.8067, "step": 253 }, { "epoch": 0.09720627631075392, "grad_norm": 0.4505247175693512, "learning_rate": 1.2908163265306123e-05, "loss": 0.7982, "step": 254 }, { "epoch": 0.09758897818599312, "grad_norm": 0.480345219373703, "learning_rate": 1.2959183673469389e-05, "loss": 0.7519, "step": 255 }, { "epoch": 0.0979716800612323, "grad_norm": 0.45448145270347595, "learning_rate": 1.3010204081632653e-05, "loss": 0.7963, "step": 256 }, { "epoch": 0.09835438193647149, "grad_norm": 0.5136796236038208, "learning_rate": 1.3061224489795918e-05, "loss": 0.8257, "step": 257 }, { "epoch": 0.09873708381171067, "grad_norm": 0.5116167664527893, "learning_rate": 1.3112244897959185e-05, "loss": 0.9071, "step": 258 }, { "epoch": 0.09911978568694986, "grad_norm": 0.46745070815086365, "learning_rate": 1.316326530612245e-05, "loss": 0.8022, "step": 259 }, { "epoch": 0.09950248756218906, "grad_norm": 0.49566754698753357, "learning_rate": 1.3214285714285716e-05, "loss": 0.8189, "step": 260 }, { "epoch": 0.09988518943742825, "grad_norm": 0.4588281214237213, "learning_rate": 1.326530612244898e-05, "loss": 0.8019, "step": 261 }, { "epoch": 0.10026789131266743, "grad_norm": 0.46862688660621643, "learning_rate": 1.3316326530612247e-05, "loss": 0.9166, "step": 262 }, { "epoch": 0.10065059318790662, "grad_norm": 0.48254990577697754, "learning_rate": 1.3367346938775512e-05, "loss": 0.7411, "step": 263 }, { "epoch": 0.1010332950631458, "grad_norm": 0.43509092926979065, "learning_rate": 1.3418367346938776e-05, "loss": 0.7469, "step": 264 }, { "epoch": 0.101415996938385, "grad_norm": 0.5085695385932922, "learning_rate": 1.3469387755102042e-05, "loss": 0.7973, "step": 265 }, { "epoch": 0.10179869881362419, "grad_norm": 0.49721789360046387, "learning_rate": 1.3520408163265306e-05, "loss": 0.8184, "step": 266 }, { "epoch": 0.10218140068886337, "grad_norm": 0.4744129776954651, "learning_rate": 1.3571428571428574e-05, "loss": 0.7871, "step": 267 }, { "epoch": 0.10256410256410256, "grad_norm": 0.44555607438087463, "learning_rate": 1.3622448979591838e-05, "loss": 0.8135, "step": 268 }, { "epoch": 0.10294680443934175, "grad_norm": 0.46480438113212585, "learning_rate": 1.3673469387755102e-05, "loss": 0.7877, "step": 269 }, { "epoch": 0.10332950631458095, "grad_norm": 0.48376762866973877, "learning_rate": 1.3724489795918368e-05, "loss": 0.8283, "step": 270 }, { "epoch": 0.10371220818982013, "grad_norm": 0.44515371322631836, "learning_rate": 1.3775510204081634e-05, "loss": 0.8109, "step": 271 }, { "epoch": 0.10409491006505932, "grad_norm": 0.49347105622291565, "learning_rate": 1.38265306122449e-05, "loss": 0.7996, "step": 272 }, { "epoch": 0.1044776119402985, "grad_norm": 0.44547903537750244, "learning_rate": 1.3877551020408165e-05, "loss": 0.7439, "step": 273 }, { "epoch": 0.10486031381553769, "grad_norm": 0.5297123193740845, "learning_rate": 1.3928571428571429e-05, "loss": 0.79, "step": 274 }, { "epoch": 0.10524301569077689, "grad_norm": 0.47679299116134644, "learning_rate": 1.3979591836734696e-05, "loss": 0.8709, "step": 275 }, { "epoch": 0.10562571756601608, "grad_norm": 0.46401384472846985, "learning_rate": 1.403061224489796e-05, "loss": 0.801, "step": 276 }, { "epoch": 0.10600841944125526, "grad_norm": 0.6553977727890015, "learning_rate": 1.4081632653061225e-05, "loss": 0.7897, "step": 277 }, { "epoch": 0.10639112131649445, "grad_norm": 0.43892979621887207, "learning_rate": 1.4132653061224491e-05, "loss": 0.7618, "step": 278 }, { "epoch": 0.10677382319173363, "grad_norm": 0.4523659348487854, "learning_rate": 1.4183673469387755e-05, "loss": 0.7787, "step": 279 }, { "epoch": 0.10715652506697283, "grad_norm": 0.48000937700271606, "learning_rate": 1.4234693877551023e-05, "loss": 0.8192, "step": 280 }, { "epoch": 0.10753922694221202, "grad_norm": 0.5044941902160645, "learning_rate": 1.4285714285714287e-05, "loss": 0.7275, "step": 281 }, { "epoch": 0.1079219288174512, "grad_norm": 0.4577247202396393, "learning_rate": 1.4336734693877551e-05, "loss": 0.6971, "step": 282 }, { "epoch": 0.10830463069269039, "grad_norm": 0.502692699432373, "learning_rate": 1.4387755102040817e-05, "loss": 0.7618, "step": 283 }, { "epoch": 0.10868733256792958, "grad_norm": 0.492830753326416, "learning_rate": 1.4438775510204083e-05, "loss": 0.796, "step": 284 }, { "epoch": 0.10907003444316878, "grad_norm": 0.47037985920906067, "learning_rate": 1.448979591836735e-05, "loss": 0.7535, "step": 285 }, { "epoch": 0.10945273631840796, "grad_norm": 0.5385987758636475, "learning_rate": 1.4540816326530614e-05, "loss": 0.7609, "step": 286 }, { "epoch": 0.10983543819364715, "grad_norm": 0.6284303069114685, "learning_rate": 1.4591836734693878e-05, "loss": 0.8719, "step": 287 }, { "epoch": 0.11021814006888633, "grad_norm": 0.46581244468688965, "learning_rate": 1.4642857142857144e-05, "loss": 0.8284, "step": 288 }, { "epoch": 0.11060084194412553, "grad_norm": 0.46286582946777344, "learning_rate": 1.469387755102041e-05, "loss": 0.8485, "step": 289 }, { "epoch": 0.11098354381936472, "grad_norm": 0.5009508728981018, "learning_rate": 1.4744897959183676e-05, "loss": 0.7433, "step": 290 }, { "epoch": 0.1113662456946039, "grad_norm": 0.5283079743385315, "learning_rate": 1.479591836734694e-05, "loss": 0.8872, "step": 291 }, { "epoch": 0.11174894756984309, "grad_norm": 0.4892589747905731, "learning_rate": 1.4846938775510204e-05, "loss": 0.8128, "step": 292 }, { "epoch": 0.11213164944508228, "grad_norm": 0.4638614058494568, "learning_rate": 1.4897959183673472e-05, "loss": 0.7987, "step": 293 }, { "epoch": 0.11251435132032148, "grad_norm": 0.4975008964538574, "learning_rate": 1.4948979591836736e-05, "loss": 0.7463, "step": 294 }, { "epoch": 0.11289705319556066, "grad_norm": 0.48714739084243774, "learning_rate": 1.5000000000000002e-05, "loss": 0.8055, "step": 295 }, { "epoch": 0.11327975507079985, "grad_norm": 0.574885368347168, "learning_rate": 1.5051020408163266e-05, "loss": 0.7923, "step": 296 }, { "epoch": 0.11366245694603903, "grad_norm": 0.4364047348499298, "learning_rate": 1.510204081632653e-05, "loss": 0.7928, "step": 297 }, { "epoch": 0.11404515882127822, "grad_norm": 0.4947887659072876, "learning_rate": 1.5153061224489798e-05, "loss": 0.758, "step": 298 }, { "epoch": 0.11442786069651742, "grad_norm": 0.5770400166511536, "learning_rate": 1.5204081632653063e-05, "loss": 0.8013, "step": 299 }, { "epoch": 0.1148105625717566, "grad_norm": 0.4479968249797821, "learning_rate": 1.5255102040816327e-05, "loss": 0.8511, "step": 300 }, { "epoch": 0.11519326444699579, "grad_norm": 0.49974581599235535, "learning_rate": 1.530612244897959e-05, "loss": 0.8587, "step": 301 }, { "epoch": 0.11557596632223498, "grad_norm": 0.4741688370704651, "learning_rate": 1.535714285714286e-05, "loss": 0.8093, "step": 302 }, { "epoch": 0.11595866819747416, "grad_norm": 0.44475021958351135, "learning_rate": 1.5408163265306123e-05, "loss": 0.8308, "step": 303 }, { "epoch": 0.11634137007271336, "grad_norm": 0.4802582263946533, "learning_rate": 1.545918367346939e-05, "loss": 0.8448, "step": 304 }, { "epoch": 0.11672407194795255, "grad_norm": 0.6063730716705322, "learning_rate": 1.5510204081632655e-05, "loss": 0.8552, "step": 305 }, { "epoch": 0.11710677382319173, "grad_norm": 0.4845573306083679, "learning_rate": 1.556122448979592e-05, "loss": 0.7448, "step": 306 }, { "epoch": 0.11748947569843092, "grad_norm": 0.4838191568851471, "learning_rate": 1.5612244897959187e-05, "loss": 0.8283, "step": 307 }, { "epoch": 0.1178721775736701, "grad_norm": 0.4681665301322937, "learning_rate": 1.566326530612245e-05, "loss": 0.8101, "step": 308 }, { "epoch": 0.1182548794489093, "grad_norm": 0.43254604935646057, "learning_rate": 1.5714285714285715e-05, "loss": 0.719, "step": 309 }, { "epoch": 0.11863758132414849, "grad_norm": 0.7901787161827087, "learning_rate": 1.576530612244898e-05, "loss": 0.7854, "step": 310 }, { "epoch": 0.11902028319938768, "grad_norm": 0.4775210916996002, "learning_rate": 1.5816326530612247e-05, "loss": 0.7684, "step": 311 }, { "epoch": 0.11940298507462686, "grad_norm": 0.7116644978523254, "learning_rate": 1.586734693877551e-05, "loss": 0.828, "step": 312 }, { "epoch": 0.11978568694986605, "grad_norm": 0.46708840131759644, "learning_rate": 1.5918367346938776e-05, "loss": 0.7364, "step": 313 }, { "epoch": 0.12016838882510525, "grad_norm": 0.5275240540504456, "learning_rate": 1.596938775510204e-05, "loss": 0.8246, "step": 314 }, { "epoch": 0.12055109070034443, "grad_norm": 0.49078604578971863, "learning_rate": 1.6020408163265308e-05, "loss": 0.8392, "step": 315 }, { "epoch": 0.12093379257558362, "grad_norm": 0.4368308484554291, "learning_rate": 1.6071428571428572e-05, "loss": 0.7818, "step": 316 }, { "epoch": 0.1213164944508228, "grad_norm": 0.48200276494026184, "learning_rate": 1.612244897959184e-05, "loss": 0.7949, "step": 317 }, { "epoch": 0.12169919632606199, "grad_norm": 0.4894062280654907, "learning_rate": 1.6173469387755104e-05, "loss": 0.7963, "step": 318 }, { "epoch": 0.12208189820130119, "grad_norm": 0.5265064835548401, "learning_rate": 1.6224489795918368e-05, "loss": 0.741, "step": 319 }, { "epoch": 0.12246460007654038, "grad_norm": 0.45836132764816284, "learning_rate": 1.6275510204081636e-05, "loss": 0.8055, "step": 320 }, { "epoch": 0.12284730195177956, "grad_norm": 0.4770452082157135, "learning_rate": 1.63265306122449e-05, "loss": 0.8073, "step": 321 }, { "epoch": 0.12323000382701875, "grad_norm": 0.4482954442501068, "learning_rate": 1.6377551020408164e-05, "loss": 0.7803, "step": 322 }, { "epoch": 0.12361270570225794, "grad_norm": 0.505903422832489, "learning_rate": 1.642857142857143e-05, "loss": 0.8003, "step": 323 }, { "epoch": 0.12399540757749714, "grad_norm": 0.5284017324447632, "learning_rate": 1.6479591836734696e-05, "loss": 0.7863, "step": 324 }, { "epoch": 0.12437810945273632, "grad_norm": 0.4585797190666199, "learning_rate": 1.653061224489796e-05, "loss": 0.7391, "step": 325 }, { "epoch": 0.12476081132797551, "grad_norm": 0.44927603006362915, "learning_rate": 1.6581632653061225e-05, "loss": 0.7736, "step": 326 }, { "epoch": 0.1251435132032147, "grad_norm": 0.4714924097061157, "learning_rate": 1.6632653061224492e-05, "loss": 0.7668, "step": 327 }, { "epoch": 0.12552621507845388, "grad_norm": 0.5143863558769226, "learning_rate": 1.6683673469387757e-05, "loss": 0.7219, "step": 328 }, { "epoch": 0.12590891695369308, "grad_norm": 0.44385406374931335, "learning_rate": 1.673469387755102e-05, "loss": 0.8309, "step": 329 }, { "epoch": 0.12629161882893225, "grad_norm": 0.5264196395874023, "learning_rate": 1.678571428571429e-05, "loss": 0.8312, "step": 330 }, { "epoch": 0.12667432070417145, "grad_norm": 0.5065248012542725, "learning_rate": 1.6836734693877553e-05, "loss": 0.7861, "step": 331 }, { "epoch": 0.12705702257941065, "grad_norm": 0.47825491428375244, "learning_rate": 1.6887755102040817e-05, "loss": 0.8487, "step": 332 }, { "epoch": 0.12743972445464982, "grad_norm": 0.4384194314479828, "learning_rate": 1.6938775510204085e-05, "loss": 0.7344, "step": 333 }, { "epoch": 0.12782242632988902, "grad_norm": 0.42705562710762024, "learning_rate": 1.698979591836735e-05, "loss": 0.7161, "step": 334 }, { "epoch": 0.1282051282051282, "grad_norm": 0.4848678410053253, "learning_rate": 1.7040816326530613e-05, "loss": 0.7736, "step": 335 }, { "epoch": 0.1285878300803674, "grad_norm": 0.5217268466949463, "learning_rate": 1.7091836734693878e-05, "loss": 0.7137, "step": 336 }, { "epoch": 0.1289705319556066, "grad_norm": 0.475907564163208, "learning_rate": 1.7142857142857142e-05, "loss": 0.7592, "step": 337 }, { "epoch": 0.12935323383084577, "grad_norm": 0.4915025532245636, "learning_rate": 1.719387755102041e-05, "loss": 0.8668, "step": 338 }, { "epoch": 0.12973593570608496, "grad_norm": 0.6131678223609924, "learning_rate": 1.7244897959183674e-05, "loss": 0.751, "step": 339 }, { "epoch": 0.13011863758132414, "grad_norm": 0.5695495009422302, "learning_rate": 1.729591836734694e-05, "loss": 0.8207, "step": 340 }, { "epoch": 0.13050133945656334, "grad_norm": 0.4591576159000397, "learning_rate": 1.7346938775510206e-05, "loss": 0.6867, "step": 341 }, { "epoch": 0.13088404133180254, "grad_norm": 0.475982666015625, "learning_rate": 1.7397959183673473e-05, "loss": 0.7169, "step": 342 }, { "epoch": 0.1312667432070417, "grad_norm": 0.47769996523857117, "learning_rate": 1.7448979591836738e-05, "loss": 0.7516, "step": 343 }, { "epoch": 0.1316494450822809, "grad_norm": 0.9795849323272705, "learning_rate": 1.7500000000000002e-05, "loss": 0.7173, "step": 344 }, { "epoch": 0.13203214695752008, "grad_norm": 0.47921645641326904, "learning_rate": 1.7551020408163266e-05, "loss": 0.8067, "step": 345 }, { "epoch": 0.13241484883275928, "grad_norm": 0.48762401938438416, "learning_rate": 1.760204081632653e-05, "loss": 0.8887, "step": 346 }, { "epoch": 0.13279755070799848, "grad_norm": 0.5272188186645508, "learning_rate": 1.7653061224489798e-05, "loss": 0.7868, "step": 347 }, { "epoch": 0.13318025258323765, "grad_norm": 0.5041695833206177, "learning_rate": 1.7704081632653062e-05, "loss": 0.8244, "step": 348 }, { "epoch": 0.13356295445847685, "grad_norm": 0.5150105953216553, "learning_rate": 1.7755102040816327e-05, "loss": 0.7778, "step": 349 }, { "epoch": 0.13394565633371602, "grad_norm": 0.46310463547706604, "learning_rate": 1.780612244897959e-05, "loss": 0.7272, "step": 350 }, { "epoch": 0.13432835820895522, "grad_norm": 0.5399727821350098, "learning_rate": 1.785714285714286e-05, "loss": 0.8466, "step": 351 }, { "epoch": 0.13471106008419442, "grad_norm": 0.4699437618255615, "learning_rate": 1.7908163265306123e-05, "loss": 0.7355, "step": 352 }, { "epoch": 0.1350937619594336, "grad_norm": 0.504329264163971, "learning_rate": 1.795918367346939e-05, "loss": 0.8423, "step": 353 }, { "epoch": 0.1354764638346728, "grad_norm": 0.6426067352294922, "learning_rate": 1.8010204081632655e-05, "loss": 0.7207, "step": 354 }, { "epoch": 0.13585916570991197, "grad_norm": 0.5362403392791748, "learning_rate": 1.806122448979592e-05, "loss": 0.7281, "step": 355 }, { "epoch": 0.13624186758515117, "grad_norm": 0.4878808856010437, "learning_rate": 1.8112244897959187e-05, "loss": 0.7383, "step": 356 }, { "epoch": 0.13662456946039037, "grad_norm": 0.5546179413795471, "learning_rate": 1.816326530612245e-05, "loss": 0.776, "step": 357 }, { "epoch": 0.13700727133562954, "grad_norm": 0.4362420439720154, "learning_rate": 1.8214285714285715e-05, "loss": 0.7538, "step": 358 }, { "epoch": 0.13738997321086874, "grad_norm": 0.4572063088417053, "learning_rate": 1.826530612244898e-05, "loss": 0.7427, "step": 359 }, { "epoch": 0.1377726750861079, "grad_norm": 0.5076887607574463, "learning_rate": 1.8316326530612247e-05, "loss": 0.7839, "step": 360 }, { "epoch": 0.1381553769613471, "grad_norm": 0.49158430099487305, "learning_rate": 1.836734693877551e-05, "loss": 0.7109, "step": 361 }, { "epoch": 0.1385380788365863, "grad_norm": 0.5068415403366089, "learning_rate": 1.8418367346938776e-05, "loss": 0.9069, "step": 362 }, { "epoch": 0.13892078071182548, "grad_norm": 0.5111059546470642, "learning_rate": 1.8469387755102043e-05, "loss": 0.717, "step": 363 }, { "epoch": 0.13930348258706468, "grad_norm": 0.5775309205055237, "learning_rate": 1.8520408163265307e-05, "loss": 0.828, "step": 364 }, { "epoch": 0.13968618446230385, "grad_norm": 0.49432775378227234, "learning_rate": 1.8571428571428575e-05, "loss": 0.8183, "step": 365 }, { "epoch": 0.14006888633754305, "grad_norm": 0.5059371590614319, "learning_rate": 1.862244897959184e-05, "loss": 0.7544, "step": 366 }, { "epoch": 0.14045158821278225, "grad_norm": 0.59481281042099, "learning_rate": 1.8673469387755104e-05, "loss": 0.7213, "step": 367 }, { "epoch": 0.14083429008802142, "grad_norm": 0.5257335305213928, "learning_rate": 1.8724489795918368e-05, "loss": 0.7496, "step": 368 }, { "epoch": 0.14121699196326062, "grad_norm": 0.5574237108230591, "learning_rate": 1.8775510204081636e-05, "loss": 0.7916, "step": 369 }, { "epoch": 0.1415996938384998, "grad_norm": 0.4593428671360016, "learning_rate": 1.88265306122449e-05, "loss": 0.7888, "step": 370 }, { "epoch": 0.141982395713739, "grad_norm": 0.5101248025894165, "learning_rate": 1.8877551020408164e-05, "loss": 0.7619, "step": 371 }, { "epoch": 0.1423650975889782, "grad_norm": 0.4678976237773895, "learning_rate": 1.892857142857143e-05, "loss": 0.8096, "step": 372 }, { "epoch": 0.14274779946421737, "grad_norm": 0.4980526864528656, "learning_rate": 1.8979591836734696e-05, "loss": 0.8204, "step": 373 }, { "epoch": 0.14313050133945657, "grad_norm": 0.5028175115585327, "learning_rate": 1.903061224489796e-05, "loss": 0.7811, "step": 374 }, { "epoch": 0.14351320321469574, "grad_norm": 0.5716807246208191, "learning_rate": 1.9081632653061225e-05, "loss": 0.7622, "step": 375 }, { "epoch": 0.14389590508993494, "grad_norm": 0.4889722466468811, "learning_rate": 1.9132653061224492e-05, "loss": 0.7936, "step": 376 }, { "epoch": 0.14427860696517414, "grad_norm": 0.6214013695716858, "learning_rate": 1.9183673469387756e-05, "loss": 0.7434, "step": 377 }, { "epoch": 0.1446613088404133, "grad_norm": 0.4586067199707031, "learning_rate": 1.9234693877551024e-05, "loss": 0.7516, "step": 378 }, { "epoch": 0.1450440107156525, "grad_norm": 0.5999574661254883, "learning_rate": 1.928571428571429e-05, "loss": 0.7797, "step": 379 }, { "epoch": 0.14542671259089168, "grad_norm": 0.5201244950294495, "learning_rate": 1.9336734693877553e-05, "loss": 0.7496, "step": 380 }, { "epoch": 0.14580941446613088, "grad_norm": 0.5804024934768677, "learning_rate": 1.9387755102040817e-05, "loss": 0.916, "step": 381 }, { "epoch": 0.14619211634137008, "grad_norm": 0.4436352849006653, "learning_rate": 1.9438775510204085e-05, "loss": 0.7659, "step": 382 }, { "epoch": 0.14657481821660925, "grad_norm": 0.5320150852203369, "learning_rate": 1.948979591836735e-05, "loss": 0.7416, "step": 383 }, { "epoch": 0.14695752009184845, "grad_norm": 0.5179184079170227, "learning_rate": 1.9540816326530613e-05, "loss": 0.7356, "step": 384 }, { "epoch": 0.14734022196708763, "grad_norm": 0.4989006519317627, "learning_rate": 1.9591836734693877e-05, "loss": 0.7313, "step": 385 }, { "epoch": 0.14772292384232683, "grad_norm": 0.5252915024757385, "learning_rate": 1.9642857142857145e-05, "loss": 0.7644, "step": 386 }, { "epoch": 0.14810562571756603, "grad_norm": 0.5016194581985474, "learning_rate": 1.969387755102041e-05, "loss": 0.7765, "step": 387 }, { "epoch": 0.1484883275928052, "grad_norm": 0.5654856562614441, "learning_rate": 1.9744897959183677e-05, "loss": 0.8292, "step": 388 }, { "epoch": 0.1488710294680444, "grad_norm": 0.49477460980415344, "learning_rate": 1.979591836734694e-05, "loss": 0.768, "step": 389 }, { "epoch": 0.14925373134328357, "grad_norm": 0.5155985355377197, "learning_rate": 1.9846938775510205e-05, "loss": 0.801, "step": 390 }, { "epoch": 0.14963643321852277, "grad_norm": 0.4874078929424286, "learning_rate": 1.9897959183673473e-05, "loss": 0.8311, "step": 391 }, { "epoch": 0.15001913509376197, "grad_norm": 0.6127997040748596, "learning_rate": 1.9948979591836737e-05, "loss": 0.8777, "step": 392 }, { "epoch": 0.15040183696900114, "grad_norm": 0.5528757572174072, "learning_rate": 2e-05, "loss": 0.7919, "step": 393 }, { "epoch": 0.15078453884424034, "grad_norm": 0.5106461048126221, "learning_rate": 1.999999969273657e-05, "loss": 0.809, "step": 394 }, { "epoch": 0.1511672407194795, "grad_norm": 0.4746157228946686, "learning_rate": 1.999999877094629e-05, "loss": 0.6977, "step": 395 }, { "epoch": 0.1515499425947187, "grad_norm": 0.491228848695755, "learning_rate": 1.9999997234629224e-05, "loss": 0.7484, "step": 396 }, { "epoch": 0.1519326444699579, "grad_norm": 0.5519547462463379, "learning_rate": 1.9999995083785458e-05, "loss": 0.8429, "step": 397 }, { "epoch": 0.15231534634519708, "grad_norm": 0.6026712656021118, "learning_rate": 1.9999992318415134e-05, "loss": 0.7803, "step": 398 }, { "epoch": 0.15269804822043628, "grad_norm": 0.555978536605835, "learning_rate": 1.9999988938518417e-05, "loss": 0.7594, "step": 399 }, { "epoch": 0.15308075009567546, "grad_norm": 0.5221860408782959, "learning_rate": 1.9999984944095514e-05, "loss": 0.8271, "step": 400 }, { "epoch": 0.15346345197091465, "grad_norm": 0.5649492144584656, "learning_rate": 1.9999980335146668e-05, "loss": 0.8286, "step": 401 }, { "epoch": 0.15384615384615385, "grad_norm": 0.5375712513923645, "learning_rate": 1.999997511167217e-05, "loss": 0.7739, "step": 402 }, { "epoch": 0.15422885572139303, "grad_norm": 0.4926639795303345, "learning_rate": 1.9999969273672334e-05, "loss": 0.7614, "step": 403 }, { "epoch": 0.15461155759663223, "grad_norm": 0.5006320476531982, "learning_rate": 1.9999962821147523e-05, "loss": 0.7093, "step": 404 }, { "epoch": 0.1549942594718714, "grad_norm": 0.4833860695362091, "learning_rate": 1.999995575409813e-05, "loss": 0.7624, "step": 405 }, { "epoch": 0.1553769613471106, "grad_norm": 0.486387699842453, "learning_rate": 1.9999948072524593e-05, "loss": 0.764, "step": 406 }, { "epoch": 0.1557596632223498, "grad_norm": 0.5455716252326965, "learning_rate": 1.9999939776427382e-05, "loss": 0.8042, "step": 407 }, { "epoch": 0.15614236509758897, "grad_norm": 0.4572210907936096, "learning_rate": 1.9999930865807006e-05, "loss": 0.6785, "step": 408 }, { "epoch": 0.15652506697282817, "grad_norm": 0.49035879969596863, "learning_rate": 1.9999921340664016e-05, "loss": 0.814, "step": 409 }, { "epoch": 0.15690776884806737, "grad_norm": 0.4861501455307007, "learning_rate": 1.9999911200998992e-05, "loss": 0.7637, "step": 410 }, { "epoch": 0.15729047072330654, "grad_norm": 0.5286645293235779, "learning_rate": 1.9999900446812558e-05, "loss": 0.7959, "step": 411 }, { "epoch": 0.15767317259854574, "grad_norm": 0.5439308881759644, "learning_rate": 1.999988907810538e-05, "loss": 0.8302, "step": 412 }, { "epoch": 0.1580558744737849, "grad_norm": 0.4900946021080017, "learning_rate": 1.9999877094878154e-05, "loss": 0.7395, "step": 413 }, { "epoch": 0.1584385763490241, "grad_norm": 0.44722995162010193, "learning_rate": 1.9999864497131615e-05, "loss": 0.7564, "step": 414 }, { "epoch": 0.1588212782242633, "grad_norm": 0.5095186233520508, "learning_rate": 1.999985128486654e-05, "loss": 0.7999, "step": 415 }, { "epoch": 0.15920398009950248, "grad_norm": 0.5084940791130066, "learning_rate": 1.9999837458083738e-05, "loss": 0.7483, "step": 416 }, { "epoch": 0.15958668197474168, "grad_norm": 0.5234619975090027, "learning_rate": 1.9999823016784057e-05, "loss": 0.7321, "step": 417 }, { "epoch": 0.15996938384998086, "grad_norm": 0.5672624111175537, "learning_rate": 1.9999807960968393e-05, "loss": 0.743, "step": 418 }, { "epoch": 0.16035208572522006, "grad_norm": 0.528971254825592, "learning_rate": 1.999979229063766e-05, "loss": 0.7881, "step": 419 }, { "epoch": 0.16073478760045926, "grad_norm": 0.5318459868431091, "learning_rate": 1.9999776005792827e-05, "loss": 0.8002, "step": 420 }, { "epoch": 0.16111748947569843, "grad_norm": 0.5341087579727173, "learning_rate": 1.9999759106434896e-05, "loss": 0.8315, "step": 421 }, { "epoch": 0.16150019135093763, "grad_norm": 0.5429895520210266, "learning_rate": 1.9999741592564903e-05, "loss": 0.8327, "step": 422 }, { "epoch": 0.1618828932261768, "grad_norm": 0.5071070194244385, "learning_rate": 1.9999723464183925e-05, "loss": 0.7581, "step": 423 }, { "epoch": 0.162265595101416, "grad_norm": 0.5377606153488159, "learning_rate": 1.9999704721293077e-05, "loss": 0.7771, "step": 424 }, { "epoch": 0.1626482969766552, "grad_norm": 0.5914068818092346, "learning_rate": 1.999968536389351e-05, "loss": 0.8336, "step": 425 }, { "epoch": 0.16303099885189437, "grad_norm": 0.5248625874519348, "learning_rate": 1.9999665391986413e-05, "loss": 0.7773, "step": 426 }, { "epoch": 0.16341370072713357, "grad_norm": 0.5387527942657471, "learning_rate": 1.999964480557301e-05, "loss": 0.7673, "step": 427 }, { "epoch": 0.16379640260237274, "grad_norm": 0.5271666646003723, "learning_rate": 1.999962360465457e-05, "loss": 0.759, "step": 428 }, { "epoch": 0.16417910447761194, "grad_norm": 0.46156519651412964, "learning_rate": 1.99996017892324e-05, "loss": 0.7155, "step": 429 }, { "epoch": 0.16456180635285114, "grad_norm": 0.5919661521911621, "learning_rate": 1.9999579359307836e-05, "loss": 0.7487, "step": 430 }, { "epoch": 0.16494450822809031, "grad_norm": 0.5138614177703857, "learning_rate": 1.9999556314882252e-05, "loss": 0.8351, "step": 431 }, { "epoch": 0.1653272101033295, "grad_norm": 0.574366569519043, "learning_rate": 1.9999532655957074e-05, "loss": 0.7318, "step": 432 }, { "epoch": 0.16570991197856869, "grad_norm": 0.5230963230133057, "learning_rate": 1.999950838253375e-05, "loss": 0.7673, "step": 433 }, { "epoch": 0.16609261385380789, "grad_norm": 0.49305540323257446, "learning_rate": 1.9999483494613767e-05, "loss": 0.7223, "step": 434 }, { "epoch": 0.16647531572904709, "grad_norm": 0.5517773032188416, "learning_rate": 1.9999457992198663e-05, "loss": 0.7723, "step": 435 }, { "epoch": 0.16685801760428626, "grad_norm": 0.5525909066200256, "learning_rate": 1.9999431875290005e-05, "loss": 0.7843, "step": 436 }, { "epoch": 0.16724071947952546, "grad_norm": 0.4797762334346771, "learning_rate": 1.999940514388939e-05, "loss": 0.7998, "step": 437 }, { "epoch": 0.16762342135476463, "grad_norm": 0.5330603122711182, "learning_rate": 1.9999377797998468e-05, "loss": 0.7672, "step": 438 }, { "epoch": 0.16800612323000383, "grad_norm": 0.5842723846435547, "learning_rate": 1.999934983761892e-05, "loss": 0.7611, "step": 439 }, { "epoch": 0.16838882510524303, "grad_norm": 0.6112608909606934, "learning_rate": 1.9999321262752457e-05, "loss": 0.6989, "step": 440 }, { "epoch": 0.1687715269804822, "grad_norm": 0.541223406791687, "learning_rate": 1.999929207340084e-05, "loss": 0.7785, "step": 441 }, { "epoch": 0.1691542288557214, "grad_norm": 0.5532671809196472, "learning_rate": 1.9999262269565863e-05, "loss": 0.7355, "step": 442 }, { "epoch": 0.16953693073096057, "grad_norm": 0.5270472168922424, "learning_rate": 1.9999231851249353e-05, "loss": 0.8825, "step": 443 }, { "epoch": 0.16991963260619977, "grad_norm": 0.5031540393829346, "learning_rate": 1.999920081845319e-05, "loss": 0.7129, "step": 444 }, { "epoch": 0.17030233448143897, "grad_norm": 0.5788050889968872, "learning_rate": 1.999916917117927e-05, "loss": 0.7663, "step": 445 }, { "epoch": 0.17068503635667814, "grad_norm": 0.5520285964012146, "learning_rate": 1.999913690942954e-05, "loss": 0.8264, "step": 446 }, { "epoch": 0.17106773823191734, "grad_norm": 0.49639350175857544, "learning_rate": 1.9999104033205986e-05, "loss": 0.7482, "step": 447 }, { "epoch": 0.17145044010715652, "grad_norm": 0.5077463984489441, "learning_rate": 1.9999070542510628e-05, "loss": 0.7079, "step": 448 }, { "epoch": 0.17183314198239572, "grad_norm": 0.498809278011322, "learning_rate": 1.999903643734552e-05, "loss": 0.8224, "step": 449 }, { "epoch": 0.17221584385763491, "grad_norm": 0.5286426544189453, "learning_rate": 1.9999001717712763e-05, "loss": 0.808, "step": 450 }, { "epoch": 0.1725985457328741, "grad_norm": 0.5452371835708618, "learning_rate": 1.999896638361449e-05, "loss": 0.7119, "step": 451 }, { "epoch": 0.1729812476081133, "grad_norm": 0.5616328716278076, "learning_rate": 1.9998930435052867e-05, "loss": 0.838, "step": 452 }, { "epoch": 0.17336394948335246, "grad_norm": 0.581000030040741, "learning_rate": 1.9998893872030106e-05, "loss": 0.7512, "step": 453 }, { "epoch": 0.17374665135859166, "grad_norm": 0.44894999265670776, "learning_rate": 1.999885669454846e-05, "loss": 0.6849, "step": 454 }, { "epoch": 0.17412935323383086, "grad_norm": 0.5801690220832825, "learning_rate": 1.9998818902610203e-05, "loss": 0.8033, "step": 455 }, { "epoch": 0.17451205510907003, "grad_norm": 0.6682469248771667, "learning_rate": 1.9998780496217665e-05, "loss": 0.8007, "step": 456 }, { "epoch": 0.17489475698430923, "grad_norm": 0.4814477562904358, "learning_rate": 1.9998741475373204e-05, "loss": 0.7569, "step": 457 }, { "epoch": 0.1752774588595484, "grad_norm": 0.5485851764678955, "learning_rate": 1.9998701840079215e-05, "loss": 0.8103, "step": 458 }, { "epoch": 0.1756601607347876, "grad_norm": 0.552594780921936, "learning_rate": 1.999866159033814e-05, "loss": 0.7669, "step": 459 }, { "epoch": 0.1760428626100268, "grad_norm": 0.6417049765586853, "learning_rate": 1.999862072615245e-05, "loss": 0.7509, "step": 460 }, { "epoch": 0.17642556448526597, "grad_norm": 0.4882282316684723, "learning_rate": 1.999857924752465e-05, "loss": 0.7336, "step": 461 }, { "epoch": 0.17680826636050517, "grad_norm": 0.4956763684749603, "learning_rate": 1.9998537154457298e-05, "loss": 0.6961, "step": 462 }, { "epoch": 0.17719096823574434, "grad_norm": 0.518681526184082, "learning_rate": 1.9998494446952973e-05, "loss": 0.7641, "step": 463 }, { "epoch": 0.17757367011098354, "grad_norm": 0.5170573592185974, "learning_rate": 1.999845112501431e-05, "loss": 0.7616, "step": 464 }, { "epoch": 0.17795637198622274, "grad_norm": 0.5130677819252014, "learning_rate": 1.999840718864396e-05, "loss": 0.7201, "step": 465 }, { "epoch": 0.17833907386146192, "grad_norm": 0.5364904403686523, "learning_rate": 1.999836263784463e-05, "loss": 0.7018, "step": 466 }, { "epoch": 0.17872177573670112, "grad_norm": 0.4806588888168335, "learning_rate": 1.9998317472619053e-05, "loss": 0.8173, "step": 467 }, { "epoch": 0.1791044776119403, "grad_norm": 0.5018374919891357, "learning_rate": 1.999827169297001e-05, "loss": 0.7355, "step": 468 }, { "epoch": 0.1794871794871795, "grad_norm": 0.5573753118515015, "learning_rate": 1.9998225298900308e-05, "loss": 0.7945, "step": 469 }, { "epoch": 0.1798698813624187, "grad_norm": 0.48167383670806885, "learning_rate": 1.99981782904128e-05, "loss": 0.7723, "step": 470 }, { "epoch": 0.18025258323765786, "grad_norm": 0.5045027136802673, "learning_rate": 1.999813066751038e-05, "loss": 0.7239, "step": 471 }, { "epoch": 0.18063528511289706, "grad_norm": 0.5494056344032288, "learning_rate": 1.9998082430195967e-05, "loss": 0.7376, "step": 472 }, { "epoch": 0.18101798698813623, "grad_norm": 0.5252105593681335, "learning_rate": 1.9998033578472533e-05, "loss": 0.7763, "step": 473 }, { "epoch": 0.18140068886337543, "grad_norm": 0.4952108860015869, "learning_rate": 1.9997984112343074e-05, "loss": 0.7465, "step": 474 }, { "epoch": 0.18178339073861463, "grad_norm": 0.5058620572090149, "learning_rate": 1.999793403181063e-05, "loss": 0.7653, "step": 475 }, { "epoch": 0.1821660926138538, "grad_norm": 0.5246459245681763, "learning_rate": 1.9997883336878282e-05, "loss": 0.7511, "step": 476 }, { "epoch": 0.182548794489093, "grad_norm": 0.5394855737686157, "learning_rate": 1.9997832027549148e-05, "loss": 0.8265, "step": 477 }, { "epoch": 0.18293149636433217, "grad_norm": 0.48656630516052246, "learning_rate": 1.999778010382637e-05, "loss": 0.7637, "step": 478 }, { "epoch": 0.18331419823957137, "grad_norm": 0.5747602581977844, "learning_rate": 1.9997727565713147e-05, "loss": 0.7213, "step": 479 }, { "epoch": 0.18369690011481057, "grad_norm": 0.5310860276222229, "learning_rate": 1.999767441321271e-05, "loss": 0.7649, "step": 480 }, { "epoch": 0.18407960199004975, "grad_norm": 0.5117557048797607, "learning_rate": 1.999762064632832e-05, "loss": 0.7598, "step": 481 }, { "epoch": 0.18446230386528895, "grad_norm": 0.5319780111312866, "learning_rate": 1.999756626506328e-05, "loss": 0.7276, "step": 482 }, { "epoch": 0.18484500574052812, "grad_norm": 0.4577291011810303, "learning_rate": 1.9997511269420936e-05, "loss": 0.6625, "step": 483 }, { "epoch": 0.18522770761576732, "grad_norm": 0.5676308870315552, "learning_rate": 1.999745565940467e-05, "loss": 0.7862, "step": 484 }, { "epoch": 0.18561040949100652, "grad_norm": 0.6844309568405151, "learning_rate": 1.9997399435017893e-05, "loss": 0.6958, "step": 485 }, { "epoch": 0.1859931113662457, "grad_norm": 0.5327829718589783, "learning_rate": 1.999734259626406e-05, "loss": 0.7172, "step": 486 }, { "epoch": 0.1863758132414849, "grad_norm": 0.5475878119468689, "learning_rate": 1.999728514314667e-05, "loss": 0.6898, "step": 487 }, { "epoch": 0.18675851511672406, "grad_norm": 0.5247138738632202, "learning_rate": 1.999722707566925e-05, "loss": 0.7582, "step": 488 }, { "epoch": 0.18714121699196326, "grad_norm": 0.5034151077270508, "learning_rate": 1.999716839383537e-05, "loss": 0.6862, "step": 489 }, { "epoch": 0.18752391886720246, "grad_norm": 0.506783127784729, "learning_rate": 1.999710909764863e-05, "loss": 0.7782, "step": 490 }, { "epoch": 0.18790662074244163, "grad_norm": 0.5248714685440063, "learning_rate": 1.9997049187112684e-05, "loss": 0.7298, "step": 491 }, { "epoch": 0.18828932261768083, "grad_norm": 0.5521115660667419, "learning_rate": 1.9996988662231204e-05, "loss": 0.7616, "step": 492 }, { "epoch": 0.18867202449292, "grad_norm": 0.5282880663871765, "learning_rate": 1.999692752300792e-05, "loss": 0.7612, "step": 493 }, { "epoch": 0.1890547263681592, "grad_norm": 0.5053582787513733, "learning_rate": 1.999686576944658e-05, "loss": 0.8109, "step": 494 }, { "epoch": 0.1894374282433984, "grad_norm": 0.6051093339920044, "learning_rate": 1.9996803401550978e-05, "loss": 0.7132, "step": 495 }, { "epoch": 0.18982013011863758, "grad_norm": 0.5760282874107361, "learning_rate": 1.9996740419324955e-05, "loss": 0.8184, "step": 496 }, { "epoch": 0.19020283199387678, "grad_norm": 0.5599878430366516, "learning_rate": 1.9996676822772377e-05, "loss": 0.7237, "step": 497 }, { "epoch": 0.19058553386911595, "grad_norm": 0.5526458024978638, "learning_rate": 1.999661261189715e-05, "loss": 0.7437, "step": 498 }, { "epoch": 0.19096823574435515, "grad_norm": 0.534202516078949, "learning_rate": 1.9996547786703228e-05, "loss": 0.7638, "step": 499 }, { "epoch": 0.19135093761959435, "grad_norm": 0.5392880439758301, "learning_rate": 1.9996482347194585e-05, "loss": 0.6912, "step": 500 }, { "epoch": 0.19173363949483352, "grad_norm": 0.505568265914917, "learning_rate": 1.9996416293375246e-05, "loss": 0.7752, "step": 501 }, { "epoch": 0.19211634137007272, "grad_norm": 0.5926176309585571, "learning_rate": 1.9996349625249267e-05, "loss": 0.8198, "step": 502 }, { "epoch": 0.1924990432453119, "grad_norm": 0.5755407810211182, "learning_rate": 1.999628234282075e-05, "loss": 0.7799, "step": 503 }, { "epoch": 0.1928817451205511, "grad_norm": 0.5041341185569763, "learning_rate": 1.999621444609383e-05, "loss": 0.6497, "step": 504 }, { "epoch": 0.1932644469957903, "grad_norm": 0.531568169593811, "learning_rate": 1.999614593507268e-05, "loss": 0.7243, "step": 505 }, { "epoch": 0.19364714887102946, "grad_norm": 0.4860071539878845, "learning_rate": 1.9996076809761503e-05, "loss": 0.787, "step": 506 }, { "epoch": 0.19402985074626866, "grad_norm": 0.5583129525184631, "learning_rate": 1.999600707016456e-05, "loss": 0.6303, "step": 507 }, { "epoch": 0.19441255262150783, "grad_norm": 0.5536578297615051, "learning_rate": 1.999593671628612e-05, "loss": 0.8736, "step": 508 }, { "epoch": 0.19479525449674703, "grad_norm": 0.533761739730835, "learning_rate": 1.9995865748130518e-05, "loss": 0.764, "step": 509 }, { "epoch": 0.19517795637198623, "grad_norm": 0.5193125009536743, "learning_rate": 1.999579416570211e-05, "loss": 0.7544, "step": 510 }, { "epoch": 0.1955606582472254, "grad_norm": 0.5278550386428833, "learning_rate": 1.99957219690053e-05, "loss": 0.7807, "step": 511 }, { "epoch": 0.1959433601224646, "grad_norm": 0.5460487008094788, "learning_rate": 1.9995649158044516e-05, "loss": 0.7881, "step": 512 }, { "epoch": 0.19632606199770378, "grad_norm": 0.5392619371414185, "learning_rate": 1.9995575732824243e-05, "loss": 0.6963, "step": 513 }, { "epoch": 0.19670876387294298, "grad_norm": 0.5544105172157288, "learning_rate": 1.9995501693348988e-05, "loss": 0.7107, "step": 514 }, { "epoch": 0.19709146574818218, "grad_norm": 0.5181266069412231, "learning_rate": 1.99954270396233e-05, "loss": 0.8226, "step": 515 }, { "epoch": 0.19747416762342135, "grad_norm": 0.508459746837616, "learning_rate": 1.9995351771651766e-05, "loss": 0.7721, "step": 516 }, { "epoch": 0.19785686949866055, "grad_norm": 0.5154515504837036, "learning_rate": 1.9995275889439015e-05, "loss": 0.7579, "step": 517 }, { "epoch": 0.19823957137389972, "grad_norm": 0.5264533758163452, "learning_rate": 1.999519939298971e-05, "loss": 0.7211, "step": 518 }, { "epoch": 0.19862227324913892, "grad_norm": 0.6307706236839294, "learning_rate": 1.9995122282308547e-05, "loss": 0.834, "step": 519 }, { "epoch": 0.19900497512437812, "grad_norm": 0.5358056426048279, "learning_rate": 1.999504455740027e-05, "loss": 0.7741, "step": 520 }, { "epoch": 0.1993876769996173, "grad_norm": 0.6136643290519714, "learning_rate": 1.9994966218269655e-05, "loss": 0.8153, "step": 521 }, { "epoch": 0.1997703788748565, "grad_norm": 0.49198248982429504, "learning_rate": 1.9994887264921513e-05, "loss": 0.7252, "step": 522 }, { "epoch": 0.20015308075009566, "grad_norm": 0.5141664743423462, "learning_rate": 1.9994807697360694e-05, "loss": 0.7552, "step": 523 }, { "epoch": 0.20053578262533486, "grad_norm": 0.5442894697189331, "learning_rate": 1.9994727515592097e-05, "loss": 0.7372, "step": 524 }, { "epoch": 0.20091848450057406, "grad_norm": 0.5106886625289917, "learning_rate": 1.999464671962064e-05, "loss": 0.7406, "step": 525 }, { "epoch": 0.20130118637581323, "grad_norm": 0.7622138857841492, "learning_rate": 1.999456530945129e-05, "loss": 0.7954, "step": 526 }, { "epoch": 0.20168388825105243, "grad_norm": 0.4762383997440338, "learning_rate": 1.9994483285089054e-05, "loss": 0.7139, "step": 527 }, { "epoch": 0.2020665901262916, "grad_norm": 0.5939384698867798, "learning_rate": 1.999440064653897e-05, "loss": 0.7569, "step": 528 }, { "epoch": 0.2024492920015308, "grad_norm": 0.5281030535697937, "learning_rate": 1.9994317393806115e-05, "loss": 0.7133, "step": 529 }, { "epoch": 0.20283199387677, "grad_norm": 0.5377398729324341, "learning_rate": 1.999423352689561e-05, "loss": 0.6817, "step": 530 }, { "epoch": 0.20321469575200918, "grad_norm": 0.5254453420639038, "learning_rate": 1.99941490458126e-05, "loss": 0.7744, "step": 531 }, { "epoch": 0.20359739762724838, "grad_norm": 0.513671875, "learning_rate": 1.999406395056229e-05, "loss": 0.7649, "step": 532 }, { "epoch": 0.20398009950248755, "grad_norm": 0.49534738063812256, "learning_rate": 1.9993978241149892e-05, "loss": 0.8037, "step": 533 }, { "epoch": 0.20436280137772675, "grad_norm": 0.5144739747047424, "learning_rate": 1.9993891917580685e-05, "loss": 0.6986, "step": 534 }, { "epoch": 0.20474550325296595, "grad_norm": 0.49943894147872925, "learning_rate": 1.9993804979859974e-05, "loss": 0.6892, "step": 535 }, { "epoch": 0.20512820512820512, "grad_norm": 0.5595836043357849, "learning_rate": 1.9993717427993098e-05, "loss": 0.7462, "step": 536 }, { "epoch": 0.20551090700344432, "grad_norm": 0.5196743607521057, "learning_rate": 1.9993629261985438e-05, "loss": 0.686, "step": 537 }, { "epoch": 0.2058936088786835, "grad_norm": 0.5609855055809021, "learning_rate": 1.999354048184241e-05, "loss": 0.7595, "step": 538 }, { "epoch": 0.2062763107539227, "grad_norm": 0.5139284729957581, "learning_rate": 1.9993451087569475e-05, "loss": 0.7452, "step": 539 }, { "epoch": 0.2066590126291619, "grad_norm": 0.5207282900810242, "learning_rate": 1.9993361079172123e-05, "loss": 0.7343, "step": 540 }, { "epoch": 0.20704171450440106, "grad_norm": 0.6209056973457336, "learning_rate": 1.999327045665588e-05, "loss": 0.768, "step": 541 }, { "epoch": 0.20742441637964026, "grad_norm": 0.5371406674385071, "learning_rate": 1.9993179220026325e-05, "loss": 0.7721, "step": 542 }, { "epoch": 0.20780711825487944, "grad_norm": 0.5622643232345581, "learning_rate": 1.9993087369289063e-05, "loss": 0.7286, "step": 543 }, { "epoch": 0.20818982013011864, "grad_norm": 0.5616574883460999, "learning_rate": 1.9992994904449733e-05, "loss": 0.8243, "step": 544 }, { "epoch": 0.20857252200535784, "grad_norm": 0.5429919362068176, "learning_rate": 1.999290182551402e-05, "loss": 0.7054, "step": 545 }, { "epoch": 0.208955223880597, "grad_norm": 0.5362637639045715, "learning_rate": 1.9992808132487643e-05, "loss": 0.779, "step": 546 }, { "epoch": 0.2093379257558362, "grad_norm": 0.5088674426078796, "learning_rate": 1.9992713825376363e-05, "loss": 0.7541, "step": 547 }, { "epoch": 0.20972062763107538, "grad_norm": 0.5936992168426514, "learning_rate": 1.9992618904185972e-05, "loss": 0.7335, "step": 548 }, { "epoch": 0.21010332950631458, "grad_norm": 0.5467650294303894, "learning_rate": 1.9992523368922305e-05, "loss": 0.8323, "step": 549 }, { "epoch": 0.21048603138155378, "grad_norm": 0.5413325428962708, "learning_rate": 1.999242721959123e-05, "loss": 0.7651, "step": 550 }, { "epoch": 0.21086873325679295, "grad_norm": 0.5206835865974426, "learning_rate": 1.999233045619866e-05, "loss": 0.6638, "step": 551 }, { "epoch": 0.21125143513203215, "grad_norm": 0.5475010871887207, "learning_rate": 1.999223307875054e-05, "loss": 0.6724, "step": 552 }, { "epoch": 0.21163413700727132, "grad_norm": 0.5290173292160034, "learning_rate": 1.9992135087252847e-05, "loss": 0.6426, "step": 553 }, { "epoch": 0.21201683888251052, "grad_norm": 0.5268495678901672, "learning_rate": 1.9992036481711618e-05, "loss": 0.8069, "step": 554 }, { "epoch": 0.21239954075774972, "grad_norm": 0.5609273910522461, "learning_rate": 1.9991937262132898e-05, "loss": 0.7533, "step": 555 }, { "epoch": 0.2127822426329889, "grad_norm": 0.49837806820869446, "learning_rate": 1.9991837428522793e-05, "loss": 0.7658, "step": 556 }, { "epoch": 0.2131649445082281, "grad_norm": 0.5094565749168396, "learning_rate": 1.999173698088743e-05, "loss": 0.7749, "step": 557 }, { "epoch": 0.21354764638346727, "grad_norm": 0.5439174175262451, "learning_rate": 1.999163591923299e-05, "loss": 0.7508, "step": 558 }, { "epoch": 0.21393034825870647, "grad_norm": 0.5346569418907166, "learning_rate": 1.9991534243565682e-05, "loss": 0.7735, "step": 559 }, { "epoch": 0.21431305013394567, "grad_norm": 0.7179418206214905, "learning_rate": 1.9991431953891752e-05, "loss": 0.7629, "step": 560 }, { "epoch": 0.21469575200918484, "grad_norm": 0.4893960952758789, "learning_rate": 1.9991329050217486e-05, "loss": 0.7118, "step": 561 }, { "epoch": 0.21507845388442404, "grad_norm": 0.534938633441925, "learning_rate": 1.999122553254921e-05, "loss": 0.7271, "step": 562 }, { "epoch": 0.2154611557596632, "grad_norm": 0.6037063598632812, "learning_rate": 1.999112140089328e-05, "loss": 0.7948, "step": 563 }, { "epoch": 0.2158438576349024, "grad_norm": 0.5526931285858154, "learning_rate": 1.99910166552561e-05, "loss": 0.7186, "step": 564 }, { "epoch": 0.2162265595101416, "grad_norm": 0.5631595253944397, "learning_rate": 1.9990911295644106e-05, "loss": 0.7574, "step": 565 }, { "epoch": 0.21660926138538078, "grad_norm": 0.6179533004760742, "learning_rate": 1.9990805322063772e-05, "loss": 0.7353, "step": 566 }, { "epoch": 0.21699196326061998, "grad_norm": 0.5125918388366699, "learning_rate": 1.9990698734521614e-05, "loss": 0.7313, "step": 567 }, { "epoch": 0.21737466513585915, "grad_norm": 0.575333833694458, "learning_rate": 1.9990591533024176e-05, "loss": 0.7511, "step": 568 }, { "epoch": 0.21775736701109835, "grad_norm": 0.6563008427619934, "learning_rate": 1.9990483717578047e-05, "loss": 0.7157, "step": 569 }, { "epoch": 0.21814006888633755, "grad_norm": 0.7214829921722412, "learning_rate": 1.9990375288189858e-05, "loss": 0.7481, "step": 570 }, { "epoch": 0.21852277076157672, "grad_norm": 0.6002402901649475, "learning_rate": 1.999026624486627e-05, "loss": 0.8771, "step": 571 }, { "epoch": 0.21890547263681592, "grad_norm": 0.5676529407501221, "learning_rate": 1.9990156587613976e-05, "loss": 0.7695, "step": 572 }, { "epoch": 0.21928817451205512, "grad_norm": 0.5576393604278564, "learning_rate": 1.9990046316439724e-05, "loss": 0.7518, "step": 573 }, { "epoch": 0.2196708763872943, "grad_norm": 0.5221457481384277, "learning_rate": 1.9989935431350287e-05, "loss": 0.76, "step": 574 }, { "epoch": 0.2200535782625335, "grad_norm": 0.5309351682662964, "learning_rate": 1.9989823932352485e-05, "loss": 0.7403, "step": 575 }, { "epoch": 0.22043628013777267, "grad_norm": 0.5889729857444763, "learning_rate": 1.9989711819453158e-05, "loss": 0.7686, "step": 576 }, { "epoch": 0.22081898201301187, "grad_norm": 0.5781514048576355, "learning_rate": 1.9989599092659207e-05, "loss": 0.7271, "step": 577 }, { "epoch": 0.22120168388825107, "grad_norm": 0.5552148222923279, "learning_rate": 1.9989485751977553e-05, "loss": 0.7164, "step": 578 }, { "epoch": 0.22158438576349024, "grad_norm": 0.5834025740623474, "learning_rate": 1.9989371797415163e-05, "loss": 0.7381, "step": 579 }, { "epoch": 0.22196708763872944, "grad_norm": 0.553257942199707, "learning_rate": 1.9989257228979038e-05, "loss": 0.7627, "step": 580 }, { "epoch": 0.2223497895139686, "grad_norm": 0.5733938217163086, "learning_rate": 1.998914204667622e-05, "loss": 0.7246, "step": 581 }, { "epoch": 0.2227324913892078, "grad_norm": 0.6232308149337769, "learning_rate": 1.998902625051379e-05, "loss": 0.7052, "step": 582 }, { "epoch": 0.223115193264447, "grad_norm": 0.501899778842926, "learning_rate": 1.9988909840498866e-05, "loss": 0.723, "step": 583 }, { "epoch": 0.22349789513968618, "grad_norm": 0.5032399892807007, "learning_rate": 1.998879281663859e-05, "loss": 0.6779, "step": 584 }, { "epoch": 0.22388059701492538, "grad_norm": 0.5386418104171753, "learning_rate": 1.998867517894016e-05, "loss": 0.7345, "step": 585 }, { "epoch": 0.22426329889016455, "grad_norm": 0.515798807144165, "learning_rate": 1.998855692741081e-05, "loss": 0.7728, "step": 586 }, { "epoch": 0.22464600076540375, "grad_norm": 0.586979866027832, "learning_rate": 1.99884380620578e-05, "loss": 0.85, "step": 587 }, { "epoch": 0.22502870264064295, "grad_norm": 0.5033718347549438, "learning_rate": 1.9988318582888442e-05, "loss": 0.8081, "step": 588 }, { "epoch": 0.22541140451588212, "grad_norm": 0.5045295357704163, "learning_rate": 1.9988198489910068e-05, "loss": 0.7504, "step": 589 }, { "epoch": 0.22579410639112132, "grad_norm": 0.5230786204338074, "learning_rate": 1.9988077783130068e-05, "loss": 0.7888, "step": 590 }, { "epoch": 0.2261768082663605, "grad_norm": 0.5741920471191406, "learning_rate": 1.998795646255585e-05, "loss": 0.7658, "step": 591 }, { "epoch": 0.2265595101415997, "grad_norm": 0.5234642028808594, "learning_rate": 1.9987834528194877e-05, "loss": 0.7566, "step": 592 }, { "epoch": 0.2269422120168389, "grad_norm": 0.5031620860099792, "learning_rate": 1.9987711980054642e-05, "loss": 0.7819, "step": 593 }, { "epoch": 0.22732491389207807, "grad_norm": 0.5254848599433899, "learning_rate": 1.9987588818142675e-05, "loss": 0.6956, "step": 594 }, { "epoch": 0.22770761576731727, "grad_norm": 0.5294387936592102, "learning_rate": 1.998746504246654e-05, "loss": 0.6981, "step": 595 }, { "epoch": 0.22809031764255644, "grad_norm": 0.6205242276191711, "learning_rate": 1.9987340653033847e-05, "loss": 0.8418, "step": 596 }, { "epoch": 0.22847301951779564, "grad_norm": 0.6437031626701355, "learning_rate": 1.9987215649852244e-05, "loss": 0.7205, "step": 597 }, { "epoch": 0.22885572139303484, "grad_norm": 0.632239580154419, "learning_rate": 1.9987090032929405e-05, "loss": 0.7522, "step": 598 }, { "epoch": 0.229238423268274, "grad_norm": 0.5294379591941833, "learning_rate": 1.9986963802273055e-05, "loss": 0.6861, "step": 599 }, { "epoch": 0.2296211251435132, "grad_norm": 0.5243014097213745, "learning_rate": 1.9986836957890947e-05, "loss": 0.8285, "step": 600 }, { "epoch": 0.23000382701875238, "grad_norm": 0.48958373069763184, "learning_rate": 1.9986709499790883e-05, "loss": 0.7514, "step": 601 }, { "epoch": 0.23038652889399158, "grad_norm": 0.5310447216033936, "learning_rate": 1.9986581427980687e-05, "loss": 0.7383, "step": 602 }, { "epoch": 0.23076923076923078, "grad_norm": 0.5616538524627686, "learning_rate": 1.9986452742468235e-05, "loss": 0.6639, "step": 603 }, { "epoch": 0.23115193264446995, "grad_norm": 0.53178870677948, "learning_rate": 1.9986323443261433e-05, "loss": 0.7172, "step": 604 }, { "epoch": 0.23153463451970915, "grad_norm": 0.5285511016845703, "learning_rate": 1.998619353036823e-05, "loss": 0.7477, "step": 605 }, { "epoch": 0.23191733639494833, "grad_norm": 0.5728214383125305, "learning_rate": 1.9986063003796602e-05, "loss": 0.7326, "step": 606 }, { "epoch": 0.23230003827018753, "grad_norm": 0.5429890155792236, "learning_rate": 1.9985931863554575e-05, "loss": 0.7141, "step": 607 }, { "epoch": 0.23268274014542673, "grad_norm": 0.5801789164543152, "learning_rate": 1.9985800109650212e-05, "loss": 0.745, "step": 608 }, { "epoch": 0.2330654420206659, "grad_norm": 0.5733997225761414, "learning_rate": 1.9985667742091603e-05, "loss": 0.703, "step": 609 }, { "epoch": 0.2334481438959051, "grad_norm": 0.5002754926681519, "learning_rate": 1.9985534760886884e-05, "loss": 0.6971, "step": 610 }, { "epoch": 0.23383084577114427, "grad_norm": 0.6038405895233154, "learning_rate": 1.998540116604423e-05, "loss": 0.7186, "step": 611 }, { "epoch": 0.23421354764638347, "grad_norm": 0.5009385347366333, "learning_rate": 1.9985266957571848e-05, "loss": 0.7566, "step": 612 }, { "epoch": 0.23459624952162267, "grad_norm": 0.635793924331665, "learning_rate": 1.9985132135477987e-05, "loss": 0.7409, "step": 613 }, { "epoch": 0.23497895139686184, "grad_norm": 0.5844504237174988, "learning_rate": 1.9984996699770928e-05, "loss": 0.7557, "step": 614 }, { "epoch": 0.23536165327210104, "grad_norm": 0.5570359826087952, "learning_rate": 1.9984860650459002e-05, "loss": 0.7775, "step": 615 }, { "epoch": 0.2357443551473402, "grad_norm": 0.5345900058746338, "learning_rate": 1.998472398755056e-05, "loss": 0.7236, "step": 616 }, { "epoch": 0.2361270570225794, "grad_norm": 0.5413364171981812, "learning_rate": 1.9984586711054012e-05, "loss": 0.7686, "step": 617 }, { "epoch": 0.2365097588978186, "grad_norm": 0.5696339011192322, "learning_rate": 1.998444882097778e-05, "loss": 0.7669, "step": 618 }, { "epoch": 0.23689246077305778, "grad_norm": 0.5856554508209229, "learning_rate": 1.9984310317330348e-05, "loss": 0.8698, "step": 619 }, { "epoch": 0.23727516264829698, "grad_norm": 0.8564842939376831, "learning_rate": 1.9984171200120223e-05, "loss": 0.7781, "step": 620 }, { "epoch": 0.23765786452353616, "grad_norm": 0.5498459339141846, "learning_rate": 1.9984031469355957e-05, "loss": 0.8303, "step": 621 }, { "epoch": 0.23804056639877536, "grad_norm": 0.5684852600097656, "learning_rate": 1.9983891125046135e-05, "loss": 0.7748, "step": 622 }, { "epoch": 0.23842326827401455, "grad_norm": 0.6633450388908386, "learning_rate": 1.9983750167199377e-05, "loss": 0.7368, "step": 623 }, { "epoch": 0.23880597014925373, "grad_norm": 0.5357542037963867, "learning_rate": 1.9983608595824356e-05, "loss": 0.7997, "step": 624 }, { "epoch": 0.23918867202449293, "grad_norm": 0.5325158834457397, "learning_rate": 1.9983466410929764e-05, "loss": 0.7178, "step": 625 }, { "epoch": 0.2395713738997321, "grad_norm": 0.50383460521698, "learning_rate": 1.998332361252434e-05, "loss": 0.726, "step": 626 }, { "epoch": 0.2399540757749713, "grad_norm": 0.6681680083274841, "learning_rate": 1.998318020061686e-05, "loss": 0.8144, "step": 627 }, { "epoch": 0.2403367776502105, "grad_norm": 0.6343321800231934, "learning_rate": 1.9983036175216138e-05, "loss": 0.7823, "step": 628 }, { "epoch": 0.24071947952544967, "grad_norm": 0.4960142970085144, "learning_rate": 1.998289153633102e-05, "loss": 0.7985, "step": 629 }, { "epoch": 0.24110218140068887, "grad_norm": 0.7593303918838501, "learning_rate": 1.9982746283970405e-05, "loss": 0.8037, "step": 630 }, { "epoch": 0.24148488327592804, "grad_norm": 0.5798348188400269, "learning_rate": 1.998260041814321e-05, "loss": 0.736, "step": 631 }, { "epoch": 0.24186758515116724, "grad_norm": 0.5602174401283264, "learning_rate": 1.99824539388584e-05, "loss": 0.7002, "step": 632 }, { "epoch": 0.24225028702640644, "grad_norm": 0.5119494199752808, "learning_rate": 1.9982306846124975e-05, "loss": 0.7438, "step": 633 }, { "epoch": 0.2426329889016456, "grad_norm": 0.5956265330314636, "learning_rate": 1.998215913995198e-05, "loss": 0.8357, "step": 634 }, { "epoch": 0.2430156907768848, "grad_norm": 0.5304882526397705, "learning_rate": 1.998201082034849e-05, "loss": 0.7561, "step": 635 }, { "epoch": 0.24339839265212398, "grad_norm": 0.699135959148407, "learning_rate": 1.9981861887323614e-05, "loss": 0.6776, "step": 636 }, { "epoch": 0.24378109452736318, "grad_norm": 0.5598679184913635, "learning_rate": 1.998171234088651e-05, "loss": 0.7176, "step": 637 }, { "epoch": 0.24416379640260238, "grad_norm": 0.5642784237861633, "learning_rate": 1.998156218104637e-05, "loss": 0.7369, "step": 638 }, { "epoch": 0.24454649827784156, "grad_norm": 0.4992322623729706, "learning_rate": 1.998141140781242e-05, "loss": 0.6946, "step": 639 }, { "epoch": 0.24492920015308076, "grad_norm": 0.5206226110458374, "learning_rate": 1.998126002119392e-05, "loss": 0.7365, "step": 640 }, { "epoch": 0.24531190202831993, "grad_norm": 0.6561776995658875, "learning_rate": 1.9981108021200177e-05, "loss": 0.8194, "step": 641 }, { "epoch": 0.24569460390355913, "grad_norm": 0.7439214587211609, "learning_rate": 1.9980955407840534e-05, "loss": 0.6526, "step": 642 }, { "epoch": 0.24607730577879833, "grad_norm": 0.540381669998169, "learning_rate": 1.9980802181124364e-05, "loss": 0.7713, "step": 643 }, { "epoch": 0.2464600076540375, "grad_norm": 0.4653822183609009, "learning_rate": 1.998064834106109e-05, "loss": 0.6651, "step": 644 }, { "epoch": 0.2468427095292767, "grad_norm": 0.5686643123626709, "learning_rate": 1.9980493887660165e-05, "loss": 0.7212, "step": 645 }, { "epoch": 0.24722541140451587, "grad_norm": 0.5055914521217346, "learning_rate": 1.9980338820931074e-05, "loss": 0.645, "step": 646 }, { "epoch": 0.24760811327975507, "grad_norm": 0.550740122795105, "learning_rate": 1.998018314088335e-05, "loss": 0.7179, "step": 647 }, { "epoch": 0.24799081515499427, "grad_norm": 0.4740290343761444, "learning_rate": 1.998002684752656e-05, "loss": 0.7323, "step": 648 }, { "epoch": 0.24837351703023344, "grad_norm": 0.5614973902702332, "learning_rate": 1.997986994087031e-05, "loss": 0.7216, "step": 649 }, { "epoch": 0.24875621890547264, "grad_norm": 0.5400650501251221, "learning_rate": 1.9979712420924245e-05, "loss": 0.7602, "step": 650 }, { "epoch": 0.24913892078071181, "grad_norm": 0.5080827474594116, "learning_rate": 1.997955428769804e-05, "loss": 0.6855, "step": 651 }, { "epoch": 0.24952162265595101, "grad_norm": 0.5731632113456726, "learning_rate": 1.9979395541201412e-05, "loss": 0.7174, "step": 652 }, { "epoch": 0.24990432453119021, "grad_norm": 0.5469997525215149, "learning_rate": 1.997923618144412e-05, "loss": 0.8203, "step": 653 }, { "epoch": 0.2502870264064294, "grad_norm": 0.5845215916633606, "learning_rate": 1.997907620843595e-05, "loss": 0.7295, "step": 654 }, { "epoch": 0.25066972828166856, "grad_norm": 0.5663894414901733, "learning_rate": 1.9978915622186744e-05, "loss": 0.6808, "step": 655 }, { "epoch": 0.25105243015690776, "grad_norm": 0.5909819006919861, "learning_rate": 1.9978754422706365e-05, "loss": 0.7639, "step": 656 }, { "epoch": 0.25143513203214696, "grad_norm": 0.5499590039253235, "learning_rate": 1.9978592610004716e-05, "loss": 0.7692, "step": 657 }, { "epoch": 0.25181783390738616, "grad_norm": 0.5714154839515686, "learning_rate": 1.9978430184091742e-05, "loss": 0.7797, "step": 658 }, { "epoch": 0.25220053578262536, "grad_norm": 0.5364381670951843, "learning_rate": 1.9978267144977432e-05, "loss": 0.8129, "step": 659 }, { "epoch": 0.2525832376578645, "grad_norm": 0.49599629640579224, "learning_rate": 1.9978103492671797e-05, "loss": 0.7631, "step": 660 }, { "epoch": 0.2529659395331037, "grad_norm": 0.5081931948661804, "learning_rate": 1.9977939227184896e-05, "loss": 0.788, "step": 661 }, { "epoch": 0.2533486414083429, "grad_norm": 0.533089816570282, "learning_rate": 1.9977774348526823e-05, "loss": 0.6813, "step": 662 }, { "epoch": 0.2537313432835821, "grad_norm": 0.49581179022789, "learning_rate": 1.9977608856707712e-05, "loss": 0.6417, "step": 663 }, { "epoch": 0.2541140451588213, "grad_norm": 0.5220790505409241, "learning_rate": 1.9977442751737734e-05, "loss": 0.7944, "step": 664 }, { "epoch": 0.25449674703406044, "grad_norm": 0.564741849899292, "learning_rate": 1.997727603362709e-05, "loss": 0.6611, "step": 665 }, { "epoch": 0.25487944890929964, "grad_norm": 0.5509733557701111, "learning_rate": 1.9977108702386033e-05, "loss": 0.7854, "step": 666 }, { "epoch": 0.25526215078453884, "grad_norm": 0.5129890441894531, "learning_rate": 1.9976940758024843e-05, "loss": 0.7249, "step": 667 }, { "epoch": 0.25564485265977804, "grad_norm": 0.534566342830658, "learning_rate": 1.997677220055384e-05, "loss": 0.6918, "step": 668 }, { "epoch": 0.25602755453501724, "grad_norm": 0.5784416198730469, "learning_rate": 1.9976603029983383e-05, "loss": 0.765, "step": 669 }, { "epoch": 0.2564102564102564, "grad_norm": 0.54595947265625, "learning_rate": 1.9976433246323867e-05, "loss": 0.7124, "step": 670 }, { "epoch": 0.2567929582854956, "grad_norm": 0.5956503748893738, "learning_rate": 1.9976262849585725e-05, "loss": 0.7361, "step": 671 }, { "epoch": 0.2571756601607348, "grad_norm": 0.4822946786880493, "learning_rate": 1.9976091839779434e-05, "loss": 0.7571, "step": 672 }, { "epoch": 0.257558362035974, "grad_norm": 0.5388430953025818, "learning_rate": 1.9975920216915496e-05, "loss": 0.6828, "step": 673 }, { "epoch": 0.2579410639112132, "grad_norm": 0.5461109280586243, "learning_rate": 1.997574798100446e-05, "loss": 0.781, "step": 674 }, { "epoch": 0.25832376578645233, "grad_norm": 0.5748773217201233, "learning_rate": 1.997557513205691e-05, "loss": 0.7037, "step": 675 }, { "epoch": 0.25870646766169153, "grad_norm": 0.6259620189666748, "learning_rate": 1.9975401670083473e-05, "loss": 0.6707, "step": 676 }, { "epoch": 0.25908916953693073, "grad_norm": 0.5020027756690979, "learning_rate": 1.99752275950948e-05, "loss": 0.7097, "step": 677 }, { "epoch": 0.25947187141216993, "grad_norm": 0.5833941698074341, "learning_rate": 1.9975052907101596e-05, "loss": 0.612, "step": 678 }, { "epoch": 0.25985457328740913, "grad_norm": 0.5696139335632324, "learning_rate": 1.997487760611459e-05, "loss": 0.7387, "step": 679 }, { "epoch": 0.2602372751626483, "grad_norm": 0.512054979801178, "learning_rate": 1.9974701692144563e-05, "loss": 0.7261, "step": 680 }, { "epoch": 0.2606199770378875, "grad_norm": 0.5764826536178589, "learning_rate": 1.9974525165202313e-05, "loss": 0.7392, "step": 681 }, { "epoch": 0.2610026789131267, "grad_norm": 0.5058150887489319, "learning_rate": 1.99743480252987e-05, "loss": 0.7583, "step": 682 }, { "epoch": 0.2613853807883659, "grad_norm": 0.4955865144729614, "learning_rate": 1.9974170272444604e-05, "loss": 0.7266, "step": 683 }, { "epoch": 0.2617680826636051, "grad_norm": 0.5127694606781006, "learning_rate": 1.997399190665095e-05, "loss": 0.7422, "step": 684 }, { "epoch": 0.2621507845388442, "grad_norm": 0.6336838006973267, "learning_rate": 1.9973812927928695e-05, "loss": 0.7284, "step": 685 }, { "epoch": 0.2625334864140834, "grad_norm": 0.5562102794647217, "learning_rate": 1.9973633336288846e-05, "loss": 0.7333, "step": 686 }, { "epoch": 0.2629161882893226, "grad_norm": 0.49975255131721497, "learning_rate": 1.997345313174243e-05, "loss": 0.6592, "step": 687 }, { "epoch": 0.2632988901645618, "grad_norm": 0.5093178749084473, "learning_rate": 1.9973272314300527e-05, "loss": 0.7434, "step": 688 }, { "epoch": 0.263681592039801, "grad_norm": 0.5402156710624695, "learning_rate": 1.9973090883974246e-05, "loss": 0.7419, "step": 689 }, { "epoch": 0.26406429391504016, "grad_norm": 0.4797593057155609, "learning_rate": 1.997290884077474e-05, "loss": 0.6641, "step": 690 }, { "epoch": 0.26444699579027936, "grad_norm": 0.5111425518989563, "learning_rate": 1.9972726184713194e-05, "loss": 0.7431, "step": 691 }, { "epoch": 0.26482969766551856, "grad_norm": 0.5458868741989136, "learning_rate": 1.997254291580083e-05, "loss": 0.7776, "step": 692 }, { "epoch": 0.26521239954075776, "grad_norm": 0.5894204378128052, "learning_rate": 1.9972359034048915e-05, "loss": 0.7255, "step": 693 }, { "epoch": 0.26559510141599696, "grad_norm": 0.5263329148292542, "learning_rate": 1.9972174539468746e-05, "loss": 0.7203, "step": 694 }, { "epoch": 0.2659778032912361, "grad_norm": 0.5233591794967651, "learning_rate": 1.997198943207166e-05, "loss": 0.7325, "step": 695 }, { "epoch": 0.2663605051664753, "grad_norm": 0.48319825530052185, "learning_rate": 1.9971803711869032e-05, "loss": 0.7368, "step": 696 }, { "epoch": 0.2667432070417145, "grad_norm": 0.5528940558433533, "learning_rate": 1.997161737887228e-05, "loss": 0.7553, "step": 697 }, { "epoch": 0.2671259089169537, "grad_norm": 0.5565774440765381, "learning_rate": 1.997143043309285e-05, "loss": 0.7074, "step": 698 }, { "epoch": 0.2675086107921929, "grad_norm": 0.6781579852104187, "learning_rate": 1.997124287454223e-05, "loss": 0.7811, "step": 699 }, { "epoch": 0.26789131266743205, "grad_norm": 0.6325992941856384, "learning_rate": 1.9971054703231954e-05, "loss": 0.7444, "step": 700 }, { "epoch": 0.26827401454267125, "grad_norm": 0.5260130167007446, "learning_rate": 1.9970865919173574e-05, "loss": 0.7188, "step": 701 }, { "epoch": 0.26865671641791045, "grad_norm": 0.6460747718811035, "learning_rate": 1.99706765223787e-05, "loss": 0.7123, "step": 702 }, { "epoch": 0.26903941829314965, "grad_norm": 0.5294407606124878, "learning_rate": 1.9970486512858964e-05, "loss": 0.7234, "step": 703 }, { "epoch": 0.26942212016838885, "grad_norm": 0.5754542350769043, "learning_rate": 1.9970295890626047e-05, "loss": 0.7698, "step": 704 }, { "epoch": 0.269804822043628, "grad_norm": 0.5077725648880005, "learning_rate": 1.9970104655691665e-05, "loss": 0.7523, "step": 705 }, { "epoch": 0.2701875239188672, "grad_norm": 0.5685206651687622, "learning_rate": 1.9969912808067568e-05, "loss": 0.6643, "step": 706 }, { "epoch": 0.2705702257941064, "grad_norm": 0.5985338687896729, "learning_rate": 1.996972034776554e-05, "loss": 0.7974, "step": 707 }, { "epoch": 0.2709529276693456, "grad_norm": 0.4850582778453827, "learning_rate": 1.9969527274797413e-05, "loss": 0.7651, "step": 708 }, { "epoch": 0.2713356295445848, "grad_norm": 0.5971077680587769, "learning_rate": 1.9969333589175056e-05, "loss": 0.7502, "step": 709 }, { "epoch": 0.27171833141982393, "grad_norm": 0.49548599123954773, "learning_rate": 1.9969139290910367e-05, "loss": 0.656, "step": 710 }, { "epoch": 0.27210103329506313, "grad_norm": 0.6211652755737305, "learning_rate": 1.9968944380015286e-05, "loss": 0.6988, "step": 711 }, { "epoch": 0.27248373517030233, "grad_norm": 0.5529072284698486, "learning_rate": 1.996874885650179e-05, "loss": 0.7297, "step": 712 }, { "epoch": 0.27286643704554153, "grad_norm": 0.5275421738624573, "learning_rate": 1.9968552720381897e-05, "loss": 0.7757, "step": 713 }, { "epoch": 0.27324913892078073, "grad_norm": 0.6187619566917419, "learning_rate": 1.996835597166766e-05, "loss": 0.704, "step": 714 }, { "epoch": 0.2736318407960199, "grad_norm": 0.5329996347427368, "learning_rate": 1.9968158610371164e-05, "loss": 0.777, "step": 715 }, { "epoch": 0.2740145426712591, "grad_norm": 0.5629217028617859, "learning_rate": 1.9967960636504548e-05, "loss": 0.7382, "step": 716 }, { "epoch": 0.2743972445464983, "grad_norm": 0.6393334865570068, "learning_rate": 1.9967762050079967e-05, "loss": 0.7686, "step": 717 }, { "epoch": 0.2747799464217375, "grad_norm": 0.608506977558136, "learning_rate": 1.9967562851109633e-05, "loss": 0.7184, "step": 718 }, { "epoch": 0.2751626482969767, "grad_norm": 0.6470226645469666, "learning_rate": 1.9967363039605783e-05, "loss": 0.8041, "step": 719 }, { "epoch": 0.2755453501722158, "grad_norm": 0.5248188376426697, "learning_rate": 1.9967162615580697e-05, "loss": 0.6525, "step": 720 }, { "epoch": 0.275928052047455, "grad_norm": 0.5207483172416687, "learning_rate": 1.9966961579046692e-05, "loss": 0.7169, "step": 721 }, { "epoch": 0.2763107539226942, "grad_norm": 0.5286962389945984, "learning_rate": 1.996675993001612e-05, "loss": 0.7192, "step": 722 }, { "epoch": 0.2766934557979334, "grad_norm": 0.5619462132453918, "learning_rate": 1.996655766850138e-05, "loss": 0.6838, "step": 723 }, { "epoch": 0.2770761576731726, "grad_norm": 0.6214057803153992, "learning_rate": 1.996635479451489e-05, "loss": 0.7915, "step": 724 }, { "epoch": 0.27745885954841176, "grad_norm": 0.5412583351135254, "learning_rate": 1.9966151308069127e-05, "loss": 0.743, "step": 725 }, { "epoch": 0.27784156142365096, "grad_norm": 0.5423333644866943, "learning_rate": 1.9965947209176592e-05, "loss": 0.7204, "step": 726 }, { "epoch": 0.27822426329889016, "grad_norm": 0.5646645426750183, "learning_rate": 1.9965742497849824e-05, "loss": 0.7542, "step": 727 }, { "epoch": 0.27860696517412936, "grad_norm": 0.6134679913520813, "learning_rate": 1.9965537174101407e-05, "loss": 0.7224, "step": 728 }, { "epoch": 0.27898966704936856, "grad_norm": 0.5197445154190063, "learning_rate": 1.996533123794396e-05, "loss": 0.6966, "step": 729 }, { "epoch": 0.2793723689246077, "grad_norm": 0.5603775382041931, "learning_rate": 1.996512468939014e-05, "loss": 0.7828, "step": 730 }, { "epoch": 0.2797550707998469, "grad_norm": 0.592644214630127, "learning_rate": 1.9964917528452633e-05, "loss": 0.6499, "step": 731 }, { "epoch": 0.2801377726750861, "grad_norm": 0.4922853708267212, "learning_rate": 1.9964709755144174e-05, "loss": 0.7531, "step": 732 }, { "epoch": 0.2805204745503253, "grad_norm": 0.5239735841751099, "learning_rate": 1.996450136947753e-05, "loss": 0.7555, "step": 733 }, { "epoch": 0.2809031764255645, "grad_norm": 0.569957435131073, "learning_rate": 1.996429237146551e-05, "loss": 0.71, "step": 734 }, { "epoch": 0.28128587830080365, "grad_norm": 0.5475341081619263, "learning_rate": 1.996408276112095e-05, "loss": 0.7563, "step": 735 }, { "epoch": 0.28166858017604285, "grad_norm": 0.6342677474021912, "learning_rate": 1.9963872538456738e-05, "loss": 0.6759, "step": 736 }, { "epoch": 0.28205128205128205, "grad_norm": 0.5920348167419434, "learning_rate": 1.9963661703485796e-05, "loss": 0.7733, "step": 737 }, { "epoch": 0.28243398392652125, "grad_norm": 0.4960639178752899, "learning_rate": 1.9963450256221066e-05, "loss": 0.6981, "step": 738 }, { "epoch": 0.28281668580176045, "grad_norm": 0.5531309843063354, "learning_rate": 1.9963238196675558e-05, "loss": 0.6773, "step": 739 }, { "epoch": 0.2831993876769996, "grad_norm": 0.5010950565338135, "learning_rate": 1.9963025524862296e-05, "loss": 0.7441, "step": 740 }, { "epoch": 0.2835820895522388, "grad_norm": 0.5893356204032898, "learning_rate": 1.9962812240794344e-05, "loss": 0.7057, "step": 741 }, { "epoch": 0.283964791427478, "grad_norm": 0.4984580874443054, "learning_rate": 1.996259834448482e-05, "loss": 0.7446, "step": 742 }, { "epoch": 0.2843474933027172, "grad_norm": 0.600356936454773, "learning_rate": 1.9962383835946862e-05, "loss": 0.766, "step": 743 }, { "epoch": 0.2847301951779564, "grad_norm": 0.4853595197200775, "learning_rate": 1.9962168715193654e-05, "loss": 0.6924, "step": 744 }, { "epoch": 0.28511289705319554, "grad_norm": 0.5481930375099182, "learning_rate": 1.9961952982238415e-05, "loss": 0.657, "step": 745 }, { "epoch": 0.28549559892843474, "grad_norm": 0.5396609902381897, "learning_rate": 1.9961736637094402e-05, "loss": 0.6657, "step": 746 }, { "epoch": 0.28587830080367393, "grad_norm": 0.511504054069519, "learning_rate": 1.996151967977491e-05, "loss": 0.6919, "step": 747 }, { "epoch": 0.28626100267891313, "grad_norm": 0.5346115231513977, "learning_rate": 1.9961302110293275e-05, "loss": 0.7314, "step": 748 }, { "epoch": 0.28664370455415233, "grad_norm": 0.4844016134738922, "learning_rate": 1.996108392866286e-05, "loss": 0.7915, "step": 749 }, { "epoch": 0.2870264064293915, "grad_norm": 0.4900115728378296, "learning_rate": 1.996086513489708e-05, "loss": 0.7445, "step": 750 }, { "epoch": 0.2874091083046307, "grad_norm": 0.5207425951957703, "learning_rate": 1.9960645729009375e-05, "loss": 0.6738, "step": 751 }, { "epoch": 0.2877918101798699, "grad_norm": 0.5490735769271851, "learning_rate": 1.9960425711013233e-05, "loss": 0.7345, "step": 752 }, { "epoch": 0.2881745120551091, "grad_norm": 0.5723102688789368, "learning_rate": 1.9960205080922176e-05, "loss": 0.7513, "step": 753 }, { "epoch": 0.2885572139303483, "grad_norm": 0.5332742929458618, "learning_rate": 1.9959983838749753e-05, "loss": 0.6693, "step": 754 }, { "epoch": 0.2889399158055874, "grad_norm": 0.5558417439460754, "learning_rate": 1.995976198450957e-05, "loss": 0.7312, "step": 755 }, { "epoch": 0.2893226176808266, "grad_norm": 0.5559164881706238, "learning_rate": 1.9959539518215253e-05, "loss": 0.695, "step": 756 }, { "epoch": 0.2897053195560658, "grad_norm": 0.5109314918518066, "learning_rate": 1.9959316439880477e-05, "loss": 0.6947, "step": 757 }, { "epoch": 0.290088021431305, "grad_norm": 0.564274787902832, "learning_rate": 1.9959092749518955e-05, "loss": 0.6589, "step": 758 }, { "epoch": 0.2904707233065442, "grad_norm": 0.5325790643692017, "learning_rate": 1.9958868447144423e-05, "loss": 0.8161, "step": 759 }, { "epoch": 0.29085342518178336, "grad_norm": 0.5076199173927307, "learning_rate": 1.995864353277067e-05, "loss": 0.7125, "step": 760 }, { "epoch": 0.29123612705702256, "grad_norm": 0.5963625311851501, "learning_rate": 1.9958418006411522e-05, "loss": 0.7248, "step": 761 }, { "epoch": 0.29161882893226176, "grad_norm": 0.5159624218940735, "learning_rate": 1.995819186808083e-05, "loss": 0.7525, "step": 762 }, { "epoch": 0.29200153080750096, "grad_norm": 0.557902991771698, "learning_rate": 1.9957965117792496e-05, "loss": 0.7702, "step": 763 }, { "epoch": 0.29238423268274016, "grad_norm": 0.6088110208511353, "learning_rate": 1.9957737755560455e-05, "loss": 0.7443, "step": 764 }, { "epoch": 0.2927669345579793, "grad_norm": 0.5802242755889893, "learning_rate": 1.995750978139868e-05, "loss": 0.6766, "step": 765 }, { "epoch": 0.2931496364332185, "grad_norm": 0.5632839202880859, "learning_rate": 1.9957281195321174e-05, "loss": 0.7095, "step": 766 }, { "epoch": 0.2935323383084577, "grad_norm": 0.5265945196151733, "learning_rate": 1.995705199734199e-05, "loss": 0.7686, "step": 767 }, { "epoch": 0.2939150401836969, "grad_norm": 0.57957524061203, "learning_rate": 1.9956822187475214e-05, "loss": 0.7801, "step": 768 }, { "epoch": 0.2942977420589361, "grad_norm": 0.5191106200218201, "learning_rate": 1.9956591765734962e-05, "loss": 0.6363, "step": 769 }, { "epoch": 0.29468044393417525, "grad_norm": 0.5478788614273071, "learning_rate": 1.99563607321354e-05, "loss": 0.7539, "step": 770 }, { "epoch": 0.29506314580941445, "grad_norm": 0.5279152393341064, "learning_rate": 1.995612908669072e-05, "loss": 0.7282, "step": 771 }, { "epoch": 0.29544584768465365, "grad_norm": 0.6664388179779053, "learning_rate": 1.9955896829415162e-05, "loss": 0.6907, "step": 772 }, { "epoch": 0.29582854955989285, "grad_norm": 0.5148594975471497, "learning_rate": 1.9955663960323e-05, "loss": 0.7579, "step": 773 }, { "epoch": 0.29621125143513205, "grad_norm": 0.5545429587364197, "learning_rate": 1.995543047942854e-05, "loss": 0.7197, "step": 774 }, { "epoch": 0.2965939533103712, "grad_norm": 0.5493525266647339, "learning_rate": 1.995519638674613e-05, "loss": 0.646, "step": 775 }, { "epoch": 0.2969766551856104, "grad_norm": 0.5815727710723877, "learning_rate": 1.9954961682290163e-05, "loss": 0.7177, "step": 776 }, { "epoch": 0.2973593570608496, "grad_norm": 0.5408259034156799, "learning_rate": 1.995472636607505e-05, "loss": 0.7152, "step": 777 }, { "epoch": 0.2977420589360888, "grad_norm": 0.5350371599197388, "learning_rate": 1.9954490438115262e-05, "loss": 0.7033, "step": 778 }, { "epoch": 0.298124760811328, "grad_norm": 0.6190057396888733, "learning_rate": 1.9954253898425297e-05, "loss": 0.656, "step": 779 }, { "epoch": 0.29850746268656714, "grad_norm": 0.5293673276901245, "learning_rate": 1.995401674701968e-05, "loss": 0.8076, "step": 780 }, { "epoch": 0.29889016456180634, "grad_norm": 0.5548465251922607, "learning_rate": 1.9953778983913e-05, "loss": 0.6957, "step": 781 }, { "epoch": 0.29927286643704554, "grad_norm": 0.5383663773536682, "learning_rate": 1.995354060911986e-05, "loss": 0.6706, "step": 782 }, { "epoch": 0.29965556831228474, "grad_norm": 0.5491650700569153, "learning_rate": 1.9953301622654902e-05, "loss": 0.7772, "step": 783 }, { "epoch": 0.30003827018752394, "grad_norm": 0.5765327215194702, "learning_rate": 1.995306202453283e-05, "loss": 0.6835, "step": 784 }, { "epoch": 0.3004209720627631, "grad_norm": 0.551202118396759, "learning_rate": 1.995282181476835e-05, "loss": 0.7647, "step": 785 }, { "epoch": 0.3008036739380023, "grad_norm": 0.5883597731590271, "learning_rate": 1.9952580993376234e-05, "loss": 0.6864, "step": 786 }, { "epoch": 0.3011863758132415, "grad_norm": 0.5916661024093628, "learning_rate": 1.9952339560371277e-05, "loss": 0.7201, "step": 787 }, { "epoch": 0.3015690776884807, "grad_norm": 0.6164562702178955, "learning_rate": 1.995209751576832e-05, "loss": 0.7185, "step": 788 }, { "epoch": 0.3019517795637199, "grad_norm": 0.5861438512802124, "learning_rate": 1.9951854859582234e-05, "loss": 0.7028, "step": 789 }, { "epoch": 0.302334481438959, "grad_norm": 0.5329659581184387, "learning_rate": 1.995161159182793e-05, "loss": 0.7492, "step": 790 }, { "epoch": 0.3027171833141982, "grad_norm": 0.5562418103218079, "learning_rate": 1.995136771252036e-05, "loss": 0.7163, "step": 791 }, { "epoch": 0.3030998851894374, "grad_norm": 0.6309837102890015, "learning_rate": 1.9951123221674508e-05, "loss": 0.7277, "step": 792 }, { "epoch": 0.3034825870646766, "grad_norm": 0.560936689376831, "learning_rate": 1.9950878119305403e-05, "loss": 0.7584, "step": 793 }, { "epoch": 0.3038652889399158, "grad_norm": 0.5844101905822754, "learning_rate": 1.9950632405428105e-05, "loss": 0.7062, "step": 794 }, { "epoch": 0.30424799081515497, "grad_norm": 0.5422306656837463, "learning_rate": 1.9950386080057708e-05, "loss": 0.7623, "step": 795 }, { "epoch": 0.30463069269039417, "grad_norm": 0.5733110904693604, "learning_rate": 1.9950139143209362e-05, "loss": 0.7043, "step": 796 }, { "epoch": 0.30501339456563337, "grad_norm": 0.5510596036911011, "learning_rate": 1.994989159489823e-05, "loss": 0.7157, "step": 797 }, { "epoch": 0.30539609644087257, "grad_norm": 0.5421299934387207, "learning_rate": 1.9949643435139528e-05, "loss": 0.703, "step": 798 }, { "epoch": 0.30577879831611177, "grad_norm": 0.594139814376831, "learning_rate": 1.994939466394851e-05, "loss": 0.6843, "step": 799 }, { "epoch": 0.3061615001913509, "grad_norm": 0.5820221900939941, "learning_rate": 1.994914528134046e-05, "loss": 0.7007, "step": 800 }, { "epoch": 0.3065442020665901, "grad_norm": 0.5571632981300354, "learning_rate": 1.9948895287330707e-05, "loss": 0.7095, "step": 801 }, { "epoch": 0.3069269039418293, "grad_norm": 0.593104362487793, "learning_rate": 1.994864468193461e-05, "loss": 0.73, "step": 802 }, { "epoch": 0.3073096058170685, "grad_norm": 0.5340127944946289, "learning_rate": 1.994839346516757e-05, "loss": 0.7443, "step": 803 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5475616455078125, "learning_rate": 1.994814163704502e-05, "loss": 0.7503, "step": 804 }, { "epoch": 0.30807500956754685, "grad_norm": 0.48921892046928406, "learning_rate": 1.9947889197582445e-05, "loss": 0.7388, "step": 805 }, { "epoch": 0.30845771144278605, "grad_norm": 0.5314140319824219, "learning_rate": 1.9947636146795357e-05, "loss": 0.7465, "step": 806 }, { "epoch": 0.30884041331802525, "grad_norm": 0.5047615170478821, "learning_rate": 1.99473824846993e-05, "loss": 0.6368, "step": 807 }, { "epoch": 0.30922311519326445, "grad_norm": 0.5747345089912415, "learning_rate": 1.9947128211309866e-05, "loss": 0.6985, "step": 808 }, { "epoch": 0.30960581706850365, "grad_norm": 0.5089448690414429, "learning_rate": 1.994687332664268e-05, "loss": 0.7392, "step": 809 }, { "epoch": 0.3099885189437428, "grad_norm": 0.5869911909103394, "learning_rate": 1.994661783071341e-05, "loss": 0.7135, "step": 810 }, { "epoch": 0.310371220818982, "grad_norm": 0.555362343788147, "learning_rate": 1.994636172353775e-05, "loss": 0.7012, "step": 811 }, { "epoch": 0.3107539226942212, "grad_norm": 0.6641100645065308, "learning_rate": 1.994610500513144e-05, "loss": 0.731, "step": 812 }, { "epoch": 0.3111366245694604, "grad_norm": 0.5598411560058594, "learning_rate": 1.9945847675510256e-05, "loss": 0.6811, "step": 813 }, { "epoch": 0.3115193264446996, "grad_norm": 0.5722761750221252, "learning_rate": 1.9945589734690016e-05, "loss": 0.6372, "step": 814 }, { "epoch": 0.31190202831993874, "grad_norm": 0.5531580448150635, "learning_rate": 1.994533118268657e-05, "loss": 0.7087, "step": 815 }, { "epoch": 0.31228473019517794, "grad_norm": 0.517867386341095, "learning_rate": 1.99450720195158e-05, "loss": 0.7007, "step": 816 }, { "epoch": 0.31266743207041714, "grad_norm": 0.5209311842918396, "learning_rate": 1.994481224519364e-05, "loss": 0.6661, "step": 817 }, { "epoch": 0.31305013394565634, "grad_norm": 0.5756950378417969, "learning_rate": 1.994455185973605e-05, "loss": 0.7341, "step": 818 }, { "epoch": 0.31343283582089554, "grad_norm": 0.6631211638450623, "learning_rate": 1.9944290863159032e-05, "loss": 0.6422, "step": 819 }, { "epoch": 0.31381553769613474, "grad_norm": 0.49340736865997314, "learning_rate": 1.994402925547863e-05, "loss": 0.6591, "step": 820 }, { "epoch": 0.3141982395713739, "grad_norm": 0.6339350938796997, "learning_rate": 1.994376703671091e-05, "loss": 0.7462, "step": 821 }, { "epoch": 0.3145809414466131, "grad_norm": 0.5481440424919128, "learning_rate": 1.9943504206871992e-05, "loss": 0.7267, "step": 822 }, { "epoch": 0.3149636433218523, "grad_norm": 0.57209712266922, "learning_rate": 1.994324076597803e-05, "loss": 0.6816, "step": 823 }, { "epoch": 0.3153463451970915, "grad_norm": 0.5556782484054565, "learning_rate": 1.9942976714045206e-05, "loss": 0.6627, "step": 824 }, { "epoch": 0.3157290470723307, "grad_norm": 0.548581600189209, "learning_rate": 1.9942712051089755e-05, "loss": 0.7904, "step": 825 }, { "epoch": 0.3161117489475698, "grad_norm": 0.5716879963874817, "learning_rate": 1.9942446777127938e-05, "loss": 0.7182, "step": 826 }, { "epoch": 0.316494450822809, "grad_norm": 0.5050795078277588, "learning_rate": 1.9942180892176055e-05, "loss": 0.7093, "step": 827 }, { "epoch": 0.3168771526980482, "grad_norm": 0.5813159942626953, "learning_rate": 1.9941914396250447e-05, "loss": 0.865, "step": 828 }, { "epoch": 0.3172598545732874, "grad_norm": 0.6130154728889465, "learning_rate": 1.994164728936749e-05, "loss": 0.7679, "step": 829 }, { "epoch": 0.3176425564485266, "grad_norm": 0.5387080311775208, "learning_rate": 1.9941379571543597e-05, "loss": 0.7103, "step": 830 }, { "epoch": 0.31802525832376577, "grad_norm": 0.5429013967514038, "learning_rate": 1.9941111242795226e-05, "loss": 0.7252, "step": 831 }, { "epoch": 0.31840796019900497, "grad_norm": 0.5605617761611938, "learning_rate": 1.9940842303138857e-05, "loss": 0.6671, "step": 832 }, { "epoch": 0.31879066207424417, "grad_norm": 0.6801189184188843, "learning_rate": 1.9940572752591027e-05, "loss": 0.6714, "step": 833 }, { "epoch": 0.31917336394948337, "grad_norm": 0.6532358527183533, "learning_rate": 1.9940302591168294e-05, "loss": 0.7048, "step": 834 }, { "epoch": 0.31955606582472257, "grad_norm": 0.5708308219909668, "learning_rate": 1.9940031818887266e-05, "loss": 0.7808, "step": 835 }, { "epoch": 0.3199387676999617, "grad_norm": 0.6285911202430725, "learning_rate": 1.9939760435764576e-05, "loss": 0.7127, "step": 836 }, { "epoch": 0.3203214695752009, "grad_norm": 0.5769196152687073, "learning_rate": 1.9939488441816903e-05, "loss": 0.7509, "step": 837 }, { "epoch": 0.3207041714504401, "grad_norm": 0.5985997319221497, "learning_rate": 1.9939215837060962e-05, "loss": 0.6845, "step": 838 }, { "epoch": 0.3210868733256793, "grad_norm": 0.5896108150482178, "learning_rate": 1.993894262151351e-05, "loss": 0.6437, "step": 839 }, { "epoch": 0.3214695752009185, "grad_norm": 0.5583901405334473, "learning_rate": 1.9938668795191334e-05, "loss": 0.6807, "step": 840 }, { "epoch": 0.32185227707615766, "grad_norm": 0.589591920375824, "learning_rate": 1.9938394358111258e-05, "loss": 0.7778, "step": 841 }, { "epoch": 0.32223497895139686, "grad_norm": 0.5394251942634583, "learning_rate": 1.993811931029015e-05, "loss": 0.801, "step": 842 }, { "epoch": 0.32261768082663606, "grad_norm": 0.5719064474105835, "learning_rate": 1.993784365174491e-05, "loss": 0.6499, "step": 843 }, { "epoch": 0.32300038270187525, "grad_norm": 0.5558503866195679, "learning_rate": 1.993756738249248e-05, "loss": 0.6956, "step": 844 }, { "epoch": 0.32338308457711445, "grad_norm": 0.5064206123352051, "learning_rate": 1.9937290502549837e-05, "loss": 0.7477, "step": 845 }, { "epoch": 0.3237657864523536, "grad_norm": 0.5815936923027039, "learning_rate": 1.9937013011933998e-05, "loss": 0.7551, "step": 846 }, { "epoch": 0.3241484883275928, "grad_norm": 0.5544987916946411, "learning_rate": 1.9936734910662013e-05, "loss": 0.7163, "step": 847 }, { "epoch": 0.324531190202832, "grad_norm": 0.5443788170814514, "learning_rate": 1.993645619875097e-05, "loss": 0.7472, "step": 848 }, { "epoch": 0.3249138920780712, "grad_norm": 0.5969615578651428, "learning_rate": 1.9936176876218004e-05, "loss": 0.6634, "step": 849 }, { "epoch": 0.3252965939533104, "grad_norm": 0.5604581832885742, "learning_rate": 1.9935896943080272e-05, "loss": 0.6564, "step": 850 }, { "epoch": 0.32567929582854954, "grad_norm": 0.56931471824646, "learning_rate": 1.9935616399354984e-05, "loss": 0.6834, "step": 851 }, { "epoch": 0.32606199770378874, "grad_norm": 0.5562780499458313, "learning_rate": 1.9935335245059372e-05, "loss": 0.7145, "step": 852 }, { "epoch": 0.32644469957902794, "grad_norm": 0.5678125619888306, "learning_rate": 1.993505348021072e-05, "loss": 0.7712, "step": 853 }, { "epoch": 0.32682740145426714, "grad_norm": 0.5570697784423828, "learning_rate": 1.9934771104826343e-05, "loss": 0.6986, "step": 854 }, { "epoch": 0.32721010332950634, "grad_norm": 0.5637919902801514, "learning_rate": 1.9934488118923586e-05, "loss": 0.6974, "step": 855 }, { "epoch": 0.3275928052047455, "grad_norm": 0.5980667471885681, "learning_rate": 1.9934204522519854e-05, "loss": 0.8049, "step": 856 }, { "epoch": 0.3279755070799847, "grad_norm": 0.5635457634925842, "learning_rate": 1.9933920315632557e-05, "loss": 0.8427, "step": 857 }, { "epoch": 0.3283582089552239, "grad_norm": 0.5852844715118408, "learning_rate": 1.9933635498279177e-05, "loss": 0.7466, "step": 858 }, { "epoch": 0.3287409108304631, "grad_norm": 0.533606767654419, "learning_rate": 1.9933350070477205e-05, "loss": 0.7945, "step": 859 }, { "epoch": 0.3291236127057023, "grad_norm": 0.5195430517196655, "learning_rate": 1.9933064032244185e-05, "loss": 0.6702, "step": 860 }, { "epoch": 0.32950631458094143, "grad_norm": 0.5806863307952881, "learning_rate": 1.9932777383597695e-05, "loss": 0.688, "step": 861 }, { "epoch": 0.32988901645618063, "grad_norm": 0.5212812423706055, "learning_rate": 1.9932490124555353e-05, "loss": 0.7024, "step": 862 }, { "epoch": 0.33027171833141983, "grad_norm": 0.50531005859375, "learning_rate": 1.993220225513481e-05, "loss": 0.7881, "step": 863 }, { "epoch": 0.330654420206659, "grad_norm": 0.5628938674926758, "learning_rate": 1.993191377535375e-05, "loss": 0.7445, "step": 864 }, { "epoch": 0.3310371220818982, "grad_norm": 0.5364320874214172, "learning_rate": 1.993162468522991e-05, "loss": 0.7397, "step": 865 }, { "epoch": 0.33141982395713737, "grad_norm": 0.6063359975814819, "learning_rate": 1.9931334984781053e-05, "loss": 0.6882, "step": 866 }, { "epoch": 0.33180252583237657, "grad_norm": 0.5646716952323914, "learning_rate": 1.993104467402498e-05, "loss": 0.7198, "step": 867 }, { "epoch": 0.33218522770761577, "grad_norm": 0.5220961570739746, "learning_rate": 1.993075375297953e-05, "loss": 0.7265, "step": 868 }, { "epoch": 0.33256792958285497, "grad_norm": 0.5361908674240112, "learning_rate": 1.9930462221662584e-05, "loss": 0.6734, "step": 869 }, { "epoch": 0.33295063145809417, "grad_norm": 0.519233226776123, "learning_rate": 1.993017008009206e-05, "loss": 0.6746, "step": 870 }, { "epoch": 0.3333333333333333, "grad_norm": 0.542789101600647, "learning_rate": 1.9929877328285902e-05, "loss": 0.7726, "step": 871 }, { "epoch": 0.3337160352085725, "grad_norm": 0.5429500341415405, "learning_rate": 1.9929583966262108e-05, "loss": 0.6801, "step": 872 }, { "epoch": 0.3340987370838117, "grad_norm": 0.5644886493682861, "learning_rate": 1.9929289994038706e-05, "loss": 0.7714, "step": 873 }, { "epoch": 0.3344814389590509, "grad_norm": 0.5244765281677246, "learning_rate": 1.9928995411633755e-05, "loss": 0.7072, "step": 874 }, { "epoch": 0.3348641408342901, "grad_norm": 0.5819005370140076, "learning_rate": 1.992870021906536e-05, "loss": 0.8176, "step": 875 }, { "epoch": 0.33524684270952926, "grad_norm": 0.5092201828956604, "learning_rate": 1.992840441635167e-05, "loss": 0.7141, "step": 876 }, { "epoch": 0.33562954458476846, "grad_norm": 0.5020915269851685, "learning_rate": 1.992810800351085e-05, "loss": 0.67, "step": 877 }, { "epoch": 0.33601224646000766, "grad_norm": 0.6086021065711975, "learning_rate": 1.9927810980561125e-05, "loss": 0.7942, "step": 878 }, { "epoch": 0.33639494833524686, "grad_norm": 0.6519919633865356, "learning_rate": 1.9927513347520743e-05, "loss": 0.7456, "step": 879 }, { "epoch": 0.33677765021048606, "grad_norm": 0.5811492204666138, "learning_rate": 1.9927215104407997e-05, "loss": 0.6963, "step": 880 }, { "epoch": 0.3371603520857252, "grad_norm": 0.5556333661079407, "learning_rate": 1.992691625124121e-05, "loss": 0.7896, "step": 881 }, { "epoch": 0.3375430539609644, "grad_norm": 0.5148698687553406, "learning_rate": 1.9926616788038756e-05, "loss": 0.6961, "step": 882 }, { "epoch": 0.3379257558362036, "grad_norm": 0.5917673110961914, "learning_rate": 1.992631671481903e-05, "loss": 0.8501, "step": 883 }, { "epoch": 0.3383084577114428, "grad_norm": 0.6360171437263489, "learning_rate": 1.9926016031600474e-05, "loss": 0.7028, "step": 884 }, { "epoch": 0.338691159586682, "grad_norm": 0.5471616983413696, "learning_rate": 1.9925714738401568e-05, "loss": 0.7367, "step": 885 }, { "epoch": 0.33907386146192114, "grad_norm": 0.5500317811965942, "learning_rate": 1.9925412835240826e-05, "loss": 0.7615, "step": 886 }, { "epoch": 0.33945656333716034, "grad_norm": 0.5347245335578918, "learning_rate": 1.9925110322136802e-05, "loss": 0.6925, "step": 887 }, { "epoch": 0.33983926521239954, "grad_norm": 0.5439125299453735, "learning_rate": 1.992480719910808e-05, "loss": 0.7607, "step": 888 }, { "epoch": 0.34022196708763874, "grad_norm": 0.5712805390357971, "learning_rate": 1.9924503466173297e-05, "loss": 0.7467, "step": 889 }, { "epoch": 0.34060466896287794, "grad_norm": 0.5271598696708679, "learning_rate": 1.9924199123351113e-05, "loss": 0.7655, "step": 890 }, { "epoch": 0.3409873708381171, "grad_norm": 0.4724038541316986, "learning_rate": 1.9923894170660233e-05, "loss": 0.6962, "step": 891 }, { "epoch": 0.3413700727133563, "grad_norm": 0.5130577087402344, "learning_rate": 1.9923588608119394e-05, "loss": 0.6558, "step": 892 }, { "epoch": 0.3417527745885955, "grad_norm": 0.6236817836761475, "learning_rate": 1.9923282435747376e-05, "loss": 0.691, "step": 893 }, { "epoch": 0.3421354764638347, "grad_norm": 0.495248943567276, "learning_rate": 1.992297565356299e-05, "loss": 0.6929, "step": 894 }, { "epoch": 0.3425181783390739, "grad_norm": 0.5144777297973633, "learning_rate": 1.9922668261585096e-05, "loss": 0.6958, "step": 895 }, { "epoch": 0.34290088021431303, "grad_norm": 0.508453369140625, "learning_rate": 1.992236025983258e-05, "loss": 0.6512, "step": 896 }, { "epoch": 0.34328358208955223, "grad_norm": 0.5630110502243042, "learning_rate": 1.992205164832437e-05, "loss": 0.7039, "step": 897 }, { "epoch": 0.34366628396479143, "grad_norm": 0.5666761994361877, "learning_rate": 1.992174242707943e-05, "loss": 0.7533, "step": 898 }, { "epoch": 0.34404898584003063, "grad_norm": 0.5150330662727356, "learning_rate": 1.9921432596116763e-05, "loss": 0.6784, "step": 899 }, { "epoch": 0.34443168771526983, "grad_norm": 1.5539579391479492, "learning_rate": 1.992112215545541e-05, "loss": 0.7085, "step": 900 }, { "epoch": 0.344814389590509, "grad_norm": 0.541276216506958, "learning_rate": 1.9920811105114445e-05, "loss": 0.6214, "step": 901 }, { "epoch": 0.3451970914657482, "grad_norm": 0.6649701595306396, "learning_rate": 1.992049944511299e-05, "loss": 0.8169, "step": 902 }, { "epoch": 0.3455797933409874, "grad_norm": 0.540753185749054, "learning_rate": 1.9920187175470187e-05, "loss": 0.7849, "step": 903 }, { "epoch": 0.3459624952162266, "grad_norm": 0.5677890181541443, "learning_rate": 1.9919874296205237e-05, "loss": 0.7289, "step": 904 }, { "epoch": 0.3463451970914658, "grad_norm": 0.5662000179290771, "learning_rate": 1.991956080733736e-05, "loss": 0.6844, "step": 905 }, { "epoch": 0.3467278989667049, "grad_norm": 0.5500953793525696, "learning_rate": 1.991924670888582e-05, "loss": 0.7371, "step": 906 }, { "epoch": 0.3471106008419441, "grad_norm": 0.5688446760177612, "learning_rate": 1.9918932000869924e-05, "loss": 0.6809, "step": 907 }, { "epoch": 0.3474933027171833, "grad_norm": 0.5735496878623962, "learning_rate": 1.991861668330901e-05, "loss": 0.7149, "step": 908 }, { "epoch": 0.3478760045924225, "grad_norm": 0.5361505150794983, "learning_rate": 1.9918300756222452e-05, "loss": 0.7537, "step": 909 }, { "epoch": 0.3482587064676617, "grad_norm": 0.5362344980239868, "learning_rate": 1.991798421962967e-05, "loss": 0.7066, "step": 910 }, { "epoch": 0.34864140834290086, "grad_norm": 0.6420547366142273, "learning_rate": 1.9917667073550114e-05, "loss": 0.6899, "step": 911 }, { "epoch": 0.34902411021814006, "grad_norm": 0.5579440593719482, "learning_rate": 1.991734931800327e-05, "loss": 0.745, "step": 912 }, { "epoch": 0.34940681209337926, "grad_norm": 0.5138629078865051, "learning_rate": 1.9917030953008666e-05, "loss": 0.7362, "step": 913 }, { "epoch": 0.34978951396861846, "grad_norm": 0.5159977674484253, "learning_rate": 1.9916711978585873e-05, "loss": 0.6507, "step": 914 }, { "epoch": 0.35017221584385766, "grad_norm": 0.5332601070404053, "learning_rate": 1.9916392394754483e-05, "loss": 0.7264, "step": 915 }, { "epoch": 0.3505549177190968, "grad_norm": 0.519873321056366, "learning_rate": 1.9916072201534143e-05, "loss": 0.7147, "step": 916 }, { "epoch": 0.350937619594336, "grad_norm": 0.570414125919342, "learning_rate": 1.9915751398944525e-05, "loss": 0.7857, "step": 917 }, { "epoch": 0.3513203214695752, "grad_norm": 0.5243626236915588, "learning_rate": 1.9915429987005347e-05, "loss": 0.6448, "step": 918 }, { "epoch": 0.3517030233448144, "grad_norm": 0.5294333100318909, "learning_rate": 1.9915107965736356e-05, "loss": 0.7753, "step": 919 }, { "epoch": 0.3520857252200536, "grad_norm": 0.5424880981445312, "learning_rate": 1.9914785335157344e-05, "loss": 0.7477, "step": 920 }, { "epoch": 0.35246842709529275, "grad_norm": 0.6554422974586487, "learning_rate": 1.9914462095288137e-05, "loss": 0.7501, "step": 921 }, { "epoch": 0.35285112897053195, "grad_norm": 0.5518964529037476, "learning_rate": 1.99141382461486e-05, "loss": 0.6226, "step": 922 }, { "epoch": 0.35323383084577115, "grad_norm": 0.5006068348884583, "learning_rate": 1.9913813787758637e-05, "loss": 0.7747, "step": 923 }, { "epoch": 0.35361653272101035, "grad_norm": 0.6079138517379761, "learning_rate": 1.991348872013818e-05, "loss": 0.731, "step": 924 }, { "epoch": 0.35399923459624955, "grad_norm": 0.6097971796989441, "learning_rate": 1.991316304330721e-05, "loss": 0.6646, "step": 925 }, { "epoch": 0.3543819364714887, "grad_norm": 0.5348948240280151, "learning_rate": 1.991283675728574e-05, "loss": 0.6962, "step": 926 }, { "epoch": 0.3547646383467279, "grad_norm": 0.5824871063232422, "learning_rate": 1.9912509862093817e-05, "loss": 0.6559, "step": 927 }, { "epoch": 0.3551473402219671, "grad_norm": 0.5410217046737671, "learning_rate": 1.9912182357751534e-05, "loss": 0.749, "step": 928 }, { "epoch": 0.3555300420972063, "grad_norm": 0.569269061088562, "learning_rate": 1.9911854244279016e-05, "loss": 0.7261, "step": 929 }, { "epoch": 0.3559127439724455, "grad_norm": 0.6498334407806396, "learning_rate": 1.9911525521696428e-05, "loss": 0.665, "step": 930 }, { "epoch": 0.35629544584768463, "grad_norm": 0.5723531246185303, "learning_rate": 1.991119619002397e-05, "loss": 0.7414, "step": 931 }, { "epoch": 0.35667814772292383, "grad_norm": 0.48659729957580566, "learning_rate": 1.991086624928188e-05, "loss": 0.6469, "step": 932 }, { "epoch": 0.35706084959816303, "grad_norm": 0.512479841709137, "learning_rate": 1.991053569949043e-05, "loss": 0.7832, "step": 933 }, { "epoch": 0.35744355147340223, "grad_norm": 0.5120828151702881, "learning_rate": 1.991020454066994e-05, "loss": 0.7492, "step": 934 }, { "epoch": 0.35782625334864143, "grad_norm": 0.5498716235160828, "learning_rate": 1.990987277284076e-05, "loss": 0.7166, "step": 935 }, { "epoch": 0.3582089552238806, "grad_norm": 0.4993622899055481, "learning_rate": 1.9909540396023267e-05, "loss": 0.6427, "step": 936 }, { "epoch": 0.3585916570991198, "grad_norm": 0.5796472430229187, "learning_rate": 1.9909207410237902e-05, "loss": 0.7111, "step": 937 }, { "epoch": 0.358974358974359, "grad_norm": 0.6634133458137512, "learning_rate": 1.990887381550512e-05, "loss": 0.6809, "step": 938 }, { "epoch": 0.3593570608495982, "grad_norm": 0.5390592217445374, "learning_rate": 1.9908539611845418e-05, "loss": 0.7987, "step": 939 }, { "epoch": 0.3597397627248374, "grad_norm": 0.5719312429428101, "learning_rate": 1.9908204799279342e-05, "loss": 0.7083, "step": 940 }, { "epoch": 0.3601224646000765, "grad_norm": 0.5316107869148254, "learning_rate": 1.990786937782746e-05, "loss": 0.6127, "step": 941 }, { "epoch": 0.3605051664753157, "grad_norm": 0.5526863932609558, "learning_rate": 1.9907533347510387e-05, "loss": 0.686, "step": 942 }, { "epoch": 0.3608878683505549, "grad_norm": 0.6173678040504456, "learning_rate": 1.9907196708348775e-05, "loss": 0.7531, "step": 943 }, { "epoch": 0.3612705702257941, "grad_norm": 0.5507813096046448, "learning_rate": 1.9906859460363307e-05, "loss": 0.6871, "step": 944 }, { "epoch": 0.3616532721010333, "grad_norm": 0.5507002472877502, "learning_rate": 1.9906521603574713e-05, "loss": 0.6502, "step": 945 }, { "epoch": 0.36203597397627246, "grad_norm": 0.5084413886070251, "learning_rate": 1.9906183138003754e-05, "loss": 0.6882, "step": 946 }, { "epoch": 0.36241867585151166, "grad_norm": 0.7549517154693604, "learning_rate": 1.9905844063671228e-05, "loss": 0.8353, "step": 947 }, { "epoch": 0.36280137772675086, "grad_norm": 0.5853334069252014, "learning_rate": 1.990550438059797e-05, "loss": 0.6638, "step": 948 }, { "epoch": 0.36318407960199006, "grad_norm": 0.5651171803474426, "learning_rate": 1.9905164088804857e-05, "loss": 0.7856, "step": 949 }, { "epoch": 0.36356678147722926, "grad_norm": 0.5354325771331787, "learning_rate": 1.9904823188312804e-05, "loss": 0.7351, "step": 950 }, { "epoch": 0.3639494833524684, "grad_norm": 0.5385192632675171, "learning_rate": 1.9904481679142754e-05, "loss": 0.747, "step": 951 }, { "epoch": 0.3643321852277076, "grad_norm": 0.604126513004303, "learning_rate": 1.9904139561315695e-05, "loss": 0.7276, "step": 952 }, { "epoch": 0.3647148871029468, "grad_norm": 0.6031460165977478, "learning_rate": 1.9903796834852654e-05, "loss": 0.7317, "step": 953 }, { "epoch": 0.365097588978186, "grad_norm": 0.5515976548194885, "learning_rate": 1.990345349977469e-05, "loss": 0.6659, "step": 954 }, { "epoch": 0.3654802908534252, "grad_norm": 0.5289283990859985, "learning_rate": 1.9903109556102907e-05, "loss": 0.7198, "step": 955 }, { "epoch": 0.36586299272866435, "grad_norm": 0.5509209036827087, "learning_rate": 1.9902765003858434e-05, "loss": 0.7476, "step": 956 }, { "epoch": 0.36624569460390355, "grad_norm": 0.513640820980072, "learning_rate": 1.990241984306245e-05, "loss": 0.6969, "step": 957 }, { "epoch": 0.36662839647914275, "grad_norm": 0.49532178044319153, "learning_rate": 1.990207407373616e-05, "loss": 0.6731, "step": 958 }, { "epoch": 0.36701109835438195, "grad_norm": 0.5710733532905579, "learning_rate": 1.990172769590082e-05, "loss": 0.7866, "step": 959 }, { "epoch": 0.36739380022962115, "grad_norm": 0.5301476716995239, "learning_rate": 1.990138070957771e-05, "loss": 0.7469, "step": 960 }, { "epoch": 0.3677765021048603, "grad_norm": 0.5255497694015503, "learning_rate": 1.990103311478815e-05, "loss": 0.7328, "step": 961 }, { "epoch": 0.3681592039800995, "grad_norm": 0.6061772108078003, "learning_rate": 1.9900684911553517e-05, "loss": 0.6887, "step": 962 }, { "epoch": 0.3685419058553387, "grad_norm": 0.5228190422058105, "learning_rate": 1.9900336099895195e-05, "loss": 0.6837, "step": 963 }, { "epoch": 0.3689246077305779, "grad_norm": 0.5348122119903564, "learning_rate": 1.989998667983462e-05, "loss": 0.7642, "step": 964 }, { "epoch": 0.3693073096058171, "grad_norm": 0.5372776985168457, "learning_rate": 1.9899636651393266e-05, "loss": 0.6766, "step": 965 }, { "epoch": 0.36969001148105624, "grad_norm": 0.7497754693031311, "learning_rate": 1.9899286014592646e-05, "loss": 0.7156, "step": 966 }, { "epoch": 0.37007271335629544, "grad_norm": 0.6075822114944458, "learning_rate": 1.9898934769454307e-05, "loss": 0.7125, "step": 967 }, { "epoch": 0.37045541523153463, "grad_norm": 0.500117838382721, "learning_rate": 1.9898582915999834e-05, "loss": 0.633, "step": 968 }, { "epoch": 0.37083811710677383, "grad_norm": 0.5109246969223022, "learning_rate": 1.9898230454250848e-05, "loss": 0.6619, "step": 969 }, { "epoch": 0.37122081898201303, "grad_norm": 0.49672022461891174, "learning_rate": 1.9897877384229007e-05, "loss": 0.634, "step": 970 }, { "epoch": 0.3716035208572522, "grad_norm": 0.5103479027748108, "learning_rate": 1.9897523705956012e-05, "loss": 0.7004, "step": 971 }, { "epoch": 0.3719862227324914, "grad_norm": 0.5359560251235962, "learning_rate": 1.9897169419453594e-05, "loss": 0.7164, "step": 972 }, { "epoch": 0.3723689246077306, "grad_norm": 0.5160733461380005, "learning_rate": 1.989681452474353e-05, "loss": 0.6712, "step": 973 }, { "epoch": 0.3727516264829698, "grad_norm": 0.5045751333236694, "learning_rate": 1.9896459021847622e-05, "loss": 0.6908, "step": 974 }, { "epoch": 0.373134328358209, "grad_norm": 0.6267533302307129, "learning_rate": 1.989610291078772e-05, "loss": 0.6604, "step": 975 }, { "epoch": 0.3735170302334481, "grad_norm": 0.5244762897491455, "learning_rate": 1.9895746191585713e-05, "loss": 0.7111, "step": 976 }, { "epoch": 0.3738997321086873, "grad_norm": 0.5464669466018677, "learning_rate": 1.9895388864263515e-05, "loss": 0.7296, "step": 977 }, { "epoch": 0.3742824339839265, "grad_norm": 0.5526725649833679, "learning_rate": 1.9895030928843083e-05, "loss": 0.7588, "step": 978 }, { "epoch": 0.3746651358591657, "grad_norm": 0.49762359261512756, "learning_rate": 1.9894672385346422e-05, "loss": 0.6826, "step": 979 }, { "epoch": 0.3750478377344049, "grad_norm": 0.5642932057380676, "learning_rate": 1.9894313233795563e-05, "loss": 0.6784, "step": 980 }, { "epoch": 0.37543053960964407, "grad_norm": 0.5513096451759338, "learning_rate": 1.9893953474212572e-05, "loss": 0.698, "step": 981 }, { "epoch": 0.37581324148488326, "grad_norm": 0.5303071737289429, "learning_rate": 1.989359310661956e-05, "loss": 0.6684, "step": 982 }, { "epoch": 0.37619594336012246, "grad_norm": 0.5268600583076477, "learning_rate": 1.9893232131038672e-05, "loss": 0.6308, "step": 983 }, { "epoch": 0.37657864523536166, "grad_norm": 0.5320382118225098, "learning_rate": 1.9892870547492094e-05, "loss": 0.7447, "step": 984 }, { "epoch": 0.37696134711060086, "grad_norm": 0.5626268982887268, "learning_rate": 1.9892508356002044e-05, "loss": 0.7568, "step": 985 }, { "epoch": 0.37734404898584, "grad_norm": 0.5310444235801697, "learning_rate": 1.9892145556590776e-05, "loss": 0.7486, "step": 986 }, { "epoch": 0.3777267508610792, "grad_norm": 0.5288851261138916, "learning_rate": 1.989178214928059e-05, "loss": 0.8012, "step": 987 }, { "epoch": 0.3781094527363184, "grad_norm": 0.5373882055282593, "learning_rate": 1.989141813409382e-05, "loss": 0.6622, "step": 988 }, { "epoch": 0.3784921546115576, "grad_norm": 0.5631815195083618, "learning_rate": 1.9891053511052827e-05, "loss": 0.7137, "step": 989 }, { "epoch": 0.3788748564867968, "grad_norm": 0.5636327862739563, "learning_rate": 1.9890688280180027e-05, "loss": 0.7939, "step": 990 }, { "epoch": 0.37925755836203595, "grad_norm": 0.5601379871368408, "learning_rate": 1.9890322441497857e-05, "loss": 0.6701, "step": 991 }, { "epoch": 0.37964026023727515, "grad_norm": 0.5870500206947327, "learning_rate": 1.9889955995028808e-05, "loss": 0.689, "step": 992 }, { "epoch": 0.38002296211251435, "grad_norm": 0.6326809525489807, "learning_rate": 1.9889588940795392e-05, "loss": 0.6992, "step": 993 }, { "epoch": 0.38040566398775355, "grad_norm": 0.524488091468811, "learning_rate": 1.9889221278820166e-05, "loss": 0.6983, "step": 994 }, { "epoch": 0.38078836586299275, "grad_norm": 0.5604749917984009, "learning_rate": 1.9888853009125726e-05, "loss": 0.6679, "step": 995 }, { "epoch": 0.3811710677382319, "grad_norm": 0.5131329894065857, "learning_rate": 1.98884841317347e-05, "loss": 0.6796, "step": 996 }, { "epoch": 0.3815537696134711, "grad_norm": 0.6016077995300293, "learning_rate": 1.9888114646669762e-05, "loss": 0.724, "step": 997 }, { "epoch": 0.3819364714887103, "grad_norm": 0.506019115447998, "learning_rate": 1.9887744553953615e-05, "loss": 0.6735, "step": 998 }, { "epoch": 0.3823191733639495, "grad_norm": 0.5794334411621094, "learning_rate": 1.9887373853609e-05, "loss": 0.6483, "step": 999 }, { "epoch": 0.3827018752391887, "grad_norm": 0.6084061861038208, "learning_rate": 1.98870025456587e-05, "loss": 0.6906, "step": 1000 }, { "epoch": 0.38308457711442784, "grad_norm": 0.5255367755889893, "learning_rate": 1.9886630630125532e-05, "loss": 0.666, "step": 1001 }, { "epoch": 0.38346727898966704, "grad_norm": 0.5539889335632324, "learning_rate": 1.9886258107032352e-05, "loss": 0.8216, "step": 1002 }, { "epoch": 0.38384998086490624, "grad_norm": 0.5907905697822571, "learning_rate": 1.9885884976402053e-05, "loss": 0.702, "step": 1003 }, { "epoch": 0.38423268274014544, "grad_norm": 0.5751607418060303, "learning_rate": 1.988551123825756e-05, "loss": 0.6567, "step": 1004 }, { "epoch": 0.38461538461538464, "grad_norm": 0.48887938261032104, "learning_rate": 1.9885136892621845e-05, "loss": 0.6392, "step": 1005 }, { "epoch": 0.3849980864906238, "grad_norm": 0.6035395264625549, "learning_rate": 1.9884761939517916e-05, "loss": 0.716, "step": 1006 }, { "epoch": 0.385380788365863, "grad_norm": 0.648949146270752, "learning_rate": 1.9884386378968806e-05, "loss": 0.7071, "step": 1007 }, { "epoch": 0.3857634902411022, "grad_norm": 0.5420228242874146, "learning_rate": 1.98840102109976e-05, "loss": 0.74, "step": 1008 }, { "epoch": 0.3861461921163414, "grad_norm": 0.5710490942001343, "learning_rate": 1.9883633435627412e-05, "loss": 0.74, "step": 1009 }, { "epoch": 0.3865288939915806, "grad_norm": 0.7276522517204285, "learning_rate": 1.9883256052881398e-05, "loss": 0.764, "step": 1010 }, { "epoch": 0.3869115958668197, "grad_norm": 0.570376455783844, "learning_rate": 1.9882878062782752e-05, "loss": 0.6833, "step": 1011 }, { "epoch": 0.3872942977420589, "grad_norm": 0.5231733322143555, "learning_rate": 1.9882499465354696e-05, "loss": 0.6966, "step": 1012 }, { "epoch": 0.3876769996172981, "grad_norm": 0.5794960856437683, "learning_rate": 1.9882120260620497e-05, "loss": 0.6327, "step": 1013 }, { "epoch": 0.3880597014925373, "grad_norm": 0.5193080902099609, "learning_rate": 1.988174044860346e-05, "loss": 0.7307, "step": 1014 }, { "epoch": 0.3884424033677765, "grad_norm": 0.5572579503059387, "learning_rate": 1.988136002932693e-05, "loss": 0.7665, "step": 1015 }, { "epoch": 0.38882510524301567, "grad_norm": 0.5321922302246094, "learning_rate": 1.9880979002814275e-05, "loss": 0.6584, "step": 1016 }, { "epoch": 0.38920780711825487, "grad_norm": 0.5470019578933716, "learning_rate": 1.9880597369088917e-05, "loss": 0.7476, "step": 1017 }, { "epoch": 0.38959050899349407, "grad_norm": 0.5650668144226074, "learning_rate": 1.9880215128174304e-05, "loss": 0.711, "step": 1018 }, { "epoch": 0.38997321086873327, "grad_norm": 0.5758969783782959, "learning_rate": 1.987983228009393e-05, "loss": 0.684, "step": 1019 }, { "epoch": 0.39035591274397247, "grad_norm": 0.5503025054931641, "learning_rate": 1.9879448824871322e-05, "loss": 0.663, "step": 1020 }, { "epoch": 0.3907386146192116, "grad_norm": 0.6301083564758301, "learning_rate": 1.987906476253004e-05, "loss": 0.7592, "step": 1021 }, { "epoch": 0.3911213164944508, "grad_norm": 0.5038923621177673, "learning_rate": 1.987868009309369e-05, "loss": 0.7052, "step": 1022 }, { "epoch": 0.39150401836969, "grad_norm": 0.550819993019104, "learning_rate": 1.987829481658591e-05, "loss": 0.7753, "step": 1023 }, { "epoch": 0.3918867202449292, "grad_norm": 0.5625050067901611, "learning_rate": 1.9877908933030373e-05, "loss": 0.7639, "step": 1024 }, { "epoch": 0.3922694221201684, "grad_norm": 0.539334237575531, "learning_rate": 1.9877522442450798e-05, "loss": 0.6844, "step": 1025 }, { "epoch": 0.39265212399540755, "grad_norm": 0.5458476543426514, "learning_rate": 1.987713534487093e-05, "loss": 0.6861, "step": 1026 }, { "epoch": 0.39303482587064675, "grad_norm": 0.5698636770248413, "learning_rate": 1.987674764031456e-05, "loss": 0.6475, "step": 1027 }, { "epoch": 0.39341752774588595, "grad_norm": 0.5293007493019104, "learning_rate": 1.9876359328805518e-05, "loss": 0.6509, "step": 1028 }, { "epoch": 0.39380022962112515, "grad_norm": 0.5487672686576843, "learning_rate": 1.987597041036766e-05, "loss": 0.7386, "step": 1029 }, { "epoch": 0.39418293149636435, "grad_norm": 0.5557478070259094, "learning_rate": 1.987558088502489e-05, "loss": 0.7296, "step": 1030 }, { "epoch": 0.3945656333716035, "grad_norm": 0.5562498569488525, "learning_rate": 1.987519075280114e-05, "loss": 0.7209, "step": 1031 }, { "epoch": 0.3949483352468427, "grad_norm": 0.5868808031082153, "learning_rate": 1.9874800013720394e-05, "loss": 0.6556, "step": 1032 }, { "epoch": 0.3953310371220819, "grad_norm": 0.5634583234786987, "learning_rate": 1.9874408667806657e-05, "loss": 0.8293, "step": 1033 }, { "epoch": 0.3957137389973211, "grad_norm": 0.531012237071991, "learning_rate": 1.9874016715083978e-05, "loss": 0.6957, "step": 1034 }, { "epoch": 0.3960964408725603, "grad_norm": 0.7760922312736511, "learning_rate": 1.9873624155576447e-05, "loss": 0.7423, "step": 1035 }, { "epoch": 0.39647914274779944, "grad_norm": 0.5588914155960083, "learning_rate": 1.9873230989308184e-05, "loss": 0.6661, "step": 1036 }, { "epoch": 0.39686184462303864, "grad_norm": 0.5719295740127563, "learning_rate": 1.9872837216303353e-05, "loss": 0.7572, "step": 1037 }, { "epoch": 0.39724454649827784, "grad_norm": 0.525124192237854, "learning_rate": 1.9872442836586156e-05, "loss": 0.7913, "step": 1038 }, { "epoch": 0.39762724837351704, "grad_norm": 0.566474974155426, "learning_rate": 1.987204785018082e-05, "loss": 0.7189, "step": 1039 }, { "epoch": 0.39800995024875624, "grad_norm": 0.6149770021438599, "learning_rate": 1.9871652257111623e-05, "loss": 0.7176, "step": 1040 }, { "epoch": 0.3983926521239954, "grad_norm": 0.582068145275116, "learning_rate": 1.9871256057402874e-05, "loss": 0.7702, "step": 1041 }, { "epoch": 0.3987753539992346, "grad_norm": 0.5966618061065674, "learning_rate": 1.9870859251078923e-05, "loss": 0.7757, "step": 1042 }, { "epoch": 0.3991580558744738, "grad_norm": 0.48878297209739685, "learning_rate": 1.9870461838164152e-05, "loss": 0.5579, "step": 1043 }, { "epoch": 0.399540757749713, "grad_norm": 0.5114944577217102, "learning_rate": 1.9870063818682985e-05, "loss": 0.7551, "step": 1044 }, { "epoch": 0.3999234596249522, "grad_norm": 0.5548186898231506, "learning_rate": 1.9869665192659878e-05, "loss": 0.7069, "step": 1045 }, { "epoch": 0.4003061615001913, "grad_norm": 0.5750588774681091, "learning_rate": 1.9869265960119334e-05, "loss": 0.7766, "step": 1046 }, { "epoch": 0.4006888633754305, "grad_norm": 0.527833878993988, "learning_rate": 1.986886612108588e-05, "loss": 0.7761, "step": 1047 }, { "epoch": 0.4010715652506697, "grad_norm": 0.5710975527763367, "learning_rate": 1.9868465675584096e-05, "loss": 0.7049, "step": 1048 }, { "epoch": 0.4014542671259089, "grad_norm": 0.5661287903785706, "learning_rate": 1.986806462363858e-05, "loss": 0.7351, "step": 1049 }, { "epoch": 0.4018369690011481, "grad_norm": 0.5419279336929321, "learning_rate": 1.9867662965273982e-05, "loss": 0.6869, "step": 1050 }, { "epoch": 0.40221967087638727, "grad_norm": 0.6278996467590332, "learning_rate": 1.9867260700514986e-05, "loss": 0.8049, "step": 1051 }, { "epoch": 0.40260237275162647, "grad_norm": 0.5864235162734985, "learning_rate": 1.9866857829386314e-05, "loss": 0.6696, "step": 1052 }, { "epoch": 0.40298507462686567, "grad_norm": 0.5489898920059204, "learning_rate": 1.9866454351912717e-05, "loss": 0.7447, "step": 1053 }, { "epoch": 0.40336777650210487, "grad_norm": 0.5650928616523743, "learning_rate": 1.9866050268118995e-05, "loss": 0.7999, "step": 1054 }, { "epoch": 0.40375047837734407, "grad_norm": 0.6671600937843323, "learning_rate": 1.9865645578029982e-05, "loss": 0.7897, "step": 1055 }, { "epoch": 0.4041331802525832, "grad_norm": 0.4790136516094208, "learning_rate": 1.986524028167054e-05, "loss": 0.6426, "step": 1056 }, { "epoch": 0.4045158821278224, "grad_norm": 0.5261064767837524, "learning_rate": 1.9864834379065583e-05, "loss": 0.6765, "step": 1057 }, { "epoch": 0.4048985840030616, "grad_norm": 0.502359926700592, "learning_rate": 1.986442787024005e-05, "loss": 0.7001, "step": 1058 }, { "epoch": 0.4052812858783008, "grad_norm": 0.5598413944244385, "learning_rate": 1.9864020755218926e-05, "loss": 0.675, "step": 1059 }, { "epoch": 0.40566398775354, "grad_norm": 0.5808597803115845, "learning_rate": 1.9863613034027224e-05, "loss": 0.7634, "step": 1060 }, { "epoch": 0.40604668962877916, "grad_norm": 0.5676493644714355, "learning_rate": 1.9863204706690004e-05, "loss": 0.7145, "step": 1061 }, { "epoch": 0.40642939150401836, "grad_norm": 0.5320488214492798, "learning_rate": 1.9862795773232358e-05, "loss": 0.6841, "step": 1062 }, { "epoch": 0.40681209337925756, "grad_norm": 0.5240735411643982, "learning_rate": 1.9862386233679415e-05, "loss": 0.7154, "step": 1063 }, { "epoch": 0.40719479525449676, "grad_norm": 0.5748835802078247, "learning_rate": 1.9861976088056344e-05, "loss": 0.6748, "step": 1064 }, { "epoch": 0.40757749712973596, "grad_norm": 0.5585904717445374, "learning_rate": 1.9861565336388348e-05, "loss": 0.7069, "step": 1065 }, { "epoch": 0.4079601990049751, "grad_norm": 0.5579940676689148, "learning_rate": 1.986115397870067e-05, "loss": 0.7281, "step": 1066 }, { "epoch": 0.4083429008802143, "grad_norm": 0.5261317491531372, "learning_rate": 1.9860742015018585e-05, "loss": 0.7128, "step": 1067 }, { "epoch": 0.4087256027554535, "grad_norm": 0.5721328854560852, "learning_rate": 1.986032944536741e-05, "loss": 0.6329, "step": 1068 }, { "epoch": 0.4091083046306927, "grad_norm": 0.5710325241088867, "learning_rate": 1.9859916269772506e-05, "loss": 0.6864, "step": 1069 }, { "epoch": 0.4094910065059319, "grad_norm": 0.5756151676177979, "learning_rate": 1.9859502488259258e-05, "loss": 0.6921, "step": 1070 }, { "epoch": 0.40987370838117104, "grad_norm": 0.5156763195991516, "learning_rate": 1.985908810085309e-05, "loss": 0.6787, "step": 1071 }, { "epoch": 0.41025641025641024, "grad_norm": 0.5647479295730591, "learning_rate": 1.985867310757948e-05, "loss": 0.6775, "step": 1072 }, { "epoch": 0.41063911213164944, "grad_norm": 0.5172876715660095, "learning_rate": 1.9858257508463915e-05, "loss": 0.7441, "step": 1073 }, { "epoch": 0.41102181400688864, "grad_norm": 0.502260148525238, "learning_rate": 1.9857841303531943e-05, "loss": 0.7686, "step": 1074 }, { "epoch": 0.41140451588212784, "grad_norm": 0.5718316435813904, "learning_rate": 1.9857424492809142e-05, "loss": 0.6872, "step": 1075 }, { "epoch": 0.411787217757367, "grad_norm": 0.5978792309761047, "learning_rate": 1.9857007076321127e-05, "loss": 0.6968, "step": 1076 }, { "epoch": 0.4121699196326062, "grad_norm": 0.5558916926383972, "learning_rate": 1.9856589054093542e-05, "loss": 0.6717, "step": 1077 }, { "epoch": 0.4125526215078454, "grad_norm": 0.5348724722862244, "learning_rate": 1.985617042615208e-05, "loss": 0.666, "step": 1078 }, { "epoch": 0.4129353233830846, "grad_norm": 0.5780853629112244, "learning_rate": 1.9855751192522467e-05, "loss": 0.7649, "step": 1079 }, { "epoch": 0.4133180252583238, "grad_norm": 0.537470817565918, "learning_rate": 1.985533135323047e-05, "loss": 0.6525, "step": 1080 }, { "epoch": 0.41370072713356293, "grad_norm": 0.5623329281806946, "learning_rate": 1.9854910908301882e-05, "loss": 0.7606, "step": 1081 }, { "epoch": 0.41408342900880213, "grad_norm": 0.5329267978668213, "learning_rate": 1.985448985776254e-05, "loss": 0.69, "step": 1082 }, { "epoch": 0.41446613088404133, "grad_norm": 0.585917592048645, "learning_rate": 1.9854068201638327e-05, "loss": 0.7315, "step": 1083 }, { "epoch": 0.41484883275928053, "grad_norm": 0.5866027474403381, "learning_rate": 1.9853645939955147e-05, "loss": 0.7084, "step": 1084 }, { "epoch": 0.41523153463451973, "grad_norm": 0.6199507117271423, "learning_rate": 1.985322307273896e-05, "loss": 0.8007, "step": 1085 }, { "epoch": 0.41561423650975887, "grad_norm": 0.8702028393745422, "learning_rate": 1.985279960001574e-05, "loss": 0.6982, "step": 1086 }, { "epoch": 0.41599693838499807, "grad_norm": 0.48944559693336487, "learning_rate": 1.9852375521811508e-05, "loss": 0.7449, "step": 1087 }, { "epoch": 0.41637964026023727, "grad_norm": 0.5187018513679504, "learning_rate": 1.985195083815234e-05, "loss": 0.6265, "step": 1088 }, { "epoch": 0.41676234213547647, "grad_norm": 0.5369208455085754, "learning_rate": 1.9851525549064324e-05, "loss": 0.6359, "step": 1089 }, { "epoch": 0.41714504401071567, "grad_norm": 0.5401303172111511, "learning_rate": 1.9851099654573595e-05, "loss": 0.7111, "step": 1090 }, { "epoch": 0.4175277458859548, "grad_norm": 0.5817647576332092, "learning_rate": 1.985067315470633e-05, "loss": 0.7128, "step": 1091 }, { "epoch": 0.417910447761194, "grad_norm": 0.5440465211868286, "learning_rate": 1.9850246049488734e-05, "loss": 0.705, "step": 1092 }, { "epoch": 0.4182931496364332, "grad_norm": 0.5465120077133179, "learning_rate": 1.9849818338947058e-05, "loss": 0.7674, "step": 1093 }, { "epoch": 0.4186758515116724, "grad_norm": 3.516484022140503, "learning_rate": 1.9849390023107583e-05, "loss": 0.7323, "step": 1094 }, { "epoch": 0.4190585533869116, "grad_norm": 0.5569886565208435, "learning_rate": 1.984896110199663e-05, "loss": 0.7132, "step": 1095 }, { "epoch": 0.41944125526215076, "grad_norm": 0.5238699316978455, "learning_rate": 1.9848531575640558e-05, "loss": 0.6748, "step": 1096 }, { "epoch": 0.41982395713738996, "grad_norm": 0.7358837723731995, "learning_rate": 1.984810144406576e-05, "loss": 0.7868, "step": 1097 }, { "epoch": 0.42020665901262916, "grad_norm": 0.5585016012191772, "learning_rate": 1.9847670707298675e-05, "loss": 0.6715, "step": 1098 }, { "epoch": 0.42058936088786836, "grad_norm": 0.5397026538848877, "learning_rate": 1.984723936536577e-05, "loss": 0.7398, "step": 1099 }, { "epoch": 0.42097206276310756, "grad_norm": 0.5886635184288025, "learning_rate": 1.984680741829355e-05, "loss": 0.704, "step": 1100 }, { "epoch": 0.4213547646383467, "grad_norm": 0.5278101563453674, "learning_rate": 1.984637486610856e-05, "loss": 0.7049, "step": 1101 }, { "epoch": 0.4217374665135859, "grad_norm": 0.5634078979492188, "learning_rate": 1.9845941708837385e-05, "loss": 0.6803, "step": 1102 }, { "epoch": 0.4221201683888251, "grad_norm": 0.5529354214668274, "learning_rate": 1.9845507946506637e-05, "loss": 0.7132, "step": 1103 }, { "epoch": 0.4225028702640643, "grad_norm": 0.5877954363822937, "learning_rate": 1.984507357914298e-05, "loss": 0.7656, "step": 1104 }, { "epoch": 0.4228855721393035, "grad_norm": 0.5334131121635437, "learning_rate": 1.98446386067731e-05, "loss": 0.7422, "step": 1105 }, { "epoch": 0.42326827401454264, "grad_norm": 0.6679038405418396, "learning_rate": 1.9844203029423733e-05, "loss": 0.7598, "step": 1106 }, { "epoch": 0.42365097588978184, "grad_norm": 0.5457921028137207, "learning_rate": 1.984376684712164e-05, "loss": 0.7112, "step": 1107 }, { "epoch": 0.42403367776502104, "grad_norm": 0.569377064704895, "learning_rate": 1.984333005989363e-05, "loss": 0.7002, "step": 1108 }, { "epoch": 0.42441637964026024, "grad_norm": 0.520605742931366, "learning_rate": 1.9842892667766546e-05, "loss": 0.737, "step": 1109 }, { "epoch": 0.42479908151549944, "grad_norm": 0.5613458752632141, "learning_rate": 1.984245467076726e-05, "loss": 0.6527, "step": 1110 }, { "epoch": 0.4251817833907386, "grad_norm": 0.5814681053161621, "learning_rate": 1.98420160689227e-05, "loss": 0.7677, "step": 1111 }, { "epoch": 0.4255644852659778, "grad_norm": 0.5499116778373718, "learning_rate": 1.9841576862259805e-05, "loss": 0.7506, "step": 1112 }, { "epoch": 0.425947187141217, "grad_norm": 0.6438210010528564, "learning_rate": 1.984113705080558e-05, "loss": 0.7474, "step": 1113 }, { "epoch": 0.4263298890164562, "grad_norm": 0.550987184047699, "learning_rate": 1.9840696634587035e-05, "loss": 0.708, "step": 1114 }, { "epoch": 0.4267125908916954, "grad_norm": 0.5253874659538269, "learning_rate": 1.9840255613631254e-05, "loss": 0.7451, "step": 1115 }, { "epoch": 0.42709529276693453, "grad_norm": 0.5452373623847961, "learning_rate": 1.9839813987965323e-05, "loss": 0.7309, "step": 1116 }, { "epoch": 0.42747799464217373, "grad_norm": 0.5847927331924438, "learning_rate": 1.983937175761639e-05, "loss": 0.7495, "step": 1117 }, { "epoch": 0.42786069651741293, "grad_norm": 0.6100080609321594, "learning_rate": 1.9838928922611634e-05, "loss": 0.7186, "step": 1118 }, { "epoch": 0.42824339839265213, "grad_norm": 0.5147436857223511, "learning_rate": 1.983848548297826e-05, "loss": 0.7451, "step": 1119 }, { "epoch": 0.42862610026789133, "grad_norm": 0.527008593082428, "learning_rate": 1.983804143874352e-05, "loss": 0.6554, "step": 1120 }, { "epoch": 0.4290088021431305, "grad_norm": 0.5884060263633728, "learning_rate": 1.9837596789934704e-05, "loss": 0.7057, "step": 1121 }, { "epoch": 0.4293915040183697, "grad_norm": 0.6611606478691101, "learning_rate": 1.9837151536579143e-05, "loss": 0.669, "step": 1122 }, { "epoch": 0.4297742058936089, "grad_norm": 0.572204053401947, "learning_rate": 1.9836705678704187e-05, "loss": 0.7126, "step": 1123 }, { "epoch": 0.4301569077688481, "grad_norm": 0.6692376732826233, "learning_rate": 1.9836259216337242e-05, "loss": 0.6924, "step": 1124 }, { "epoch": 0.4305396096440873, "grad_norm": 0.5671500563621521, "learning_rate": 1.9835812149505744e-05, "loss": 0.7246, "step": 1125 }, { "epoch": 0.4309223115193264, "grad_norm": 0.6035650372505188, "learning_rate": 1.9835364478237165e-05, "loss": 0.6191, "step": 1126 }, { "epoch": 0.4313050133945656, "grad_norm": 0.5857300162315369, "learning_rate": 1.9834916202559014e-05, "loss": 0.7361, "step": 1127 }, { "epoch": 0.4316877152698048, "grad_norm": 0.5053495764732361, "learning_rate": 1.9834467322498844e-05, "loss": 0.6337, "step": 1128 }, { "epoch": 0.432070417145044, "grad_norm": 0.5717106461524963, "learning_rate": 1.9834017838084234e-05, "loss": 0.7097, "step": 1129 }, { "epoch": 0.4324531190202832, "grad_norm": 0.6714223027229309, "learning_rate": 1.983356774934281e-05, "loss": 0.7087, "step": 1130 }, { "epoch": 0.43283582089552236, "grad_norm": 0.5678625106811523, "learning_rate": 1.983311705630223e-05, "loss": 0.7259, "step": 1131 }, { "epoch": 0.43321852277076156, "grad_norm": 0.5380966663360596, "learning_rate": 1.9832665758990193e-05, "loss": 0.6573, "step": 1132 }, { "epoch": 0.43360122464600076, "grad_norm": 0.8286049365997314, "learning_rate": 1.9832213857434426e-05, "loss": 0.6695, "step": 1133 }, { "epoch": 0.43398392652123996, "grad_norm": 0.6323550939559937, "learning_rate": 1.9831761351662705e-05, "loss": 0.6934, "step": 1134 }, { "epoch": 0.43436662839647916, "grad_norm": 0.5323156118392944, "learning_rate": 1.9831308241702837e-05, "loss": 0.7177, "step": 1135 }, { "epoch": 0.4347493302717183, "grad_norm": 0.6076926589012146, "learning_rate": 1.9830854527582665e-05, "loss": 0.689, "step": 1136 }, { "epoch": 0.4351320321469575, "grad_norm": 0.5261543989181519, "learning_rate": 1.983040020933007e-05, "loss": 0.6961, "step": 1137 }, { "epoch": 0.4355147340221967, "grad_norm": 0.6162883043289185, "learning_rate": 1.9829945286972975e-05, "loss": 0.7283, "step": 1138 }, { "epoch": 0.4358974358974359, "grad_norm": 0.5980687737464905, "learning_rate": 1.9829489760539334e-05, "loss": 0.6808, "step": 1139 }, { "epoch": 0.4362801377726751, "grad_norm": 0.5238384008407593, "learning_rate": 1.982903363005714e-05, "loss": 0.6651, "step": 1140 }, { "epoch": 0.43666283964791425, "grad_norm": 0.5370144844055176, "learning_rate": 1.9828576895554423e-05, "loss": 0.6893, "step": 1141 }, { "epoch": 0.43704554152315345, "grad_norm": 0.521984875202179, "learning_rate": 1.9828119557059254e-05, "loss": 0.6965, "step": 1142 }, { "epoch": 0.43742824339839265, "grad_norm": 0.6948754787445068, "learning_rate": 1.9827661614599734e-05, "loss": 0.7143, "step": 1143 }, { "epoch": 0.43781094527363185, "grad_norm": 0.561779797077179, "learning_rate": 1.9827203068204005e-05, "loss": 0.7584, "step": 1144 }, { "epoch": 0.43819364714887105, "grad_norm": 0.5535571575164795, "learning_rate": 1.9826743917900245e-05, "loss": 0.7324, "step": 1145 }, { "epoch": 0.43857634902411025, "grad_norm": 0.5271081328392029, "learning_rate": 1.9826284163716673e-05, "loss": 0.6579, "step": 1146 }, { "epoch": 0.4389590508993494, "grad_norm": 0.578034520149231, "learning_rate": 1.9825823805681543e-05, "loss": 0.7461, "step": 1147 }, { "epoch": 0.4393417527745886, "grad_norm": 0.544985294342041, "learning_rate": 1.982536284382314e-05, "loss": 0.7201, "step": 1148 }, { "epoch": 0.4397244546498278, "grad_norm": 0.5447714328765869, "learning_rate": 1.9824901278169797e-05, "loss": 0.6971, "step": 1149 }, { "epoch": 0.440107156525067, "grad_norm": 0.5153583884239197, "learning_rate": 1.9824439108749875e-05, "loss": 0.6836, "step": 1150 }, { "epoch": 0.4404898584003062, "grad_norm": 0.5426744818687439, "learning_rate": 1.9823976335591777e-05, "loss": 0.6305, "step": 1151 }, { "epoch": 0.44087256027554533, "grad_norm": 0.5327655673027039, "learning_rate": 1.9823512958723942e-05, "loss": 0.7112, "step": 1152 }, { "epoch": 0.44125526215078453, "grad_norm": 0.5520485043525696, "learning_rate": 1.982304897817484e-05, "loss": 0.7015, "step": 1153 }, { "epoch": 0.44163796402602373, "grad_norm": 0.551723837852478, "learning_rate": 1.9822584393972993e-05, "loss": 0.7719, "step": 1154 }, { "epoch": 0.44202066590126293, "grad_norm": 0.5156102180480957, "learning_rate": 1.9822119206146948e-05, "loss": 0.6514, "step": 1155 }, { "epoch": 0.44240336777650213, "grad_norm": 0.529034435749054, "learning_rate": 1.9821653414725286e-05, "loss": 0.7073, "step": 1156 }, { "epoch": 0.4427860696517413, "grad_norm": 0.5149195194244385, "learning_rate": 1.9821187019736644e-05, "loss": 0.6547, "step": 1157 }, { "epoch": 0.4431687715269805, "grad_norm": 0.5080868601799011, "learning_rate": 1.9820720021209667e-05, "loss": 0.6824, "step": 1158 }, { "epoch": 0.4435514734022197, "grad_norm": 0.5305992960929871, "learning_rate": 1.9820252419173064e-05, "loss": 0.7445, "step": 1159 }, { "epoch": 0.4439341752774589, "grad_norm": 0.5259260535240173, "learning_rate": 1.981978421365557e-05, "loss": 0.7037, "step": 1160 }, { "epoch": 0.4443168771526981, "grad_norm": 0.5861746072769165, "learning_rate": 1.9819315404685955e-05, "loss": 0.7469, "step": 1161 }, { "epoch": 0.4446995790279372, "grad_norm": 0.6060369610786438, "learning_rate": 1.9818845992293027e-05, "loss": 0.7709, "step": 1162 }, { "epoch": 0.4450822809031764, "grad_norm": 0.5601010322570801, "learning_rate": 1.9818375976505635e-05, "loss": 0.7486, "step": 1163 }, { "epoch": 0.4454649827784156, "grad_norm": 0.5240575075149536, "learning_rate": 1.9817905357352662e-05, "loss": 0.7016, "step": 1164 }, { "epoch": 0.4458476846536548, "grad_norm": 0.5229506492614746, "learning_rate": 1.9817434134863028e-05, "loss": 0.7368, "step": 1165 }, { "epoch": 0.446230386528894, "grad_norm": 0.5815767049789429, "learning_rate": 1.9816962309065696e-05, "loss": 0.7606, "step": 1166 }, { "epoch": 0.44661308840413316, "grad_norm": 0.539306104183197, "learning_rate": 1.9816489879989653e-05, "loss": 0.7474, "step": 1167 }, { "epoch": 0.44699579027937236, "grad_norm": 0.5196320414543152, "learning_rate": 1.9816016847663938e-05, "loss": 0.6937, "step": 1168 }, { "epoch": 0.44737849215461156, "grad_norm": 0.8112896084785461, "learning_rate": 1.9815543212117614e-05, "loss": 0.7086, "step": 1169 }, { "epoch": 0.44776119402985076, "grad_norm": 0.5070350766181946, "learning_rate": 1.9815068973379792e-05, "loss": 0.72, "step": 1170 }, { "epoch": 0.44814389590508996, "grad_norm": 0.5030051469802856, "learning_rate": 1.9814594131479614e-05, "loss": 0.7097, "step": 1171 }, { "epoch": 0.4485265977803291, "grad_norm": 0.8525295257568359, "learning_rate": 1.9814118686446258e-05, "loss": 0.732, "step": 1172 }, { "epoch": 0.4489092996555683, "grad_norm": 0.530771791934967, "learning_rate": 1.9813642638308943e-05, "loss": 0.6606, "step": 1173 }, { "epoch": 0.4492920015308075, "grad_norm": 0.5427703261375427, "learning_rate": 1.9813165987096926e-05, "loss": 0.7233, "step": 1174 }, { "epoch": 0.4496747034060467, "grad_norm": 0.5518602728843689, "learning_rate": 1.9812688732839497e-05, "loss": 0.7153, "step": 1175 }, { "epoch": 0.4500574052812859, "grad_norm": 0.5782102942466736, "learning_rate": 1.981221087556598e-05, "loss": 0.6796, "step": 1176 }, { "epoch": 0.45044010715652505, "grad_norm": 0.5538005232810974, "learning_rate": 1.9811732415305746e-05, "loss": 0.7175, "step": 1177 }, { "epoch": 0.45082280903176425, "grad_norm": 0.5671499967575073, "learning_rate": 1.9811253352088197e-05, "loss": 0.7017, "step": 1178 }, { "epoch": 0.45120551090700345, "grad_norm": 0.576509952545166, "learning_rate": 1.981077368594277e-05, "loss": 0.6711, "step": 1179 }, { "epoch": 0.45158821278224265, "grad_norm": 0.5235339999198914, "learning_rate": 1.9810293416898944e-05, "loss": 0.7541, "step": 1180 }, { "epoch": 0.45197091465748185, "grad_norm": 0.5522798895835876, "learning_rate": 1.980981254498623e-05, "loss": 0.75, "step": 1181 }, { "epoch": 0.452353616532721, "grad_norm": 0.5231978893280029, "learning_rate": 1.9809331070234185e-05, "loss": 0.7117, "step": 1182 }, { "epoch": 0.4527363184079602, "grad_norm": 0.5666417479515076, "learning_rate": 1.9808848992672392e-05, "loss": 0.6807, "step": 1183 }, { "epoch": 0.4531190202831994, "grad_norm": 0.5716787576675415, "learning_rate": 1.9808366312330476e-05, "loss": 0.7579, "step": 1184 }, { "epoch": 0.4535017221584386, "grad_norm": 0.5339987277984619, "learning_rate": 1.98078830292381e-05, "loss": 0.6897, "step": 1185 }, { "epoch": 0.4538844240336778, "grad_norm": 0.5152406692504883, "learning_rate": 1.9807399143424965e-05, "loss": 0.7034, "step": 1186 }, { "epoch": 0.45426712590891694, "grad_norm": 0.5395106673240662, "learning_rate": 1.9806914654920803e-05, "loss": 0.7156, "step": 1187 }, { "epoch": 0.45464982778415614, "grad_norm": 0.5999433398246765, "learning_rate": 1.980642956375539e-05, "loss": 0.7291, "step": 1188 }, { "epoch": 0.45503252965939534, "grad_norm": 0.5049673318862915, "learning_rate": 1.9805943869958538e-05, "loss": 0.8211, "step": 1189 }, { "epoch": 0.45541523153463453, "grad_norm": 0.572972297668457, "learning_rate": 1.9805457573560087e-05, "loss": 0.7367, "step": 1190 }, { "epoch": 0.45579793340987373, "grad_norm": 0.5572946667671204, "learning_rate": 1.980497067458993e-05, "loss": 0.7502, "step": 1191 }, { "epoch": 0.4561806352851129, "grad_norm": 0.557876706123352, "learning_rate": 1.980448317307798e-05, "loss": 0.7423, "step": 1192 }, { "epoch": 0.4565633371603521, "grad_norm": 0.4964842200279236, "learning_rate": 1.9803995069054204e-05, "loss": 0.7869, "step": 1193 }, { "epoch": 0.4569460390355913, "grad_norm": 0.5532469749450684, "learning_rate": 1.9803506362548588e-05, "loss": 0.7996, "step": 1194 }, { "epoch": 0.4573287409108305, "grad_norm": 0.5647322535514832, "learning_rate": 1.9803017053591168e-05, "loss": 0.7521, "step": 1195 }, { "epoch": 0.4577114427860697, "grad_norm": 0.512520968914032, "learning_rate": 1.980252714221202e-05, "loss": 0.7609, "step": 1196 }, { "epoch": 0.4580941446613088, "grad_norm": 0.5729935765266418, "learning_rate": 1.980203662844124e-05, "loss": 0.66, "step": 1197 }, { "epoch": 0.458476846536548, "grad_norm": 0.691275954246521, "learning_rate": 1.9801545512308982e-05, "loss": 0.7756, "step": 1198 }, { "epoch": 0.4588595484117872, "grad_norm": 0.5463365316390991, "learning_rate": 1.9801053793845415e-05, "loss": 0.7131, "step": 1199 }, { "epoch": 0.4592422502870264, "grad_norm": 0.545625627040863, "learning_rate": 1.9800561473080764e-05, "loss": 0.7096, "step": 1200 }, { "epoch": 0.4596249521622656, "grad_norm": 0.5227223634719849, "learning_rate": 1.9800068550045282e-05, "loss": 0.6323, "step": 1201 }, { "epoch": 0.46000765403750477, "grad_norm": 0.5616353750228882, "learning_rate": 1.9799575024769257e-05, "loss": 0.7081, "step": 1202 }, { "epoch": 0.46039035591274396, "grad_norm": 0.5405026078224182, "learning_rate": 1.979908089728302e-05, "loss": 0.637, "step": 1203 }, { "epoch": 0.46077305778798316, "grad_norm": 0.5239979028701782, "learning_rate": 1.9798586167616943e-05, "loss": 0.6964, "step": 1204 }, { "epoch": 0.46115575966322236, "grad_norm": 0.5032397508621216, "learning_rate": 1.9798090835801418e-05, "loss": 0.7366, "step": 1205 }, { "epoch": 0.46153846153846156, "grad_norm": 0.5186507105827332, "learning_rate": 1.9797594901866887e-05, "loss": 0.7452, "step": 1206 }, { "epoch": 0.4619211634137007, "grad_norm": 0.5679971575737, "learning_rate": 1.979709836584383e-05, "loss": 0.667, "step": 1207 }, { "epoch": 0.4623038652889399, "grad_norm": 0.5616445541381836, "learning_rate": 1.979660122776276e-05, "loss": 0.6696, "step": 1208 }, { "epoch": 0.4626865671641791, "grad_norm": 0.538846492767334, "learning_rate": 1.9796103487654225e-05, "loss": 0.7243, "step": 1209 }, { "epoch": 0.4630692690394183, "grad_norm": 0.5538100004196167, "learning_rate": 1.9795605145548813e-05, "loss": 0.6868, "step": 1210 }, { "epoch": 0.4634519709146575, "grad_norm": 0.5236196517944336, "learning_rate": 1.979510620147715e-05, "loss": 0.7881, "step": 1211 }, { "epoch": 0.46383467278989665, "grad_norm": 0.5713673830032349, "learning_rate": 1.9794606655469895e-05, "loss": 0.7817, "step": 1212 }, { "epoch": 0.46421737466513585, "grad_norm": 0.4885382354259491, "learning_rate": 1.9794106507557748e-05, "loss": 0.8508, "step": 1213 }, { "epoch": 0.46460007654037505, "grad_norm": 0.5285899043083191, "learning_rate": 1.9793605757771446e-05, "loss": 0.7141, "step": 1214 }, { "epoch": 0.46498277841561425, "grad_norm": 0.5704699754714966, "learning_rate": 1.9793104406141757e-05, "loss": 0.6687, "step": 1215 }, { "epoch": 0.46536548029085345, "grad_norm": 0.5315327644348145, "learning_rate": 1.9792602452699496e-05, "loss": 0.6497, "step": 1216 }, { "epoch": 0.4657481821660926, "grad_norm": 0.5379560589790344, "learning_rate": 1.9792099897475505e-05, "loss": 0.5891, "step": 1217 }, { "epoch": 0.4661308840413318, "grad_norm": 0.5214300751686096, "learning_rate": 1.9791596740500673e-05, "loss": 0.6864, "step": 1218 }, { "epoch": 0.466513585916571, "grad_norm": 0.5123255252838135, "learning_rate": 1.9791092981805907e-05, "loss": 0.6534, "step": 1219 }, { "epoch": 0.4668962877918102, "grad_norm": 0.5160566568374634, "learning_rate": 1.979058862142218e-05, "loss": 0.6625, "step": 1220 }, { "epoch": 0.4672789896670494, "grad_norm": 0.5376072525978088, "learning_rate": 1.979008365938048e-05, "loss": 0.6708, "step": 1221 }, { "epoch": 0.46766169154228854, "grad_norm": 0.5503467321395874, "learning_rate": 1.9789578095711837e-05, "loss": 0.621, "step": 1222 }, { "epoch": 0.46804439341752774, "grad_norm": 0.6214192509651184, "learning_rate": 1.9789071930447323e-05, "loss": 0.6862, "step": 1223 }, { "epoch": 0.46842709529276694, "grad_norm": 0.5587108731269836, "learning_rate": 1.9788565163618037e-05, "loss": 0.6894, "step": 1224 }, { "epoch": 0.46880979716800614, "grad_norm": 0.5399854183197021, "learning_rate": 1.9788057795255128e-05, "loss": 0.7399, "step": 1225 }, { "epoch": 0.46919249904324534, "grad_norm": 0.5708882212638855, "learning_rate": 1.978754982538977e-05, "loss": 0.6496, "step": 1226 }, { "epoch": 0.4695752009184845, "grad_norm": 0.56450355052948, "learning_rate": 1.9787041254053178e-05, "loss": 0.6749, "step": 1227 }, { "epoch": 0.4699579027937237, "grad_norm": 0.48988550901412964, "learning_rate": 1.9786532081276615e-05, "loss": 0.77, "step": 1228 }, { "epoch": 0.4703406046689629, "grad_norm": 0.544632613658905, "learning_rate": 1.9786022307091358e-05, "loss": 0.7721, "step": 1229 }, { "epoch": 0.4707233065442021, "grad_norm": 0.6062704920768738, "learning_rate": 1.978551193152874e-05, "loss": 0.728, "step": 1230 }, { "epoch": 0.4711060084194413, "grad_norm": 0.6523524522781372, "learning_rate": 1.978500095462013e-05, "loss": 0.7385, "step": 1231 }, { "epoch": 0.4714887102946804, "grad_norm": 0.5624481439590454, "learning_rate": 1.9784489376396925e-05, "loss": 0.6338, "step": 1232 }, { "epoch": 0.4718714121699196, "grad_norm": 0.7525923848152161, "learning_rate": 1.9783977196890558e-05, "loss": 0.656, "step": 1233 }, { "epoch": 0.4722541140451588, "grad_norm": 0.7398058176040649, "learning_rate": 1.9783464416132507e-05, "loss": 0.7041, "step": 1234 }, { "epoch": 0.472636815920398, "grad_norm": 0.5348504781723022, "learning_rate": 1.9782951034154286e-05, "loss": 0.671, "step": 1235 }, { "epoch": 0.4730195177956372, "grad_norm": 0.625778317451477, "learning_rate": 1.9782437050987443e-05, "loss": 0.6532, "step": 1236 }, { "epoch": 0.47340221967087637, "grad_norm": 0.5594839453697205, "learning_rate": 1.978192246666356e-05, "loss": 0.8554, "step": 1237 }, { "epoch": 0.47378492154611557, "grad_norm": 0.5355791449546814, "learning_rate": 1.9781407281214267e-05, "loss": 0.6611, "step": 1238 }, { "epoch": 0.47416762342135477, "grad_norm": 0.5353894233703613, "learning_rate": 1.9780891494671216e-05, "loss": 0.666, "step": 1239 }, { "epoch": 0.47455032529659397, "grad_norm": 0.9072039723396301, "learning_rate": 1.9780375107066106e-05, "loss": 0.7901, "step": 1240 }, { "epoch": 0.47493302717183317, "grad_norm": 0.6462231278419495, "learning_rate": 1.977985811843067e-05, "loss": 0.7037, "step": 1241 }, { "epoch": 0.4753157290470723, "grad_norm": 0.5111790299415588, "learning_rate": 1.9779340528796684e-05, "loss": 0.6595, "step": 1242 }, { "epoch": 0.4756984309223115, "grad_norm": 0.5016728043556213, "learning_rate": 1.9778822338195944e-05, "loss": 0.7606, "step": 1243 }, { "epoch": 0.4760811327975507, "grad_norm": 0.5085366368293762, "learning_rate": 1.9778303546660304e-05, "loss": 0.6252, "step": 1244 }, { "epoch": 0.4764638346727899, "grad_norm": 0.5292803645133972, "learning_rate": 1.9777784154221642e-05, "loss": 0.6378, "step": 1245 }, { "epoch": 0.4768465365480291, "grad_norm": 0.5785952806472778, "learning_rate": 1.9777264160911875e-05, "loss": 0.6783, "step": 1246 }, { "epoch": 0.47722923842326825, "grad_norm": 0.5814845561981201, "learning_rate": 1.9776743566762956e-05, "loss": 0.7536, "step": 1247 }, { "epoch": 0.47761194029850745, "grad_norm": 0.5012936592102051, "learning_rate": 1.9776222371806883e-05, "loss": 0.5959, "step": 1248 }, { "epoch": 0.47799464217374665, "grad_norm": 0.529917299747467, "learning_rate": 1.9775700576075684e-05, "loss": 0.6767, "step": 1249 }, { "epoch": 0.47837734404898585, "grad_norm": 0.5789620876312256, "learning_rate": 1.9775178179601417e-05, "loss": 0.6638, "step": 1250 }, { "epoch": 0.47876004592422505, "grad_norm": 0.5115686655044556, "learning_rate": 1.9774655182416195e-05, "loss": 0.7072, "step": 1251 }, { "epoch": 0.4791427477994642, "grad_norm": 0.5506501793861389, "learning_rate": 1.9774131584552147e-05, "loss": 0.652, "step": 1252 }, { "epoch": 0.4795254496747034, "grad_norm": 0.6184877157211304, "learning_rate": 1.977360738604146e-05, "loss": 0.6821, "step": 1253 }, { "epoch": 0.4799081515499426, "grad_norm": 0.5370518565177917, "learning_rate": 1.977308258691634e-05, "loss": 0.7001, "step": 1254 }, { "epoch": 0.4802908534251818, "grad_norm": 0.5962867736816406, "learning_rate": 1.9772557187209037e-05, "loss": 0.7278, "step": 1255 }, { "epoch": 0.480673555300421, "grad_norm": 0.5659542679786682, "learning_rate": 1.9772031186951848e-05, "loss": 0.68, "step": 1256 }, { "epoch": 0.48105625717566014, "grad_norm": 0.5202198028564453, "learning_rate": 1.9771504586177082e-05, "loss": 0.7274, "step": 1257 }, { "epoch": 0.48143895905089934, "grad_norm": 0.5611588358879089, "learning_rate": 1.9770977384917116e-05, "loss": 0.6872, "step": 1258 }, { "epoch": 0.48182166092613854, "grad_norm": 0.5047485828399658, "learning_rate": 1.9770449583204336e-05, "loss": 0.6077, "step": 1259 }, { "epoch": 0.48220436280137774, "grad_norm": 0.49799051880836487, "learning_rate": 1.976992118107118e-05, "loss": 0.7485, "step": 1260 }, { "epoch": 0.48258706467661694, "grad_norm": 0.5853896737098694, "learning_rate": 1.9769392178550123e-05, "loss": 0.6704, "step": 1261 }, { "epoch": 0.4829697665518561, "grad_norm": 0.5685197114944458, "learning_rate": 1.976886257567367e-05, "loss": 0.6173, "step": 1262 }, { "epoch": 0.4833524684270953, "grad_norm": 0.5187684297561646, "learning_rate": 1.976833237247437e-05, "loss": 0.7157, "step": 1263 }, { "epoch": 0.4837351703023345, "grad_norm": 0.5328609943389893, "learning_rate": 1.9767801568984797e-05, "loss": 0.63, "step": 1264 }, { "epoch": 0.4841178721775737, "grad_norm": 0.6008513569831848, "learning_rate": 1.9767270165237582e-05, "loss": 0.6786, "step": 1265 }, { "epoch": 0.4845005740528129, "grad_norm": 0.5199552774429321, "learning_rate": 1.9766738161265375e-05, "loss": 0.6562, "step": 1266 }, { "epoch": 0.484883275928052, "grad_norm": 0.5293965935707092, "learning_rate": 1.976620555710087e-05, "loss": 0.6283, "step": 1267 }, { "epoch": 0.4852659778032912, "grad_norm": 0.571964681148529, "learning_rate": 1.9765672352776794e-05, "loss": 0.7506, "step": 1268 }, { "epoch": 0.4856486796785304, "grad_norm": 0.5672798752784729, "learning_rate": 1.9765138548325917e-05, "loss": 0.6645, "step": 1269 }, { "epoch": 0.4860313815537696, "grad_norm": 0.6062446236610413, "learning_rate": 1.9764604143781046e-05, "loss": 0.6707, "step": 1270 }, { "epoch": 0.4864140834290088, "grad_norm": 0.5226308107376099, "learning_rate": 1.9764069139175012e-05, "loss": 0.6746, "step": 1271 }, { "epoch": 0.48679678530424797, "grad_norm": 0.5342475771903992, "learning_rate": 1.9763533534540706e-05, "loss": 0.6724, "step": 1272 }, { "epoch": 0.48717948717948717, "grad_norm": 0.5810508728027344, "learning_rate": 1.9762997329911027e-05, "loss": 0.7579, "step": 1273 }, { "epoch": 0.48756218905472637, "grad_norm": 0.6969314217567444, "learning_rate": 1.9762460525318937e-05, "loss": 0.7458, "step": 1274 }, { "epoch": 0.48794489092996557, "grad_norm": 0.557000458240509, "learning_rate": 1.9761923120797423e-05, "loss": 0.7207, "step": 1275 }, { "epoch": 0.48832759280520477, "grad_norm": 0.5460852980613708, "learning_rate": 1.9761385116379507e-05, "loss": 0.717, "step": 1276 }, { "epoch": 0.4887102946804439, "grad_norm": 0.530967116355896, "learning_rate": 1.9760846512098252e-05, "loss": 0.7386, "step": 1277 }, { "epoch": 0.4890929965556831, "grad_norm": 0.5307694673538208, "learning_rate": 1.9760307307986753e-05, "loss": 0.6835, "step": 1278 }, { "epoch": 0.4894756984309223, "grad_norm": 0.49468573927879333, "learning_rate": 1.975976750407815e-05, "loss": 0.6813, "step": 1279 }, { "epoch": 0.4898584003061615, "grad_norm": 0.5238416194915771, "learning_rate": 1.9759227100405618e-05, "loss": 0.7411, "step": 1280 }, { "epoch": 0.4902411021814007, "grad_norm": 0.5172258615493774, "learning_rate": 1.9758686097002358e-05, "loss": 0.5991, "step": 1281 }, { "epoch": 0.49062380405663986, "grad_norm": 0.5818792581558228, "learning_rate": 1.9758144493901622e-05, "loss": 0.7784, "step": 1282 }, { "epoch": 0.49100650593187906, "grad_norm": 0.5687139630317688, "learning_rate": 1.9757602291136694e-05, "loss": 0.7696, "step": 1283 }, { "epoch": 0.49138920780711826, "grad_norm": 0.5196610689163208, "learning_rate": 1.975705948874089e-05, "loss": 0.7456, "step": 1284 }, { "epoch": 0.49177190968235746, "grad_norm": 0.5685257315635681, "learning_rate": 1.975651608674757e-05, "loss": 0.6639, "step": 1285 }, { "epoch": 0.49215461155759666, "grad_norm": 0.7594094276428223, "learning_rate": 1.9755972085190124e-05, "loss": 0.7258, "step": 1286 }, { "epoch": 0.4925373134328358, "grad_norm": 0.6045129895210266, "learning_rate": 1.9755427484101984e-05, "loss": 0.7545, "step": 1287 }, { "epoch": 0.492920015308075, "grad_norm": 0.5634876489639282, "learning_rate": 1.9754882283516615e-05, "loss": 0.7511, "step": 1288 }, { "epoch": 0.4933027171833142, "grad_norm": 0.534619152545929, "learning_rate": 1.9754336483467528e-05, "loss": 0.6609, "step": 1289 }, { "epoch": 0.4936854190585534, "grad_norm": 0.5590626001358032, "learning_rate": 1.9753790083988256e-05, "loss": 0.7801, "step": 1290 }, { "epoch": 0.4940681209337926, "grad_norm": 0.5630539655685425, "learning_rate": 1.975324308511238e-05, "loss": 0.6393, "step": 1291 }, { "epoch": 0.49445082280903174, "grad_norm": 0.593304455280304, "learning_rate": 1.9752695486873516e-05, "loss": 0.7014, "step": 1292 }, { "epoch": 0.49483352468427094, "grad_norm": 0.5209435224533081, "learning_rate": 1.975214728930531e-05, "loss": 0.683, "step": 1293 }, { "epoch": 0.49521622655951014, "grad_norm": 0.49509626626968384, "learning_rate": 1.975159849244146e-05, "loss": 0.6523, "step": 1294 }, { "epoch": 0.49559892843474934, "grad_norm": 0.5887961387634277, "learning_rate": 1.975104909631568e-05, "loss": 0.6607, "step": 1295 }, { "epoch": 0.49598163030998854, "grad_norm": 0.6323246955871582, "learning_rate": 1.975049910096174e-05, "loss": 0.7624, "step": 1296 }, { "epoch": 0.4963643321852277, "grad_norm": 0.5350232720375061, "learning_rate": 1.9749948506413433e-05, "loss": 0.6919, "step": 1297 }, { "epoch": 0.4967470340604669, "grad_norm": 0.5825594663619995, "learning_rate": 1.97493973127046e-05, "loss": 0.7379, "step": 1298 }, { "epoch": 0.4971297359357061, "grad_norm": 0.5421949028968811, "learning_rate": 1.974884551986911e-05, "loss": 0.6523, "step": 1299 }, { "epoch": 0.4975124378109453, "grad_norm": 0.5585834980010986, "learning_rate": 1.974829312794087e-05, "loss": 0.64, "step": 1300 }, { "epoch": 0.4978951396861845, "grad_norm": 0.5088152289390564, "learning_rate": 1.9747740136953833e-05, "loss": 0.706, "step": 1301 }, { "epoch": 0.49827784156142363, "grad_norm": 0.5460655689239502, "learning_rate": 1.9747186546941976e-05, "loss": 0.6745, "step": 1302 }, { "epoch": 0.49866054343666283, "grad_norm": 0.5044805407524109, "learning_rate": 1.9746632357939318e-05, "loss": 0.6624, "step": 1303 }, { "epoch": 0.49904324531190203, "grad_norm": 0.650441586971283, "learning_rate": 1.9746077569979916e-05, "loss": 0.7139, "step": 1304 }, { "epoch": 0.49942594718714123, "grad_norm": 0.5581428408622742, "learning_rate": 1.9745522183097867e-05, "loss": 0.655, "step": 1305 }, { "epoch": 0.49980864906238043, "grad_norm": 0.5802026391029358, "learning_rate": 1.97449661973273e-05, "loss": 0.6646, "step": 1306 }, { "epoch": 0.5001913509376196, "grad_norm": 0.5137327909469604, "learning_rate": 1.9744409612702378e-05, "loss": 0.6596, "step": 1307 }, { "epoch": 0.5005740528128588, "grad_norm": 0.6012839674949646, "learning_rate": 1.9743852429257305e-05, "loss": 0.7491, "step": 1308 }, { "epoch": 0.500956754688098, "grad_norm": 0.6072969436645508, "learning_rate": 1.974329464702633e-05, "loss": 0.6427, "step": 1309 }, { "epoch": 0.5013394565633371, "grad_norm": 0.6672323346138, "learning_rate": 1.974273626604372e-05, "loss": 0.7423, "step": 1310 }, { "epoch": 0.5017221584385764, "grad_norm": 0.6104515790939331, "learning_rate": 1.974217728634379e-05, "loss": 0.7411, "step": 1311 }, { "epoch": 0.5021048603138155, "grad_norm": 0.5168175101280212, "learning_rate": 1.97416177079609e-05, "loss": 0.7445, "step": 1312 }, { "epoch": 0.5024875621890548, "grad_norm": 0.5647799372673035, "learning_rate": 1.9741057530929425e-05, "loss": 0.7381, "step": 1313 }, { "epoch": 0.5028702640642939, "grad_norm": 0.5679032206535339, "learning_rate": 1.9740496755283796e-05, "loss": 0.7498, "step": 1314 }, { "epoch": 0.5032529659395331, "grad_norm": 0.6036396026611328, "learning_rate": 1.9739935381058476e-05, "loss": 0.6745, "step": 1315 }, { "epoch": 0.5036356678147723, "grad_norm": 0.5271382927894592, "learning_rate": 1.9739373408287962e-05, "loss": 0.728, "step": 1316 }, { "epoch": 0.5040183696900115, "grad_norm": 0.5341216325759888, "learning_rate": 1.9738810837006783e-05, "loss": 0.7376, "step": 1317 }, { "epoch": 0.5044010715652507, "grad_norm": 0.52583909034729, "learning_rate": 1.973824766724952e-05, "loss": 0.6671, "step": 1318 }, { "epoch": 0.5047837734404899, "grad_norm": 0.5861115455627441, "learning_rate": 1.973768389905077e-05, "loss": 0.7312, "step": 1319 }, { "epoch": 0.505166475315729, "grad_norm": 0.5877363681793213, "learning_rate": 1.9737119532445188e-05, "loss": 0.6914, "step": 1320 }, { "epoch": 0.5055491771909683, "grad_norm": 0.516252338886261, "learning_rate": 1.973655456746745e-05, "loss": 0.689, "step": 1321 }, { "epoch": 0.5059318790662074, "grad_norm": 0.5190399289131165, "learning_rate": 1.973598900415228e-05, "loss": 0.7331, "step": 1322 }, { "epoch": 0.5063145809414467, "grad_norm": 0.49807754158973694, "learning_rate": 1.9735422842534427e-05, "loss": 0.694, "step": 1323 }, { "epoch": 0.5066972828166858, "grad_norm": 0.53472900390625, "learning_rate": 1.973485608264869e-05, "loss": 0.6596, "step": 1324 }, { "epoch": 0.507079984691925, "grad_norm": 0.5503033399581909, "learning_rate": 1.973428872452989e-05, "loss": 0.7479, "step": 1325 }, { "epoch": 0.5074626865671642, "grad_norm": 0.5059475898742676, "learning_rate": 1.9733720768212898e-05, "loss": 0.6948, "step": 1326 }, { "epoch": 0.5078453884424033, "grad_norm": 0.5270992517471313, "learning_rate": 1.9733152213732614e-05, "loss": 0.6815, "step": 1327 }, { "epoch": 0.5082280903176426, "grad_norm": 0.6486445069313049, "learning_rate": 1.9732583061123983e-05, "loss": 0.6641, "step": 1328 }, { "epoch": 0.5086107921928817, "grad_norm": 0.5719577670097351, "learning_rate": 1.9732013310421975e-05, "loss": 0.7004, "step": 1329 }, { "epoch": 0.5089934940681209, "grad_norm": 0.5741063952445984, "learning_rate": 1.97314429616616e-05, "loss": 0.6812, "step": 1330 }, { "epoch": 0.5093761959433601, "grad_norm": 0.5202080607414246, "learning_rate": 1.9730872014877915e-05, "loss": 0.6777, "step": 1331 }, { "epoch": 0.5097588978185993, "grad_norm": 0.5318107008934021, "learning_rate": 1.9730300470106e-05, "loss": 0.6588, "step": 1332 }, { "epoch": 0.5101415996938385, "grad_norm": 0.5315122008323669, "learning_rate": 1.9729728327380987e-05, "loss": 0.7025, "step": 1333 }, { "epoch": 0.5105243015690777, "grad_norm": 0.5469088554382324, "learning_rate": 1.9729155586738026e-05, "loss": 0.746, "step": 1334 }, { "epoch": 0.5109070034443168, "grad_norm": 0.519136369228363, "learning_rate": 1.9728582248212312e-05, "loss": 0.6531, "step": 1335 }, { "epoch": 0.5112897053195561, "grad_norm": 0.5522601008415222, "learning_rate": 1.972800831183909e-05, "loss": 0.7141, "step": 1336 }, { "epoch": 0.5116724071947952, "grad_norm": 0.5031684637069702, "learning_rate": 1.972743377765362e-05, "loss": 0.6428, "step": 1337 }, { "epoch": 0.5120551090700345, "grad_norm": 0.616713285446167, "learning_rate": 1.972685864569121e-05, "loss": 0.6978, "step": 1338 }, { "epoch": 0.5124378109452736, "grad_norm": 0.5954180359840393, "learning_rate": 1.9726282915987208e-05, "loss": 0.6789, "step": 1339 }, { "epoch": 0.5128205128205128, "grad_norm": 0.5677024722099304, "learning_rate": 1.972570658857699e-05, "loss": 0.6927, "step": 1340 }, { "epoch": 0.513203214695752, "grad_norm": 0.4738870859146118, "learning_rate": 1.9725129663495976e-05, "loss": 0.6967, "step": 1341 }, { "epoch": 0.5135859165709912, "grad_norm": 0.5972160696983337, "learning_rate": 1.9724552140779616e-05, "loss": 0.6595, "step": 1342 }, { "epoch": 0.5139686184462304, "grad_norm": 0.5165050625801086, "learning_rate": 1.9723974020463403e-05, "loss": 0.7043, "step": 1343 }, { "epoch": 0.5143513203214696, "grad_norm": 0.5164828300476074, "learning_rate": 1.972339530258286e-05, "loss": 0.7225, "step": 1344 }, { "epoch": 0.5147340221967087, "grad_norm": 0.5929709076881409, "learning_rate": 1.9722815987173557e-05, "loss": 0.7282, "step": 1345 }, { "epoch": 0.515116724071948, "grad_norm": 0.5537583231925964, "learning_rate": 1.9722236074271092e-05, "loss": 0.6977, "step": 1346 }, { "epoch": 0.5154994259471871, "grad_norm": 0.4811393618583679, "learning_rate": 1.97216555639111e-05, "loss": 0.7072, "step": 1347 }, { "epoch": 0.5158821278224264, "grad_norm": 0.5779057145118713, "learning_rate": 1.9721074456129257e-05, "loss": 0.7371, "step": 1348 }, { "epoch": 0.5162648296976655, "grad_norm": 0.5510604381561279, "learning_rate": 1.972049275096127e-05, "loss": 0.6676, "step": 1349 }, { "epoch": 0.5166475315729047, "grad_norm": 0.5454850792884827, "learning_rate": 1.9719910448442893e-05, "loss": 0.6833, "step": 1350 }, { "epoch": 0.5170302334481439, "grad_norm": 0.5455752015113831, "learning_rate": 1.9719327548609904e-05, "loss": 0.6227, "step": 1351 }, { "epoch": 0.5174129353233831, "grad_norm": 0.5890477299690247, "learning_rate": 1.971874405149813e-05, "loss": 0.7082, "step": 1352 }, { "epoch": 0.5177956371986223, "grad_norm": 0.5616897940635681, "learning_rate": 1.971815995714342e-05, "loss": 0.705, "step": 1353 }, { "epoch": 0.5181783390738615, "grad_norm": 0.5962591767311096, "learning_rate": 1.971757526558168e-05, "loss": 0.7333, "step": 1354 }, { "epoch": 0.5185610409491006, "grad_norm": 0.5643134713172913, "learning_rate": 1.9716989976848828e-05, "loss": 0.6209, "step": 1355 }, { "epoch": 0.5189437428243399, "grad_norm": 0.510465681552887, "learning_rate": 1.971640409098084e-05, "loss": 0.6491, "step": 1356 }, { "epoch": 0.519326444699579, "grad_norm": 0.5551455020904541, "learning_rate": 1.9715817608013716e-05, "loss": 0.8171, "step": 1357 }, { "epoch": 0.5197091465748183, "grad_norm": 0.5422292351722717, "learning_rate": 1.97152305279835e-05, "loss": 0.6355, "step": 1358 }, { "epoch": 0.5200918484500574, "grad_norm": 0.8132827281951904, "learning_rate": 1.9714642850926264e-05, "loss": 0.686, "step": 1359 }, { "epoch": 0.5204745503252965, "grad_norm": 0.49563005566596985, "learning_rate": 1.971405457687813e-05, "loss": 0.6923, "step": 1360 }, { "epoch": 0.5208572522005358, "grad_norm": 0.5173966288566589, "learning_rate": 1.9713465705875247e-05, "loss": 0.6552, "step": 1361 }, { "epoch": 0.521239954075775, "grad_norm": 0.522104024887085, "learning_rate": 1.9712876237953798e-05, "loss": 0.6828, "step": 1362 }, { "epoch": 0.5216226559510142, "grad_norm": 0.5552956461906433, "learning_rate": 1.971228617315001e-05, "loss": 0.7942, "step": 1363 }, { "epoch": 0.5220053578262533, "grad_norm": 0.5154722929000854, "learning_rate": 1.971169551150015e-05, "loss": 0.667, "step": 1364 }, { "epoch": 0.5223880597014925, "grad_norm": 0.5052900910377502, "learning_rate": 1.9711104253040504e-05, "loss": 0.6133, "step": 1365 }, { "epoch": 0.5227707615767317, "grad_norm": 0.5529662370681763, "learning_rate": 1.9710512397807418e-05, "loss": 0.7067, "step": 1366 }, { "epoch": 0.5231534634519709, "grad_norm": 0.5322500467300415, "learning_rate": 1.9709919945837255e-05, "loss": 0.6923, "step": 1367 }, { "epoch": 0.5235361653272101, "grad_norm": 0.5414328575134277, "learning_rate": 1.9709326897166424e-05, "loss": 0.7104, "step": 1368 }, { "epoch": 0.5239188672024493, "grad_norm": 0.5648317933082581, "learning_rate": 1.9708733251831376e-05, "loss": 0.7165, "step": 1369 }, { "epoch": 0.5243015690776884, "grad_norm": 0.5596868991851807, "learning_rate": 1.9708139009868583e-05, "loss": 0.7047, "step": 1370 }, { "epoch": 0.5246842709529277, "grad_norm": 0.5302459001541138, "learning_rate": 1.970754417131457e-05, "loss": 0.5894, "step": 1371 }, { "epoch": 0.5250669728281668, "grad_norm": 0.5655054450035095, "learning_rate": 1.970694873620589e-05, "loss": 0.6526, "step": 1372 }, { "epoch": 0.5254496747034061, "grad_norm": 0.5534198880195618, "learning_rate": 1.970635270457913e-05, "loss": 0.7237, "step": 1373 }, { "epoch": 0.5258323765786452, "grad_norm": 0.5487428903579712, "learning_rate": 1.970575607647092e-05, "loss": 0.7108, "step": 1374 }, { "epoch": 0.5262150784538844, "grad_norm": 0.49397027492523193, "learning_rate": 1.9705158851917923e-05, "loss": 0.6889, "step": 1375 }, { "epoch": 0.5265977803291236, "grad_norm": 0.5514860153198242, "learning_rate": 1.9704561030956846e-05, "loss": 0.7125, "step": 1376 }, { "epoch": 0.5269804822043628, "grad_norm": 0.509323239326477, "learning_rate": 1.970396261362442e-05, "loss": 0.6703, "step": 1377 }, { "epoch": 0.527363184079602, "grad_norm": 0.49629029631614685, "learning_rate": 1.9703363599957418e-05, "loss": 0.6922, "step": 1378 }, { "epoch": 0.5277458859548412, "grad_norm": 0.5294306874275208, "learning_rate": 1.970276398999266e-05, "loss": 0.6675, "step": 1379 }, { "epoch": 0.5281285878300803, "grad_norm": 0.5770675539970398, "learning_rate": 1.9702163783766987e-05, "loss": 0.7131, "step": 1380 }, { "epoch": 0.5285112897053196, "grad_norm": 0.558680534362793, "learning_rate": 1.9701562981317286e-05, "loss": 0.7282, "step": 1381 }, { "epoch": 0.5288939915805587, "grad_norm": 0.649315595626831, "learning_rate": 1.9700961582680476e-05, "loss": 0.6663, "step": 1382 }, { "epoch": 0.529276693455798, "grad_norm": 0.5313507914543152, "learning_rate": 1.9700359587893513e-05, "loss": 0.6963, "step": 1383 }, { "epoch": 0.5296593953310371, "grad_norm": 0.5567432641983032, "learning_rate": 1.96997569969934e-05, "loss": 0.7451, "step": 1384 }, { "epoch": 0.5300420972062763, "grad_norm": 0.5308191180229187, "learning_rate": 1.9699153810017154e-05, "loss": 0.6425, "step": 1385 }, { "epoch": 0.5304247990815155, "grad_norm": 0.5477274656295776, "learning_rate": 1.969855002700185e-05, "loss": 0.6348, "step": 1386 }, { "epoch": 0.5308075009567547, "grad_norm": 0.5502591133117676, "learning_rate": 1.9697945647984595e-05, "loss": 0.7047, "step": 1387 }, { "epoch": 0.5311902028319939, "grad_norm": 0.5264895558357239, "learning_rate": 1.9697340673002525e-05, "loss": 0.7125, "step": 1388 }, { "epoch": 0.5315729047072331, "grad_norm": 0.5558783411979675, "learning_rate": 1.969673510209282e-05, "loss": 0.7171, "step": 1389 }, { "epoch": 0.5319556065824722, "grad_norm": 0.5855070352554321, "learning_rate": 1.9696128935292692e-05, "loss": 0.6649, "step": 1390 }, { "epoch": 0.5323383084577115, "grad_norm": 0.5193771123886108, "learning_rate": 1.969552217263939e-05, "loss": 0.6991, "step": 1391 }, { "epoch": 0.5327210103329506, "grad_norm": 0.5097098350524902, "learning_rate": 1.969491481417021e-05, "loss": 0.6832, "step": 1392 }, { "epoch": 0.5331037122081899, "grad_norm": 0.5975263118743896, "learning_rate": 1.9694306859922463e-05, "loss": 0.6979, "step": 1393 }, { "epoch": 0.533486414083429, "grad_norm": 0.5090343952178955, "learning_rate": 1.969369830993352e-05, "loss": 0.6589, "step": 1394 }, { "epoch": 0.5338691159586682, "grad_norm": 0.5041846632957458, "learning_rate": 1.9693089164240772e-05, "loss": 0.7723, "step": 1395 }, { "epoch": 0.5342518178339074, "grad_norm": 0.5334237217903137, "learning_rate": 1.9692479422881654e-05, "loss": 0.7361, "step": 1396 }, { "epoch": 0.5346345197091465, "grad_norm": 0.5353453159332275, "learning_rate": 1.969186908589364e-05, "loss": 0.6734, "step": 1397 }, { "epoch": 0.5350172215843858, "grad_norm": 0.5436659455299377, "learning_rate": 1.969125815331423e-05, "loss": 0.7172, "step": 1398 }, { "epoch": 0.535399923459625, "grad_norm": 0.5676171183586121, "learning_rate": 1.9690646625180968e-05, "loss": 0.6894, "step": 1399 }, { "epoch": 0.5357826253348641, "grad_norm": 0.5104290843009949, "learning_rate": 1.9690034501531443e-05, "loss": 0.7198, "step": 1400 }, { "epoch": 0.5361653272101033, "grad_norm": 0.5903849601745605, "learning_rate": 1.9689421782403265e-05, "loss": 0.7236, "step": 1401 }, { "epoch": 0.5365480290853425, "grad_norm": 0.5169079899787903, "learning_rate": 1.9688808467834083e-05, "loss": 0.8066, "step": 1402 }, { "epoch": 0.5369307309605817, "grad_norm": 0.5310946702957153, "learning_rate": 1.9688194557861597e-05, "loss": 0.7051, "step": 1403 }, { "epoch": 0.5373134328358209, "grad_norm": 0.6128215193748474, "learning_rate": 1.9687580052523527e-05, "loss": 0.6036, "step": 1404 }, { "epoch": 0.53769613471106, "grad_norm": 0.6487993597984314, "learning_rate": 1.9686964951857636e-05, "loss": 0.6774, "step": 1405 }, { "epoch": 0.5380788365862993, "grad_norm": 0.5014602541923523, "learning_rate": 1.9686349255901724e-05, "loss": 0.6301, "step": 1406 }, { "epoch": 0.5384615384615384, "grad_norm": 0.6653391718864441, "learning_rate": 1.968573296469363e-05, "loss": 0.7431, "step": 1407 }, { "epoch": 0.5388442403367777, "grad_norm": 0.5498612523078918, "learning_rate": 1.9685116078271224e-05, "loss": 0.718, "step": 1408 }, { "epoch": 0.5392269422120168, "grad_norm": 0.5975430607795715, "learning_rate": 1.9684498596672416e-05, "loss": 0.7444, "step": 1409 }, { "epoch": 0.539609644087256, "grad_norm": 0.5311407446861267, "learning_rate": 1.9683880519935154e-05, "loss": 0.7691, "step": 1410 }, { "epoch": 0.5399923459624952, "grad_norm": 0.5955147743225098, "learning_rate": 1.9683261848097415e-05, "loss": 0.7732, "step": 1411 }, { "epoch": 0.5403750478377344, "grad_norm": 0.509845495223999, "learning_rate": 1.9682642581197225e-05, "loss": 0.6516, "step": 1412 }, { "epoch": 0.5407577497129736, "grad_norm": 0.48705023527145386, "learning_rate": 1.9682022719272633e-05, "loss": 0.6674, "step": 1413 }, { "epoch": 0.5411404515882128, "grad_norm": 0.544196367263794, "learning_rate": 1.9681402262361737e-05, "loss": 0.645, "step": 1414 }, { "epoch": 0.5415231534634519, "grad_norm": 0.627152144908905, "learning_rate": 1.968078121050266e-05, "loss": 0.7445, "step": 1415 }, { "epoch": 0.5419058553386912, "grad_norm": 0.5627769827842712, "learning_rate": 1.968015956373357e-05, "loss": 0.7334, "step": 1416 }, { "epoch": 0.5422885572139303, "grad_norm": 0.5218785405158997, "learning_rate": 1.9679537322092672e-05, "loss": 0.7287, "step": 1417 }, { "epoch": 0.5426712590891696, "grad_norm": 0.5342346429824829, "learning_rate": 1.96789144856182e-05, "loss": 0.6169, "step": 1418 }, { "epoch": 0.5430539609644087, "grad_norm": 0.5178425312042236, "learning_rate": 1.9678291054348432e-05, "loss": 0.7034, "step": 1419 }, { "epoch": 0.5434366628396479, "grad_norm": 0.49896883964538574, "learning_rate": 1.9677667028321674e-05, "loss": 0.6037, "step": 1420 }, { "epoch": 0.5438193647148871, "grad_norm": 0.5381090044975281, "learning_rate": 1.9677042407576282e-05, "loss": 0.697, "step": 1421 }, { "epoch": 0.5442020665901263, "grad_norm": 0.5418798327445984, "learning_rate": 1.9676417192150637e-05, "loss": 0.7362, "step": 1422 }, { "epoch": 0.5445847684653655, "grad_norm": 0.5376259684562683, "learning_rate": 1.9675791382083157e-05, "loss": 0.6709, "step": 1423 }, { "epoch": 0.5449674703406047, "grad_norm": 0.4910535216331482, "learning_rate": 1.9675164977412303e-05, "loss": 0.698, "step": 1424 }, { "epoch": 0.5453501722158438, "grad_norm": 0.5204727649688721, "learning_rate": 1.9674537978176572e-05, "loss": 0.7317, "step": 1425 }, { "epoch": 0.5457328740910831, "grad_norm": 0.5862488746643066, "learning_rate": 1.967391038441449e-05, "loss": 0.792, "step": 1426 }, { "epoch": 0.5461155759663222, "grad_norm": 0.5073144435882568, "learning_rate": 1.9673282196164625e-05, "loss": 0.6511, "step": 1427 }, { "epoch": 0.5464982778415615, "grad_norm": 0.5114550590515137, "learning_rate": 1.9672653413465584e-05, "loss": 0.7809, "step": 1428 }, { "epoch": 0.5468809797168006, "grad_norm": 0.5123797655105591, "learning_rate": 1.9672024036356006e-05, "loss": 0.6672, "step": 1429 }, { "epoch": 0.5472636815920398, "grad_norm": 0.5244854688644409, "learning_rate": 1.967139406487456e-05, "loss": 0.7251, "step": 1430 }, { "epoch": 0.547646383467279, "grad_norm": 0.5916690826416016, "learning_rate": 1.9670763499059978e-05, "loss": 0.6628, "step": 1431 }, { "epoch": 0.5480290853425182, "grad_norm": 0.5644987225532532, "learning_rate": 1.9670132338950992e-05, "loss": 0.7346, "step": 1432 }, { "epoch": 0.5484117872177574, "grad_norm": 0.5635762214660645, "learning_rate": 1.9669500584586394e-05, "loss": 0.6835, "step": 1433 }, { "epoch": 0.5487944890929966, "grad_norm": 0.6269950270652771, "learning_rate": 1.966886823600501e-05, "loss": 0.6584, "step": 1434 }, { "epoch": 0.5491771909682357, "grad_norm": 0.5182039737701416, "learning_rate": 1.9668235293245698e-05, "loss": 0.6324, "step": 1435 }, { "epoch": 0.549559892843475, "grad_norm": 0.4941788613796234, "learning_rate": 1.966760175634735e-05, "loss": 0.7366, "step": 1436 }, { "epoch": 0.5499425947187141, "grad_norm": 0.5396627187728882, "learning_rate": 1.9666967625348907e-05, "loss": 0.7492, "step": 1437 }, { "epoch": 0.5503252965939534, "grad_norm": 0.5781134366989136, "learning_rate": 1.966633290028933e-05, "loss": 0.7684, "step": 1438 }, { "epoch": 0.5507079984691925, "grad_norm": 0.5289052724838257, "learning_rate": 1.966569758120763e-05, "loss": 0.6512, "step": 1439 }, { "epoch": 0.5510907003444316, "grad_norm": 0.5260624885559082, "learning_rate": 1.9665061668142847e-05, "loss": 0.7416, "step": 1440 }, { "epoch": 0.5514734022196709, "grad_norm": 0.5263771414756775, "learning_rate": 1.966442516113406e-05, "loss": 0.6818, "step": 1441 }, { "epoch": 0.55185610409491, "grad_norm": 0.6710653901100159, "learning_rate": 1.966378806022038e-05, "loss": 0.7078, "step": 1442 }, { "epoch": 0.5522388059701493, "grad_norm": 0.6474928855895996, "learning_rate": 1.9663150365440962e-05, "loss": 0.7372, "step": 1443 }, { "epoch": 0.5526215078453884, "grad_norm": 0.48644202947616577, "learning_rate": 1.9662512076834995e-05, "loss": 0.6863, "step": 1444 }, { "epoch": 0.5530042097206276, "grad_norm": 0.5143247246742249, "learning_rate": 1.9661873194441704e-05, "loss": 0.6823, "step": 1445 }, { "epoch": 0.5533869115958668, "grad_norm": 0.5222007632255554, "learning_rate": 1.966123371830035e-05, "loss": 0.6634, "step": 1446 }, { "epoch": 0.553769613471106, "grad_norm": 0.5084363222122192, "learning_rate": 1.9660593648450227e-05, "loss": 0.7412, "step": 1447 }, { "epoch": 0.5541523153463452, "grad_norm": 0.5185539126396179, "learning_rate": 1.965995298493067e-05, "loss": 0.6469, "step": 1448 }, { "epoch": 0.5545350172215844, "grad_norm": 0.4942879378795624, "learning_rate": 1.9659311727781052e-05, "loss": 0.609, "step": 1449 }, { "epoch": 0.5549177190968235, "grad_norm": 0.5638726949691772, "learning_rate": 1.965866987704078e-05, "loss": 0.759, "step": 1450 }, { "epoch": 0.5553004209720628, "grad_norm": 0.5843246579170227, "learning_rate": 1.9658027432749293e-05, "loss": 0.6471, "step": 1451 }, { "epoch": 0.5556831228473019, "grad_norm": 0.5184071660041809, "learning_rate": 1.9657384394946078e-05, "loss": 0.7001, "step": 1452 }, { "epoch": 0.5560658247225412, "grad_norm": 0.49200931191444397, "learning_rate": 1.9656740763670645e-05, "loss": 0.6245, "step": 1453 }, { "epoch": 0.5564485265977803, "grad_norm": 0.616764485836029, "learning_rate": 1.965609653896255e-05, "loss": 0.7366, "step": 1454 }, { "epoch": 0.5568312284730195, "grad_norm": 0.5636755228042603, "learning_rate": 1.965545172086138e-05, "loss": 0.7395, "step": 1455 }, { "epoch": 0.5572139303482587, "grad_norm": 0.5616733431816101, "learning_rate": 1.9654806309406767e-05, "loss": 0.6994, "step": 1456 }, { "epoch": 0.5575966322234979, "grad_norm": 0.572848916053772, "learning_rate": 1.9654160304638364e-05, "loss": 0.6531, "step": 1457 }, { "epoch": 0.5579793340987371, "grad_norm": 0.5838314890861511, "learning_rate": 1.9653513706595877e-05, "loss": 0.6922, "step": 1458 }, { "epoch": 0.5583620359739763, "grad_norm": 0.4823254644870758, "learning_rate": 1.965286651531904e-05, "loss": 0.5881, "step": 1459 }, { "epoch": 0.5587447378492154, "grad_norm": 0.5051684975624084, "learning_rate": 1.965221873084762e-05, "loss": 0.7011, "step": 1460 }, { "epoch": 0.5591274397244547, "grad_norm": 0.602725625038147, "learning_rate": 1.9651570353221427e-05, "loss": 0.7155, "step": 1461 }, { "epoch": 0.5595101415996938, "grad_norm": 0.56202632188797, "learning_rate": 1.9650921382480312e-05, "loss": 0.6948, "step": 1462 }, { "epoch": 0.5598928434749331, "grad_norm": 0.5687233209609985, "learning_rate": 1.9650271818664148e-05, "loss": 0.7145, "step": 1463 }, { "epoch": 0.5602755453501722, "grad_norm": 0.58930903673172, "learning_rate": 1.9649621661812856e-05, "loss": 0.7229, "step": 1464 }, { "epoch": 0.5606582472254114, "grad_norm": 0.5275293588638306, "learning_rate": 1.964897091196639e-05, "loss": 0.6546, "step": 1465 }, { "epoch": 0.5610409491006506, "grad_norm": 0.6472257971763611, "learning_rate": 1.964831956916474e-05, "loss": 0.7125, "step": 1466 }, { "epoch": 0.5614236509758898, "grad_norm": 0.5875588059425354, "learning_rate": 1.964766763344793e-05, "loss": 0.7021, "step": 1467 }, { "epoch": 0.561806352851129, "grad_norm": 0.5256525874137878, "learning_rate": 1.9647015104856026e-05, "loss": 0.7157, "step": 1468 }, { "epoch": 0.5621890547263682, "grad_norm": 0.559641420841217, "learning_rate": 1.964636198342913e-05, "loss": 0.6956, "step": 1469 }, { "epoch": 0.5625717566016073, "grad_norm": 0.6424188613891602, "learning_rate": 1.964570826920737e-05, "loss": 0.6845, "step": 1470 }, { "epoch": 0.5629544584768466, "grad_norm": 0.6070991158485413, "learning_rate": 1.964505396223093e-05, "loss": 0.7542, "step": 1471 }, { "epoch": 0.5633371603520857, "grad_norm": 0.6664294600486755, "learning_rate": 1.964439906254001e-05, "loss": 0.6744, "step": 1472 }, { "epoch": 0.563719862227325, "grad_norm": 0.5328034162521362, "learning_rate": 1.9643743570174855e-05, "loss": 0.6768, "step": 1473 }, { "epoch": 0.5641025641025641, "grad_norm": 0.5598429441452026, "learning_rate": 1.9643087485175752e-05, "loss": 0.6069, "step": 1474 }, { "epoch": 0.5644852659778032, "grad_norm": 0.5825220346450806, "learning_rate": 1.9642430807583017e-05, "loss": 0.7021, "step": 1475 }, { "epoch": 0.5648679678530425, "grad_norm": 0.5010420083999634, "learning_rate": 1.9641773537437006e-05, "loss": 0.6485, "step": 1476 }, { "epoch": 0.5652506697282816, "grad_norm": 0.6629123091697693, "learning_rate": 1.964111567477811e-05, "loss": 0.7062, "step": 1477 }, { "epoch": 0.5656333716035209, "grad_norm": 0.5777814984321594, "learning_rate": 1.964045721964675e-05, "loss": 0.7577, "step": 1478 }, { "epoch": 0.56601607347876, "grad_norm": 0.5641427636146545, "learning_rate": 1.96397981720834e-05, "loss": 0.6318, "step": 1479 }, { "epoch": 0.5663987753539992, "grad_norm": 0.5449649691581726, "learning_rate": 1.963913853212855e-05, "loss": 0.7014, "step": 1480 }, { "epoch": 0.5667814772292384, "grad_norm": 0.6006320714950562, "learning_rate": 1.9638478299822746e-05, "loss": 0.6945, "step": 1481 }, { "epoch": 0.5671641791044776, "grad_norm": 0.5194278955459595, "learning_rate": 1.9637817475206554e-05, "loss": 0.6364, "step": 1482 }, { "epoch": 0.5675468809797168, "grad_norm": 0.5044655203819275, "learning_rate": 1.963715605832059e-05, "loss": 0.6738, "step": 1483 }, { "epoch": 0.567929582854956, "grad_norm": 0.6247344017028809, "learning_rate": 1.9636494049205493e-05, "loss": 0.7244, "step": 1484 }, { "epoch": 0.5683122847301951, "grad_norm": 0.6157674789428711, "learning_rate": 1.963583144790195e-05, "loss": 0.6724, "step": 1485 }, { "epoch": 0.5686949866054344, "grad_norm": 0.6908137202262878, "learning_rate": 1.9635168254450673e-05, "loss": 0.7029, "step": 1486 }, { "epoch": 0.5690776884806735, "grad_norm": 0.5351104140281677, "learning_rate": 1.963450446889243e-05, "loss": 0.6486, "step": 1487 }, { "epoch": 0.5694603903559128, "grad_norm": 0.5820773839950562, "learning_rate": 1.9633840091268e-05, "loss": 0.6132, "step": 1488 }, { "epoch": 0.5698430922311519, "grad_norm": 0.6177107095718384, "learning_rate": 1.9633175121618212e-05, "loss": 0.7807, "step": 1489 }, { "epoch": 0.5702257941063911, "grad_norm": 0.6470410823822021, "learning_rate": 1.9632509559983937e-05, "loss": 0.7388, "step": 1490 }, { "epoch": 0.5706084959816303, "grad_norm": 0.5660357475280762, "learning_rate": 1.963184340640607e-05, "loss": 0.7258, "step": 1491 }, { "epoch": 0.5709911978568695, "grad_norm": 0.5911499857902527, "learning_rate": 1.9631176660925555e-05, "loss": 0.6856, "step": 1492 }, { "epoch": 0.5713738997321087, "grad_norm": 0.6619125008583069, "learning_rate": 1.9630509323583356e-05, "loss": 0.6849, "step": 1493 }, { "epoch": 0.5717566016073479, "grad_norm": 0.5437648892402649, "learning_rate": 1.9629841394420487e-05, "loss": 0.7545, "step": 1494 }, { "epoch": 0.572139303482587, "grad_norm": 0.543850302696228, "learning_rate": 1.9629172873477995e-05, "loss": 0.6478, "step": 1495 }, { "epoch": 0.5725220053578263, "grad_norm": 0.5994136333465576, "learning_rate": 1.962850376079696e-05, "loss": 0.7556, "step": 1496 }, { "epoch": 0.5729047072330654, "grad_norm": 0.5513026714324951, "learning_rate": 1.96278340564185e-05, "loss": 0.7186, "step": 1497 }, { "epoch": 0.5732874091083047, "grad_norm": 0.6127933859825134, "learning_rate": 1.9627163760383777e-05, "loss": 0.693, "step": 1498 }, { "epoch": 0.5736701109835438, "grad_norm": 0.571232795715332, "learning_rate": 1.9626492872733974e-05, "loss": 0.7295, "step": 1499 }, { "epoch": 0.574052812858783, "grad_norm": 0.5419303178787231, "learning_rate": 1.9625821393510323e-05, "loss": 0.6931, "step": 1500 }, { "epoch": 0.5744355147340222, "grad_norm": 0.5883557200431824, "learning_rate": 1.9625149322754092e-05, "loss": 0.6682, "step": 1501 }, { "epoch": 0.5748182166092614, "grad_norm": 0.5479043126106262, "learning_rate": 1.9624476660506574e-05, "loss": 0.7741, "step": 1502 }, { "epoch": 0.5752009184845006, "grad_norm": 0.5455323457717896, "learning_rate": 1.9623803406809106e-05, "loss": 0.6698, "step": 1503 }, { "epoch": 0.5755836203597398, "grad_norm": 0.5867158770561218, "learning_rate": 1.962312956170307e-05, "loss": 0.7133, "step": 1504 }, { "epoch": 0.5759663222349789, "grad_norm": 0.6857865452766418, "learning_rate": 1.9622455125229868e-05, "loss": 0.6681, "step": 1505 }, { "epoch": 0.5763490241102182, "grad_norm": 0.6241140961647034, "learning_rate": 1.962178009743095e-05, "loss": 0.6931, "step": 1506 }, { "epoch": 0.5767317259854573, "grad_norm": 0.5433399081230164, "learning_rate": 1.9621104478347788e-05, "loss": 0.6655, "step": 1507 }, { "epoch": 0.5771144278606966, "grad_norm": 0.5045892000198364, "learning_rate": 1.9620428268021915e-05, "loss": 0.7492, "step": 1508 }, { "epoch": 0.5774971297359357, "grad_norm": 0.6546459794044495, "learning_rate": 1.9619751466494882e-05, "loss": 0.692, "step": 1509 }, { "epoch": 0.5778798316111748, "grad_norm": 0.5830736756324768, "learning_rate": 1.9619074073808275e-05, "loss": 0.716, "step": 1510 }, { "epoch": 0.5782625334864141, "grad_norm": 0.5677217245101929, "learning_rate": 1.9618396090003723e-05, "loss": 0.7906, "step": 1511 }, { "epoch": 0.5786452353616532, "grad_norm": 0.5160881280899048, "learning_rate": 1.961771751512289e-05, "loss": 0.625, "step": 1512 }, { "epoch": 0.5790279372368925, "grad_norm": 0.5519261956214905, "learning_rate": 1.9617038349207482e-05, "loss": 0.694, "step": 1513 }, { "epoch": 0.5794106391121316, "grad_norm": 0.5584532022476196, "learning_rate": 1.9616358592299233e-05, "loss": 0.7286, "step": 1514 }, { "epoch": 0.5797933409873708, "grad_norm": 0.5756310224533081, "learning_rate": 1.961567824443991e-05, "loss": 0.6754, "step": 1515 }, { "epoch": 0.58017604286261, "grad_norm": 0.5685321092605591, "learning_rate": 1.9614997305671327e-05, "loss": 0.6268, "step": 1516 }, { "epoch": 0.5805587447378492, "grad_norm": 0.5493777990341187, "learning_rate": 1.9614315776035334e-05, "loss": 0.7533, "step": 1517 }, { "epoch": 0.5809414466130884, "grad_norm": 0.6185341477394104, "learning_rate": 1.96136336555738e-05, "loss": 0.7217, "step": 1518 }, { "epoch": 0.5813241484883276, "grad_norm": 0.5998911261558533, "learning_rate": 1.9612950944328654e-05, "loss": 0.7322, "step": 1519 }, { "epoch": 0.5817068503635667, "grad_norm": 0.5957605838775635, "learning_rate": 1.961226764234185e-05, "loss": 0.7645, "step": 1520 }, { "epoch": 0.582089552238806, "grad_norm": 0.538483738899231, "learning_rate": 1.9611583749655374e-05, "loss": 0.5997, "step": 1521 }, { "epoch": 0.5824722541140451, "grad_norm": 0.5337823033332825, "learning_rate": 1.9610899266311258e-05, "loss": 0.6983, "step": 1522 }, { "epoch": 0.5828549559892844, "grad_norm": 0.5592231154441833, "learning_rate": 1.961021419235156e-05, "loss": 0.728, "step": 1523 }, { "epoch": 0.5832376578645235, "grad_norm": 0.49810466170310974, "learning_rate": 1.960952852781838e-05, "loss": 0.6967, "step": 1524 }, { "epoch": 0.5836203597397627, "grad_norm": 0.5395834445953369, "learning_rate": 1.9608842272753858e-05, "loss": 0.6864, "step": 1525 }, { "epoch": 0.5840030616150019, "grad_norm": 0.5759785771369934, "learning_rate": 1.9608155427200162e-05, "loss": 0.6673, "step": 1526 }, { "epoch": 0.5843857634902411, "grad_norm": 0.5607790946960449, "learning_rate": 1.9607467991199508e-05, "loss": 0.7757, "step": 1527 }, { "epoch": 0.5847684653654803, "grad_norm": 0.5442545413970947, "learning_rate": 1.9606779964794135e-05, "loss": 0.6309, "step": 1528 }, { "epoch": 0.5851511672407195, "grad_norm": 0.568960964679718, "learning_rate": 1.9606091348026323e-05, "loss": 0.7486, "step": 1529 }, { "epoch": 0.5855338691159586, "grad_norm": 0.532088577747345, "learning_rate": 1.9605402140938387e-05, "loss": 0.6819, "step": 1530 }, { "epoch": 0.5859165709911979, "grad_norm": 0.5004754662513733, "learning_rate": 1.960471234357269e-05, "loss": 0.7104, "step": 1531 }, { "epoch": 0.586299272866437, "grad_norm": 0.5872828960418701, "learning_rate": 1.9604021955971615e-05, "loss": 0.691, "step": 1532 }, { "epoch": 0.5866819747416763, "grad_norm": 0.5860136151313782, "learning_rate": 1.960333097817759e-05, "loss": 0.6817, "step": 1533 }, { "epoch": 0.5870646766169154, "grad_norm": 0.532212495803833, "learning_rate": 1.9602639410233074e-05, "loss": 0.7113, "step": 1534 }, { "epoch": 0.5874473784921546, "grad_norm": 0.5687962174415588, "learning_rate": 1.960194725218057e-05, "loss": 0.7762, "step": 1535 }, { "epoch": 0.5878300803673938, "grad_norm": 0.5478495359420776, "learning_rate": 1.9601254504062614e-05, "loss": 0.71, "step": 1536 }, { "epoch": 0.588212782242633, "grad_norm": 0.548232913017273, "learning_rate": 1.9600561165921774e-05, "loss": 0.796, "step": 1537 }, { "epoch": 0.5885954841178722, "grad_norm": 0.525065004825592, "learning_rate": 1.959986723780066e-05, "loss": 0.7463, "step": 1538 }, { "epoch": 0.5889781859931114, "grad_norm": 0.5928045511245728, "learning_rate": 1.9599172719741914e-05, "loss": 0.7171, "step": 1539 }, { "epoch": 0.5893608878683505, "grad_norm": 0.5054778456687927, "learning_rate": 1.9598477611788214e-05, "loss": 0.6151, "step": 1540 }, { "epoch": 0.5897435897435898, "grad_norm": 0.5453140735626221, "learning_rate": 1.9597781913982278e-05, "loss": 0.7612, "step": 1541 }, { "epoch": 0.5901262916188289, "grad_norm": 0.5268307328224182, "learning_rate": 1.9597085626366863e-05, "loss": 0.7058, "step": 1542 }, { "epoch": 0.5905089934940682, "grad_norm": 0.5441411733627319, "learning_rate": 1.959638874898475e-05, "loss": 0.6455, "step": 1543 }, { "epoch": 0.5908916953693073, "grad_norm": 0.5381253957748413, "learning_rate": 1.959569128187877e-05, "loss": 0.7045, "step": 1544 }, { "epoch": 0.5912743972445464, "grad_norm": 0.5452741980552673, "learning_rate": 1.9594993225091784e-05, "loss": 0.6939, "step": 1545 }, { "epoch": 0.5916570991197857, "grad_norm": 0.5623897314071655, "learning_rate": 1.9594294578666685e-05, "loss": 0.6569, "step": 1546 }, { "epoch": 0.5920398009950248, "grad_norm": 0.5128586888313293, "learning_rate": 1.9593595342646407e-05, "loss": 0.6749, "step": 1547 }, { "epoch": 0.5924225028702641, "grad_norm": 0.5699991583824158, "learning_rate": 1.9592895517073925e-05, "loss": 0.7506, "step": 1548 }, { "epoch": 0.5928052047455032, "grad_norm": 0.5161445140838623, "learning_rate": 1.9592195101992238e-05, "loss": 0.6281, "step": 1549 }, { "epoch": 0.5931879066207424, "grad_norm": 0.5361670851707458, "learning_rate": 1.95914940974444e-05, "loss": 0.6673, "step": 1550 }, { "epoch": 0.5935706084959816, "grad_norm": 0.5245701670646667, "learning_rate": 1.9590792503473477e-05, "loss": 0.6527, "step": 1551 }, { "epoch": 0.5939533103712208, "grad_norm": 0.5230910778045654, "learning_rate": 1.959009032012259e-05, "loss": 0.6874, "step": 1552 }, { "epoch": 0.59433601224646, "grad_norm": 0.6149885058403015, "learning_rate": 1.958938754743489e-05, "loss": 0.7072, "step": 1553 }, { "epoch": 0.5947187141216992, "grad_norm": 0.5234848856925964, "learning_rate": 1.958868418545356e-05, "loss": 0.6606, "step": 1554 }, { "epoch": 0.5951014159969383, "grad_norm": 0.5014810562133789, "learning_rate": 1.9587980234221833e-05, "loss": 0.7, "step": 1555 }, { "epoch": 0.5954841178721776, "grad_norm": 0.5532161593437195, "learning_rate": 1.958727569378296e-05, "loss": 0.6887, "step": 1556 }, { "epoch": 0.5958668197474167, "grad_norm": 0.4791255295276642, "learning_rate": 1.9586570564180238e-05, "loss": 0.696, "step": 1557 }, { "epoch": 0.596249521622656, "grad_norm": 0.5221503376960754, "learning_rate": 1.9585864845457e-05, "loss": 0.6145, "step": 1558 }, { "epoch": 0.5966322234978951, "grad_norm": 0.5222460031509399, "learning_rate": 1.9585158537656618e-05, "loss": 0.7241, "step": 1559 }, { "epoch": 0.5970149253731343, "grad_norm": 0.5904268026351929, "learning_rate": 1.9584451640822493e-05, "loss": 0.706, "step": 1560 }, { "epoch": 0.5973976272483735, "grad_norm": 0.5746265649795532, "learning_rate": 1.9583744154998065e-05, "loss": 0.6252, "step": 1561 }, { "epoch": 0.5977803291236127, "grad_norm": 0.5449187755584717, "learning_rate": 1.958303608022681e-05, "loss": 0.7069, "step": 1562 }, { "epoch": 0.5981630309988519, "grad_norm": 0.5712835192680359, "learning_rate": 1.9582327416552248e-05, "loss": 0.6319, "step": 1563 }, { "epoch": 0.5985457328740911, "grad_norm": 0.5061448812484741, "learning_rate": 1.958161816401792e-05, "loss": 0.7446, "step": 1564 }, { "epoch": 0.5989284347493302, "grad_norm": 0.5179334878921509, "learning_rate": 1.9580908322667416e-05, "loss": 0.5715, "step": 1565 }, { "epoch": 0.5993111366245695, "grad_norm": 0.48915961384773254, "learning_rate": 1.9580197892544354e-05, "loss": 0.6025, "step": 1566 }, { "epoch": 0.5996938384998086, "grad_norm": 0.6005991697311401, "learning_rate": 1.95794868736924e-05, "loss": 0.669, "step": 1567 }, { "epoch": 0.6000765403750479, "grad_norm": 0.5055102109909058, "learning_rate": 1.9578775266155236e-05, "loss": 0.6823, "step": 1568 }, { "epoch": 0.600459242250287, "grad_norm": 0.4862108826637268, "learning_rate": 1.9578063069976604e-05, "loss": 0.6302, "step": 1569 }, { "epoch": 0.6008419441255262, "grad_norm": 0.5503808259963989, "learning_rate": 1.9577350285200262e-05, "loss": 0.6842, "step": 1570 }, { "epoch": 0.6012246460007654, "grad_norm": 0.5669888854026794, "learning_rate": 1.9576636911870016e-05, "loss": 0.6637, "step": 1571 }, { "epoch": 0.6016073478760046, "grad_norm": 0.6233569979667664, "learning_rate": 1.9575922950029704e-05, "loss": 0.6865, "step": 1572 }, { "epoch": 0.6019900497512438, "grad_norm": 0.5546138286590576, "learning_rate": 1.95752083997232e-05, "loss": 0.7022, "step": 1573 }, { "epoch": 0.602372751626483, "grad_norm": 0.525173008441925, "learning_rate": 1.9574493260994417e-05, "loss": 0.6174, "step": 1574 }, { "epoch": 0.6027554535017221, "grad_norm": 0.5212791562080383, "learning_rate": 1.9573777533887305e-05, "loss": 0.6756, "step": 1575 }, { "epoch": 0.6031381553769614, "grad_norm": 0.50221186876297, "learning_rate": 1.957306121844584e-05, "loss": 0.6127, "step": 1576 }, { "epoch": 0.6035208572522005, "grad_norm": 0.5468408465385437, "learning_rate": 1.9572344314714047e-05, "loss": 0.735, "step": 1577 }, { "epoch": 0.6039035591274398, "grad_norm": 0.5563971996307373, "learning_rate": 1.9571626822735975e-05, "loss": 0.7037, "step": 1578 }, { "epoch": 0.6042862610026789, "grad_norm": 0.5318216681480408, "learning_rate": 1.9570908742555724e-05, "loss": 0.7009, "step": 1579 }, { "epoch": 0.604668962877918, "grad_norm": 0.5090997815132141, "learning_rate": 1.957019007421742e-05, "loss": 0.6624, "step": 1580 }, { "epoch": 0.6050516647531573, "grad_norm": 0.5679251551628113, "learning_rate": 1.9569470817765226e-05, "loss": 0.604, "step": 1581 }, { "epoch": 0.6054343666283964, "grad_norm": 0.5535605549812317, "learning_rate": 1.956875097324334e-05, "loss": 0.6736, "step": 1582 }, { "epoch": 0.6058170685036357, "grad_norm": 0.5660558938980103, "learning_rate": 1.9568030540696003e-05, "loss": 0.693, "step": 1583 }, { "epoch": 0.6061997703788748, "grad_norm": 0.481094628572464, "learning_rate": 1.9567309520167483e-05, "loss": 0.6923, "step": 1584 }, { "epoch": 0.606582472254114, "grad_norm": 0.5238268375396729, "learning_rate": 1.9566587911702088e-05, "loss": 0.7218, "step": 1585 }, { "epoch": 0.6069651741293532, "grad_norm": 0.5873443484306335, "learning_rate": 1.956586571534417e-05, "loss": 0.6589, "step": 1586 }, { "epoch": 0.6073478760045924, "grad_norm": 0.5287145972251892, "learning_rate": 1.9565142931138103e-05, "loss": 0.7043, "step": 1587 }, { "epoch": 0.6077305778798316, "grad_norm": 0.5206516981124878, "learning_rate": 1.9564419559128302e-05, "loss": 0.662, "step": 1588 }, { "epoch": 0.6081132797550708, "grad_norm": 0.5219056010246277, "learning_rate": 1.9563695599359233e-05, "loss": 0.7585, "step": 1589 }, { "epoch": 0.6084959816303099, "grad_norm": 0.600476861000061, "learning_rate": 1.9562971051875367e-05, "loss": 0.7358, "step": 1590 }, { "epoch": 0.6088786835055492, "grad_norm": 0.4753425419330597, "learning_rate": 1.9562245916721245e-05, "loss": 0.7168, "step": 1591 }, { "epoch": 0.6092613853807883, "grad_norm": 0.5960062742233276, "learning_rate": 1.9561520193941424e-05, "loss": 0.682, "step": 1592 }, { "epoch": 0.6096440872560276, "grad_norm": 0.5524174571037292, "learning_rate": 1.9560793883580496e-05, "loss": 0.6902, "step": 1593 }, { "epoch": 0.6100267891312667, "grad_norm": 0.5929354429244995, "learning_rate": 1.9560066985683103e-05, "loss": 0.6884, "step": 1594 }, { "epoch": 0.6104094910065059, "grad_norm": 0.578347384929657, "learning_rate": 1.955933950029391e-05, "loss": 0.7009, "step": 1595 }, { "epoch": 0.6107921928817451, "grad_norm": 0.5395039319992065, "learning_rate": 1.955861142745762e-05, "loss": 0.6851, "step": 1596 }, { "epoch": 0.6111748947569843, "grad_norm": 0.5647489428520203, "learning_rate": 1.955788276721898e-05, "loss": 0.6869, "step": 1597 }, { "epoch": 0.6115575966322235, "grad_norm": 0.5871720910072327, "learning_rate": 1.9557153519622768e-05, "loss": 0.7341, "step": 1598 }, { "epoch": 0.6119402985074627, "grad_norm": 0.5364786386489868, "learning_rate": 1.9556423684713795e-05, "loss": 0.6638, "step": 1599 }, { "epoch": 0.6123230003827018, "grad_norm": 0.6204370856285095, "learning_rate": 1.9555693262536914e-05, "loss": 0.7331, "step": 1600 }, { "epoch": 0.6127057022579411, "grad_norm": 0.5359656810760498, "learning_rate": 1.9554962253137014e-05, "loss": 0.6995, "step": 1601 }, { "epoch": 0.6130884041331802, "grad_norm": 0.5319023132324219, "learning_rate": 1.955423065655901e-05, "loss": 0.7081, "step": 1602 }, { "epoch": 0.6134711060084195, "grad_norm": 0.6231010556221008, "learning_rate": 1.955349847284787e-05, "loss": 0.7696, "step": 1603 }, { "epoch": 0.6138538078836586, "grad_norm": 0.5613243579864502, "learning_rate": 1.955276570204858e-05, "loss": 0.7199, "step": 1604 }, { "epoch": 0.6142365097588978, "grad_norm": 0.5897523164749146, "learning_rate": 1.9552032344206173e-05, "loss": 0.6879, "step": 1605 }, { "epoch": 0.614619211634137, "grad_norm": 0.5714329481124878, "learning_rate": 1.955129839936572e-05, "loss": 0.6465, "step": 1606 }, { "epoch": 0.6150019135093762, "grad_norm": 0.5618292093276978, "learning_rate": 1.9550563867572315e-05, "loss": 0.6997, "step": 1607 }, { "epoch": 0.6153846153846154, "grad_norm": 0.5401606559753418, "learning_rate": 1.954982874887111e-05, "loss": 0.6742, "step": 1608 }, { "epoch": 0.6157673172598546, "grad_norm": 0.5059478282928467, "learning_rate": 1.9549093043307266e-05, "loss": 0.7008, "step": 1609 }, { "epoch": 0.6161500191350937, "grad_norm": 0.48666757345199585, "learning_rate": 1.9548356750926008e-05, "loss": 0.6534, "step": 1610 }, { "epoch": 0.616532721010333, "grad_norm": 0.5100866556167603, "learning_rate": 1.9547619871772575e-05, "loss": 0.6509, "step": 1611 }, { "epoch": 0.6169154228855721, "grad_norm": 0.6310868859291077, "learning_rate": 1.9546882405892247e-05, "loss": 0.7182, "step": 1612 }, { "epoch": 0.6172981247608114, "grad_norm": 0.5471295714378357, "learning_rate": 1.954614435333035e-05, "loss": 0.6977, "step": 1613 }, { "epoch": 0.6176808266360505, "grad_norm": 0.5331815481185913, "learning_rate": 1.9545405714132236e-05, "loss": 0.7494, "step": 1614 }, { "epoch": 0.6180635285112897, "grad_norm": 0.6361591219902039, "learning_rate": 1.9544666488343296e-05, "loss": 0.6834, "step": 1615 }, { "epoch": 0.6184462303865289, "grad_norm": 0.5564016699790955, "learning_rate": 1.9543926676008964e-05, "loss": 0.748, "step": 1616 }, { "epoch": 0.618828932261768, "grad_norm": 0.5565066337585449, "learning_rate": 1.9543186277174694e-05, "loss": 0.6933, "step": 1617 }, { "epoch": 0.6192116341370073, "grad_norm": 0.5591948628425598, "learning_rate": 1.954244529188599e-05, "loss": 0.7324, "step": 1618 }, { "epoch": 0.6195943360122464, "grad_norm": 0.5414474010467529, "learning_rate": 1.954170372018839e-05, "loss": 0.7272, "step": 1619 }, { "epoch": 0.6199770378874856, "grad_norm": 0.5388268828392029, "learning_rate": 1.9540961562127457e-05, "loss": 0.7492, "step": 1620 }, { "epoch": 0.6203597397627248, "grad_norm": 0.5379191040992737, "learning_rate": 1.9540218817748808e-05, "loss": 0.7332, "step": 1621 }, { "epoch": 0.620742441637964, "grad_norm": 0.5719488859176636, "learning_rate": 1.9539475487098083e-05, "loss": 0.6932, "step": 1622 }, { "epoch": 0.6211251435132032, "grad_norm": 0.5252376198768616, "learning_rate": 1.9538731570220962e-05, "loss": 0.662, "step": 1623 }, { "epoch": 0.6215078453884424, "grad_norm": 0.5677130818367004, "learning_rate": 1.9537987067163156e-05, "loss": 0.7634, "step": 1624 }, { "epoch": 0.6218905472636815, "grad_norm": 0.5611857175827026, "learning_rate": 1.9537241977970426e-05, "loss": 0.803, "step": 1625 }, { "epoch": 0.6222732491389208, "grad_norm": 0.5566733479499817, "learning_rate": 1.9536496302688556e-05, "loss": 0.7209, "step": 1626 }, { "epoch": 0.6226559510141599, "grad_norm": 0.6072419285774231, "learning_rate": 1.9535750041363363e-05, "loss": 0.7829, "step": 1627 }, { "epoch": 0.6230386528893992, "grad_norm": 0.5002434849739075, "learning_rate": 1.9535003194040713e-05, "loss": 0.6442, "step": 1628 }, { "epoch": 0.6234213547646383, "grad_norm": 0.5229623913764954, "learning_rate": 1.95342557607665e-05, "loss": 0.7293, "step": 1629 }, { "epoch": 0.6238040566398775, "grad_norm": 0.5276324152946472, "learning_rate": 1.9533507741586664e-05, "loss": 0.6833, "step": 1630 }, { "epoch": 0.6241867585151167, "grad_norm": 0.5351231694221497, "learning_rate": 1.953275913654716e-05, "loss": 0.68, "step": 1631 }, { "epoch": 0.6245694603903559, "grad_norm": 0.49417850375175476, "learning_rate": 1.9532009945694e-05, "loss": 0.7179, "step": 1632 }, { "epoch": 0.6249521622655951, "grad_norm": 0.5580313801765442, "learning_rate": 1.9531260169073222e-05, "loss": 0.69, "step": 1633 }, { "epoch": 0.6253348641408343, "grad_norm": 0.5742224454879761, "learning_rate": 1.9530509806730896e-05, "loss": 0.6653, "step": 1634 }, { "epoch": 0.6257175660160735, "grad_norm": 0.5318869948387146, "learning_rate": 1.952975885871314e-05, "loss": 0.6907, "step": 1635 }, { "epoch": 0.6261002678913127, "grad_norm": 0.4785587191581726, "learning_rate": 1.9529007325066103e-05, "loss": 0.6345, "step": 1636 }, { "epoch": 0.6264829697665518, "grad_norm": 0.5589016675949097, "learning_rate": 1.9528255205835966e-05, "loss": 0.806, "step": 1637 }, { "epoch": 0.6268656716417911, "grad_norm": 0.5109735131263733, "learning_rate": 1.9527502501068948e-05, "loss": 0.7049, "step": 1638 }, { "epoch": 0.6272483735170302, "grad_norm": 0.5262353420257568, "learning_rate": 1.9526749210811306e-05, "loss": 0.711, "step": 1639 }, { "epoch": 0.6276310753922695, "grad_norm": 0.5422343015670776, "learning_rate": 1.9525995335109333e-05, "loss": 0.6543, "step": 1640 }, { "epoch": 0.6280137772675086, "grad_norm": 0.5026019215583801, "learning_rate": 1.9525240874009354e-05, "loss": 0.7356, "step": 1641 }, { "epoch": 0.6283964791427478, "grad_norm": 0.5333892703056335, "learning_rate": 1.9524485827557737e-05, "loss": 0.7301, "step": 1642 }, { "epoch": 0.628779181017987, "grad_norm": 0.5616320371627808, "learning_rate": 1.952373019580087e-05, "loss": 0.7028, "step": 1643 }, { "epoch": 0.6291618828932262, "grad_norm": 0.545152485370636, "learning_rate": 1.9522973978785204e-05, "loss": 0.569, "step": 1644 }, { "epoch": 0.6295445847684654, "grad_norm": 0.5837751030921936, "learning_rate": 1.95222171765572e-05, "loss": 0.657, "step": 1645 }, { "epoch": 0.6299272866437046, "grad_norm": 0.5100137591362, "learning_rate": 1.9521459789163372e-05, "loss": 0.7195, "step": 1646 }, { "epoch": 0.6303099885189437, "grad_norm": 0.4836386442184448, "learning_rate": 1.9520701816650262e-05, "loss": 0.705, "step": 1647 }, { "epoch": 0.630692690394183, "grad_norm": 0.5696287155151367, "learning_rate": 1.9519943259064444e-05, "loss": 0.6083, "step": 1648 }, { "epoch": 0.6310753922694221, "grad_norm": 0.5313151478767395, "learning_rate": 1.951918411645254e-05, "loss": 0.6802, "step": 1649 }, { "epoch": 0.6314580941446614, "grad_norm": 0.5342072248458862, "learning_rate": 1.9518424388861196e-05, "loss": 0.6993, "step": 1650 }, { "epoch": 0.6318407960199005, "grad_norm": 0.5382024645805359, "learning_rate": 1.9517664076337106e-05, "loss": 0.7363, "step": 1651 }, { "epoch": 0.6322234978951397, "grad_norm": 0.5634157657623291, "learning_rate": 1.9516903178926987e-05, "loss": 0.7342, "step": 1652 }, { "epoch": 0.6326061997703789, "grad_norm": 0.585382878780365, "learning_rate": 1.9516141696677602e-05, "loss": 0.7725, "step": 1653 }, { "epoch": 0.632988901645618, "grad_norm": 0.574517011642456, "learning_rate": 1.9515379629635743e-05, "loss": 0.6683, "step": 1654 }, { "epoch": 0.6333716035208573, "grad_norm": 0.5429658889770508, "learning_rate": 1.9514616977848243e-05, "loss": 0.6699, "step": 1655 }, { "epoch": 0.6337543053960965, "grad_norm": 0.5593811273574829, "learning_rate": 1.9513853741361973e-05, "loss": 0.7604, "step": 1656 }, { "epoch": 0.6341370072713356, "grad_norm": 0.5651026368141174, "learning_rate": 1.9513089920223825e-05, "loss": 0.7762, "step": 1657 }, { "epoch": 0.6345197091465749, "grad_norm": 0.5617815852165222, "learning_rate": 1.951232551448075e-05, "loss": 0.7035, "step": 1658 }, { "epoch": 0.634902411021814, "grad_norm": 0.5633507370948792, "learning_rate": 1.9511560524179714e-05, "loss": 0.6678, "step": 1659 }, { "epoch": 0.6352851128970533, "grad_norm": 0.5940945744514465, "learning_rate": 1.9510794949367734e-05, "loss": 0.7067, "step": 1660 }, { "epoch": 0.6356678147722924, "grad_norm": 0.5226981043815613, "learning_rate": 1.951002879009185e-05, "loss": 0.6833, "step": 1661 }, { "epoch": 0.6360505166475315, "grad_norm": 0.5983591079711914, "learning_rate": 1.9509262046399153e-05, "loss": 0.7258, "step": 1662 }, { "epoch": 0.6364332185227708, "grad_norm": 0.5460703372955322, "learning_rate": 1.950849471833675e-05, "loss": 0.7339, "step": 1663 }, { "epoch": 0.6368159203980099, "grad_norm": 0.5570838451385498, "learning_rate": 1.950772680595181e-05, "loss": 0.6747, "step": 1664 }, { "epoch": 0.6371986222732492, "grad_norm": 0.5814378261566162, "learning_rate": 1.950695830929151e-05, "loss": 0.6292, "step": 1665 }, { "epoch": 0.6375813241484883, "grad_norm": 0.5608204007148743, "learning_rate": 1.9506189228403084e-05, "loss": 0.6709, "step": 1666 }, { "epoch": 0.6379640260237275, "grad_norm": 0.5298590660095215, "learning_rate": 1.9505419563333793e-05, "loss": 0.696, "step": 1667 }, { "epoch": 0.6383467278989667, "grad_norm": 0.5065222382545471, "learning_rate": 1.9504649314130927e-05, "loss": 0.7286, "step": 1668 }, { "epoch": 0.6387294297742059, "grad_norm": 0.5199479460716248, "learning_rate": 1.9503878480841832e-05, "loss": 0.6617, "step": 1669 }, { "epoch": 0.6391121316494451, "grad_norm": 0.5431360006332397, "learning_rate": 1.9503107063513873e-05, "loss": 0.692, "step": 1670 }, { "epoch": 0.6394948335246843, "grad_norm": 0.4820219576358795, "learning_rate": 1.950233506219445e-05, "loss": 0.6203, "step": 1671 }, { "epoch": 0.6398775353999234, "grad_norm": 0.5168941617012024, "learning_rate": 1.9501562476931014e-05, "loss": 0.6367, "step": 1672 }, { "epoch": 0.6402602372751627, "grad_norm": 0.5502641797065735, "learning_rate": 1.9500789307771035e-05, "loss": 0.678, "step": 1673 }, { "epoch": 0.6406429391504018, "grad_norm": 0.5485039949417114, "learning_rate": 1.9500015554762034e-05, "loss": 0.7055, "step": 1674 }, { "epoch": 0.6410256410256411, "grad_norm": 0.56076979637146, "learning_rate": 1.9499241217951552e-05, "loss": 0.7448, "step": 1675 }, { "epoch": 0.6414083429008802, "grad_norm": 0.5513243079185486, "learning_rate": 1.9498466297387176e-05, "loss": 0.6792, "step": 1676 }, { "epoch": 0.6417910447761194, "grad_norm": 0.5869731903076172, "learning_rate": 1.949769079311653e-05, "loss": 0.7067, "step": 1677 }, { "epoch": 0.6421737466513586, "grad_norm": 0.5357140302658081, "learning_rate": 1.949691470518727e-05, "loss": 0.6954, "step": 1678 }, { "epoch": 0.6425564485265978, "grad_norm": 0.5254952311515808, "learning_rate": 1.9496138033647088e-05, "loss": 0.7648, "step": 1679 }, { "epoch": 0.642939150401837, "grad_norm": 0.5370014309883118, "learning_rate": 1.9495360778543713e-05, "loss": 0.7844, "step": 1680 }, { "epoch": 0.6433218522770762, "grad_norm": 0.5149961113929749, "learning_rate": 1.9494582939924908e-05, "loss": 0.6522, "step": 1681 }, { "epoch": 0.6437045541523153, "grad_norm": 0.5630367994308472, "learning_rate": 1.9493804517838475e-05, "loss": 0.7584, "step": 1682 }, { "epoch": 0.6440872560275546, "grad_norm": 0.5289002656936646, "learning_rate": 1.9493025512332245e-05, "loss": 0.6507, "step": 1683 }, { "epoch": 0.6444699579027937, "grad_norm": 0.5757426023483276, "learning_rate": 1.9492245923454102e-05, "loss": 0.6678, "step": 1684 }, { "epoch": 0.644852659778033, "grad_norm": 0.507201611995697, "learning_rate": 1.949146575125194e-05, "loss": 0.6493, "step": 1685 }, { "epoch": 0.6452353616532721, "grad_norm": 0.5838504433631897, "learning_rate": 1.949068499577371e-05, "loss": 0.7026, "step": 1686 }, { "epoch": 0.6456180635285113, "grad_norm": 0.6987410187721252, "learning_rate": 1.9489903657067394e-05, "loss": 0.6529, "step": 1687 }, { "epoch": 0.6460007654037505, "grad_norm": 0.6377722024917603, "learning_rate": 1.9489121735181e-05, "loss": 0.7006, "step": 1688 }, { "epoch": 0.6463834672789897, "grad_norm": 0.5069819092750549, "learning_rate": 1.9488339230162583e-05, "loss": 0.6444, "step": 1689 }, { "epoch": 0.6467661691542289, "grad_norm": 0.5609286427497864, "learning_rate": 1.948755614206023e-05, "loss": 0.6672, "step": 1690 }, { "epoch": 0.647148871029468, "grad_norm": 0.49013614654541016, "learning_rate": 1.948677247092207e-05, "loss": 0.6662, "step": 1691 }, { "epoch": 0.6475315729047072, "grad_norm": 0.5721766352653503, "learning_rate": 1.9485988216796246e-05, "loss": 0.7551, "step": 1692 }, { "epoch": 0.6479142747799465, "grad_norm": 0.5129331350326538, "learning_rate": 1.9485203379730968e-05, "loss": 0.6988, "step": 1693 }, { "epoch": 0.6482969766551856, "grad_norm": 0.5394241809844971, "learning_rate": 1.948441795977446e-05, "loss": 0.6107, "step": 1694 }, { "epoch": 0.6486796785304249, "grad_norm": 0.548125147819519, "learning_rate": 1.948363195697499e-05, "loss": 0.7289, "step": 1695 }, { "epoch": 0.649062380405664, "grad_norm": 0.5056767463684082, "learning_rate": 1.9482845371380853e-05, "loss": 0.7143, "step": 1696 }, { "epoch": 0.6494450822809031, "grad_norm": 0.5013774633407593, "learning_rate": 1.9482058203040397e-05, "loss": 0.7004, "step": 1697 }, { "epoch": 0.6498277841561424, "grad_norm": 0.5850799083709717, "learning_rate": 1.948127045200199e-05, "loss": 0.6639, "step": 1698 }, { "epoch": 0.6502104860313815, "grad_norm": 0.5670098066329956, "learning_rate": 1.948048211831404e-05, "loss": 0.7317, "step": 1699 }, { "epoch": 0.6505931879066208, "grad_norm": 0.5836971402168274, "learning_rate": 1.9479693202024995e-05, "loss": 0.7476, "step": 1700 }, { "epoch": 0.6509758897818599, "grad_norm": 0.5758711099624634, "learning_rate": 1.947890370318334e-05, "loss": 0.6588, "step": 1701 }, { "epoch": 0.6513585916570991, "grad_norm": 0.5824857950210571, "learning_rate": 1.9478113621837584e-05, "loss": 0.7232, "step": 1702 }, { "epoch": 0.6517412935323383, "grad_norm": 0.5467429757118225, "learning_rate": 1.9477322958036284e-05, "loss": 0.6634, "step": 1703 }, { "epoch": 0.6521239954075775, "grad_norm": 0.5211171507835388, "learning_rate": 1.9476531711828027e-05, "loss": 0.6422, "step": 1704 }, { "epoch": 0.6525066972828167, "grad_norm": 0.5313950777053833, "learning_rate": 1.9475739883261435e-05, "loss": 0.6261, "step": 1705 }, { "epoch": 0.6528893991580559, "grad_norm": 0.6162922382354736, "learning_rate": 1.9474947472385176e-05, "loss": 0.7308, "step": 1706 }, { "epoch": 0.653272101033295, "grad_norm": 0.5493375062942505, "learning_rate": 1.9474154479247936e-05, "loss": 0.7135, "step": 1707 }, { "epoch": 0.6536548029085343, "grad_norm": 0.6159459948539734, "learning_rate": 1.9473360903898456e-05, "loss": 0.6994, "step": 1708 }, { "epoch": 0.6540375047837734, "grad_norm": 0.5361166596412659, "learning_rate": 1.9472566746385495e-05, "loss": 0.6697, "step": 1709 }, { "epoch": 0.6544202066590127, "grad_norm": 0.5972000360488892, "learning_rate": 1.9471772006757858e-05, "loss": 0.6347, "step": 1710 }, { "epoch": 0.6548029085342518, "grad_norm": 0.5408409833908081, "learning_rate": 1.9470976685064386e-05, "loss": 0.7958, "step": 1711 }, { "epoch": 0.655185610409491, "grad_norm": 0.5511062741279602, "learning_rate": 1.9470180781353956e-05, "loss": 0.6727, "step": 1712 }, { "epoch": 0.6555683122847302, "grad_norm": 0.5019368529319763, "learning_rate": 1.9469384295675475e-05, "loss": 0.6241, "step": 1713 }, { "epoch": 0.6559510141599694, "grad_norm": 0.504653811454773, "learning_rate": 1.9468587228077888e-05, "loss": 0.6646, "step": 1714 }, { "epoch": 0.6563337160352086, "grad_norm": 0.5384987592697144, "learning_rate": 1.9467789578610178e-05, "loss": 0.7097, "step": 1715 }, { "epoch": 0.6567164179104478, "grad_norm": 0.5616456866264343, "learning_rate": 1.9466991347321364e-05, "loss": 0.7977, "step": 1716 }, { "epoch": 0.6570991197856869, "grad_norm": 0.5321367383003235, "learning_rate": 1.94661925342605e-05, "loss": 0.7328, "step": 1717 }, { "epoch": 0.6574818216609262, "grad_norm": 0.5794159173965454, "learning_rate": 1.9465393139476673e-05, "loss": 0.6203, "step": 1718 }, { "epoch": 0.6578645235361653, "grad_norm": 0.569313645362854, "learning_rate": 1.9464593163019006e-05, "loss": 0.6409, "step": 1719 }, { "epoch": 0.6582472254114046, "grad_norm": 0.5063784718513489, "learning_rate": 1.9463792604936667e-05, "loss": 0.6109, "step": 1720 }, { "epoch": 0.6586299272866437, "grad_norm": 0.5375789999961853, "learning_rate": 1.946299146527885e-05, "loss": 0.7195, "step": 1721 }, { "epoch": 0.6590126291618829, "grad_norm": 0.5430029034614563, "learning_rate": 1.9462189744094776e-05, "loss": 0.8081, "step": 1722 }, { "epoch": 0.6593953310371221, "grad_norm": 0.5235605239868164, "learning_rate": 1.9461387441433732e-05, "loss": 0.677, "step": 1723 }, { "epoch": 0.6597780329123613, "grad_norm": 0.594463050365448, "learning_rate": 1.9460584557345007e-05, "loss": 0.6589, "step": 1724 }, { "epoch": 0.6601607347876005, "grad_norm": 0.5054609179496765, "learning_rate": 1.9459781091877946e-05, "loss": 0.6731, "step": 1725 }, { "epoch": 0.6605434366628397, "grad_norm": 0.5478920936584473, "learning_rate": 1.9458977045081924e-05, "loss": 0.6856, "step": 1726 }, { "epoch": 0.6609261385380788, "grad_norm": 0.5191294550895691, "learning_rate": 1.9458172417006347e-05, "loss": 0.6938, "step": 1727 }, { "epoch": 0.661308840413318, "grad_norm": 0.7837832570075989, "learning_rate": 1.9457367207700672e-05, "loss": 0.7291, "step": 1728 }, { "epoch": 0.6616915422885572, "grad_norm": 0.4591367840766907, "learning_rate": 1.945656141721437e-05, "loss": 0.6535, "step": 1729 }, { "epoch": 0.6620742441637965, "grad_norm": 0.5333532094955444, "learning_rate": 1.9455755045596967e-05, "loss": 0.6662, "step": 1730 }, { "epoch": 0.6624569460390356, "grad_norm": 0.5619720816612244, "learning_rate": 1.945494809289801e-05, "loss": 0.7299, "step": 1731 }, { "epoch": 0.6628396479142747, "grad_norm": 0.5478049516677856, "learning_rate": 1.9454140559167096e-05, "loss": 0.7139, "step": 1732 }, { "epoch": 0.663222349789514, "grad_norm": 0.5332092046737671, "learning_rate": 1.9453332444453844e-05, "loss": 0.7427, "step": 1733 }, { "epoch": 0.6636050516647531, "grad_norm": 0.46387404203414917, "learning_rate": 1.945252374880792e-05, "loss": 0.6593, "step": 1734 }, { "epoch": 0.6639877535399924, "grad_norm": 0.628686249256134, "learning_rate": 1.9451714472279012e-05, "loss": 0.6776, "step": 1735 }, { "epoch": 0.6643704554152315, "grad_norm": 0.5677107572555542, "learning_rate": 1.9450904614916864e-05, "loss": 0.6608, "step": 1736 }, { "epoch": 0.6647531572904707, "grad_norm": 0.6093522906303406, "learning_rate": 1.9450094176771235e-05, "loss": 0.7514, "step": 1737 }, { "epoch": 0.6651358591657099, "grad_norm": 0.5802770853042603, "learning_rate": 1.944928315789193e-05, "loss": 0.6979, "step": 1738 }, { "epoch": 0.6655185610409491, "grad_norm": 0.5627919435501099, "learning_rate": 1.944847155832879e-05, "loss": 0.7467, "step": 1739 }, { "epoch": 0.6659012629161883, "grad_norm": 0.6108699440956116, "learning_rate": 1.944765937813169e-05, "loss": 0.6806, "step": 1740 }, { "epoch": 0.6662839647914275, "grad_norm": 0.5988759994506836, "learning_rate": 1.9446846617350542e-05, "loss": 0.713, "step": 1741 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5677027702331543, "learning_rate": 1.9446033276035288e-05, "loss": 0.654, "step": 1742 }, { "epoch": 0.6670493685419059, "grad_norm": 0.5053319931030273, "learning_rate": 1.944521935423591e-05, "loss": 0.7159, "step": 1743 }, { "epoch": 0.667432070417145, "grad_norm": 0.5704641342163086, "learning_rate": 1.9444404852002433e-05, "loss": 0.7698, "step": 1744 }, { "epoch": 0.6678147722923843, "grad_norm": 0.5444302558898926, "learning_rate": 1.9443589769384906e-05, "loss": 0.625, "step": 1745 }, { "epoch": 0.6681974741676234, "grad_norm": 0.6009384393692017, "learning_rate": 1.9442774106433412e-05, "loss": 0.6878, "step": 1746 }, { "epoch": 0.6685801760428626, "grad_norm": 0.5354681015014648, "learning_rate": 1.9441957863198087e-05, "loss": 0.6251, "step": 1747 }, { "epoch": 0.6689628779181018, "grad_norm": 0.5260025262832642, "learning_rate": 1.9441141039729082e-05, "loss": 0.6697, "step": 1748 }, { "epoch": 0.669345579793341, "grad_norm": 0.5687674283981323, "learning_rate": 1.94403236360766e-05, "loss": 0.6835, "step": 1749 }, { "epoch": 0.6697282816685802, "grad_norm": 0.5413052439689636, "learning_rate": 1.9439505652290867e-05, "loss": 0.7748, "step": 1750 }, { "epoch": 0.6701109835438194, "grad_norm": 0.6811854839324951, "learning_rate": 1.943868708842215e-05, "loss": 0.6759, "step": 1751 }, { "epoch": 0.6704936854190585, "grad_norm": 0.5666257739067078, "learning_rate": 1.943786794452076e-05, "loss": 0.7267, "step": 1752 }, { "epoch": 0.6708763872942978, "grad_norm": 0.5349835157394409, "learning_rate": 1.9437048220637028e-05, "loss": 0.6794, "step": 1753 }, { "epoch": 0.6712590891695369, "grad_norm": 0.5481505990028381, "learning_rate": 1.943622791682133e-05, "loss": 0.7192, "step": 1754 }, { "epoch": 0.6716417910447762, "grad_norm": 0.648455023765564, "learning_rate": 1.9435407033124073e-05, "loss": 0.7409, "step": 1755 }, { "epoch": 0.6720244929200153, "grad_norm": 0.5477964282035828, "learning_rate": 1.943458556959571e-05, "loss": 0.7471, "step": 1756 }, { "epoch": 0.6724071947952545, "grad_norm": 0.5448601245880127, "learning_rate": 1.9433763526286716e-05, "loss": 0.6417, "step": 1757 }, { "epoch": 0.6727898966704937, "grad_norm": 0.5310268402099609, "learning_rate": 1.943294090324761e-05, "loss": 0.7056, "step": 1758 }, { "epoch": 0.6731725985457329, "grad_norm": 0.5730711221694946, "learning_rate": 1.9432117700528943e-05, "loss": 0.6991, "step": 1759 }, { "epoch": 0.6735553004209721, "grad_norm": 0.6213411092758179, "learning_rate": 1.9431293918181305e-05, "loss": 0.7162, "step": 1760 }, { "epoch": 0.6739380022962113, "grad_norm": 0.5565881729125977, "learning_rate": 1.9430469556255317e-05, "loss": 0.7342, "step": 1761 }, { "epoch": 0.6743207041714504, "grad_norm": 0.5723482370376587, "learning_rate": 1.9429644614801644e-05, "loss": 0.7344, "step": 1762 }, { "epoch": 0.6747034060466897, "grad_norm": 0.6100386381149292, "learning_rate": 1.942881909387097e-05, "loss": 0.7139, "step": 1763 }, { "epoch": 0.6750861079219288, "grad_norm": 0.5615648031234741, "learning_rate": 1.942799299351404e-05, "loss": 0.638, "step": 1764 }, { "epoch": 0.6754688097971681, "grad_norm": 0.5793516039848328, "learning_rate": 1.942716631378161e-05, "loss": 0.6224, "step": 1765 }, { "epoch": 0.6758515116724072, "grad_norm": 0.5434484481811523, "learning_rate": 1.9426339054724482e-05, "loss": 0.669, "step": 1766 }, { "epoch": 0.6762342135476463, "grad_norm": 0.5700794458389282, "learning_rate": 1.9425511216393495e-05, "loss": 0.647, "step": 1767 }, { "epoch": 0.6766169154228856, "grad_norm": 0.6434406638145447, "learning_rate": 1.9424682798839525e-05, "loss": 0.677, "step": 1768 }, { "epoch": 0.6769996172981247, "grad_norm": 0.5569505095481873, "learning_rate": 1.9423853802113476e-05, "loss": 0.6376, "step": 1769 }, { "epoch": 0.677382319173364, "grad_norm": 0.581112802028656, "learning_rate": 1.942302422626629e-05, "loss": 0.7012, "step": 1770 }, { "epoch": 0.6777650210486031, "grad_norm": 0.546038031578064, "learning_rate": 1.942219407134896e-05, "loss": 0.7137, "step": 1771 }, { "epoch": 0.6781477229238423, "grad_norm": 0.559614896774292, "learning_rate": 1.9421363337412483e-05, "loss": 0.7612, "step": 1772 }, { "epoch": 0.6785304247990815, "grad_norm": 0.5340948700904846, "learning_rate": 1.942053202450792e-05, "loss": 0.7034, "step": 1773 }, { "epoch": 0.6789131266743207, "grad_norm": 0.5637003779411316, "learning_rate": 1.941970013268636e-05, "loss": 0.7129, "step": 1774 }, { "epoch": 0.6792958285495599, "grad_norm": 0.5325061678886414, "learning_rate": 1.9418867661998916e-05, "loss": 0.6621, "step": 1775 }, { "epoch": 0.6796785304247991, "grad_norm": 0.6030668020248413, "learning_rate": 1.9418034612496754e-05, "loss": 0.652, "step": 1776 }, { "epoch": 0.6800612323000382, "grad_norm": 0.5824204683303833, "learning_rate": 1.9417200984231063e-05, "loss": 0.6666, "step": 1777 }, { "epoch": 0.6804439341752775, "grad_norm": 0.5422678589820862, "learning_rate": 1.941636677725307e-05, "loss": 0.6872, "step": 1778 }, { "epoch": 0.6808266360505166, "grad_norm": 0.5482543706893921, "learning_rate": 1.9415531991614043e-05, "loss": 0.6565, "step": 1779 }, { "epoch": 0.6812093379257559, "grad_norm": 0.5726454854011536, "learning_rate": 1.941469662736528e-05, "loss": 0.5936, "step": 1780 }, { "epoch": 0.681592039800995, "grad_norm": 0.5359926223754883, "learning_rate": 1.9413860684558117e-05, "loss": 0.6478, "step": 1781 }, { "epoch": 0.6819747416762342, "grad_norm": 0.5101439952850342, "learning_rate": 1.9413024163243928e-05, "loss": 0.7079, "step": 1782 }, { "epoch": 0.6823574435514734, "grad_norm": 0.5144621133804321, "learning_rate": 1.9412187063474116e-05, "loss": 0.5934, "step": 1783 }, { "epoch": 0.6827401454267126, "grad_norm": 0.6188265681266785, "learning_rate": 1.9411349385300118e-05, "loss": 0.6655, "step": 1784 }, { "epoch": 0.6831228473019518, "grad_norm": 0.5832834243774414, "learning_rate": 1.941051112877342e-05, "loss": 0.6992, "step": 1785 }, { "epoch": 0.683505549177191, "grad_norm": 0.57710862159729, "learning_rate": 1.940967229394553e-05, "loss": 0.6302, "step": 1786 }, { "epoch": 0.6838882510524301, "grad_norm": 0.5605683922767639, "learning_rate": 1.9408832880868e-05, "loss": 0.7246, "step": 1787 }, { "epoch": 0.6842709529276694, "grad_norm": 0.5344046354293823, "learning_rate": 1.9407992889592412e-05, "loss": 0.6651, "step": 1788 }, { "epoch": 0.6846536548029085, "grad_norm": 0.5576102137565613, "learning_rate": 1.9407152320170385e-05, "loss": 0.7258, "step": 1789 }, { "epoch": 0.6850363566781478, "grad_norm": 0.48910844326019287, "learning_rate": 1.940631117265358e-05, "loss": 0.6868, "step": 1790 }, { "epoch": 0.6854190585533869, "grad_norm": 0.6048798561096191, "learning_rate": 1.940546944709368e-05, "loss": 0.7779, "step": 1791 }, { "epoch": 0.6858017604286261, "grad_norm": 0.5294688940048218, "learning_rate": 1.940462714354242e-05, "loss": 0.701, "step": 1792 }, { "epoch": 0.6861844623038653, "grad_norm": 0.6042173504829407, "learning_rate": 1.9403784262051548e-05, "loss": 0.7414, "step": 1793 }, { "epoch": 0.6865671641791045, "grad_norm": 0.5529136657714844, "learning_rate": 1.9402940802672876e-05, "loss": 0.7998, "step": 1794 }, { "epoch": 0.6869498660543437, "grad_norm": 0.5041884779930115, "learning_rate": 1.940209676545823e-05, "loss": 0.694, "step": 1795 }, { "epoch": 0.6873325679295829, "grad_norm": 0.5987876653671265, "learning_rate": 1.9401252150459478e-05, "loss": 0.6917, "step": 1796 }, { "epoch": 0.687715269804822, "grad_norm": 0.5611307621002197, "learning_rate": 1.9400406957728527e-05, "loss": 0.7746, "step": 1797 }, { "epoch": 0.6880979716800613, "grad_norm": 0.5727817416191101, "learning_rate": 1.939956118731731e-05, "loss": 0.7018, "step": 1798 }, { "epoch": 0.6884806735553004, "grad_norm": 0.5102805495262146, "learning_rate": 1.9398714839277808e-05, "loss": 0.7178, "step": 1799 }, { "epoch": 0.6888633754305397, "grad_norm": 0.5262888669967651, "learning_rate": 1.9397867913662032e-05, "loss": 0.7108, "step": 1800 }, { "epoch": 0.6892460773057788, "grad_norm": 0.5577698945999146, "learning_rate": 1.939702041052202e-05, "loss": 0.8084, "step": 1801 }, { "epoch": 0.689628779181018, "grad_norm": 0.5308541059494019, "learning_rate": 1.939617232990987e-05, "loss": 0.6767, "step": 1802 }, { "epoch": 0.6900114810562572, "grad_norm": 0.5234506726264954, "learning_rate": 1.9395323671877677e-05, "loss": 0.6397, "step": 1803 }, { "epoch": 0.6903941829314963, "grad_norm": 0.547825813293457, "learning_rate": 1.9394474436477607e-05, "loss": 0.6257, "step": 1804 }, { "epoch": 0.6907768848067356, "grad_norm": 0.5433498620986938, "learning_rate": 1.9393624623761848e-05, "loss": 0.7667, "step": 1805 }, { "epoch": 0.6911595866819747, "grad_norm": 0.5680157542228699, "learning_rate": 1.9392774233782615e-05, "loss": 0.7083, "step": 1806 }, { "epoch": 0.6915422885572139, "grad_norm": 0.5540896654129028, "learning_rate": 1.939192326659217e-05, "loss": 0.6642, "step": 1807 }, { "epoch": 0.6919249904324531, "grad_norm": 0.5984779000282288, "learning_rate": 1.9391071722242814e-05, "loss": 0.731, "step": 1808 }, { "epoch": 0.6923076923076923, "grad_norm": 0.6552509069442749, "learning_rate": 1.939021960078687e-05, "loss": 0.7868, "step": 1809 }, { "epoch": 0.6926903941829315, "grad_norm": 0.5391627550125122, "learning_rate": 1.938936690227671e-05, "loss": 0.7412, "step": 1810 }, { "epoch": 0.6930730960581707, "grad_norm": 0.5769292712211609, "learning_rate": 1.938851362676472e-05, "loss": 0.6763, "step": 1811 }, { "epoch": 0.6934557979334098, "grad_norm": 0.5149970650672913, "learning_rate": 1.9387659774303353e-05, "loss": 0.6135, "step": 1812 }, { "epoch": 0.6938384998086491, "grad_norm": 0.5812851786613464, "learning_rate": 1.9386805344945066e-05, "loss": 0.7393, "step": 1813 }, { "epoch": 0.6942212016838882, "grad_norm": 0.5346975922584534, "learning_rate": 1.9385950338742377e-05, "loss": 0.6542, "step": 1814 }, { "epoch": 0.6946039035591275, "grad_norm": 0.5627533197402954, "learning_rate": 1.9385094755747826e-05, "loss": 0.599, "step": 1815 }, { "epoch": 0.6949866054343666, "grad_norm": 0.5163363814353943, "learning_rate": 1.938423859601399e-05, "loss": 0.6545, "step": 1816 }, { "epoch": 0.6953693073096058, "grad_norm": 0.5441470742225647, "learning_rate": 1.9383381859593476e-05, "loss": 0.743, "step": 1817 }, { "epoch": 0.695752009184845, "grad_norm": 0.6033336520195007, "learning_rate": 1.9382524546538944e-05, "loss": 0.7419, "step": 1818 }, { "epoch": 0.6961347110600842, "grad_norm": 0.5469204187393188, "learning_rate": 1.9381666656903068e-05, "loss": 0.6897, "step": 1819 }, { "epoch": 0.6965174129353234, "grad_norm": 0.5476014018058777, "learning_rate": 1.9380808190738577e-05, "loss": 0.6051, "step": 1820 }, { "epoch": 0.6969001148105626, "grad_norm": 0.5052337646484375, "learning_rate": 1.9379949148098217e-05, "loss": 0.6996, "step": 1821 }, { "epoch": 0.6972828166858017, "grad_norm": 0.5469549894332886, "learning_rate": 1.9379089529034785e-05, "loss": 0.6719, "step": 1822 }, { "epoch": 0.697665518561041, "grad_norm": 0.5284478664398193, "learning_rate": 1.9378229333601104e-05, "loss": 0.6854, "step": 1823 }, { "epoch": 0.6980482204362801, "grad_norm": 0.659647524356842, "learning_rate": 1.9377368561850036e-05, "loss": 0.8249, "step": 1824 }, { "epoch": 0.6984309223115194, "grad_norm": 0.5125284194946289, "learning_rate": 1.9376507213834476e-05, "loss": 0.7052, "step": 1825 }, { "epoch": 0.6988136241867585, "grad_norm": 0.48219624161720276, "learning_rate": 1.937564528960736e-05, "loss": 0.7211, "step": 1826 }, { "epoch": 0.6991963260619977, "grad_norm": 0.567832887172699, "learning_rate": 1.9374782789221655e-05, "loss": 0.6974, "step": 1827 }, { "epoch": 0.6995790279372369, "grad_norm": 0.5445326566696167, "learning_rate": 1.9373919712730357e-05, "loss": 0.7594, "step": 1828 }, { "epoch": 0.6999617298124761, "grad_norm": 0.5128849744796753, "learning_rate": 1.937305606018651e-05, "loss": 0.6695, "step": 1829 }, { "epoch": 0.7003444316877153, "grad_norm": 0.616830587387085, "learning_rate": 1.9372191831643195e-05, "loss": 0.6384, "step": 1830 }, { "epoch": 0.7007271335629545, "grad_norm": 0.532188892364502, "learning_rate": 1.9371327027153507e-05, "loss": 0.7058, "step": 1831 }, { "epoch": 0.7011098354381936, "grad_norm": 0.5616138577461243, "learning_rate": 1.93704616467706e-05, "loss": 0.7188, "step": 1832 }, { "epoch": 0.7014925373134329, "grad_norm": 0.5165753364562988, "learning_rate": 1.936959569054765e-05, "loss": 0.6254, "step": 1833 }, { "epoch": 0.701875239188672, "grad_norm": 0.5785284042358398, "learning_rate": 1.9368729158537875e-05, "loss": 0.7113, "step": 1834 }, { "epoch": 0.7022579410639113, "grad_norm": 0.5367897748947144, "learning_rate": 1.9367862050794524e-05, "loss": 0.7406, "step": 1835 }, { "epoch": 0.7026406429391504, "grad_norm": 0.5277400016784668, "learning_rate": 1.936699436737088e-05, "loss": 0.5881, "step": 1836 }, { "epoch": 0.7030233448143896, "grad_norm": 0.5221449732780457, "learning_rate": 1.9366126108320273e-05, "loss": 0.7578, "step": 1837 }, { "epoch": 0.7034060466896288, "grad_norm": 0.49230536818504333, "learning_rate": 1.936525727369605e-05, "loss": 0.6419, "step": 1838 }, { "epoch": 0.703788748564868, "grad_norm": 0.5410023927688599, "learning_rate": 1.936438786355161e-05, "loss": 0.7006, "step": 1839 }, { "epoch": 0.7041714504401072, "grad_norm": 0.5984060168266296, "learning_rate": 1.9363517877940378e-05, "loss": 0.6858, "step": 1840 }, { "epoch": 0.7045541523153463, "grad_norm": 0.5170323252677917, "learning_rate": 1.9362647316915817e-05, "loss": 0.6014, "step": 1841 }, { "epoch": 0.7049368541905855, "grad_norm": 0.5853543877601624, "learning_rate": 1.9361776180531428e-05, "loss": 0.6975, "step": 1842 }, { "epoch": 0.7053195560658247, "grad_norm": 0.5287882685661316, "learning_rate": 1.936090446884074e-05, "loss": 0.726, "step": 1843 }, { "epoch": 0.7057022579410639, "grad_norm": 0.5411827564239502, "learning_rate": 1.9360032181897325e-05, "loss": 0.6204, "step": 1844 }, { "epoch": 0.7060849598163031, "grad_norm": 0.5450449585914612, "learning_rate": 1.9359159319754786e-05, "loss": 0.7341, "step": 1845 }, { "epoch": 0.7064676616915423, "grad_norm": 0.6558800935745239, "learning_rate": 1.9358285882466764e-05, "loss": 0.6243, "step": 1846 }, { "epoch": 0.7068503635667814, "grad_norm": 0.5117479562759399, "learning_rate": 1.9357411870086933e-05, "loss": 0.6999, "step": 1847 }, { "epoch": 0.7072330654420207, "grad_norm": 0.5576404929161072, "learning_rate": 1.9356537282669004e-05, "loss": 0.7293, "step": 1848 }, { "epoch": 0.7076157673172598, "grad_norm": 0.5554692149162292, "learning_rate": 1.9355662120266726e-05, "loss": 0.6702, "step": 1849 }, { "epoch": 0.7079984691924991, "grad_norm": 0.5051576495170593, "learning_rate": 1.9354786382933873e-05, "loss": 0.6632, "step": 1850 }, { "epoch": 0.7083811710677382, "grad_norm": 0.6135920882225037, "learning_rate": 1.9353910070724268e-05, "loss": 0.7026, "step": 1851 }, { "epoch": 0.7087638729429774, "grad_norm": 0.5272507071495056, "learning_rate": 1.935303318369176e-05, "loss": 0.7063, "step": 1852 }, { "epoch": 0.7091465748182166, "grad_norm": 0.49568748474121094, "learning_rate": 1.9352155721890234e-05, "loss": 0.677, "step": 1853 }, { "epoch": 0.7095292766934558, "grad_norm": 0.501797080039978, "learning_rate": 1.9351277685373615e-05, "loss": 0.615, "step": 1854 }, { "epoch": 0.709911978568695, "grad_norm": 0.6247240900993347, "learning_rate": 1.935039907419586e-05, "loss": 0.6909, "step": 1855 }, { "epoch": 0.7102946804439342, "grad_norm": 0.5120430588722229, "learning_rate": 1.934951988841096e-05, "loss": 0.6442, "step": 1856 }, { "epoch": 0.7106773823191733, "grad_norm": 0.5720070600509644, "learning_rate": 1.934864012807295e-05, "loss": 0.7131, "step": 1857 }, { "epoch": 0.7110600841944126, "grad_norm": 0.5084592700004578, "learning_rate": 1.934775979323589e-05, "loss": 0.6901, "step": 1858 }, { "epoch": 0.7114427860696517, "grad_norm": 0.5235263705253601, "learning_rate": 1.9346878883953873e-05, "loss": 0.7146, "step": 1859 }, { "epoch": 0.711825487944891, "grad_norm": 0.5393877029418945, "learning_rate": 1.9345997400281038e-05, "loss": 0.7407, "step": 1860 }, { "epoch": 0.7122081898201301, "grad_norm": 0.5145605802536011, "learning_rate": 1.934511534227156e-05, "loss": 0.5956, "step": 1861 }, { "epoch": 0.7125908916953693, "grad_norm": 0.5320399403572083, "learning_rate": 1.9344232709979636e-05, "loss": 0.656, "step": 1862 }, { "epoch": 0.7129735935706085, "grad_norm": 0.5209212899208069, "learning_rate": 1.9343349503459507e-05, "loss": 0.6957, "step": 1863 }, { "epoch": 0.7133562954458477, "grad_norm": 0.5690443515777588, "learning_rate": 1.9342465722765453e-05, "loss": 0.7918, "step": 1864 }, { "epoch": 0.7137389973210869, "grad_norm": 0.5112349987030029, "learning_rate": 1.934158136795178e-05, "loss": 0.6899, "step": 1865 }, { "epoch": 0.7141216991963261, "grad_norm": 0.5342438817024231, "learning_rate": 1.934069643907284e-05, "loss": 0.7632, "step": 1866 }, { "epoch": 0.7145044010715652, "grad_norm": 0.5196400284767151, "learning_rate": 1.9339810936183e-05, "loss": 0.7133, "step": 1867 }, { "epoch": 0.7148871029468045, "grad_norm": 0.5301933288574219, "learning_rate": 1.9338924859336695e-05, "loss": 0.7341, "step": 1868 }, { "epoch": 0.7152698048220436, "grad_norm": 0.6180863380432129, "learning_rate": 1.933803820858837e-05, "loss": 0.7793, "step": 1869 }, { "epoch": 0.7156525066972829, "grad_norm": 0.5935837626457214, "learning_rate": 1.9337150983992505e-05, "loss": 0.7111, "step": 1870 }, { "epoch": 0.716035208572522, "grad_norm": 0.5369552969932556, "learning_rate": 1.933626318560363e-05, "loss": 0.7135, "step": 1871 }, { "epoch": 0.7164179104477612, "grad_norm": 0.6628219485282898, "learning_rate": 1.93353748134763e-05, "loss": 0.645, "step": 1872 }, { "epoch": 0.7168006123230004, "grad_norm": 0.48591184616088867, "learning_rate": 1.933448586766511e-05, "loss": 0.6952, "step": 1873 }, { "epoch": 0.7171833141982396, "grad_norm": 0.5864787101745605, "learning_rate": 1.9333596348224687e-05, "loss": 0.664, "step": 1874 }, { "epoch": 0.7175660160734788, "grad_norm": 0.5576381683349609, "learning_rate": 1.933270625520969e-05, "loss": 0.6245, "step": 1875 }, { "epoch": 0.717948717948718, "grad_norm": 0.5477436780929565, "learning_rate": 1.9331815588674825e-05, "loss": 0.6419, "step": 1876 }, { "epoch": 0.7183314198239571, "grad_norm": 0.5283337831497192, "learning_rate": 1.933092434867482e-05, "loss": 0.6726, "step": 1877 }, { "epoch": 0.7187141216991964, "grad_norm": 0.555468738079071, "learning_rate": 1.9330032535264447e-05, "loss": 0.7436, "step": 1878 }, { "epoch": 0.7190968235744355, "grad_norm": 0.5592958927154541, "learning_rate": 1.932914014849851e-05, "loss": 0.7471, "step": 1879 }, { "epoch": 0.7194795254496748, "grad_norm": 0.5406386852264404, "learning_rate": 1.9328247188431848e-05, "loss": 0.6786, "step": 1880 }, { "epoch": 0.7198622273249139, "grad_norm": 0.5361269116401672, "learning_rate": 1.9327353655119336e-05, "loss": 0.6815, "step": 1881 }, { "epoch": 0.720244929200153, "grad_norm": 0.5579361319541931, "learning_rate": 1.9326459548615884e-05, "loss": 0.6725, "step": 1882 }, { "epoch": 0.7206276310753923, "grad_norm": 0.47636473178863525, "learning_rate": 1.932556486897644e-05, "loss": 0.6321, "step": 1883 }, { "epoch": 0.7210103329506314, "grad_norm": 0.5070143342018127, "learning_rate": 1.9324669616255978e-05, "loss": 0.6805, "step": 1884 }, { "epoch": 0.7213930348258707, "grad_norm": 0.5089043378829956, "learning_rate": 1.932377379050952e-05, "loss": 0.6085, "step": 1885 }, { "epoch": 0.7217757367011098, "grad_norm": 0.48525479435920715, "learning_rate": 1.9322877391792113e-05, "loss": 0.657, "step": 1886 }, { "epoch": 0.722158438576349, "grad_norm": 0.5223888158798218, "learning_rate": 1.9321980420158843e-05, "loss": 0.5959, "step": 1887 }, { "epoch": 0.7225411404515882, "grad_norm": 0.5827596187591553, "learning_rate": 1.9321082875664834e-05, "loss": 0.6845, "step": 1888 }, { "epoch": 0.7229238423268274, "grad_norm": 0.5548449754714966, "learning_rate": 1.9320184758365245e-05, "loss": 0.6645, "step": 1889 }, { "epoch": 0.7233065442020666, "grad_norm": 0.50011146068573, "learning_rate": 1.931928606831526e-05, "loss": 0.6685, "step": 1890 }, { "epoch": 0.7236892460773058, "grad_norm": 0.551304817199707, "learning_rate": 1.9318386805570108e-05, "loss": 0.7285, "step": 1891 }, { "epoch": 0.7240719479525449, "grad_norm": 0.5477525591850281, "learning_rate": 1.9317486970185054e-05, "loss": 0.6655, "step": 1892 }, { "epoch": 0.7244546498277842, "grad_norm": 0.5342088937759399, "learning_rate": 1.9316586562215397e-05, "loss": 0.6379, "step": 1893 }, { "epoch": 0.7248373517030233, "grad_norm": 0.5067463517189026, "learning_rate": 1.931568558171647e-05, "loss": 0.6569, "step": 1894 }, { "epoch": 0.7252200535782626, "grad_norm": 0.5258552432060242, "learning_rate": 1.931478402874363e-05, "loss": 0.6052, "step": 1895 }, { "epoch": 0.7256027554535017, "grad_norm": 0.4998440742492676, "learning_rate": 1.9313881903352293e-05, "loss": 0.7023, "step": 1896 }, { "epoch": 0.7259854573287409, "grad_norm": 0.5187048316001892, "learning_rate": 1.9312979205597887e-05, "loss": 0.7531, "step": 1897 }, { "epoch": 0.7263681592039801, "grad_norm": 0.5225951671600342, "learning_rate": 1.9312075935535892e-05, "loss": 0.7315, "step": 1898 }, { "epoch": 0.7267508610792193, "grad_norm": 0.5729700326919556, "learning_rate": 1.9311172093221813e-05, "loss": 0.6522, "step": 1899 }, { "epoch": 0.7271335629544585, "grad_norm": 0.5525944828987122, "learning_rate": 1.9310267678711194e-05, "loss": 0.7527, "step": 1900 }, { "epoch": 0.7275162648296977, "grad_norm": 0.5372529625892639, "learning_rate": 1.9309362692059617e-05, "loss": 0.6405, "step": 1901 }, { "epoch": 0.7278989667049368, "grad_norm": 0.4906664192676544, "learning_rate": 1.930845713332269e-05, "loss": 0.7429, "step": 1902 }, { "epoch": 0.7282816685801761, "grad_norm": 0.545875072479248, "learning_rate": 1.9307551002556065e-05, "loss": 0.6849, "step": 1903 }, { "epoch": 0.7286643704554152, "grad_norm": 0.5236281156539917, "learning_rate": 1.9306644299815432e-05, "loss": 0.7612, "step": 1904 }, { "epoch": 0.7290470723306545, "grad_norm": 0.5684908628463745, "learning_rate": 1.9305737025156497e-05, "loss": 0.7185, "step": 1905 }, { "epoch": 0.7294297742058936, "grad_norm": 0.6053718328475952, "learning_rate": 1.9304829178635028e-05, "loss": 0.708, "step": 1906 }, { "epoch": 0.7298124760811328, "grad_norm": 0.5678989291191101, "learning_rate": 1.9303920760306804e-05, "loss": 0.7134, "step": 1907 }, { "epoch": 0.730195177956372, "grad_norm": 0.5570694208145142, "learning_rate": 1.9303011770227657e-05, "loss": 0.7201, "step": 1908 }, { "epoch": 0.7305778798316112, "grad_norm": 0.6383375525474548, "learning_rate": 1.9302102208453444e-05, "loss": 0.6057, "step": 1909 }, { "epoch": 0.7309605817068504, "grad_norm": 0.5100163221359253, "learning_rate": 1.930119207504006e-05, "loss": 0.6754, "step": 1910 }, { "epoch": 0.7313432835820896, "grad_norm": 0.5580726265907288, "learning_rate": 1.9300281370043433e-05, "loss": 0.6736, "step": 1911 }, { "epoch": 0.7317259854573287, "grad_norm": 0.5205092430114746, "learning_rate": 1.9299370093519534e-05, "loss": 0.7339, "step": 1912 }, { "epoch": 0.732108687332568, "grad_norm": 0.6054539680480957, "learning_rate": 1.929845824552436e-05, "loss": 0.7258, "step": 1913 }, { "epoch": 0.7324913892078071, "grad_norm": 0.6401503086090088, "learning_rate": 1.9297545826113947e-05, "loss": 0.6662, "step": 1914 }, { "epoch": 0.7328740910830464, "grad_norm": 0.5224330425262451, "learning_rate": 1.9296632835344362e-05, "loss": 0.6348, "step": 1915 }, { "epoch": 0.7332567929582855, "grad_norm": 0.6423154473304749, "learning_rate": 1.929571927327172e-05, "loss": 0.6514, "step": 1916 }, { "epoch": 0.7336394948335246, "grad_norm": 0.6030638813972473, "learning_rate": 1.9294805139952148e-05, "loss": 0.7923, "step": 1917 }, { "epoch": 0.7340221967087639, "grad_norm": 0.4953066408634186, "learning_rate": 1.9293890435441835e-05, "loss": 0.6715, "step": 1918 }, { "epoch": 0.734404898584003, "grad_norm": 0.5241521000862122, "learning_rate": 1.9292975159796982e-05, "loss": 0.6162, "step": 1919 }, { "epoch": 0.7347876004592423, "grad_norm": 0.4971122145652771, "learning_rate": 1.9292059313073846e-05, "loss": 0.6379, "step": 1920 }, { "epoch": 0.7351703023344814, "grad_norm": 0.5500872135162354, "learning_rate": 1.92911428953287e-05, "loss": 0.6835, "step": 1921 }, { "epoch": 0.7355530042097206, "grad_norm": 0.5715925693511963, "learning_rate": 1.929022590661786e-05, "loss": 0.6318, "step": 1922 }, { "epoch": 0.7359357060849598, "grad_norm": 0.5345056056976318, "learning_rate": 1.9289308346997683e-05, "loss": 0.7415, "step": 1923 }, { "epoch": 0.736318407960199, "grad_norm": 0.5891109108924866, "learning_rate": 1.928839021652455e-05, "loss": 0.7075, "step": 1924 }, { "epoch": 0.7367011098354382, "grad_norm": 0.5729769468307495, "learning_rate": 1.9287471515254887e-05, "loss": 0.7302, "step": 1925 }, { "epoch": 0.7370838117106774, "grad_norm": 0.5085662603378296, "learning_rate": 1.928655224324515e-05, "loss": 0.6456, "step": 1926 }, { "epoch": 0.7374665135859165, "grad_norm": 0.5152777433395386, "learning_rate": 1.9285632400551826e-05, "loss": 0.7308, "step": 1927 }, { "epoch": 0.7378492154611558, "grad_norm": 0.5050517916679382, "learning_rate": 1.9284711987231447e-05, "loss": 0.6673, "step": 1928 }, { "epoch": 0.7382319173363949, "grad_norm": 0.6027451753616333, "learning_rate": 1.928379100334057e-05, "loss": 0.7445, "step": 1929 }, { "epoch": 0.7386146192116342, "grad_norm": 0.5183762311935425, "learning_rate": 1.92828694489358e-05, "loss": 0.67, "step": 1930 }, { "epoch": 0.7389973210868733, "grad_norm": 0.7116043567657471, "learning_rate": 1.9281947324073763e-05, "loss": 0.7487, "step": 1931 }, { "epoch": 0.7393800229621125, "grad_norm": 0.5807106494903564, "learning_rate": 1.9281024628811127e-05, "loss": 0.6674, "step": 1932 }, { "epoch": 0.7397627248373517, "grad_norm": 0.5301859378814697, "learning_rate": 1.9280101363204592e-05, "loss": 0.6552, "step": 1933 }, { "epoch": 0.7401454267125909, "grad_norm": 0.6070111989974976, "learning_rate": 1.92791775273109e-05, "loss": 0.7281, "step": 1934 }, { "epoch": 0.7405281285878301, "grad_norm": 0.5213584303855896, "learning_rate": 1.9278253121186816e-05, "loss": 0.7792, "step": 1935 }, { "epoch": 0.7409108304630693, "grad_norm": 0.49406492710113525, "learning_rate": 1.9277328144889156e-05, "loss": 0.6354, "step": 1936 }, { "epoch": 0.7412935323383084, "grad_norm": 0.549826979637146, "learning_rate": 1.9276402598474754e-05, "loss": 0.7106, "step": 1937 }, { "epoch": 0.7416762342135477, "grad_norm": 0.5245624780654907, "learning_rate": 1.9275476482000497e-05, "loss": 0.6795, "step": 1938 }, { "epoch": 0.7420589360887868, "grad_norm": 0.5432856678962708, "learning_rate": 1.927454979552329e-05, "loss": 0.6696, "step": 1939 }, { "epoch": 0.7424416379640261, "grad_norm": 0.5541824698448181, "learning_rate": 1.9273622539100077e-05, "loss": 0.7592, "step": 1940 }, { "epoch": 0.7428243398392652, "grad_norm": 0.5019318461418152, "learning_rate": 1.927269471278785e-05, "loss": 0.7003, "step": 1941 }, { "epoch": 0.7432070417145044, "grad_norm": 0.6375231742858887, "learning_rate": 1.9271766316643624e-05, "loss": 0.6966, "step": 1942 }, { "epoch": 0.7435897435897436, "grad_norm": 0.548819363117218, "learning_rate": 1.9270837350724444e-05, "loss": 0.7029, "step": 1943 }, { "epoch": 0.7439724454649828, "grad_norm": 0.6604312658309937, "learning_rate": 1.9269907815087407e-05, "loss": 0.6679, "step": 1944 }, { "epoch": 0.744355147340222, "grad_norm": 0.5668247938156128, "learning_rate": 1.926897770978963e-05, "loss": 0.702, "step": 1945 }, { "epoch": 0.7447378492154612, "grad_norm": 0.5155757665634155, "learning_rate": 1.9268047034888272e-05, "loss": 0.6636, "step": 1946 }, { "epoch": 0.7451205510907003, "grad_norm": 0.5536683797836304, "learning_rate": 1.9267115790440526e-05, "loss": 0.667, "step": 1947 }, { "epoch": 0.7455032529659396, "grad_norm": 0.647136390209198, "learning_rate": 1.9266183976503616e-05, "loss": 0.6903, "step": 1948 }, { "epoch": 0.7458859548411787, "grad_norm": 0.5688952207565308, "learning_rate": 1.9265251593134807e-05, "loss": 0.8497, "step": 1949 }, { "epoch": 0.746268656716418, "grad_norm": 0.6436936259269714, "learning_rate": 1.92643186403914e-05, "loss": 0.7529, "step": 1950 }, { "epoch": 0.7466513585916571, "grad_norm": 0.5691399574279785, "learning_rate": 1.926338511833072e-05, "loss": 0.7013, "step": 1951 }, { "epoch": 0.7470340604668962, "grad_norm": 0.5146892070770264, "learning_rate": 1.9262451027010143e-05, "loss": 0.7309, "step": 1952 }, { "epoch": 0.7474167623421355, "grad_norm": 0.5383132100105286, "learning_rate": 1.9261516366487062e-05, "loss": 0.6083, "step": 1953 }, { "epoch": 0.7477994642173746, "grad_norm": 0.5885443091392517, "learning_rate": 1.9260581136818923e-05, "loss": 0.7032, "step": 1954 }, { "epoch": 0.7481821660926139, "grad_norm": 0.6102935075759888, "learning_rate": 1.9259645338063193e-05, "loss": 0.5847, "step": 1955 }, { "epoch": 0.748564867967853, "grad_norm": 0.5791459083557129, "learning_rate": 1.925870897027738e-05, "loss": 0.7484, "step": 1956 }, { "epoch": 0.7489475698430922, "grad_norm": 0.5150757431983948, "learning_rate": 1.9257772033519032e-05, "loss": 0.665, "step": 1957 }, { "epoch": 0.7493302717183314, "grad_norm": 0.537121057510376, "learning_rate": 1.925683452784572e-05, "loss": 0.671, "step": 1958 }, { "epoch": 0.7497129735935706, "grad_norm": 0.6060574054718018, "learning_rate": 1.9255896453315054e-05, "loss": 0.646, "step": 1959 }, { "epoch": 0.7500956754688098, "grad_norm": 0.49905192852020264, "learning_rate": 1.925495780998469e-05, "loss": 0.6633, "step": 1960 }, { "epoch": 0.750478377344049, "grad_norm": 0.5165275931358337, "learning_rate": 1.9254018597912302e-05, "loss": 0.6079, "step": 1961 }, { "epoch": 0.7508610792192881, "grad_norm": 0.5717229247093201, "learning_rate": 1.925307881715561e-05, "loss": 0.7204, "step": 1962 }, { "epoch": 0.7512437810945274, "grad_norm": 0.5728121399879456, "learning_rate": 1.925213846777237e-05, "loss": 0.7158, "step": 1963 }, { "epoch": 0.7516264829697665, "grad_norm": 0.5310381054878235, "learning_rate": 1.925119754982036e-05, "loss": 0.659, "step": 1964 }, { "epoch": 0.7520091848450058, "grad_norm": 0.7134553790092468, "learning_rate": 1.925025606335741e-05, "loss": 0.6904, "step": 1965 }, { "epoch": 0.7523918867202449, "grad_norm": 0.5235660076141357, "learning_rate": 1.9249314008441375e-05, "loss": 0.7143, "step": 1966 }, { "epoch": 0.7527745885954841, "grad_norm": 0.5169049501419067, "learning_rate": 1.9248371385130144e-05, "loss": 0.6211, "step": 1967 }, { "epoch": 0.7531572904707233, "grad_norm": 0.5448389053344727, "learning_rate": 1.924742819348165e-05, "loss": 0.6607, "step": 1968 }, { "epoch": 0.7535399923459625, "grad_norm": 0.5206490755081177, "learning_rate": 1.9246484433553844e-05, "loss": 0.6702, "step": 1969 }, { "epoch": 0.7539226942212017, "grad_norm": 0.505161702632904, "learning_rate": 1.9245540105404733e-05, "loss": 0.657, "step": 1970 }, { "epoch": 0.7543053960964409, "grad_norm": 0.540014386177063, "learning_rate": 1.924459520909234e-05, "loss": 0.6678, "step": 1971 }, { "epoch": 0.75468809797168, "grad_norm": 0.5043179392814636, "learning_rate": 1.9243649744674745e-05, "loss": 0.6712, "step": 1972 }, { "epoch": 0.7550707998469193, "grad_norm": 0.5089402794837952, "learning_rate": 1.9242703712210035e-05, "loss": 0.6267, "step": 1973 }, { "epoch": 0.7554535017221584, "grad_norm": 0.5160390734672546, "learning_rate": 1.924175711175635e-05, "loss": 0.6449, "step": 1974 }, { "epoch": 0.7558362035973977, "grad_norm": 0.5189663767814636, "learning_rate": 1.9240809943371864e-05, "loss": 0.7238, "step": 1975 }, { "epoch": 0.7562189054726368, "grad_norm": 0.4849562644958496, "learning_rate": 1.923986220711478e-05, "loss": 0.6316, "step": 1976 }, { "epoch": 0.756601607347876, "grad_norm": 0.5198491811752319, "learning_rate": 1.9238913903043345e-05, "loss": 0.7505, "step": 1977 }, { "epoch": 0.7569843092231152, "grad_norm": 0.7462255358695984, "learning_rate": 1.9237965031215826e-05, "loss": 0.6438, "step": 1978 }, { "epoch": 0.7573670110983544, "grad_norm": 0.5523308515548706, "learning_rate": 1.923701559169054e-05, "loss": 0.736, "step": 1979 }, { "epoch": 0.7577497129735936, "grad_norm": 0.5456462502479553, "learning_rate": 1.923606558452583e-05, "loss": 0.7624, "step": 1980 }, { "epoch": 0.7581324148488328, "grad_norm": 0.5377681851387024, "learning_rate": 1.923511500978008e-05, "loss": 0.6636, "step": 1981 }, { "epoch": 0.7585151167240719, "grad_norm": 0.5904213786125183, "learning_rate": 1.92341638675117e-05, "loss": 0.6391, "step": 1982 }, { "epoch": 0.7588978185993112, "grad_norm": 0.5250484943389893, "learning_rate": 1.923321215777914e-05, "loss": 0.6727, "step": 1983 }, { "epoch": 0.7592805204745503, "grad_norm": 0.5157269239425659, "learning_rate": 1.9232259880640894e-05, "loss": 0.6521, "step": 1984 }, { "epoch": 0.7596632223497896, "grad_norm": 0.6111825704574585, "learning_rate": 1.9231307036155472e-05, "loss": 0.7106, "step": 1985 }, { "epoch": 0.7600459242250287, "grad_norm": 0.5119845271110535, "learning_rate": 1.9230353624381436e-05, "loss": 0.7092, "step": 1986 }, { "epoch": 0.7604286261002678, "grad_norm": 0.5575418472290039, "learning_rate": 1.922939964537737e-05, "loss": 0.7118, "step": 1987 }, { "epoch": 0.7608113279755071, "grad_norm": 0.5526330471038818, "learning_rate": 1.92284450992019e-05, "loss": 0.6715, "step": 1988 }, { "epoch": 0.7611940298507462, "grad_norm": 0.4921630918979645, "learning_rate": 1.9227489985913687e-05, "loss": 0.5962, "step": 1989 }, { "epoch": 0.7615767317259855, "grad_norm": 0.6838455200195312, "learning_rate": 1.9226534305571428e-05, "loss": 0.6476, "step": 1990 }, { "epoch": 0.7619594336012246, "grad_norm": 0.509598433971405, "learning_rate": 1.9225578058233845e-05, "loss": 0.6754, "step": 1991 }, { "epoch": 0.7623421354764638, "grad_norm": 0.6063412427902222, "learning_rate": 1.922462124395971e-05, "loss": 0.7029, "step": 1992 }, { "epoch": 0.762724837351703, "grad_norm": 0.5657756328582764, "learning_rate": 1.9223663862807817e-05, "loss": 0.7322, "step": 1993 }, { "epoch": 0.7631075392269422, "grad_norm": 0.6237136721611023, "learning_rate": 1.9222705914836998e-05, "loss": 0.7288, "step": 1994 }, { "epoch": 0.7634902411021814, "grad_norm": 0.5713965892791748, "learning_rate": 1.9221747400106124e-05, "loss": 0.663, "step": 1995 }, { "epoch": 0.7638729429774206, "grad_norm": 0.49599793553352356, "learning_rate": 1.9220788318674098e-05, "loss": 0.8023, "step": 1996 }, { "epoch": 0.7642556448526597, "grad_norm": 0.5351353287696838, "learning_rate": 1.921982867059986e-05, "loss": 0.7033, "step": 1997 }, { "epoch": 0.764638346727899, "grad_norm": 0.6471113562583923, "learning_rate": 1.9218868455942378e-05, "loss": 0.7301, "step": 1998 }, { "epoch": 0.7650210486031381, "grad_norm": 0.5379230976104736, "learning_rate": 1.9217907674760666e-05, "loss": 0.6663, "step": 1999 }, { "epoch": 0.7654037504783774, "grad_norm": 0.5627852082252502, "learning_rate": 1.9216946327113765e-05, "loss": 0.7254, "step": 2000 }, { "epoch": 0.7657864523536165, "grad_norm": 0.5728996396064758, "learning_rate": 1.9215984413060746e-05, "loss": 0.6773, "step": 2001 }, { "epoch": 0.7661691542288557, "grad_norm": 0.5508913993835449, "learning_rate": 1.921502193266073e-05, "loss": 0.7098, "step": 2002 }, { "epoch": 0.7665518561040949, "grad_norm": 0.5150412321090698, "learning_rate": 1.921405888597286e-05, "loss": 0.6201, "step": 2003 }, { "epoch": 0.7669345579793341, "grad_norm": 0.5563148260116577, "learning_rate": 1.9213095273056318e-05, "loss": 0.6205, "step": 2004 }, { "epoch": 0.7673172598545733, "grad_norm": 0.584912896156311, "learning_rate": 1.921213109397032e-05, "loss": 0.6498, "step": 2005 }, { "epoch": 0.7676999617298125, "grad_norm": 0.5034434199333191, "learning_rate": 1.9211166348774117e-05, "loss": 0.655, "step": 2006 }, { "epoch": 0.7680826636050516, "grad_norm": 0.5769985914230347, "learning_rate": 1.9210201037526996e-05, "loss": 0.6475, "step": 2007 }, { "epoch": 0.7684653654802909, "grad_norm": 0.6037774682044983, "learning_rate": 1.920923516028828e-05, "loss": 0.6372, "step": 2008 }, { "epoch": 0.76884806735553, "grad_norm": 0.50749272108078, "learning_rate": 1.920826871711732e-05, "loss": 0.7201, "step": 2009 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5959120988845825, "learning_rate": 1.9207301708073514e-05, "loss": 0.7247, "step": 2010 }, { "epoch": 0.7696134711060084, "grad_norm": 0.5399760603904724, "learning_rate": 1.920633413321628e-05, "loss": 0.6313, "step": 2011 }, { "epoch": 0.7699961729812476, "grad_norm": 0.5903515219688416, "learning_rate": 1.9205365992605078e-05, "loss": 0.7402, "step": 2012 }, { "epoch": 0.7703788748564868, "grad_norm": 0.5262232422828674, "learning_rate": 1.920439728629941e-05, "loss": 0.7731, "step": 2013 }, { "epoch": 0.770761576731726, "grad_norm": 0.5348026156425476, "learning_rate": 1.9203428014358797e-05, "loss": 0.5728, "step": 2014 }, { "epoch": 0.7711442786069652, "grad_norm": 0.6223875284194946, "learning_rate": 1.9202458176842813e-05, "loss": 0.7177, "step": 2015 }, { "epoch": 0.7715269804822044, "grad_norm": 0.5896192789077759, "learning_rate": 1.920148777381105e-05, "loss": 0.7323, "step": 2016 }, { "epoch": 0.7719096823574435, "grad_norm": 0.6358504891395569, "learning_rate": 1.920051680532314e-05, "loss": 0.6714, "step": 2017 }, { "epoch": 0.7722923842326828, "grad_norm": 0.5691512823104858, "learning_rate": 1.919954527143876e-05, "loss": 0.6819, "step": 2018 }, { "epoch": 0.7726750861079219, "grad_norm": 0.5379306674003601, "learning_rate": 1.9198573172217606e-05, "loss": 0.669, "step": 2019 }, { "epoch": 0.7730577879831612, "grad_norm": 0.5549985766410828, "learning_rate": 1.919760050771942e-05, "loss": 0.6375, "step": 2020 }, { "epoch": 0.7734404898584003, "grad_norm": 0.7526847720146179, "learning_rate": 1.919662727800397e-05, "loss": 0.7472, "step": 2021 }, { "epoch": 0.7738231917336394, "grad_norm": 0.5242685675621033, "learning_rate": 1.919565348313107e-05, "loss": 0.6144, "step": 2022 }, { "epoch": 0.7742058936088787, "grad_norm": 0.5418250560760498, "learning_rate": 1.919467912316056e-05, "loss": 0.6668, "step": 2023 }, { "epoch": 0.7745885954841178, "grad_norm": 0.5476940274238586, "learning_rate": 1.9193704198152316e-05, "loss": 0.737, "step": 2024 }, { "epoch": 0.7749712973593571, "grad_norm": 0.5517264008522034, "learning_rate": 1.919272870816625e-05, "loss": 0.6911, "step": 2025 }, { "epoch": 0.7753539992345962, "grad_norm": 0.5385427474975586, "learning_rate": 1.919175265326231e-05, "loss": 0.6573, "step": 2026 }, { "epoch": 0.7757367011098354, "grad_norm": 0.5249191522598267, "learning_rate": 1.9190776033500472e-05, "loss": 0.7152, "step": 2027 }, { "epoch": 0.7761194029850746, "grad_norm": 0.47937268018722534, "learning_rate": 1.918979884894076e-05, "loss": 0.6658, "step": 2028 }, { "epoch": 0.7765021048603138, "grad_norm": 0.5562266707420349, "learning_rate": 1.9188821099643217e-05, "loss": 0.6885, "step": 2029 }, { "epoch": 0.776884806735553, "grad_norm": 0.5117166638374329, "learning_rate": 1.9187842785667935e-05, "loss": 0.6781, "step": 2030 }, { "epoch": 0.7772675086107922, "grad_norm": 0.5161581039428711, "learning_rate": 1.9186863907075026e-05, "loss": 0.6939, "step": 2031 }, { "epoch": 0.7776502104860313, "grad_norm": 0.48864907026290894, "learning_rate": 1.9185884463924656e-05, "loss": 0.6756, "step": 2032 }, { "epoch": 0.7780329123612706, "grad_norm": 0.5728349685668945, "learning_rate": 1.9184904456277002e-05, "loss": 0.6485, "step": 2033 }, { "epoch": 0.7784156142365097, "grad_norm": 0.5951014757156372, "learning_rate": 1.9183923884192297e-05, "loss": 0.6761, "step": 2034 }, { "epoch": 0.778798316111749, "grad_norm": 0.5778472423553467, "learning_rate": 1.9182942747730796e-05, "loss": 0.7283, "step": 2035 }, { "epoch": 0.7791810179869881, "grad_norm": 0.5307621955871582, "learning_rate": 1.9181961046952795e-05, "loss": 0.7752, "step": 2036 }, { "epoch": 0.7795637198622273, "grad_norm": 0.566340446472168, "learning_rate": 1.918097878191862e-05, "loss": 0.7429, "step": 2037 }, { "epoch": 0.7799464217374665, "grad_norm": 0.5374646782875061, "learning_rate": 1.917999595268863e-05, "loss": 0.6181, "step": 2038 }, { "epoch": 0.7803291236127057, "grad_norm": 0.6123586893081665, "learning_rate": 1.917901255932323e-05, "loss": 0.6862, "step": 2039 }, { "epoch": 0.7807118254879449, "grad_norm": 0.5349882245063782, "learning_rate": 1.917802860188285e-05, "loss": 0.6339, "step": 2040 }, { "epoch": 0.7810945273631841, "grad_norm": 0.5455144047737122, "learning_rate": 1.9177044080427957e-05, "loss": 0.6051, "step": 2041 }, { "epoch": 0.7814772292384232, "grad_norm": 0.5724525451660156, "learning_rate": 1.917605899501905e-05, "loss": 0.644, "step": 2042 }, { "epoch": 0.7818599311136625, "grad_norm": 0.5525273084640503, "learning_rate": 1.9175073345716665e-05, "loss": 0.759, "step": 2043 }, { "epoch": 0.7822426329889016, "grad_norm": 0.5073444843292236, "learning_rate": 1.9174087132581372e-05, "loss": 0.649, "step": 2044 }, { "epoch": 0.7826253348641409, "grad_norm": 0.559809148311615, "learning_rate": 1.917310035567378e-05, "loss": 0.7228, "step": 2045 }, { "epoch": 0.78300803673938, "grad_norm": 0.5855394601821899, "learning_rate": 1.917211301505453e-05, "loss": 0.7458, "step": 2046 }, { "epoch": 0.7833907386146192, "grad_norm": 0.6232478022575378, "learning_rate": 1.9171125110784295e-05, "loss": 0.6768, "step": 2047 }, { "epoch": 0.7837734404898584, "grad_norm": 0.5795620679855347, "learning_rate": 1.9170136642923784e-05, "loss": 0.6537, "step": 2048 }, { "epoch": 0.7841561423650976, "grad_norm": 0.5185753107070923, "learning_rate": 1.9169147611533737e-05, "loss": 0.6814, "step": 2049 }, { "epoch": 0.7845388442403368, "grad_norm": 0.46485820412635803, "learning_rate": 1.9168158016674937e-05, "loss": 0.6968, "step": 2050 }, { "epoch": 0.784921546115576, "grad_norm": 0.5598355531692505, "learning_rate": 1.91671678584082e-05, "loss": 0.6636, "step": 2051 }, { "epoch": 0.7853042479908151, "grad_norm": 0.5756394267082214, "learning_rate": 1.9166177136794367e-05, "loss": 0.7028, "step": 2052 }, { "epoch": 0.7856869498660544, "grad_norm": 0.5462226867675781, "learning_rate": 1.9165185851894327e-05, "loss": 0.7304, "step": 2053 }, { "epoch": 0.7860696517412935, "grad_norm": 0.5087311267852783, "learning_rate": 1.916419400376899e-05, "loss": 0.6495, "step": 2054 }, { "epoch": 0.7864523536165328, "grad_norm": 0.5554178953170776, "learning_rate": 1.9163201592479315e-05, "loss": 0.7437, "step": 2055 }, { "epoch": 0.7868350554917719, "grad_norm": 0.551313042640686, "learning_rate": 1.9162208618086287e-05, "loss": 0.7529, "step": 2056 }, { "epoch": 0.787217757367011, "grad_norm": 0.5162830352783203, "learning_rate": 1.9161215080650923e-05, "loss": 0.5838, "step": 2057 }, { "epoch": 0.7876004592422503, "grad_norm": 0.5121774077415466, "learning_rate": 1.9160220980234284e-05, "loss": 0.7163, "step": 2058 }, { "epoch": 0.7879831611174895, "grad_norm": 0.5413571000099182, "learning_rate": 1.9159226316897454e-05, "loss": 0.6354, "step": 2059 }, { "epoch": 0.7883658629927287, "grad_norm": 0.5307399034500122, "learning_rate": 1.915823109070156e-05, "loss": 0.6884, "step": 2060 }, { "epoch": 0.7887485648679678, "grad_norm": 0.5138938426971436, "learning_rate": 1.915723530170776e-05, "loss": 0.7248, "step": 2061 }, { "epoch": 0.789131266743207, "grad_norm": 0.5549039244651794, "learning_rate": 1.9156238949977254e-05, "loss": 0.6714, "step": 2062 }, { "epoch": 0.7895139686184462, "grad_norm": 0.5184617042541504, "learning_rate": 1.9155242035571268e-05, "loss": 0.612, "step": 2063 }, { "epoch": 0.7898966704936854, "grad_norm": 0.5149233341217041, "learning_rate": 1.915424455855106e-05, "loss": 0.7198, "step": 2064 }, { "epoch": 0.7902793723689246, "grad_norm": 0.574002742767334, "learning_rate": 1.915324651897793e-05, "loss": 0.711, "step": 2065 }, { "epoch": 0.7906620742441638, "grad_norm": 0.6037258505821228, "learning_rate": 1.9152247916913214e-05, "loss": 0.732, "step": 2066 }, { "epoch": 0.7910447761194029, "grad_norm": 0.5340920090675354, "learning_rate": 1.9151248752418278e-05, "loss": 0.7021, "step": 2067 }, { "epoch": 0.7914274779946422, "grad_norm": 0.557054340839386, "learning_rate": 1.9150249025554516e-05, "loss": 0.6905, "step": 2068 }, { "epoch": 0.7918101798698813, "grad_norm": 0.5235397815704346, "learning_rate": 1.914924873638337e-05, "loss": 0.676, "step": 2069 }, { "epoch": 0.7921928817451206, "grad_norm": 0.5839309096336365, "learning_rate": 1.9148247884966314e-05, "loss": 0.6696, "step": 2070 }, { "epoch": 0.7925755836203597, "grad_norm": 0.6205877661705017, "learning_rate": 1.9147246471364848e-05, "loss": 0.6469, "step": 2071 }, { "epoch": 0.7929582854955989, "grad_norm": 0.5210297107696533, "learning_rate": 1.914624449564051e-05, "loss": 0.6217, "step": 2072 }, { "epoch": 0.7933409873708381, "grad_norm": 0.5480197072029114, "learning_rate": 1.9145241957854875e-05, "loss": 0.6754, "step": 2073 }, { "epoch": 0.7937236892460773, "grad_norm": 0.6663427352905273, "learning_rate": 1.9144238858069556e-05, "loss": 0.6283, "step": 2074 }, { "epoch": 0.7941063911213165, "grad_norm": 0.609613299369812, "learning_rate": 1.9143235196346194e-05, "loss": 0.6375, "step": 2075 }, { "epoch": 0.7944890929965557, "grad_norm": 0.5188632011413574, "learning_rate": 1.9142230972746463e-05, "loss": 0.6101, "step": 2076 }, { "epoch": 0.7948717948717948, "grad_norm": 0.529485285282135, "learning_rate": 1.914122618733208e-05, "loss": 0.6865, "step": 2077 }, { "epoch": 0.7952544967470341, "grad_norm": 0.5795204639434814, "learning_rate": 1.914022084016479e-05, "loss": 0.7099, "step": 2078 }, { "epoch": 0.7956371986222732, "grad_norm": 0.5879075527191162, "learning_rate": 1.9139214931306372e-05, "loss": 0.702, "step": 2079 }, { "epoch": 0.7960199004975125, "grad_norm": 0.5287312269210815, "learning_rate": 1.9138208460818647e-05, "loss": 0.645, "step": 2080 }, { "epoch": 0.7964026023727516, "grad_norm": 0.5489304065704346, "learning_rate": 1.913720142876346e-05, "loss": 0.6855, "step": 2081 }, { "epoch": 0.7967853042479908, "grad_norm": 0.5626754760742188, "learning_rate": 1.91361938352027e-05, "loss": 0.619, "step": 2082 }, { "epoch": 0.79716800612323, "grad_norm": 0.47284913063049316, "learning_rate": 1.9135185680198284e-05, "loss": 0.6512, "step": 2083 }, { "epoch": 0.7975507079984692, "grad_norm": 0.4955426752567291, "learning_rate": 1.913417696381217e-05, "loss": 0.6724, "step": 2084 }, { "epoch": 0.7979334098737084, "grad_norm": 0.6089184880256653, "learning_rate": 1.9133167686106338e-05, "loss": 0.7461, "step": 2085 }, { "epoch": 0.7983161117489476, "grad_norm": 0.5618055462837219, "learning_rate": 1.913215784714282e-05, "loss": 0.7215, "step": 2086 }, { "epoch": 0.7986988136241867, "grad_norm": 0.49155259132385254, "learning_rate": 1.9131147446983663e-05, "loss": 0.6116, "step": 2087 }, { "epoch": 0.799081515499426, "grad_norm": 0.505859911441803, "learning_rate": 1.9130136485690966e-05, "loss": 0.6285, "step": 2088 }, { "epoch": 0.7994642173746651, "grad_norm": 0.697469174861908, "learning_rate": 1.9129124963326856e-05, "loss": 0.7243, "step": 2089 }, { "epoch": 0.7998469192499044, "grad_norm": 0.5414753556251526, "learning_rate": 1.912811287995349e-05, "loss": 0.8164, "step": 2090 }, { "epoch": 0.8002296211251435, "grad_norm": 0.5904883146286011, "learning_rate": 1.912710023563307e-05, "loss": 0.7429, "step": 2091 }, { "epoch": 0.8006123230003827, "grad_norm": 0.6672569513320923, "learning_rate": 1.9126087030427813e-05, "loss": 0.703, "step": 2092 }, { "epoch": 0.8009950248756219, "grad_norm": 0.5307592153549194, "learning_rate": 1.9125073264399993e-05, "loss": 0.8261, "step": 2093 }, { "epoch": 0.801377726750861, "grad_norm": 0.5156726241111755, "learning_rate": 1.912405893761191e-05, "loss": 0.7029, "step": 2094 }, { "epoch": 0.8017604286261003, "grad_norm": 0.5335444211959839, "learning_rate": 1.9123044050125892e-05, "loss": 0.7718, "step": 2095 }, { "epoch": 0.8021431305013395, "grad_norm": 0.5854575634002686, "learning_rate": 1.9122028602004307e-05, "loss": 0.6523, "step": 2096 }, { "epoch": 0.8025258323765786, "grad_norm": 0.4991576671600342, "learning_rate": 1.912101259330956e-05, "loss": 0.6129, "step": 2097 }, { "epoch": 0.8029085342518179, "grad_norm": 0.5656649470329285, "learning_rate": 1.9119996024104083e-05, "loss": 0.7142, "step": 2098 }, { "epoch": 0.803291236127057, "grad_norm": 0.5352083444595337, "learning_rate": 1.911897889445035e-05, "loss": 0.7046, "step": 2099 }, { "epoch": 0.8036739380022963, "grad_norm": 0.4954148828983307, "learning_rate": 1.911796120441087e-05, "loss": 0.6244, "step": 2100 }, { "epoch": 0.8040566398775354, "grad_norm": 0.5616485476493835, "learning_rate": 1.9116942954048175e-05, "loss": 0.6784, "step": 2101 }, { "epoch": 0.8044393417527745, "grad_norm": 0.6107630729675293, "learning_rate": 1.9115924143424843e-05, "loss": 0.6582, "step": 2102 }, { "epoch": 0.8048220436280138, "grad_norm": 0.5442965030670166, "learning_rate": 1.911490477260348e-05, "loss": 0.6903, "step": 2103 }, { "epoch": 0.8052047455032529, "grad_norm": 0.5437005758285522, "learning_rate": 1.9113884841646736e-05, "loss": 0.7197, "step": 2104 }, { "epoch": 0.8055874473784922, "grad_norm": 0.48110392689704895, "learning_rate": 1.911286435061728e-05, "loss": 0.6005, "step": 2105 }, { "epoch": 0.8059701492537313, "grad_norm": 0.5145890712738037, "learning_rate": 1.9111843299577833e-05, "loss": 0.612, "step": 2106 }, { "epoch": 0.8063528511289705, "grad_norm": 0.5903363227844238, "learning_rate": 1.9110821688591136e-05, "loss": 0.609, "step": 2107 }, { "epoch": 0.8067355530042097, "grad_norm": 0.5126757025718689, "learning_rate": 1.9109799517719966e-05, "loss": 0.6394, "step": 2108 }, { "epoch": 0.8071182548794489, "grad_norm": 0.5002396702766418, "learning_rate": 1.9108776787027145e-05, "loss": 0.6815, "step": 2109 }, { "epoch": 0.8075009567546881, "grad_norm": 0.5444457530975342, "learning_rate": 1.910775349657552e-05, "loss": 0.7073, "step": 2110 }, { "epoch": 0.8078836586299273, "grad_norm": 0.6209067106246948, "learning_rate": 1.9106729646427973e-05, "loss": 0.7202, "step": 2111 }, { "epoch": 0.8082663605051664, "grad_norm": 0.5934401154518127, "learning_rate": 1.910570523664743e-05, "loss": 0.6958, "step": 2112 }, { "epoch": 0.8086490623804057, "grad_norm": 0.5570154190063477, "learning_rate": 1.9104680267296827e-05, "loss": 0.6749, "step": 2113 }, { "epoch": 0.8090317642556448, "grad_norm": 0.5509297847747803, "learning_rate": 1.910365473843917e-05, "loss": 0.6631, "step": 2114 }, { "epoch": 0.8094144661308841, "grad_norm": 0.5598213076591492, "learning_rate": 1.9102628650137468e-05, "loss": 0.6427, "step": 2115 }, { "epoch": 0.8097971680061232, "grad_norm": 0.7804170250892639, "learning_rate": 1.9101602002454785e-05, "loss": 0.6571, "step": 2116 }, { "epoch": 0.8101798698813624, "grad_norm": 0.4993966519832611, "learning_rate": 1.9100574795454204e-05, "loss": 0.6648, "step": 2117 }, { "epoch": 0.8105625717566016, "grad_norm": 0.5515482425689697, "learning_rate": 1.909954702919886e-05, "loss": 0.664, "step": 2118 }, { "epoch": 0.8109452736318408, "grad_norm": 0.5608924627304077, "learning_rate": 1.90985187037519e-05, "loss": 0.7064, "step": 2119 }, { "epoch": 0.81132797550708, "grad_norm": 0.6408185958862305, "learning_rate": 1.9097489819176522e-05, "loss": 0.7201, "step": 2120 }, { "epoch": 0.8117106773823192, "grad_norm": 0.5714882612228394, "learning_rate": 1.9096460375535953e-05, "loss": 0.7257, "step": 2121 }, { "epoch": 0.8120933792575583, "grad_norm": 0.5743690729141235, "learning_rate": 1.9095430372893464e-05, "loss": 0.784, "step": 2122 }, { "epoch": 0.8124760811327976, "grad_norm": 0.5765624046325684, "learning_rate": 1.9094399811312337e-05, "loss": 0.6878, "step": 2123 }, { "epoch": 0.8128587830080367, "grad_norm": 0.5841130018234253, "learning_rate": 1.909336869085591e-05, "loss": 0.6538, "step": 2124 }, { "epoch": 0.813241484883276, "grad_norm": 0.4914608597755432, "learning_rate": 1.909233701158755e-05, "loss": 0.6645, "step": 2125 }, { "epoch": 0.8136241867585151, "grad_norm": 0.5361962914466858, "learning_rate": 1.9091304773570653e-05, "loss": 0.6757, "step": 2126 }, { "epoch": 0.8140068886337543, "grad_norm": 0.5551583170890808, "learning_rate": 1.9090271976868654e-05, "loss": 0.6995, "step": 2127 }, { "epoch": 0.8143895905089935, "grad_norm": 0.6141475439071655, "learning_rate": 1.9089238621545022e-05, "loss": 0.7054, "step": 2128 }, { "epoch": 0.8147722923842327, "grad_norm": 0.5292297005653381, "learning_rate": 1.9088204707663256e-05, "loss": 0.6569, "step": 2129 }, { "epoch": 0.8151549942594719, "grad_norm": 0.5221912860870361, "learning_rate": 1.9087170235286898e-05, "loss": 0.6402, "step": 2130 }, { "epoch": 0.815537696134711, "grad_norm": 0.539077877998352, "learning_rate": 1.9086135204479516e-05, "loss": 0.7171, "step": 2131 }, { "epoch": 0.8159203980099502, "grad_norm": 0.5428900718688965, "learning_rate": 1.9085099615304715e-05, "loss": 0.7953, "step": 2132 }, { "epoch": 0.8163030998851895, "grad_norm": 0.5239648818969727, "learning_rate": 1.9084063467826137e-05, "loss": 0.6836, "step": 2133 }, { "epoch": 0.8166858017604286, "grad_norm": 0.5200762152671814, "learning_rate": 1.9083026762107453e-05, "loss": 0.6181, "step": 2134 }, { "epoch": 0.8170685036356679, "grad_norm": 0.5478535890579224, "learning_rate": 1.9081989498212373e-05, "loss": 0.8254, "step": 2135 }, { "epoch": 0.817451205510907, "grad_norm": 0.5428305864334106, "learning_rate": 1.9080951676204642e-05, "loss": 0.6179, "step": 2136 }, { "epoch": 0.8178339073861461, "grad_norm": 0.5201843976974487, "learning_rate": 1.907991329614803e-05, "loss": 0.6883, "step": 2137 }, { "epoch": 0.8182166092613854, "grad_norm": 0.5803486108779907, "learning_rate": 1.9078874358106357e-05, "loss": 0.614, "step": 2138 }, { "epoch": 0.8185993111366245, "grad_norm": 0.539722204208374, "learning_rate": 1.9077834862143465e-05, "loss": 0.7728, "step": 2139 }, { "epoch": 0.8189820130118638, "grad_norm": 0.5094454884529114, "learning_rate": 1.907679480832323e-05, "loss": 0.7508, "step": 2140 }, { "epoch": 0.8193647148871029, "grad_norm": 0.5245801210403442, "learning_rate": 1.9075754196709574e-05, "loss": 0.6648, "step": 2141 }, { "epoch": 0.8197474167623421, "grad_norm": 0.5615615844726562, "learning_rate": 1.9074713027366436e-05, "loss": 0.7365, "step": 2142 }, { "epoch": 0.8201301186375813, "grad_norm": 0.5734975934028625, "learning_rate": 1.9073671300357802e-05, "loss": 0.6894, "step": 2143 }, { "epoch": 0.8205128205128205, "grad_norm": 0.5041593313217163, "learning_rate": 1.9072629015747695e-05, "loss": 0.6661, "step": 2144 }, { "epoch": 0.8208955223880597, "grad_norm": 0.565788209438324, "learning_rate": 1.9071586173600162e-05, "loss": 0.6431, "step": 2145 }, { "epoch": 0.8212782242632989, "grad_norm": 0.528772234916687, "learning_rate": 1.9070542773979284e-05, "loss": 0.7757, "step": 2146 }, { "epoch": 0.821660926138538, "grad_norm": 0.5806124806404114, "learning_rate": 1.9069498816949185e-05, "loss": 0.754, "step": 2147 }, { "epoch": 0.8220436280137773, "grad_norm": 0.5186758041381836, "learning_rate": 1.906845430257402e-05, "loss": 0.6406, "step": 2148 }, { "epoch": 0.8224263298890164, "grad_norm": 0.5334534645080566, "learning_rate": 1.9067409230917974e-05, "loss": 0.7433, "step": 2149 }, { "epoch": 0.8228090317642557, "grad_norm": 0.5699700117111206, "learning_rate": 1.9066363602045276e-05, "loss": 0.6572, "step": 2150 }, { "epoch": 0.8231917336394948, "grad_norm": 0.5564050078392029, "learning_rate": 1.9065317416020176e-05, "loss": 0.6888, "step": 2151 }, { "epoch": 0.823574435514734, "grad_norm": 0.5358455181121826, "learning_rate": 1.9064270672906962e-05, "loss": 0.6185, "step": 2152 }, { "epoch": 0.8239571373899732, "grad_norm": 0.5014235377311707, "learning_rate": 1.906322337276997e-05, "loss": 0.6931, "step": 2153 }, { "epoch": 0.8243398392652124, "grad_norm": 0.531119704246521, "learning_rate": 1.906217551567355e-05, "loss": 0.5963, "step": 2154 }, { "epoch": 0.8247225411404516, "grad_norm": 0.5798108577728271, "learning_rate": 1.90611271016821e-05, "loss": 0.6776, "step": 2155 }, { "epoch": 0.8251052430156908, "grad_norm": 0.591350793838501, "learning_rate": 1.906007813086005e-05, "loss": 0.7248, "step": 2156 }, { "epoch": 0.8254879448909299, "grad_norm": 0.5554988384246826, "learning_rate": 1.9059028603271856e-05, "loss": 0.7625, "step": 2157 }, { "epoch": 0.8258706467661692, "grad_norm": 0.5988894104957581, "learning_rate": 1.9057978518982016e-05, "loss": 0.8039, "step": 2158 }, { "epoch": 0.8262533486414083, "grad_norm": 0.5713659524917603, "learning_rate": 1.9056927878055062e-05, "loss": 0.7184, "step": 2159 }, { "epoch": 0.8266360505166476, "grad_norm": 0.547332763671875, "learning_rate": 1.905587668055556e-05, "loss": 0.7049, "step": 2160 }, { "epoch": 0.8270187523918867, "grad_norm": 0.567808210849762, "learning_rate": 1.905482492654811e-05, "loss": 0.7316, "step": 2161 }, { "epoch": 0.8274014542671259, "grad_norm": 0.5863398909568787, "learning_rate": 1.905377261609734e-05, "loss": 0.7206, "step": 2162 }, { "epoch": 0.8277841561423651, "grad_norm": 0.5517461895942688, "learning_rate": 1.905271974926792e-05, "loss": 0.6625, "step": 2163 }, { "epoch": 0.8281668580176043, "grad_norm": 0.5713918209075928, "learning_rate": 1.905166632612455e-05, "loss": 0.7404, "step": 2164 }, { "epoch": 0.8285495598928435, "grad_norm": 0.5380989909172058, "learning_rate": 1.9050612346731968e-05, "loss": 0.622, "step": 2165 }, { "epoch": 0.8289322617680827, "grad_norm": 0.4985780417919159, "learning_rate": 1.9049557811154943e-05, "loss": 0.6515, "step": 2166 }, { "epoch": 0.8293149636433218, "grad_norm": 0.6258448362350464, "learning_rate": 1.904850271945828e-05, "loss": 0.7275, "step": 2167 }, { "epoch": 0.8296976655185611, "grad_norm": 0.547441303730011, "learning_rate": 1.9047447071706816e-05, "loss": 0.7536, "step": 2168 }, { "epoch": 0.8300803673938002, "grad_norm": 0.5583265423774719, "learning_rate": 1.9046390867965422e-05, "loss": 0.7313, "step": 2169 }, { "epoch": 0.8304630692690395, "grad_norm": 0.5424691438674927, "learning_rate": 1.904533410829901e-05, "loss": 0.7276, "step": 2170 }, { "epoch": 0.8308457711442786, "grad_norm": 0.6491246819496155, "learning_rate": 1.9044276792772512e-05, "loss": 0.6664, "step": 2171 }, { "epoch": 0.8312284730195177, "grad_norm": 0.5366805791854858, "learning_rate": 1.904321892145091e-05, "loss": 0.6895, "step": 2172 }, { "epoch": 0.831611174894757, "grad_norm": 0.5115657448768616, "learning_rate": 1.9042160494399212e-05, "loss": 0.7527, "step": 2173 }, { "epoch": 0.8319938767699961, "grad_norm": 0.59858238697052, "learning_rate": 1.904110151168246e-05, "loss": 0.761, "step": 2174 }, { "epoch": 0.8323765786452354, "grad_norm": 0.4911678433418274, "learning_rate": 1.9040041973365725e-05, "loss": 0.6893, "step": 2175 }, { "epoch": 0.8327592805204745, "grad_norm": 0.5977715253829956, "learning_rate": 1.903898187951413e-05, "loss": 0.714, "step": 2176 }, { "epoch": 0.8331419823957137, "grad_norm": 0.546829104423523, "learning_rate": 1.9037921230192815e-05, "loss": 0.6559, "step": 2177 }, { "epoch": 0.8335246842709529, "grad_norm": 0.5301912426948547, "learning_rate": 1.903686002546696e-05, "loss": 0.6519, "step": 2178 }, { "epoch": 0.8339073861461921, "grad_norm": 0.5640426874160767, "learning_rate": 1.9035798265401778e-05, "loss": 0.6882, "step": 2179 }, { "epoch": 0.8342900880214313, "grad_norm": 0.540257453918457, "learning_rate": 1.903473595006252e-05, "loss": 0.6807, "step": 2180 }, { "epoch": 0.8346727898966705, "grad_norm": 0.5671212673187256, "learning_rate": 1.9033673079514466e-05, "loss": 0.8063, "step": 2181 }, { "epoch": 0.8350554917719096, "grad_norm": 0.5494288802146912, "learning_rate": 1.9032609653822932e-05, "loss": 0.6266, "step": 2182 }, { "epoch": 0.8354381936471489, "grad_norm": 0.5839433670043945, "learning_rate": 1.9031545673053266e-05, "loss": 0.6128, "step": 2183 }, { "epoch": 0.835820895522388, "grad_norm": 0.5645805597305298, "learning_rate": 1.903048113727086e-05, "loss": 0.7069, "step": 2184 }, { "epoch": 0.8362035973976273, "grad_norm": 0.5968703031539917, "learning_rate": 1.9029416046541125e-05, "loss": 0.7036, "step": 2185 }, { "epoch": 0.8365862992728664, "grad_norm": 0.5425674915313721, "learning_rate": 1.9028350400929516e-05, "loss": 0.672, "step": 2186 }, { "epoch": 0.8369690011481056, "grad_norm": 0.6507921814918518, "learning_rate": 1.902728420050152e-05, "loss": 0.7523, "step": 2187 }, { "epoch": 0.8373517030233448, "grad_norm": 0.510736882686615, "learning_rate": 1.902621744532266e-05, "loss": 0.684, "step": 2188 }, { "epoch": 0.837734404898584, "grad_norm": 0.5142451524734497, "learning_rate": 1.9025150135458487e-05, "loss": 0.659, "step": 2189 }, { "epoch": 0.8381171067738232, "grad_norm": 0.7356585264205933, "learning_rate": 1.9024082270974595e-05, "loss": 0.703, "step": 2190 }, { "epoch": 0.8384998086490624, "grad_norm": 0.5426244735717773, "learning_rate": 1.9023013851936603e-05, "loss": 0.6722, "step": 2191 }, { "epoch": 0.8388825105243015, "grad_norm": 0.5628562569618225, "learning_rate": 1.902194487841017e-05, "loss": 0.7072, "step": 2192 }, { "epoch": 0.8392652123995408, "grad_norm": 0.5392438769340515, "learning_rate": 1.9020875350460984e-05, "loss": 0.6855, "step": 2193 }, { "epoch": 0.8396479142747799, "grad_norm": 0.5231603384017944, "learning_rate": 1.9019805268154775e-05, "loss": 0.6462, "step": 2194 }, { "epoch": 0.8400306161500192, "grad_norm": 0.5619972348213196, "learning_rate": 1.9018734631557304e-05, "loss": 0.6536, "step": 2195 }, { "epoch": 0.8404133180252583, "grad_norm": 0.5081596970558167, "learning_rate": 1.901766344073436e-05, "loss": 0.6902, "step": 2196 }, { "epoch": 0.8407960199004975, "grad_norm": 0.5113386511802673, "learning_rate": 1.901659169575177e-05, "loss": 0.6154, "step": 2197 }, { "epoch": 0.8411787217757367, "grad_norm": 0.5153869986534119, "learning_rate": 1.9015519396675395e-05, "loss": 0.6652, "step": 2198 }, { "epoch": 0.8415614236509759, "grad_norm": 0.5175416469573975, "learning_rate": 1.9014446543571137e-05, "loss": 0.7092, "step": 2199 }, { "epoch": 0.8419441255262151, "grad_norm": 0.5548285245895386, "learning_rate": 1.901337313650492e-05, "loss": 0.6573, "step": 2200 }, { "epoch": 0.8423268274014543, "grad_norm": 0.523889422416687, "learning_rate": 1.901229917554271e-05, "loss": 0.6069, "step": 2201 }, { "epoch": 0.8427095292766934, "grad_norm": 0.5390679240226746, "learning_rate": 1.90112246607505e-05, "loss": 0.6814, "step": 2202 }, { "epoch": 0.8430922311519327, "grad_norm": 0.5099616646766663, "learning_rate": 1.9010149592194334e-05, "loss": 0.6552, "step": 2203 }, { "epoch": 0.8434749330271718, "grad_norm": 0.510606586933136, "learning_rate": 1.9009073969940264e-05, "loss": 0.5851, "step": 2204 }, { "epoch": 0.8438576349024111, "grad_norm": 0.7332969903945923, "learning_rate": 1.9007997794054398e-05, "loss": 0.6654, "step": 2205 }, { "epoch": 0.8442403367776502, "grad_norm": 0.48656976222991943, "learning_rate": 1.9006921064602867e-05, "loss": 0.6383, "step": 2206 }, { "epoch": 0.8446230386528893, "grad_norm": 0.5406942367553711, "learning_rate": 1.900584378165184e-05, "loss": 0.7154, "step": 2207 }, { "epoch": 0.8450057405281286, "grad_norm": 0.5146381258964539, "learning_rate": 1.9004765945267518e-05, "loss": 0.6319, "step": 2208 }, { "epoch": 0.8453884424033677, "grad_norm": 0.5233501195907593, "learning_rate": 1.9003687555516137e-05, "loss": 0.6895, "step": 2209 }, { "epoch": 0.845771144278607, "grad_norm": 0.5838407278060913, "learning_rate": 1.900260861246397e-05, "loss": 0.6501, "step": 2210 }, { "epoch": 0.8461538461538461, "grad_norm": 0.5260056853294373, "learning_rate": 1.9001529116177315e-05, "loss": 0.6618, "step": 2211 }, { "epoch": 0.8465365480290853, "grad_norm": 0.548311173915863, "learning_rate": 1.9000449066722516e-05, "loss": 0.6724, "step": 2212 }, { "epoch": 0.8469192499043245, "grad_norm": 0.5347469449043274, "learning_rate": 1.8999368464165943e-05, "loss": 0.7285, "step": 2213 }, { "epoch": 0.8473019517795637, "grad_norm": 0.6435689926147461, "learning_rate": 1.8998287308573996e-05, "loss": 0.7438, "step": 2214 }, { "epoch": 0.8476846536548029, "grad_norm": 0.5399803519248962, "learning_rate": 1.8997205600013124e-05, "loss": 0.5725, "step": 2215 }, { "epoch": 0.8480673555300421, "grad_norm": 0.47036314010620117, "learning_rate": 1.8996123338549798e-05, "loss": 0.6736, "step": 2216 }, { "epoch": 0.8484500574052812, "grad_norm": 0.4950539469718933, "learning_rate": 1.899504052425052e-05, "loss": 0.6644, "step": 2217 }, { "epoch": 0.8488327592805205, "grad_norm": 0.5822178721427917, "learning_rate": 1.899395715718184e-05, "loss": 0.6648, "step": 2218 }, { "epoch": 0.8492154611557596, "grad_norm": 0.5690000653266907, "learning_rate": 1.899287323741033e-05, "loss": 0.6859, "step": 2219 }, { "epoch": 0.8495981630309989, "grad_norm": 0.5896934270858765, "learning_rate": 1.89917887650026e-05, "loss": 0.7241, "step": 2220 }, { "epoch": 0.849980864906238, "grad_norm": 0.5868052840232849, "learning_rate": 1.8990703740025296e-05, "loss": 0.708, "step": 2221 }, { "epoch": 0.8503635667814772, "grad_norm": 0.5917852520942688, "learning_rate": 1.8989618162545092e-05, "loss": 0.7663, "step": 2222 }, { "epoch": 0.8507462686567164, "grad_norm": 0.6619212627410889, "learning_rate": 1.8988532032628702e-05, "loss": 0.6689, "step": 2223 }, { "epoch": 0.8511289705319556, "grad_norm": 0.606124222278595, "learning_rate": 1.898744535034287e-05, "loss": 0.6983, "step": 2224 }, { "epoch": 0.8515116724071948, "grad_norm": 0.6227298974990845, "learning_rate": 1.8986358115754378e-05, "loss": 0.6486, "step": 2225 }, { "epoch": 0.851894374282434, "grad_norm": 0.5361924767494202, "learning_rate": 1.8985270328930036e-05, "loss": 0.7124, "step": 2226 }, { "epoch": 0.8522770761576731, "grad_norm": 0.5105598568916321, "learning_rate": 1.8984181989936697e-05, "loss": 0.6139, "step": 2227 }, { "epoch": 0.8526597780329124, "grad_norm": 0.5386499762535095, "learning_rate": 1.8983093098841236e-05, "loss": 0.6603, "step": 2228 }, { "epoch": 0.8530424799081515, "grad_norm": 0.5569700598716736, "learning_rate": 1.898200365571057e-05, "loss": 0.6953, "step": 2229 }, { "epoch": 0.8534251817833908, "grad_norm": 0.513370931148529, "learning_rate": 1.898091366061165e-05, "loss": 0.6653, "step": 2230 }, { "epoch": 0.8538078836586299, "grad_norm": 0.5130563974380493, "learning_rate": 1.8979823113611462e-05, "loss": 0.6776, "step": 2231 }, { "epoch": 0.8541905855338691, "grad_norm": 0.46241307258605957, "learning_rate": 1.8978732014777014e-05, "loss": 0.6491, "step": 2232 }, { "epoch": 0.8545732874091083, "grad_norm": 0.6019946932792664, "learning_rate": 1.8977640364175367e-05, "loss": 0.7149, "step": 2233 }, { "epoch": 0.8549559892843475, "grad_norm": 0.5693157315254211, "learning_rate": 1.89765481618736e-05, "loss": 0.7396, "step": 2234 }, { "epoch": 0.8553386911595867, "grad_norm": 0.5239552855491638, "learning_rate": 1.897545540793883e-05, "loss": 0.6916, "step": 2235 }, { "epoch": 0.8557213930348259, "grad_norm": 0.5537455081939697, "learning_rate": 1.8974362102438215e-05, "loss": 0.6446, "step": 2236 }, { "epoch": 0.856104094910065, "grad_norm": 0.5443614721298218, "learning_rate": 1.8973268245438943e-05, "loss": 0.7036, "step": 2237 }, { "epoch": 0.8564867967853043, "grad_norm": 0.5539005994796753, "learning_rate": 1.8972173837008225e-05, "loss": 0.6882, "step": 2238 }, { "epoch": 0.8568694986605434, "grad_norm": 0.5154517292976379, "learning_rate": 1.8971078877213323e-05, "loss": 0.6525, "step": 2239 }, { "epoch": 0.8572522005357827, "grad_norm": 0.5497653484344482, "learning_rate": 1.8969983366121524e-05, "loss": 0.6916, "step": 2240 }, { "epoch": 0.8576349024110218, "grad_norm": 0.5206841826438904, "learning_rate": 1.896888730380015e-05, "loss": 0.6974, "step": 2241 }, { "epoch": 0.858017604286261, "grad_norm": 0.4973767399787903, "learning_rate": 1.8967790690316555e-05, "loss": 0.7523, "step": 2242 }, { "epoch": 0.8584003061615002, "grad_norm": 0.49616336822509766, "learning_rate": 1.896669352573813e-05, "loss": 0.6668, "step": 2243 }, { "epoch": 0.8587830080367393, "grad_norm": 0.546058714389801, "learning_rate": 1.89655958101323e-05, "loss": 0.7315, "step": 2244 }, { "epoch": 0.8591657099119786, "grad_norm": 0.5046958923339844, "learning_rate": 1.896449754356652e-05, "loss": 0.6125, "step": 2245 }, { "epoch": 0.8595484117872177, "grad_norm": 0.546845018863678, "learning_rate": 1.8963398726108284e-05, "loss": 0.6795, "step": 2246 }, { "epoch": 0.8599311136624569, "grad_norm": 0.5560593605041504, "learning_rate": 1.8962299357825115e-05, "loss": 0.6847, "step": 2247 }, { "epoch": 0.8603138155376961, "grad_norm": 0.5817855596542358, "learning_rate": 1.8961199438784576e-05, "loss": 0.6399, "step": 2248 }, { "epoch": 0.8606965174129353, "grad_norm": 0.5224149823188782, "learning_rate": 1.8960098969054253e-05, "loss": 0.7012, "step": 2249 }, { "epoch": 0.8610792192881745, "grad_norm": 0.5107572674751282, "learning_rate": 1.8958997948701784e-05, "loss": 0.6296, "step": 2250 }, { "epoch": 0.8614619211634137, "grad_norm": 0.5142855048179626, "learning_rate": 1.8957896377794817e-05, "loss": 0.6806, "step": 2251 }, { "epoch": 0.8618446230386528, "grad_norm": 0.5397214293479919, "learning_rate": 1.8956794256401056e-05, "loss": 0.7065, "step": 2252 }, { "epoch": 0.8622273249138921, "grad_norm": 0.587313711643219, "learning_rate": 1.8955691584588222e-05, "loss": 0.7121, "step": 2253 }, { "epoch": 0.8626100267891312, "grad_norm": 0.5812573432922363, "learning_rate": 1.8954588362424084e-05, "loss": 0.7152, "step": 2254 }, { "epoch": 0.8629927286643705, "grad_norm": 0.5675926804542542, "learning_rate": 1.895348458997643e-05, "loss": 0.6367, "step": 2255 }, { "epoch": 0.8633754305396096, "grad_norm": 0.590056836605072, "learning_rate": 1.89523802673131e-05, "loss": 0.6629, "step": 2256 }, { "epoch": 0.8637581324148488, "grad_norm": 0.5007932782173157, "learning_rate": 1.895127539450195e-05, "loss": 0.633, "step": 2257 }, { "epoch": 0.864140834290088, "grad_norm": 0.551047146320343, "learning_rate": 1.8950169971610876e-05, "loss": 0.6731, "step": 2258 }, { "epoch": 0.8645235361653272, "grad_norm": 0.5829090476036072, "learning_rate": 1.8949063998707817e-05, "loss": 0.6483, "step": 2259 }, { "epoch": 0.8649062380405664, "grad_norm": 0.5180804133415222, "learning_rate": 1.8947957475860736e-05, "loss": 0.5965, "step": 2260 }, { "epoch": 0.8652889399158056, "grad_norm": 0.549214243888855, "learning_rate": 1.8946850403137623e-05, "loss": 0.6621, "step": 2261 }, { "epoch": 0.8656716417910447, "grad_norm": 0.5474393367767334, "learning_rate": 1.8945742780606518e-05, "loss": 0.6766, "step": 2262 }, { "epoch": 0.866054343666284, "grad_norm": 0.583943247795105, "learning_rate": 1.894463460833549e-05, "loss": 0.6402, "step": 2263 }, { "epoch": 0.8664370455415231, "grad_norm": 0.5505298376083374, "learning_rate": 1.8943525886392635e-05, "loss": 0.656, "step": 2264 }, { "epoch": 0.8668197474167624, "grad_norm": 0.5505328178405762, "learning_rate": 1.8942416614846086e-05, "loss": 0.6386, "step": 2265 }, { "epoch": 0.8672024492920015, "grad_norm": 0.5288465619087219, "learning_rate": 1.8941306793764015e-05, "loss": 0.7311, "step": 2266 }, { "epoch": 0.8675851511672407, "grad_norm": 0.5294044613838196, "learning_rate": 1.8940196423214616e-05, "loss": 0.6625, "step": 2267 }, { "epoch": 0.8679678530424799, "grad_norm": 0.4711208939552307, "learning_rate": 1.8939085503266135e-05, "loss": 0.6584, "step": 2268 }, { "epoch": 0.8683505549177191, "grad_norm": 0.5943913459777832, "learning_rate": 1.8937974033986834e-05, "loss": 0.7205, "step": 2269 }, { "epoch": 0.8687332567929583, "grad_norm": 0.5382689833641052, "learning_rate": 1.8936862015445013e-05, "loss": 0.7898, "step": 2270 }, { "epoch": 0.8691159586681975, "grad_norm": 0.616124153137207, "learning_rate": 1.893574944770901e-05, "loss": 0.7113, "step": 2271 }, { "epoch": 0.8694986605434366, "grad_norm": 0.5482029318809509, "learning_rate": 1.8934636330847205e-05, "loss": 0.6416, "step": 2272 }, { "epoch": 0.8698813624186759, "grad_norm": 0.5308566689491272, "learning_rate": 1.8933522664927993e-05, "loss": 0.5936, "step": 2273 }, { "epoch": 0.870264064293915, "grad_norm": 0.5740435719490051, "learning_rate": 1.893240845001981e-05, "loss": 0.7103, "step": 2274 }, { "epoch": 0.8706467661691543, "grad_norm": 0.4915345013141632, "learning_rate": 1.8931293686191132e-05, "loss": 0.6444, "step": 2275 }, { "epoch": 0.8710294680443934, "grad_norm": 0.5520486235618591, "learning_rate": 1.8930178373510464e-05, "loss": 0.6997, "step": 2276 }, { "epoch": 0.8714121699196326, "grad_norm": 0.5470171570777893, "learning_rate": 1.8929062512046346e-05, "loss": 0.6662, "step": 2277 }, { "epoch": 0.8717948717948718, "grad_norm": 0.531303882598877, "learning_rate": 1.8927946101867348e-05, "loss": 0.6715, "step": 2278 }, { "epoch": 0.872177573670111, "grad_norm": 0.5352877378463745, "learning_rate": 1.8926829143042075e-05, "loss": 0.6942, "step": 2279 }, { "epoch": 0.8725602755453502, "grad_norm": 0.6136537790298462, "learning_rate": 1.892571163563917e-05, "loss": 0.7335, "step": 2280 }, { "epoch": 0.8729429774205893, "grad_norm": 0.5086985230445862, "learning_rate": 1.892459357972731e-05, "loss": 0.7, "step": 2281 }, { "epoch": 0.8733256792958285, "grad_norm": 0.5519249439239502, "learning_rate": 1.8923474975375195e-05, "loss": 0.7249, "step": 2282 }, { "epoch": 0.8737083811710677, "grad_norm": 0.5921387672424316, "learning_rate": 1.892235582265157e-05, "loss": 0.7803, "step": 2283 }, { "epoch": 0.8740910830463069, "grad_norm": 0.5441846251487732, "learning_rate": 1.892123612162521e-05, "loss": 0.7457, "step": 2284 }, { "epoch": 0.8744737849215461, "grad_norm": 0.5264445543289185, "learning_rate": 1.892011587236492e-05, "loss": 0.6626, "step": 2285 }, { "epoch": 0.8748564867967853, "grad_norm": 0.5198737978935242, "learning_rate": 1.891899507493955e-05, "loss": 0.6383, "step": 2286 }, { "epoch": 0.8752391886720245, "grad_norm": 0.5498431324958801, "learning_rate": 1.8917873729417974e-05, "loss": 0.7107, "step": 2287 }, { "epoch": 0.8756218905472637, "grad_norm": 0.5191537141799927, "learning_rate": 1.8916751835869095e-05, "loss": 0.6964, "step": 2288 }, { "epoch": 0.8760045924225028, "grad_norm": 0.5066076517105103, "learning_rate": 1.8915629394361862e-05, "loss": 0.664, "step": 2289 }, { "epoch": 0.8763872942977421, "grad_norm": 0.6167012453079224, "learning_rate": 1.8914506404965252e-05, "loss": 0.6488, "step": 2290 }, { "epoch": 0.8767699961729812, "grad_norm": 0.5609169006347656, "learning_rate": 1.8913382867748275e-05, "loss": 0.6857, "step": 2291 }, { "epoch": 0.8771526980482205, "grad_norm": 0.5107314586639404, "learning_rate": 1.8912258782779976e-05, "loss": 0.7036, "step": 2292 }, { "epoch": 0.8775353999234596, "grad_norm": 0.649328351020813, "learning_rate": 1.891113415012943e-05, "loss": 0.7781, "step": 2293 }, { "epoch": 0.8779181017986988, "grad_norm": 0.5486709475517273, "learning_rate": 1.891000896986575e-05, "loss": 0.7088, "step": 2294 }, { "epoch": 0.878300803673938, "grad_norm": 0.7092811465263367, "learning_rate": 1.8908883242058084e-05, "loss": 0.7192, "step": 2295 }, { "epoch": 0.8786835055491772, "grad_norm": 0.571381151676178, "learning_rate": 1.890775696677561e-05, "loss": 0.6334, "step": 2296 }, { "epoch": 0.8790662074244164, "grad_norm": 0.5252084732055664, "learning_rate": 1.890663014408754e-05, "loss": 0.7112, "step": 2297 }, { "epoch": 0.8794489092996556, "grad_norm": 0.5136157870292664, "learning_rate": 1.8905502774063115e-05, "loss": 0.667, "step": 2298 }, { "epoch": 0.8798316111748947, "grad_norm": 0.5348042845726013, "learning_rate": 1.8904374856771624e-05, "loss": 0.6619, "step": 2299 }, { "epoch": 0.880214313050134, "grad_norm": 0.5407000184059143, "learning_rate": 1.8903246392282376e-05, "loss": 0.6743, "step": 2300 }, { "epoch": 0.8805970149253731, "grad_norm": 0.562211275100708, "learning_rate": 1.890211738066472e-05, "loss": 0.6695, "step": 2301 }, { "epoch": 0.8809797168006124, "grad_norm": 0.5136865973472595, "learning_rate": 1.8900987821988038e-05, "loss": 0.6592, "step": 2302 }, { "epoch": 0.8813624186758515, "grad_norm": 0.5673660039901733, "learning_rate": 1.8899857716321737e-05, "loss": 0.6539, "step": 2303 }, { "epoch": 0.8817451205510907, "grad_norm": 0.5392155051231384, "learning_rate": 1.8898727063735268e-05, "loss": 0.6788, "step": 2304 }, { "epoch": 0.8821278224263299, "grad_norm": 0.5635517835617065, "learning_rate": 1.889759586429812e-05, "loss": 0.6674, "step": 2305 }, { "epoch": 0.8825105243015691, "grad_norm": 0.6220323443412781, "learning_rate": 1.8896464118079804e-05, "loss": 0.6765, "step": 2306 }, { "epoch": 0.8828932261768083, "grad_norm": 0.5441206097602844, "learning_rate": 1.889533182514986e-05, "loss": 0.747, "step": 2307 }, { "epoch": 0.8832759280520475, "grad_norm": 0.5624338388442993, "learning_rate": 1.8894198985577885e-05, "loss": 0.629, "step": 2308 }, { "epoch": 0.8836586299272866, "grad_norm": 0.5376296639442444, "learning_rate": 1.8893065599433485e-05, "loss": 0.6941, "step": 2309 }, { "epoch": 0.8840413318025259, "grad_norm": 0.5309484601020813, "learning_rate": 1.889193166678631e-05, "loss": 0.6672, "step": 2310 }, { "epoch": 0.884424033677765, "grad_norm": 0.7597054243087769, "learning_rate": 1.8890797187706052e-05, "loss": 0.6755, "step": 2311 }, { "epoch": 0.8848067355530043, "grad_norm": 0.5799849033355713, "learning_rate": 1.8889662162262418e-05, "loss": 0.6979, "step": 2312 }, { "epoch": 0.8851894374282434, "grad_norm": 0.5219376683235168, "learning_rate": 1.8888526590525165e-05, "loss": 0.692, "step": 2313 }, { "epoch": 0.8855721393034826, "grad_norm": 0.5500605702400208, "learning_rate": 1.8887390472564073e-05, "loss": 0.6659, "step": 2314 }, { "epoch": 0.8859548411787218, "grad_norm": 0.5041612982749939, "learning_rate": 1.888625380844896e-05, "loss": 0.7206, "step": 2315 }, { "epoch": 0.886337543053961, "grad_norm": 0.5424149632453918, "learning_rate": 1.888511659824968e-05, "loss": 0.7021, "step": 2316 }, { "epoch": 0.8867202449292002, "grad_norm": 0.549458384513855, "learning_rate": 1.8883978842036114e-05, "loss": 0.5987, "step": 2317 }, { "epoch": 0.8871029468044394, "grad_norm": 0.5443065166473389, "learning_rate": 1.8882840539878178e-05, "loss": 0.6776, "step": 2318 }, { "epoch": 0.8874856486796785, "grad_norm": 0.516432523727417, "learning_rate": 1.8881701691845827e-05, "loss": 0.6803, "step": 2319 }, { "epoch": 0.8878683505549178, "grad_norm": 0.5450880527496338, "learning_rate": 1.888056229800905e-05, "loss": 0.6995, "step": 2320 }, { "epoch": 0.8882510524301569, "grad_norm": 0.5616101026535034, "learning_rate": 1.8879422358437863e-05, "loss": 0.7251, "step": 2321 }, { "epoch": 0.8886337543053962, "grad_norm": 0.5294768810272217, "learning_rate": 1.8878281873202315e-05, "loss": 0.6875, "step": 2322 }, { "epoch": 0.8890164561806353, "grad_norm": 0.6771349310874939, "learning_rate": 1.8877140842372496e-05, "loss": 0.7116, "step": 2323 }, { "epoch": 0.8893991580558744, "grad_norm": 0.535860002040863, "learning_rate": 1.8875999266018524e-05, "loss": 0.6897, "step": 2324 }, { "epoch": 0.8897818599311137, "grad_norm": 0.5338298678398132, "learning_rate": 1.887485714421055e-05, "loss": 0.7269, "step": 2325 }, { "epoch": 0.8901645618063528, "grad_norm": 0.6157966256141663, "learning_rate": 1.8873714477018764e-05, "loss": 0.6973, "step": 2326 }, { "epoch": 0.8905472636815921, "grad_norm": 0.5834113359451294, "learning_rate": 1.8872571264513384e-05, "loss": 0.6573, "step": 2327 }, { "epoch": 0.8909299655568312, "grad_norm": 0.5796091556549072, "learning_rate": 1.8871427506764665e-05, "loss": 0.7183, "step": 2328 }, { "epoch": 0.8913126674320704, "grad_norm": 0.5503896474838257, "learning_rate": 1.8870283203842888e-05, "loss": 0.7093, "step": 2329 }, { "epoch": 0.8916953693073096, "grad_norm": 0.581525981426239, "learning_rate": 1.886913835581838e-05, "loss": 0.6496, "step": 2330 }, { "epoch": 0.8920780711825488, "grad_norm": 0.5966241955757141, "learning_rate": 1.8867992962761495e-05, "loss": 0.8231, "step": 2331 }, { "epoch": 0.892460773057788, "grad_norm": 0.8431763052940369, "learning_rate": 1.8866847024742617e-05, "loss": 0.6502, "step": 2332 }, { "epoch": 0.8928434749330272, "grad_norm": 0.5371373295783997, "learning_rate": 1.886570054183217e-05, "loss": 0.7528, "step": 2333 }, { "epoch": 0.8932261768082663, "grad_norm": 0.53851717710495, "learning_rate": 1.886455351410061e-05, "loss": 0.7472, "step": 2334 }, { "epoch": 0.8936088786835056, "grad_norm": 0.5782144665718079, "learning_rate": 1.886340594161842e-05, "loss": 0.6036, "step": 2335 }, { "epoch": 0.8939915805587447, "grad_norm": 0.5757951140403748, "learning_rate": 1.886225782445612e-05, "loss": 0.689, "step": 2336 }, { "epoch": 0.894374282433984, "grad_norm": 0.4919421672821045, "learning_rate": 1.886110916268427e-05, "loss": 0.634, "step": 2337 }, { "epoch": 0.8947569843092231, "grad_norm": 0.5365685224533081, "learning_rate": 1.8859959956373456e-05, "loss": 0.7276, "step": 2338 }, { "epoch": 0.8951396861844623, "grad_norm": 0.5551812052726746, "learning_rate": 1.8858810205594304e-05, "loss": 0.6715, "step": 2339 }, { "epoch": 0.8955223880597015, "grad_norm": 0.5788233876228333, "learning_rate": 1.885765991041746e-05, "loss": 0.7109, "step": 2340 }, { "epoch": 0.8959050899349407, "grad_norm": 0.541479766368866, "learning_rate": 1.8856509070913625e-05, "loss": 0.7149, "step": 2341 }, { "epoch": 0.8962877918101799, "grad_norm": 0.559782087802887, "learning_rate": 1.885535768715351e-05, "loss": 0.7283, "step": 2342 }, { "epoch": 0.8966704936854191, "grad_norm": 0.502561628818512, "learning_rate": 1.885420575920788e-05, "loss": 0.6924, "step": 2343 }, { "epoch": 0.8970531955606582, "grad_norm": 0.5336918234825134, "learning_rate": 1.8853053287147512e-05, "loss": 0.6633, "step": 2344 }, { "epoch": 0.8974358974358975, "grad_norm": 0.5033063292503357, "learning_rate": 1.8851900271043243e-05, "loss": 0.6652, "step": 2345 }, { "epoch": 0.8978185993111366, "grad_norm": 0.5209484696388245, "learning_rate": 1.8850746710965918e-05, "loss": 0.5758, "step": 2346 }, { "epoch": 0.8982013011863759, "grad_norm": 0.5578566789627075, "learning_rate": 1.884959260698643e-05, "loss": 0.63, "step": 2347 }, { "epoch": 0.898584003061615, "grad_norm": 0.5455530881881714, "learning_rate": 1.8848437959175703e-05, "loss": 0.7294, "step": 2348 }, { "epoch": 0.8989667049368542, "grad_norm": 0.665499210357666, "learning_rate": 1.884728276760469e-05, "loss": 0.6586, "step": 2349 }, { "epoch": 0.8993494068120934, "grad_norm": 0.5069064497947693, "learning_rate": 1.884612703234439e-05, "loss": 0.6181, "step": 2350 }, { "epoch": 0.8997321086873326, "grad_norm": 0.5046830773353577, "learning_rate": 1.8844970753465813e-05, "loss": 0.7782, "step": 2351 }, { "epoch": 0.9001148105625718, "grad_norm": 0.550284743309021, "learning_rate": 1.884381393104002e-05, "loss": 0.7232, "step": 2352 }, { "epoch": 0.900497512437811, "grad_norm": 0.5948193073272705, "learning_rate": 1.8842656565138107e-05, "loss": 0.6994, "step": 2353 }, { "epoch": 0.9008802143130501, "grad_norm": 0.48913976550102234, "learning_rate": 1.884149865583119e-05, "loss": 0.5608, "step": 2354 }, { "epoch": 0.9012629161882894, "grad_norm": 0.5252088308334351, "learning_rate": 1.8840340203190427e-05, "loss": 0.7133, "step": 2355 }, { "epoch": 0.9016456180635285, "grad_norm": 0.503268837928772, "learning_rate": 1.883918120728701e-05, "loss": 0.6111, "step": 2356 }, { "epoch": 0.9020283199387678, "grad_norm": 0.5134750008583069, "learning_rate": 1.883802166819216e-05, "loss": 0.7709, "step": 2357 }, { "epoch": 0.9024110218140069, "grad_norm": 0.610112726688385, "learning_rate": 1.8836861585977138e-05, "loss": 0.6648, "step": 2358 }, { "epoch": 0.902793723689246, "grad_norm": 0.5335385203361511, "learning_rate": 1.883570096071323e-05, "loss": 0.6894, "step": 2359 }, { "epoch": 0.9031764255644853, "grad_norm": 0.5207417011260986, "learning_rate": 1.883453979247176e-05, "loss": 0.6625, "step": 2360 }, { "epoch": 0.9035591274397244, "grad_norm": 0.5515102744102478, "learning_rate": 1.883337808132409e-05, "loss": 0.6943, "step": 2361 }, { "epoch": 0.9039418293149637, "grad_norm": 0.5564811825752258, "learning_rate": 1.88322158273416e-05, "loss": 0.6443, "step": 2362 }, { "epoch": 0.9043245311902028, "grad_norm": 0.5694364309310913, "learning_rate": 1.8831053030595723e-05, "loss": 0.6339, "step": 2363 }, { "epoch": 0.904707233065442, "grad_norm": 0.5241259932518005, "learning_rate": 1.882988969115791e-05, "loss": 0.6165, "step": 2364 }, { "epoch": 0.9050899349406812, "grad_norm": 0.4890950918197632, "learning_rate": 1.8828725809099657e-05, "loss": 0.6425, "step": 2365 }, { "epoch": 0.9054726368159204, "grad_norm": 0.5155773162841797, "learning_rate": 1.8827561384492482e-05, "loss": 0.7154, "step": 2366 }, { "epoch": 0.9058553386911596, "grad_norm": 0.6193656921386719, "learning_rate": 1.8826396417407952e-05, "loss": 0.6985, "step": 2367 }, { "epoch": 0.9062380405663988, "grad_norm": 0.5089056491851807, "learning_rate": 1.882523090791764e-05, "loss": 0.6852, "step": 2368 }, { "epoch": 0.9066207424416379, "grad_norm": 0.5832858681678772, "learning_rate": 1.8824064856093187e-05, "loss": 0.7079, "step": 2369 }, { "epoch": 0.9070034443168772, "grad_norm": 0.5154018402099609, "learning_rate": 1.882289826200624e-05, "loss": 0.7507, "step": 2370 }, { "epoch": 0.9073861461921163, "grad_norm": 0.5249848365783691, "learning_rate": 1.8821731125728493e-05, "loss": 0.7827, "step": 2371 }, { "epoch": 0.9077688480673556, "grad_norm": 0.5394926071166992, "learning_rate": 1.882056344733167e-05, "loss": 0.6572, "step": 2372 }, { "epoch": 0.9081515499425947, "grad_norm": 0.5929439067840576, "learning_rate": 1.8819395226887524e-05, "loss": 0.6661, "step": 2373 }, { "epoch": 0.9085342518178339, "grad_norm": 0.6309826970100403, "learning_rate": 1.881822646446785e-05, "loss": 0.6617, "step": 2374 }, { "epoch": 0.9089169536930731, "grad_norm": 0.5654677152633667, "learning_rate": 1.8817057160144466e-05, "loss": 0.6854, "step": 2375 }, { "epoch": 0.9092996555683123, "grad_norm": 0.5439044237136841, "learning_rate": 1.881588731398924e-05, "loss": 0.6595, "step": 2376 }, { "epoch": 0.9096823574435515, "grad_norm": 0.6070613265037537, "learning_rate": 1.881471692607405e-05, "loss": 0.6355, "step": 2377 }, { "epoch": 0.9100650593187907, "grad_norm": 0.5354739427566528, "learning_rate": 1.8813545996470825e-05, "loss": 0.7618, "step": 2378 }, { "epoch": 0.9104477611940298, "grad_norm": 0.57608962059021, "learning_rate": 1.881237452525152e-05, "loss": 0.7704, "step": 2379 }, { "epoch": 0.9108304630692691, "grad_norm": 0.4809272587299347, "learning_rate": 1.8811202512488126e-05, "loss": 0.6287, "step": 2380 }, { "epoch": 0.9112131649445082, "grad_norm": 0.5648482441902161, "learning_rate": 1.881002995825267e-05, "loss": 0.7472, "step": 2381 }, { "epoch": 0.9115958668197475, "grad_norm": 0.5374367833137512, "learning_rate": 1.88088568626172e-05, "loss": 0.7262, "step": 2382 }, { "epoch": 0.9119785686949866, "grad_norm": 0.5355334877967834, "learning_rate": 1.8807683225653815e-05, "loss": 0.6726, "step": 2383 }, { "epoch": 0.9123612705702258, "grad_norm": 0.5435540676116943, "learning_rate": 1.8806509047434633e-05, "loss": 0.6706, "step": 2384 }, { "epoch": 0.912743972445465, "grad_norm": 0.5214954614639282, "learning_rate": 1.880533432803181e-05, "loss": 0.6448, "step": 2385 }, { "epoch": 0.9131266743207042, "grad_norm": 0.5257498621940613, "learning_rate": 1.8804159067517535e-05, "loss": 0.726, "step": 2386 }, { "epoch": 0.9135093761959434, "grad_norm": 0.5655977129936218, "learning_rate": 1.8802983265964035e-05, "loss": 0.6579, "step": 2387 }, { "epoch": 0.9138920780711826, "grad_norm": 0.5810930728912354, "learning_rate": 1.8801806923443566e-05, "loss": 0.6158, "step": 2388 }, { "epoch": 0.9142747799464217, "grad_norm": 0.5117050409317017, "learning_rate": 1.8800630040028415e-05, "loss": 0.6729, "step": 2389 }, { "epoch": 0.914657481821661, "grad_norm": 0.5574482679367065, "learning_rate": 1.8799452615790907e-05, "loss": 0.7415, "step": 2390 }, { "epoch": 0.9150401836969001, "grad_norm": 0.6100360751152039, "learning_rate": 1.879827465080339e-05, "loss": 0.6544, "step": 2391 }, { "epoch": 0.9154228855721394, "grad_norm": 0.8719677925109863, "learning_rate": 1.8797096145138264e-05, "loss": 0.6578, "step": 2392 }, { "epoch": 0.9158055874473785, "grad_norm": 0.599470853805542, "learning_rate": 1.8795917098867946e-05, "loss": 0.6405, "step": 2393 }, { "epoch": 0.9161882893226176, "grad_norm": 0.5538224577903748, "learning_rate": 1.879473751206489e-05, "loss": 0.6748, "step": 2394 }, { "epoch": 0.9165709911978569, "grad_norm": 0.5210962891578674, "learning_rate": 1.879355738480159e-05, "loss": 0.5932, "step": 2395 }, { "epoch": 0.916953693073096, "grad_norm": 0.5480614304542542, "learning_rate": 1.8792376717150566e-05, "loss": 0.6798, "step": 2396 }, { "epoch": 0.9173363949483353, "grad_norm": 0.5528345704078674, "learning_rate": 1.8791195509184366e-05, "loss": 0.7531, "step": 2397 }, { "epoch": 0.9177190968235744, "grad_norm": 0.6698029637336731, "learning_rate": 1.8790013760975593e-05, "loss": 0.7343, "step": 2398 }, { "epoch": 0.9181017986988136, "grad_norm": 0.6260066628456116, "learning_rate": 1.8788831472596853e-05, "loss": 0.6865, "step": 2399 }, { "epoch": 0.9184845005740528, "grad_norm": 0.5725381970405579, "learning_rate": 1.878764864412081e-05, "loss": 0.7642, "step": 2400 }, { "epoch": 0.918867202449292, "grad_norm": 0.6128880977630615, "learning_rate": 1.878646527562015e-05, "loss": 0.6917, "step": 2401 }, { "epoch": 0.9192499043245312, "grad_norm": 0.7735032439231873, "learning_rate": 1.8785281367167596e-05, "loss": 0.7188, "step": 2402 }, { "epoch": 0.9196326061997704, "grad_norm": 0.5887590050697327, "learning_rate": 1.8784096918835897e-05, "loss": 0.8232, "step": 2403 }, { "epoch": 0.9200153080750095, "grad_norm": 0.5293758511543274, "learning_rate": 1.8782911930697847e-05, "loss": 0.6819, "step": 2404 }, { "epoch": 0.9203980099502488, "grad_norm": 0.6229714155197144, "learning_rate": 1.878172640282626e-05, "loss": 0.6795, "step": 2405 }, { "epoch": 0.9207807118254879, "grad_norm": 0.7043350338935852, "learning_rate": 1.8780540335293994e-05, "loss": 0.6981, "step": 2406 }, { "epoch": 0.9211634137007272, "grad_norm": 0.6043265461921692, "learning_rate": 1.877935372817394e-05, "loss": 0.7206, "step": 2407 }, { "epoch": 0.9215461155759663, "grad_norm": 0.5265960693359375, "learning_rate": 1.877816658153901e-05, "loss": 0.6503, "step": 2408 }, { "epoch": 0.9219288174512055, "grad_norm": 0.501714289188385, "learning_rate": 1.8776978895462164e-05, "loss": 0.6444, "step": 2409 }, { "epoch": 0.9223115193264447, "grad_norm": 0.5428301692008972, "learning_rate": 1.8775790670016383e-05, "loss": 0.6593, "step": 2410 }, { "epoch": 0.9226942212016839, "grad_norm": 0.741820752620697, "learning_rate": 1.877460190527469e-05, "loss": 0.6761, "step": 2411 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5132878422737122, "learning_rate": 1.8773412601310138e-05, "loss": 0.6061, "step": 2412 }, { "epoch": 0.9234596249521623, "grad_norm": 0.5516854524612427, "learning_rate": 1.877222275819581e-05, "loss": 0.6647, "step": 2413 }, { "epoch": 0.9238423268274014, "grad_norm": 0.5468112230300903, "learning_rate": 1.8771032376004828e-05, "loss": 0.6022, "step": 2414 }, { "epoch": 0.9242250287026407, "grad_norm": 0.5856238603591919, "learning_rate": 1.8769841454810343e-05, "loss": 0.7591, "step": 2415 }, { "epoch": 0.9246077305778798, "grad_norm": 0.4816870391368866, "learning_rate": 1.876864999468554e-05, "loss": 0.6288, "step": 2416 }, { "epoch": 0.9249904324531191, "grad_norm": 0.5273502469062805, "learning_rate": 1.876745799570364e-05, "loss": 0.6814, "step": 2417 }, { "epoch": 0.9253731343283582, "grad_norm": 0.5483396649360657, "learning_rate": 1.876626545793789e-05, "loss": 0.6447, "step": 2418 }, { "epoch": 0.9257558362035974, "grad_norm": 0.5050948858261108, "learning_rate": 1.876507238146158e-05, "loss": 0.704, "step": 2419 }, { "epoch": 0.9261385380788366, "grad_norm": 0.5400721430778503, "learning_rate": 1.8763878766348026e-05, "loss": 0.7737, "step": 2420 }, { "epoch": 0.9265212399540758, "grad_norm": 0.6075438857078552, "learning_rate": 1.876268461267057e-05, "loss": 0.7018, "step": 2421 }, { "epoch": 0.926903941829315, "grad_norm": 0.4986242651939392, "learning_rate": 1.876148992050261e-05, "loss": 0.7244, "step": 2422 }, { "epoch": 0.9272866437045542, "grad_norm": 0.7161392569541931, "learning_rate": 1.8760294689917556e-05, "loss": 0.6238, "step": 2423 }, { "epoch": 0.9276693455797933, "grad_norm": 0.5724283456802368, "learning_rate": 1.875909892098886e-05, "loss": 0.8362, "step": 2424 }, { "epoch": 0.9280520474550326, "grad_norm": 0.5652454495429993, "learning_rate": 1.8757902613790002e-05, "loss": 0.6426, "step": 2425 }, { "epoch": 0.9284347493302717, "grad_norm": 0.4913378953933716, "learning_rate": 1.87567057683945e-05, "loss": 0.6301, "step": 2426 }, { "epoch": 0.928817451205511, "grad_norm": 0.5447685718536377, "learning_rate": 1.8755508384875903e-05, "loss": 0.6774, "step": 2427 }, { "epoch": 0.9292001530807501, "grad_norm": 0.5226560831069946, "learning_rate": 1.8754310463307796e-05, "loss": 0.6235, "step": 2428 }, { "epoch": 0.9295828549559892, "grad_norm": 0.5380014181137085, "learning_rate": 1.8753112003763792e-05, "loss": 0.6034, "step": 2429 }, { "epoch": 0.9299655568312285, "grad_norm": 0.4946866035461426, "learning_rate": 1.8751913006317542e-05, "loss": 0.7043, "step": 2430 }, { "epoch": 0.9303482587064676, "grad_norm": 0.5909304022789001, "learning_rate": 1.8750713471042722e-05, "loss": 0.8012, "step": 2431 }, { "epoch": 0.9307309605817069, "grad_norm": 0.5878859758377075, "learning_rate": 1.8749513398013053e-05, "loss": 0.6429, "step": 2432 }, { "epoch": 0.931113662456946, "grad_norm": 0.5623537302017212, "learning_rate": 1.8748312787302282e-05, "loss": 0.7223, "step": 2433 }, { "epoch": 0.9314963643321852, "grad_norm": 0.5557054877281189, "learning_rate": 1.8747111638984185e-05, "loss": 0.7141, "step": 2434 }, { "epoch": 0.9318790662074244, "grad_norm": 0.5359923839569092, "learning_rate": 1.874590995313258e-05, "loss": 0.6878, "step": 2435 }, { "epoch": 0.9322617680826636, "grad_norm": 0.5302033424377441, "learning_rate": 1.8744707729821313e-05, "loss": 0.6649, "step": 2436 }, { "epoch": 0.9326444699579028, "grad_norm": 0.544875979423523, "learning_rate": 1.874350496912426e-05, "loss": 0.6781, "step": 2437 }, { "epoch": 0.933027171833142, "grad_norm": 0.5071401596069336, "learning_rate": 1.8742301671115342e-05, "loss": 0.623, "step": 2438 }, { "epoch": 0.9334098737083811, "grad_norm": 0.48915764689445496, "learning_rate": 1.87410978358685e-05, "loss": 0.6821, "step": 2439 }, { "epoch": 0.9337925755836204, "grad_norm": 0.5403692722320557, "learning_rate": 1.873989346345771e-05, "loss": 0.6825, "step": 2440 }, { "epoch": 0.9341752774588595, "grad_norm": 0.5674801468849182, "learning_rate": 1.8738688553956987e-05, "loss": 0.714, "step": 2441 }, { "epoch": 0.9345579793340988, "grad_norm": 0.5048158168792725, "learning_rate": 1.8737483107440378e-05, "loss": 0.6674, "step": 2442 }, { "epoch": 0.9349406812093379, "grad_norm": 0.5099855065345764, "learning_rate": 1.873627712398196e-05, "loss": 0.6538, "step": 2443 }, { "epoch": 0.9353233830845771, "grad_norm": 0.500532329082489, "learning_rate": 1.873507060365584e-05, "loss": 0.5899, "step": 2444 }, { "epoch": 0.9357060849598163, "grad_norm": 0.5178689360618591, "learning_rate": 1.8733863546536167e-05, "loss": 0.6814, "step": 2445 }, { "epoch": 0.9360887868350555, "grad_norm": 0.5527170300483704, "learning_rate": 1.8732655952697114e-05, "loss": 0.6434, "step": 2446 }, { "epoch": 0.9364714887102947, "grad_norm": 0.47672954201698303, "learning_rate": 1.873144782221289e-05, "loss": 0.6823, "step": 2447 }, { "epoch": 0.9368541905855339, "grad_norm": 0.534945547580719, "learning_rate": 1.8730239155157746e-05, "loss": 0.6775, "step": 2448 }, { "epoch": 0.937236892460773, "grad_norm": 0.5242097973823547, "learning_rate": 1.8729029951605947e-05, "loss": 0.7204, "step": 2449 }, { "epoch": 0.9376195943360123, "grad_norm": 0.5717682838439941, "learning_rate": 1.872782021163181e-05, "loss": 0.6605, "step": 2450 }, { "epoch": 0.9380022962112514, "grad_norm": 0.8140515089035034, "learning_rate": 1.8726609935309677e-05, "loss": 0.6734, "step": 2451 }, { "epoch": 0.9383849980864907, "grad_norm": 0.5281966328620911, "learning_rate": 1.8725399122713914e-05, "loss": 0.7396, "step": 2452 }, { "epoch": 0.9387676999617298, "grad_norm": 0.48519179224967957, "learning_rate": 1.8724187773918937e-05, "loss": 0.5754, "step": 2453 }, { "epoch": 0.939150401836969, "grad_norm": 0.48081696033477783, "learning_rate": 1.8722975888999183e-05, "loss": 0.7097, "step": 2454 }, { "epoch": 0.9395331037122082, "grad_norm": 0.5109480619430542, "learning_rate": 1.8721763468029125e-05, "loss": 0.6634, "step": 2455 }, { "epoch": 0.9399158055874474, "grad_norm": 0.5105809569358826, "learning_rate": 1.8720550511083273e-05, "loss": 0.7, "step": 2456 }, { "epoch": 0.9402985074626866, "grad_norm": 0.5235955119132996, "learning_rate": 1.8719337018236162e-05, "loss": 0.7251, "step": 2457 }, { "epoch": 0.9406812093379258, "grad_norm": 0.5229055881500244, "learning_rate": 1.871812298956237e-05, "loss": 0.7035, "step": 2458 }, { "epoch": 0.9410639112131649, "grad_norm": 0.5601255297660828, "learning_rate": 1.8716908425136498e-05, "loss": 0.7276, "step": 2459 }, { "epoch": 0.9414466130884042, "grad_norm": 0.5416072607040405, "learning_rate": 1.8715693325033185e-05, "loss": 0.5663, "step": 2460 }, { "epoch": 0.9418293149636433, "grad_norm": 0.4800127446651459, "learning_rate": 1.8714477689327104e-05, "loss": 0.6653, "step": 2461 }, { "epoch": 0.9422120168388826, "grad_norm": 0.5877089500427246, "learning_rate": 1.8713261518092956e-05, "loss": 0.6493, "step": 2462 }, { "epoch": 0.9425947187141217, "grad_norm": 0.4694516658782959, "learning_rate": 1.871204481140548e-05, "loss": 0.6548, "step": 2463 }, { "epoch": 0.9429774205893608, "grad_norm": 0.5524033904075623, "learning_rate": 1.8710827569339444e-05, "loss": 0.7282, "step": 2464 }, { "epoch": 0.9433601224646001, "grad_norm": 0.5500392913818359, "learning_rate": 1.8709609791969655e-05, "loss": 0.686, "step": 2465 }, { "epoch": 0.9437428243398392, "grad_norm": 0.47887593507766724, "learning_rate": 1.8708391479370945e-05, "loss": 0.6546, "step": 2466 }, { "epoch": 0.9441255262150785, "grad_norm": 0.5013276934623718, "learning_rate": 1.8707172631618184e-05, "loss": 0.67, "step": 2467 }, { "epoch": 0.9445082280903176, "grad_norm": 0.50898677110672, "learning_rate": 1.870595324878627e-05, "loss": 0.6307, "step": 2468 }, { "epoch": 0.9448909299655568, "grad_norm": 0.5781949162483215, "learning_rate": 1.8704733330950146e-05, "loss": 0.7144, "step": 2469 }, { "epoch": 0.945273631840796, "grad_norm": 0.4989788234233856, "learning_rate": 1.870351287818477e-05, "loss": 0.6698, "step": 2470 }, { "epoch": 0.9456563337160352, "grad_norm": 0.4959215819835663, "learning_rate": 1.8702291890565146e-05, "loss": 0.6661, "step": 2471 }, { "epoch": 0.9460390355912744, "grad_norm": 0.5457781553268433, "learning_rate": 1.8701070368166307e-05, "loss": 0.6972, "step": 2472 }, { "epoch": 0.9464217374665136, "grad_norm": 0.552743673324585, "learning_rate": 1.8699848311063317e-05, "loss": 0.6642, "step": 2473 }, { "epoch": 0.9468044393417527, "grad_norm": 0.5822766423225403, "learning_rate": 1.8698625719331278e-05, "loss": 0.7617, "step": 2474 }, { "epoch": 0.947187141216992, "grad_norm": 0.55756676197052, "learning_rate": 1.869740259304532e-05, "loss": 0.6625, "step": 2475 }, { "epoch": 0.9475698430922311, "grad_norm": 0.5614320039749146, "learning_rate": 1.8696178932280605e-05, "loss": 0.6461, "step": 2476 }, { "epoch": 0.9479525449674704, "grad_norm": 0.4960680902004242, "learning_rate": 1.8694954737112337e-05, "loss": 0.7154, "step": 2477 }, { "epoch": 0.9483352468427095, "grad_norm": 0.656847357749939, "learning_rate": 1.8693730007615737e-05, "loss": 0.7187, "step": 2478 }, { "epoch": 0.9487179487179487, "grad_norm": 0.5088282823562622, "learning_rate": 1.8692504743866074e-05, "loss": 0.6342, "step": 2479 }, { "epoch": 0.9491006505931879, "grad_norm": 0.5299701690673828, "learning_rate": 1.869127894593864e-05, "loss": 0.6896, "step": 2480 }, { "epoch": 0.9494833524684271, "grad_norm": 0.5447501540184021, "learning_rate": 1.869005261390877e-05, "loss": 0.6309, "step": 2481 }, { "epoch": 0.9498660543436663, "grad_norm": 0.662951648235321, "learning_rate": 1.868882574785182e-05, "loss": 0.6554, "step": 2482 }, { "epoch": 0.9502487562189055, "grad_norm": 0.5835161805152893, "learning_rate": 1.8687598347843182e-05, "loss": 0.6069, "step": 2483 }, { "epoch": 0.9506314580941446, "grad_norm": 0.5936782360076904, "learning_rate": 1.868637041395829e-05, "loss": 0.7894, "step": 2484 }, { "epoch": 0.9510141599693839, "grad_norm": 0.5266033411026001, "learning_rate": 1.8685141946272602e-05, "loss": 0.6871, "step": 2485 }, { "epoch": 0.951396861844623, "grad_norm": 0.5214227437973022, "learning_rate": 1.8683912944861606e-05, "loss": 0.5833, "step": 2486 }, { "epoch": 0.9517795637198623, "grad_norm": 0.5633607506752014, "learning_rate": 1.8682683409800825e-05, "loss": 0.7055, "step": 2487 }, { "epoch": 0.9521622655951014, "grad_norm": 0.5885217785835266, "learning_rate": 1.8681453341165832e-05, "loss": 0.7683, "step": 2488 }, { "epoch": 0.9525449674703406, "grad_norm": 0.5001779198646545, "learning_rate": 1.8680222739032202e-05, "loss": 0.6215, "step": 2489 }, { "epoch": 0.9529276693455798, "grad_norm": 0.5379462242126465, "learning_rate": 1.8678991603475566e-05, "loss": 0.7192, "step": 2490 }, { "epoch": 0.953310371220819, "grad_norm": 0.5258882641792297, "learning_rate": 1.8677759934571584e-05, "loss": 0.6232, "step": 2491 }, { "epoch": 0.9536930730960582, "grad_norm": 0.5363120436668396, "learning_rate": 1.867652773239594e-05, "loss": 0.6905, "step": 2492 }, { "epoch": 0.9540757749712974, "grad_norm": 0.5430044531822205, "learning_rate": 1.8675294997024353e-05, "loss": 0.6894, "step": 2493 }, { "epoch": 0.9544584768465365, "grad_norm": 0.5400209426879883, "learning_rate": 1.867406172853259e-05, "loss": 0.7633, "step": 2494 }, { "epoch": 0.9548411787217758, "grad_norm": 0.5387817025184631, "learning_rate": 1.8672827926996424e-05, "loss": 0.807, "step": 2495 }, { "epoch": 0.9552238805970149, "grad_norm": 0.6013066172599792, "learning_rate": 1.8671593592491683e-05, "loss": 0.7086, "step": 2496 }, { "epoch": 0.9556065824722542, "grad_norm": 0.5686445236206055, "learning_rate": 1.8670358725094225e-05, "loss": 0.6667, "step": 2497 }, { "epoch": 0.9559892843474933, "grad_norm": 0.5522993206977844, "learning_rate": 1.8669123324879924e-05, "loss": 0.692, "step": 2498 }, { "epoch": 0.9563719862227325, "grad_norm": 0.5302451252937317, "learning_rate": 1.8667887391924712e-05, "loss": 0.726, "step": 2499 }, { "epoch": 0.9567546880979717, "grad_norm": 0.6052833199501038, "learning_rate": 1.8666650926304527e-05, "loss": 0.6268, "step": 2500 }, { "epoch": 0.9571373899732109, "grad_norm": 0.524660050868988, "learning_rate": 1.8665413928095364e-05, "loss": 0.6231, "step": 2501 }, { "epoch": 0.9575200918484501, "grad_norm": 0.5684884190559387, "learning_rate": 1.8664176397373236e-05, "loss": 0.7383, "step": 2502 }, { "epoch": 0.9579027937236892, "grad_norm": 0.6010488867759705, "learning_rate": 1.8662938334214188e-05, "loss": 0.721, "step": 2503 }, { "epoch": 0.9582854955989284, "grad_norm": 0.5208972096443176, "learning_rate": 1.8661699738694308e-05, "loss": 0.7717, "step": 2504 }, { "epoch": 0.9586681974741676, "grad_norm": 0.6475250720977783, "learning_rate": 1.8660460610889712e-05, "loss": 0.7073, "step": 2505 }, { "epoch": 0.9590508993494068, "grad_norm": 0.5364944338798523, "learning_rate": 1.865922095087654e-05, "loss": 0.6398, "step": 2506 }, { "epoch": 0.959433601224646, "grad_norm": 0.5307921171188354, "learning_rate": 1.8657980758730984e-05, "loss": 0.6554, "step": 2507 }, { "epoch": 0.9598163030998852, "grad_norm": 0.5262178778648376, "learning_rate": 1.8656740034529247e-05, "loss": 0.6303, "step": 2508 }, { "epoch": 0.9601990049751243, "grad_norm": 0.5332175493240356, "learning_rate": 1.865549877834758e-05, "loss": 0.6981, "step": 2509 }, { "epoch": 0.9605817068503636, "grad_norm": 0.5391271114349365, "learning_rate": 1.865425699026226e-05, "loss": 0.5748, "step": 2510 }, { "epoch": 0.9609644087256027, "grad_norm": 0.5404515266418457, "learning_rate": 1.8653014670349597e-05, "loss": 0.7228, "step": 2511 }, { "epoch": 0.961347110600842, "grad_norm": 0.5738024115562439, "learning_rate": 1.8651771818685937e-05, "loss": 0.6661, "step": 2512 }, { "epoch": 0.9617298124760811, "grad_norm": 0.5545884370803833, "learning_rate": 1.8650528435347657e-05, "loss": 0.6623, "step": 2513 }, { "epoch": 0.9621125143513203, "grad_norm": 0.6049636006355286, "learning_rate": 1.8649284520411165e-05, "loss": 0.6811, "step": 2514 }, { "epoch": 0.9624952162265595, "grad_norm": 0.5282649993896484, "learning_rate": 1.8648040073952903e-05, "loss": 0.7612, "step": 2515 }, { "epoch": 0.9628779181017987, "grad_norm": 0.5233682990074158, "learning_rate": 1.8646795096049347e-05, "loss": 0.6883, "step": 2516 }, { "epoch": 0.9632606199770379, "grad_norm": 0.5262303948402405, "learning_rate": 1.8645549586777e-05, "loss": 0.6133, "step": 2517 }, { "epoch": 0.9636433218522771, "grad_norm": 0.5247983336448669, "learning_rate": 1.8644303546212407e-05, "loss": 0.6352, "step": 2518 }, { "epoch": 0.9640260237275162, "grad_norm": 0.49849560856819153, "learning_rate": 1.8643056974432134e-05, "loss": 0.6696, "step": 2519 }, { "epoch": 0.9644087256027555, "grad_norm": 0.5126455426216125, "learning_rate": 1.8641809871512792e-05, "loss": 0.6829, "step": 2520 }, { "epoch": 0.9647914274779946, "grad_norm": 0.5522153973579407, "learning_rate": 1.864056223753102e-05, "loss": 0.6912, "step": 2521 }, { "epoch": 0.9651741293532339, "grad_norm": 0.6418414115905762, "learning_rate": 1.8639314072563484e-05, "loss": 0.6467, "step": 2522 }, { "epoch": 0.965556831228473, "grad_norm": 0.569180965423584, "learning_rate": 1.863806537668689e-05, "loss": 0.6388, "step": 2523 }, { "epoch": 0.9659395331037122, "grad_norm": 0.5275533199310303, "learning_rate": 1.863681614997797e-05, "loss": 0.7103, "step": 2524 }, { "epoch": 0.9663222349789514, "grad_norm": 0.5503414273262024, "learning_rate": 1.8635566392513498e-05, "loss": 0.6648, "step": 2525 }, { "epoch": 0.9667049368541906, "grad_norm": 0.5421643257141113, "learning_rate": 1.8634316104370267e-05, "loss": 0.6551, "step": 2526 }, { "epoch": 0.9670876387294298, "grad_norm": 0.5218026638031006, "learning_rate": 1.863306528562512e-05, "loss": 0.5848, "step": 2527 }, { "epoch": 0.967470340604669, "grad_norm": 0.5677403807640076, "learning_rate": 1.863181393635492e-05, "loss": 0.6932, "step": 2528 }, { "epoch": 0.9678530424799081, "grad_norm": 0.48642751574516296, "learning_rate": 1.863056205663656e-05, "loss": 0.66, "step": 2529 }, { "epoch": 0.9682357443551474, "grad_norm": 0.5424838066101074, "learning_rate": 1.8629309646546977e-05, "loss": 0.6267, "step": 2530 }, { "epoch": 0.9686184462303865, "grad_norm": 0.5918376445770264, "learning_rate": 1.862805670616313e-05, "loss": 0.6596, "step": 2531 }, { "epoch": 0.9690011481056258, "grad_norm": 0.5071219801902771, "learning_rate": 1.8626803235562025e-05, "loss": 0.6652, "step": 2532 }, { "epoch": 0.9693838499808649, "grad_norm": 0.5408348441123962, "learning_rate": 1.8625549234820685e-05, "loss": 0.6944, "step": 2533 }, { "epoch": 0.969766551856104, "grad_norm": 0.5570545196533203, "learning_rate": 1.862429470401617e-05, "loss": 0.6314, "step": 2534 }, { "epoch": 0.9701492537313433, "grad_norm": 0.5052222609519958, "learning_rate": 1.862303964322558e-05, "loss": 0.7021, "step": 2535 }, { "epoch": 0.9705319556065825, "grad_norm": 0.5683215260505676, "learning_rate": 1.8621784052526032e-05, "loss": 0.6312, "step": 2536 }, { "epoch": 0.9709146574818217, "grad_norm": 0.553483784198761, "learning_rate": 1.8620527931994695e-05, "loss": 0.7338, "step": 2537 }, { "epoch": 0.9712973593570609, "grad_norm": 0.51225346326828, "learning_rate": 1.8619271281708756e-05, "loss": 0.6807, "step": 2538 }, { "epoch": 0.9716800612323, "grad_norm": 0.4755615293979645, "learning_rate": 1.8618014101745444e-05, "loss": 0.6778, "step": 2539 }, { "epoch": 0.9720627631075393, "grad_norm": 0.5407717227935791, "learning_rate": 1.8616756392182012e-05, "loss": 0.6333, "step": 2540 }, { "epoch": 0.9724454649827784, "grad_norm": 0.5603088736534119, "learning_rate": 1.8615498153095746e-05, "loss": 0.674, "step": 2541 }, { "epoch": 0.9728281668580177, "grad_norm": 0.4737756550312042, "learning_rate": 1.8614239384563982e-05, "loss": 0.6997, "step": 2542 }, { "epoch": 0.9732108687332568, "grad_norm": 0.5328311324119568, "learning_rate": 1.8612980086664062e-05, "loss": 0.6878, "step": 2543 }, { "epoch": 0.9735935706084959, "grad_norm": 0.6374053359031677, "learning_rate": 1.8611720259473376e-05, "loss": 0.7353, "step": 2544 }, { "epoch": 0.9739762724837352, "grad_norm": 0.5566831231117249, "learning_rate": 1.8610459903069346e-05, "loss": 0.6049, "step": 2545 }, { "epoch": 0.9743589743589743, "grad_norm": 0.5484935641288757, "learning_rate": 1.860919901752942e-05, "loss": 0.6574, "step": 2546 }, { "epoch": 0.9747416762342136, "grad_norm": 0.568035364151001, "learning_rate": 1.860793760293109e-05, "loss": 0.7293, "step": 2547 }, { "epoch": 0.9751243781094527, "grad_norm": 0.5488471984863281, "learning_rate": 1.8606675659351863e-05, "loss": 0.6597, "step": 2548 }, { "epoch": 0.9755070799846919, "grad_norm": 0.5796221494674683, "learning_rate": 1.8605413186869304e-05, "loss": 0.7905, "step": 2549 }, { "epoch": 0.9758897818599311, "grad_norm": 0.5450961589813232, "learning_rate": 1.860415018556098e-05, "loss": 0.6676, "step": 2550 }, { "epoch": 0.9762724837351703, "grad_norm": 0.5847272872924805, "learning_rate": 1.8602886655504513e-05, "loss": 0.7701, "step": 2551 }, { "epoch": 0.9766551856104095, "grad_norm": 0.5140928626060486, "learning_rate": 1.860162259677755e-05, "loss": 0.6301, "step": 2552 }, { "epoch": 0.9770378874856487, "grad_norm": 0.5374216437339783, "learning_rate": 1.860035800945777e-05, "loss": 0.6638, "step": 2553 }, { "epoch": 0.9774205893608878, "grad_norm": 0.5364008545875549, "learning_rate": 1.8599092893622884e-05, "loss": 0.7236, "step": 2554 }, { "epoch": 0.9778032912361271, "grad_norm": 0.5860774517059326, "learning_rate": 1.859782724935064e-05, "loss": 0.6608, "step": 2555 }, { "epoch": 0.9781859931113662, "grad_norm": 0.7715074419975281, "learning_rate": 1.8596561076718814e-05, "loss": 0.6841, "step": 2556 }, { "epoch": 0.9785686949866055, "grad_norm": 0.641567587852478, "learning_rate": 1.8595294375805217e-05, "loss": 0.6709, "step": 2557 }, { "epoch": 0.9789513968618446, "grad_norm": 0.5068246126174927, "learning_rate": 1.8594027146687688e-05, "loss": 0.6878, "step": 2558 }, { "epoch": 0.9793340987370838, "grad_norm": 0.5228655338287354, "learning_rate": 1.8592759389444103e-05, "loss": 0.6828, "step": 2559 }, { "epoch": 0.979716800612323, "grad_norm": 0.5837813019752502, "learning_rate": 1.8591491104152366e-05, "loss": 0.6462, "step": 2560 }, { "epoch": 0.9800995024875622, "grad_norm": 0.5268024206161499, "learning_rate": 1.8590222290890424e-05, "loss": 0.6299, "step": 2561 }, { "epoch": 0.9804822043628014, "grad_norm": 0.5706527829170227, "learning_rate": 1.8588952949736244e-05, "loss": 0.6236, "step": 2562 }, { "epoch": 0.9808649062380406, "grad_norm": 0.5196132659912109, "learning_rate": 1.858768308076783e-05, "loss": 0.6598, "step": 2563 }, { "epoch": 0.9812476081132797, "grad_norm": 0.5789106488227844, "learning_rate": 1.8586412684063217e-05, "loss": 0.6299, "step": 2564 }, { "epoch": 0.981630309988519, "grad_norm": 0.5456148386001587, "learning_rate": 1.8585141759700484e-05, "loss": 0.6198, "step": 2565 }, { "epoch": 0.9820130118637581, "grad_norm": 0.5581376552581787, "learning_rate": 1.858387030775772e-05, "loss": 0.6218, "step": 2566 }, { "epoch": 0.9823957137389974, "grad_norm": 0.5003324151039124, "learning_rate": 1.858259832831307e-05, "loss": 0.7023, "step": 2567 }, { "epoch": 0.9827784156142365, "grad_norm": 0.5084156394004822, "learning_rate": 1.858132582144469e-05, "loss": 0.6275, "step": 2568 }, { "epoch": 0.9831611174894757, "grad_norm": 0.5179926156997681, "learning_rate": 1.858005278723079e-05, "loss": 0.629, "step": 2569 }, { "epoch": 0.9835438193647149, "grad_norm": 0.5186851620674133, "learning_rate": 1.8578779225749593e-05, "loss": 0.6987, "step": 2570 }, { "epoch": 0.983926521239954, "grad_norm": 1.0313093662261963, "learning_rate": 1.8577505137079367e-05, "loss": 0.6568, "step": 2571 }, { "epoch": 0.9843092231151933, "grad_norm": 0.5106679797172546, "learning_rate": 1.8576230521298407e-05, "loss": 0.6388, "step": 2572 }, { "epoch": 0.9846919249904325, "grad_norm": 0.5179961919784546, "learning_rate": 1.857495537848504e-05, "loss": 0.6312, "step": 2573 }, { "epoch": 0.9850746268656716, "grad_norm": 0.5299312472343445, "learning_rate": 1.8573679708717628e-05, "loss": 0.6646, "step": 2574 }, { "epoch": 0.9854573287409109, "grad_norm": 0.5318065285682678, "learning_rate": 1.8572403512074568e-05, "loss": 0.6493, "step": 2575 }, { "epoch": 0.98584003061615, "grad_norm": 0.5459107160568237, "learning_rate": 1.857112678863428e-05, "loss": 0.6608, "step": 2576 }, { "epoch": 0.9862227324913893, "grad_norm": 0.5573206543922424, "learning_rate": 1.8569849538475227e-05, "loss": 0.6411, "step": 2577 }, { "epoch": 0.9866054343666284, "grad_norm": 0.7686195373535156, "learning_rate": 1.8568571761675893e-05, "loss": 0.6554, "step": 2578 }, { "epoch": 0.9869881362418675, "grad_norm": 0.5893393754959106, "learning_rate": 1.8567293458314808e-05, "loss": 0.6942, "step": 2579 }, { "epoch": 0.9873708381171068, "grad_norm": 0.5681552290916443, "learning_rate": 1.8566014628470525e-05, "loss": 0.7612, "step": 2580 }, { "epoch": 0.9877535399923459, "grad_norm": 0.5493112206459045, "learning_rate": 1.8564735272221628e-05, "loss": 0.7338, "step": 2581 }, { "epoch": 0.9881362418675852, "grad_norm": 0.5560922622680664, "learning_rate": 1.856345538964674e-05, "loss": 0.6153, "step": 2582 }, { "epoch": 0.9885189437428243, "grad_norm": 0.5046100616455078, "learning_rate": 1.8562174980824514e-05, "loss": 0.732, "step": 2583 }, { "epoch": 0.9889016456180635, "grad_norm": 0.5947822332382202, "learning_rate": 1.856089404583363e-05, "loss": 0.7027, "step": 2584 }, { "epoch": 0.9892843474933027, "grad_norm": 0.5125589370727539, "learning_rate": 1.855961258475281e-05, "loss": 0.6643, "step": 2585 }, { "epoch": 0.9896670493685419, "grad_norm": 0.5097452998161316, "learning_rate": 1.8558330597660803e-05, "loss": 0.6862, "step": 2586 }, { "epoch": 0.9900497512437811, "grad_norm": 0.517119288444519, "learning_rate": 1.8557048084636386e-05, "loss": 0.6795, "step": 2587 }, { "epoch": 0.9904324531190203, "grad_norm": 0.5736740827560425, "learning_rate": 1.8555765045758377e-05, "loss": 0.6656, "step": 2588 }, { "epoch": 0.9908151549942594, "grad_norm": 0.5036564469337463, "learning_rate": 1.855448148110562e-05, "loss": 0.6521, "step": 2589 }, { "epoch": 0.9911978568694987, "grad_norm": 0.5891701579093933, "learning_rate": 1.8553197390757e-05, "loss": 0.6568, "step": 2590 }, { "epoch": 0.9915805587447378, "grad_norm": 0.49302443861961365, "learning_rate": 1.8551912774791415e-05, "loss": 0.7091, "step": 2591 }, { "epoch": 0.9919632606199771, "grad_norm": 0.4985717236995697, "learning_rate": 1.855062763328782e-05, "loss": 0.6609, "step": 2592 }, { "epoch": 0.9923459624952162, "grad_norm": 0.5529389977455139, "learning_rate": 1.8549341966325186e-05, "loss": 0.7252, "step": 2593 }, { "epoch": 0.9927286643704554, "grad_norm": 0.5472580790519714, "learning_rate": 1.8548055773982518e-05, "loss": 0.6413, "step": 2594 }, { "epoch": 0.9931113662456946, "grad_norm": 0.5264586210250854, "learning_rate": 1.8546769056338857e-05, "loss": 0.6781, "step": 2595 }, { "epoch": 0.9934940681209338, "grad_norm": 0.5274324417114258, "learning_rate": 1.854548181347328e-05, "loss": 0.6999, "step": 2596 }, { "epoch": 0.993876769996173, "grad_norm": 0.5339049100875854, "learning_rate": 1.8544194045464888e-05, "loss": 0.747, "step": 2597 }, { "epoch": 0.9942594718714122, "grad_norm": 0.541072428226471, "learning_rate": 1.8542905752392816e-05, "loss": 0.6926, "step": 2598 }, { "epoch": 0.9946421737466513, "grad_norm": 0.5627388954162598, "learning_rate": 1.8541616934336236e-05, "loss": 0.7165, "step": 2599 }, { "epoch": 0.9950248756218906, "grad_norm": 0.5362045168876648, "learning_rate": 1.854032759137435e-05, "loss": 0.6419, "step": 2600 }, { "epoch": 0.9954075774971297, "grad_norm": 0.5299434661865234, "learning_rate": 1.8539037723586387e-05, "loss": 0.6232, "step": 2601 }, { "epoch": 0.995790279372369, "grad_norm": 0.5564910173416138, "learning_rate": 1.8537747331051615e-05, "loss": 0.8027, "step": 2602 }, { "epoch": 0.9961729812476081, "grad_norm": 0.5507721304893494, "learning_rate": 1.8536456413849338e-05, "loss": 0.6563, "step": 2603 }, { "epoch": 0.9965556831228473, "grad_norm": 0.5277629494667053, "learning_rate": 1.8535164972058876e-05, "loss": 0.699, "step": 2604 }, { "epoch": 0.9969383849980865, "grad_norm": 0.5393261909484863, "learning_rate": 1.8533873005759602e-05, "loss": 0.6849, "step": 2605 }, { "epoch": 0.9973210868733257, "grad_norm": 0.5208573937416077, "learning_rate": 1.8532580515030904e-05, "loss": 0.6726, "step": 2606 }, { "epoch": 0.9977037887485649, "grad_norm": 0.5816627740859985, "learning_rate": 1.853128749995221e-05, "loss": 0.6537, "step": 2607 }, { "epoch": 0.9980864906238041, "grad_norm": 0.5266723036766052, "learning_rate": 1.8529993960602977e-05, "loss": 0.7022, "step": 2608 }, { "epoch": 0.9984691924990432, "grad_norm": 0.5424588322639465, "learning_rate": 1.8528699897062703e-05, "loss": 0.6873, "step": 2609 }, { "epoch": 0.9988518943742825, "grad_norm": 0.5431889891624451, "learning_rate": 1.8527405309410905e-05, "loss": 0.7531, "step": 2610 }, { "epoch": 0.9992345962495216, "grad_norm": 0.5416121482849121, "learning_rate": 1.852611019772715e-05, "loss": 0.7489, "step": 2611 }, { "epoch": 0.9996172981247609, "grad_norm": 0.5009639263153076, "learning_rate": 1.852481456209101e-05, "loss": 0.6785, "step": 2612 }, { "epoch": 1.0, "grad_norm": 0.4957413077354431, "learning_rate": 1.852351840258211e-05, "loss": 0.6422, "step": 2613 }, { "epoch": 1.0003827018752391, "grad_norm": 0.49977633357048035, "learning_rate": 1.8522221719280112e-05, "loss": 0.6758, "step": 2614 }, { "epoch": 1.0007654037504783, "grad_norm": 0.5344793796539307, "learning_rate": 1.8520924512264696e-05, "loss": 0.7603, "step": 2615 }, { "epoch": 1.0011481056257177, "grad_norm": 0.4930453598499298, "learning_rate": 1.8519626781615575e-05, "loss": 0.637, "step": 2616 }, { "epoch": 1.0015308075009568, "grad_norm": 0.49273478984832764, "learning_rate": 1.85183285274125e-05, "loss": 0.6176, "step": 2617 }, { "epoch": 1.001913509376196, "grad_norm": 0.5198045969009399, "learning_rate": 1.8517029749735254e-05, "loss": 0.7146, "step": 2618 }, { "epoch": 1.002296211251435, "grad_norm": 0.5382727384567261, "learning_rate": 1.8515730448663647e-05, "loss": 0.6978, "step": 2619 }, { "epoch": 1.0026789131266742, "grad_norm": 0.5674942135810852, "learning_rate": 1.8514430624277525e-05, "loss": 0.7009, "step": 2620 }, { "epoch": 1.0030616150019136, "grad_norm": 0.5543457269668579, "learning_rate": 1.8513130276656772e-05, "loss": 0.6747, "step": 2621 }, { "epoch": 1.0034443168771527, "grad_norm": 0.5982082486152649, "learning_rate": 1.851182940588129e-05, "loss": 0.6591, "step": 2622 }, { "epoch": 1.0038270187523919, "grad_norm": 0.487307608127594, "learning_rate": 1.8510528012031027e-05, "loss": 0.6567, "step": 2623 }, { "epoch": 1.004209720627631, "grad_norm": 0.5274561643600464, "learning_rate": 1.850922609518595e-05, "loss": 0.651, "step": 2624 }, { "epoch": 1.0045924225028702, "grad_norm": 0.5209740996360779, "learning_rate": 1.850792365542607e-05, "loss": 0.6491, "step": 2625 }, { "epoch": 1.0049751243781095, "grad_norm": 0.5558923482894897, "learning_rate": 1.8506620692831427e-05, "loss": 0.7038, "step": 2626 }, { "epoch": 1.0053578262533487, "grad_norm": 0.5676994323730469, "learning_rate": 1.850531720748209e-05, "loss": 0.6398, "step": 2627 }, { "epoch": 1.0057405281285878, "grad_norm": 0.5119205117225647, "learning_rate": 1.850401319945816e-05, "loss": 0.74, "step": 2628 }, { "epoch": 1.006123230003827, "grad_norm": 0.5802038908004761, "learning_rate": 1.8502708668839773e-05, "loss": 0.6592, "step": 2629 }, { "epoch": 1.0065059318790661, "grad_norm": 0.5277078151702881, "learning_rate": 1.8501403615707098e-05, "loss": 0.6976, "step": 2630 }, { "epoch": 1.0068886337543055, "grad_norm": 0.48014625906944275, "learning_rate": 1.850009804014033e-05, "loss": 0.6232, "step": 2631 }, { "epoch": 1.0072713356295446, "grad_norm": 0.7340759038925171, "learning_rate": 1.84987919422197e-05, "loss": 0.693, "step": 2632 }, { "epoch": 1.0076540375047838, "grad_norm": 0.5496671795845032, "learning_rate": 1.8497485322025473e-05, "loss": 0.7159, "step": 2633 }, { "epoch": 1.008036739380023, "grad_norm": 0.5644863247871399, "learning_rate": 1.849617817963795e-05, "loss": 0.6084, "step": 2634 }, { "epoch": 1.008419441255262, "grad_norm": 0.5178579688072205, "learning_rate": 1.849487051513745e-05, "loss": 0.5975, "step": 2635 }, { "epoch": 1.0088021431305014, "grad_norm": 0.5177514553070068, "learning_rate": 1.8493562328604333e-05, "loss": 0.642, "step": 2636 }, { "epoch": 1.0091848450057406, "grad_norm": 0.5080211758613586, "learning_rate": 1.8492253620118994e-05, "loss": 0.7299, "step": 2637 }, { "epoch": 1.0095675468809797, "grad_norm": 0.5315657258033752, "learning_rate": 1.8490944389761858e-05, "loss": 0.6692, "step": 2638 }, { "epoch": 1.0099502487562189, "grad_norm": 0.5168536901473999, "learning_rate": 1.8489634637613375e-05, "loss": 0.6624, "step": 2639 }, { "epoch": 1.010332950631458, "grad_norm": 0.49643638730049133, "learning_rate": 1.848832436375404e-05, "loss": 0.7425, "step": 2640 }, { "epoch": 1.0107156525066974, "grad_norm": 0.573516845703125, "learning_rate": 1.848701356826437e-05, "loss": 0.7575, "step": 2641 }, { "epoch": 1.0110983543819365, "grad_norm": 0.4943254888057709, "learning_rate": 1.848570225122491e-05, "loss": 0.6885, "step": 2642 }, { "epoch": 1.0114810562571757, "grad_norm": 0.5715441107749939, "learning_rate": 1.8484390412716253e-05, "loss": 0.6224, "step": 2643 }, { "epoch": 1.0118637581324148, "grad_norm": 0.48207616806030273, "learning_rate": 1.8483078052819012e-05, "loss": 0.5672, "step": 2644 }, { "epoch": 1.012246460007654, "grad_norm": 0.5200220346450806, "learning_rate": 1.8481765171613836e-05, "loss": 0.769, "step": 2645 }, { "epoch": 1.0126291618828933, "grad_norm": 0.5209426879882812, "learning_rate": 1.8480451769181403e-05, "loss": 0.622, "step": 2646 }, { "epoch": 1.0130118637581325, "grad_norm": 0.5386373996734619, "learning_rate": 1.8479137845602426e-05, "loss": 0.628, "step": 2647 }, { "epoch": 1.0133945656333716, "grad_norm": 0.5281029343605042, "learning_rate": 1.847782340095765e-05, "loss": 0.7311, "step": 2648 }, { "epoch": 1.0137772675086107, "grad_norm": 0.47776171565055847, "learning_rate": 1.847650843532785e-05, "loss": 0.5894, "step": 2649 }, { "epoch": 1.01415996938385, "grad_norm": 0.5460483431816101, "learning_rate": 1.8475192948793832e-05, "loss": 0.6642, "step": 2650 }, { "epoch": 1.0145426712590893, "grad_norm": 0.49827510118484497, "learning_rate": 1.8473876941436443e-05, "loss": 0.6449, "step": 2651 }, { "epoch": 1.0149253731343284, "grad_norm": 0.5183588862419128, "learning_rate": 1.847256041333655e-05, "loss": 0.6926, "step": 2652 }, { "epoch": 1.0153080750095675, "grad_norm": 0.5372808575630188, "learning_rate": 1.8471243364575057e-05, "loss": 0.7028, "step": 2653 }, { "epoch": 1.0156907768848067, "grad_norm": 0.534022331237793, "learning_rate": 1.84699257952329e-05, "loss": 0.6673, "step": 2654 }, { "epoch": 1.0160734787600458, "grad_norm": 0.5136264562606812, "learning_rate": 1.846860770539105e-05, "loss": 0.6334, "step": 2655 }, { "epoch": 1.0164561806352852, "grad_norm": 0.500325083732605, "learning_rate": 1.846728909513051e-05, "loss": 0.6759, "step": 2656 }, { "epoch": 1.0168388825105243, "grad_norm": 0.571131706237793, "learning_rate": 1.84659699645323e-05, "loss": 0.7327, "step": 2657 }, { "epoch": 1.0172215843857635, "grad_norm": 0.5893007516860962, "learning_rate": 1.8464650313677496e-05, "loss": 0.6335, "step": 2658 }, { "epoch": 1.0176042862610026, "grad_norm": 0.5298492908477783, "learning_rate": 1.846333014264719e-05, "loss": 0.6486, "step": 2659 }, { "epoch": 1.0179869881362418, "grad_norm": 0.46710795164108276, "learning_rate": 1.8462009451522512e-05, "loss": 0.6508, "step": 2660 }, { "epoch": 1.0183696900114811, "grad_norm": 0.4992779493331909, "learning_rate": 1.8460688240384618e-05, "loss": 0.6778, "step": 2661 }, { "epoch": 1.0187523918867203, "grad_norm": 0.5138274431228638, "learning_rate": 1.8459366509314703e-05, "loss": 0.6989, "step": 2662 }, { "epoch": 1.0191350937619594, "grad_norm": 0.5169967412948608, "learning_rate": 1.845804425839399e-05, "loss": 0.6625, "step": 2663 }, { "epoch": 1.0195177956371986, "grad_norm": 0.532369077205658, "learning_rate": 1.845672148770373e-05, "loss": 0.6944, "step": 2664 }, { "epoch": 1.0199004975124377, "grad_norm": 0.49084025621414185, "learning_rate": 1.845539819732522e-05, "loss": 0.5758, "step": 2665 }, { "epoch": 1.020283199387677, "grad_norm": 0.5340083837509155, "learning_rate": 1.8454074387339777e-05, "loss": 0.7318, "step": 2666 }, { "epoch": 1.0206659012629162, "grad_norm": 0.5609795451164246, "learning_rate": 1.845275005782875e-05, "loss": 0.7118, "step": 2667 }, { "epoch": 1.0210486031381554, "grad_norm": 0.5840969681739807, "learning_rate": 1.845142520887352e-05, "loss": 0.7179, "step": 2668 }, { "epoch": 1.0214313050133945, "grad_norm": 0.562938392162323, "learning_rate": 1.8450099840555512e-05, "loss": 0.7219, "step": 2669 }, { "epoch": 1.0218140068886337, "grad_norm": 0.5015725493431091, "learning_rate": 1.8448773952956164e-05, "loss": 0.7227, "step": 2670 }, { "epoch": 1.022196708763873, "grad_norm": 0.5272977352142334, "learning_rate": 1.844744754615696e-05, "loss": 0.7096, "step": 2671 }, { "epoch": 1.0225794106391122, "grad_norm": 0.5029600262641907, "learning_rate": 1.844612062023941e-05, "loss": 0.7048, "step": 2672 }, { "epoch": 1.0229621125143513, "grad_norm": 0.5552951693534851, "learning_rate": 1.8444793175285057e-05, "loss": 0.6412, "step": 2673 }, { "epoch": 1.0233448143895905, "grad_norm": 0.5180416703224182, "learning_rate": 1.8443465211375474e-05, "loss": 0.6712, "step": 2674 }, { "epoch": 1.0237275162648296, "grad_norm": 0.5770550966262817, "learning_rate": 1.8442136728592275e-05, "loss": 0.6945, "step": 2675 }, { "epoch": 1.024110218140069, "grad_norm": 0.5332086682319641, "learning_rate": 1.8440807727017093e-05, "loss": 0.6209, "step": 2676 }, { "epoch": 1.0244929200153081, "grad_norm": 0.5130305886268616, "learning_rate": 1.84394782067316e-05, "loss": 0.5841, "step": 2677 }, { "epoch": 1.0248756218905473, "grad_norm": 0.5670604109764099, "learning_rate": 1.84381481678175e-05, "loss": 0.626, "step": 2678 }, { "epoch": 1.0252583237657864, "grad_norm": 0.5771763920783997, "learning_rate": 1.843681761035652e-05, "loss": 0.7085, "step": 2679 }, { "epoch": 1.0256410256410255, "grad_norm": 0.5074794888496399, "learning_rate": 1.843548653443044e-05, "loss": 0.635, "step": 2680 }, { "epoch": 1.026023727516265, "grad_norm": 0.5584529638290405, "learning_rate": 1.843415494012105e-05, "loss": 0.6323, "step": 2681 }, { "epoch": 1.026406429391504, "grad_norm": 0.5340223908424377, "learning_rate": 1.8432822827510177e-05, "loss": 0.7118, "step": 2682 }, { "epoch": 1.0267891312667432, "grad_norm": 0.4946165084838867, "learning_rate": 1.8431490196679686e-05, "loss": 0.6668, "step": 2683 }, { "epoch": 1.0271718331419823, "grad_norm": 0.5204297304153442, "learning_rate": 1.8430157047711473e-05, "loss": 0.6796, "step": 2684 }, { "epoch": 1.0275545350172215, "grad_norm": 0.676088273525238, "learning_rate": 1.8428823380687464e-05, "loss": 0.6565, "step": 2685 }, { "epoch": 1.0279372368924609, "grad_norm": 0.5387211441993713, "learning_rate": 1.842748919568961e-05, "loss": 0.7123, "step": 2686 }, { "epoch": 1.0283199387677, "grad_norm": 0.6237301826477051, "learning_rate": 1.842615449279991e-05, "loss": 0.6944, "step": 2687 }, { "epoch": 1.0287026406429391, "grad_norm": 0.5682514309883118, "learning_rate": 1.8424819272100375e-05, "loss": 0.7211, "step": 2688 }, { "epoch": 1.0290853425181783, "grad_norm": 0.5081998109817505, "learning_rate": 1.8423483533673065e-05, "loss": 0.6611, "step": 2689 }, { "epoch": 1.0294680443934174, "grad_norm": 0.5591248869895935, "learning_rate": 1.842214727760006e-05, "loss": 0.719, "step": 2690 }, { "epoch": 1.0298507462686568, "grad_norm": 0.5304175615310669, "learning_rate": 1.842081050396348e-05, "loss": 0.6313, "step": 2691 }, { "epoch": 1.030233448143896, "grad_norm": 0.5669243335723877, "learning_rate": 1.8419473212845473e-05, "loss": 0.6551, "step": 2692 }, { "epoch": 1.030616150019135, "grad_norm": 0.5699199438095093, "learning_rate": 1.8418135404328218e-05, "loss": 0.6288, "step": 2693 }, { "epoch": 1.0309988518943742, "grad_norm": 0.5850335359573364, "learning_rate": 1.8416797078493928e-05, "loss": 0.6325, "step": 2694 }, { "epoch": 1.0313815537696134, "grad_norm": 0.5345771312713623, "learning_rate": 1.8415458235424844e-05, "loss": 0.7749, "step": 2695 }, { "epoch": 1.0317642556448527, "grad_norm": 0.47930553555488586, "learning_rate": 1.8414118875203243e-05, "loss": 0.5982, "step": 2696 }, { "epoch": 1.032146957520092, "grad_norm": 0.5423949956893921, "learning_rate": 1.8412778997911434e-05, "loss": 0.7154, "step": 2697 }, { "epoch": 1.032529659395331, "grad_norm": 0.5567170977592468, "learning_rate": 1.8411438603631754e-05, "loss": 0.6843, "step": 2698 }, { "epoch": 1.0329123612705702, "grad_norm": 0.5544547438621521, "learning_rate": 1.8410097692446575e-05, "loss": 0.6824, "step": 2699 }, { "epoch": 1.0332950631458093, "grad_norm": 0.5172803401947021, "learning_rate": 1.8408756264438298e-05, "loss": 0.6896, "step": 2700 }, { "epoch": 1.0336777650210487, "grad_norm": 0.5529794692993164, "learning_rate": 1.840741431968936e-05, "loss": 0.7071, "step": 2701 }, { "epoch": 1.0340604668962878, "grad_norm": 0.4841984212398529, "learning_rate": 1.8406071858282226e-05, "loss": 0.6505, "step": 2702 }, { "epoch": 1.034443168771527, "grad_norm": 0.5417487621307373, "learning_rate": 1.840472888029939e-05, "loss": 0.6904, "step": 2703 }, { "epoch": 1.0348258706467661, "grad_norm": 0.5414745211601257, "learning_rate": 1.8403385385823392e-05, "loss": 0.6178, "step": 2704 }, { "epoch": 1.0352085725220053, "grad_norm": 0.5374239683151245, "learning_rate": 1.8402041374936783e-05, "loss": 0.651, "step": 2705 }, { "epoch": 1.0355912743972446, "grad_norm": 0.5005358457565308, "learning_rate": 1.8400696847722158e-05, "loss": 0.6526, "step": 2706 }, { "epoch": 1.0359739762724838, "grad_norm": 0.5214084982872009, "learning_rate": 1.8399351804262142e-05, "loss": 0.6641, "step": 2707 }, { "epoch": 1.036356678147723, "grad_norm": 0.5328978896141052, "learning_rate": 1.8398006244639397e-05, "loss": 0.7543, "step": 2708 }, { "epoch": 1.036739380022962, "grad_norm": 0.5273274779319763, "learning_rate": 1.8396660168936605e-05, "loss": 0.6586, "step": 2709 }, { "epoch": 1.0371220818982012, "grad_norm": 0.5909596681594849, "learning_rate": 1.8395313577236486e-05, "loss": 0.6774, "step": 2710 }, { "epoch": 1.0375047837734406, "grad_norm": 0.5393558144569397, "learning_rate": 1.83939664696218e-05, "loss": 0.5936, "step": 2711 }, { "epoch": 1.0378874856486797, "grad_norm": 0.531682014465332, "learning_rate": 1.839261884617532e-05, "loss": 0.7378, "step": 2712 }, { "epoch": 1.0382701875239189, "grad_norm": 0.5085123181343079, "learning_rate": 1.8391270706979864e-05, "loss": 0.6745, "step": 2713 }, { "epoch": 1.038652889399158, "grad_norm": 0.5727180242538452, "learning_rate": 1.8389922052118278e-05, "loss": 0.6406, "step": 2714 }, { "epoch": 1.0390355912743972, "grad_norm": 0.5163426995277405, "learning_rate": 1.8388572881673446e-05, "loss": 0.6716, "step": 2715 }, { "epoch": 1.0394182931496365, "grad_norm": 0.5381858944892883, "learning_rate": 1.8387223195728276e-05, "loss": 0.6258, "step": 2716 }, { "epoch": 1.0398009950248757, "grad_norm": 0.5261530876159668, "learning_rate": 1.8385872994365705e-05, "loss": 0.6653, "step": 2717 }, { "epoch": 1.0401836969001148, "grad_norm": 0.4890611171722412, "learning_rate": 1.8384522277668713e-05, "loss": 0.5189, "step": 2718 }, { "epoch": 1.040566398775354, "grad_norm": 0.5032864212989807, "learning_rate": 1.8383171045720302e-05, "loss": 0.7046, "step": 2719 }, { "epoch": 1.040949100650593, "grad_norm": 0.5455443859100342, "learning_rate": 1.8381819298603508e-05, "loss": 0.7298, "step": 2720 }, { "epoch": 1.0413318025258325, "grad_norm": 0.528840959072113, "learning_rate": 1.8380467036401398e-05, "loss": 0.6997, "step": 2721 }, { "epoch": 1.0417145044010716, "grad_norm": 0.5727002024650574, "learning_rate": 1.8379114259197076e-05, "loss": 0.6869, "step": 2722 }, { "epoch": 1.0420972062763108, "grad_norm": 0.5402343273162842, "learning_rate": 1.8377760967073673e-05, "loss": 0.6545, "step": 2723 }, { "epoch": 1.04247990815155, "grad_norm": 0.5287296772003174, "learning_rate": 1.8376407160114355e-05, "loss": 0.7099, "step": 2724 }, { "epoch": 1.042862610026789, "grad_norm": 0.5384796261787415, "learning_rate": 1.8375052838402307e-05, "loss": 0.6105, "step": 2725 }, { "epoch": 1.0432453119020284, "grad_norm": 0.5456178784370422, "learning_rate": 1.8373698002020768e-05, "loss": 0.7273, "step": 2726 }, { "epoch": 1.0436280137772675, "grad_norm": 0.5408119559288025, "learning_rate": 1.8372342651052988e-05, "loss": 0.7169, "step": 2727 }, { "epoch": 1.0440107156525067, "grad_norm": 0.5340107679367065, "learning_rate": 1.8370986785582265e-05, "loss": 0.6644, "step": 2728 }, { "epoch": 1.0443934175277458, "grad_norm": 0.49418091773986816, "learning_rate": 1.836963040569191e-05, "loss": 0.5827, "step": 2729 }, { "epoch": 1.044776119402985, "grad_norm": 0.5778838992118835, "learning_rate": 1.836827351146528e-05, "loss": 0.619, "step": 2730 }, { "epoch": 1.0451588212782243, "grad_norm": 0.5300496220588684, "learning_rate": 1.8366916102985768e-05, "loss": 0.6687, "step": 2731 }, { "epoch": 1.0455415231534635, "grad_norm": 0.5910166501998901, "learning_rate": 1.836555818033678e-05, "loss": 0.653, "step": 2732 }, { "epoch": 1.0459242250287026, "grad_norm": 0.5385996103286743, "learning_rate": 1.8364199743601768e-05, "loss": 0.6792, "step": 2733 }, { "epoch": 1.0463069269039418, "grad_norm": 0.5411009788513184, "learning_rate": 1.836284079286421e-05, "loss": 0.7115, "step": 2734 }, { "epoch": 1.046689628779181, "grad_norm": 0.554228663444519, "learning_rate": 1.8361481328207623e-05, "loss": 0.5883, "step": 2735 }, { "epoch": 1.0470723306544203, "grad_norm": 0.5388482213020325, "learning_rate": 1.836012134971554e-05, "loss": 0.6358, "step": 2736 }, { "epoch": 1.0474550325296594, "grad_norm": 0.550409197807312, "learning_rate": 1.8358760857471546e-05, "loss": 0.6407, "step": 2737 }, { "epoch": 1.0478377344048986, "grad_norm": 0.5540940165519714, "learning_rate": 1.835739985155924e-05, "loss": 0.6584, "step": 2738 }, { "epoch": 1.0482204362801377, "grad_norm": 0.517909049987793, "learning_rate": 1.8356038332062258e-05, "loss": 0.6296, "step": 2739 }, { "epoch": 1.0486031381553769, "grad_norm": 0.5601116418838501, "learning_rate": 1.8354676299064274e-05, "loss": 0.656, "step": 2740 }, { "epoch": 1.0489858400306162, "grad_norm": 0.5583581924438477, "learning_rate": 1.8353313752648986e-05, "loss": 0.6319, "step": 2741 }, { "epoch": 1.0493685419058554, "grad_norm": 0.5612464547157288, "learning_rate": 1.8351950692900127e-05, "loss": 0.6578, "step": 2742 }, { "epoch": 1.0497512437810945, "grad_norm": 0.5641632080078125, "learning_rate": 1.8350587119901462e-05, "loss": 0.6898, "step": 2743 }, { "epoch": 1.0501339456563337, "grad_norm": 0.5067662596702576, "learning_rate": 1.8349223033736784e-05, "loss": 0.7276, "step": 2744 }, { "epoch": 1.0505166475315728, "grad_norm": 0.5306946039199829, "learning_rate": 1.834785843448992e-05, "loss": 0.7542, "step": 2745 }, { "epoch": 1.0508993494068122, "grad_norm": 0.5454068779945374, "learning_rate": 1.834649332224473e-05, "loss": 0.7172, "step": 2746 }, { "epoch": 1.0512820512820513, "grad_norm": 0.495207816362381, "learning_rate": 1.8345127697085102e-05, "loss": 0.6536, "step": 2747 }, { "epoch": 1.0516647531572905, "grad_norm": 0.5269585847854614, "learning_rate": 1.8343761559094958e-05, "loss": 0.6527, "step": 2748 }, { "epoch": 1.0520474550325296, "grad_norm": 0.5033565759658813, "learning_rate": 1.8342394908358247e-05, "loss": 0.687, "step": 2749 }, { "epoch": 1.0524301569077688, "grad_norm": 0.5388094186782837, "learning_rate": 1.834102774495896e-05, "loss": 0.6296, "step": 2750 }, { "epoch": 1.0528128587830081, "grad_norm": 0.5434271693229675, "learning_rate": 1.8339660068981114e-05, "loss": 0.7085, "step": 2751 }, { "epoch": 1.0531955606582473, "grad_norm": 0.569441556930542, "learning_rate": 1.833829188050875e-05, "loss": 0.6846, "step": 2752 }, { "epoch": 1.0535782625334864, "grad_norm": 0.5297559499740601, "learning_rate": 1.8336923179625944e-05, "loss": 0.6918, "step": 2753 }, { "epoch": 1.0539609644087256, "grad_norm": 0.5386055111885071, "learning_rate": 1.8335553966416816e-05, "loss": 0.6771, "step": 2754 }, { "epoch": 1.0543436662839647, "grad_norm": 0.6124333143234253, "learning_rate": 1.8334184240965506e-05, "loss": 0.6359, "step": 2755 }, { "epoch": 1.054726368159204, "grad_norm": 0.6565502285957336, "learning_rate": 1.8332814003356182e-05, "loss": 0.7581, "step": 2756 }, { "epoch": 1.0551090700344432, "grad_norm": 0.5440715551376343, "learning_rate": 1.8331443253673045e-05, "loss": 0.6509, "step": 2757 }, { "epoch": 1.0554917719096824, "grad_norm": 0.5452346801757812, "learning_rate": 1.8330071992000346e-05, "loss": 0.7229, "step": 2758 }, { "epoch": 1.0558744737849215, "grad_norm": 0.5558221936225891, "learning_rate": 1.8328700218422343e-05, "loss": 0.6566, "step": 2759 }, { "epoch": 1.0562571756601606, "grad_norm": 0.5490821599960327, "learning_rate": 1.832732793302334e-05, "loss": 0.7167, "step": 2760 }, { "epoch": 1.0566398775354, "grad_norm": 0.5130245089530945, "learning_rate": 1.8325955135887657e-05, "loss": 0.6244, "step": 2761 }, { "epoch": 1.0570225794106392, "grad_norm": 0.5339406132698059, "learning_rate": 1.8324581827099665e-05, "loss": 0.6433, "step": 2762 }, { "epoch": 1.0574052812858783, "grad_norm": 0.572760820388794, "learning_rate": 1.8323208006743757e-05, "loss": 0.6122, "step": 2763 }, { "epoch": 1.0577879831611174, "grad_norm": 0.5670487880706787, "learning_rate": 1.8321833674904358e-05, "loss": 0.739, "step": 2764 }, { "epoch": 1.0581706850363566, "grad_norm": 0.5250357389450073, "learning_rate": 1.832045883166592e-05, "loss": 0.7241, "step": 2765 }, { "epoch": 1.058553386911596, "grad_norm": 0.5670422911643982, "learning_rate": 1.8319083477112936e-05, "loss": 0.6995, "step": 2766 }, { "epoch": 1.058936088786835, "grad_norm": 0.6100444793701172, "learning_rate": 1.8317707611329924e-05, "loss": 0.6339, "step": 2767 }, { "epoch": 1.0593187906620742, "grad_norm": 0.5121865272521973, "learning_rate": 1.8316331234401434e-05, "loss": 0.6805, "step": 2768 }, { "epoch": 1.0597014925373134, "grad_norm": 0.5550234317779541, "learning_rate": 1.8314954346412048e-05, "loss": 0.6648, "step": 2769 }, { "epoch": 1.0600841944125525, "grad_norm": 0.5749727487564087, "learning_rate": 1.8313576947446377e-05, "loss": 0.6291, "step": 2770 }, { "epoch": 1.060466896287792, "grad_norm": 0.6253642439842224, "learning_rate": 1.831219903758907e-05, "loss": 0.731, "step": 2771 }, { "epoch": 1.060849598163031, "grad_norm": 0.5998815894126892, "learning_rate": 1.83108206169248e-05, "loss": 0.6817, "step": 2772 }, { "epoch": 1.0612323000382702, "grad_norm": 0.5094248652458191, "learning_rate": 1.830944168553828e-05, "loss": 0.6595, "step": 2773 }, { "epoch": 1.0616150019135093, "grad_norm": 0.5523145198822021, "learning_rate": 1.8308062243514242e-05, "loss": 0.6954, "step": 2774 }, { "epoch": 1.0619977037887485, "grad_norm": 0.5292519927024841, "learning_rate": 1.830668229093746e-05, "loss": 0.6275, "step": 2775 }, { "epoch": 1.0623804056639878, "grad_norm": 0.5452808737754822, "learning_rate": 1.8305301827892735e-05, "loss": 0.644, "step": 2776 }, { "epoch": 1.062763107539227, "grad_norm": 0.5645712018013, "learning_rate": 1.83039208544649e-05, "loss": 0.5593, "step": 2777 }, { "epoch": 1.0631458094144661, "grad_norm": 0.4845796227455139, "learning_rate": 1.830253937073882e-05, "loss": 0.5633, "step": 2778 }, { "epoch": 1.0635285112897053, "grad_norm": 0.5394136309623718, "learning_rate": 1.8301157376799394e-05, "loss": 0.6249, "step": 2779 }, { "epoch": 1.0639112131649444, "grad_norm": 0.5624867677688599, "learning_rate": 1.8299774872731543e-05, "loss": 0.702, "step": 2780 }, { "epoch": 1.0642939150401838, "grad_norm": 0.572916567325592, "learning_rate": 1.8298391858620232e-05, "loss": 0.6696, "step": 2781 }, { "epoch": 1.064676616915423, "grad_norm": 0.5420809984207153, "learning_rate": 1.8297008334550445e-05, "loss": 0.6521, "step": 2782 }, { "epoch": 1.065059318790662, "grad_norm": 0.5743457078933716, "learning_rate": 1.8295624300607206e-05, "loss": 0.6933, "step": 2783 }, { "epoch": 1.0654420206659012, "grad_norm": 0.5378336906433105, "learning_rate": 1.829423975687557e-05, "loss": 0.7196, "step": 2784 }, { "epoch": 1.0658247225411404, "grad_norm": 0.5372422933578491, "learning_rate": 1.829285470344062e-05, "loss": 0.6516, "step": 2785 }, { "epoch": 1.0662074244163797, "grad_norm": 0.5394554734230042, "learning_rate": 1.8291469140387462e-05, "loss": 0.6381, "step": 2786 }, { "epoch": 1.0665901262916189, "grad_norm": 0.5065298676490784, "learning_rate": 1.829008306780126e-05, "loss": 0.6659, "step": 2787 }, { "epoch": 1.066972828166858, "grad_norm": 0.555059015750885, "learning_rate": 1.8288696485767174e-05, "loss": 0.6878, "step": 2788 }, { "epoch": 1.0673555300420972, "grad_norm": 0.4859851896762848, "learning_rate": 1.8287309394370427e-05, "loss": 0.6528, "step": 2789 }, { "epoch": 1.0677382319173363, "grad_norm": 0.531288743019104, "learning_rate": 1.828592179369625e-05, "loss": 0.5802, "step": 2790 }, { "epoch": 1.0681209337925757, "grad_norm": 0.5355536937713623, "learning_rate": 1.8284533683829923e-05, "loss": 0.7691, "step": 2791 }, { "epoch": 1.0685036356678148, "grad_norm": 0.5305525660514832, "learning_rate": 1.828314506485674e-05, "loss": 0.601, "step": 2792 }, { "epoch": 1.068886337543054, "grad_norm": 0.5282543301582336, "learning_rate": 1.8281755936862043e-05, "loss": 0.6236, "step": 2793 }, { "epoch": 1.069269039418293, "grad_norm": 0.5421457290649414, "learning_rate": 1.8280366299931194e-05, "loss": 0.7384, "step": 2794 }, { "epoch": 1.0696517412935322, "grad_norm": 0.5275017619132996, "learning_rate": 1.827897615414959e-05, "loss": 0.6079, "step": 2795 }, { "epoch": 1.0700344431687716, "grad_norm": 0.512546718120575, "learning_rate": 1.8277585499602663e-05, "loss": 0.6386, "step": 2796 }, { "epoch": 1.0704171450440108, "grad_norm": 0.6709668636322021, "learning_rate": 1.827619433637587e-05, "loss": 0.6574, "step": 2797 }, { "epoch": 1.07079984691925, "grad_norm": 0.5278890132904053, "learning_rate": 1.8274802664554697e-05, "loss": 0.7021, "step": 2798 }, { "epoch": 1.071182548794489, "grad_norm": 0.5446144938468933, "learning_rate": 1.827341048422467e-05, "loss": 0.7343, "step": 2799 }, { "epoch": 1.0715652506697282, "grad_norm": 0.5151923894882202, "learning_rate": 1.8272017795471345e-05, "loss": 0.6706, "step": 2800 }, { "epoch": 1.0719479525449676, "grad_norm": 0.4956144094467163, "learning_rate": 1.8270624598380303e-05, "loss": 0.6225, "step": 2801 }, { "epoch": 1.0723306544202067, "grad_norm": 0.5541746616363525, "learning_rate": 1.8269230893037163e-05, "loss": 0.6749, "step": 2802 }, { "epoch": 1.0727133562954458, "grad_norm": 0.5250786542892456, "learning_rate": 1.8267836679527567e-05, "loss": 0.6338, "step": 2803 }, { "epoch": 1.073096058170685, "grad_norm": 0.5858698487281799, "learning_rate": 1.8266441957937194e-05, "loss": 0.6772, "step": 2804 }, { "epoch": 1.0734787600459241, "grad_norm": 0.5650739669799805, "learning_rate": 1.8265046728351757e-05, "loss": 0.6224, "step": 2805 }, { "epoch": 1.0738614619211635, "grad_norm": 0.5209460258483887, "learning_rate": 1.8263650990856993e-05, "loss": 0.5992, "step": 2806 }, { "epoch": 1.0742441637964026, "grad_norm": 0.56777024269104, "learning_rate": 1.826225474553868e-05, "loss": 0.6785, "step": 2807 }, { "epoch": 1.0746268656716418, "grad_norm": 0.5504165291786194, "learning_rate": 1.826085799248261e-05, "loss": 0.658, "step": 2808 }, { "epoch": 1.075009567546881, "grad_norm": 0.5586565732955933, "learning_rate": 1.8259460731774624e-05, "loss": 0.7287, "step": 2809 }, { "epoch": 1.07539226942212, "grad_norm": 0.5551797747612, "learning_rate": 1.825806296350059e-05, "loss": 0.6127, "step": 2810 }, { "epoch": 1.0757749712973594, "grad_norm": 0.5316354036331177, "learning_rate": 1.82566646877464e-05, "loss": 0.6743, "step": 2811 }, { "epoch": 1.0761576731725986, "grad_norm": 0.5998116731643677, "learning_rate": 1.8255265904597986e-05, "loss": 0.6425, "step": 2812 }, { "epoch": 1.0765403750478377, "grad_norm": 0.5211967825889587, "learning_rate": 1.82538666141413e-05, "loss": 0.6194, "step": 2813 }, { "epoch": 1.0769230769230769, "grad_norm": 0.5869531631469727, "learning_rate": 1.8252466816462345e-05, "loss": 0.6998, "step": 2814 }, { "epoch": 1.077305778798316, "grad_norm": 0.5226860046386719, "learning_rate": 1.8251066511647127e-05, "loss": 0.6588, "step": 2815 }, { "epoch": 1.0776884806735554, "grad_norm": 0.6914331912994385, "learning_rate": 1.8249665699781707e-05, "loss": 0.7165, "step": 2816 }, { "epoch": 1.0780711825487945, "grad_norm": 0.5783628225326538, "learning_rate": 1.8248264380952168e-05, "loss": 0.6969, "step": 2817 }, { "epoch": 1.0784538844240337, "grad_norm": 0.5205628275871277, "learning_rate": 1.8246862555244624e-05, "loss": 0.6791, "step": 2818 }, { "epoch": 1.0788365862992728, "grad_norm": 0.5232817530632019, "learning_rate": 1.824546022274522e-05, "loss": 0.8025, "step": 2819 }, { "epoch": 1.079219288174512, "grad_norm": 0.5422163605690002, "learning_rate": 1.8244057383540133e-05, "loss": 0.6277, "step": 2820 }, { "epoch": 1.0796019900497513, "grad_norm": 0.5507021546363831, "learning_rate": 1.8242654037715573e-05, "loss": 0.6227, "step": 2821 }, { "epoch": 1.0799846919249905, "grad_norm": 0.5179495215415955, "learning_rate": 1.824125018535778e-05, "loss": 0.6692, "step": 2822 }, { "epoch": 1.0803673938002296, "grad_norm": 0.5737230777740479, "learning_rate": 1.8239845826553023e-05, "loss": 0.6916, "step": 2823 }, { "epoch": 1.0807500956754688, "grad_norm": 0.5430436134338379, "learning_rate": 1.82384409613876e-05, "loss": 0.6801, "step": 2824 }, { "epoch": 1.081132797550708, "grad_norm": 0.5496267676353455, "learning_rate": 1.823703558994785e-05, "loss": 0.6539, "step": 2825 }, { "epoch": 1.0815154994259473, "grad_norm": 0.5222635865211487, "learning_rate": 1.8235629712320134e-05, "loss": 0.6366, "step": 2826 }, { "epoch": 1.0818982013011864, "grad_norm": 0.5462832450866699, "learning_rate": 1.823422332859085e-05, "loss": 0.6574, "step": 2827 }, { "epoch": 1.0822809031764256, "grad_norm": 0.5467646718025208, "learning_rate": 1.8232816438846418e-05, "loss": 0.6215, "step": 2828 }, { "epoch": 1.0826636050516647, "grad_norm": 0.5497494339942932, "learning_rate": 1.82314090431733e-05, "loss": 0.6715, "step": 2829 }, { "epoch": 1.0830463069269038, "grad_norm": 0.5553709864616394, "learning_rate": 1.8230001141657983e-05, "loss": 0.671, "step": 2830 }, { "epoch": 1.0834290088021432, "grad_norm": 0.5769476294517517, "learning_rate": 1.8228592734386983e-05, "loss": 0.6842, "step": 2831 }, { "epoch": 1.0838117106773824, "grad_norm": 0.5381067991256714, "learning_rate": 1.8227183821446854e-05, "loss": 0.6282, "step": 2832 }, { "epoch": 1.0841944125526215, "grad_norm": 0.5403822064399719, "learning_rate": 1.822577440292418e-05, "loss": 0.6009, "step": 2833 }, { "epoch": 1.0845771144278606, "grad_norm": 0.5485720634460449, "learning_rate": 1.822436447890557e-05, "loss": 0.6781, "step": 2834 }, { "epoch": 1.0849598163030998, "grad_norm": 0.5762686133384705, "learning_rate": 1.8222954049477667e-05, "loss": 0.6723, "step": 2835 }, { "epoch": 1.0853425181783392, "grad_norm": 0.5639861822128296, "learning_rate": 1.8221543114727146e-05, "loss": 0.7231, "step": 2836 }, { "epoch": 1.0857252200535783, "grad_norm": 0.5491145253181458, "learning_rate": 1.8220131674740713e-05, "loss": 0.6493, "step": 2837 }, { "epoch": 1.0861079219288174, "grad_norm": 0.5699709057807922, "learning_rate": 1.8218719729605108e-05, "loss": 0.7045, "step": 2838 }, { "epoch": 1.0864906238040566, "grad_norm": 0.5678272843360901, "learning_rate": 1.8217307279407094e-05, "loss": 0.7055, "step": 2839 }, { "epoch": 1.0868733256792957, "grad_norm": 0.5426042079925537, "learning_rate": 1.8215894324233475e-05, "loss": 0.7238, "step": 2840 }, { "epoch": 1.087256027554535, "grad_norm": 0.539477527141571, "learning_rate": 1.8214480864171075e-05, "loss": 0.6445, "step": 2841 }, { "epoch": 1.0876387294297742, "grad_norm": 0.5038633942604065, "learning_rate": 1.8213066899306756e-05, "loss": 0.5934, "step": 2842 }, { "epoch": 1.0880214313050134, "grad_norm": 0.5462212562561035, "learning_rate": 1.8211652429727413e-05, "loss": 0.678, "step": 2843 }, { "epoch": 1.0884041331802525, "grad_norm": 0.5340138077735901, "learning_rate": 1.821023745551997e-05, "loss": 0.6438, "step": 2844 }, { "epoch": 1.0887868350554917, "grad_norm": 0.5719809532165527, "learning_rate": 1.8208821976771376e-05, "loss": 0.6027, "step": 2845 }, { "epoch": 1.089169536930731, "grad_norm": 0.5166266560554504, "learning_rate": 1.8207405993568622e-05, "loss": 0.645, "step": 2846 }, { "epoch": 1.0895522388059702, "grad_norm": 0.5468419194221497, "learning_rate": 1.8205989505998717e-05, "loss": 0.6065, "step": 2847 }, { "epoch": 1.0899349406812093, "grad_norm": 0.523361086845398, "learning_rate": 1.8204572514148716e-05, "loss": 0.5967, "step": 2848 }, { "epoch": 1.0903176425564485, "grad_norm": 0.6374710202217102, "learning_rate": 1.8203155018105687e-05, "loss": 0.7212, "step": 2849 }, { "epoch": 1.0907003444316876, "grad_norm": 0.6424232125282288, "learning_rate": 1.8201737017956746e-05, "loss": 0.5972, "step": 2850 }, { "epoch": 1.091083046306927, "grad_norm": 0.5609733462333679, "learning_rate": 1.8200318513789036e-05, "loss": 0.6159, "step": 2851 }, { "epoch": 1.0914657481821661, "grad_norm": 0.5970624685287476, "learning_rate": 1.819889950568972e-05, "loss": 0.655, "step": 2852 }, { "epoch": 1.0918484500574053, "grad_norm": 0.5444765686988831, "learning_rate": 1.8197479993746002e-05, "loss": 0.6536, "step": 2853 }, { "epoch": 1.0922311519326444, "grad_norm": 0.6039867401123047, "learning_rate": 1.8196059978045117e-05, "loss": 0.6766, "step": 2854 }, { "epoch": 1.0926138538078836, "grad_norm": 0.5987497568130493, "learning_rate": 1.8194639458674325e-05, "loss": 0.6067, "step": 2855 }, { "epoch": 1.092996555683123, "grad_norm": 0.5705089569091797, "learning_rate": 1.8193218435720927e-05, "loss": 0.6665, "step": 2856 }, { "epoch": 1.093379257558362, "grad_norm": 0.5422529578208923, "learning_rate": 1.8191796909272244e-05, "loss": 0.7461, "step": 2857 }, { "epoch": 1.0937619594336012, "grad_norm": 0.6137611865997314, "learning_rate": 1.8190374879415634e-05, "loss": 0.7528, "step": 2858 }, { "epoch": 1.0941446613088404, "grad_norm": 0.5613712072372437, "learning_rate": 1.8188952346238483e-05, "loss": 0.7129, "step": 2859 }, { "epoch": 1.0945273631840795, "grad_norm": 0.5774744749069214, "learning_rate": 1.818752930982821e-05, "loss": 0.6079, "step": 2860 }, { "epoch": 1.0949100650593189, "grad_norm": 0.6159073114395142, "learning_rate": 1.818610577027227e-05, "loss": 0.7425, "step": 2861 }, { "epoch": 1.095292766934558, "grad_norm": 0.5808877944946289, "learning_rate": 1.8184681727658134e-05, "loss": 0.6448, "step": 2862 }, { "epoch": 1.0956754688097972, "grad_norm": 0.5235018134117126, "learning_rate": 1.818325718207332e-05, "loss": 0.6786, "step": 2863 }, { "epoch": 1.0960581706850363, "grad_norm": 0.7616720199584961, "learning_rate": 1.818183213360537e-05, "loss": 0.6489, "step": 2864 }, { "epoch": 1.0964408725602754, "grad_norm": 0.5443903207778931, "learning_rate": 1.818040658234185e-05, "loss": 0.6829, "step": 2865 }, { "epoch": 1.0968235744355148, "grad_norm": 0.562629222869873, "learning_rate": 1.8178980528370372e-05, "loss": 0.7278, "step": 2866 }, { "epoch": 1.097206276310754, "grad_norm": 0.6197822093963623, "learning_rate": 1.8177553971778564e-05, "loss": 0.6967, "step": 2867 }, { "epoch": 1.097588978185993, "grad_norm": 0.5763746500015259, "learning_rate": 1.81761269126541e-05, "loss": 0.6073, "step": 2868 }, { "epoch": 1.0979716800612322, "grad_norm": 0.5674311518669128, "learning_rate": 1.817469935108467e-05, "loss": 0.6657, "step": 2869 }, { "epoch": 1.0983543819364714, "grad_norm": 0.5695037841796875, "learning_rate": 1.8173271287158005e-05, "loss": 0.643, "step": 2870 }, { "epoch": 1.0987370838117108, "grad_norm": 0.5721834897994995, "learning_rate": 1.817184272096186e-05, "loss": 0.672, "step": 2871 }, { "epoch": 1.09911978568695, "grad_norm": 0.5074792504310608, "learning_rate": 1.817041365258403e-05, "loss": 0.7263, "step": 2872 }, { "epoch": 1.099502487562189, "grad_norm": 0.5332947373390198, "learning_rate": 1.8168984082112327e-05, "loss": 0.607, "step": 2873 }, { "epoch": 1.0998851894374282, "grad_norm": 0.5462673306465149, "learning_rate": 1.816755400963461e-05, "loss": 0.6405, "step": 2874 }, { "epoch": 1.1002678913126673, "grad_norm": 0.562000036239624, "learning_rate": 1.8166123435238754e-05, "loss": 0.6153, "step": 2875 }, { "epoch": 1.1006505931879067, "grad_norm": 0.5641255974769592, "learning_rate": 1.8164692359012676e-05, "loss": 0.6862, "step": 2876 }, { "epoch": 1.1010332950631458, "grad_norm": 0.517539381980896, "learning_rate": 1.8163260781044317e-05, "loss": 0.6271, "step": 2877 }, { "epoch": 1.101415996938385, "grad_norm": 0.4950191378593445, "learning_rate": 1.8161828701421654e-05, "loss": 0.6824, "step": 2878 }, { "epoch": 1.1017986988136241, "grad_norm": 0.5269961357116699, "learning_rate": 1.816039612023269e-05, "loss": 0.6261, "step": 2879 }, { "epoch": 1.1021814006888633, "grad_norm": 0.5390366911888123, "learning_rate": 1.8158963037565463e-05, "loss": 0.5888, "step": 2880 }, { "epoch": 1.1025641025641026, "grad_norm": 0.5086110830307007, "learning_rate": 1.8157529453508036e-05, "loss": 0.817, "step": 2881 }, { "epoch": 1.1029468044393418, "grad_norm": 0.550534188747406, "learning_rate": 1.815609536814851e-05, "loss": 0.6915, "step": 2882 }, { "epoch": 1.103329506314581, "grad_norm": 0.5194905400276184, "learning_rate": 1.815466078157501e-05, "loss": 0.7264, "step": 2883 }, { "epoch": 1.10371220818982, "grad_norm": 0.5347272753715515, "learning_rate": 1.81532256938757e-05, "loss": 0.6638, "step": 2884 }, { "epoch": 1.1040949100650592, "grad_norm": 0.538184642791748, "learning_rate": 1.8151790105138768e-05, "loss": 0.7132, "step": 2885 }, { "epoch": 1.1044776119402986, "grad_norm": 0.6017094254493713, "learning_rate": 1.8150354015452435e-05, "loss": 0.694, "step": 2886 }, { "epoch": 1.1048603138155377, "grad_norm": 0.6415824294090271, "learning_rate": 1.8148917424904952e-05, "loss": 0.725, "step": 2887 }, { "epoch": 1.1052430156907769, "grad_norm": 0.6106433272361755, "learning_rate": 1.81474803335846e-05, "loss": 0.6474, "step": 2888 }, { "epoch": 1.105625717566016, "grad_norm": 0.5710487961769104, "learning_rate": 1.8146042741579694e-05, "loss": 0.6056, "step": 2889 }, { "epoch": 1.1060084194412552, "grad_norm": 0.523276686668396, "learning_rate": 1.8144604648978575e-05, "loss": 0.674, "step": 2890 }, { "epoch": 1.1063911213164945, "grad_norm": 0.4944958984851837, "learning_rate": 1.8143166055869622e-05, "loss": 0.6589, "step": 2891 }, { "epoch": 1.1067738231917337, "grad_norm": 0.4974925220012665, "learning_rate": 1.8141726962341237e-05, "loss": 0.6645, "step": 2892 }, { "epoch": 1.1071565250669728, "grad_norm": 0.5372302532196045, "learning_rate": 1.814028736848186e-05, "loss": 0.6959, "step": 2893 }, { "epoch": 1.107539226942212, "grad_norm": 0.5534840226173401, "learning_rate": 1.813884727437995e-05, "loss": 0.654, "step": 2894 }, { "epoch": 1.107921928817451, "grad_norm": 0.5454028844833374, "learning_rate": 1.8137406680124017e-05, "loss": 0.7276, "step": 2895 }, { "epoch": 1.1083046306926905, "grad_norm": 0.5747724771499634, "learning_rate": 1.8135965585802577e-05, "loss": 0.7009, "step": 2896 }, { "epoch": 1.1086873325679296, "grad_norm": 0.534392237663269, "learning_rate": 1.8134523991504198e-05, "loss": 0.6974, "step": 2897 }, { "epoch": 1.1090700344431688, "grad_norm": 0.56070476770401, "learning_rate": 1.8133081897317467e-05, "loss": 0.7044, "step": 2898 }, { "epoch": 1.109452736318408, "grad_norm": 0.552684485912323, "learning_rate": 1.8131639303331e-05, "loss": 0.6507, "step": 2899 }, { "epoch": 1.109835438193647, "grad_norm": 0.5475170612335205, "learning_rate": 1.8130196209633455e-05, "loss": 0.6252, "step": 2900 }, { "epoch": 1.1102181400688864, "grad_norm": 0.523971676826477, "learning_rate": 1.812875261631351e-05, "loss": 0.641, "step": 2901 }, { "epoch": 1.1106008419441256, "grad_norm": 0.56175696849823, "learning_rate": 1.812730852345988e-05, "loss": 0.6138, "step": 2902 }, { "epoch": 1.1109835438193647, "grad_norm": 0.5634353160858154, "learning_rate": 1.812586393116131e-05, "loss": 0.7079, "step": 2903 }, { "epoch": 1.1113662456946038, "grad_norm": 0.48679620027542114, "learning_rate": 1.8124418839506566e-05, "loss": 0.6375, "step": 2904 }, { "epoch": 1.111748947569843, "grad_norm": 0.5090281367301941, "learning_rate": 1.812297324858446e-05, "loss": 0.7039, "step": 2905 }, { "epoch": 1.1121316494450824, "grad_norm": 0.5609080791473389, "learning_rate": 1.8121527158483828e-05, "loss": 0.6395, "step": 2906 }, { "epoch": 1.1125143513203215, "grad_norm": 0.659885585308075, "learning_rate": 1.812008056929353e-05, "loss": 0.66, "step": 2907 }, { "epoch": 1.1128970531955606, "grad_norm": 0.5357038974761963, "learning_rate": 1.811863348110247e-05, "loss": 0.6929, "step": 2908 }, { "epoch": 1.1132797550707998, "grad_norm": 0.5777322053909302, "learning_rate": 1.8117185893999568e-05, "loss": 0.6169, "step": 2909 }, { "epoch": 1.113662456946039, "grad_norm": 0.6371073126792908, "learning_rate": 1.811573780807379e-05, "loss": 0.6635, "step": 2910 }, { "epoch": 1.1140451588212783, "grad_norm": 0.495525985956192, "learning_rate": 1.811428922341412e-05, "loss": 0.6012, "step": 2911 }, { "epoch": 1.1144278606965174, "grad_norm": 0.5517506003379822, "learning_rate": 1.8112840140109576e-05, "loss": 0.6542, "step": 2912 }, { "epoch": 1.1148105625717566, "grad_norm": 0.6500807404518127, "learning_rate": 1.811139055824921e-05, "loss": 0.5947, "step": 2913 }, { "epoch": 1.1151932644469957, "grad_norm": 0.5488268733024597, "learning_rate": 1.8109940477922108e-05, "loss": 0.5736, "step": 2914 }, { "epoch": 1.1155759663222349, "grad_norm": 0.5892224907875061, "learning_rate": 1.810848989921737e-05, "loss": 0.7067, "step": 2915 }, { "epoch": 1.1159586681974742, "grad_norm": 0.5753273963928223, "learning_rate": 1.810703882222415e-05, "loss": 0.636, "step": 2916 }, { "epoch": 1.1163413700727134, "grad_norm": 0.5209982991218567, "learning_rate": 1.8105587247031612e-05, "loss": 0.5922, "step": 2917 }, { "epoch": 1.1167240719479525, "grad_norm": 0.5403946042060852, "learning_rate": 1.8104135173728962e-05, "loss": 0.6932, "step": 2918 }, { "epoch": 1.1171067738231917, "grad_norm": 0.5634833574295044, "learning_rate": 1.8102682602405434e-05, "loss": 0.7554, "step": 2919 }, { "epoch": 1.1174894756984308, "grad_norm": 0.505500078201294, "learning_rate": 1.8101229533150294e-05, "loss": 0.6218, "step": 2920 }, { "epoch": 1.1178721775736702, "grad_norm": 0.5061744451522827, "learning_rate": 1.8099775966052834e-05, "loss": 0.5973, "step": 2921 }, { "epoch": 1.1182548794489093, "grad_norm": 0.5739994049072266, "learning_rate": 1.809832190120238e-05, "loss": 0.6907, "step": 2922 }, { "epoch": 1.1186375813241485, "grad_norm": 0.5332978367805481, "learning_rate": 1.809686733868829e-05, "loss": 0.6812, "step": 2923 }, { "epoch": 1.1190202831993876, "grad_norm": 0.5748330354690552, "learning_rate": 1.809541227859995e-05, "loss": 0.6754, "step": 2924 }, { "epoch": 1.1194029850746268, "grad_norm": 0.5811464190483093, "learning_rate": 1.8093956721026774e-05, "loss": 0.6696, "step": 2925 }, { "epoch": 1.1197856869498661, "grad_norm": 0.5844445824623108, "learning_rate": 1.809250066605822e-05, "loss": 0.6938, "step": 2926 }, { "epoch": 1.1201683888251053, "grad_norm": 0.5884419083595276, "learning_rate": 1.8091044113783756e-05, "loss": 0.6783, "step": 2927 }, { "epoch": 1.1205510907003444, "grad_norm": 0.5188353061676025, "learning_rate": 1.8089587064292896e-05, "loss": 0.5952, "step": 2928 }, { "epoch": 1.1209337925755836, "grad_norm": 0.5295062065124512, "learning_rate": 1.8088129517675173e-05, "loss": 0.6384, "step": 2929 }, { "epoch": 1.1213164944508227, "grad_norm": 0.5584462881088257, "learning_rate": 1.8086671474020167e-05, "loss": 0.6405, "step": 2930 }, { "epoch": 1.121699196326062, "grad_norm": 0.5581779479980469, "learning_rate": 1.8085212933417474e-05, "loss": 0.67, "step": 2931 }, { "epoch": 1.1220818982013012, "grad_norm": 0.593387246131897, "learning_rate": 1.8083753895956726e-05, "loss": 0.5846, "step": 2932 }, { "epoch": 1.1224646000765404, "grad_norm": 0.5676791071891785, "learning_rate": 1.8082294361727583e-05, "loss": 0.6435, "step": 2933 }, { "epoch": 1.1228473019517795, "grad_norm": 0.5276166796684265, "learning_rate": 1.808083433081974e-05, "loss": 0.613, "step": 2934 }, { "epoch": 1.1232300038270187, "grad_norm": 0.5761513113975525, "learning_rate": 1.8079373803322914e-05, "loss": 0.6626, "step": 2935 }, { "epoch": 1.123612705702258, "grad_norm": 0.5254824757575989, "learning_rate": 1.8077912779326868e-05, "loss": 0.606, "step": 2936 }, { "epoch": 1.1239954075774972, "grad_norm": 0.5325479507446289, "learning_rate": 1.807645125892138e-05, "loss": 0.6767, "step": 2937 }, { "epoch": 1.1243781094527363, "grad_norm": 0.5544779300689697, "learning_rate": 1.807498924219626e-05, "loss": 0.7223, "step": 2938 }, { "epoch": 1.1247608113279755, "grad_norm": 0.5212403535842896, "learning_rate": 1.807352672924136e-05, "loss": 0.6656, "step": 2939 }, { "epoch": 1.1251435132032146, "grad_norm": 0.5291479229927063, "learning_rate": 1.8072063720146553e-05, "loss": 0.7029, "step": 2940 }, { "epoch": 1.125526215078454, "grad_norm": 0.5511521697044373, "learning_rate": 1.8070600215001746e-05, "loss": 0.7332, "step": 2941 }, { "epoch": 1.125908916953693, "grad_norm": 0.6031075716018677, "learning_rate": 1.8069136213896874e-05, "loss": 0.6838, "step": 2942 }, { "epoch": 1.1262916188289323, "grad_norm": 0.5243281722068787, "learning_rate": 1.80676717169219e-05, "loss": 0.6236, "step": 2943 }, { "epoch": 1.1266743207041714, "grad_norm": 0.5401199460029602, "learning_rate": 1.806620672416683e-05, "loss": 0.6877, "step": 2944 }, { "epoch": 1.1270570225794105, "grad_norm": 0.5372428297996521, "learning_rate": 1.806474123572169e-05, "loss": 0.6531, "step": 2945 }, { "epoch": 1.12743972445465, "grad_norm": 0.590384304523468, "learning_rate": 1.806327525167653e-05, "loss": 0.6037, "step": 2946 }, { "epoch": 1.127822426329889, "grad_norm": 0.5326341986656189, "learning_rate": 1.8061808772121444e-05, "loss": 0.6536, "step": 2947 }, { "epoch": 1.1282051282051282, "grad_norm": 0.6027411818504333, "learning_rate": 1.806034179714655e-05, "loss": 0.6665, "step": 2948 }, { "epoch": 1.1285878300803673, "grad_norm": 0.5661752820014954, "learning_rate": 1.8058874326842004e-05, "loss": 0.6363, "step": 2949 }, { "epoch": 1.1289705319556065, "grad_norm": 0.5263052582740784, "learning_rate": 1.8057406361297974e-05, "loss": 0.6558, "step": 2950 }, { "epoch": 1.1293532338308458, "grad_norm": 0.5433693528175354, "learning_rate": 1.8055937900604683e-05, "loss": 0.6525, "step": 2951 }, { "epoch": 1.129735935706085, "grad_norm": 0.5413323044776917, "learning_rate": 1.805446894485236e-05, "loss": 0.6407, "step": 2952 }, { "epoch": 1.1301186375813241, "grad_norm": 0.5427802801132202, "learning_rate": 1.8052999494131288e-05, "loss": 0.5791, "step": 2953 }, { "epoch": 1.1305013394565633, "grad_norm": 0.5116299390792847, "learning_rate": 1.8051529548531758e-05, "loss": 0.6698, "step": 2954 }, { "epoch": 1.1308840413318024, "grad_norm": 0.535752534866333, "learning_rate": 1.8050059108144108e-05, "loss": 0.7215, "step": 2955 }, { "epoch": 1.1312667432070418, "grad_norm": 0.5639509558677673, "learning_rate": 1.80485881730587e-05, "loss": 0.7937, "step": 2956 }, { "epoch": 1.131649445082281, "grad_norm": 0.5782310962677002, "learning_rate": 1.8047116743365926e-05, "loss": 0.6825, "step": 2957 }, { "epoch": 1.13203214695752, "grad_norm": 0.5726993680000305, "learning_rate": 1.804564481915621e-05, "loss": 0.6811, "step": 2958 }, { "epoch": 1.1324148488327592, "grad_norm": 0.585019588470459, "learning_rate": 1.8044172400520006e-05, "loss": 0.6806, "step": 2959 }, { "epoch": 1.1327975507079984, "grad_norm": 0.5535844564437866, "learning_rate": 1.80426994875478e-05, "loss": 0.6329, "step": 2960 }, { "epoch": 1.1331802525832377, "grad_norm": 0.49964043498039246, "learning_rate": 1.8041226080330098e-05, "loss": 0.7287, "step": 2961 }, { "epoch": 1.1335629544584769, "grad_norm": 0.5918294787406921, "learning_rate": 1.8039752178957453e-05, "loss": 0.6375, "step": 2962 }, { "epoch": 1.133945656333716, "grad_norm": 0.5546256303787231, "learning_rate": 1.803827778352044e-05, "loss": 0.6379, "step": 2963 }, { "epoch": 1.1343283582089552, "grad_norm": 0.5215321183204651, "learning_rate": 1.8036802894109658e-05, "loss": 0.6906, "step": 2964 }, { "epoch": 1.1347110600841943, "grad_norm": 0.5186118483543396, "learning_rate": 1.8035327510815752e-05, "loss": 0.6393, "step": 2965 }, { "epoch": 1.1350937619594337, "grad_norm": 0.5761914849281311, "learning_rate": 1.803385163372938e-05, "loss": 0.6909, "step": 2966 }, { "epoch": 1.1354764638346728, "grad_norm": 0.5477498769760132, "learning_rate": 1.8032375262941245e-05, "loss": 0.6976, "step": 2967 }, { "epoch": 1.135859165709912, "grad_norm": 0.5580676794052124, "learning_rate": 1.803089839854207e-05, "loss": 0.6556, "step": 2968 }, { "epoch": 1.136241867585151, "grad_norm": 0.5684165954589844, "learning_rate": 1.8029421040622614e-05, "loss": 0.625, "step": 2969 }, { "epoch": 1.1366245694603903, "grad_norm": 0.5164238810539246, "learning_rate": 1.8027943189273664e-05, "loss": 0.6602, "step": 2970 }, { "epoch": 1.1370072713356296, "grad_norm": 0.5199002027511597, "learning_rate": 1.8026464844586038e-05, "loss": 0.6912, "step": 2971 }, { "epoch": 1.1373899732108688, "grad_norm": 0.4801240563392639, "learning_rate": 1.8024986006650584e-05, "loss": 0.6798, "step": 2972 }, { "epoch": 1.137772675086108, "grad_norm": 0.6423991322517395, "learning_rate": 1.802350667555818e-05, "loss": 0.6876, "step": 2973 }, { "epoch": 1.138155376961347, "grad_norm": 0.5490931868553162, "learning_rate": 1.8022026851399737e-05, "loss": 0.6931, "step": 2974 }, { "epoch": 1.1385380788365862, "grad_norm": 0.5292242765426636, "learning_rate": 1.802054653426619e-05, "loss": 0.6332, "step": 2975 }, { "epoch": 1.1389207807118256, "grad_norm": 0.5427055954933167, "learning_rate": 1.8019065724248514e-05, "loss": 0.5783, "step": 2976 }, { "epoch": 1.1393034825870647, "grad_norm": 0.6263815760612488, "learning_rate": 1.8017584421437707e-05, "loss": 0.7095, "step": 2977 }, { "epoch": 1.1396861844623039, "grad_norm": 0.5339562892913818, "learning_rate": 1.8016102625924796e-05, "loss": 0.6387, "step": 2978 }, { "epoch": 1.140068886337543, "grad_norm": 0.5711956024169922, "learning_rate": 1.8014620337800846e-05, "loss": 0.7342, "step": 2979 }, { "epoch": 1.1404515882127821, "grad_norm": 0.561844527721405, "learning_rate": 1.8013137557156944e-05, "loss": 0.6792, "step": 2980 }, { "epoch": 1.1408342900880215, "grad_norm": 0.548886775970459, "learning_rate": 1.801165428408421e-05, "loss": 0.7487, "step": 2981 }, { "epoch": 1.1412169919632607, "grad_norm": 0.5406917929649353, "learning_rate": 1.80101705186738e-05, "loss": 0.7044, "step": 2982 }, { "epoch": 1.1415996938384998, "grad_norm": 0.5447388887405396, "learning_rate": 1.800868626101689e-05, "loss": 0.7576, "step": 2983 }, { "epoch": 1.141982395713739, "grad_norm": 0.5802203416824341, "learning_rate": 1.8007201511204694e-05, "loss": 0.666, "step": 2984 }, { "epoch": 1.142365097588978, "grad_norm": 0.5702853202819824, "learning_rate": 1.8005716269328454e-05, "loss": 0.7755, "step": 2985 }, { "epoch": 1.1427477994642175, "grad_norm": 0.5304261445999146, "learning_rate": 1.800423053547944e-05, "loss": 0.6399, "step": 2986 }, { "epoch": 1.1431305013394566, "grad_norm": 0.5553671717643738, "learning_rate": 1.8002744309748964e-05, "loss": 0.6725, "step": 2987 }, { "epoch": 1.1435132032146957, "grad_norm": 0.5801923274993896, "learning_rate": 1.8001257592228346e-05, "loss": 0.6855, "step": 2988 }, { "epoch": 1.1438959050899349, "grad_norm": 0.5858103036880493, "learning_rate": 1.799977038300895e-05, "loss": 0.6997, "step": 2989 }, { "epoch": 1.144278606965174, "grad_norm": 0.5785283446311951, "learning_rate": 1.7998282682182176e-05, "loss": 0.6702, "step": 2990 }, { "epoch": 1.1446613088404134, "grad_norm": 0.5409015417098999, "learning_rate": 1.7996794489839445e-05, "loss": 0.6467, "step": 2991 }, { "epoch": 1.1450440107156525, "grad_norm": 0.5675731301307678, "learning_rate": 1.799530580607221e-05, "loss": 0.6926, "step": 2992 }, { "epoch": 1.1454267125908917, "grad_norm": 0.5330316424369812, "learning_rate": 1.7993816630971953e-05, "loss": 0.6715, "step": 2993 }, { "epoch": 1.1458094144661308, "grad_norm": 0.5581218004226685, "learning_rate": 1.799232696463019e-05, "loss": 0.6896, "step": 2994 }, { "epoch": 1.14619211634137, "grad_norm": 0.6044667959213257, "learning_rate": 1.799083680713846e-05, "loss": 0.7125, "step": 2995 }, { "epoch": 1.1465748182166093, "grad_norm": 0.5303691029548645, "learning_rate": 1.7989346158588342e-05, "loss": 0.734, "step": 2996 }, { "epoch": 1.1469575200918485, "grad_norm": 0.5589876174926758, "learning_rate": 1.798785501907144e-05, "loss": 0.6239, "step": 2997 }, { "epoch": 1.1473402219670876, "grad_norm": 0.6180453896522522, "learning_rate": 1.7986363388679387e-05, "loss": 0.6859, "step": 2998 }, { "epoch": 1.1477229238423268, "grad_norm": 0.5235795974731445, "learning_rate": 1.798487126750385e-05, "loss": 0.6883, "step": 2999 }, { "epoch": 1.148105625717566, "grad_norm": 0.6682568788528442, "learning_rate": 1.798337865563652e-05, "loss": 0.5863, "step": 3000 }, { "epoch": 1.1484883275928053, "grad_norm": 0.5281704068183899, "learning_rate": 1.7981885553169127e-05, "loss": 0.6282, "step": 3001 }, { "epoch": 1.1488710294680444, "grad_norm": 0.5349992513656616, "learning_rate": 1.7980391960193425e-05, "loss": 0.6607, "step": 3002 }, { "epoch": 1.1492537313432836, "grad_norm": 0.5184538960456848, "learning_rate": 1.7978897876801192e-05, "loss": 0.7069, "step": 3003 }, { "epoch": 1.1496364332185227, "grad_norm": 0.5716302394866943, "learning_rate": 1.797740330308425e-05, "loss": 0.5868, "step": 3004 }, { "epoch": 1.1500191350937619, "grad_norm": 0.5467389822006226, "learning_rate": 1.7975908239134447e-05, "loss": 0.6496, "step": 3005 }, { "epoch": 1.1504018369690012, "grad_norm": 0.5270695686340332, "learning_rate": 1.7974412685043657e-05, "loss": 0.6778, "step": 3006 }, { "epoch": 1.1507845388442404, "grad_norm": 0.5374653339385986, "learning_rate": 1.7972916640903782e-05, "loss": 0.548, "step": 3007 }, { "epoch": 1.1511672407194795, "grad_norm": 0.5630999207496643, "learning_rate": 1.7971420106806762e-05, "loss": 0.6882, "step": 3008 }, { "epoch": 1.1515499425947187, "grad_norm": 0.5779929161071777, "learning_rate": 1.7969923082844557e-05, "loss": 0.6255, "step": 3009 }, { "epoch": 1.1519326444699578, "grad_norm": 0.6154858469963074, "learning_rate": 1.7968425569109173e-05, "loss": 0.6475, "step": 3010 }, { "epoch": 1.1523153463451972, "grad_norm": 0.5682929158210754, "learning_rate": 1.7966927565692627e-05, "loss": 0.648, "step": 3011 }, { "epoch": 1.1526980482204363, "grad_norm": 0.5996136665344238, "learning_rate": 1.796542907268698e-05, "loss": 0.6601, "step": 3012 }, { "epoch": 1.1530807500956755, "grad_norm": 0.5570274591445923, "learning_rate": 1.7963930090184318e-05, "loss": 0.6653, "step": 3013 }, { "epoch": 1.1534634519709146, "grad_norm": 0.5380955338478088, "learning_rate": 1.7962430618276753e-05, "loss": 0.6277, "step": 3014 }, { "epoch": 1.1538461538461537, "grad_norm": 0.4905976355075836, "learning_rate": 1.796093065705644e-05, "loss": 0.6096, "step": 3015 }, { "epoch": 1.154228855721393, "grad_norm": 0.5052477717399597, "learning_rate": 1.795943020661555e-05, "loss": 0.6126, "step": 3016 }, { "epoch": 1.1546115575966323, "grad_norm": 0.5724315643310547, "learning_rate": 1.795792926704629e-05, "loss": 0.57, "step": 3017 }, { "epoch": 1.1549942594718714, "grad_norm": 0.5466816425323486, "learning_rate": 1.7956427838440896e-05, "loss": 0.6777, "step": 3018 }, { "epoch": 1.1553769613471105, "grad_norm": 0.5569403767585754, "learning_rate": 1.795492592089164e-05, "loss": 0.7187, "step": 3019 }, { "epoch": 1.1557596632223497, "grad_norm": 0.5502663254737854, "learning_rate": 1.7953423514490817e-05, "loss": 0.7688, "step": 3020 }, { "epoch": 1.156142365097589, "grad_norm": 0.5143882036209106, "learning_rate": 1.7951920619330746e-05, "loss": 0.5522, "step": 3021 }, { "epoch": 1.1565250669728282, "grad_norm": 0.5625560879707336, "learning_rate": 1.7950417235503796e-05, "loss": 0.5452, "step": 3022 }, { "epoch": 1.1569077688480673, "grad_norm": 0.5526366829872131, "learning_rate": 1.7948913363102342e-05, "loss": 0.6618, "step": 3023 }, { "epoch": 1.1572904707233065, "grad_norm": 0.6500884890556335, "learning_rate": 1.794740900221881e-05, "loss": 0.688, "step": 3024 }, { "epoch": 1.1576731725985456, "grad_norm": 0.6201344132423401, "learning_rate": 1.7945904152945647e-05, "loss": 0.6767, "step": 3025 }, { "epoch": 1.158055874473785, "grad_norm": 0.561040997505188, "learning_rate": 1.7944398815375326e-05, "loss": 0.7081, "step": 3026 }, { "epoch": 1.1584385763490241, "grad_norm": 0.5240119695663452, "learning_rate": 1.7942892989600355e-05, "loss": 0.6325, "step": 3027 }, { "epoch": 1.1588212782242633, "grad_norm": 0.5329996943473816, "learning_rate": 1.7941386675713272e-05, "loss": 0.767, "step": 3028 }, { "epoch": 1.1592039800995024, "grad_norm": 0.6040669083595276, "learning_rate": 1.793987987380664e-05, "loss": 0.7026, "step": 3029 }, { "epoch": 1.1595866819747416, "grad_norm": 0.5640117526054382, "learning_rate": 1.7938372583973063e-05, "loss": 0.64, "step": 3030 }, { "epoch": 1.159969383849981, "grad_norm": 0.5163224339485168, "learning_rate": 1.793686480630516e-05, "loss": 0.683, "step": 3031 }, { "epoch": 1.16035208572522, "grad_norm": 0.5879204869270325, "learning_rate": 1.79353565408956e-05, "loss": 0.6385, "step": 3032 }, { "epoch": 1.1607347876004592, "grad_norm": 0.5658045411109924, "learning_rate": 1.7933847787837053e-05, "loss": 0.6824, "step": 3033 }, { "epoch": 1.1611174894756984, "grad_norm": 0.551884114742279, "learning_rate": 1.793233854722225e-05, "loss": 0.6701, "step": 3034 }, { "epoch": 1.1615001913509375, "grad_norm": 0.5149215459823608, "learning_rate": 1.7930828819143935e-05, "loss": 0.6675, "step": 3035 }, { "epoch": 1.1618828932261769, "grad_norm": 0.5138813257217407, "learning_rate": 1.792931860369488e-05, "loss": 0.6992, "step": 3036 }, { "epoch": 1.162265595101416, "grad_norm": 0.5590100884437561, "learning_rate": 1.7927807900967896e-05, "loss": 0.694, "step": 3037 }, { "epoch": 1.1626482969766552, "grad_norm": 0.49375978112220764, "learning_rate": 1.7926296711055818e-05, "loss": 0.652, "step": 3038 }, { "epoch": 1.1630309988518943, "grad_norm": 0.4904802143573761, "learning_rate": 1.7924785034051514e-05, "loss": 0.5724, "step": 3039 }, { "epoch": 1.1634137007271335, "grad_norm": 0.5105817317962646, "learning_rate": 1.7923272870047878e-05, "loss": 0.6637, "step": 3040 }, { "epoch": 1.1637964026023728, "grad_norm": 0.5336229801177979, "learning_rate": 1.7921760219137838e-05, "loss": 0.6468, "step": 3041 }, { "epoch": 1.164179104477612, "grad_norm": 0.5195471048355103, "learning_rate": 1.7920247081414354e-05, "loss": 0.6022, "step": 3042 }, { "epoch": 1.1645618063528511, "grad_norm": 0.5177996158599854, "learning_rate": 1.7918733456970408e-05, "loss": 0.6131, "step": 3043 }, { "epoch": 1.1649445082280903, "grad_norm": 0.5019701719284058, "learning_rate": 1.7917219345899017e-05, "loss": 0.6717, "step": 3044 }, { "epoch": 1.1653272101033294, "grad_norm": 0.5226551294326782, "learning_rate": 1.791570474829323e-05, "loss": 0.5541, "step": 3045 }, { "epoch": 1.1657099119785688, "grad_norm": 0.5442278385162354, "learning_rate": 1.7914189664246116e-05, "loss": 0.6936, "step": 3046 }, { "epoch": 1.166092613853808, "grad_norm": 0.526441752910614, "learning_rate": 1.7912674093850787e-05, "loss": 0.7112, "step": 3047 }, { "epoch": 1.166475315729047, "grad_norm": 0.5321860313415527, "learning_rate": 1.7911158037200377e-05, "loss": 0.7339, "step": 3048 }, { "epoch": 1.1668580176042862, "grad_norm": 0.5413110852241516, "learning_rate": 1.7909641494388057e-05, "loss": 0.6549, "step": 3049 }, { "epoch": 1.1672407194795253, "grad_norm": 0.574988842010498, "learning_rate": 1.7908124465507012e-05, "loss": 0.6175, "step": 3050 }, { "epoch": 1.1676234213547647, "grad_norm": 0.5274472236633301, "learning_rate": 1.790660695065048e-05, "loss": 0.6713, "step": 3051 }, { "epoch": 1.1680061232300039, "grad_norm": 0.518136203289032, "learning_rate": 1.7905088949911705e-05, "loss": 0.6982, "step": 3052 }, { "epoch": 1.168388825105243, "grad_norm": 0.5345322489738464, "learning_rate": 1.790357046338398e-05, "loss": 0.6657, "step": 3053 }, { "epoch": 1.1687715269804821, "grad_norm": 0.5661484003067017, "learning_rate": 1.7902051491160615e-05, "loss": 0.7077, "step": 3054 }, { "epoch": 1.1691542288557213, "grad_norm": 0.5619734525680542, "learning_rate": 1.7900532033334958e-05, "loss": 0.6492, "step": 3055 }, { "epoch": 1.1695369307309607, "grad_norm": 0.5769252181053162, "learning_rate": 1.7899012090000383e-05, "loss": 0.771, "step": 3056 }, { "epoch": 1.1699196326061998, "grad_norm": 0.5670433640480042, "learning_rate": 1.7897491661250295e-05, "loss": 0.6559, "step": 3057 }, { "epoch": 1.170302334481439, "grad_norm": 0.5164825320243835, "learning_rate": 1.7895970747178124e-05, "loss": 0.6494, "step": 3058 }, { "epoch": 1.170685036356678, "grad_norm": 0.5154972076416016, "learning_rate": 1.789444934787734e-05, "loss": 0.7062, "step": 3059 }, { "epoch": 1.1710677382319172, "grad_norm": 0.5387123227119446, "learning_rate": 1.789292746344144e-05, "loss": 0.6976, "step": 3060 }, { "epoch": 1.1714504401071566, "grad_norm": 0.5258303880691528, "learning_rate": 1.789140509396394e-05, "loss": 0.6555, "step": 3061 }, { "epoch": 1.1718331419823957, "grad_norm": 0.5416663289070129, "learning_rate": 1.7889882239538394e-05, "loss": 0.6171, "step": 3062 }, { "epoch": 1.1722158438576349, "grad_norm": 0.5327554941177368, "learning_rate": 1.7888358900258392e-05, "loss": 0.6449, "step": 3063 }, { "epoch": 1.172598545732874, "grad_norm": 0.5156478881835938, "learning_rate": 1.7886835076217538e-05, "loss": 0.6585, "step": 3064 }, { "epoch": 1.1729812476081132, "grad_norm": 0.5457035899162292, "learning_rate": 1.7885310767509486e-05, "loss": 0.6621, "step": 3065 }, { "epoch": 1.1733639494833525, "grad_norm": 0.5373361110687256, "learning_rate": 1.7883785974227903e-05, "loss": 0.615, "step": 3066 }, { "epoch": 1.1737466513585917, "grad_norm": 0.8750335574150085, "learning_rate": 1.7882260696466492e-05, "loss": 0.7802, "step": 3067 }, { "epoch": 1.1741293532338308, "grad_norm": 0.5514951944351196, "learning_rate": 1.7880734934318984e-05, "loss": 0.6675, "step": 3068 }, { "epoch": 1.17451205510907, "grad_norm": 0.5615848302841187, "learning_rate": 1.7879208687879145e-05, "loss": 0.6083, "step": 3069 }, { "epoch": 1.1748947569843091, "grad_norm": 0.549576997756958, "learning_rate": 1.7877681957240764e-05, "loss": 0.7003, "step": 3070 }, { "epoch": 1.1752774588595485, "grad_norm": 0.5433852076530457, "learning_rate": 1.787615474249766e-05, "loss": 0.6021, "step": 3071 }, { "epoch": 1.1756601607347876, "grad_norm": 0.5328896045684814, "learning_rate": 1.7874627043743692e-05, "loss": 0.6569, "step": 3072 }, { "epoch": 1.1760428626100268, "grad_norm": 0.5578414797782898, "learning_rate": 1.7873098861072736e-05, "loss": 0.7608, "step": 3073 }, { "epoch": 1.176425564485266, "grad_norm": 0.5558559894561768, "learning_rate": 1.7871570194578706e-05, "loss": 0.6693, "step": 3074 }, { "epoch": 1.176808266360505, "grad_norm": 0.5299286246299744, "learning_rate": 1.7870041044355536e-05, "loss": 0.6986, "step": 3075 }, { "epoch": 1.1771909682357444, "grad_norm": 0.5089366436004639, "learning_rate": 1.7868511410497205e-05, "loss": 0.6448, "step": 3076 }, { "epoch": 1.1775736701109836, "grad_norm": 0.5783634781837463, "learning_rate": 1.7866981293097708e-05, "loss": 0.7293, "step": 3077 }, { "epoch": 1.1779563719862227, "grad_norm": 0.5423774719238281, "learning_rate": 1.7865450692251073e-05, "loss": 0.733, "step": 3078 }, { "epoch": 1.1783390738614619, "grad_norm": 0.5360353589057922, "learning_rate": 1.786391960805137e-05, "loss": 0.7106, "step": 3079 }, { "epoch": 1.178721775736701, "grad_norm": 0.5264966487884521, "learning_rate": 1.7862388040592672e-05, "loss": 0.6893, "step": 3080 }, { "epoch": 1.1791044776119404, "grad_norm": 0.5321733951568604, "learning_rate": 1.7860855989969114e-05, "loss": 0.6835, "step": 3081 }, { "epoch": 1.1794871794871795, "grad_norm": 0.5153892040252686, "learning_rate": 1.7859323456274832e-05, "loss": 0.6801, "step": 3082 }, { "epoch": 1.1798698813624187, "grad_norm": 0.5204899311065674, "learning_rate": 1.785779043960401e-05, "loss": 0.68, "step": 3083 }, { "epoch": 1.1802525832376578, "grad_norm": 0.5073541402816772, "learning_rate": 1.785625694005086e-05, "loss": 0.6288, "step": 3084 }, { "epoch": 1.180635285112897, "grad_norm": 0.5237618684768677, "learning_rate": 1.785472295770961e-05, "loss": 0.7389, "step": 3085 }, { "epoch": 1.1810179869881363, "grad_norm": 0.56654953956604, "learning_rate": 1.7853188492674536e-05, "loss": 0.6995, "step": 3086 }, { "epoch": 1.1814006888633755, "grad_norm": 0.5160045027732849, "learning_rate": 1.785165354503993e-05, "loss": 0.6437, "step": 3087 }, { "epoch": 1.1817833907386146, "grad_norm": 0.5700361728668213, "learning_rate": 1.785011811490012e-05, "loss": 0.6759, "step": 3088 }, { "epoch": 1.1821660926138537, "grad_norm": 0.5725778341293335, "learning_rate": 1.7848582202349464e-05, "loss": 0.6436, "step": 3089 }, { "epoch": 1.182548794489093, "grad_norm": 0.5302454233169556, "learning_rate": 1.7847045807482347e-05, "loss": 0.6971, "step": 3090 }, { "epoch": 1.1829314963643323, "grad_norm": 0.5214876532554626, "learning_rate": 1.7845508930393182e-05, "loss": 0.7218, "step": 3091 }, { "epoch": 1.1833141982395714, "grad_norm": 0.48413628339767456, "learning_rate": 1.784397157117642e-05, "loss": 0.6834, "step": 3092 }, { "epoch": 1.1836969001148105, "grad_norm": 0.5152813196182251, "learning_rate": 1.784243372992653e-05, "loss": 0.7165, "step": 3093 }, { "epoch": 1.1840796019900497, "grad_norm": 0.5214792490005493, "learning_rate": 1.784089540673802e-05, "loss": 0.6772, "step": 3094 }, { "epoch": 1.1844623038652888, "grad_norm": 0.5429422855377197, "learning_rate": 1.7839356601705416e-05, "loss": 0.7328, "step": 3095 }, { "epoch": 1.1848450057405282, "grad_norm": 0.5910903811454773, "learning_rate": 1.7837817314923296e-05, "loss": 0.7493, "step": 3096 }, { "epoch": 1.1852277076157673, "grad_norm": 0.5771308541297913, "learning_rate": 1.783627754648624e-05, "loss": 0.5835, "step": 3097 }, { "epoch": 1.1856104094910065, "grad_norm": 0.556025505065918, "learning_rate": 1.7834737296488886e-05, "loss": 0.6847, "step": 3098 }, { "epoch": 1.1859931113662456, "grad_norm": 0.5414633750915527, "learning_rate": 1.7833196565025867e-05, "loss": 0.6544, "step": 3099 }, { "epoch": 1.1863758132414848, "grad_norm": 0.5026599168777466, "learning_rate": 1.7831655352191882e-05, "loss": 0.6461, "step": 3100 }, { "epoch": 1.1867585151167241, "grad_norm": 0.5681809186935425, "learning_rate": 1.783011365808163e-05, "loss": 0.6373, "step": 3101 }, { "epoch": 1.1871412169919633, "grad_norm": 0.5443688035011292, "learning_rate": 1.7828571482789865e-05, "loss": 0.6288, "step": 3102 }, { "epoch": 1.1875239188672024, "grad_norm": 0.5772762894630432, "learning_rate": 1.7827028826411345e-05, "loss": 0.6599, "step": 3103 }, { "epoch": 1.1879066207424416, "grad_norm": 0.5226432681083679, "learning_rate": 1.7825485689040882e-05, "loss": 0.5911, "step": 3104 }, { "epoch": 1.1882893226176807, "grad_norm": 0.5330497026443481, "learning_rate": 1.7823942070773298e-05, "loss": 0.5879, "step": 3105 }, { "epoch": 1.18867202449292, "grad_norm": 0.5524353981018066, "learning_rate": 1.782239797170345e-05, "loss": 0.6637, "step": 3106 }, { "epoch": 1.1890547263681592, "grad_norm": 0.5964124798774719, "learning_rate": 1.782085339192624e-05, "loss": 0.6578, "step": 3107 }, { "epoch": 1.1894374282433984, "grad_norm": 0.5260863304138184, "learning_rate": 1.7819308331536572e-05, "loss": 0.7476, "step": 3108 }, { "epoch": 1.1898201301186375, "grad_norm": 0.5296857953071594, "learning_rate": 1.7817762790629406e-05, "loss": 0.6501, "step": 3109 }, { "epoch": 1.1902028319938767, "grad_norm": 0.5716163516044617, "learning_rate": 1.781621676929971e-05, "loss": 0.6416, "step": 3110 }, { "epoch": 1.190585533869116, "grad_norm": 0.5150159597396851, "learning_rate": 1.7814670267642495e-05, "loss": 0.5777, "step": 3111 }, { "epoch": 1.1909682357443552, "grad_norm": 0.5444925427436829, "learning_rate": 1.78131232857528e-05, "loss": 0.6484, "step": 3112 }, { "epoch": 1.1913509376195943, "grad_norm": 0.5387150645256042, "learning_rate": 1.781157582372569e-05, "loss": 0.6807, "step": 3113 }, { "epoch": 1.1917336394948335, "grad_norm": 0.5446691513061523, "learning_rate": 1.7810027881656257e-05, "loss": 0.6443, "step": 3114 }, { "epoch": 1.1921163413700726, "grad_norm": 0.5936750173568726, "learning_rate": 1.780847945963963e-05, "loss": 0.5899, "step": 3115 }, { "epoch": 1.192499043245312, "grad_norm": 0.5825581550598145, "learning_rate": 1.7806930557770965e-05, "loss": 0.7086, "step": 3116 }, { "epoch": 1.1928817451205511, "grad_norm": 0.5384499430656433, "learning_rate": 1.7805381176145443e-05, "loss": 0.6463, "step": 3117 }, { "epoch": 1.1932644469957903, "grad_norm": 0.5382066369056702, "learning_rate": 1.7803831314858276e-05, "loss": 0.6337, "step": 3118 }, { "epoch": 1.1936471488710294, "grad_norm": 0.4991798400878906, "learning_rate": 1.7802280974004717e-05, "loss": 0.6638, "step": 3119 }, { "epoch": 1.1940298507462686, "grad_norm": 0.5484002828598022, "learning_rate": 1.7800730153680023e-05, "loss": 0.656, "step": 3120 }, { "epoch": 1.194412552621508, "grad_norm": 0.5421544909477234, "learning_rate": 1.779917885397951e-05, "loss": 0.597, "step": 3121 }, { "epoch": 1.194795254496747, "grad_norm": 0.5294302105903625, "learning_rate": 1.77976270749985e-05, "loss": 0.6636, "step": 3122 }, { "epoch": 1.1951779563719862, "grad_norm": 0.47925329208374023, "learning_rate": 1.779607481683236e-05, "loss": 0.642, "step": 3123 }, { "epoch": 1.1955606582472253, "grad_norm": 0.5063508152961731, "learning_rate": 1.7794522079576476e-05, "loss": 0.6326, "step": 3124 }, { "epoch": 1.1959433601224645, "grad_norm": 0.5942808985710144, "learning_rate": 1.7792968863326273e-05, "loss": 0.6892, "step": 3125 }, { "epoch": 1.1963260619977039, "grad_norm": 0.5831581354141235, "learning_rate": 1.77914151681772e-05, "loss": 0.6428, "step": 3126 }, { "epoch": 1.196708763872943, "grad_norm": 0.5064157247543335, "learning_rate": 1.778986099422473e-05, "loss": 0.6281, "step": 3127 }, { "epoch": 1.1970914657481821, "grad_norm": 0.5684926509857178, "learning_rate": 1.7788306341564373e-05, "loss": 0.7282, "step": 3128 }, { "epoch": 1.1974741676234213, "grad_norm": 0.5056917667388916, "learning_rate": 1.7786751210291674e-05, "loss": 0.6392, "step": 3129 }, { "epoch": 1.1978568694986604, "grad_norm": 0.6160793900489807, "learning_rate": 1.778519560050219e-05, "loss": 0.7747, "step": 3130 }, { "epoch": 1.1982395713738998, "grad_norm": 0.535057008266449, "learning_rate": 1.778363951229152e-05, "loss": 0.7118, "step": 3131 }, { "epoch": 1.198622273249139, "grad_norm": 0.5065245628356934, "learning_rate": 1.7782082945755296e-05, "loss": 0.7113, "step": 3132 }, { "epoch": 1.199004975124378, "grad_norm": 0.522930383682251, "learning_rate": 1.778052590098916e-05, "loss": 0.6878, "step": 3133 }, { "epoch": 1.1993876769996172, "grad_norm": 0.5242587327957153, "learning_rate": 1.7778968378088815e-05, "loss": 0.6746, "step": 3134 }, { "epoch": 1.1997703788748564, "grad_norm": 0.4892423152923584, "learning_rate": 1.777741037714996e-05, "loss": 0.6059, "step": 3135 }, { "epoch": 1.2001530807500957, "grad_norm": 0.5419400334358215, "learning_rate": 1.7775851898268347e-05, "loss": 0.6396, "step": 3136 }, { "epoch": 1.200535782625335, "grad_norm": 0.5421422719955444, "learning_rate": 1.777429294153974e-05, "loss": 0.6308, "step": 3137 }, { "epoch": 1.200918484500574, "grad_norm": 0.5699341297149658, "learning_rate": 1.7772733507059947e-05, "loss": 0.5791, "step": 3138 }, { "epoch": 1.2013011863758132, "grad_norm": 0.5921555161476135, "learning_rate": 1.77711735949248e-05, "loss": 0.6859, "step": 3139 }, { "epoch": 1.2016838882510523, "grad_norm": 0.5051217675209045, "learning_rate": 1.776961320523016e-05, "loss": 0.6586, "step": 3140 }, { "epoch": 1.2020665901262917, "grad_norm": 0.6064521670341492, "learning_rate": 1.7768052338071916e-05, "loss": 0.6314, "step": 3141 }, { "epoch": 1.2024492920015308, "grad_norm": 0.6199438571929932, "learning_rate": 1.7766490993545984e-05, "loss": 0.6147, "step": 3142 }, { "epoch": 1.20283199387677, "grad_norm": 0.6231129765510559, "learning_rate": 1.7764929171748316e-05, "loss": 0.672, "step": 3143 }, { "epoch": 1.2032146957520091, "grad_norm": 0.584602415561676, "learning_rate": 1.776336687277489e-05, "loss": 0.6958, "step": 3144 }, { "epoch": 1.2035973976272483, "grad_norm": 0.6529060006141663, "learning_rate": 1.7761804096721714e-05, "loss": 0.6686, "step": 3145 }, { "epoch": 1.2039800995024876, "grad_norm": 0.6060982942581177, "learning_rate": 1.7760240843684825e-05, "loss": 0.6823, "step": 3146 }, { "epoch": 1.2043628013777268, "grad_norm": 0.5494998693466187, "learning_rate": 1.775867711376029e-05, "loss": 0.702, "step": 3147 }, { "epoch": 1.204745503252966, "grad_norm": 0.5233975648880005, "learning_rate": 1.77571129070442e-05, "loss": 0.6198, "step": 3148 }, { "epoch": 1.205128205128205, "grad_norm": 0.5007618069648743, "learning_rate": 1.7755548223632684e-05, "loss": 0.6506, "step": 3149 }, { "epoch": 1.2055109070034442, "grad_norm": 0.544653058052063, "learning_rate": 1.775398306362189e-05, "loss": 0.6309, "step": 3150 }, { "epoch": 1.2058936088786836, "grad_norm": 0.5458307266235352, "learning_rate": 1.775241742710801e-05, "loss": 0.6763, "step": 3151 }, { "epoch": 1.2062763107539227, "grad_norm": 0.615001380443573, "learning_rate": 1.7750851314187254e-05, "loss": 0.6432, "step": 3152 }, { "epoch": 1.2066590126291619, "grad_norm": 0.5707245469093323, "learning_rate": 1.774928472495586e-05, "loss": 0.6962, "step": 3153 }, { "epoch": 1.207041714504401, "grad_norm": 0.5520814061164856, "learning_rate": 1.77477176595101e-05, "loss": 0.7225, "step": 3154 }, { "epoch": 1.2074244163796402, "grad_norm": 0.520765483379364, "learning_rate": 1.7746150117946278e-05, "loss": 0.6689, "step": 3155 }, { "epoch": 1.2078071182548795, "grad_norm": 0.5440402030944824, "learning_rate": 1.774458210036072e-05, "loss": 0.7167, "step": 3156 }, { "epoch": 1.2081898201301187, "grad_norm": 0.7262790203094482, "learning_rate": 1.7743013606849787e-05, "loss": 0.7619, "step": 3157 }, { "epoch": 1.2085725220053578, "grad_norm": 0.6472504138946533, "learning_rate": 1.7741444637509864e-05, "loss": 0.7094, "step": 3158 }, { "epoch": 1.208955223880597, "grad_norm": 0.5272620320320129, "learning_rate": 1.7739875192437372e-05, "loss": 0.6981, "step": 3159 }, { "epoch": 1.209337925755836, "grad_norm": 0.5675913095474243, "learning_rate": 1.773830527172876e-05, "loss": 0.7158, "step": 3160 }, { "epoch": 1.2097206276310755, "grad_norm": 0.5238757729530334, "learning_rate": 1.7736734875480493e-05, "loss": 0.7001, "step": 3161 }, { "epoch": 1.2101033295063146, "grad_norm": 0.58238685131073, "learning_rate": 1.7735164003789087e-05, "loss": 0.6114, "step": 3162 }, { "epoch": 1.2104860313815538, "grad_norm": 0.7079142928123474, "learning_rate": 1.7733592656751076e-05, "loss": 0.682, "step": 3163 }, { "epoch": 1.210868733256793, "grad_norm": 0.5527577996253967, "learning_rate": 1.7732020834463016e-05, "loss": 0.728, "step": 3164 }, { "epoch": 1.211251435132032, "grad_norm": 0.5521705746650696, "learning_rate": 1.7730448537021504e-05, "loss": 0.6866, "step": 3165 }, { "epoch": 1.2116341370072714, "grad_norm": 0.6060081720352173, "learning_rate": 1.772887576452316e-05, "loss": 0.6423, "step": 3166 }, { "epoch": 1.2120168388825105, "grad_norm": 0.6088805198669434, "learning_rate": 1.772730251706464e-05, "loss": 0.6202, "step": 3167 }, { "epoch": 1.2123995407577497, "grad_norm": 0.5986154079437256, "learning_rate": 1.7725728794742617e-05, "loss": 0.7225, "step": 3168 }, { "epoch": 1.2127822426329888, "grad_norm": 0.531085193157196, "learning_rate": 1.772415459765381e-05, "loss": 0.7164, "step": 3169 }, { "epoch": 1.213164944508228, "grad_norm": 0.5867754817008972, "learning_rate": 1.7722579925894948e-05, "loss": 0.6102, "step": 3170 }, { "epoch": 1.2135476463834673, "grad_norm": 0.6414543986320496, "learning_rate": 1.7721004779562805e-05, "loss": 0.6446, "step": 3171 }, { "epoch": 1.2139303482587065, "grad_norm": 0.5842898488044739, "learning_rate": 1.7719429158754175e-05, "loss": 0.6434, "step": 3172 }, { "epoch": 1.2143130501339456, "grad_norm": 0.5612961649894714, "learning_rate": 1.7717853063565886e-05, "loss": 0.6477, "step": 3173 }, { "epoch": 1.2146957520091848, "grad_norm": 0.5649941563606262, "learning_rate": 1.7716276494094792e-05, "loss": 0.7325, "step": 3174 }, { "epoch": 1.215078453884424, "grad_norm": 0.5071946382522583, "learning_rate": 1.7714699450437778e-05, "loss": 0.714, "step": 3175 }, { "epoch": 1.2154611557596633, "grad_norm": 0.6029782891273499, "learning_rate": 1.7713121932691755e-05, "loss": 0.6061, "step": 3176 }, { "epoch": 1.2158438576349024, "grad_norm": 0.5633662939071655, "learning_rate": 1.7711543940953667e-05, "loss": 0.6181, "step": 3177 }, { "epoch": 1.2162265595101416, "grad_norm": 0.5389094948768616, "learning_rate": 1.7709965475320493e-05, "loss": 0.6587, "step": 3178 }, { "epoch": 1.2166092613853807, "grad_norm": 0.508328378200531, "learning_rate": 1.770838653588922e-05, "loss": 0.7111, "step": 3179 }, { "epoch": 1.2169919632606199, "grad_norm": 0.5325565338134766, "learning_rate": 1.7706807122756894e-05, "loss": 0.6701, "step": 3180 }, { "epoch": 1.2173746651358592, "grad_norm": 0.5661947727203369, "learning_rate": 1.7705227236020562e-05, "loss": 0.705, "step": 3181 }, { "epoch": 1.2177573670110984, "grad_norm": 0.5757995843887329, "learning_rate": 1.7703646875777315e-05, "loss": 0.7218, "step": 3182 }, { "epoch": 1.2181400688863375, "grad_norm": 0.615358293056488, "learning_rate": 1.770206604212427e-05, "loss": 0.6463, "step": 3183 }, { "epoch": 1.2185227707615767, "grad_norm": 0.5292381048202515, "learning_rate": 1.7700484735158584e-05, "loss": 0.5785, "step": 3184 }, { "epoch": 1.2189054726368158, "grad_norm": 0.5448427796363831, "learning_rate": 1.7698902954977416e-05, "loss": 0.7103, "step": 3185 }, { "epoch": 1.2192881745120552, "grad_norm": 0.5506073832511902, "learning_rate": 1.769732070167798e-05, "loss": 0.7408, "step": 3186 }, { "epoch": 1.2196708763872943, "grad_norm": 0.5429569482803345, "learning_rate": 1.769573797535751e-05, "loss": 0.5771, "step": 3187 }, { "epoch": 1.2200535782625335, "grad_norm": 0.5558047890663147, "learning_rate": 1.7694154776113265e-05, "loss": 0.7448, "step": 3188 }, { "epoch": 1.2204362801377726, "grad_norm": 0.7137319445610046, "learning_rate": 1.769257110404254e-05, "loss": 0.673, "step": 3189 }, { "epoch": 1.2208189820130118, "grad_norm": 0.52690589427948, "learning_rate": 1.7690986959242653e-05, "loss": 0.7118, "step": 3190 }, { "epoch": 1.2212016838882511, "grad_norm": 0.5224199891090393, "learning_rate": 1.7689402341810956e-05, "loss": 0.6084, "step": 3191 }, { "epoch": 1.2215843857634903, "grad_norm": 0.5181595087051392, "learning_rate": 1.768781725184483e-05, "loss": 0.7464, "step": 3192 }, { "epoch": 1.2219670876387294, "grad_norm": 0.5421761274337769, "learning_rate": 1.7686231689441676e-05, "loss": 0.66, "step": 3193 }, { "epoch": 1.2223497895139686, "grad_norm": 1.2133089303970337, "learning_rate": 1.768464565469894e-05, "loss": 0.6504, "step": 3194 }, { "epoch": 1.2227324913892077, "grad_norm": 0.5721228718757629, "learning_rate": 1.7683059147714084e-05, "loss": 0.5294, "step": 3195 }, { "epoch": 1.223115193264447, "grad_norm": 0.49611231684684753, "learning_rate": 1.7681472168584598e-05, "loss": 0.7097, "step": 3196 }, { "epoch": 1.2234978951396862, "grad_norm": 0.6374525427818298, "learning_rate": 1.7679884717408015e-05, "loss": 0.6373, "step": 3197 }, { "epoch": 1.2238805970149254, "grad_norm": 0.5605248808860779, "learning_rate": 1.7678296794281886e-05, "loss": 0.6929, "step": 3198 }, { "epoch": 1.2242632988901645, "grad_norm": 0.5289474725723267, "learning_rate": 1.7676708399303784e-05, "loss": 0.6619, "step": 3199 }, { "epoch": 1.2246460007654036, "grad_norm": 0.5405463576316833, "learning_rate": 1.7675119532571334e-05, "loss": 0.6437, "step": 3200 }, { "epoch": 1.225028702640643, "grad_norm": 0.5452598929405212, "learning_rate": 1.767353019418217e-05, "loss": 0.6689, "step": 3201 }, { "epoch": 1.2254114045158822, "grad_norm": 0.5843181014060974, "learning_rate": 1.7671940384233957e-05, "loss": 0.7177, "step": 3202 }, { "epoch": 1.2257941063911213, "grad_norm": 0.6069718599319458, "learning_rate": 1.76703501028244e-05, "loss": 0.6855, "step": 3203 }, { "epoch": 1.2261768082663604, "grad_norm": 0.5774433612823486, "learning_rate": 1.766875935005122e-05, "loss": 0.6994, "step": 3204 }, { "epoch": 1.2265595101415996, "grad_norm": 0.5650824308395386, "learning_rate": 1.7667168126012175e-05, "loss": 0.6538, "step": 3205 }, { "epoch": 1.226942212016839, "grad_norm": 0.5215293169021606, "learning_rate": 1.7665576430805053e-05, "loss": 0.7048, "step": 3206 }, { "epoch": 1.227324913892078, "grad_norm": 0.5755517482757568, "learning_rate": 1.7663984264527666e-05, "loss": 0.5507, "step": 3207 }, { "epoch": 1.2277076157673172, "grad_norm": 0.5516165494918823, "learning_rate": 1.7662391627277856e-05, "loss": 0.677, "step": 3208 }, { "epoch": 1.2280903176425564, "grad_norm": 0.5148214101791382, "learning_rate": 1.7660798519153493e-05, "loss": 0.7365, "step": 3209 }, { "epoch": 1.2284730195177955, "grad_norm": 0.5518274307250977, "learning_rate": 1.7659204940252487e-05, "loss": 0.6017, "step": 3210 }, { "epoch": 1.228855721393035, "grad_norm": 0.5215811133384705, "learning_rate": 1.7657610890672758e-05, "loss": 0.7563, "step": 3211 }, { "epoch": 1.229238423268274, "grad_norm": 0.607500433921814, "learning_rate": 1.7656016370512268e-05, "loss": 0.6916, "step": 3212 }, { "epoch": 1.2296211251435132, "grad_norm": 0.5462857484817505, "learning_rate": 1.7654421379869004e-05, "loss": 0.7721, "step": 3213 }, { "epoch": 1.2300038270187523, "grad_norm": 0.5749751329421997, "learning_rate": 1.765282591884098e-05, "loss": 0.6172, "step": 3214 }, { "epoch": 1.2303865288939915, "grad_norm": 0.4852350652217865, "learning_rate": 1.7651229987526247e-05, "loss": 0.6867, "step": 3215 }, { "epoch": 1.2307692307692308, "grad_norm": 0.5173105597496033, "learning_rate": 1.7649633586022873e-05, "loss": 0.6721, "step": 3216 }, { "epoch": 1.23115193264447, "grad_norm": 0.5956243276596069, "learning_rate": 1.7648036714428967e-05, "loss": 0.6447, "step": 3217 }, { "epoch": 1.2315346345197091, "grad_norm": 0.6429666876792908, "learning_rate": 1.7646439372842657e-05, "loss": 0.6836, "step": 3218 }, { "epoch": 1.2319173363949483, "grad_norm": 0.5668161511421204, "learning_rate": 1.7644841561362107e-05, "loss": 0.7064, "step": 3219 }, { "epoch": 1.2323000382701874, "grad_norm": 0.6651352047920227, "learning_rate": 1.7643243280085506e-05, "loss": 0.6903, "step": 3220 }, { "epoch": 1.2326827401454268, "grad_norm": 0.5475209355354309, "learning_rate": 1.764164452911107e-05, "loss": 0.7943, "step": 3221 }, { "epoch": 1.233065442020666, "grad_norm": 0.6439547538757324, "learning_rate": 1.764004530853705e-05, "loss": 0.7598, "step": 3222 }, { "epoch": 1.233448143895905, "grad_norm": 0.5534660816192627, "learning_rate": 1.763844561846172e-05, "loss": 0.5971, "step": 3223 }, { "epoch": 1.2338308457711442, "grad_norm": 0.5687600374221802, "learning_rate": 1.7636845458983386e-05, "loss": 0.6808, "step": 3224 }, { "epoch": 1.2342135476463834, "grad_norm": 0.5259628295898438, "learning_rate": 1.7635244830200387e-05, "loss": 0.6645, "step": 3225 }, { "epoch": 1.2345962495216227, "grad_norm": 0.5375024080276489, "learning_rate": 1.7633643732211076e-05, "loss": 0.6311, "step": 3226 }, { "epoch": 1.2349789513968619, "grad_norm": 0.5478159785270691, "learning_rate": 1.7632042165113848e-05, "loss": 0.815, "step": 3227 }, { "epoch": 1.235361653272101, "grad_norm": 0.5421873331069946, "learning_rate": 1.763044012900713e-05, "loss": 0.6845, "step": 3228 }, { "epoch": 1.2357443551473402, "grad_norm": 0.5113847851753235, "learning_rate": 1.7628837623989366e-05, "loss": 0.7758, "step": 3229 }, { "epoch": 1.2361270570225793, "grad_norm": 0.486473947763443, "learning_rate": 1.7627234650159037e-05, "loss": 0.6947, "step": 3230 }, { "epoch": 1.2365097588978187, "grad_norm": 0.5600213408470154, "learning_rate": 1.762563120761464e-05, "loss": 0.7093, "step": 3231 }, { "epoch": 1.2368924607730578, "grad_norm": 0.5454961061477661, "learning_rate": 1.7624027296454728e-05, "loss": 0.6673, "step": 3232 }, { "epoch": 1.237275162648297, "grad_norm": 0.6061163544654846, "learning_rate": 1.7622422916777853e-05, "loss": 0.6848, "step": 3233 }, { "epoch": 1.237657864523536, "grad_norm": 0.531745433807373, "learning_rate": 1.7620818068682614e-05, "loss": 0.6615, "step": 3234 }, { "epoch": 1.2380405663987752, "grad_norm": 0.5857110619544983, "learning_rate": 1.7619212752267628e-05, "loss": 0.6703, "step": 3235 }, { "epoch": 1.2384232682740146, "grad_norm": 0.582463264465332, "learning_rate": 1.7617606967631555e-05, "loss": 0.6499, "step": 3236 }, { "epoch": 1.2388059701492538, "grad_norm": 0.49252861738204956, "learning_rate": 1.7616000714873065e-05, "loss": 0.6673, "step": 3237 }, { "epoch": 1.239188672024493, "grad_norm": 0.5118803381919861, "learning_rate": 1.7614393994090873e-05, "loss": 0.6464, "step": 3238 }, { "epoch": 1.239571373899732, "grad_norm": 0.5009279251098633, "learning_rate": 1.761278680538371e-05, "loss": 0.6363, "step": 3239 }, { "epoch": 1.2399540757749712, "grad_norm": 0.5029326677322388, "learning_rate": 1.761117914885035e-05, "loss": 0.6388, "step": 3240 }, { "epoch": 1.2403367776502106, "grad_norm": 0.5135133862495422, "learning_rate": 1.7609571024589582e-05, "loss": 0.6273, "step": 3241 }, { "epoch": 1.2407194795254497, "grad_norm": 0.4708830714225769, "learning_rate": 1.7607962432700233e-05, "loss": 0.6216, "step": 3242 }, { "epoch": 1.2411021814006888, "grad_norm": 0.5175661444664001, "learning_rate": 1.7606353373281155e-05, "loss": 0.6277, "step": 3243 }, { "epoch": 1.241484883275928, "grad_norm": 0.5665274262428284, "learning_rate": 1.7604743846431224e-05, "loss": 0.6797, "step": 3244 }, { "epoch": 1.2418675851511671, "grad_norm": 0.5175936818122864, "learning_rate": 1.7603133852249357e-05, "loss": 0.6612, "step": 3245 }, { "epoch": 1.2422502870264065, "grad_norm": 0.5226302742958069, "learning_rate": 1.7601523390834487e-05, "loss": 0.7418, "step": 3246 }, { "epoch": 1.2426329889016456, "grad_norm": 0.5438722372055054, "learning_rate": 1.7599912462285585e-05, "loss": 0.6255, "step": 3247 }, { "epoch": 1.2430156907768848, "grad_norm": 0.5723857283592224, "learning_rate": 1.7598301066701646e-05, "loss": 0.7362, "step": 3248 }, { "epoch": 1.243398392652124, "grad_norm": 0.5339874029159546, "learning_rate": 1.759668920418169e-05, "loss": 0.6251, "step": 3249 }, { "epoch": 1.243781094527363, "grad_norm": 0.6111608743667603, "learning_rate": 1.7595076874824774e-05, "loss": 0.6356, "step": 3250 }, { "epoch": 1.2441637964026024, "grad_norm": 0.5695353746414185, "learning_rate": 1.7593464078729983e-05, "loss": 0.7162, "step": 3251 }, { "epoch": 1.2445464982778416, "grad_norm": 0.5203403830528259, "learning_rate": 1.7591850815996423e-05, "loss": 0.5992, "step": 3252 }, { "epoch": 1.2449292001530807, "grad_norm": 0.5921613574028015, "learning_rate": 1.7590237086723235e-05, "loss": 0.7205, "step": 3253 }, { "epoch": 1.2453119020283199, "grad_norm": 0.5700814723968506, "learning_rate": 1.7588622891009586e-05, "loss": 0.6363, "step": 3254 }, { "epoch": 1.245694603903559, "grad_norm": 0.5269098281860352, "learning_rate": 1.7587008228954677e-05, "loss": 0.5974, "step": 3255 }, { "epoch": 1.2460773057787984, "grad_norm": 0.5355486869812012, "learning_rate": 1.7585393100657726e-05, "loss": 0.6644, "step": 3256 }, { "epoch": 1.2464600076540375, "grad_norm": 0.5159132480621338, "learning_rate": 1.7583777506217995e-05, "loss": 0.6451, "step": 3257 }, { "epoch": 1.2468427095292767, "grad_norm": 0.510017991065979, "learning_rate": 1.7582161445734757e-05, "loss": 0.6817, "step": 3258 }, { "epoch": 1.2472254114045158, "grad_norm": 0.5201574563980103, "learning_rate": 1.758054491930733e-05, "loss": 0.6785, "step": 3259 }, { "epoch": 1.247608113279755, "grad_norm": 0.559138298034668, "learning_rate": 1.7578927927035056e-05, "loss": 0.6199, "step": 3260 }, { "epoch": 1.2479908151549943, "grad_norm": 0.5250104665756226, "learning_rate": 1.75773104690173e-05, "loss": 0.664, "step": 3261 }, { "epoch": 1.2483735170302335, "grad_norm": 0.5482470989227295, "learning_rate": 1.7575692545353456e-05, "loss": 0.6738, "step": 3262 }, { "epoch": 1.2487562189054726, "grad_norm": 0.6086686849594116, "learning_rate": 1.757407415614296e-05, "loss": 0.6332, "step": 3263 }, { "epoch": 1.2491389207807118, "grad_norm": 0.5794693231582642, "learning_rate": 1.757245530148525e-05, "loss": 0.6195, "step": 3264 }, { "epoch": 1.249521622655951, "grad_norm": 0.5203121304512024, "learning_rate": 1.7570835981479824e-05, "loss": 0.6288, "step": 3265 }, { "epoch": 1.2499043245311903, "grad_norm": 0.5561046004295349, "learning_rate": 1.7569216196226185e-05, "loss": 0.6321, "step": 3266 }, { "epoch": 1.2502870264064294, "grad_norm": 0.5190437436103821, "learning_rate": 1.7567595945823878e-05, "loss": 0.689, "step": 3267 }, { "epoch": 1.2506697282816686, "grad_norm": 0.528041422367096, "learning_rate": 1.7565975230372473e-05, "loss": 0.6415, "step": 3268 }, { "epoch": 1.2510524301569077, "grad_norm": 0.5371917486190796, "learning_rate": 1.7564354049971557e-05, "loss": 0.6621, "step": 3269 }, { "epoch": 1.2514351320321468, "grad_norm": 0.5422175526618958, "learning_rate": 1.756273240472077e-05, "loss": 0.581, "step": 3270 }, { "epoch": 1.2518178339073862, "grad_norm": 0.49179041385650635, "learning_rate": 1.7561110294719755e-05, "loss": 0.6786, "step": 3271 }, { "epoch": 1.2522005357826254, "grad_norm": 0.5627067685127258, "learning_rate": 1.75594877200682e-05, "loss": 0.6787, "step": 3272 }, { "epoch": 1.2525832376578645, "grad_norm": 0.5671417713165283, "learning_rate": 1.7557864680865817e-05, "loss": 0.6143, "step": 3273 }, { "epoch": 1.2529659395331036, "grad_norm": 0.5404809713363647, "learning_rate": 1.755624117721235e-05, "loss": 0.6483, "step": 3274 }, { "epoch": 1.2533486414083428, "grad_norm": 0.4864318370819092, "learning_rate": 1.7554617209207552e-05, "loss": 0.6653, "step": 3275 }, { "epoch": 1.2537313432835822, "grad_norm": 0.5402732491493225, "learning_rate": 1.755299277695124e-05, "loss": 0.5274, "step": 3276 }, { "epoch": 1.2541140451588213, "grad_norm": 0.5468334555625916, "learning_rate": 1.755136788054323e-05, "loss": 0.7189, "step": 3277 }, { "epoch": 1.2544967470340604, "grad_norm": 0.5038975477218628, "learning_rate": 1.7549742520083373e-05, "loss": 0.658, "step": 3278 }, { "epoch": 1.2548794489092996, "grad_norm": 0.6174642443656921, "learning_rate": 1.7548116695671557e-05, "loss": 0.6737, "step": 3279 }, { "epoch": 1.2552621507845387, "grad_norm": 0.5958371162414551, "learning_rate": 1.7546490407407694e-05, "loss": 0.6582, "step": 3280 }, { "epoch": 1.255644852659778, "grad_norm": 0.5272994041442871, "learning_rate": 1.7544863655391723e-05, "loss": 0.7076, "step": 3281 }, { "epoch": 1.2560275545350172, "grad_norm": 0.5486589074134827, "learning_rate": 1.754323643972361e-05, "loss": 0.7003, "step": 3282 }, { "epoch": 1.2564102564102564, "grad_norm": 0.5838080048561096, "learning_rate": 1.754160876050335e-05, "loss": 0.624, "step": 3283 }, { "epoch": 1.2567929582854955, "grad_norm": 0.5359195470809937, "learning_rate": 1.7539980617830975e-05, "loss": 0.7553, "step": 3284 }, { "epoch": 1.2571756601607347, "grad_norm": 0.5981696844100952, "learning_rate": 1.753835201180653e-05, "loss": 0.6925, "step": 3285 }, { "epoch": 1.257558362035974, "grad_norm": 0.5063390731811523, "learning_rate": 1.753672294253011e-05, "loss": 0.5987, "step": 3286 }, { "epoch": 1.2579410639112132, "grad_norm": 0.5712844133377075, "learning_rate": 1.753509341010181e-05, "loss": 0.6276, "step": 3287 }, { "epoch": 1.2583237657864523, "grad_norm": 0.5198637247085571, "learning_rate": 1.753346341462178e-05, "loss": 0.6511, "step": 3288 }, { "epoch": 1.2587064676616915, "grad_norm": 0.49926677346229553, "learning_rate": 1.7531832956190186e-05, "loss": 0.6786, "step": 3289 }, { "epoch": 1.2590891695369306, "grad_norm": 0.5602369904518127, "learning_rate": 1.7530202034907223e-05, "loss": 0.6518, "step": 3290 }, { "epoch": 1.25947187141217, "grad_norm": 0.5684513449668884, "learning_rate": 1.7528570650873115e-05, "loss": 0.6964, "step": 3291 }, { "epoch": 1.2598545732874091, "grad_norm": 0.7049539685249329, "learning_rate": 1.752693880418811e-05, "loss": 0.6977, "step": 3292 }, { "epoch": 1.2602372751626483, "grad_norm": 0.5736395120620728, "learning_rate": 1.7525306494952498e-05, "loss": 0.6526, "step": 3293 }, { "epoch": 1.2606199770378874, "grad_norm": 0.5819072723388672, "learning_rate": 1.752367372326659e-05, "loss": 0.6484, "step": 3294 }, { "epoch": 1.2610026789131266, "grad_norm": 0.5467433333396912, "learning_rate": 1.7522040489230715e-05, "loss": 0.6455, "step": 3295 }, { "epoch": 1.261385380788366, "grad_norm": 0.5243082642555237, "learning_rate": 1.752040679294524e-05, "loss": 0.6762, "step": 3296 }, { "epoch": 1.261768082663605, "grad_norm": 0.5181486010551453, "learning_rate": 1.751877263451057e-05, "loss": 0.6558, "step": 3297 }, { "epoch": 1.2621507845388442, "grad_norm": 0.5628703236579895, "learning_rate": 1.751713801402712e-05, "loss": 0.6557, "step": 3298 }, { "epoch": 1.2625334864140834, "grad_norm": 0.5928704142570496, "learning_rate": 1.7515502931595344e-05, "loss": 0.6355, "step": 3299 }, { "epoch": 1.2629161882893225, "grad_norm": 0.5286141037940979, "learning_rate": 1.7513867387315727e-05, "loss": 0.6351, "step": 3300 }, { "epoch": 1.2632988901645619, "grad_norm": 0.5243015885353088, "learning_rate": 1.7512231381288767e-05, "loss": 0.6173, "step": 3301 }, { "epoch": 1.263681592039801, "grad_norm": 0.5866624712944031, "learning_rate": 1.751059491361501e-05, "loss": 0.6679, "step": 3302 }, { "epoch": 1.2640642939150402, "grad_norm": 0.5217955708503723, "learning_rate": 1.750895798439502e-05, "loss": 0.6985, "step": 3303 }, { "epoch": 1.2644469957902793, "grad_norm": 0.5545191168785095, "learning_rate": 1.7507320593729383e-05, "loss": 0.7599, "step": 3304 }, { "epoch": 1.2648296976655184, "grad_norm": 0.5676888227462769, "learning_rate": 1.7505682741718734e-05, "loss": 0.655, "step": 3305 }, { "epoch": 1.2652123995407578, "grad_norm": 0.5002974271774292, "learning_rate": 1.7504044428463713e-05, "loss": 0.6492, "step": 3306 }, { "epoch": 1.265595101415997, "grad_norm": 0.5717249512672424, "learning_rate": 1.7502405654065002e-05, "loss": 0.7042, "step": 3307 }, { "epoch": 1.265977803291236, "grad_norm": 0.5565614700317383, "learning_rate": 1.750076641862331e-05, "loss": 0.6693, "step": 3308 }, { "epoch": 1.2663605051664752, "grad_norm": 0.5209079384803772, "learning_rate": 1.749912672223937e-05, "loss": 0.7744, "step": 3309 }, { "epoch": 1.2667432070417144, "grad_norm": 0.530799150466919, "learning_rate": 1.749748656501395e-05, "loss": 0.6685, "step": 3310 }, { "epoch": 1.2671259089169538, "grad_norm": 0.5010607838630676, "learning_rate": 1.7495845947047836e-05, "loss": 0.6784, "step": 3311 }, { "epoch": 1.267508610792193, "grad_norm": 0.5354277491569519, "learning_rate": 1.7494204868441848e-05, "loss": 0.6619, "step": 3312 }, { "epoch": 1.267891312667432, "grad_norm": 0.5248739123344421, "learning_rate": 1.7492563329296843e-05, "loss": 0.5897, "step": 3313 }, { "epoch": 1.2682740145426712, "grad_norm": 0.5133988857269287, "learning_rate": 1.749092132971369e-05, "loss": 0.6811, "step": 3314 }, { "epoch": 1.2686567164179103, "grad_norm": 0.47736895084381104, "learning_rate": 1.7489278869793297e-05, "loss": 0.6137, "step": 3315 }, { "epoch": 1.2690394182931497, "grad_norm": 0.508309543132782, "learning_rate": 1.74876359496366e-05, "loss": 0.6443, "step": 3316 }, { "epoch": 1.2694221201683888, "grad_norm": 0.5173004269599915, "learning_rate": 1.7485992569344556e-05, "loss": 0.6667, "step": 3317 }, { "epoch": 1.269804822043628, "grad_norm": 0.5434055328369141, "learning_rate": 1.7484348729018153e-05, "loss": 0.7227, "step": 3318 }, { "epoch": 1.2701875239188671, "grad_norm": 0.5463588833808899, "learning_rate": 1.748270442875842e-05, "loss": 0.6453, "step": 3319 }, { "epoch": 1.2705702257941063, "grad_norm": 0.5148978233337402, "learning_rate": 1.74810596686664e-05, "loss": 0.6235, "step": 3320 }, { "epoch": 1.2709529276693456, "grad_norm": 0.5346746444702148, "learning_rate": 1.747941444884316e-05, "loss": 0.642, "step": 3321 }, { "epoch": 1.2713356295445848, "grad_norm": 0.5379209518432617, "learning_rate": 1.747776876938981e-05, "loss": 0.6136, "step": 3322 }, { "epoch": 1.271718331419824, "grad_norm": 0.5466813445091248, "learning_rate": 1.747612263040748e-05, "loss": 0.6587, "step": 3323 }, { "epoch": 1.272101033295063, "grad_norm": 0.5409952402114868, "learning_rate": 1.7474476031997335e-05, "loss": 0.649, "step": 3324 }, { "epoch": 1.2724837351703022, "grad_norm": 0.5828019976615906, "learning_rate": 1.747282897426055e-05, "loss": 0.7522, "step": 3325 }, { "epoch": 1.2728664370455416, "grad_norm": 0.5381157398223877, "learning_rate": 1.7471181457298354e-05, "loss": 0.6366, "step": 3326 }, { "epoch": 1.2732491389207807, "grad_norm": 0.5637904405593872, "learning_rate": 1.7469533481211984e-05, "loss": 0.6975, "step": 3327 }, { "epoch": 1.2736318407960199, "grad_norm": 0.48953431844711304, "learning_rate": 1.7467885046102717e-05, "loss": 0.677, "step": 3328 }, { "epoch": 1.274014542671259, "grad_norm": 0.5606407523155212, "learning_rate": 1.746623615207185e-05, "loss": 0.631, "step": 3329 }, { "epoch": 1.2743972445464982, "grad_norm": 0.5292462706565857, "learning_rate": 1.7464586799220713e-05, "loss": 0.6808, "step": 3330 }, { "epoch": 1.2747799464217375, "grad_norm": 0.5007873773574829, "learning_rate": 1.7462936987650666e-05, "loss": 0.681, "step": 3331 }, { "epoch": 1.2751626482969767, "grad_norm": 0.5599896907806396, "learning_rate": 1.746128671746309e-05, "loss": 0.7212, "step": 3332 }, { "epoch": 1.2755453501722158, "grad_norm": 0.5427311062812805, "learning_rate": 1.7459635988759398e-05, "loss": 0.6998, "step": 3333 }, { "epoch": 1.275928052047455, "grad_norm": 0.5365706086158752, "learning_rate": 1.745798480164104e-05, "loss": 0.6663, "step": 3334 }, { "epoch": 1.276310753922694, "grad_norm": 0.5720147490501404, "learning_rate": 1.7456333156209475e-05, "loss": 0.6748, "step": 3335 }, { "epoch": 1.2766934557979335, "grad_norm": 0.5188844203948975, "learning_rate": 1.7454681052566206e-05, "loss": 0.6521, "step": 3336 }, { "epoch": 1.2770761576731726, "grad_norm": 0.554415225982666, "learning_rate": 1.7453028490812764e-05, "loss": 0.6672, "step": 3337 }, { "epoch": 1.2774588595484118, "grad_norm": 0.5661869049072266, "learning_rate": 1.7451375471050694e-05, "loss": 0.6001, "step": 3338 }, { "epoch": 1.277841561423651, "grad_norm": 0.5033111572265625, "learning_rate": 1.7449721993381586e-05, "loss": 0.6136, "step": 3339 }, { "epoch": 1.27822426329889, "grad_norm": 0.5822563767433167, "learning_rate": 1.7448068057907047e-05, "loss": 0.8018, "step": 3340 }, { "epoch": 1.2786069651741294, "grad_norm": 0.5462537407875061, "learning_rate": 1.7446413664728718e-05, "loss": 0.6537, "step": 3341 }, { "epoch": 1.2789896670493686, "grad_norm": 0.54083651304245, "learning_rate": 1.7444758813948265e-05, "loss": 0.6277, "step": 3342 }, { "epoch": 1.2793723689246077, "grad_norm": 0.5243209600448608, "learning_rate": 1.744310350566738e-05, "loss": 0.677, "step": 3343 }, { "epoch": 1.2797550707998468, "grad_norm": 0.5546015501022339, "learning_rate": 1.744144773998779e-05, "loss": 0.6578, "step": 3344 }, { "epoch": 1.280137772675086, "grad_norm": 0.5646459460258484, "learning_rate": 1.7439791517011246e-05, "loss": 0.612, "step": 3345 }, { "epoch": 1.2805204745503254, "grad_norm": 0.5356569290161133, "learning_rate": 1.7438134836839524e-05, "loss": 0.6529, "step": 3346 }, { "epoch": 1.2809031764255645, "grad_norm": 0.5573431849479675, "learning_rate": 1.7436477699574437e-05, "loss": 0.6701, "step": 3347 }, { "epoch": 1.2812858783008036, "grad_norm": 0.5443512797355652, "learning_rate": 1.7434820105317816e-05, "loss": 0.7099, "step": 3348 }, { "epoch": 1.2816685801760428, "grad_norm": 0.5148206949234009, "learning_rate": 1.7433162054171525e-05, "loss": 0.6649, "step": 3349 }, { "epoch": 1.282051282051282, "grad_norm": 0.5630031824111938, "learning_rate": 1.7431503546237455e-05, "loss": 0.7503, "step": 3350 }, { "epoch": 1.2824339839265213, "grad_norm": 0.5767777562141418, "learning_rate": 1.7429844581617532e-05, "loss": 0.5722, "step": 3351 }, { "epoch": 1.2828166858017604, "grad_norm": 0.5349550247192383, "learning_rate": 1.7428185160413694e-05, "loss": 0.6134, "step": 3352 }, { "epoch": 1.2831993876769996, "grad_norm": 0.5363196134567261, "learning_rate": 1.7426525282727927e-05, "loss": 0.7305, "step": 3353 }, { "epoch": 1.2835820895522387, "grad_norm": 0.5956946611404419, "learning_rate": 1.7424864948662228e-05, "loss": 0.632, "step": 3354 }, { "epoch": 1.2839647914274779, "grad_norm": 0.5593138337135315, "learning_rate": 1.742320415831863e-05, "loss": 0.704, "step": 3355 }, { "epoch": 1.2843474933027172, "grad_norm": 0.5613971948623657, "learning_rate": 1.7421542911799198e-05, "loss": 0.6756, "step": 3356 }, { "epoch": 1.2847301951779564, "grad_norm": 0.5994872450828552, "learning_rate": 1.7419881209206012e-05, "loss": 0.705, "step": 3357 }, { "epoch": 1.2851128970531955, "grad_norm": 0.5094663500785828, "learning_rate": 1.7418219050641197e-05, "loss": 0.6827, "step": 3358 }, { "epoch": 1.2854955989284347, "grad_norm": 0.5625669360160828, "learning_rate": 1.7416556436206885e-05, "loss": 0.613, "step": 3359 }, { "epoch": 1.2858783008036738, "grad_norm": 0.5361120104789734, "learning_rate": 1.7414893366005263e-05, "loss": 0.7425, "step": 3360 }, { "epoch": 1.2862610026789132, "grad_norm": 0.4875270426273346, "learning_rate": 1.741322984013852e-05, "loss": 0.5413, "step": 3361 }, { "epoch": 1.2866437045541523, "grad_norm": 0.5850022435188293, "learning_rate": 1.7411565858708885e-05, "loss": 0.6306, "step": 3362 }, { "epoch": 1.2870264064293915, "grad_norm": 0.6224614977836609, "learning_rate": 1.7409901421818623e-05, "loss": 0.7037, "step": 3363 }, { "epoch": 1.2874091083046306, "grad_norm": 0.5022472739219666, "learning_rate": 1.740823652957001e-05, "loss": 0.6355, "step": 3364 }, { "epoch": 1.2877918101798698, "grad_norm": 0.520267903804779, "learning_rate": 1.7406571182065356e-05, "loss": 0.6303, "step": 3365 }, { "epoch": 1.2881745120551091, "grad_norm": 0.5195624828338623, "learning_rate": 1.740490537940701e-05, "loss": 0.6435, "step": 3366 }, { "epoch": 1.2885572139303483, "grad_norm": 0.49893733859062195, "learning_rate": 1.7403239121697333e-05, "loss": 0.6786, "step": 3367 }, { "epoch": 1.2889399158055874, "grad_norm": 0.5676876306533813, "learning_rate": 1.7401572409038725e-05, "loss": 0.7575, "step": 3368 }, { "epoch": 1.2893226176808266, "grad_norm": 0.5398663878440857, "learning_rate": 1.7399905241533606e-05, "loss": 0.6981, "step": 3369 }, { "epoch": 1.2897053195560657, "grad_norm": 0.5529617071151733, "learning_rate": 1.739823761928443e-05, "loss": 0.7432, "step": 3370 }, { "epoch": 1.290088021431305, "grad_norm": 0.5454068183898926, "learning_rate": 1.739656954239368e-05, "loss": 0.6866, "step": 3371 }, { "epoch": 1.2904707233065442, "grad_norm": 0.5823728442192078, "learning_rate": 1.7394901010963857e-05, "loss": 0.6473, "step": 3372 }, { "epoch": 1.2908534251817834, "grad_norm": 0.5664229989051819, "learning_rate": 1.73932320250975e-05, "loss": 0.6644, "step": 3373 }, { "epoch": 1.2912361270570225, "grad_norm": 0.4843754172325134, "learning_rate": 1.7391562584897176e-05, "loss": 0.7284, "step": 3374 }, { "epoch": 1.2916188289322617, "grad_norm": 0.5639331936836243, "learning_rate": 1.7389892690465476e-05, "loss": 0.7345, "step": 3375 }, { "epoch": 1.292001530807501, "grad_norm": 0.5585648417472839, "learning_rate": 1.7388222341905016e-05, "loss": 0.6264, "step": 3376 }, { "epoch": 1.2923842326827402, "grad_norm": 0.5589930415153503, "learning_rate": 1.738655153931844e-05, "loss": 0.661, "step": 3377 }, { "epoch": 1.2927669345579793, "grad_norm": 0.5691714882850647, "learning_rate": 1.7384880282808433e-05, "loss": 0.6922, "step": 3378 }, { "epoch": 1.2931496364332185, "grad_norm": 0.5813454389572144, "learning_rate": 1.7383208572477692e-05, "loss": 0.6824, "step": 3379 }, { "epoch": 1.2935323383084576, "grad_norm": 0.5487279295921326, "learning_rate": 1.7381536408428948e-05, "loss": 0.6428, "step": 3380 }, { "epoch": 1.293915040183697, "grad_norm": 0.5020553469657898, "learning_rate": 1.7379863790764963e-05, "loss": 0.6087, "step": 3381 }, { "epoch": 1.294297742058936, "grad_norm": 0.5290857553482056, "learning_rate": 1.737819071958852e-05, "loss": 0.6234, "step": 3382 }, { "epoch": 1.2946804439341753, "grad_norm": 0.5094029903411865, "learning_rate": 1.737651719500244e-05, "loss": 0.7477, "step": 3383 }, { "epoch": 1.2950631458094144, "grad_norm": 0.5106810331344604, "learning_rate": 1.7374843217109557e-05, "loss": 0.6711, "step": 3384 }, { "epoch": 1.2954458476846535, "grad_norm": 0.5434792637825012, "learning_rate": 1.737316878601275e-05, "loss": 0.6677, "step": 3385 }, { "epoch": 1.295828549559893, "grad_norm": 0.5140631198883057, "learning_rate": 1.737149390181491e-05, "loss": 0.6489, "step": 3386 }, { "epoch": 1.296211251435132, "grad_norm": 0.47987139225006104, "learning_rate": 1.7369818564618966e-05, "loss": 0.6467, "step": 3387 }, { "epoch": 1.2965939533103712, "grad_norm": 0.5367193818092346, "learning_rate": 1.7368142774527874e-05, "loss": 0.6793, "step": 3388 }, { "epoch": 1.2969766551856103, "grad_norm": 0.5137706995010376, "learning_rate": 1.736646653164461e-05, "loss": 0.6507, "step": 3389 }, { "epoch": 1.2973593570608495, "grad_norm": 0.5467406511306763, "learning_rate": 1.7364789836072192e-05, "loss": 0.6924, "step": 3390 }, { "epoch": 1.2977420589360888, "grad_norm": 0.5462122559547424, "learning_rate": 1.7363112687913654e-05, "loss": 0.5947, "step": 3391 }, { "epoch": 1.298124760811328, "grad_norm": 0.5296015739440918, "learning_rate": 1.736143508727206e-05, "loss": 0.728, "step": 3392 }, { "epoch": 1.2985074626865671, "grad_norm": 0.5249717831611633, "learning_rate": 1.73597570342505e-05, "loss": 0.6127, "step": 3393 }, { "epoch": 1.2988901645618063, "grad_norm": 0.5309066772460938, "learning_rate": 1.73580785289521e-05, "loss": 0.5744, "step": 3394 }, { "epoch": 1.2992728664370454, "grad_norm": 0.5210626721382141, "learning_rate": 1.7356399571480007e-05, "loss": 0.6971, "step": 3395 }, { "epoch": 1.2996555683122848, "grad_norm": 0.5103476643562317, "learning_rate": 1.73547201619374e-05, "loss": 0.5929, "step": 3396 }, { "epoch": 1.300038270187524, "grad_norm": 0.5410709977149963, "learning_rate": 1.7353040300427476e-05, "loss": 0.6121, "step": 3397 }, { "epoch": 1.300420972062763, "grad_norm": 0.5259401798248291, "learning_rate": 1.7351359987053476e-05, "loss": 0.6868, "step": 3398 }, { "epoch": 1.3008036739380022, "grad_norm": 0.5422089695930481, "learning_rate": 1.7349679221918655e-05, "loss": 0.6378, "step": 3399 }, { "epoch": 1.3011863758132414, "grad_norm": 0.5150445103645325, "learning_rate": 1.7347998005126296e-05, "loss": 0.5978, "step": 3400 }, { "epoch": 1.3015690776884807, "grad_norm": 0.5310264229774475, "learning_rate": 1.7346316336779725e-05, "loss": 0.6896, "step": 3401 }, { "epoch": 1.3019517795637199, "grad_norm": 0.5197222828865051, "learning_rate": 1.7344634216982275e-05, "loss": 0.6865, "step": 3402 }, { "epoch": 1.302334481438959, "grad_norm": 0.5535228848457336, "learning_rate": 1.7342951645837326e-05, "loss": 0.7288, "step": 3403 }, { "epoch": 1.3027171833141982, "grad_norm": 0.5112469792366028, "learning_rate": 1.7341268623448266e-05, "loss": 0.6079, "step": 3404 }, { "epoch": 1.3030998851894373, "grad_norm": 0.5431948304176331, "learning_rate": 1.733958514991853e-05, "loss": 0.6834, "step": 3405 }, { "epoch": 1.3034825870646767, "grad_norm": 0.5318964719772339, "learning_rate": 1.733790122535157e-05, "loss": 0.7092, "step": 3406 }, { "epoch": 1.3038652889399158, "grad_norm": 0.5587689876556396, "learning_rate": 1.7336216849850866e-05, "loss": 0.6695, "step": 3407 }, { "epoch": 1.304247990815155, "grad_norm": 0.5273523330688477, "learning_rate": 1.7334532023519925e-05, "loss": 0.6525, "step": 3408 }, { "epoch": 1.3046306926903941, "grad_norm": 0.5355141162872314, "learning_rate": 1.733284674646229e-05, "loss": 0.6582, "step": 3409 }, { "epoch": 1.3050133945656333, "grad_norm": 0.5430492162704468, "learning_rate": 1.733116101878152e-05, "loss": 0.6785, "step": 3410 }, { "epoch": 1.3053960964408726, "grad_norm": 0.5307841300964355, "learning_rate": 1.7329474840581212e-05, "loss": 0.6264, "step": 3411 }, { "epoch": 1.3057787983161118, "grad_norm": 0.5366126894950867, "learning_rate": 1.7327788211964985e-05, "loss": 0.7115, "step": 3412 }, { "epoch": 1.306161500191351, "grad_norm": 0.5325263142585754, "learning_rate": 1.7326101133036485e-05, "loss": 0.665, "step": 3413 }, { "epoch": 1.30654420206659, "grad_norm": 0.4968659579753876, "learning_rate": 1.732441360389939e-05, "loss": 0.6431, "step": 3414 }, { "epoch": 1.3069269039418292, "grad_norm": 0.5357723236083984, "learning_rate": 1.7322725624657404e-05, "loss": 0.6179, "step": 3415 }, { "epoch": 1.3073096058170686, "grad_norm": 0.5142333507537842, "learning_rate": 1.7321037195414252e-05, "loss": 0.5842, "step": 3416 }, { "epoch": 1.3076923076923077, "grad_norm": 0.5332732796669006, "learning_rate": 1.73193483162737e-05, "loss": 0.7046, "step": 3417 }, { "epoch": 1.3080750095675469, "grad_norm": 0.5285913944244385, "learning_rate": 1.7317658987339524e-05, "loss": 0.6076, "step": 3418 }, { "epoch": 1.308457711442786, "grad_norm": 0.5137394070625305, "learning_rate": 1.7315969208715552e-05, "loss": 0.7031, "step": 3419 }, { "epoch": 1.3088404133180251, "grad_norm": 0.5034336447715759, "learning_rate": 1.7314278980505615e-05, "loss": 0.7167, "step": 3420 }, { "epoch": 1.3092231151932645, "grad_norm": 0.6226645708084106, "learning_rate": 1.7312588302813585e-05, "loss": 0.6937, "step": 3421 }, { "epoch": 1.3096058170685037, "grad_norm": 0.5680915713310242, "learning_rate": 1.731089717574336e-05, "loss": 0.6878, "step": 3422 }, { "epoch": 1.3099885189437428, "grad_norm": 0.5887991189956665, "learning_rate": 1.730920559939886e-05, "loss": 0.5687, "step": 3423 }, { "epoch": 1.310371220818982, "grad_norm": 0.5907261967658997, "learning_rate": 1.730751357388404e-05, "loss": 0.5614, "step": 3424 }, { "epoch": 1.310753922694221, "grad_norm": 0.576385498046875, "learning_rate": 1.730582109930288e-05, "loss": 0.6326, "step": 3425 }, { "epoch": 1.3111366245694605, "grad_norm": 0.5445957183837891, "learning_rate": 1.730412817575939e-05, "loss": 0.612, "step": 3426 }, { "epoch": 1.3115193264446996, "grad_norm": 0.5532810091972351, "learning_rate": 1.7302434803357596e-05, "loss": 0.6749, "step": 3427 }, { "epoch": 1.3119020283199387, "grad_norm": 0.5431328415870667, "learning_rate": 1.730074098220157e-05, "loss": 0.6661, "step": 3428 }, { "epoch": 1.3122847301951779, "grad_norm": 0.539672315120697, "learning_rate": 1.7299046712395394e-05, "loss": 0.7138, "step": 3429 }, { "epoch": 1.312667432070417, "grad_norm": 0.6310951113700867, "learning_rate": 1.7297351994043187e-05, "loss": 0.5851, "step": 3430 }, { "epoch": 1.3130501339456564, "grad_norm": 0.587348222732544, "learning_rate": 1.72956568272491e-05, "loss": 0.624, "step": 3431 }, { "epoch": 1.3134328358208955, "grad_norm": 0.629986584186554, "learning_rate": 1.72939612121173e-05, "loss": 0.7017, "step": 3432 }, { "epoch": 1.3138155376961347, "grad_norm": 0.5815560221672058, "learning_rate": 1.7292265148751987e-05, "loss": 0.7528, "step": 3433 }, { "epoch": 1.3141982395713738, "grad_norm": 0.550757646560669, "learning_rate": 1.7290568637257394e-05, "loss": 0.6604, "step": 3434 }, { "epoch": 1.314580941446613, "grad_norm": 0.5850407481193542, "learning_rate": 1.7288871677737767e-05, "loss": 0.6333, "step": 3435 }, { "epoch": 1.3149636433218523, "grad_norm": 0.5216822624206543, "learning_rate": 1.7287174270297397e-05, "loss": 0.7074, "step": 3436 }, { "epoch": 1.3153463451970915, "grad_norm": 0.5566686391830444, "learning_rate": 1.7285476415040588e-05, "loss": 0.6974, "step": 3437 }, { "epoch": 1.3157290470723306, "grad_norm": 0.5957866907119751, "learning_rate": 1.7283778112071683e-05, "loss": 0.6604, "step": 3438 }, { "epoch": 1.3161117489475698, "grad_norm": 0.5745619535446167, "learning_rate": 1.7282079361495045e-05, "loss": 0.5858, "step": 3439 }, { "epoch": 1.316494450822809, "grad_norm": 0.5440539717674255, "learning_rate": 1.7280380163415068e-05, "loss": 0.7399, "step": 3440 }, { "epoch": 1.3168771526980483, "grad_norm": 0.5450443625450134, "learning_rate": 1.727868051793617e-05, "loss": 0.6781, "step": 3441 }, { "epoch": 1.3172598545732874, "grad_norm": 0.5401238203048706, "learning_rate": 1.72769804251628e-05, "loss": 0.6958, "step": 3442 }, { "epoch": 1.3176425564485266, "grad_norm": 0.5985117554664612, "learning_rate": 1.7275279885199435e-05, "loss": 0.6719, "step": 3443 }, { "epoch": 1.3180252583237657, "grad_norm": 0.5467671155929565, "learning_rate": 1.727357889815058e-05, "loss": 0.662, "step": 3444 }, { "epoch": 1.3184079601990049, "grad_norm": 0.5493303537368774, "learning_rate": 1.7271877464120753e-05, "loss": 0.8082, "step": 3445 }, { "epoch": 1.3187906620742442, "grad_norm": 0.5583283305168152, "learning_rate": 1.7270175583214525e-05, "loss": 0.6561, "step": 3446 }, { "epoch": 1.3191733639494834, "grad_norm": 0.5763624310493469, "learning_rate": 1.7268473255536473e-05, "loss": 0.7144, "step": 3447 }, { "epoch": 1.3195560658247225, "grad_norm": 0.5218399167060852, "learning_rate": 1.726677048119121e-05, "loss": 0.6373, "step": 3448 }, { "epoch": 1.3199387676999617, "grad_norm": 0.5199733376502991, "learning_rate": 1.7265067260283387e-05, "loss": 0.6564, "step": 3449 }, { "epoch": 1.3203214695752008, "grad_norm": 0.5624281764030457, "learning_rate": 1.7263363592917663e-05, "loss": 0.6314, "step": 3450 }, { "epoch": 1.3207041714504402, "grad_norm": 0.5758853554725647, "learning_rate": 1.7261659479198728e-05, "loss": 0.6939, "step": 3451 }, { "epoch": 1.3210868733256793, "grad_norm": 0.5981304049491882, "learning_rate": 1.725995491923131e-05, "loss": 0.6151, "step": 3452 }, { "epoch": 1.3214695752009185, "grad_norm": 0.4916823208332062, "learning_rate": 1.725824991312016e-05, "loss": 0.6607, "step": 3453 }, { "epoch": 1.3218522770761576, "grad_norm": 0.5593554973602295, "learning_rate": 1.725654446097005e-05, "loss": 0.6506, "step": 3454 }, { "epoch": 1.3222349789513967, "grad_norm": 0.5384451746940613, "learning_rate": 1.7254838562885794e-05, "loss": 0.6597, "step": 3455 }, { "epoch": 1.322617680826636, "grad_norm": 0.6606580018997192, "learning_rate": 1.7253132218972217e-05, "loss": 0.6952, "step": 3456 }, { "epoch": 1.3230003827018753, "grad_norm": 0.5752317905426025, "learning_rate": 1.7251425429334178e-05, "loss": 0.7098, "step": 3457 }, { "epoch": 1.3233830845771144, "grad_norm": 0.5158082842826843, "learning_rate": 1.7249718194076564e-05, "loss": 0.6813, "step": 3458 }, { "epoch": 1.3237657864523535, "grad_norm": 0.5519260764122009, "learning_rate": 1.7248010513304296e-05, "loss": 0.6952, "step": 3459 }, { "epoch": 1.3241484883275927, "grad_norm": 0.5978283882141113, "learning_rate": 1.7246302387122307e-05, "loss": 0.7624, "step": 3460 }, { "epoch": 1.324531190202832, "grad_norm": 0.5257342457771301, "learning_rate": 1.724459381563557e-05, "loss": 0.6473, "step": 3461 }, { "epoch": 1.3249138920780712, "grad_norm": 0.507608950138092, "learning_rate": 1.724288479894908e-05, "loss": 0.7377, "step": 3462 }, { "epoch": 1.3252965939533103, "grad_norm": 0.5201870203018188, "learning_rate": 1.724117533716786e-05, "loss": 0.6468, "step": 3463 }, { "epoch": 1.3256792958285495, "grad_norm": 0.5054258704185486, "learning_rate": 1.7239465430396964e-05, "loss": 0.6689, "step": 3464 }, { "epoch": 1.3260619977037886, "grad_norm": 0.5721175074577332, "learning_rate": 1.7237755078741467e-05, "loss": 0.6564, "step": 3465 }, { "epoch": 1.326444699579028, "grad_norm": 0.5829220414161682, "learning_rate": 1.7236044282306475e-05, "loss": 0.6887, "step": 3466 }, { "epoch": 1.3268274014542671, "grad_norm": 0.5654780268669128, "learning_rate": 1.7234333041197127e-05, "loss": 0.6696, "step": 3467 }, { "epoch": 1.3272101033295063, "grad_norm": 0.5133846998214722, "learning_rate": 1.7232621355518575e-05, "loss": 0.6798, "step": 3468 }, { "epoch": 1.3275928052047454, "grad_norm": 0.5582302808761597, "learning_rate": 1.7230909225376012e-05, "loss": 0.6924, "step": 3469 }, { "epoch": 1.3279755070799846, "grad_norm": 0.569391667842865, "learning_rate": 1.722919665087465e-05, "loss": 0.6691, "step": 3470 }, { "epoch": 1.328358208955224, "grad_norm": 0.5360039472579956, "learning_rate": 1.7227483632119734e-05, "loss": 0.7446, "step": 3471 }, { "epoch": 1.328740910830463, "grad_norm": 0.5386156439781189, "learning_rate": 1.722577016921653e-05, "loss": 0.692, "step": 3472 }, { "epoch": 1.3291236127057022, "grad_norm": 0.6301005482673645, "learning_rate": 1.7224056262270338e-05, "loss": 0.6361, "step": 3473 }, { "epoch": 1.3295063145809414, "grad_norm": 0.49686160683631897, "learning_rate": 1.7222341911386484e-05, "loss": 0.6179, "step": 3474 }, { "epoch": 1.3298890164561805, "grad_norm": 0.5496760010719299, "learning_rate": 1.7220627116670314e-05, "loss": 0.677, "step": 3475 }, { "epoch": 1.3302717183314199, "grad_norm": 0.4888838529586792, "learning_rate": 1.7218911878227208e-05, "loss": 0.6505, "step": 3476 }, { "epoch": 1.330654420206659, "grad_norm": 0.5295485258102417, "learning_rate": 1.721719619616258e-05, "loss": 0.6366, "step": 3477 }, { "epoch": 1.3310371220818982, "grad_norm": 0.5256691575050354, "learning_rate": 1.7215480070581853e-05, "loss": 0.5628, "step": 3478 }, { "epoch": 1.3314198239571373, "grad_norm": 0.5940430164337158, "learning_rate": 1.721376350159049e-05, "loss": 0.6207, "step": 3479 }, { "epoch": 1.3318025258323765, "grad_norm": 0.5817030072212219, "learning_rate": 1.721204648929398e-05, "loss": 0.6221, "step": 3480 }, { "epoch": 1.3321852277076158, "grad_norm": 0.5363978147506714, "learning_rate": 1.721032903379784e-05, "loss": 0.6086, "step": 3481 }, { "epoch": 1.332567929582855, "grad_norm": 0.5912573933601379, "learning_rate": 1.7208611135207607e-05, "loss": 0.6703, "step": 3482 }, { "epoch": 1.3329506314580941, "grad_norm": 0.5375508069992065, "learning_rate": 1.720689279362886e-05, "loss": 0.6271, "step": 3483 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5622987747192383, "learning_rate": 1.7205174009167185e-05, "loss": 0.6457, "step": 3484 }, { "epoch": 1.3337160352085724, "grad_norm": 0.5374427437782288, "learning_rate": 1.720345478192821e-05, "loss": 0.6593, "step": 3485 }, { "epoch": 1.3340987370838118, "grad_norm": 0.4838193356990814, "learning_rate": 1.7201735112017587e-05, "loss": 0.6312, "step": 3486 }, { "epoch": 1.334481438959051, "grad_norm": 0.4776071012020111, "learning_rate": 1.7200014999541e-05, "loss": 0.6207, "step": 3487 }, { "epoch": 1.33486414083429, "grad_norm": 0.5911425948143005, "learning_rate": 1.7198294444604143e-05, "loss": 0.6428, "step": 3488 }, { "epoch": 1.3352468427095292, "grad_norm": 0.5977736711502075, "learning_rate": 1.7196573447312753e-05, "loss": 0.6998, "step": 3489 }, { "epoch": 1.3356295445847683, "grad_norm": 0.5509235262870789, "learning_rate": 1.719485200777259e-05, "loss": 0.6428, "step": 3490 }, { "epoch": 1.3360122464600077, "grad_norm": 0.49853402376174927, "learning_rate": 1.719313012608945e-05, "loss": 0.6576, "step": 3491 }, { "epoch": 1.3363949483352469, "grad_norm": 0.532072126865387, "learning_rate": 1.7191407802369132e-05, "loss": 0.6597, "step": 3492 }, { "epoch": 1.336777650210486, "grad_norm": 0.546273410320282, "learning_rate": 1.718968503671749e-05, "loss": 0.6677, "step": 3493 }, { "epoch": 1.3371603520857251, "grad_norm": 0.5796959400177002, "learning_rate": 1.7187961829240385e-05, "loss": 0.6107, "step": 3494 }, { "epoch": 1.3375430539609643, "grad_norm": 0.6254673600196838, "learning_rate": 1.7186238180043716e-05, "loss": 0.6478, "step": 3495 }, { "epoch": 1.3379257558362037, "grad_norm": 0.5599147081375122, "learning_rate": 1.7184514089233403e-05, "loss": 0.6503, "step": 3496 }, { "epoch": 1.3383084577114428, "grad_norm": 0.5052818059921265, "learning_rate": 1.71827895569154e-05, "loss": 0.5993, "step": 3497 }, { "epoch": 1.338691159586682, "grad_norm": 0.5956594944000244, "learning_rate": 1.718106458319568e-05, "loss": 0.6738, "step": 3498 }, { "epoch": 1.339073861461921, "grad_norm": 0.5221593976020813, "learning_rate": 1.7179339168180255e-05, "loss": 0.7175, "step": 3499 }, { "epoch": 1.3394565633371602, "grad_norm": 0.5548490881919861, "learning_rate": 1.7177613311975145e-05, "loss": 0.6325, "step": 3500 }, { "epoch": 1.3398392652123996, "grad_norm": 0.516389012336731, "learning_rate": 1.7175887014686417e-05, "loss": 0.6867, "step": 3501 }, { "epoch": 1.3402219670876387, "grad_norm": 0.6002725958824158, "learning_rate": 1.7174160276420156e-05, "loss": 0.7491, "step": 3502 }, { "epoch": 1.3406046689628779, "grad_norm": 0.5096220374107361, "learning_rate": 1.7172433097282468e-05, "loss": 0.6818, "step": 3503 }, { "epoch": 1.340987370838117, "grad_norm": 0.5640214085578918, "learning_rate": 1.71707054773795e-05, "loss": 0.6601, "step": 3504 }, { "epoch": 1.3413700727133562, "grad_norm": 0.5866783857345581, "learning_rate": 1.7168977416817417e-05, "loss": 0.6001, "step": 3505 }, { "epoch": 1.3417527745885955, "grad_norm": 0.5069500207901001, "learning_rate": 1.716724891570241e-05, "loss": 0.7541, "step": 3506 }, { "epoch": 1.3421354764638347, "grad_norm": 0.5539551973342896, "learning_rate": 1.7165519974140706e-05, "loss": 0.6279, "step": 3507 }, { "epoch": 1.3425181783390738, "grad_norm": 0.6942571401596069, "learning_rate": 1.7163790592238548e-05, "loss": 0.6629, "step": 3508 }, { "epoch": 1.342900880214313, "grad_norm": 0.5817781090736389, "learning_rate": 1.716206077010221e-05, "loss": 0.6837, "step": 3509 }, { "epoch": 1.3432835820895521, "grad_norm": 0.5386359691619873, "learning_rate": 1.7160330507837998e-05, "loss": 0.6502, "step": 3510 }, { "epoch": 1.3436662839647915, "grad_norm": 0.5914651155471802, "learning_rate": 1.7158599805552242e-05, "loss": 0.6639, "step": 3511 }, { "epoch": 1.3440489858400306, "grad_norm": 0.5535217523574829, "learning_rate": 1.7156868663351295e-05, "loss": 0.7161, "step": 3512 }, { "epoch": 1.3444316877152698, "grad_norm": 0.503495454788208, "learning_rate": 1.7155137081341545e-05, "loss": 0.6878, "step": 3513 }, { "epoch": 1.344814389590509, "grad_norm": 0.5473951697349548, "learning_rate": 1.7153405059629396e-05, "loss": 0.6766, "step": 3514 }, { "epoch": 1.345197091465748, "grad_norm": 0.5318741202354431, "learning_rate": 1.7151672598321286e-05, "loss": 0.6821, "step": 3515 }, { "epoch": 1.3455797933409874, "grad_norm": 0.5247941017150879, "learning_rate": 1.7149939697523687e-05, "loss": 0.6527, "step": 3516 }, { "epoch": 1.3459624952162266, "grad_norm": 0.4991472065448761, "learning_rate": 1.7148206357343084e-05, "loss": 0.6843, "step": 3517 }, { "epoch": 1.3463451970914657, "grad_norm": 0.5197606086730957, "learning_rate": 1.7146472577885995e-05, "loss": 0.6697, "step": 3518 }, { "epoch": 1.3467278989667049, "grad_norm": 0.5342605710029602, "learning_rate": 1.714473835925897e-05, "loss": 0.7045, "step": 3519 }, { "epoch": 1.347110600841944, "grad_norm": 0.5159749984741211, "learning_rate": 1.7143003701568574e-05, "loss": 0.6049, "step": 3520 }, { "epoch": 1.3474933027171834, "grad_norm": 0.5267810225486755, "learning_rate": 1.7141268604921414e-05, "loss": 0.6729, "step": 3521 }, { "epoch": 1.3478760045924225, "grad_norm": 0.5752356648445129, "learning_rate": 1.7139533069424116e-05, "loss": 0.6688, "step": 3522 }, { "epoch": 1.3482587064676617, "grad_norm": 0.554821789264679, "learning_rate": 1.7137797095183327e-05, "loss": 0.6143, "step": 3523 }, { "epoch": 1.3486414083429008, "grad_norm": 0.5086885690689087, "learning_rate": 1.713606068230573e-05, "loss": 0.7722, "step": 3524 }, { "epoch": 1.34902411021814, "grad_norm": 0.6088413000106812, "learning_rate": 1.7134323830898036e-05, "loss": 0.699, "step": 3525 }, { "epoch": 1.3494068120933793, "grad_norm": 0.539847731590271, "learning_rate": 1.7132586541066975e-05, "loss": 0.6736, "step": 3526 }, { "epoch": 1.3497895139686185, "grad_norm": 0.5391380786895752, "learning_rate": 1.713084881291931e-05, "loss": 0.6889, "step": 3527 }, { "epoch": 1.3501722158438576, "grad_norm": 0.5493136048316956, "learning_rate": 1.7129110646561827e-05, "loss": 0.563, "step": 3528 }, { "epoch": 1.3505549177190967, "grad_norm": 0.5450697541236877, "learning_rate": 1.7127372042101344e-05, "loss": 0.6652, "step": 3529 }, { "epoch": 1.350937619594336, "grad_norm": 0.5615074634552002, "learning_rate": 1.7125632999644698e-05, "loss": 0.6696, "step": 3530 }, { "epoch": 1.3513203214695753, "grad_norm": 0.5223785042762756, "learning_rate": 1.7123893519298764e-05, "loss": 0.6213, "step": 3531 }, { "epoch": 1.3517030233448144, "grad_norm": 0.5429962873458862, "learning_rate": 1.7122153601170438e-05, "loss": 0.6413, "step": 3532 }, { "epoch": 1.3520857252200535, "grad_norm": 0.5405092239379883, "learning_rate": 1.7120413245366636e-05, "loss": 0.6829, "step": 3533 }, { "epoch": 1.3524684270952927, "grad_norm": 0.5808344483375549, "learning_rate": 1.711867245199431e-05, "loss": 0.7018, "step": 3534 }, { "epoch": 1.3528511289705318, "grad_norm": 0.5708465576171875, "learning_rate": 1.7116931221160438e-05, "loss": 0.6637, "step": 3535 }, { "epoch": 1.3532338308457712, "grad_norm": 0.5679522156715393, "learning_rate": 1.7115189552972027e-05, "loss": 0.678, "step": 3536 }, { "epoch": 1.3536165327210103, "grad_norm": 0.5599812865257263, "learning_rate": 1.7113447447536102e-05, "loss": 0.6746, "step": 3537 }, { "epoch": 1.3539992345962495, "grad_norm": 0.5207595229148865, "learning_rate": 1.7111704904959716e-05, "loss": 0.6091, "step": 3538 }, { "epoch": 1.3543819364714886, "grad_norm": 0.5717136263847351, "learning_rate": 1.7109961925349963e-05, "loss": 0.7346, "step": 3539 }, { "epoch": 1.3547646383467278, "grad_norm": 0.6045215725898743, "learning_rate": 1.7108218508813944e-05, "loss": 0.6379, "step": 3540 }, { "epoch": 1.3551473402219671, "grad_norm": 0.6761857867240906, "learning_rate": 1.710647465545881e-05, "loss": 0.569, "step": 3541 }, { "epoch": 1.3555300420972063, "grad_norm": 0.5079129934310913, "learning_rate": 1.710473036539171e-05, "loss": 0.6077, "step": 3542 }, { "epoch": 1.3559127439724454, "grad_norm": 0.5532659888267517, "learning_rate": 1.7102985638719842e-05, "loss": 0.6477, "step": 3543 }, { "epoch": 1.3562954458476846, "grad_norm": 0.4953748881816864, "learning_rate": 1.7101240475550427e-05, "loss": 0.6732, "step": 3544 }, { "epoch": 1.3566781477229237, "grad_norm": 0.6237345933914185, "learning_rate": 1.7099494875990708e-05, "loss": 0.6737, "step": 3545 }, { "epoch": 1.357060849598163, "grad_norm": 0.5531752109527588, "learning_rate": 1.7097748840147954e-05, "loss": 0.671, "step": 3546 }, { "epoch": 1.3574435514734022, "grad_norm": 0.5209479928016663, "learning_rate": 1.7096002368129466e-05, "loss": 0.6779, "step": 3547 }, { "epoch": 1.3578262533486414, "grad_norm": 0.5292243361473083, "learning_rate": 1.709425546004257e-05, "loss": 0.6294, "step": 3548 }, { "epoch": 1.3582089552238805, "grad_norm": 0.5177050232887268, "learning_rate": 1.709250811599462e-05, "loss": 0.741, "step": 3549 }, { "epoch": 1.3585916570991197, "grad_norm": 0.5739982724189758, "learning_rate": 1.709076033609299e-05, "loss": 0.626, "step": 3550 }, { "epoch": 1.358974358974359, "grad_norm": 0.5147891640663147, "learning_rate": 1.7089012120445085e-05, "loss": 0.6545, "step": 3551 }, { "epoch": 1.3593570608495982, "grad_norm": 0.5185917615890503, "learning_rate": 1.7087263469158343e-05, "loss": 0.6497, "step": 3552 }, { "epoch": 1.3597397627248373, "grad_norm": 0.5739821791648865, "learning_rate": 1.7085514382340223e-05, "loss": 0.595, "step": 3553 }, { "epoch": 1.3601224646000765, "grad_norm": 0.5712764263153076, "learning_rate": 1.7083764860098206e-05, "loss": 0.5516, "step": 3554 }, { "epoch": 1.3605051664753156, "grad_norm": 0.502826988697052, "learning_rate": 1.708201490253981e-05, "loss": 0.6241, "step": 3555 }, { "epoch": 1.360887868350555, "grad_norm": 0.5291908383369446, "learning_rate": 1.7080264509772574e-05, "loss": 0.6849, "step": 3556 }, { "epoch": 1.3612705702257941, "grad_norm": 0.5874804258346558, "learning_rate": 1.7078513681904055e-05, "loss": 0.6169, "step": 3557 }, { "epoch": 1.3616532721010333, "grad_norm": 0.5367396473884583, "learning_rate": 1.707676241904186e-05, "loss": 0.6655, "step": 3558 }, { "epoch": 1.3620359739762724, "grad_norm": 0.5970855355262756, "learning_rate": 1.7075010721293602e-05, "loss": 0.6575, "step": 3559 }, { "epoch": 1.3624186758515116, "grad_norm": 0.5217001438140869, "learning_rate": 1.707325858876693e-05, "loss": 0.6457, "step": 3560 }, { "epoch": 1.362801377726751, "grad_norm": 0.585705578327179, "learning_rate": 1.7071506021569515e-05, "loss": 0.6884, "step": 3561 }, { "epoch": 1.36318407960199, "grad_norm": 0.4948258399963379, "learning_rate": 1.7069753019809053e-05, "loss": 0.6729, "step": 3562 }, { "epoch": 1.3635667814772292, "grad_norm": 0.5172722935676575, "learning_rate": 1.706799958359328e-05, "loss": 0.6426, "step": 3563 }, { "epoch": 1.3639494833524684, "grad_norm": 0.5618462562561035, "learning_rate": 1.7066245713029945e-05, "loss": 0.6743, "step": 3564 }, { "epoch": 1.3643321852277075, "grad_norm": 0.5564031004905701, "learning_rate": 1.7064491408226822e-05, "loss": 0.6311, "step": 3565 }, { "epoch": 1.3647148871029469, "grad_norm": 0.5678279399871826, "learning_rate": 1.706273666929173e-05, "loss": 0.708, "step": 3566 }, { "epoch": 1.365097588978186, "grad_norm": 0.5157721042633057, "learning_rate": 1.706098149633249e-05, "loss": 0.6288, "step": 3567 }, { "epoch": 1.3654802908534251, "grad_norm": 0.5942360162734985, "learning_rate": 1.705922588945697e-05, "loss": 0.6012, "step": 3568 }, { "epoch": 1.3658629927286643, "grad_norm": 0.554533839225769, "learning_rate": 1.7057469848773057e-05, "loss": 0.614, "step": 3569 }, { "epoch": 1.3662456946039034, "grad_norm": 0.635668933391571, "learning_rate": 1.705571337438866e-05, "loss": 0.6055, "step": 3570 }, { "epoch": 1.3666283964791428, "grad_norm": 0.49819907546043396, "learning_rate": 1.705395646641172e-05, "loss": 0.5893, "step": 3571 }, { "epoch": 1.367011098354382, "grad_norm": 0.5161516666412354, "learning_rate": 1.7052199124950207e-05, "loss": 0.579, "step": 3572 }, { "epoch": 1.367393800229621, "grad_norm": 0.5827211141586304, "learning_rate": 1.7050441350112108e-05, "loss": 0.6314, "step": 3573 }, { "epoch": 1.3677765021048602, "grad_norm": 0.6045976877212524, "learning_rate": 1.7048683142005448e-05, "loss": 0.7163, "step": 3574 }, { "epoch": 1.3681592039800994, "grad_norm": 0.5885894298553467, "learning_rate": 1.7046924500738272e-05, "loss": 0.726, "step": 3575 }, { "epoch": 1.3685419058553387, "grad_norm": 0.5568593740463257, "learning_rate": 1.704516542641866e-05, "loss": 0.6366, "step": 3576 }, { "epoch": 1.368924607730578, "grad_norm": 0.5345310568809509, "learning_rate": 1.7043405919154703e-05, "loss": 0.5943, "step": 3577 }, { "epoch": 1.369307309605817, "grad_norm": 0.5741827487945557, "learning_rate": 1.7041645979054527e-05, "loss": 0.6595, "step": 3578 }, { "epoch": 1.3696900114810562, "grad_norm": 0.5862584710121155, "learning_rate": 1.7039885606226293e-05, "loss": 0.6298, "step": 3579 }, { "epoch": 1.3700727133562953, "grad_norm": 0.5380381345748901, "learning_rate": 1.7038124800778173e-05, "loss": 0.6177, "step": 3580 }, { "epoch": 1.3704554152315347, "grad_norm": 0.5840367674827576, "learning_rate": 1.7036363562818374e-05, "loss": 0.6878, "step": 3581 }, { "epoch": 1.3708381171067738, "grad_norm": 0.5838548541069031, "learning_rate": 1.7034601892455137e-05, "loss": 0.71, "step": 3582 }, { "epoch": 1.371220818982013, "grad_norm": 0.5172257423400879, "learning_rate": 1.7032839789796712e-05, "loss": 0.6339, "step": 3583 }, { "epoch": 1.3716035208572521, "grad_norm": 0.5966460108757019, "learning_rate": 1.7031077254951385e-05, "loss": 0.6367, "step": 3584 }, { "epoch": 1.3719862227324913, "grad_norm": 0.559958279132843, "learning_rate": 1.7029314288027477e-05, "loss": 0.6718, "step": 3585 }, { "epoch": 1.3723689246077306, "grad_norm": 0.6215289235115051, "learning_rate": 1.7027550889133316e-05, "loss": 0.6086, "step": 3586 }, { "epoch": 1.3727516264829698, "grad_norm": 0.5743451118469238, "learning_rate": 1.702578705837728e-05, "loss": 0.6738, "step": 3587 }, { "epoch": 1.373134328358209, "grad_norm": 0.5758944749832153, "learning_rate": 1.702402279586775e-05, "loss": 0.5615, "step": 3588 }, { "epoch": 1.373517030233448, "grad_norm": 0.5924113392829895, "learning_rate": 1.7022258101713153e-05, "loss": 0.6462, "step": 3589 }, { "epoch": 1.3738997321086872, "grad_norm": 0.49153050780296326, "learning_rate": 1.702049297602193e-05, "loss": 0.6484, "step": 3590 }, { "epoch": 1.3742824339839266, "grad_norm": 0.5182503461837769, "learning_rate": 1.701872741890255e-05, "loss": 0.6204, "step": 3591 }, { "epoch": 1.3746651358591657, "grad_norm": 0.6357426643371582, "learning_rate": 1.7016961430463516e-05, "loss": 0.606, "step": 3592 }, { "epoch": 1.3750478377344049, "grad_norm": 0.5918871164321899, "learning_rate": 1.701519501081335e-05, "loss": 0.6168, "step": 3593 }, { "epoch": 1.375430539609644, "grad_norm": 0.515683114528656, "learning_rate": 1.7013428160060606e-05, "loss": 0.6201, "step": 3594 }, { "epoch": 1.3758132414848832, "grad_norm": 0.5327391624450684, "learning_rate": 1.701166087831386e-05, "loss": 0.6313, "step": 3595 }, { "epoch": 1.3761959433601225, "grad_norm": 0.5219769477844238, "learning_rate": 1.7009893165681713e-05, "loss": 0.6047, "step": 3596 }, { "epoch": 1.3765786452353617, "grad_norm": 0.5608281493186951, "learning_rate": 1.7008125022272805e-05, "loss": 0.5845, "step": 3597 }, { "epoch": 1.3769613471106008, "grad_norm": 0.5070093274116516, "learning_rate": 1.700635644819578e-05, "loss": 0.6842, "step": 3598 }, { "epoch": 1.37734404898584, "grad_norm": 0.5512410402297974, "learning_rate": 1.7004587443559334e-05, "loss": 0.6679, "step": 3599 }, { "epoch": 1.377726750861079, "grad_norm": 0.5532315969467163, "learning_rate": 1.700281800847217e-05, "loss": 0.7184, "step": 3600 }, { "epoch": 1.3781094527363185, "grad_norm": 0.5068445801734924, "learning_rate": 1.7001048143043028e-05, "loss": 0.6095, "step": 3601 }, { "epoch": 1.3784921546115576, "grad_norm": 0.5488535761833191, "learning_rate": 1.699927784738067e-05, "loss": 0.636, "step": 3602 }, { "epoch": 1.3788748564867968, "grad_norm": 0.5657810568809509, "learning_rate": 1.699750712159388e-05, "loss": 0.7196, "step": 3603 }, { "epoch": 1.379257558362036, "grad_norm": 0.5349979400634766, "learning_rate": 1.6995735965791483e-05, "loss": 0.6096, "step": 3604 }, { "epoch": 1.379640260237275, "grad_norm": 0.5169501304626465, "learning_rate": 1.6993964380082317e-05, "loss": 0.6925, "step": 3605 }, { "epoch": 1.3800229621125144, "grad_norm": 0.5557631850242615, "learning_rate": 1.6992192364575247e-05, "loss": 0.6787, "step": 3606 }, { "epoch": 1.3804056639877536, "grad_norm": 0.524323582649231, "learning_rate": 1.6990419919379174e-05, "loss": 0.6866, "step": 3607 }, { "epoch": 1.3807883658629927, "grad_norm": 0.5341280698776245, "learning_rate": 1.6988647044603017e-05, "loss": 0.6443, "step": 3608 }, { "epoch": 1.3811710677382318, "grad_norm": 0.519007682800293, "learning_rate": 1.6986873740355725e-05, "loss": 0.6475, "step": 3609 }, { "epoch": 1.381553769613471, "grad_norm": 0.5357455015182495, "learning_rate": 1.698510000674627e-05, "loss": 0.6864, "step": 3610 }, { "epoch": 1.3819364714887103, "grad_norm": 0.5229552984237671, "learning_rate": 1.6983325843883657e-05, "loss": 0.5871, "step": 3611 }, { "epoch": 1.3823191733639495, "grad_norm": 0.5701537728309631, "learning_rate": 1.6981551251876905e-05, "loss": 0.7502, "step": 3612 }, { "epoch": 1.3827018752391886, "grad_norm": 0.5335500836372375, "learning_rate": 1.6979776230835076e-05, "loss": 0.6713, "step": 3613 }, { "epoch": 1.3830845771144278, "grad_norm": 0.5287372469902039, "learning_rate": 1.697800078086725e-05, "loss": 0.6762, "step": 3614 }, { "epoch": 1.383467278989667, "grad_norm": 0.5044769048690796, "learning_rate": 1.6976224902082524e-05, "loss": 0.6917, "step": 3615 }, { "epoch": 1.3838499808649063, "grad_norm": 0.5633739233016968, "learning_rate": 1.6974448594590037e-05, "loss": 0.7741, "step": 3616 }, { "epoch": 1.3842326827401454, "grad_norm": 0.5103923678398132, "learning_rate": 1.697267185849895e-05, "loss": 0.6182, "step": 3617 }, { "epoch": 1.3846153846153846, "grad_norm": 0.5387341976165771, "learning_rate": 1.6970894693918444e-05, "loss": 0.6586, "step": 3618 }, { "epoch": 1.3849980864906237, "grad_norm": 0.5471880435943604, "learning_rate": 1.696911710095773e-05, "loss": 0.7038, "step": 3619 }, { "epoch": 1.3853807883658629, "grad_norm": 0.5287713408470154, "learning_rate": 1.6967339079726047e-05, "loss": 0.6565, "step": 3620 }, { "epoch": 1.3857634902411022, "grad_norm": 0.5148517489433289, "learning_rate": 1.6965560630332666e-05, "loss": 0.64, "step": 3621 }, { "epoch": 1.3861461921163414, "grad_norm": 0.5431538224220276, "learning_rate": 1.6963781752886865e-05, "loss": 0.6384, "step": 3622 }, { "epoch": 1.3865288939915805, "grad_norm": 0.506580650806427, "learning_rate": 1.6962002447497972e-05, "loss": 0.6182, "step": 3623 }, { "epoch": 1.3869115958668197, "grad_norm": 0.5383666157722473, "learning_rate": 1.6960222714275323e-05, "loss": 0.7151, "step": 3624 }, { "epoch": 1.3872942977420588, "grad_norm": 0.48395881056785583, "learning_rate": 1.695844255332829e-05, "loss": 0.5808, "step": 3625 }, { "epoch": 1.3876769996172982, "grad_norm": 0.5270757079124451, "learning_rate": 1.6956661964766264e-05, "loss": 0.7076, "step": 3626 }, { "epoch": 1.3880597014925373, "grad_norm": 0.5110002160072327, "learning_rate": 1.6954880948698678e-05, "loss": 0.6637, "step": 3627 }, { "epoch": 1.3884424033677765, "grad_norm": 0.5544290542602539, "learning_rate": 1.695309950523497e-05, "loss": 0.6589, "step": 3628 }, { "epoch": 1.3888251052430156, "grad_norm": 0.5555459260940552, "learning_rate": 1.695131763448462e-05, "loss": 0.6736, "step": 3629 }, { "epoch": 1.3892078071182548, "grad_norm": 0.5743347406387329, "learning_rate": 1.694953533655712e-05, "loss": 0.6882, "step": 3630 }, { "epoch": 1.3895905089934941, "grad_norm": 0.5281102657318115, "learning_rate": 1.6947752611562012e-05, "loss": 0.6388, "step": 3631 }, { "epoch": 1.3899732108687333, "grad_norm": 0.5202227830886841, "learning_rate": 1.6945969459608837e-05, "loss": 0.7203, "step": 3632 }, { "epoch": 1.3903559127439724, "grad_norm": 0.5337592959403992, "learning_rate": 1.694418588080718e-05, "loss": 0.6518, "step": 3633 }, { "epoch": 1.3907386146192116, "grad_norm": 0.503725528717041, "learning_rate": 1.6942401875266644e-05, "loss": 0.6868, "step": 3634 }, { "epoch": 1.3911213164944507, "grad_norm": 0.5190379023551941, "learning_rate": 1.694061744309686e-05, "loss": 0.6251, "step": 3635 }, { "epoch": 1.39150401836969, "grad_norm": 0.5779746770858765, "learning_rate": 1.6938832584407494e-05, "loss": 0.6406, "step": 3636 }, { "epoch": 1.3918867202449292, "grad_norm": 0.5734394192695618, "learning_rate": 1.693704729930822e-05, "loss": 0.6989, "step": 3637 }, { "epoch": 1.3922694221201684, "grad_norm": 0.5242670774459839, "learning_rate": 1.6935261587908757e-05, "loss": 0.5828, "step": 3638 }, { "epoch": 1.3926521239954075, "grad_norm": 0.5365371704101562, "learning_rate": 1.6933475450318834e-05, "loss": 0.7028, "step": 3639 }, { "epoch": 1.3930348258706466, "grad_norm": 0.5319308638572693, "learning_rate": 1.6931688886648217e-05, "loss": 0.6005, "step": 3640 }, { "epoch": 1.393417527745886, "grad_norm": 0.49582499265670776, "learning_rate": 1.69299018970067e-05, "loss": 0.5585, "step": 3641 }, { "epoch": 1.3938002296211252, "grad_norm": 0.5268990397453308, "learning_rate": 1.692811448150409e-05, "loss": 0.651, "step": 3642 }, { "epoch": 1.3941829314963643, "grad_norm": 0.567378580570221, "learning_rate": 1.6926326640250238e-05, "loss": 0.6193, "step": 3643 }, { "epoch": 1.3945656333716034, "grad_norm": 0.5327368974685669, "learning_rate": 1.6924538373355004e-05, "loss": 0.5572, "step": 3644 }, { "epoch": 1.3949483352468426, "grad_norm": 0.5235344767570496, "learning_rate": 1.6922749680928286e-05, "loss": 0.6389, "step": 3645 }, { "epoch": 1.395331037122082, "grad_norm": 0.5341357588768005, "learning_rate": 1.692096056308e-05, "loss": 0.6511, "step": 3646 }, { "epoch": 1.395713738997321, "grad_norm": 0.49571698904037476, "learning_rate": 1.6919171019920094e-05, "loss": 0.7038, "step": 3647 }, { "epoch": 1.3960964408725602, "grad_norm": 0.5007657408714294, "learning_rate": 1.691738105155854e-05, "loss": 0.6304, "step": 3648 }, { "epoch": 1.3964791427477994, "grad_norm": 0.5544379949569702, "learning_rate": 1.691559065810534e-05, "loss": 0.6533, "step": 3649 }, { "epoch": 1.3968618446230385, "grad_norm": 0.5139579772949219, "learning_rate": 1.6913799839670514e-05, "loss": 0.6154, "step": 3650 }, { "epoch": 1.397244546498278, "grad_norm": 0.5770754218101501, "learning_rate": 1.6912008596364113e-05, "loss": 0.6585, "step": 3651 }, { "epoch": 1.397627248373517, "grad_norm": 0.6028351187705994, "learning_rate": 1.6910216928296214e-05, "loss": 0.677, "step": 3652 }, { "epoch": 1.3980099502487562, "grad_norm": 0.5384200215339661, "learning_rate": 1.6908424835576923e-05, "loss": 0.6331, "step": 3653 }, { "epoch": 1.3983926521239953, "grad_norm": 0.5615752339363098, "learning_rate": 1.6906632318316367e-05, "loss": 0.7367, "step": 3654 }, { "epoch": 1.3987753539992345, "grad_norm": 0.5612069964408875, "learning_rate": 1.6904839376624697e-05, "loss": 0.6128, "step": 3655 }, { "epoch": 1.3991580558744738, "grad_norm": 0.5550077557563782, "learning_rate": 1.6903046010612096e-05, "loss": 0.6402, "step": 3656 }, { "epoch": 1.399540757749713, "grad_norm": 0.5570629239082336, "learning_rate": 1.6901252220388776e-05, "loss": 0.6264, "step": 3657 }, { "epoch": 1.3999234596249521, "grad_norm": 0.5625937581062317, "learning_rate": 1.6899458006064967e-05, "loss": 0.623, "step": 3658 }, { "epoch": 1.4003061615001913, "grad_norm": 0.5212867259979248, "learning_rate": 1.689766336775093e-05, "loss": 0.5716, "step": 3659 }, { "epoch": 1.4006888633754304, "grad_norm": 0.5656701922416687, "learning_rate": 1.6895868305556945e-05, "loss": 0.6873, "step": 3660 }, { "epoch": 1.4010715652506698, "grad_norm": 0.6853770613670349, "learning_rate": 1.6894072819593327e-05, "loss": 0.6903, "step": 3661 }, { "epoch": 1.401454267125909, "grad_norm": 0.48135697841644287, "learning_rate": 1.6892276909970418e-05, "loss": 0.6412, "step": 3662 }, { "epoch": 1.401836969001148, "grad_norm": 0.534408688545227, "learning_rate": 1.689048057679857e-05, "loss": 0.6822, "step": 3663 }, { "epoch": 1.4022196708763872, "grad_norm": 0.5098583102226257, "learning_rate": 1.6888683820188186e-05, "loss": 0.6077, "step": 3664 }, { "epoch": 1.4026023727516264, "grad_norm": 0.4907713532447815, "learning_rate": 1.688688664024967e-05, "loss": 0.6117, "step": 3665 }, { "epoch": 1.4029850746268657, "grad_norm": 0.5175735354423523, "learning_rate": 1.6885089037093472e-05, "loss": 0.6576, "step": 3666 }, { "epoch": 1.4033677765021049, "grad_norm": 0.5516359210014343, "learning_rate": 1.6883291010830055e-05, "loss": 0.6355, "step": 3667 }, { "epoch": 1.403750478377344, "grad_norm": 0.5744474530220032, "learning_rate": 1.688149256156991e-05, "loss": 0.6059, "step": 3668 }, { "epoch": 1.4041331802525832, "grad_norm": 0.4982525110244751, "learning_rate": 1.6879693689423563e-05, "loss": 0.6748, "step": 3669 }, { "epoch": 1.4045158821278223, "grad_norm": 0.5132076740264893, "learning_rate": 1.687789439450156e-05, "loss": 0.6492, "step": 3670 }, { "epoch": 1.4048985840030617, "grad_norm": 0.5509145259857178, "learning_rate": 1.6876094676914463e-05, "loss": 0.6044, "step": 3671 }, { "epoch": 1.4052812858783008, "grad_norm": 0.6168286800384521, "learning_rate": 1.687429453677288e-05, "loss": 0.6964, "step": 3672 }, { "epoch": 1.40566398775354, "grad_norm": 0.5344570875167847, "learning_rate": 1.6872493974187425e-05, "loss": 0.6421, "step": 3673 }, { "epoch": 1.406046689628779, "grad_norm": 0.5160139799118042, "learning_rate": 1.6870692989268755e-05, "loss": 0.614, "step": 3674 }, { "epoch": 1.4064293915040182, "grad_norm": 0.5237826108932495, "learning_rate": 1.6868891582127546e-05, "loss": 0.6688, "step": 3675 }, { "epoch": 1.4068120933792576, "grad_norm": 0.5535359382629395, "learning_rate": 1.686708975287449e-05, "loss": 0.6366, "step": 3676 }, { "epoch": 1.4071947952544968, "grad_norm": 0.5742279887199402, "learning_rate": 1.6865287501620325e-05, "loss": 0.726, "step": 3677 }, { "epoch": 1.407577497129736, "grad_norm": 0.5049348473548889, "learning_rate": 1.68634848284758e-05, "loss": 0.6449, "step": 3678 }, { "epoch": 1.407960199004975, "grad_norm": 0.5475499629974365, "learning_rate": 1.686168173355169e-05, "loss": 0.5918, "step": 3679 }, { "epoch": 1.4083429008802142, "grad_norm": 0.5225620269775391, "learning_rate": 1.6859878216958806e-05, "loss": 0.6295, "step": 3680 }, { "epoch": 1.4087256027554536, "grad_norm": 0.520590603351593, "learning_rate": 1.6858074278807975e-05, "loss": 0.6859, "step": 3681 }, { "epoch": 1.4091083046306927, "grad_norm": 0.5312633514404297, "learning_rate": 1.6856269919210056e-05, "loss": 0.6282, "step": 3682 }, { "epoch": 1.4094910065059318, "grad_norm": 0.6145686507225037, "learning_rate": 1.6854465138275933e-05, "loss": 0.7333, "step": 3683 }, { "epoch": 1.409873708381171, "grad_norm": 0.5308599472045898, "learning_rate": 1.685265993611651e-05, "loss": 0.5774, "step": 3684 }, { "epoch": 1.4102564102564101, "grad_norm": 0.5586947202682495, "learning_rate": 1.6850854312842724e-05, "loss": 0.6973, "step": 3685 }, { "epoch": 1.4106391121316495, "grad_norm": 0.5070253610610962, "learning_rate": 1.6849048268565537e-05, "loss": 0.5534, "step": 3686 }, { "epoch": 1.4110218140068886, "grad_norm": 0.492570161819458, "learning_rate": 1.6847241803395937e-05, "loss": 0.6298, "step": 3687 }, { "epoch": 1.4114045158821278, "grad_norm": 0.5410642623901367, "learning_rate": 1.684543491744493e-05, "loss": 0.6579, "step": 3688 }, { "epoch": 1.411787217757367, "grad_norm": 0.5262538194656372, "learning_rate": 1.6843627610823556e-05, "loss": 0.636, "step": 3689 }, { "epoch": 1.412169919632606, "grad_norm": 0.5956889390945435, "learning_rate": 1.684181988364288e-05, "loss": 0.5878, "step": 3690 }, { "epoch": 1.4125526215078454, "grad_norm": 0.5498132705688477, "learning_rate": 1.6840011736013994e-05, "loss": 0.6707, "step": 3691 }, { "epoch": 1.4129353233830846, "grad_norm": 0.5348090529441833, "learning_rate": 1.6838203168048012e-05, "loss": 0.6564, "step": 3692 }, { "epoch": 1.4133180252583237, "grad_norm": 0.502848744392395, "learning_rate": 1.6836394179856073e-05, "loss": 0.5903, "step": 3693 }, { "epoch": 1.4137007271335629, "grad_norm": 0.5195391178131104, "learning_rate": 1.6834584771549345e-05, "loss": 0.6828, "step": 3694 }, { "epoch": 1.414083429008802, "grad_norm": 0.5007060170173645, "learning_rate": 1.6832774943239022e-05, "loss": 0.6929, "step": 3695 }, { "epoch": 1.4144661308840414, "grad_norm": 0.513887882232666, "learning_rate": 1.6830964695036324e-05, "loss": 0.6708, "step": 3696 }, { "epoch": 1.4148488327592805, "grad_norm": 0.5761421918869019, "learning_rate": 1.6829154027052493e-05, "loss": 0.7039, "step": 3697 }, { "epoch": 1.4152315346345197, "grad_norm": 0.5177212357521057, "learning_rate": 1.6827342939398802e-05, "loss": 0.663, "step": 3698 }, { "epoch": 1.4156142365097588, "grad_norm": 0.5408876538276672, "learning_rate": 1.6825531432186545e-05, "loss": 0.7017, "step": 3699 }, { "epoch": 1.415996938384998, "grad_norm": 0.5386651754379272, "learning_rate": 1.6823719505527042e-05, "loss": 0.6755, "step": 3700 }, { "epoch": 1.4163796402602373, "grad_norm": 0.541867733001709, "learning_rate": 1.6821907159531644e-05, "loss": 0.5899, "step": 3701 }, { "epoch": 1.4167623421354765, "grad_norm": 0.5177562832832336, "learning_rate": 1.682009439431173e-05, "loss": 0.6745, "step": 3702 }, { "epoch": 1.4171450440107156, "grad_norm": 0.5238727331161499, "learning_rate": 1.6818281209978688e-05, "loss": 0.6891, "step": 3703 }, { "epoch": 1.4175277458859548, "grad_norm": 0.5539349317550659, "learning_rate": 1.681646760664395e-05, "loss": 0.6415, "step": 3704 }, { "epoch": 1.417910447761194, "grad_norm": 0.6412811875343323, "learning_rate": 1.6814653584418966e-05, "loss": 0.6419, "step": 3705 }, { "epoch": 1.4182931496364333, "grad_norm": 0.6050126552581787, "learning_rate": 1.6812839143415207e-05, "loss": 0.6681, "step": 3706 }, { "epoch": 1.4186758515116724, "grad_norm": 0.6137129068374634, "learning_rate": 1.6811024283744183e-05, "loss": 0.7029, "step": 3707 }, { "epoch": 1.4190585533869116, "grad_norm": 0.5438085794448853, "learning_rate": 1.6809209005517423e-05, "loss": 0.7218, "step": 3708 }, { "epoch": 1.4194412552621507, "grad_norm": 0.5267171859741211, "learning_rate": 1.680739330884647e-05, "loss": 0.5961, "step": 3709 }, { "epoch": 1.4198239571373898, "grad_norm": 0.5386074185371399, "learning_rate": 1.680557719384292e-05, "loss": 0.6565, "step": 3710 }, { "epoch": 1.4202066590126292, "grad_norm": 0.5437086820602417, "learning_rate": 1.6803760660618357e-05, "loss": 0.5866, "step": 3711 }, { "epoch": 1.4205893608878684, "grad_norm": 0.5507184267044067, "learning_rate": 1.680194370928443e-05, "loss": 0.6387, "step": 3712 }, { "epoch": 1.4209720627631075, "grad_norm": 0.5925731062889099, "learning_rate": 1.6800126339952784e-05, "loss": 0.6824, "step": 3713 }, { "epoch": 1.4213547646383466, "grad_norm": 0.521874189376831, "learning_rate": 1.6798308552735107e-05, "loss": 0.6961, "step": 3714 }, { "epoch": 1.4217374665135858, "grad_norm": 0.5174279808998108, "learning_rate": 1.6796490347743112e-05, "loss": 0.6594, "step": 3715 }, { "epoch": 1.4221201683888252, "grad_norm": 0.6279808878898621, "learning_rate": 1.6794671725088522e-05, "loss": 0.7066, "step": 3716 }, { "epoch": 1.4225028702640643, "grad_norm": 0.5320529341697693, "learning_rate": 1.6792852684883098e-05, "loss": 0.7013, "step": 3717 }, { "epoch": 1.4228855721393034, "grad_norm": 0.5656452178955078, "learning_rate": 1.679103322723863e-05, "loss": 0.6372, "step": 3718 }, { "epoch": 1.4232682740145426, "grad_norm": 0.5279425978660583, "learning_rate": 1.6789213352266926e-05, "loss": 0.5803, "step": 3719 }, { "epoch": 1.4236509758897817, "grad_norm": 0.5428298115730286, "learning_rate": 1.6787393060079825e-05, "loss": 0.6316, "step": 3720 }, { "epoch": 1.424033677765021, "grad_norm": 0.5199066996574402, "learning_rate": 1.6785572350789182e-05, "loss": 0.6511, "step": 3721 }, { "epoch": 1.4244163796402602, "grad_norm": 0.5680964589118958, "learning_rate": 1.6783751224506892e-05, "loss": 0.6675, "step": 3722 }, { "epoch": 1.4247990815154994, "grad_norm": 0.5772886872291565, "learning_rate": 1.678192968134486e-05, "loss": 0.7032, "step": 3723 }, { "epoch": 1.4251817833907385, "grad_norm": 0.5452038049697876, "learning_rate": 1.6780107721415032e-05, "loss": 0.6222, "step": 3724 }, { "epoch": 1.4255644852659777, "grad_norm": 0.5433860421180725, "learning_rate": 1.6778285344829373e-05, "loss": 0.6329, "step": 3725 }, { "epoch": 1.425947187141217, "grad_norm": 0.5721851587295532, "learning_rate": 1.6776462551699867e-05, "loss": 0.6922, "step": 3726 }, { "epoch": 1.4263298890164562, "grad_norm": 0.5155309438705444, "learning_rate": 1.6774639342138536e-05, "loss": 0.762, "step": 3727 }, { "epoch": 1.4267125908916953, "grad_norm": 0.5372326970100403, "learning_rate": 1.6772815716257414e-05, "loss": 0.6948, "step": 3728 }, { "epoch": 1.4270952927669345, "grad_norm": 0.5834089517593384, "learning_rate": 1.6770991674168567e-05, "loss": 0.634, "step": 3729 }, { "epoch": 1.4274779946421736, "grad_norm": 0.5213291049003601, "learning_rate": 1.67691672159841e-05, "loss": 0.7088, "step": 3730 }, { "epoch": 1.427860696517413, "grad_norm": 0.5195742845535278, "learning_rate": 1.6767342341816117e-05, "loss": 0.6262, "step": 3731 }, { "epoch": 1.4282433983926521, "grad_norm": 0.47501862049102783, "learning_rate": 1.6765517051776766e-05, "loss": 0.57, "step": 3732 }, { "epoch": 1.4286261002678913, "grad_norm": 0.5183603167533875, "learning_rate": 1.6763691345978223e-05, "loss": 0.6901, "step": 3733 }, { "epoch": 1.4290088021431304, "grad_norm": 0.5565842986106873, "learning_rate": 1.6761865224532666e-05, "loss": 0.5938, "step": 3734 }, { "epoch": 1.4293915040183696, "grad_norm": 0.5202518701553345, "learning_rate": 1.6760038687552333e-05, "loss": 0.6846, "step": 3735 }, { "epoch": 1.429774205893609, "grad_norm": 0.5521090030670166, "learning_rate": 1.675821173514946e-05, "loss": 0.599, "step": 3736 }, { "epoch": 1.430156907768848, "grad_norm": 0.5843809843063354, "learning_rate": 1.6756384367436317e-05, "loss": 0.6912, "step": 3737 }, { "epoch": 1.4305396096440872, "grad_norm": 0.5525131821632385, "learning_rate": 1.675455658452521e-05, "loss": 0.6721, "step": 3738 }, { "epoch": 1.4309223115193264, "grad_norm": 0.5013397336006165, "learning_rate": 1.6752728386528452e-05, "loss": 0.6926, "step": 3739 }, { "epoch": 1.4313050133945655, "grad_norm": 0.5819587111473083, "learning_rate": 1.6750899773558392e-05, "loss": 0.7547, "step": 3740 }, { "epoch": 1.4316877152698049, "grad_norm": 0.5324735641479492, "learning_rate": 1.6749070745727403e-05, "loss": 0.6461, "step": 3741 }, { "epoch": 1.432070417145044, "grad_norm": 0.5161858201026917, "learning_rate": 1.6747241303147888e-05, "loss": 0.6616, "step": 3742 }, { "epoch": 1.4324531190202832, "grad_norm": 0.547639012336731, "learning_rate": 1.674541144593227e-05, "loss": 0.6216, "step": 3743 }, { "epoch": 1.4328358208955223, "grad_norm": 0.5316040515899658, "learning_rate": 1.6743581174192995e-05, "loss": 0.6541, "step": 3744 }, { "epoch": 1.4332185227707614, "grad_norm": 0.5314698219299316, "learning_rate": 1.6741750488042542e-05, "loss": 0.6262, "step": 3745 }, { "epoch": 1.4336012246460008, "grad_norm": 0.4937327206134796, "learning_rate": 1.6739919387593407e-05, "loss": 0.7385, "step": 3746 }, { "epoch": 1.43398392652124, "grad_norm": 0.5924541354179382, "learning_rate": 1.673808787295812e-05, "loss": 0.7504, "step": 3747 }, { "epoch": 1.434366628396479, "grad_norm": 0.5524366497993469, "learning_rate": 1.6736255944249234e-05, "loss": 0.6905, "step": 3748 }, { "epoch": 1.4347493302717182, "grad_norm": 0.5007689595222473, "learning_rate": 1.6734423601579323e-05, "loss": 0.6249, "step": 3749 }, { "epoch": 1.4351320321469574, "grad_norm": 0.5315425395965576, "learning_rate": 1.6732590845060986e-05, "loss": 0.6562, "step": 3750 }, { "epoch": 1.4355147340221968, "grad_norm": 0.534283459186554, "learning_rate": 1.6730757674806858e-05, "loss": 0.6787, "step": 3751 }, { "epoch": 1.435897435897436, "grad_norm": 0.5872358679771423, "learning_rate": 1.6728924090929587e-05, "loss": 0.6613, "step": 3752 }, { "epoch": 1.436280137772675, "grad_norm": 0.5438349843025208, "learning_rate": 1.6727090093541854e-05, "loss": 0.5995, "step": 3753 }, { "epoch": 1.4366628396479142, "grad_norm": 0.515120267868042, "learning_rate": 1.672525568275636e-05, "loss": 0.6256, "step": 3754 }, { "epoch": 1.4370455415231533, "grad_norm": 0.5349109172821045, "learning_rate": 1.6723420858685838e-05, "loss": 0.603, "step": 3755 }, { "epoch": 1.4374282433983927, "grad_norm": 0.5115794539451599, "learning_rate": 1.6721585621443044e-05, "loss": 0.6724, "step": 3756 }, { "epoch": 1.4378109452736318, "grad_norm": 0.6508148908615112, "learning_rate": 1.6719749971140756e-05, "loss": 0.6431, "step": 3757 }, { "epoch": 1.438193647148871, "grad_norm": 0.5174590945243835, "learning_rate": 1.6717913907891777e-05, "loss": 0.6388, "step": 3758 }, { "epoch": 1.4385763490241104, "grad_norm": 0.5241878032684326, "learning_rate": 1.6716077431808944e-05, "loss": 0.6099, "step": 3759 }, { "epoch": 1.4389590508993493, "grad_norm": 0.5009501576423645, "learning_rate": 1.671424054300511e-05, "loss": 0.7224, "step": 3760 }, { "epoch": 1.4393417527745886, "grad_norm": 0.5714031457901001, "learning_rate": 1.6712403241593157e-05, "loss": 0.645, "step": 3761 }, { "epoch": 1.4397244546498278, "grad_norm": 0.5114065408706665, "learning_rate": 1.6710565527685988e-05, "loss": 0.6178, "step": 3762 }, { "epoch": 1.440107156525067, "grad_norm": 0.5941129922866821, "learning_rate": 1.6708727401396542e-05, "loss": 0.7412, "step": 3763 }, { "epoch": 1.4404898584003063, "grad_norm": 0.5394287109375, "learning_rate": 1.6706888862837774e-05, "loss": 0.7055, "step": 3764 }, { "epoch": 1.4408725602755452, "grad_norm": 0.5629622936248779, "learning_rate": 1.670504991212267e-05, "loss": 0.6893, "step": 3765 }, { "epoch": 1.4412552621507846, "grad_norm": 0.5809664726257324, "learning_rate": 1.670321054936423e-05, "loss": 0.8261, "step": 3766 }, { "epoch": 1.4416379640260237, "grad_norm": 0.5620447397232056, "learning_rate": 1.67013707746755e-05, "loss": 0.6149, "step": 3767 }, { "epoch": 1.4420206659012629, "grad_norm": 0.5944066643714905, "learning_rate": 1.6699530588169528e-05, "loss": 0.7312, "step": 3768 }, { "epoch": 1.4424033677765022, "grad_norm": 0.575411319732666, "learning_rate": 1.6697689989959407e-05, "loss": 0.748, "step": 3769 }, { "epoch": 1.4427860696517412, "grad_norm": 0.5439212322235107, "learning_rate": 1.669584898015824e-05, "loss": 0.6271, "step": 3770 }, { "epoch": 1.4431687715269805, "grad_norm": 0.562751293182373, "learning_rate": 1.6694007558879168e-05, "loss": 0.5354, "step": 3771 }, { "epoch": 1.4435514734022197, "grad_norm": 0.5671093463897705, "learning_rate": 1.6692165726235346e-05, "loss": 0.6236, "step": 3772 }, { "epoch": 1.4439341752774588, "grad_norm": 0.5399303436279297, "learning_rate": 1.6690323482339962e-05, "loss": 0.6684, "step": 3773 }, { "epoch": 1.4443168771526982, "grad_norm": 0.5584139823913574, "learning_rate": 1.6688480827306224e-05, "loss": 0.6395, "step": 3774 }, { "epoch": 1.444699579027937, "grad_norm": 0.5027498602867126, "learning_rate": 1.6686637761247377e-05, "loss": 0.7156, "step": 3775 }, { "epoch": 1.4450822809031765, "grad_norm": 0.5930926203727722, "learning_rate": 1.6684794284276668e-05, "loss": 0.5935, "step": 3776 }, { "epoch": 1.4454649827784156, "grad_norm": 0.54514479637146, "learning_rate": 1.6682950396507394e-05, "loss": 0.691, "step": 3777 }, { "epoch": 1.4458476846536548, "grad_norm": 0.5989216566085815, "learning_rate": 1.6681106098052868e-05, "loss": 0.7537, "step": 3778 }, { "epoch": 1.4462303865288941, "grad_norm": 0.4991592466831207, "learning_rate": 1.6679261389026418e-05, "loss": 0.5785, "step": 3779 }, { "epoch": 1.446613088404133, "grad_norm": 0.5450389385223389, "learning_rate": 1.667741626954142e-05, "loss": 0.6712, "step": 3780 }, { "epoch": 1.4469957902793724, "grad_norm": 0.5127086043357849, "learning_rate": 1.6675570739711245e-05, "loss": 0.6125, "step": 3781 }, { "epoch": 1.4473784921546116, "grad_norm": 0.5437016487121582, "learning_rate": 1.667372479964932e-05, "loss": 0.6595, "step": 3782 }, { "epoch": 1.4477611940298507, "grad_norm": 0.5249236822128296, "learning_rate": 1.6671878449469074e-05, "loss": 0.5934, "step": 3783 }, { "epoch": 1.44814389590509, "grad_norm": 0.555696427822113, "learning_rate": 1.6670031689283973e-05, "loss": 0.592, "step": 3784 }, { "epoch": 1.448526597780329, "grad_norm": 0.5166023969650269, "learning_rate": 1.6668184519207507e-05, "loss": 0.6334, "step": 3785 }, { "epoch": 1.4489092996555684, "grad_norm": 0.5339528322219849, "learning_rate": 1.666633693935319e-05, "loss": 0.6789, "step": 3786 }, { "epoch": 1.4492920015308075, "grad_norm": 0.5113990902900696, "learning_rate": 1.6664488949834558e-05, "loss": 0.64, "step": 3787 }, { "epoch": 1.4496747034060466, "grad_norm": 0.5513786673545837, "learning_rate": 1.6662640550765178e-05, "loss": 0.6481, "step": 3788 }, { "epoch": 1.450057405281286, "grad_norm": 0.5274580717086792, "learning_rate": 1.666079174225863e-05, "loss": 0.7336, "step": 3789 }, { "epoch": 1.450440107156525, "grad_norm": 0.5045140385627747, "learning_rate": 1.6658942524428542e-05, "loss": 0.7043, "step": 3790 }, { "epoch": 1.4508228090317643, "grad_norm": 0.5579679012298584, "learning_rate": 1.6657092897388546e-05, "loss": 0.6652, "step": 3791 }, { "epoch": 1.4512055109070034, "grad_norm": 0.5112223029136658, "learning_rate": 1.6655242861252304e-05, "loss": 0.5972, "step": 3792 }, { "epoch": 1.4515882127822426, "grad_norm": 0.6025789976119995, "learning_rate": 1.665339241613351e-05, "loss": 0.7226, "step": 3793 }, { "epoch": 1.451970914657482, "grad_norm": 0.5499840378761292, "learning_rate": 1.6651541562145877e-05, "loss": 0.6569, "step": 3794 }, { "epoch": 1.4523536165327209, "grad_norm": 0.6750298738479614, "learning_rate": 1.664969029940315e-05, "loss": 0.7385, "step": 3795 }, { "epoch": 1.4527363184079602, "grad_norm": 0.5385761857032776, "learning_rate": 1.6647838628019088e-05, "loss": 0.6715, "step": 3796 }, { "epoch": 1.4531190202831994, "grad_norm": 0.5605955719947815, "learning_rate": 1.664598654810748e-05, "loss": 0.5668, "step": 3797 }, { "epoch": 1.4535017221584385, "grad_norm": 0.5594416856765747, "learning_rate": 1.6644134059782144e-05, "loss": 0.6269, "step": 3798 }, { "epoch": 1.453884424033678, "grad_norm": 0.5708343386650085, "learning_rate": 1.6642281163156922e-05, "loss": 0.6863, "step": 3799 }, { "epoch": 1.4542671259089168, "grad_norm": 0.5951197147369385, "learning_rate": 1.664042785834568e-05, "loss": 0.6872, "step": 3800 }, { "epoch": 1.4546498277841562, "grad_norm": 0.5220427513122559, "learning_rate": 1.6638574145462305e-05, "loss": 0.6142, "step": 3801 }, { "epoch": 1.4550325296593953, "grad_norm": 0.5519853830337524, "learning_rate": 1.6636720024620718e-05, "loss": 0.6602, "step": 3802 }, { "epoch": 1.4554152315346345, "grad_norm": 0.5481212735176086, "learning_rate": 1.663486549593485e-05, "loss": 0.7013, "step": 3803 }, { "epoch": 1.4557979334098738, "grad_norm": 0.6405293941497803, "learning_rate": 1.6633010559518678e-05, "loss": 0.6193, "step": 3804 }, { "epoch": 1.4561806352851128, "grad_norm": 0.6135008931159973, "learning_rate": 1.6631155215486182e-05, "loss": 0.636, "step": 3805 }, { "epoch": 1.4565633371603521, "grad_norm": 0.5616983771324158, "learning_rate": 1.6629299463951388e-05, "loss": 0.7197, "step": 3806 }, { "epoch": 1.4569460390355913, "grad_norm": 0.5481765866279602, "learning_rate": 1.662744330502833e-05, "loss": 0.7215, "step": 3807 }, { "epoch": 1.4573287409108304, "grad_norm": 0.5759325623512268, "learning_rate": 1.662558673883108e-05, "loss": 0.6512, "step": 3808 }, { "epoch": 1.4577114427860698, "grad_norm": 0.5349867939949036, "learning_rate": 1.662372976547372e-05, "loss": 0.6591, "step": 3809 }, { "epoch": 1.4580941446613087, "grad_norm": 0.582432210445404, "learning_rate": 1.6621872385070374e-05, "loss": 0.6714, "step": 3810 }, { "epoch": 1.458476846536548, "grad_norm": 0.6000100374221802, "learning_rate": 1.662001459773518e-05, "loss": 0.6546, "step": 3811 }, { "epoch": 1.4588595484117872, "grad_norm": 0.5139645934104919, "learning_rate": 1.6618156403582305e-05, "loss": 0.7234, "step": 3812 }, { "epoch": 1.4592422502870264, "grad_norm": 0.5273248553276062, "learning_rate": 1.661629780272594e-05, "loss": 0.6211, "step": 3813 }, { "epoch": 1.4596249521622657, "grad_norm": 0.5015040040016174, "learning_rate": 1.66144387952803e-05, "loss": 0.6406, "step": 3814 }, { "epoch": 1.4600076540375047, "grad_norm": 0.5847607254981995, "learning_rate": 1.6612579381359624e-05, "loss": 0.6885, "step": 3815 }, { "epoch": 1.460390355912744, "grad_norm": 0.546302080154419, "learning_rate": 1.6610719561078184e-05, "loss": 0.673, "step": 3816 }, { "epoch": 1.4607730577879832, "grad_norm": 0.5084646344184875, "learning_rate": 1.6608859334550266e-05, "loss": 0.6023, "step": 3817 }, { "epoch": 1.4611557596632223, "grad_norm": 0.5518094301223755, "learning_rate": 1.6606998701890186e-05, "loss": 0.6798, "step": 3818 }, { "epoch": 1.4615384615384617, "grad_norm": 0.6360760927200317, "learning_rate": 1.660513766321229e-05, "loss": 0.6315, "step": 3819 }, { "epoch": 1.4619211634137006, "grad_norm": 0.5146000981330872, "learning_rate": 1.6603276218630932e-05, "loss": 0.6579, "step": 3820 }, { "epoch": 1.46230386528894, "grad_norm": 0.5366308093070984, "learning_rate": 1.6601414368260518e-05, "loss": 0.59, "step": 3821 }, { "epoch": 1.462686567164179, "grad_norm": 0.5551469922065735, "learning_rate": 1.6599552112215455e-05, "loss": 0.6946, "step": 3822 }, { "epoch": 1.4630692690394183, "grad_norm": 0.542209804058075, "learning_rate": 1.6597689450610183e-05, "loss": 0.6332, "step": 3823 }, { "epoch": 1.4634519709146576, "grad_norm": 0.5737065672874451, "learning_rate": 1.659582638355917e-05, "loss": 0.6514, "step": 3824 }, { "epoch": 1.4638346727898965, "grad_norm": 0.5406222343444824, "learning_rate": 1.6593962911176906e-05, "loss": 0.6873, "step": 3825 }, { "epoch": 1.464217374665136, "grad_norm": 0.5529142618179321, "learning_rate": 1.659209903357791e-05, "loss": 0.6265, "step": 3826 }, { "epoch": 1.464600076540375, "grad_norm": 0.547341525554657, "learning_rate": 1.6590234750876716e-05, "loss": 0.6843, "step": 3827 }, { "epoch": 1.4649827784156142, "grad_norm": 0.5644741654396057, "learning_rate": 1.6588370063187892e-05, "loss": 0.6994, "step": 3828 }, { "epoch": 1.4653654802908536, "grad_norm": 0.5035999417304993, "learning_rate": 1.6586504970626027e-05, "loss": 0.6544, "step": 3829 }, { "epoch": 1.4657481821660925, "grad_norm": 0.5363783836364746, "learning_rate": 1.6584639473305738e-05, "loss": 0.6723, "step": 3830 }, { "epoch": 1.4661308840413318, "grad_norm": 0.5562039613723755, "learning_rate": 1.6582773571341662e-05, "loss": 0.6442, "step": 3831 }, { "epoch": 1.466513585916571, "grad_norm": 0.49098891019821167, "learning_rate": 1.658090726484847e-05, "loss": 0.6754, "step": 3832 }, { "epoch": 1.4668962877918101, "grad_norm": 0.4911652207374573, "learning_rate": 1.6579040553940843e-05, "loss": 0.6834, "step": 3833 }, { "epoch": 1.4672789896670495, "grad_norm": 0.5225595235824585, "learning_rate": 1.6577173438733502e-05, "loss": 0.6338, "step": 3834 }, { "epoch": 1.4676616915422884, "grad_norm": 0.5229998826980591, "learning_rate": 1.6575305919341186e-05, "loss": 0.698, "step": 3835 }, { "epoch": 1.4680443934175278, "grad_norm": 0.5311759114265442, "learning_rate": 1.6573437995878653e-05, "loss": 0.6803, "step": 3836 }, { "epoch": 1.468427095292767, "grad_norm": 0.5260611772537231, "learning_rate": 1.6571569668460693e-05, "loss": 0.6393, "step": 3837 }, { "epoch": 1.468809797168006, "grad_norm": 0.5430733561515808, "learning_rate": 1.656970093720213e-05, "loss": 0.6617, "step": 3838 }, { "epoch": 1.4691924990432454, "grad_norm": 0.5895853638648987, "learning_rate": 1.6567831802217785e-05, "loss": 0.6901, "step": 3839 }, { "epoch": 1.4695752009184844, "grad_norm": 0.4877072274684906, "learning_rate": 1.6565962263622538e-05, "loss": 0.6058, "step": 3840 }, { "epoch": 1.4699579027937237, "grad_norm": 0.5364777445793152, "learning_rate": 1.6564092321531272e-05, "loss": 0.6243, "step": 3841 }, { "epoch": 1.4703406046689629, "grad_norm": 0.5244858264923096, "learning_rate": 1.6562221976058894e-05, "loss": 0.5662, "step": 3842 }, { "epoch": 1.470723306544202, "grad_norm": 0.5635048151016235, "learning_rate": 1.656035122732035e-05, "loss": 0.7164, "step": 3843 }, { "epoch": 1.4711060084194414, "grad_norm": 0.5775884985923767, "learning_rate": 1.6558480075430594e-05, "loss": 0.6755, "step": 3844 }, { "epoch": 1.4714887102946803, "grad_norm": 0.533913254737854, "learning_rate": 1.655660852050462e-05, "loss": 0.6418, "step": 3845 }, { "epoch": 1.4718714121699197, "grad_norm": 0.5233728885650635, "learning_rate": 1.6554736562657442e-05, "loss": 0.591, "step": 3846 }, { "epoch": 1.4722541140451588, "grad_norm": 0.5213394165039062, "learning_rate": 1.655286420200409e-05, "loss": 0.602, "step": 3847 }, { "epoch": 1.472636815920398, "grad_norm": 0.5711614489555359, "learning_rate": 1.6550991438659627e-05, "loss": 0.6799, "step": 3848 }, { "epoch": 1.4730195177956373, "grad_norm": 0.5841376781463623, "learning_rate": 1.654911827273914e-05, "loss": 0.622, "step": 3849 }, { "epoch": 1.4734022196708763, "grad_norm": 0.5366480946540833, "learning_rate": 1.654724470435775e-05, "loss": 0.6363, "step": 3850 }, { "epoch": 1.4737849215461156, "grad_norm": 0.49876490235328674, "learning_rate": 1.6545370733630574e-05, "loss": 0.5932, "step": 3851 }, { "epoch": 1.4741676234213548, "grad_norm": 0.5512809157371521, "learning_rate": 1.6543496360672786e-05, "loss": 0.6642, "step": 3852 }, { "epoch": 1.474550325296594, "grad_norm": 0.5624339580535889, "learning_rate": 1.6541621585599567e-05, "loss": 0.6707, "step": 3853 }, { "epoch": 1.4749330271718333, "grad_norm": 0.5874694585800171, "learning_rate": 1.6539746408526128e-05, "loss": 0.6608, "step": 3854 }, { "epoch": 1.4753157290470722, "grad_norm": 0.5756096839904785, "learning_rate": 1.6537870829567705e-05, "loss": 0.6495, "step": 3855 }, { "epoch": 1.4756984309223116, "grad_norm": 0.5200847387313843, "learning_rate": 1.6535994848839552e-05, "loss": 0.6983, "step": 3856 }, { "epoch": 1.4760811327975507, "grad_norm": 0.597507655620575, "learning_rate": 1.6534118466456955e-05, "loss": 0.6933, "step": 3857 }, { "epoch": 1.4764638346727899, "grad_norm": 0.5726972818374634, "learning_rate": 1.6532241682535232e-05, "loss": 0.671, "step": 3858 }, { "epoch": 1.4768465365480292, "grad_norm": 0.5360260605812073, "learning_rate": 1.6530364497189704e-05, "loss": 0.5865, "step": 3859 }, { "epoch": 1.4772292384232681, "grad_norm": 0.5496785640716553, "learning_rate": 1.6528486910535733e-05, "loss": 0.6945, "step": 3860 }, { "epoch": 1.4776119402985075, "grad_norm": 0.5067858099937439, "learning_rate": 1.6526608922688704e-05, "loss": 0.6718, "step": 3861 }, { "epoch": 1.4779946421737467, "grad_norm": 0.5103477239608765, "learning_rate": 1.6524730533764025e-05, "loss": 0.6677, "step": 3862 }, { "epoch": 1.4783773440489858, "grad_norm": 0.5302661657333374, "learning_rate": 1.6522851743877124e-05, "loss": 0.6496, "step": 3863 }, { "epoch": 1.4787600459242252, "grad_norm": 0.5592770576477051, "learning_rate": 1.652097255314346e-05, "loss": 0.6419, "step": 3864 }, { "epoch": 1.479142747799464, "grad_norm": 0.5422284603118896, "learning_rate": 1.6519092961678515e-05, "loss": 0.6689, "step": 3865 }, { "epoch": 1.4795254496747035, "grad_norm": 0.5915776491165161, "learning_rate": 1.6517212969597793e-05, "loss": 0.6604, "step": 3866 }, { "epoch": 1.4799081515499426, "grad_norm": 0.5152393579483032, "learning_rate": 1.6515332577016826e-05, "loss": 0.6557, "step": 3867 }, { "epoch": 1.4802908534251817, "grad_norm": 0.5361080765724182, "learning_rate": 1.6513451784051165e-05, "loss": 0.6745, "step": 3868 }, { "epoch": 1.480673555300421, "grad_norm": 0.5105699896812439, "learning_rate": 1.6511570590816398e-05, "loss": 0.6748, "step": 3869 }, { "epoch": 1.48105625717566, "grad_norm": 0.5078539252281189, "learning_rate": 1.6509688997428125e-05, "loss": 0.6869, "step": 3870 }, { "epoch": 1.4814389590508994, "grad_norm": 0.5590525269508362, "learning_rate": 1.6507807004001974e-05, "loss": 0.6492, "step": 3871 }, { "epoch": 1.4818216609261385, "grad_norm": 0.5193614363670349, "learning_rate": 1.65059246106536e-05, "loss": 0.7009, "step": 3872 }, { "epoch": 1.4822043628013777, "grad_norm": 0.49274933338165283, "learning_rate": 1.6504041817498676e-05, "loss": 0.6381, "step": 3873 }, { "epoch": 1.482587064676617, "grad_norm": 0.5461486577987671, "learning_rate": 1.6502158624652915e-05, "loss": 0.6785, "step": 3874 }, { "epoch": 1.482969766551856, "grad_norm": 0.6126235723495483, "learning_rate": 1.650027503223204e-05, "loss": 0.6256, "step": 3875 }, { "epoch": 1.4833524684270953, "grad_norm": 0.681445300579071, "learning_rate": 1.6498391040351796e-05, "loss": 0.6939, "step": 3876 }, { "epoch": 1.4837351703023345, "grad_norm": 0.49148550629615784, "learning_rate": 1.6496506649127968e-05, "loss": 0.6195, "step": 3877 }, { "epoch": 1.4841178721775736, "grad_norm": 0.5258722305297852, "learning_rate": 1.649462185867635e-05, "loss": 0.6109, "step": 3878 }, { "epoch": 1.484500574052813, "grad_norm": 0.5453488826751709, "learning_rate": 1.6492736669112773e-05, "loss": 0.6576, "step": 3879 }, { "epoch": 1.484883275928052, "grad_norm": 0.5476211905479431, "learning_rate": 1.6490851080553085e-05, "loss": 0.6942, "step": 3880 }, { "epoch": 1.4852659778032913, "grad_norm": 0.5315927267074585, "learning_rate": 1.648896509311316e-05, "loss": 0.6201, "step": 3881 }, { "epoch": 1.4856486796785304, "grad_norm": 0.5181908011436462, "learning_rate": 1.64870787069089e-05, "loss": 0.6567, "step": 3882 }, { "epoch": 1.4860313815537696, "grad_norm": 0.5764490962028503, "learning_rate": 1.6485191922056225e-05, "loss": 0.613, "step": 3883 }, { "epoch": 1.486414083429009, "grad_norm": 0.5670636296272278, "learning_rate": 1.6483304738671083e-05, "loss": 0.6513, "step": 3884 }, { "epoch": 1.4867967853042479, "grad_norm": 0.6490976810455322, "learning_rate": 1.648141715686945e-05, "loss": 0.625, "step": 3885 }, { "epoch": 1.4871794871794872, "grad_norm": 0.5418419241905212, "learning_rate": 1.6479529176767316e-05, "loss": 0.6598, "step": 3886 }, { "epoch": 1.4875621890547264, "grad_norm": 0.4982435405254364, "learning_rate": 1.647764079848071e-05, "loss": 0.6747, "step": 3887 }, { "epoch": 1.4879448909299655, "grad_norm": 0.5355984568595886, "learning_rate": 1.6475752022125674e-05, "loss": 0.6081, "step": 3888 }, { "epoch": 1.4883275928052049, "grad_norm": 0.5404819846153259, "learning_rate": 1.647386284781828e-05, "loss": 0.6212, "step": 3889 }, { "epoch": 1.4887102946804438, "grad_norm": 0.5268921256065369, "learning_rate": 1.6471973275674618e-05, "loss": 0.7178, "step": 3890 }, { "epoch": 1.4890929965556832, "grad_norm": 0.5154486298561096, "learning_rate": 1.6470083305810817e-05, "loss": 0.6243, "step": 3891 }, { "epoch": 1.4894756984309223, "grad_norm": 0.5406738519668579, "learning_rate": 1.646819293834301e-05, "loss": 0.6973, "step": 3892 }, { "epoch": 1.4898584003061615, "grad_norm": 0.5380112528800964, "learning_rate": 1.6466302173387374e-05, "loss": 0.6304, "step": 3893 }, { "epoch": 1.4902411021814008, "grad_norm": 0.48008155822753906, "learning_rate": 1.6464411011060097e-05, "loss": 0.5564, "step": 3894 }, { "epoch": 1.4906238040566397, "grad_norm": 0.5246208906173706, "learning_rate": 1.6462519451477396e-05, "loss": 0.6797, "step": 3895 }, { "epoch": 1.4910065059318791, "grad_norm": 0.5608070492744446, "learning_rate": 1.6460627494755512e-05, "loss": 0.6339, "step": 3896 }, { "epoch": 1.4913892078071183, "grad_norm": 0.532351016998291, "learning_rate": 1.6458735141010716e-05, "loss": 0.6906, "step": 3897 }, { "epoch": 1.4917719096823574, "grad_norm": 0.5811026096343994, "learning_rate": 1.645684239035929e-05, "loss": 0.7723, "step": 3898 }, { "epoch": 1.4921546115575968, "grad_norm": 0.5194271802902222, "learning_rate": 1.6454949242917555e-05, "loss": 0.7207, "step": 3899 }, { "epoch": 1.4925373134328357, "grad_norm": 0.5425914525985718, "learning_rate": 1.645305569880185e-05, "loss": 0.667, "step": 3900 }, { "epoch": 1.492920015308075, "grad_norm": 0.5169320106506348, "learning_rate": 1.645116175812853e-05, "loss": 0.6609, "step": 3901 }, { "epoch": 1.4933027171833142, "grad_norm": 0.5260623097419739, "learning_rate": 1.6449267421013994e-05, "loss": 0.6243, "step": 3902 }, { "epoch": 1.4936854190585533, "grad_norm": 0.5464056134223938, "learning_rate": 1.644737268757465e-05, "loss": 0.6593, "step": 3903 }, { "epoch": 1.4940681209337927, "grad_norm": 0.558152973651886, "learning_rate": 1.6445477557926932e-05, "loss": 0.6392, "step": 3904 }, { "epoch": 1.4944508228090316, "grad_norm": 0.5374472737312317, "learning_rate": 1.64435820321873e-05, "loss": 0.6352, "step": 3905 }, { "epoch": 1.494833524684271, "grad_norm": 0.5460878610610962, "learning_rate": 1.6441686110472243e-05, "loss": 0.6827, "step": 3906 }, { "epoch": 1.4952162265595101, "grad_norm": 0.5261766314506531, "learning_rate": 1.643978979289827e-05, "loss": 0.5797, "step": 3907 }, { "epoch": 1.4955989284347493, "grad_norm": 0.5271217823028564, "learning_rate": 1.6437893079581915e-05, "loss": 0.6403, "step": 3908 }, { "epoch": 1.4959816303099887, "grad_norm": 0.5463591814041138, "learning_rate": 1.643599597063973e-05, "loss": 0.6409, "step": 3909 }, { "epoch": 1.4963643321852276, "grad_norm": 0.5505545139312744, "learning_rate": 1.6434098466188307e-05, "loss": 0.6498, "step": 3910 }, { "epoch": 1.496747034060467, "grad_norm": 0.5520131587982178, "learning_rate": 1.6432200566344245e-05, "loss": 0.6934, "step": 3911 }, { "epoch": 1.497129735935706, "grad_norm": 0.5245305895805359, "learning_rate": 1.6430302271224178e-05, "loss": 0.5756, "step": 3912 }, { "epoch": 1.4975124378109452, "grad_norm": 0.5917728543281555, "learning_rate": 1.6428403580944765e-05, "loss": 0.7075, "step": 3913 }, { "epoch": 1.4978951396861846, "grad_norm": 0.5597856044769287, "learning_rate": 1.642650449562268e-05, "loss": 0.6329, "step": 3914 }, { "epoch": 1.4982778415614235, "grad_norm": 0.5140689611434937, "learning_rate": 1.642460501537463e-05, "loss": 0.6134, "step": 3915 }, { "epoch": 1.4986605434366629, "grad_norm": 0.5458993315696716, "learning_rate": 1.642270514031734e-05, "loss": 0.641, "step": 3916 }, { "epoch": 1.499043245311902, "grad_norm": 0.5211390852928162, "learning_rate": 1.642080487056757e-05, "loss": 0.6494, "step": 3917 }, { "epoch": 1.4994259471871412, "grad_norm": 0.5356349349021912, "learning_rate": 1.6418904206242084e-05, "loss": 0.7272, "step": 3918 }, { "epoch": 1.4998086490623805, "grad_norm": 0.5566579103469849, "learning_rate": 1.6417003147457693e-05, "loss": 0.7028, "step": 3919 }, { "epoch": 1.5001913509376195, "grad_norm": 0.5271283388137817, "learning_rate": 1.641510169433122e-05, "loss": 0.7056, "step": 3920 }, { "epoch": 1.5005740528128588, "grad_norm": 0.6114487648010254, "learning_rate": 1.6413199846979514e-05, "loss": 0.7047, "step": 3921 }, { "epoch": 1.500956754688098, "grad_norm": 0.5146735310554504, "learning_rate": 1.641129760551945e-05, "loss": 0.5719, "step": 3922 }, { "epoch": 1.5013394565633371, "grad_norm": 0.5254828929901123, "learning_rate": 1.640939497006792e-05, "loss": 0.684, "step": 3923 }, { "epoch": 1.5017221584385765, "grad_norm": 0.569430947303772, "learning_rate": 1.6407491940741853e-05, "loss": 0.6666, "step": 3924 }, { "epoch": 1.5021048603138154, "grad_norm": 0.541837751865387, "learning_rate": 1.6405588517658194e-05, "loss": 0.6125, "step": 3925 }, { "epoch": 1.5024875621890548, "grad_norm": 0.5279228687286377, "learning_rate": 1.640368470093391e-05, "loss": 0.641, "step": 3926 }, { "epoch": 1.502870264064294, "grad_norm": 0.5111411213874817, "learning_rate": 1.6401780490685998e-05, "loss": 0.6379, "step": 3927 }, { "epoch": 1.503252965939533, "grad_norm": 0.5383403897285461, "learning_rate": 1.6399875887031476e-05, "loss": 0.6373, "step": 3928 }, { "epoch": 1.5036356678147724, "grad_norm": 0.5616130828857422, "learning_rate": 1.639797089008739e-05, "loss": 0.7486, "step": 3929 }, { "epoch": 1.5040183696900113, "grad_norm": 0.5372121334075928, "learning_rate": 1.63960654999708e-05, "loss": 0.6653, "step": 3930 }, { "epoch": 1.5044010715652507, "grad_norm": 0.5121235251426697, "learning_rate": 1.6394159716798807e-05, "loss": 0.6125, "step": 3931 }, { "epoch": 1.5047837734404899, "grad_norm": 0.5483638644218445, "learning_rate": 1.639225354068852e-05, "loss": 0.6101, "step": 3932 }, { "epoch": 1.505166475315729, "grad_norm": 0.5589113235473633, "learning_rate": 1.639034697175708e-05, "loss": 0.6978, "step": 3933 }, { "epoch": 1.5055491771909684, "grad_norm": 0.5078133940696716, "learning_rate": 1.638844001012165e-05, "loss": 0.6596, "step": 3934 }, { "epoch": 1.5059318790662073, "grad_norm": 0.5384703874588013, "learning_rate": 1.6386532655899418e-05, "loss": 0.6121, "step": 3935 }, { "epoch": 1.5063145809414467, "grad_norm": 0.8371486067771912, "learning_rate": 1.63846249092076e-05, "loss": 0.6512, "step": 3936 }, { "epoch": 1.5066972828166858, "grad_norm": 0.596724808216095, "learning_rate": 1.638271677016343e-05, "loss": 0.6849, "step": 3937 }, { "epoch": 1.507079984691925, "grad_norm": 0.5632964372634888, "learning_rate": 1.638080823888416e-05, "loss": 0.6727, "step": 3938 }, { "epoch": 1.5074626865671643, "grad_norm": 0.5412147045135498, "learning_rate": 1.6378899315487088e-05, "loss": 0.621, "step": 3939 }, { "epoch": 1.5078453884424032, "grad_norm": 0.471047580242157, "learning_rate": 1.6376990000089513e-05, "loss": 0.6298, "step": 3940 }, { "epoch": 1.5082280903176426, "grad_norm": 0.5961466431617737, "learning_rate": 1.6375080292808774e-05, "loss": 0.6659, "step": 3941 }, { "epoch": 1.5086107921928817, "grad_norm": 0.5106171369552612, "learning_rate": 1.6373170193762225e-05, "loss": 0.6352, "step": 3942 }, { "epoch": 1.5089934940681209, "grad_norm": 0.512830913066864, "learning_rate": 1.637125970306724e-05, "loss": 0.5928, "step": 3943 }, { "epoch": 1.5093761959433603, "grad_norm": 0.59549880027771, "learning_rate": 1.636934882084124e-05, "loss": 0.6329, "step": 3944 }, { "epoch": 1.5097588978185992, "grad_norm": 0.5417449474334717, "learning_rate": 1.6367437547201634e-05, "loss": 0.6373, "step": 3945 }, { "epoch": 1.5101415996938385, "grad_norm": 0.5312244892120361, "learning_rate": 1.636552588226589e-05, "loss": 0.5665, "step": 3946 }, { "epoch": 1.5105243015690777, "grad_norm": 0.5098130106925964, "learning_rate": 1.6363613826151477e-05, "loss": 0.6654, "step": 3947 }, { "epoch": 1.5109070034443168, "grad_norm": 0.5271446704864502, "learning_rate": 1.63617013789759e-05, "loss": 0.6543, "step": 3948 }, { "epoch": 1.5112897053195562, "grad_norm": 0.5462010502815247, "learning_rate": 1.6359788540856682e-05, "loss": 0.6875, "step": 3949 }, { "epoch": 1.5116724071947951, "grad_norm": 0.5188846588134766, "learning_rate": 1.6357875311911375e-05, "loss": 0.6812, "step": 3950 }, { "epoch": 1.5120551090700345, "grad_norm": 0.5775516033172607, "learning_rate": 1.6355961692257545e-05, "loss": 0.6276, "step": 3951 }, { "epoch": 1.5124378109452736, "grad_norm": 0.6105210781097412, "learning_rate": 1.63540476820128e-05, "loss": 0.5913, "step": 3952 }, { "epoch": 1.5128205128205128, "grad_norm": 0.6184600591659546, "learning_rate": 1.635213328129475e-05, "loss": 0.6375, "step": 3953 }, { "epoch": 1.5132032146957521, "grad_norm": 0.5597871541976929, "learning_rate": 1.6350218490221047e-05, "loss": 0.6978, "step": 3954 }, { "epoch": 1.513585916570991, "grad_norm": 0.4929431676864624, "learning_rate": 1.6348303308909357e-05, "loss": 0.7199, "step": 3955 }, { "epoch": 1.5139686184462304, "grad_norm": 0.5163165926933289, "learning_rate": 1.6346387737477375e-05, "loss": 0.6361, "step": 3956 }, { "epoch": 1.5143513203214696, "grad_norm": 0.6633158326148987, "learning_rate": 1.6344471776042814e-05, "loss": 0.7033, "step": 3957 }, { "epoch": 1.5147340221967087, "grad_norm": 0.5321021676063538, "learning_rate": 1.634255542472342e-05, "loss": 0.6719, "step": 3958 }, { "epoch": 1.515116724071948, "grad_norm": 0.5214015245437622, "learning_rate": 1.6340638683636956e-05, "loss": 0.6334, "step": 3959 }, { "epoch": 1.515499425947187, "grad_norm": 0.5319516062736511, "learning_rate": 1.633872155290121e-05, "loss": 0.6557, "step": 3960 }, { "epoch": 1.5158821278224264, "grad_norm": 0.5323466062545776, "learning_rate": 1.6336804032633998e-05, "loss": 0.7308, "step": 3961 }, { "epoch": 1.5162648296976655, "grad_norm": 0.5896013379096985, "learning_rate": 1.6334886122953155e-05, "loss": 0.6393, "step": 3962 }, { "epoch": 1.5166475315729047, "grad_norm": 0.5210293531417847, "learning_rate": 1.633296782397654e-05, "loss": 0.6471, "step": 3963 }, { "epoch": 1.517030233448144, "grad_norm": 0.49539947509765625, "learning_rate": 1.6331049135822035e-05, "loss": 0.623, "step": 3964 }, { "epoch": 1.517412935323383, "grad_norm": 0.520236611366272, "learning_rate": 1.632913005860756e-05, "loss": 0.7754, "step": 3965 }, { "epoch": 1.5177956371986223, "grad_norm": 0.5878621935844421, "learning_rate": 1.632721059245103e-05, "loss": 0.6899, "step": 3966 }, { "epoch": 1.5181783390738615, "grad_norm": 0.5308201313018799, "learning_rate": 1.632529073747042e-05, "loss": 0.6568, "step": 3967 }, { "epoch": 1.5185610409491006, "grad_norm": 0.6138604283332825, "learning_rate": 1.6323370493783696e-05, "loss": 0.7424, "step": 3968 }, { "epoch": 1.51894374282434, "grad_norm": 0.5120649933815002, "learning_rate": 1.632144986150887e-05, "loss": 0.6651, "step": 3969 }, { "epoch": 1.519326444699579, "grad_norm": 0.5662031769752502, "learning_rate": 1.6319528840763967e-05, "loss": 0.6277, "step": 3970 }, { "epoch": 1.5197091465748183, "grad_norm": 0.5947843194007874, "learning_rate": 1.6317607431667038e-05, "loss": 0.683, "step": 3971 }, { "epoch": 1.5200918484500574, "grad_norm": 0.5630815029144287, "learning_rate": 1.631568563433616e-05, "loss": 0.6983, "step": 3972 }, { "epoch": 1.5204745503252965, "grad_norm": 0.5367321968078613, "learning_rate": 1.6313763448889435e-05, "loss": 0.6437, "step": 3973 }, { "epoch": 1.520857252200536, "grad_norm": 0.524355411529541, "learning_rate": 1.631184087544498e-05, "loss": 0.6892, "step": 3974 }, { "epoch": 1.5212399540757748, "grad_norm": 0.5595932602882385, "learning_rate": 1.6309917914120953e-05, "loss": 0.6109, "step": 3975 }, { "epoch": 1.5216226559510142, "grad_norm": 0.5210201144218445, "learning_rate": 1.6307994565035517e-05, "loss": 0.6904, "step": 3976 }, { "epoch": 1.5220053578262533, "grad_norm": 0.5352857708930969, "learning_rate": 1.6306070828306862e-05, "loss": 0.678, "step": 3977 }, { "epoch": 1.5223880597014925, "grad_norm": 0.5470892190933228, "learning_rate": 1.630414670405322e-05, "loss": 0.7023, "step": 3978 }, { "epoch": 1.5227707615767319, "grad_norm": 0.5054042339324951, "learning_rate": 1.6302222192392825e-05, "loss": 0.6692, "step": 3979 }, { "epoch": 1.5231534634519708, "grad_norm": 0.5587280988693237, "learning_rate": 1.630029729344395e-05, "loss": 0.6988, "step": 3980 }, { "epoch": 1.5235361653272101, "grad_norm": 0.5920350551605225, "learning_rate": 1.6298372007324873e-05, "loss": 0.619, "step": 3981 }, { "epoch": 1.5239188672024493, "grad_norm": 0.5533884167671204, "learning_rate": 1.629644633415392e-05, "loss": 0.7527, "step": 3982 }, { "epoch": 1.5243015690776884, "grad_norm": 0.5591922402381897, "learning_rate": 1.6294520274049422e-05, "loss": 0.6128, "step": 3983 }, { "epoch": 1.5246842709529278, "grad_norm": 0.4864802658557892, "learning_rate": 1.6292593827129745e-05, "loss": 0.6765, "step": 3984 }, { "epoch": 1.5250669728281667, "grad_norm": 0.5369486808776855, "learning_rate": 1.6290666993513272e-05, "loss": 0.7213, "step": 3985 }, { "epoch": 1.525449674703406, "grad_norm": 0.5456950664520264, "learning_rate": 1.6288739773318413e-05, "loss": 0.6156, "step": 3986 }, { "epoch": 1.5258323765786452, "grad_norm": 0.539135754108429, "learning_rate": 1.62868121666636e-05, "loss": 0.6688, "step": 3987 }, { "epoch": 1.5262150784538844, "grad_norm": 0.5680748224258423, "learning_rate": 1.6284884173667287e-05, "loss": 0.6917, "step": 3988 }, { "epoch": 1.5265977803291237, "grad_norm": 0.5862008929252625, "learning_rate": 1.628295579444796e-05, "loss": 0.6903, "step": 3989 }, { "epoch": 1.5269804822043627, "grad_norm": 0.5248652696609497, "learning_rate": 1.628102702912412e-05, "loss": 0.6503, "step": 3990 }, { "epoch": 1.527363184079602, "grad_norm": 0.510648250579834, "learning_rate": 1.6279097877814294e-05, "loss": 0.6845, "step": 3991 }, { "epoch": 1.5277458859548412, "grad_norm": 0.5247638821601868, "learning_rate": 1.6277168340637034e-05, "loss": 0.656, "step": 3992 }, { "epoch": 1.5281285878300803, "grad_norm": 0.49587175250053406, "learning_rate": 1.627523841771092e-05, "loss": 0.5604, "step": 3993 }, { "epoch": 1.5285112897053197, "grad_norm": 0.6100490093231201, "learning_rate": 1.6273308109154545e-05, "loss": 0.6468, "step": 3994 }, { "epoch": 1.5288939915805586, "grad_norm": 0.5963635444641113, "learning_rate": 1.6271377415086533e-05, "loss": 0.6116, "step": 3995 }, { "epoch": 1.529276693455798, "grad_norm": 0.5232284665107727, "learning_rate": 1.6269446335625533e-05, "loss": 0.6794, "step": 3996 }, { "epoch": 1.5296593953310371, "grad_norm": 0.593130886554718, "learning_rate": 1.6267514870890208e-05, "loss": 0.6493, "step": 3997 }, { "epoch": 1.5300420972062763, "grad_norm": 0.5639086961746216, "learning_rate": 1.6265583020999257e-05, "loss": 0.683, "step": 3998 }, { "epoch": 1.5304247990815156, "grad_norm": 0.5000501275062561, "learning_rate": 1.6263650786071405e-05, "loss": 0.6175, "step": 3999 }, { "epoch": 1.5308075009567546, "grad_norm": 0.6222371459007263, "learning_rate": 1.6261718166225374e-05, "loss": 0.6728, "step": 4000 }, { "epoch": 1.531190202831994, "grad_norm": 0.5593092441558838, "learning_rate": 1.6259785161579946e-05, "loss": 0.6377, "step": 4001 }, { "epoch": 1.531572904707233, "grad_norm": 0.5490189790725708, "learning_rate": 1.6257851772253906e-05, "loss": 0.6263, "step": 4002 }, { "epoch": 1.5319556065824722, "grad_norm": 0.5886465907096863, "learning_rate": 1.6255917998366054e-05, "loss": 0.6636, "step": 4003 }, { "epoch": 1.5323383084577116, "grad_norm": 0.5572566986083984, "learning_rate": 1.6253983840035243e-05, "loss": 0.6845, "step": 4004 }, { "epoch": 1.5327210103329505, "grad_norm": 0.5370848178863525, "learning_rate": 1.6252049297380315e-05, "loss": 0.6943, "step": 4005 }, { "epoch": 1.5331037122081899, "grad_norm": 0.5057812333106995, "learning_rate": 1.625011437052017e-05, "loss": 0.7222, "step": 4006 }, { "epoch": 1.533486414083429, "grad_norm": 0.48455360531806946, "learning_rate": 1.6248179059573705e-05, "loss": 0.6401, "step": 4007 }, { "epoch": 1.5338691159586682, "grad_norm": 0.5252039432525635, "learning_rate": 1.6246243364659845e-05, "loss": 0.7079, "step": 4008 }, { "epoch": 1.5342518178339075, "grad_norm": 0.5457437634468079, "learning_rate": 1.6244307285897556e-05, "loss": 0.677, "step": 4009 }, { "epoch": 1.5346345197091464, "grad_norm": 0.48895302414894104, "learning_rate": 1.6242370823405807e-05, "loss": 0.6502, "step": 4010 }, { "epoch": 1.5350172215843858, "grad_norm": 0.5887897610664368, "learning_rate": 1.62404339773036e-05, "loss": 0.6585, "step": 4011 }, { "epoch": 1.535399923459625, "grad_norm": 0.5325651168823242, "learning_rate": 1.6238496747709958e-05, "loss": 0.6893, "step": 4012 }, { "epoch": 1.535782625334864, "grad_norm": 0.5790703296661377, "learning_rate": 1.6236559134743935e-05, "loss": 0.6515, "step": 4013 }, { "epoch": 1.5361653272101035, "grad_norm": 0.514498233795166, "learning_rate": 1.62346211385246e-05, "loss": 0.6806, "step": 4014 }, { "epoch": 1.5365480290853424, "grad_norm": 0.555513858795166, "learning_rate": 1.6232682759171045e-05, "loss": 0.6363, "step": 4015 }, { "epoch": 1.5369307309605817, "grad_norm": 0.5191698670387268, "learning_rate": 1.6230743996802387e-05, "loss": 0.6757, "step": 4016 }, { "epoch": 1.537313432835821, "grad_norm": 0.6493743062019348, "learning_rate": 1.6228804851537777e-05, "loss": 0.6599, "step": 4017 }, { "epoch": 1.53769613471106, "grad_norm": 0.5028771162033081, "learning_rate": 1.6226865323496373e-05, "loss": 0.7361, "step": 4018 }, { "epoch": 1.5380788365862994, "grad_norm": 0.5321173071861267, "learning_rate": 1.6224925412797366e-05, "loss": 0.6908, "step": 4019 }, { "epoch": 1.5384615384615383, "grad_norm": 0.5454524755477905, "learning_rate": 1.6222985119559967e-05, "loss": 0.71, "step": 4020 }, { "epoch": 1.5388442403367777, "grad_norm": 0.5162585973739624, "learning_rate": 1.6221044443903418e-05, "loss": 0.6653, "step": 4021 }, { "epoch": 1.5392269422120168, "grad_norm": 0.499148964881897, "learning_rate": 1.6219103385946975e-05, "loss": 0.7344, "step": 4022 }, { "epoch": 1.539609644087256, "grad_norm": 0.5741094350814819, "learning_rate": 1.621716194580992e-05, "loss": 0.6912, "step": 4023 }, { "epoch": 1.5399923459624953, "grad_norm": 0.5145421624183655, "learning_rate": 1.621522012361156e-05, "loss": 0.6269, "step": 4024 }, { "epoch": 1.5403750478377343, "grad_norm": 0.5165121555328369, "learning_rate": 1.621327791947123e-05, "loss": 0.5693, "step": 4025 }, { "epoch": 1.5407577497129736, "grad_norm": 0.5087985992431641, "learning_rate": 1.6211335333508277e-05, "loss": 0.6502, "step": 4026 }, { "epoch": 1.5411404515882128, "grad_norm": 0.4853403866291046, "learning_rate": 1.620939236584208e-05, "loss": 0.621, "step": 4027 }, { "epoch": 1.541523153463452, "grad_norm": 0.6030787229537964, "learning_rate": 1.6207449016592045e-05, "loss": 0.7542, "step": 4028 }, { "epoch": 1.5419058553386913, "grad_norm": 0.552666962146759, "learning_rate": 1.6205505285877587e-05, "loss": 0.5764, "step": 4029 }, { "epoch": 1.5422885572139302, "grad_norm": 0.4865206182003021, "learning_rate": 1.6203561173818163e-05, "loss": 0.6419, "step": 4030 }, { "epoch": 1.5426712590891696, "grad_norm": 0.5056977272033691, "learning_rate": 1.6201616680533234e-05, "loss": 0.6782, "step": 4031 }, { "epoch": 1.5430539609644087, "grad_norm": 0.4815840423107147, "learning_rate": 1.6199671806142304e-05, "loss": 0.5658, "step": 4032 }, { "epoch": 1.5434366628396479, "grad_norm": 0.5309726595878601, "learning_rate": 1.619772655076488e-05, "loss": 0.6948, "step": 4033 }, { "epoch": 1.5438193647148872, "grad_norm": 0.4551936089992523, "learning_rate": 1.6195780914520514e-05, "loss": 0.7288, "step": 4034 }, { "epoch": 1.5442020665901262, "grad_norm": 0.5422321557998657, "learning_rate": 1.6193834897528766e-05, "loss": 0.6519, "step": 4035 }, { "epoch": 1.5445847684653655, "grad_norm": 0.558538019657135, "learning_rate": 1.619188849990922e-05, "loss": 0.735, "step": 4036 }, { "epoch": 1.5449674703406047, "grad_norm": 0.5521723628044128, "learning_rate": 1.6189941721781495e-05, "loss": 0.7084, "step": 4037 }, { "epoch": 1.5453501722158438, "grad_norm": 0.7013563513755798, "learning_rate": 1.6187994563265223e-05, "loss": 0.6498, "step": 4038 }, { "epoch": 1.5457328740910832, "grad_norm": 0.5214457511901855, "learning_rate": 1.6186047024480058e-05, "loss": 0.624, "step": 4039 }, { "epoch": 1.546115575966322, "grad_norm": 0.5196923613548279, "learning_rate": 1.6184099105545686e-05, "loss": 0.6472, "step": 4040 }, { "epoch": 1.5464982778415615, "grad_norm": 0.5420656204223633, "learning_rate": 1.618215080658181e-05, "loss": 0.6359, "step": 4041 }, { "epoch": 1.5468809797168006, "grad_norm": 0.5084579586982727, "learning_rate": 1.6180202127708157e-05, "loss": 0.6289, "step": 4042 }, { "epoch": 1.5472636815920398, "grad_norm": 0.5086448192596436, "learning_rate": 1.6178253069044487e-05, "loss": 0.6187, "step": 4043 }, { "epoch": 1.5476463834672791, "grad_norm": 0.5092979073524475, "learning_rate": 1.6176303630710563e-05, "loss": 0.7151, "step": 4044 }, { "epoch": 1.548029085342518, "grad_norm": 0.5681091547012329, "learning_rate": 1.6174353812826186e-05, "loss": 0.6338, "step": 4045 }, { "epoch": 1.5484117872177574, "grad_norm": 0.5149807929992676, "learning_rate": 1.617240361551118e-05, "loss": 0.6091, "step": 4046 }, { "epoch": 1.5487944890929966, "grad_norm": 0.5271354913711548, "learning_rate": 1.6170453038885394e-05, "loss": 0.6894, "step": 4047 }, { "epoch": 1.5491771909682357, "grad_norm": 0.5587602257728577, "learning_rate": 1.6168502083068692e-05, "loss": 0.5936, "step": 4048 }, { "epoch": 1.549559892843475, "grad_norm": 0.5067407488822937, "learning_rate": 1.6166550748180962e-05, "loss": 0.7066, "step": 4049 }, { "epoch": 1.549942594718714, "grad_norm": 0.5447636842727661, "learning_rate": 1.6164599034342122e-05, "loss": 0.6182, "step": 4050 }, { "epoch": 1.5503252965939534, "grad_norm": 0.5179388523101807, "learning_rate": 1.6162646941672114e-05, "loss": 0.6997, "step": 4051 }, { "epoch": 1.5507079984691925, "grad_norm": 0.5071699619293213, "learning_rate": 1.6160694470290893e-05, "loss": 0.5594, "step": 4052 }, { "epoch": 1.5510907003444316, "grad_norm": 0.5415440201759338, "learning_rate": 1.6158741620318447e-05, "loss": 0.6926, "step": 4053 }, { "epoch": 1.551473402219671, "grad_norm": 0.5444996356964111, "learning_rate": 1.6156788391874783e-05, "loss": 0.7026, "step": 4054 }, { "epoch": 1.55185610409491, "grad_norm": 0.5297889709472656, "learning_rate": 1.615483478507993e-05, "loss": 0.7112, "step": 4055 }, { "epoch": 1.5522388059701493, "grad_norm": 0.5308390855789185, "learning_rate": 1.6152880800053946e-05, "loss": 0.6757, "step": 4056 }, { "epoch": 1.5526215078453884, "grad_norm": 0.5329200625419617, "learning_rate": 1.6150926436916907e-05, "loss": 0.6954, "step": 4057 }, { "epoch": 1.5530042097206276, "grad_norm": 0.511081874370575, "learning_rate": 1.6148971695788914e-05, "loss": 0.5963, "step": 4058 }, { "epoch": 1.553386911595867, "grad_norm": 0.5728087425231934, "learning_rate": 1.6147016576790094e-05, "loss": 0.6493, "step": 4059 }, { "epoch": 1.5537696134711059, "grad_norm": 0.530965268611908, "learning_rate": 1.6145061080040587e-05, "loss": 0.533, "step": 4060 }, { "epoch": 1.5541523153463452, "grad_norm": 0.5502596497535706, "learning_rate": 1.614310520566057e-05, "loss": 0.6701, "step": 4061 }, { "epoch": 1.5545350172215844, "grad_norm": 0.5653761625289917, "learning_rate": 1.6141148953770237e-05, "loss": 0.5992, "step": 4062 }, { "epoch": 1.5549177190968235, "grad_norm": 0.5248960852622986, "learning_rate": 1.61391923244898e-05, "loss": 0.6201, "step": 4063 }, { "epoch": 1.555300420972063, "grad_norm": 0.5533770322799683, "learning_rate": 1.6137235317939503e-05, "loss": 0.6226, "step": 4064 }, { "epoch": 1.5556831228473018, "grad_norm": 0.5451905727386475, "learning_rate": 1.6135277934239605e-05, "loss": 0.7044, "step": 4065 }, { "epoch": 1.5560658247225412, "grad_norm": 0.5196478962898254, "learning_rate": 1.6133320173510398e-05, "loss": 0.5861, "step": 4066 }, { "epoch": 1.5564485265977803, "grad_norm": 0.5556396842002869, "learning_rate": 1.613136203587219e-05, "loss": 0.6185, "step": 4067 }, { "epoch": 1.5568312284730195, "grad_norm": 0.5416322350502014, "learning_rate": 1.612940352144531e-05, "loss": 0.6686, "step": 4068 }, { "epoch": 1.5572139303482588, "grad_norm": 0.5521867275238037, "learning_rate": 1.612744463035012e-05, "loss": 0.6528, "step": 4069 }, { "epoch": 1.5575966322234978, "grad_norm": 0.5882201194763184, "learning_rate": 1.6125485362706995e-05, "loss": 0.6311, "step": 4070 }, { "epoch": 1.5579793340987371, "grad_norm": 0.536458432674408, "learning_rate": 1.612352571863634e-05, "loss": 0.5704, "step": 4071 }, { "epoch": 1.5583620359739763, "grad_norm": 0.5393508672714233, "learning_rate": 1.612156569825858e-05, "loss": 0.6634, "step": 4072 }, { "epoch": 1.5587447378492154, "grad_norm": 0.4773359000682831, "learning_rate": 1.6119605301694156e-05, "loss": 0.6984, "step": 4073 }, { "epoch": 1.5591274397244548, "grad_norm": 0.5180994272232056, "learning_rate": 1.6117644529063552e-05, "loss": 0.6229, "step": 4074 }, { "epoch": 1.5595101415996937, "grad_norm": 0.5259363651275635, "learning_rate": 1.6115683380487253e-05, "loss": 0.62, "step": 4075 }, { "epoch": 1.559892843474933, "grad_norm": 0.5111142992973328, "learning_rate": 1.6113721856085783e-05, "loss": 0.6644, "step": 4076 }, { "epoch": 1.5602755453501722, "grad_norm": 0.5209270715713501, "learning_rate": 1.611175995597968e-05, "loss": 0.7018, "step": 4077 }, { "epoch": 1.5606582472254114, "grad_norm": 0.5356357097625732, "learning_rate": 1.610979768028951e-05, "loss": 0.7176, "step": 4078 }, { "epoch": 1.5610409491006507, "grad_norm": 0.4991685450077057, "learning_rate": 1.6107835029135854e-05, "loss": 0.6114, "step": 4079 }, { "epoch": 1.5614236509758896, "grad_norm": 0.5254079699516296, "learning_rate": 1.610587200263933e-05, "loss": 0.633, "step": 4080 }, { "epoch": 1.561806352851129, "grad_norm": 0.592283308506012, "learning_rate": 1.6103908600920567e-05, "loss": 0.6733, "step": 4081 }, { "epoch": 1.5621890547263682, "grad_norm": 0.5079001188278198, "learning_rate": 1.610194482410022e-05, "loss": 0.6722, "step": 4082 }, { "epoch": 1.5625717566016073, "grad_norm": 0.5029669404029846, "learning_rate": 1.6099980672298975e-05, "loss": 0.6661, "step": 4083 }, { "epoch": 1.5629544584768467, "grad_norm": 0.5287566781044006, "learning_rate": 1.6098016145637528e-05, "loss": 0.6718, "step": 4084 }, { "epoch": 1.5633371603520856, "grad_norm": 0.5231010913848877, "learning_rate": 1.609605124423661e-05, "loss": 0.5985, "step": 4085 }, { "epoch": 1.563719862227325, "grad_norm": 0.5145679712295532, "learning_rate": 1.6094085968216963e-05, "loss": 0.6782, "step": 4086 }, { "epoch": 1.564102564102564, "grad_norm": 0.5014981031417847, "learning_rate": 1.6092120317699364e-05, "loss": 0.6594, "step": 4087 }, { "epoch": 1.5644852659778032, "grad_norm": 0.6056356430053711, "learning_rate": 1.6090154292804604e-05, "loss": 0.7003, "step": 4088 }, { "epoch": 1.5648679678530426, "grad_norm": 0.633143961429596, "learning_rate": 1.60881878936535e-05, "loss": 0.6389, "step": 4089 }, { "epoch": 1.5652506697282815, "grad_norm": 0.5218345522880554, "learning_rate": 1.6086221120366895e-05, "loss": 0.6311, "step": 4090 }, { "epoch": 1.565633371603521, "grad_norm": 0.539315402507782, "learning_rate": 1.608425397306565e-05, "loss": 0.6666, "step": 4091 }, { "epoch": 1.56601607347876, "grad_norm": 0.5079124569892883, "learning_rate": 1.6082286451870656e-05, "loss": 0.6339, "step": 4092 }, { "epoch": 1.5663987753539992, "grad_norm": 0.5412386059761047, "learning_rate": 1.6080318556902816e-05, "loss": 0.6311, "step": 4093 }, { "epoch": 1.5667814772292386, "grad_norm": 0.6470586657524109, "learning_rate": 1.6078350288283068e-05, "loss": 0.6984, "step": 4094 }, { "epoch": 1.5671641791044775, "grad_norm": 0.6310622096061707, "learning_rate": 1.6076381646132367e-05, "loss": 0.6604, "step": 4095 }, { "epoch": 1.5675468809797168, "grad_norm": 0.5399909019470215, "learning_rate": 1.6074412630571685e-05, "loss": 0.5465, "step": 4096 }, { "epoch": 1.567929582854956, "grad_norm": 0.5618472695350647, "learning_rate": 1.607244324172203e-05, "loss": 0.6308, "step": 4097 }, { "epoch": 1.5683122847301951, "grad_norm": 0.5681276321411133, "learning_rate": 1.607047347970443e-05, "loss": 0.7091, "step": 4098 }, { "epoch": 1.5686949866054345, "grad_norm": 0.5881242156028748, "learning_rate": 1.6068503344639917e-05, "loss": 0.658, "step": 4099 }, { "epoch": 1.5690776884806734, "grad_norm": 0.5858713984489441, "learning_rate": 1.6066532836649577e-05, "loss": 0.7003, "step": 4100 }, { "epoch": 1.5694603903559128, "grad_norm": 0.5156630277633667, "learning_rate": 1.6064561955854497e-05, "loss": 0.6808, "step": 4101 }, { "epoch": 1.569843092231152, "grad_norm": 0.5646254420280457, "learning_rate": 1.606259070237579e-05, "loss": 0.7174, "step": 4102 }, { "epoch": 1.570225794106391, "grad_norm": 0.6386151909828186, "learning_rate": 1.6060619076334594e-05, "loss": 0.5728, "step": 4103 }, { "epoch": 1.5706084959816304, "grad_norm": 0.49307602643966675, "learning_rate": 1.605864707785208e-05, "loss": 0.6027, "step": 4104 }, { "epoch": 1.5709911978568694, "grad_norm": 0.5384640693664551, "learning_rate": 1.6056674707049423e-05, "loss": 0.6924, "step": 4105 }, { "epoch": 1.5713738997321087, "grad_norm": 0.584233283996582, "learning_rate": 1.6054701964047835e-05, "loss": 0.6505, "step": 4106 }, { "epoch": 1.5717566016073479, "grad_norm": 0.7192216515541077, "learning_rate": 1.6052728848968542e-05, "loss": 0.6204, "step": 4107 }, { "epoch": 1.572139303482587, "grad_norm": 0.6260219216346741, "learning_rate": 1.6050755361932807e-05, "loss": 0.5856, "step": 4108 }, { "epoch": 1.5725220053578264, "grad_norm": 0.5519225001335144, "learning_rate": 1.6048781503061896e-05, "loss": 0.6548, "step": 4109 }, { "epoch": 1.5729047072330653, "grad_norm": 0.5112871527671814, "learning_rate": 1.6046807272477112e-05, "loss": 0.64, "step": 4110 }, { "epoch": 1.5732874091083047, "grad_norm": 0.565451443195343, "learning_rate": 1.604483267029978e-05, "loss": 0.6346, "step": 4111 }, { "epoch": 1.5736701109835438, "grad_norm": 0.5523509383201599, "learning_rate": 1.6042857696651237e-05, "loss": 0.6585, "step": 4112 }, { "epoch": 1.574052812858783, "grad_norm": 0.5806711912155151, "learning_rate": 1.6040882351652855e-05, "loss": 0.6762, "step": 4113 }, { "epoch": 1.5744355147340223, "grad_norm": 0.5154079794883728, "learning_rate": 1.6038906635426027e-05, "loss": 0.6238, "step": 4114 }, { "epoch": 1.5748182166092612, "grad_norm": 0.5278206467628479, "learning_rate": 1.6036930548092158e-05, "loss": 0.6694, "step": 4115 }, { "epoch": 1.5752009184845006, "grad_norm": 0.5284342169761658, "learning_rate": 1.603495408977269e-05, "loss": 0.5937, "step": 4116 }, { "epoch": 1.5755836203597398, "grad_norm": 0.5828486680984497, "learning_rate": 1.6032977260589077e-05, "loss": 0.7264, "step": 4117 }, { "epoch": 1.575966322234979, "grad_norm": 0.6035677194595337, "learning_rate": 1.603100006066281e-05, "loss": 0.6453, "step": 4118 }, { "epoch": 1.5763490241102183, "grad_norm": 0.5790857672691345, "learning_rate": 1.6029022490115383e-05, "loss": 0.5695, "step": 4119 }, { "epoch": 1.5767317259854572, "grad_norm": 0.527378261089325, "learning_rate": 1.602704454906833e-05, "loss": 0.7207, "step": 4120 }, { "epoch": 1.5771144278606966, "grad_norm": 0.5209848284721375, "learning_rate": 1.6025066237643198e-05, "loss": 0.6113, "step": 4121 }, { "epoch": 1.5774971297359357, "grad_norm": 0.5486105680465698, "learning_rate": 1.6023087555961558e-05, "loss": 0.6285, "step": 4122 }, { "epoch": 1.5778798316111748, "grad_norm": 0.542062520980835, "learning_rate": 1.6021108504145005e-05, "loss": 0.666, "step": 4123 }, { "epoch": 1.5782625334864142, "grad_norm": 0.5454142093658447, "learning_rate": 1.6019129082315163e-05, "loss": 0.6486, "step": 4124 }, { "epoch": 1.5786452353616531, "grad_norm": 0.5834380984306335, "learning_rate": 1.6017149290593664e-05, "loss": 0.7485, "step": 4125 }, { "epoch": 1.5790279372368925, "grad_norm": 0.5782944560050964, "learning_rate": 1.601516912910218e-05, "loss": 0.6789, "step": 4126 }, { "epoch": 1.5794106391121316, "grad_norm": 0.5866331458091736, "learning_rate": 1.601318859796239e-05, "loss": 0.6722, "step": 4127 }, { "epoch": 1.5797933409873708, "grad_norm": 0.5846785306930542, "learning_rate": 1.6011207697296007e-05, "loss": 0.5932, "step": 4128 }, { "epoch": 1.5801760428626102, "grad_norm": 0.54570072889328, "learning_rate": 1.6009226427224764e-05, "loss": 0.6396, "step": 4129 }, { "epoch": 1.580558744737849, "grad_norm": 0.5595234036445618, "learning_rate": 1.600724478787041e-05, "loss": 0.6701, "step": 4130 }, { "epoch": 1.5809414466130884, "grad_norm": 0.5738050937652588, "learning_rate": 1.600526277935473e-05, "loss": 0.6975, "step": 4131 }, { "epoch": 1.5813241484883276, "grad_norm": 0.7084988355636597, "learning_rate": 1.6003280401799516e-05, "loss": 0.6718, "step": 4132 }, { "epoch": 1.5817068503635667, "grad_norm": 0.5290881395339966, "learning_rate": 1.6001297655326593e-05, "loss": 0.6655, "step": 4133 }, { "epoch": 1.582089552238806, "grad_norm": 0.5420892238616943, "learning_rate": 1.599931454005781e-05, "loss": 0.6762, "step": 4134 }, { "epoch": 1.582472254114045, "grad_norm": 0.5711716413497925, "learning_rate": 1.599733105611503e-05, "loss": 0.6317, "step": 4135 }, { "epoch": 1.5828549559892844, "grad_norm": 0.558039128780365, "learning_rate": 1.5995347203620142e-05, "loss": 0.7415, "step": 4136 }, { "epoch": 1.5832376578645235, "grad_norm": 0.5097461342811584, "learning_rate": 1.5993362982695067e-05, "loss": 0.632, "step": 4137 }, { "epoch": 1.5836203597397627, "grad_norm": 0.5651438236236572, "learning_rate": 1.599137839346173e-05, "loss": 0.6834, "step": 4138 }, { "epoch": 1.584003061615002, "grad_norm": 0.5402020812034607, "learning_rate": 1.59893934360421e-05, "loss": 0.5775, "step": 4139 }, { "epoch": 1.584385763490241, "grad_norm": 0.5423325300216675, "learning_rate": 1.598740811055815e-05, "loss": 0.6598, "step": 4140 }, { "epoch": 1.5847684653654803, "grad_norm": 0.5295654535293579, "learning_rate": 1.5985422417131883e-05, "loss": 0.613, "step": 4141 }, { "epoch": 1.5851511672407195, "grad_norm": 0.5587893724441528, "learning_rate": 1.5983436355885333e-05, "loss": 0.644, "step": 4142 }, { "epoch": 1.5855338691159586, "grad_norm": 0.5745251774787903, "learning_rate": 1.5981449926940545e-05, "loss": 0.6078, "step": 4143 }, { "epoch": 1.585916570991198, "grad_norm": 0.5498107671737671, "learning_rate": 1.597946313041959e-05, "loss": 0.6415, "step": 4144 }, { "epoch": 1.586299272866437, "grad_norm": 0.591945230960846, "learning_rate": 1.597747596644456e-05, "loss": 0.6526, "step": 4145 }, { "epoch": 1.5866819747416763, "grad_norm": 0.5403217077255249, "learning_rate": 1.5975488435137573e-05, "loss": 0.6074, "step": 4146 }, { "epoch": 1.5870646766169154, "grad_norm": 0.5654319524765015, "learning_rate": 1.597350053662077e-05, "loss": 0.6595, "step": 4147 }, { "epoch": 1.5874473784921546, "grad_norm": 0.5318201184272766, "learning_rate": 1.597151227101631e-05, "loss": 0.5841, "step": 4148 }, { "epoch": 1.587830080367394, "grad_norm": 0.517686128616333, "learning_rate": 1.596952363844638e-05, "loss": 0.5986, "step": 4149 }, { "epoch": 1.5882127822426328, "grad_norm": 0.5070990324020386, "learning_rate": 1.5967534639033188e-05, "loss": 0.6025, "step": 4150 }, { "epoch": 1.5885954841178722, "grad_norm": 0.5868526697158813, "learning_rate": 1.5965545272898957e-05, "loss": 0.7512, "step": 4151 }, { "epoch": 1.5889781859931114, "grad_norm": 0.5805437564849854, "learning_rate": 1.5963555540165946e-05, "loss": 0.6767, "step": 4152 }, { "epoch": 1.5893608878683505, "grad_norm": 0.5025380849838257, "learning_rate": 1.5961565440956422e-05, "loss": 0.5886, "step": 4153 }, { "epoch": 1.5897435897435899, "grad_norm": 0.5236551761627197, "learning_rate": 1.5959574975392684e-05, "loss": 0.5412, "step": 4154 }, { "epoch": 1.5901262916188288, "grad_norm": 0.5111422538757324, "learning_rate": 1.595758414359706e-05, "loss": 0.6975, "step": 4155 }, { "epoch": 1.5905089934940682, "grad_norm": 0.5281949043273926, "learning_rate": 1.595559294569188e-05, "loss": 0.6042, "step": 4156 }, { "epoch": 1.5908916953693073, "grad_norm": 0.523349404335022, "learning_rate": 1.5953601381799517e-05, "loss": 0.6854, "step": 4157 }, { "epoch": 1.5912743972445464, "grad_norm": 0.501585841178894, "learning_rate": 1.5951609452042354e-05, "loss": 0.6586, "step": 4158 }, { "epoch": 1.5916570991197858, "grad_norm": 0.539099395275116, "learning_rate": 1.59496171565428e-05, "loss": 0.6127, "step": 4159 }, { "epoch": 1.5920398009950247, "grad_norm": 0.5686036944389343, "learning_rate": 1.594762449542329e-05, "loss": 0.6051, "step": 4160 }, { "epoch": 1.592422502870264, "grad_norm": 0.5311874747276306, "learning_rate": 1.594563146880628e-05, "loss": 0.7008, "step": 4161 }, { "epoch": 1.5928052047455032, "grad_norm": 0.5174441933631897, "learning_rate": 1.5943638076814235e-05, "loss": 0.6386, "step": 4162 }, { "epoch": 1.5931879066207424, "grad_norm": 0.5040134787559509, "learning_rate": 1.5941644319569665e-05, "loss": 0.6783, "step": 4163 }, { "epoch": 1.5935706084959818, "grad_norm": 0.5161314010620117, "learning_rate": 1.593965019719509e-05, "loss": 0.6265, "step": 4164 }, { "epoch": 1.5939533103712207, "grad_norm": 0.5267512202262878, "learning_rate": 1.593765570981306e-05, "loss": 0.6595, "step": 4165 }, { "epoch": 1.59433601224646, "grad_norm": 0.5509853363037109, "learning_rate": 1.5935660857546128e-05, "loss": 0.6083, "step": 4166 }, { "epoch": 1.5947187141216992, "grad_norm": 0.5261800289154053, "learning_rate": 1.593366564051689e-05, "loss": 0.5875, "step": 4167 }, { "epoch": 1.5951014159969383, "grad_norm": 0.4800848960876465, "learning_rate": 1.593167005884796e-05, "loss": 0.7028, "step": 4168 }, { "epoch": 1.5954841178721777, "grad_norm": 0.582895815372467, "learning_rate": 1.5929674112661972e-05, "loss": 0.7961, "step": 4169 }, { "epoch": 1.5958668197474166, "grad_norm": 0.5192604064941406, "learning_rate": 1.5927677802081577e-05, "loss": 0.6184, "step": 4170 }, { "epoch": 1.596249521622656, "grad_norm": 0.49857762455940247, "learning_rate": 1.592568112722945e-05, "loss": 0.7114, "step": 4171 }, { "epoch": 1.5966322234978951, "grad_norm": 0.521378755569458, "learning_rate": 1.5923684088228308e-05, "loss": 0.6845, "step": 4172 }, { "epoch": 1.5970149253731343, "grad_norm": 0.5451171398162842, "learning_rate": 1.592168668520086e-05, "loss": 0.7062, "step": 4173 }, { "epoch": 1.5973976272483736, "grad_norm": 0.5135816335678101, "learning_rate": 1.591968891826986e-05, "loss": 0.7115, "step": 4174 }, { "epoch": 1.5977803291236126, "grad_norm": 0.49221742153167725, "learning_rate": 1.5917690787558073e-05, "loss": 0.6174, "step": 4175 }, { "epoch": 1.598163030998852, "grad_norm": 0.5670404434204102, "learning_rate": 1.5915692293188287e-05, "loss": 0.68, "step": 4176 }, { "epoch": 1.598545732874091, "grad_norm": 0.5200201869010925, "learning_rate": 1.5913693435283317e-05, "loss": 0.5922, "step": 4177 }, { "epoch": 1.5989284347493302, "grad_norm": 0.5194621086120605, "learning_rate": 1.5911694213965997e-05, "loss": 0.6293, "step": 4178 }, { "epoch": 1.5993111366245696, "grad_norm": 0.5593679547309875, "learning_rate": 1.590969462935919e-05, "loss": 0.6705, "step": 4179 }, { "epoch": 1.5996938384998085, "grad_norm": 0.5671465396881104, "learning_rate": 1.590769468158577e-05, "loss": 0.6941, "step": 4180 }, { "epoch": 1.6000765403750479, "grad_norm": 0.5374941229820251, "learning_rate": 1.590569437076864e-05, "loss": 0.6486, "step": 4181 }, { "epoch": 1.600459242250287, "grad_norm": 0.5207922458648682, "learning_rate": 1.5903693697030723e-05, "loss": 0.6478, "step": 4182 }, { "epoch": 1.6008419441255262, "grad_norm": 0.5749903321266174, "learning_rate": 1.5901692660494973e-05, "loss": 0.5948, "step": 4183 }, { "epoch": 1.6012246460007655, "grad_norm": 0.5077226758003235, "learning_rate": 1.5899691261284353e-05, "loss": 0.6745, "step": 4184 }, { "epoch": 1.6016073478760045, "grad_norm": 0.5103320479393005, "learning_rate": 1.5897689499521853e-05, "loss": 0.6612, "step": 4185 }, { "epoch": 1.6019900497512438, "grad_norm": 0.5480393767356873, "learning_rate": 1.589568737533049e-05, "loss": 0.6543, "step": 4186 }, { "epoch": 1.602372751626483, "grad_norm": 0.522807240486145, "learning_rate": 1.5893684888833304e-05, "loss": 0.6011, "step": 4187 }, { "epoch": 1.602755453501722, "grad_norm": 0.5566440224647522, "learning_rate": 1.589168204015334e-05, "loss": 0.6862, "step": 4188 }, { "epoch": 1.6031381553769615, "grad_norm": 0.5340921878814697, "learning_rate": 1.5889678829413694e-05, "loss": 0.6133, "step": 4189 }, { "epoch": 1.6035208572522004, "grad_norm": 0.47898608446121216, "learning_rate": 1.5887675256737454e-05, "loss": 0.6517, "step": 4190 }, { "epoch": 1.6039035591274398, "grad_norm": 0.539457380771637, "learning_rate": 1.588567132224776e-05, "loss": 0.6037, "step": 4191 }, { "epoch": 1.604286261002679, "grad_norm": 0.5517003536224365, "learning_rate": 1.5883667026067745e-05, "loss": 0.6738, "step": 4192 }, { "epoch": 1.604668962877918, "grad_norm": 0.603654682636261, "learning_rate": 1.5881662368320588e-05, "loss": 0.664, "step": 4193 }, { "epoch": 1.6050516647531574, "grad_norm": 0.5059731602668762, "learning_rate": 1.5879657349129477e-05, "loss": 0.6776, "step": 4194 }, { "epoch": 1.6054343666283963, "grad_norm": 0.5008291602134705, "learning_rate": 1.5877651968617627e-05, "loss": 0.594, "step": 4195 }, { "epoch": 1.6058170685036357, "grad_norm": 0.5945414304733276, "learning_rate": 1.587564622690827e-05, "loss": 0.6308, "step": 4196 }, { "epoch": 1.6061997703788748, "grad_norm": 0.5783547163009644, "learning_rate": 1.587364012412467e-05, "loss": 0.6507, "step": 4197 }, { "epoch": 1.606582472254114, "grad_norm": 0.5894497036933899, "learning_rate": 1.5871633660390107e-05, "loss": 0.5506, "step": 4198 }, { "epoch": 1.6069651741293534, "grad_norm": 0.5480672717094421, "learning_rate": 1.586962683582788e-05, "loss": 0.6694, "step": 4199 }, { "epoch": 1.6073478760045923, "grad_norm": 0.5079948306083679, "learning_rate": 1.5867619650561313e-05, "loss": 0.6638, "step": 4200 }, { "epoch": 1.6077305778798316, "grad_norm": 0.5203515887260437, "learning_rate": 1.5865612104713756e-05, "loss": 0.6587, "step": 4201 }, { "epoch": 1.6081132797550708, "grad_norm": 0.5227818489074707, "learning_rate": 1.5863604198408584e-05, "loss": 0.6436, "step": 4202 }, { "epoch": 1.60849598163031, "grad_norm": 0.5568510293960571, "learning_rate": 1.586159593176917e-05, "loss": 0.6805, "step": 4203 }, { "epoch": 1.6088786835055493, "grad_norm": 0.5167678594589233, "learning_rate": 1.5859587304918946e-05, "loss": 0.636, "step": 4204 }, { "epoch": 1.6092613853807882, "grad_norm": 0.5234500765800476, "learning_rate": 1.585757831798134e-05, "loss": 0.67, "step": 4205 }, { "epoch": 1.6096440872560276, "grad_norm": 0.563599169254303, "learning_rate": 1.5855568971079808e-05, "loss": 0.6191, "step": 4206 }, { "epoch": 1.6100267891312667, "grad_norm": 0.5528417229652405, "learning_rate": 1.5853559264337835e-05, "loss": 0.696, "step": 4207 }, { "epoch": 1.6104094910065059, "grad_norm": 0.5814376473426819, "learning_rate": 1.5851549197878914e-05, "loss": 0.7136, "step": 4208 }, { "epoch": 1.6107921928817452, "grad_norm": 0.5285678505897522, "learning_rate": 1.5849538771826578e-05, "loss": 0.5932, "step": 4209 }, { "epoch": 1.6111748947569842, "grad_norm": 0.4842156767845154, "learning_rate": 1.5847527986304372e-05, "loss": 0.626, "step": 4210 }, { "epoch": 1.6115575966322235, "grad_norm": 0.565146803855896, "learning_rate": 1.584551684143586e-05, "loss": 0.6388, "step": 4211 }, { "epoch": 1.6119402985074627, "grad_norm": 0.5364747047424316, "learning_rate": 1.5843505337344633e-05, "loss": 0.6254, "step": 4212 }, { "epoch": 1.6123230003827018, "grad_norm": 0.5454295873641968, "learning_rate": 1.5841493474154307e-05, "loss": 0.6665, "step": 4213 }, { "epoch": 1.6127057022579412, "grad_norm": 0.5284351706504822, "learning_rate": 1.583948125198851e-05, "loss": 0.6037, "step": 4214 }, { "epoch": 1.61308840413318, "grad_norm": 0.5404253602027893, "learning_rate": 1.5837468670970906e-05, "loss": 0.5687, "step": 4215 }, { "epoch": 1.6134711060084195, "grad_norm": 0.5989180207252502, "learning_rate": 1.5835455731225167e-05, "loss": 0.6938, "step": 4216 }, { "epoch": 1.6138538078836586, "grad_norm": 0.6362990140914917, "learning_rate": 1.5833442432875e-05, "loss": 0.5828, "step": 4217 }, { "epoch": 1.6142365097588978, "grad_norm": 0.5417254567146301, "learning_rate": 1.583142877604412e-05, "loss": 0.6411, "step": 4218 }, { "epoch": 1.6146192116341371, "grad_norm": 0.49253159761428833, "learning_rate": 1.582941476085628e-05, "loss": 0.6077, "step": 4219 }, { "epoch": 1.615001913509376, "grad_norm": 0.5474430322647095, "learning_rate": 1.582740038743524e-05, "loss": 0.6639, "step": 4220 }, { "epoch": 1.6153846153846154, "grad_norm": 0.5787261724472046, "learning_rate": 1.582538565590479e-05, "loss": 0.657, "step": 4221 }, { "epoch": 1.6157673172598546, "grad_norm": 0.49621066451072693, "learning_rate": 1.582337056638874e-05, "loss": 0.6737, "step": 4222 }, { "epoch": 1.6161500191350937, "grad_norm": 0.4955839514732361, "learning_rate": 1.582135511901093e-05, "loss": 0.5927, "step": 4223 }, { "epoch": 1.616532721010333, "grad_norm": 0.5432429909706116, "learning_rate": 1.5819339313895204e-05, "loss": 0.6676, "step": 4224 }, { "epoch": 1.616915422885572, "grad_norm": 0.5219000577926636, "learning_rate": 1.581732315116545e-05, "loss": 0.7145, "step": 4225 }, { "epoch": 1.6172981247608114, "grad_norm": 0.5502670407295227, "learning_rate": 1.5815306630945553e-05, "loss": 0.6182, "step": 4226 }, { "epoch": 1.6176808266360505, "grad_norm": 0.5144481658935547, "learning_rate": 1.581328975335944e-05, "loss": 0.7031, "step": 4227 }, { "epoch": 1.6180635285112897, "grad_norm": 0.5516034364700317, "learning_rate": 1.581127251853106e-05, "loss": 0.61, "step": 4228 }, { "epoch": 1.618446230386529, "grad_norm": 0.5240404009819031, "learning_rate": 1.5809254926584366e-05, "loss": 0.6844, "step": 4229 }, { "epoch": 1.618828932261768, "grad_norm": 0.5165302157402039, "learning_rate": 1.5807236977643356e-05, "loss": 0.6633, "step": 4230 }, { "epoch": 1.6192116341370073, "grad_norm": 0.49160006642341614, "learning_rate": 1.580521867183203e-05, "loss": 0.6688, "step": 4231 }, { "epoch": 1.6195943360122464, "grad_norm": 0.5447586178779602, "learning_rate": 1.5803200009274423e-05, "loss": 0.682, "step": 4232 }, { "epoch": 1.6199770378874856, "grad_norm": 0.5379782915115356, "learning_rate": 1.580118099009458e-05, "loss": 0.6825, "step": 4233 }, { "epoch": 1.620359739762725, "grad_norm": 0.5196921229362488, "learning_rate": 1.579916161441658e-05, "loss": 0.6227, "step": 4234 }, { "epoch": 1.6207424416379639, "grad_norm": 0.5791500210762024, "learning_rate": 1.5797141882364528e-05, "loss": 0.6339, "step": 4235 }, { "epoch": 1.6211251435132032, "grad_norm": 0.6063637733459473, "learning_rate": 1.579512179406253e-05, "loss": 0.6061, "step": 4236 }, { "epoch": 1.6215078453884424, "grad_norm": 0.5479731559753418, "learning_rate": 1.5793101349634725e-05, "loss": 0.6006, "step": 4237 }, { "epoch": 1.6218905472636815, "grad_norm": 0.5534972548484802, "learning_rate": 1.579108054920528e-05, "loss": 0.6543, "step": 4238 }, { "epoch": 1.622273249138921, "grad_norm": 0.6015211343765259, "learning_rate": 1.5789059392898383e-05, "loss": 0.7019, "step": 4239 }, { "epoch": 1.6226559510141598, "grad_norm": 0.5335915088653564, "learning_rate": 1.5787037880838226e-05, "loss": 0.7153, "step": 4240 }, { "epoch": 1.6230386528893992, "grad_norm": 0.5477595925331116, "learning_rate": 1.5785016013149054e-05, "loss": 0.6805, "step": 4241 }, { "epoch": 1.6234213547646383, "grad_norm": 0.5091144442558289, "learning_rate": 1.57829937899551e-05, "loss": 0.6652, "step": 4242 }, { "epoch": 1.6238040566398775, "grad_norm": 0.528877317905426, "learning_rate": 1.5780971211380644e-05, "loss": 0.6289, "step": 4243 }, { "epoch": 1.6241867585151168, "grad_norm": 0.5757085680961609, "learning_rate": 1.5778948277549973e-05, "loss": 0.5571, "step": 4244 }, { "epoch": 1.6245694603903558, "grad_norm": 0.6224637031555176, "learning_rate": 1.5776924988587406e-05, "loss": 0.6535, "step": 4245 }, { "epoch": 1.6249521622655951, "grad_norm": 0.5676321983337402, "learning_rate": 1.5774901344617282e-05, "loss": 0.6683, "step": 4246 }, { "epoch": 1.6253348641408343, "grad_norm": 0.49722370505332947, "learning_rate": 1.5772877345763955e-05, "loss": 0.6217, "step": 4247 }, { "epoch": 1.6257175660160734, "grad_norm": 0.5581941604614258, "learning_rate": 1.5770852992151806e-05, "loss": 0.5891, "step": 4248 }, { "epoch": 1.6261002678913128, "grad_norm": 0.5805266499519348, "learning_rate": 1.5768828283905238e-05, "loss": 0.6085, "step": 4249 }, { "epoch": 1.6264829697665517, "grad_norm": 0.5295156240463257, "learning_rate": 1.5766803221148676e-05, "loss": 0.6355, "step": 4250 }, { "epoch": 1.626865671641791, "grad_norm": 0.495796263217926, "learning_rate": 1.576477780400656e-05, "loss": 0.6213, "step": 4251 }, { "epoch": 1.6272483735170302, "grad_norm": 0.49408429861068726, "learning_rate": 1.576275203260336e-05, "loss": 0.6137, "step": 4252 }, { "epoch": 1.6276310753922694, "grad_norm": 0.5836955308914185, "learning_rate": 1.5760725907063566e-05, "loss": 0.684, "step": 4253 }, { "epoch": 1.6280137772675087, "grad_norm": 0.5471921563148499, "learning_rate": 1.5758699427511694e-05, "loss": 0.6414, "step": 4254 }, { "epoch": 1.6283964791427477, "grad_norm": 0.5240886211395264, "learning_rate": 1.5756672594072268e-05, "loss": 0.6423, "step": 4255 }, { "epoch": 1.628779181017987, "grad_norm": 0.529484212398529, "learning_rate": 1.5754645406869843e-05, "loss": 0.6724, "step": 4256 }, { "epoch": 1.6291618828932262, "grad_norm": 0.6638882160186768, "learning_rate": 1.5752617866029005e-05, "loss": 0.5951, "step": 4257 }, { "epoch": 1.6295445847684653, "grad_norm": 0.4932021200656891, "learning_rate": 1.5750589971674338e-05, "loss": 0.6164, "step": 4258 }, { "epoch": 1.6299272866437047, "grad_norm": 0.5590762495994568, "learning_rate": 1.574856172393047e-05, "loss": 0.7168, "step": 4259 }, { "epoch": 1.6303099885189436, "grad_norm": 0.48899951577186584, "learning_rate": 1.5746533122922044e-05, "loss": 0.6868, "step": 4260 }, { "epoch": 1.630692690394183, "grad_norm": 0.5107108354568481, "learning_rate": 1.5744504168773716e-05, "loss": 0.6243, "step": 4261 }, { "epoch": 1.631075392269422, "grad_norm": 0.534608781337738, "learning_rate": 1.5742474861610177e-05, "loss": 0.71, "step": 4262 }, { "epoch": 1.6314580941446613, "grad_norm": 0.6165154576301575, "learning_rate": 1.574044520155613e-05, "loss": 0.5554, "step": 4263 }, { "epoch": 1.6318407960199006, "grad_norm": 0.5857166647911072, "learning_rate": 1.57384151887363e-05, "loss": 0.7219, "step": 4264 }, { "epoch": 1.6322234978951395, "grad_norm": 0.4892512857913971, "learning_rate": 1.5736384823275445e-05, "loss": 0.621, "step": 4265 }, { "epoch": 1.632606199770379, "grad_norm": 0.5416974425315857, "learning_rate": 1.5734354105298332e-05, "loss": 0.6781, "step": 4266 }, { "epoch": 1.632988901645618, "grad_norm": 0.5511426329612732, "learning_rate": 1.5732323034929753e-05, "loss": 0.5999, "step": 4267 }, { "epoch": 1.6333716035208572, "grad_norm": 0.6228882670402527, "learning_rate": 1.5730291612294523e-05, "loss": 0.6192, "step": 4268 }, { "epoch": 1.6337543053960966, "grad_norm": 0.5223386287689209, "learning_rate": 1.5728259837517475e-05, "loss": 0.6849, "step": 4269 }, { "epoch": 1.6341370072713355, "grad_norm": 0.5295674800872803, "learning_rate": 1.5726227710723478e-05, "loss": 0.6857, "step": 4270 }, { "epoch": 1.6345197091465749, "grad_norm": 0.7357922792434692, "learning_rate": 1.57241952320374e-05, "loss": 0.6034, "step": 4271 }, { "epoch": 1.634902411021814, "grad_norm": 0.5290018916130066, "learning_rate": 1.572216240158415e-05, "loss": 0.6247, "step": 4272 }, { "epoch": 1.6352851128970531, "grad_norm": 0.5671287775039673, "learning_rate": 1.572012921948865e-05, "loss": 0.6948, "step": 4273 }, { "epoch": 1.6356678147722925, "grad_norm": 0.4957449734210968, "learning_rate": 1.5718095685875836e-05, "loss": 0.6539, "step": 4274 }, { "epoch": 1.6360505166475314, "grad_norm": 0.543829619884491, "learning_rate": 1.5716061800870685e-05, "loss": 0.6309, "step": 4275 }, { "epoch": 1.6364332185227708, "grad_norm": 0.5203392505645752, "learning_rate": 1.571402756459818e-05, "loss": 0.5957, "step": 4276 }, { "epoch": 1.63681592039801, "grad_norm": 0.49227362871170044, "learning_rate": 1.5711992977183327e-05, "loss": 0.6278, "step": 4277 }, { "epoch": 1.637198622273249, "grad_norm": 0.5585498213768005, "learning_rate": 1.5709958038751163e-05, "loss": 0.6402, "step": 4278 }, { "epoch": 1.6375813241484884, "grad_norm": 0.5382294058799744, "learning_rate": 1.5707922749426735e-05, "loss": 0.545, "step": 4279 }, { "epoch": 1.6379640260237274, "grad_norm": 0.5707585215568542, "learning_rate": 1.5705887109335124e-05, "loss": 0.6375, "step": 4280 }, { "epoch": 1.6383467278989667, "grad_norm": 0.549027681350708, "learning_rate": 1.570385111860142e-05, "loss": 0.6203, "step": 4281 }, { "epoch": 1.6387294297742059, "grad_norm": 0.5694654583930969, "learning_rate": 1.5701814777350743e-05, "loss": 0.6751, "step": 4282 }, { "epoch": 1.639112131649445, "grad_norm": 0.5456705093383789, "learning_rate": 1.5699778085708226e-05, "loss": 0.7098, "step": 4283 }, { "epoch": 1.6394948335246844, "grad_norm": 0.5431959629058838, "learning_rate": 1.5697741043799036e-05, "loss": 0.6432, "step": 4284 }, { "epoch": 1.6398775353999233, "grad_norm": 0.47257593274116516, "learning_rate": 1.569570365174835e-05, "loss": 0.6539, "step": 4285 }, { "epoch": 1.6402602372751627, "grad_norm": 0.5382380485534668, "learning_rate": 1.5693665909681377e-05, "loss": 0.6767, "step": 4286 }, { "epoch": 1.6406429391504018, "grad_norm": 0.49865880608558655, "learning_rate": 1.5691627817723335e-05, "loss": 0.6904, "step": 4287 }, { "epoch": 1.641025641025641, "grad_norm": 0.4790586829185486, "learning_rate": 1.5689589375999473e-05, "loss": 0.6461, "step": 4288 }, { "epoch": 1.6414083429008803, "grad_norm": 0.672833263874054, "learning_rate": 1.5687550584635058e-05, "loss": 0.7577, "step": 4289 }, { "epoch": 1.6417910447761193, "grad_norm": 0.5307454466819763, "learning_rate": 1.568551144375538e-05, "loss": 0.6395, "step": 4290 }, { "epoch": 1.6421737466513586, "grad_norm": 0.7639450430870056, "learning_rate": 1.5683471953485755e-05, "loss": 0.6827, "step": 4291 }, { "epoch": 1.6425564485265978, "grad_norm": 0.5689762830734253, "learning_rate": 1.56814321139515e-05, "loss": 0.6277, "step": 4292 }, { "epoch": 1.642939150401837, "grad_norm": 0.5208433866500854, "learning_rate": 1.5679391925277988e-05, "loss": 0.6242, "step": 4293 }, { "epoch": 1.6433218522770763, "grad_norm": 0.5022462606430054, "learning_rate": 1.567735138759058e-05, "loss": 0.6009, "step": 4294 }, { "epoch": 1.6437045541523152, "grad_norm": 0.5236585736274719, "learning_rate": 1.567531050101468e-05, "loss": 0.6958, "step": 4295 }, { "epoch": 1.6440872560275546, "grad_norm": 0.5711921453475952, "learning_rate": 1.56732692656757e-05, "loss": 0.6902, "step": 4296 }, { "epoch": 1.6444699579027937, "grad_norm": 0.5388635396957397, "learning_rate": 1.5671227681699082e-05, "loss": 0.6475, "step": 4297 }, { "epoch": 1.6448526597780329, "grad_norm": 0.49668648838996887, "learning_rate": 1.566918574921029e-05, "loss": 0.6305, "step": 4298 }, { "epoch": 1.6452353616532722, "grad_norm": 0.5200216174125671, "learning_rate": 1.56671434683348e-05, "loss": 0.6321, "step": 4299 }, { "epoch": 1.6456180635285111, "grad_norm": 0.5486312508583069, "learning_rate": 1.5665100839198123e-05, "loss": 0.6836, "step": 4300 }, { "epoch": 1.6460007654037505, "grad_norm": 0.536111056804657, "learning_rate": 1.5663057861925777e-05, "loss": 0.6239, "step": 4301 }, { "epoch": 1.6463834672789897, "grad_norm": 0.5012050867080688, "learning_rate": 1.5661014536643314e-05, "loss": 0.6493, "step": 4302 }, { "epoch": 1.6467661691542288, "grad_norm": 0.5579233765602112, "learning_rate": 1.5658970863476298e-05, "loss": 0.643, "step": 4303 }, { "epoch": 1.6471488710294682, "grad_norm": 0.5654094815254211, "learning_rate": 1.565692684255032e-05, "loss": 0.667, "step": 4304 }, { "epoch": 1.647531572904707, "grad_norm": 0.5148199200630188, "learning_rate": 1.565488247399099e-05, "loss": 0.6337, "step": 4305 }, { "epoch": 1.6479142747799465, "grad_norm": 0.572525143623352, "learning_rate": 1.565283775792394e-05, "loss": 0.6472, "step": 4306 }, { "epoch": 1.6482969766551856, "grad_norm": 0.5721274614334106, "learning_rate": 1.5650792694474826e-05, "loss": 0.7336, "step": 4307 }, { "epoch": 1.6486796785304247, "grad_norm": 0.46624869108200073, "learning_rate": 1.564874728376932e-05, "loss": 0.668, "step": 4308 }, { "epoch": 1.649062380405664, "grad_norm": 0.4958135485649109, "learning_rate": 1.5646701525933114e-05, "loss": 0.6441, "step": 4309 }, { "epoch": 1.649445082280903, "grad_norm": 0.5904421806335449, "learning_rate": 1.5644655421091933e-05, "loss": 0.654, "step": 4310 }, { "epoch": 1.6498277841561424, "grad_norm": 0.4952678680419922, "learning_rate": 1.5642608969371512e-05, "loss": 0.6577, "step": 4311 }, { "epoch": 1.6502104860313815, "grad_norm": 0.5139767527580261, "learning_rate": 1.5640562170897608e-05, "loss": 0.6435, "step": 4312 }, { "epoch": 1.6505931879066207, "grad_norm": 0.6465383768081665, "learning_rate": 1.563851502579601e-05, "loss": 0.7113, "step": 4313 }, { "epoch": 1.65097588978186, "grad_norm": 0.49905505776405334, "learning_rate": 1.563646753419251e-05, "loss": 0.6474, "step": 4314 }, { "epoch": 1.651358591657099, "grad_norm": 0.5673899054527283, "learning_rate": 1.563441969621294e-05, "loss": 0.7101, "step": 4315 }, { "epoch": 1.6517412935323383, "grad_norm": 0.5152559876441956, "learning_rate": 1.5632371511983143e-05, "loss": 0.6415, "step": 4316 }, { "epoch": 1.6521239954075775, "grad_norm": 0.5092588663101196, "learning_rate": 1.563032298162899e-05, "loss": 0.601, "step": 4317 }, { "epoch": 1.6525066972828166, "grad_norm": 0.49037617444992065, "learning_rate": 1.562827410527636e-05, "loss": 0.6584, "step": 4318 }, { "epoch": 1.652889399158056, "grad_norm": 0.5325514674186707, "learning_rate": 1.5626224883051165e-05, "loss": 0.6832, "step": 4319 }, { "epoch": 1.653272101033295, "grad_norm": 0.5366780757904053, "learning_rate": 1.5624175315079336e-05, "loss": 0.6954, "step": 4320 }, { "epoch": 1.6536548029085343, "grad_norm": 0.5533644556999207, "learning_rate": 1.5622125401486826e-05, "loss": 0.6576, "step": 4321 }, { "epoch": 1.6540375047837734, "grad_norm": 0.5357879996299744, "learning_rate": 1.5620075142399606e-05, "loss": 0.663, "step": 4322 }, { "epoch": 1.6544202066590126, "grad_norm": 0.5243419408798218, "learning_rate": 1.561802453794367e-05, "loss": 0.525, "step": 4323 }, { "epoch": 1.654802908534252, "grad_norm": 0.5038297772407532, "learning_rate": 1.5615973588245036e-05, "loss": 0.6292, "step": 4324 }, { "epoch": 1.6551856104094909, "grad_norm": 0.6009093523025513, "learning_rate": 1.5613922293429733e-05, "loss": 0.6976, "step": 4325 }, { "epoch": 1.6555683122847302, "grad_norm": 0.5400561094284058, "learning_rate": 1.5611870653623826e-05, "loss": 0.6371, "step": 4326 }, { "epoch": 1.6559510141599694, "grad_norm": 0.5378261208534241, "learning_rate": 1.560981866895339e-05, "loss": 0.6081, "step": 4327 }, { "epoch": 1.6563337160352085, "grad_norm": 0.5496990084648132, "learning_rate": 1.5607766339544528e-05, "loss": 0.6524, "step": 4328 }, { "epoch": 1.6567164179104479, "grad_norm": 0.5874646306037903, "learning_rate": 1.5605713665523356e-05, "loss": 0.6402, "step": 4329 }, { "epoch": 1.6570991197856868, "grad_norm": 0.5266179442405701, "learning_rate": 1.5603660647016022e-05, "loss": 0.6435, "step": 4330 }, { "epoch": 1.6574818216609262, "grad_norm": 0.5702356696128845, "learning_rate": 1.5601607284148687e-05, "loss": 0.6989, "step": 4331 }, { "epoch": 1.6578645235361653, "grad_norm": 0.6913303136825562, "learning_rate": 1.5599553577047535e-05, "loss": 0.689, "step": 4332 }, { "epoch": 1.6582472254114045, "grad_norm": 0.5515636205673218, "learning_rate": 1.5597499525838774e-05, "loss": 0.6379, "step": 4333 }, { "epoch": 1.6586299272866438, "grad_norm": 0.6058421730995178, "learning_rate": 1.5595445130648625e-05, "loss": 0.5553, "step": 4334 }, { "epoch": 1.6590126291618827, "grad_norm": 0.5690128207206726, "learning_rate": 1.5593390391603347e-05, "loss": 0.6851, "step": 4335 }, { "epoch": 1.6593953310371221, "grad_norm": 0.5882741212844849, "learning_rate": 1.55913353088292e-05, "loss": 0.6691, "step": 4336 }, { "epoch": 1.6597780329123613, "grad_norm": 0.4865604043006897, "learning_rate": 1.5589279882452476e-05, "loss": 0.6509, "step": 4337 }, { "epoch": 1.6601607347876004, "grad_norm": 0.5618063807487488, "learning_rate": 1.558722411259949e-05, "loss": 0.6309, "step": 4338 }, { "epoch": 1.6605434366628398, "grad_norm": 0.5602952837944031, "learning_rate": 1.5585167999396567e-05, "loss": 0.5865, "step": 4339 }, { "epoch": 1.6609261385380787, "grad_norm": 1.8140685558319092, "learning_rate": 1.5583111542970074e-05, "loss": 0.6635, "step": 4340 }, { "epoch": 1.661308840413318, "grad_norm": 0.4991161823272705, "learning_rate": 1.5581054743446374e-05, "loss": 0.6029, "step": 4341 }, { "epoch": 1.6616915422885572, "grad_norm": 0.5793114900588989, "learning_rate": 1.5578997600951864e-05, "loss": 0.6896, "step": 4342 }, { "epoch": 1.6620742441637963, "grad_norm": 0.5168867707252502, "learning_rate": 1.5576940115612966e-05, "loss": 0.5831, "step": 4343 }, { "epoch": 1.6624569460390357, "grad_norm": 0.4940605163574219, "learning_rate": 1.5574882287556115e-05, "loss": 0.6612, "step": 4344 }, { "epoch": 1.6628396479142746, "grad_norm": 0.5288812518119812, "learning_rate": 1.5572824116907772e-05, "loss": 0.6112, "step": 4345 }, { "epoch": 1.663222349789514, "grad_norm": 0.5414489507675171, "learning_rate": 1.5570765603794413e-05, "loss": 0.5853, "step": 4346 }, { "epoch": 1.6636050516647531, "grad_norm": 0.5472850799560547, "learning_rate": 1.5568706748342545e-05, "loss": 0.6177, "step": 4347 }, { "epoch": 1.6639877535399923, "grad_norm": 0.6141483187675476, "learning_rate": 1.5566647550678684e-05, "loss": 0.6948, "step": 4348 }, { "epoch": 1.6643704554152317, "grad_norm": 0.5179823040962219, "learning_rate": 1.5564588010929375e-05, "loss": 0.7583, "step": 4349 }, { "epoch": 1.6647531572904706, "grad_norm": 0.532174289226532, "learning_rate": 1.5562528129221187e-05, "loss": 0.6514, "step": 4350 }, { "epoch": 1.66513585916571, "grad_norm": 0.5471709966659546, "learning_rate": 1.5560467905680706e-05, "loss": 0.6789, "step": 4351 }, { "epoch": 1.665518561040949, "grad_norm": 0.5539054274559021, "learning_rate": 1.5558407340434528e-05, "loss": 0.6619, "step": 4352 }, { "epoch": 1.6659012629161882, "grad_norm": 0.5780105590820312, "learning_rate": 1.5556346433609286e-05, "loss": 0.6758, "step": 4353 }, { "epoch": 1.6662839647914276, "grad_norm": 0.5983449220657349, "learning_rate": 1.555428518533163e-05, "loss": 0.6964, "step": 4354 }, { "epoch": 1.6666666666666665, "grad_norm": 0.49970725178718567, "learning_rate": 1.555222359572823e-05, "loss": 0.611, "step": 4355 }, { "epoch": 1.6670493685419059, "grad_norm": 0.5202506184577942, "learning_rate": 1.555016166492577e-05, "loss": 0.5951, "step": 4356 }, { "epoch": 1.667432070417145, "grad_norm": 0.5064339637756348, "learning_rate": 1.5548099393050967e-05, "loss": 0.6547, "step": 4357 }, { "epoch": 1.6678147722923842, "grad_norm": 0.5519571900367737, "learning_rate": 1.554603678023055e-05, "loss": 0.6444, "step": 4358 }, { "epoch": 1.6681974741676235, "grad_norm": 0.5384988784790039, "learning_rate": 1.5543973826591276e-05, "loss": 0.5913, "step": 4359 }, { "epoch": 1.6685801760428625, "grad_norm": 0.5731048583984375, "learning_rate": 1.5541910532259914e-05, "loss": 0.7023, "step": 4360 }, { "epoch": 1.6689628779181018, "grad_norm": 0.4837013781070709, "learning_rate": 1.553984689736326e-05, "loss": 0.6577, "step": 4361 }, { "epoch": 1.669345579793341, "grad_norm": 0.5162649750709534, "learning_rate": 1.5537782922028136e-05, "loss": 0.6325, "step": 4362 }, { "epoch": 1.6697282816685801, "grad_norm": 0.49877050518989563, "learning_rate": 1.5535718606381377e-05, "loss": 0.5957, "step": 4363 }, { "epoch": 1.6701109835438195, "grad_norm": 0.4984208643436432, "learning_rate": 1.553365395054983e-05, "loss": 0.605, "step": 4364 }, { "epoch": 1.6704936854190584, "grad_norm": 0.5498892664909363, "learning_rate": 1.5531588954660385e-05, "loss": 0.6684, "step": 4365 }, { "epoch": 1.6708763872942978, "grad_norm": 0.51121586561203, "learning_rate": 1.5529523618839937e-05, "loss": 0.5916, "step": 4366 }, { "epoch": 1.671259089169537, "grad_norm": 0.5661166906356812, "learning_rate": 1.5527457943215408e-05, "loss": 0.6212, "step": 4367 }, { "epoch": 1.671641791044776, "grad_norm": 0.4878019690513611, "learning_rate": 1.5525391927913746e-05, "loss": 0.6517, "step": 4368 }, { "epoch": 1.6720244929200154, "grad_norm": 0.4751785397529602, "learning_rate": 1.5523325573061895e-05, "loss": 0.6362, "step": 4369 }, { "epoch": 1.6724071947952543, "grad_norm": 0.5511400103569031, "learning_rate": 1.5521258878786856e-05, "loss": 0.6005, "step": 4370 }, { "epoch": 1.6727898966704937, "grad_norm": 0.524625837802887, "learning_rate": 1.5519191845215625e-05, "loss": 0.6337, "step": 4371 }, { "epoch": 1.6731725985457329, "grad_norm": 0.5556111931800842, "learning_rate": 1.551712447247523e-05, "loss": 0.6767, "step": 4372 }, { "epoch": 1.673555300420972, "grad_norm": 0.5430702567100525, "learning_rate": 1.5515056760692707e-05, "loss": 0.6576, "step": 4373 }, { "epoch": 1.6739380022962114, "grad_norm": 0.5006523132324219, "learning_rate": 1.5512988709995135e-05, "loss": 0.5692, "step": 4374 }, { "epoch": 1.6743207041714503, "grad_norm": 0.5867803692817688, "learning_rate": 1.5510920320509595e-05, "loss": 0.64, "step": 4375 }, { "epoch": 1.6747034060466897, "grad_norm": 0.5512092113494873, "learning_rate": 1.5508851592363197e-05, "loss": 0.6964, "step": 4376 }, { "epoch": 1.6750861079219288, "grad_norm": 0.5154982805252075, "learning_rate": 1.5506782525683073e-05, "loss": 0.6231, "step": 4377 }, { "epoch": 1.675468809797168, "grad_norm": 0.5232341885566711, "learning_rate": 1.5504713120596363e-05, "loss": 0.6498, "step": 4378 }, { "epoch": 1.6758515116724073, "grad_norm": 0.536321222782135, "learning_rate": 1.5502643377230247e-05, "loss": 0.699, "step": 4379 }, { "epoch": 1.6762342135476462, "grad_norm": 0.5706935524940491, "learning_rate": 1.550057329571191e-05, "loss": 0.6797, "step": 4380 }, { "epoch": 1.6766169154228856, "grad_norm": 0.4935586750507355, "learning_rate": 1.5498502876168566e-05, "loss": 0.5649, "step": 4381 }, { "epoch": 1.6769996172981247, "grad_norm": 0.5674734115600586, "learning_rate": 1.5496432118727453e-05, "loss": 0.6417, "step": 4382 }, { "epoch": 1.677382319173364, "grad_norm": 0.5231019258499146, "learning_rate": 1.5494361023515815e-05, "loss": 0.6915, "step": 4383 }, { "epoch": 1.6777650210486033, "grad_norm": 0.5769078135490417, "learning_rate": 1.5492289590660937e-05, "loss": 0.6315, "step": 4384 }, { "epoch": 1.6781477229238422, "grad_norm": 0.5036982893943787, "learning_rate": 1.5490217820290104e-05, "loss": 0.5584, "step": 4385 }, { "epoch": 1.6785304247990815, "grad_norm": 0.5370412468910217, "learning_rate": 1.548814571253064e-05, "loss": 0.6688, "step": 4386 }, { "epoch": 1.6789131266743207, "grad_norm": 0.5368664860725403, "learning_rate": 1.5486073267509874e-05, "loss": 0.6698, "step": 4387 }, { "epoch": 1.6792958285495598, "grad_norm": 0.5188376903533936, "learning_rate": 1.548400048535517e-05, "loss": 0.5805, "step": 4388 }, { "epoch": 1.6796785304247992, "grad_norm": 0.6235090494155884, "learning_rate": 1.5481927366193904e-05, "loss": 0.6432, "step": 4389 }, { "epoch": 1.6800612323000381, "grad_norm": 0.5031155943870544, "learning_rate": 1.5479853910153475e-05, "loss": 0.6819, "step": 4390 }, { "epoch": 1.6804439341752775, "grad_norm": 0.6022192239761353, "learning_rate": 1.5477780117361302e-05, "loss": 0.6627, "step": 4391 }, { "epoch": 1.6808266360505166, "grad_norm": 0.5620031356811523, "learning_rate": 1.5475705987944822e-05, "loss": 0.627, "step": 4392 }, { "epoch": 1.6812093379257558, "grad_norm": 0.5478487610816956, "learning_rate": 1.54736315220315e-05, "loss": 0.7438, "step": 4393 }, { "epoch": 1.6815920398009951, "grad_norm": 0.5554544925689697, "learning_rate": 1.5471556719748814e-05, "loss": 0.6732, "step": 4394 }, { "epoch": 1.681974741676234, "grad_norm": 0.5851149559020996, "learning_rate": 1.5469481581224274e-05, "loss": 0.6107, "step": 4395 }, { "epoch": 1.6823574435514734, "grad_norm": 0.4861753582954407, "learning_rate": 1.5467406106585388e-05, "loss": 0.5907, "step": 4396 }, { "epoch": 1.6827401454267126, "grad_norm": 0.5862606167793274, "learning_rate": 1.5465330295959717e-05, "loss": 0.683, "step": 4397 }, { "epoch": 1.6831228473019517, "grad_norm": 0.5952194929122925, "learning_rate": 1.5463254149474812e-05, "loss": 0.6353, "step": 4398 }, { "epoch": 1.683505549177191, "grad_norm": 0.7281048893928528, "learning_rate": 1.5461177667258262e-05, "loss": 0.6319, "step": 4399 }, { "epoch": 1.68388825105243, "grad_norm": 0.6419719457626343, "learning_rate": 1.545910084943768e-05, "loss": 0.6764, "step": 4400 }, { "epoch": 1.6842709529276694, "grad_norm": 0.6011815071105957, "learning_rate": 1.545702369614068e-05, "loss": 0.6617, "step": 4401 }, { "epoch": 1.6846536548029085, "grad_norm": 0.5798829197883606, "learning_rate": 1.5454946207494913e-05, "loss": 0.7124, "step": 4402 }, { "epoch": 1.6850363566781477, "grad_norm": 0.5074421167373657, "learning_rate": 1.5452868383628047e-05, "loss": 0.6192, "step": 4403 }, { "epoch": 1.685419058553387, "grad_norm": 0.5076141357421875, "learning_rate": 1.5450790224667768e-05, "loss": 0.6435, "step": 4404 }, { "epoch": 1.685801760428626, "grad_norm": 0.5411178469657898, "learning_rate": 1.544871173074179e-05, "loss": 0.6581, "step": 4405 }, { "epoch": 1.6861844623038653, "grad_norm": 0.5375654697418213, "learning_rate": 1.5446632901977836e-05, "loss": 0.605, "step": 4406 }, { "epoch": 1.6865671641791045, "grad_norm": 0.5569940805435181, "learning_rate": 1.544455373850366e-05, "loss": 0.5989, "step": 4407 }, { "epoch": 1.6869498660543436, "grad_norm": 0.509524405002594, "learning_rate": 1.5442474240447028e-05, "loss": 0.5846, "step": 4408 }, { "epoch": 1.687332567929583, "grad_norm": 0.5708227753639221, "learning_rate": 1.5440394407935732e-05, "loss": 0.6067, "step": 4409 }, { "epoch": 1.687715269804822, "grad_norm": 0.5329142212867737, "learning_rate": 1.5438314241097584e-05, "loss": 0.5888, "step": 4410 }, { "epoch": 1.6880979716800613, "grad_norm": 0.5215886831283569, "learning_rate": 1.543623374006042e-05, "loss": 0.6367, "step": 4411 }, { "epoch": 1.6884806735553004, "grad_norm": 0.4949476420879364, "learning_rate": 1.5434152904952085e-05, "loss": 0.5603, "step": 4412 }, { "epoch": 1.6888633754305395, "grad_norm": 0.5428240299224854, "learning_rate": 1.5432071735900458e-05, "loss": 0.6993, "step": 4413 }, { "epoch": 1.689246077305779, "grad_norm": 0.5335379242897034, "learning_rate": 1.5429990233033427e-05, "loss": 0.6397, "step": 4414 }, { "epoch": 1.6896287791810178, "grad_norm": 0.5573947429656982, "learning_rate": 1.5427908396478908e-05, "loss": 0.6732, "step": 4415 }, { "epoch": 1.6900114810562572, "grad_norm": 0.5351755619049072, "learning_rate": 1.5425826226364837e-05, "loss": 0.716, "step": 4416 }, { "epoch": 1.6903941829314963, "grad_norm": 0.5323948264122009, "learning_rate": 1.5423743722819167e-05, "loss": 0.6088, "step": 4417 }, { "epoch": 1.6907768848067355, "grad_norm": 0.513859748840332, "learning_rate": 1.5421660885969875e-05, "loss": 0.7175, "step": 4418 }, { "epoch": 1.6911595866819749, "grad_norm": 0.5497235059738159, "learning_rate": 1.5419577715944957e-05, "loss": 0.6301, "step": 4419 }, { "epoch": 1.6915422885572138, "grad_norm": 0.5005728006362915, "learning_rate": 1.5417494212872427e-05, "loss": 0.6415, "step": 4420 }, { "epoch": 1.6919249904324531, "grad_norm": 0.5932610630989075, "learning_rate": 1.541541037688033e-05, "loss": 0.6041, "step": 4421 }, { "epoch": 1.6923076923076923, "grad_norm": 0.5631845593452454, "learning_rate": 1.541332620809671e-05, "loss": 0.7217, "step": 4422 }, { "epoch": 1.6926903941829314, "grad_norm": 0.5202962756156921, "learning_rate": 1.5411241706649655e-05, "loss": 0.6942, "step": 4423 }, { "epoch": 1.6930730960581708, "grad_norm": 0.6189414262771606, "learning_rate": 1.540915687266726e-05, "loss": 0.7385, "step": 4424 }, { "epoch": 1.6934557979334097, "grad_norm": 0.5542664527893066, "learning_rate": 1.5407071706277638e-05, "loss": 0.5778, "step": 4425 }, { "epoch": 1.693838499808649, "grad_norm": 0.8870189785957336, "learning_rate": 1.5404986207608938e-05, "loss": 0.6187, "step": 4426 }, { "epoch": 1.6942212016838882, "grad_norm": 0.5069278478622437, "learning_rate": 1.5402900376789316e-05, "loss": 0.6551, "step": 4427 }, { "epoch": 1.6946039035591274, "grad_norm": 0.5511771440505981, "learning_rate": 1.5400814213946948e-05, "loss": 0.6392, "step": 4428 }, { "epoch": 1.6949866054343667, "grad_norm": 0.5597057342529297, "learning_rate": 1.539872771921004e-05, "loss": 0.6998, "step": 4429 }, { "epoch": 1.6953693073096057, "grad_norm": 0.6252185702323914, "learning_rate": 1.5396640892706807e-05, "loss": 0.6429, "step": 4430 }, { "epoch": 1.695752009184845, "grad_norm": 0.5011460185050964, "learning_rate": 1.53945537345655e-05, "loss": 0.6811, "step": 4431 }, { "epoch": 1.6961347110600842, "grad_norm": 0.5398042798042297, "learning_rate": 1.5392466244914368e-05, "loss": 0.6731, "step": 4432 }, { "epoch": 1.6965174129353233, "grad_norm": 0.5699660181999207, "learning_rate": 1.5390378423881695e-05, "loss": 0.664, "step": 4433 }, { "epoch": 1.6969001148105627, "grad_norm": 0.5084646940231323, "learning_rate": 1.5388290271595794e-05, "loss": 0.6781, "step": 4434 }, { "epoch": 1.6972828166858016, "grad_norm": 0.5539363026618958, "learning_rate": 1.538620178818498e-05, "loss": 0.6072, "step": 4435 }, { "epoch": 1.697665518561041, "grad_norm": 0.5551081895828247, "learning_rate": 1.538411297377759e-05, "loss": 0.6731, "step": 4436 }, { "epoch": 1.6980482204362801, "grad_norm": 0.550974428653717, "learning_rate": 1.5382023828501997e-05, "loss": 0.6271, "step": 4437 }, { "epoch": 1.6984309223115193, "grad_norm": 0.5883692502975464, "learning_rate": 1.537993435248658e-05, "loss": 0.6342, "step": 4438 }, { "epoch": 1.6988136241867586, "grad_norm": 0.5694878101348877, "learning_rate": 1.5377844545859745e-05, "loss": 0.7231, "step": 4439 }, { "epoch": 1.6991963260619976, "grad_norm": 0.5235363245010376, "learning_rate": 1.5375754408749915e-05, "loss": 0.6735, "step": 4440 }, { "epoch": 1.699579027937237, "grad_norm": 0.5516543388366699, "learning_rate": 1.5373663941285536e-05, "loss": 0.6581, "step": 4441 }, { "epoch": 1.699961729812476, "grad_norm": 0.551139235496521, "learning_rate": 1.537157314359507e-05, "loss": 0.6466, "step": 4442 }, { "epoch": 1.7003444316877152, "grad_norm": 0.5572298765182495, "learning_rate": 1.5369482015807e-05, "loss": 0.6801, "step": 4443 }, { "epoch": 1.7007271335629546, "grad_norm": 0.5153892040252686, "learning_rate": 1.5367390558049843e-05, "loss": 0.6214, "step": 4444 }, { "epoch": 1.7011098354381935, "grad_norm": 0.4881877899169922, "learning_rate": 1.5365298770452107e-05, "loss": 0.5841, "step": 4445 }, { "epoch": 1.7014925373134329, "grad_norm": 0.5396494269371033, "learning_rate": 1.5363206653142357e-05, "loss": 0.6823, "step": 4446 }, { "epoch": 1.701875239188672, "grad_norm": 0.6144421100616455, "learning_rate": 1.5361114206249147e-05, "loss": 0.7819, "step": 4447 }, { "epoch": 1.7022579410639112, "grad_norm": 0.5415388941764832, "learning_rate": 1.5359021429901063e-05, "loss": 0.652, "step": 4448 }, { "epoch": 1.7026406429391505, "grad_norm": 0.5244187712669373, "learning_rate": 1.5356928324226718e-05, "loss": 0.6374, "step": 4449 }, { "epoch": 1.7030233448143894, "grad_norm": 0.5393893718719482, "learning_rate": 1.535483488935474e-05, "loss": 0.5866, "step": 4450 }, { "epoch": 1.7034060466896288, "grad_norm": 0.49182310700416565, "learning_rate": 1.5352741125413763e-05, "loss": 0.656, "step": 4451 }, { "epoch": 1.703788748564868, "grad_norm": 0.5636366009712219, "learning_rate": 1.5350647032532472e-05, "loss": 0.6394, "step": 4452 }, { "epoch": 1.704171450440107, "grad_norm": 0.5422935485839844, "learning_rate": 1.534855261083954e-05, "loss": 0.6896, "step": 4453 }, { "epoch": 1.7045541523153465, "grad_norm": 0.5551270246505737, "learning_rate": 1.5346457860463687e-05, "loss": 0.7015, "step": 4454 }, { "epoch": 1.7049368541905854, "grad_norm": 0.5393613576889038, "learning_rate": 1.5344362781533632e-05, "loss": 0.7091, "step": 4455 }, { "epoch": 1.7053195560658247, "grad_norm": 0.5268805027008057, "learning_rate": 1.5342267374178126e-05, "loss": 0.6262, "step": 4456 }, { "epoch": 1.705702257941064, "grad_norm": 0.5767546892166138, "learning_rate": 1.534017163852594e-05, "loss": 0.6292, "step": 4457 }, { "epoch": 1.706084959816303, "grad_norm": 0.5949785709381104, "learning_rate": 1.533807557470586e-05, "loss": 0.6834, "step": 4458 }, { "epoch": 1.7064676616915424, "grad_norm": 0.5090019106864929, "learning_rate": 1.5335979182846698e-05, "loss": 0.6899, "step": 4459 }, { "epoch": 1.7068503635667813, "grad_norm": 0.5293718576431274, "learning_rate": 1.5333882463077276e-05, "loss": 0.6224, "step": 4460 }, { "epoch": 1.7072330654420207, "grad_norm": 0.5522685647010803, "learning_rate": 1.5331785415526446e-05, "loss": 0.6818, "step": 4461 }, { "epoch": 1.7076157673172598, "grad_norm": 0.5499504208564758, "learning_rate": 1.5329688040323082e-05, "loss": 0.6552, "step": 4462 }, { "epoch": 1.707998469192499, "grad_norm": 0.5828074216842651, "learning_rate": 1.532759033759607e-05, "loss": 0.7009, "step": 4463 }, { "epoch": 1.7083811710677383, "grad_norm": 0.5790151953697205, "learning_rate": 1.5325492307474314e-05, "loss": 0.6867, "step": 4464 }, { "epoch": 1.7087638729429773, "grad_norm": 0.5574508905410767, "learning_rate": 1.5323393950086757e-05, "loss": 0.6671, "step": 4465 }, { "epoch": 1.7091465748182166, "grad_norm": 0.6547495722770691, "learning_rate": 1.5321295265562334e-05, "loss": 0.6175, "step": 4466 }, { "epoch": 1.7095292766934558, "grad_norm": 0.5064290761947632, "learning_rate": 1.5319196254030026e-05, "loss": 0.623, "step": 4467 }, { "epoch": 1.709911978568695, "grad_norm": 0.5379889607429504, "learning_rate": 1.5317096915618813e-05, "loss": 0.6272, "step": 4468 }, { "epoch": 1.7102946804439343, "grad_norm": 0.5336356163024902, "learning_rate": 1.5314997250457712e-05, "loss": 0.6234, "step": 4469 }, { "epoch": 1.7106773823191732, "grad_norm": 0.5653278231620789, "learning_rate": 1.5312897258675753e-05, "loss": 0.6408, "step": 4470 }, { "epoch": 1.7110600841944126, "grad_norm": 0.4893880784511566, "learning_rate": 1.5310796940401984e-05, "loss": 0.6723, "step": 4471 }, { "epoch": 1.7114427860696517, "grad_norm": 0.5243481397628784, "learning_rate": 1.5308696295765474e-05, "loss": 0.6833, "step": 4472 }, { "epoch": 1.7118254879448909, "grad_norm": 0.5411103367805481, "learning_rate": 1.5306595324895316e-05, "loss": 0.6243, "step": 4473 }, { "epoch": 1.7122081898201302, "grad_norm": 0.5438482165336609, "learning_rate": 1.530449402792062e-05, "loss": 0.6724, "step": 4474 }, { "epoch": 1.7125908916953692, "grad_norm": 0.515373706817627, "learning_rate": 1.5302392404970517e-05, "loss": 0.689, "step": 4475 }, { "epoch": 1.7129735935706085, "grad_norm": 0.5901016592979431, "learning_rate": 1.5300290456174153e-05, "loss": 0.6861, "step": 4476 }, { "epoch": 1.7133562954458477, "grad_norm": 0.5115534663200378, "learning_rate": 1.5298188181660702e-05, "loss": 0.6869, "step": 4477 }, { "epoch": 1.7137389973210868, "grad_norm": 0.5548985004425049, "learning_rate": 1.5296085581559354e-05, "loss": 0.6624, "step": 4478 }, { "epoch": 1.7141216991963262, "grad_norm": 0.6052834987640381, "learning_rate": 1.5293982655999317e-05, "loss": 0.6599, "step": 4479 }, { "epoch": 1.714504401071565, "grad_norm": 0.4986618161201477, "learning_rate": 1.5291879405109825e-05, "loss": 0.7475, "step": 4480 }, { "epoch": 1.7148871029468045, "grad_norm": 0.52466881275177, "learning_rate": 1.528977582902013e-05, "loss": 0.6819, "step": 4481 }, { "epoch": 1.7152698048220436, "grad_norm": 0.5386791825294495, "learning_rate": 1.5287671927859494e-05, "loss": 0.5707, "step": 4482 }, { "epoch": 1.7156525066972828, "grad_norm": 0.582141637802124, "learning_rate": 1.5285567701757216e-05, "loss": 0.6834, "step": 4483 }, { "epoch": 1.7160352085725221, "grad_norm": 0.5797891020774841, "learning_rate": 1.5283463150842604e-05, "loss": 0.7201, "step": 4484 }, { "epoch": 1.716417910447761, "grad_norm": 0.5579447150230408, "learning_rate": 1.5281358275244987e-05, "loss": 0.6373, "step": 4485 }, { "epoch": 1.7168006123230004, "grad_norm": 0.5196929574012756, "learning_rate": 1.5279253075093712e-05, "loss": 0.7019, "step": 4486 }, { "epoch": 1.7171833141982396, "grad_norm": 0.6104020476341248, "learning_rate": 1.5277147550518156e-05, "loss": 0.698, "step": 4487 }, { "epoch": 1.7175660160734787, "grad_norm": 0.5289765000343323, "learning_rate": 1.5275041701647705e-05, "loss": 0.6174, "step": 4488 }, { "epoch": 1.717948717948718, "grad_norm": 0.5654638409614563, "learning_rate": 1.527293552861177e-05, "loss": 0.6864, "step": 4489 }, { "epoch": 1.718331419823957, "grad_norm": 0.4764801859855652, "learning_rate": 1.5270829031539782e-05, "loss": 0.5184, "step": 4490 }, { "epoch": 1.7187141216991964, "grad_norm": 0.6064857840538025, "learning_rate": 1.526872221056119e-05, "loss": 0.6226, "step": 4491 }, { "epoch": 1.7190968235744355, "grad_norm": 0.5518608689308167, "learning_rate": 1.5266615065805463e-05, "loss": 0.6722, "step": 4492 }, { "epoch": 1.7194795254496746, "grad_norm": 0.519381046295166, "learning_rate": 1.526450759740209e-05, "loss": 0.6801, "step": 4493 }, { "epoch": 1.719862227324914, "grad_norm": 0.5144667029380798, "learning_rate": 1.526239980548059e-05, "loss": 0.656, "step": 4494 }, { "epoch": 1.720244929200153, "grad_norm": 0.5680873990058899, "learning_rate": 1.5260291690170474e-05, "loss": 0.7406, "step": 4495 }, { "epoch": 1.7206276310753923, "grad_norm": 0.5153980255126953, "learning_rate": 1.525818325160131e-05, "loss": 0.7261, "step": 4496 }, { "epoch": 1.7210103329506314, "grad_norm": 0.5371862649917603, "learning_rate": 1.5256074489902656e-05, "loss": 0.6348, "step": 4497 }, { "epoch": 1.7213930348258706, "grad_norm": 0.552861213684082, "learning_rate": 1.5253965405204102e-05, "loss": 0.7018, "step": 4498 }, { "epoch": 1.72177573670111, "grad_norm": 0.52130526304245, "learning_rate": 1.525185599763526e-05, "loss": 0.6041, "step": 4499 }, { "epoch": 1.7221584385763489, "grad_norm": 0.5398353338241577, "learning_rate": 1.524974626732576e-05, "loss": 0.6262, "step": 4500 }, { "epoch": 1.7225411404515882, "grad_norm": 0.5053167343139648, "learning_rate": 1.5247636214405248e-05, "loss": 0.591, "step": 4501 }, { "epoch": 1.7229238423268274, "grad_norm": 0.4943530559539795, "learning_rate": 1.5245525839003392e-05, "loss": 0.6743, "step": 4502 }, { "epoch": 1.7233065442020665, "grad_norm": 0.4800105392932892, "learning_rate": 1.524341514124988e-05, "loss": 0.6545, "step": 4503 }, { "epoch": 1.723689246077306, "grad_norm": 0.7421761155128479, "learning_rate": 1.5241304121274425e-05, "loss": 0.6291, "step": 4504 }, { "epoch": 1.7240719479525448, "grad_norm": 0.5302397608757019, "learning_rate": 1.5239192779206748e-05, "loss": 0.6807, "step": 4505 }, { "epoch": 1.7244546498277842, "grad_norm": 0.578159511089325, "learning_rate": 1.5237081115176603e-05, "loss": 0.624, "step": 4506 }, { "epoch": 1.7248373517030233, "grad_norm": 0.6348825097084045, "learning_rate": 1.523496912931375e-05, "loss": 0.7205, "step": 4507 }, { "epoch": 1.7252200535782625, "grad_norm": 0.5663437843322754, "learning_rate": 1.5232856821747981e-05, "loss": 0.7044, "step": 4508 }, { "epoch": 1.7256027554535018, "grad_norm": 0.5744450688362122, "learning_rate": 1.5230744192609104e-05, "loss": 0.7369, "step": 4509 }, { "epoch": 1.7259854573287408, "grad_norm": 0.5574772953987122, "learning_rate": 1.522863124202694e-05, "loss": 0.6346, "step": 4510 }, { "epoch": 1.7263681592039801, "grad_norm": 0.5609472990036011, "learning_rate": 1.5226517970131345e-05, "loss": 0.6305, "step": 4511 }, { "epoch": 1.7267508610792193, "grad_norm": 0.5682723522186279, "learning_rate": 1.5224404377052178e-05, "loss": 0.6023, "step": 4512 }, { "epoch": 1.7271335629544584, "grad_norm": 0.5626649260520935, "learning_rate": 1.5222290462919324e-05, "loss": 0.601, "step": 4513 }, { "epoch": 1.7275162648296978, "grad_norm": 0.5391250848770142, "learning_rate": 1.5220176227862693e-05, "loss": 0.6514, "step": 4514 }, { "epoch": 1.7278989667049367, "grad_norm": 0.5662454962730408, "learning_rate": 1.5218061672012212e-05, "loss": 0.6392, "step": 4515 }, { "epoch": 1.728281668580176, "grad_norm": 0.5804277658462524, "learning_rate": 1.5215946795497818e-05, "loss": 0.6616, "step": 4516 }, { "epoch": 1.7286643704554152, "grad_norm": 0.5216949582099915, "learning_rate": 1.5213831598449487e-05, "loss": 0.6628, "step": 4517 }, { "epoch": 1.7290470723306544, "grad_norm": 0.5748910903930664, "learning_rate": 1.521171608099719e-05, "loss": 0.6202, "step": 4518 }, { "epoch": 1.7294297742058937, "grad_norm": 0.5259443521499634, "learning_rate": 1.5209600243270943e-05, "loss": 0.6432, "step": 4519 }, { "epoch": 1.7298124760811326, "grad_norm": 0.5682885050773621, "learning_rate": 1.5207484085400766e-05, "loss": 0.6469, "step": 4520 }, { "epoch": 1.730195177956372, "grad_norm": 0.61273592710495, "learning_rate": 1.5205367607516699e-05, "loss": 0.6673, "step": 4521 }, { "epoch": 1.7305778798316112, "grad_norm": 0.5704248547554016, "learning_rate": 1.520325080974881e-05, "loss": 0.6072, "step": 4522 }, { "epoch": 1.7309605817068503, "grad_norm": 0.5671982765197754, "learning_rate": 1.5201133692227181e-05, "loss": 0.5982, "step": 4523 }, { "epoch": 1.7313432835820897, "grad_norm": 0.5220487117767334, "learning_rate": 1.519901625508191e-05, "loss": 0.6129, "step": 4524 }, { "epoch": 1.7317259854573286, "grad_norm": 0.4975503981113434, "learning_rate": 1.5196898498443129e-05, "loss": 0.636, "step": 4525 }, { "epoch": 1.732108687332568, "grad_norm": 0.516268789768219, "learning_rate": 1.5194780422440971e-05, "loss": 0.7049, "step": 4526 }, { "epoch": 1.732491389207807, "grad_norm": 0.5469555854797363, "learning_rate": 1.5192662027205601e-05, "loss": 0.6633, "step": 4527 }, { "epoch": 1.7328740910830462, "grad_norm": 0.539199948310852, "learning_rate": 1.5190543312867198e-05, "loss": 0.664, "step": 4528 }, { "epoch": 1.7332567929582856, "grad_norm": 0.6062024831771851, "learning_rate": 1.5188424279555964e-05, "loss": 0.6093, "step": 4529 }, { "epoch": 1.7336394948335245, "grad_norm": 0.5126818418502808, "learning_rate": 1.5186304927402122e-05, "loss": 0.7419, "step": 4530 }, { "epoch": 1.734022196708764, "grad_norm": 0.6182147860527039, "learning_rate": 1.5184185256535907e-05, "loss": 0.6839, "step": 4531 }, { "epoch": 1.734404898584003, "grad_norm": 0.5249072909355164, "learning_rate": 1.5182065267087582e-05, "loss": 0.7284, "step": 4532 }, { "epoch": 1.7347876004592422, "grad_norm": 0.5319894552230835, "learning_rate": 1.5179944959187423e-05, "loss": 0.6679, "step": 4533 }, { "epoch": 1.7351703023344816, "grad_norm": 0.6359682083129883, "learning_rate": 1.5177824332965729e-05, "loss": 0.6595, "step": 4534 }, { "epoch": 1.7355530042097205, "grad_norm": 0.5481525659561157, "learning_rate": 1.5175703388552825e-05, "loss": 0.658, "step": 4535 }, { "epoch": 1.7359357060849598, "grad_norm": 0.5442440509796143, "learning_rate": 1.517358212607904e-05, "loss": 0.6554, "step": 4536 }, { "epoch": 1.736318407960199, "grad_norm": 0.5501551628112793, "learning_rate": 1.5171460545674734e-05, "loss": 0.6854, "step": 4537 }, { "epoch": 1.7367011098354381, "grad_norm": 0.5824860334396362, "learning_rate": 1.5169338647470287e-05, "loss": 0.6365, "step": 4538 }, { "epoch": 1.7370838117106775, "grad_norm": 0.600202739238739, "learning_rate": 1.5167216431596089e-05, "loss": 0.7116, "step": 4539 }, { "epoch": 1.7374665135859164, "grad_norm": 0.5443181991577148, "learning_rate": 1.516509389818256e-05, "loss": 0.7651, "step": 4540 }, { "epoch": 1.7378492154611558, "grad_norm": 0.5352132320404053, "learning_rate": 1.516297104736014e-05, "loss": 0.6389, "step": 4541 }, { "epoch": 1.738231917336395, "grad_norm": 0.56842440366745, "learning_rate": 1.5160847879259275e-05, "loss": 0.6422, "step": 4542 }, { "epoch": 1.738614619211634, "grad_norm": 0.5170071721076965, "learning_rate": 1.5158724394010442e-05, "loss": 0.6468, "step": 4543 }, { "epoch": 1.7389973210868734, "grad_norm": 0.5316141247749329, "learning_rate": 1.5156600591744137e-05, "loss": 0.6241, "step": 4544 }, { "epoch": 1.7393800229621124, "grad_norm": 0.49250906705856323, "learning_rate": 1.5154476472590874e-05, "loss": 0.6364, "step": 4545 }, { "epoch": 1.7397627248373517, "grad_norm": 0.5705097913742065, "learning_rate": 1.5152352036681186e-05, "loss": 0.642, "step": 4546 }, { "epoch": 1.7401454267125909, "grad_norm": 0.5337973237037659, "learning_rate": 1.515022728414562e-05, "loss": 0.6965, "step": 4547 }, { "epoch": 1.74052812858783, "grad_norm": 0.5740866661071777, "learning_rate": 1.5148102215114753e-05, "loss": 0.5863, "step": 4548 }, { "epoch": 1.7409108304630694, "grad_norm": 0.6015119552612305, "learning_rate": 1.5145976829719173e-05, "loss": 0.6231, "step": 4549 }, { "epoch": 1.7412935323383083, "grad_norm": 0.5468730926513672, "learning_rate": 1.5143851128089493e-05, "loss": 0.5876, "step": 4550 }, { "epoch": 1.7416762342135477, "grad_norm": 0.5711395740509033, "learning_rate": 1.5141725110356344e-05, "loss": 0.6856, "step": 4551 }, { "epoch": 1.7420589360887868, "grad_norm": 0.5416901111602783, "learning_rate": 1.513959877665037e-05, "loss": 0.6812, "step": 4552 }, { "epoch": 1.742441637964026, "grad_norm": 0.4900062680244446, "learning_rate": 1.5137472127102246e-05, "loss": 0.6538, "step": 4553 }, { "epoch": 1.7428243398392653, "grad_norm": 0.4995408058166504, "learning_rate": 1.513534516184266e-05, "loss": 0.5883, "step": 4554 }, { "epoch": 1.7432070417145042, "grad_norm": 0.5492715239524841, "learning_rate": 1.5133217881002313e-05, "loss": 0.6258, "step": 4555 }, { "epoch": 1.7435897435897436, "grad_norm": 0.5448857545852661, "learning_rate": 1.5131090284711942e-05, "loss": 0.6817, "step": 4556 }, { "epoch": 1.7439724454649828, "grad_norm": 0.5655015110969543, "learning_rate": 1.5128962373102285e-05, "loss": 0.6561, "step": 4557 }, { "epoch": 1.744355147340222, "grad_norm": 0.5359886884689331, "learning_rate": 1.5126834146304115e-05, "loss": 0.591, "step": 4558 }, { "epoch": 1.7447378492154613, "grad_norm": 0.5810126066207886, "learning_rate": 1.5124705604448208e-05, "loss": 0.7233, "step": 4559 }, { "epoch": 1.7451205510907002, "grad_norm": 0.5753326416015625, "learning_rate": 1.5122576747665379e-05, "loss": 0.6095, "step": 4560 }, { "epoch": 1.7455032529659396, "grad_norm": 0.5852716565132141, "learning_rate": 1.5120447576086446e-05, "loss": 0.7023, "step": 4561 }, { "epoch": 1.7458859548411787, "grad_norm": 0.539341390132904, "learning_rate": 1.5118318089842254e-05, "loss": 0.6262, "step": 4562 }, { "epoch": 1.7462686567164178, "grad_norm": 0.5289041996002197, "learning_rate": 1.5116188289063663e-05, "loss": 0.6606, "step": 4563 }, { "epoch": 1.7466513585916572, "grad_norm": 0.5784135460853577, "learning_rate": 1.5114058173881554e-05, "loss": 0.6802, "step": 4564 }, { "epoch": 1.7470340604668961, "grad_norm": 0.5072954297065735, "learning_rate": 1.5111927744426836e-05, "loss": 0.6906, "step": 4565 }, { "epoch": 1.7474167623421355, "grad_norm": 0.5606713891029358, "learning_rate": 1.5109797000830427e-05, "loss": 0.6306, "step": 4566 }, { "epoch": 1.7477994642173746, "grad_norm": 0.6208239197731018, "learning_rate": 1.510766594322326e-05, "loss": 0.7185, "step": 4567 }, { "epoch": 1.7481821660926138, "grad_norm": 0.5423541069030762, "learning_rate": 1.51055345717363e-05, "loss": 0.6509, "step": 4568 }, { "epoch": 1.7485648679678532, "grad_norm": 0.5558854341506958, "learning_rate": 1.5103402886500526e-05, "loss": 0.6535, "step": 4569 }, { "epoch": 1.748947569843092, "grad_norm": 0.5768556594848633, "learning_rate": 1.5101270887646935e-05, "loss": 0.677, "step": 4570 }, { "epoch": 1.7493302717183314, "grad_norm": 0.5476897954940796, "learning_rate": 1.5099138575306543e-05, "loss": 0.695, "step": 4571 }, { "epoch": 1.7497129735935706, "grad_norm": 0.6125562191009521, "learning_rate": 1.5097005949610385e-05, "loss": 0.6496, "step": 4572 }, { "epoch": 1.7500956754688097, "grad_norm": 0.5820769667625427, "learning_rate": 1.509487301068952e-05, "loss": 0.6434, "step": 4573 }, { "epoch": 1.750478377344049, "grad_norm": 0.5395642518997192, "learning_rate": 1.5092739758675019e-05, "loss": 0.6225, "step": 4574 }, { "epoch": 1.750861079219288, "grad_norm": 0.608199954032898, "learning_rate": 1.509060619369798e-05, "loss": 0.6354, "step": 4575 }, { "epoch": 1.7512437810945274, "grad_norm": 0.5223132967948914, "learning_rate": 1.5088472315889516e-05, "loss": 0.569, "step": 4576 }, { "epoch": 1.7516264829697665, "grad_norm": 0.5125810503959656, "learning_rate": 1.5086338125380757e-05, "loss": 0.6419, "step": 4577 }, { "epoch": 1.7520091848450057, "grad_norm": 0.5299440622329712, "learning_rate": 1.5084203622302855e-05, "loss": 0.7133, "step": 4578 }, { "epoch": 1.752391886720245, "grad_norm": 0.5204107761383057, "learning_rate": 1.508206880678698e-05, "loss": 0.6985, "step": 4579 }, { "epoch": 1.752774588595484, "grad_norm": 0.5406456589698792, "learning_rate": 1.5079933678964327e-05, "loss": 0.6916, "step": 4580 }, { "epoch": 1.7531572904707233, "grad_norm": 0.5004727840423584, "learning_rate": 1.50777982389661e-05, "loss": 0.6352, "step": 4581 }, { "epoch": 1.7535399923459625, "grad_norm": 0.5842272639274597, "learning_rate": 1.5075662486923532e-05, "loss": 0.6302, "step": 4582 }, { "epoch": 1.7539226942212016, "grad_norm": 0.5264187455177307, "learning_rate": 1.5073526422967866e-05, "loss": 0.6384, "step": 4583 }, { "epoch": 1.754305396096441, "grad_norm": 0.657837986946106, "learning_rate": 1.5071390047230371e-05, "loss": 0.6307, "step": 4584 }, { "epoch": 1.75468809797168, "grad_norm": 0.5419890284538269, "learning_rate": 1.5069253359842337e-05, "loss": 0.6585, "step": 4585 }, { "epoch": 1.7550707998469193, "grad_norm": 0.5207778215408325, "learning_rate": 1.5067116360935066e-05, "loss": 0.6586, "step": 4586 }, { "epoch": 1.7554535017221584, "grad_norm": 0.5605060458183289, "learning_rate": 1.5064979050639878e-05, "loss": 0.7275, "step": 4587 }, { "epoch": 1.7558362035973976, "grad_norm": 0.5512086153030396, "learning_rate": 1.5062841429088122e-05, "loss": 0.6866, "step": 4588 }, { "epoch": 1.756218905472637, "grad_norm": 0.48615652322769165, "learning_rate": 1.5060703496411158e-05, "loss": 0.6064, "step": 4589 }, { "epoch": 1.7566016073478758, "grad_norm": 0.5222479701042175, "learning_rate": 1.5058565252740368e-05, "loss": 0.6395, "step": 4590 }, { "epoch": 1.7569843092231152, "grad_norm": 0.5536056160926819, "learning_rate": 1.5056426698207156e-05, "loss": 0.6395, "step": 4591 }, { "epoch": 1.7573670110983544, "grad_norm": 0.5524901151657104, "learning_rate": 1.5054287832942939e-05, "loss": 0.6473, "step": 4592 }, { "epoch": 1.7577497129735935, "grad_norm": 0.5296889543533325, "learning_rate": 1.5052148657079155e-05, "loss": 0.696, "step": 4593 }, { "epoch": 1.7581324148488329, "grad_norm": 0.5379515290260315, "learning_rate": 1.5050009170747264e-05, "loss": 0.7104, "step": 4594 }, { "epoch": 1.7585151167240718, "grad_norm": 0.5245203375816345, "learning_rate": 1.5047869374078742e-05, "loss": 0.6276, "step": 4595 }, { "epoch": 1.7588978185993112, "grad_norm": 0.5919705033302307, "learning_rate": 1.504572926720509e-05, "loss": 0.6732, "step": 4596 }, { "epoch": 1.7592805204745503, "grad_norm": 0.5429064631462097, "learning_rate": 1.5043588850257815e-05, "loss": 0.6841, "step": 4597 }, { "epoch": 1.7596632223497894, "grad_norm": 0.5293561220169067, "learning_rate": 1.5041448123368454e-05, "loss": 0.6227, "step": 4598 }, { "epoch": 1.7600459242250288, "grad_norm": 0.4840903580188751, "learning_rate": 1.5039307086668565e-05, "loss": 0.6533, "step": 4599 }, { "epoch": 1.7604286261002677, "grad_norm": 0.5614187717437744, "learning_rate": 1.5037165740289716e-05, "loss": 0.6902, "step": 4600 }, { "epoch": 1.760811327975507, "grad_norm": 0.5410558581352234, "learning_rate": 1.5035024084363504e-05, "loss": 0.5352, "step": 4601 }, { "epoch": 1.7611940298507462, "grad_norm": 0.5267480611801147, "learning_rate": 1.503288211902153e-05, "loss": 0.6778, "step": 4602 }, { "epoch": 1.7615767317259854, "grad_norm": 0.5526008009910583, "learning_rate": 1.503073984439543e-05, "loss": 0.746, "step": 4603 }, { "epoch": 1.7619594336012248, "grad_norm": 0.4882066249847412, "learning_rate": 1.5028597260616853e-05, "loss": 0.6094, "step": 4604 }, { "epoch": 1.7623421354764637, "grad_norm": 0.5321102142333984, "learning_rate": 1.5026454367817467e-05, "loss": 0.6959, "step": 4605 }, { "epoch": 1.762724837351703, "grad_norm": 0.5434553623199463, "learning_rate": 1.5024311166128956e-05, "loss": 0.6971, "step": 4606 }, { "epoch": 1.7631075392269422, "grad_norm": 0.5128450989723206, "learning_rate": 1.5022167655683022e-05, "loss": 0.6602, "step": 4607 }, { "epoch": 1.7634902411021813, "grad_norm": 0.551014244556427, "learning_rate": 1.50200238366114e-05, "loss": 0.7159, "step": 4608 }, { "epoch": 1.7638729429774207, "grad_norm": 0.6586646437644958, "learning_rate": 1.501787970904582e-05, "loss": 0.6878, "step": 4609 }, { "epoch": 1.7642556448526596, "grad_norm": 0.5548067688941956, "learning_rate": 1.5015735273118056e-05, "loss": 0.6166, "step": 4610 }, { "epoch": 1.764638346727899, "grad_norm": 0.6895891427993774, "learning_rate": 1.5013590528959882e-05, "loss": 0.5932, "step": 4611 }, { "epoch": 1.7650210486031381, "grad_norm": 0.5802630186080933, "learning_rate": 1.5011445476703102e-05, "loss": 0.692, "step": 4612 }, { "epoch": 1.7654037504783773, "grad_norm": 0.5241475105285645, "learning_rate": 1.5009300116479535e-05, "loss": 0.5998, "step": 4613 }, { "epoch": 1.7657864523536166, "grad_norm": 0.5656006336212158, "learning_rate": 1.5007154448421018e-05, "loss": 0.6763, "step": 4614 }, { "epoch": 1.7661691542288556, "grad_norm": 0.5295542478561401, "learning_rate": 1.5005008472659407e-05, "loss": 0.7097, "step": 4615 }, { "epoch": 1.766551856104095, "grad_norm": 0.5251811146736145, "learning_rate": 1.5002862189326583e-05, "loss": 0.5259, "step": 4616 }, { "epoch": 1.766934557979334, "grad_norm": 0.5398306846618652, "learning_rate": 1.5000715598554434e-05, "loss": 0.6246, "step": 4617 }, { "epoch": 1.7673172598545732, "grad_norm": 0.49377432465553284, "learning_rate": 1.4998568700474878e-05, "loss": 0.7168, "step": 4618 }, { "epoch": 1.7676999617298126, "grad_norm": 0.545047402381897, "learning_rate": 1.4996421495219849e-05, "loss": 0.6499, "step": 4619 }, { "epoch": 1.7680826636050515, "grad_norm": 0.5349961519241333, "learning_rate": 1.4994273982921292e-05, "loss": 0.6313, "step": 4620 }, { "epoch": 1.7684653654802909, "grad_norm": 0.5691924691200256, "learning_rate": 1.4992126163711185e-05, "loss": 0.6638, "step": 4621 }, { "epoch": 1.76884806735553, "grad_norm": 0.5224207043647766, "learning_rate": 1.4989978037721513e-05, "loss": 0.6411, "step": 4622 }, { "epoch": 1.7692307692307692, "grad_norm": 0.5423957705497742, "learning_rate": 1.4987829605084284e-05, "loss": 0.7465, "step": 4623 }, { "epoch": 1.7696134711060085, "grad_norm": 0.5166990160942078, "learning_rate": 1.498568086593153e-05, "loss": 0.67, "step": 4624 }, { "epoch": 1.7699961729812475, "grad_norm": 0.5322198867797852, "learning_rate": 1.4983531820395287e-05, "loss": 0.6007, "step": 4625 }, { "epoch": 1.7703788748564868, "grad_norm": 0.49419349431991577, "learning_rate": 1.498138246860763e-05, "loss": 0.6473, "step": 4626 }, { "epoch": 1.770761576731726, "grad_norm": 0.533894956111908, "learning_rate": 1.4979232810700638e-05, "loss": 0.6678, "step": 4627 }, { "epoch": 1.771144278606965, "grad_norm": 0.5425860285758972, "learning_rate": 1.4977082846806408e-05, "loss": 0.704, "step": 4628 }, { "epoch": 1.7715269804822045, "grad_norm": 0.5651950836181641, "learning_rate": 1.497493257705707e-05, "loss": 0.6675, "step": 4629 }, { "epoch": 1.7719096823574434, "grad_norm": 0.5378401279449463, "learning_rate": 1.4972782001584759e-05, "loss": 0.7412, "step": 4630 }, { "epoch": 1.7722923842326828, "grad_norm": 0.5300345420837402, "learning_rate": 1.4970631120521635e-05, "loss": 0.6875, "step": 4631 }, { "epoch": 1.772675086107922, "grad_norm": 0.5530430674552917, "learning_rate": 1.4968479933999878e-05, "loss": 0.6534, "step": 4632 }, { "epoch": 1.773057787983161, "grad_norm": 0.5520253777503967, "learning_rate": 1.4966328442151677e-05, "loss": 0.7212, "step": 4633 }, { "epoch": 1.7734404898584004, "grad_norm": 0.5592941641807556, "learning_rate": 1.4964176645109252e-05, "loss": 0.6838, "step": 4634 }, { "epoch": 1.7738231917336393, "grad_norm": 0.5128092765808105, "learning_rate": 1.4962024543004839e-05, "loss": 0.6837, "step": 4635 }, { "epoch": 1.7742058936088787, "grad_norm": 0.5784416794776917, "learning_rate": 1.4959872135970683e-05, "loss": 0.6907, "step": 4636 }, { "epoch": 1.7745885954841178, "grad_norm": 0.5988723635673523, "learning_rate": 1.4957719424139063e-05, "loss": 0.6708, "step": 4637 }, { "epoch": 1.774971297359357, "grad_norm": 0.5401865839958191, "learning_rate": 1.4955566407642263e-05, "loss": 0.7175, "step": 4638 }, { "epoch": 1.7753539992345964, "grad_norm": 0.5023044943809509, "learning_rate": 1.4953413086612595e-05, "loss": 0.651, "step": 4639 }, { "epoch": 1.7757367011098353, "grad_norm": 0.5285412669181824, "learning_rate": 1.4951259461182389e-05, "loss": 0.6032, "step": 4640 }, { "epoch": 1.7761194029850746, "grad_norm": 0.5713176727294922, "learning_rate": 1.4949105531483983e-05, "loss": 0.6112, "step": 4641 }, { "epoch": 1.7765021048603138, "grad_norm": 0.5291022062301636, "learning_rate": 1.4946951297649748e-05, "loss": 0.6554, "step": 4642 }, { "epoch": 1.776884806735553, "grad_norm": 0.5271397233009338, "learning_rate": 1.4944796759812066e-05, "loss": 0.6171, "step": 4643 }, { "epoch": 1.7772675086107923, "grad_norm": 0.5401374697685242, "learning_rate": 1.4942641918103339e-05, "loss": 0.7494, "step": 4644 }, { "epoch": 1.7776502104860312, "grad_norm": 0.5439696311950684, "learning_rate": 1.4940486772655986e-05, "loss": 0.667, "step": 4645 }, { "epoch": 1.7780329123612706, "grad_norm": 0.5065123438835144, "learning_rate": 1.4938331323602451e-05, "loss": 0.6544, "step": 4646 }, { "epoch": 1.7784156142365097, "grad_norm": 0.49837177991867065, "learning_rate": 1.493617557107519e-05, "loss": 0.7433, "step": 4647 }, { "epoch": 1.7787983161117489, "grad_norm": 0.5310905575752258, "learning_rate": 1.4934019515206675e-05, "loss": 0.6584, "step": 4648 }, { "epoch": 1.7791810179869882, "grad_norm": 0.5047562122344971, "learning_rate": 1.4931863156129406e-05, "loss": 0.6509, "step": 4649 }, { "epoch": 1.7795637198622272, "grad_norm": 0.547585666179657, "learning_rate": 1.4929706493975902e-05, "loss": 0.5928, "step": 4650 }, { "epoch": 1.7799464217374665, "grad_norm": 0.5325350165367126, "learning_rate": 1.4927549528878685e-05, "loss": 0.6842, "step": 4651 }, { "epoch": 1.7803291236127057, "grad_norm": 0.5020607113838196, "learning_rate": 1.4925392260970315e-05, "loss": 0.6198, "step": 4652 }, { "epoch": 1.7807118254879448, "grad_norm": 0.5116055607795715, "learning_rate": 1.4923234690383358e-05, "loss": 0.6623, "step": 4653 }, { "epoch": 1.7810945273631842, "grad_norm": 0.507597804069519, "learning_rate": 1.49210768172504e-05, "loss": 0.7196, "step": 4654 }, { "epoch": 1.781477229238423, "grad_norm": 0.5608073472976685, "learning_rate": 1.4918918641704054e-05, "loss": 0.6425, "step": 4655 }, { "epoch": 1.7818599311136625, "grad_norm": 0.542448103427887, "learning_rate": 1.491676016387694e-05, "loss": 0.637, "step": 4656 }, { "epoch": 1.7822426329889016, "grad_norm": 0.567615270614624, "learning_rate": 1.491460138390171e-05, "loss": 0.6295, "step": 4657 }, { "epoch": 1.7826253348641408, "grad_norm": 0.53093022108078, "learning_rate": 1.491244230191102e-05, "loss": 0.724, "step": 4658 }, { "epoch": 1.7830080367393801, "grad_norm": 0.5556247234344482, "learning_rate": 1.491028291803755e-05, "loss": 0.7299, "step": 4659 }, { "epoch": 1.783390738614619, "grad_norm": 0.504719614982605, "learning_rate": 1.4908123232414006e-05, "loss": 0.6073, "step": 4660 }, { "epoch": 1.7837734404898584, "grad_norm": 0.49375173449516296, "learning_rate": 1.4905963245173105e-05, "loss": 0.6492, "step": 4661 }, { "epoch": 1.7841561423650976, "grad_norm": 0.5604788661003113, "learning_rate": 1.4903802956447581e-05, "loss": 0.7619, "step": 4662 }, { "epoch": 1.7845388442403367, "grad_norm": 0.5662589073181152, "learning_rate": 1.4901642366370193e-05, "loss": 0.6409, "step": 4663 }, { "epoch": 1.784921546115576, "grad_norm": 0.5230194330215454, "learning_rate": 1.4899481475073709e-05, "loss": 0.6886, "step": 4664 }, { "epoch": 1.785304247990815, "grad_norm": 0.5335447192192078, "learning_rate": 1.4897320282690931e-05, "loss": 0.6487, "step": 4665 }, { "epoch": 1.7856869498660544, "grad_norm": 0.5516831874847412, "learning_rate": 1.4895158789354662e-05, "loss": 0.5882, "step": 4666 }, { "epoch": 1.7860696517412935, "grad_norm": 0.5663967132568359, "learning_rate": 1.4892996995197735e-05, "loss": 0.6354, "step": 4667 }, { "epoch": 1.7864523536165327, "grad_norm": 0.5531226396560669, "learning_rate": 1.4890834900353e-05, "loss": 0.6969, "step": 4668 }, { "epoch": 1.786835055491772, "grad_norm": 0.5271188616752625, "learning_rate": 1.4888672504953317e-05, "loss": 0.6538, "step": 4669 }, { "epoch": 1.787217757367011, "grad_norm": 0.5474289655685425, "learning_rate": 1.4886509809131577e-05, "loss": 0.6908, "step": 4670 }, { "epoch": 1.7876004592422503, "grad_norm": 0.4656168222427368, "learning_rate": 1.4884346813020685e-05, "loss": 0.6351, "step": 4671 }, { "epoch": 1.7879831611174895, "grad_norm": 0.5249249935150146, "learning_rate": 1.4882183516753557e-05, "loss": 0.6482, "step": 4672 }, { "epoch": 1.7883658629927286, "grad_norm": 0.5540801286697388, "learning_rate": 1.4880019920463137e-05, "loss": 0.6571, "step": 4673 }, { "epoch": 1.788748564867968, "grad_norm": 0.9662730097770691, "learning_rate": 1.487785602428238e-05, "loss": 0.6395, "step": 4674 }, { "epoch": 1.7891312667432069, "grad_norm": 0.5224184989929199, "learning_rate": 1.4875691828344266e-05, "loss": 0.7167, "step": 4675 }, { "epoch": 1.7895139686184462, "grad_norm": 0.5533115863800049, "learning_rate": 1.4873527332781794e-05, "loss": 0.5902, "step": 4676 }, { "epoch": 1.7898966704936854, "grad_norm": 0.5212864875793457, "learning_rate": 1.4871362537727971e-05, "loss": 0.6595, "step": 4677 }, { "epoch": 1.7902793723689245, "grad_norm": 0.5716767311096191, "learning_rate": 1.4869197443315837e-05, "loss": 0.7214, "step": 4678 }, { "epoch": 1.790662074244164, "grad_norm": 0.5658721923828125, "learning_rate": 1.4867032049678436e-05, "loss": 0.7279, "step": 4679 }, { "epoch": 1.7910447761194028, "grad_norm": 0.5322924852371216, "learning_rate": 1.4864866356948841e-05, "loss": 0.7147, "step": 4680 }, { "epoch": 1.7914274779946422, "grad_norm": 0.545231282711029, "learning_rate": 1.486270036526014e-05, "loss": 0.6232, "step": 4681 }, { "epoch": 1.7918101798698813, "grad_norm": 0.5193164944648743, "learning_rate": 1.486053407474544e-05, "loss": 0.6725, "step": 4682 }, { "epoch": 1.7921928817451205, "grad_norm": 0.5810557007789612, "learning_rate": 1.4858367485537859e-05, "loss": 0.6923, "step": 4683 }, { "epoch": 1.7925755836203598, "grad_norm": 0.6229861378669739, "learning_rate": 1.4856200597770545e-05, "loss": 0.6086, "step": 4684 }, { "epoch": 1.7929582854955988, "grad_norm": 0.5138784050941467, "learning_rate": 1.4854033411576659e-05, "loss": 0.6588, "step": 4685 }, { "epoch": 1.7933409873708381, "grad_norm": 0.5271984338760376, "learning_rate": 1.4851865927089382e-05, "loss": 0.7031, "step": 4686 }, { "epoch": 1.7937236892460773, "grad_norm": 0.5551497936248779, "learning_rate": 1.4849698144441907e-05, "loss": 0.7198, "step": 4687 }, { "epoch": 1.7941063911213164, "grad_norm": 0.4941340684890747, "learning_rate": 1.4847530063767451e-05, "loss": 0.5361, "step": 4688 }, { "epoch": 1.7944890929965558, "grad_norm": 0.557838499546051, "learning_rate": 1.4845361685199251e-05, "loss": 0.5842, "step": 4689 }, { "epoch": 1.7948717948717947, "grad_norm": 0.5311904549598694, "learning_rate": 1.4843193008870556e-05, "loss": 0.5655, "step": 4690 }, { "epoch": 1.795254496747034, "grad_norm": 0.5046560764312744, "learning_rate": 1.4841024034914644e-05, "loss": 0.6448, "step": 4691 }, { "epoch": 1.7956371986222732, "grad_norm": 0.5388524532318115, "learning_rate": 1.4838854763464799e-05, "loss": 0.6168, "step": 4692 }, { "epoch": 1.7960199004975124, "grad_norm": 0.5375342965126038, "learning_rate": 1.4836685194654326e-05, "loss": 0.586, "step": 4693 }, { "epoch": 1.7964026023727517, "grad_norm": 0.4847685992717743, "learning_rate": 1.4834515328616555e-05, "loss": 0.6867, "step": 4694 }, { "epoch": 1.7967853042479907, "grad_norm": 0.5270293951034546, "learning_rate": 1.4832345165484832e-05, "loss": 0.6216, "step": 4695 }, { "epoch": 1.79716800612323, "grad_norm": 0.4990093410015106, "learning_rate": 1.4830174705392512e-05, "loss": 0.7138, "step": 4696 }, { "epoch": 1.7975507079984692, "grad_norm": 0.5106823444366455, "learning_rate": 1.4828003948472984e-05, "loss": 0.603, "step": 4697 }, { "epoch": 1.7979334098737083, "grad_norm": 0.5234810709953308, "learning_rate": 1.4825832894859642e-05, "loss": 0.6912, "step": 4698 }, { "epoch": 1.7983161117489477, "grad_norm": 0.5758552551269531, "learning_rate": 1.4823661544685904e-05, "loss": 0.6612, "step": 4699 }, { "epoch": 1.7986988136241866, "grad_norm": 0.5477757453918457, "learning_rate": 1.4821489898085204e-05, "loss": 0.7025, "step": 4700 }, { "epoch": 1.799081515499426, "grad_norm": 0.5370025038719177, "learning_rate": 1.4819317955190996e-05, "loss": 0.6627, "step": 4701 }, { "epoch": 1.799464217374665, "grad_norm": 0.5651820302009583, "learning_rate": 1.4817145716136759e-05, "loss": 0.6085, "step": 4702 }, { "epoch": 1.7998469192499043, "grad_norm": 0.5786362886428833, "learning_rate": 1.481497318105597e-05, "loss": 0.6684, "step": 4703 }, { "epoch": 1.8002296211251436, "grad_norm": 0.5852587223052979, "learning_rate": 1.4812800350082144e-05, "loss": 0.7125, "step": 4704 }, { "epoch": 1.8006123230003825, "grad_norm": 0.5545000433921814, "learning_rate": 1.4810627223348811e-05, "loss": 0.636, "step": 4705 }, { "epoch": 1.800995024875622, "grad_norm": 0.5624074339866638, "learning_rate": 1.4808453800989505e-05, "loss": 0.7098, "step": 4706 }, { "epoch": 1.801377726750861, "grad_norm": 0.547183632850647, "learning_rate": 1.4806280083137803e-05, "loss": 0.6744, "step": 4707 }, { "epoch": 1.8017604286261002, "grad_norm": 0.6856287717819214, "learning_rate": 1.4804106069927275e-05, "loss": 0.6989, "step": 4708 }, { "epoch": 1.8021431305013396, "grad_norm": 0.5688490867614746, "learning_rate": 1.480193176149152e-05, "loss": 0.677, "step": 4709 }, { "epoch": 1.8025258323765785, "grad_norm": 0.5741695165634155, "learning_rate": 1.4799757157964161e-05, "loss": 0.6383, "step": 4710 }, { "epoch": 1.8029085342518179, "grad_norm": 0.5274592041969299, "learning_rate": 1.4797582259478826e-05, "loss": 0.6555, "step": 4711 }, { "epoch": 1.803291236127057, "grad_norm": 0.5258172750473022, "learning_rate": 1.4795407066169179e-05, "loss": 0.6796, "step": 4712 }, { "epoch": 1.8036739380022961, "grad_norm": 0.5189687013626099, "learning_rate": 1.4793231578168882e-05, "loss": 0.6859, "step": 4713 }, { "epoch": 1.8040566398775355, "grad_norm": 0.5465261936187744, "learning_rate": 1.4791055795611623e-05, "loss": 0.6995, "step": 4714 }, { "epoch": 1.8044393417527744, "grad_norm": 0.5461695194244385, "learning_rate": 1.478887971863112e-05, "loss": 0.6087, "step": 4715 }, { "epoch": 1.8048220436280138, "grad_norm": 0.52660071849823, "learning_rate": 1.4786703347361088e-05, "loss": 0.6022, "step": 4716 }, { "epoch": 1.805204745503253, "grad_norm": 0.5268875360488892, "learning_rate": 1.4784526681935282e-05, "loss": 0.7862, "step": 4717 }, { "epoch": 1.805587447378492, "grad_norm": 0.5406726598739624, "learning_rate": 1.4782349722487457e-05, "loss": 0.6788, "step": 4718 }, { "epoch": 1.8059701492537314, "grad_norm": 0.506401538848877, "learning_rate": 1.4780172469151388e-05, "loss": 0.6451, "step": 4719 }, { "epoch": 1.8063528511289704, "grad_norm": 0.5430780649185181, "learning_rate": 1.4777994922060886e-05, "loss": 0.7579, "step": 4720 }, { "epoch": 1.8067355530042097, "grad_norm": 0.6016284227371216, "learning_rate": 1.4775817081349754e-05, "loss": 0.753, "step": 4721 }, { "epoch": 1.8071182548794489, "grad_norm": 0.5451686978340149, "learning_rate": 1.4773638947151836e-05, "loss": 0.6963, "step": 4722 }, { "epoch": 1.807500956754688, "grad_norm": 0.5032594203948975, "learning_rate": 1.477146051960098e-05, "loss": 0.6628, "step": 4723 }, { "epoch": 1.8078836586299274, "grad_norm": 0.5613216757774353, "learning_rate": 1.4769281798831054e-05, "loss": 0.6701, "step": 4724 }, { "epoch": 1.8082663605051663, "grad_norm": 0.5572973489761353, "learning_rate": 1.4767102784975951e-05, "loss": 0.6222, "step": 4725 }, { "epoch": 1.8086490623804057, "grad_norm": 0.49351054430007935, "learning_rate": 1.4764923478169575e-05, "loss": 0.69, "step": 4726 }, { "epoch": 1.8090317642556448, "grad_norm": 0.5509966015815735, "learning_rate": 1.4762743878545849e-05, "loss": 0.6825, "step": 4727 }, { "epoch": 1.809414466130884, "grad_norm": 0.5359150171279907, "learning_rate": 1.4760563986238719e-05, "loss": 0.6366, "step": 4728 }, { "epoch": 1.8097971680061233, "grad_norm": 0.5743146538734436, "learning_rate": 1.4758383801382141e-05, "loss": 0.7488, "step": 4729 }, { "epoch": 1.8101798698813623, "grad_norm": 0.5071951150894165, "learning_rate": 1.4756203324110094e-05, "loss": 0.585, "step": 4730 }, { "epoch": 1.8105625717566016, "grad_norm": 0.4970850944519043, "learning_rate": 1.475402255455658e-05, "loss": 0.709, "step": 4731 }, { "epoch": 1.8109452736318408, "grad_norm": 0.5560656785964966, "learning_rate": 1.4751841492855602e-05, "loss": 0.578, "step": 4732 }, { "epoch": 1.81132797550708, "grad_norm": 0.5020857453346252, "learning_rate": 1.4749660139141204e-05, "loss": 0.6121, "step": 4733 }, { "epoch": 1.8117106773823193, "grad_norm": 0.5399463772773743, "learning_rate": 1.4747478493547427e-05, "loss": 0.6304, "step": 4734 }, { "epoch": 1.8120933792575582, "grad_norm": 0.5781292915344238, "learning_rate": 1.4745296556208342e-05, "loss": 0.601, "step": 4735 }, { "epoch": 1.8124760811327976, "grad_norm": 0.4978233575820923, "learning_rate": 1.474311432725804e-05, "loss": 0.6893, "step": 4736 }, { "epoch": 1.8128587830080367, "grad_norm": 0.5115368962287903, "learning_rate": 1.4740931806830617e-05, "loss": 0.7864, "step": 4737 }, { "epoch": 1.8132414848832759, "grad_norm": 0.5592079758644104, "learning_rate": 1.47387489950602e-05, "loss": 0.7255, "step": 4738 }, { "epoch": 1.8136241867585152, "grad_norm": 0.8226484060287476, "learning_rate": 1.4736565892080922e-05, "loss": 0.7022, "step": 4739 }, { "epoch": 1.8140068886337541, "grad_norm": 0.580218493938446, "learning_rate": 1.4734382498026946e-05, "loss": 0.6716, "step": 4740 }, { "epoch": 1.8143895905089935, "grad_norm": 0.5816295146942139, "learning_rate": 1.4732198813032452e-05, "loss": 0.6641, "step": 4741 }, { "epoch": 1.8147722923842327, "grad_norm": 0.5754163265228271, "learning_rate": 1.4730014837231624e-05, "loss": 0.5853, "step": 4742 }, { "epoch": 1.8151549942594718, "grad_norm": 1.1272003650665283, "learning_rate": 1.472783057075868e-05, "loss": 0.6128, "step": 4743 }, { "epoch": 1.8155376961347112, "grad_norm": 0.5146432518959045, "learning_rate": 1.472564601374784e-05, "loss": 0.5688, "step": 4744 }, { "epoch": 1.81592039800995, "grad_norm": 0.5633511543273926, "learning_rate": 1.4723461166333361e-05, "loss": 0.6803, "step": 4745 }, { "epoch": 1.8163030998851895, "grad_norm": 0.5513591170310974, "learning_rate": 1.4721276028649506e-05, "loss": 0.7158, "step": 4746 }, { "epoch": 1.8166858017604286, "grad_norm": 0.5463995933532715, "learning_rate": 1.4719090600830555e-05, "loss": 0.7072, "step": 4747 }, { "epoch": 1.8170685036356677, "grad_norm": 0.5368611216545105, "learning_rate": 1.4716904883010804e-05, "loss": 0.6577, "step": 4748 }, { "epoch": 1.817451205510907, "grad_norm": 0.5548350214958191, "learning_rate": 1.471471887532458e-05, "loss": 0.6601, "step": 4749 }, { "epoch": 1.817833907386146, "grad_norm": 0.5377435088157654, "learning_rate": 1.4712532577906214e-05, "loss": 0.7303, "step": 4750 }, { "epoch": 1.8182166092613854, "grad_norm": 0.5158487558364868, "learning_rate": 1.471034599089006e-05, "loss": 0.7015, "step": 4751 }, { "epoch": 1.8185993111366245, "grad_norm": 0.5378150343894958, "learning_rate": 1.4708159114410494e-05, "loss": 0.6784, "step": 4752 }, { "epoch": 1.8189820130118637, "grad_norm": 0.534416913986206, "learning_rate": 1.47059719486019e-05, "loss": 0.6466, "step": 4753 }, { "epoch": 1.819364714887103, "grad_norm": 0.4564569890499115, "learning_rate": 1.4703784493598685e-05, "loss": 0.614, "step": 4754 }, { "epoch": 1.819747416762342, "grad_norm": 0.6962404251098633, "learning_rate": 1.4701596749535276e-05, "loss": 0.7144, "step": 4755 }, { "epoch": 1.8201301186375813, "grad_norm": 0.533453643321991, "learning_rate": 1.469940871654612e-05, "loss": 0.6772, "step": 4756 }, { "epoch": 1.8205128205128205, "grad_norm": 0.5160120129585266, "learning_rate": 1.4697220394765675e-05, "loss": 0.6191, "step": 4757 }, { "epoch": 1.8208955223880596, "grad_norm": 0.534890353679657, "learning_rate": 1.469503178432841e-05, "loss": 0.6547, "step": 4758 }, { "epoch": 1.821278224263299, "grad_norm": 0.5591180324554443, "learning_rate": 1.4692842885368834e-05, "loss": 0.7182, "step": 4759 }, { "epoch": 1.821660926138538, "grad_norm": 0.6475191712379456, "learning_rate": 1.4690653698021456e-05, "loss": 0.7107, "step": 4760 }, { "epoch": 1.8220436280137773, "grad_norm": 0.606683611869812, "learning_rate": 1.4688464222420804e-05, "loss": 0.7072, "step": 4761 }, { "epoch": 1.8224263298890164, "grad_norm": 0.5394673943519592, "learning_rate": 1.4686274458701432e-05, "loss": 0.694, "step": 4762 }, { "epoch": 1.8228090317642556, "grad_norm": 0.5195249319076538, "learning_rate": 1.4684084406997903e-05, "loss": 0.6369, "step": 4763 }, { "epoch": 1.823191733639495, "grad_norm": 0.5155349969863892, "learning_rate": 1.4681894067444805e-05, "loss": 0.6871, "step": 4764 }, { "epoch": 1.8235744355147339, "grad_norm": 0.5214259028434753, "learning_rate": 1.4679703440176739e-05, "loss": 0.6233, "step": 4765 }, { "epoch": 1.8239571373899732, "grad_norm": 0.5202044248580933, "learning_rate": 1.4677512525328322e-05, "loss": 0.5911, "step": 4766 }, { "epoch": 1.8243398392652124, "grad_norm": 0.5761368274688721, "learning_rate": 1.4675321323034198e-05, "loss": 0.6249, "step": 4767 }, { "epoch": 1.8247225411404515, "grad_norm": 0.5547483563423157, "learning_rate": 1.4673129833429013e-05, "loss": 0.6369, "step": 4768 }, { "epoch": 1.8251052430156909, "grad_norm": 0.6132334470748901, "learning_rate": 1.467093805664745e-05, "loss": 0.6838, "step": 4769 }, { "epoch": 1.8254879448909298, "grad_norm": 0.548518717288971, "learning_rate": 1.4668745992824196e-05, "loss": 0.6987, "step": 4770 }, { "epoch": 1.8258706467661692, "grad_norm": 0.5084291696548462, "learning_rate": 1.4666553642093955e-05, "loss": 0.6442, "step": 4771 }, { "epoch": 1.8262533486414083, "grad_norm": 0.5295979976654053, "learning_rate": 1.4664361004591459e-05, "loss": 0.6742, "step": 4772 }, { "epoch": 1.8266360505166475, "grad_norm": 0.5944174528121948, "learning_rate": 1.4662168080451446e-05, "loss": 0.6954, "step": 4773 }, { "epoch": 1.8270187523918868, "grad_norm": 0.6403147578239441, "learning_rate": 1.465997486980868e-05, "loss": 0.7032, "step": 4774 }, { "epoch": 1.8274014542671257, "grad_norm": 0.531080424785614, "learning_rate": 1.4657781372797938e-05, "loss": 0.6666, "step": 4775 }, { "epoch": 1.8277841561423651, "grad_norm": 0.5710650682449341, "learning_rate": 1.4655587589554019e-05, "loss": 0.6818, "step": 4776 }, { "epoch": 1.8281668580176043, "grad_norm": 0.5470076203346252, "learning_rate": 1.4653393520211736e-05, "loss": 0.6564, "step": 4777 }, { "epoch": 1.8285495598928434, "grad_norm": 0.5365594029426575, "learning_rate": 1.465119916490592e-05, "loss": 0.5986, "step": 4778 }, { "epoch": 1.8289322617680828, "grad_norm": 0.6379629969596863, "learning_rate": 1.464900452377142e-05, "loss": 0.6178, "step": 4779 }, { "epoch": 1.8293149636433217, "grad_norm": 0.5010591149330139, "learning_rate": 1.4646809596943101e-05, "loss": 0.6551, "step": 4780 }, { "epoch": 1.829697665518561, "grad_norm": 0.5347120761871338, "learning_rate": 1.464461438455585e-05, "loss": 0.6293, "step": 4781 }, { "epoch": 1.8300803673938002, "grad_norm": 0.5250279307365417, "learning_rate": 1.4642418886744567e-05, "loss": 0.6917, "step": 4782 }, { "epoch": 1.8304630692690393, "grad_norm": 0.49505341053009033, "learning_rate": 1.4640223103644173e-05, "loss": 0.7321, "step": 4783 }, { "epoch": 1.8308457711442787, "grad_norm": 0.5058775544166565, "learning_rate": 1.4638027035389599e-05, "loss": 0.6647, "step": 4784 }, { "epoch": 1.8312284730195176, "grad_norm": 0.5076223015785217, "learning_rate": 1.4635830682115809e-05, "loss": 0.6221, "step": 4785 }, { "epoch": 1.831611174894757, "grad_norm": 0.5900699496269226, "learning_rate": 1.4633634043957767e-05, "loss": 0.7579, "step": 4786 }, { "epoch": 1.8319938767699961, "grad_norm": 0.5532951951026917, "learning_rate": 1.4631437121050462e-05, "loss": 0.6943, "step": 4787 }, { "epoch": 1.8323765786452353, "grad_norm": 0.5337534546852112, "learning_rate": 1.462923991352891e-05, "loss": 0.6319, "step": 4788 }, { "epoch": 1.8327592805204747, "grad_norm": 0.5419719219207764, "learning_rate": 1.4627042421528124e-05, "loss": 0.6228, "step": 4789 }, { "epoch": 1.8331419823957136, "grad_norm": 0.5543385744094849, "learning_rate": 1.4624844645183149e-05, "loss": 0.6775, "step": 4790 }, { "epoch": 1.833524684270953, "grad_norm": 0.5098129510879517, "learning_rate": 1.4622646584629052e-05, "loss": 0.6362, "step": 4791 }, { "epoch": 1.833907386146192, "grad_norm": 0.5531493425369263, "learning_rate": 1.46204482400009e-05, "loss": 0.6612, "step": 4792 }, { "epoch": 1.8342900880214312, "grad_norm": 0.5596335530281067, "learning_rate": 1.4618249611433792e-05, "loss": 0.6651, "step": 4793 }, { "epoch": 1.8346727898966706, "grad_norm": 0.5176159143447876, "learning_rate": 1.4616050699062838e-05, "loss": 0.6981, "step": 4794 }, { "epoch": 1.8350554917719095, "grad_norm": 0.5682605504989624, "learning_rate": 1.4613851503023167e-05, "loss": 0.6275, "step": 4795 }, { "epoch": 1.8354381936471489, "grad_norm": 0.5600401163101196, "learning_rate": 1.4611652023449927e-05, "loss": 0.7016, "step": 4796 }, { "epoch": 1.835820895522388, "grad_norm": 0.5611562132835388, "learning_rate": 1.4609452260478278e-05, "loss": 0.6358, "step": 4797 }, { "epoch": 1.8362035973976272, "grad_norm": 0.5281900763511658, "learning_rate": 1.460725221424341e-05, "loss": 0.7474, "step": 4798 }, { "epoch": 1.8365862992728665, "grad_norm": 0.5322651267051697, "learning_rate": 1.4605051884880512e-05, "loss": 0.6801, "step": 4799 }, { "epoch": 1.8369690011481055, "grad_norm": 0.538286030292511, "learning_rate": 1.4602851272524805e-05, "loss": 0.6318, "step": 4800 }, { "epoch": 1.8373517030233448, "grad_norm": 0.5302243232727051, "learning_rate": 1.4600650377311523e-05, "loss": 0.6666, "step": 4801 }, { "epoch": 1.837734404898584, "grad_norm": 0.5503251552581787, "learning_rate": 1.4598449199375914e-05, "loss": 0.5412, "step": 4802 }, { "epoch": 1.8381171067738231, "grad_norm": 0.5744736194610596, "learning_rate": 1.4596247738853251e-05, "loss": 0.6392, "step": 4803 }, { "epoch": 1.8384998086490625, "grad_norm": 0.495697557926178, "learning_rate": 1.4594045995878814e-05, "loss": 0.6892, "step": 4804 }, { "epoch": 1.8388825105243014, "grad_norm": 0.47178635001182556, "learning_rate": 1.4591843970587906e-05, "loss": 0.5853, "step": 4805 }, { "epoch": 1.8392652123995408, "grad_norm": 0.5174659490585327, "learning_rate": 1.4589641663115854e-05, "loss": 0.6176, "step": 4806 }, { "epoch": 1.83964791427478, "grad_norm": 0.5682302117347717, "learning_rate": 1.4587439073597993e-05, "loss": 0.7247, "step": 4807 }, { "epoch": 1.840030616150019, "grad_norm": 0.5162869095802307, "learning_rate": 1.4585236202169675e-05, "loss": 0.6561, "step": 4808 }, { "epoch": 1.8404133180252584, "grad_norm": 0.5152250528335571, "learning_rate": 1.4583033048966273e-05, "loss": 0.6383, "step": 4809 }, { "epoch": 1.8407960199004973, "grad_norm": 0.5008905529975891, "learning_rate": 1.4580829614123178e-05, "loss": 0.6143, "step": 4810 }, { "epoch": 1.8411787217757367, "grad_norm": 0.5080466866493225, "learning_rate": 1.45786258977758e-05, "loss": 0.6339, "step": 4811 }, { "epoch": 1.8415614236509759, "grad_norm": 0.4919726252555847, "learning_rate": 1.4576421900059558e-05, "loss": 0.6244, "step": 4812 }, { "epoch": 1.841944125526215, "grad_norm": 0.48714256286621094, "learning_rate": 1.4574217621109897e-05, "loss": 0.6376, "step": 4813 }, { "epoch": 1.8423268274014544, "grad_norm": 0.5429942011833191, "learning_rate": 1.457201306106227e-05, "loss": 0.6729, "step": 4814 }, { "epoch": 1.8427095292766933, "grad_norm": 0.5655882358551025, "learning_rate": 1.4569808220052162e-05, "loss": 0.6309, "step": 4815 }, { "epoch": 1.8430922311519327, "grad_norm": 0.5366756916046143, "learning_rate": 1.4567603098215062e-05, "loss": 0.6259, "step": 4816 }, { "epoch": 1.8434749330271718, "grad_norm": 0.5069791078567505, "learning_rate": 1.456539769568648e-05, "loss": 0.6712, "step": 4817 }, { "epoch": 1.843857634902411, "grad_norm": 0.5237774848937988, "learning_rate": 1.4563192012601946e-05, "loss": 0.6652, "step": 4818 }, { "epoch": 1.8442403367776503, "grad_norm": 0.5609771609306335, "learning_rate": 1.4560986049097001e-05, "loss": 0.5958, "step": 4819 }, { "epoch": 1.8446230386528892, "grad_norm": 0.48608657717704773, "learning_rate": 1.4558779805307213e-05, "loss": 0.6036, "step": 4820 }, { "epoch": 1.8450057405281286, "grad_norm": 0.6096404194831848, "learning_rate": 1.4556573281368157e-05, "loss": 0.6604, "step": 4821 }, { "epoch": 1.8453884424033677, "grad_norm": 0.5876935720443726, "learning_rate": 1.4554366477415431e-05, "loss": 0.6727, "step": 4822 }, { "epoch": 1.845771144278607, "grad_norm": 0.5310375690460205, "learning_rate": 1.455215939358465e-05, "loss": 0.6401, "step": 4823 }, { "epoch": 1.8461538461538463, "grad_norm": 0.5665957927703857, "learning_rate": 1.4549952030011448e-05, "loss": 0.6679, "step": 4824 }, { "epoch": 1.8465365480290852, "grad_norm": 0.580021858215332, "learning_rate": 1.4547744386831468e-05, "loss": 0.6537, "step": 4825 }, { "epoch": 1.8469192499043245, "grad_norm": 0.5326604247093201, "learning_rate": 1.4545536464180376e-05, "loss": 0.7097, "step": 4826 }, { "epoch": 1.8473019517795637, "grad_norm": 0.5263270735740662, "learning_rate": 1.4543328262193862e-05, "loss": 0.652, "step": 4827 }, { "epoch": 1.8476846536548028, "grad_norm": 0.5574325919151306, "learning_rate": 1.4541119781007616e-05, "loss": 0.7028, "step": 4828 }, { "epoch": 1.8480673555300422, "grad_norm": 0.520220160484314, "learning_rate": 1.4538911020757363e-05, "loss": 0.6761, "step": 4829 }, { "epoch": 1.8484500574052811, "grad_norm": 0.5350147485733032, "learning_rate": 1.453670198157883e-05, "loss": 0.6319, "step": 4830 }, { "epoch": 1.8488327592805205, "grad_norm": 0.6073192954063416, "learning_rate": 1.4534492663607775e-05, "loss": 0.616, "step": 4831 }, { "epoch": 1.8492154611557596, "grad_norm": 0.5386824607849121, "learning_rate": 1.4532283066979967e-05, "loss": 0.6558, "step": 4832 }, { "epoch": 1.8495981630309988, "grad_norm": 0.5003077387809753, "learning_rate": 1.4530073191831185e-05, "loss": 0.5691, "step": 4833 }, { "epoch": 1.8499808649062381, "grad_norm": 0.581675112247467, "learning_rate": 1.4527863038297233e-05, "loss": 0.6699, "step": 4834 }, { "epoch": 1.850363566781477, "grad_norm": 0.5375751852989197, "learning_rate": 1.4525652606513938e-05, "loss": 0.6146, "step": 4835 }, { "epoch": 1.8507462686567164, "grad_norm": 0.5707883834838867, "learning_rate": 1.4523441896617127e-05, "loss": 0.619, "step": 4836 }, { "epoch": 1.8511289705319556, "grad_norm": 0.550902247428894, "learning_rate": 1.4521230908742665e-05, "loss": 0.684, "step": 4837 }, { "epoch": 1.8515116724071947, "grad_norm": 0.5592581033706665, "learning_rate": 1.4519019643026417e-05, "loss": 0.6551, "step": 4838 }, { "epoch": 1.851894374282434, "grad_norm": 0.5860358476638794, "learning_rate": 1.451680809960427e-05, "loss": 0.6632, "step": 4839 }, { "epoch": 1.852277076157673, "grad_norm": 0.5917476415634155, "learning_rate": 1.4514596278612127e-05, "loss": 0.6709, "step": 4840 }, { "epoch": 1.8526597780329124, "grad_norm": 0.5290408730506897, "learning_rate": 1.4512384180185919e-05, "loss": 0.5402, "step": 4841 }, { "epoch": 1.8530424799081515, "grad_norm": 0.5137609243392944, "learning_rate": 1.4510171804461579e-05, "loss": 0.6243, "step": 4842 }, { "epoch": 1.8534251817833907, "grad_norm": 0.5195699334144592, "learning_rate": 1.4507959151575065e-05, "loss": 0.6179, "step": 4843 }, { "epoch": 1.85380788365863, "grad_norm": 0.5600074529647827, "learning_rate": 1.450574622166235e-05, "loss": 0.5664, "step": 4844 }, { "epoch": 1.854190585533869, "grad_norm": 0.5416440963745117, "learning_rate": 1.4503533014859427e-05, "loss": 0.7503, "step": 4845 }, { "epoch": 1.8545732874091083, "grad_norm": 0.5056224465370178, "learning_rate": 1.4501319531302302e-05, "loss": 0.6964, "step": 4846 }, { "epoch": 1.8549559892843475, "grad_norm": 0.5458555817604065, "learning_rate": 1.4499105771126998e-05, "loss": 0.6615, "step": 4847 }, { "epoch": 1.8553386911595866, "grad_norm": 0.5263186097145081, "learning_rate": 1.4496891734469559e-05, "loss": 0.6865, "step": 4848 }, { "epoch": 1.855721393034826, "grad_norm": 0.5738276243209839, "learning_rate": 1.449467742146604e-05, "loss": 0.7111, "step": 4849 }, { "epoch": 1.856104094910065, "grad_norm": 0.5691863894462585, "learning_rate": 1.4492462832252517e-05, "loss": 0.6562, "step": 4850 }, { "epoch": 1.8564867967853043, "grad_norm": 0.5337293744087219, "learning_rate": 1.4490247966965085e-05, "loss": 0.6842, "step": 4851 }, { "epoch": 1.8568694986605434, "grad_norm": 0.5015528798103333, "learning_rate": 1.4488032825739855e-05, "loss": 0.6951, "step": 4852 }, { "epoch": 1.8572522005357825, "grad_norm": 0.5041957497596741, "learning_rate": 1.448581740871295e-05, "loss": 0.5986, "step": 4853 }, { "epoch": 1.857634902411022, "grad_norm": 0.5807220935821533, "learning_rate": 1.4483601716020512e-05, "loss": 0.6498, "step": 4854 }, { "epoch": 1.8580176042862608, "grad_norm": 0.5411649346351624, "learning_rate": 1.4481385747798705e-05, "loss": 0.7139, "step": 4855 }, { "epoch": 1.8584003061615002, "grad_norm": 0.5305037498474121, "learning_rate": 1.4479169504183703e-05, "loss": 0.7309, "step": 4856 }, { "epoch": 1.8587830080367393, "grad_norm": 0.5222592949867249, "learning_rate": 1.4476952985311704e-05, "loss": 0.6611, "step": 4857 }, { "epoch": 1.8591657099119785, "grad_norm": 0.5243184566497803, "learning_rate": 1.4474736191318917e-05, "loss": 0.6541, "step": 4858 }, { "epoch": 1.8595484117872179, "grad_norm": 0.5372806787490845, "learning_rate": 1.4472519122341566e-05, "loss": 0.6741, "step": 4859 }, { "epoch": 1.8599311136624568, "grad_norm": 0.5681394934654236, "learning_rate": 1.4470301778515902e-05, "loss": 0.7158, "step": 4860 }, { "epoch": 1.8603138155376961, "grad_norm": 0.5435879826545715, "learning_rate": 1.4468084159978184e-05, "loss": 0.7019, "step": 4861 }, { "epoch": 1.8606965174129353, "grad_norm": 0.553519070148468, "learning_rate": 1.4465866266864693e-05, "loss": 0.6931, "step": 4862 }, { "epoch": 1.8610792192881744, "grad_norm": 0.632942795753479, "learning_rate": 1.446364809931172e-05, "loss": 0.6392, "step": 4863 }, { "epoch": 1.8614619211634138, "grad_norm": 0.5270412564277649, "learning_rate": 1.4461429657455579e-05, "loss": 0.6217, "step": 4864 }, { "epoch": 1.8618446230386527, "grad_norm": 0.5547237396240234, "learning_rate": 1.4459210941432602e-05, "loss": 0.61, "step": 4865 }, { "epoch": 1.862227324913892, "grad_norm": 0.5802023410797119, "learning_rate": 1.4456991951379134e-05, "loss": 0.6412, "step": 4866 }, { "epoch": 1.8626100267891312, "grad_norm": 0.572941243648529, "learning_rate": 1.4454772687431533e-05, "loss": 0.6976, "step": 4867 }, { "epoch": 1.8629927286643704, "grad_norm": 0.5148993730545044, "learning_rate": 1.4452553149726186e-05, "loss": 0.7318, "step": 4868 }, { "epoch": 1.8633754305396097, "grad_norm": 0.5563246607780457, "learning_rate": 1.4450333338399484e-05, "loss": 0.6485, "step": 4869 }, { "epoch": 1.8637581324148487, "grad_norm": 0.5982072353363037, "learning_rate": 1.4448113253587843e-05, "loss": 0.6953, "step": 4870 }, { "epoch": 1.864140834290088, "grad_norm": 0.5552899837493896, "learning_rate": 1.4445892895427695e-05, "loss": 0.7127, "step": 4871 }, { "epoch": 1.8645235361653272, "grad_norm": 0.5528488159179688, "learning_rate": 1.4443672264055485e-05, "loss": 0.7418, "step": 4872 }, { "epoch": 1.8649062380405663, "grad_norm": 0.5230633020401001, "learning_rate": 1.4441451359607675e-05, "loss": 0.6459, "step": 4873 }, { "epoch": 1.8652889399158057, "grad_norm": 0.4756128489971161, "learning_rate": 1.4439230182220745e-05, "loss": 0.5505, "step": 4874 }, { "epoch": 1.8656716417910446, "grad_norm": 0.5264442563056946, "learning_rate": 1.4437008732031195e-05, "loss": 0.6804, "step": 4875 }, { "epoch": 1.866054343666284, "grad_norm": 0.5337808728218079, "learning_rate": 1.443478700917554e-05, "loss": 0.7162, "step": 4876 }, { "epoch": 1.8664370455415231, "grad_norm": 0.49837350845336914, "learning_rate": 1.443256501379031e-05, "loss": 0.5768, "step": 4877 }, { "epoch": 1.8668197474167623, "grad_norm": 0.50713050365448, "learning_rate": 1.4430342746012049e-05, "loss": 0.6774, "step": 4878 }, { "epoch": 1.8672024492920016, "grad_norm": 1.0557477474212646, "learning_rate": 1.4428120205977328e-05, "loss": 0.6486, "step": 4879 }, { "epoch": 1.8675851511672406, "grad_norm": 0.5443301200866699, "learning_rate": 1.442589739382272e-05, "loss": 0.6457, "step": 4880 }, { "epoch": 1.86796785304248, "grad_norm": 0.5310903787612915, "learning_rate": 1.442367430968483e-05, "loss": 0.5958, "step": 4881 }, { "epoch": 1.868350554917719, "grad_norm": 0.5383405685424805, "learning_rate": 1.442145095370027e-05, "loss": 0.6493, "step": 4882 }, { "epoch": 1.8687332567929582, "grad_norm": 0.49795374274253845, "learning_rate": 1.4419227326005668e-05, "loss": 0.6693, "step": 4883 }, { "epoch": 1.8691159586681976, "grad_norm": 0.5212081074714661, "learning_rate": 1.4417003426737676e-05, "loss": 0.6495, "step": 4884 }, { "epoch": 1.8694986605434365, "grad_norm": 0.6122345328330994, "learning_rate": 1.4414779256032958e-05, "loss": 0.6481, "step": 4885 }, { "epoch": 1.8698813624186759, "grad_norm": 0.5959057807922363, "learning_rate": 1.4412554814028192e-05, "loss": 0.6932, "step": 4886 }, { "epoch": 1.870264064293915, "grad_norm": 0.5008255839347839, "learning_rate": 1.4410330100860081e-05, "loss": 0.6595, "step": 4887 }, { "epoch": 1.8706467661691542, "grad_norm": 0.49442723393440247, "learning_rate": 1.4408105116665336e-05, "loss": 0.5954, "step": 4888 }, { "epoch": 1.8710294680443935, "grad_norm": 0.5433297753334045, "learning_rate": 1.4405879861580689e-05, "loss": 0.6688, "step": 4889 }, { "epoch": 1.8714121699196324, "grad_norm": 0.5066640973091125, "learning_rate": 1.440365433574289e-05, "loss": 0.7127, "step": 4890 }, { "epoch": 1.8717948717948718, "grad_norm": 0.5587450265884399, "learning_rate": 1.4401428539288697e-05, "loss": 0.6455, "step": 4891 }, { "epoch": 1.872177573670111, "grad_norm": 0.5107596516609192, "learning_rate": 1.43992024723549e-05, "loss": 0.6296, "step": 4892 }, { "epoch": 1.87256027554535, "grad_norm": 0.538602352142334, "learning_rate": 1.4396976135078293e-05, "loss": 0.6508, "step": 4893 }, { "epoch": 1.8729429774205895, "grad_norm": 0.5135806202888489, "learning_rate": 1.4394749527595688e-05, "loss": 0.6411, "step": 4894 }, { "epoch": 1.8733256792958284, "grad_norm": 0.505699872970581, "learning_rate": 1.4392522650043919e-05, "loss": 0.581, "step": 4895 }, { "epoch": 1.8737083811710677, "grad_norm": 0.5032881498336792, "learning_rate": 1.439029550255983e-05, "loss": 0.6456, "step": 4896 }, { "epoch": 1.874091083046307, "grad_norm": 0.5472476482391357, "learning_rate": 1.4388068085280291e-05, "loss": 0.663, "step": 4897 }, { "epoch": 1.874473784921546, "grad_norm": 0.5336350202560425, "learning_rate": 1.4385840398342182e-05, "loss": 0.6743, "step": 4898 }, { "epoch": 1.8748564867967854, "grad_norm": 0.5430163741111755, "learning_rate": 1.4383612441882393e-05, "loss": 0.6899, "step": 4899 }, { "epoch": 1.8752391886720245, "grad_norm": 0.47504204511642456, "learning_rate": 1.4381384216037844e-05, "loss": 0.7055, "step": 4900 }, { "epoch": 1.8756218905472637, "grad_norm": 0.5711771845817566, "learning_rate": 1.4379155720945464e-05, "loss": 0.6274, "step": 4901 }, { "epoch": 1.8760045924225028, "grad_norm": 0.5830507278442383, "learning_rate": 1.4376926956742204e-05, "loss": 0.7077, "step": 4902 }, { "epoch": 1.876387294297742, "grad_norm": 0.5095477104187012, "learning_rate": 1.4374697923565022e-05, "loss": 0.6106, "step": 4903 }, { "epoch": 1.8767699961729813, "grad_norm": 0.5576366186141968, "learning_rate": 1.4372468621550899e-05, "loss": 0.7137, "step": 4904 }, { "epoch": 1.8771526980482205, "grad_norm": 0.5264337658882141, "learning_rate": 1.4370239050836832e-05, "loss": 0.7294, "step": 4905 }, { "epoch": 1.8775353999234596, "grad_norm": 0.5238435864448547, "learning_rate": 1.436800921155984e-05, "loss": 0.6522, "step": 4906 }, { "epoch": 1.8779181017986988, "grad_norm": 2.0951743125915527, "learning_rate": 1.436577910385694e-05, "loss": 0.6492, "step": 4907 }, { "epoch": 1.878300803673938, "grad_norm": 0.5638626217842102, "learning_rate": 1.4363548727865192e-05, "loss": 0.694, "step": 4908 }, { "epoch": 1.8786835055491773, "grad_norm": 0.5361049771308899, "learning_rate": 1.4361318083721646e-05, "loss": 0.5785, "step": 4909 }, { "epoch": 1.8790662074244164, "grad_norm": 0.5507961511611938, "learning_rate": 1.4359087171563388e-05, "loss": 0.6081, "step": 4910 }, { "epoch": 1.8794489092996556, "grad_norm": 0.5364822149276733, "learning_rate": 1.4356855991527517e-05, "loss": 0.5704, "step": 4911 }, { "epoch": 1.8798316111748947, "grad_norm": 0.5337881445884705, "learning_rate": 1.4354624543751139e-05, "loss": 0.7012, "step": 4912 }, { "epoch": 1.8802143130501339, "grad_norm": 0.5549154877662659, "learning_rate": 1.4352392828371385e-05, "loss": 0.6737, "step": 4913 }, { "epoch": 1.8805970149253732, "grad_norm": 0.5169242024421692, "learning_rate": 1.4350160845525394e-05, "loss": 0.6351, "step": 4914 }, { "epoch": 1.8809797168006124, "grad_norm": 0.5234274864196777, "learning_rate": 1.4347928595350336e-05, "loss": 0.7013, "step": 4915 }, { "epoch": 1.8813624186758515, "grad_norm": 0.5535668730735779, "learning_rate": 1.4345696077983387e-05, "loss": 0.6531, "step": 4916 }, { "epoch": 1.8817451205510907, "grad_norm": 0.5331268906593323, "learning_rate": 1.4343463293561734e-05, "loss": 0.6859, "step": 4917 }, { "epoch": 1.8821278224263298, "grad_norm": 0.5714091062545776, "learning_rate": 1.43412302422226e-05, "loss": 0.6638, "step": 4918 }, { "epoch": 1.8825105243015692, "grad_norm": 0.5398744940757751, "learning_rate": 1.43389969241032e-05, "loss": 0.6281, "step": 4919 }, { "epoch": 1.8828932261768083, "grad_norm": 0.5641939043998718, "learning_rate": 1.4336763339340782e-05, "loss": 0.6684, "step": 4920 }, { "epoch": 1.8832759280520475, "grad_norm": 0.5116326212882996, "learning_rate": 1.4334529488072608e-05, "loss": 0.6227, "step": 4921 }, { "epoch": 1.8836586299272866, "grad_norm": 0.493042528629303, "learning_rate": 1.433229537043595e-05, "loss": 0.6862, "step": 4922 }, { "epoch": 1.8840413318025258, "grad_norm": 0.5194539427757263, "learning_rate": 1.433006098656811e-05, "loss": 0.6917, "step": 4923 }, { "epoch": 1.8844240336777651, "grad_norm": 0.5406288504600525, "learning_rate": 1.4327826336606383e-05, "loss": 0.5797, "step": 4924 }, { "epoch": 1.8848067355530043, "grad_norm": 0.5516949892044067, "learning_rate": 1.4325591420688101e-05, "loss": 0.5829, "step": 4925 }, { "epoch": 1.8851894374282434, "grad_norm": 0.5681295990943909, "learning_rate": 1.4323356238950607e-05, "loss": 0.5954, "step": 4926 }, { "epoch": 1.8855721393034826, "grad_norm": 0.49969640374183655, "learning_rate": 1.4321120791531256e-05, "loss": 0.6615, "step": 4927 }, { "epoch": 1.8859548411787217, "grad_norm": 0.5541297793388367, "learning_rate": 1.4318885078567429e-05, "loss": 0.6422, "step": 4928 }, { "epoch": 1.886337543053961, "grad_norm": 0.577060341835022, "learning_rate": 1.4316649100196507e-05, "loss": 0.6599, "step": 4929 }, { "epoch": 1.8867202449292002, "grad_norm": 0.6380683779716492, "learning_rate": 1.4314412856555899e-05, "loss": 0.7056, "step": 4930 }, { "epoch": 1.8871029468044394, "grad_norm": 0.5315225720405579, "learning_rate": 1.4312176347783033e-05, "loss": 0.6225, "step": 4931 }, { "epoch": 1.8874856486796785, "grad_norm": 0.5105995535850525, "learning_rate": 1.4309939574015348e-05, "loss": 0.7026, "step": 4932 }, { "epoch": 1.8878683505549176, "grad_norm": 0.5101585984230042, "learning_rate": 1.4307702535390294e-05, "loss": 0.5922, "step": 4933 }, { "epoch": 1.888251052430157, "grad_norm": 0.5154255628585815, "learning_rate": 1.430546523204535e-05, "loss": 0.5791, "step": 4934 }, { "epoch": 1.8886337543053962, "grad_norm": 0.5114341378211975, "learning_rate": 1.4303227664117993e-05, "loss": 0.6096, "step": 4935 }, { "epoch": 1.8890164561806353, "grad_norm": 0.5704997777938843, "learning_rate": 1.4300989831745745e-05, "loss": 0.6715, "step": 4936 }, { "epoch": 1.8893991580558744, "grad_norm": 0.5084300637245178, "learning_rate": 1.4298751735066114e-05, "loss": 0.6822, "step": 4937 }, { "epoch": 1.8897818599311136, "grad_norm": 0.5098234415054321, "learning_rate": 1.4296513374216639e-05, "loss": 0.699, "step": 4938 }, { "epoch": 1.890164561806353, "grad_norm": 0.4842054843902588, "learning_rate": 1.4294274749334878e-05, "loss": 0.6362, "step": 4939 }, { "epoch": 1.890547263681592, "grad_norm": 0.5318970680236816, "learning_rate": 1.4292035860558394e-05, "loss": 0.6938, "step": 4940 }, { "epoch": 1.8909299655568312, "grad_norm": 0.5050945281982422, "learning_rate": 1.4289796708024775e-05, "loss": 0.6602, "step": 4941 }, { "epoch": 1.8913126674320704, "grad_norm": 0.5632967352867126, "learning_rate": 1.4287557291871625e-05, "loss": 0.6678, "step": 4942 }, { "epoch": 1.8916953693073095, "grad_norm": 0.5375555157661438, "learning_rate": 1.4285317612236563e-05, "loss": 0.6638, "step": 4943 }, { "epoch": 1.892078071182549, "grad_norm": 0.5572227239608765, "learning_rate": 1.428307766925722e-05, "loss": 0.7538, "step": 4944 }, { "epoch": 1.892460773057788, "grad_norm": 0.561226487159729, "learning_rate": 1.4280837463071244e-05, "loss": 0.6328, "step": 4945 }, { "epoch": 1.8928434749330272, "grad_norm": 0.5334022045135498, "learning_rate": 1.427859699381631e-05, "loss": 0.5752, "step": 4946 }, { "epoch": 1.8932261768082663, "grad_norm": 0.500715970993042, "learning_rate": 1.4276356261630096e-05, "loss": 0.5956, "step": 4947 }, { "epoch": 1.8936088786835055, "grad_norm": 0.6037939190864563, "learning_rate": 1.4274115266650299e-05, "loss": 0.6279, "step": 4948 }, { "epoch": 1.8939915805587448, "grad_norm": 0.6215310096740723, "learning_rate": 1.427187400901464e-05, "loss": 0.6929, "step": 4949 }, { "epoch": 1.894374282433984, "grad_norm": 0.5890525579452515, "learning_rate": 1.4269632488860843e-05, "loss": 0.6797, "step": 4950 }, { "epoch": 1.8947569843092231, "grad_norm": 0.5922120809555054, "learning_rate": 1.426739070632666e-05, "loss": 0.705, "step": 4951 }, { "epoch": 1.8951396861844623, "grad_norm": 0.5060815811157227, "learning_rate": 1.4265148661549857e-05, "loss": 0.6484, "step": 4952 }, { "epoch": 1.8955223880597014, "grad_norm": 0.572425365447998, "learning_rate": 1.4262906354668206e-05, "loss": 0.6105, "step": 4953 }, { "epoch": 1.8959050899349408, "grad_norm": 0.5177334547042847, "learning_rate": 1.426066378581951e-05, "loss": 0.6756, "step": 4954 }, { "epoch": 1.89628779181018, "grad_norm": 0.5406126976013184, "learning_rate": 1.4258420955141579e-05, "loss": 0.6994, "step": 4955 }, { "epoch": 1.896670493685419, "grad_norm": 0.5057927370071411, "learning_rate": 1.4256177862772235e-05, "loss": 0.6226, "step": 4956 }, { "epoch": 1.8970531955606582, "grad_norm": 0.49098318815231323, "learning_rate": 1.4253934508849331e-05, "loss": 0.6635, "step": 4957 }, { "epoch": 1.8974358974358974, "grad_norm": 0.4876505732536316, "learning_rate": 1.4251690893510726e-05, "loss": 0.6642, "step": 4958 }, { "epoch": 1.8978185993111367, "grad_norm": 0.49747705459594727, "learning_rate": 1.4249447016894289e-05, "loss": 0.7094, "step": 4959 }, { "epoch": 1.8982013011863759, "grad_norm": 0.5604303479194641, "learning_rate": 1.4247202879137919e-05, "loss": 0.7455, "step": 4960 }, { "epoch": 1.898584003061615, "grad_norm": 0.5298585295677185, "learning_rate": 1.424495848037952e-05, "loss": 0.6868, "step": 4961 }, { "epoch": 1.8989667049368542, "grad_norm": 0.5162067413330078, "learning_rate": 1.4242713820757022e-05, "loss": 0.6283, "step": 4962 }, { "epoch": 1.8993494068120933, "grad_norm": 0.5353745818138123, "learning_rate": 1.424046890040836e-05, "loss": 0.7198, "step": 4963 }, { "epoch": 1.8997321086873327, "grad_norm": 0.5258961915969849, "learning_rate": 1.4238223719471491e-05, "loss": 0.6377, "step": 4964 }, { "epoch": 1.9001148105625718, "grad_norm": 0.5423747301101685, "learning_rate": 1.4235978278084388e-05, "loss": 0.6442, "step": 4965 }, { "epoch": 1.900497512437811, "grad_norm": 0.5270748734474182, "learning_rate": 1.423373257638504e-05, "loss": 0.6452, "step": 4966 }, { "epoch": 1.90088021431305, "grad_norm": 0.515595555305481, "learning_rate": 1.4231486614511452e-05, "loss": 0.7135, "step": 4967 }, { "epoch": 1.9012629161882892, "grad_norm": 0.5585705041885376, "learning_rate": 1.4229240392601647e-05, "loss": 0.6544, "step": 4968 }, { "epoch": 1.9016456180635286, "grad_norm": 0.526411771774292, "learning_rate": 1.4226993910793651e-05, "loss": 0.6656, "step": 4969 }, { "epoch": 1.9020283199387678, "grad_norm": 0.5265370607376099, "learning_rate": 1.4224747169225527e-05, "loss": 0.6964, "step": 4970 }, { "epoch": 1.902411021814007, "grad_norm": 0.5956276655197144, "learning_rate": 1.4222500168035343e-05, "loss": 0.6462, "step": 4971 }, { "epoch": 1.902793723689246, "grad_norm": 0.7444089651107788, "learning_rate": 1.4220252907361177e-05, "loss": 0.6227, "step": 4972 }, { "epoch": 1.9031764255644852, "grad_norm": 0.5294516086578369, "learning_rate": 1.4218005387341132e-05, "loss": 0.6195, "step": 4973 }, { "epoch": 1.9035591274397246, "grad_norm": 0.4940127730369568, "learning_rate": 1.4215757608113323e-05, "loss": 0.6004, "step": 4974 }, { "epoch": 1.9039418293149637, "grad_norm": 1.0191328525543213, "learning_rate": 1.4213509569815884e-05, "loss": 0.6175, "step": 4975 }, { "epoch": 1.9043245311902028, "grad_norm": 0.5820332765579224, "learning_rate": 1.4211261272586965e-05, "loss": 0.7775, "step": 4976 }, { "epoch": 1.904707233065442, "grad_norm": 0.5100592970848083, "learning_rate": 1.4209012716564725e-05, "loss": 0.6739, "step": 4977 }, { "epoch": 1.9050899349406811, "grad_norm": 0.501028835773468, "learning_rate": 1.4206763901887346e-05, "loss": 0.6157, "step": 4978 }, { "epoch": 1.9054726368159205, "grad_norm": 0.524179995059967, "learning_rate": 1.4204514828693023e-05, "loss": 0.7001, "step": 4979 }, { "epoch": 1.9058553386911596, "grad_norm": 0.5168145895004272, "learning_rate": 1.4202265497119971e-05, "loss": 0.6081, "step": 4980 }, { "epoch": 1.9062380405663988, "grad_norm": 0.5327088832855225, "learning_rate": 1.4200015907306415e-05, "loss": 0.684, "step": 4981 }, { "epoch": 1.906620742441638, "grad_norm": 0.5153069496154785, "learning_rate": 1.4197766059390596e-05, "loss": 0.6416, "step": 4982 }, { "epoch": 1.907003444316877, "grad_norm": 0.5108414888381958, "learning_rate": 1.4195515953510777e-05, "loss": 0.5935, "step": 4983 }, { "epoch": 1.9073861461921164, "grad_norm": 0.5037458539009094, "learning_rate": 1.4193265589805232e-05, "loss": 0.6294, "step": 4984 }, { "epoch": 1.9077688480673556, "grad_norm": 0.512245774269104, "learning_rate": 1.4191014968412247e-05, "loss": 0.5735, "step": 4985 }, { "epoch": 1.9081515499425947, "grad_norm": 0.5207228064537048, "learning_rate": 1.4188764089470137e-05, "loss": 0.6404, "step": 4986 }, { "epoch": 1.9085342518178339, "grad_norm": 0.5793163776397705, "learning_rate": 1.418651295311722e-05, "loss": 0.7465, "step": 4987 }, { "epoch": 1.908916953693073, "grad_norm": 0.5918910503387451, "learning_rate": 1.4184261559491837e-05, "loss": 0.6621, "step": 4988 }, { "epoch": 1.9092996555683124, "grad_norm": 0.5374968647956848, "learning_rate": 1.4182009908732337e-05, "loss": 0.6864, "step": 4989 }, { "epoch": 1.9096823574435515, "grad_norm": 0.6002636551856995, "learning_rate": 1.4179758000977093e-05, "loss": 0.6665, "step": 4990 }, { "epoch": 1.9100650593187907, "grad_norm": 0.5861822366714478, "learning_rate": 1.4177505836364494e-05, "loss": 0.6179, "step": 4991 }, { "epoch": 1.9104477611940298, "grad_norm": 0.5836253762245178, "learning_rate": 1.4175253415032937e-05, "loss": 0.6566, "step": 4992 }, { "epoch": 1.910830463069269, "grad_norm": 0.5241498351097107, "learning_rate": 1.4173000737120839e-05, "loss": 0.6952, "step": 4993 }, { "epoch": 1.9112131649445083, "grad_norm": 0.587803065776825, "learning_rate": 1.4170747802766638e-05, "loss": 0.6698, "step": 4994 }, { "epoch": 1.9115958668197475, "grad_norm": 0.5686842799186707, "learning_rate": 1.4168494612108776e-05, "loss": 0.7159, "step": 4995 }, { "epoch": 1.9119785686949866, "grad_norm": 0.5384783148765564, "learning_rate": 1.4166241165285724e-05, "loss": 0.7524, "step": 4996 }, { "epoch": 1.9123612705702258, "grad_norm": 0.5111042261123657, "learning_rate": 1.416398746243596e-05, "loss": 0.6536, "step": 4997 }, { "epoch": 1.912743972445465, "grad_norm": 0.5365068912506104, "learning_rate": 1.4161733503697978e-05, "loss": 0.6927, "step": 4998 }, { "epoch": 1.9131266743207043, "grad_norm": 0.5569556951522827, "learning_rate": 1.4159479289210296e-05, "loss": 0.7174, "step": 4999 }, { "epoch": 1.9135093761959434, "grad_norm": 0.5374916791915894, "learning_rate": 1.4157224819111432e-05, "loss": 0.6368, "step": 5000 }, { "epoch": 1.9138920780711826, "grad_norm": 0.513307511806488, "learning_rate": 1.415497009353994e-05, "loss": 0.6386, "step": 5001 }, { "epoch": 1.9142747799464217, "grad_norm": 0.5580990314483643, "learning_rate": 1.4152715112634369e-05, "loss": 0.6967, "step": 5002 }, { "epoch": 1.9146574818216608, "grad_norm": 0.5280164480209351, "learning_rate": 1.41504598765333e-05, "loss": 0.7631, "step": 5003 }, { "epoch": 1.9150401836969002, "grad_norm": 0.5155099630355835, "learning_rate": 1.414820438537532e-05, "loss": 0.6146, "step": 5004 }, { "epoch": 1.9154228855721394, "grad_norm": 0.5607386827468872, "learning_rate": 1.414594863929904e-05, "loss": 0.6517, "step": 5005 }, { "epoch": 1.9158055874473785, "grad_norm": 0.4946140944957733, "learning_rate": 1.4143692638443074e-05, "loss": 0.5736, "step": 5006 }, { "epoch": 1.9161882893226176, "grad_norm": 0.5614767670631409, "learning_rate": 1.4141436382946065e-05, "loss": 0.6538, "step": 5007 }, { "epoch": 1.9165709911978568, "grad_norm": 0.5639678835868835, "learning_rate": 1.4139179872946664e-05, "loss": 0.6981, "step": 5008 }, { "epoch": 1.9169536930730962, "grad_norm": 0.542691171169281, "learning_rate": 1.413692310858354e-05, "loss": 0.5843, "step": 5009 }, { "epoch": 1.9173363949483353, "grad_norm": 0.5672925114631653, "learning_rate": 1.4134666089995378e-05, "loss": 0.6636, "step": 5010 }, { "epoch": 1.9177190968235744, "grad_norm": 0.5942568182945251, "learning_rate": 1.4132408817320875e-05, "loss": 0.7366, "step": 5011 }, { "epoch": 1.9181017986988136, "grad_norm": 0.5563998818397522, "learning_rate": 1.413015129069875e-05, "loss": 0.6818, "step": 5012 }, { "epoch": 1.9184845005740527, "grad_norm": 0.5135079026222229, "learning_rate": 1.4127893510267735e-05, "loss": 0.6757, "step": 5013 }, { "epoch": 1.918867202449292, "grad_norm": 0.5358787178993225, "learning_rate": 1.4125635476166571e-05, "loss": 0.6594, "step": 5014 }, { "epoch": 1.9192499043245312, "grad_norm": 0.5307397842407227, "learning_rate": 1.4123377188534025e-05, "loss": 0.6892, "step": 5015 }, { "epoch": 1.9196326061997704, "grad_norm": 0.5163781046867371, "learning_rate": 1.4121118647508872e-05, "loss": 0.6243, "step": 5016 }, { "epoch": 1.9200153080750095, "grad_norm": 0.5293192267417908, "learning_rate": 1.4118859853229907e-05, "loss": 0.5791, "step": 5017 }, { "epoch": 1.9203980099502487, "grad_norm": 0.5294682383537292, "learning_rate": 1.4116600805835941e-05, "loss": 0.6016, "step": 5018 }, { "epoch": 1.920780711825488, "grad_norm": 0.5376864075660706, "learning_rate": 1.4114341505465792e-05, "loss": 0.6513, "step": 5019 }, { "epoch": 1.9211634137007272, "grad_norm": 0.5951526761054993, "learning_rate": 1.4112081952258307e-05, "loss": 0.656, "step": 5020 }, { "epoch": 1.9215461155759663, "grad_norm": 0.5239231586456299, "learning_rate": 1.4109822146352338e-05, "loss": 0.6752, "step": 5021 }, { "epoch": 1.9219288174512055, "grad_norm": 0.5797893404960632, "learning_rate": 1.410756208788676e-05, "loss": 0.6868, "step": 5022 }, { "epoch": 1.9223115193264446, "grad_norm": 0.5416980981826782, "learning_rate": 1.4105301777000456e-05, "loss": 0.7252, "step": 5023 }, { "epoch": 1.922694221201684, "grad_norm": 0.5289522409439087, "learning_rate": 1.4103041213832325e-05, "loss": 0.7156, "step": 5024 }, { "epoch": 1.9230769230769231, "grad_norm": 0.6262988448143005, "learning_rate": 1.4100780398521293e-05, "loss": 0.6387, "step": 5025 }, { "epoch": 1.9234596249521623, "grad_norm": 0.6659690141677856, "learning_rate": 1.4098519331206284e-05, "loss": 0.7451, "step": 5026 }, { "epoch": 1.9238423268274014, "grad_norm": 0.5769839882850647, "learning_rate": 1.4096258012026256e-05, "loss": 0.6812, "step": 5027 }, { "epoch": 1.9242250287026406, "grad_norm": 0.5121026039123535, "learning_rate": 1.4093996441120168e-05, "loss": 0.6469, "step": 5028 }, { "epoch": 1.92460773057788, "grad_norm": 0.49374109506607056, "learning_rate": 1.4091734618626998e-05, "loss": 0.6313, "step": 5029 }, { "epoch": 1.924990432453119, "grad_norm": 0.5041234493255615, "learning_rate": 1.4089472544685744e-05, "loss": 0.5896, "step": 5030 }, { "epoch": 1.9253731343283582, "grad_norm": 0.5847843289375305, "learning_rate": 1.4087210219435417e-05, "loss": 0.6867, "step": 5031 }, { "epoch": 1.9257558362035974, "grad_norm": 0.567642092704773, "learning_rate": 1.4084947643015041e-05, "loss": 0.6321, "step": 5032 }, { "epoch": 1.9261385380788365, "grad_norm": 0.5070046186447144, "learning_rate": 1.408268481556366e-05, "loss": 0.6139, "step": 5033 }, { "epoch": 1.9265212399540759, "grad_norm": 0.49578234553337097, "learning_rate": 1.4080421737220327e-05, "loss": 0.6461, "step": 5034 }, { "epoch": 1.926903941829315, "grad_norm": 0.472017765045166, "learning_rate": 1.4078158408124117e-05, "loss": 0.5788, "step": 5035 }, { "epoch": 1.9272866437045542, "grad_norm": 0.5359833836555481, "learning_rate": 1.4075894828414117e-05, "loss": 0.6772, "step": 5036 }, { "epoch": 1.9276693455797933, "grad_norm": 0.5473407506942749, "learning_rate": 1.4073630998229427e-05, "loss": 0.6228, "step": 5037 }, { "epoch": 1.9280520474550324, "grad_norm": 0.5012601613998413, "learning_rate": 1.4071366917709172e-05, "loss": 0.7354, "step": 5038 }, { "epoch": 1.9284347493302718, "grad_norm": 0.5128373503684998, "learning_rate": 1.406910258699248e-05, "loss": 0.6646, "step": 5039 }, { "epoch": 1.928817451205511, "grad_norm": 0.5557908415794373, "learning_rate": 1.4066838006218504e-05, "loss": 0.6468, "step": 5040 }, { "epoch": 1.92920015308075, "grad_norm": 0.5068283677101135, "learning_rate": 1.4064573175526406e-05, "loss": 0.6082, "step": 5041 }, { "epoch": 1.9295828549559892, "grad_norm": 0.4953576624393463, "learning_rate": 1.4062308095055368e-05, "loss": 0.7009, "step": 5042 }, { "epoch": 1.9299655568312284, "grad_norm": 0.5142969489097595, "learning_rate": 1.4060042764944583e-05, "loss": 0.577, "step": 5043 }, { "epoch": 1.9303482587064678, "grad_norm": 0.5057594180107117, "learning_rate": 1.4057777185333265e-05, "loss": 0.6344, "step": 5044 }, { "epoch": 1.930730960581707, "grad_norm": 0.5389426946640015, "learning_rate": 1.4055511356360637e-05, "loss": 0.6806, "step": 5045 }, { "epoch": 1.931113662456946, "grad_norm": 0.5788654088973999, "learning_rate": 1.405324527816594e-05, "loss": 0.6958, "step": 5046 }, { "epoch": 1.9314963643321852, "grad_norm": 0.5543150305747986, "learning_rate": 1.4050978950888429e-05, "loss": 0.6627, "step": 5047 }, { "epoch": 1.9318790662074243, "grad_norm": 0.6269844770431519, "learning_rate": 1.4048712374667383e-05, "loss": 0.6389, "step": 5048 }, { "epoch": 1.9322617680826637, "grad_norm": 0.5131609439849854, "learning_rate": 1.4046445549642085e-05, "loss": 0.6697, "step": 5049 }, { "epoch": 1.9326444699579028, "grad_norm": 0.5510777831077576, "learning_rate": 1.4044178475951835e-05, "loss": 0.6118, "step": 5050 }, { "epoch": 1.933027171833142, "grad_norm": 0.520359218120575, "learning_rate": 1.4041911153735952e-05, "loss": 0.6681, "step": 5051 }, { "epoch": 1.9334098737083811, "grad_norm": 0.49514201283454895, "learning_rate": 1.403964358313377e-05, "loss": 0.6756, "step": 5052 }, { "epoch": 1.9337925755836203, "grad_norm": 0.5360912084579468, "learning_rate": 1.403737576428464e-05, "loss": 0.6644, "step": 5053 }, { "epoch": 1.9341752774588596, "grad_norm": 0.5597091317176819, "learning_rate": 1.4035107697327924e-05, "loss": 0.6872, "step": 5054 }, { "epoch": 1.9345579793340988, "grad_norm": 0.5024288296699524, "learning_rate": 1.4032839382402996e-05, "loss": 0.6161, "step": 5055 }, { "epoch": 1.934940681209338, "grad_norm": 0.5242691040039062, "learning_rate": 1.4030570819649253e-05, "loss": 0.6463, "step": 5056 }, { "epoch": 1.935323383084577, "grad_norm": 0.5991485118865967, "learning_rate": 1.402830200920611e-05, "loss": 0.6688, "step": 5057 }, { "epoch": 1.9357060849598162, "grad_norm": 0.5122311115264893, "learning_rate": 1.4026032951212982e-05, "loss": 0.6506, "step": 5058 }, { "epoch": 1.9360887868350556, "grad_norm": 0.5590986609458923, "learning_rate": 1.4023763645809317e-05, "loss": 0.6918, "step": 5059 }, { "epoch": 1.9364714887102947, "grad_norm": 0.5590054392814636, "learning_rate": 1.4021494093134564e-05, "loss": 0.6258, "step": 5060 }, { "epoch": 1.9368541905855339, "grad_norm": 0.56744784116745, "learning_rate": 1.4019224293328196e-05, "loss": 0.6857, "step": 5061 }, { "epoch": 1.937236892460773, "grad_norm": 0.525385856628418, "learning_rate": 1.4016954246529697e-05, "loss": 0.6227, "step": 5062 }, { "epoch": 1.9376195943360122, "grad_norm": 0.6522575616836548, "learning_rate": 1.4014683952878567e-05, "loss": 0.6993, "step": 5063 }, { "epoch": 1.9380022962112515, "grad_norm": 0.48491835594177246, "learning_rate": 1.4012413412514326e-05, "loss": 0.5896, "step": 5064 }, { "epoch": 1.9383849980864907, "grad_norm": 0.5879744291305542, "learning_rate": 1.4010142625576496e-05, "loss": 0.6402, "step": 5065 }, { "epoch": 1.9387676999617298, "grad_norm": 0.5456363558769226, "learning_rate": 1.4007871592204634e-05, "loss": 0.6528, "step": 5066 }, { "epoch": 1.939150401836969, "grad_norm": 0.5089050531387329, "learning_rate": 1.4005600312538295e-05, "loss": 0.6884, "step": 5067 }, { "epoch": 1.939533103712208, "grad_norm": 0.5059911012649536, "learning_rate": 1.4003328786717053e-05, "loss": 0.6479, "step": 5068 }, { "epoch": 1.9399158055874475, "grad_norm": 0.550723135471344, "learning_rate": 1.4001057014880503e-05, "loss": 0.5881, "step": 5069 }, { "epoch": 1.9402985074626866, "grad_norm": 0.5732541680335999, "learning_rate": 1.3998784997168251e-05, "loss": 0.6977, "step": 5070 }, { "epoch": 1.9406812093379258, "grad_norm": 0.5048913955688477, "learning_rate": 1.3996512733719915e-05, "loss": 0.6332, "step": 5071 }, { "epoch": 1.941063911213165, "grad_norm": 0.7227523326873779, "learning_rate": 1.399424022467514e-05, "loss": 0.6375, "step": 5072 }, { "epoch": 1.941446613088404, "grad_norm": 0.48736488819122314, "learning_rate": 1.3991967470173571e-05, "loss": 0.6126, "step": 5073 }, { "epoch": 1.9418293149636434, "grad_norm": 0.5700148940086365, "learning_rate": 1.3989694470354878e-05, "loss": 0.6751, "step": 5074 }, { "epoch": 1.9422120168388826, "grad_norm": 0.4940975606441498, "learning_rate": 1.398742122535874e-05, "loss": 0.5852, "step": 5075 }, { "epoch": 1.9425947187141217, "grad_norm": 0.5728409290313721, "learning_rate": 1.3985147735324853e-05, "loss": 0.7259, "step": 5076 }, { "epoch": 1.9429774205893608, "grad_norm": 1.0524013042449951, "learning_rate": 1.3982874000392934e-05, "loss": 0.7058, "step": 5077 }, { "epoch": 1.9433601224646, "grad_norm": 0.5214498043060303, "learning_rate": 1.3980600020702708e-05, "loss": 0.6437, "step": 5078 }, { "epoch": 1.9437428243398394, "grad_norm": 0.5640019178390503, "learning_rate": 1.3978325796393919e-05, "loss": 0.611, "step": 5079 }, { "epoch": 1.9441255262150785, "grad_norm": 0.5006248354911804, "learning_rate": 1.3976051327606317e-05, "loss": 0.6363, "step": 5080 }, { "epoch": 1.9445082280903176, "grad_norm": 0.5593833327293396, "learning_rate": 1.3973776614479682e-05, "loss": 0.7448, "step": 5081 }, { "epoch": 1.9448909299655568, "grad_norm": 0.5132530331611633, "learning_rate": 1.3971501657153801e-05, "loss": 0.5715, "step": 5082 }, { "epoch": 1.945273631840796, "grad_norm": 0.5422427654266357, "learning_rate": 1.3969226455768472e-05, "loss": 0.6523, "step": 5083 }, { "epoch": 1.9456563337160353, "grad_norm": 0.5249171257019043, "learning_rate": 1.3966951010463513e-05, "loss": 0.6751, "step": 5084 }, { "epoch": 1.9460390355912744, "grad_norm": 0.5013188123703003, "learning_rate": 1.3964675321378756e-05, "loss": 0.6307, "step": 5085 }, { "epoch": 1.9464217374665136, "grad_norm": 0.5075995922088623, "learning_rate": 1.3962399388654049e-05, "loss": 0.6281, "step": 5086 }, { "epoch": 1.9468044393417527, "grad_norm": 0.577790379524231, "learning_rate": 1.396012321242926e-05, "loss": 0.654, "step": 5087 }, { "epoch": 1.9471871412169919, "grad_norm": 0.5364044904708862, "learning_rate": 1.3957846792844256e-05, "loss": 0.6388, "step": 5088 }, { "epoch": 1.9475698430922312, "grad_norm": 0.607263445854187, "learning_rate": 1.3955570130038936e-05, "loss": 0.6316, "step": 5089 }, { "epoch": 1.9479525449674704, "grad_norm": 0.547904908657074, "learning_rate": 1.3953293224153205e-05, "loss": 0.5594, "step": 5090 }, { "epoch": 1.9483352468427095, "grad_norm": 0.5323928594589233, "learning_rate": 1.395101607532698e-05, "loss": 0.6846, "step": 5091 }, { "epoch": 1.9487179487179487, "grad_norm": 0.6030524373054504, "learning_rate": 1.3948738683700209e-05, "loss": 0.6288, "step": 5092 }, { "epoch": 1.9491006505931878, "grad_norm": 0.5102348923683167, "learning_rate": 1.3946461049412835e-05, "loss": 0.6628, "step": 5093 }, { "epoch": 1.9494833524684272, "grad_norm": 0.5170794725418091, "learning_rate": 1.3944183172604828e-05, "loss": 0.671, "step": 5094 }, { "epoch": 1.9498660543436663, "grad_norm": 0.49778616428375244, "learning_rate": 1.3941905053416171e-05, "loss": 0.6594, "step": 5095 }, { "epoch": 1.9502487562189055, "grad_norm": 0.5893067121505737, "learning_rate": 1.3939626691986857e-05, "loss": 0.6256, "step": 5096 }, { "epoch": 1.9506314580941446, "grad_norm": 0.5914115309715271, "learning_rate": 1.3937348088456897e-05, "loss": 0.5758, "step": 5097 }, { "epoch": 1.9510141599693838, "grad_norm": 0.5200788974761963, "learning_rate": 1.3935069242966322e-05, "loss": 0.6302, "step": 5098 }, { "epoch": 1.9513968618446231, "grad_norm": 0.5142930150032043, "learning_rate": 1.3932790155655166e-05, "loss": 0.5961, "step": 5099 }, { "epoch": 1.9517795637198623, "grad_norm": 0.5790538191795349, "learning_rate": 1.3930510826663498e-05, "loss": 0.6197, "step": 5100 }, { "epoch": 1.9521622655951014, "grad_norm": 0.48665758967399597, "learning_rate": 1.3928231256131372e-05, "loss": 0.6475, "step": 5101 }, { "epoch": 1.9525449674703406, "grad_norm": 0.5728985667228699, "learning_rate": 1.3925951444198885e-05, "loss": 0.7455, "step": 5102 }, { "epoch": 1.9529276693455797, "grad_norm": 0.6128982901573181, "learning_rate": 1.3923671391006135e-05, "loss": 0.6638, "step": 5103 }, { "epoch": 1.953310371220819, "grad_norm": 0.5015314221382141, "learning_rate": 1.392139109669324e-05, "loss": 0.6906, "step": 5104 }, { "epoch": 1.9536930730960582, "grad_norm": 0.5724260807037354, "learning_rate": 1.3919110561400325e-05, "loss": 0.6674, "step": 5105 }, { "epoch": 1.9540757749712974, "grad_norm": 0.5315262079238892, "learning_rate": 1.3916829785267533e-05, "loss": 0.6108, "step": 5106 }, { "epoch": 1.9544584768465365, "grad_norm": 0.5055209994316101, "learning_rate": 1.3914548768435032e-05, "loss": 0.6068, "step": 5107 }, { "epoch": 1.9548411787217757, "grad_norm": 0.5523732900619507, "learning_rate": 1.3912267511042994e-05, "loss": 0.6445, "step": 5108 }, { "epoch": 1.955223880597015, "grad_norm": 0.5061286091804504, "learning_rate": 1.3909986013231609e-05, "loss": 0.7419, "step": 5109 }, { "epoch": 1.9556065824722542, "grad_norm": 0.5125958323478699, "learning_rate": 1.3907704275141077e-05, "loss": 0.673, "step": 5110 }, { "epoch": 1.9559892843474933, "grad_norm": 0.5342481136322021, "learning_rate": 1.3905422296911617e-05, "loss": 0.7006, "step": 5111 }, { "epoch": 1.9563719862227325, "grad_norm": 0.5182138085365295, "learning_rate": 1.3903140078683466e-05, "loss": 0.5919, "step": 5112 }, { "epoch": 1.9567546880979716, "grad_norm": 0.5899636149406433, "learning_rate": 1.3900857620596873e-05, "loss": 0.5831, "step": 5113 }, { "epoch": 1.957137389973211, "grad_norm": 0.5540575385093689, "learning_rate": 1.3898574922792099e-05, "loss": 0.6242, "step": 5114 }, { "epoch": 1.95752009184845, "grad_norm": 0.4901326894760132, "learning_rate": 1.3896291985409423e-05, "loss": 0.6247, "step": 5115 }, { "epoch": 1.9579027937236892, "grad_norm": 0.5516331791877747, "learning_rate": 1.3894008808589135e-05, "loss": 0.6338, "step": 5116 }, { "epoch": 1.9582854955989284, "grad_norm": 0.5270789265632629, "learning_rate": 1.3891725392471547e-05, "loss": 0.634, "step": 5117 }, { "epoch": 1.9586681974741675, "grad_norm": 0.5379692316055298, "learning_rate": 1.3889441737196975e-05, "loss": 0.6124, "step": 5118 }, { "epoch": 1.959050899349407, "grad_norm": 0.523565411567688, "learning_rate": 1.3887157842905764e-05, "loss": 0.7478, "step": 5119 }, { "epoch": 1.959433601224646, "grad_norm": 0.562812089920044, "learning_rate": 1.3884873709738259e-05, "loss": 0.6714, "step": 5120 }, { "epoch": 1.9598163030998852, "grad_norm": 0.4902704358100891, "learning_rate": 1.3882589337834827e-05, "loss": 0.6345, "step": 5121 }, { "epoch": 1.9601990049751243, "grad_norm": 0.5305050015449524, "learning_rate": 1.388030472733585e-05, "loss": 0.696, "step": 5122 }, { "epoch": 1.9605817068503635, "grad_norm": 0.6461338996887207, "learning_rate": 1.3878019878381722e-05, "loss": 0.6818, "step": 5123 }, { "epoch": 1.9609644087256028, "grad_norm": 0.5382905602455139, "learning_rate": 1.3875734791112857e-05, "loss": 0.5915, "step": 5124 }, { "epoch": 1.961347110600842, "grad_norm": 0.6086674332618713, "learning_rate": 1.3873449465669672e-05, "loss": 0.7377, "step": 5125 }, { "epoch": 1.9617298124760811, "grad_norm": 0.532233715057373, "learning_rate": 1.3871163902192614e-05, "loss": 0.7231, "step": 5126 }, { "epoch": 1.9621125143513203, "grad_norm": 0.5646494626998901, "learning_rate": 1.3868878100822139e-05, "loss": 0.5823, "step": 5127 }, { "epoch": 1.9624952162265594, "grad_norm": 0.5593141317367554, "learning_rate": 1.3866592061698705e-05, "loss": 0.6121, "step": 5128 }, { "epoch": 1.9628779181017988, "grad_norm": 0.49423760175704956, "learning_rate": 1.3864305784962804e-05, "loss": 0.692, "step": 5129 }, { "epoch": 1.963260619977038, "grad_norm": 0.5288196206092834, "learning_rate": 1.386201927075493e-05, "loss": 0.6149, "step": 5130 }, { "epoch": 1.963643321852277, "grad_norm": 0.5846158862113953, "learning_rate": 1.3859732519215596e-05, "loss": 0.6289, "step": 5131 }, { "epoch": 1.9640260237275162, "grad_norm": 0.5153148174285889, "learning_rate": 1.3857445530485332e-05, "loss": 0.6243, "step": 5132 }, { "epoch": 1.9644087256027554, "grad_norm": 0.5775570869445801, "learning_rate": 1.3855158304704674e-05, "loss": 0.7122, "step": 5133 }, { "epoch": 1.9647914274779947, "grad_norm": 0.5304251313209534, "learning_rate": 1.3852870842014186e-05, "loss": 0.667, "step": 5134 }, { "epoch": 1.9651741293532339, "grad_norm": 0.5305089354515076, "learning_rate": 1.385058314255443e-05, "loss": 0.6699, "step": 5135 }, { "epoch": 1.965556831228473, "grad_norm": 0.516484797000885, "learning_rate": 1.3848295206465994e-05, "loss": 0.6618, "step": 5136 }, { "epoch": 1.9659395331037122, "grad_norm": 0.5109000205993652, "learning_rate": 1.3846007033889483e-05, "loss": 0.6344, "step": 5137 }, { "epoch": 1.9663222349789513, "grad_norm": 0.5163682699203491, "learning_rate": 1.3843718624965505e-05, "loss": 0.726, "step": 5138 }, { "epoch": 1.9667049368541907, "grad_norm": 0.505703330039978, "learning_rate": 1.3841429979834695e-05, "loss": 0.6725, "step": 5139 }, { "epoch": 1.9670876387294298, "grad_norm": 0.578650712966919, "learning_rate": 1.3839141098637691e-05, "loss": 0.6047, "step": 5140 }, { "epoch": 1.967470340604669, "grad_norm": 0.49557697772979736, "learning_rate": 1.3836851981515149e-05, "loss": 0.5589, "step": 5141 }, { "epoch": 1.967853042479908, "grad_norm": 0.5385852456092834, "learning_rate": 1.3834562628607749e-05, "loss": 0.7278, "step": 5142 }, { "epoch": 1.9682357443551473, "grad_norm": 0.48429349064826965, "learning_rate": 1.3832273040056174e-05, "loss": 0.6269, "step": 5143 }, { "epoch": 1.9686184462303866, "grad_norm": 0.4777600169181824, "learning_rate": 1.3829983216001123e-05, "loss": 0.6316, "step": 5144 }, { "epoch": 1.9690011481056258, "grad_norm": 0.507119357585907, "learning_rate": 1.3827693156583318e-05, "loss": 0.6449, "step": 5145 }, { "epoch": 1.969383849980865, "grad_norm": 0.5230934619903564, "learning_rate": 1.382540286194348e-05, "loss": 0.7239, "step": 5146 }, { "epoch": 1.969766551856104, "grad_norm": 0.537909984588623, "learning_rate": 1.382311233222236e-05, "loss": 0.7233, "step": 5147 }, { "epoch": 1.9701492537313432, "grad_norm": 0.4666156768798828, "learning_rate": 1.3820821567560722e-05, "loss": 0.6264, "step": 5148 }, { "epoch": 1.9705319556065826, "grad_norm": 0.5180084109306335, "learning_rate": 1.3818530568099328e-05, "loss": 0.666, "step": 5149 }, { "epoch": 1.9709146574818217, "grad_norm": 0.5052936673164368, "learning_rate": 1.3816239333978977e-05, "loss": 0.6299, "step": 5150 }, { "epoch": 1.9712973593570609, "grad_norm": 0.5937554240226746, "learning_rate": 1.3813947865340462e-05, "loss": 0.6791, "step": 5151 }, { "epoch": 1.9716800612323, "grad_norm": 0.5167226195335388, "learning_rate": 1.381165616232461e-05, "loss": 0.6756, "step": 5152 }, { "epoch": 1.9720627631075391, "grad_norm": 0.5173790454864502, "learning_rate": 1.3809364225072243e-05, "loss": 0.7067, "step": 5153 }, { "epoch": 1.9724454649827785, "grad_norm": 0.5303701758384705, "learning_rate": 1.380707205372421e-05, "loss": 0.7325, "step": 5154 }, { "epoch": 1.9728281668580177, "grad_norm": 0.5522913932800293, "learning_rate": 1.3804779648421373e-05, "loss": 0.7011, "step": 5155 }, { "epoch": 1.9732108687332568, "grad_norm": 0.4934369623661041, "learning_rate": 1.3802487009304606e-05, "loss": 0.6143, "step": 5156 }, { "epoch": 1.973593570608496, "grad_norm": 0.502781331539154, "learning_rate": 1.3800194136514795e-05, "loss": 0.6278, "step": 5157 }, { "epoch": 1.973976272483735, "grad_norm": 0.5830462574958801, "learning_rate": 1.3797901030192847e-05, "loss": 0.6579, "step": 5158 }, { "epoch": 1.9743589743589745, "grad_norm": 0.553963303565979, "learning_rate": 1.3795607690479678e-05, "loss": 0.6859, "step": 5159 }, { "epoch": 1.9747416762342136, "grad_norm": 0.48884931206703186, "learning_rate": 1.3793314117516218e-05, "loss": 0.6979, "step": 5160 }, { "epoch": 1.9751243781094527, "grad_norm": 0.5112025737762451, "learning_rate": 1.3791020311443415e-05, "loss": 0.678, "step": 5161 }, { "epoch": 1.9755070799846919, "grad_norm": 0.6078619360923767, "learning_rate": 1.3788726272402228e-05, "loss": 0.6463, "step": 5162 }, { "epoch": 1.975889781859931, "grad_norm": 0.5119839310646057, "learning_rate": 1.3786432000533636e-05, "loss": 0.719, "step": 5163 }, { "epoch": 1.9762724837351704, "grad_norm": 0.4974619448184967, "learning_rate": 1.3784137495978623e-05, "loss": 0.5882, "step": 5164 }, { "epoch": 1.9766551856104095, "grad_norm": 0.5197635293006897, "learning_rate": 1.3781842758878197e-05, "loss": 0.6462, "step": 5165 }, { "epoch": 1.9770378874856487, "grad_norm": 0.535258412361145, "learning_rate": 1.377954778937337e-05, "loss": 0.6527, "step": 5166 }, { "epoch": 1.9774205893608878, "grad_norm": 0.8389122486114502, "learning_rate": 1.3777252587605182e-05, "loss": 0.7509, "step": 5167 }, { "epoch": 1.977803291236127, "grad_norm": 0.5199759006500244, "learning_rate": 1.3774957153714672e-05, "loss": 0.6188, "step": 5168 }, { "epoch": 1.9781859931113663, "grad_norm": 0.5231986045837402, "learning_rate": 1.3772661487842906e-05, "loss": 0.6276, "step": 5169 }, { "epoch": 1.9785686949866055, "grad_norm": 0.48059937357902527, "learning_rate": 1.3770365590130955e-05, "loss": 0.6556, "step": 5170 }, { "epoch": 1.9789513968618446, "grad_norm": 0.5459837913513184, "learning_rate": 1.3768069460719909e-05, "loss": 0.6149, "step": 5171 }, { "epoch": 1.9793340987370838, "grad_norm": 0.6127578616142273, "learning_rate": 1.376577309975087e-05, "loss": 0.6143, "step": 5172 }, { "epoch": 1.979716800612323, "grad_norm": 0.5472574234008789, "learning_rate": 1.3763476507364962e-05, "loss": 0.7786, "step": 5173 }, { "epoch": 1.9800995024875623, "grad_norm": 0.4878825545310974, "learning_rate": 1.376117968370331e-05, "loss": 0.6279, "step": 5174 }, { "epoch": 1.9804822043628014, "grad_norm": 0.5258947014808655, "learning_rate": 1.375888262890706e-05, "loss": 0.6584, "step": 5175 }, { "epoch": 1.9808649062380406, "grad_norm": 0.5115512013435364, "learning_rate": 1.3756585343117374e-05, "loss": 0.5901, "step": 5176 }, { "epoch": 1.9812476081132797, "grad_norm": 0.5594764351844788, "learning_rate": 1.3754287826475428e-05, "loss": 0.5873, "step": 5177 }, { "epoch": 1.9816303099885189, "grad_norm": 0.5218705534934998, "learning_rate": 1.3751990079122412e-05, "loss": 0.6849, "step": 5178 }, { "epoch": 1.9820130118637582, "grad_norm": 0.5695292949676514, "learning_rate": 1.3749692101199524e-05, "loss": 0.638, "step": 5179 }, { "epoch": 1.9823957137389974, "grad_norm": 0.5549219846725464, "learning_rate": 1.3747393892847983e-05, "loss": 0.5731, "step": 5180 }, { "epoch": 1.9827784156142365, "grad_norm": 0.5046568512916565, "learning_rate": 1.3745095454209017e-05, "loss": 0.6847, "step": 5181 }, { "epoch": 1.9831611174894757, "grad_norm": 0.5458203554153442, "learning_rate": 1.3742796785423878e-05, "loss": 0.6365, "step": 5182 }, { "epoch": 1.9835438193647148, "grad_norm": 0.5834252238273621, "learning_rate": 1.374049788663382e-05, "loss": 0.7136, "step": 5183 }, { "epoch": 1.9839265212399542, "grad_norm": 0.5810561776161194, "learning_rate": 1.373819875798012e-05, "loss": 0.6078, "step": 5184 }, { "epoch": 1.9843092231151933, "grad_norm": 0.5289281010627747, "learning_rate": 1.373589939960406e-05, "loss": 0.6661, "step": 5185 }, { "epoch": 1.9846919249904325, "grad_norm": 0.6131139397621155, "learning_rate": 1.3733599811646947e-05, "loss": 0.7021, "step": 5186 }, { "epoch": 1.9850746268656716, "grad_norm": 0.5704060792922974, "learning_rate": 1.3731299994250097e-05, "loss": 0.6814, "step": 5187 }, { "epoch": 1.9854573287409107, "grad_norm": 0.5289977788925171, "learning_rate": 1.3728999947554835e-05, "loss": 0.687, "step": 5188 }, { "epoch": 1.98584003061615, "grad_norm": 0.4747069180011749, "learning_rate": 1.3726699671702512e-05, "loss": 0.6663, "step": 5189 }, { "epoch": 1.9862227324913893, "grad_norm": 0.5633859634399414, "learning_rate": 1.3724399166834478e-05, "loss": 0.6391, "step": 5190 }, { "epoch": 1.9866054343666284, "grad_norm": 0.4911225438117981, "learning_rate": 1.3722098433092113e-05, "loss": 0.6852, "step": 5191 }, { "epoch": 1.9869881362418675, "grad_norm": 0.6485611200332642, "learning_rate": 1.37197974706168e-05, "loss": 0.6369, "step": 5192 }, { "epoch": 1.9873708381171067, "grad_norm": 0.49942392110824585, "learning_rate": 1.3717496279549937e-05, "loss": 0.678, "step": 5193 }, { "epoch": 1.987753539992346, "grad_norm": 0.5616331696510315, "learning_rate": 1.3715194860032944e-05, "loss": 0.7494, "step": 5194 }, { "epoch": 1.9881362418675852, "grad_norm": 0.5305683016777039, "learning_rate": 1.3712893212207245e-05, "loss": 0.6897, "step": 5195 }, { "epoch": 1.9885189437428243, "grad_norm": 0.5672233700752258, "learning_rate": 1.3710591336214281e-05, "loss": 0.7204, "step": 5196 }, { "epoch": 1.9889016456180635, "grad_norm": 0.5144281387329102, "learning_rate": 1.3708289232195513e-05, "loss": 0.6343, "step": 5197 }, { "epoch": 1.9892843474933026, "grad_norm": 0.5401279330253601, "learning_rate": 1.3705986900292408e-05, "loss": 0.602, "step": 5198 }, { "epoch": 1.989667049368542, "grad_norm": 0.5375946164131165, "learning_rate": 1.3703684340646453e-05, "loss": 0.6424, "step": 5199 }, { "epoch": 1.9900497512437811, "grad_norm": 0.508700430393219, "learning_rate": 1.3701381553399147e-05, "loss": 0.6365, "step": 5200 }, { "epoch": 1.9904324531190203, "grad_norm": 0.47800213098526, "learning_rate": 1.3699078538691994e-05, "loss": 0.628, "step": 5201 }, { "epoch": 1.9908151549942594, "grad_norm": 0.5030466914176941, "learning_rate": 1.3696775296666536e-05, "loss": 0.6217, "step": 5202 }, { "epoch": 1.9911978568694986, "grad_norm": 0.4894194006919861, "learning_rate": 1.3694471827464302e-05, "loss": 0.6373, "step": 5203 }, { "epoch": 1.991580558744738, "grad_norm": 0.5737351775169373, "learning_rate": 1.3692168131226847e-05, "loss": 0.644, "step": 5204 }, { "epoch": 1.991963260619977, "grad_norm": 0.5317250490188599, "learning_rate": 1.3689864208095742e-05, "loss": 0.6621, "step": 5205 }, { "epoch": 1.9923459624952162, "grad_norm": 0.57501620054245, "learning_rate": 1.3687560058212568e-05, "loss": 0.6476, "step": 5206 }, { "epoch": 1.9927286643704554, "grad_norm": 0.5421673655509949, "learning_rate": 1.3685255681718922e-05, "loss": 0.6446, "step": 5207 }, { "epoch": 1.9931113662456945, "grad_norm": 0.5767539143562317, "learning_rate": 1.3682951078756417e-05, "loss": 0.6929, "step": 5208 }, { "epoch": 1.9934940681209339, "grad_norm": 0.4939613342285156, "learning_rate": 1.368064624946667e-05, "loss": 0.6689, "step": 5209 }, { "epoch": 1.993876769996173, "grad_norm": 0.4793926775455475, "learning_rate": 1.3678341193991324e-05, "loss": 0.6561, "step": 5210 }, { "epoch": 1.9942594718714122, "grad_norm": 0.538559079170227, "learning_rate": 1.3676035912472033e-05, "loss": 0.6467, "step": 5211 }, { "epoch": 1.9946421737466513, "grad_norm": 0.5289633274078369, "learning_rate": 1.3673730405050454e-05, "loss": 0.6857, "step": 5212 }, { "epoch": 1.9950248756218905, "grad_norm": 0.5130329132080078, "learning_rate": 1.3671424671868277e-05, "loss": 0.6665, "step": 5213 }, { "epoch": 1.9954075774971298, "grad_norm": 0.5721285343170166, "learning_rate": 1.3669118713067187e-05, "loss": 0.6278, "step": 5214 }, { "epoch": 1.995790279372369, "grad_norm": 0.5079842209815979, "learning_rate": 1.3666812528788897e-05, "loss": 0.6101, "step": 5215 }, { "epoch": 1.9961729812476081, "grad_norm": 0.5093860030174255, "learning_rate": 1.3664506119175128e-05, "loss": 0.6231, "step": 5216 }, { "epoch": 1.9965556831228473, "grad_norm": 2.2451181411743164, "learning_rate": 1.3662199484367609e-05, "loss": 0.6153, "step": 5217 }, { "epoch": 1.9969383849980864, "grad_norm": 0.6654559969902039, "learning_rate": 1.3659892624508096e-05, "loss": 0.6772, "step": 5218 }, { "epoch": 1.9973210868733258, "grad_norm": 0.5484278202056885, "learning_rate": 1.3657585539738349e-05, "loss": 0.6439, "step": 5219 }, { "epoch": 1.997703788748565, "grad_norm": 0.5333636999130249, "learning_rate": 1.3655278230200144e-05, "loss": 0.5926, "step": 5220 }, { "epoch": 1.998086490623804, "grad_norm": 0.4918729364871979, "learning_rate": 1.365297069603527e-05, "loss": 0.5818, "step": 5221 }, { "epoch": 1.9984691924990432, "grad_norm": 0.5222769975662231, "learning_rate": 1.3650662937385537e-05, "loss": 0.6544, "step": 5222 }, { "epoch": 1.9988518943742823, "grad_norm": 0.5579756498336792, "learning_rate": 1.3648354954392758e-05, "loss": 0.6997, "step": 5223 }, { "epoch": 1.9992345962495217, "grad_norm": 0.5440437197685242, "learning_rate": 1.3646046747198763e-05, "loss": 0.6345, "step": 5224 }, { "epoch": 1.9996172981247609, "grad_norm": 0.5519583821296692, "learning_rate": 1.3643738315945404e-05, "loss": 0.5683, "step": 5225 }, { "epoch": 2.0, "grad_norm": 0.4951680898666382, "learning_rate": 1.3641429660774535e-05, "loss": 0.661, "step": 5226 }, { "epoch": 2.0003827018752394, "grad_norm": 0.5355364084243774, "learning_rate": 1.3639120781828028e-05, "loss": 0.6531, "step": 5227 }, { "epoch": 2.0007654037504783, "grad_norm": 0.5056034326553345, "learning_rate": 1.3636811679247775e-05, "loss": 0.6265, "step": 5228 }, { "epoch": 2.0011481056257177, "grad_norm": 0.5147846341133118, "learning_rate": 1.3634502353175676e-05, "loss": 0.6778, "step": 5229 }, { "epoch": 2.0015308075009566, "grad_norm": 0.5735732913017273, "learning_rate": 1.363219280375364e-05, "loss": 0.6913, "step": 5230 }, { "epoch": 2.001913509376196, "grad_norm": 0.5536653399467468, "learning_rate": 1.3629883031123599e-05, "loss": 0.6279, "step": 5231 }, { "epoch": 2.0022962112514353, "grad_norm": 0.5538913011550903, "learning_rate": 1.3627573035427494e-05, "loss": 0.6471, "step": 5232 }, { "epoch": 2.0026789131266742, "grad_norm": 0.5358171463012695, "learning_rate": 1.3625262816807284e-05, "loss": 0.6157, "step": 5233 }, { "epoch": 2.0030616150019136, "grad_norm": 0.49367743730545044, "learning_rate": 1.3622952375404931e-05, "loss": 0.6108, "step": 5234 }, { "epoch": 2.0034443168771525, "grad_norm": 0.6143732070922852, "learning_rate": 1.3620641711362423e-05, "loss": 0.665, "step": 5235 }, { "epoch": 2.003827018752392, "grad_norm": 0.5615062117576599, "learning_rate": 1.3618330824821753e-05, "loss": 0.5569, "step": 5236 }, { "epoch": 2.0042097206276313, "grad_norm": 0.5346431136131287, "learning_rate": 1.3616019715924936e-05, "loss": 0.6596, "step": 5237 }, { "epoch": 2.00459242250287, "grad_norm": 0.4895579516887665, "learning_rate": 1.361370838481399e-05, "loss": 0.6392, "step": 5238 }, { "epoch": 2.0049751243781095, "grad_norm": 0.5618073344230652, "learning_rate": 1.3611396831630961e-05, "loss": 0.7393, "step": 5239 }, { "epoch": 2.0053578262533485, "grad_norm": 0.5236818194389343, "learning_rate": 1.3609085056517888e-05, "loss": 0.61, "step": 5240 }, { "epoch": 2.005740528128588, "grad_norm": 0.5355179309844971, "learning_rate": 1.3606773059616845e-05, "loss": 0.6504, "step": 5241 }, { "epoch": 2.006123230003827, "grad_norm": 0.6667188405990601, "learning_rate": 1.360446084106991e-05, "loss": 0.5959, "step": 5242 }, { "epoch": 2.006505931879066, "grad_norm": 0.5397460460662842, "learning_rate": 1.3602148401019171e-05, "loss": 0.6306, "step": 5243 }, { "epoch": 2.0068886337543055, "grad_norm": 0.5439111590385437, "learning_rate": 1.3599835739606737e-05, "loss": 0.6042, "step": 5244 }, { "epoch": 2.0072713356295444, "grad_norm": 0.5194686651229858, "learning_rate": 1.3597522856974724e-05, "loss": 0.6523, "step": 5245 }, { "epoch": 2.0076540375047838, "grad_norm": 0.6341765522956848, "learning_rate": 1.3595209753265266e-05, "loss": 0.6393, "step": 5246 }, { "epoch": 2.008036739380023, "grad_norm": 0.5256776809692383, "learning_rate": 1.3592896428620515e-05, "loss": 0.584, "step": 5247 }, { "epoch": 2.008419441255262, "grad_norm": 0.4851900339126587, "learning_rate": 1.3590582883182622e-05, "loss": 0.6445, "step": 5248 }, { "epoch": 2.0088021431305014, "grad_norm": 0.5418990850448608, "learning_rate": 1.3588269117093767e-05, "loss": 0.7041, "step": 5249 }, { "epoch": 2.0091848450057403, "grad_norm": 0.48794522881507874, "learning_rate": 1.3585955130496134e-05, "loss": 0.6527, "step": 5250 }, { "epoch": 2.0095675468809797, "grad_norm": 0.595832109451294, "learning_rate": 1.3583640923531922e-05, "loss": 0.6945, "step": 5251 }, { "epoch": 2.009950248756219, "grad_norm": 0.5891751646995544, "learning_rate": 1.3581326496343353e-05, "loss": 0.5578, "step": 5252 }, { "epoch": 2.010332950631458, "grad_norm": 0.5317770838737488, "learning_rate": 1.3579011849072643e-05, "loss": 0.6971, "step": 5253 }, { "epoch": 2.0107156525066974, "grad_norm": 0.5101879239082336, "learning_rate": 1.3576696981862045e-05, "loss": 0.66, "step": 5254 }, { "epoch": 2.0110983543819363, "grad_norm": 0.4909704029560089, "learning_rate": 1.357438189485381e-05, "loss": 0.6265, "step": 5255 }, { "epoch": 2.0114810562571757, "grad_norm": 0.500870943069458, "learning_rate": 1.35720665881902e-05, "loss": 0.6863, "step": 5256 }, { "epoch": 2.011863758132415, "grad_norm": 0.516210675239563, "learning_rate": 1.3569751062013502e-05, "loss": 0.6973, "step": 5257 }, { "epoch": 2.012246460007654, "grad_norm": 0.5575618147850037, "learning_rate": 1.3567435316466016e-05, "loss": 0.6327, "step": 5258 }, { "epoch": 2.0126291618828933, "grad_norm": 0.5499412417411804, "learning_rate": 1.3565119351690043e-05, "loss": 0.6068, "step": 5259 }, { "epoch": 2.0130118637581322, "grad_norm": 0.5965864658355713, "learning_rate": 1.356280316782791e-05, "loss": 0.5934, "step": 5260 }, { "epoch": 2.0133945656333716, "grad_norm": 0.5181757807731628, "learning_rate": 1.3560486765021947e-05, "loss": 0.6443, "step": 5261 }, { "epoch": 2.013777267508611, "grad_norm": 0.48690128326416016, "learning_rate": 1.355817014341451e-05, "loss": 0.654, "step": 5262 }, { "epoch": 2.01415996938385, "grad_norm": 0.5287138819694519, "learning_rate": 1.3555853303147957e-05, "loss": 0.6428, "step": 5263 }, { "epoch": 2.0145426712590893, "grad_norm": 0.6182487607002258, "learning_rate": 1.3553536244364669e-05, "loss": 0.6274, "step": 5264 }, { "epoch": 2.014925373134328, "grad_norm": 0.5536932945251465, "learning_rate": 1.3551218967207032e-05, "loss": 0.6588, "step": 5265 }, { "epoch": 2.0153080750095675, "grad_norm": 0.5185964107513428, "learning_rate": 1.3548901471817444e-05, "loss": 0.6176, "step": 5266 }, { "epoch": 2.015690776884807, "grad_norm": 0.5107839107513428, "learning_rate": 1.354658375833833e-05, "loss": 0.6578, "step": 5267 }, { "epoch": 2.016073478760046, "grad_norm": 0.5735543966293335, "learning_rate": 1.3544265826912118e-05, "loss": 0.6851, "step": 5268 }, { "epoch": 2.016456180635285, "grad_norm": 0.5035829544067383, "learning_rate": 1.3541947677681246e-05, "loss": 0.661, "step": 5269 }, { "epoch": 2.016838882510524, "grad_norm": 0.5673652291297913, "learning_rate": 1.3539629310788177e-05, "loss": 0.6318, "step": 5270 }, { "epoch": 2.0172215843857635, "grad_norm": 0.5079026222229004, "learning_rate": 1.3537310726375375e-05, "loss": 0.589, "step": 5271 }, { "epoch": 2.017604286261003, "grad_norm": 0.5301870703697205, "learning_rate": 1.3534991924585326e-05, "loss": 0.6393, "step": 5272 }, { "epoch": 2.0179869881362418, "grad_norm": 0.5046024918556213, "learning_rate": 1.3532672905560527e-05, "loss": 0.6123, "step": 5273 }, { "epoch": 2.018369690011481, "grad_norm": 0.582085132598877, "learning_rate": 1.3530353669443486e-05, "loss": 0.5988, "step": 5274 }, { "epoch": 2.01875239188672, "grad_norm": 0.5233427286148071, "learning_rate": 1.3528034216376731e-05, "loss": 0.6601, "step": 5275 }, { "epoch": 2.0191350937619594, "grad_norm": 0.5118962526321411, "learning_rate": 1.3525714546502793e-05, "loss": 0.6488, "step": 5276 }, { "epoch": 2.019517795637199, "grad_norm": 0.46152663230895996, "learning_rate": 1.3523394659964224e-05, "loss": 0.5666, "step": 5277 }, { "epoch": 2.0199004975124377, "grad_norm": 0.4878464639186859, "learning_rate": 1.352107455690359e-05, "loss": 0.5689, "step": 5278 }, { "epoch": 2.020283199387677, "grad_norm": 0.567036509513855, "learning_rate": 1.3518754237463463e-05, "loss": 0.6179, "step": 5279 }, { "epoch": 2.020665901262916, "grad_norm": 0.5744159817695618, "learning_rate": 1.3516433701786438e-05, "loss": 0.5806, "step": 5280 }, { "epoch": 2.0210486031381554, "grad_norm": 0.5768238306045532, "learning_rate": 1.351411295001511e-05, "loss": 0.6023, "step": 5281 }, { "epoch": 2.0214313050133947, "grad_norm": 0.5138521194458008, "learning_rate": 1.3511791982292103e-05, "loss": 0.609, "step": 5282 }, { "epoch": 2.0218140068886337, "grad_norm": 0.5265240669250488, "learning_rate": 1.3509470798760048e-05, "loss": 0.7057, "step": 5283 }, { "epoch": 2.022196708763873, "grad_norm": 0.505206823348999, "learning_rate": 1.350714939956158e-05, "loss": 0.6035, "step": 5284 }, { "epoch": 2.022579410639112, "grad_norm": 0.5164929628372192, "learning_rate": 1.350482778483936e-05, "loss": 0.6208, "step": 5285 }, { "epoch": 2.0229621125143513, "grad_norm": 0.5764365196228027, "learning_rate": 1.3502505954736058e-05, "loss": 0.5414, "step": 5286 }, { "epoch": 2.0233448143895907, "grad_norm": 0.590478241443634, "learning_rate": 1.3500183909394356e-05, "loss": 0.6514, "step": 5287 }, { "epoch": 2.0237275162648296, "grad_norm": 0.5675897598266602, "learning_rate": 1.3497861648956949e-05, "loss": 0.5504, "step": 5288 }, { "epoch": 2.024110218140069, "grad_norm": 0.5738194584846497, "learning_rate": 1.3495539173566546e-05, "loss": 0.6659, "step": 5289 }, { "epoch": 2.024492920015308, "grad_norm": 0.5273186564445496, "learning_rate": 1.349321648336587e-05, "loss": 0.6024, "step": 5290 }, { "epoch": 2.0248756218905473, "grad_norm": 0.569487988948822, "learning_rate": 1.3490893578497657e-05, "loss": 0.6401, "step": 5291 }, { "epoch": 2.0252583237657866, "grad_norm": 0.5916973352432251, "learning_rate": 1.3488570459104656e-05, "loss": 0.5521, "step": 5292 }, { "epoch": 2.0256410256410255, "grad_norm": 0.5940636396408081, "learning_rate": 1.3486247125329627e-05, "loss": 0.6619, "step": 5293 }, { "epoch": 2.026023727516265, "grad_norm": 0.5991669297218323, "learning_rate": 1.3483923577315347e-05, "loss": 0.7002, "step": 5294 }, { "epoch": 2.026406429391504, "grad_norm": 0.5297185182571411, "learning_rate": 1.3481599815204604e-05, "loss": 0.6063, "step": 5295 }, { "epoch": 2.026789131266743, "grad_norm": 0.5051356554031372, "learning_rate": 1.3479275839140197e-05, "loss": 0.613, "step": 5296 }, { "epoch": 2.0271718331419826, "grad_norm": 0.547937273979187, "learning_rate": 1.3476951649264944e-05, "loss": 0.6903, "step": 5297 }, { "epoch": 2.0275545350172215, "grad_norm": 0.564597487449646, "learning_rate": 1.3474627245721675e-05, "loss": 0.6764, "step": 5298 }, { "epoch": 2.027937236892461, "grad_norm": 0.5641602277755737, "learning_rate": 1.3472302628653224e-05, "loss": 0.7009, "step": 5299 }, { "epoch": 2.0283199387677, "grad_norm": 0.5681803822517395, "learning_rate": 1.3469977798202447e-05, "loss": 0.6978, "step": 5300 }, { "epoch": 2.028702640642939, "grad_norm": 0.5617138743400574, "learning_rate": 1.3467652754512213e-05, "loss": 0.6518, "step": 5301 }, { "epoch": 2.0290853425181785, "grad_norm": 0.5467539429664612, "learning_rate": 1.34653274977254e-05, "loss": 0.6725, "step": 5302 }, { "epoch": 2.0294680443934174, "grad_norm": 0.5137493014335632, "learning_rate": 1.3463002027984907e-05, "loss": 0.6977, "step": 5303 }, { "epoch": 2.029850746268657, "grad_norm": 0.5277265310287476, "learning_rate": 1.3460676345433635e-05, "loss": 0.6618, "step": 5304 }, { "epoch": 2.0302334481438957, "grad_norm": 0.5391505360603333, "learning_rate": 1.3458350450214501e-05, "loss": 0.6651, "step": 5305 }, { "epoch": 2.030616150019135, "grad_norm": 0.5061625838279724, "learning_rate": 1.3456024342470445e-05, "loss": 0.6406, "step": 5306 }, { "epoch": 2.0309988518943745, "grad_norm": 0.5196017622947693, "learning_rate": 1.3453698022344406e-05, "loss": 0.6336, "step": 5307 }, { "epoch": 2.0313815537696134, "grad_norm": 0.5310797095298767, "learning_rate": 1.3451371489979346e-05, "loss": 0.6884, "step": 5308 }, { "epoch": 2.0317642556448527, "grad_norm": 0.5268329381942749, "learning_rate": 1.3449044745518235e-05, "loss": 0.6576, "step": 5309 }, { "epoch": 2.0321469575200917, "grad_norm": 0.5304811596870422, "learning_rate": 1.344671778910406e-05, "loss": 0.7165, "step": 5310 }, { "epoch": 2.032529659395331, "grad_norm": 0.5653570294380188, "learning_rate": 1.3444390620879816e-05, "loss": 0.699, "step": 5311 }, { "epoch": 2.0329123612705704, "grad_norm": 0.5601182579994202, "learning_rate": 1.3442063240988517e-05, "loss": 0.6183, "step": 5312 }, { "epoch": 2.0332950631458093, "grad_norm": 0.49808186292648315, "learning_rate": 1.3439735649573182e-05, "loss": 0.6448, "step": 5313 }, { "epoch": 2.0336777650210487, "grad_norm": 0.5071684122085571, "learning_rate": 1.3437407846776854e-05, "loss": 0.661, "step": 5314 }, { "epoch": 2.0340604668962876, "grad_norm": 0.5322943329811096, "learning_rate": 1.3435079832742577e-05, "loss": 0.6534, "step": 5315 }, { "epoch": 2.034443168771527, "grad_norm": 0.5875592827796936, "learning_rate": 1.3432751607613415e-05, "loss": 0.5789, "step": 5316 }, { "epoch": 2.0348258706467663, "grad_norm": 0.5410504341125488, "learning_rate": 1.3430423171532445e-05, "loss": 0.6245, "step": 5317 }, { "epoch": 2.0352085725220053, "grad_norm": 0.5096706748008728, "learning_rate": 1.3428094524642756e-05, "loss": 0.6528, "step": 5318 }, { "epoch": 2.0355912743972446, "grad_norm": 0.5299837589263916, "learning_rate": 1.3425765667087452e-05, "loss": 0.5979, "step": 5319 }, { "epoch": 2.0359739762724836, "grad_norm": 0.5070053935050964, "learning_rate": 1.342343659900964e-05, "loss": 0.7146, "step": 5320 }, { "epoch": 2.036356678147723, "grad_norm": 0.5211166143417358, "learning_rate": 1.3421107320552453e-05, "loss": 0.7084, "step": 5321 }, { "epoch": 2.0367393800229623, "grad_norm": 0.5414469838142395, "learning_rate": 1.3418777831859031e-05, "loss": 0.5885, "step": 5322 }, { "epoch": 2.037122081898201, "grad_norm": 0.5955721735954285, "learning_rate": 1.3416448133072525e-05, "loss": 0.7064, "step": 5323 }, { "epoch": 2.0375047837734406, "grad_norm": 0.5270006656646729, "learning_rate": 1.3414118224336105e-05, "loss": 0.6221, "step": 5324 }, { "epoch": 2.0378874856486795, "grad_norm": 0.5851909518241882, "learning_rate": 1.3411788105792951e-05, "loss": 0.6171, "step": 5325 }, { "epoch": 2.038270187523919, "grad_norm": 0.5406605005264282, "learning_rate": 1.3409457777586247e-05, "loss": 0.6667, "step": 5326 }, { "epoch": 2.0386528893991582, "grad_norm": 0.5703496336936951, "learning_rate": 1.3407127239859203e-05, "loss": 0.6484, "step": 5327 }, { "epoch": 2.039035591274397, "grad_norm": 0.523350715637207, "learning_rate": 1.340479649275504e-05, "loss": 0.6511, "step": 5328 }, { "epoch": 2.0394182931496365, "grad_norm": 0.5669201612472534, "learning_rate": 1.3402465536416984e-05, "loss": 0.5894, "step": 5329 }, { "epoch": 2.0398009950248754, "grad_norm": 0.564652681350708, "learning_rate": 1.3400134370988283e-05, "loss": 0.6461, "step": 5330 }, { "epoch": 2.040183696900115, "grad_norm": 0.590325117111206, "learning_rate": 1.3397802996612187e-05, "loss": 0.6292, "step": 5331 }, { "epoch": 2.040566398775354, "grad_norm": 0.5465552806854248, "learning_rate": 1.3395471413431968e-05, "loss": 0.6201, "step": 5332 }, { "epoch": 2.040949100650593, "grad_norm": 0.5317513346672058, "learning_rate": 1.339313962159091e-05, "loss": 0.6838, "step": 5333 }, { "epoch": 2.0413318025258325, "grad_norm": 0.5536258220672607, "learning_rate": 1.3390807621232308e-05, "loss": 0.634, "step": 5334 }, { "epoch": 2.0417145044010714, "grad_norm": 0.5400696396827698, "learning_rate": 1.3388475412499469e-05, "loss": 0.6039, "step": 5335 }, { "epoch": 2.0420972062763108, "grad_norm": 0.5231133699417114, "learning_rate": 1.3386142995535711e-05, "loss": 0.704, "step": 5336 }, { "epoch": 2.04247990815155, "grad_norm": 0.5076937079429626, "learning_rate": 1.3383810370484367e-05, "loss": 0.6238, "step": 5337 }, { "epoch": 2.042862610026789, "grad_norm": 0.5154184699058533, "learning_rate": 1.338147753748879e-05, "loss": 0.6948, "step": 5338 }, { "epoch": 2.0432453119020284, "grad_norm": 0.6288823485374451, "learning_rate": 1.3379144496692332e-05, "loss": 0.6181, "step": 5339 }, { "epoch": 2.0436280137772673, "grad_norm": 0.5268318057060242, "learning_rate": 1.3376811248238366e-05, "loss": 0.6842, "step": 5340 }, { "epoch": 2.0440107156525067, "grad_norm": 0.5423926115036011, "learning_rate": 1.3374477792270276e-05, "loss": 0.6943, "step": 5341 }, { "epoch": 2.044393417527746, "grad_norm": 0.5617265105247498, "learning_rate": 1.337214412893146e-05, "loss": 0.6252, "step": 5342 }, { "epoch": 2.044776119402985, "grad_norm": 0.5174399018287659, "learning_rate": 1.336981025836533e-05, "loss": 0.7308, "step": 5343 }, { "epoch": 2.0451588212782243, "grad_norm": 0.5025792717933655, "learning_rate": 1.3367476180715304e-05, "loss": 0.644, "step": 5344 }, { "epoch": 2.0455415231534633, "grad_norm": 0.5799979567527771, "learning_rate": 1.3365141896124824e-05, "loss": 0.6389, "step": 5345 }, { "epoch": 2.0459242250287026, "grad_norm": 0.5486041307449341, "learning_rate": 1.3362807404737332e-05, "loss": 0.6241, "step": 5346 }, { "epoch": 2.046306926903942, "grad_norm": 0.5042457580566406, "learning_rate": 1.336047270669629e-05, "loss": 0.5964, "step": 5347 }, { "epoch": 2.046689628779181, "grad_norm": 0.5568341612815857, "learning_rate": 1.3358137802145174e-05, "loss": 0.6819, "step": 5348 }, { "epoch": 2.0470723306544203, "grad_norm": 0.5242086052894592, "learning_rate": 1.3355802691227466e-05, "loss": 0.6003, "step": 5349 }, { "epoch": 2.047455032529659, "grad_norm": 0.5690749287605286, "learning_rate": 1.335346737408667e-05, "loss": 0.6385, "step": 5350 }, { "epoch": 2.0478377344048986, "grad_norm": 0.591959536075592, "learning_rate": 1.3351131850866295e-05, "loss": 0.6714, "step": 5351 }, { "epoch": 2.048220436280138, "grad_norm": 0.5529099106788635, "learning_rate": 1.3348796121709862e-05, "loss": 0.6544, "step": 5352 }, { "epoch": 2.048603138155377, "grad_norm": 0.5768452286720276, "learning_rate": 1.3346460186760913e-05, "loss": 0.6887, "step": 5353 }, { "epoch": 2.0489858400306162, "grad_norm": 0.5399566292762756, "learning_rate": 1.3344124046162996e-05, "loss": 0.5643, "step": 5354 }, { "epoch": 2.049368541905855, "grad_norm": 0.5460125803947449, "learning_rate": 1.3341787700059672e-05, "loss": 0.6587, "step": 5355 }, { "epoch": 2.0497512437810945, "grad_norm": 0.5021165013313293, "learning_rate": 1.3339451148594515e-05, "loss": 0.549, "step": 5356 }, { "epoch": 2.050133945656334, "grad_norm": 0.5477648973464966, "learning_rate": 1.3337114391911113e-05, "loss": 0.6145, "step": 5357 }, { "epoch": 2.050516647531573, "grad_norm": 0.5205659866333008, "learning_rate": 1.3334777430153067e-05, "loss": 0.6254, "step": 5358 }, { "epoch": 2.050899349406812, "grad_norm": 0.5109203457832336, "learning_rate": 1.333244026346399e-05, "loss": 0.6456, "step": 5359 }, { "epoch": 2.051282051282051, "grad_norm": 0.6867050528526306, "learning_rate": 1.3330102891987507e-05, "loss": 0.6188, "step": 5360 }, { "epoch": 2.0516647531572905, "grad_norm": 0.4792887270450592, "learning_rate": 1.3327765315867253e-05, "loss": 0.6575, "step": 5361 }, { "epoch": 2.05204745503253, "grad_norm": 0.586313009262085, "learning_rate": 1.3325427535246877e-05, "loss": 0.6223, "step": 5362 }, { "epoch": 2.0524301569077688, "grad_norm": 0.5433412194252014, "learning_rate": 1.3323089550270053e-05, "loss": 0.6442, "step": 5363 }, { "epoch": 2.052812858783008, "grad_norm": 0.5016582012176514, "learning_rate": 1.3320751361080446e-05, "loss": 0.4899, "step": 5364 }, { "epoch": 2.053195560658247, "grad_norm": 0.529922604560852, "learning_rate": 1.3318412967821743e-05, "loss": 0.6492, "step": 5365 }, { "epoch": 2.0535782625334864, "grad_norm": 0.5594286918640137, "learning_rate": 1.3316074370637653e-05, "loss": 0.671, "step": 5366 }, { "epoch": 2.0539609644087258, "grad_norm": 0.5921157598495483, "learning_rate": 1.3313735569671882e-05, "loss": 0.6704, "step": 5367 }, { "epoch": 2.0543436662839647, "grad_norm": 0.5074508786201477, "learning_rate": 1.3311396565068156e-05, "loss": 0.6015, "step": 5368 }, { "epoch": 2.054726368159204, "grad_norm": 0.47704118490219116, "learning_rate": 1.3309057356970218e-05, "loss": 0.6385, "step": 5369 }, { "epoch": 2.055109070034443, "grad_norm": 0.5095381140708923, "learning_rate": 1.3306717945521814e-05, "loss": 0.6232, "step": 5370 }, { "epoch": 2.0554917719096824, "grad_norm": 0.5084190964698792, "learning_rate": 1.3304378330866711e-05, "loss": 0.5752, "step": 5371 }, { "epoch": 2.0558744737849217, "grad_norm": 0.5551927089691162, "learning_rate": 1.330203851314868e-05, "loss": 0.6395, "step": 5372 }, { "epoch": 2.0562571756601606, "grad_norm": 0.5122650265693665, "learning_rate": 1.3299698492511512e-05, "loss": 0.6004, "step": 5373 }, { "epoch": 2.0566398775354, "grad_norm": 0.5663692951202393, "learning_rate": 1.329735826909901e-05, "loss": 0.6597, "step": 5374 }, { "epoch": 2.057022579410639, "grad_norm": 0.5352078080177307, "learning_rate": 1.3295017843054981e-05, "loss": 0.6616, "step": 5375 }, { "epoch": 2.0574052812858783, "grad_norm": 0.6252171397209167, "learning_rate": 1.3292677214523255e-05, "loss": 0.5829, "step": 5376 }, { "epoch": 2.0577879831611177, "grad_norm": 0.5545435547828674, "learning_rate": 1.3290336383647667e-05, "loss": 0.6496, "step": 5377 }, { "epoch": 2.0581706850363566, "grad_norm": 0.5396726727485657, "learning_rate": 1.3287995350572067e-05, "loss": 0.624, "step": 5378 }, { "epoch": 2.058553386911596, "grad_norm": 0.5275074243545532, "learning_rate": 1.3285654115440323e-05, "loss": 0.5233, "step": 5379 }, { "epoch": 2.058936088786835, "grad_norm": 0.5512111783027649, "learning_rate": 1.3283312678396307e-05, "loss": 0.6852, "step": 5380 }, { "epoch": 2.0593187906620742, "grad_norm": 0.5275155901908875, "learning_rate": 1.3280971039583906e-05, "loss": 0.6012, "step": 5381 }, { "epoch": 2.0597014925373136, "grad_norm": 0.5917609930038452, "learning_rate": 1.3278629199147017e-05, "loss": 0.5665, "step": 5382 }, { "epoch": 2.0600841944125525, "grad_norm": 0.5773249268531799, "learning_rate": 1.3276287157229557e-05, "loss": 0.5575, "step": 5383 }, { "epoch": 2.060466896287792, "grad_norm": 0.5071720480918884, "learning_rate": 1.3273944913975452e-05, "loss": 0.6388, "step": 5384 }, { "epoch": 2.060849598163031, "grad_norm": 0.5534452199935913, "learning_rate": 1.3271602469528635e-05, "loss": 0.6057, "step": 5385 }, { "epoch": 2.06123230003827, "grad_norm": 0.5416595935821533, "learning_rate": 1.3269259824033056e-05, "loss": 0.6333, "step": 5386 }, { "epoch": 2.0616150019135095, "grad_norm": 0.5773433446884155, "learning_rate": 1.326691697763268e-05, "loss": 0.6649, "step": 5387 }, { "epoch": 2.0619977037887485, "grad_norm": 0.6163720488548279, "learning_rate": 1.3264573930471476e-05, "loss": 0.7048, "step": 5388 }, { "epoch": 2.062380405663988, "grad_norm": 0.6140479445457458, "learning_rate": 1.3262230682693438e-05, "loss": 0.6829, "step": 5389 }, { "epoch": 2.0627631075392268, "grad_norm": 0.5722364783287048, "learning_rate": 1.325988723444256e-05, "loss": 0.7146, "step": 5390 }, { "epoch": 2.063145809414466, "grad_norm": 0.6037188768386841, "learning_rate": 1.3257543585862851e-05, "loss": 0.64, "step": 5391 }, { "epoch": 2.0635285112897055, "grad_norm": 0.5985523462295532, "learning_rate": 1.3255199737098339e-05, "loss": 0.686, "step": 5392 }, { "epoch": 2.0639112131649444, "grad_norm": 0.5833336710929871, "learning_rate": 1.3252855688293059e-05, "loss": 0.5433, "step": 5393 }, { "epoch": 2.064293915040184, "grad_norm": 0.6092972755432129, "learning_rate": 1.3250511439591055e-05, "loss": 0.5816, "step": 5394 }, { "epoch": 2.0646766169154227, "grad_norm": 0.613431453704834, "learning_rate": 1.3248166991136392e-05, "loss": 0.8033, "step": 5395 }, { "epoch": 2.065059318790662, "grad_norm": 0.5586572885513306, "learning_rate": 1.3245822343073143e-05, "loss": 0.6697, "step": 5396 }, { "epoch": 2.0654420206659014, "grad_norm": 0.5305017828941345, "learning_rate": 1.3243477495545389e-05, "loss": 0.6568, "step": 5397 }, { "epoch": 2.0658247225411404, "grad_norm": 0.5904590487480164, "learning_rate": 1.3241132448697232e-05, "loss": 0.6371, "step": 5398 }, { "epoch": 2.0662074244163797, "grad_norm": 0.6637874841690063, "learning_rate": 1.3238787202672776e-05, "loss": 0.6135, "step": 5399 }, { "epoch": 2.0665901262916186, "grad_norm": 0.6050931215286255, "learning_rate": 1.3236441757616148e-05, "loss": 0.6259, "step": 5400 }, { "epoch": 2.066972828166858, "grad_norm": 0.502000629901886, "learning_rate": 1.3234096113671476e-05, "loss": 0.5949, "step": 5401 }, { "epoch": 2.0673555300420974, "grad_norm": 0.5524563193321228, "learning_rate": 1.323175027098291e-05, "loss": 0.6143, "step": 5402 }, { "epoch": 2.0677382319173363, "grad_norm": 0.7052518725395203, "learning_rate": 1.3229404229694612e-05, "loss": 0.6291, "step": 5403 }, { "epoch": 2.0681209337925757, "grad_norm": 0.6639379262924194, "learning_rate": 1.3227057989950743e-05, "loss": 0.5257, "step": 5404 }, { "epoch": 2.0685036356678146, "grad_norm": 0.5627332329750061, "learning_rate": 1.3224711551895495e-05, "loss": 0.6185, "step": 5405 }, { "epoch": 2.068886337543054, "grad_norm": 0.5135262608528137, "learning_rate": 1.3222364915673059e-05, "loss": 0.6483, "step": 5406 }, { "epoch": 2.0692690394182933, "grad_norm": 0.5065717101097107, "learning_rate": 1.3220018081427637e-05, "loss": 0.5746, "step": 5407 }, { "epoch": 2.0696517412935322, "grad_norm": 0.6230489015579224, "learning_rate": 1.3217671049303458e-05, "loss": 0.6255, "step": 5408 }, { "epoch": 2.0700344431687716, "grad_norm": 0.616702675819397, "learning_rate": 1.3215323819444747e-05, "loss": 0.5523, "step": 5409 }, { "epoch": 2.0704171450440105, "grad_norm": 0.5945949554443359, "learning_rate": 1.3212976391995752e-05, "loss": 0.6491, "step": 5410 }, { "epoch": 2.07079984691925, "grad_norm": 0.5472855567932129, "learning_rate": 1.3210628767100723e-05, "loss": 0.7529, "step": 5411 }, { "epoch": 2.0711825487944893, "grad_norm": 0.5345296859741211, "learning_rate": 1.320828094490393e-05, "loss": 0.6601, "step": 5412 }, { "epoch": 2.071565250669728, "grad_norm": 0.5627256035804749, "learning_rate": 1.3205932925549658e-05, "loss": 0.5924, "step": 5413 }, { "epoch": 2.0719479525449676, "grad_norm": 0.5664548277854919, "learning_rate": 1.3203584709182194e-05, "loss": 0.6319, "step": 5414 }, { "epoch": 2.0723306544202065, "grad_norm": 0.6164005994796753, "learning_rate": 1.3201236295945841e-05, "loss": 0.6887, "step": 5415 }, { "epoch": 2.072713356295446, "grad_norm": 0.5330138206481934, "learning_rate": 1.319888768598492e-05, "loss": 0.6283, "step": 5416 }, { "epoch": 2.073096058170685, "grad_norm": 0.5973674654960632, "learning_rate": 1.3196538879443752e-05, "loss": 0.6396, "step": 5417 }, { "epoch": 2.073478760045924, "grad_norm": 0.5583202838897705, "learning_rate": 1.3194189876466688e-05, "loss": 0.6251, "step": 5418 }, { "epoch": 2.0738614619211635, "grad_norm": 0.5442088842391968, "learning_rate": 1.3191840677198073e-05, "loss": 0.5925, "step": 5419 }, { "epoch": 2.0742441637964024, "grad_norm": 0.5795313119888306, "learning_rate": 1.3189491281782271e-05, "loss": 0.6944, "step": 5420 }, { "epoch": 2.074626865671642, "grad_norm": 0.5239920020103455, "learning_rate": 1.3187141690363667e-05, "loss": 0.5304, "step": 5421 }, { "epoch": 2.075009567546881, "grad_norm": 0.5943015217781067, "learning_rate": 1.3184791903086638e-05, "loss": 0.6412, "step": 5422 }, { "epoch": 2.07539226942212, "grad_norm": 0.5491247177124023, "learning_rate": 1.3182441920095595e-05, "loss": 0.7499, "step": 5423 }, { "epoch": 2.0757749712973594, "grad_norm": 0.5234754681587219, "learning_rate": 1.3180091741534944e-05, "loss": 0.5916, "step": 5424 }, { "epoch": 2.0761576731725984, "grad_norm": 0.5689679384231567, "learning_rate": 1.3177741367549114e-05, "loss": 0.5604, "step": 5425 }, { "epoch": 2.0765403750478377, "grad_norm": 0.5338311791419983, "learning_rate": 1.3175390798282539e-05, "loss": 0.5851, "step": 5426 }, { "epoch": 2.076923076923077, "grad_norm": 0.5259073972702026, "learning_rate": 1.3173040033879666e-05, "loss": 0.6713, "step": 5427 }, { "epoch": 2.077305778798316, "grad_norm": 0.5693382620811462, "learning_rate": 1.3170689074484961e-05, "loss": 0.6752, "step": 5428 }, { "epoch": 2.0776884806735554, "grad_norm": 0.5708796381950378, "learning_rate": 1.3168337920242898e-05, "loss": 0.6687, "step": 5429 }, { "epoch": 2.0780711825487943, "grad_norm": 0.5290303826332092, "learning_rate": 1.3165986571297953e-05, "loss": 0.6697, "step": 5430 }, { "epoch": 2.0784538844240337, "grad_norm": 0.5524734854698181, "learning_rate": 1.316363502779463e-05, "loss": 0.6765, "step": 5431 }, { "epoch": 2.078836586299273, "grad_norm": 0.5889753699302673, "learning_rate": 1.3161283289877433e-05, "loss": 0.5927, "step": 5432 }, { "epoch": 2.079219288174512, "grad_norm": 0.6364218592643738, "learning_rate": 1.3158931357690887e-05, "loss": 0.7021, "step": 5433 }, { "epoch": 2.0796019900497513, "grad_norm": 0.5543997287750244, "learning_rate": 1.3156579231379523e-05, "loss": 0.6307, "step": 5434 }, { "epoch": 2.0799846919249902, "grad_norm": 0.551109254360199, "learning_rate": 1.3154226911087884e-05, "loss": 0.6011, "step": 5435 }, { "epoch": 2.0803673938002296, "grad_norm": 0.603603720664978, "learning_rate": 1.315187439696053e-05, "loss": 0.5778, "step": 5436 }, { "epoch": 2.080750095675469, "grad_norm": 0.5345560908317566, "learning_rate": 1.3149521689142024e-05, "loss": 0.6334, "step": 5437 }, { "epoch": 2.081132797550708, "grad_norm": 0.588088870048523, "learning_rate": 1.3147168787776949e-05, "loss": 0.6145, "step": 5438 }, { "epoch": 2.0815154994259473, "grad_norm": 0.5211451053619385, "learning_rate": 1.31448156930099e-05, "loss": 0.5927, "step": 5439 }, { "epoch": 2.081898201301186, "grad_norm": 0.5314558148384094, "learning_rate": 1.3142462404985476e-05, "loss": 0.6331, "step": 5440 }, { "epoch": 2.0822809031764256, "grad_norm": 0.5466611981391907, "learning_rate": 1.3140108923848297e-05, "loss": 0.6506, "step": 5441 }, { "epoch": 2.082663605051665, "grad_norm": 0.5356952548027039, "learning_rate": 1.3137755249742986e-05, "loss": 0.7768, "step": 5442 }, { "epoch": 2.083046306926904, "grad_norm": 0.5437801480293274, "learning_rate": 1.3135401382814186e-05, "loss": 0.6341, "step": 5443 }, { "epoch": 2.083429008802143, "grad_norm": 0.5285253524780273, "learning_rate": 1.3133047323206549e-05, "loss": 0.6633, "step": 5444 }, { "epoch": 2.083811710677382, "grad_norm": 0.5700019598007202, "learning_rate": 1.3130693071064738e-05, "loss": 0.7459, "step": 5445 }, { "epoch": 2.0841944125526215, "grad_norm": 0.5105715394020081, "learning_rate": 1.3128338626533424e-05, "loss": 0.6191, "step": 5446 }, { "epoch": 2.084577114427861, "grad_norm": 0.48742255568504333, "learning_rate": 1.3125983989757299e-05, "loss": 0.6232, "step": 5447 }, { "epoch": 2.0849598163031, "grad_norm": 0.5170453190803528, "learning_rate": 1.3123629160881057e-05, "loss": 0.606, "step": 5448 }, { "epoch": 2.085342518178339, "grad_norm": 0.599585771560669, "learning_rate": 1.3121274140049417e-05, "loss": 0.705, "step": 5449 }, { "epoch": 2.085725220053578, "grad_norm": 0.5202299952507019, "learning_rate": 1.3118918927407094e-05, "loss": 0.5634, "step": 5450 }, { "epoch": 2.0861079219288174, "grad_norm": 0.5744889378547668, "learning_rate": 1.3116563523098821e-05, "loss": 0.6274, "step": 5451 }, { "epoch": 2.086490623804057, "grad_norm": 0.5113105773925781, "learning_rate": 1.3114207927269349e-05, "loss": 0.5974, "step": 5452 }, { "epoch": 2.0868733256792957, "grad_norm": 0.518900454044342, "learning_rate": 1.3111852140063435e-05, "loss": 0.6759, "step": 5453 }, { "epoch": 2.087256027554535, "grad_norm": 0.5941471457481384, "learning_rate": 1.3109496161625843e-05, "loss": 0.7017, "step": 5454 }, { "epoch": 2.087638729429774, "grad_norm": 0.5602853298187256, "learning_rate": 1.3107139992101363e-05, "loss": 0.5308, "step": 5455 }, { "epoch": 2.0880214313050134, "grad_norm": 0.4938914477825165, "learning_rate": 1.3104783631634778e-05, "loss": 0.6279, "step": 5456 }, { "epoch": 2.0884041331802528, "grad_norm": 0.505513072013855, "learning_rate": 1.3102427080370901e-05, "loss": 0.5669, "step": 5457 }, { "epoch": 2.0887868350554917, "grad_norm": 0.5302906036376953, "learning_rate": 1.3100070338454547e-05, "loss": 0.7061, "step": 5458 }, { "epoch": 2.089169536930731, "grad_norm": 0.4783971309661865, "learning_rate": 1.309771340603054e-05, "loss": 0.6081, "step": 5459 }, { "epoch": 2.08955223880597, "grad_norm": 0.5132931470870972, "learning_rate": 1.3095356283243723e-05, "loss": 0.6158, "step": 5460 }, { "epoch": 2.0899349406812093, "grad_norm": 0.4968332350254059, "learning_rate": 1.3092998970238946e-05, "loss": 0.6112, "step": 5461 }, { "epoch": 2.0903176425564487, "grad_norm": 0.5266866087913513, "learning_rate": 1.3090641467161071e-05, "loss": 0.5332, "step": 5462 }, { "epoch": 2.0907003444316876, "grad_norm": 0.5334839224815369, "learning_rate": 1.308828377415498e-05, "loss": 0.5945, "step": 5463 }, { "epoch": 2.091083046306927, "grad_norm": 0.5726290941238403, "learning_rate": 1.308592589136555e-05, "loss": 0.6302, "step": 5464 }, { "epoch": 2.091465748182166, "grad_norm": 0.5708039402961731, "learning_rate": 1.3083567818937689e-05, "loss": 0.6527, "step": 5465 }, { "epoch": 2.0918484500574053, "grad_norm": 0.4887515604496002, "learning_rate": 1.3081209557016297e-05, "loss": 0.6472, "step": 5466 }, { "epoch": 2.0922311519326446, "grad_norm": 0.5086894035339355, "learning_rate": 1.3078851105746297e-05, "loss": 0.5721, "step": 5467 }, { "epoch": 2.0926138538078836, "grad_norm": 0.878383457660675, "learning_rate": 1.3076492465272632e-05, "loss": 0.6465, "step": 5468 }, { "epoch": 2.092996555683123, "grad_norm": 0.5388166308403015, "learning_rate": 1.3074133635740236e-05, "loss": 0.6541, "step": 5469 }, { "epoch": 2.093379257558362, "grad_norm": 1.0754696130752563, "learning_rate": 1.3071774617294071e-05, "loss": 0.668, "step": 5470 }, { "epoch": 2.093761959433601, "grad_norm": 0.539804995059967, "learning_rate": 1.3069415410079106e-05, "loss": 0.6134, "step": 5471 }, { "epoch": 2.0941446613088406, "grad_norm": 0.556084156036377, "learning_rate": 1.3067056014240311e-05, "loss": 0.6003, "step": 5472 }, { "epoch": 2.0945273631840795, "grad_norm": 0.5323691368103027, "learning_rate": 1.306469642992269e-05, "loss": 0.6776, "step": 5473 }, { "epoch": 2.094910065059319, "grad_norm": 0.6227754354476929, "learning_rate": 1.3062336657271237e-05, "loss": 0.6759, "step": 5474 }, { "epoch": 2.095292766934558, "grad_norm": 0.5693916082382202, "learning_rate": 1.3059976696430973e-05, "loss": 0.6962, "step": 5475 }, { "epoch": 2.095675468809797, "grad_norm": 0.5365363955497742, "learning_rate": 1.305761654754692e-05, "loss": 0.5649, "step": 5476 }, { "epoch": 2.0960581706850365, "grad_norm": 0.5393907427787781, "learning_rate": 1.3055256210764113e-05, "loss": 0.6454, "step": 5477 }, { "epoch": 2.0964408725602754, "grad_norm": 0.538042426109314, "learning_rate": 1.3052895686227604e-05, "loss": 0.6211, "step": 5478 }, { "epoch": 2.096823574435515, "grad_norm": 0.49808788299560547, "learning_rate": 1.3050534974082458e-05, "loss": 0.581, "step": 5479 }, { "epoch": 2.0972062763107537, "grad_norm": 0.5819712281227112, "learning_rate": 1.304817407447374e-05, "loss": 0.6523, "step": 5480 }, { "epoch": 2.097588978185993, "grad_norm": 0.6416563391685486, "learning_rate": 1.3045812987546536e-05, "loss": 0.6316, "step": 5481 }, { "epoch": 2.0979716800612325, "grad_norm": 0.5522867441177368, "learning_rate": 1.3043451713445938e-05, "loss": 0.6831, "step": 5482 }, { "epoch": 2.0983543819364714, "grad_norm": 0.5328803658485413, "learning_rate": 1.304109025231706e-05, "loss": 0.6117, "step": 5483 }, { "epoch": 2.0987370838117108, "grad_norm": 0.5242828130722046, "learning_rate": 1.3038728604305015e-05, "loss": 0.6543, "step": 5484 }, { "epoch": 2.0991197856869497, "grad_norm": 0.5449391007423401, "learning_rate": 1.3036366769554936e-05, "loss": 0.624, "step": 5485 }, { "epoch": 2.099502487562189, "grad_norm": 0.5985726714134216, "learning_rate": 1.303400474821196e-05, "loss": 0.611, "step": 5486 }, { "epoch": 2.0998851894374284, "grad_norm": 0.6019235849380493, "learning_rate": 1.3031642540421241e-05, "loss": 0.6054, "step": 5487 }, { "epoch": 2.1002678913126673, "grad_norm": 0.5454671382904053, "learning_rate": 1.3029280146327943e-05, "loss": 0.6613, "step": 5488 }, { "epoch": 2.1006505931879067, "grad_norm": 0.5513691306114197, "learning_rate": 1.3026917566077242e-05, "loss": 0.7196, "step": 5489 }, { "epoch": 2.1010332950631456, "grad_norm": 0.49010348320007324, "learning_rate": 1.3024554799814326e-05, "loss": 0.6782, "step": 5490 }, { "epoch": 2.101415996938385, "grad_norm": 0.5203431248664856, "learning_rate": 1.3022191847684391e-05, "loss": 0.5922, "step": 5491 }, { "epoch": 2.1017986988136244, "grad_norm": 0.5006888508796692, "learning_rate": 1.301982870983265e-05, "loss": 0.5643, "step": 5492 }, { "epoch": 2.1021814006888633, "grad_norm": 0.5648853778839111, "learning_rate": 1.3017465386404318e-05, "loss": 0.6738, "step": 5493 }, { "epoch": 2.1025641025641026, "grad_norm": 0.6114888787269592, "learning_rate": 1.3015101877544635e-05, "loss": 0.5716, "step": 5494 }, { "epoch": 2.1029468044393416, "grad_norm": 0.5456013679504395, "learning_rate": 1.301273818339884e-05, "loss": 0.6333, "step": 5495 }, { "epoch": 2.103329506314581, "grad_norm": 0.5180480480194092, "learning_rate": 1.3010374304112193e-05, "loss": 0.6452, "step": 5496 }, { "epoch": 2.1037122081898203, "grad_norm": 0.5503971576690674, "learning_rate": 1.3008010239829951e-05, "loss": 0.5837, "step": 5497 }, { "epoch": 2.104094910065059, "grad_norm": 0.547754168510437, "learning_rate": 1.3005645990697404e-05, "loss": 0.6191, "step": 5498 }, { "epoch": 2.1044776119402986, "grad_norm": 0.562238335609436, "learning_rate": 1.3003281556859837e-05, "loss": 0.5768, "step": 5499 }, { "epoch": 2.1048603138155375, "grad_norm": 0.5098243355751038, "learning_rate": 1.3000916938462547e-05, "loss": 0.7232, "step": 5500 }, { "epoch": 2.105243015690777, "grad_norm": 0.6394631862640381, "learning_rate": 1.2998552135650851e-05, "loss": 0.6551, "step": 5501 }, { "epoch": 2.1056257175660162, "grad_norm": 0.4982565939426422, "learning_rate": 1.2996187148570069e-05, "loss": 0.6401, "step": 5502 }, { "epoch": 2.106008419441255, "grad_norm": 0.5456660389900208, "learning_rate": 1.299382197736554e-05, "loss": 0.7007, "step": 5503 }, { "epoch": 2.1063911213164945, "grad_norm": 0.5306207537651062, "learning_rate": 1.2991456622182606e-05, "loss": 0.6816, "step": 5504 }, { "epoch": 2.1067738231917335, "grad_norm": 0.5330739617347717, "learning_rate": 1.2989091083166627e-05, "loss": 0.6389, "step": 5505 }, { "epoch": 2.107156525066973, "grad_norm": 0.5578978061676025, "learning_rate": 1.298672536046297e-05, "loss": 0.6208, "step": 5506 }, { "epoch": 2.107539226942212, "grad_norm": 0.5582525730133057, "learning_rate": 1.2984359454217015e-05, "loss": 0.644, "step": 5507 }, { "epoch": 2.107921928817451, "grad_norm": 0.5030088424682617, "learning_rate": 1.2981993364574152e-05, "loss": 0.6258, "step": 5508 }, { "epoch": 2.1083046306926905, "grad_norm": 0.5991057753562927, "learning_rate": 1.2979627091679793e-05, "loss": 0.6975, "step": 5509 }, { "epoch": 2.1086873325679294, "grad_norm": 0.6223148107528687, "learning_rate": 1.297726063567934e-05, "loss": 0.6336, "step": 5510 }, { "epoch": 2.1090700344431688, "grad_norm": 0.5775443315505981, "learning_rate": 1.297489399671822e-05, "loss": 0.6549, "step": 5511 }, { "epoch": 2.109452736318408, "grad_norm": 0.6155873537063599, "learning_rate": 1.2972527174941876e-05, "loss": 0.5755, "step": 5512 }, { "epoch": 2.109835438193647, "grad_norm": 0.6312095522880554, "learning_rate": 1.2970160170495749e-05, "loss": 0.6255, "step": 5513 }, { "epoch": 2.1102181400688864, "grad_norm": 0.6304394602775574, "learning_rate": 1.2967792983525303e-05, "loss": 0.7227, "step": 5514 }, { "epoch": 2.1106008419441253, "grad_norm": 0.6175702810287476, "learning_rate": 1.2965425614176004e-05, "loss": 0.6626, "step": 5515 }, { "epoch": 2.1109835438193647, "grad_norm": 0.5619997382164001, "learning_rate": 1.2963058062593332e-05, "loss": 0.5859, "step": 5516 }, { "epoch": 2.111366245694604, "grad_norm": 0.5073375105857849, "learning_rate": 1.2960690328922782e-05, "loss": 0.5986, "step": 5517 }, { "epoch": 2.111748947569843, "grad_norm": 0.549524188041687, "learning_rate": 1.295832241330986e-05, "loss": 0.6747, "step": 5518 }, { "epoch": 2.1121316494450824, "grad_norm": 0.5366579294204712, "learning_rate": 1.2955954315900077e-05, "loss": 0.6588, "step": 5519 }, { "epoch": 2.1125143513203213, "grad_norm": 0.5571321249008179, "learning_rate": 1.2953586036838961e-05, "loss": 0.6228, "step": 5520 }, { "epoch": 2.1128970531955606, "grad_norm": 0.7527409195899963, "learning_rate": 1.2951217576272047e-05, "loss": 0.5617, "step": 5521 }, { "epoch": 2.1132797550708, "grad_norm": 0.5658262968063354, "learning_rate": 1.2948848934344885e-05, "loss": 0.5922, "step": 5522 }, { "epoch": 2.113662456946039, "grad_norm": 0.5493008494377136, "learning_rate": 1.2946480111203035e-05, "loss": 0.5676, "step": 5523 }, { "epoch": 2.1140451588212783, "grad_norm": 0.5098698139190674, "learning_rate": 1.2944111106992066e-05, "loss": 0.593, "step": 5524 }, { "epoch": 2.1144278606965172, "grad_norm": 0.6356461644172668, "learning_rate": 1.294174192185756e-05, "loss": 0.5855, "step": 5525 }, { "epoch": 2.1148105625717566, "grad_norm": 0.5829760432243347, "learning_rate": 1.2939372555945112e-05, "loss": 0.6608, "step": 5526 }, { "epoch": 2.115193264446996, "grad_norm": 0.540425717830658, "learning_rate": 1.2937003009400319e-05, "loss": 0.699, "step": 5527 }, { "epoch": 2.115575966322235, "grad_norm": 0.49490854144096375, "learning_rate": 1.2934633282368802e-05, "loss": 0.6226, "step": 5528 }, { "epoch": 2.1159586681974742, "grad_norm": 0.6085807085037231, "learning_rate": 1.2932263374996187e-05, "loss": 0.6359, "step": 5529 }, { "epoch": 2.116341370072713, "grad_norm": 0.5152081251144409, "learning_rate": 1.2929893287428112e-05, "loss": 0.6046, "step": 5530 }, { "epoch": 2.1167240719479525, "grad_norm": 0.5518518090248108, "learning_rate": 1.2927523019810224e-05, "loss": 0.7011, "step": 5531 }, { "epoch": 2.117106773823192, "grad_norm": 0.5158029198646545, "learning_rate": 1.2925152572288178e-05, "loss": 0.7084, "step": 5532 }, { "epoch": 2.117489475698431, "grad_norm": 0.5137454271316528, "learning_rate": 1.292278194500765e-05, "loss": 0.716, "step": 5533 }, { "epoch": 2.11787217757367, "grad_norm": 0.5436176657676697, "learning_rate": 1.2920411138114317e-05, "loss": 0.6862, "step": 5534 }, { "epoch": 2.118254879448909, "grad_norm": 0.5407941341400146, "learning_rate": 1.2918040151753877e-05, "loss": 0.6621, "step": 5535 }, { "epoch": 2.1186375813241485, "grad_norm": 0.5173583626747131, "learning_rate": 1.291566898607203e-05, "loss": 0.5842, "step": 5536 }, { "epoch": 2.119020283199388, "grad_norm": 0.5223385095596313, "learning_rate": 1.291329764121449e-05, "loss": 0.7557, "step": 5537 }, { "epoch": 2.1194029850746268, "grad_norm": 0.5448794960975647, "learning_rate": 1.2910926117326984e-05, "loss": 0.6112, "step": 5538 }, { "epoch": 2.119785686949866, "grad_norm": 0.5665983557701111, "learning_rate": 1.2908554414555247e-05, "loss": 0.5732, "step": 5539 }, { "epoch": 2.120168388825105, "grad_norm": 0.5000305771827698, "learning_rate": 1.2906182533045026e-05, "loss": 0.5777, "step": 5540 }, { "epoch": 2.1205510907003444, "grad_norm": 0.5105348229408264, "learning_rate": 1.2903810472942083e-05, "loss": 0.6199, "step": 5541 }, { "epoch": 2.120933792575584, "grad_norm": 0.6220698952674866, "learning_rate": 1.2901438234392188e-05, "loss": 0.6427, "step": 5542 }, { "epoch": 2.1213164944508227, "grad_norm": 0.508845865726471, "learning_rate": 1.2899065817541114e-05, "loss": 0.6082, "step": 5543 }, { "epoch": 2.121699196326062, "grad_norm": 0.5197914242744446, "learning_rate": 1.289669322253466e-05, "loss": 0.6576, "step": 5544 }, { "epoch": 2.122081898201301, "grad_norm": 0.5172052979469299, "learning_rate": 1.2894320449518624e-05, "loss": 0.6319, "step": 5545 }, { "epoch": 2.1224646000765404, "grad_norm": 0.5469117760658264, "learning_rate": 1.2891947498638824e-05, "loss": 0.6951, "step": 5546 }, { "epoch": 2.1228473019517797, "grad_norm": 0.5082879662513733, "learning_rate": 1.2889574370041076e-05, "loss": 0.7026, "step": 5547 }, { "epoch": 2.1232300038270187, "grad_norm": 0.5438482165336609, "learning_rate": 1.2887201063871223e-05, "loss": 0.6221, "step": 5548 }, { "epoch": 2.123612705702258, "grad_norm": 0.5164670944213867, "learning_rate": 1.2884827580275107e-05, "loss": 0.6316, "step": 5549 }, { "epoch": 2.123995407577497, "grad_norm": 0.535712718963623, "learning_rate": 1.2882453919398588e-05, "loss": 0.6058, "step": 5550 }, { "epoch": 2.1243781094527363, "grad_norm": 0.574087917804718, "learning_rate": 1.2880080081387531e-05, "loss": 0.597, "step": 5551 }, { "epoch": 2.1247608113279757, "grad_norm": 0.5166386365890503, "learning_rate": 1.2877706066387817e-05, "loss": 0.6338, "step": 5552 }, { "epoch": 2.1251435132032146, "grad_norm": 0.5179682374000549, "learning_rate": 1.2875331874545331e-05, "loss": 0.6396, "step": 5553 }, { "epoch": 2.125526215078454, "grad_norm": 0.48640909790992737, "learning_rate": 1.2872957506005981e-05, "loss": 0.631, "step": 5554 }, { "epoch": 2.125908916953693, "grad_norm": 0.5101359486579895, "learning_rate": 1.2870582960915669e-05, "loss": 0.6159, "step": 5555 }, { "epoch": 2.1262916188289323, "grad_norm": 0.5773372650146484, "learning_rate": 1.2868208239420326e-05, "loss": 0.6115, "step": 5556 }, { "epoch": 2.1266743207041716, "grad_norm": 0.5712890625, "learning_rate": 1.286583334166588e-05, "loss": 0.6639, "step": 5557 }, { "epoch": 2.1270570225794105, "grad_norm": 0.5110282897949219, "learning_rate": 1.2863458267798277e-05, "loss": 0.675, "step": 5558 }, { "epoch": 2.12743972445465, "grad_norm": 0.5798378586769104, "learning_rate": 1.2861083017963471e-05, "loss": 0.6228, "step": 5559 }, { "epoch": 2.127822426329889, "grad_norm": 0.5038809776306152, "learning_rate": 1.2858707592307424e-05, "loss": 0.5645, "step": 5560 }, { "epoch": 2.128205128205128, "grad_norm": 0.526238739490509, "learning_rate": 1.2856331990976121e-05, "loss": 0.6483, "step": 5561 }, { "epoch": 2.1285878300803676, "grad_norm": 0.5411843061447144, "learning_rate": 1.285395621411554e-05, "loss": 0.6979, "step": 5562 }, { "epoch": 2.1289705319556065, "grad_norm": 0.5474837422370911, "learning_rate": 1.2851580261871682e-05, "loss": 0.6351, "step": 5563 }, { "epoch": 2.129353233830846, "grad_norm": 0.5335898995399475, "learning_rate": 1.2849204134390559e-05, "loss": 0.6532, "step": 5564 }, { "epoch": 2.1297359357060848, "grad_norm": 0.6079065799713135, "learning_rate": 1.2846827831818187e-05, "loss": 0.6208, "step": 5565 }, { "epoch": 2.130118637581324, "grad_norm": 0.5380761623382568, "learning_rate": 1.2844451354300595e-05, "loss": 0.5508, "step": 5566 }, { "epoch": 2.1305013394565635, "grad_norm": 0.5041224956512451, "learning_rate": 1.2842074701983824e-05, "loss": 0.6093, "step": 5567 }, { "epoch": 2.1308840413318024, "grad_norm": 0.6021047234535217, "learning_rate": 1.283969787501393e-05, "loss": 0.6336, "step": 5568 }, { "epoch": 2.131266743207042, "grad_norm": 0.5399214029312134, "learning_rate": 1.2837320873536974e-05, "loss": 0.6355, "step": 5569 }, { "epoch": 2.1316494450822807, "grad_norm": 0.520233690738678, "learning_rate": 1.283494369769903e-05, "loss": 0.7033, "step": 5570 }, { "epoch": 2.13203214695752, "grad_norm": 0.546693742275238, "learning_rate": 1.2832566347646174e-05, "loss": 0.6375, "step": 5571 }, { "epoch": 2.1324148488327594, "grad_norm": 0.5536788702011108, "learning_rate": 1.2830188823524507e-05, "loss": 0.6083, "step": 5572 }, { "epoch": 2.1327975507079984, "grad_norm": 0.5273149609565735, "learning_rate": 1.2827811125480135e-05, "loss": 0.5807, "step": 5573 }, { "epoch": 2.1331802525832377, "grad_norm": 0.5608332753181458, "learning_rate": 1.2825433253659174e-05, "loss": 0.622, "step": 5574 }, { "epoch": 2.1335629544584767, "grad_norm": 0.5342458486557007, "learning_rate": 1.2823055208207747e-05, "loss": 0.6728, "step": 5575 }, { "epoch": 2.133945656333716, "grad_norm": 0.5409425497055054, "learning_rate": 1.2820676989271993e-05, "loss": 0.6715, "step": 5576 }, { "epoch": 2.1343283582089554, "grad_norm": 0.6039491295814514, "learning_rate": 1.2818298596998062e-05, "loss": 0.7118, "step": 5577 }, { "epoch": 2.1347110600841943, "grad_norm": 0.5533198714256287, "learning_rate": 1.281592003153211e-05, "loss": 0.5894, "step": 5578 }, { "epoch": 2.1350937619594337, "grad_norm": 0.527251660823822, "learning_rate": 1.2813541293020307e-05, "loss": 0.5899, "step": 5579 }, { "epoch": 2.1354764638346726, "grad_norm": 0.5126714110374451, "learning_rate": 1.2811162381608835e-05, "loss": 0.604, "step": 5580 }, { "epoch": 2.135859165709912, "grad_norm": 0.6196866631507874, "learning_rate": 1.2808783297443878e-05, "loss": 0.6063, "step": 5581 }, { "epoch": 2.1362418675851513, "grad_norm": 0.5234975814819336, "learning_rate": 1.2806404040671646e-05, "loss": 0.6961, "step": 5582 }, { "epoch": 2.1366245694603903, "grad_norm": 0.540469229221344, "learning_rate": 1.2804024611438343e-05, "loss": 0.6672, "step": 5583 }, { "epoch": 2.1370072713356296, "grad_norm": 0.5112031102180481, "learning_rate": 1.2801645009890195e-05, "loss": 0.6009, "step": 5584 }, { "epoch": 2.1373899732108685, "grad_norm": 0.5059005618095398, "learning_rate": 1.2799265236173439e-05, "loss": 0.593, "step": 5585 }, { "epoch": 2.137772675086108, "grad_norm": 0.6063156127929688, "learning_rate": 1.279688529043431e-05, "loss": 0.6186, "step": 5586 }, { "epoch": 2.1381553769613473, "grad_norm": 0.5297521948814392, "learning_rate": 1.2794505172819068e-05, "loss": 0.6226, "step": 5587 }, { "epoch": 2.138538078836586, "grad_norm": 0.5138850212097168, "learning_rate": 1.2792124883473973e-05, "loss": 0.6364, "step": 5588 }, { "epoch": 2.1389207807118256, "grad_norm": 0.5854220390319824, "learning_rate": 1.2789744422545306e-05, "loss": 0.6478, "step": 5589 }, { "epoch": 2.1393034825870645, "grad_norm": 0.5290401577949524, "learning_rate": 1.2787363790179348e-05, "loss": 0.6557, "step": 5590 }, { "epoch": 2.139686184462304, "grad_norm": 0.5537073016166687, "learning_rate": 1.2784982986522399e-05, "loss": 0.7022, "step": 5591 }, { "epoch": 2.140068886337543, "grad_norm": 0.5752295255661011, "learning_rate": 1.278260201172076e-05, "loss": 0.6216, "step": 5592 }, { "epoch": 2.140451588212782, "grad_norm": 0.5188144445419312, "learning_rate": 1.2780220865920752e-05, "loss": 0.6824, "step": 5593 }, { "epoch": 2.1408342900880215, "grad_norm": 0.5615350604057312, "learning_rate": 1.2777839549268704e-05, "loss": 0.6045, "step": 5594 }, { "epoch": 2.1412169919632604, "grad_norm": 0.5408236384391785, "learning_rate": 1.2775458061910954e-05, "loss": 0.6692, "step": 5595 }, { "epoch": 2.1415996938385, "grad_norm": 0.5252184271812439, "learning_rate": 1.2773076403993852e-05, "loss": 0.6401, "step": 5596 }, { "epoch": 2.141982395713739, "grad_norm": 0.5023372769355774, "learning_rate": 1.2770694575663752e-05, "loss": 0.6508, "step": 5597 }, { "epoch": 2.142365097588978, "grad_norm": 0.5295570492744446, "learning_rate": 1.2768312577067025e-05, "loss": 0.717, "step": 5598 }, { "epoch": 2.1427477994642175, "grad_norm": 0.5496678948402405, "learning_rate": 1.2765930408350054e-05, "loss": 0.6559, "step": 5599 }, { "epoch": 2.1431305013394564, "grad_norm": 0.5601440072059631, "learning_rate": 1.2763548069659233e-05, "loss": 0.5617, "step": 5600 }, { "epoch": 2.1435132032146957, "grad_norm": 0.49304330348968506, "learning_rate": 1.2761165561140955e-05, "loss": 0.5846, "step": 5601 }, { "epoch": 2.143895905089935, "grad_norm": 0.5078324675559998, "learning_rate": 1.2758782882941638e-05, "loss": 0.6159, "step": 5602 }, { "epoch": 2.144278606965174, "grad_norm": 0.6157850027084351, "learning_rate": 1.2756400035207699e-05, "loss": 0.6759, "step": 5603 }, { "epoch": 2.1446613088404134, "grad_norm": 0.5523136854171753, "learning_rate": 1.2754017018085573e-05, "loss": 0.5871, "step": 5604 }, { "epoch": 2.1450440107156523, "grad_norm": 0.5273779034614563, "learning_rate": 1.2751633831721704e-05, "loss": 0.6026, "step": 5605 }, { "epoch": 2.1454267125908917, "grad_norm": 0.5344384908676147, "learning_rate": 1.2749250476262542e-05, "loss": 0.7248, "step": 5606 }, { "epoch": 2.145809414466131, "grad_norm": 0.5314520001411438, "learning_rate": 1.2746866951854555e-05, "loss": 0.7242, "step": 5607 }, { "epoch": 2.14619211634137, "grad_norm": 0.5205344557762146, "learning_rate": 1.274448325864421e-05, "loss": 0.64, "step": 5608 }, { "epoch": 2.1465748182166093, "grad_norm": 0.5440836548805237, "learning_rate": 1.2742099396778e-05, "loss": 0.6712, "step": 5609 }, { "epoch": 2.1469575200918483, "grad_norm": 0.583565890789032, "learning_rate": 1.2739715366402415e-05, "loss": 0.6679, "step": 5610 }, { "epoch": 2.1473402219670876, "grad_norm": 0.5383120179176331, "learning_rate": 1.273733116766396e-05, "loss": 0.6334, "step": 5611 }, { "epoch": 2.147722923842327, "grad_norm": 0.5272748470306396, "learning_rate": 1.2734946800709149e-05, "loss": 0.6318, "step": 5612 }, { "epoch": 2.148105625717566, "grad_norm": 0.5365722179412842, "learning_rate": 1.2732562265684512e-05, "loss": 0.585, "step": 5613 }, { "epoch": 2.1484883275928053, "grad_norm": 0.5357686877250671, "learning_rate": 1.2730177562736583e-05, "loss": 0.6104, "step": 5614 }, { "epoch": 2.148871029468044, "grad_norm": 0.47784844040870667, "learning_rate": 1.2727792692011906e-05, "loss": 0.6226, "step": 5615 }, { "epoch": 2.1492537313432836, "grad_norm": 0.5445475578308105, "learning_rate": 1.2725407653657042e-05, "loss": 0.6534, "step": 5616 }, { "epoch": 2.149636433218523, "grad_norm": 0.5038131475448608, "learning_rate": 1.2723022447818554e-05, "loss": 0.5466, "step": 5617 }, { "epoch": 2.150019135093762, "grad_norm": 0.5180799961090088, "learning_rate": 1.2720637074643021e-05, "loss": 0.6501, "step": 5618 }, { "epoch": 2.1504018369690012, "grad_norm": 0.517196774482727, "learning_rate": 1.2718251534277034e-05, "loss": 0.6067, "step": 5619 }, { "epoch": 2.15078453884424, "grad_norm": 0.5123803615570068, "learning_rate": 1.2715865826867184e-05, "loss": 0.5521, "step": 5620 }, { "epoch": 2.1511672407194795, "grad_norm": 0.5342884063720703, "learning_rate": 1.2713479952560087e-05, "loss": 0.6938, "step": 5621 }, { "epoch": 2.151549942594719, "grad_norm": 0.5070653557777405, "learning_rate": 1.2711093911502353e-05, "loss": 0.77, "step": 5622 }, { "epoch": 2.151932644469958, "grad_norm": 0.556168794631958, "learning_rate": 1.2708707703840615e-05, "loss": 0.728, "step": 5623 }, { "epoch": 2.152315346345197, "grad_norm": 0.5896899700164795, "learning_rate": 1.2706321329721513e-05, "loss": 0.666, "step": 5624 }, { "epoch": 2.152698048220436, "grad_norm": 0.5803532600402832, "learning_rate": 1.2703934789291696e-05, "loss": 0.7247, "step": 5625 }, { "epoch": 2.1530807500956755, "grad_norm": 0.5455873012542725, "learning_rate": 1.270154808269782e-05, "loss": 0.5152, "step": 5626 }, { "epoch": 2.153463451970915, "grad_norm": 0.5327936410903931, "learning_rate": 1.2699161210086558e-05, "loss": 0.6031, "step": 5627 }, { "epoch": 2.1538461538461537, "grad_norm": 0.5327444076538086, "learning_rate": 1.2696774171604586e-05, "loss": 0.6584, "step": 5628 }, { "epoch": 2.154228855721393, "grad_norm": 0.554894745349884, "learning_rate": 1.2694386967398602e-05, "loss": 0.6549, "step": 5629 }, { "epoch": 2.154611557596632, "grad_norm": 0.5391136407852173, "learning_rate": 1.2691999597615297e-05, "loss": 0.6268, "step": 5630 }, { "epoch": 2.1549942594718714, "grad_norm": 0.5434026718139648, "learning_rate": 1.2689612062401384e-05, "loss": 0.6907, "step": 5631 }, { "epoch": 2.1553769613471108, "grad_norm": 0.5356073379516602, "learning_rate": 1.2687224361903585e-05, "loss": 0.6104, "step": 5632 }, { "epoch": 2.1557596632223497, "grad_norm": 0.5597086548805237, "learning_rate": 1.2684836496268626e-05, "loss": 0.5968, "step": 5633 }, { "epoch": 2.156142365097589, "grad_norm": 0.574825644493103, "learning_rate": 1.2682448465643256e-05, "loss": 0.6596, "step": 5634 }, { "epoch": 2.156525066972828, "grad_norm": 0.5339955687522888, "learning_rate": 1.2680060270174221e-05, "loss": 0.4918, "step": 5635 }, { "epoch": 2.1569077688480673, "grad_norm": 0.5229926705360413, "learning_rate": 1.2677671910008283e-05, "loss": 0.6482, "step": 5636 }, { "epoch": 2.1572904707233067, "grad_norm": 0.5805646181106567, "learning_rate": 1.2675283385292212e-05, "loss": 0.6594, "step": 5637 }, { "epoch": 2.1576731725985456, "grad_norm": 0.49807029962539673, "learning_rate": 1.2672894696172788e-05, "loss": 0.6364, "step": 5638 }, { "epoch": 2.158055874473785, "grad_norm": 0.5590648055076599, "learning_rate": 1.2670505842796804e-05, "loss": 0.6317, "step": 5639 }, { "epoch": 2.158438576349024, "grad_norm": 0.5012333393096924, "learning_rate": 1.2668116825311064e-05, "loss": 0.6225, "step": 5640 }, { "epoch": 2.1588212782242633, "grad_norm": 0.5830447673797607, "learning_rate": 1.2665727643862376e-05, "loss": 0.7141, "step": 5641 }, { "epoch": 2.1592039800995027, "grad_norm": 0.49919265508651733, "learning_rate": 1.2663338298597562e-05, "loss": 0.6145, "step": 5642 }, { "epoch": 2.1595866819747416, "grad_norm": 0.5407682061195374, "learning_rate": 1.2660948789663457e-05, "loss": 0.677, "step": 5643 }, { "epoch": 2.159969383849981, "grad_norm": 0.5187921524047852, "learning_rate": 1.2658559117206897e-05, "loss": 0.756, "step": 5644 }, { "epoch": 2.16035208572522, "grad_norm": 0.5733587145805359, "learning_rate": 1.2656169281374739e-05, "loss": 0.6131, "step": 5645 }, { "epoch": 2.1607347876004592, "grad_norm": 0.5552048087120056, "learning_rate": 1.2653779282313843e-05, "loss": 0.6633, "step": 5646 }, { "epoch": 2.1611174894756986, "grad_norm": 0.5446456670761108, "learning_rate": 1.2651389120171082e-05, "loss": 0.6502, "step": 5647 }, { "epoch": 2.1615001913509375, "grad_norm": 0.5427764654159546, "learning_rate": 1.2648998795093334e-05, "loss": 0.6456, "step": 5648 }, { "epoch": 2.161882893226177, "grad_norm": 0.528989851474762, "learning_rate": 1.2646608307227495e-05, "loss": 0.6106, "step": 5649 }, { "epoch": 2.162265595101416, "grad_norm": 0.514164924621582, "learning_rate": 1.2644217656720465e-05, "loss": 0.6539, "step": 5650 }, { "epoch": 2.162648296976655, "grad_norm": 0.5156978368759155, "learning_rate": 1.2641826843719159e-05, "loss": 0.5921, "step": 5651 }, { "epoch": 2.1630309988518945, "grad_norm": 0.5527496933937073, "learning_rate": 1.263943586837049e-05, "loss": 0.6613, "step": 5652 }, { "epoch": 2.1634137007271335, "grad_norm": 0.5151551365852356, "learning_rate": 1.2637044730821399e-05, "loss": 0.6479, "step": 5653 }, { "epoch": 2.163796402602373, "grad_norm": 0.5245763063430786, "learning_rate": 1.2634653431218826e-05, "loss": 0.6348, "step": 5654 }, { "epoch": 2.1641791044776117, "grad_norm": 0.5306447148323059, "learning_rate": 1.2632261969709723e-05, "loss": 0.5464, "step": 5655 }, { "epoch": 2.164561806352851, "grad_norm": 0.5313665866851807, "learning_rate": 1.2629870346441048e-05, "loss": 0.6159, "step": 5656 }, { "epoch": 2.1649445082280905, "grad_norm": 0.5015019178390503, "learning_rate": 1.2627478561559776e-05, "loss": 0.6034, "step": 5657 }, { "epoch": 2.1653272101033294, "grad_norm": 0.5469040870666504, "learning_rate": 1.2625086615212886e-05, "loss": 0.6391, "step": 5658 }, { "epoch": 2.1657099119785688, "grad_norm": 0.5856290459632874, "learning_rate": 1.2622694507547372e-05, "loss": 0.6204, "step": 5659 }, { "epoch": 2.1660926138538077, "grad_norm": 0.5451115369796753, "learning_rate": 1.2620302238710238e-05, "loss": 0.6971, "step": 5660 }, { "epoch": 2.166475315729047, "grad_norm": 0.5643056631088257, "learning_rate": 1.261790980884849e-05, "loss": 0.6773, "step": 5661 }, { "epoch": 2.1668580176042864, "grad_norm": 0.5507262945175171, "learning_rate": 1.261551721810915e-05, "loss": 0.645, "step": 5662 }, { "epoch": 2.1672407194795253, "grad_norm": 0.532035231590271, "learning_rate": 1.261312446663925e-05, "loss": 0.6525, "step": 5663 }, { "epoch": 2.1676234213547647, "grad_norm": 0.5181592702865601, "learning_rate": 1.2610731554585836e-05, "loss": 0.6154, "step": 5664 }, { "epoch": 2.1680061232300036, "grad_norm": 0.5515012741088867, "learning_rate": 1.2608338482095947e-05, "loss": 0.7099, "step": 5665 }, { "epoch": 2.168388825105243, "grad_norm": 0.5469069480895996, "learning_rate": 1.2605945249316657e-05, "loss": 0.6338, "step": 5666 }, { "epoch": 2.1687715269804824, "grad_norm": 0.5747780203819275, "learning_rate": 1.2603551856395026e-05, "loss": 0.6558, "step": 5667 }, { "epoch": 2.1691542288557213, "grad_norm": 0.4748660922050476, "learning_rate": 1.2601158303478143e-05, "loss": 0.6911, "step": 5668 }, { "epoch": 2.1695369307309607, "grad_norm": 0.5132082104682922, "learning_rate": 1.2598764590713093e-05, "loss": 0.6166, "step": 5669 }, { "epoch": 2.1699196326061996, "grad_norm": 0.4955454468727112, "learning_rate": 1.2596370718246975e-05, "loss": 0.618, "step": 5670 }, { "epoch": 2.170302334481439, "grad_norm": 0.5679891705513, "learning_rate": 1.2593976686226906e-05, "loss": 0.6282, "step": 5671 }, { "epoch": 2.1706850363566783, "grad_norm": 0.5126736164093018, "learning_rate": 1.2591582494799997e-05, "loss": 0.6783, "step": 5672 }, { "epoch": 2.1710677382319172, "grad_norm": 0.4962429404258728, "learning_rate": 1.258918814411338e-05, "loss": 0.5903, "step": 5673 }, { "epoch": 2.1714504401071566, "grad_norm": 0.5186276435852051, "learning_rate": 1.2586793634314201e-05, "loss": 0.5951, "step": 5674 }, { "epoch": 2.1718331419823955, "grad_norm": 0.5478777885437012, "learning_rate": 1.25843989655496e-05, "loss": 0.7067, "step": 5675 }, { "epoch": 2.172215843857635, "grad_norm": 0.5228421688079834, "learning_rate": 1.2582004137966744e-05, "loss": 0.6836, "step": 5676 }, { "epoch": 2.1725985457328743, "grad_norm": 0.5571334362030029, "learning_rate": 1.2579609151712793e-05, "loss": 0.6586, "step": 5677 }, { "epoch": 2.172981247608113, "grad_norm": 0.5458739399909973, "learning_rate": 1.2577214006934929e-05, "loss": 0.6227, "step": 5678 }, { "epoch": 2.1733639494833525, "grad_norm": 0.5193731784820557, "learning_rate": 1.2574818703780343e-05, "loss": 0.6231, "step": 5679 }, { "epoch": 2.1737466513585915, "grad_norm": 0.5748461484909058, "learning_rate": 1.2572423242396228e-05, "loss": 0.5992, "step": 5680 }, { "epoch": 2.174129353233831, "grad_norm": 0.5042511224746704, "learning_rate": 1.2570027622929797e-05, "loss": 0.7179, "step": 5681 }, { "epoch": 2.17451205510907, "grad_norm": 0.589535653591156, "learning_rate": 1.2567631845528264e-05, "loss": 0.6728, "step": 5682 }, { "epoch": 2.174894756984309, "grad_norm": 0.5309666991233826, "learning_rate": 1.2565235910338852e-05, "loss": 0.5831, "step": 5683 }, { "epoch": 2.1752774588595485, "grad_norm": 0.5300456285476685, "learning_rate": 1.2562839817508805e-05, "loss": 0.5923, "step": 5684 }, { "epoch": 2.1756601607347874, "grad_norm": 0.8687590956687927, "learning_rate": 1.2560443567185366e-05, "loss": 0.6302, "step": 5685 }, { "epoch": 2.1760428626100268, "grad_norm": 0.5367029309272766, "learning_rate": 1.255804715951579e-05, "loss": 0.6405, "step": 5686 }, { "epoch": 2.176425564485266, "grad_norm": 0.5229449272155762, "learning_rate": 1.2555650594647347e-05, "loss": 0.6384, "step": 5687 }, { "epoch": 2.176808266360505, "grad_norm": 0.5515431761741638, "learning_rate": 1.2553253872727304e-05, "loss": 0.6098, "step": 5688 }, { "epoch": 2.1771909682357444, "grad_norm": 0.5180490612983704, "learning_rate": 1.2550856993902956e-05, "loss": 0.5564, "step": 5689 }, { "epoch": 2.1775736701109834, "grad_norm": 0.5517380833625793, "learning_rate": 1.2548459958321592e-05, "loss": 0.5923, "step": 5690 }, { "epoch": 2.1779563719862227, "grad_norm": 0.5275505185127258, "learning_rate": 1.2546062766130516e-05, "loss": 0.613, "step": 5691 }, { "epoch": 2.178339073861462, "grad_norm": 0.5556995868682861, "learning_rate": 1.2543665417477045e-05, "loss": 0.6714, "step": 5692 }, { "epoch": 2.178721775736701, "grad_norm": 0.5598067045211792, "learning_rate": 1.2541267912508497e-05, "loss": 0.5976, "step": 5693 }, { "epoch": 2.1791044776119404, "grad_norm": 0.5231569409370422, "learning_rate": 1.253887025137221e-05, "loss": 0.6509, "step": 5694 }, { "epoch": 2.1794871794871793, "grad_norm": 0.6147084832191467, "learning_rate": 1.2536472434215527e-05, "loss": 0.6413, "step": 5695 }, { "epoch": 2.1798698813624187, "grad_norm": 0.5205718278884888, "learning_rate": 1.2534074461185797e-05, "loss": 0.7151, "step": 5696 }, { "epoch": 2.180252583237658, "grad_norm": 0.497534841299057, "learning_rate": 1.2531676332430384e-05, "loss": 0.6196, "step": 5697 }, { "epoch": 2.180635285112897, "grad_norm": 0.5096346139907837, "learning_rate": 1.252927804809666e-05, "loss": 0.5897, "step": 5698 }, { "epoch": 2.1810179869881363, "grad_norm": 0.5309149026870728, "learning_rate": 1.2526879608332003e-05, "loss": 0.5769, "step": 5699 }, { "epoch": 2.1814006888633752, "grad_norm": 0.5222199559211731, "learning_rate": 1.2524481013283809e-05, "loss": 0.651, "step": 5700 }, { "epoch": 2.1817833907386146, "grad_norm": 0.5415173768997192, "learning_rate": 1.2522082263099473e-05, "loss": 0.6218, "step": 5701 }, { "epoch": 2.182166092613854, "grad_norm": 0.5068539977073669, "learning_rate": 1.2519683357926406e-05, "loss": 0.5977, "step": 5702 }, { "epoch": 2.182548794489093, "grad_norm": 0.5430195927619934, "learning_rate": 1.2517284297912027e-05, "loss": 0.687, "step": 5703 }, { "epoch": 2.1829314963643323, "grad_norm": 0.5172205567359924, "learning_rate": 1.2514885083203765e-05, "loss": 0.5974, "step": 5704 }, { "epoch": 2.183314198239571, "grad_norm": 0.4926782548427582, "learning_rate": 1.251248571394906e-05, "loss": 0.6227, "step": 5705 }, { "epoch": 2.1836969001148105, "grad_norm": 0.5340198874473572, "learning_rate": 1.2510086190295358e-05, "loss": 0.626, "step": 5706 }, { "epoch": 2.18407960199005, "grad_norm": 0.5484779477119446, "learning_rate": 1.2507686512390115e-05, "loss": 0.6263, "step": 5707 }, { "epoch": 2.184462303865289, "grad_norm": 0.532910943031311, "learning_rate": 1.2505286680380801e-05, "loss": 0.4893, "step": 5708 }, { "epoch": 2.184845005740528, "grad_norm": 0.5548496246337891, "learning_rate": 1.2502886694414888e-05, "loss": 0.6129, "step": 5709 }, { "epoch": 2.185227707615767, "grad_norm": 0.5490597486495972, "learning_rate": 1.2500486554639866e-05, "loss": 0.6383, "step": 5710 }, { "epoch": 2.1856104094910065, "grad_norm": 0.5062610507011414, "learning_rate": 1.2498086261203226e-05, "loss": 0.6259, "step": 5711 }, { "epoch": 2.185993111366246, "grad_norm": 0.5448955297470093, "learning_rate": 1.2495685814252474e-05, "loss": 0.7038, "step": 5712 }, { "epoch": 2.1863758132414848, "grad_norm": 0.5114791393280029, "learning_rate": 1.2493285213935125e-05, "loss": 0.6387, "step": 5713 }, { "epoch": 2.186758515116724, "grad_norm": 0.5146470069885254, "learning_rate": 1.24908844603987e-05, "loss": 0.5789, "step": 5714 }, { "epoch": 2.187141216991963, "grad_norm": 0.5514643788337708, "learning_rate": 1.2488483553790733e-05, "loss": 0.6955, "step": 5715 }, { "epoch": 2.1875239188672024, "grad_norm": 0.5445388555526733, "learning_rate": 1.248608249425877e-05, "loss": 0.6386, "step": 5716 }, { "epoch": 2.187906620742442, "grad_norm": 0.5932431817054749, "learning_rate": 1.2483681281950355e-05, "loss": 0.6378, "step": 5717 }, { "epoch": 2.1882893226176807, "grad_norm": 0.5112600922584534, "learning_rate": 1.2481279917013054e-05, "loss": 0.5705, "step": 5718 }, { "epoch": 2.18867202449292, "grad_norm": 0.48841479420661926, "learning_rate": 1.2478878399594433e-05, "loss": 0.5672, "step": 5719 }, { "epoch": 2.189054726368159, "grad_norm": 0.5582079887390137, "learning_rate": 1.2476476729842083e-05, "loss": 0.5919, "step": 5720 }, { "epoch": 2.1894374282433984, "grad_norm": 0.5454201102256775, "learning_rate": 1.247407490790358e-05, "loss": 0.6419, "step": 5721 }, { "epoch": 2.1898201301186377, "grad_norm": 0.5105095505714417, "learning_rate": 1.2471672933926526e-05, "loss": 0.6461, "step": 5722 }, { "epoch": 2.1902028319938767, "grad_norm": 0.5789893865585327, "learning_rate": 1.2469270808058531e-05, "loss": 0.693, "step": 5723 }, { "epoch": 2.190585533869116, "grad_norm": 0.5494406223297119, "learning_rate": 1.2466868530447212e-05, "loss": 0.658, "step": 5724 }, { "epoch": 2.190968235744355, "grad_norm": 0.5555405616760254, "learning_rate": 1.2464466101240193e-05, "loss": 0.6707, "step": 5725 }, { "epoch": 2.1913509376195943, "grad_norm": 0.5817223787307739, "learning_rate": 1.2462063520585115e-05, "loss": 0.7321, "step": 5726 }, { "epoch": 2.1917336394948337, "grad_norm": 0.5486766695976257, "learning_rate": 1.2459660788629615e-05, "loss": 0.5997, "step": 5727 }, { "epoch": 2.1921163413700726, "grad_norm": 0.5576833486557007, "learning_rate": 1.2457257905521354e-05, "loss": 0.63, "step": 5728 }, { "epoch": 2.192499043245312, "grad_norm": 0.4808758795261383, "learning_rate": 1.2454854871407993e-05, "loss": 0.6139, "step": 5729 }, { "epoch": 2.192881745120551, "grad_norm": 0.49549728631973267, "learning_rate": 1.2452451686437204e-05, "loss": 0.6009, "step": 5730 }, { "epoch": 2.1932644469957903, "grad_norm": 0.5185689926147461, "learning_rate": 1.2450048350756675e-05, "loss": 0.676, "step": 5731 }, { "epoch": 2.1936471488710296, "grad_norm": 0.49308958649635315, "learning_rate": 1.2447644864514086e-05, "loss": 0.6621, "step": 5732 }, { "epoch": 2.1940298507462686, "grad_norm": 0.5593460202217102, "learning_rate": 1.2445241227857148e-05, "loss": 0.6579, "step": 5733 }, { "epoch": 2.194412552621508, "grad_norm": 0.5181347131729126, "learning_rate": 1.2442837440933568e-05, "loss": 0.6666, "step": 5734 }, { "epoch": 2.194795254496747, "grad_norm": 0.5061633586883545, "learning_rate": 1.2440433503891063e-05, "loss": 0.5721, "step": 5735 }, { "epoch": 2.195177956371986, "grad_norm": 0.5289369225502014, "learning_rate": 1.2438029416877365e-05, "loss": 0.6577, "step": 5736 }, { "epoch": 2.1955606582472256, "grad_norm": 0.5363816022872925, "learning_rate": 1.2435625180040209e-05, "loss": 0.6275, "step": 5737 }, { "epoch": 2.1959433601224645, "grad_norm": 0.5650448203086853, "learning_rate": 1.243322079352734e-05, "loss": 0.6807, "step": 5738 }, { "epoch": 2.196326061997704, "grad_norm": 0.541807234287262, "learning_rate": 1.2430816257486518e-05, "loss": 0.6336, "step": 5739 }, { "epoch": 2.196708763872943, "grad_norm": 0.5684800148010254, "learning_rate": 1.2428411572065506e-05, "loss": 0.6168, "step": 5740 }, { "epoch": 2.197091465748182, "grad_norm": 0.5451333522796631, "learning_rate": 1.2426006737412082e-05, "loss": 0.7361, "step": 5741 }, { "epoch": 2.1974741676234215, "grad_norm": 0.521281361579895, "learning_rate": 1.2423601753674026e-05, "loss": 0.6651, "step": 5742 }, { "epoch": 2.1978568694986604, "grad_norm": 0.5259708762168884, "learning_rate": 1.2421196620999127e-05, "loss": 0.6965, "step": 5743 }, { "epoch": 2.1982395713739, "grad_norm": 0.4948168992996216, "learning_rate": 1.2418791339535194e-05, "loss": 0.5834, "step": 5744 }, { "epoch": 2.1986222732491387, "grad_norm": 0.5354844927787781, "learning_rate": 1.2416385909430033e-05, "loss": 0.6252, "step": 5745 }, { "epoch": 2.199004975124378, "grad_norm": 0.4871520698070526, "learning_rate": 1.241398033083147e-05, "loss": 0.6496, "step": 5746 }, { "epoch": 2.1993876769996175, "grad_norm": 0.5463835000991821, "learning_rate": 1.241157460388733e-05, "loss": 0.5939, "step": 5747 }, { "epoch": 2.1997703788748564, "grad_norm": 0.500802218914032, "learning_rate": 1.240916872874545e-05, "loss": 0.609, "step": 5748 }, { "epoch": 2.2001530807500957, "grad_norm": 0.5829517245292664, "learning_rate": 1.2406762705553678e-05, "loss": 0.6668, "step": 5749 }, { "epoch": 2.2005357826253347, "grad_norm": 0.5102537870407104, "learning_rate": 1.2404356534459876e-05, "loss": 0.5818, "step": 5750 }, { "epoch": 2.200918484500574, "grad_norm": 0.4992401599884033, "learning_rate": 1.2401950215611902e-05, "loss": 0.5846, "step": 5751 }, { "epoch": 2.2013011863758134, "grad_norm": 0.49928948283195496, "learning_rate": 1.239954374915764e-05, "loss": 0.6249, "step": 5752 }, { "epoch": 2.2016838882510523, "grad_norm": 0.5225181579589844, "learning_rate": 1.2397137135244961e-05, "loss": 0.6641, "step": 5753 }, { "epoch": 2.2020665901262917, "grad_norm": 0.5606725811958313, "learning_rate": 1.2394730374021769e-05, "loss": 0.5669, "step": 5754 }, { "epoch": 2.2024492920015306, "grad_norm": 0.5892246961593628, "learning_rate": 1.2392323465635962e-05, "loss": 0.6155, "step": 5755 }, { "epoch": 2.20283199387677, "grad_norm": 0.6080582737922668, "learning_rate": 1.238991641023545e-05, "loss": 0.6831, "step": 5756 }, { "epoch": 2.2032146957520093, "grad_norm": 0.5723065733909607, "learning_rate": 1.2387509207968157e-05, "loss": 0.7165, "step": 5757 }, { "epoch": 2.2035973976272483, "grad_norm": 0.6065109968185425, "learning_rate": 1.2385101858982004e-05, "loss": 0.6717, "step": 5758 }, { "epoch": 2.2039800995024876, "grad_norm": 0.5569303631782532, "learning_rate": 1.2382694363424939e-05, "loss": 0.6135, "step": 5759 }, { "epoch": 2.2043628013777266, "grad_norm": 0.5383920073509216, "learning_rate": 1.2380286721444903e-05, "loss": 0.6251, "step": 5760 }, { "epoch": 2.204745503252966, "grad_norm": 0.514500617980957, "learning_rate": 1.237787893318985e-05, "loss": 0.6733, "step": 5761 }, { "epoch": 2.2051282051282053, "grad_norm": 0.581891655921936, "learning_rate": 1.2375470998807755e-05, "loss": 0.6959, "step": 5762 }, { "epoch": 2.205510907003444, "grad_norm": 0.5222253203392029, "learning_rate": 1.2373062918446581e-05, "loss": 0.6172, "step": 5763 }, { "epoch": 2.2058936088786836, "grad_norm": 0.508111834526062, "learning_rate": 1.2370654692254314e-05, "loss": 0.5738, "step": 5764 }, { "epoch": 2.2062763107539225, "grad_norm": 0.4912688434123993, "learning_rate": 1.2368246320378952e-05, "loss": 0.6079, "step": 5765 }, { "epoch": 2.206659012629162, "grad_norm": 0.5198652744293213, "learning_rate": 1.2365837802968488e-05, "loss": 0.6449, "step": 5766 }, { "epoch": 2.2070417145044012, "grad_norm": 0.56208735704422, "learning_rate": 1.2363429140170938e-05, "loss": 0.6678, "step": 5767 }, { "epoch": 2.20742441637964, "grad_norm": 0.5413572788238525, "learning_rate": 1.2361020332134314e-05, "loss": 0.606, "step": 5768 }, { "epoch": 2.2078071182548795, "grad_norm": 0.5422149896621704, "learning_rate": 1.2358611379006648e-05, "loss": 0.6676, "step": 5769 }, { "epoch": 2.2081898201301184, "grad_norm": 0.5558492541313171, "learning_rate": 1.2356202280935981e-05, "loss": 0.5943, "step": 5770 }, { "epoch": 2.208572522005358, "grad_norm": 0.5417395830154419, "learning_rate": 1.235379303807035e-05, "loss": 0.6773, "step": 5771 }, { "epoch": 2.208955223880597, "grad_norm": 0.5282338857650757, "learning_rate": 1.2351383650557816e-05, "loss": 0.6614, "step": 5772 }, { "epoch": 2.209337925755836, "grad_norm": 0.512575089931488, "learning_rate": 1.2348974118546436e-05, "loss": 0.6805, "step": 5773 }, { "epoch": 2.2097206276310755, "grad_norm": 0.5233507752418518, "learning_rate": 1.2346564442184288e-05, "loss": 0.5795, "step": 5774 }, { "epoch": 2.2101033295063144, "grad_norm": 0.559846818447113, "learning_rate": 1.234415462161945e-05, "loss": 0.6977, "step": 5775 }, { "epoch": 2.2104860313815538, "grad_norm": 0.5753258466720581, "learning_rate": 1.2341744657000017e-05, "loss": 0.6394, "step": 5776 }, { "epoch": 2.210868733256793, "grad_norm": 0.5540636777877808, "learning_rate": 1.233933454847408e-05, "loss": 0.6775, "step": 5777 }, { "epoch": 2.211251435132032, "grad_norm": 0.6311403512954712, "learning_rate": 1.2336924296189752e-05, "loss": 0.663, "step": 5778 }, { "epoch": 2.2116341370072714, "grad_norm": 0.5464850664138794, "learning_rate": 1.2334513900295144e-05, "loss": 0.6337, "step": 5779 }, { "epoch": 2.2120168388825103, "grad_norm": 0.5377113819122314, "learning_rate": 1.233210336093839e-05, "loss": 0.6538, "step": 5780 }, { "epoch": 2.2123995407577497, "grad_norm": 0.5794731974601746, "learning_rate": 1.2329692678267619e-05, "loss": 0.6932, "step": 5781 }, { "epoch": 2.212782242632989, "grad_norm": 0.6356885433197021, "learning_rate": 1.2327281852430972e-05, "loss": 0.7898, "step": 5782 }, { "epoch": 2.213164944508228, "grad_norm": 0.5222490429878235, "learning_rate": 1.2324870883576602e-05, "loss": 0.6053, "step": 5783 }, { "epoch": 2.2135476463834673, "grad_norm": 0.5478712320327759, "learning_rate": 1.2322459771852673e-05, "loss": 0.6707, "step": 5784 }, { "epoch": 2.2139303482587063, "grad_norm": 0.5268785953521729, "learning_rate": 1.2320048517407352e-05, "loss": 0.6457, "step": 5785 }, { "epoch": 2.2143130501339456, "grad_norm": 0.5591649413108826, "learning_rate": 1.2317637120388816e-05, "loss": 0.5366, "step": 5786 }, { "epoch": 2.214695752009185, "grad_norm": 0.5722110867500305, "learning_rate": 1.2315225580945252e-05, "loss": 0.6817, "step": 5787 }, { "epoch": 2.215078453884424, "grad_norm": 0.5464453101158142, "learning_rate": 1.2312813899224857e-05, "loss": 0.6838, "step": 5788 }, { "epoch": 2.2154611557596633, "grad_norm": 0.5851995944976807, "learning_rate": 1.2310402075375834e-05, "loss": 0.6131, "step": 5789 }, { "epoch": 2.215843857634902, "grad_norm": 0.562410295009613, "learning_rate": 1.2307990109546397e-05, "loss": 0.6641, "step": 5790 }, { "epoch": 2.2162265595101416, "grad_norm": 0.5091689825057983, "learning_rate": 1.2305578001884768e-05, "loss": 0.6392, "step": 5791 }, { "epoch": 2.216609261385381, "grad_norm": 0.5036776661872864, "learning_rate": 1.2303165752539175e-05, "loss": 0.6107, "step": 5792 }, { "epoch": 2.21699196326062, "grad_norm": 0.5299052596092224, "learning_rate": 1.2300753361657863e-05, "loss": 0.6372, "step": 5793 }, { "epoch": 2.2173746651358592, "grad_norm": 0.5266646146774292, "learning_rate": 1.2298340829389071e-05, "loss": 0.6666, "step": 5794 }, { "epoch": 2.217757367011098, "grad_norm": 0.5249107480049133, "learning_rate": 1.2295928155881063e-05, "loss": 0.5923, "step": 5795 }, { "epoch": 2.2181400688863375, "grad_norm": 0.5505281090736389, "learning_rate": 1.22935153412821e-05, "loss": 0.5744, "step": 5796 }, { "epoch": 2.218522770761577, "grad_norm": 0.487519234418869, "learning_rate": 1.2291102385740458e-05, "loss": 0.626, "step": 5797 }, { "epoch": 2.218905472636816, "grad_norm": 0.4950845539569855, "learning_rate": 1.2288689289404424e-05, "loss": 0.6921, "step": 5798 }, { "epoch": 2.219288174512055, "grad_norm": 0.538537859916687, "learning_rate": 1.2286276052422278e-05, "loss": 0.6192, "step": 5799 }, { "epoch": 2.219670876387294, "grad_norm": 0.5258589386940002, "learning_rate": 1.2283862674942331e-05, "loss": 0.6482, "step": 5800 }, { "epoch": 2.2200535782625335, "grad_norm": 0.5622684359550476, "learning_rate": 1.2281449157112888e-05, "loss": 0.7377, "step": 5801 }, { "epoch": 2.220436280137773, "grad_norm": 0.5303115248680115, "learning_rate": 1.2279035499082263e-05, "loss": 0.6497, "step": 5802 }, { "epoch": 2.2208189820130118, "grad_norm": 0.5187774300575256, "learning_rate": 1.2276621700998783e-05, "loss": 0.7051, "step": 5803 }, { "epoch": 2.221201683888251, "grad_norm": 0.5175257921218872, "learning_rate": 1.2274207763010783e-05, "loss": 0.563, "step": 5804 }, { "epoch": 2.22158438576349, "grad_norm": 0.5227670669555664, "learning_rate": 1.227179368526661e-05, "loss": 0.6245, "step": 5805 }, { "epoch": 2.2219670876387294, "grad_norm": 0.5312772989273071, "learning_rate": 1.2269379467914607e-05, "loss": 0.6737, "step": 5806 }, { "epoch": 2.2223497895139688, "grad_norm": 0.5406265258789062, "learning_rate": 1.2266965111103144e-05, "loss": 0.6625, "step": 5807 }, { "epoch": 2.2227324913892077, "grad_norm": 0.5352779030799866, "learning_rate": 1.2264550614980581e-05, "loss": 0.6886, "step": 5808 }, { "epoch": 2.223115193264447, "grad_norm": 0.5316658020019531, "learning_rate": 1.22621359796953e-05, "loss": 0.6859, "step": 5809 }, { "epoch": 2.223497895139686, "grad_norm": 0.5330570340156555, "learning_rate": 1.2259721205395687e-05, "loss": 0.6816, "step": 5810 }, { "epoch": 2.2238805970149254, "grad_norm": 0.5162027478218079, "learning_rate": 1.2257306292230136e-05, "loss": 0.6329, "step": 5811 }, { "epoch": 2.2242632988901647, "grad_norm": 0.5273109078407288, "learning_rate": 1.225489124034705e-05, "loss": 0.5853, "step": 5812 }, { "epoch": 2.2246460007654036, "grad_norm": 0.5230556726455688, "learning_rate": 1.2252476049894834e-05, "loss": 0.582, "step": 5813 }, { "epoch": 2.225028702640643, "grad_norm": 0.553065299987793, "learning_rate": 1.2250060721021914e-05, "loss": 0.6006, "step": 5814 }, { "epoch": 2.225411404515882, "grad_norm": 0.515099823474884, "learning_rate": 1.2247645253876724e-05, "loss": 0.6322, "step": 5815 }, { "epoch": 2.2257941063911213, "grad_norm": 0.5468950271606445, "learning_rate": 1.224522964860769e-05, "loss": 0.6116, "step": 5816 }, { "epoch": 2.2261768082663607, "grad_norm": 0.6402366757392883, "learning_rate": 1.2242813905363264e-05, "loss": 0.6915, "step": 5817 }, { "epoch": 2.2265595101415996, "grad_norm": 0.5135183930397034, "learning_rate": 1.2240398024291897e-05, "loss": 0.6416, "step": 5818 }, { "epoch": 2.226942212016839, "grad_norm": 0.5433872938156128, "learning_rate": 1.2237982005542053e-05, "loss": 0.6662, "step": 5819 }, { "epoch": 2.227324913892078, "grad_norm": 0.5026704668998718, "learning_rate": 1.2235565849262205e-05, "loss": 0.7256, "step": 5820 }, { "epoch": 2.2277076157673172, "grad_norm": 0.5056248903274536, "learning_rate": 1.2233149555600827e-05, "loss": 0.5879, "step": 5821 }, { "epoch": 2.2280903176425566, "grad_norm": 0.5493441224098206, "learning_rate": 1.2230733124706411e-05, "loss": 0.5371, "step": 5822 }, { "epoch": 2.2284730195177955, "grad_norm": 0.5860801339149475, "learning_rate": 1.222831655672745e-05, "loss": 0.6614, "step": 5823 }, { "epoch": 2.228855721393035, "grad_norm": 0.5296498537063599, "learning_rate": 1.222589985181245e-05, "loss": 0.605, "step": 5824 }, { "epoch": 2.229238423268274, "grad_norm": 0.5176679491996765, "learning_rate": 1.2223483010109927e-05, "loss": 0.6243, "step": 5825 }, { "epoch": 2.229621125143513, "grad_norm": 0.4882084131240845, "learning_rate": 1.2221066031768397e-05, "loss": 0.6286, "step": 5826 }, { "epoch": 2.2300038270187525, "grad_norm": 0.561267077922821, "learning_rate": 1.2218648916936396e-05, "loss": 0.6151, "step": 5827 }, { "epoch": 2.2303865288939915, "grad_norm": 0.5135766863822937, "learning_rate": 1.2216231665762455e-05, "loss": 0.5834, "step": 5828 }, { "epoch": 2.230769230769231, "grad_norm": 0.5327361822128296, "learning_rate": 1.2213814278395125e-05, "loss": 0.6574, "step": 5829 }, { "epoch": 2.2311519326444698, "grad_norm": 0.5483625531196594, "learning_rate": 1.221139675498296e-05, "loss": 0.6659, "step": 5830 }, { "epoch": 2.231534634519709, "grad_norm": 0.5493265986442566, "learning_rate": 1.2208979095674523e-05, "loss": 0.6488, "step": 5831 }, { "epoch": 2.2319173363949485, "grad_norm": 0.5401184558868408, "learning_rate": 1.2206561300618389e-05, "loss": 0.5474, "step": 5832 }, { "epoch": 2.2323000382701874, "grad_norm": 0.5645315647125244, "learning_rate": 1.2204143369963132e-05, "loss": 0.6818, "step": 5833 }, { "epoch": 2.232682740145427, "grad_norm": 0.5739622116088867, "learning_rate": 1.2201725303857344e-05, "loss": 0.7335, "step": 5834 }, { "epoch": 2.2330654420206657, "grad_norm": 0.5348342657089233, "learning_rate": 1.2199307102449625e-05, "loss": 0.6255, "step": 5835 }, { "epoch": 2.233448143895905, "grad_norm": 0.5583150386810303, "learning_rate": 1.2196888765888572e-05, "loss": 0.6256, "step": 5836 }, { "epoch": 2.2338308457711444, "grad_norm": 0.5526531338691711, "learning_rate": 1.2194470294322805e-05, "loss": 0.5998, "step": 5837 }, { "epoch": 2.2342135476463834, "grad_norm": 0.5406566858291626, "learning_rate": 1.2192051687900941e-05, "loss": 0.6924, "step": 5838 }, { "epoch": 2.2345962495216227, "grad_norm": 0.4970090091228485, "learning_rate": 1.218963294677161e-05, "loss": 0.5974, "step": 5839 }, { "epoch": 2.2349789513968616, "grad_norm": 0.5274213552474976, "learning_rate": 1.2187214071083455e-05, "loss": 0.6151, "step": 5840 }, { "epoch": 2.235361653272101, "grad_norm": 0.5661929249763489, "learning_rate": 1.2184795060985122e-05, "loss": 0.6252, "step": 5841 }, { "epoch": 2.2357443551473404, "grad_norm": 0.5422635078430176, "learning_rate": 1.2182375916625258e-05, "loss": 0.6621, "step": 5842 }, { "epoch": 2.2361270570225793, "grad_norm": 0.5309639573097229, "learning_rate": 1.2179956638152535e-05, "loss": 0.6433, "step": 5843 }, { "epoch": 2.2365097588978187, "grad_norm": 0.5959199666976929, "learning_rate": 1.2177537225715615e-05, "loss": 0.6668, "step": 5844 }, { "epoch": 2.2368924607730576, "grad_norm": 0.5161147117614746, "learning_rate": 1.2175117679463187e-05, "loss": 0.599, "step": 5845 }, { "epoch": 2.237275162648297, "grad_norm": 0.5548872351646423, "learning_rate": 1.2172697999543934e-05, "loss": 0.5826, "step": 5846 }, { "epoch": 2.2376578645235363, "grad_norm": 0.5074834823608398, "learning_rate": 1.217027818610655e-05, "loss": 0.664, "step": 5847 }, { "epoch": 2.2380405663987752, "grad_norm": 0.5449039340019226, "learning_rate": 1.2167858239299743e-05, "loss": 0.6472, "step": 5848 }, { "epoch": 2.2384232682740146, "grad_norm": 0.5313143730163574, "learning_rate": 1.216543815927222e-05, "loss": 0.6352, "step": 5849 }, { "epoch": 2.2388059701492535, "grad_norm": 0.5174048542976379, "learning_rate": 1.2163017946172707e-05, "loss": 0.614, "step": 5850 }, { "epoch": 2.239188672024493, "grad_norm": 0.5609890818595886, "learning_rate": 1.2160597600149933e-05, "loss": 0.725, "step": 5851 }, { "epoch": 2.2395713738997323, "grad_norm": 0.5029460787773132, "learning_rate": 1.2158177121352629e-05, "loss": 0.5925, "step": 5852 }, { "epoch": 2.239954075774971, "grad_norm": 0.5373101830482483, "learning_rate": 1.2155756509929541e-05, "loss": 0.7053, "step": 5853 }, { "epoch": 2.2403367776502106, "grad_norm": 0.516917884349823, "learning_rate": 1.2153335766029426e-05, "loss": 0.6559, "step": 5854 }, { "epoch": 2.2407194795254495, "grad_norm": 0.49228447675704956, "learning_rate": 1.2150914889801043e-05, "loss": 0.6839, "step": 5855 }, { "epoch": 2.241102181400689, "grad_norm": 0.591068685054779, "learning_rate": 1.2148493881393164e-05, "loss": 0.6171, "step": 5856 }, { "epoch": 2.241484883275928, "grad_norm": 0.614234447479248, "learning_rate": 1.214607274095456e-05, "loss": 0.6922, "step": 5857 }, { "epoch": 2.241867585151167, "grad_norm": 0.584641695022583, "learning_rate": 1.2143651468634024e-05, "loss": 0.7073, "step": 5858 }, { "epoch": 2.2422502870264065, "grad_norm": 0.5848619341850281, "learning_rate": 1.2141230064580345e-05, "loss": 0.6927, "step": 5859 }, { "epoch": 2.2426329889016454, "grad_norm": 0.5492410063743591, "learning_rate": 1.2138808528942326e-05, "loss": 0.6588, "step": 5860 }, { "epoch": 2.243015690776885, "grad_norm": 0.5584808588027954, "learning_rate": 1.2136386861868778e-05, "loss": 0.5427, "step": 5861 }, { "epoch": 2.243398392652124, "grad_norm": 0.5206058025360107, "learning_rate": 1.213396506350852e-05, "loss": 0.631, "step": 5862 }, { "epoch": 2.243781094527363, "grad_norm": 0.5566229224205017, "learning_rate": 1.213154313401037e-05, "loss": 0.736, "step": 5863 }, { "epoch": 2.2441637964026024, "grad_norm": 0.5362070798873901, "learning_rate": 1.2129121073523171e-05, "loss": 0.6449, "step": 5864 }, { "epoch": 2.2445464982778414, "grad_norm": 0.560560941696167, "learning_rate": 1.2126698882195761e-05, "loss": 0.5748, "step": 5865 }, { "epoch": 2.2449292001530807, "grad_norm": 0.5474836826324463, "learning_rate": 1.2124276560176996e-05, "loss": 0.6326, "step": 5866 }, { "epoch": 2.24531190202832, "grad_norm": 0.5389282703399658, "learning_rate": 1.2121854107615726e-05, "loss": 0.6969, "step": 5867 }, { "epoch": 2.245694603903559, "grad_norm": 0.5289753079414368, "learning_rate": 1.2119431524660821e-05, "loss": 0.6051, "step": 5868 }, { "epoch": 2.2460773057787984, "grad_norm": 0.5256497859954834, "learning_rate": 1.2117008811461153e-05, "loss": 0.6861, "step": 5869 }, { "epoch": 2.2464600076540373, "grad_norm": 0.5218472480773926, "learning_rate": 1.211458596816561e-05, "loss": 0.6151, "step": 5870 }, { "epoch": 2.2468427095292767, "grad_norm": 0.5299076437950134, "learning_rate": 1.2112162994923075e-05, "loss": 0.6484, "step": 5871 }, { "epoch": 2.247225411404516, "grad_norm": 0.5292649269104004, "learning_rate": 1.2109739891882453e-05, "loss": 0.6927, "step": 5872 }, { "epoch": 2.247608113279755, "grad_norm": 0.56682950258255, "learning_rate": 1.2107316659192645e-05, "loss": 0.6547, "step": 5873 }, { "epoch": 2.2479908151549943, "grad_norm": 0.5090750455856323, "learning_rate": 1.2104893297002566e-05, "loss": 0.5638, "step": 5874 }, { "epoch": 2.2483735170302332, "grad_norm": 0.5328126549720764, "learning_rate": 1.2102469805461141e-05, "loss": 0.7127, "step": 5875 }, { "epoch": 2.2487562189054726, "grad_norm": 0.5502026677131653, "learning_rate": 1.2100046184717298e-05, "loss": 0.5842, "step": 5876 }, { "epoch": 2.249138920780712, "grad_norm": 0.5525456666946411, "learning_rate": 1.2097622434919976e-05, "loss": 0.553, "step": 5877 }, { "epoch": 2.249521622655951, "grad_norm": 0.4958241581916809, "learning_rate": 1.2095198556218117e-05, "loss": 0.5815, "step": 5878 }, { "epoch": 2.2499043245311903, "grad_norm": 0.4946977198123932, "learning_rate": 1.2092774548760679e-05, "loss": 0.5443, "step": 5879 }, { "epoch": 2.250287026406429, "grad_norm": 0.5268017053604126, "learning_rate": 1.2090350412696626e-05, "loss": 0.6605, "step": 5880 }, { "epoch": 2.2506697282816686, "grad_norm": 0.4924485981464386, "learning_rate": 1.208792614817492e-05, "loss": 0.6377, "step": 5881 }, { "epoch": 2.251052430156908, "grad_norm": 0.5077912211418152, "learning_rate": 1.2085501755344547e-05, "loss": 0.6216, "step": 5882 }, { "epoch": 2.251435132032147, "grad_norm": 0.49494701623916626, "learning_rate": 1.2083077234354486e-05, "loss": 0.6661, "step": 5883 }, { "epoch": 2.251817833907386, "grad_norm": 0.522515058517456, "learning_rate": 1.208065258535373e-05, "loss": 0.6559, "step": 5884 }, { "epoch": 2.2522005357826256, "grad_norm": 0.5749408006668091, "learning_rate": 1.207822780849129e-05, "loss": 0.6554, "step": 5885 }, { "epoch": 2.2525832376578645, "grad_norm": 0.531258761882782, "learning_rate": 1.2075802903916162e-05, "loss": 0.5927, "step": 5886 }, { "epoch": 2.252965939533104, "grad_norm": 0.5530086159706116, "learning_rate": 1.2073377871777372e-05, "loss": 0.7171, "step": 5887 }, { "epoch": 2.253348641408343, "grad_norm": 0.6449833512306213, "learning_rate": 1.207095271222394e-05, "loss": 0.6246, "step": 5888 }, { "epoch": 2.253731343283582, "grad_norm": 0.5338450074195862, "learning_rate": 1.2068527425404902e-05, "loss": 0.6604, "step": 5889 }, { "epoch": 2.254114045158821, "grad_norm": 0.5554092526435852, "learning_rate": 1.2066102011469295e-05, "loss": 0.6307, "step": 5890 }, { "epoch": 2.2544967470340604, "grad_norm": 0.5205696225166321, "learning_rate": 1.2063676470566171e-05, "loss": 0.5555, "step": 5891 }, { "epoch": 2.2548794489093, "grad_norm": 0.5454459190368652, "learning_rate": 1.2061250802844583e-05, "loss": 0.6484, "step": 5892 }, { "epoch": 2.2552621507845387, "grad_norm": 0.5075467824935913, "learning_rate": 1.2058825008453596e-05, "loss": 0.6352, "step": 5893 }, { "epoch": 2.255644852659778, "grad_norm": 0.49973544478416443, "learning_rate": 1.2056399087542277e-05, "loss": 0.5906, "step": 5894 }, { "epoch": 2.2560275545350175, "grad_norm": 0.5395973324775696, "learning_rate": 1.2053973040259714e-05, "loss": 0.6411, "step": 5895 }, { "epoch": 2.2564102564102564, "grad_norm": 0.4964561462402344, "learning_rate": 1.2051546866754989e-05, "loss": 0.5696, "step": 5896 }, { "epoch": 2.2567929582854958, "grad_norm": 0.5763630867004395, "learning_rate": 1.2049120567177198e-05, "loss": 0.6947, "step": 5897 }, { "epoch": 2.2571756601607347, "grad_norm": 0.5354982018470764, "learning_rate": 1.2046694141675444e-05, "loss": 0.6931, "step": 5898 }, { "epoch": 2.257558362035974, "grad_norm": 0.5348233580589294, "learning_rate": 1.2044267590398831e-05, "loss": 0.6317, "step": 5899 }, { "epoch": 2.257941063911213, "grad_norm": 0.5649446845054626, "learning_rate": 1.2041840913496487e-05, "loss": 0.7072, "step": 5900 }, { "epoch": 2.2583237657864523, "grad_norm": 0.5859045386314392, "learning_rate": 1.2039414111117536e-05, "loss": 0.6083, "step": 5901 }, { "epoch": 2.2587064676616917, "grad_norm": 0.5285983681678772, "learning_rate": 1.2036987183411105e-05, "loss": 0.5967, "step": 5902 }, { "epoch": 2.2590891695369306, "grad_norm": 0.543075442314148, "learning_rate": 1.2034560130526341e-05, "loss": 0.6189, "step": 5903 }, { "epoch": 2.25947187141217, "grad_norm": 0.5257560014724731, "learning_rate": 1.2032132952612388e-05, "loss": 0.6874, "step": 5904 }, { "epoch": 2.2598545732874094, "grad_norm": 0.6080725789070129, "learning_rate": 1.2029705649818408e-05, "loss": 0.6401, "step": 5905 }, { "epoch": 2.2602372751626483, "grad_norm": 0.5980830192565918, "learning_rate": 1.2027278222293566e-05, "loss": 0.5816, "step": 5906 }, { "epoch": 2.2606199770378876, "grad_norm": 0.556251585483551, "learning_rate": 1.2024850670187027e-05, "loss": 0.6812, "step": 5907 }, { "epoch": 2.2610026789131266, "grad_norm": 0.533547580242157, "learning_rate": 1.2022422993647978e-05, "loss": 0.6108, "step": 5908 }, { "epoch": 2.261385380788366, "grad_norm": 0.49302008748054504, "learning_rate": 1.2019995192825598e-05, "loss": 0.5829, "step": 5909 }, { "epoch": 2.261768082663605, "grad_norm": 0.5002306699752808, "learning_rate": 1.2017567267869089e-05, "loss": 0.6733, "step": 5910 }, { "epoch": 2.262150784538844, "grad_norm": 0.5154793858528137, "learning_rate": 1.2015139218927652e-05, "loss": 0.6269, "step": 5911 }, { "epoch": 2.2625334864140836, "grad_norm": 0.53071129322052, "learning_rate": 1.2012711046150496e-05, "loss": 0.7418, "step": 5912 }, { "epoch": 2.2629161882893225, "grad_norm": 0.5334556102752686, "learning_rate": 1.2010282749686839e-05, "loss": 0.6153, "step": 5913 }, { "epoch": 2.263298890164562, "grad_norm": 0.5267828106880188, "learning_rate": 1.2007854329685906e-05, "loss": 0.5834, "step": 5914 }, { "epoch": 2.2636815920398012, "grad_norm": 0.5296624302864075, "learning_rate": 1.200542578629693e-05, "loss": 0.6262, "step": 5915 }, { "epoch": 2.26406429391504, "grad_norm": 0.5277380347251892, "learning_rate": 1.2002997119669156e-05, "loss": 0.6282, "step": 5916 }, { "epoch": 2.2644469957902795, "grad_norm": 0.5539369583129883, "learning_rate": 1.2000568329951822e-05, "loss": 0.7303, "step": 5917 }, { "epoch": 2.2648296976655184, "grad_norm": 0.5271759033203125, "learning_rate": 1.1998139417294193e-05, "loss": 0.6077, "step": 5918 }, { "epoch": 2.265212399540758, "grad_norm": 0.5618340969085693, "learning_rate": 1.1995710381845527e-05, "loss": 0.6475, "step": 5919 }, { "epoch": 2.2655951014159967, "grad_norm": 0.5026712417602539, "learning_rate": 1.1993281223755097e-05, "loss": 0.6534, "step": 5920 }, { "epoch": 2.265977803291236, "grad_norm": 0.6311990022659302, "learning_rate": 1.1990851943172185e-05, "loss": 0.6966, "step": 5921 }, { "epoch": 2.2663605051664755, "grad_norm": 0.5501184463500977, "learning_rate": 1.1988422540246068e-05, "loss": 0.6176, "step": 5922 }, { "epoch": 2.2667432070417144, "grad_norm": 0.503903865814209, "learning_rate": 1.1985993015126044e-05, "loss": 0.6212, "step": 5923 }, { "epoch": 2.2671259089169538, "grad_norm": 0.5631338953971863, "learning_rate": 1.1983563367961414e-05, "loss": 0.6866, "step": 5924 }, { "epoch": 2.267508610792193, "grad_norm": 0.5529137253761292, "learning_rate": 1.1981133598901486e-05, "loss": 0.6323, "step": 5925 }, { "epoch": 2.267891312667432, "grad_norm": 0.5743860006332397, "learning_rate": 1.197870370809558e-05, "loss": 0.6125, "step": 5926 }, { "epoch": 2.2682740145426714, "grad_norm": 0.5717052221298218, "learning_rate": 1.1976273695693015e-05, "loss": 0.6945, "step": 5927 }, { "epoch": 2.2686567164179103, "grad_norm": 0.5341587662696838, "learning_rate": 1.1973843561843117e-05, "loss": 0.7388, "step": 5928 }, { "epoch": 2.2690394182931497, "grad_norm": 0.5959823131561279, "learning_rate": 1.1971413306695232e-05, "loss": 0.648, "step": 5929 }, { "epoch": 2.2694221201683886, "grad_norm": 0.5268690586090088, "learning_rate": 1.1968982930398703e-05, "loss": 0.6296, "step": 5930 }, { "epoch": 2.269804822043628, "grad_norm": 0.5429916381835938, "learning_rate": 1.1966552433102884e-05, "loss": 0.658, "step": 5931 }, { "epoch": 2.2701875239188674, "grad_norm": 0.5678544044494629, "learning_rate": 1.1964121814957136e-05, "loss": 0.6038, "step": 5932 }, { "epoch": 2.2705702257941063, "grad_norm": 0.5367928147315979, "learning_rate": 1.1961691076110825e-05, "loss": 0.6676, "step": 5933 }, { "epoch": 2.2709529276693456, "grad_norm": 0.5316751003265381, "learning_rate": 1.1959260216713325e-05, "loss": 0.649, "step": 5934 }, { "epoch": 2.271335629544585, "grad_norm": 0.5449824333190918, "learning_rate": 1.1956829236914022e-05, "loss": 0.6912, "step": 5935 }, { "epoch": 2.271718331419824, "grad_norm": 0.5286306142807007, "learning_rate": 1.1954398136862308e-05, "loss": 0.6757, "step": 5936 }, { "epoch": 2.2721010332950633, "grad_norm": 0.4975563883781433, "learning_rate": 1.1951966916707577e-05, "loss": 0.6592, "step": 5937 }, { "epoch": 2.272483735170302, "grad_norm": 0.5574039816856384, "learning_rate": 1.1949535576599234e-05, "loss": 0.5682, "step": 5938 }, { "epoch": 2.2728664370455416, "grad_norm": 0.5179893374443054, "learning_rate": 1.1947104116686692e-05, "loss": 0.6259, "step": 5939 }, { "epoch": 2.2732491389207805, "grad_norm": 0.5367581248283386, "learning_rate": 1.1944672537119373e-05, "loss": 0.6214, "step": 5940 }, { "epoch": 2.27363184079602, "grad_norm": 0.6329227685928345, "learning_rate": 1.1942240838046699e-05, "loss": 0.7553, "step": 5941 }, { "epoch": 2.2740145426712592, "grad_norm": 0.5729301571846008, "learning_rate": 1.1939809019618112e-05, "loss": 0.6242, "step": 5942 }, { "epoch": 2.274397244546498, "grad_norm": 0.554943323135376, "learning_rate": 1.1937377081983046e-05, "loss": 0.6756, "step": 5943 }, { "epoch": 2.2747799464217375, "grad_norm": 0.5489736199378967, "learning_rate": 1.1934945025290953e-05, "loss": 0.5745, "step": 5944 }, { "epoch": 2.275162648296977, "grad_norm": 0.5292836427688599, "learning_rate": 1.1932512849691293e-05, "loss": 0.6447, "step": 5945 }, { "epoch": 2.275545350172216, "grad_norm": 0.5134501457214355, "learning_rate": 1.1930080555333525e-05, "loss": 0.5724, "step": 5946 }, { "epoch": 2.275928052047455, "grad_norm": 0.5599271059036255, "learning_rate": 1.1927648142367125e-05, "loss": 0.6749, "step": 5947 }, { "epoch": 2.276310753922694, "grad_norm": 0.5749673843383789, "learning_rate": 1.1925215610941568e-05, "loss": 0.6782, "step": 5948 }, { "epoch": 2.2766934557979335, "grad_norm": 0.5219247341156006, "learning_rate": 1.1922782961206332e-05, "loss": 0.6048, "step": 5949 }, { "epoch": 2.2770761576731724, "grad_norm": 0.5604027509689331, "learning_rate": 1.1920350193310923e-05, "loss": 0.6264, "step": 5950 }, { "epoch": 2.2774588595484118, "grad_norm": 0.5655605792999268, "learning_rate": 1.1917917307404836e-05, "loss": 0.6428, "step": 5951 }, { "epoch": 2.277841561423651, "grad_norm": 0.5519929528236389, "learning_rate": 1.1915484303637577e-05, "loss": 0.6236, "step": 5952 }, { "epoch": 2.27822426329889, "grad_norm": 0.4949408769607544, "learning_rate": 1.1913051182158664e-05, "loss": 0.6341, "step": 5953 }, { "epoch": 2.2786069651741294, "grad_norm": 0.5104151964187622, "learning_rate": 1.1910617943117611e-05, "loss": 0.6064, "step": 5954 }, { "epoch": 2.278989667049369, "grad_norm": 0.5957834720611572, "learning_rate": 1.1908184586663957e-05, "loss": 0.6172, "step": 5955 }, { "epoch": 2.2793723689246077, "grad_norm": 0.5451918840408325, "learning_rate": 1.1905751112947232e-05, "loss": 0.7342, "step": 5956 }, { "epoch": 2.279755070799847, "grad_norm": 0.5609599351882935, "learning_rate": 1.1903317522116984e-05, "loss": 0.68, "step": 5957 }, { "epoch": 2.280137772675086, "grad_norm": 0.6107780933380127, "learning_rate": 1.1900883814322759e-05, "loss": 0.5725, "step": 5958 }, { "epoch": 2.2805204745503254, "grad_norm": 0.5502409338951111, "learning_rate": 1.1898449989714116e-05, "loss": 0.7088, "step": 5959 }, { "epoch": 2.2809031764255643, "grad_norm": 0.5240595936775208, "learning_rate": 1.1896016048440621e-05, "loss": 0.6182, "step": 5960 }, { "epoch": 2.2812858783008036, "grad_norm": 0.49763059616088867, "learning_rate": 1.1893581990651848e-05, "loss": 0.703, "step": 5961 }, { "epoch": 2.281668580176043, "grad_norm": 0.5738383531570435, "learning_rate": 1.1891147816497373e-05, "loss": 0.6712, "step": 5962 }, { "epoch": 2.282051282051282, "grad_norm": 0.5846663117408752, "learning_rate": 1.1888713526126784e-05, "loss": 0.6216, "step": 5963 }, { "epoch": 2.2824339839265213, "grad_norm": 0.5486178398132324, "learning_rate": 1.1886279119689676e-05, "loss": 0.6516, "step": 5964 }, { "epoch": 2.2828166858017607, "grad_norm": 0.5300953388214111, "learning_rate": 1.1883844597335647e-05, "loss": 0.6604, "step": 5965 }, { "epoch": 2.2831993876769996, "grad_norm": 0.5366272330284119, "learning_rate": 1.188140995921431e-05, "loss": 0.6377, "step": 5966 }, { "epoch": 2.283582089552239, "grad_norm": 0.5628576278686523, "learning_rate": 1.1878975205475274e-05, "loss": 0.6764, "step": 5967 }, { "epoch": 2.283964791427478, "grad_norm": 0.5585125088691711, "learning_rate": 1.1876540336268163e-05, "loss": 0.7518, "step": 5968 }, { "epoch": 2.2843474933027172, "grad_norm": 0.5082035064697266, "learning_rate": 1.1874105351742609e-05, "loss": 0.6739, "step": 5969 }, { "epoch": 2.284730195177956, "grad_norm": 0.5825346112251282, "learning_rate": 1.1871670252048245e-05, "loss": 0.5784, "step": 5970 }, { "epoch": 2.2851128970531955, "grad_norm": 0.5569528937339783, "learning_rate": 1.1869235037334716e-05, "loss": 0.6854, "step": 5971 }, { "epoch": 2.285495598928435, "grad_norm": 0.5192355513572693, "learning_rate": 1.1866799707751673e-05, "loss": 0.6406, "step": 5972 }, { "epoch": 2.285878300803674, "grad_norm": 0.5402302145957947, "learning_rate": 1.1864364263448774e-05, "loss": 0.6767, "step": 5973 }, { "epoch": 2.286261002678913, "grad_norm": 0.5462149381637573, "learning_rate": 1.1861928704575677e-05, "loss": 0.6421, "step": 5974 }, { "epoch": 2.2866437045541526, "grad_norm": 0.5560253262519836, "learning_rate": 1.1859493031282063e-05, "loss": 0.6125, "step": 5975 }, { "epoch": 2.2870264064293915, "grad_norm": 0.5469866991043091, "learning_rate": 1.185705724371761e-05, "loss": 0.6438, "step": 5976 }, { "epoch": 2.287409108304631, "grad_norm": 0.5398311018943787, "learning_rate": 1.1854621342031996e-05, "loss": 0.6044, "step": 5977 }, { "epoch": 2.2877918101798698, "grad_norm": 0.5901206135749817, "learning_rate": 1.1852185326374919e-05, "loss": 0.6513, "step": 5978 }, { "epoch": 2.288174512055109, "grad_norm": 0.5715613961219788, "learning_rate": 1.1849749196896077e-05, "loss": 0.6204, "step": 5979 }, { "epoch": 2.288557213930348, "grad_norm": 0.5189023613929749, "learning_rate": 1.1847312953745179e-05, "loss": 0.5417, "step": 5980 }, { "epoch": 2.2889399158055874, "grad_norm": 0.5192238092422485, "learning_rate": 1.1844876597071937e-05, "loss": 0.6479, "step": 5981 }, { "epoch": 2.289322617680827, "grad_norm": 0.5860387086868286, "learning_rate": 1.184244012702607e-05, "loss": 0.6501, "step": 5982 }, { "epoch": 2.2897053195560657, "grad_norm": 0.5507810115814209, "learning_rate": 1.184000354375731e-05, "loss": 0.5976, "step": 5983 }, { "epoch": 2.290088021431305, "grad_norm": 0.5491170883178711, "learning_rate": 1.1837566847415386e-05, "loss": 0.6423, "step": 5984 }, { "epoch": 2.2904707233065444, "grad_norm": 0.5237870216369629, "learning_rate": 1.1835130038150044e-05, "loss": 0.6004, "step": 5985 }, { "epoch": 2.2908534251817834, "grad_norm": 0.5181156992912292, "learning_rate": 1.1832693116111034e-05, "loss": 0.6323, "step": 5986 }, { "epoch": 2.2912361270570227, "grad_norm": 0.5684764385223389, "learning_rate": 1.1830256081448109e-05, "loss": 0.6384, "step": 5987 }, { "epoch": 2.2916188289322617, "grad_norm": 0.582761287689209, "learning_rate": 1.1827818934311025e-05, "loss": 0.6394, "step": 5988 }, { "epoch": 2.292001530807501, "grad_norm": 0.5434165596961975, "learning_rate": 1.1825381674849558e-05, "loss": 0.5757, "step": 5989 }, { "epoch": 2.29238423268274, "grad_norm": 0.5298071503639221, "learning_rate": 1.1822944303213486e-05, "loss": 0.5703, "step": 5990 }, { "epoch": 2.2927669345579793, "grad_norm": 0.5183631181716919, "learning_rate": 1.182050681955259e-05, "loss": 0.6615, "step": 5991 }, { "epoch": 2.2931496364332187, "grad_norm": 0.5635404586791992, "learning_rate": 1.181806922401666e-05, "loss": 0.6414, "step": 5992 }, { "epoch": 2.2935323383084576, "grad_norm": 0.5335788726806641, "learning_rate": 1.1815631516755488e-05, "loss": 0.6005, "step": 5993 }, { "epoch": 2.293915040183697, "grad_norm": 0.538794755935669, "learning_rate": 1.1813193697918884e-05, "loss": 0.6921, "step": 5994 }, { "epoch": 2.2942977420589363, "grad_norm": 0.577709436416626, "learning_rate": 1.1810755767656654e-05, "loss": 0.6441, "step": 5995 }, { "epoch": 2.2946804439341753, "grad_norm": 0.5561738610267639, "learning_rate": 1.180831772611862e-05, "loss": 0.606, "step": 5996 }, { "epoch": 2.2950631458094146, "grad_norm": 0.526869535446167, "learning_rate": 1.1805879573454605e-05, "loss": 0.7086, "step": 5997 }, { "epoch": 2.2954458476846535, "grad_norm": 0.5275442004203796, "learning_rate": 1.1803441309814433e-05, "loss": 0.6511, "step": 5998 }, { "epoch": 2.295828549559893, "grad_norm": 0.5423957705497742, "learning_rate": 1.1801002935347951e-05, "loss": 0.6353, "step": 5999 }, { "epoch": 2.296211251435132, "grad_norm": 0.549455463886261, "learning_rate": 1.1798564450205002e-05, "loss": 0.6819, "step": 6000 }, { "epoch": 2.296593953310371, "grad_norm": 0.514480471611023, "learning_rate": 1.1796125854535432e-05, "loss": 0.6101, "step": 6001 }, { "epoch": 2.2969766551856106, "grad_norm": 0.5534989833831787, "learning_rate": 1.1793687148489109e-05, "loss": 0.6973, "step": 6002 }, { "epoch": 2.2973593570608495, "grad_norm": 0.5817016363143921, "learning_rate": 1.1791248332215886e-05, "loss": 0.7467, "step": 6003 }, { "epoch": 2.297742058936089, "grad_norm": 0.554385244846344, "learning_rate": 1.1788809405865643e-05, "loss": 0.6378, "step": 6004 }, { "epoch": 2.298124760811328, "grad_norm": 0.5426996350288391, "learning_rate": 1.1786370369588257e-05, "loss": 0.6407, "step": 6005 }, { "epoch": 2.298507462686567, "grad_norm": 0.49688777327537537, "learning_rate": 1.1783931223533608e-05, "loss": 0.5782, "step": 6006 }, { "epoch": 2.2988901645618065, "grad_norm": 0.5362728834152222, "learning_rate": 1.17814919678516e-05, "loss": 0.5873, "step": 6007 }, { "epoch": 2.2992728664370454, "grad_norm": 0.5402117967605591, "learning_rate": 1.177905260269212e-05, "loss": 0.6067, "step": 6008 }, { "epoch": 2.299655568312285, "grad_norm": 0.5508279800415039, "learning_rate": 1.1776613128205081e-05, "loss": 0.6658, "step": 6009 }, { "epoch": 2.3000382701875237, "grad_norm": 0.5098780989646912, "learning_rate": 1.177417354454039e-05, "loss": 0.5828, "step": 6010 }, { "epoch": 2.300420972062763, "grad_norm": 0.5129390358924866, "learning_rate": 1.1771733851847968e-05, "loss": 0.6064, "step": 6011 }, { "epoch": 2.3008036739380024, "grad_norm": 0.5744432210922241, "learning_rate": 1.1769294050277744e-05, "loss": 0.6568, "step": 6012 }, { "epoch": 2.3011863758132414, "grad_norm": 0.5411107540130615, "learning_rate": 1.1766854139979647e-05, "loss": 0.6771, "step": 6013 }, { "epoch": 2.3015690776884807, "grad_norm": 0.6091376543045044, "learning_rate": 1.1764414121103613e-05, "loss": 0.6861, "step": 6014 }, { "epoch": 2.30195177956372, "grad_norm": 0.560905396938324, "learning_rate": 1.1761973993799594e-05, "loss": 0.665, "step": 6015 }, { "epoch": 2.302334481438959, "grad_norm": 0.5785747766494751, "learning_rate": 1.1759533758217538e-05, "loss": 0.596, "step": 6016 }, { "epoch": 2.3027171833141984, "grad_norm": 0.4995601177215576, "learning_rate": 1.1757093414507408e-05, "loss": 0.6615, "step": 6017 }, { "epoch": 2.3030998851894373, "grad_norm": 0.5828412175178528, "learning_rate": 1.1754652962819166e-05, "loss": 0.674, "step": 6018 }, { "epoch": 2.3034825870646767, "grad_norm": 0.5887455344200134, "learning_rate": 1.1752212403302785e-05, "loss": 0.5892, "step": 6019 }, { "epoch": 2.3038652889399156, "grad_norm": 0.5220962166786194, "learning_rate": 1.1749771736108245e-05, "loss": 0.6245, "step": 6020 }, { "epoch": 2.304247990815155, "grad_norm": 0.5255992412567139, "learning_rate": 1.174733096138553e-05, "loss": 0.6557, "step": 6021 }, { "epoch": 2.3046306926903943, "grad_norm": 0.49327877163887024, "learning_rate": 1.1744890079284635e-05, "loss": 0.647, "step": 6022 }, { "epoch": 2.3050133945656333, "grad_norm": 0.5456172227859497, "learning_rate": 1.1742449089955557e-05, "loss": 0.7336, "step": 6023 }, { "epoch": 2.3053960964408726, "grad_norm": 0.5264549851417542, "learning_rate": 1.1740007993548299e-05, "loss": 0.6391, "step": 6024 }, { "epoch": 2.305778798316112, "grad_norm": 0.5251181125640869, "learning_rate": 1.1737566790212877e-05, "loss": 0.6444, "step": 6025 }, { "epoch": 2.306161500191351, "grad_norm": 0.6051468253135681, "learning_rate": 1.173512548009931e-05, "loss": 0.6879, "step": 6026 }, { "epoch": 2.3065442020665903, "grad_norm": 0.5753169655799866, "learning_rate": 1.1732684063357615e-05, "loss": 0.5931, "step": 6027 }, { "epoch": 2.306926903941829, "grad_norm": 0.535622775554657, "learning_rate": 1.1730242540137835e-05, "loss": 0.7142, "step": 6028 }, { "epoch": 2.3073096058170686, "grad_norm": 0.5331630110740662, "learning_rate": 1.172780091059e-05, "loss": 0.6094, "step": 6029 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5363110899925232, "learning_rate": 1.1725359174864157e-05, "loss": 0.5958, "step": 6030 }, { "epoch": 2.308075009567547, "grad_norm": 0.558521032333374, "learning_rate": 1.172291733311036e-05, "loss": 0.6066, "step": 6031 }, { "epoch": 2.308457711442786, "grad_norm": 0.5622885823249817, "learning_rate": 1.1720475385478662e-05, "loss": 0.6282, "step": 6032 }, { "epoch": 2.308840413318025, "grad_norm": 0.5895817875862122, "learning_rate": 1.1718033332119132e-05, "loss": 0.6254, "step": 6033 }, { "epoch": 2.3092231151932645, "grad_norm": 0.6127783060073853, "learning_rate": 1.1715591173181834e-05, "loss": 0.5928, "step": 6034 }, { "epoch": 2.309605817068504, "grad_norm": 0.5166054368019104, "learning_rate": 1.1713148908816851e-05, "loss": 0.611, "step": 6035 }, { "epoch": 2.309988518943743, "grad_norm": 0.6037149429321289, "learning_rate": 1.1710706539174268e-05, "loss": 0.5736, "step": 6036 }, { "epoch": 2.310371220818982, "grad_norm": 0.531949520111084, "learning_rate": 1.170826406440417e-05, "loss": 0.5865, "step": 6037 }, { "epoch": 2.310753922694221, "grad_norm": 0.5499215126037598, "learning_rate": 1.1705821484656658e-05, "loss": 0.6414, "step": 6038 }, { "epoch": 2.3111366245694605, "grad_norm": 0.524013876914978, "learning_rate": 1.170337880008183e-05, "loss": 0.572, "step": 6039 }, { "epoch": 2.3115193264446994, "grad_norm": 0.5828987956047058, "learning_rate": 1.17009360108298e-05, "loss": 0.6564, "step": 6040 }, { "epoch": 2.3119020283199387, "grad_norm": 0.5078042149543762, "learning_rate": 1.1698493117050684e-05, "loss": 0.575, "step": 6041 }, { "epoch": 2.312284730195178, "grad_norm": 0.48514828085899353, "learning_rate": 1.1696050118894603e-05, "loss": 0.6214, "step": 6042 }, { "epoch": 2.312667432070417, "grad_norm": 0.6146171689033508, "learning_rate": 1.1693607016511683e-05, "loss": 0.6355, "step": 6043 }, { "epoch": 2.3130501339456564, "grad_norm": 0.5372426509857178, "learning_rate": 1.1691163810052064e-05, "loss": 0.6727, "step": 6044 }, { "epoch": 2.3134328358208958, "grad_norm": 0.5434811115264893, "learning_rate": 1.1688720499665884e-05, "loss": 0.6396, "step": 6045 }, { "epoch": 2.3138155376961347, "grad_norm": 0.529111921787262, "learning_rate": 1.1686277085503294e-05, "loss": 0.5727, "step": 6046 }, { "epoch": 2.314198239571374, "grad_norm": 0.5125353932380676, "learning_rate": 1.168383356771445e-05, "loss": 0.5736, "step": 6047 }, { "epoch": 2.314580941446613, "grad_norm": 0.5525092482566833, "learning_rate": 1.1681389946449504e-05, "loss": 0.672, "step": 6048 }, { "epoch": 2.3149636433218523, "grad_norm": 0.5598242878913879, "learning_rate": 1.1678946221858632e-05, "loss": 0.6384, "step": 6049 }, { "epoch": 2.3153463451970913, "grad_norm": 0.5219502449035645, "learning_rate": 1.1676502394092002e-05, "loss": 0.6992, "step": 6050 }, { "epoch": 2.3157290470723306, "grad_norm": 0.6052695512771606, "learning_rate": 1.1674058463299798e-05, "loss": 0.5907, "step": 6051 }, { "epoch": 2.31611174894757, "grad_norm": 0.5607175827026367, "learning_rate": 1.1671614429632206e-05, "loss": 0.6478, "step": 6052 }, { "epoch": 2.316494450822809, "grad_norm": 0.538137674331665, "learning_rate": 1.1669170293239411e-05, "loss": 0.6916, "step": 6053 }, { "epoch": 2.3168771526980483, "grad_norm": 0.5720471739768982, "learning_rate": 1.166672605427162e-05, "loss": 0.6736, "step": 6054 }, { "epoch": 2.3172598545732876, "grad_norm": 0.5155527591705322, "learning_rate": 1.1664281712879033e-05, "loss": 0.5958, "step": 6055 }, { "epoch": 2.3176425564485266, "grad_norm": 0.5080094337463379, "learning_rate": 1.1661837269211869e-05, "loss": 0.5907, "step": 6056 }, { "epoch": 2.318025258323766, "grad_norm": 0.5620556473731995, "learning_rate": 1.1659392723420337e-05, "loss": 0.6978, "step": 6057 }, { "epoch": 2.318407960199005, "grad_norm": 0.5707299113273621, "learning_rate": 1.1656948075654667e-05, "loss": 0.6374, "step": 6058 }, { "epoch": 2.3187906620742442, "grad_norm": 0.5829529762268066, "learning_rate": 1.1654503326065084e-05, "loss": 0.6406, "step": 6059 }, { "epoch": 2.319173363949483, "grad_norm": 0.5352677702903748, "learning_rate": 1.1652058474801827e-05, "loss": 0.6153, "step": 6060 }, { "epoch": 2.3195560658247225, "grad_norm": 0.5100828409194946, "learning_rate": 1.1649613522015135e-05, "loss": 0.6083, "step": 6061 }, { "epoch": 2.319938767699962, "grad_norm": 0.531194806098938, "learning_rate": 1.1647168467855265e-05, "loss": 0.6885, "step": 6062 }, { "epoch": 2.320321469575201, "grad_norm": 0.5764731168746948, "learning_rate": 1.1644723312472465e-05, "loss": 0.7333, "step": 6063 }, { "epoch": 2.32070417145044, "grad_norm": 0.550101101398468, "learning_rate": 1.1642278056017003e-05, "loss": 0.6324, "step": 6064 }, { "epoch": 2.3210868733256795, "grad_norm": 0.5280738472938538, "learning_rate": 1.1639832698639137e-05, "loss": 0.5973, "step": 6065 }, { "epoch": 2.3214695752009185, "grad_norm": 0.5487499237060547, "learning_rate": 1.1637387240489148e-05, "loss": 0.5989, "step": 6066 }, { "epoch": 2.321852277076158, "grad_norm": 0.5122355222702026, "learning_rate": 1.1634941681717318e-05, "loss": 0.5895, "step": 6067 }, { "epoch": 2.3222349789513967, "grad_norm": 0.5622780919075012, "learning_rate": 1.1632496022473925e-05, "loss": 0.6142, "step": 6068 }, { "epoch": 2.322617680826636, "grad_norm": 0.5732781887054443, "learning_rate": 1.163005026290927e-05, "loss": 0.6309, "step": 6069 }, { "epoch": 2.323000382701875, "grad_norm": 0.5405258536338806, "learning_rate": 1.1627604403173642e-05, "loss": 0.5771, "step": 6070 }, { "epoch": 2.3233830845771144, "grad_norm": 0.5627906918525696, "learning_rate": 1.1625158443417352e-05, "loss": 0.6719, "step": 6071 }, { "epoch": 2.3237657864523538, "grad_norm": 0.5584981441497803, "learning_rate": 1.1622712383790712e-05, "loss": 0.5732, "step": 6072 }, { "epoch": 2.3241484883275927, "grad_norm": 0.5118237137794495, "learning_rate": 1.1620266224444039e-05, "loss": 0.4711, "step": 6073 }, { "epoch": 2.324531190202832, "grad_norm": 0.5072062611579895, "learning_rate": 1.161781996552765e-05, "loss": 0.6407, "step": 6074 }, { "epoch": 2.3249138920780714, "grad_norm": 0.5605751276016235, "learning_rate": 1.1615373607191877e-05, "loss": 0.6353, "step": 6075 }, { "epoch": 2.3252965939533103, "grad_norm": 0.5387629866600037, "learning_rate": 1.1612927149587056e-05, "loss": 0.6537, "step": 6076 }, { "epoch": 2.3256792958285497, "grad_norm": 0.5763370394706726, "learning_rate": 1.161048059286353e-05, "loss": 0.6964, "step": 6077 }, { "epoch": 2.3260619977037886, "grad_norm": 0.5834887027740479, "learning_rate": 1.1608033937171647e-05, "loss": 0.7126, "step": 6078 }, { "epoch": 2.326444699579028, "grad_norm": 0.521793782711029, "learning_rate": 1.1605587182661758e-05, "loss": 0.674, "step": 6079 }, { "epoch": 2.326827401454267, "grad_norm": 0.5661998391151428, "learning_rate": 1.1603140329484219e-05, "loss": 0.6997, "step": 6080 }, { "epoch": 2.3272101033295063, "grad_norm": 0.5165109038352966, "learning_rate": 1.1600693377789403e-05, "loss": 0.6388, "step": 6081 }, { "epoch": 2.3275928052047457, "grad_norm": 0.5849822163581848, "learning_rate": 1.159824632772768e-05, "loss": 0.6449, "step": 6082 }, { "epoch": 2.3279755070799846, "grad_norm": 0.6048030853271484, "learning_rate": 1.1595799179449424e-05, "loss": 0.7188, "step": 6083 }, { "epoch": 2.328358208955224, "grad_norm": 0.4953518509864807, "learning_rate": 1.1593351933105024e-05, "loss": 0.5675, "step": 6084 }, { "epoch": 2.3287409108304633, "grad_norm": 0.559121310710907, "learning_rate": 1.1590904588844864e-05, "loss": 0.6703, "step": 6085 }, { "epoch": 2.3291236127057022, "grad_norm": 0.5059764981269836, "learning_rate": 1.1588457146819345e-05, "loss": 0.6533, "step": 6086 }, { "epoch": 2.3295063145809416, "grad_norm": 0.5212219953536987, "learning_rate": 1.1586009607178865e-05, "loss": 0.6424, "step": 6087 }, { "epoch": 2.3298890164561805, "grad_norm": 0.5662047266960144, "learning_rate": 1.1583561970073838e-05, "loss": 0.6682, "step": 6088 }, { "epoch": 2.33027171833142, "grad_norm": 0.49949175119400024, "learning_rate": 1.158111423565467e-05, "loss": 0.6078, "step": 6089 }, { "epoch": 2.330654420206659, "grad_norm": 0.5233070254325867, "learning_rate": 1.1578666404071784e-05, "loss": 0.6278, "step": 6090 }, { "epoch": 2.331037122081898, "grad_norm": 0.5336652398109436, "learning_rate": 1.157621847547561e-05, "loss": 0.5659, "step": 6091 }, { "epoch": 2.3314198239571375, "grad_norm": 0.5561338067054749, "learning_rate": 1.1573770450016572e-05, "loss": 0.6965, "step": 6092 }, { "epoch": 2.3318025258323765, "grad_norm": 0.5190878510475159, "learning_rate": 1.1571322327845116e-05, "loss": 0.5979, "step": 6093 }, { "epoch": 2.332185227707616, "grad_norm": 0.5492413640022278, "learning_rate": 1.1568874109111678e-05, "loss": 0.6645, "step": 6094 }, { "epoch": 2.332567929582855, "grad_norm": 0.5243427753448486, "learning_rate": 1.1566425793966714e-05, "loss": 0.6393, "step": 6095 }, { "epoch": 2.332950631458094, "grad_norm": 0.5705350041389465, "learning_rate": 1.1563977382560676e-05, "loss": 0.6767, "step": 6096 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5072084665298462, "learning_rate": 1.1561528875044026e-05, "loss": 0.657, "step": 6097 }, { "epoch": 2.3337160352085724, "grad_norm": 0.537065327167511, "learning_rate": 1.1559080271567233e-05, "loss": 0.576, "step": 6098 }, { "epoch": 2.3340987370838118, "grad_norm": 0.5070728063583374, "learning_rate": 1.1556631572280765e-05, "loss": 0.5955, "step": 6099 }, { "epoch": 2.3344814389590507, "grad_norm": 0.8743232488632202, "learning_rate": 1.1554182777335108e-05, "loss": 0.6601, "step": 6100 }, { "epoch": 2.33486414083429, "grad_norm": 0.5699942111968994, "learning_rate": 1.1551733886880745e-05, "loss": 0.601, "step": 6101 }, { "epoch": 2.3352468427095294, "grad_norm": 0.546633780002594, "learning_rate": 1.1549284901068163e-05, "loss": 0.5886, "step": 6102 }, { "epoch": 2.3356295445847683, "grad_norm": 0.5287055969238281, "learning_rate": 1.1546835820047863e-05, "loss": 0.5449, "step": 6103 }, { "epoch": 2.3360122464600077, "grad_norm": 0.5245034694671631, "learning_rate": 1.1544386643970346e-05, "loss": 0.5203, "step": 6104 }, { "epoch": 2.336394948335247, "grad_norm": 0.5412701964378357, "learning_rate": 1.154193737298612e-05, "loss": 0.6422, "step": 6105 }, { "epoch": 2.336777650210486, "grad_norm": 0.5061604976654053, "learning_rate": 1.1539488007245704e-05, "loss": 0.6405, "step": 6106 }, { "epoch": 2.3371603520857254, "grad_norm": 0.567762553691864, "learning_rate": 1.153703854689961e-05, "loss": 0.5731, "step": 6107 }, { "epoch": 2.3375430539609643, "grad_norm": 0.5155295729637146, "learning_rate": 1.153458899209837e-05, "loss": 0.6362, "step": 6108 }, { "epoch": 2.3379257558362037, "grad_norm": 0.6122921705245972, "learning_rate": 1.1532139342992515e-05, "loss": 0.6505, "step": 6109 }, { "epoch": 2.3383084577114426, "grad_norm": 0.5202809572219849, "learning_rate": 1.1529689599732577e-05, "loss": 0.5676, "step": 6110 }, { "epoch": 2.338691159586682, "grad_norm": 0.5440065860748291, "learning_rate": 1.1527239762469107e-05, "loss": 0.6962, "step": 6111 }, { "epoch": 2.3390738614619213, "grad_norm": 0.5323384404182434, "learning_rate": 1.152478983135265e-05, "loss": 0.6794, "step": 6112 }, { "epoch": 2.3394565633371602, "grad_norm": 0.5637457370758057, "learning_rate": 1.152233980653376e-05, "loss": 0.5907, "step": 6113 }, { "epoch": 2.3398392652123996, "grad_norm": 0.48855459690093994, "learning_rate": 1.1519889688163002e-05, "loss": 0.6018, "step": 6114 }, { "epoch": 2.340221967087639, "grad_norm": 0.49326249957084656, "learning_rate": 1.1517439476390934e-05, "loss": 0.6143, "step": 6115 }, { "epoch": 2.340604668962878, "grad_norm": 0.5898305177688599, "learning_rate": 1.151498917136814e-05, "loss": 0.717, "step": 6116 }, { "epoch": 2.3409873708381173, "grad_norm": 0.5110900402069092, "learning_rate": 1.151253877324519e-05, "loss": 0.5879, "step": 6117 }, { "epoch": 2.341370072713356, "grad_norm": 0.5574154853820801, "learning_rate": 1.1510088282172667e-05, "loss": 0.7025, "step": 6118 }, { "epoch": 2.3417527745885955, "grad_norm": 0.5243968367576599, "learning_rate": 1.1507637698301164e-05, "loss": 0.6638, "step": 6119 }, { "epoch": 2.3421354764638345, "grad_norm": 0.516819179058075, "learning_rate": 1.150518702178127e-05, "loss": 0.5624, "step": 6120 }, { "epoch": 2.342518178339074, "grad_norm": 0.5477219820022583, "learning_rate": 1.1502736252763596e-05, "loss": 0.5923, "step": 6121 }, { "epoch": 2.342900880214313, "grad_norm": 0.5879136919975281, "learning_rate": 1.150028539139874e-05, "loss": 0.681, "step": 6122 }, { "epoch": 2.343283582089552, "grad_norm": 0.5473641157150269, "learning_rate": 1.1497834437837314e-05, "loss": 0.6705, "step": 6123 }, { "epoch": 2.3436662839647915, "grad_norm": 0.5768722295761108, "learning_rate": 1.1495383392229941e-05, "loss": 0.6372, "step": 6124 }, { "epoch": 2.344048985840031, "grad_norm": 0.6227751970291138, "learning_rate": 1.1492932254727238e-05, "loss": 0.635, "step": 6125 }, { "epoch": 2.3444316877152698, "grad_norm": 0.5662115216255188, "learning_rate": 1.1490481025479838e-05, "loss": 0.5541, "step": 6126 }, { "epoch": 2.344814389590509, "grad_norm": 2.750840663909912, "learning_rate": 1.1488029704638378e-05, "loss": 0.6403, "step": 6127 }, { "epoch": 2.345197091465748, "grad_norm": 0.5794787406921387, "learning_rate": 1.1485578292353492e-05, "loss": 0.6203, "step": 6128 }, { "epoch": 2.3455797933409874, "grad_norm": 0.6063698530197144, "learning_rate": 1.1483126788775828e-05, "loss": 0.7093, "step": 6129 }, { "epoch": 2.3459624952162264, "grad_norm": 0.5214340686798096, "learning_rate": 1.1480675194056041e-05, "loss": 0.6042, "step": 6130 }, { "epoch": 2.3463451970914657, "grad_norm": 0.5443503260612488, "learning_rate": 1.1478223508344783e-05, "loss": 0.6235, "step": 6131 }, { "epoch": 2.346727898966705, "grad_norm": 0.5730525851249695, "learning_rate": 1.1475771731792724e-05, "loss": 0.594, "step": 6132 }, { "epoch": 2.347110600841944, "grad_norm": 0.4979500472545624, "learning_rate": 1.1473319864550523e-05, "loss": 0.6255, "step": 6133 }, { "epoch": 2.3474933027171834, "grad_norm": 0.5337297320365906, "learning_rate": 1.147086790676886e-05, "loss": 0.6379, "step": 6134 }, { "epoch": 2.3478760045924227, "grad_norm": 0.5872949361801147, "learning_rate": 1.1468415858598413e-05, "loss": 0.6463, "step": 6135 }, { "epoch": 2.3482587064676617, "grad_norm": 0.7077239155769348, "learning_rate": 1.1465963720189864e-05, "loss": 0.6649, "step": 6136 }, { "epoch": 2.348641408342901, "grad_norm": 0.5125137567520142, "learning_rate": 1.1463511491693908e-05, "loss": 0.6232, "step": 6137 }, { "epoch": 2.34902411021814, "grad_norm": 0.5115095973014832, "learning_rate": 1.146105917326124e-05, "loss": 0.7111, "step": 6138 }, { "epoch": 2.3494068120933793, "grad_norm": 0.5307428240776062, "learning_rate": 1.1458606765042559e-05, "loss": 0.6332, "step": 6139 }, { "epoch": 2.3497895139686182, "grad_norm": 0.5291560292243958, "learning_rate": 1.1456154267188572e-05, "loss": 0.6134, "step": 6140 }, { "epoch": 2.3501722158438576, "grad_norm": 0.5387807488441467, "learning_rate": 1.1453701679849995e-05, "loss": 0.6457, "step": 6141 }, { "epoch": 2.350554917719097, "grad_norm": 0.515809953212738, "learning_rate": 1.1451249003177546e-05, "loss": 0.6194, "step": 6142 }, { "epoch": 2.350937619594336, "grad_norm": 0.5436244010925293, "learning_rate": 1.1448796237321949e-05, "loss": 0.6773, "step": 6143 }, { "epoch": 2.3513203214695753, "grad_norm": 0.6055291295051575, "learning_rate": 1.1446343382433929e-05, "loss": 0.6702, "step": 6144 }, { "epoch": 2.3517030233448146, "grad_norm": 0.5061083436012268, "learning_rate": 1.144389043866422e-05, "loss": 0.5535, "step": 6145 }, { "epoch": 2.3520857252200535, "grad_norm": 0.5208227038383484, "learning_rate": 1.1441437406163565e-05, "loss": 0.5771, "step": 6146 }, { "epoch": 2.352468427095293, "grad_norm": 0.538378119468689, "learning_rate": 1.143898428508271e-05, "loss": 0.6036, "step": 6147 }, { "epoch": 2.352851128970532, "grad_norm": 0.507703959941864, "learning_rate": 1.1436531075572408e-05, "loss": 0.6822, "step": 6148 }, { "epoch": 2.353233830845771, "grad_norm": 0.5313612818717957, "learning_rate": 1.1434077777783408e-05, "loss": 0.6764, "step": 6149 }, { "epoch": 2.35361653272101, "grad_norm": 0.5129598379135132, "learning_rate": 1.1431624391866476e-05, "loss": 0.682, "step": 6150 }, { "epoch": 2.3539992345962495, "grad_norm": 0.5234012603759766, "learning_rate": 1.142917091797238e-05, "loss": 0.6866, "step": 6151 }, { "epoch": 2.354381936471489, "grad_norm": 0.5059363842010498, "learning_rate": 1.1426717356251893e-05, "loss": 0.6913, "step": 6152 }, { "epoch": 2.354764638346728, "grad_norm": 0.557253360748291, "learning_rate": 1.1424263706855792e-05, "loss": 0.5638, "step": 6153 }, { "epoch": 2.355147340221967, "grad_norm": 0.5316115617752075, "learning_rate": 1.1421809969934859e-05, "loss": 0.6511, "step": 6154 }, { "epoch": 2.3555300420972065, "grad_norm": 0.5657581686973572, "learning_rate": 1.1419356145639879e-05, "loss": 0.6837, "step": 6155 }, { "epoch": 2.3559127439724454, "grad_norm": 0.5346672534942627, "learning_rate": 1.1416902234121658e-05, "loss": 0.5623, "step": 6156 }, { "epoch": 2.356295445847685, "grad_norm": 0.5178198218345642, "learning_rate": 1.1414448235530984e-05, "loss": 0.7069, "step": 6157 }, { "epoch": 2.3566781477229237, "grad_norm": 0.4903124272823334, "learning_rate": 1.1411994150018668e-05, "loss": 0.5944, "step": 6158 }, { "epoch": 2.357060849598163, "grad_norm": 0.5168110728263855, "learning_rate": 1.140953997773552e-05, "loss": 0.5809, "step": 6159 }, { "epoch": 2.357443551473402, "grad_norm": 0.5366156101226807, "learning_rate": 1.1407085718832348e-05, "loss": 0.6629, "step": 6160 }, { "epoch": 2.3578262533486414, "grad_norm": 0.4988226592540741, "learning_rate": 1.1404631373459981e-05, "loss": 0.6648, "step": 6161 }, { "epoch": 2.3582089552238807, "grad_norm": 0.5630804300308228, "learning_rate": 1.1402176941769242e-05, "loss": 0.704, "step": 6162 }, { "epoch": 2.3585916570991197, "grad_norm": 0.5694405436515808, "learning_rate": 1.1399722423910966e-05, "loss": 0.7082, "step": 6163 }, { "epoch": 2.358974358974359, "grad_norm": 0.5914484858512878, "learning_rate": 1.1397267820035986e-05, "loss": 0.5219, "step": 6164 }, { "epoch": 2.3593570608495984, "grad_norm": 0.4895511269569397, "learning_rate": 1.139481313029514e-05, "loss": 0.6203, "step": 6165 }, { "epoch": 2.3597397627248373, "grad_norm": 0.5819926261901855, "learning_rate": 1.1392358354839286e-05, "loss": 0.5503, "step": 6166 }, { "epoch": 2.3601224646000767, "grad_norm": 0.5537598729133606, "learning_rate": 1.1389903493819265e-05, "loss": 0.6276, "step": 6167 }, { "epoch": 2.3605051664753156, "grad_norm": 0.5126705765724182, "learning_rate": 1.1387448547385947e-05, "loss": 0.6032, "step": 6168 }, { "epoch": 2.360887868350555, "grad_norm": 0.564017117023468, "learning_rate": 1.1384993515690183e-05, "loss": 0.6724, "step": 6169 }, { "epoch": 2.361270570225794, "grad_norm": 0.5728877782821655, "learning_rate": 1.1382538398882847e-05, "loss": 0.5982, "step": 6170 }, { "epoch": 2.3616532721010333, "grad_norm": 0.616949737071991, "learning_rate": 1.1380083197114816e-05, "loss": 0.6264, "step": 6171 }, { "epoch": 2.3620359739762726, "grad_norm": 0.5184482932090759, "learning_rate": 1.1377627910536964e-05, "loss": 0.6541, "step": 6172 }, { "epoch": 2.3624186758515116, "grad_norm": 0.5035645961761475, "learning_rate": 1.1375172539300173e-05, "loss": 0.6082, "step": 6173 }, { "epoch": 2.362801377726751, "grad_norm": 0.5572062730789185, "learning_rate": 1.1372717083555338e-05, "loss": 0.6903, "step": 6174 }, { "epoch": 2.3631840796019903, "grad_norm": 0.5972555875778198, "learning_rate": 1.1370261543453347e-05, "loss": 0.7221, "step": 6175 }, { "epoch": 2.363566781477229, "grad_norm": 0.602232813835144, "learning_rate": 1.1367805919145106e-05, "loss": 0.6606, "step": 6176 }, { "epoch": 2.3639494833524686, "grad_norm": 0.5414847731590271, "learning_rate": 1.1365350210781516e-05, "loss": 0.6375, "step": 6177 }, { "epoch": 2.3643321852277075, "grad_norm": 0.5370460152626038, "learning_rate": 1.1362894418513487e-05, "loss": 0.5779, "step": 6178 }, { "epoch": 2.364714887102947, "grad_norm": 0.5910100340843201, "learning_rate": 1.1360438542491934e-05, "loss": 0.6659, "step": 6179 }, { "epoch": 2.365097588978186, "grad_norm": 0.5042476058006287, "learning_rate": 1.1357982582867777e-05, "loss": 0.6756, "step": 6180 }, { "epoch": 2.365480290853425, "grad_norm": 0.5433396100997925, "learning_rate": 1.1355526539791943e-05, "loss": 0.6883, "step": 6181 }, { "epoch": 2.3658629927286645, "grad_norm": 0.5970562696456909, "learning_rate": 1.1353070413415363e-05, "loss": 0.6028, "step": 6182 }, { "epoch": 2.3662456946039034, "grad_norm": 0.5080413222312927, "learning_rate": 1.135061420388897e-05, "loss": 0.6067, "step": 6183 }, { "epoch": 2.366628396479143, "grad_norm": 0.5382479429244995, "learning_rate": 1.1348157911363709e-05, "loss": 0.7212, "step": 6184 }, { "epoch": 2.367011098354382, "grad_norm": 0.5597609281539917, "learning_rate": 1.1345701535990516e-05, "loss": 0.6309, "step": 6185 }, { "epoch": 2.367393800229621, "grad_norm": 0.5743227005004883, "learning_rate": 1.1343245077920353e-05, "loss": 0.6675, "step": 6186 }, { "epoch": 2.3677765021048605, "grad_norm": 0.4712161719799042, "learning_rate": 1.134078853730417e-05, "loss": 0.6398, "step": 6187 }, { "epoch": 2.3681592039800994, "grad_norm": 0.5156522989273071, "learning_rate": 1.1338331914292931e-05, "loss": 0.6281, "step": 6188 }, { "epoch": 2.3685419058553387, "grad_norm": 0.508332371711731, "learning_rate": 1.1335875209037598e-05, "loss": 0.6084, "step": 6189 }, { "epoch": 2.3689246077305777, "grad_norm": 0.5191062688827515, "learning_rate": 1.1333418421689144e-05, "loss": 0.6035, "step": 6190 }, { "epoch": 2.369307309605817, "grad_norm": 0.5955939292907715, "learning_rate": 1.1330961552398548e-05, "loss": 0.5596, "step": 6191 }, { "epoch": 2.3696900114810564, "grad_norm": 0.5514528155326843, "learning_rate": 1.132850460131679e-05, "loss": 0.5943, "step": 6192 }, { "epoch": 2.3700727133562953, "grad_norm": 0.5206118822097778, "learning_rate": 1.1326047568594852e-05, "loss": 0.6265, "step": 6193 }, { "epoch": 2.3704554152315347, "grad_norm": 0.5405824780464172, "learning_rate": 1.1323590454383732e-05, "loss": 0.6926, "step": 6194 }, { "epoch": 2.370838117106774, "grad_norm": 0.5350494384765625, "learning_rate": 1.132113325883442e-05, "loss": 0.615, "step": 6195 }, { "epoch": 2.371220818982013, "grad_norm": 0.5200092196464539, "learning_rate": 1.131867598209792e-05, "loss": 0.6511, "step": 6196 }, { "epoch": 2.3716035208572523, "grad_norm": 0.5778239965438843, "learning_rate": 1.1316218624325239e-05, "loss": 0.6508, "step": 6197 }, { "epoch": 2.3719862227324913, "grad_norm": 0.5276181101799011, "learning_rate": 1.1313761185667391e-05, "loss": 0.6653, "step": 6198 }, { "epoch": 2.3723689246077306, "grad_norm": 0.5829154849052429, "learning_rate": 1.1311303666275384e-05, "loss": 0.6778, "step": 6199 }, { "epoch": 2.3727516264829696, "grad_norm": 0.5423204302787781, "learning_rate": 1.1308846066300245e-05, "loss": 0.6435, "step": 6200 }, { "epoch": 2.373134328358209, "grad_norm": 0.48475414514541626, "learning_rate": 1.1306388385892998e-05, "loss": 0.5589, "step": 6201 }, { "epoch": 2.3735170302334483, "grad_norm": 0.5283805131912231, "learning_rate": 1.1303930625204677e-05, "loss": 0.6885, "step": 6202 }, { "epoch": 2.373899732108687, "grad_norm": 0.5377120971679688, "learning_rate": 1.1301472784386318e-05, "loss": 0.6227, "step": 6203 }, { "epoch": 2.3742824339839266, "grad_norm": 0.5368050932884216, "learning_rate": 1.1299014863588955e-05, "loss": 0.683, "step": 6204 }, { "epoch": 2.374665135859166, "grad_norm": 0.5170769095420837, "learning_rate": 1.129655686296364e-05, "loss": 0.6046, "step": 6205 }, { "epoch": 2.375047837734405, "grad_norm": 0.5464915633201599, "learning_rate": 1.1294098782661425e-05, "loss": 0.6761, "step": 6206 }, { "epoch": 2.3754305396096442, "grad_norm": 0.4777645170688629, "learning_rate": 1.1291640622833362e-05, "loss": 0.5608, "step": 6207 }, { "epoch": 2.375813241484883, "grad_norm": 0.5210279822349548, "learning_rate": 1.1289182383630515e-05, "loss": 0.6352, "step": 6208 }, { "epoch": 2.3761959433601225, "grad_norm": 0.5333783626556396, "learning_rate": 1.1286724065203945e-05, "loss": 0.6281, "step": 6209 }, { "epoch": 2.3765786452353614, "grad_norm": 0.5269767642021179, "learning_rate": 1.1284265667704723e-05, "loss": 0.5524, "step": 6210 }, { "epoch": 2.376961347110601, "grad_norm": 0.5302755832672119, "learning_rate": 1.128180719128393e-05, "loss": 0.5892, "step": 6211 }, { "epoch": 2.37734404898584, "grad_norm": 0.5180466771125793, "learning_rate": 1.1279348636092634e-05, "loss": 0.6545, "step": 6212 }, { "epoch": 2.377726750861079, "grad_norm": 0.5795738697052002, "learning_rate": 1.1276890002281935e-05, "loss": 0.6517, "step": 6213 }, { "epoch": 2.3781094527363185, "grad_norm": 0.5532697439193726, "learning_rate": 1.1274431290002912e-05, "loss": 0.642, "step": 6214 }, { "epoch": 2.378492154611558, "grad_norm": 0.513659656047821, "learning_rate": 1.127197249940666e-05, "loss": 0.6148, "step": 6215 }, { "epoch": 2.3788748564867968, "grad_norm": 0.5101425647735596, "learning_rate": 1.1269513630644286e-05, "loss": 0.608, "step": 6216 }, { "epoch": 2.379257558362036, "grad_norm": 0.530034601688385, "learning_rate": 1.1267054683866885e-05, "loss": 0.6989, "step": 6217 }, { "epoch": 2.379640260237275, "grad_norm": 0.565155029296875, "learning_rate": 1.1264595659225572e-05, "loss": 0.6048, "step": 6218 }, { "epoch": 2.3800229621125144, "grad_norm": 0.5231576561927795, "learning_rate": 1.1262136556871457e-05, "loss": 0.6511, "step": 6219 }, { "epoch": 2.3804056639877533, "grad_norm": 0.5599167346954346, "learning_rate": 1.1259677376955657e-05, "loss": 0.6598, "step": 6220 }, { "epoch": 2.3807883658629927, "grad_norm": 0.5112417340278625, "learning_rate": 1.1257218119629305e-05, "loss": 0.6468, "step": 6221 }, { "epoch": 2.381171067738232, "grad_norm": 0.48910099267959595, "learning_rate": 1.1254758785043516e-05, "loss": 0.6875, "step": 6222 }, { "epoch": 2.381553769613471, "grad_norm": 0.5682770609855652, "learning_rate": 1.1252299373349434e-05, "loss": 0.6825, "step": 6223 }, { "epoch": 2.3819364714887103, "grad_norm": 0.5452709197998047, "learning_rate": 1.1249839884698189e-05, "loss": 0.5243, "step": 6224 }, { "epoch": 2.3823191733639497, "grad_norm": 0.5276846289634705, "learning_rate": 1.1247380319240924e-05, "loss": 0.6312, "step": 6225 }, { "epoch": 2.3827018752391886, "grad_norm": 0.5691794157028198, "learning_rate": 1.1244920677128786e-05, "loss": 0.6416, "step": 6226 }, { "epoch": 2.383084577114428, "grad_norm": 0.5471081137657166, "learning_rate": 1.1242460958512929e-05, "loss": 0.6753, "step": 6227 }, { "epoch": 2.383467278989667, "grad_norm": 0.545671284198761, "learning_rate": 1.1240001163544512e-05, "loss": 0.6242, "step": 6228 }, { "epoch": 2.3838499808649063, "grad_norm": 0.5506085157394409, "learning_rate": 1.123754129237469e-05, "loss": 0.6633, "step": 6229 }, { "epoch": 2.384232682740145, "grad_norm": 0.5150423049926758, "learning_rate": 1.1235081345154628e-05, "loss": 0.6855, "step": 6230 }, { "epoch": 2.3846153846153846, "grad_norm": 0.5922170281410217, "learning_rate": 1.1232621322035502e-05, "loss": 0.6485, "step": 6231 }, { "epoch": 2.384998086490624, "grad_norm": 0.5337657928466797, "learning_rate": 1.1230161223168483e-05, "loss": 0.5033, "step": 6232 }, { "epoch": 2.385380788365863, "grad_norm": 0.5264856219291687, "learning_rate": 1.1227701048704754e-05, "loss": 0.6745, "step": 6233 }, { "epoch": 2.3857634902411022, "grad_norm": 0.49886906147003174, "learning_rate": 1.1225240798795498e-05, "loss": 0.6687, "step": 6234 }, { "epoch": 2.3861461921163416, "grad_norm": 0.5471172332763672, "learning_rate": 1.1222780473591902e-05, "loss": 0.5919, "step": 6235 }, { "epoch": 2.3865288939915805, "grad_norm": 0.5713621973991394, "learning_rate": 1.1220320073245156e-05, "loss": 0.6916, "step": 6236 }, { "epoch": 2.38691159586682, "grad_norm": 0.5500684976577759, "learning_rate": 1.121785959790647e-05, "loss": 0.6099, "step": 6237 }, { "epoch": 2.387294297742059, "grad_norm": 0.5332363247871399, "learning_rate": 1.1215399047727039e-05, "loss": 0.6881, "step": 6238 }, { "epoch": 2.387676999617298, "grad_norm": 0.48657500743865967, "learning_rate": 1.1212938422858071e-05, "loss": 0.6949, "step": 6239 }, { "epoch": 2.388059701492537, "grad_norm": 0.5289338827133179, "learning_rate": 1.1210477723450775e-05, "loss": 0.5837, "step": 6240 }, { "epoch": 2.3884424033677765, "grad_norm": 0.5469245314598083, "learning_rate": 1.1208016949656373e-05, "loss": 0.5822, "step": 6241 }, { "epoch": 2.388825105243016, "grad_norm": 0.513710618019104, "learning_rate": 1.1205556101626084e-05, "loss": 0.6615, "step": 6242 }, { "epoch": 2.3892078071182548, "grad_norm": 0.5259149074554443, "learning_rate": 1.1203095179511134e-05, "loss": 0.6459, "step": 6243 }, { "epoch": 2.389590508993494, "grad_norm": 0.5300850868225098, "learning_rate": 1.1200634183462754e-05, "loss": 0.5772, "step": 6244 }, { "epoch": 2.3899732108687335, "grad_norm": 0.5537452101707458, "learning_rate": 1.1198173113632179e-05, "loss": 0.6561, "step": 6245 }, { "epoch": 2.3903559127439724, "grad_norm": 0.6119998097419739, "learning_rate": 1.1195711970170643e-05, "loss": 0.6842, "step": 6246 }, { "epoch": 2.3907386146192118, "grad_norm": 0.5685131549835205, "learning_rate": 1.1193250753229398e-05, "loss": 0.6754, "step": 6247 }, { "epoch": 2.3911213164944507, "grad_norm": 0.5428422093391418, "learning_rate": 1.1190789462959686e-05, "loss": 0.6498, "step": 6248 }, { "epoch": 2.39150401836969, "grad_norm": 0.5245223045349121, "learning_rate": 1.1188328099512767e-05, "loss": 0.6787, "step": 6249 }, { "epoch": 2.391886720244929, "grad_norm": 0.5120739936828613, "learning_rate": 1.1185866663039887e-05, "loss": 0.6325, "step": 6250 }, { "epoch": 2.3922694221201684, "grad_norm": 0.5432265996932983, "learning_rate": 1.118340515369232e-05, "loss": 0.6498, "step": 6251 }, { "epoch": 2.3926521239954077, "grad_norm": 0.5741381645202637, "learning_rate": 1.1180943571621326e-05, "loss": 0.7302, "step": 6252 }, { "epoch": 2.3930348258706466, "grad_norm": 0.5175909996032715, "learning_rate": 1.1178481916978175e-05, "loss": 0.6645, "step": 6253 }, { "epoch": 2.393417527745886, "grad_norm": 0.5585779547691345, "learning_rate": 1.1176020189914147e-05, "loss": 0.6533, "step": 6254 }, { "epoch": 2.3938002296211254, "grad_norm": 0.5156822800636292, "learning_rate": 1.1173558390580517e-05, "loss": 0.6379, "step": 6255 }, { "epoch": 2.3941829314963643, "grad_norm": 0.6181535124778748, "learning_rate": 1.117109651912857e-05, "loss": 0.6568, "step": 6256 }, { "epoch": 2.3945656333716037, "grad_norm": 0.5307922959327698, "learning_rate": 1.11686345757096e-05, "loss": 0.6637, "step": 6257 }, { "epoch": 2.3949483352468426, "grad_norm": 0.5193773508071899, "learning_rate": 1.1166172560474894e-05, "loss": 0.6711, "step": 6258 }, { "epoch": 2.395331037122082, "grad_norm": 0.5214883089065552, "learning_rate": 1.116371047357575e-05, "loss": 0.585, "step": 6259 }, { "epoch": 2.395713738997321, "grad_norm": 0.5639206171035767, "learning_rate": 1.116124831516347e-05, "loss": 0.6782, "step": 6260 }, { "epoch": 2.3960964408725602, "grad_norm": 0.5791147947311401, "learning_rate": 1.1158786085389362e-05, "loss": 0.7183, "step": 6261 }, { "epoch": 2.3964791427477996, "grad_norm": 0.5783074498176575, "learning_rate": 1.1156323784404739e-05, "loss": 0.6536, "step": 6262 }, { "epoch": 2.3968618446230385, "grad_norm": 0.5790145993232727, "learning_rate": 1.115386141236091e-05, "loss": 0.6139, "step": 6263 }, { "epoch": 2.397244546498278, "grad_norm": 0.5127957463264465, "learning_rate": 1.1151398969409199e-05, "loss": 0.6648, "step": 6264 }, { "epoch": 2.3976272483735173, "grad_norm": 0.5852875709533691, "learning_rate": 1.1148936455700926e-05, "loss": 0.6614, "step": 6265 }, { "epoch": 2.398009950248756, "grad_norm": 0.5793094038963318, "learning_rate": 1.1146473871387422e-05, "loss": 0.6577, "step": 6266 }, { "epoch": 2.3983926521239955, "grad_norm": 0.4915509819984436, "learning_rate": 1.1144011216620022e-05, "loss": 0.5533, "step": 6267 }, { "epoch": 2.3987753539992345, "grad_norm": 0.5559350252151489, "learning_rate": 1.1141548491550058e-05, "loss": 0.6668, "step": 6268 }, { "epoch": 2.399158055874474, "grad_norm": 0.5339529514312744, "learning_rate": 1.1139085696328871e-05, "loss": 0.5606, "step": 6269 }, { "epoch": 2.3995407577497128, "grad_norm": 0.4746333658695221, "learning_rate": 1.1136622831107806e-05, "loss": 0.5675, "step": 6270 }, { "epoch": 2.399923459624952, "grad_norm": 0.5152091383934021, "learning_rate": 1.1134159896038217e-05, "loss": 0.6408, "step": 6271 }, { "epoch": 2.4003061615001915, "grad_norm": 0.4994767904281616, "learning_rate": 1.1131696891271455e-05, "loss": 0.615, "step": 6272 }, { "epoch": 2.4006888633754304, "grad_norm": 0.5405814051628113, "learning_rate": 1.112923381695888e-05, "loss": 0.7031, "step": 6273 }, { "epoch": 2.40107156525067, "grad_norm": 0.5903508067131042, "learning_rate": 1.112677067325185e-05, "loss": 0.7017, "step": 6274 }, { "epoch": 2.401454267125909, "grad_norm": 0.551230788230896, "learning_rate": 1.1124307460301738e-05, "loss": 0.7137, "step": 6275 }, { "epoch": 2.401836969001148, "grad_norm": 0.5217422246932983, "learning_rate": 1.1121844178259911e-05, "loss": 0.6552, "step": 6276 }, { "epoch": 2.4022196708763874, "grad_norm": 0.5302578210830688, "learning_rate": 1.1119380827277744e-05, "loss": 0.5913, "step": 6277 }, { "epoch": 2.4026023727516264, "grad_norm": 0.5128597617149353, "learning_rate": 1.111691740750662e-05, "loss": 0.6081, "step": 6278 }, { "epoch": 2.4029850746268657, "grad_norm": 0.537866473197937, "learning_rate": 1.111445391909792e-05, "loss": 0.638, "step": 6279 }, { "epoch": 2.4033677765021046, "grad_norm": 0.5546033978462219, "learning_rate": 1.1111990362203034e-05, "loss": 0.6401, "step": 6280 }, { "epoch": 2.403750478377344, "grad_norm": 0.6853591799736023, "learning_rate": 1.110952673697335e-05, "loss": 0.6978, "step": 6281 }, { "epoch": 2.4041331802525834, "grad_norm": 0.5176724195480347, "learning_rate": 1.1107063043560268e-05, "loss": 0.5635, "step": 6282 }, { "epoch": 2.4045158821278223, "grad_norm": 0.5591959357261658, "learning_rate": 1.1104599282115191e-05, "loss": 0.6061, "step": 6283 }, { "epoch": 2.4048985840030617, "grad_norm": 0.5322481393814087, "learning_rate": 1.110213545278952e-05, "loss": 0.6503, "step": 6284 }, { "epoch": 2.405281285878301, "grad_norm": 0.5243991017341614, "learning_rate": 1.1099671555734663e-05, "loss": 0.6324, "step": 6285 }, { "epoch": 2.40566398775354, "grad_norm": 0.5390865206718445, "learning_rate": 1.1097207591102032e-05, "loss": 0.5675, "step": 6286 }, { "epoch": 2.4060466896287793, "grad_norm": 0.5327250361442566, "learning_rate": 1.1094743559043051e-05, "loss": 0.6012, "step": 6287 }, { "epoch": 2.4064293915040182, "grad_norm": 0.5536350607872009, "learning_rate": 1.1092279459709136e-05, "loss": 0.6008, "step": 6288 }, { "epoch": 2.4068120933792576, "grad_norm": 0.5110554695129395, "learning_rate": 1.1089815293251715e-05, "loss": 0.6664, "step": 6289 }, { "epoch": 2.4071947952544965, "grad_norm": 0.5575195550918579, "learning_rate": 1.1087351059822213e-05, "loss": 0.6091, "step": 6290 }, { "epoch": 2.407577497129736, "grad_norm": 0.5790491104125977, "learning_rate": 1.1084886759572069e-05, "loss": 0.6408, "step": 6291 }, { "epoch": 2.4079601990049753, "grad_norm": 0.5533971190452576, "learning_rate": 1.1082422392652718e-05, "loss": 0.6632, "step": 6292 }, { "epoch": 2.408342900880214, "grad_norm": 0.5178888440132141, "learning_rate": 1.1079957959215608e-05, "loss": 0.739, "step": 6293 }, { "epoch": 2.4087256027554536, "grad_norm": 0.5572832226753235, "learning_rate": 1.1077493459412176e-05, "loss": 0.7577, "step": 6294 }, { "epoch": 2.409108304630693, "grad_norm": 0.5157217979431152, "learning_rate": 1.1075028893393877e-05, "loss": 0.6343, "step": 6295 }, { "epoch": 2.409491006505932, "grad_norm": 0.6508147716522217, "learning_rate": 1.1072564261312164e-05, "loss": 0.6327, "step": 6296 }, { "epoch": 2.409873708381171, "grad_norm": 0.5566961765289307, "learning_rate": 1.1070099563318497e-05, "loss": 0.6071, "step": 6297 }, { "epoch": 2.41025641025641, "grad_norm": 0.5532688498497009, "learning_rate": 1.1067634799564335e-05, "loss": 0.6322, "step": 6298 }, { "epoch": 2.4106391121316495, "grad_norm": 0.524275004863739, "learning_rate": 1.1065169970201149e-05, "loss": 0.7362, "step": 6299 }, { "epoch": 2.4110218140068884, "grad_norm": 0.48412269353866577, "learning_rate": 1.1062705075380405e-05, "loss": 0.5902, "step": 6300 }, { "epoch": 2.411404515882128, "grad_norm": 0.545932948589325, "learning_rate": 1.1060240115253578e-05, "loss": 0.6503, "step": 6301 }, { "epoch": 2.411787217757367, "grad_norm": 0.49384021759033203, "learning_rate": 1.1057775089972149e-05, "loss": 0.6061, "step": 6302 }, { "epoch": 2.412169919632606, "grad_norm": 0.5041643977165222, "learning_rate": 1.1055309999687599e-05, "loss": 0.6209, "step": 6303 }, { "epoch": 2.4125526215078454, "grad_norm": 0.5210525989532471, "learning_rate": 1.1052844844551416e-05, "loss": 0.6378, "step": 6304 }, { "epoch": 2.412935323383085, "grad_norm": 0.5490976572036743, "learning_rate": 1.1050379624715087e-05, "loss": 0.7114, "step": 6305 }, { "epoch": 2.4133180252583237, "grad_norm": 0.5492632389068604, "learning_rate": 1.1047914340330106e-05, "loss": 0.6296, "step": 6306 }, { "epoch": 2.413700727133563, "grad_norm": 0.5051327347755432, "learning_rate": 1.1045448991547978e-05, "loss": 0.6283, "step": 6307 }, { "epoch": 2.414083429008802, "grad_norm": 0.566811740398407, "learning_rate": 1.1042983578520197e-05, "loss": 0.6085, "step": 6308 }, { "epoch": 2.4144661308840414, "grad_norm": 0.5337949395179749, "learning_rate": 1.1040518101398277e-05, "loss": 0.606, "step": 6309 }, { "epoch": 2.4148488327592803, "grad_norm": 0.5232440829277039, "learning_rate": 1.103805256033372e-05, "loss": 0.5466, "step": 6310 }, { "epoch": 2.4152315346345197, "grad_norm": 0.4972420930862427, "learning_rate": 1.1035586955478049e-05, "loss": 0.6027, "step": 6311 }, { "epoch": 2.415614236509759, "grad_norm": 0.5367382764816284, "learning_rate": 1.1033121286982776e-05, "loss": 0.6271, "step": 6312 }, { "epoch": 2.415996938384998, "grad_norm": 0.5488976836204529, "learning_rate": 1.103065555499942e-05, "loss": 0.6271, "step": 6313 }, { "epoch": 2.4163796402602373, "grad_norm": 0.4865328073501587, "learning_rate": 1.1028189759679517e-05, "loss": 0.6502, "step": 6314 }, { "epoch": 2.4167623421354767, "grad_norm": 0.5405508279800415, "learning_rate": 1.102572390117459e-05, "loss": 0.6343, "step": 6315 }, { "epoch": 2.4171450440107156, "grad_norm": 0.5553873181343079, "learning_rate": 1.1023257979636171e-05, "loss": 0.6423, "step": 6316 }, { "epoch": 2.417527745885955, "grad_norm": 0.5339360237121582, "learning_rate": 1.1020791995215803e-05, "loss": 0.6541, "step": 6317 }, { "epoch": 2.417910447761194, "grad_norm": 0.5398573875427246, "learning_rate": 1.1018325948065022e-05, "loss": 0.7435, "step": 6318 }, { "epoch": 2.4182931496364333, "grad_norm": 0.5508237481117249, "learning_rate": 1.101585983833538e-05, "loss": 0.6976, "step": 6319 }, { "epoch": 2.418675851511672, "grad_norm": 0.5709373354911804, "learning_rate": 1.1013393666178417e-05, "loss": 0.676, "step": 6320 }, { "epoch": 2.4190585533869116, "grad_norm": 0.6463512778282166, "learning_rate": 1.1010927431745692e-05, "loss": 0.5676, "step": 6321 }, { "epoch": 2.419441255262151, "grad_norm": 0.5152413845062256, "learning_rate": 1.1008461135188764e-05, "loss": 0.6086, "step": 6322 }, { "epoch": 2.41982395713739, "grad_norm": 0.5859897136688232, "learning_rate": 1.1005994776659188e-05, "loss": 0.6759, "step": 6323 }, { "epoch": 2.420206659012629, "grad_norm": 0.6203253865242004, "learning_rate": 1.1003528356308528e-05, "loss": 0.6073, "step": 6324 }, { "epoch": 2.4205893608878686, "grad_norm": 0.5500771999359131, "learning_rate": 1.1001061874288359e-05, "loss": 0.6628, "step": 6325 }, { "epoch": 2.4209720627631075, "grad_norm": 0.5574804544448853, "learning_rate": 1.0998595330750241e-05, "loss": 0.6129, "step": 6326 }, { "epoch": 2.421354764638347, "grad_norm": 0.5557833909988403, "learning_rate": 1.0996128725845764e-05, "loss": 0.6327, "step": 6327 }, { "epoch": 2.421737466513586, "grad_norm": 0.5383809804916382, "learning_rate": 1.09936620597265e-05, "loss": 0.666, "step": 6328 }, { "epoch": 2.422120168388825, "grad_norm": 0.5218943953514099, "learning_rate": 1.0991195332544031e-05, "loss": 0.6899, "step": 6329 }, { "epoch": 2.422502870264064, "grad_norm": 0.5073323845863342, "learning_rate": 1.098872854444995e-05, "loss": 0.6086, "step": 6330 }, { "epoch": 2.4228855721393034, "grad_norm": 0.5048040747642517, "learning_rate": 1.0986261695595837e-05, "loss": 0.6834, "step": 6331 }, { "epoch": 2.423268274014543, "grad_norm": 0.5241096019744873, "learning_rate": 1.0983794786133298e-05, "loss": 0.5705, "step": 6332 }, { "epoch": 2.4236509758897817, "grad_norm": 0.5339519381523132, "learning_rate": 1.0981327816213924e-05, "loss": 0.6404, "step": 6333 }, { "epoch": 2.424033677765021, "grad_norm": 0.5117214918136597, "learning_rate": 1.097886078598932e-05, "loss": 0.7107, "step": 6334 }, { "epoch": 2.4244163796402605, "grad_norm": 0.5716574192047119, "learning_rate": 1.0976393695611095e-05, "loss": 0.6349, "step": 6335 }, { "epoch": 2.4247990815154994, "grad_norm": 0.5562425255775452, "learning_rate": 1.0973926545230849e-05, "loss": 0.6482, "step": 6336 }, { "epoch": 2.4251817833907388, "grad_norm": 0.545874834060669, "learning_rate": 1.09714593350002e-05, "loss": 0.6605, "step": 6337 }, { "epoch": 2.4255644852659777, "grad_norm": 0.5691450834274292, "learning_rate": 1.096899206507077e-05, "loss": 0.6387, "step": 6338 }, { "epoch": 2.425947187141217, "grad_norm": 0.5490900874137878, "learning_rate": 1.0966524735594169e-05, "loss": 0.5766, "step": 6339 }, { "epoch": 2.426329889016456, "grad_norm": 0.5518971681594849, "learning_rate": 1.096405734672203e-05, "loss": 0.6508, "step": 6340 }, { "epoch": 2.4267125908916953, "grad_norm": 0.5505756735801697, "learning_rate": 1.0961589898605975e-05, "loss": 0.7519, "step": 6341 }, { "epoch": 2.4270952927669347, "grad_norm": 0.552278995513916, "learning_rate": 1.0959122391397638e-05, "loss": 0.5981, "step": 6342 }, { "epoch": 2.4274779946421736, "grad_norm": 0.5853802561759949, "learning_rate": 1.0956654825248656e-05, "loss": 0.6882, "step": 6343 }, { "epoch": 2.427860696517413, "grad_norm": 0.529316782951355, "learning_rate": 1.0954187200310661e-05, "loss": 0.5987, "step": 6344 }, { "epoch": 2.4282433983926524, "grad_norm": 0.562225341796875, "learning_rate": 1.0951719516735301e-05, "loss": 0.5682, "step": 6345 }, { "epoch": 2.4286261002678913, "grad_norm": 0.5615450143814087, "learning_rate": 1.094925177467422e-05, "loss": 0.6653, "step": 6346 }, { "epoch": 2.4290088021431306, "grad_norm": 0.49466145038604736, "learning_rate": 1.0946783974279066e-05, "loss": 0.5338, "step": 6347 }, { "epoch": 2.4293915040183696, "grad_norm": 0.5502709150314331, "learning_rate": 1.0944316115701497e-05, "loss": 0.66, "step": 6348 }, { "epoch": 2.429774205893609, "grad_norm": 0.5037339925765991, "learning_rate": 1.0941848199093163e-05, "loss": 0.6532, "step": 6349 }, { "epoch": 2.430156907768848, "grad_norm": 0.5732260942459106, "learning_rate": 1.0939380224605727e-05, "loss": 0.5995, "step": 6350 }, { "epoch": 2.430539609644087, "grad_norm": 0.562326967716217, "learning_rate": 1.0936912192390852e-05, "loss": 0.6941, "step": 6351 }, { "epoch": 2.4309223115193266, "grad_norm": 0.6670188903808594, "learning_rate": 1.0934444102600205e-05, "loss": 0.5864, "step": 6352 }, { "epoch": 2.4313050133945655, "grad_norm": 0.5549088716506958, "learning_rate": 1.093197595538546e-05, "loss": 0.5831, "step": 6353 }, { "epoch": 2.431687715269805, "grad_norm": 0.5261608362197876, "learning_rate": 1.0929507750898289e-05, "loss": 0.6482, "step": 6354 }, { "epoch": 2.4320704171450442, "grad_norm": 0.5208941102027893, "learning_rate": 1.0927039489290368e-05, "loss": 0.6399, "step": 6355 }, { "epoch": 2.432453119020283, "grad_norm": 0.5280900597572327, "learning_rate": 1.0924571170713381e-05, "loss": 0.6669, "step": 6356 }, { "epoch": 2.4328358208955225, "grad_norm": 0.5343762040138245, "learning_rate": 1.092210279531901e-05, "loss": 0.6095, "step": 6357 }, { "epoch": 2.4332185227707614, "grad_norm": 0.5082356333732605, "learning_rate": 1.0919634363258947e-05, "loss": 0.6176, "step": 6358 }, { "epoch": 2.433601224646001, "grad_norm": 0.5429057478904724, "learning_rate": 1.091716587468488e-05, "loss": 0.5804, "step": 6359 }, { "epoch": 2.4339839265212397, "grad_norm": 0.5841774344444275, "learning_rate": 1.0914697329748508e-05, "loss": 0.6115, "step": 6360 }, { "epoch": 2.434366628396479, "grad_norm": 0.5164796710014343, "learning_rate": 1.0912228728601524e-05, "loss": 0.6411, "step": 6361 }, { "epoch": 2.4347493302717185, "grad_norm": 0.5384321212768555, "learning_rate": 1.0909760071395639e-05, "loss": 0.572, "step": 6362 }, { "epoch": 2.4351320321469574, "grad_norm": 0.5652962923049927, "learning_rate": 1.090729135828255e-05, "loss": 0.7674, "step": 6363 }, { "epoch": 2.4355147340221968, "grad_norm": 0.5891969203948975, "learning_rate": 1.0904822589413973e-05, "loss": 0.6509, "step": 6364 }, { "epoch": 2.435897435897436, "grad_norm": 0.6353817582130432, "learning_rate": 1.0902353764941613e-05, "loss": 0.6649, "step": 6365 }, { "epoch": 2.436280137772675, "grad_norm": 0.5305449962615967, "learning_rate": 1.0899884885017191e-05, "loss": 0.6072, "step": 6366 }, { "epoch": 2.4366628396479144, "grad_norm": 0.5289894938468933, "learning_rate": 1.0897415949792427e-05, "loss": 0.6243, "step": 6367 }, { "epoch": 2.4370455415231533, "grad_norm": 0.5784481167793274, "learning_rate": 1.089494695941904e-05, "loss": 0.6104, "step": 6368 }, { "epoch": 2.4374282433983927, "grad_norm": 0.5547499656677246, "learning_rate": 1.089247791404876e-05, "loss": 0.6447, "step": 6369 }, { "epoch": 2.4378109452736316, "grad_norm": 0.5095576643943787, "learning_rate": 1.0890008813833312e-05, "loss": 0.5883, "step": 6370 }, { "epoch": 2.438193647148871, "grad_norm": 0.49317118525505066, "learning_rate": 1.0887539658924433e-05, "loss": 0.6448, "step": 6371 }, { "epoch": 2.4385763490241104, "grad_norm": 0.4998229146003723, "learning_rate": 1.088507044947386e-05, "loss": 0.5605, "step": 6372 }, { "epoch": 2.4389590508993493, "grad_norm": 0.5362765192985535, "learning_rate": 1.0882601185633326e-05, "loss": 0.636, "step": 6373 }, { "epoch": 2.4393417527745886, "grad_norm": 0.5757013559341431, "learning_rate": 1.0880131867554581e-05, "loss": 0.6771, "step": 6374 }, { "epoch": 2.439724454649828, "grad_norm": 0.491895854473114, "learning_rate": 1.087766249538937e-05, "loss": 0.6379, "step": 6375 }, { "epoch": 2.440107156525067, "grad_norm": 0.5438093543052673, "learning_rate": 1.0875193069289434e-05, "loss": 0.6826, "step": 6376 }, { "epoch": 2.4404898584003063, "grad_norm": 0.513931930065155, "learning_rate": 1.0872723589406538e-05, "loss": 0.6282, "step": 6377 }, { "epoch": 2.4408725602755452, "grad_norm": 0.5629894733428955, "learning_rate": 1.087025405589243e-05, "loss": 0.6547, "step": 6378 }, { "epoch": 2.4412552621507846, "grad_norm": 0.626103937625885, "learning_rate": 1.0867784468898875e-05, "loss": 0.629, "step": 6379 }, { "epoch": 2.4416379640260235, "grad_norm": 0.5783993601799011, "learning_rate": 1.0865314828577633e-05, "loss": 0.6809, "step": 6380 }, { "epoch": 2.442020665901263, "grad_norm": 0.5491083860397339, "learning_rate": 1.0862845135080466e-05, "loss": 0.6646, "step": 6381 }, { "epoch": 2.4424033677765022, "grad_norm": 0.5202564001083374, "learning_rate": 1.0860375388559153e-05, "loss": 0.5344, "step": 6382 }, { "epoch": 2.442786069651741, "grad_norm": 0.5697914361953735, "learning_rate": 1.0857905589165459e-05, "loss": 0.6946, "step": 6383 }, { "epoch": 2.4431687715269805, "grad_norm": 0.5171549916267395, "learning_rate": 1.0855435737051159e-05, "loss": 0.5596, "step": 6384 }, { "epoch": 2.44355147340222, "grad_norm": 0.5752720236778259, "learning_rate": 1.0852965832368039e-05, "loss": 0.6436, "step": 6385 }, { "epoch": 2.443934175277459, "grad_norm": 0.5559223890304565, "learning_rate": 1.0850495875267873e-05, "loss": 0.6275, "step": 6386 }, { "epoch": 2.444316877152698, "grad_norm": 0.5699167251586914, "learning_rate": 1.0848025865902455e-05, "loss": 0.6415, "step": 6387 }, { "epoch": 2.444699579027937, "grad_norm": 0.4993203580379486, "learning_rate": 1.0845555804423567e-05, "loss": 0.61, "step": 6388 }, { "epoch": 2.4450822809031765, "grad_norm": 0.533412754535675, "learning_rate": 1.0843085690983002e-05, "loss": 0.5493, "step": 6389 }, { "epoch": 2.4454649827784154, "grad_norm": 0.5090436339378357, "learning_rate": 1.084061552573256e-05, "loss": 0.6156, "step": 6390 }, { "epoch": 2.4458476846536548, "grad_norm": 0.5759556293487549, "learning_rate": 1.083814530882403e-05, "loss": 0.6585, "step": 6391 }, { "epoch": 2.446230386528894, "grad_norm": 0.5355123281478882, "learning_rate": 1.0835675040409223e-05, "loss": 0.7266, "step": 6392 }, { "epoch": 2.446613088404133, "grad_norm": 0.48101499676704407, "learning_rate": 1.083320472063994e-05, "loss": 0.6612, "step": 6393 }, { "epoch": 2.4469957902793724, "grad_norm": 0.5435634851455688, "learning_rate": 1.0830734349667988e-05, "loss": 0.65, "step": 6394 }, { "epoch": 2.447378492154612, "grad_norm": 0.5470604300498962, "learning_rate": 1.082826392764518e-05, "loss": 0.5731, "step": 6395 }, { "epoch": 2.4477611940298507, "grad_norm": 0.5914145112037659, "learning_rate": 1.0825793454723325e-05, "loss": 0.7044, "step": 6396 }, { "epoch": 2.44814389590509, "grad_norm": 0.5292344689369202, "learning_rate": 1.0823322931054245e-05, "loss": 0.6543, "step": 6397 }, { "epoch": 2.448526597780329, "grad_norm": 0.5258802771568298, "learning_rate": 1.082085235678976e-05, "loss": 0.6771, "step": 6398 }, { "epoch": 2.4489092996555684, "grad_norm": 0.5662237405776978, "learning_rate": 1.081838173208169e-05, "loss": 0.6669, "step": 6399 }, { "epoch": 2.4492920015308073, "grad_norm": 0.5339305400848389, "learning_rate": 1.081591105708187e-05, "loss": 0.5725, "step": 6400 }, { "epoch": 2.4496747034060466, "grad_norm": 0.5902783870697021, "learning_rate": 1.0813440331942117e-05, "loss": 0.6743, "step": 6401 }, { "epoch": 2.450057405281286, "grad_norm": 0.5407136082649231, "learning_rate": 1.0810969556814273e-05, "loss": 0.6576, "step": 6402 }, { "epoch": 2.450440107156525, "grad_norm": 0.61275714635849, "learning_rate": 1.080849873185017e-05, "loss": 0.58, "step": 6403 }, { "epoch": 2.4508228090317643, "grad_norm": 0.5159421563148499, "learning_rate": 1.0806027857201649e-05, "loss": 0.5893, "step": 6404 }, { "epoch": 2.4512055109070037, "grad_norm": 0.49280115962028503, "learning_rate": 1.0803556933020551e-05, "loss": 0.6333, "step": 6405 }, { "epoch": 2.4515882127822426, "grad_norm": 0.5490779280662537, "learning_rate": 1.0801085959458719e-05, "loss": 0.6575, "step": 6406 }, { "epoch": 2.451970914657482, "grad_norm": 0.5581007599830627, "learning_rate": 1.0798614936668003e-05, "loss": 0.6863, "step": 6407 }, { "epoch": 2.452353616532721, "grad_norm": 0.5048882961273193, "learning_rate": 1.0796143864800257e-05, "loss": 0.6314, "step": 6408 }, { "epoch": 2.4527363184079602, "grad_norm": 0.5544918775558472, "learning_rate": 1.079367274400733e-05, "loss": 0.5682, "step": 6409 }, { "epoch": 2.453119020283199, "grad_norm": 0.5155327320098877, "learning_rate": 1.0791201574441079e-05, "loss": 0.5203, "step": 6410 }, { "epoch": 2.4535017221584385, "grad_norm": 0.5602041482925415, "learning_rate": 1.0788730356253368e-05, "loss": 0.717, "step": 6411 }, { "epoch": 2.453884424033678, "grad_norm": 1.5090909004211426, "learning_rate": 1.0786259089596053e-05, "loss": 0.6977, "step": 6412 }, { "epoch": 2.454267125908917, "grad_norm": 0.5298274159431458, "learning_rate": 1.0783787774621011e-05, "loss": 0.6684, "step": 6413 }, { "epoch": 2.454649827784156, "grad_norm": 0.5803132653236389, "learning_rate": 1.0781316411480103e-05, "loss": 0.6108, "step": 6414 }, { "epoch": 2.4550325296593956, "grad_norm": 0.571040689945221, "learning_rate": 1.07788450003252e-05, "loss": 0.6302, "step": 6415 }, { "epoch": 2.4554152315346345, "grad_norm": 0.5875695943832397, "learning_rate": 1.077637354130818e-05, "loss": 0.602, "step": 6416 }, { "epoch": 2.455797933409874, "grad_norm": 0.5583468079566956, "learning_rate": 1.0773902034580919e-05, "loss": 0.6708, "step": 6417 }, { "epoch": 2.4561806352851128, "grad_norm": 0.5256684422492981, "learning_rate": 1.0771430480295301e-05, "loss": 0.6631, "step": 6418 }, { "epoch": 2.456563337160352, "grad_norm": 0.5960051417350769, "learning_rate": 1.076895887860321e-05, "loss": 0.6904, "step": 6419 }, { "epoch": 2.456946039035591, "grad_norm": 0.5646334290504456, "learning_rate": 1.0766487229656526e-05, "loss": 0.7162, "step": 6420 }, { "epoch": 2.4573287409108304, "grad_norm": 0.5367555022239685, "learning_rate": 1.0764015533607143e-05, "loss": 0.5763, "step": 6421 }, { "epoch": 2.45771144278607, "grad_norm": 0.541563093662262, "learning_rate": 1.0761543790606954e-05, "loss": 0.6534, "step": 6422 }, { "epoch": 2.4580941446613087, "grad_norm": 0.5326570868492126, "learning_rate": 1.0759072000807853e-05, "loss": 0.5794, "step": 6423 }, { "epoch": 2.458476846536548, "grad_norm": 0.5557588934898376, "learning_rate": 1.0756600164361739e-05, "loss": 0.6368, "step": 6424 }, { "epoch": 2.4588595484117874, "grad_norm": 0.5333808660507202, "learning_rate": 1.0754128281420511e-05, "loss": 0.6232, "step": 6425 }, { "epoch": 2.4592422502870264, "grad_norm": 0.5367006063461304, "learning_rate": 1.0751656352136076e-05, "loss": 0.6421, "step": 6426 }, { "epoch": 2.4596249521622657, "grad_norm": 0.5726892352104187, "learning_rate": 1.0749184376660337e-05, "loss": 0.594, "step": 6427 }, { "epoch": 2.4600076540375047, "grad_norm": 0.5637040734291077, "learning_rate": 1.0746712355145205e-05, "loss": 0.6088, "step": 6428 }, { "epoch": 2.460390355912744, "grad_norm": 0.5166687369346619, "learning_rate": 1.0744240287742595e-05, "loss": 0.5294, "step": 6429 }, { "epoch": 2.460773057787983, "grad_norm": 0.600184440612793, "learning_rate": 1.074176817460442e-05, "loss": 0.6404, "step": 6430 }, { "epoch": 2.4611557596632223, "grad_norm": 0.554369330406189, "learning_rate": 1.0739296015882594e-05, "loss": 0.6534, "step": 6431 }, { "epoch": 2.4615384615384617, "grad_norm": 0.4764150381088257, "learning_rate": 1.0736823811729044e-05, "loss": 0.6064, "step": 6432 }, { "epoch": 2.4619211634137006, "grad_norm": 0.6306803226470947, "learning_rate": 1.0734351562295693e-05, "loss": 0.6273, "step": 6433 }, { "epoch": 2.46230386528894, "grad_norm": 0.5126171112060547, "learning_rate": 1.0731879267734464e-05, "loss": 0.6462, "step": 6434 }, { "epoch": 2.4626865671641793, "grad_norm": 0.5673646330833435, "learning_rate": 1.072940692819729e-05, "loss": 0.6474, "step": 6435 }, { "epoch": 2.4630692690394183, "grad_norm": 0.565199613571167, "learning_rate": 1.0726934543836096e-05, "loss": 0.6464, "step": 6436 }, { "epoch": 2.4634519709146576, "grad_norm": 0.5107318758964539, "learning_rate": 1.0724462114802825e-05, "loss": 0.5937, "step": 6437 }, { "epoch": 2.4638346727898965, "grad_norm": 0.5303379893302917, "learning_rate": 1.0721989641249408e-05, "loss": 0.6341, "step": 6438 }, { "epoch": 2.464217374665136, "grad_norm": 0.5183894038200378, "learning_rate": 1.0719517123327792e-05, "loss": 0.6173, "step": 6439 }, { "epoch": 2.464600076540375, "grad_norm": 0.590753972530365, "learning_rate": 1.0717044561189915e-05, "loss": 0.6499, "step": 6440 }, { "epoch": 2.464982778415614, "grad_norm": 0.5066030025482178, "learning_rate": 1.071457195498772e-05, "loss": 0.5766, "step": 6441 }, { "epoch": 2.4653654802908536, "grad_norm": 0.5569173097610474, "learning_rate": 1.0712099304873161e-05, "loss": 0.6485, "step": 6442 }, { "epoch": 2.4657481821660925, "grad_norm": 0.5427770018577576, "learning_rate": 1.0709626610998185e-05, "loss": 0.7204, "step": 6443 }, { "epoch": 2.466130884041332, "grad_norm": 0.49984419345855713, "learning_rate": 1.070715387351475e-05, "loss": 0.5923, "step": 6444 }, { "epoch": 2.466513585916571, "grad_norm": 0.522834062576294, "learning_rate": 1.0704681092574812e-05, "loss": 0.5827, "step": 6445 }, { "epoch": 2.46689628779181, "grad_norm": 0.6048813462257385, "learning_rate": 1.0702208268330321e-05, "loss": 0.693, "step": 6446 }, { "epoch": 2.4672789896670495, "grad_norm": 0.5053644776344299, "learning_rate": 1.0699735400933248e-05, "loss": 0.6046, "step": 6447 }, { "epoch": 2.4676616915422884, "grad_norm": 0.5710451602935791, "learning_rate": 1.0697262490535558e-05, "loss": 0.6263, "step": 6448 }, { "epoch": 2.468044393417528, "grad_norm": 0.5123350620269775, "learning_rate": 1.069478953728921e-05, "loss": 0.6415, "step": 6449 }, { "epoch": 2.4684270952927667, "grad_norm": 0.6153277158737183, "learning_rate": 1.0692316541346181e-05, "loss": 0.6649, "step": 6450 }, { "epoch": 2.468809797168006, "grad_norm": 0.5219919085502625, "learning_rate": 1.0689843502858441e-05, "loss": 0.5961, "step": 6451 }, { "epoch": 2.4691924990432454, "grad_norm": 0.5498950481414795, "learning_rate": 1.0687370421977964e-05, "loss": 0.6031, "step": 6452 }, { "epoch": 2.4695752009184844, "grad_norm": 0.6073704957962036, "learning_rate": 1.0684897298856727e-05, "loss": 0.6319, "step": 6453 }, { "epoch": 2.4699579027937237, "grad_norm": 0.5250904560089111, "learning_rate": 1.0682424133646712e-05, "loss": 0.6672, "step": 6454 }, { "epoch": 2.470340604668963, "grad_norm": 0.48746365308761597, "learning_rate": 1.06799509264999e-05, "loss": 0.6598, "step": 6455 }, { "epoch": 2.470723306544202, "grad_norm": 0.5954557657241821, "learning_rate": 1.0677477677568278e-05, "loss": 0.6302, "step": 6456 }, { "epoch": 2.4711060084194414, "grad_norm": 0.6562588214874268, "learning_rate": 1.0675004387003829e-05, "loss": 0.6057, "step": 6457 }, { "epoch": 2.4714887102946803, "grad_norm": 0.5411845445632935, "learning_rate": 1.0672531054958553e-05, "loss": 0.62, "step": 6458 }, { "epoch": 2.4718714121699197, "grad_norm": 0.5665258169174194, "learning_rate": 1.0670057681584431e-05, "loss": 0.6707, "step": 6459 }, { "epoch": 2.4722541140451586, "grad_norm": 0.5469112992286682, "learning_rate": 1.0667584267033471e-05, "loss": 0.5951, "step": 6460 }, { "epoch": 2.472636815920398, "grad_norm": 0.5522730946540833, "learning_rate": 1.0665110811457658e-05, "loss": 0.6471, "step": 6461 }, { "epoch": 2.4730195177956373, "grad_norm": 0.5863181948661804, "learning_rate": 1.0662637315009002e-05, "loss": 0.6888, "step": 6462 }, { "epoch": 2.4734022196708763, "grad_norm": 0.5660390257835388, "learning_rate": 1.0660163777839502e-05, "loss": 0.5983, "step": 6463 }, { "epoch": 2.4737849215461156, "grad_norm": 0.553521990776062, "learning_rate": 1.0657690200101163e-05, "loss": 0.6517, "step": 6464 }, { "epoch": 2.474167623421355, "grad_norm": 0.5645022392272949, "learning_rate": 1.0655216581945997e-05, "loss": 0.6163, "step": 6465 }, { "epoch": 2.474550325296594, "grad_norm": 0.5284848809242249, "learning_rate": 1.065274292352601e-05, "loss": 0.6329, "step": 6466 }, { "epoch": 2.4749330271718333, "grad_norm": 0.5734888315200806, "learning_rate": 1.0650269224993217e-05, "loss": 0.6497, "step": 6467 }, { "epoch": 2.475315729047072, "grad_norm": 0.5697460174560547, "learning_rate": 1.0647795486499634e-05, "loss": 0.6826, "step": 6468 }, { "epoch": 2.4756984309223116, "grad_norm": 0.5320362448692322, "learning_rate": 1.064532170819728e-05, "loss": 0.674, "step": 6469 }, { "epoch": 2.4760811327975505, "grad_norm": 0.53005450963974, "learning_rate": 1.0642847890238171e-05, "loss": 0.6601, "step": 6470 }, { "epoch": 2.47646383467279, "grad_norm": 0.5361956357955933, "learning_rate": 1.064037403277433e-05, "loss": 0.5862, "step": 6471 }, { "epoch": 2.476846536548029, "grad_norm": 0.49750441312789917, "learning_rate": 1.0637900135957785e-05, "loss": 0.607, "step": 6472 }, { "epoch": 2.477229238423268, "grad_norm": 0.505830705165863, "learning_rate": 1.0635426199940566e-05, "loss": 0.5855, "step": 6473 }, { "epoch": 2.4776119402985075, "grad_norm": 0.6771740317344666, "learning_rate": 1.06329522248747e-05, "loss": 0.6349, "step": 6474 }, { "epoch": 2.477994642173747, "grad_norm": 0.5440220236778259, "learning_rate": 1.0630478210912216e-05, "loss": 0.7137, "step": 6475 }, { "epoch": 2.478377344048986, "grad_norm": 0.4973439574241638, "learning_rate": 1.0628004158205155e-05, "loss": 0.6574, "step": 6476 }, { "epoch": 2.478760045924225, "grad_norm": 0.5327556133270264, "learning_rate": 1.0625530066905547e-05, "loss": 0.7144, "step": 6477 }, { "epoch": 2.479142747799464, "grad_norm": 0.5513970255851746, "learning_rate": 1.062305593716544e-05, "loss": 0.6398, "step": 6478 }, { "epoch": 2.4795254496747035, "grad_norm": 0.5217998623847961, "learning_rate": 1.0620581769136873e-05, "loss": 0.6944, "step": 6479 }, { "epoch": 2.4799081515499424, "grad_norm": 0.5100957751274109, "learning_rate": 1.0618107562971887e-05, "loss": 0.6585, "step": 6480 }, { "epoch": 2.4802908534251817, "grad_norm": 0.4925716817378998, "learning_rate": 1.061563331882253e-05, "loss": 0.635, "step": 6481 }, { "epoch": 2.480673555300421, "grad_norm": 0.5461499094963074, "learning_rate": 1.0613159036840854e-05, "loss": 0.5968, "step": 6482 }, { "epoch": 2.48105625717566, "grad_norm": 0.5396993160247803, "learning_rate": 1.0610684717178905e-05, "loss": 0.6786, "step": 6483 }, { "epoch": 2.4814389590508994, "grad_norm": 0.5422077775001526, "learning_rate": 1.0608210359988742e-05, "loss": 0.5138, "step": 6484 }, { "epoch": 2.4818216609261388, "grad_norm": 0.5170471668243408, "learning_rate": 1.0605735965422417e-05, "loss": 0.6485, "step": 6485 }, { "epoch": 2.4822043628013777, "grad_norm": 0.49312058091163635, "learning_rate": 1.060326153363199e-05, "loss": 0.574, "step": 6486 }, { "epoch": 2.482587064676617, "grad_norm": 0.6744502186775208, "learning_rate": 1.0600787064769523e-05, "loss": 0.6274, "step": 6487 }, { "epoch": 2.482969766551856, "grad_norm": 0.5296429395675659, "learning_rate": 1.0598312558987072e-05, "loss": 0.5799, "step": 6488 }, { "epoch": 2.4833524684270953, "grad_norm": 0.5346795320510864, "learning_rate": 1.0595838016436713e-05, "loss": 0.6509, "step": 6489 }, { "epoch": 2.4837351703023343, "grad_norm": 0.567384660243988, "learning_rate": 1.0593363437270501e-05, "loss": 0.647, "step": 6490 }, { "epoch": 2.4841178721775736, "grad_norm": 0.549795389175415, "learning_rate": 1.0590888821640517e-05, "loss": 0.6595, "step": 6491 }, { "epoch": 2.484500574052813, "grad_norm": 0.5039892792701721, "learning_rate": 1.0588414169698824e-05, "loss": 0.5529, "step": 6492 }, { "epoch": 2.484883275928052, "grad_norm": 0.5869365334510803, "learning_rate": 1.05859394815975e-05, "loss": 0.6361, "step": 6493 }, { "epoch": 2.4852659778032913, "grad_norm": 0.6215143203735352, "learning_rate": 1.0583464757488621e-05, "loss": 0.6394, "step": 6494 }, { "epoch": 2.4856486796785306, "grad_norm": 0.5350363254547119, "learning_rate": 1.0580989997524269e-05, "loss": 0.6726, "step": 6495 }, { "epoch": 2.4860313815537696, "grad_norm": 0.4990329444408417, "learning_rate": 1.0578515201856516e-05, "loss": 0.6403, "step": 6496 }, { "epoch": 2.486414083429009, "grad_norm": 0.5820246338844299, "learning_rate": 1.057604037063745e-05, "loss": 0.6973, "step": 6497 }, { "epoch": 2.486796785304248, "grad_norm": 0.5285184979438782, "learning_rate": 1.0573565504019157e-05, "loss": 0.6408, "step": 6498 }, { "epoch": 2.4871794871794872, "grad_norm": 0.5061426162719727, "learning_rate": 1.0571090602153724e-05, "loss": 0.6585, "step": 6499 }, { "epoch": 2.487562189054726, "grad_norm": 0.5207504630088806, "learning_rate": 1.0568615665193237e-05, "loss": 0.642, "step": 6500 }, { "epoch": 2.4879448909299655, "grad_norm": 0.548734724521637, "learning_rate": 1.0566140693289792e-05, "loss": 0.6964, "step": 6501 }, { "epoch": 2.488327592805205, "grad_norm": 0.5416510105133057, "learning_rate": 1.0563665686595479e-05, "loss": 0.6486, "step": 6502 }, { "epoch": 2.488710294680444, "grad_norm": 0.539916455745697, "learning_rate": 1.0561190645262393e-05, "loss": 0.651, "step": 6503 }, { "epoch": 2.489092996555683, "grad_norm": 0.6095733642578125, "learning_rate": 1.055871556944264e-05, "loss": 0.7211, "step": 6504 }, { "epoch": 2.4894756984309225, "grad_norm": 0.5005873441696167, "learning_rate": 1.0556240459288312e-05, "loss": 0.6831, "step": 6505 }, { "epoch": 2.4898584003061615, "grad_norm": 0.5296956896781921, "learning_rate": 1.055376531495151e-05, "loss": 0.6179, "step": 6506 }, { "epoch": 2.490241102181401, "grad_norm": 0.5376429557800293, "learning_rate": 1.0551290136584345e-05, "loss": 0.6846, "step": 6507 }, { "epoch": 2.4906238040566397, "grad_norm": 0.556378960609436, "learning_rate": 1.054881492433892e-05, "loss": 0.6082, "step": 6508 }, { "epoch": 2.491006505931879, "grad_norm": 0.5714859962463379, "learning_rate": 1.0546339678367342e-05, "loss": 0.6517, "step": 6509 }, { "epoch": 2.491389207807118, "grad_norm": 0.5039429068565369, "learning_rate": 1.0543864398821723e-05, "loss": 0.5773, "step": 6510 }, { "epoch": 2.4917719096823574, "grad_norm": 0.5465301275253296, "learning_rate": 1.0541389085854177e-05, "loss": 0.6025, "step": 6511 }, { "epoch": 2.4921546115575968, "grad_norm": 0.5617127418518066, "learning_rate": 1.0538913739616817e-05, "loss": 0.6434, "step": 6512 }, { "epoch": 2.4925373134328357, "grad_norm": 0.5073524117469788, "learning_rate": 1.0536438360261761e-05, "loss": 0.6241, "step": 6513 }, { "epoch": 2.492920015308075, "grad_norm": 0.5453078150749207, "learning_rate": 1.0533962947941123e-05, "loss": 0.7259, "step": 6514 }, { "epoch": 2.4933027171833144, "grad_norm": 0.528700053691864, "learning_rate": 1.0531487502807031e-05, "loss": 0.6455, "step": 6515 }, { "epoch": 2.4936854190585533, "grad_norm": 0.5440320372581482, "learning_rate": 1.0529012025011602e-05, "loss": 0.6986, "step": 6516 }, { "epoch": 2.4940681209337927, "grad_norm": 0.5691733956336975, "learning_rate": 1.0526536514706962e-05, "loss": 0.669, "step": 6517 }, { "epoch": 2.4944508228090316, "grad_norm": 0.5506103038787842, "learning_rate": 1.0524060972045243e-05, "loss": 0.5927, "step": 6518 }, { "epoch": 2.494833524684271, "grad_norm": 0.5444128513336182, "learning_rate": 1.0521585397178563e-05, "loss": 0.6214, "step": 6519 }, { "epoch": 2.49521622655951, "grad_norm": 0.5393136143684387, "learning_rate": 1.0519109790259065e-05, "loss": 0.6611, "step": 6520 }, { "epoch": 2.4955989284347493, "grad_norm": 0.5703444480895996, "learning_rate": 1.0516634151438874e-05, "loss": 0.6319, "step": 6521 }, { "epoch": 2.4959816303099887, "grad_norm": 0.5338516235351562, "learning_rate": 1.0514158480870125e-05, "loss": 0.6551, "step": 6522 }, { "epoch": 2.4963643321852276, "grad_norm": 0.5225899815559387, "learning_rate": 1.0511682778704956e-05, "loss": 0.6146, "step": 6523 }, { "epoch": 2.496747034060467, "grad_norm": 0.5779993534088135, "learning_rate": 1.0509207045095506e-05, "loss": 0.6249, "step": 6524 }, { "epoch": 2.4971297359357063, "grad_norm": 0.7114317417144775, "learning_rate": 1.0506731280193919e-05, "loss": 0.6584, "step": 6525 }, { "epoch": 2.4975124378109452, "grad_norm": 0.5293284058570862, "learning_rate": 1.0504255484152327e-05, "loss": 0.7626, "step": 6526 }, { "epoch": 2.4978951396861846, "grad_norm": 0.5697917342185974, "learning_rate": 1.0501779657122884e-05, "loss": 0.6078, "step": 6527 }, { "epoch": 2.4982778415614235, "grad_norm": 0.582059383392334, "learning_rate": 1.0499303799257734e-05, "loss": 0.5758, "step": 6528 }, { "epoch": 2.498660543436663, "grad_norm": 0.5300227403640747, "learning_rate": 1.0496827910709021e-05, "loss": 0.6722, "step": 6529 }, { "epoch": 2.499043245311902, "grad_norm": 0.5324503779411316, "learning_rate": 1.0494351991628903e-05, "loss": 0.5647, "step": 6530 }, { "epoch": 2.499425947187141, "grad_norm": 0.5954743027687073, "learning_rate": 1.0491876042169525e-05, "loss": 0.5419, "step": 6531 }, { "epoch": 2.4998086490623805, "grad_norm": 0.540419340133667, "learning_rate": 1.0489400062483042e-05, "loss": 0.6084, "step": 6532 }, { "epoch": 2.5001913509376195, "grad_norm": 0.5165369510650635, "learning_rate": 1.048692405272161e-05, "loss": 0.6647, "step": 6533 }, { "epoch": 2.500574052812859, "grad_norm": 0.5647760629653931, "learning_rate": 1.048444801303739e-05, "loss": 0.6054, "step": 6534 }, { "epoch": 2.500956754688098, "grad_norm": 0.5268423557281494, "learning_rate": 1.0481971943582534e-05, "loss": 0.5867, "step": 6535 }, { "epoch": 2.501339456563337, "grad_norm": 0.5559113025665283, "learning_rate": 1.0479495844509209e-05, "loss": 0.73, "step": 6536 }, { "epoch": 2.5017221584385765, "grad_norm": 0.5245987772941589, "learning_rate": 1.0477019715969574e-05, "loss": 0.6505, "step": 6537 }, { "epoch": 2.5021048603138154, "grad_norm": 0.5348398089408875, "learning_rate": 1.04745435581158e-05, "loss": 0.6035, "step": 6538 }, { "epoch": 2.5024875621890548, "grad_norm": 0.5331740975379944, "learning_rate": 1.0472067371100048e-05, "loss": 0.6229, "step": 6539 }, { "epoch": 2.5028702640642937, "grad_norm": 0.49879327416419983, "learning_rate": 1.0469591155074488e-05, "loss": 0.6992, "step": 6540 }, { "epoch": 2.503252965939533, "grad_norm": 0.5484578013420105, "learning_rate": 1.046711491019129e-05, "loss": 0.6263, "step": 6541 }, { "epoch": 2.5036356678147724, "grad_norm": 0.5406304001808167, "learning_rate": 1.0464638636602621e-05, "loss": 0.5862, "step": 6542 }, { "epoch": 2.5040183696900113, "grad_norm": 0.9986594915390015, "learning_rate": 1.0462162334460665e-05, "loss": 0.5966, "step": 6543 }, { "epoch": 2.5044010715652507, "grad_norm": 0.49521157145500183, "learning_rate": 1.0459686003917593e-05, "loss": 0.6229, "step": 6544 }, { "epoch": 2.50478377344049, "grad_norm": 0.49035146832466125, "learning_rate": 1.0457209645125577e-05, "loss": 0.6434, "step": 6545 }, { "epoch": 2.505166475315729, "grad_norm": 0.545229971408844, "learning_rate": 1.0454733258236805e-05, "loss": 0.6524, "step": 6546 }, { "epoch": 2.5055491771909684, "grad_norm": 0.529021143913269, "learning_rate": 1.0452256843403448e-05, "loss": 0.6343, "step": 6547 }, { "epoch": 2.5059318790662073, "grad_norm": 0.5186361074447632, "learning_rate": 1.0449780400777694e-05, "loss": 0.5998, "step": 6548 }, { "epoch": 2.5063145809414467, "grad_norm": 0.5354294776916504, "learning_rate": 1.0447303930511728e-05, "loss": 0.6072, "step": 6549 }, { "epoch": 2.5066972828166856, "grad_norm": 0.5945996642112732, "learning_rate": 1.044482743275773e-05, "loss": 0.6209, "step": 6550 }, { "epoch": 2.507079984691925, "grad_norm": 0.5597503781318665, "learning_rate": 1.0442350907667896e-05, "loss": 0.645, "step": 6551 }, { "epoch": 2.5074626865671643, "grad_norm": 0.7135100364685059, "learning_rate": 1.0439874355394407e-05, "loss": 0.6579, "step": 6552 }, { "epoch": 2.5078453884424032, "grad_norm": 0.5318120121955872, "learning_rate": 1.0437397776089455e-05, "loss": 0.7259, "step": 6553 }, { "epoch": 2.5082280903176426, "grad_norm": 0.5689685344696045, "learning_rate": 1.043492116990524e-05, "loss": 0.6858, "step": 6554 }, { "epoch": 2.508610792192882, "grad_norm": 0.5659905672073364, "learning_rate": 1.0432444536993946e-05, "loss": 0.76, "step": 6555 }, { "epoch": 2.508993494068121, "grad_norm": 0.5452008247375488, "learning_rate": 1.0429967877507778e-05, "loss": 0.7028, "step": 6556 }, { "epoch": 2.5093761959433603, "grad_norm": 0.5304660201072693, "learning_rate": 1.0427491191598926e-05, "loss": 0.6132, "step": 6557 }, { "epoch": 2.509758897818599, "grad_norm": 0.5582414865493774, "learning_rate": 1.042501447941959e-05, "loss": 0.5795, "step": 6558 }, { "epoch": 2.5101415996938385, "grad_norm": 0.5680282711982727, "learning_rate": 1.0422537741121977e-05, "loss": 0.6878, "step": 6559 }, { "epoch": 2.5105243015690775, "grad_norm": 0.5293636322021484, "learning_rate": 1.0420060976858284e-05, "loss": 0.6619, "step": 6560 }, { "epoch": 2.510907003444317, "grad_norm": 0.6029543280601501, "learning_rate": 1.0417584186780714e-05, "loss": 0.6222, "step": 6561 }, { "epoch": 2.511289705319556, "grad_norm": 0.5483143925666809, "learning_rate": 1.0415107371041473e-05, "loss": 0.6528, "step": 6562 }, { "epoch": 2.511672407194795, "grad_norm": 0.52250075340271, "learning_rate": 1.0412630529792769e-05, "loss": 0.6032, "step": 6563 }, { "epoch": 2.5120551090700345, "grad_norm": 0.5441731810569763, "learning_rate": 1.041015366318681e-05, "loss": 0.6035, "step": 6564 }, { "epoch": 2.512437810945274, "grad_norm": 0.4991455674171448, "learning_rate": 1.040767677137581e-05, "loss": 0.653, "step": 6565 }, { "epoch": 2.5128205128205128, "grad_norm": 0.5392313599586487, "learning_rate": 1.0405199854511974e-05, "loss": 0.6737, "step": 6566 }, { "epoch": 2.513203214695752, "grad_norm": 0.5709441304206848, "learning_rate": 1.0402722912747518e-05, "loss": 0.718, "step": 6567 }, { "epoch": 2.513585916570991, "grad_norm": 0.5376661419868469, "learning_rate": 1.040024594623466e-05, "loss": 0.6259, "step": 6568 }, { "epoch": 2.5139686184462304, "grad_norm": 0.5172434449195862, "learning_rate": 1.0397768955125612e-05, "loss": 0.5957, "step": 6569 }, { "epoch": 2.5143513203214694, "grad_norm": 0.5459728837013245, "learning_rate": 1.0395291939572593e-05, "loss": 0.7237, "step": 6570 }, { "epoch": 2.5147340221967087, "grad_norm": 0.552105724811554, "learning_rate": 1.039281489972782e-05, "loss": 0.728, "step": 6571 }, { "epoch": 2.515116724071948, "grad_norm": 0.5315303206443787, "learning_rate": 1.0390337835743518e-05, "loss": 0.6261, "step": 6572 }, { "epoch": 2.515499425947187, "grad_norm": 0.5511916279792786, "learning_rate": 1.0387860747771909e-05, "loss": 0.623, "step": 6573 }, { "epoch": 2.5158821278224264, "grad_norm": 0.5422195792198181, "learning_rate": 1.0385383635965214e-05, "loss": 0.724, "step": 6574 }, { "epoch": 2.5162648296976657, "grad_norm": 0.5242536664009094, "learning_rate": 1.038290650047566e-05, "loss": 0.7002, "step": 6575 }, { "epoch": 2.5166475315729047, "grad_norm": 0.576712965965271, "learning_rate": 1.0380429341455472e-05, "loss": 0.6461, "step": 6576 }, { "epoch": 2.517030233448144, "grad_norm": 0.4798590838909149, "learning_rate": 1.0377952159056877e-05, "loss": 0.6392, "step": 6577 }, { "epoch": 2.517412935323383, "grad_norm": 0.5200046300888062, "learning_rate": 1.037547495343211e-05, "loss": 0.5744, "step": 6578 }, { "epoch": 2.5177956371986223, "grad_norm": 0.5608038902282715, "learning_rate": 1.0372997724733396e-05, "loss": 0.5444, "step": 6579 }, { "epoch": 2.5181783390738612, "grad_norm": 0.5063287615776062, "learning_rate": 1.037052047311297e-05, "loss": 0.5622, "step": 6580 }, { "epoch": 2.5185610409491006, "grad_norm": 0.5384572744369507, "learning_rate": 1.0368043198723066e-05, "loss": 0.6125, "step": 6581 }, { "epoch": 2.51894374282434, "grad_norm": 0.5394543409347534, "learning_rate": 1.0365565901715918e-05, "loss": 0.5834, "step": 6582 }, { "epoch": 2.519326444699579, "grad_norm": 0.49964287877082825, "learning_rate": 1.0363088582243766e-05, "loss": 0.6118, "step": 6583 }, { "epoch": 2.5197091465748183, "grad_norm": 0.5678921937942505, "learning_rate": 1.0360611240458839e-05, "loss": 0.6377, "step": 6584 }, { "epoch": 2.5200918484500576, "grad_norm": 0.5080129504203796, "learning_rate": 1.035813387651339e-05, "loss": 0.6315, "step": 6585 }, { "epoch": 2.5204745503252965, "grad_norm": 0.5372555255889893, "learning_rate": 1.0355656490559648e-05, "loss": 0.6165, "step": 6586 }, { "epoch": 2.520857252200536, "grad_norm": 0.5362086892127991, "learning_rate": 1.0353179082749857e-05, "loss": 0.617, "step": 6587 }, { "epoch": 2.521239954075775, "grad_norm": 0.5089673399925232, "learning_rate": 1.0350701653236268e-05, "loss": 0.6243, "step": 6588 }, { "epoch": 2.521622655951014, "grad_norm": 0.49589139223098755, "learning_rate": 1.0348224202171117e-05, "loss": 0.5426, "step": 6589 }, { "epoch": 2.522005357826253, "grad_norm": 0.5191583633422852, "learning_rate": 1.0345746729706655e-05, "loss": 0.6415, "step": 6590 }, { "epoch": 2.5223880597014925, "grad_norm": 0.5252068638801575, "learning_rate": 1.0343269235995127e-05, "loss": 0.6473, "step": 6591 }, { "epoch": 2.522770761576732, "grad_norm": 0.5282584428787231, "learning_rate": 1.034079172118878e-05, "loss": 0.6513, "step": 6592 }, { "epoch": 2.523153463451971, "grad_norm": 0.5320456624031067, "learning_rate": 1.0338314185439868e-05, "loss": 0.7684, "step": 6593 }, { "epoch": 2.52353616532721, "grad_norm": 0.5346137285232544, "learning_rate": 1.0335836628900642e-05, "loss": 0.7452, "step": 6594 }, { "epoch": 2.5239188672024495, "grad_norm": 0.5243627429008484, "learning_rate": 1.033335905172335e-05, "loss": 0.5676, "step": 6595 }, { "epoch": 2.5243015690776884, "grad_norm": 0.5182619094848633, "learning_rate": 1.0330881454060252e-05, "loss": 0.6757, "step": 6596 }, { "epoch": 2.524684270952928, "grad_norm": 0.5069455504417419, "learning_rate": 1.0328403836063597e-05, "loss": 0.6161, "step": 6597 }, { "epoch": 2.5250669728281667, "grad_norm": 0.558615505695343, "learning_rate": 1.0325926197885645e-05, "loss": 0.718, "step": 6598 }, { "epoch": 2.525449674703406, "grad_norm": 0.5595706105232239, "learning_rate": 1.0323448539678653e-05, "loss": 0.7105, "step": 6599 }, { "epoch": 2.525832376578645, "grad_norm": 0.54124915599823, "learning_rate": 1.032097086159488e-05, "loss": 0.6051, "step": 6600 }, { "epoch": 2.5262150784538844, "grad_norm": 0.526761531829834, "learning_rate": 1.0318493163786582e-05, "loss": 0.6614, "step": 6601 }, { "epoch": 2.5265977803291237, "grad_norm": 0.5163949131965637, "learning_rate": 1.0316015446406025e-05, "loss": 0.6885, "step": 6602 }, { "epoch": 2.5269804822043627, "grad_norm": 0.5450451970100403, "learning_rate": 1.031353770960547e-05, "loss": 0.6144, "step": 6603 }, { "epoch": 2.527363184079602, "grad_norm": 0.5121517181396484, "learning_rate": 1.031105995353718e-05, "loss": 0.6178, "step": 6604 }, { "epoch": 2.5277458859548414, "grad_norm": 0.5406484007835388, "learning_rate": 1.0308582178353421e-05, "loss": 0.6924, "step": 6605 }, { "epoch": 2.5281285878300803, "grad_norm": 0.5028522610664368, "learning_rate": 1.0306104384206461e-05, "loss": 0.5663, "step": 6606 }, { "epoch": 2.5285112897053197, "grad_norm": 0.5774726271629333, "learning_rate": 1.0303626571248559e-05, "loss": 0.5596, "step": 6607 }, { "epoch": 2.5288939915805586, "grad_norm": 0.559438169002533, "learning_rate": 1.030114873963199e-05, "loss": 0.6881, "step": 6608 }, { "epoch": 2.529276693455798, "grad_norm": 0.5185205340385437, "learning_rate": 1.0298670889509025e-05, "loss": 0.6477, "step": 6609 }, { "epoch": 2.529659395331037, "grad_norm": 0.5201298594474792, "learning_rate": 1.0296193021031929e-05, "loss": 0.6557, "step": 6610 }, { "epoch": 2.5300420972062763, "grad_norm": 0.5144718885421753, "learning_rate": 1.0293715134352977e-05, "loss": 0.6929, "step": 6611 }, { "epoch": 2.5304247990815156, "grad_norm": 0.5614184141159058, "learning_rate": 1.029123722962444e-05, "loss": 0.5639, "step": 6612 }, { "epoch": 2.5308075009567546, "grad_norm": 0.556796669960022, "learning_rate": 1.0288759306998591e-05, "loss": 0.6323, "step": 6613 }, { "epoch": 2.531190202831994, "grad_norm": 0.5230352878570557, "learning_rate": 1.0286281366627711e-05, "loss": 0.6794, "step": 6614 }, { "epoch": 2.5315729047072333, "grad_norm": 0.6003883481025696, "learning_rate": 1.0283803408664069e-05, "loss": 0.6675, "step": 6615 }, { "epoch": 2.531955606582472, "grad_norm": 0.5102835297584534, "learning_rate": 1.0281325433259947e-05, "loss": 0.5918, "step": 6616 }, { "epoch": 2.5323383084577116, "grad_norm": 0.5552318692207336, "learning_rate": 1.0278847440567619e-05, "loss": 0.6697, "step": 6617 }, { "epoch": 2.5327210103329505, "grad_norm": 0.5504249930381775, "learning_rate": 1.0276369430739368e-05, "loss": 0.5711, "step": 6618 }, { "epoch": 2.53310371220819, "grad_norm": 0.5555195212364197, "learning_rate": 1.0273891403927473e-05, "loss": 0.6266, "step": 6619 }, { "epoch": 2.533486414083429, "grad_norm": 0.5213361978530884, "learning_rate": 1.0271413360284217e-05, "loss": 0.6347, "step": 6620 }, { "epoch": 2.533869115958668, "grad_norm": 0.5574822425842285, "learning_rate": 1.0268935299961878e-05, "loss": 0.6132, "step": 6621 }, { "epoch": 2.5342518178339075, "grad_norm": 0.7856463193893433, "learning_rate": 1.0266457223112742e-05, "loss": 0.6401, "step": 6622 }, { "epoch": 2.5346345197091464, "grad_norm": 0.5074942111968994, "learning_rate": 1.0263979129889095e-05, "loss": 0.6464, "step": 6623 }, { "epoch": 2.535017221584386, "grad_norm": 0.5975372195243835, "learning_rate": 1.0261501020443223e-05, "loss": 0.6285, "step": 6624 }, { "epoch": 2.535399923459625, "grad_norm": 0.5190019607543945, "learning_rate": 1.025902289492741e-05, "loss": 0.6725, "step": 6625 }, { "epoch": 2.535782625334864, "grad_norm": 0.5220121741294861, "learning_rate": 1.0256544753493945e-05, "loss": 0.5901, "step": 6626 }, { "epoch": 2.5361653272101035, "grad_norm": 0.559395432472229, "learning_rate": 1.0254066596295114e-05, "loss": 0.6966, "step": 6627 }, { "epoch": 2.5365480290853424, "grad_norm": 0.5433639287948608, "learning_rate": 1.0251588423483205e-05, "loss": 0.6422, "step": 6628 }, { "epoch": 2.5369307309605817, "grad_norm": 0.5784795880317688, "learning_rate": 1.0249110235210518e-05, "loss": 0.6554, "step": 6629 }, { "epoch": 2.5373134328358207, "grad_norm": 0.4956801235675812, "learning_rate": 1.0246632031629335e-05, "loss": 0.5879, "step": 6630 }, { "epoch": 2.53769613471106, "grad_norm": 0.5504612922668457, "learning_rate": 1.024415381289195e-05, "loss": 0.7314, "step": 6631 }, { "epoch": 2.5380788365862994, "grad_norm": 0.5283480286598206, "learning_rate": 1.0241675579150659e-05, "loss": 0.6609, "step": 6632 }, { "epoch": 2.5384615384615383, "grad_norm": 0.5296286344528198, "learning_rate": 1.0239197330557754e-05, "loss": 0.6747, "step": 6633 }, { "epoch": 2.5388442403367777, "grad_norm": 0.5106634497642517, "learning_rate": 1.0236719067265527e-05, "loss": 0.5545, "step": 6634 }, { "epoch": 2.539226942212017, "grad_norm": 0.49890515208244324, "learning_rate": 1.0234240789426284e-05, "loss": 0.5913, "step": 6635 }, { "epoch": 2.539609644087256, "grad_norm": 0.5922748446464539, "learning_rate": 1.0231762497192311e-05, "loss": 0.681, "step": 6636 }, { "epoch": 2.5399923459624953, "grad_norm": 0.5735593438148499, "learning_rate": 1.0229284190715909e-05, "loss": 0.577, "step": 6637 }, { "epoch": 2.5403750478377343, "grad_norm": 0.48662546277046204, "learning_rate": 1.0226805870149378e-05, "loss": 0.6013, "step": 6638 }, { "epoch": 2.5407577497129736, "grad_norm": 0.5246291756629944, "learning_rate": 1.0224327535645018e-05, "loss": 0.6252, "step": 6639 }, { "epoch": 2.5411404515882126, "grad_norm": 0.5349690914154053, "learning_rate": 1.0221849187355131e-05, "loss": 0.6269, "step": 6640 }, { "epoch": 2.541523153463452, "grad_norm": 0.551642656326294, "learning_rate": 1.0219370825432012e-05, "loss": 0.6459, "step": 6641 }, { "epoch": 2.5419058553386913, "grad_norm": 0.5742986798286438, "learning_rate": 1.0216892450027966e-05, "loss": 0.6856, "step": 6642 }, { "epoch": 2.54228855721393, "grad_norm": 0.506157636642456, "learning_rate": 1.0214414061295299e-05, "loss": 0.6014, "step": 6643 }, { "epoch": 2.5426712590891696, "grad_norm": 0.5298565030097961, "learning_rate": 1.021193565938631e-05, "loss": 0.557, "step": 6644 }, { "epoch": 2.543053960964409, "grad_norm": 0.5893973112106323, "learning_rate": 1.0209457244453304e-05, "loss": 0.5939, "step": 6645 }, { "epoch": 2.543436662839648, "grad_norm": 0.5378790497779846, "learning_rate": 1.0206978816648592e-05, "loss": 0.6374, "step": 6646 }, { "epoch": 2.5438193647148872, "grad_norm": 0.5055114030838013, "learning_rate": 1.020450037612447e-05, "loss": 0.62, "step": 6647 }, { "epoch": 2.544202066590126, "grad_norm": 0.53256756067276, "learning_rate": 1.0202021923033254e-05, "loss": 0.6447, "step": 6648 }, { "epoch": 2.5445847684653655, "grad_norm": 0.49721235036849976, "learning_rate": 1.0199543457527247e-05, "loss": 0.6898, "step": 6649 }, { "epoch": 2.5449674703406044, "grad_norm": 0.5416828393936157, "learning_rate": 1.0197064979758762e-05, "loss": 0.6159, "step": 6650 }, { "epoch": 2.545350172215844, "grad_norm": 0.6009681224822998, "learning_rate": 1.0194586489880103e-05, "loss": 0.6434, "step": 6651 }, { "epoch": 2.545732874091083, "grad_norm": 0.5676518678665161, "learning_rate": 1.0192107988043576e-05, "loss": 0.5395, "step": 6652 }, { "epoch": 2.546115575966322, "grad_norm": 0.5620341300964355, "learning_rate": 1.0189629474401501e-05, "loss": 0.6303, "step": 6653 }, { "epoch": 2.5464982778415615, "grad_norm": 0.5416271090507507, "learning_rate": 1.0187150949106185e-05, "loss": 0.6481, "step": 6654 }, { "epoch": 2.546880979716801, "grad_norm": 0.5021787881851196, "learning_rate": 1.0184672412309942e-05, "loss": 0.5933, "step": 6655 }, { "epoch": 2.5472636815920398, "grad_norm": 0.6097206473350525, "learning_rate": 1.0182193864165084e-05, "loss": 0.6362, "step": 6656 }, { "epoch": 2.547646383467279, "grad_norm": 0.5164451599121094, "learning_rate": 1.0179715304823921e-05, "loss": 0.6463, "step": 6657 }, { "epoch": 2.548029085342518, "grad_norm": 0.5403048396110535, "learning_rate": 1.0177236734438768e-05, "loss": 0.5771, "step": 6658 }, { "epoch": 2.5484117872177574, "grad_norm": 0.5198078751564026, "learning_rate": 1.0174758153161946e-05, "loss": 0.6397, "step": 6659 }, { "epoch": 2.5487944890929963, "grad_norm": 0.5898360013961792, "learning_rate": 1.0172279561145763e-05, "loss": 0.5889, "step": 6660 }, { "epoch": 2.5491771909682357, "grad_norm": 0.5368406772613525, "learning_rate": 1.016980095854254e-05, "loss": 0.5935, "step": 6661 }, { "epoch": 2.549559892843475, "grad_norm": 0.5493205189704895, "learning_rate": 1.016732234550459e-05, "loss": 0.7199, "step": 6662 }, { "epoch": 2.549942594718714, "grad_norm": 0.5177925825119019, "learning_rate": 1.0164843722184232e-05, "loss": 0.5914, "step": 6663 }, { "epoch": 2.5503252965939534, "grad_norm": 0.5489147901535034, "learning_rate": 1.0162365088733788e-05, "loss": 0.5981, "step": 6664 }, { "epoch": 2.5507079984691927, "grad_norm": 0.5339605808258057, "learning_rate": 1.0159886445305569e-05, "loss": 0.6546, "step": 6665 }, { "epoch": 2.5510907003444316, "grad_norm": 0.5585513114929199, "learning_rate": 1.0157407792051903e-05, "loss": 0.6111, "step": 6666 }, { "epoch": 2.551473402219671, "grad_norm": 0.5194693207740784, "learning_rate": 1.0154929129125102e-05, "loss": 0.6669, "step": 6667 }, { "epoch": 2.55185610409491, "grad_norm": 0.5015438795089722, "learning_rate": 1.0152450456677488e-05, "loss": 0.6254, "step": 6668 }, { "epoch": 2.5522388059701493, "grad_norm": 0.4994913935661316, "learning_rate": 1.0149971774861386e-05, "loss": 0.6805, "step": 6669 }, { "epoch": 2.552621507845388, "grad_norm": 0.5338549613952637, "learning_rate": 1.0147493083829116e-05, "loss": 0.5904, "step": 6670 }, { "epoch": 2.5530042097206276, "grad_norm": 0.5345609188079834, "learning_rate": 1.0145014383733e-05, "loss": 0.659, "step": 6671 }, { "epoch": 2.553386911595867, "grad_norm": 0.508087158203125, "learning_rate": 1.014253567472536e-05, "loss": 0.656, "step": 6672 }, { "epoch": 2.553769613471106, "grad_norm": 0.5439314842224121, "learning_rate": 1.014005695695852e-05, "loss": 0.621, "step": 6673 }, { "epoch": 2.5541523153463452, "grad_norm": 0.4904763400554657, "learning_rate": 1.0137578230584804e-05, "loss": 0.6369, "step": 6674 }, { "epoch": 2.5545350172215846, "grad_norm": 0.548213005065918, "learning_rate": 1.0135099495756535e-05, "loss": 0.6085, "step": 6675 }, { "epoch": 2.5549177190968235, "grad_norm": 0.5310295224189758, "learning_rate": 1.0132620752626041e-05, "loss": 0.6759, "step": 6676 }, { "epoch": 2.555300420972063, "grad_norm": 0.5192717909812927, "learning_rate": 1.0130142001345644e-05, "loss": 0.6014, "step": 6677 }, { "epoch": 2.555683122847302, "grad_norm": 0.573000967502594, "learning_rate": 1.012766324206767e-05, "loss": 0.6341, "step": 6678 }, { "epoch": 2.556065824722541, "grad_norm": 0.509099543094635, "learning_rate": 1.0125184474944449e-05, "loss": 0.6903, "step": 6679 }, { "epoch": 2.55644852659778, "grad_norm": 0.5645133852958679, "learning_rate": 1.0122705700128307e-05, "loss": 0.7022, "step": 6680 }, { "epoch": 2.5568312284730195, "grad_norm": 0.5061400532722473, "learning_rate": 1.0120226917771567e-05, "loss": 0.6405, "step": 6681 }, { "epoch": 2.557213930348259, "grad_norm": 0.5923013687133789, "learning_rate": 1.0117748128026561e-05, "loss": 0.6319, "step": 6682 }, { "epoch": 2.5575966322234978, "grad_norm": 0.5489556193351746, "learning_rate": 1.0115269331045615e-05, "loss": 0.6646, "step": 6683 }, { "epoch": 2.557979334098737, "grad_norm": 0.5314682126045227, "learning_rate": 1.0112790526981062e-05, "loss": 0.6014, "step": 6684 }, { "epoch": 2.5583620359739765, "grad_norm": 0.5399160385131836, "learning_rate": 1.0110311715985228e-05, "loss": 0.6918, "step": 6685 }, { "epoch": 2.5587447378492154, "grad_norm": 0.5402072668075562, "learning_rate": 1.0107832898210438e-05, "loss": 0.6093, "step": 6686 }, { "epoch": 2.5591274397244548, "grad_norm": 0.5345472693443298, "learning_rate": 1.010535407380903e-05, "loss": 0.6117, "step": 6687 }, { "epoch": 2.5595101415996937, "grad_norm": 0.493563711643219, "learning_rate": 1.010287524293333e-05, "loss": 0.773, "step": 6688 }, { "epoch": 2.559892843474933, "grad_norm": 0.5229853987693787, "learning_rate": 1.0100396405735673e-05, "loss": 0.6721, "step": 6689 }, { "epoch": 2.560275545350172, "grad_norm": 0.5297348499298096, "learning_rate": 1.0097917562368386e-05, "loss": 0.6848, "step": 6690 }, { "epoch": 2.5606582472254114, "grad_norm": 0.6580557227134705, "learning_rate": 1.00954387129838e-05, "loss": 0.6534, "step": 6691 }, { "epoch": 2.5610409491006507, "grad_norm": 0.5159618258476257, "learning_rate": 1.0092959857734247e-05, "loss": 0.6173, "step": 6692 }, { "epoch": 2.5614236509758896, "grad_norm": 0.5488163828849792, "learning_rate": 1.0090480996772067e-05, "loss": 0.6368, "step": 6693 }, { "epoch": 2.561806352851129, "grad_norm": 0.5847957730293274, "learning_rate": 1.008800213024958e-05, "loss": 0.6838, "step": 6694 }, { "epoch": 2.5621890547263684, "grad_norm": 0.4885942339897156, "learning_rate": 1.0085523258319131e-05, "loss": 0.5672, "step": 6695 }, { "epoch": 2.5625717566016073, "grad_norm": 0.559786856174469, "learning_rate": 1.0083044381133044e-05, "loss": 0.5768, "step": 6696 }, { "epoch": 2.5629544584768467, "grad_norm": 0.5525892972946167, "learning_rate": 1.008056549884366e-05, "loss": 0.6319, "step": 6697 }, { "epoch": 2.5633371603520856, "grad_norm": 0.8743525743484497, "learning_rate": 1.0078086611603308e-05, "loss": 0.657, "step": 6698 }, { "epoch": 2.563719862227325, "grad_norm": 0.5515022873878479, "learning_rate": 1.0075607719564324e-05, "loss": 0.6241, "step": 6699 }, { "epoch": 2.564102564102564, "grad_norm": 0.5043615102767944, "learning_rate": 1.0073128822879044e-05, "loss": 0.5993, "step": 6700 }, { "epoch": 2.5644852659778032, "grad_norm": 0.5443761348724365, "learning_rate": 1.00706499216998e-05, "loss": 0.6495, "step": 6701 }, { "epoch": 2.5648679678530426, "grad_norm": 0.5062729120254517, "learning_rate": 1.0068171016178928e-05, "loss": 0.6806, "step": 6702 }, { "epoch": 2.5652506697282815, "grad_norm": 0.536253035068512, "learning_rate": 1.0065692106468766e-05, "loss": 0.6208, "step": 6703 }, { "epoch": 2.565633371603521, "grad_norm": 0.5468579530715942, "learning_rate": 1.0063213192721643e-05, "loss": 0.6332, "step": 6704 }, { "epoch": 2.5660160734787603, "grad_norm": 0.5647099018096924, "learning_rate": 1.0060734275089906e-05, "loss": 0.6219, "step": 6705 }, { "epoch": 2.566398775353999, "grad_norm": 0.5383147597312927, "learning_rate": 1.005825535372588e-05, "loss": 0.6684, "step": 6706 }, { "epoch": 2.5667814772292386, "grad_norm": 0.5484620332717896, "learning_rate": 1.0055776428781907e-05, "loss": 0.6564, "step": 6707 }, { "epoch": 2.5671641791044775, "grad_norm": 0.5081182718276978, "learning_rate": 1.0053297500410322e-05, "loss": 0.6479, "step": 6708 }, { "epoch": 2.567546880979717, "grad_norm": 0.5789680480957031, "learning_rate": 1.0050818568763462e-05, "loss": 0.5952, "step": 6709 }, { "epoch": 2.5679295828549558, "grad_norm": 0.613879382610321, "learning_rate": 1.0048339633993666e-05, "loss": 0.6721, "step": 6710 }, { "epoch": 2.568312284730195, "grad_norm": 0.534935474395752, "learning_rate": 1.0045860696253272e-05, "loss": 0.6076, "step": 6711 }, { "epoch": 2.5686949866054345, "grad_norm": 0.5226503014564514, "learning_rate": 1.004338175569461e-05, "loss": 0.5702, "step": 6712 }, { "epoch": 2.5690776884806734, "grad_norm": 0.5236576795578003, "learning_rate": 1.0040902812470023e-05, "loss": 0.6198, "step": 6713 }, { "epoch": 2.569460390355913, "grad_norm": 0.5426762104034424, "learning_rate": 1.003842386673185e-05, "loss": 0.6669, "step": 6714 }, { "epoch": 2.569843092231152, "grad_norm": 0.7125771641731262, "learning_rate": 1.0035944918632429e-05, "loss": 0.6827, "step": 6715 }, { "epoch": 2.570225794106391, "grad_norm": 0.5959031581878662, "learning_rate": 1.0033465968324097e-05, "loss": 0.6205, "step": 6716 }, { "epoch": 2.5706084959816304, "grad_norm": 0.561712384223938, "learning_rate": 1.0030987015959185e-05, "loss": 0.5877, "step": 6717 }, { "epoch": 2.5709911978568694, "grad_norm": 0.571399450302124, "learning_rate": 1.0028508061690044e-05, "loss": 0.6845, "step": 6718 }, { "epoch": 2.5713738997321087, "grad_norm": 0.5502716302871704, "learning_rate": 1.0026029105669005e-05, "loss": 0.633, "step": 6719 }, { "epoch": 2.5717566016073476, "grad_norm": 0.5137166380882263, "learning_rate": 1.0023550148048405e-05, "loss": 0.6078, "step": 6720 }, { "epoch": 2.572139303482587, "grad_norm": 0.5245959758758545, "learning_rate": 1.0021071188980586e-05, "loss": 0.6222, "step": 6721 }, { "epoch": 2.5725220053578264, "grad_norm": 0.4954027235507965, "learning_rate": 1.0018592228617885e-05, "loss": 0.6382, "step": 6722 }, { "epoch": 2.5729047072330653, "grad_norm": 0.505368173122406, "learning_rate": 1.0016113267112642e-05, "loss": 0.6538, "step": 6723 }, { "epoch": 2.5732874091083047, "grad_norm": 0.5492420196533203, "learning_rate": 1.0013634304617197e-05, "loss": 0.7403, "step": 6724 }, { "epoch": 2.573670110983544, "grad_norm": 0.5109663605690002, "learning_rate": 1.0011155341283884e-05, "loss": 0.6558, "step": 6725 }, { "epoch": 2.574052812858783, "grad_norm": 0.5357462167739868, "learning_rate": 1.000867637726505e-05, "loss": 0.6687, "step": 6726 }, { "epoch": 2.5744355147340223, "grad_norm": 0.5467975735664368, "learning_rate": 1.0006197412713025e-05, "loss": 0.5835, "step": 6727 }, { "epoch": 2.5748182166092612, "grad_norm": 0.544384241104126, "learning_rate": 1.0003718447780156e-05, "loss": 0.6055, "step": 6728 }, { "epoch": 2.5752009184845006, "grad_norm": 0.5519154071807861, "learning_rate": 1.0001239482618777e-05, "loss": 0.6573, "step": 6729 }, { "epoch": 2.5755836203597395, "grad_norm": 0.4885721206665039, "learning_rate": 9.998760517381226e-06, "loss": 0.657, "step": 6730 }, { "epoch": 2.575966322234979, "grad_norm": 0.5390810966491699, "learning_rate": 9.996281552219846e-06, "loss": 0.6351, "step": 6731 }, { "epoch": 2.5763490241102183, "grad_norm": 0.5144046545028687, "learning_rate": 9.993802587286977e-06, "loss": 0.6063, "step": 6732 }, { "epoch": 2.576731725985457, "grad_norm": 0.4996858835220337, "learning_rate": 9.991323622734953e-06, "loss": 0.5366, "step": 6733 }, { "epoch": 2.5771144278606966, "grad_norm": 0.485145628452301, "learning_rate": 9.988844658716116e-06, "loss": 0.6414, "step": 6734 }, { "epoch": 2.577497129735936, "grad_norm": 0.5264871120452881, "learning_rate": 9.986365695382808e-06, "loss": 0.6369, "step": 6735 }, { "epoch": 2.577879831611175, "grad_norm": 0.5144298672676086, "learning_rate": 9.983886732887361e-06, "loss": 0.6099, "step": 6736 }, { "epoch": 2.578262533486414, "grad_norm": 0.5481277704238892, "learning_rate": 9.981407771382117e-06, "loss": 0.6141, "step": 6737 }, { "epoch": 2.578645235361653, "grad_norm": 0.5097438097000122, "learning_rate": 9.978928811019419e-06, "loss": 0.5663, "step": 6738 }, { "epoch": 2.5790279372368925, "grad_norm": 0.5250925421714783, "learning_rate": 9.976449851951599e-06, "loss": 0.6634, "step": 6739 }, { "epoch": 2.5794106391121314, "grad_norm": 0.5370386242866516, "learning_rate": 9.973970894331e-06, "loss": 0.629, "step": 6740 }, { "epoch": 2.579793340987371, "grad_norm": 0.5534359812736511, "learning_rate": 9.971491938309963e-06, "loss": 0.6219, "step": 6741 }, { "epoch": 2.58017604286261, "grad_norm": 0.5795691609382629, "learning_rate": 9.969012984040816e-06, "loss": 0.708, "step": 6742 }, { "epoch": 2.580558744737849, "grad_norm": 0.5289720892906189, "learning_rate": 9.966534031675907e-06, "loss": 0.684, "step": 6743 }, { "epoch": 2.5809414466130884, "grad_norm": 0.5598954558372498, "learning_rate": 9.96405508136757e-06, "loss": 0.6015, "step": 6744 }, { "epoch": 2.581324148488328, "grad_norm": 0.5364937782287598, "learning_rate": 9.961576133268151e-06, "loss": 0.574, "step": 6745 }, { "epoch": 2.5817068503635667, "grad_norm": 0.5293288230895996, "learning_rate": 9.959097187529978e-06, "loss": 0.6952, "step": 6746 }, { "epoch": 2.582089552238806, "grad_norm": 0.5263007879257202, "learning_rate": 9.956618244305391e-06, "loss": 0.612, "step": 6747 }, { "epoch": 2.582472254114045, "grad_norm": 0.5531501173973083, "learning_rate": 9.954139303746733e-06, "loss": 0.6733, "step": 6748 }, { "epoch": 2.5828549559892844, "grad_norm": 0.5254088044166565, "learning_rate": 9.951660366006335e-06, "loss": 0.6254, "step": 6749 }, { "epoch": 2.5832376578645233, "grad_norm": 0.5562371015548706, "learning_rate": 9.94918143123654e-06, "loss": 0.6293, "step": 6750 }, { "epoch": 2.5836203597397627, "grad_norm": 0.5313664674758911, "learning_rate": 9.946702499589683e-06, "loss": 0.6762, "step": 6751 }, { "epoch": 2.584003061615002, "grad_norm": 0.6101784110069275, "learning_rate": 9.944223571218096e-06, "loss": 0.6476, "step": 6752 }, { "epoch": 2.584385763490241, "grad_norm": 0.6516756415367126, "learning_rate": 9.941744646274121e-06, "loss": 0.6051, "step": 6753 }, { "epoch": 2.5847684653654803, "grad_norm": 0.5324227809906006, "learning_rate": 9.939265724910101e-06, "loss": 0.6348, "step": 6754 }, { "epoch": 2.5851511672407197, "grad_norm": 0.5380114912986755, "learning_rate": 9.936786807278358e-06, "loss": 0.6968, "step": 6755 }, { "epoch": 2.5855338691159586, "grad_norm": 0.6099985837936401, "learning_rate": 9.93430789353124e-06, "loss": 0.6122, "step": 6756 }, { "epoch": 2.585916570991198, "grad_norm": 0.5337097644805908, "learning_rate": 9.931828983821074e-06, "loss": 0.6707, "step": 6757 }, { "epoch": 2.586299272866437, "grad_norm": 0.5289077162742615, "learning_rate": 9.929350078300203e-06, "loss": 0.6202, "step": 6758 }, { "epoch": 2.5866819747416763, "grad_norm": 0.5291690230369568, "learning_rate": 9.92687117712096e-06, "loss": 0.673, "step": 6759 }, { "epoch": 2.587064676616915, "grad_norm": 0.5178725123405457, "learning_rate": 9.924392280435676e-06, "loss": 0.6491, "step": 6760 }, { "epoch": 2.5874473784921546, "grad_norm": 0.5099597573280334, "learning_rate": 9.921913388396697e-06, "loss": 0.6412, "step": 6761 }, { "epoch": 2.587830080367394, "grad_norm": 0.5498343110084534, "learning_rate": 9.919434501156344e-06, "loss": 0.6061, "step": 6762 }, { "epoch": 2.588212782242633, "grad_norm": 0.592340886592865, "learning_rate": 9.916955618866954e-06, "loss": 0.6277, "step": 6763 }, { "epoch": 2.588595484117872, "grad_norm": 0.5904368162155151, "learning_rate": 9.914476741680872e-06, "loss": 0.7354, "step": 6764 }, { "epoch": 2.5889781859931116, "grad_norm": 0.5085833668708801, "learning_rate": 9.911997869750421e-06, "loss": 0.6379, "step": 6765 }, { "epoch": 2.5893608878683505, "grad_norm": 0.5506567358970642, "learning_rate": 9.909519003227937e-06, "loss": 0.6179, "step": 6766 }, { "epoch": 2.58974358974359, "grad_norm": 0.5130301117897034, "learning_rate": 9.907040142265754e-06, "loss": 0.6812, "step": 6767 }, { "epoch": 2.590126291618829, "grad_norm": 0.5425006151199341, "learning_rate": 9.904561287016203e-06, "loss": 0.6688, "step": 6768 }, { "epoch": 2.590508993494068, "grad_norm": 0.5308319330215454, "learning_rate": 9.902082437631617e-06, "loss": 0.7117, "step": 6769 }, { "epoch": 2.590891695369307, "grad_norm": 0.518113911151886, "learning_rate": 9.899603594264327e-06, "loss": 0.5881, "step": 6770 }, { "epoch": 2.5912743972445464, "grad_norm": 0.5398553013801575, "learning_rate": 9.897124757066672e-06, "loss": 0.6766, "step": 6771 }, { "epoch": 2.591657099119786, "grad_norm": 0.5225513577461243, "learning_rate": 9.894645926190973e-06, "loss": 0.6197, "step": 6772 }, { "epoch": 2.5920398009950247, "grad_norm": 0.5725724697113037, "learning_rate": 9.892167101789563e-06, "loss": 0.5685, "step": 6773 }, { "epoch": 2.592422502870264, "grad_norm": 0.5248197913169861, "learning_rate": 9.889688284014777e-06, "loss": 0.6217, "step": 6774 }, { "epoch": 2.5928052047455035, "grad_norm": 0.6106010675430298, "learning_rate": 9.887209473018941e-06, "loss": 0.6227, "step": 6775 }, { "epoch": 2.5931879066207424, "grad_norm": 0.5340655446052551, "learning_rate": 9.884730668954386e-06, "loss": 0.6944, "step": 6776 }, { "epoch": 2.5935706084959818, "grad_norm": 0.5059930086135864, "learning_rate": 9.882251871973442e-06, "loss": 0.653, "step": 6777 }, { "epoch": 2.5939533103712207, "grad_norm": 0.5325576663017273, "learning_rate": 9.879773082228435e-06, "loss": 0.6574, "step": 6778 }, { "epoch": 2.59433601224646, "grad_norm": 0.5199963450431824, "learning_rate": 9.877294299871696e-06, "loss": 0.5408, "step": 6779 }, { "epoch": 2.594718714121699, "grad_norm": 0.5180482864379883, "learning_rate": 9.874815525055556e-06, "loss": 0.6387, "step": 6780 }, { "epoch": 2.5951014159969383, "grad_norm": 0.5477609038352966, "learning_rate": 9.872336757932332e-06, "loss": 0.6728, "step": 6781 }, { "epoch": 2.5954841178721777, "grad_norm": 0.5192123651504517, "learning_rate": 9.86985799865436e-06, "loss": 0.6516, "step": 6782 }, { "epoch": 2.5958668197474166, "grad_norm": 0.5314677953720093, "learning_rate": 9.867379247373962e-06, "loss": 0.6483, "step": 6783 }, { "epoch": 2.596249521622656, "grad_norm": 0.5101203322410583, "learning_rate": 9.864900504243468e-06, "loss": 0.6443, "step": 6784 }, { "epoch": 2.5966322234978954, "grad_norm": 0.5255680084228516, "learning_rate": 9.8624217694152e-06, "loss": 0.5928, "step": 6785 }, { "epoch": 2.5970149253731343, "grad_norm": 0.5062272548675537, "learning_rate": 9.85994304304148e-06, "loss": 0.6038, "step": 6786 }, { "epoch": 2.5973976272483736, "grad_norm": 0.5429773926734924, "learning_rate": 9.857464325274643e-06, "loss": 0.6641, "step": 6787 }, { "epoch": 2.5977803291236126, "grad_norm": 0.5237653851509094, "learning_rate": 9.854985616267002e-06, "loss": 0.6262, "step": 6788 }, { "epoch": 2.598163030998852, "grad_norm": 0.5496804714202881, "learning_rate": 9.852506916170884e-06, "loss": 0.6541, "step": 6789 }, { "epoch": 2.598545732874091, "grad_norm": 0.5407880544662476, "learning_rate": 9.850028225138619e-06, "loss": 0.5836, "step": 6790 }, { "epoch": 2.59892843474933, "grad_norm": 0.5207484364509583, "learning_rate": 9.847549543322514e-06, "loss": 0.5893, "step": 6791 }, { "epoch": 2.5993111366245696, "grad_norm": 0.6814168095588684, "learning_rate": 9.845070870874902e-06, "loss": 0.7614, "step": 6792 }, { "epoch": 2.5996938384998085, "grad_norm": 0.5168951749801636, "learning_rate": 9.842592207948102e-06, "loss": 0.5932, "step": 6793 }, { "epoch": 2.600076540375048, "grad_norm": 0.5920395851135254, "learning_rate": 9.840113554694433e-06, "loss": 0.5529, "step": 6794 }, { "epoch": 2.6004592422502872, "grad_norm": 0.5411787629127502, "learning_rate": 9.837634911266215e-06, "loss": 0.6637, "step": 6795 }, { "epoch": 2.600841944125526, "grad_norm": 0.5188656449317932, "learning_rate": 9.835156277815768e-06, "loss": 0.6316, "step": 6796 }, { "epoch": 2.6012246460007655, "grad_norm": 0.5287262797355652, "learning_rate": 9.832677654495413e-06, "loss": 0.6101, "step": 6797 }, { "epoch": 2.6016073478760045, "grad_norm": 0.5208128690719604, "learning_rate": 9.830199041457462e-06, "loss": 0.6494, "step": 6798 }, { "epoch": 2.601990049751244, "grad_norm": 0.5482460260391235, "learning_rate": 9.827720438854238e-06, "loss": 0.6762, "step": 6799 }, { "epoch": 2.6023727516264827, "grad_norm": 0.5626282095909119, "learning_rate": 9.825241846838058e-06, "loss": 0.6938, "step": 6800 }, { "epoch": 2.602755453501722, "grad_norm": 0.602183997631073, "learning_rate": 9.822763265561233e-06, "loss": 0.5912, "step": 6801 }, { "epoch": 2.6031381553769615, "grad_norm": 0.4537360668182373, "learning_rate": 9.820284695176082e-06, "loss": 0.5647, "step": 6802 }, { "epoch": 2.6035208572522004, "grad_norm": 0.511380136013031, "learning_rate": 9.81780613583492e-06, "loss": 0.6582, "step": 6803 }, { "epoch": 2.6039035591274398, "grad_norm": 0.5324100852012634, "learning_rate": 9.81532758769006e-06, "loss": 0.701, "step": 6804 }, { "epoch": 2.604286261002679, "grad_norm": 0.5450876951217651, "learning_rate": 9.812849050893815e-06, "loss": 0.6957, "step": 6805 }, { "epoch": 2.604668962877918, "grad_norm": 0.5392612814903259, "learning_rate": 9.810370525598502e-06, "loss": 0.6375, "step": 6806 }, { "epoch": 2.6050516647531574, "grad_norm": 0.49901553988456726, "learning_rate": 9.807892011956427e-06, "loss": 0.6173, "step": 6807 }, { "epoch": 2.6054343666283963, "grad_norm": 0.5029535889625549, "learning_rate": 9.8054135101199e-06, "loss": 0.6076, "step": 6808 }, { "epoch": 2.6058170685036357, "grad_norm": 0.5110498666763306, "learning_rate": 9.80293502024124e-06, "loss": 0.6326, "step": 6809 }, { "epoch": 2.6061997703788746, "grad_norm": 0.5179443955421448, "learning_rate": 9.800456542472754e-06, "loss": 0.683, "step": 6810 }, { "epoch": 2.606582472254114, "grad_norm": 0.5396177768707275, "learning_rate": 9.797978076966748e-06, "loss": 0.6295, "step": 6811 }, { "epoch": 2.6069651741293534, "grad_norm": 0.5025389790534973, "learning_rate": 9.795499623875531e-06, "loss": 0.6167, "step": 6812 }, { "epoch": 2.6073478760045923, "grad_norm": 0.5644391179084778, "learning_rate": 9.793021183351413e-06, "loss": 0.6487, "step": 6813 }, { "epoch": 2.6077305778798316, "grad_norm": 0.5034900307655334, "learning_rate": 9.790542755546698e-06, "loss": 0.6351, "step": 6814 }, { "epoch": 2.608113279755071, "grad_norm": 0.5038338899612427, "learning_rate": 9.788064340613692e-06, "loss": 0.6503, "step": 6815 }, { "epoch": 2.60849598163031, "grad_norm": 0.4987518787384033, "learning_rate": 9.785585938704706e-06, "loss": 0.6042, "step": 6816 }, { "epoch": 2.6088786835055493, "grad_norm": 0.5768007636070251, "learning_rate": 9.783107549972037e-06, "loss": 0.6746, "step": 6817 }, { "epoch": 2.6092613853807882, "grad_norm": 0.524143397808075, "learning_rate": 9.78062917456799e-06, "loss": 0.6115, "step": 6818 }, { "epoch": 2.6096440872560276, "grad_norm": 0.5121735334396362, "learning_rate": 9.778150812644874e-06, "loss": 0.6273, "step": 6819 }, { "epoch": 2.6100267891312665, "grad_norm": 0.5229863524436951, "learning_rate": 9.775672464354983e-06, "loss": 0.6773, "step": 6820 }, { "epoch": 2.610409491006506, "grad_norm": 0.5790531635284424, "learning_rate": 9.773194129850624e-06, "loss": 0.6278, "step": 6821 }, { "epoch": 2.6107921928817452, "grad_norm": 0.5824530124664307, "learning_rate": 9.770715809284093e-06, "loss": 0.6038, "step": 6822 }, { "epoch": 2.611174894756984, "grad_norm": 0.5263776779174805, "learning_rate": 9.768237502807692e-06, "loss": 0.6278, "step": 6823 }, { "epoch": 2.6115575966322235, "grad_norm": 0.5257768630981445, "learning_rate": 9.76575921057372e-06, "loss": 0.6419, "step": 6824 }, { "epoch": 2.611940298507463, "grad_norm": 0.5419763326644897, "learning_rate": 9.763280932734472e-06, "loss": 0.6846, "step": 6825 }, { "epoch": 2.612323000382702, "grad_norm": 0.5310291647911072, "learning_rate": 9.760802669442251e-06, "loss": 0.5607, "step": 6826 }, { "epoch": 2.612705702257941, "grad_norm": 0.5402430295944214, "learning_rate": 9.758324420849345e-06, "loss": 0.6635, "step": 6827 }, { "epoch": 2.61308840413318, "grad_norm": 0.5216154456138611, "learning_rate": 9.755846187108052e-06, "loss": 0.6374, "step": 6828 }, { "epoch": 2.6134711060084195, "grad_norm": 0.5334552526473999, "learning_rate": 9.75336796837067e-06, "loss": 0.5625, "step": 6829 }, { "epoch": 2.6138538078836584, "grad_norm": 0.5307459831237793, "learning_rate": 9.750889764789485e-06, "loss": 0.6638, "step": 6830 }, { "epoch": 2.6142365097588978, "grad_norm": 0.552675724029541, "learning_rate": 9.748411576516794e-06, "loss": 0.6208, "step": 6831 }, { "epoch": 2.614619211634137, "grad_norm": 0.5200284719467163, "learning_rate": 9.745933403704891e-06, "loss": 0.6008, "step": 6832 }, { "epoch": 2.615001913509376, "grad_norm": 0.5358573198318481, "learning_rate": 9.743455246506059e-06, "loss": 0.5562, "step": 6833 }, { "epoch": 2.6153846153846154, "grad_norm": 0.5695062279701233, "learning_rate": 9.740977105072591e-06, "loss": 0.648, "step": 6834 }, { "epoch": 2.615767317259855, "grad_norm": 0.551428496837616, "learning_rate": 9.738498979556777e-06, "loss": 0.671, "step": 6835 }, { "epoch": 2.6161500191350937, "grad_norm": 0.529944896697998, "learning_rate": 9.736020870110906e-06, "loss": 0.6881, "step": 6836 }, { "epoch": 2.616532721010333, "grad_norm": 0.5365170240402222, "learning_rate": 9.73354277688726e-06, "loss": 0.5393, "step": 6837 }, { "epoch": 2.616915422885572, "grad_norm": 0.5316312313079834, "learning_rate": 9.731064700038123e-06, "loss": 0.6572, "step": 6838 }, { "epoch": 2.6172981247608114, "grad_norm": 0.5247485637664795, "learning_rate": 9.728586639715788e-06, "loss": 0.7433, "step": 6839 }, { "epoch": 2.6176808266360503, "grad_norm": 0.5425074696540833, "learning_rate": 9.72610859607253e-06, "loss": 0.643, "step": 6840 }, { "epoch": 2.6180635285112897, "grad_norm": 0.48876115679740906, "learning_rate": 9.723630569260634e-06, "loss": 0.5892, "step": 6841 }, { "epoch": 2.618446230386529, "grad_norm": 0.5314688086509705, "learning_rate": 9.721152559432385e-06, "loss": 0.6709, "step": 6842 }, { "epoch": 2.618828932261768, "grad_norm": 0.48900702595710754, "learning_rate": 9.718674566740056e-06, "loss": 0.6659, "step": 6843 }, { "epoch": 2.6192116341370073, "grad_norm": 0.5232649445533752, "learning_rate": 9.716196591335933e-06, "loss": 0.5806, "step": 6844 }, { "epoch": 2.6195943360122467, "grad_norm": 0.5152409076690674, "learning_rate": 9.713718633372294e-06, "loss": 0.5868, "step": 6845 }, { "epoch": 2.6199770378874856, "grad_norm": 0.5175122022628784, "learning_rate": 9.71124069300141e-06, "loss": 0.7103, "step": 6846 }, { "epoch": 2.620359739762725, "grad_norm": 0.5598313808441162, "learning_rate": 9.708762770375565e-06, "loss": 0.6044, "step": 6847 }, { "epoch": 2.620742441637964, "grad_norm": 0.62124103307724, "learning_rate": 9.706284865647028e-06, "loss": 0.6015, "step": 6848 }, { "epoch": 2.6211251435132032, "grad_norm": 0.4997808635234833, "learning_rate": 9.703806978968075e-06, "loss": 0.6218, "step": 6849 }, { "epoch": 2.621507845388442, "grad_norm": 0.5312077403068542, "learning_rate": 9.701329110490978e-06, "loss": 0.5693, "step": 6850 }, { "epoch": 2.6218905472636815, "grad_norm": 0.49382129311561584, "learning_rate": 9.69885126036801e-06, "loss": 0.6531, "step": 6851 }, { "epoch": 2.622273249138921, "grad_norm": 0.5489920377731323, "learning_rate": 9.696373428751443e-06, "loss": 0.6222, "step": 6852 }, { "epoch": 2.62265595101416, "grad_norm": 0.49248358607292175, "learning_rate": 9.693895615793542e-06, "loss": 0.6771, "step": 6853 }, { "epoch": 2.623038652889399, "grad_norm": 0.543222963809967, "learning_rate": 9.691417821646579e-06, "loss": 0.6506, "step": 6854 }, { "epoch": 2.6234213547646386, "grad_norm": 0.4877927303314209, "learning_rate": 9.688940046462823e-06, "loss": 0.6486, "step": 6855 }, { "epoch": 2.6238040566398775, "grad_norm": 0.5432459712028503, "learning_rate": 9.686462290394533e-06, "loss": 0.678, "step": 6856 }, { "epoch": 2.624186758515117, "grad_norm": 0.5599781274795532, "learning_rate": 9.683984553593977e-06, "loss": 0.657, "step": 6857 }, { "epoch": 2.6245694603903558, "grad_norm": 0.5509884357452393, "learning_rate": 9.681506836213421e-06, "loss": 0.5368, "step": 6858 }, { "epoch": 2.624952162265595, "grad_norm": 0.5348765254020691, "learning_rate": 9.679029138405125e-06, "loss": 0.5578, "step": 6859 }, { "epoch": 2.625334864140834, "grad_norm": 0.5163488984107971, "learning_rate": 9.67655146032135e-06, "loss": 0.6169, "step": 6860 }, { "epoch": 2.6257175660160734, "grad_norm": 0.4899837076663971, "learning_rate": 9.67407380211436e-06, "loss": 0.6043, "step": 6861 }, { "epoch": 2.626100267891313, "grad_norm": 0.5891813635826111, "learning_rate": 9.671596163936408e-06, "loss": 0.7065, "step": 6862 }, { "epoch": 2.6264829697665517, "grad_norm": 0.5541038513183594, "learning_rate": 9.66911854593975e-06, "loss": 0.6227, "step": 6863 }, { "epoch": 2.626865671641791, "grad_norm": 0.5463547110557556, "learning_rate": 9.666640948276648e-06, "loss": 0.6377, "step": 6864 }, { "epoch": 2.6272483735170304, "grad_norm": 0.5521708726882935, "learning_rate": 9.664163371099363e-06, "loss": 0.6258, "step": 6865 }, { "epoch": 2.6276310753922694, "grad_norm": 0.6752772927284241, "learning_rate": 9.661685814560133e-06, "loss": 0.6704, "step": 6866 }, { "epoch": 2.6280137772675087, "grad_norm": 0.6297165751457214, "learning_rate": 9.65920827881122e-06, "loss": 0.6976, "step": 6867 }, { "epoch": 2.6283964791427477, "grad_norm": 0.50706946849823, "learning_rate": 9.656730764004878e-06, "loss": 0.6507, "step": 6868 }, { "epoch": 2.628779181017987, "grad_norm": 0.518640398979187, "learning_rate": 9.654253270293348e-06, "loss": 0.6084, "step": 6869 }, { "epoch": 2.629161882893226, "grad_norm": 0.5527084469795227, "learning_rate": 9.651775797828884e-06, "loss": 0.566, "step": 6870 }, { "epoch": 2.6295445847684653, "grad_norm": 0.5207298994064331, "learning_rate": 9.649298346763737e-06, "loss": 0.6938, "step": 6871 }, { "epoch": 2.6299272866437047, "grad_norm": 0.540641188621521, "learning_rate": 9.646820917250144e-06, "loss": 0.6632, "step": 6872 }, { "epoch": 2.6303099885189436, "grad_norm": 0.5627807974815369, "learning_rate": 9.644343509440356e-06, "loss": 0.637, "step": 6873 }, { "epoch": 2.630692690394183, "grad_norm": 0.5418567657470703, "learning_rate": 9.641866123486614e-06, "loss": 0.6518, "step": 6874 }, { "epoch": 2.6310753922694223, "grad_norm": 0.51495361328125, "learning_rate": 9.639388759541163e-06, "loss": 0.6653, "step": 6875 }, { "epoch": 2.6314580941446613, "grad_norm": 0.5637997388839722, "learning_rate": 9.636911417756239e-06, "loss": 0.6317, "step": 6876 }, { "epoch": 2.6318407960199006, "grad_norm": 0.5368965268135071, "learning_rate": 9.634434098284084e-06, "loss": 0.5942, "step": 6877 }, { "epoch": 2.6322234978951395, "grad_norm": 0.574362576007843, "learning_rate": 9.631956801276937e-06, "loss": 0.6786, "step": 6878 }, { "epoch": 2.632606199770379, "grad_norm": 0.525545597076416, "learning_rate": 9.629479526887032e-06, "loss": 0.6856, "step": 6879 }, { "epoch": 2.632988901645618, "grad_norm": 0.5005756616592407, "learning_rate": 9.627002275266606e-06, "loss": 0.6658, "step": 6880 }, { "epoch": 2.633371603520857, "grad_norm": 0.5628889799118042, "learning_rate": 9.624525046567897e-06, "loss": 0.6244, "step": 6881 }, { "epoch": 2.6337543053960966, "grad_norm": 0.5115458369255066, "learning_rate": 9.622047840943127e-06, "loss": 0.6412, "step": 6882 }, { "epoch": 2.6341370072713355, "grad_norm": 0.5374618768692017, "learning_rate": 9.619570658544531e-06, "loss": 0.7245, "step": 6883 }, { "epoch": 2.634519709146575, "grad_norm": 0.5470808744430542, "learning_rate": 9.617093499524346e-06, "loss": 0.6378, "step": 6884 }, { "epoch": 2.634902411021814, "grad_norm": 0.5616141557693481, "learning_rate": 9.61461636403479e-06, "loss": 0.5882, "step": 6885 }, { "epoch": 2.635285112897053, "grad_norm": 0.5281970500946045, "learning_rate": 9.612139252228093e-06, "loss": 0.6302, "step": 6886 }, { "epoch": 2.6356678147722925, "grad_norm": 0.6003969311714172, "learning_rate": 9.609662164256483e-06, "loss": 0.7788, "step": 6887 }, { "epoch": 2.6360505166475314, "grad_norm": 0.5629487633705139, "learning_rate": 9.607185100272181e-06, "loss": 0.6193, "step": 6888 }, { "epoch": 2.636433218522771, "grad_norm": 0.5054855346679688, "learning_rate": 9.604708060427408e-06, "loss": 0.6454, "step": 6889 }, { "epoch": 2.6368159203980097, "grad_norm": 0.5614744424819946, "learning_rate": 9.60223104487439e-06, "loss": 0.6156, "step": 6890 }, { "epoch": 2.637198622273249, "grad_norm": 0.8165290355682373, "learning_rate": 9.599754053765345e-06, "loss": 0.6874, "step": 6891 }, { "epoch": 2.6375813241484884, "grad_norm": 0.5607129335403442, "learning_rate": 9.597277087252484e-06, "loss": 0.6149, "step": 6892 }, { "epoch": 2.6379640260237274, "grad_norm": 0.5260421633720398, "learning_rate": 9.594800145488027e-06, "loss": 0.6231, "step": 6893 }, { "epoch": 2.6383467278989667, "grad_norm": 0.5074437260627747, "learning_rate": 9.592323228624193e-06, "loss": 0.6583, "step": 6894 }, { "epoch": 2.638729429774206, "grad_norm": 0.5056631565093994, "learning_rate": 9.589846336813192e-06, "loss": 0.6875, "step": 6895 }, { "epoch": 2.639112131649445, "grad_norm": 0.58863365650177, "learning_rate": 9.587369470207233e-06, "loss": 0.6228, "step": 6896 }, { "epoch": 2.6394948335246844, "grad_norm": 0.5299451947212219, "learning_rate": 9.584892628958532e-06, "loss": 0.6709, "step": 6897 }, { "epoch": 2.6398775353999233, "grad_norm": 0.5410652756690979, "learning_rate": 9.58241581321929e-06, "loss": 0.6191, "step": 6898 }, { "epoch": 2.6402602372751627, "grad_norm": 0.5274096131324768, "learning_rate": 9.579939023141719e-06, "loss": 0.6529, "step": 6899 }, { "epoch": 2.6406429391504016, "grad_norm": 0.6155027151107788, "learning_rate": 9.577462258878028e-06, "loss": 0.6917, "step": 6900 }, { "epoch": 2.641025641025641, "grad_norm": 0.5319733619689941, "learning_rate": 9.574985520580411e-06, "loss": 0.6371, "step": 6901 }, { "epoch": 2.6414083429008803, "grad_norm": 0.5189396142959595, "learning_rate": 9.572508808401078e-06, "loss": 0.6959, "step": 6902 }, { "epoch": 2.6417910447761193, "grad_norm": 0.4823048412799835, "learning_rate": 9.570032122492224e-06, "loss": 0.5597, "step": 6903 }, { "epoch": 2.6421737466513586, "grad_norm": 0.48448535799980164, "learning_rate": 9.567555463006056e-06, "loss": 0.5866, "step": 6904 }, { "epoch": 2.642556448526598, "grad_norm": 0.5110596418380737, "learning_rate": 9.565078830094762e-06, "loss": 0.593, "step": 6905 }, { "epoch": 2.642939150401837, "grad_norm": 0.5502274632453918, "learning_rate": 9.562602223910545e-06, "loss": 0.6668, "step": 6906 }, { "epoch": 2.6433218522770763, "grad_norm": 0.5419213175773621, "learning_rate": 9.560125644605599e-06, "loss": 0.6925, "step": 6907 }, { "epoch": 2.643704554152315, "grad_norm": 0.562123715877533, "learning_rate": 9.55764909233211e-06, "loss": 0.7022, "step": 6908 }, { "epoch": 2.6440872560275546, "grad_norm": 0.519106924533844, "learning_rate": 9.55517256724227e-06, "loss": 0.6337, "step": 6909 }, { "epoch": 2.6444699579027935, "grad_norm": 0.5673871636390686, "learning_rate": 9.552696069488279e-06, "loss": 0.6599, "step": 6910 }, { "epoch": 2.644852659778033, "grad_norm": 0.5050313472747803, "learning_rate": 9.55021959922231e-06, "loss": 0.6286, "step": 6911 }, { "epoch": 2.645235361653272, "grad_norm": 0.5177398324012756, "learning_rate": 9.547743156596555e-06, "loss": 0.6193, "step": 6912 }, { "epoch": 2.645618063528511, "grad_norm": 0.5623415112495422, "learning_rate": 9.545266741763202e-06, "loss": 0.6161, "step": 6913 }, { "epoch": 2.6460007654037505, "grad_norm": 0.5930346250534058, "learning_rate": 9.542790354874425e-06, "loss": 0.7253, "step": 6914 }, { "epoch": 2.64638346727899, "grad_norm": 0.5792069435119629, "learning_rate": 9.540313996082409e-06, "loss": 0.6771, "step": 6915 }, { "epoch": 2.646766169154229, "grad_norm": 0.5589796900749207, "learning_rate": 9.537837665539335e-06, "loss": 0.6578, "step": 6916 }, { "epoch": 2.647148871029468, "grad_norm": 0.5093567967414856, "learning_rate": 9.535361363397382e-06, "loss": 0.6093, "step": 6917 }, { "epoch": 2.647531572904707, "grad_norm": 0.5834516882896423, "learning_rate": 9.532885089808713e-06, "loss": 0.5976, "step": 6918 }, { "epoch": 2.6479142747799465, "grad_norm": 0.5462430715560913, "learning_rate": 9.530408844925514e-06, "loss": 0.6265, "step": 6919 }, { "epoch": 2.6482969766551854, "grad_norm": 0.47647982835769653, "learning_rate": 9.527932628899956e-06, "loss": 0.6372, "step": 6920 }, { "epoch": 2.6486796785304247, "grad_norm": 0.5470840930938721, "learning_rate": 9.525456441884204e-06, "loss": 0.637, "step": 6921 }, { "epoch": 2.649062380405664, "grad_norm": 0.5639404654502869, "learning_rate": 9.522980284030426e-06, "loss": 0.6276, "step": 6922 }, { "epoch": 2.649445082280903, "grad_norm": 0.5389909148216248, "learning_rate": 9.520504155490794e-06, "loss": 0.7676, "step": 6923 }, { "epoch": 2.6498277841561424, "grad_norm": 0.5604237914085388, "learning_rate": 9.518028056417469e-06, "loss": 0.5726, "step": 6924 }, { "epoch": 2.6502104860313818, "grad_norm": 0.5753512978553772, "learning_rate": 9.515551986962614e-06, "loss": 0.6732, "step": 6925 }, { "epoch": 2.6505931879066207, "grad_norm": 0.7327571511268616, "learning_rate": 9.513075947278394e-06, "loss": 0.692, "step": 6926 }, { "epoch": 2.65097588978186, "grad_norm": 0.5373938083648682, "learning_rate": 9.510599937516963e-06, "loss": 0.6406, "step": 6927 }, { "epoch": 2.651358591657099, "grad_norm": 0.5349684357643127, "learning_rate": 9.508123957830479e-06, "loss": 0.6837, "step": 6928 }, { "epoch": 2.6517412935323383, "grad_norm": 0.5262151956558228, "learning_rate": 9.505648008371097e-06, "loss": 0.5659, "step": 6929 }, { "epoch": 2.6521239954075773, "grad_norm": 0.5440450310707092, "learning_rate": 9.50317208929098e-06, "loss": 0.6946, "step": 6930 }, { "epoch": 2.6525066972828166, "grad_norm": 0.5678083896636963, "learning_rate": 9.500696200742268e-06, "loss": 0.6285, "step": 6931 }, { "epoch": 2.652889399158056, "grad_norm": 0.5254601836204529, "learning_rate": 9.498220342877116e-06, "loss": 0.7047, "step": 6932 }, { "epoch": 2.653272101033295, "grad_norm": 0.49764329195022583, "learning_rate": 9.495744515847674e-06, "loss": 0.6355, "step": 6933 }, { "epoch": 2.6536548029085343, "grad_norm": 0.5569125413894653, "learning_rate": 9.493268719806085e-06, "loss": 0.5969, "step": 6934 }, { "epoch": 2.6540375047837736, "grad_norm": 0.5906564593315125, "learning_rate": 9.490792954904494e-06, "loss": 0.5998, "step": 6935 }, { "epoch": 2.6544202066590126, "grad_norm": 0.5280908942222595, "learning_rate": 9.488317221295049e-06, "loss": 0.6459, "step": 6936 }, { "epoch": 2.654802908534252, "grad_norm": 0.5117020010948181, "learning_rate": 9.485841519129878e-06, "loss": 0.6719, "step": 6937 }, { "epoch": 2.655185610409491, "grad_norm": 0.5445016622543335, "learning_rate": 9.483365848561131e-06, "loss": 0.6897, "step": 6938 }, { "epoch": 2.6555683122847302, "grad_norm": 0.5473846793174744, "learning_rate": 9.48089020974094e-06, "loss": 0.6243, "step": 6939 }, { "epoch": 2.655951014159969, "grad_norm": 0.52458655834198, "learning_rate": 9.478414602821438e-06, "loss": 0.7063, "step": 6940 }, { "epoch": 2.6563337160352085, "grad_norm": 0.4843619465827942, "learning_rate": 9.47593902795476e-06, "loss": 0.5258, "step": 6941 }, { "epoch": 2.656716417910448, "grad_norm": 0.5082101821899414, "learning_rate": 9.473463485293038e-06, "loss": 0.6963, "step": 6942 }, { "epoch": 2.657099119785687, "grad_norm": 0.5618235468864441, "learning_rate": 9.470987974988401e-06, "loss": 0.6256, "step": 6943 }, { "epoch": 2.657481821660926, "grad_norm": 0.5263324975967407, "learning_rate": 9.468512497192972e-06, "loss": 0.7068, "step": 6944 }, { "epoch": 2.6578645235361655, "grad_norm": 0.4820530116558075, "learning_rate": 9.466037052058877e-06, "loss": 0.6567, "step": 6945 }, { "epoch": 2.6582472254114045, "grad_norm": 0.5361441373825073, "learning_rate": 9.463561639738246e-06, "loss": 0.6653, "step": 6946 }, { "epoch": 2.658629927286644, "grad_norm": 0.5920738577842712, "learning_rate": 9.461086260383188e-06, "loss": 0.64, "step": 6947 }, { "epoch": 2.6590126291618827, "grad_norm": 0.539864182472229, "learning_rate": 9.458610914145826e-06, "loss": 0.6452, "step": 6948 }, { "epoch": 2.659395331037122, "grad_norm": 0.5295279026031494, "learning_rate": 9.45613560117828e-06, "loss": 0.6513, "step": 6949 }, { "epoch": 2.659778032912361, "grad_norm": 0.5656508207321167, "learning_rate": 9.453660321632662e-06, "loss": 0.6002, "step": 6950 }, { "epoch": 2.6601607347876004, "grad_norm": 0.6128174662590027, "learning_rate": 9.451185075661083e-06, "loss": 0.6536, "step": 6951 }, { "epoch": 2.6605434366628398, "grad_norm": 0.5504202842712402, "learning_rate": 9.44870986341566e-06, "loss": 0.6234, "step": 6952 }, { "epoch": 2.6609261385380787, "grad_norm": 0.6370477080345154, "learning_rate": 9.446234685048492e-06, "loss": 0.6401, "step": 6953 }, { "epoch": 2.661308840413318, "grad_norm": 0.5349025130271912, "learning_rate": 9.443759540711692e-06, "loss": 0.6247, "step": 6954 }, { "epoch": 2.6616915422885574, "grad_norm": 0.5066938400268555, "learning_rate": 9.441284430557362e-06, "loss": 0.5011, "step": 6955 }, { "epoch": 2.6620742441637963, "grad_norm": 0.5471150279045105, "learning_rate": 9.43880935473761e-06, "loss": 0.6701, "step": 6956 }, { "epoch": 2.6624569460390357, "grad_norm": 0.4897775948047638, "learning_rate": 9.436334313404524e-06, "loss": 0.6294, "step": 6957 }, { "epoch": 2.6628396479142746, "grad_norm": 0.5583629608154297, "learning_rate": 9.433859306710211e-06, "loss": 0.6244, "step": 6958 }, { "epoch": 2.663222349789514, "grad_norm": 0.5320243239402771, "learning_rate": 9.431384334806766e-06, "loss": 0.5883, "step": 6959 }, { "epoch": 2.663605051664753, "grad_norm": 0.5062786340713501, "learning_rate": 9.42890939784628e-06, "loss": 0.6736, "step": 6960 }, { "epoch": 2.6639877535399923, "grad_norm": 0.5999323725700378, "learning_rate": 9.426434495980843e-06, "loss": 0.6229, "step": 6961 }, { "epoch": 2.6643704554152317, "grad_norm": 0.5487567782402039, "learning_rate": 9.423959629362555e-06, "loss": 0.6553, "step": 6962 }, { "epoch": 2.6647531572904706, "grad_norm": 0.5651699304580688, "learning_rate": 9.421484798143487e-06, "loss": 0.6672, "step": 6963 }, { "epoch": 2.66513585916571, "grad_norm": 0.5821321606636047, "learning_rate": 9.419010002475735e-06, "loss": 0.6598, "step": 6964 }, { "epoch": 2.6655185610409493, "grad_norm": 0.5443241000175476, "learning_rate": 9.416535242511382e-06, "loss": 0.6015, "step": 6965 }, { "epoch": 2.6659012629161882, "grad_norm": 0.5251019597053528, "learning_rate": 9.414060518402503e-06, "loss": 0.5886, "step": 6966 }, { "epoch": 2.6662839647914276, "grad_norm": 0.530195951461792, "learning_rate": 9.411585830301179e-06, "loss": 0.5841, "step": 6967 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5200997591018677, "learning_rate": 9.409111178359485e-06, "loss": 0.6244, "step": 6968 }, { "epoch": 2.667049368541906, "grad_norm": 0.5250401496887207, "learning_rate": 9.4066365627295e-06, "loss": 0.6065, "step": 6969 }, { "epoch": 2.667432070417145, "grad_norm": 0.5148866772651672, "learning_rate": 9.404161983563292e-06, "loss": 0.6663, "step": 6970 }, { "epoch": 2.667814772292384, "grad_norm": 0.5491673946380615, "learning_rate": 9.401687441012928e-06, "loss": 0.7214, "step": 6971 }, { "epoch": 2.6681974741676235, "grad_norm": 0.5819565653800964, "learning_rate": 9.399212935230484e-06, "loss": 0.5734, "step": 6972 }, { "epoch": 2.6685801760428625, "grad_norm": 0.5304103493690491, "learning_rate": 9.396738466368014e-06, "loss": 0.5535, "step": 6973 }, { "epoch": 2.668962877918102, "grad_norm": 0.5285202860832214, "learning_rate": 9.394264034577583e-06, "loss": 0.654, "step": 6974 }, { "epoch": 2.669345579793341, "grad_norm": 0.5471457242965698, "learning_rate": 9.391789640011263e-06, "loss": 0.6496, "step": 6975 }, { "epoch": 2.66972828166858, "grad_norm": 0.548608124256134, "learning_rate": 9.389315282821097e-06, "loss": 0.657, "step": 6976 }, { "epoch": 2.6701109835438195, "grad_norm": 0.5396882891654968, "learning_rate": 9.386840963159149e-06, "loss": 0.7008, "step": 6977 }, { "epoch": 2.6704936854190584, "grad_norm": 0.5728716850280762, "learning_rate": 9.384366681177472e-06, "loss": 0.6496, "step": 6978 }, { "epoch": 2.6708763872942978, "grad_norm": 0.5659109354019165, "learning_rate": 9.381892437028117e-06, "loss": 0.5874, "step": 6979 }, { "epoch": 2.6712590891695367, "grad_norm": 0.556602418422699, "learning_rate": 9.379418230863129e-06, "loss": 0.6544, "step": 6980 }, { "epoch": 2.671641791044776, "grad_norm": 0.603344202041626, "learning_rate": 9.37694406283456e-06, "loss": 0.6835, "step": 6981 }, { "epoch": 2.6720244929200154, "grad_norm": 0.523117184638977, "learning_rate": 9.374469933094454e-06, "loss": 0.6067, "step": 6982 }, { "epoch": 2.6724071947952543, "grad_norm": 0.5580435395240784, "learning_rate": 9.371995841794849e-06, "loss": 0.6846, "step": 6983 }, { "epoch": 2.6727898966704937, "grad_norm": 0.5995256900787354, "learning_rate": 9.369521789087784e-06, "loss": 0.639, "step": 6984 }, { "epoch": 2.673172598545733, "grad_norm": 0.570692777633667, "learning_rate": 9.367047775125305e-06, "loss": 0.7439, "step": 6985 }, { "epoch": 2.673555300420972, "grad_norm": 0.5221983790397644, "learning_rate": 9.364573800059437e-06, "loss": 0.6162, "step": 6986 }, { "epoch": 2.6739380022962114, "grad_norm": 0.5885785222053528, "learning_rate": 9.362099864042214e-06, "loss": 0.728, "step": 6987 }, { "epoch": 2.6743207041714503, "grad_norm": 0.5412728786468506, "learning_rate": 9.359625967225673e-06, "loss": 0.6579, "step": 6988 }, { "epoch": 2.6747034060466897, "grad_norm": 0.5495259165763855, "learning_rate": 9.357152109761834e-06, "loss": 0.5774, "step": 6989 }, { "epoch": 2.6750861079219286, "grad_norm": 0.5389127135276794, "learning_rate": 9.354678291802723e-06, "loss": 0.6159, "step": 6990 }, { "epoch": 2.675468809797168, "grad_norm": 0.5086460113525391, "learning_rate": 9.352204513500371e-06, "loss": 0.5793, "step": 6991 }, { "epoch": 2.6758515116724073, "grad_norm": 0.5296508073806763, "learning_rate": 9.349730775006786e-06, "loss": 0.589, "step": 6992 }, { "epoch": 2.6762342135476462, "grad_norm": 0.5013658404350281, "learning_rate": 9.347257076473992e-06, "loss": 0.6129, "step": 6993 }, { "epoch": 2.6766169154228856, "grad_norm": 0.544863760471344, "learning_rate": 9.344783418054005e-06, "loss": 0.709, "step": 6994 }, { "epoch": 2.676999617298125, "grad_norm": 0.5155394077301025, "learning_rate": 9.342309799898838e-06, "loss": 0.6128, "step": 6995 }, { "epoch": 2.677382319173364, "grad_norm": 0.5023505091667175, "learning_rate": 9.339836222160501e-06, "loss": 0.6613, "step": 6996 }, { "epoch": 2.6777650210486033, "grad_norm": 0.5229877233505249, "learning_rate": 9.337362684991e-06, "loss": 0.6297, "step": 6997 }, { "epoch": 2.678147722923842, "grad_norm": 0.5745657086372375, "learning_rate": 9.334889188542344e-06, "loss": 0.6641, "step": 6998 }, { "epoch": 2.6785304247990815, "grad_norm": 0.5700744390487671, "learning_rate": 9.332415732966534e-06, "loss": 0.6944, "step": 6999 }, { "epoch": 2.6789131266743205, "grad_norm": 0.532241702079773, "learning_rate": 9.329942318415569e-06, "loss": 0.5908, "step": 7000 }, { "epoch": 2.67929582854956, "grad_norm": 0.5864928364753723, "learning_rate": 9.327468945041453e-06, "loss": 0.6745, "step": 7001 }, { "epoch": 2.679678530424799, "grad_norm": 0.5579946637153625, "learning_rate": 9.324995612996173e-06, "loss": 0.688, "step": 7002 }, { "epoch": 2.680061232300038, "grad_norm": 0.531552791595459, "learning_rate": 9.322522322431725e-06, "loss": 0.6374, "step": 7003 }, { "epoch": 2.6804439341752775, "grad_norm": 0.5392783880233765, "learning_rate": 9.320049073500105e-06, "loss": 0.5829, "step": 7004 }, { "epoch": 2.680826636050517, "grad_norm": 0.568985104560852, "learning_rate": 9.317575866353293e-06, "loss": 0.662, "step": 7005 }, { "epoch": 2.6812093379257558, "grad_norm": 0.5217303037643433, "learning_rate": 9.315102701143276e-06, "loss": 0.596, "step": 7006 }, { "epoch": 2.681592039800995, "grad_norm": 0.5474341511726379, "learning_rate": 9.312629578022038e-06, "loss": 0.6151, "step": 7007 }, { "epoch": 2.681974741676234, "grad_norm": 0.5337045788764954, "learning_rate": 9.310156497141562e-06, "loss": 0.6201, "step": 7008 }, { "epoch": 2.6823574435514734, "grad_norm": 0.47783133387565613, "learning_rate": 9.30768345865382e-06, "loss": 0.5767, "step": 7009 }, { "epoch": 2.6827401454267124, "grad_norm": 0.5431030988693237, "learning_rate": 9.30521046271079e-06, "loss": 0.6241, "step": 7010 }, { "epoch": 2.6831228473019517, "grad_norm": 0.5095300078392029, "learning_rate": 9.302737509464447e-06, "loss": 0.6717, "step": 7011 }, { "epoch": 2.683505549177191, "grad_norm": 0.6166151762008667, "learning_rate": 9.300264599066755e-06, "loss": 0.6997, "step": 7012 }, { "epoch": 2.68388825105243, "grad_norm": 0.538055956363678, "learning_rate": 9.29779173166968e-06, "loss": 0.6444, "step": 7013 }, { "epoch": 2.6842709529276694, "grad_norm": 0.5540618300437927, "learning_rate": 9.295318907425195e-06, "loss": 0.5986, "step": 7014 }, { "epoch": 2.6846536548029087, "grad_norm": 0.5310130715370178, "learning_rate": 9.292846126485252e-06, "loss": 0.5669, "step": 7015 }, { "epoch": 2.6850363566781477, "grad_norm": 0.552532970905304, "learning_rate": 9.290373389001815e-06, "loss": 0.6727, "step": 7016 }, { "epoch": 2.685419058553387, "grad_norm": 0.5302086472511292, "learning_rate": 9.287900695126844e-06, "loss": 0.6011, "step": 7017 }, { "epoch": 2.685801760428626, "grad_norm": 0.572230875492096, "learning_rate": 9.285428045012282e-06, "loss": 0.6379, "step": 7018 }, { "epoch": 2.6861844623038653, "grad_norm": 0.5008811950683594, "learning_rate": 9.282955438810088e-06, "loss": 0.5953, "step": 7019 }, { "epoch": 2.6865671641791042, "grad_norm": 0.5317485332489014, "learning_rate": 9.280482876672208e-06, "loss": 0.6269, "step": 7020 }, { "epoch": 2.6869498660543436, "grad_norm": 0.5575997233390808, "learning_rate": 9.278010358750594e-06, "loss": 0.6401, "step": 7021 }, { "epoch": 2.687332567929583, "grad_norm": 0.5026775598526001, "learning_rate": 9.275537885197178e-06, "loss": 0.614, "step": 7022 }, { "epoch": 2.687715269804822, "grad_norm": 0.5692101120948792, "learning_rate": 9.273065456163905e-06, "loss": 0.738, "step": 7023 }, { "epoch": 2.6880979716800613, "grad_norm": 0.5344721674919128, "learning_rate": 9.270593071802715e-06, "loss": 0.6708, "step": 7024 }, { "epoch": 2.6884806735553006, "grad_norm": 0.48955920338630676, "learning_rate": 9.26812073226554e-06, "loss": 0.634, "step": 7025 }, { "epoch": 2.6888633754305395, "grad_norm": 0.5238972306251526, "learning_rate": 9.265648437704309e-06, "loss": 0.6271, "step": 7026 }, { "epoch": 2.689246077305779, "grad_norm": 0.5053678154945374, "learning_rate": 9.26317618827096e-06, "loss": 0.6313, "step": 7027 }, { "epoch": 2.689628779181018, "grad_norm": 0.5509716272354126, "learning_rate": 9.260703984117409e-06, "loss": 0.6776, "step": 7028 }, { "epoch": 2.690011481056257, "grad_norm": 0.5173219442367554, "learning_rate": 9.258231825395582e-06, "loss": 0.6678, "step": 7029 }, { "epoch": 2.690394182931496, "grad_norm": 0.5372743606567383, "learning_rate": 9.25575971225741e-06, "loss": 0.5924, "step": 7030 }, { "epoch": 2.6907768848067355, "grad_norm": 0.5490433573722839, "learning_rate": 9.253287644854796e-06, "loss": 0.7021, "step": 7031 }, { "epoch": 2.691159586681975, "grad_norm": 0.5746453404426575, "learning_rate": 9.250815623339664e-06, "loss": 0.5993, "step": 7032 }, { "epoch": 2.691542288557214, "grad_norm": 0.6193386912345886, "learning_rate": 9.248343647863926e-06, "loss": 0.6547, "step": 7033 }, { "epoch": 2.691924990432453, "grad_norm": 0.555070161819458, "learning_rate": 9.24587171857949e-06, "loss": 0.7212, "step": 7034 }, { "epoch": 2.6923076923076925, "grad_norm": 0.5095936059951782, "learning_rate": 9.243399835638263e-06, "loss": 0.5629, "step": 7035 }, { "epoch": 2.6926903941829314, "grad_norm": 0.5234457850456238, "learning_rate": 9.240927999192147e-06, "loss": 0.6936, "step": 7036 }, { "epoch": 2.693073096058171, "grad_norm": 0.5817936062812805, "learning_rate": 9.238456209393049e-06, "loss": 0.6722, "step": 7037 }, { "epoch": 2.6934557979334097, "grad_norm": 0.5570861101150513, "learning_rate": 9.23598446639286e-06, "loss": 0.6221, "step": 7038 }, { "epoch": 2.693838499808649, "grad_norm": 0.657427191734314, "learning_rate": 9.233512770343477e-06, "loss": 0.6113, "step": 7039 }, { "epoch": 2.694221201683888, "grad_norm": 0.5398461222648621, "learning_rate": 9.231041121396795e-06, "loss": 0.6359, "step": 7040 }, { "epoch": 2.6946039035591274, "grad_norm": 0.5162186026573181, "learning_rate": 9.2285695197047e-06, "loss": 0.6348, "step": 7041 }, { "epoch": 2.6949866054343667, "grad_norm": 0.5494723320007324, "learning_rate": 9.226097965419081e-06, "loss": 0.6552, "step": 7042 }, { "epoch": 2.6953693073096057, "grad_norm": 0.5505455732345581, "learning_rate": 9.223626458691823e-06, "loss": 0.6795, "step": 7043 }, { "epoch": 2.695752009184845, "grad_norm": 0.5259919762611389, "learning_rate": 9.221154999674803e-06, "loss": 0.6847, "step": 7044 }, { "epoch": 2.6961347110600844, "grad_norm": 0.6146982312202454, "learning_rate": 9.2186835885199e-06, "loss": 0.6399, "step": 7045 }, { "epoch": 2.6965174129353233, "grad_norm": 0.5756797194480896, "learning_rate": 9.21621222537899e-06, "loss": 0.6643, "step": 7046 }, { "epoch": 2.6969001148105627, "grad_norm": 0.5214305520057678, "learning_rate": 9.213740910403948e-06, "loss": 0.5764, "step": 7047 }, { "epoch": 2.6972828166858016, "grad_norm": 0.5193303823471069, "learning_rate": 9.211269643746637e-06, "loss": 0.6249, "step": 7048 }, { "epoch": 2.697665518561041, "grad_norm": 0.5186196565628052, "learning_rate": 9.208798425558923e-06, "loss": 0.6382, "step": 7049 }, { "epoch": 2.69804822043628, "grad_norm": 0.5561969876289368, "learning_rate": 9.206327255992675e-06, "loss": 0.644, "step": 7050 }, { "epoch": 2.6984309223115193, "grad_norm": 0.5401443839073181, "learning_rate": 9.203856135199746e-06, "loss": 0.5748, "step": 7051 }, { "epoch": 2.6988136241867586, "grad_norm": 0.5668985843658447, "learning_rate": 9.201385063331997e-06, "loss": 0.6032, "step": 7052 }, { "epoch": 2.6991963260619976, "grad_norm": 0.5275164842605591, "learning_rate": 9.198914040541283e-06, "loss": 0.6534, "step": 7053 }, { "epoch": 2.699579027937237, "grad_norm": 0.5404662489891052, "learning_rate": 9.196443066979452e-06, "loss": 0.6642, "step": 7054 }, { "epoch": 2.6999617298124763, "grad_norm": 0.53957599401474, "learning_rate": 9.193972142798353e-06, "loss": 0.6246, "step": 7055 }, { "epoch": 2.700344431687715, "grad_norm": 0.49547770619392395, "learning_rate": 9.191501268149834e-06, "loss": 0.6331, "step": 7056 }, { "epoch": 2.7007271335629546, "grad_norm": 0.5300753712654114, "learning_rate": 9.18903044318573e-06, "loss": 0.6318, "step": 7057 }, { "epoch": 2.7011098354381935, "grad_norm": 0.6067737340927124, "learning_rate": 9.186559668057886e-06, "loss": 0.6348, "step": 7058 }, { "epoch": 2.701492537313433, "grad_norm": 0.5741518139839172, "learning_rate": 9.184088942918137e-06, "loss": 0.5678, "step": 7059 }, { "epoch": 2.701875239188672, "grad_norm": 0.5933037400245667, "learning_rate": 9.181618267918311e-06, "loss": 0.6404, "step": 7060 }, { "epoch": 2.702257941063911, "grad_norm": 0.5099709630012512, "learning_rate": 9.179147643210242e-06, "loss": 0.6521, "step": 7061 }, { "epoch": 2.7026406429391505, "grad_norm": 0.5496928095817566, "learning_rate": 9.176677068945756e-06, "loss": 0.6655, "step": 7062 }, { "epoch": 2.7030233448143894, "grad_norm": 0.5779358148574829, "learning_rate": 9.174206545276678e-06, "loss": 0.6134, "step": 7063 }, { "epoch": 2.703406046689629, "grad_norm": 0.5409708619117737, "learning_rate": 9.171736072354824e-06, "loss": 0.6394, "step": 7064 }, { "epoch": 2.703788748564868, "grad_norm": 0.5561589598655701, "learning_rate": 9.169265650332013e-06, "loss": 0.6446, "step": 7065 }, { "epoch": 2.704171450440107, "grad_norm": 0.5783271193504333, "learning_rate": 9.166795279360065e-06, "loss": 0.6752, "step": 7066 }, { "epoch": 2.7045541523153465, "grad_norm": 0.5137059688568115, "learning_rate": 9.16432495959078e-06, "loss": 0.5918, "step": 7067 }, { "epoch": 2.7049368541905854, "grad_norm": 0.4997424781322479, "learning_rate": 9.161854691175971e-06, "loss": 0.6975, "step": 7068 }, { "epoch": 2.7053195560658247, "grad_norm": 0.53165203332901, "learning_rate": 9.159384474267446e-06, "loss": 0.6458, "step": 7069 }, { "epoch": 2.7057022579410637, "grad_norm": 0.5544179081916809, "learning_rate": 9.156914309017001e-06, "loss": 0.6615, "step": 7070 }, { "epoch": 2.706084959816303, "grad_norm": 0.5410664081573486, "learning_rate": 9.154444195576437e-06, "loss": 0.6444, "step": 7071 }, { "epoch": 2.7064676616915424, "grad_norm": 0.545994222164154, "learning_rate": 9.151974134097552e-06, "loss": 0.6162, "step": 7072 }, { "epoch": 2.7068503635667813, "grad_norm": 0.46674826741218567, "learning_rate": 9.14950412473213e-06, "loss": 0.6117, "step": 7073 }, { "epoch": 2.7072330654420207, "grad_norm": 0.5239441990852356, "learning_rate": 9.147034167631964e-06, "loss": 0.6547, "step": 7074 }, { "epoch": 2.70761576731726, "grad_norm": 0.5550409555435181, "learning_rate": 9.144564262948841e-06, "loss": 0.6823, "step": 7075 }, { "epoch": 2.707998469192499, "grad_norm": 0.5595652461051941, "learning_rate": 9.142094410834546e-06, "loss": 0.6507, "step": 7076 }, { "epoch": 2.7083811710677383, "grad_norm": 0.5871501564979553, "learning_rate": 9.139624611440852e-06, "loss": 0.6792, "step": 7077 }, { "epoch": 2.7087638729429773, "grad_norm": 0.49343448877334595, "learning_rate": 9.137154864919536e-06, "loss": 0.6579, "step": 7078 }, { "epoch": 2.7091465748182166, "grad_norm": 0.5623051524162292, "learning_rate": 9.134685171422372e-06, "loss": 0.5937, "step": 7079 }, { "epoch": 2.7095292766934556, "grad_norm": 0.49633029103279114, "learning_rate": 9.132215531101128e-06, "loss": 0.5734, "step": 7080 }, { "epoch": 2.709911978568695, "grad_norm": 0.5201810598373413, "learning_rate": 9.129745944107571e-06, "loss": 0.5733, "step": 7081 }, { "epoch": 2.7102946804439343, "grad_norm": 0.46948421001434326, "learning_rate": 9.127276410593467e-06, "loss": 0.7003, "step": 7082 }, { "epoch": 2.710677382319173, "grad_norm": 0.5287299156188965, "learning_rate": 9.124806930710569e-06, "loss": 0.6383, "step": 7083 }, { "epoch": 2.7110600841944126, "grad_norm": 0.5252298712730408, "learning_rate": 9.122337504610634e-06, "loss": 0.677, "step": 7084 }, { "epoch": 2.711442786069652, "grad_norm": 0.5138881206512451, "learning_rate": 9.119868132445424e-06, "loss": 0.6112, "step": 7085 }, { "epoch": 2.711825487944891, "grad_norm": 0.510085940361023, "learning_rate": 9.117398814366677e-06, "loss": 0.5615, "step": 7086 }, { "epoch": 2.7122081898201302, "grad_norm": 0.554036557674408, "learning_rate": 9.114929550526144e-06, "loss": 0.6843, "step": 7087 }, { "epoch": 2.712590891695369, "grad_norm": 0.8275859355926514, "learning_rate": 9.112460341075567e-06, "loss": 0.5671, "step": 7088 }, { "epoch": 2.7129735935706085, "grad_norm": 0.5142521262168884, "learning_rate": 9.10999118616669e-06, "loss": 0.6481, "step": 7089 }, { "epoch": 2.7133562954458474, "grad_norm": 0.5224593877792358, "learning_rate": 9.107522085951243e-06, "loss": 0.5909, "step": 7090 }, { "epoch": 2.713738997321087, "grad_norm": 0.5013084411621094, "learning_rate": 9.10505304058096e-06, "loss": 0.6424, "step": 7091 }, { "epoch": 2.714121699196326, "grad_norm": 0.5685811042785645, "learning_rate": 9.102584050207578e-06, "loss": 0.5866, "step": 7092 }, { "epoch": 2.714504401071565, "grad_norm": 0.5180128216743469, "learning_rate": 9.100115114982812e-06, "loss": 0.5706, "step": 7093 }, { "epoch": 2.7148871029468045, "grad_norm": 0.5136368274688721, "learning_rate": 9.09764623505839e-06, "loss": 0.6565, "step": 7094 }, { "epoch": 2.715269804822044, "grad_norm": 0.5086124539375305, "learning_rate": 9.095177410586034e-06, "loss": 0.634, "step": 7095 }, { "epoch": 2.7156525066972828, "grad_norm": 0.5528671741485596, "learning_rate": 9.092708641717452e-06, "loss": 0.6701, "step": 7096 }, { "epoch": 2.716035208572522, "grad_norm": 0.5681154727935791, "learning_rate": 9.090239928604365e-06, "loss": 0.5877, "step": 7097 }, { "epoch": 2.716417910447761, "grad_norm": 0.5248828530311584, "learning_rate": 9.087771271398477e-06, "loss": 0.6804, "step": 7098 }, { "epoch": 2.7168006123230004, "grad_norm": 0.5858815312385559, "learning_rate": 9.085302670251496e-06, "loss": 0.5857, "step": 7099 }, { "epoch": 2.7171833141982393, "grad_norm": 0.5366657972335815, "learning_rate": 9.082834125315121e-06, "loss": 0.5559, "step": 7100 }, { "epoch": 2.7175660160734787, "grad_norm": 0.5276786684989929, "learning_rate": 9.080365636741055e-06, "loss": 0.5934, "step": 7101 }, { "epoch": 2.717948717948718, "grad_norm": 0.5490941405296326, "learning_rate": 9.077897204680994e-06, "loss": 0.664, "step": 7102 }, { "epoch": 2.718331419823957, "grad_norm": 0.4960631728172302, "learning_rate": 9.075428829286624e-06, "loss": 0.68, "step": 7103 }, { "epoch": 2.7187141216991964, "grad_norm": 0.5523174405097961, "learning_rate": 9.072960510709634e-06, "loss": 0.6541, "step": 7104 }, { "epoch": 2.7190968235744357, "grad_norm": 0.5154369473457336, "learning_rate": 9.070492249101714e-06, "loss": 0.5516, "step": 7105 }, { "epoch": 2.7194795254496746, "grad_norm": 0.5505847334861755, "learning_rate": 9.068024044614543e-06, "loss": 0.6883, "step": 7106 }, { "epoch": 2.719862227324914, "grad_norm": 0.5246214270591736, "learning_rate": 9.065555897399795e-06, "loss": 0.659, "step": 7107 }, { "epoch": 2.720244929200153, "grad_norm": 0.495856374502182, "learning_rate": 9.063087807609151e-06, "loss": 0.6296, "step": 7108 }, { "epoch": 2.7206276310753923, "grad_norm": 0.5359637141227722, "learning_rate": 9.060619775394276e-06, "loss": 0.6974, "step": 7109 }, { "epoch": 2.721010332950631, "grad_norm": 0.5412381887435913, "learning_rate": 9.05815180090684e-06, "loss": 0.6825, "step": 7110 }, { "epoch": 2.7213930348258706, "grad_norm": 0.5630905628204346, "learning_rate": 9.055683884298508e-06, "loss": 0.563, "step": 7111 }, { "epoch": 2.72177573670111, "grad_norm": 0.5416079163551331, "learning_rate": 9.053216025720935e-06, "loss": 0.6122, "step": 7112 }, { "epoch": 2.722158438576349, "grad_norm": 0.5460605025291443, "learning_rate": 9.050748225325783e-06, "loss": 0.6192, "step": 7113 }, { "epoch": 2.7225411404515882, "grad_norm": 0.49456560611724854, "learning_rate": 9.0482804832647e-06, "loss": 0.588, "step": 7114 }, { "epoch": 2.7229238423268276, "grad_norm": 0.614726185798645, "learning_rate": 9.045812799689342e-06, "loss": 0.6427, "step": 7115 }, { "epoch": 2.7233065442020665, "grad_norm": 0.5419029593467712, "learning_rate": 9.043345174751347e-06, "loss": 0.5512, "step": 7116 }, { "epoch": 2.723689246077306, "grad_norm": 0.5014411211013794, "learning_rate": 9.040877608602363e-06, "loss": 0.6537, "step": 7117 }, { "epoch": 2.724071947952545, "grad_norm": 0.5441469550132751, "learning_rate": 9.038410101394027e-06, "loss": 0.6156, "step": 7118 }, { "epoch": 2.724454649827784, "grad_norm": 0.5577993392944336, "learning_rate": 9.035942653277973e-06, "loss": 0.612, "step": 7119 }, { "epoch": 2.724837351703023, "grad_norm": 0.517930269241333, "learning_rate": 9.033475264405831e-06, "loss": 0.6479, "step": 7120 }, { "epoch": 2.7252200535782625, "grad_norm": 0.537616491317749, "learning_rate": 9.031007934929237e-06, "loss": 0.6989, "step": 7121 }, { "epoch": 2.725602755453502, "grad_norm": 0.5726365447044373, "learning_rate": 9.028540664999803e-06, "loss": 0.6859, "step": 7122 }, { "epoch": 2.7259854573287408, "grad_norm": 0.587611973285675, "learning_rate": 9.026073454769155e-06, "loss": 0.6887, "step": 7123 }, { "epoch": 2.72636815920398, "grad_norm": 0.5362085103988647, "learning_rate": 9.023606304388912e-06, "loss": 0.657, "step": 7124 }, { "epoch": 2.7267508610792195, "grad_norm": 0.5452587604522705, "learning_rate": 9.021139214010683e-06, "loss": 0.5795, "step": 7125 }, { "epoch": 2.7271335629544584, "grad_norm": 0.49760958552360535, "learning_rate": 9.018672183786077e-06, "loss": 0.6083, "step": 7126 }, { "epoch": 2.727516264829698, "grad_norm": 0.5222731232643127, "learning_rate": 9.016205213866704e-06, "loss": 0.6459, "step": 7127 }, { "epoch": 2.7278989667049367, "grad_norm": 0.5276165008544922, "learning_rate": 9.013738304404167e-06, "loss": 0.5894, "step": 7128 }, { "epoch": 2.728281668580176, "grad_norm": 0.5763657689094543, "learning_rate": 9.011271455550055e-06, "loss": 0.5436, "step": 7129 }, { "epoch": 2.728664370455415, "grad_norm": 0.5201448202133179, "learning_rate": 9.008804667455969e-06, "loss": 0.5954, "step": 7130 }, { "epoch": 2.7290470723306544, "grad_norm": 0.5462809801101685, "learning_rate": 9.006337940273504e-06, "loss": 0.6457, "step": 7131 }, { "epoch": 2.7294297742058937, "grad_norm": 0.561219334602356, "learning_rate": 9.003871274154237e-06, "loss": 0.5967, "step": 7132 }, { "epoch": 2.7298124760811326, "grad_norm": 0.5785240530967712, "learning_rate": 9.001404669249758e-06, "loss": 0.6031, "step": 7133 }, { "epoch": 2.730195177956372, "grad_norm": 0.5772151350975037, "learning_rate": 8.998938125711646e-06, "loss": 0.7191, "step": 7134 }, { "epoch": 2.7305778798316114, "grad_norm": 0.5094403624534607, "learning_rate": 8.996471643691474e-06, "loss": 0.5756, "step": 7135 }, { "epoch": 2.7309605817068503, "grad_norm": 0.5287219285964966, "learning_rate": 8.994005223340815e-06, "loss": 0.6644, "step": 7136 }, { "epoch": 2.7313432835820897, "grad_norm": 0.6165122389793396, "learning_rate": 8.991538864811241e-06, "loss": 0.5684, "step": 7137 }, { "epoch": 2.7317259854573286, "grad_norm": 0.568865954875946, "learning_rate": 8.98907256825431e-06, "loss": 0.7289, "step": 7138 }, { "epoch": 2.732108687332568, "grad_norm": 0.546033501625061, "learning_rate": 8.986606333821584e-06, "loss": 0.6667, "step": 7139 }, { "epoch": 2.732491389207807, "grad_norm": 0.5457695126533508, "learning_rate": 8.98414016166462e-06, "loss": 0.604, "step": 7140 }, { "epoch": 2.7328740910830462, "grad_norm": 0.5460342764854431, "learning_rate": 8.98167405193498e-06, "loss": 0.6457, "step": 7141 }, { "epoch": 2.7332567929582856, "grad_norm": 0.5621662735939026, "learning_rate": 8.979208004784199e-06, "loss": 0.6358, "step": 7142 }, { "epoch": 2.7336394948335245, "grad_norm": 0.5909360647201538, "learning_rate": 8.976742020363829e-06, "loss": 0.7104, "step": 7143 }, { "epoch": 2.734022196708764, "grad_norm": 0.5141475200653076, "learning_rate": 8.974276098825413e-06, "loss": 0.6486, "step": 7144 }, { "epoch": 2.7344048985840033, "grad_norm": 0.4831739068031311, "learning_rate": 8.971810240320484e-06, "loss": 0.6637, "step": 7145 }, { "epoch": 2.734787600459242, "grad_norm": 0.49922457337379456, "learning_rate": 8.969344445000578e-06, "loss": 0.588, "step": 7146 }, { "epoch": 2.7351703023344816, "grad_norm": 0.4915129542350769, "learning_rate": 8.96687871301723e-06, "loss": 0.6541, "step": 7147 }, { "epoch": 2.7355530042097205, "grad_norm": 0.5571674704551697, "learning_rate": 8.964413044521957e-06, "loss": 0.6487, "step": 7148 }, { "epoch": 2.73593570608496, "grad_norm": 0.5192618370056152, "learning_rate": 8.961947439666282e-06, "loss": 0.6552, "step": 7149 }, { "epoch": 2.7363184079601988, "grad_norm": 0.5652652978897095, "learning_rate": 8.959481898601729e-06, "loss": 0.6724, "step": 7150 }, { "epoch": 2.736701109835438, "grad_norm": 0.5309550166130066, "learning_rate": 8.957016421479804e-06, "loss": 0.5835, "step": 7151 }, { "epoch": 2.7370838117106775, "grad_norm": 0.5552018284797668, "learning_rate": 8.954551008452025e-06, "loss": 0.6437, "step": 7152 }, { "epoch": 2.7374665135859164, "grad_norm": 0.5948748588562012, "learning_rate": 8.952085659669895e-06, "loss": 0.6969, "step": 7153 }, { "epoch": 2.737849215461156, "grad_norm": 0.4961393475532532, "learning_rate": 8.949620375284918e-06, "loss": 0.5247, "step": 7154 }, { "epoch": 2.738231917336395, "grad_norm": 0.5419962406158447, "learning_rate": 8.947155155448588e-06, "loss": 0.6543, "step": 7155 }, { "epoch": 2.738614619211634, "grad_norm": 0.5301740169525146, "learning_rate": 8.944690000312401e-06, "loss": 0.6307, "step": 7156 }, { "epoch": 2.7389973210868734, "grad_norm": 0.5074910521507263, "learning_rate": 8.942224910027856e-06, "loss": 0.6525, "step": 7157 }, { "epoch": 2.7393800229621124, "grad_norm": 0.5011751055717468, "learning_rate": 8.939759884746427e-06, "loss": 0.6203, "step": 7158 }, { "epoch": 2.7397627248373517, "grad_norm": 0.5460874438285828, "learning_rate": 8.9372949246196e-06, "loss": 0.5926, "step": 7159 }, { "epoch": 2.7401454267125906, "grad_norm": 0.5394935607910156, "learning_rate": 8.934830029798856e-06, "loss": 0.6442, "step": 7160 }, { "epoch": 2.74052812858783, "grad_norm": 0.5498991012573242, "learning_rate": 8.932365200435668e-06, "loss": 0.6264, "step": 7161 }, { "epoch": 2.7409108304630694, "grad_norm": 0.7525969743728638, "learning_rate": 8.929900436681506e-06, "loss": 0.5464, "step": 7162 }, { "epoch": 2.7412935323383083, "grad_norm": 0.5978761911392212, "learning_rate": 8.927435738687839e-06, "loss": 0.7415, "step": 7163 }, { "epoch": 2.7416762342135477, "grad_norm": 0.5118886232376099, "learning_rate": 8.924971106606126e-06, "loss": 0.7049, "step": 7164 }, { "epoch": 2.742058936088787, "grad_norm": 0.4931068420410156, "learning_rate": 8.922506540587826e-06, "loss": 0.6123, "step": 7165 }, { "epoch": 2.742441637964026, "grad_norm": 0.5247097015380859, "learning_rate": 8.920042040784393e-06, "loss": 0.6088, "step": 7166 }, { "epoch": 2.7428243398392653, "grad_norm": 0.5418390035629272, "learning_rate": 8.917577607347283e-06, "loss": 0.5757, "step": 7167 }, { "epoch": 2.7432070417145042, "grad_norm": 0.5038950443267822, "learning_rate": 8.915113240427933e-06, "loss": 0.5728, "step": 7168 }, { "epoch": 2.7435897435897436, "grad_norm": 0.478930801153183, "learning_rate": 8.912648940177789e-06, "loss": 0.6647, "step": 7169 }, { "epoch": 2.7439724454649825, "grad_norm": 0.5245305299758911, "learning_rate": 8.91018470674829e-06, "loss": 0.6264, "step": 7170 }, { "epoch": 2.744355147340222, "grad_norm": 0.5326768755912781, "learning_rate": 8.907720540290868e-06, "loss": 0.6425, "step": 7171 }, { "epoch": 2.7447378492154613, "grad_norm": 0.5411690473556519, "learning_rate": 8.90525644095695e-06, "loss": 0.6514, "step": 7172 }, { "epoch": 2.7451205510907, "grad_norm": 0.5737541913986206, "learning_rate": 8.902792408897973e-06, "loss": 0.6363, "step": 7173 }, { "epoch": 2.7455032529659396, "grad_norm": 0.5387718677520752, "learning_rate": 8.90032844426534e-06, "loss": 0.6876, "step": 7174 }, { "epoch": 2.745885954841179, "grad_norm": 0.532689094543457, "learning_rate": 8.897864547210484e-06, "loss": 0.6219, "step": 7175 }, { "epoch": 2.746268656716418, "grad_norm": 0.5380871891975403, "learning_rate": 8.895400717884814e-06, "loss": 0.5348, "step": 7176 }, { "epoch": 2.746651358591657, "grad_norm": 0.5747153162956238, "learning_rate": 8.892936956439734e-06, "loss": 0.6452, "step": 7177 }, { "epoch": 2.747034060466896, "grad_norm": 0.7705985307693481, "learning_rate": 8.890473263026653e-06, "loss": 0.6184, "step": 7178 }, { "epoch": 2.7474167623421355, "grad_norm": 0.5509061813354492, "learning_rate": 8.888009637796968e-06, "loss": 0.6409, "step": 7179 }, { "epoch": 2.7477994642173744, "grad_norm": 0.5260021686553955, "learning_rate": 8.885546080902083e-06, "loss": 0.6876, "step": 7180 }, { "epoch": 2.748182166092614, "grad_norm": 0.5387274622917175, "learning_rate": 8.883082592493382e-06, "loss": 0.6175, "step": 7181 }, { "epoch": 2.748564867967853, "grad_norm": 0.5094408392906189, "learning_rate": 8.880619172722257e-06, "loss": 0.62, "step": 7182 }, { "epoch": 2.748947569843092, "grad_norm": 0.5468149185180664, "learning_rate": 8.878155821740094e-06, "loss": 0.5824, "step": 7183 }, { "epoch": 2.7493302717183314, "grad_norm": 0.5276967287063599, "learning_rate": 8.875692539698264e-06, "loss": 0.5458, "step": 7184 }, { "epoch": 2.749712973593571, "grad_norm": 0.52220618724823, "learning_rate": 8.873229326748151e-06, "loss": 0.6149, "step": 7185 }, { "epoch": 2.7500956754688097, "grad_norm": 0.6057054400444031, "learning_rate": 8.870766183041127e-06, "loss": 0.6033, "step": 7186 }, { "epoch": 2.750478377344049, "grad_norm": 0.5092149972915649, "learning_rate": 8.868303108728548e-06, "loss": 0.6606, "step": 7187 }, { "epoch": 2.750861079219288, "grad_norm": 0.537255048751831, "learning_rate": 8.865840103961787e-06, "loss": 0.5859, "step": 7188 }, { "epoch": 2.7512437810945274, "grad_norm": 0.5175155997276306, "learning_rate": 8.863377168892197e-06, "loss": 0.5518, "step": 7189 }, { "epoch": 2.7516264829697663, "grad_norm": 0.5168123245239258, "learning_rate": 8.860914303671134e-06, "loss": 0.6899, "step": 7190 }, { "epoch": 2.7520091848450057, "grad_norm": 0.5650374293327332, "learning_rate": 8.858451508449945e-06, "loss": 0.6499, "step": 7191 }, { "epoch": 2.752391886720245, "grad_norm": 0.580644965171814, "learning_rate": 8.85598878337998e-06, "loss": 0.664, "step": 7192 }, { "epoch": 2.752774588595484, "grad_norm": 0.5429313778877258, "learning_rate": 8.85352612861258e-06, "loss": 0.6504, "step": 7193 }, { "epoch": 2.7531572904707233, "grad_norm": 0.5390105843544006, "learning_rate": 8.851063544299075e-06, "loss": 0.5871, "step": 7194 }, { "epoch": 2.7535399923459627, "grad_norm": 0.5511770248413086, "learning_rate": 8.848601030590801e-06, "loss": 0.6253, "step": 7195 }, { "epoch": 2.7539226942212016, "grad_norm": 0.6095899343490601, "learning_rate": 8.846138587639092e-06, "loss": 0.6816, "step": 7196 }, { "epoch": 2.754305396096441, "grad_norm": 0.6180123090744019, "learning_rate": 8.843676215595265e-06, "loss": 0.7236, "step": 7197 }, { "epoch": 2.75468809797168, "grad_norm": 0.594403862953186, "learning_rate": 8.841213914610638e-06, "loss": 0.7008, "step": 7198 }, { "epoch": 2.7550707998469193, "grad_norm": 0.5860974192619324, "learning_rate": 8.838751684836533e-06, "loss": 0.6706, "step": 7199 }, { "epoch": 2.755453501722158, "grad_norm": 0.5059118270874023, "learning_rate": 8.836289526424253e-06, "loss": 0.5512, "step": 7200 }, { "epoch": 2.7558362035973976, "grad_norm": 0.5802126526832581, "learning_rate": 8.833827439525109e-06, "loss": 0.6701, "step": 7201 }, { "epoch": 2.756218905472637, "grad_norm": 0.5545303821563721, "learning_rate": 8.831365424290405e-06, "loss": 0.6572, "step": 7202 }, { "epoch": 2.756601607347876, "grad_norm": 0.5987486243247986, "learning_rate": 8.828903480871432e-06, "loss": 0.6631, "step": 7203 }, { "epoch": 2.756984309223115, "grad_norm": 0.48983362317085266, "learning_rate": 8.826441609419487e-06, "loss": 0.6747, "step": 7204 }, { "epoch": 2.7573670110983546, "grad_norm": 0.5443428754806519, "learning_rate": 8.823979810085856e-06, "loss": 0.6154, "step": 7205 }, { "epoch": 2.7577497129735935, "grad_norm": 0.5964906215667725, "learning_rate": 8.821518083021827e-06, "loss": 0.7153, "step": 7206 }, { "epoch": 2.758132414848833, "grad_norm": 0.5483643412590027, "learning_rate": 8.819056428378678e-06, "loss": 0.6154, "step": 7207 }, { "epoch": 2.758515116724072, "grad_norm": 0.5435228943824768, "learning_rate": 8.816594846307682e-06, "loss": 0.6502, "step": 7208 }, { "epoch": 2.758897818599311, "grad_norm": 0.5556168556213379, "learning_rate": 8.814133336960114e-06, "loss": 0.6153, "step": 7209 }, { "epoch": 2.75928052047455, "grad_norm": 0.5605584383010864, "learning_rate": 8.811671900487238e-06, "loss": 0.6449, "step": 7210 }, { "epoch": 2.7596632223497894, "grad_norm": 0.5388641953468323, "learning_rate": 8.809210537040314e-06, "loss": 0.6193, "step": 7211 }, { "epoch": 2.760045924225029, "grad_norm": 0.5150550603866577, "learning_rate": 8.806749246770607e-06, "loss": 0.609, "step": 7212 }, { "epoch": 2.7604286261002677, "grad_norm": 0.5312610864639282, "learning_rate": 8.80428802982936e-06, "loss": 0.6442, "step": 7213 }, { "epoch": 2.760811327975507, "grad_norm": 0.6027878522872925, "learning_rate": 8.801826886367826e-06, "loss": 0.618, "step": 7214 }, { "epoch": 2.7611940298507465, "grad_norm": 0.5216258764266968, "learning_rate": 8.799365816537249e-06, "loss": 0.6197, "step": 7215 }, { "epoch": 2.7615767317259854, "grad_norm": 0.5739855170249939, "learning_rate": 8.796904820488869e-06, "loss": 0.5966, "step": 7216 }, { "epoch": 2.7619594336012248, "grad_norm": 0.5135936737060547, "learning_rate": 8.794443898373917e-06, "loss": 0.59, "step": 7217 }, { "epoch": 2.7623421354764637, "grad_norm": 0.5239837765693665, "learning_rate": 8.791983050343627e-06, "loss": 0.6122, "step": 7218 }, { "epoch": 2.762724837351703, "grad_norm": 0.5037060976028442, "learning_rate": 8.789522276549227e-06, "loss": 0.5897, "step": 7219 }, { "epoch": 2.763107539226942, "grad_norm": 0.5936477184295654, "learning_rate": 8.787061577141934e-06, "loss": 0.6853, "step": 7220 }, { "epoch": 2.7634902411021813, "grad_norm": 0.5458961725234985, "learning_rate": 8.784600952272963e-06, "loss": 0.6608, "step": 7221 }, { "epoch": 2.7638729429774207, "grad_norm": 0.5396609902381897, "learning_rate": 8.782140402093534e-06, "loss": 0.633, "step": 7222 }, { "epoch": 2.7642556448526596, "grad_norm": 0.4969136714935303, "learning_rate": 8.779679926754845e-06, "loss": 0.5719, "step": 7223 }, { "epoch": 2.764638346727899, "grad_norm": 0.5493727922439575, "learning_rate": 8.777219526408103e-06, "loss": 0.6347, "step": 7224 }, { "epoch": 2.7650210486031384, "grad_norm": 0.5286442041397095, "learning_rate": 8.774759201204508e-06, "loss": 0.702, "step": 7225 }, { "epoch": 2.7654037504783773, "grad_norm": 0.5861799716949463, "learning_rate": 8.77229895129525e-06, "loss": 0.6332, "step": 7226 }, { "epoch": 2.7657864523536166, "grad_norm": 0.541092574596405, "learning_rate": 8.769838776831518e-06, "loss": 0.6175, "step": 7227 }, { "epoch": 2.7661691542288556, "grad_norm": 0.5512580871582031, "learning_rate": 8.767378677964503e-06, "loss": 0.6417, "step": 7228 }, { "epoch": 2.766551856104095, "grad_norm": 0.5254911184310913, "learning_rate": 8.764918654845373e-06, "loss": 0.6766, "step": 7229 }, { "epoch": 2.766934557979334, "grad_norm": 0.5129702687263489, "learning_rate": 8.762458707625313e-06, "loss": 0.6201, "step": 7230 }, { "epoch": 2.767317259854573, "grad_norm": 0.5533748269081116, "learning_rate": 8.75999883645549e-06, "loss": 0.6813, "step": 7231 }, { "epoch": 2.7676999617298126, "grad_norm": 0.5447983145713806, "learning_rate": 8.757539041487073e-06, "loss": 0.6167, "step": 7232 }, { "epoch": 2.7680826636050515, "grad_norm": 0.5299031734466553, "learning_rate": 8.755079322871215e-06, "loss": 0.6174, "step": 7233 }, { "epoch": 2.768465365480291, "grad_norm": 0.5279706716537476, "learning_rate": 8.75261968075908e-06, "loss": 0.6393, "step": 7234 }, { "epoch": 2.7688480673555302, "grad_norm": 0.5971385836601257, "learning_rate": 8.750160115301816e-06, "loss": 0.6021, "step": 7235 }, { "epoch": 2.769230769230769, "grad_norm": 0.49543246626853943, "learning_rate": 8.747700626650571e-06, "loss": 0.5954, "step": 7236 }, { "epoch": 2.7696134711060085, "grad_norm": 0.5271174311637878, "learning_rate": 8.745241214956484e-06, "loss": 0.7054, "step": 7237 }, { "epoch": 2.7699961729812475, "grad_norm": 0.6613025665283203, "learning_rate": 8.7427818803707e-06, "loss": 0.6067, "step": 7238 }, { "epoch": 2.770378874856487, "grad_norm": 0.5363490581512451, "learning_rate": 8.740322623044344e-06, "loss": 0.6016, "step": 7239 }, { "epoch": 2.7707615767317257, "grad_norm": 0.5019893050193787, "learning_rate": 8.737863443128544e-06, "loss": 0.6475, "step": 7240 }, { "epoch": 2.771144278606965, "grad_norm": 0.5573104023933411, "learning_rate": 8.735404340774433e-06, "loss": 0.5437, "step": 7241 }, { "epoch": 2.7715269804822045, "grad_norm": 0.570365309715271, "learning_rate": 8.732945316133118e-06, "loss": 0.6103, "step": 7242 }, { "epoch": 2.7719096823574434, "grad_norm": 0.517598569393158, "learning_rate": 8.730486369355717e-06, "loss": 0.6446, "step": 7243 }, { "epoch": 2.7722923842326828, "grad_norm": 0.4855019450187683, "learning_rate": 8.728027500593338e-06, "loss": 0.6126, "step": 7244 }, { "epoch": 2.772675086107922, "grad_norm": 0.6775697469711304, "learning_rate": 8.725568709997091e-06, "loss": 0.6524, "step": 7245 }, { "epoch": 2.773057787983161, "grad_norm": 0.5179588794708252, "learning_rate": 8.723109997718066e-06, "loss": 0.567, "step": 7246 }, { "epoch": 2.7734404898584004, "grad_norm": 0.5143736004829407, "learning_rate": 8.720651363907364e-06, "loss": 0.6424, "step": 7247 }, { "epoch": 2.7738231917336393, "grad_norm": 0.5900987982749939, "learning_rate": 8.718192808716076e-06, "loss": 0.596, "step": 7248 }, { "epoch": 2.7742058936088787, "grad_norm": 0.5060738921165466, "learning_rate": 8.71573433229528e-06, "loss": 0.6176, "step": 7249 }, { "epoch": 2.7745885954841176, "grad_norm": 0.4994462728500366, "learning_rate": 8.713275934796055e-06, "loss": 0.6268, "step": 7250 }, { "epoch": 2.774971297359357, "grad_norm": 0.5253398418426514, "learning_rate": 8.710817616369489e-06, "loss": 0.6277, "step": 7251 }, { "epoch": 2.7753539992345964, "grad_norm": 0.502006471157074, "learning_rate": 8.70835937716664e-06, "loss": 0.6961, "step": 7252 }, { "epoch": 2.7757367011098353, "grad_norm": 0.555718719959259, "learning_rate": 8.705901217338576e-06, "loss": 0.6485, "step": 7253 }, { "epoch": 2.7761194029850746, "grad_norm": 0.5300837159156799, "learning_rate": 8.703443137036361e-06, "loss": 0.6006, "step": 7254 }, { "epoch": 2.776502104860314, "grad_norm": 0.5506948232650757, "learning_rate": 8.700985136411047e-06, "loss": 0.6704, "step": 7255 }, { "epoch": 2.776884806735553, "grad_norm": 0.5290852189064026, "learning_rate": 8.698527215613685e-06, "loss": 0.6433, "step": 7256 }, { "epoch": 2.7772675086107923, "grad_norm": 0.579867959022522, "learning_rate": 8.696069374795323e-06, "loss": 0.7202, "step": 7257 }, { "epoch": 2.7776502104860312, "grad_norm": 0.5516603589057922, "learning_rate": 8.693611614107004e-06, "loss": 0.6933, "step": 7258 }, { "epoch": 2.7780329123612706, "grad_norm": 0.5486859083175659, "learning_rate": 8.691153933699758e-06, "loss": 0.679, "step": 7259 }, { "epoch": 2.7784156142365095, "grad_norm": 0.5412517786026001, "learning_rate": 8.68869633372462e-06, "loss": 0.568, "step": 7260 }, { "epoch": 2.778798316111749, "grad_norm": 0.5324066281318665, "learning_rate": 8.686238814332614e-06, "loss": 0.559, "step": 7261 }, { "epoch": 2.7791810179869882, "grad_norm": 0.5283344984054565, "learning_rate": 8.683781375674763e-06, "loss": 0.6158, "step": 7262 }, { "epoch": 2.779563719862227, "grad_norm": 0.5396201014518738, "learning_rate": 8.68132401790208e-06, "loss": 0.7196, "step": 7263 }, { "epoch": 2.7799464217374665, "grad_norm": 0.5138574838638306, "learning_rate": 8.678866741165583e-06, "loss": 0.6038, "step": 7264 }, { "epoch": 2.780329123612706, "grad_norm": 0.5710706114768982, "learning_rate": 8.676409545616271e-06, "loss": 0.5626, "step": 7265 }, { "epoch": 2.780711825487945, "grad_norm": 0.5619781017303467, "learning_rate": 8.673952431405148e-06, "loss": 0.7108, "step": 7266 }, { "epoch": 2.781094527363184, "grad_norm": 0.5264468193054199, "learning_rate": 8.671495398683215e-06, "loss": 0.6584, "step": 7267 }, { "epoch": 2.781477229238423, "grad_norm": 0.5290716886520386, "learning_rate": 8.669038447601454e-06, "loss": 0.5631, "step": 7268 }, { "epoch": 2.7818599311136625, "grad_norm": 0.5448685884475708, "learning_rate": 8.666581578310858e-06, "loss": 0.7505, "step": 7269 }, { "epoch": 2.7822426329889014, "grad_norm": 0.5015613436698914, "learning_rate": 8.664124790962407e-06, "loss": 0.5813, "step": 7270 }, { "epoch": 2.7826253348641408, "grad_norm": 0.5571271181106567, "learning_rate": 8.661668085707074e-06, "loss": 0.6399, "step": 7271 }, { "epoch": 2.78300803673938, "grad_norm": 0.5909960865974426, "learning_rate": 8.659211462695832e-06, "loss": 0.7889, "step": 7272 }, { "epoch": 2.783390738614619, "grad_norm": 0.5389389991760254, "learning_rate": 8.656754922079649e-06, "loss": 0.6351, "step": 7273 }, { "epoch": 2.7837734404898584, "grad_norm": 0.5234535336494446, "learning_rate": 8.654298464009486e-06, "loss": 0.6224, "step": 7274 }, { "epoch": 2.784156142365098, "grad_norm": 0.5252106785774231, "learning_rate": 8.651842088636295e-06, "loss": 0.6053, "step": 7275 }, { "epoch": 2.7845388442403367, "grad_norm": 0.5704923868179321, "learning_rate": 8.64938579611103e-06, "loss": 0.643, "step": 7276 }, { "epoch": 2.784921546115576, "grad_norm": 0.6016185283660889, "learning_rate": 8.646929586584642e-06, "loss": 0.6991, "step": 7277 }, { "epoch": 2.785304247990815, "grad_norm": 0.5433434844017029, "learning_rate": 8.64447346020806e-06, "loss": 0.7088, "step": 7278 }, { "epoch": 2.7856869498660544, "grad_norm": 0.5211987495422363, "learning_rate": 8.642017417132225e-06, "loss": 0.6219, "step": 7279 }, { "epoch": 2.7860696517412933, "grad_norm": 0.557981014251709, "learning_rate": 8.639561457508071e-06, "loss": 0.5476, "step": 7280 }, { "epoch": 2.7864523536165327, "grad_norm": 0.510429859161377, "learning_rate": 8.637105581486518e-06, "loss": 0.6474, "step": 7281 }, { "epoch": 2.786835055491772, "grad_norm": 0.5182579755783081, "learning_rate": 8.634649789218487e-06, "loss": 0.6407, "step": 7282 }, { "epoch": 2.787217757367011, "grad_norm": 0.6667146682739258, "learning_rate": 8.632194080854901e-06, "loss": 0.6536, "step": 7283 }, { "epoch": 2.7876004592422503, "grad_norm": 0.5565563440322876, "learning_rate": 8.629738456546656e-06, "loss": 0.5946, "step": 7284 }, { "epoch": 2.7879831611174897, "grad_norm": 0.566815197467804, "learning_rate": 8.627282916444665e-06, "loss": 0.6468, "step": 7285 }, { "epoch": 2.7883658629927286, "grad_norm": 0.5178815126419067, "learning_rate": 8.624827460699829e-06, "loss": 0.6217, "step": 7286 }, { "epoch": 2.788748564867968, "grad_norm": 0.5215690732002258, "learning_rate": 8.622372089463043e-06, "loss": 0.6884, "step": 7287 }, { "epoch": 2.789131266743207, "grad_norm": 0.5557385683059692, "learning_rate": 8.619916802885188e-06, "loss": 0.6194, "step": 7288 }, { "epoch": 2.7895139686184462, "grad_norm": 0.5658363103866577, "learning_rate": 8.617461601117155e-06, "loss": 0.5895, "step": 7289 }, { "epoch": 2.789896670493685, "grad_norm": 0.5699406862258911, "learning_rate": 8.61500648430982e-06, "loss": 0.6878, "step": 7290 }, { "epoch": 2.7902793723689245, "grad_norm": 0.4977867007255554, "learning_rate": 8.612551452614058e-06, "loss": 0.5909, "step": 7291 }, { "epoch": 2.790662074244164, "grad_norm": 0.5311651825904846, "learning_rate": 8.610096506180735e-06, "loss": 0.6734, "step": 7292 }, { "epoch": 2.791044776119403, "grad_norm": 0.580373227596283, "learning_rate": 8.607641645160721e-06, "loss": 0.6305, "step": 7293 }, { "epoch": 2.791427477994642, "grad_norm": 0.49428287148475647, "learning_rate": 8.605186869704862e-06, "loss": 0.6029, "step": 7294 }, { "epoch": 2.7918101798698816, "grad_norm": 0.4967627227306366, "learning_rate": 8.602732179964017e-06, "loss": 0.6437, "step": 7295 }, { "epoch": 2.7921928817451205, "grad_norm": 0.5404354929924011, "learning_rate": 8.600277576089039e-06, "loss": 0.6729, "step": 7296 }, { "epoch": 2.79257558362036, "grad_norm": 0.538095235824585, "learning_rate": 8.59782305823076e-06, "loss": 0.6546, "step": 7297 }, { "epoch": 2.7929582854955988, "grad_norm": 0.5546205639839172, "learning_rate": 8.595368626540022e-06, "loss": 0.5993, "step": 7298 }, { "epoch": 2.793340987370838, "grad_norm": 0.562349796295166, "learning_rate": 8.592914281167654e-06, "loss": 0.6894, "step": 7299 }, { "epoch": 2.793723689246077, "grad_norm": 0.5966123342514038, "learning_rate": 8.590460022264487e-06, "loss": 0.6251, "step": 7300 }, { "epoch": 2.7941063911213164, "grad_norm": 0.5344566106796265, "learning_rate": 8.588005849981335e-06, "loss": 0.6271, "step": 7301 }, { "epoch": 2.794489092996556, "grad_norm": 0.5576499700546265, "learning_rate": 8.585551764469016e-06, "loss": 0.7148, "step": 7302 }, { "epoch": 2.7948717948717947, "grad_norm": 0.49372151494026184, "learning_rate": 8.583097765878347e-06, "loss": 0.5895, "step": 7303 }, { "epoch": 2.795254496747034, "grad_norm": 0.5296488404273987, "learning_rate": 8.580643854360123e-06, "loss": 0.6779, "step": 7304 }, { "epoch": 2.7956371986222734, "grad_norm": 0.5073195695877075, "learning_rate": 8.578190030065146e-06, "loss": 0.6439, "step": 7305 }, { "epoch": 2.7960199004975124, "grad_norm": 0.5283328294754028, "learning_rate": 8.575736293144213e-06, "loss": 0.6328, "step": 7306 }, { "epoch": 2.7964026023727517, "grad_norm": 0.5498377084732056, "learning_rate": 8.57328264374811e-06, "loss": 0.6313, "step": 7307 }, { "epoch": 2.7967853042479907, "grad_norm": 0.5448032021522522, "learning_rate": 8.570829082027621e-06, "loss": 0.5988, "step": 7308 }, { "epoch": 2.79716800612323, "grad_norm": 0.4973548948764801, "learning_rate": 8.568375608133528e-06, "loss": 0.6252, "step": 7309 }, { "epoch": 2.797550707998469, "grad_norm": 0.5615866780281067, "learning_rate": 8.565922222216596e-06, "loss": 0.7421, "step": 7310 }, { "epoch": 2.7979334098737083, "grad_norm": 0.5562452673912048, "learning_rate": 8.563468924427596e-06, "loss": 0.7072, "step": 7311 }, { "epoch": 2.7983161117489477, "grad_norm": 0.5410261750221252, "learning_rate": 8.561015714917289e-06, "loss": 0.6288, "step": 7312 }, { "epoch": 2.7986988136241866, "grad_norm": 0.5481303334236145, "learning_rate": 8.558562593836438e-06, "loss": 0.612, "step": 7313 }, { "epoch": 2.799081515499426, "grad_norm": 0.5545285940170288, "learning_rate": 8.556109561335784e-06, "loss": 0.6003, "step": 7314 }, { "epoch": 2.7994642173746653, "grad_norm": 0.552125096321106, "learning_rate": 8.553656617566076e-06, "loss": 0.7188, "step": 7315 }, { "epoch": 2.7998469192499043, "grad_norm": 0.5525945425033569, "learning_rate": 8.551203762678056e-06, "loss": 0.6025, "step": 7316 }, { "epoch": 2.8002296211251436, "grad_norm": 0.5092467069625854, "learning_rate": 8.548750996822455e-06, "loss": 0.6225, "step": 7317 }, { "epoch": 2.8006123230003825, "grad_norm": 0.555240273475647, "learning_rate": 8.546298320150003e-06, "loss": 0.6012, "step": 7318 }, { "epoch": 2.800995024875622, "grad_norm": 0.5309153199195862, "learning_rate": 8.54384573281143e-06, "loss": 0.6147, "step": 7319 }, { "epoch": 2.801377726750861, "grad_norm": 0.5064284801483154, "learning_rate": 8.541393234957443e-06, "loss": 0.6802, "step": 7320 }, { "epoch": 2.8017604286261, "grad_norm": 0.541993260383606, "learning_rate": 8.538940826738761e-06, "loss": 0.6468, "step": 7321 }, { "epoch": 2.8021431305013396, "grad_norm": 0.5158272385597229, "learning_rate": 8.536488508306097e-06, "loss": 0.6891, "step": 7322 }, { "epoch": 2.8025258323765785, "grad_norm": 0.4933754503726959, "learning_rate": 8.53403627981014e-06, "loss": 0.5825, "step": 7323 }, { "epoch": 2.802908534251818, "grad_norm": 0.5368970036506653, "learning_rate": 8.531584141401592e-06, "loss": 0.6273, "step": 7324 }, { "epoch": 2.803291236127057, "grad_norm": 0.5158793330192566, "learning_rate": 8.529132093231143e-06, "loss": 0.6568, "step": 7325 }, { "epoch": 2.803673938002296, "grad_norm": 0.534427285194397, "learning_rate": 8.52668013544948e-06, "loss": 0.6204, "step": 7326 }, { "epoch": 2.8040566398775355, "grad_norm": 0.5343006253242493, "learning_rate": 8.52422826820728e-06, "loss": 0.6687, "step": 7327 }, { "epoch": 2.8044393417527744, "grad_norm": 0.5000122785568237, "learning_rate": 8.521776491655215e-06, "loss": 0.5965, "step": 7328 }, { "epoch": 2.804822043628014, "grad_norm": 0.5910434722900391, "learning_rate": 8.519324805943962e-06, "loss": 0.6518, "step": 7329 }, { "epoch": 2.8052047455032527, "grad_norm": 0.5867051482200623, "learning_rate": 8.516873211224173e-06, "loss": 0.7142, "step": 7330 }, { "epoch": 2.805587447378492, "grad_norm": 0.5478551387786865, "learning_rate": 8.51442170764651e-06, "loss": 0.6168, "step": 7331 }, { "epoch": 2.8059701492537314, "grad_norm": 0.5452851057052612, "learning_rate": 8.511970295361627e-06, "loss": 0.6482, "step": 7332 }, { "epoch": 2.8063528511289704, "grad_norm": 0.5224609375, "learning_rate": 8.509518974520164e-06, "loss": 0.5905, "step": 7333 }, { "epoch": 2.8067355530042097, "grad_norm": 0.5402268171310425, "learning_rate": 8.507067745272764e-06, "loss": 0.6995, "step": 7334 }, { "epoch": 2.807118254879449, "grad_norm": 0.5980462431907654, "learning_rate": 8.504616607770064e-06, "loss": 0.6389, "step": 7335 }, { "epoch": 2.807500956754688, "grad_norm": 0.5192909836769104, "learning_rate": 8.502165562162688e-06, "loss": 0.6603, "step": 7336 }, { "epoch": 2.8078836586299274, "grad_norm": 0.554495632648468, "learning_rate": 8.499714608601263e-06, "loss": 0.6633, "step": 7337 }, { "epoch": 2.8082663605051663, "grad_norm": 0.5284677743911743, "learning_rate": 8.497263747236404e-06, "loss": 0.6834, "step": 7338 }, { "epoch": 2.8086490623804057, "grad_norm": 0.5916843414306641, "learning_rate": 8.494812978218732e-06, "loss": 0.6327, "step": 7339 }, { "epoch": 2.8090317642556446, "grad_norm": 0.5702458620071411, "learning_rate": 8.492362301698837e-06, "loss": 0.6261, "step": 7340 }, { "epoch": 2.809414466130884, "grad_norm": 0.515294075012207, "learning_rate": 8.489911717827333e-06, "loss": 0.6942, "step": 7341 }, { "epoch": 2.8097971680061233, "grad_norm": 0.5202558040618896, "learning_rate": 8.487461226754816e-06, "loss": 0.6699, "step": 7342 }, { "epoch": 2.8101798698813623, "grad_norm": 0.5823715329170227, "learning_rate": 8.485010828631863e-06, "loss": 0.7015, "step": 7343 }, { "epoch": 2.8105625717566016, "grad_norm": 0.5616106390953064, "learning_rate": 8.482560523609064e-06, "loss": 0.7032, "step": 7344 }, { "epoch": 2.810945273631841, "grad_norm": 0.5499544739723206, "learning_rate": 8.480110311837002e-06, "loss": 0.6116, "step": 7345 }, { "epoch": 2.81132797550708, "grad_norm": 0.5762054920196533, "learning_rate": 8.477660193466241e-06, "loss": 0.641, "step": 7346 }, { "epoch": 2.8117106773823193, "grad_norm": 0.5203481316566467, "learning_rate": 8.475210168647352e-06, "loss": 0.5888, "step": 7347 }, { "epoch": 2.812093379257558, "grad_norm": 0.5108327865600586, "learning_rate": 8.4727602375309e-06, "loss": 0.6568, "step": 7348 }, { "epoch": 2.8124760811327976, "grad_norm": 0.5210540890693665, "learning_rate": 8.470310400267427e-06, "loss": 0.7397, "step": 7349 }, { "epoch": 2.8128587830080365, "grad_norm": 0.5492504239082336, "learning_rate": 8.467860657007488e-06, "loss": 0.6212, "step": 7350 }, { "epoch": 2.813241484883276, "grad_norm": 0.5129650831222534, "learning_rate": 8.46541100790163e-06, "loss": 0.6652, "step": 7351 }, { "epoch": 2.8136241867585152, "grad_norm": 0.5795044898986816, "learning_rate": 8.462961453100394e-06, "loss": 0.6612, "step": 7352 }, { "epoch": 2.814006888633754, "grad_norm": 0.5663700699806213, "learning_rate": 8.4605119927543e-06, "loss": 0.628, "step": 7353 }, { "epoch": 2.8143895905089935, "grad_norm": 0.5313138365745544, "learning_rate": 8.45806262701388e-06, "loss": 0.5825, "step": 7354 }, { "epoch": 2.814772292384233, "grad_norm": 0.5104140639305115, "learning_rate": 8.455613356029657e-06, "loss": 0.5847, "step": 7355 }, { "epoch": 2.815154994259472, "grad_norm": 0.5409271121025085, "learning_rate": 8.453164179952138e-06, "loss": 0.5837, "step": 7356 }, { "epoch": 2.815537696134711, "grad_norm": 0.5272746086120605, "learning_rate": 8.450715098931838e-06, "loss": 0.6312, "step": 7357 }, { "epoch": 2.81592039800995, "grad_norm": 0.5401822328567505, "learning_rate": 8.448266113119262e-06, "loss": 0.6449, "step": 7358 }, { "epoch": 2.8163030998851895, "grad_norm": 0.5450471639633179, "learning_rate": 8.445817222664897e-06, "loss": 0.5321, "step": 7359 }, { "epoch": 2.8166858017604284, "grad_norm": 0.5581899881362915, "learning_rate": 8.443368427719238e-06, "loss": 0.6513, "step": 7360 }, { "epoch": 2.8170685036356677, "grad_norm": 0.5484660267829895, "learning_rate": 8.440919728432772e-06, "loss": 0.733, "step": 7361 }, { "epoch": 2.817451205510907, "grad_norm": 0.5183884501457214, "learning_rate": 8.438471124955979e-06, "loss": 0.6067, "step": 7362 }, { "epoch": 2.817833907386146, "grad_norm": 0.5221250653266907, "learning_rate": 8.436022617439326e-06, "loss": 0.5523, "step": 7363 }, { "epoch": 2.8182166092613854, "grad_norm": 0.5350903868675232, "learning_rate": 8.433574206033287e-06, "loss": 0.6589, "step": 7364 }, { "epoch": 2.8185993111366248, "grad_norm": 0.5092862844467163, "learning_rate": 8.431125890888323e-06, "loss": 0.6742, "step": 7365 }, { "epoch": 2.8189820130118637, "grad_norm": 0.586317777633667, "learning_rate": 8.428677672154887e-06, "loss": 0.63, "step": 7366 }, { "epoch": 2.819364714887103, "grad_norm": 0.5214250683784485, "learning_rate": 8.426229549983428e-06, "loss": 0.6971, "step": 7367 }, { "epoch": 2.819747416762342, "grad_norm": 0.524108350276947, "learning_rate": 8.423781524524396e-06, "loss": 0.6231, "step": 7368 }, { "epoch": 2.8201301186375813, "grad_norm": 0.5297930836677551, "learning_rate": 8.421333595928218e-06, "loss": 0.5875, "step": 7369 }, { "epoch": 2.8205128205128203, "grad_norm": 0.5175272822380066, "learning_rate": 8.418885764345332e-06, "loss": 0.6084, "step": 7370 }, { "epoch": 2.8208955223880596, "grad_norm": 0.5468437075614929, "learning_rate": 8.416438029926167e-06, "loss": 0.6045, "step": 7371 }, { "epoch": 2.821278224263299, "grad_norm": 0.5287004709243774, "learning_rate": 8.413990392821137e-06, "loss": 0.5846, "step": 7372 }, { "epoch": 2.821660926138538, "grad_norm": 0.5033250451087952, "learning_rate": 8.411542853180657e-06, "loss": 0.5563, "step": 7373 }, { "epoch": 2.8220436280137773, "grad_norm": 0.5143627524375916, "learning_rate": 8.40909541115514e-06, "loss": 0.5952, "step": 7374 }, { "epoch": 2.8224263298890166, "grad_norm": 0.5360648036003113, "learning_rate": 8.40664806689498e-06, "loss": 0.6529, "step": 7375 }, { "epoch": 2.8228090317642556, "grad_norm": 0.5007986426353455, "learning_rate": 8.404200820550577e-06, "loss": 0.519, "step": 7376 }, { "epoch": 2.823191733639495, "grad_norm": 0.5283035635948181, "learning_rate": 8.401753672272321e-06, "loss": 0.6504, "step": 7377 }, { "epoch": 2.823574435514734, "grad_norm": 0.5394375920295715, "learning_rate": 8.3993066222106e-06, "loss": 0.5699, "step": 7378 }, { "epoch": 2.8239571373899732, "grad_norm": 0.6139810085296631, "learning_rate": 8.396859670515784e-06, "loss": 0.6509, "step": 7379 }, { "epoch": 2.824339839265212, "grad_norm": 0.520774245262146, "learning_rate": 8.394412817338247e-06, "loss": 0.6646, "step": 7380 }, { "epoch": 2.8247225411404515, "grad_norm": 0.5068051815032959, "learning_rate": 8.391966062828358e-06, "loss": 0.6441, "step": 7381 }, { "epoch": 2.825105243015691, "grad_norm": 0.6102548241615295, "learning_rate": 8.389519407136471e-06, "loss": 0.59, "step": 7382 }, { "epoch": 2.82548794489093, "grad_norm": 0.5751034617424011, "learning_rate": 8.387072850412944e-06, "loss": 0.7058, "step": 7383 }, { "epoch": 2.825870646766169, "grad_norm": 0.49970439076423645, "learning_rate": 8.384626392808126e-06, "loss": 0.6943, "step": 7384 }, { "epoch": 2.8262533486414085, "grad_norm": 0.513353705406189, "learning_rate": 8.382180034472353e-06, "loss": 0.6086, "step": 7385 }, { "epoch": 2.8266360505166475, "grad_norm": 0.6245337128639221, "learning_rate": 8.379733775555964e-06, "loss": 0.6595, "step": 7386 }, { "epoch": 2.827018752391887, "grad_norm": 0.6014984846115112, "learning_rate": 8.377287616209291e-06, "loss": 0.728, "step": 7387 }, { "epoch": 2.8274014542671257, "grad_norm": 0.5844564437866211, "learning_rate": 8.37484155658265e-06, "loss": 0.6967, "step": 7388 }, { "epoch": 2.827784156142365, "grad_norm": 0.5083109736442566, "learning_rate": 8.37239559682636e-06, "loss": 0.552, "step": 7389 }, { "epoch": 2.828166858017604, "grad_norm": 0.543595552444458, "learning_rate": 8.369949737090734e-06, "loss": 0.6101, "step": 7390 }, { "epoch": 2.8285495598928434, "grad_norm": 0.5278576612472534, "learning_rate": 8.367503977526076e-06, "loss": 0.642, "step": 7391 }, { "epoch": 2.8289322617680828, "grad_norm": 0.5071812868118286, "learning_rate": 8.365058318282685e-06, "loss": 0.634, "step": 7392 }, { "epoch": 2.8293149636433217, "grad_norm": 0.5276254415512085, "learning_rate": 8.36261275951085e-06, "loss": 0.7025, "step": 7393 }, { "epoch": 2.829697665518561, "grad_norm": 0.5188202261924744, "learning_rate": 8.360167301360866e-06, "loss": 0.6234, "step": 7394 }, { "epoch": 2.8300803673938004, "grad_norm": 0.5196437239646912, "learning_rate": 8.357721943983e-06, "loss": 0.6022, "step": 7395 }, { "epoch": 2.8304630692690393, "grad_norm": 0.527555525302887, "learning_rate": 8.355276687527535e-06, "loss": 0.6536, "step": 7396 }, { "epoch": 2.8308457711442787, "grad_norm": 0.502953827381134, "learning_rate": 8.35283153214474e-06, "loss": 0.6735, "step": 7397 }, { "epoch": 2.8312284730195176, "grad_norm": 0.5226002931594849, "learning_rate": 8.350386477984867e-06, "loss": 0.6167, "step": 7398 }, { "epoch": 2.831611174894757, "grad_norm": 0.524909496307373, "learning_rate": 8.347941525198179e-06, "loss": 0.6183, "step": 7399 }, { "epoch": 2.831993876769996, "grad_norm": 0.5038281679153442, "learning_rate": 8.345496673934921e-06, "loss": 0.6582, "step": 7400 }, { "epoch": 2.8323765786452353, "grad_norm": 0.5414998531341553, "learning_rate": 8.343051924345338e-06, "loss": 0.6114, "step": 7401 }, { "epoch": 2.8327592805204747, "grad_norm": 0.5002110004425049, "learning_rate": 8.340607276579664e-06, "loss": 0.645, "step": 7402 }, { "epoch": 2.8331419823957136, "grad_norm": 0.536632239818573, "learning_rate": 8.338162730788131e-06, "loss": 0.6575, "step": 7403 }, { "epoch": 2.833524684270953, "grad_norm": 0.5670983195304871, "learning_rate": 8.335718287120969e-06, "loss": 0.7001, "step": 7404 }, { "epoch": 2.8339073861461923, "grad_norm": 0.515105128288269, "learning_rate": 8.333273945728382e-06, "loss": 0.5928, "step": 7405 }, { "epoch": 2.8342900880214312, "grad_norm": 0.5397792458534241, "learning_rate": 8.330829706760589e-06, "loss": 0.6643, "step": 7406 }, { "epoch": 2.8346727898966706, "grad_norm": 0.5360637307167053, "learning_rate": 8.328385570367801e-06, "loss": 0.6467, "step": 7407 }, { "epoch": 2.8350554917719095, "grad_norm": 0.536686897277832, "learning_rate": 8.325941536700206e-06, "loss": 0.5996, "step": 7408 }, { "epoch": 2.835438193647149, "grad_norm": 0.5065423250198364, "learning_rate": 8.323497605908e-06, "loss": 0.6488, "step": 7409 }, { "epoch": 2.835820895522388, "grad_norm": 0.557621955871582, "learning_rate": 8.321053778141373e-06, "loss": 0.6549, "step": 7410 }, { "epoch": 2.836203597397627, "grad_norm": 0.5550623536109924, "learning_rate": 8.318610053550498e-06, "loss": 0.5656, "step": 7411 }, { "epoch": 2.8365862992728665, "grad_norm": 0.526435911655426, "learning_rate": 8.316166432285554e-06, "loss": 0.5876, "step": 7412 }, { "epoch": 2.8369690011481055, "grad_norm": 0.5090694427490234, "learning_rate": 8.31372291449671e-06, "loss": 0.719, "step": 7413 }, { "epoch": 2.837351703023345, "grad_norm": 0.5328282117843628, "learning_rate": 8.311279500334119e-06, "loss": 0.6151, "step": 7414 }, { "epoch": 2.837734404898584, "grad_norm": 0.5498670339584351, "learning_rate": 8.308836189947939e-06, "loss": 0.6465, "step": 7415 }, { "epoch": 2.838117106773823, "grad_norm": 0.5169886946678162, "learning_rate": 8.306392983488316e-06, "loss": 0.6737, "step": 7416 }, { "epoch": 2.8384998086490625, "grad_norm": 0.5187084078788757, "learning_rate": 8.303949881105402e-06, "loss": 0.6193, "step": 7417 }, { "epoch": 2.8388825105243014, "grad_norm": 0.5249105095863342, "learning_rate": 8.30150688294932e-06, "loss": 0.5633, "step": 7418 }, { "epoch": 2.8392652123995408, "grad_norm": 0.5105767846107483, "learning_rate": 8.2990639891702e-06, "loss": 0.5932, "step": 7419 }, { "epoch": 2.8396479142747797, "grad_norm": 0.5177664160728455, "learning_rate": 8.296621199918172e-06, "loss": 0.6189, "step": 7420 }, { "epoch": 2.840030616150019, "grad_norm": 0.5395061373710632, "learning_rate": 8.294178515343345e-06, "loss": 0.6106, "step": 7421 }, { "epoch": 2.8404133180252584, "grad_norm": 0.5429565906524658, "learning_rate": 8.29173593559583e-06, "loss": 0.599, "step": 7422 }, { "epoch": 2.8407960199004973, "grad_norm": 0.5616645216941833, "learning_rate": 8.289293460825737e-06, "loss": 0.6625, "step": 7423 }, { "epoch": 2.8411787217757367, "grad_norm": 0.5164635181427002, "learning_rate": 8.28685109118315e-06, "loss": 0.6852, "step": 7424 }, { "epoch": 2.841561423650976, "grad_norm": 0.5229648351669312, "learning_rate": 8.284408826818167e-06, "loss": 0.6407, "step": 7425 }, { "epoch": 2.841944125526215, "grad_norm": 0.5475519895553589, "learning_rate": 8.281966667880874e-06, "loss": 0.6174, "step": 7426 }, { "epoch": 2.8423268274014544, "grad_norm": 0.7142263650894165, "learning_rate": 8.279524614521341e-06, "loss": 0.6195, "step": 7427 }, { "epoch": 2.8427095292766933, "grad_norm": 0.8284965753555298, "learning_rate": 8.277082666889642e-06, "loss": 0.5912, "step": 7428 }, { "epoch": 2.8430922311519327, "grad_norm": 0.48829880356788635, "learning_rate": 8.274640825135843e-06, "loss": 0.6451, "step": 7429 }, { "epoch": 2.8434749330271716, "grad_norm": 0.5565791130065918, "learning_rate": 8.272199089410002e-06, "loss": 0.6773, "step": 7430 }, { "epoch": 2.843857634902411, "grad_norm": 0.5656364560127258, "learning_rate": 8.269757459862169e-06, "loss": 0.5845, "step": 7431 }, { "epoch": 2.8442403367776503, "grad_norm": 0.523144006729126, "learning_rate": 8.267315936642385e-06, "loss": 0.6317, "step": 7432 }, { "epoch": 2.8446230386528892, "grad_norm": 0.6628046631813049, "learning_rate": 8.264874519900697e-06, "loss": 0.6889, "step": 7433 }, { "epoch": 2.8450057405281286, "grad_norm": 0.5521656274795532, "learning_rate": 8.262433209787126e-06, "loss": 0.6687, "step": 7434 }, { "epoch": 2.845388442403368, "grad_norm": 0.525571346282959, "learning_rate": 8.259992006451703e-06, "loss": 0.6196, "step": 7435 }, { "epoch": 2.845771144278607, "grad_norm": 0.5605136156082153, "learning_rate": 8.257550910044447e-06, "loss": 0.6268, "step": 7436 }, { "epoch": 2.8461538461538463, "grad_norm": 0.5529643297195435, "learning_rate": 8.255109920715369e-06, "loss": 0.6685, "step": 7437 }, { "epoch": 2.846536548029085, "grad_norm": 0.5592047572135925, "learning_rate": 8.252669038614472e-06, "loss": 0.5915, "step": 7438 }, { "epoch": 2.8469192499043245, "grad_norm": 0.5164852142333984, "learning_rate": 8.25022826389176e-06, "loss": 0.6095, "step": 7439 }, { "epoch": 2.8473019517795635, "grad_norm": 0.56486976146698, "learning_rate": 8.247787596697217e-06, "loss": 0.7066, "step": 7440 }, { "epoch": 2.847684653654803, "grad_norm": 0.5093870162963867, "learning_rate": 8.245347037180837e-06, "loss": 0.6675, "step": 7441 }, { "epoch": 2.848067355530042, "grad_norm": 0.5227140188217163, "learning_rate": 8.242906585492594e-06, "loss": 0.6542, "step": 7442 }, { "epoch": 2.848450057405281, "grad_norm": 0.6753102540969849, "learning_rate": 8.240466241782465e-06, "loss": 0.6566, "step": 7443 }, { "epoch": 2.8488327592805205, "grad_norm": 0.5389211773872375, "learning_rate": 8.238026006200409e-06, "loss": 0.7585, "step": 7444 }, { "epoch": 2.84921546115576, "grad_norm": 0.5486986637115479, "learning_rate": 8.235585878896389e-06, "loss": 0.647, "step": 7445 }, { "epoch": 2.8495981630309988, "grad_norm": 0.5311877727508545, "learning_rate": 8.233145860020358e-06, "loss": 0.6319, "step": 7446 }, { "epoch": 2.849980864906238, "grad_norm": 0.5589559078216553, "learning_rate": 8.23070594972226e-06, "loss": 0.6899, "step": 7447 }, { "epoch": 2.850363566781477, "grad_norm": 0.500910758972168, "learning_rate": 8.228266148152033e-06, "loss": 0.5801, "step": 7448 }, { "epoch": 2.8507462686567164, "grad_norm": 0.5236615538597107, "learning_rate": 8.225826455459615e-06, "loss": 0.7348, "step": 7449 }, { "epoch": 2.8511289705319554, "grad_norm": 0.5254378914833069, "learning_rate": 8.223386871794922e-06, "loss": 0.586, "step": 7450 }, { "epoch": 2.8515116724071947, "grad_norm": 0.529140293598175, "learning_rate": 8.220947397307881e-06, "loss": 0.6448, "step": 7451 }, { "epoch": 2.851894374282434, "grad_norm": 0.5513941049575806, "learning_rate": 8.218508032148405e-06, "loss": 0.7816, "step": 7452 }, { "epoch": 2.852277076157673, "grad_norm": 0.5424204468727112, "learning_rate": 8.216068776466394e-06, "loss": 0.6292, "step": 7453 }, { "epoch": 2.8526597780329124, "grad_norm": 0.5388266444206238, "learning_rate": 8.213629630411747e-06, "loss": 0.6811, "step": 7454 }, { "epoch": 2.8530424799081517, "grad_norm": 0.5397634506225586, "learning_rate": 8.211190594134359e-06, "loss": 0.5777, "step": 7455 }, { "epoch": 2.8534251817833907, "grad_norm": 0.5078646540641785, "learning_rate": 8.208751667784117e-06, "loss": 0.6948, "step": 7456 }, { "epoch": 2.85380788365863, "grad_norm": 0.49996230006217957, "learning_rate": 8.206312851510896e-06, "loss": 0.6089, "step": 7457 }, { "epoch": 2.854190585533869, "grad_norm": 0.5391510725021362, "learning_rate": 8.203874145464568e-06, "loss": 0.6057, "step": 7458 }, { "epoch": 2.8545732874091083, "grad_norm": 0.5135923624038696, "learning_rate": 8.201435549795003e-06, "loss": 0.6223, "step": 7459 }, { "epoch": 2.8549559892843472, "grad_norm": 0.5525499582290649, "learning_rate": 8.19899706465205e-06, "loss": 0.5814, "step": 7460 }, { "epoch": 2.8553386911595866, "grad_norm": 0.530427098274231, "learning_rate": 8.196558690185567e-06, "loss": 0.5851, "step": 7461 }, { "epoch": 2.855721393034826, "grad_norm": 0.5102974772453308, "learning_rate": 8.1941204265454e-06, "loss": 0.5757, "step": 7462 }, { "epoch": 2.856104094910065, "grad_norm": 0.5459819436073303, "learning_rate": 8.191682273881382e-06, "loss": 0.6981, "step": 7463 }, { "epoch": 2.8564867967853043, "grad_norm": 0.5458293557167053, "learning_rate": 8.189244232343348e-06, "loss": 0.6297, "step": 7464 }, { "epoch": 2.8568694986605436, "grad_norm": 0.5220879912376404, "learning_rate": 8.18680630208112e-06, "loss": 0.6293, "step": 7465 }, { "epoch": 2.8572522005357825, "grad_norm": 0.5683251619338989, "learning_rate": 8.184368483244514e-06, "loss": 0.6324, "step": 7466 }, { "epoch": 2.857634902411022, "grad_norm": 0.6367135047912598, "learning_rate": 8.181930775983343e-06, "loss": 0.6832, "step": 7467 }, { "epoch": 2.858017604286261, "grad_norm": 0.554715096950531, "learning_rate": 8.179493180447411e-06, "loss": 0.6601, "step": 7468 }, { "epoch": 2.8584003061615, "grad_norm": 0.6661410331726074, "learning_rate": 8.177055696786516e-06, "loss": 0.6795, "step": 7469 }, { "epoch": 2.858783008036739, "grad_norm": 0.5731185078620911, "learning_rate": 8.174618325150443e-06, "loss": 0.5516, "step": 7470 }, { "epoch": 2.8591657099119785, "grad_norm": 0.5065010786056519, "learning_rate": 8.172181065688978e-06, "loss": 0.6022, "step": 7471 }, { "epoch": 2.859548411787218, "grad_norm": 0.5334993600845337, "learning_rate": 8.169743918551898e-06, "loss": 0.6448, "step": 7472 }, { "epoch": 2.859931113662457, "grad_norm": 0.5470457673072815, "learning_rate": 8.167306883888969e-06, "loss": 0.7253, "step": 7473 }, { "epoch": 2.860313815537696, "grad_norm": 0.5960391163825989, "learning_rate": 8.164869961849956e-06, "loss": 0.6867, "step": 7474 }, { "epoch": 2.8606965174129355, "grad_norm": 0.5408859848976135, "learning_rate": 8.162433152584615e-06, "loss": 0.6857, "step": 7475 }, { "epoch": 2.8610792192881744, "grad_norm": 0.6891456246376038, "learning_rate": 8.159996456242693e-06, "loss": 0.6512, "step": 7476 }, { "epoch": 2.861461921163414, "grad_norm": 0.5279086828231812, "learning_rate": 8.157559872973932e-06, "loss": 0.6548, "step": 7477 }, { "epoch": 2.8618446230386527, "grad_norm": 0.6176167726516724, "learning_rate": 8.15512340292807e-06, "loss": 0.6325, "step": 7478 }, { "epoch": 2.862227324913892, "grad_norm": 0.5273594856262207, "learning_rate": 8.152687046254826e-06, "loss": 0.5958, "step": 7479 }, { "epoch": 2.862610026789131, "grad_norm": 0.4876686632633209, "learning_rate": 8.150250803103927e-06, "loss": 0.6564, "step": 7480 }, { "epoch": 2.8629927286643704, "grad_norm": 0.540444016456604, "learning_rate": 8.147814673625086e-06, "loss": 0.5765, "step": 7481 }, { "epoch": 2.8633754305396097, "grad_norm": 0.5384907722473145, "learning_rate": 8.145378657968009e-06, "loss": 0.6935, "step": 7482 }, { "epoch": 2.8637581324148487, "grad_norm": 0.6225442290306091, "learning_rate": 8.142942756282394e-06, "loss": 0.6015, "step": 7483 }, { "epoch": 2.864140834290088, "grad_norm": 0.5693854093551636, "learning_rate": 8.140506968717937e-06, "loss": 0.6138, "step": 7484 }, { "epoch": 2.8645235361653274, "grad_norm": 0.6124842166900635, "learning_rate": 8.138071295424324e-06, "loss": 0.5539, "step": 7485 }, { "epoch": 2.8649062380405663, "grad_norm": 0.8516815304756165, "learning_rate": 8.135635736551231e-06, "loss": 0.5702, "step": 7486 }, { "epoch": 2.8652889399158057, "grad_norm": 0.5708482265472412, "learning_rate": 8.133200292248329e-06, "loss": 0.6318, "step": 7487 }, { "epoch": 2.8656716417910446, "grad_norm": 0.5404980182647705, "learning_rate": 8.13076496266529e-06, "loss": 0.7501, "step": 7488 }, { "epoch": 2.866054343666284, "grad_norm": 0.577745795249939, "learning_rate": 8.128329747951759e-06, "loss": 0.6379, "step": 7489 }, { "epoch": 2.866437045541523, "grad_norm": 0.5626717805862427, "learning_rate": 8.125894648257396e-06, "loss": 0.5292, "step": 7490 }, { "epoch": 2.8668197474167623, "grad_norm": 0.5026901364326477, "learning_rate": 8.12345966373184e-06, "loss": 0.5907, "step": 7491 }, { "epoch": 2.8672024492920016, "grad_norm": 0.5590662956237793, "learning_rate": 8.12102479452473e-06, "loss": 0.6119, "step": 7492 }, { "epoch": 2.8675851511672406, "grad_norm": 0.5307954549789429, "learning_rate": 8.118590040785693e-06, "loss": 0.5888, "step": 7493 }, { "epoch": 2.86796785304248, "grad_norm": 0.6078107953071594, "learning_rate": 8.116155402664358e-06, "loss": 0.7015, "step": 7494 }, { "epoch": 2.8683505549177193, "grad_norm": 0.5249902009963989, "learning_rate": 8.113720880310327e-06, "loss": 0.6249, "step": 7495 }, { "epoch": 2.868733256792958, "grad_norm": 0.5703251957893372, "learning_rate": 8.111286473873217e-06, "loss": 0.6302, "step": 7496 }, { "epoch": 2.8691159586681976, "grad_norm": 0.5889406800270081, "learning_rate": 8.108852183502627e-06, "loss": 0.6712, "step": 7497 }, { "epoch": 2.8694986605434365, "grad_norm": 0.5152482390403748, "learning_rate": 8.106418009348157e-06, "loss": 0.6595, "step": 7498 }, { "epoch": 2.869881362418676, "grad_norm": 0.5091840624809265, "learning_rate": 8.103983951559382e-06, "loss": 0.5868, "step": 7499 }, { "epoch": 2.870264064293915, "grad_norm": 0.5257546901702881, "learning_rate": 8.101550010285887e-06, "loss": 0.6554, "step": 7500 }, { "epoch": 2.870646766169154, "grad_norm": 0.5411954522132874, "learning_rate": 8.099116185677246e-06, "loss": 0.5578, "step": 7501 }, { "epoch": 2.8710294680443935, "grad_norm": 0.549518346786499, "learning_rate": 8.09668247788302e-06, "loss": 0.6679, "step": 7502 }, { "epoch": 2.8714121699196324, "grad_norm": 0.5687037706375122, "learning_rate": 8.09424888705277e-06, "loss": 0.7046, "step": 7503 }, { "epoch": 2.871794871794872, "grad_norm": 0.5258949398994446, "learning_rate": 8.091815413336048e-06, "loss": 0.6843, "step": 7504 }, { "epoch": 2.872177573670111, "grad_norm": 0.5290899872779846, "learning_rate": 8.089382056882392e-06, "loss": 0.6417, "step": 7505 }, { "epoch": 2.87256027554535, "grad_norm": 0.5248467922210693, "learning_rate": 8.08694881784134e-06, "loss": 0.6895, "step": 7506 }, { "epoch": 2.8729429774205895, "grad_norm": 0.5584715604782104, "learning_rate": 8.084515696362426e-06, "loss": 0.6524, "step": 7507 }, { "epoch": 2.8733256792958284, "grad_norm": 0.5484394431114197, "learning_rate": 8.082082692595167e-06, "loss": 0.6178, "step": 7508 }, { "epoch": 2.8737083811710677, "grad_norm": 0.6098012924194336, "learning_rate": 8.079649806689078e-06, "loss": 0.6826, "step": 7509 }, { "epoch": 2.8740910830463067, "grad_norm": 0.7335825562477112, "learning_rate": 8.077217038793668e-06, "loss": 0.6203, "step": 7510 }, { "epoch": 2.874473784921546, "grad_norm": 0.5139738321304321, "learning_rate": 8.074784389058439e-06, "loss": 0.6422, "step": 7511 }, { "epoch": 2.8748564867967854, "grad_norm": 0.5166439414024353, "learning_rate": 8.072351857632879e-06, "loss": 0.6201, "step": 7512 }, { "epoch": 2.8752391886720243, "grad_norm": 0.5378976464271545, "learning_rate": 8.069919444666474e-06, "loss": 0.591, "step": 7513 }, { "epoch": 2.8756218905472637, "grad_norm": 0.571902871131897, "learning_rate": 8.06748715030871e-06, "loss": 0.6772, "step": 7514 }, { "epoch": 2.876004592422503, "grad_norm": 0.5457777976989746, "learning_rate": 8.065054974709048e-06, "loss": 0.6423, "step": 7515 }, { "epoch": 2.876387294297742, "grad_norm": 0.512710452079773, "learning_rate": 8.062622918016954e-06, "loss": 0.5654, "step": 7516 }, { "epoch": 2.8767699961729813, "grad_norm": 0.5704416036605835, "learning_rate": 8.060190980381892e-06, "loss": 0.7038, "step": 7517 }, { "epoch": 2.8771526980482207, "grad_norm": 0.5242633819580078, "learning_rate": 8.057759161953303e-06, "loss": 0.6565, "step": 7518 }, { "epoch": 2.8775353999234596, "grad_norm": 0.5160413384437561, "learning_rate": 8.05532746288063e-06, "loss": 0.6753, "step": 7519 }, { "epoch": 2.8779181017986986, "grad_norm": 0.5445396900177002, "learning_rate": 8.052895883313312e-06, "loss": 0.6202, "step": 7520 }, { "epoch": 2.878300803673938, "grad_norm": 0.5377331376075745, "learning_rate": 8.050464423400769e-06, "loss": 0.6792, "step": 7521 }, { "epoch": 2.8786835055491773, "grad_norm": 0.5406811833381653, "learning_rate": 8.048033083292426e-06, "loss": 0.6393, "step": 7522 }, { "epoch": 2.879066207424416, "grad_norm": 0.5581687688827515, "learning_rate": 8.045601863137694e-06, "loss": 0.6125, "step": 7523 }, { "epoch": 2.8794489092996556, "grad_norm": 0.54111647605896, "learning_rate": 8.043170763085981e-06, "loss": 0.6479, "step": 7524 }, { "epoch": 2.879831611174895, "grad_norm": 0.5306318998336792, "learning_rate": 8.040739783286677e-06, "loss": 0.6178, "step": 7525 }, { "epoch": 2.880214313050134, "grad_norm": 0.5598363876342773, "learning_rate": 8.038308923889179e-06, "loss": 0.631, "step": 7526 }, { "epoch": 2.8805970149253732, "grad_norm": 0.5011361837387085, "learning_rate": 8.035878185042869e-06, "loss": 0.5659, "step": 7527 }, { "epoch": 2.8809797168006126, "grad_norm": 0.5212870240211487, "learning_rate": 8.033447566897117e-06, "loss": 0.6181, "step": 7528 }, { "epoch": 2.8813624186758515, "grad_norm": 0.5469019412994385, "learning_rate": 8.031017069601298e-06, "loss": 0.6598, "step": 7529 }, { "epoch": 2.8817451205510904, "grad_norm": 0.528819739818573, "learning_rate": 8.028586693304771e-06, "loss": 0.598, "step": 7530 }, { "epoch": 2.88212782242633, "grad_norm": 0.524455726146698, "learning_rate": 8.026156438156885e-06, "loss": 0.6555, "step": 7531 }, { "epoch": 2.882510524301569, "grad_norm": 0.514059841632843, "learning_rate": 8.02372630430699e-06, "loss": 0.6077, "step": 7532 }, { "epoch": 2.882893226176808, "grad_norm": 0.5546913146972656, "learning_rate": 8.021296291904426e-06, "loss": 0.6285, "step": 7533 }, { "epoch": 2.8832759280520475, "grad_norm": 0.5315089225769043, "learning_rate": 8.018866401098516e-06, "loss": 0.6553, "step": 7534 }, { "epoch": 2.883658629927287, "grad_norm": 0.6104320287704468, "learning_rate": 8.016436632038587e-06, "loss": 0.6506, "step": 7535 }, { "epoch": 2.8840413318025258, "grad_norm": 0.5216886401176453, "learning_rate": 8.014006984873959e-06, "loss": 0.6249, "step": 7536 }, { "epoch": 2.884424033677765, "grad_norm": 0.530007541179657, "learning_rate": 8.011577459753936e-06, "loss": 0.7159, "step": 7537 }, { "epoch": 2.8848067355530045, "grad_norm": 0.5576090216636658, "learning_rate": 8.00914805682782e-06, "loss": 0.6705, "step": 7538 }, { "epoch": 2.8851894374282434, "grad_norm": 0.615207850933075, "learning_rate": 8.006718776244905e-06, "loss": 0.5851, "step": 7539 }, { "epoch": 2.8855721393034823, "grad_norm": 0.5709442496299744, "learning_rate": 8.004289618154474e-06, "loss": 0.7379, "step": 7540 }, { "epoch": 2.8859548411787217, "grad_norm": 0.5621057152748108, "learning_rate": 8.001860582705809e-06, "loss": 0.7662, "step": 7541 }, { "epoch": 2.886337543053961, "grad_norm": 0.5026713013648987, "learning_rate": 7.999431670048179e-06, "loss": 0.5885, "step": 7542 }, { "epoch": 2.8867202449292, "grad_norm": 0.6064246296882629, "learning_rate": 7.997002880330851e-06, "loss": 0.6221, "step": 7543 }, { "epoch": 2.8871029468044394, "grad_norm": 0.5862166285514832, "learning_rate": 7.994574213703072e-06, "loss": 0.6859, "step": 7544 }, { "epoch": 2.8874856486796787, "grad_norm": 0.6057085394859314, "learning_rate": 7.992145670314096e-06, "loss": 0.6581, "step": 7545 }, { "epoch": 2.8878683505549176, "grad_norm": 0.5277437567710876, "learning_rate": 7.989717250313165e-06, "loss": 0.7006, "step": 7546 }, { "epoch": 2.888251052430157, "grad_norm": 0.5874389410018921, "learning_rate": 7.987288953849509e-06, "loss": 0.6429, "step": 7547 }, { "epoch": 2.8886337543053964, "grad_norm": 0.5052465200424194, "learning_rate": 7.98486078107235e-06, "loss": 0.6531, "step": 7548 }, { "epoch": 2.8890164561806353, "grad_norm": 0.4927205443382263, "learning_rate": 7.982432732130911e-06, "loss": 0.6706, "step": 7549 }, { "epoch": 2.889399158055874, "grad_norm": 0.5405209064483643, "learning_rate": 7.980004807174403e-06, "loss": 0.6892, "step": 7550 }, { "epoch": 2.8897818599311136, "grad_norm": 0.6321975588798523, "learning_rate": 7.977577006352027e-06, "loss": 0.5918, "step": 7551 }, { "epoch": 2.890164561806353, "grad_norm": 0.5350271463394165, "learning_rate": 7.975149329812974e-06, "loss": 0.6209, "step": 7552 }, { "epoch": 2.890547263681592, "grad_norm": 0.5905309319496155, "learning_rate": 7.97272177770644e-06, "loss": 0.6648, "step": 7553 }, { "epoch": 2.8909299655568312, "grad_norm": 0.5453476309776306, "learning_rate": 7.970294350181595e-06, "loss": 0.698, "step": 7554 }, { "epoch": 2.8913126674320706, "grad_norm": 0.5301173329353333, "learning_rate": 7.967867047387614e-06, "loss": 0.6774, "step": 7555 }, { "epoch": 2.8916953693073095, "grad_norm": 0.5416708588600159, "learning_rate": 7.965439869473664e-06, "loss": 0.5867, "step": 7556 }, { "epoch": 2.892078071182549, "grad_norm": 0.562067985534668, "learning_rate": 7.963012816588899e-06, "loss": 0.6714, "step": 7557 }, { "epoch": 2.8924607730577883, "grad_norm": 0.517331600189209, "learning_rate": 7.960585888882469e-06, "loss": 0.6815, "step": 7558 }, { "epoch": 2.892843474933027, "grad_norm": 0.6165717840194702, "learning_rate": 7.958159086503518e-06, "loss": 0.7691, "step": 7559 }, { "epoch": 2.893226176808266, "grad_norm": 0.5498727560043335, "learning_rate": 7.955732409601172e-06, "loss": 0.5327, "step": 7560 }, { "epoch": 2.8936088786835055, "grad_norm": 0.5229191780090332, "learning_rate": 7.95330585832456e-06, "loss": 0.6118, "step": 7561 }, { "epoch": 2.893991580558745, "grad_norm": 0.5808528661727905, "learning_rate": 7.950879432822804e-06, "loss": 0.6663, "step": 7562 }, { "epoch": 2.8943742824339838, "grad_norm": 0.7083708643913269, "learning_rate": 7.948453133245015e-06, "loss": 0.5767, "step": 7563 }, { "epoch": 2.894756984309223, "grad_norm": 0.504085898399353, "learning_rate": 7.946026959740289e-06, "loss": 0.6967, "step": 7564 }, { "epoch": 2.8951396861844625, "grad_norm": 0.5491620302200317, "learning_rate": 7.943600912457723e-06, "loss": 0.6279, "step": 7565 }, { "epoch": 2.8955223880597014, "grad_norm": 0.5975731015205383, "learning_rate": 7.941174991546409e-06, "loss": 0.6813, "step": 7566 }, { "epoch": 2.895905089934941, "grad_norm": 0.5178430080413818, "learning_rate": 7.938749197155419e-06, "loss": 0.5834, "step": 7567 }, { "epoch": 2.89628779181018, "grad_norm": 0.4938093423843384, "learning_rate": 7.93632352943383e-06, "loss": 0.6175, "step": 7568 }, { "epoch": 2.896670493685419, "grad_norm": 0.5779908895492554, "learning_rate": 7.93389798853071e-06, "loss": 0.6713, "step": 7569 }, { "epoch": 2.897053195560658, "grad_norm": 0.5088751912117004, "learning_rate": 7.931472574595102e-06, "loss": 0.5696, "step": 7570 }, { "epoch": 2.8974358974358974, "grad_norm": 0.5833571553230286, "learning_rate": 7.92904728777606e-06, "loss": 0.6555, "step": 7571 }, { "epoch": 2.8978185993111367, "grad_norm": 0.5416454076766968, "learning_rate": 7.926622128222633e-06, "loss": 0.6635, "step": 7572 }, { "epoch": 2.8982013011863756, "grad_norm": 0.5075896978378296, "learning_rate": 7.924197096083841e-06, "loss": 0.6272, "step": 7573 }, { "epoch": 2.898584003061615, "grad_norm": 0.5328910946846008, "learning_rate": 7.921772191508714e-06, "loss": 0.6037, "step": 7574 }, { "epoch": 2.8989667049368544, "grad_norm": 0.5526772737503052, "learning_rate": 7.91934741464627e-06, "loss": 0.653, "step": 7575 }, { "epoch": 2.8993494068120933, "grad_norm": 0.5421561598777771, "learning_rate": 7.916922765645518e-06, "loss": 0.6139, "step": 7576 }, { "epoch": 2.8997321086873327, "grad_norm": 0.5601520538330078, "learning_rate": 7.914498244655456e-06, "loss": 0.5872, "step": 7577 }, { "epoch": 2.900114810562572, "grad_norm": 0.5235385894775391, "learning_rate": 7.912073851825081e-06, "loss": 0.6565, "step": 7578 }, { "epoch": 2.900497512437811, "grad_norm": 0.5426198840141296, "learning_rate": 7.90964958730338e-06, "loss": 0.5921, "step": 7579 }, { "epoch": 2.90088021431305, "grad_norm": 0.556804358959198, "learning_rate": 7.907225451239324e-06, "loss": 0.6007, "step": 7580 }, { "epoch": 2.9012629161882892, "grad_norm": 0.5276386141777039, "learning_rate": 7.904801443781885e-06, "loss": 0.5865, "step": 7581 }, { "epoch": 2.9016456180635286, "grad_norm": 0.5791065692901611, "learning_rate": 7.902377565080029e-06, "loss": 0.6745, "step": 7582 }, { "epoch": 2.9020283199387675, "grad_norm": 0.5194824934005737, "learning_rate": 7.899953815282705e-06, "loss": 0.6276, "step": 7583 }, { "epoch": 2.902411021814007, "grad_norm": 0.4822940230369568, "learning_rate": 7.89753019453886e-06, "loss": 0.5819, "step": 7584 }, { "epoch": 2.9027937236892463, "grad_norm": 0.47746050357818604, "learning_rate": 7.895106702997437e-06, "loss": 0.6476, "step": 7585 }, { "epoch": 2.903176425564485, "grad_norm": 0.5295459032058716, "learning_rate": 7.892683340807357e-06, "loss": 0.6858, "step": 7586 }, { "epoch": 2.9035591274397246, "grad_norm": 0.5577538013458252, "learning_rate": 7.890260108117549e-06, "loss": 0.7392, "step": 7587 }, { "epoch": 2.903941829314964, "grad_norm": 0.5097273588180542, "learning_rate": 7.887837005076925e-06, "loss": 0.678, "step": 7588 }, { "epoch": 2.904324531190203, "grad_norm": 0.6146194934844971, "learning_rate": 7.885414031834396e-06, "loss": 0.64, "step": 7589 }, { "epoch": 2.9047072330654418, "grad_norm": 0.5283464193344116, "learning_rate": 7.882991188538849e-06, "loss": 0.7416, "step": 7590 }, { "epoch": 2.905089934940681, "grad_norm": 0.5343402028083801, "learning_rate": 7.880568475339184e-06, "loss": 0.6333, "step": 7591 }, { "epoch": 2.9054726368159205, "grad_norm": 0.5046136975288391, "learning_rate": 7.878145892384279e-06, "loss": 0.5614, "step": 7592 }, { "epoch": 2.9058553386911594, "grad_norm": 0.5068636536598206, "learning_rate": 7.87572343982301e-06, "loss": 0.6372, "step": 7593 }, { "epoch": 2.906238040566399, "grad_norm": 0.5610997676849365, "learning_rate": 7.87330111780424e-06, "loss": 0.5748, "step": 7594 }, { "epoch": 2.906620742441638, "grad_norm": 0.5105218887329102, "learning_rate": 7.87087892647683e-06, "loss": 0.6739, "step": 7595 }, { "epoch": 2.907003444316877, "grad_norm": 0.5414740443229675, "learning_rate": 7.868456865989632e-06, "loss": 0.619, "step": 7596 }, { "epoch": 2.9073861461921164, "grad_norm": 0.5040915608406067, "learning_rate": 7.866034936491485e-06, "loss": 0.566, "step": 7597 }, { "epoch": 2.907768848067356, "grad_norm": 0.734248697757721, "learning_rate": 7.863613138131227e-06, "loss": 0.6907, "step": 7598 }, { "epoch": 2.9081515499425947, "grad_norm": 0.5582904815673828, "learning_rate": 7.861191471057677e-06, "loss": 0.6909, "step": 7599 }, { "epoch": 2.9085342518178336, "grad_norm": 0.5404592156410217, "learning_rate": 7.858769935419658e-06, "loss": 0.6347, "step": 7600 }, { "epoch": 2.908916953693073, "grad_norm": 0.5905028581619263, "learning_rate": 7.856348531365978e-06, "loss": 0.7243, "step": 7601 }, { "epoch": 2.9092996555683124, "grad_norm": 0.6106082797050476, "learning_rate": 7.853927259045442e-06, "loss": 0.7285, "step": 7602 }, { "epoch": 2.9096823574435513, "grad_norm": 0.5314397215843201, "learning_rate": 7.85150611860684e-06, "loss": 0.6297, "step": 7603 }, { "epoch": 2.9100650593187907, "grad_norm": 0.5098443627357483, "learning_rate": 7.849085110198957e-06, "loss": 0.6194, "step": 7604 }, { "epoch": 2.91044776119403, "grad_norm": 0.5693724751472473, "learning_rate": 7.846664233970579e-06, "loss": 0.6845, "step": 7605 }, { "epoch": 2.910830463069269, "grad_norm": 0.51168292760849, "learning_rate": 7.84424349007046e-06, "loss": 0.5222, "step": 7606 }, { "epoch": 2.9112131649445083, "grad_norm": 0.6391201019287109, "learning_rate": 7.841822878647373e-06, "loss": 0.6276, "step": 7607 }, { "epoch": 2.9115958668197477, "grad_norm": 0.573422372341156, "learning_rate": 7.839402399850074e-06, "loss": 0.6948, "step": 7608 }, { "epoch": 2.9119785686949866, "grad_norm": 0.5296555757522583, "learning_rate": 7.836982053827296e-06, "loss": 0.6375, "step": 7609 }, { "epoch": 2.9123612705702255, "grad_norm": 0.5941628813743591, "learning_rate": 7.834561840727781e-06, "loss": 0.6261, "step": 7610 }, { "epoch": 2.912743972445465, "grad_norm": 0.551164448261261, "learning_rate": 7.832141760700262e-06, "loss": 0.6551, "step": 7611 }, { "epoch": 2.9131266743207043, "grad_norm": 0.5237770080566406, "learning_rate": 7.829721813893452e-06, "loss": 0.6349, "step": 7612 }, { "epoch": 2.913509376195943, "grad_norm": 0.6370853185653687, "learning_rate": 7.827302000456069e-06, "loss": 0.6942, "step": 7613 }, { "epoch": 2.9138920780711826, "grad_norm": 0.5675819516181946, "learning_rate": 7.824882320536814e-06, "loss": 0.5981, "step": 7614 }, { "epoch": 2.914274779946422, "grad_norm": 0.5611596703529358, "learning_rate": 7.822462774284389e-06, "loss": 0.6071, "step": 7615 }, { "epoch": 2.914657481821661, "grad_norm": 0.5156307220458984, "learning_rate": 7.820043361847468e-06, "loss": 0.6388, "step": 7616 }, { "epoch": 2.9150401836969, "grad_norm": 0.5731513500213623, "learning_rate": 7.817624083374742e-06, "loss": 0.5946, "step": 7617 }, { "epoch": 2.9154228855721396, "grad_norm": 0.5291402339935303, "learning_rate": 7.815204939014884e-06, "loss": 0.6299, "step": 7618 }, { "epoch": 2.9158055874473785, "grad_norm": 0.5353744029998779, "learning_rate": 7.812785928916547e-06, "loss": 0.7166, "step": 7619 }, { "epoch": 2.9161882893226174, "grad_norm": 0.5524737238883972, "learning_rate": 7.810367053228391e-06, "loss": 0.6184, "step": 7620 }, { "epoch": 2.916570991197857, "grad_norm": 0.5389162302017212, "learning_rate": 7.807948312099062e-06, "loss": 0.6024, "step": 7621 }, { "epoch": 2.916953693073096, "grad_norm": 0.5796102285385132, "learning_rate": 7.8055297056772e-06, "loss": 0.6125, "step": 7622 }, { "epoch": 2.917336394948335, "grad_norm": 0.5873121619224548, "learning_rate": 7.80311123411143e-06, "loss": 0.6474, "step": 7623 }, { "epoch": 2.9177190968235744, "grad_norm": 0.5515313744544983, "learning_rate": 7.800692897550381e-06, "loss": 0.6583, "step": 7624 }, { "epoch": 2.918101798698814, "grad_norm": 0.5492766499519348, "learning_rate": 7.798274696142657e-06, "loss": 0.5854, "step": 7625 }, { "epoch": 2.9184845005740527, "grad_norm": 0.5705956816673279, "learning_rate": 7.795856630036871e-06, "loss": 0.6368, "step": 7626 }, { "epoch": 2.918867202449292, "grad_norm": 0.5284010171890259, "learning_rate": 7.793438699381613e-06, "loss": 0.6513, "step": 7627 }, { "epoch": 2.9192499043245315, "grad_norm": 0.5354149341583252, "learning_rate": 7.791020904325478e-06, "loss": 0.6376, "step": 7628 }, { "epoch": 2.9196326061997704, "grad_norm": 0.49906590580940247, "learning_rate": 7.788603245017043e-06, "loss": 0.7542, "step": 7629 }, { "epoch": 2.9200153080750093, "grad_norm": 0.5040558576583862, "learning_rate": 7.786185721604877e-06, "loss": 0.588, "step": 7630 }, { "epoch": 2.9203980099502487, "grad_norm": 0.5268367528915405, "learning_rate": 7.78376833423755e-06, "loss": 0.6094, "step": 7631 }, { "epoch": 2.920780711825488, "grad_norm": 0.5496755838394165, "learning_rate": 7.781351083063609e-06, "loss": 0.595, "step": 7632 }, { "epoch": 2.921163413700727, "grad_norm": 0.5585220456123352, "learning_rate": 7.778933968231603e-06, "loss": 0.5666, "step": 7633 }, { "epoch": 2.9215461155759663, "grad_norm": 0.5419743061065674, "learning_rate": 7.776516989890078e-06, "loss": 0.6583, "step": 7634 }, { "epoch": 2.9219288174512057, "grad_norm": 0.505799412727356, "learning_rate": 7.774100148187553e-06, "loss": 0.6692, "step": 7635 }, { "epoch": 2.9223115193264446, "grad_norm": 0.519078254699707, "learning_rate": 7.771683443272552e-06, "loss": 0.6308, "step": 7636 }, { "epoch": 2.922694221201684, "grad_norm": 0.5369864702224731, "learning_rate": 7.769266875293594e-06, "loss": 0.583, "step": 7637 }, { "epoch": 2.9230769230769234, "grad_norm": 0.5220605731010437, "learning_rate": 7.766850444399176e-06, "loss": 0.6401, "step": 7638 }, { "epoch": 2.9234596249521623, "grad_norm": 0.523038387298584, "learning_rate": 7.764434150737798e-06, "loss": 0.6929, "step": 7639 }, { "epoch": 2.923842326827401, "grad_norm": 0.5043748021125793, "learning_rate": 7.762017994457947e-06, "loss": 0.603, "step": 7640 }, { "epoch": 2.9242250287026406, "grad_norm": 0.5992648005485535, "learning_rate": 7.759601975708105e-06, "loss": 0.6395, "step": 7641 }, { "epoch": 2.92460773057788, "grad_norm": 0.6046569347381592, "learning_rate": 7.757186094636739e-06, "loss": 0.6303, "step": 7642 }, { "epoch": 2.924990432453119, "grad_norm": 0.563970148563385, "learning_rate": 7.754770351392311e-06, "loss": 0.7242, "step": 7643 }, { "epoch": 2.925373134328358, "grad_norm": 0.5450855493545532, "learning_rate": 7.752354746123281e-06, "loss": 0.5917, "step": 7644 }, { "epoch": 2.9257558362035976, "grad_norm": 0.548077404499054, "learning_rate": 7.749939278978087e-06, "loss": 0.6219, "step": 7645 }, { "epoch": 2.9261385380788365, "grad_norm": 0.5607596039772034, "learning_rate": 7.747523950105169e-06, "loss": 0.6908, "step": 7646 }, { "epoch": 2.926521239954076, "grad_norm": 0.5786207318305969, "learning_rate": 7.745108759652957e-06, "loss": 0.7647, "step": 7647 }, { "epoch": 2.9269039418293152, "grad_norm": 0.5422199964523315, "learning_rate": 7.742693707769869e-06, "loss": 0.619, "step": 7648 }, { "epoch": 2.927286643704554, "grad_norm": 0.5529083013534546, "learning_rate": 7.740278794604314e-06, "loss": 0.6767, "step": 7649 }, { "epoch": 2.927669345579793, "grad_norm": 0.514346182346344, "learning_rate": 7.737864020304702e-06, "loss": 0.651, "step": 7650 }, { "epoch": 2.9280520474550324, "grad_norm": 0.5437756776809692, "learning_rate": 7.73544938501942e-06, "loss": 0.6738, "step": 7651 }, { "epoch": 2.928434749330272, "grad_norm": 0.5375494956970215, "learning_rate": 7.733034888896858e-06, "loss": 0.6693, "step": 7652 }, { "epoch": 2.9288174512055107, "grad_norm": 0.6427674293518066, "learning_rate": 7.73062053208539e-06, "loss": 0.5915, "step": 7653 }, { "epoch": 2.92920015308075, "grad_norm": 0.5203753113746643, "learning_rate": 7.728206314733396e-06, "loss": 0.4835, "step": 7654 }, { "epoch": 2.9295828549559895, "grad_norm": 0.5136904120445251, "learning_rate": 7.725792236989218e-06, "loss": 0.5551, "step": 7655 }, { "epoch": 2.9299655568312284, "grad_norm": 0.5525617599487305, "learning_rate": 7.723378299001219e-06, "loss": 0.675, "step": 7656 }, { "epoch": 2.9303482587064678, "grad_norm": 0.4992533326148987, "learning_rate": 7.720964500917742e-06, "loss": 0.5749, "step": 7657 }, { "epoch": 2.930730960581707, "grad_norm": 0.5420740842819214, "learning_rate": 7.718550842887117e-06, "loss": 0.7055, "step": 7658 }, { "epoch": 2.931113662456946, "grad_norm": 0.5519652366638184, "learning_rate": 7.71613732505767e-06, "loss": 0.679, "step": 7659 }, { "epoch": 2.931496364332185, "grad_norm": 0.7087491750717163, "learning_rate": 7.713723947577725e-06, "loss": 0.6038, "step": 7660 }, { "epoch": 2.9318790662074243, "grad_norm": 0.5696932077407837, "learning_rate": 7.711310710595578e-06, "loss": 0.6319, "step": 7661 }, { "epoch": 2.9322617680826637, "grad_norm": 0.5533671975135803, "learning_rate": 7.70889761425954e-06, "loss": 0.6488, "step": 7662 }, { "epoch": 2.9326444699579026, "grad_norm": 0.595146656036377, "learning_rate": 7.706484658717903e-06, "loss": 0.6901, "step": 7663 }, { "epoch": 2.933027171833142, "grad_norm": 0.5362987518310547, "learning_rate": 7.70407184411894e-06, "loss": 0.6175, "step": 7664 }, { "epoch": 2.9334098737083814, "grad_norm": 0.5558404922485352, "learning_rate": 7.701659170610932e-06, "loss": 0.6468, "step": 7665 }, { "epoch": 2.9337925755836203, "grad_norm": 0.5249074697494507, "learning_rate": 7.699246638342142e-06, "loss": 0.6102, "step": 7666 }, { "epoch": 2.9341752774588596, "grad_norm": 0.5575051307678223, "learning_rate": 7.696834247460827e-06, "loss": 0.6143, "step": 7667 }, { "epoch": 2.934557979334099, "grad_norm": 0.5617652535438538, "learning_rate": 7.694421998115235e-06, "loss": 0.6218, "step": 7668 }, { "epoch": 2.934940681209338, "grad_norm": 0.5211566090583801, "learning_rate": 7.692009890453604e-06, "loss": 0.6222, "step": 7669 }, { "epoch": 2.935323383084577, "grad_norm": 0.491285115480423, "learning_rate": 7.68959792462417e-06, "loss": 0.5677, "step": 7670 }, { "epoch": 2.935706084959816, "grad_norm": 0.5311253666877747, "learning_rate": 7.687186100775147e-06, "loss": 0.6, "step": 7671 }, { "epoch": 2.9360887868350556, "grad_norm": 0.5108537673950195, "learning_rate": 7.684774419054748e-06, "loss": 0.5429, "step": 7672 }, { "epoch": 2.9364714887102945, "grad_norm": 0.5252341628074646, "learning_rate": 7.682362879611189e-06, "loss": 0.6429, "step": 7673 }, { "epoch": 2.936854190585534, "grad_norm": 0.5284726023674011, "learning_rate": 7.679951482592651e-06, "loss": 0.6192, "step": 7674 }, { "epoch": 2.9372368924607732, "grad_norm": 0.5532122850418091, "learning_rate": 7.677540228147328e-06, "loss": 0.6287, "step": 7675 }, { "epoch": 2.937619594336012, "grad_norm": 0.5464686155319214, "learning_rate": 7.6751291164234e-06, "loss": 0.6806, "step": 7676 }, { "epoch": 2.9380022962112515, "grad_norm": 0.5731819272041321, "learning_rate": 7.672718147569031e-06, "loss": 0.6983, "step": 7677 }, { "epoch": 2.938384998086491, "grad_norm": 0.49880552291870117, "learning_rate": 7.670307321732383e-06, "loss": 0.6151, "step": 7678 }, { "epoch": 2.93876769996173, "grad_norm": 0.48595547676086426, "learning_rate": 7.66789663906161e-06, "loss": 0.5572, "step": 7679 }, { "epoch": 2.9391504018369687, "grad_norm": 0.5231351852416992, "learning_rate": 7.665486099704858e-06, "loss": 0.5881, "step": 7680 }, { "epoch": 2.939533103712208, "grad_norm": 0.5604549646377563, "learning_rate": 7.663075703810251e-06, "loss": 0.6675, "step": 7681 }, { "epoch": 2.9399158055874475, "grad_norm": 0.6149938106536865, "learning_rate": 7.66066545152592e-06, "loss": 0.7179, "step": 7682 }, { "epoch": 2.9402985074626864, "grad_norm": 0.5581313967704773, "learning_rate": 7.658255342999988e-06, "loss": 0.5659, "step": 7683 }, { "epoch": 2.9406812093379258, "grad_norm": 0.5281447172164917, "learning_rate": 7.655845378380551e-06, "loss": 0.5981, "step": 7684 }, { "epoch": 2.941063911213165, "grad_norm": 0.5293745398521423, "learning_rate": 7.653435557815713e-06, "loss": 0.6212, "step": 7685 }, { "epoch": 2.941446613088404, "grad_norm": 0.5590645670890808, "learning_rate": 7.651025881453566e-06, "loss": 0.6519, "step": 7686 }, { "epoch": 2.9418293149636434, "grad_norm": 0.5545353293418884, "learning_rate": 7.648616349442189e-06, "loss": 0.6036, "step": 7687 }, { "epoch": 2.942212016838883, "grad_norm": 0.5810739994049072, "learning_rate": 7.646206961929652e-06, "loss": 0.6178, "step": 7688 }, { "epoch": 2.9425947187141217, "grad_norm": 0.5372157096862793, "learning_rate": 7.643797719064026e-06, "loss": 0.5967, "step": 7689 }, { "epoch": 2.9429774205893606, "grad_norm": 0.5349552035331726, "learning_rate": 7.641388620993354e-06, "loss": 0.6116, "step": 7690 }, { "epoch": 2.9433601224646, "grad_norm": 0.5303838849067688, "learning_rate": 7.638979667865689e-06, "loss": 0.6266, "step": 7691 }, { "epoch": 2.9437428243398394, "grad_norm": 0.522386908531189, "learning_rate": 7.63657085982907e-06, "loss": 0.6554, "step": 7692 }, { "epoch": 2.9441255262150783, "grad_norm": 0.6326938271522522, "learning_rate": 7.634162197031515e-06, "loss": 0.6835, "step": 7693 }, { "epoch": 2.9445082280903176, "grad_norm": 0.5512003302574158, "learning_rate": 7.631753679621053e-06, "loss": 0.7319, "step": 7694 }, { "epoch": 2.944890929965557, "grad_norm": 0.5229089856147766, "learning_rate": 7.629345307745687e-06, "loss": 0.6232, "step": 7695 }, { "epoch": 2.945273631840796, "grad_norm": 0.5954679846763611, "learning_rate": 7.6269370815534225e-06, "loss": 0.6267, "step": 7696 }, { "epoch": 2.9456563337160353, "grad_norm": 0.6043679714202881, "learning_rate": 7.6245290011922495e-06, "loss": 0.6634, "step": 7697 }, { "epoch": 2.9460390355912747, "grad_norm": 0.5071602463722229, "learning_rate": 7.622121066810148e-06, "loss": 0.6691, "step": 7698 }, { "epoch": 2.9464217374665136, "grad_norm": 0.5337955355644226, "learning_rate": 7.619713278555102e-06, "loss": 0.6172, "step": 7699 }, { "epoch": 2.9468044393417525, "grad_norm": 0.540457010269165, "learning_rate": 7.6173056365750655e-06, "loss": 0.613, "step": 7700 }, { "epoch": 2.947187141216992, "grad_norm": 0.5336350202560425, "learning_rate": 7.6148981410179966e-06, "loss": 0.6656, "step": 7701 }, { "epoch": 2.9475698430922312, "grad_norm": 0.5608001947402954, "learning_rate": 7.6124907920318485e-06, "loss": 0.6548, "step": 7702 }, { "epoch": 2.94795254496747, "grad_norm": 0.5675505995750427, "learning_rate": 7.610083589764552e-06, "loss": 0.6541, "step": 7703 }, { "epoch": 2.9483352468427095, "grad_norm": 0.6524989008903503, "learning_rate": 7.60767653436404e-06, "loss": 0.5597, "step": 7704 }, { "epoch": 2.948717948717949, "grad_norm": 0.49196115136146545, "learning_rate": 7.605269625978237e-06, "loss": 0.5383, "step": 7705 }, { "epoch": 2.949100650593188, "grad_norm": 0.5266632437705994, "learning_rate": 7.6028628647550406e-06, "loss": 0.635, "step": 7706 }, { "epoch": 2.949483352468427, "grad_norm": 0.5800240635871887, "learning_rate": 7.600456250842364e-06, "loss": 0.61, "step": 7707 }, { "epoch": 2.9498660543436666, "grad_norm": 0.5422122478485107, "learning_rate": 7.598049784388097e-06, "loss": 0.678, "step": 7708 }, { "epoch": 2.9502487562189055, "grad_norm": 0.5679247379302979, "learning_rate": 7.595643465540128e-06, "loss": 0.6317, "step": 7709 }, { "epoch": 2.9506314580941444, "grad_norm": 0.5198173522949219, "learning_rate": 7.593237294446324e-06, "loss": 0.6265, "step": 7710 }, { "epoch": 2.9510141599693838, "grad_norm": 0.5545954704284668, "learning_rate": 7.590831271254553e-06, "loss": 0.582, "step": 7711 }, { "epoch": 2.951396861844623, "grad_norm": 0.55815589427948, "learning_rate": 7.588425396112675e-06, "loss": 0.6801, "step": 7712 }, { "epoch": 2.951779563719862, "grad_norm": 0.5237001776695251, "learning_rate": 7.5860196691685316e-06, "loss": 0.6138, "step": 7713 }, { "epoch": 2.9521622655951014, "grad_norm": 0.5138325095176697, "learning_rate": 7.5836140905699665e-06, "loss": 0.6417, "step": 7714 }, { "epoch": 2.952544967470341, "grad_norm": 0.6342098116874695, "learning_rate": 7.58120866046481e-06, "loss": 0.6762, "step": 7715 }, { "epoch": 2.9529276693455797, "grad_norm": 0.525961697101593, "learning_rate": 7.578803379000874e-06, "loss": 0.6507, "step": 7716 }, { "epoch": 2.953310371220819, "grad_norm": 0.5528934597969055, "learning_rate": 7.576398246325978e-06, "loss": 0.5976, "step": 7717 }, { "epoch": 2.9536930730960584, "grad_norm": 0.4863659143447876, "learning_rate": 7.573993262587923e-06, "loss": 0.5597, "step": 7718 }, { "epoch": 2.9540757749712974, "grad_norm": 0.5620710849761963, "learning_rate": 7.571588427934495e-06, "loss": 0.6243, "step": 7719 }, { "epoch": 2.9544584768465363, "grad_norm": 0.5221589803695679, "learning_rate": 7.569183742513484e-06, "loss": 0.6615, "step": 7720 }, { "epoch": 2.9548411787217757, "grad_norm": 0.5423166155815125, "learning_rate": 7.566779206472661e-06, "loss": 0.6068, "step": 7721 }, { "epoch": 2.955223880597015, "grad_norm": 0.5547163486480713, "learning_rate": 7.564374819959795e-06, "loss": 0.6329, "step": 7722 }, { "epoch": 2.955606582472254, "grad_norm": 0.4954940974712372, "learning_rate": 7.561970583122637e-06, "loss": 0.6225, "step": 7723 }, { "epoch": 2.9559892843474933, "grad_norm": 0.5043706893920898, "learning_rate": 7.559566496108938e-06, "loss": 0.6055, "step": 7724 }, { "epoch": 2.9563719862227327, "grad_norm": 0.5526015758514404, "learning_rate": 7.557162559066437e-06, "loss": 0.6311, "step": 7725 }, { "epoch": 2.9567546880979716, "grad_norm": 0.5508992671966553, "learning_rate": 7.554758772142855e-06, "loss": 0.6087, "step": 7726 }, { "epoch": 2.957137389973211, "grad_norm": 0.48761385679244995, "learning_rate": 7.5523551354859135e-06, "loss": 0.6168, "step": 7727 }, { "epoch": 2.9575200918484503, "grad_norm": 0.5304040908813477, "learning_rate": 7.549951649243332e-06, "loss": 0.6095, "step": 7728 }, { "epoch": 2.9579027937236892, "grad_norm": 0.5170394778251648, "learning_rate": 7.547548313562798e-06, "loss": 0.5467, "step": 7729 }, { "epoch": 2.958285495598928, "grad_norm": 0.537041425704956, "learning_rate": 7.545145128592009e-06, "loss": 0.6259, "step": 7730 }, { "epoch": 2.9586681974741675, "grad_norm": 0.5366817116737366, "learning_rate": 7.54274209447865e-06, "loss": 0.6553, "step": 7731 }, { "epoch": 2.959050899349407, "grad_norm": 0.5087372660636902, "learning_rate": 7.540339211370387e-06, "loss": 0.654, "step": 7732 }, { "epoch": 2.959433601224646, "grad_norm": 0.5529883503913879, "learning_rate": 7.537936479414888e-06, "loss": 0.6659, "step": 7733 }, { "epoch": 2.959816303099885, "grad_norm": 0.5552929043769836, "learning_rate": 7.535533898759807e-06, "loss": 0.6602, "step": 7734 }, { "epoch": 2.9601990049751246, "grad_norm": 0.5964593887329102, "learning_rate": 7.533131469552793e-06, "loss": 0.6775, "step": 7735 }, { "epoch": 2.9605817068503635, "grad_norm": 0.5213457942008972, "learning_rate": 7.530729191941472e-06, "loss": 0.7457, "step": 7736 }, { "epoch": 2.960964408725603, "grad_norm": 0.5597192049026489, "learning_rate": 7.528327066073478e-06, "loss": 0.6117, "step": 7737 }, { "epoch": 2.961347110600842, "grad_norm": 0.4918772578239441, "learning_rate": 7.525925092096426e-06, "loss": 0.5334, "step": 7738 }, { "epoch": 2.961729812476081, "grad_norm": 0.5410773158073425, "learning_rate": 7.523523270157922e-06, "loss": 0.6332, "step": 7739 }, { "epoch": 2.96211251435132, "grad_norm": 0.5375485420227051, "learning_rate": 7.521121600405566e-06, "loss": 0.6435, "step": 7740 }, { "epoch": 2.9624952162265594, "grad_norm": 0.5755367279052734, "learning_rate": 7.51872008298695e-06, "loss": 0.6991, "step": 7741 }, { "epoch": 2.962877918101799, "grad_norm": 0.5639457106590271, "learning_rate": 7.516318718049648e-06, "loss": 0.654, "step": 7742 }, { "epoch": 2.9632606199770377, "grad_norm": 0.8127319812774658, "learning_rate": 7.513917505741233e-06, "loss": 0.6769, "step": 7743 }, { "epoch": 2.963643321852277, "grad_norm": 0.5078776478767395, "learning_rate": 7.51151644620927e-06, "loss": 0.6193, "step": 7744 }, { "epoch": 2.9640260237275164, "grad_norm": 0.5462298393249512, "learning_rate": 7.509115539601304e-06, "loss": 0.6083, "step": 7745 }, { "epoch": 2.9644087256027554, "grad_norm": 0.5199597477912903, "learning_rate": 7.506714786064879e-06, "loss": 0.7341, "step": 7746 }, { "epoch": 2.9647914274779947, "grad_norm": 0.5372049808502197, "learning_rate": 7.504314185747527e-06, "loss": 0.6144, "step": 7747 }, { "epoch": 2.965174129353234, "grad_norm": 0.7941120266914368, "learning_rate": 7.501913738796777e-06, "loss": 0.6101, "step": 7748 }, { "epoch": 2.965556831228473, "grad_norm": 0.5357195734977722, "learning_rate": 7.499513445360137e-06, "loss": 0.6637, "step": 7749 }, { "epoch": 2.965939533103712, "grad_norm": 0.5358832478523254, "learning_rate": 7.4971133055851135e-06, "loss": 0.6299, "step": 7750 }, { "epoch": 2.9663222349789513, "grad_norm": 0.5628033876419067, "learning_rate": 7.494713319619202e-06, "loss": 0.6118, "step": 7751 }, { "epoch": 2.9667049368541907, "grad_norm": 0.4980238974094391, "learning_rate": 7.4923134876098855e-06, "loss": 0.6254, "step": 7752 }, { "epoch": 2.9670876387294296, "grad_norm": 0.5517699122428894, "learning_rate": 7.489913809704643e-06, "loss": 0.587, "step": 7753 }, { "epoch": 2.967470340604669, "grad_norm": 0.5144104957580566, "learning_rate": 7.487514286050943e-06, "loss": 0.6473, "step": 7754 }, { "epoch": 2.9678530424799083, "grad_norm": 0.5220093727111816, "learning_rate": 7.485114916796236e-06, "loss": 0.6329, "step": 7755 }, { "epoch": 2.9682357443551473, "grad_norm": 0.5477554202079773, "learning_rate": 7.4827157020879745e-06, "loss": 0.6497, "step": 7756 }, { "epoch": 2.9686184462303866, "grad_norm": 0.54606693983078, "learning_rate": 7.480316642073598e-06, "loss": 0.615, "step": 7757 }, { "epoch": 2.969001148105626, "grad_norm": 0.48373690247535706, "learning_rate": 7.477917736900531e-06, "loss": 0.5422, "step": 7758 }, { "epoch": 2.969383849980865, "grad_norm": 0.5995122194290161, "learning_rate": 7.475518986716193e-06, "loss": 0.5976, "step": 7759 }, { "epoch": 2.969766551856104, "grad_norm": 0.5855494141578674, "learning_rate": 7.473120391667996e-06, "loss": 0.6231, "step": 7760 }, { "epoch": 2.970149253731343, "grad_norm": 0.6040787100791931, "learning_rate": 7.470721951903343e-06, "loss": 0.5963, "step": 7761 }, { "epoch": 2.9705319556065826, "grad_norm": 0.5021753907203674, "learning_rate": 7.468323667569617e-06, "loss": 0.5543, "step": 7762 }, { "epoch": 2.9709146574818215, "grad_norm": 0.49992942810058594, "learning_rate": 7.465925538814204e-06, "loss": 0.5517, "step": 7763 }, { "epoch": 2.971297359357061, "grad_norm": 0.5031715631484985, "learning_rate": 7.4635275657844784e-06, "loss": 0.6169, "step": 7764 }, { "epoch": 2.9716800612323, "grad_norm": 0.5175184607505798, "learning_rate": 7.461129748627793e-06, "loss": 0.5928, "step": 7765 }, { "epoch": 2.972062763107539, "grad_norm": 0.5521246790885925, "learning_rate": 7.4587320874915056e-06, "loss": 0.717, "step": 7766 }, { "epoch": 2.9724454649827785, "grad_norm": 0.47634291648864746, "learning_rate": 7.456334582522962e-06, "loss": 0.6808, "step": 7767 }, { "epoch": 2.972828166858018, "grad_norm": 0.6123420596122742, "learning_rate": 7.453937233869488e-06, "loss": 0.5463, "step": 7768 }, { "epoch": 2.973210868733257, "grad_norm": 0.5353777408599854, "learning_rate": 7.451540041678411e-06, "loss": 0.598, "step": 7769 }, { "epoch": 2.9735935706084957, "grad_norm": 0.5183120965957642, "learning_rate": 7.4491430060970496e-06, "loss": 0.5835, "step": 7770 }, { "epoch": 2.973976272483735, "grad_norm": 0.5220374464988708, "learning_rate": 7.446746127272699e-06, "loss": 0.5719, "step": 7771 }, { "epoch": 2.9743589743589745, "grad_norm": 0.5457521677017212, "learning_rate": 7.444349405352656e-06, "loss": 0.5969, "step": 7772 }, { "epoch": 2.9747416762342134, "grad_norm": 0.540340781211853, "learning_rate": 7.44195284048421e-06, "loss": 0.596, "step": 7773 }, { "epoch": 2.9751243781094527, "grad_norm": 0.6129778027534485, "learning_rate": 7.439556432814638e-06, "loss": 0.6406, "step": 7774 }, { "epoch": 2.975507079984692, "grad_norm": 0.5507714748382568, "learning_rate": 7.437160182491199e-06, "loss": 0.6385, "step": 7775 }, { "epoch": 2.975889781859931, "grad_norm": 0.4889959394931793, "learning_rate": 7.434764089661151e-06, "loss": 0.6441, "step": 7776 }, { "epoch": 2.9762724837351704, "grad_norm": 0.5372911095619202, "learning_rate": 7.432368154471742e-06, "loss": 0.6172, "step": 7777 }, { "epoch": 2.9766551856104098, "grad_norm": 0.5626768469810486, "learning_rate": 7.429972377070207e-06, "loss": 0.6234, "step": 7778 }, { "epoch": 2.9770378874856487, "grad_norm": 0.5139140486717224, "learning_rate": 7.427576757603774e-06, "loss": 0.5899, "step": 7779 }, { "epoch": 2.9774205893608876, "grad_norm": 0.5487284064292908, "learning_rate": 7.425181296219663e-06, "loss": 0.6381, "step": 7780 }, { "epoch": 2.977803291236127, "grad_norm": 0.5589108467102051, "learning_rate": 7.422785993065076e-06, "loss": 0.6116, "step": 7781 }, { "epoch": 2.9781859931113663, "grad_norm": 0.5007521510124207, "learning_rate": 7.4203908482872085e-06, "loss": 0.5632, "step": 7782 }, { "epoch": 2.9785686949866053, "grad_norm": 0.5234444737434387, "learning_rate": 7.4179958620332625e-06, "loss": 0.5645, "step": 7783 }, { "epoch": 2.9789513968618446, "grad_norm": 0.5368333458900452, "learning_rate": 7.415601034450403e-06, "loss": 0.6385, "step": 7784 }, { "epoch": 2.979334098737084, "grad_norm": 0.49119409918785095, "learning_rate": 7.413206365685801e-06, "loss": 0.5964, "step": 7785 }, { "epoch": 2.979716800612323, "grad_norm": 0.5292212963104248, "learning_rate": 7.41081185588662e-06, "loss": 0.6541, "step": 7786 }, { "epoch": 2.9800995024875623, "grad_norm": 0.5259398818016052, "learning_rate": 7.408417505200006e-06, "loss": 0.6863, "step": 7787 }, { "epoch": 2.9804822043628016, "grad_norm": 0.5404539704322815, "learning_rate": 7.406023313773097e-06, "loss": 0.6678, "step": 7788 }, { "epoch": 2.9808649062380406, "grad_norm": 0.5409314036369324, "learning_rate": 7.4036292817530245e-06, "loss": 0.6897, "step": 7789 }, { "epoch": 2.9812476081132795, "grad_norm": 0.742938220500946, "learning_rate": 7.4012354092869125e-06, "loss": 0.6303, "step": 7790 }, { "epoch": 2.981630309988519, "grad_norm": 0.528713047504425, "learning_rate": 7.398841696521861e-06, "loss": 0.5956, "step": 7791 }, { "epoch": 2.9820130118637582, "grad_norm": 0.5271000862121582, "learning_rate": 7.396448143604975e-06, "loss": 0.6407, "step": 7792 }, { "epoch": 2.982395713738997, "grad_norm": 0.5413183569908142, "learning_rate": 7.3940547506833485e-06, "loss": 0.6265, "step": 7793 }, { "epoch": 2.9827784156142365, "grad_norm": 0.5528640747070312, "learning_rate": 7.391661517904054e-06, "loss": 0.5819, "step": 7794 }, { "epoch": 2.983161117489476, "grad_norm": 0.5762086510658264, "learning_rate": 7.389268445414168e-06, "loss": 0.6163, "step": 7795 }, { "epoch": 2.983543819364715, "grad_norm": 0.5182535648345947, "learning_rate": 7.386875533360753e-06, "loss": 0.6361, "step": 7796 }, { "epoch": 2.983926521239954, "grad_norm": 0.5279862284660339, "learning_rate": 7.384482781890852e-06, "loss": 0.6399, "step": 7797 }, { "epoch": 2.9843092231151935, "grad_norm": 0.5267943739891052, "learning_rate": 7.382090191151512e-06, "loss": 0.5864, "step": 7798 }, { "epoch": 2.9846919249904325, "grad_norm": 0.5383817553520203, "learning_rate": 7.379697761289763e-06, "loss": 0.6492, "step": 7799 }, { "epoch": 2.9850746268656714, "grad_norm": 0.5685662627220154, "learning_rate": 7.377305492452629e-06, "loss": 0.6589, "step": 7800 }, { "epoch": 2.9854573287409107, "grad_norm": 0.5225892066955566, "learning_rate": 7.3749133847871144e-06, "loss": 0.6017, "step": 7801 }, { "epoch": 2.98584003061615, "grad_norm": 0.5411970615386963, "learning_rate": 7.372521438440225e-06, "loss": 0.6925, "step": 7802 }, { "epoch": 2.986222732491389, "grad_norm": 0.516453742980957, "learning_rate": 7.370129653558955e-06, "loss": 0.5586, "step": 7803 }, { "epoch": 2.9866054343666284, "grad_norm": 0.5313865542411804, "learning_rate": 7.36773803029028e-06, "loss": 0.6427, "step": 7804 }, { "epoch": 2.9869881362418678, "grad_norm": 0.5565495491027832, "learning_rate": 7.365346568781174e-06, "loss": 0.6958, "step": 7805 }, { "epoch": 2.9873708381171067, "grad_norm": 0.5656079053878784, "learning_rate": 7.362955269178602e-06, "loss": 0.553, "step": 7806 }, { "epoch": 2.987753539992346, "grad_norm": 0.616614818572998, "learning_rate": 7.3605641316295105e-06, "loss": 0.725, "step": 7807 }, { "epoch": 2.9881362418675854, "grad_norm": 0.5870094299316406, "learning_rate": 7.358173156280845e-06, "loss": 0.6477, "step": 7808 }, { "epoch": 2.9885189437428243, "grad_norm": 0.5481748580932617, "learning_rate": 7.35578234327954e-06, "loss": 0.6355, "step": 7809 }, { "epoch": 2.9889016456180633, "grad_norm": 0.533163845539093, "learning_rate": 7.353391692772509e-06, "loss": 0.6298, "step": 7810 }, { "epoch": 2.9892843474933026, "grad_norm": 0.5529732704162598, "learning_rate": 7.351001204906669e-06, "loss": 0.6052, "step": 7811 }, { "epoch": 2.989667049368542, "grad_norm": 0.6018039584159851, "learning_rate": 7.348610879828921e-06, "loss": 0.6516, "step": 7812 }, { "epoch": 2.990049751243781, "grad_norm": 0.576461672782898, "learning_rate": 7.34622071768616e-06, "loss": 0.579, "step": 7813 }, { "epoch": 2.9904324531190203, "grad_norm": 0.5429918766021729, "learning_rate": 7.3438307186252625e-06, "loss": 0.7114, "step": 7814 }, { "epoch": 2.9908151549942597, "grad_norm": 0.5904475450515747, "learning_rate": 7.341440882793104e-06, "loss": 0.6396, "step": 7815 }, { "epoch": 2.9911978568694986, "grad_norm": 0.5590817332267761, "learning_rate": 7.339051210336548e-06, "loss": 0.6631, "step": 7816 }, { "epoch": 2.991580558744738, "grad_norm": 0.5849786996841431, "learning_rate": 7.336661701402439e-06, "loss": 0.6899, "step": 7817 }, { "epoch": 2.9919632606199773, "grad_norm": 0.5845791101455688, "learning_rate": 7.334272356137626e-06, "loss": 0.6792, "step": 7818 }, { "epoch": 2.9923459624952162, "grad_norm": 0.5149524211883545, "learning_rate": 7.33188317468894e-06, "loss": 0.7142, "step": 7819 }, { "epoch": 2.992728664370455, "grad_norm": 0.5036463737487793, "learning_rate": 7.329494157203199e-06, "loss": 0.5678, "step": 7820 }, { "epoch": 2.9931113662456945, "grad_norm": 0.5636181831359863, "learning_rate": 7.327105303827216e-06, "loss": 0.638, "step": 7821 }, { "epoch": 2.993494068120934, "grad_norm": 0.5220511555671692, "learning_rate": 7.324716614707794e-06, "loss": 0.6073, "step": 7822 }, { "epoch": 2.993876769996173, "grad_norm": 0.4996689260005951, "learning_rate": 7.322328089991721e-06, "loss": 0.6499, "step": 7823 }, { "epoch": 2.994259471871412, "grad_norm": 0.4972243010997772, "learning_rate": 7.319939729825781e-06, "loss": 0.575, "step": 7824 }, { "epoch": 2.9946421737466515, "grad_norm": 0.550682783126831, "learning_rate": 7.317551534356744e-06, "loss": 0.5533, "step": 7825 }, { "epoch": 2.9950248756218905, "grad_norm": 0.5162297487258911, "learning_rate": 7.3151635037313775e-06, "loss": 0.755, "step": 7826 }, { "epoch": 2.99540757749713, "grad_norm": 0.5283812880516052, "learning_rate": 7.312775638096419e-06, "loss": 0.6681, "step": 7827 }, { "epoch": 2.995790279372369, "grad_norm": 0.5664196014404297, "learning_rate": 7.310387937598618e-06, "loss": 0.6528, "step": 7828 }, { "epoch": 2.996172981247608, "grad_norm": 0.5507184863090515, "learning_rate": 7.3080004023847085e-06, "loss": 0.6775, "step": 7829 }, { "epoch": 2.996555683122847, "grad_norm": 0.5247641205787659, "learning_rate": 7.305613032601402e-06, "loss": 0.6732, "step": 7830 }, { "epoch": 2.9969383849980864, "grad_norm": 0.5379205346107483, "learning_rate": 7.3032258283954135e-06, "loss": 0.6486, "step": 7831 }, { "epoch": 2.9973210868733258, "grad_norm": 0.5474895238876343, "learning_rate": 7.300838789913445e-06, "loss": 0.6112, "step": 7832 }, { "epoch": 2.9977037887485647, "grad_norm": 0.5551906824111938, "learning_rate": 7.298451917302182e-06, "loss": 0.5667, "step": 7833 }, { "epoch": 2.998086490623804, "grad_norm": 0.5727241635322571, "learning_rate": 7.296065210708305e-06, "loss": 0.6654, "step": 7834 }, { "epoch": 2.9984691924990434, "grad_norm": 0.5356207489967346, "learning_rate": 7.29367867027849e-06, "loss": 0.6411, "step": 7835 }, { "epoch": 2.9988518943742823, "grad_norm": 0.5603981018066406, "learning_rate": 7.291292296159388e-06, "loss": 0.6268, "step": 7836 }, { "epoch": 2.9992345962495217, "grad_norm": 0.543118417263031, "learning_rate": 7.28890608849765e-06, "loss": 0.6514, "step": 7837 }, { "epoch": 2.999617298124761, "grad_norm": 0.515076756477356, "learning_rate": 7.286520047439916e-06, "loss": 0.7084, "step": 7838 }, { "epoch": 3.0, "grad_norm": 0.569162130355835, "learning_rate": 7.284134173132819e-06, "loss": 0.6672, "step": 7839 }, { "epoch": 3.0003827018752394, "grad_norm": 0.5567039251327515, "learning_rate": 7.28174846572297e-06, "loss": 0.6279, "step": 7840 }, { "epoch": 3.0007654037504783, "grad_norm": 0.5425089001655579, "learning_rate": 7.27936292535698e-06, "loss": 0.6654, "step": 7841 }, { "epoch": 3.0011481056257177, "grad_norm": 0.5108500123023987, "learning_rate": 7.276977552181449e-06, "loss": 0.699, "step": 7842 }, { "epoch": 3.0015308075009566, "grad_norm": 0.5818223357200623, "learning_rate": 7.274592346342962e-06, "loss": 0.6311, "step": 7843 }, { "epoch": 3.001913509376196, "grad_norm": 0.5319927930831909, "learning_rate": 7.272207307988095e-06, "loss": 0.6141, "step": 7844 }, { "epoch": 3.0022962112514353, "grad_norm": 0.4985840320587158, "learning_rate": 7.269822437263423e-06, "loss": 0.639, "step": 7845 }, { "epoch": 3.0026789131266742, "grad_norm": 0.5217550992965698, "learning_rate": 7.267437734315493e-06, "loss": 0.6351, "step": 7846 }, { "epoch": 3.0030616150019136, "grad_norm": 0.49167874455451965, "learning_rate": 7.265053199290853e-06, "loss": 0.5599, "step": 7847 }, { "epoch": 3.0034443168771525, "grad_norm": 0.5448040962219238, "learning_rate": 7.262668832336045e-06, "loss": 0.61, "step": 7848 }, { "epoch": 3.003827018752392, "grad_norm": 0.4542577266693115, "learning_rate": 7.260284633597589e-06, "loss": 0.6471, "step": 7849 }, { "epoch": 3.0042097206276313, "grad_norm": 0.5047512650489807, "learning_rate": 7.257900603222002e-06, "loss": 0.6426, "step": 7850 }, { "epoch": 3.00459242250287, "grad_norm": 0.5071938633918762, "learning_rate": 7.255516741355789e-06, "loss": 0.6018, "step": 7851 }, { "epoch": 3.0049751243781095, "grad_norm": 0.500082790851593, "learning_rate": 7.253133048145449e-06, "loss": 0.6242, "step": 7852 }, { "epoch": 3.0053578262533485, "grad_norm": 0.49956145882606506, "learning_rate": 7.250749523737459e-06, "loss": 0.5847, "step": 7853 }, { "epoch": 3.005740528128588, "grad_norm": 0.5196601152420044, "learning_rate": 7.248366168278298e-06, "loss": 0.6186, "step": 7854 }, { "epoch": 3.006123230003827, "grad_norm": 0.5507213473320007, "learning_rate": 7.245982981914431e-06, "loss": 0.645, "step": 7855 }, { "epoch": 3.006505931879066, "grad_norm": 0.5301402807235718, "learning_rate": 7.243599964792305e-06, "loss": 0.664, "step": 7856 }, { "epoch": 3.0068886337543055, "grad_norm": 0.5542935132980347, "learning_rate": 7.241217117058365e-06, "loss": 0.6956, "step": 7857 }, { "epoch": 3.0072713356295444, "grad_norm": 0.5270954370498657, "learning_rate": 7.238834438859048e-06, "loss": 0.5994, "step": 7858 }, { "epoch": 3.0076540375047838, "grad_norm": 0.5521852970123291, "learning_rate": 7.236451930340771e-06, "loss": 0.5975, "step": 7859 }, { "epoch": 3.008036739380023, "grad_norm": 0.5331471562385559, "learning_rate": 7.2340695916499456e-06, "loss": 0.6011, "step": 7860 }, { "epoch": 3.008419441255262, "grad_norm": 0.5355010628700256, "learning_rate": 7.231687422932978e-06, "loss": 0.5846, "step": 7861 }, { "epoch": 3.0088021431305014, "grad_norm": 0.5202947854995728, "learning_rate": 7.229305424336252e-06, "loss": 0.5935, "step": 7862 }, { "epoch": 3.0091848450057403, "grad_norm": 0.503462553024292, "learning_rate": 7.226923596006153e-06, "loss": 0.6844, "step": 7863 }, { "epoch": 3.0095675468809797, "grad_norm": 0.5150624513626099, "learning_rate": 7.2245419380890455e-06, "loss": 0.593, "step": 7864 }, { "epoch": 3.009950248756219, "grad_norm": 0.5560495257377625, "learning_rate": 7.222160450731299e-06, "loss": 0.6501, "step": 7865 }, { "epoch": 3.010332950631458, "grad_norm": 0.5388282537460327, "learning_rate": 7.21977913407925e-06, "loss": 0.5902, "step": 7866 }, { "epoch": 3.0107156525066974, "grad_norm": 0.5082768201828003, "learning_rate": 7.2173979882792435e-06, "loss": 0.5248, "step": 7867 }, { "epoch": 3.0110983543819363, "grad_norm": 0.5983188152313232, "learning_rate": 7.215017013477607e-06, "loss": 0.6613, "step": 7868 }, { "epoch": 3.0114810562571757, "grad_norm": 0.5264302492141724, "learning_rate": 7.212636209820656e-06, "loss": 0.5892, "step": 7869 }, { "epoch": 3.011863758132415, "grad_norm": 0.5318003296852112, "learning_rate": 7.210255577454697e-06, "loss": 0.6209, "step": 7870 }, { "epoch": 3.012246460007654, "grad_norm": 0.532744824886322, "learning_rate": 7.207875116526031e-06, "loss": 0.6078, "step": 7871 }, { "epoch": 3.0126291618828933, "grad_norm": 0.5265753269195557, "learning_rate": 7.2054948271809346e-06, "loss": 0.7027, "step": 7872 }, { "epoch": 3.0130118637581322, "grad_norm": 0.546670138835907, "learning_rate": 7.2031147095656905e-06, "loss": 0.6659, "step": 7873 }, { "epoch": 3.0133945656333716, "grad_norm": 0.5672950148582458, "learning_rate": 7.2007347638265665e-06, "loss": 0.5787, "step": 7874 }, { "epoch": 3.013777267508611, "grad_norm": 0.5033921599388123, "learning_rate": 7.198354990109806e-06, "loss": 0.6451, "step": 7875 }, { "epoch": 3.01415996938385, "grad_norm": 0.5587972402572632, "learning_rate": 7.195975388561658e-06, "loss": 0.6109, "step": 7876 }, { "epoch": 3.0145426712590893, "grad_norm": 0.550089418888092, "learning_rate": 7.193595959328357e-06, "loss": 0.6321, "step": 7877 }, { "epoch": 3.014925373134328, "grad_norm": 0.5023585557937622, "learning_rate": 7.191216702556124e-06, "loss": 0.6031, "step": 7878 }, { "epoch": 3.0153080750095675, "grad_norm": 0.5101593732833862, "learning_rate": 7.188837618391169e-06, "loss": 0.7066, "step": 7879 }, { "epoch": 3.015690776884807, "grad_norm": 0.5184446573257446, "learning_rate": 7.186458706979694e-06, "loss": 0.6609, "step": 7880 }, { "epoch": 3.016073478760046, "grad_norm": 0.5036608576774597, "learning_rate": 7.184079968467894e-06, "loss": 0.6739, "step": 7881 }, { "epoch": 3.016456180635285, "grad_norm": 0.4989452064037323, "learning_rate": 7.181701403001939e-06, "loss": 0.5462, "step": 7882 }, { "epoch": 3.016838882510524, "grad_norm": 0.5678659081459045, "learning_rate": 7.179323010728007e-06, "loss": 0.6507, "step": 7883 }, { "epoch": 3.0172215843857635, "grad_norm": 0.5449352264404297, "learning_rate": 7.176944791792257e-06, "loss": 0.6034, "step": 7884 }, { "epoch": 3.017604286261003, "grad_norm": 0.5329513549804688, "learning_rate": 7.17456674634083e-06, "loss": 0.7142, "step": 7885 }, { "epoch": 3.0179869881362418, "grad_norm": 0.6326792240142822, "learning_rate": 7.172188874519866e-06, "loss": 0.6695, "step": 7886 }, { "epoch": 3.018369690011481, "grad_norm": 0.5194280743598938, "learning_rate": 7.169811176475497e-06, "loss": 0.6606, "step": 7887 }, { "epoch": 3.01875239188672, "grad_norm": 0.5534012913703918, "learning_rate": 7.16743365235383e-06, "loss": 0.6567, "step": 7888 }, { "epoch": 3.0191350937619594, "grad_norm": 0.5435893535614014, "learning_rate": 7.165056302300975e-06, "loss": 0.6121, "step": 7889 }, { "epoch": 3.019517795637199, "grad_norm": 0.5465348362922668, "learning_rate": 7.162679126463025e-06, "loss": 0.524, "step": 7890 }, { "epoch": 3.0199004975124377, "grad_norm": 0.50992751121521, "learning_rate": 7.160302124986073e-06, "loss": 0.5654, "step": 7891 }, { "epoch": 3.020283199387677, "grad_norm": 0.539661705493927, "learning_rate": 7.157925298016177e-06, "loss": 0.6775, "step": 7892 }, { "epoch": 3.020665901262916, "grad_norm": 0.5626011490821838, "learning_rate": 7.155548645699406e-06, "loss": 0.5843, "step": 7893 }, { "epoch": 3.0210486031381554, "grad_norm": 0.5163406729698181, "learning_rate": 7.1531721681818175e-06, "loss": 0.6479, "step": 7894 }, { "epoch": 3.0214313050133947, "grad_norm": 0.5336521863937378, "learning_rate": 7.150795865609444e-06, "loss": 0.6623, "step": 7895 }, { "epoch": 3.0218140068886337, "grad_norm": 0.519618570804596, "learning_rate": 7.148419738128318e-06, "loss": 0.6287, "step": 7896 }, { "epoch": 3.022196708763873, "grad_norm": 0.5488234758377075, "learning_rate": 7.146043785884463e-06, "loss": 0.6049, "step": 7897 }, { "epoch": 3.022579410639112, "grad_norm": 0.5269317030906677, "learning_rate": 7.143668009023883e-06, "loss": 0.6158, "step": 7898 }, { "epoch": 3.0229621125143513, "grad_norm": 0.5801209807395935, "learning_rate": 7.141292407692575e-06, "loss": 0.6629, "step": 7899 }, { "epoch": 3.0233448143895907, "grad_norm": 0.5503528118133545, "learning_rate": 7.138916982036535e-06, "loss": 0.6032, "step": 7900 }, { "epoch": 3.0237275162648296, "grad_norm": 0.5179775953292847, "learning_rate": 7.136541732201727e-06, "loss": 0.6333, "step": 7901 }, { "epoch": 3.024110218140069, "grad_norm": 0.5585522651672363, "learning_rate": 7.1341666583341225e-06, "loss": 0.5625, "step": 7902 }, { "epoch": 3.024492920015308, "grad_norm": 0.6402282118797302, "learning_rate": 7.131791760579679e-06, "loss": 0.5729, "step": 7903 }, { "epoch": 3.0248756218905473, "grad_norm": 0.47354236245155334, "learning_rate": 7.1294170390843335e-06, "loss": 0.5123, "step": 7904 }, { "epoch": 3.0252583237657866, "grad_norm": 0.559725284576416, "learning_rate": 7.127042493994023e-06, "loss": 0.5473, "step": 7905 }, { "epoch": 3.0256410256410255, "grad_norm": 0.6142787933349609, "learning_rate": 7.12466812545467e-06, "loss": 0.4985, "step": 7906 }, { "epoch": 3.026023727516265, "grad_norm": 0.6210229396820068, "learning_rate": 7.1222939336121874e-06, "loss": 0.6546, "step": 7907 }, { "epoch": 3.026406429391504, "grad_norm": 0.5313820242881775, "learning_rate": 7.119919918612472e-06, "loss": 0.6294, "step": 7908 }, { "epoch": 3.026789131266743, "grad_norm": 0.5091274976730347, "learning_rate": 7.117546080601414e-06, "loss": 0.6642, "step": 7909 }, { "epoch": 3.0271718331419826, "grad_norm": 0.5074041485786438, "learning_rate": 7.115172419724896e-06, "loss": 0.5976, "step": 7910 }, { "epoch": 3.0275545350172215, "grad_norm": 0.5098227262496948, "learning_rate": 7.112798936128781e-06, "loss": 0.645, "step": 7911 }, { "epoch": 3.027937236892461, "grad_norm": 0.5049480199813843, "learning_rate": 7.110425629958925e-06, "loss": 0.6438, "step": 7912 }, { "epoch": 3.0283199387677, "grad_norm": 0.502105712890625, "learning_rate": 7.108052501361183e-06, "loss": 0.5966, "step": 7913 }, { "epoch": 3.028702640642939, "grad_norm": 0.5815326571464539, "learning_rate": 7.105679550481379e-06, "loss": 0.6444, "step": 7914 }, { "epoch": 3.0290853425181785, "grad_norm": 0.5368668437004089, "learning_rate": 7.103306777465343e-06, "loss": 0.5485, "step": 7915 }, { "epoch": 3.0294680443934174, "grad_norm": 0.5296221375465393, "learning_rate": 7.100934182458889e-06, "loss": 0.6526, "step": 7916 }, { "epoch": 3.029850746268657, "grad_norm": 0.5591075420379639, "learning_rate": 7.098561765607817e-06, "loss": 0.5705, "step": 7917 }, { "epoch": 3.0302334481438957, "grad_norm": 0.5059341192245483, "learning_rate": 7.096189527057917e-06, "loss": 0.6023, "step": 7918 }, { "epoch": 3.030616150019135, "grad_norm": 0.49456357955932617, "learning_rate": 7.093817466954973e-06, "loss": 0.5535, "step": 7919 }, { "epoch": 3.0309988518943745, "grad_norm": 0.5420718789100647, "learning_rate": 7.0914455854447584e-06, "loss": 0.5608, "step": 7920 }, { "epoch": 3.0313815537696134, "grad_norm": 0.5651302337646484, "learning_rate": 7.089073882673021e-06, "loss": 0.6895, "step": 7921 }, { "epoch": 3.0317642556448527, "grad_norm": 0.4976652264595032, "learning_rate": 7.0867023587855135e-06, "loss": 0.6083, "step": 7922 }, { "epoch": 3.0321469575200917, "grad_norm": 0.569023847579956, "learning_rate": 7.084331013927974e-06, "loss": 0.6515, "step": 7923 }, { "epoch": 3.032529659395331, "grad_norm": 0.4907795488834381, "learning_rate": 7.081959848246126e-06, "loss": 0.6005, "step": 7924 }, { "epoch": 3.0329123612705704, "grad_norm": 0.5414459705352783, "learning_rate": 7.079588861885685e-06, "loss": 0.6208, "step": 7925 }, { "epoch": 3.0332950631458093, "grad_norm": 0.5285072326660156, "learning_rate": 7.077218054992356e-06, "loss": 0.5922, "step": 7926 }, { "epoch": 3.0336777650210487, "grad_norm": 0.5193989276885986, "learning_rate": 7.074847427711824e-06, "loss": 0.6022, "step": 7927 }, { "epoch": 3.0340604668962876, "grad_norm": 0.5649027824401855, "learning_rate": 7.07247698018978e-06, "loss": 0.688, "step": 7928 }, { "epoch": 3.034443168771527, "grad_norm": 0.5605734586715698, "learning_rate": 7.070106712571892e-06, "loss": 0.6443, "step": 7929 }, { "epoch": 3.0348258706467663, "grad_norm": 0.5229049921035767, "learning_rate": 7.0677366250038145e-06, "loss": 0.5774, "step": 7930 }, { "epoch": 3.0352085725220053, "grad_norm": 0.5838559865951538, "learning_rate": 7.065366717631199e-06, "loss": 0.7123, "step": 7931 }, { "epoch": 3.0355912743972446, "grad_norm": 0.6984310746192932, "learning_rate": 7.0629969905996835e-06, "loss": 0.7151, "step": 7932 }, { "epoch": 3.0359739762724836, "grad_norm": 0.5110427141189575, "learning_rate": 7.0606274440548935e-06, "loss": 0.6685, "step": 7933 }, { "epoch": 3.036356678147723, "grad_norm": 0.5332331657409668, "learning_rate": 7.058258078142443e-06, "loss": 0.5786, "step": 7934 }, { "epoch": 3.0367393800229623, "grad_norm": 0.48807770013809204, "learning_rate": 7.055888893007936e-06, "loss": 0.5993, "step": 7935 }, { "epoch": 3.037122081898201, "grad_norm": 0.5495527982711792, "learning_rate": 7.0535198887969695e-06, "loss": 0.6057, "step": 7936 }, { "epoch": 3.0375047837734406, "grad_norm": 0.5564048886299133, "learning_rate": 7.0511510656551175e-06, "loss": 0.6165, "step": 7937 }, { "epoch": 3.0378874856486795, "grad_norm": 0.5300796627998352, "learning_rate": 7.048782423727953e-06, "loss": 0.6343, "step": 7938 }, { "epoch": 3.038270187523919, "grad_norm": 0.568973958492279, "learning_rate": 7.046413963161043e-06, "loss": 0.63, "step": 7939 }, { "epoch": 3.0386528893991582, "grad_norm": 0.4992087483406067, "learning_rate": 7.044045684099925e-06, "loss": 0.5675, "step": 7940 }, { "epoch": 3.039035591274397, "grad_norm": 0.5667449235916138, "learning_rate": 7.041677586690141e-06, "loss": 0.5946, "step": 7941 }, { "epoch": 3.0394182931496365, "grad_norm": 0.5285758972167969, "learning_rate": 7.03930967107722e-06, "loss": 0.6034, "step": 7942 }, { "epoch": 3.0398009950248754, "grad_norm": 0.546413779258728, "learning_rate": 7.03694193740667e-06, "loss": 0.6315, "step": 7943 }, { "epoch": 3.040183696900115, "grad_norm": 0.5502351522445679, "learning_rate": 7.034574385824e-06, "loss": 0.6623, "step": 7944 }, { "epoch": 3.040566398775354, "grad_norm": 0.5707134008407593, "learning_rate": 7.0322070164746995e-06, "loss": 0.6585, "step": 7945 }, { "epoch": 3.040949100650593, "grad_norm": 0.5832645893096924, "learning_rate": 7.029839829504254e-06, "loss": 0.6153, "step": 7946 }, { "epoch": 3.0413318025258325, "grad_norm": 0.5881050825119019, "learning_rate": 7.0274728250581285e-06, "loss": 0.6844, "step": 7947 }, { "epoch": 3.0417145044010714, "grad_norm": 0.5518855452537537, "learning_rate": 7.025106003281779e-06, "loss": 0.5601, "step": 7948 }, { "epoch": 3.0420972062763108, "grad_norm": 0.5233702063560486, "learning_rate": 7.022739364320665e-06, "loss": 0.567, "step": 7949 }, { "epoch": 3.04247990815155, "grad_norm": 0.4652913212776184, "learning_rate": 7.0203729083202125e-06, "loss": 0.6419, "step": 7950 }, { "epoch": 3.042862610026789, "grad_norm": 0.527899444103241, "learning_rate": 7.018006635425847e-06, "loss": 0.6761, "step": 7951 }, { "epoch": 3.0432453119020284, "grad_norm": 0.5273644328117371, "learning_rate": 7.015640545782989e-06, "loss": 0.6238, "step": 7952 }, { "epoch": 3.0436280137772673, "grad_norm": 0.4988754093647003, "learning_rate": 7.0132746395370336e-06, "loss": 0.6051, "step": 7953 }, { "epoch": 3.0440107156525067, "grad_norm": 0.5570819973945618, "learning_rate": 7.010908916833375e-06, "loss": 0.6551, "step": 7954 }, { "epoch": 3.044393417527746, "grad_norm": 0.5434985160827637, "learning_rate": 7.0085433778173995e-06, "loss": 0.6572, "step": 7955 }, { "epoch": 3.044776119402985, "grad_norm": 0.5613444447517395, "learning_rate": 7.006178022634464e-06, "loss": 0.6657, "step": 7956 }, { "epoch": 3.0451588212782243, "grad_norm": 0.5320538282394409, "learning_rate": 7.003812851429934e-06, "loss": 0.5907, "step": 7957 }, { "epoch": 3.0455415231534633, "grad_norm": 0.5188245177268982, "learning_rate": 7.001447864349152e-06, "loss": 0.6364, "step": 7958 }, { "epoch": 3.0459242250287026, "grad_norm": 0.5378164052963257, "learning_rate": 6.999083061537456e-06, "loss": 0.6707, "step": 7959 }, { "epoch": 3.046306926903942, "grad_norm": 0.5142242312431335, "learning_rate": 6.9967184431401665e-06, "loss": 0.631, "step": 7960 }, { "epoch": 3.046689628779181, "grad_norm": 0.5151417255401611, "learning_rate": 6.994354009302597e-06, "loss": 0.6364, "step": 7961 }, { "epoch": 3.0470723306544203, "grad_norm": 0.5500905513763428, "learning_rate": 6.991989760170051e-06, "loss": 0.6308, "step": 7962 }, { "epoch": 3.047455032529659, "grad_norm": 0.5071748495101929, "learning_rate": 6.989625695887811e-06, "loss": 0.5846, "step": 7963 }, { "epoch": 3.0478377344048986, "grad_norm": 0.5494300127029419, "learning_rate": 6.987261816601161e-06, "loss": 0.634, "step": 7964 }, { "epoch": 3.048220436280138, "grad_norm": 0.4972243309020996, "learning_rate": 6.98489812245537e-06, "loss": 0.6214, "step": 7965 }, { "epoch": 3.048603138155377, "grad_norm": 0.5380309224128723, "learning_rate": 6.982534613595685e-06, "loss": 0.6638, "step": 7966 }, { "epoch": 3.0489858400306162, "grad_norm": 0.5463553071022034, "learning_rate": 6.980171290167355e-06, "loss": 0.6308, "step": 7967 }, { "epoch": 3.049368541905855, "grad_norm": 0.5618302822113037, "learning_rate": 6.977808152315612e-06, "loss": 0.6161, "step": 7968 }, { "epoch": 3.0497512437810945, "grad_norm": 0.5126113891601562, "learning_rate": 6.975445200185678e-06, "loss": 0.5419, "step": 7969 }, { "epoch": 3.050133945656334, "grad_norm": 0.5398526787757874, "learning_rate": 6.97308243392276e-06, "loss": 0.5701, "step": 7970 }, { "epoch": 3.050516647531573, "grad_norm": 0.5007403492927551, "learning_rate": 6.970719853672059e-06, "loss": 0.5613, "step": 7971 }, { "epoch": 3.050899349406812, "grad_norm": 0.5496951937675476, "learning_rate": 6.968357459578763e-06, "loss": 0.6329, "step": 7972 }, { "epoch": 3.051282051282051, "grad_norm": 0.5096003413200378, "learning_rate": 6.965995251788044e-06, "loss": 0.6033, "step": 7973 }, { "epoch": 3.0516647531572905, "grad_norm": 0.5311865210533142, "learning_rate": 6.963633230445066e-06, "loss": 0.6349, "step": 7974 }, { "epoch": 3.05204745503253, "grad_norm": 0.6263351440429688, "learning_rate": 6.961271395694988e-06, "loss": 0.5705, "step": 7975 }, { "epoch": 3.0524301569077688, "grad_norm": 0.5619261264801025, "learning_rate": 6.958909747682943e-06, "loss": 0.6585, "step": 7976 }, { "epoch": 3.052812858783008, "grad_norm": 0.5583419799804688, "learning_rate": 6.956548286554063e-06, "loss": 0.6695, "step": 7977 }, { "epoch": 3.053195560658247, "grad_norm": 0.6191052198410034, "learning_rate": 6.95418701245347e-06, "loss": 0.661, "step": 7978 }, { "epoch": 3.0535782625334864, "grad_norm": 0.5545650124549866, "learning_rate": 6.951825925526266e-06, "loss": 0.5557, "step": 7979 }, { "epoch": 3.0539609644087258, "grad_norm": 0.5309755802154541, "learning_rate": 6.949465025917546e-06, "loss": 0.5628, "step": 7980 }, { "epoch": 3.0543436662839647, "grad_norm": 0.6961756348609924, "learning_rate": 6.947104313772399e-06, "loss": 0.6766, "step": 7981 }, { "epoch": 3.054726368159204, "grad_norm": 0.5293580293655396, "learning_rate": 6.944743789235889e-06, "loss": 0.6435, "step": 7982 }, { "epoch": 3.055109070034443, "grad_norm": 0.5223367810249329, "learning_rate": 6.942383452453083e-06, "loss": 0.7213, "step": 7983 }, { "epoch": 3.0554917719096824, "grad_norm": 0.5463908314704895, "learning_rate": 6.940023303569028e-06, "loss": 0.6553, "step": 7984 }, { "epoch": 3.0558744737849217, "grad_norm": 0.5581600666046143, "learning_rate": 6.937663342728764e-06, "loss": 0.5936, "step": 7985 }, { "epoch": 3.0562571756601606, "grad_norm": 0.5611060261726379, "learning_rate": 6.935303570077312e-06, "loss": 0.6344, "step": 7986 }, { "epoch": 3.0566398775354, "grad_norm": 0.5780735611915588, "learning_rate": 6.932943985759689e-06, "loss": 0.6462, "step": 7987 }, { "epoch": 3.057022579410639, "grad_norm": 0.5449137091636658, "learning_rate": 6.9305845899209e-06, "loss": 0.6279, "step": 7988 }, { "epoch": 3.0574052812858783, "grad_norm": 0.504331111907959, "learning_rate": 6.928225382705931e-06, "loss": 0.6878, "step": 7989 }, { "epoch": 3.0577879831611177, "grad_norm": 0.5634763240814209, "learning_rate": 6.925866364259766e-06, "loss": 0.6341, "step": 7990 }, { "epoch": 3.0581706850363566, "grad_norm": 0.5153359770774841, "learning_rate": 6.923507534727374e-06, "loss": 0.6053, "step": 7991 }, { "epoch": 3.058553386911596, "grad_norm": 0.5280113816261292, "learning_rate": 6.921148894253704e-06, "loss": 0.6745, "step": 7992 }, { "epoch": 3.058936088786835, "grad_norm": 0.5600329041481018, "learning_rate": 6.918790442983706e-06, "loss": 0.6642, "step": 7993 }, { "epoch": 3.0593187906620742, "grad_norm": 0.5879468321800232, "learning_rate": 6.916432181062318e-06, "loss": 0.6277, "step": 7994 }, { "epoch": 3.0597014925373136, "grad_norm": 0.549040675163269, "learning_rate": 6.914074108634452e-06, "loss": 0.5972, "step": 7995 }, { "epoch": 3.0600841944125525, "grad_norm": 0.5419923663139343, "learning_rate": 6.911716225845023e-06, "loss": 0.6797, "step": 7996 }, { "epoch": 3.060466896287792, "grad_norm": 0.53341144323349, "learning_rate": 6.909358532838929e-06, "loss": 0.651, "step": 7997 }, { "epoch": 3.060849598163031, "grad_norm": 0.5726982355117798, "learning_rate": 6.907001029761058e-06, "loss": 0.674, "step": 7998 }, { "epoch": 3.06123230003827, "grad_norm": 0.5573379397392273, "learning_rate": 6.90464371675628e-06, "loss": 0.6023, "step": 7999 }, { "epoch": 3.0616150019135095, "grad_norm": 0.5533918738365173, "learning_rate": 6.902286593969461e-06, "loss": 0.6309, "step": 8000 }, { "epoch": 3.0619977037887485, "grad_norm": 0.4886476993560791, "learning_rate": 6.8999296615454584e-06, "loss": 0.6508, "step": 8001 }, { "epoch": 3.062380405663988, "grad_norm": 0.5772938132286072, "learning_rate": 6.897572919629101e-06, "loss": 0.575, "step": 8002 }, { "epoch": 3.0627631075392268, "grad_norm": 0.5816050171852112, "learning_rate": 6.895216368365221e-06, "loss": 0.634, "step": 8003 }, { "epoch": 3.063145809414466, "grad_norm": 0.5496010780334473, "learning_rate": 6.892860007898643e-06, "loss": 0.663, "step": 8004 }, { "epoch": 3.0635285112897055, "grad_norm": 0.5236322283744812, "learning_rate": 6.890503838374158e-06, "loss": 0.5729, "step": 8005 }, { "epoch": 3.0639112131649444, "grad_norm": 0.5140188932418823, "learning_rate": 6.888147859936569e-06, "loss": 0.6081, "step": 8006 }, { "epoch": 3.064293915040184, "grad_norm": 0.5488952398300171, "learning_rate": 6.885792072730655e-06, "loss": 0.6251, "step": 8007 }, { "epoch": 3.0646766169154227, "grad_norm": 0.5796549916267395, "learning_rate": 6.883436476901182e-06, "loss": 0.661, "step": 8008 }, { "epoch": 3.065059318790662, "grad_norm": 0.6760124564170837, "learning_rate": 6.88108107259291e-06, "loss": 0.5932, "step": 8009 }, { "epoch": 3.0654420206659014, "grad_norm": 0.5020790696144104, "learning_rate": 6.878725859950584e-06, "loss": 0.5879, "step": 8010 }, { "epoch": 3.0658247225411404, "grad_norm": 0.5470761060714722, "learning_rate": 6.876370839118944e-06, "loss": 0.5771, "step": 8011 }, { "epoch": 3.0662074244163797, "grad_norm": 0.5162336826324463, "learning_rate": 6.8740160102427054e-06, "loss": 0.6207, "step": 8012 }, { "epoch": 3.0665901262916186, "grad_norm": 0.5369536280632019, "learning_rate": 6.871661373466578e-06, "loss": 0.6578, "step": 8013 }, { "epoch": 3.066972828166858, "grad_norm": 0.5590762495994568, "learning_rate": 6.869306928935268e-06, "loss": 0.6476, "step": 8014 }, { "epoch": 3.0673555300420974, "grad_norm": 0.5060684680938721, "learning_rate": 6.866952676793455e-06, "loss": 0.5805, "step": 8015 }, { "epoch": 3.0677382319173363, "grad_norm": 0.5335803627967834, "learning_rate": 6.864598617185816e-06, "loss": 0.5841, "step": 8016 }, { "epoch": 3.0681209337925757, "grad_norm": 0.5868942141532898, "learning_rate": 6.862244750257018e-06, "loss": 0.6133, "step": 8017 }, { "epoch": 3.0685036356678146, "grad_norm": 0.7607178092002869, "learning_rate": 6.8598910761517066e-06, "loss": 0.706, "step": 8018 }, { "epoch": 3.068886337543054, "grad_norm": 0.561485230922699, "learning_rate": 6.857537595014525e-06, "loss": 0.6275, "step": 8019 }, { "epoch": 3.0692690394182933, "grad_norm": 0.6125158071517944, "learning_rate": 6.855184306990106e-06, "loss": 0.6127, "step": 8020 }, { "epoch": 3.0696517412935322, "grad_norm": 0.563649594783783, "learning_rate": 6.852831212223053e-06, "loss": 0.6784, "step": 8021 }, { "epoch": 3.0700344431687716, "grad_norm": 0.5375639796257019, "learning_rate": 6.8504783108579785e-06, "loss": 0.5795, "step": 8022 }, { "epoch": 3.0704171450440105, "grad_norm": 0.5208809971809387, "learning_rate": 6.848125603039473e-06, "loss": 0.6685, "step": 8023 }, { "epoch": 3.07079984691925, "grad_norm": 0.6031448841094971, "learning_rate": 6.845773088912118e-06, "loss": 0.6147, "step": 8024 }, { "epoch": 3.0711825487944893, "grad_norm": 0.5716138482093811, "learning_rate": 6.843420768620479e-06, "loss": 0.5947, "step": 8025 }, { "epoch": 3.071565250669728, "grad_norm": 0.6018654704093933, "learning_rate": 6.8410686423091145e-06, "loss": 0.6178, "step": 8026 }, { "epoch": 3.0719479525449676, "grad_norm": 0.6149436831474304, "learning_rate": 6.838716710122568e-06, "loss": 0.6772, "step": 8027 }, { "epoch": 3.0723306544202065, "grad_norm": 0.514400839805603, "learning_rate": 6.836364972205374e-06, "loss": 0.6666, "step": 8028 }, { "epoch": 3.072713356295446, "grad_norm": 0.5525543689727783, "learning_rate": 6.834013428702049e-06, "loss": 0.7035, "step": 8029 }, { "epoch": 3.073096058170685, "grad_norm": 0.5800848007202148, "learning_rate": 6.831662079757109e-06, "loss": 0.6883, "step": 8030 }, { "epoch": 3.073478760045924, "grad_norm": 0.5816313624382019, "learning_rate": 6.8293109255150415e-06, "loss": 0.615, "step": 8031 }, { "epoch": 3.0738614619211635, "grad_norm": 0.5465683341026306, "learning_rate": 6.826959966120335e-06, "loss": 0.5831, "step": 8032 }, { "epoch": 3.0742441637964024, "grad_norm": 0.5382384061813354, "learning_rate": 6.824609201717467e-06, "loss": 0.5917, "step": 8033 }, { "epoch": 3.074626865671642, "grad_norm": 0.5100961327552795, "learning_rate": 6.82225863245089e-06, "loss": 0.6967, "step": 8034 }, { "epoch": 3.075009567546881, "grad_norm": 0.49589988589286804, "learning_rate": 6.819908258465058e-06, "loss": 0.541, "step": 8035 }, { "epoch": 3.07539226942212, "grad_norm": 0.5409738421440125, "learning_rate": 6.817558079904407e-06, "loss": 0.6575, "step": 8036 }, { "epoch": 3.0757749712973594, "grad_norm": 0.5584927201271057, "learning_rate": 6.815208096913366e-06, "loss": 0.6924, "step": 8037 }, { "epoch": 3.0761576731725984, "grad_norm": 0.5452903509140015, "learning_rate": 6.812858309636337e-06, "loss": 0.6639, "step": 8038 }, { "epoch": 3.0765403750478377, "grad_norm": 0.5400851964950562, "learning_rate": 6.810508718217728e-06, "loss": 0.6079, "step": 8039 }, { "epoch": 3.076923076923077, "grad_norm": 0.5144737958908081, "learning_rate": 6.808159322801932e-06, "loss": 0.6447, "step": 8040 }, { "epoch": 3.077305778798316, "grad_norm": 0.5459115505218506, "learning_rate": 6.805810123533316e-06, "loss": 0.5553, "step": 8041 }, { "epoch": 3.0776884806735554, "grad_norm": 0.48675021529197693, "learning_rate": 6.803461120556249e-06, "loss": 0.6506, "step": 8042 }, { "epoch": 3.0780711825487943, "grad_norm": 0.5459500551223755, "learning_rate": 6.801112314015086e-06, "loss": 0.6963, "step": 8043 }, { "epoch": 3.0784538844240337, "grad_norm": 0.6179793477058411, "learning_rate": 6.798763704054162e-06, "loss": 0.5826, "step": 8044 }, { "epoch": 3.078836586299273, "grad_norm": 0.493438184261322, "learning_rate": 6.7964152908178095e-06, "loss": 0.6144, "step": 8045 }, { "epoch": 3.079219288174512, "grad_norm": 0.5936926603317261, "learning_rate": 6.794067074450348e-06, "loss": 0.6754, "step": 8046 }, { "epoch": 3.0796019900497513, "grad_norm": 0.543782114982605, "learning_rate": 6.791719055096073e-06, "loss": 0.5609, "step": 8047 }, { "epoch": 3.0799846919249902, "grad_norm": 0.5295653343200684, "learning_rate": 6.789371232899279e-06, "loss": 0.6001, "step": 8048 }, { "epoch": 3.0803673938002296, "grad_norm": 0.6172785758972168, "learning_rate": 6.787023608004251e-06, "loss": 0.6337, "step": 8049 }, { "epoch": 3.080750095675469, "grad_norm": 0.5173908472061157, "learning_rate": 6.784676180555256e-06, "loss": 0.5484, "step": 8050 }, { "epoch": 3.081132797550708, "grad_norm": 0.5400720238685608, "learning_rate": 6.782328950696545e-06, "loss": 0.6526, "step": 8051 }, { "epoch": 3.0815154994259473, "grad_norm": 0.5599631667137146, "learning_rate": 6.779981918572364e-06, "loss": 0.5809, "step": 8052 }, { "epoch": 3.081898201301186, "grad_norm": 0.5283148288726807, "learning_rate": 6.777635084326947e-06, "loss": 0.6257, "step": 8053 }, { "epoch": 3.0822809031764256, "grad_norm": 0.5734356641769409, "learning_rate": 6.7752884481045075e-06, "loss": 0.5959, "step": 8054 }, { "epoch": 3.082663605051665, "grad_norm": 0.5362812280654907, "learning_rate": 6.772942010049257e-06, "loss": 0.6448, "step": 8055 }, { "epoch": 3.083046306926904, "grad_norm": 0.5182270407676697, "learning_rate": 6.770595770305395e-06, "loss": 0.5866, "step": 8056 }, { "epoch": 3.083429008802143, "grad_norm": 0.554039716720581, "learning_rate": 6.768249729017092e-06, "loss": 0.6092, "step": 8057 }, { "epoch": 3.083811710677382, "grad_norm": 0.49036991596221924, "learning_rate": 6.765903886328527e-06, "loss": 0.6548, "step": 8058 }, { "epoch": 3.0841944125526215, "grad_norm": 0.5635830760002136, "learning_rate": 6.763558242383857e-06, "loss": 0.6324, "step": 8059 }, { "epoch": 3.084577114427861, "grad_norm": 0.5577313303947449, "learning_rate": 6.761212797327228e-06, "loss": 0.6473, "step": 8060 }, { "epoch": 3.0849598163031, "grad_norm": 0.5323535799980164, "learning_rate": 6.758867551302771e-06, "loss": 0.6178, "step": 8061 }, { "epoch": 3.085342518178339, "grad_norm": 0.5124140977859497, "learning_rate": 6.756522504454613e-06, "loss": 0.6071, "step": 8062 }, { "epoch": 3.085725220053578, "grad_norm": 0.5695822238922119, "learning_rate": 6.754177656926859e-06, "loss": 0.5466, "step": 8063 }, { "epoch": 3.0861079219288174, "grad_norm": 0.5757600665092468, "learning_rate": 6.751833008863608e-06, "loss": 0.6306, "step": 8064 }, { "epoch": 3.086490623804057, "grad_norm": 0.5331098437309265, "learning_rate": 6.749488560408945e-06, "loss": 0.5914, "step": 8065 }, { "epoch": 3.0868733256792957, "grad_norm": 0.5125967860221863, "learning_rate": 6.747144311706947e-06, "loss": 0.6225, "step": 8066 }, { "epoch": 3.087256027554535, "grad_norm": 0.5531377196311951, "learning_rate": 6.744800262901665e-06, "loss": 0.6538, "step": 8067 }, { "epoch": 3.087638729429774, "grad_norm": 0.5062844753265381, "learning_rate": 6.7424564141371515e-06, "loss": 0.6006, "step": 8068 }, { "epoch": 3.0880214313050134, "grad_norm": 0.504032552242279, "learning_rate": 6.740112765557445e-06, "loss": 0.6284, "step": 8069 }, { "epoch": 3.0884041331802528, "grad_norm": 0.4856913983821869, "learning_rate": 6.7377693173065654e-06, "loss": 0.642, "step": 8070 }, { "epoch": 3.0887868350554917, "grad_norm": 0.5728809237480164, "learning_rate": 6.7354260695285235e-06, "loss": 0.625, "step": 8071 }, { "epoch": 3.089169536930731, "grad_norm": 0.5752045512199402, "learning_rate": 6.733083022367324e-06, "loss": 0.6039, "step": 8072 }, { "epoch": 3.08955223880597, "grad_norm": 0.5121870040893555, "learning_rate": 6.730740175966945e-06, "loss": 0.5822, "step": 8073 }, { "epoch": 3.0899349406812093, "grad_norm": 0.5343890190124512, "learning_rate": 6.728397530471367e-06, "loss": 0.5874, "step": 8074 }, { "epoch": 3.0903176425564487, "grad_norm": 0.5360329151153564, "learning_rate": 6.72605508602455e-06, "loss": 0.6299, "step": 8075 }, { "epoch": 3.0907003444316876, "grad_norm": 0.5933768153190613, "learning_rate": 6.723712842770447e-06, "loss": 0.5573, "step": 8076 }, { "epoch": 3.091083046306927, "grad_norm": 0.5478858947753906, "learning_rate": 6.721370800852986e-06, "loss": 0.6395, "step": 8077 }, { "epoch": 3.091465748182166, "grad_norm": 0.5485054850578308, "learning_rate": 6.7190289604160986e-06, "loss": 0.6419, "step": 8078 }, { "epoch": 3.0918484500574053, "grad_norm": 0.5526188611984253, "learning_rate": 6.716687321603698e-06, "loss": 0.6471, "step": 8079 }, { "epoch": 3.0922311519326446, "grad_norm": 0.543176531791687, "learning_rate": 6.714345884559679e-06, "loss": 0.6139, "step": 8080 }, { "epoch": 3.0926138538078836, "grad_norm": 0.5772094130516052, "learning_rate": 6.7120046494279334e-06, "loss": 0.6603, "step": 8081 }, { "epoch": 3.092996555683123, "grad_norm": 0.555208683013916, "learning_rate": 6.7096636163523365e-06, "loss": 0.6512, "step": 8082 }, { "epoch": 3.093379257558362, "grad_norm": 0.536002516746521, "learning_rate": 6.707322785476748e-06, "loss": 0.5888, "step": 8083 }, { "epoch": 3.093761959433601, "grad_norm": 0.5198010206222534, "learning_rate": 6.704982156945021e-06, "loss": 0.5739, "step": 8084 }, { "epoch": 3.0941446613088406, "grad_norm": 0.5253860354423523, "learning_rate": 6.702641730900996e-06, "loss": 0.5461, "step": 8085 }, { "epoch": 3.0945273631840795, "grad_norm": 0.5236254334449768, "learning_rate": 6.70030150748849e-06, "loss": 0.6516, "step": 8086 }, { "epoch": 3.094910065059319, "grad_norm": 0.5537950396537781, "learning_rate": 6.697961486851321e-06, "loss": 0.6257, "step": 8087 }, { "epoch": 3.095292766934558, "grad_norm": 0.5651363730430603, "learning_rate": 6.695621669133291e-06, "loss": 0.5937, "step": 8088 }, { "epoch": 3.095675468809797, "grad_norm": 0.5274248123168945, "learning_rate": 6.693282054478188e-06, "loss": 0.6445, "step": 8089 }, { "epoch": 3.0960581706850365, "grad_norm": 0.5064648985862732, "learning_rate": 6.690942643029783e-06, "loss": 0.6252, "step": 8090 }, { "epoch": 3.0964408725602754, "grad_norm": 0.5280523896217346, "learning_rate": 6.688603434931844e-06, "loss": 0.6009, "step": 8091 }, { "epoch": 3.096823574435515, "grad_norm": 0.5046738982200623, "learning_rate": 6.686264430328123e-06, "loss": 0.5938, "step": 8092 }, { "epoch": 3.0972062763107537, "grad_norm": 0.5764283537864685, "learning_rate": 6.68392562936235e-06, "loss": 0.6428, "step": 8093 }, { "epoch": 3.097588978185993, "grad_norm": 0.4941142499446869, "learning_rate": 6.6815870321782565e-06, "loss": 0.6039, "step": 8094 }, { "epoch": 3.0979716800612325, "grad_norm": 0.5031107068061829, "learning_rate": 6.67924863891956e-06, "loss": 0.5459, "step": 8095 }, { "epoch": 3.0983543819364714, "grad_norm": 0.5341030359268188, "learning_rate": 6.676910449729951e-06, "loss": 0.5594, "step": 8096 }, { "epoch": 3.0987370838117108, "grad_norm": 0.5608296990394592, "learning_rate": 6.674572464753122e-06, "loss": 0.5958, "step": 8097 }, { "epoch": 3.0991197856869497, "grad_norm": 0.5384950041770935, "learning_rate": 6.6722346841327515e-06, "loss": 0.6648, "step": 8098 }, { "epoch": 3.099502487562189, "grad_norm": 0.5497645735740662, "learning_rate": 6.6698971080124975e-06, "loss": 0.6586, "step": 8099 }, { "epoch": 3.0998851894374284, "grad_norm": 0.5594947338104248, "learning_rate": 6.667559736536013e-06, "loss": 0.5399, "step": 8100 }, { "epoch": 3.1002678913126673, "grad_norm": 0.5388815402984619, "learning_rate": 6.665222569846934e-06, "loss": 0.6808, "step": 8101 }, { "epoch": 3.1006505931879067, "grad_norm": 0.5905490517616272, "learning_rate": 6.6628856080888915e-06, "loss": 0.6482, "step": 8102 }, { "epoch": 3.1010332950631456, "grad_norm": 0.5474305748939514, "learning_rate": 6.660548851405487e-06, "loss": 0.5318, "step": 8103 }, { "epoch": 3.101415996938385, "grad_norm": 0.5734382271766663, "learning_rate": 6.65821229994033e-06, "loss": 0.626, "step": 8104 }, { "epoch": 3.1017986988136244, "grad_norm": 0.5257936120033264, "learning_rate": 6.655875953837008e-06, "loss": 0.6125, "step": 8105 }, { "epoch": 3.1021814006888633, "grad_norm": 0.5304157733917236, "learning_rate": 6.65353981323909e-06, "loss": 0.6336, "step": 8106 }, { "epoch": 3.1025641025641026, "grad_norm": 0.6011481285095215, "learning_rate": 6.651203878290139e-06, "loss": 0.7281, "step": 8107 }, { "epoch": 3.1029468044393416, "grad_norm": 0.5459532737731934, "learning_rate": 6.64886814913371e-06, "loss": 0.6486, "step": 8108 }, { "epoch": 3.103329506314581, "grad_norm": 0.5097303986549377, "learning_rate": 6.646532625913331e-06, "loss": 0.6335, "step": 8109 }, { "epoch": 3.1037122081898203, "grad_norm": 0.5461519360542297, "learning_rate": 6.644197308772533e-06, "loss": 0.6455, "step": 8110 }, { "epoch": 3.104094910065059, "grad_norm": 0.5412726998329163, "learning_rate": 6.6418621978548314e-06, "loss": 0.688, "step": 8111 }, { "epoch": 3.1044776119402986, "grad_norm": 0.5158270597457886, "learning_rate": 6.639527293303713e-06, "loss": 0.6426, "step": 8112 }, { "epoch": 3.1048603138155375, "grad_norm": 0.5478838086128235, "learning_rate": 6.637192595262672e-06, "loss": 0.6367, "step": 8113 }, { "epoch": 3.105243015690777, "grad_norm": 0.5146200656890869, "learning_rate": 6.634858103875181e-06, "loss": 0.6611, "step": 8114 }, { "epoch": 3.1056257175660162, "grad_norm": 0.6146156787872314, "learning_rate": 6.632523819284698e-06, "loss": 0.6305, "step": 8115 }, { "epoch": 3.106008419441255, "grad_norm": 0.5211783647537231, "learning_rate": 6.630189741634673e-06, "loss": 0.6191, "step": 8116 }, { "epoch": 3.1063911213164945, "grad_norm": 0.49609053134918213, "learning_rate": 6.6278558710685404e-06, "loss": 0.6363, "step": 8117 }, { "epoch": 3.1067738231917335, "grad_norm": 0.5390254855155945, "learning_rate": 6.6255222077297275e-06, "loss": 0.6503, "step": 8118 }, { "epoch": 3.107156525066973, "grad_norm": 0.5641607642173767, "learning_rate": 6.6231887517616375e-06, "loss": 0.5876, "step": 8119 }, { "epoch": 3.107539226942212, "grad_norm": 0.5439266562461853, "learning_rate": 6.620855503307671e-06, "loss": 0.6876, "step": 8120 }, { "epoch": 3.107921928817451, "grad_norm": 0.4933634102344513, "learning_rate": 6.6185224625112155e-06, "loss": 0.6397, "step": 8121 }, { "epoch": 3.1083046306926905, "grad_norm": 0.5476244688034058, "learning_rate": 6.6161896295156345e-06, "loss": 0.6142, "step": 8122 }, { "epoch": 3.1086873325679294, "grad_norm": 0.5793021321296692, "learning_rate": 6.6138570044642924e-06, "loss": 0.6067, "step": 8123 }, { "epoch": 3.1090700344431688, "grad_norm": 0.5439987182617188, "learning_rate": 6.611524587500535e-06, "loss": 0.6016, "step": 8124 }, { "epoch": 3.109452736318408, "grad_norm": 0.5380797386169434, "learning_rate": 6.609192378767695e-06, "loss": 0.6497, "step": 8125 }, { "epoch": 3.109835438193647, "grad_norm": 0.5287267565727234, "learning_rate": 6.60686037840909e-06, "loss": 0.6731, "step": 8126 }, { "epoch": 3.1102181400688864, "grad_norm": 0.533616840839386, "learning_rate": 6.604528586568034e-06, "loss": 0.6521, "step": 8127 }, { "epoch": 3.1106008419441253, "grad_norm": 0.561479926109314, "learning_rate": 6.602197003387816e-06, "loss": 0.6947, "step": 8128 }, { "epoch": 3.1109835438193647, "grad_norm": 0.5318774580955505, "learning_rate": 6.59986562901172e-06, "loss": 0.6025, "step": 8129 }, { "epoch": 3.111366245694604, "grad_norm": 0.6313263177871704, "learning_rate": 6.5975344635830154e-06, "loss": 0.598, "step": 8130 }, { "epoch": 3.111748947569843, "grad_norm": 0.5788143277168274, "learning_rate": 6.595203507244964e-06, "loss": 0.7201, "step": 8131 }, { "epoch": 3.1121316494450824, "grad_norm": 0.5063524842262268, "learning_rate": 6.592872760140798e-06, "loss": 0.5481, "step": 8132 }, { "epoch": 3.1125143513203213, "grad_norm": 0.5358526110649109, "learning_rate": 6.590542222413756e-06, "loss": 0.6503, "step": 8133 }, { "epoch": 3.1128970531955606, "grad_norm": 0.505906879901886, "learning_rate": 6.588211894207055e-06, "loss": 0.655, "step": 8134 }, { "epoch": 3.1132797550708, "grad_norm": 0.5334619879722595, "learning_rate": 6.585881775663896e-06, "loss": 0.615, "step": 8135 }, { "epoch": 3.113662456946039, "grad_norm": 0.5428311228752136, "learning_rate": 6.583551866927475e-06, "loss": 0.6214, "step": 8136 }, { "epoch": 3.1140451588212783, "grad_norm": 0.5670689344406128, "learning_rate": 6.581222168140975e-06, "loss": 0.5946, "step": 8137 }, { "epoch": 3.1144278606965172, "grad_norm": 0.5198513865470886, "learning_rate": 6.578892679447549e-06, "loss": 0.5919, "step": 8138 }, { "epoch": 3.1148105625717566, "grad_norm": 0.5093504190444946, "learning_rate": 6.576563400990361e-06, "loss": 0.5449, "step": 8139 }, { "epoch": 3.115193264446996, "grad_norm": 0.5285903811454773, "learning_rate": 6.5742343329125545e-06, "loss": 0.6045, "step": 8140 }, { "epoch": 3.115575966322235, "grad_norm": 0.5520445108413696, "learning_rate": 6.571905475357247e-06, "loss": 0.6037, "step": 8141 }, { "epoch": 3.1159586681974742, "grad_norm": 0.5224454998970032, "learning_rate": 6.569576828467556e-06, "loss": 0.6095, "step": 8142 }, { "epoch": 3.116341370072713, "grad_norm": 0.5880804061889648, "learning_rate": 6.567248392386587e-06, "loss": 0.6388, "step": 8143 }, { "epoch": 3.1167240719479525, "grad_norm": 0.5673974752426147, "learning_rate": 6.564920167257427e-06, "loss": 0.6653, "step": 8144 }, { "epoch": 3.117106773823192, "grad_norm": 0.553034782409668, "learning_rate": 6.5625921532231494e-06, "loss": 0.6545, "step": 8145 }, { "epoch": 3.117489475698431, "grad_norm": 0.6130812168121338, "learning_rate": 6.560264350426819e-06, "loss": 0.6077, "step": 8146 }, { "epoch": 3.11787217757367, "grad_norm": 0.5668905377388, "learning_rate": 6.557936759011489e-06, "loss": 0.6082, "step": 8147 }, { "epoch": 3.118254879448909, "grad_norm": 0.5087791681289673, "learning_rate": 6.5556093791201844e-06, "loss": 0.6085, "step": 8148 }, { "epoch": 3.1186375813241485, "grad_norm": 0.5491944551467896, "learning_rate": 6.5532822108959415e-06, "loss": 0.5926, "step": 8149 }, { "epoch": 3.119020283199388, "grad_norm": 0.5391613245010376, "learning_rate": 6.550955254481769e-06, "loss": 0.6477, "step": 8150 }, { "epoch": 3.1194029850746268, "grad_norm": 0.574524998664856, "learning_rate": 6.548628510020658e-06, "loss": 0.7053, "step": 8151 }, { "epoch": 3.119785686949866, "grad_norm": 0.5374383926391602, "learning_rate": 6.546301977655597e-06, "loss": 0.5526, "step": 8152 }, { "epoch": 3.120168388825105, "grad_norm": 0.49814629554748535, "learning_rate": 6.5439756575295595e-06, "loss": 0.618, "step": 8153 }, { "epoch": 3.1205510907003444, "grad_norm": 0.5254050493240356, "learning_rate": 6.5416495497855005e-06, "loss": 0.6112, "step": 8154 }, { "epoch": 3.120933792575584, "grad_norm": 0.5126946568489075, "learning_rate": 6.5393236545663676e-06, "loss": 0.6756, "step": 8155 }, { "epoch": 3.1213164944508227, "grad_norm": 0.5442989468574524, "learning_rate": 6.536997972015093e-06, "loss": 0.6275, "step": 8156 }, { "epoch": 3.121699196326062, "grad_norm": 0.5514820218086243, "learning_rate": 6.534672502274601e-06, "loss": 0.6519, "step": 8157 }, { "epoch": 3.122081898201301, "grad_norm": 0.5343855619430542, "learning_rate": 6.532347245487789e-06, "loss": 0.5856, "step": 8158 }, { "epoch": 3.1224646000765404, "grad_norm": 0.5619067549705505, "learning_rate": 6.530022201797553e-06, "loss": 0.6741, "step": 8159 }, { "epoch": 3.1228473019517797, "grad_norm": 0.6211556196212769, "learning_rate": 6.5276973713467805e-06, "loss": 0.6138, "step": 8160 }, { "epoch": 3.1232300038270187, "grad_norm": 0.564154326915741, "learning_rate": 6.525372754278329e-06, "loss": 0.5538, "step": 8161 }, { "epoch": 3.123612705702258, "grad_norm": 0.5783944725990295, "learning_rate": 6.523048350735055e-06, "loss": 0.672, "step": 8162 }, { "epoch": 3.123995407577497, "grad_norm": 0.5240663290023804, "learning_rate": 6.520724160859804e-06, "loss": 0.6281, "step": 8163 }, { "epoch": 3.1243781094527363, "grad_norm": 0.5236664414405823, "learning_rate": 6.5184001847953995e-06, "loss": 0.5933, "step": 8164 }, { "epoch": 3.1247608113279757, "grad_norm": 0.5115962624549866, "learning_rate": 6.516076422684654e-06, "loss": 0.5762, "step": 8165 }, { "epoch": 3.1251435132032146, "grad_norm": 0.5182499289512634, "learning_rate": 6.5137528746703784e-06, "loss": 0.5595, "step": 8166 }, { "epoch": 3.125526215078454, "grad_norm": 0.5185571908950806, "learning_rate": 6.511429540895349e-06, "loss": 0.6289, "step": 8167 }, { "epoch": 3.125908916953693, "grad_norm": 0.519356369972229, "learning_rate": 6.509106421502347e-06, "loss": 0.6501, "step": 8168 }, { "epoch": 3.1262916188289323, "grad_norm": 0.5332748293876648, "learning_rate": 6.5067835166341296e-06, "loss": 0.6761, "step": 8169 }, { "epoch": 3.1266743207041716, "grad_norm": 0.5213454961776733, "learning_rate": 6.504460826433458e-06, "loss": 0.5707, "step": 8170 }, { "epoch": 3.1270570225794105, "grad_norm": 0.8779266476631165, "learning_rate": 6.5021383510430545e-06, "loss": 0.5852, "step": 8171 }, { "epoch": 3.12743972445465, "grad_norm": 0.5426700711250305, "learning_rate": 6.499816090605646e-06, "loss": 0.5395, "step": 8172 }, { "epoch": 3.127822426329889, "grad_norm": 0.5467095375061035, "learning_rate": 6.497494045263946e-06, "loss": 0.5812, "step": 8173 }, { "epoch": 3.128205128205128, "grad_norm": 0.5521432161331177, "learning_rate": 6.495172215160641e-06, "loss": 0.6154, "step": 8174 }, { "epoch": 3.1285878300803676, "grad_norm": 0.5269783139228821, "learning_rate": 6.4928506004384205e-06, "loss": 0.6061, "step": 8175 }, { "epoch": 3.1289705319556065, "grad_norm": 0.5778577327728271, "learning_rate": 6.490529201239958e-06, "loss": 0.7547, "step": 8176 }, { "epoch": 3.129353233830846, "grad_norm": 0.49992844462394714, "learning_rate": 6.488208017707899e-06, "loss": 0.5948, "step": 8177 }, { "epoch": 3.1297359357060848, "grad_norm": 0.5379469394683838, "learning_rate": 6.485887049984892e-06, "loss": 0.5587, "step": 8178 }, { "epoch": 3.130118637581324, "grad_norm": 0.5567007660865784, "learning_rate": 6.483566298213568e-06, "loss": 0.6508, "step": 8179 }, { "epoch": 3.1305013394565635, "grad_norm": 0.5762932300567627, "learning_rate": 6.481245762536539e-06, "loss": 0.6306, "step": 8180 }, { "epoch": 3.1308840413318024, "grad_norm": 0.5079809427261353, "learning_rate": 6.4789254430964135e-06, "loss": 0.5884, "step": 8181 }, { "epoch": 3.131266743207042, "grad_norm": 0.5558332800865173, "learning_rate": 6.476605340035776e-06, "loss": 0.639, "step": 8182 }, { "epoch": 3.1316494450822807, "grad_norm": 0.528820276260376, "learning_rate": 6.474285453497209e-06, "loss": 0.585, "step": 8183 }, { "epoch": 3.13203214695752, "grad_norm": 0.5308411121368408, "learning_rate": 6.471965783623271e-06, "loss": 0.6833, "step": 8184 }, { "epoch": 3.1324148488327594, "grad_norm": 0.5408270955085754, "learning_rate": 6.469646330556514e-06, "loss": 0.5804, "step": 8185 }, { "epoch": 3.1327975507079984, "grad_norm": 0.5667312741279602, "learning_rate": 6.467327094439477e-06, "loss": 0.5982, "step": 8186 }, { "epoch": 3.1331802525832377, "grad_norm": 0.5690663456916809, "learning_rate": 6.465008075414678e-06, "loss": 0.6284, "step": 8187 }, { "epoch": 3.1335629544584767, "grad_norm": 0.514782190322876, "learning_rate": 6.4626892736246295e-06, "loss": 0.6439, "step": 8188 }, { "epoch": 3.133945656333716, "grad_norm": 0.5215024352073669, "learning_rate": 6.460370689211829e-06, "loss": 0.6763, "step": 8189 }, { "epoch": 3.1343283582089554, "grad_norm": 0.5120810866355896, "learning_rate": 6.458052322318758e-06, "loss": 0.6259, "step": 8190 }, { "epoch": 3.1347110600841943, "grad_norm": 0.4853013753890991, "learning_rate": 6.455734173087886e-06, "loss": 0.6254, "step": 8191 }, { "epoch": 3.1350937619594337, "grad_norm": 0.5496759414672852, "learning_rate": 6.453416241661674e-06, "loss": 0.6136, "step": 8192 }, { "epoch": 3.1354764638346726, "grad_norm": 0.5797746181488037, "learning_rate": 6.451098528182558e-06, "loss": 0.6687, "step": 8193 }, { "epoch": 3.135859165709912, "grad_norm": 0.5300953984260559, "learning_rate": 6.4487810327929726e-06, "loss": 0.6173, "step": 8194 }, { "epoch": 3.1362418675851513, "grad_norm": 0.5605597496032715, "learning_rate": 6.446463755635332e-06, "loss": 0.5466, "step": 8195 }, { "epoch": 3.1366245694603903, "grad_norm": 0.5637736320495605, "learning_rate": 6.444146696852045e-06, "loss": 0.6692, "step": 8196 }, { "epoch": 3.1370072713356296, "grad_norm": 0.5114123821258545, "learning_rate": 6.4418298565854935e-06, "loss": 0.5857, "step": 8197 }, { "epoch": 3.1373899732108685, "grad_norm": 0.5181903839111328, "learning_rate": 6.439513234978054e-06, "loss": 0.7255, "step": 8198 }, { "epoch": 3.137772675086108, "grad_norm": 0.582943320274353, "learning_rate": 6.4371968321720946e-06, "loss": 0.6492, "step": 8199 }, { "epoch": 3.1381553769613473, "grad_norm": 0.5655431151390076, "learning_rate": 6.43488064830996e-06, "loss": 0.6389, "step": 8200 }, { "epoch": 3.138538078836586, "grad_norm": 0.5556305050849915, "learning_rate": 6.4325646835339864e-06, "loss": 0.6627, "step": 8201 }, { "epoch": 3.1389207807118256, "grad_norm": 0.5198658108711243, "learning_rate": 6.430248937986501e-06, "loss": 0.5923, "step": 8202 }, { "epoch": 3.1393034825870645, "grad_norm": 0.5284652709960938, "learning_rate": 6.427933411809802e-06, "loss": 0.6344, "step": 8203 }, { "epoch": 3.139686184462304, "grad_norm": 0.5279253721237183, "learning_rate": 6.425618105146194e-06, "loss": 0.619, "step": 8204 }, { "epoch": 3.140068886337543, "grad_norm": 0.5607852935791016, "learning_rate": 6.4233030181379585e-06, "loss": 0.633, "step": 8205 }, { "epoch": 3.140451588212782, "grad_norm": 0.552799642086029, "learning_rate": 6.420988150927358e-06, "loss": 0.635, "step": 8206 }, { "epoch": 3.1408342900880215, "grad_norm": 0.5315119624137878, "learning_rate": 6.4186735036566515e-06, "loss": 0.6997, "step": 8207 }, { "epoch": 3.1412169919632604, "grad_norm": 0.5451604127883911, "learning_rate": 6.416359076468079e-06, "loss": 0.6469, "step": 8208 }, { "epoch": 3.1415996938385, "grad_norm": 0.5226156711578369, "learning_rate": 6.41404486950387e-06, "loss": 0.65, "step": 8209 }, { "epoch": 3.141982395713739, "grad_norm": 0.5848897099494934, "learning_rate": 6.411730882906236e-06, "loss": 0.5676, "step": 8210 }, { "epoch": 3.142365097588978, "grad_norm": 0.554896891117096, "learning_rate": 6.409417116817379e-06, "loss": 0.6121, "step": 8211 }, { "epoch": 3.1427477994642175, "grad_norm": 0.5295591950416565, "learning_rate": 6.407103571379491e-06, "loss": 0.6114, "step": 8212 }, { "epoch": 3.1431305013394564, "grad_norm": 0.5155013799667358, "learning_rate": 6.4047902467347355e-06, "loss": 0.6274, "step": 8213 }, { "epoch": 3.1435132032146957, "grad_norm": 0.5282456278800964, "learning_rate": 6.402477143025276e-06, "loss": 0.6292, "step": 8214 }, { "epoch": 3.143895905089935, "grad_norm": 0.576221764087677, "learning_rate": 6.400164260393268e-06, "loss": 0.6477, "step": 8215 }, { "epoch": 3.144278606965174, "grad_norm": 0.5622438788414001, "learning_rate": 6.397851598980832e-06, "loss": 0.6201, "step": 8216 }, { "epoch": 3.1446613088404134, "grad_norm": 0.5505080223083496, "learning_rate": 6.395539158930093e-06, "loss": 0.6267, "step": 8217 }, { "epoch": 3.1450440107156523, "grad_norm": 0.5307260751724243, "learning_rate": 6.393226940383157e-06, "loss": 0.6227, "step": 8218 }, { "epoch": 3.1454267125908917, "grad_norm": 0.5408504009246826, "learning_rate": 6.390914943482114e-06, "loss": 0.6193, "step": 8219 }, { "epoch": 3.145809414466131, "grad_norm": 0.50961834192276, "learning_rate": 6.388603168369043e-06, "loss": 0.5992, "step": 8220 }, { "epoch": 3.14619211634137, "grad_norm": 0.5675598382949829, "learning_rate": 6.386291615186009e-06, "loss": 0.6588, "step": 8221 }, { "epoch": 3.1465748182166093, "grad_norm": 0.566085934638977, "learning_rate": 6.3839802840750686e-06, "loss": 0.6266, "step": 8222 }, { "epoch": 3.1469575200918483, "grad_norm": 0.5589492917060852, "learning_rate": 6.381669175178249e-06, "loss": 0.6532, "step": 8223 }, { "epoch": 3.1473402219670876, "grad_norm": 0.6134365797042847, "learning_rate": 6.37935828863758e-06, "loss": 0.6927, "step": 8224 }, { "epoch": 3.147722923842327, "grad_norm": 0.5823020935058594, "learning_rate": 6.377047624595073e-06, "loss": 0.7135, "step": 8225 }, { "epoch": 3.148105625717566, "grad_norm": 0.6099754571914673, "learning_rate": 6.37473718319272e-06, "loss": 0.5294, "step": 8226 }, { "epoch": 3.1484883275928053, "grad_norm": 0.5616101622581482, "learning_rate": 6.372426964572506e-06, "loss": 0.6811, "step": 8227 }, { "epoch": 3.148871029468044, "grad_norm": 0.6014809012413025, "learning_rate": 6.370116968876403e-06, "loss": 0.5679, "step": 8228 }, { "epoch": 3.1492537313432836, "grad_norm": 0.5414895415306091, "learning_rate": 6.367807196246363e-06, "loss": 0.6765, "step": 8229 }, { "epoch": 3.149636433218523, "grad_norm": 0.5129470229148865, "learning_rate": 6.365497646824327e-06, "loss": 0.6261, "step": 8230 }, { "epoch": 3.150019135093762, "grad_norm": 0.6140220165252686, "learning_rate": 6.363188320752229e-06, "loss": 0.6912, "step": 8231 }, { "epoch": 3.1504018369690012, "grad_norm": 0.5193066596984863, "learning_rate": 6.360879218171976e-06, "loss": 0.6001, "step": 8232 }, { "epoch": 3.15078453884424, "grad_norm": 0.5104652643203735, "learning_rate": 6.35857033922547e-06, "loss": 0.5475, "step": 8233 }, { "epoch": 3.1511672407194795, "grad_norm": 0.55658358335495, "learning_rate": 6.356261684054599e-06, "loss": 0.7382, "step": 8234 }, { "epoch": 3.151549942594719, "grad_norm": 0.5163302421569824, "learning_rate": 6.35395325280124e-06, "loss": 0.6072, "step": 8235 }, { "epoch": 3.151932644469958, "grad_norm": 0.5227415561676025, "learning_rate": 6.3516450456072465e-06, "loss": 0.6291, "step": 8236 }, { "epoch": 3.152315346345197, "grad_norm": 0.5118275880813599, "learning_rate": 6.349337062614464e-06, "loss": 0.5803, "step": 8237 }, { "epoch": 3.152698048220436, "grad_norm": 0.5223990082740784, "learning_rate": 6.347029303964731e-06, "loss": 0.5364, "step": 8238 }, { "epoch": 3.1530807500956755, "grad_norm": 0.5417377352714539, "learning_rate": 6.3447217697998575e-06, "loss": 0.6353, "step": 8239 }, { "epoch": 3.153463451970915, "grad_norm": 0.5333409905433655, "learning_rate": 6.342414460261652e-06, "loss": 0.5824, "step": 8240 }, { "epoch": 3.1538461538461537, "grad_norm": 0.5120039582252502, "learning_rate": 6.340107375491908e-06, "loss": 0.5671, "step": 8241 }, { "epoch": 3.154228855721393, "grad_norm": 0.5275278687477112, "learning_rate": 6.337800515632394e-06, "loss": 0.5925, "step": 8242 }, { "epoch": 3.154611557596632, "grad_norm": 0.5298250913619995, "learning_rate": 6.335493880824877e-06, "loss": 0.6242, "step": 8243 }, { "epoch": 3.1549942594718714, "grad_norm": 0.5581361055374146, "learning_rate": 6.333187471211107e-06, "loss": 0.6665, "step": 8244 }, { "epoch": 3.1553769613471108, "grad_norm": 0.5447404384613037, "learning_rate": 6.330881286932815e-06, "loss": 0.6392, "step": 8245 }, { "epoch": 3.1557596632223497, "grad_norm": 0.5357295274734497, "learning_rate": 6.3285753281317265e-06, "loss": 0.644, "step": 8246 }, { "epoch": 3.156142365097589, "grad_norm": 0.9330081343650818, "learning_rate": 6.326269594949547e-06, "loss": 0.6166, "step": 8247 }, { "epoch": 3.156525066972828, "grad_norm": 0.5298478603363037, "learning_rate": 6.3239640875279715e-06, "loss": 0.6211, "step": 8248 }, { "epoch": 3.1569077688480673, "grad_norm": 0.5437897443771362, "learning_rate": 6.321658806008677e-06, "loss": 0.5617, "step": 8249 }, { "epoch": 3.1572904707233067, "grad_norm": 0.5426121950149536, "learning_rate": 6.319353750533331e-06, "loss": 0.6242, "step": 8250 }, { "epoch": 3.1576731725985456, "grad_norm": 0.5450882911682129, "learning_rate": 6.3170489212435895e-06, "loss": 0.6053, "step": 8251 }, { "epoch": 3.158055874473785, "grad_norm": 0.5725438594818115, "learning_rate": 6.314744318281081e-06, "loss": 0.7067, "step": 8252 }, { "epoch": 3.158438576349024, "grad_norm": 0.532799243927002, "learning_rate": 6.312439941787434e-06, "loss": 0.6308, "step": 8253 }, { "epoch": 3.1588212782242633, "grad_norm": 0.5076584219932556, "learning_rate": 6.310135791904263e-06, "loss": 0.6206, "step": 8254 }, { "epoch": 3.1592039800995027, "grad_norm": 0.6095708608627319, "learning_rate": 6.307831868773158e-06, "loss": 0.6461, "step": 8255 }, { "epoch": 3.1595866819747416, "grad_norm": 0.5190576910972595, "learning_rate": 6.305528172535702e-06, "loss": 0.6461, "step": 8256 }, { "epoch": 3.159969383849981, "grad_norm": 0.5640829801559448, "learning_rate": 6.3032247033334705e-06, "loss": 0.587, "step": 8257 }, { "epoch": 3.16035208572522, "grad_norm": 0.514771044254303, "learning_rate": 6.300921461308007e-06, "loss": 0.614, "step": 8258 }, { "epoch": 3.1607347876004592, "grad_norm": 0.5780130624771118, "learning_rate": 6.298618446600856e-06, "loss": 0.6707, "step": 8259 }, { "epoch": 3.1611174894756986, "grad_norm": 0.5554496049880981, "learning_rate": 6.296315659353547e-06, "loss": 0.6009, "step": 8260 }, { "epoch": 3.1615001913509375, "grad_norm": 0.5517110228538513, "learning_rate": 6.294013099707596e-06, "loss": 0.6322, "step": 8261 }, { "epoch": 3.161882893226177, "grad_norm": 0.5507238507270813, "learning_rate": 6.291710767804491e-06, "loss": 0.6464, "step": 8262 }, { "epoch": 3.162265595101416, "grad_norm": 0.5256161093711853, "learning_rate": 6.289408663785721e-06, "loss": 0.5791, "step": 8263 }, { "epoch": 3.162648296976655, "grad_norm": 0.5714228749275208, "learning_rate": 6.28710678779276e-06, "loss": 0.613, "step": 8264 }, { "epoch": 3.1630309988518945, "grad_norm": 0.5361559987068176, "learning_rate": 6.284805139967059e-06, "loss": 0.5713, "step": 8265 }, { "epoch": 3.1634137007271335, "grad_norm": 0.514602541923523, "learning_rate": 6.282503720450063e-06, "loss": 0.6237, "step": 8266 }, { "epoch": 3.163796402602373, "grad_norm": 0.5033426284790039, "learning_rate": 6.280202529383203e-06, "loss": 0.552, "step": 8267 }, { "epoch": 3.1641791044776117, "grad_norm": 0.5686526298522949, "learning_rate": 6.277901566907889e-06, "loss": 0.5999, "step": 8268 }, { "epoch": 3.164561806352851, "grad_norm": 0.543281614780426, "learning_rate": 6.275600833165521e-06, "loss": 0.6189, "step": 8269 }, { "epoch": 3.1649445082280905, "grad_norm": 0.5076345801353455, "learning_rate": 6.273300328297493e-06, "loss": 0.6082, "step": 8270 }, { "epoch": 3.1653272101033294, "grad_norm": 0.5196027159690857, "learning_rate": 6.271000052445166e-06, "loss": 0.6445, "step": 8271 }, { "epoch": 3.1657099119785688, "grad_norm": 0.6777898669242859, "learning_rate": 6.268700005749906e-06, "loss": 0.638, "step": 8272 }, { "epoch": 3.1660926138538077, "grad_norm": 0.584240734577179, "learning_rate": 6.2664001883530544e-06, "loss": 0.6373, "step": 8273 }, { "epoch": 3.166475315729047, "grad_norm": 0.5309219360351562, "learning_rate": 6.2641006003959425e-06, "loss": 0.6219, "step": 8274 }, { "epoch": 3.1668580176042864, "grad_norm": 0.5326093435287476, "learning_rate": 6.261801242019884e-06, "loss": 0.5546, "step": 8275 }, { "epoch": 3.1672407194795253, "grad_norm": 0.5989718437194824, "learning_rate": 6.25950211336618e-06, "loss": 0.7275, "step": 8276 }, { "epoch": 3.1676234213547647, "grad_norm": 0.5468898415565491, "learning_rate": 6.257203214576126e-06, "loss": 0.5643, "step": 8277 }, { "epoch": 3.1680061232300036, "grad_norm": 0.562836229801178, "learning_rate": 6.2549045457909855e-06, "loss": 0.5996, "step": 8278 }, { "epoch": 3.168388825105243, "grad_norm": 0.5098142623901367, "learning_rate": 6.252606107152021e-06, "loss": 0.6201, "step": 8279 }, { "epoch": 3.1687715269804824, "grad_norm": 0.5223394632339478, "learning_rate": 6.250307898800481e-06, "loss": 0.5788, "step": 8280 }, { "epoch": 3.1691542288557213, "grad_norm": 0.5714342594146729, "learning_rate": 6.248009920877591e-06, "loss": 0.5898, "step": 8281 }, { "epoch": 3.1695369307309607, "grad_norm": 0.5108129382133484, "learning_rate": 6.245712173524572e-06, "loss": 0.6147, "step": 8282 }, { "epoch": 3.1699196326061996, "grad_norm": 0.5622767806053162, "learning_rate": 6.243414656882627e-06, "loss": 0.637, "step": 8283 }, { "epoch": 3.170302334481439, "grad_norm": 0.5628370046615601, "learning_rate": 6.241117371092941e-06, "loss": 0.6652, "step": 8284 }, { "epoch": 3.1706850363566783, "grad_norm": 0.6148889660835266, "learning_rate": 6.2388203162966935e-06, "loss": 0.5302, "step": 8285 }, { "epoch": 3.1710677382319172, "grad_norm": 0.5779745578765869, "learning_rate": 6.2365234926350405e-06, "loss": 0.5979, "step": 8286 }, { "epoch": 3.1714504401071566, "grad_norm": 0.5636951923370361, "learning_rate": 6.234226900249132e-06, "loss": 0.6826, "step": 8287 }, { "epoch": 3.1718331419823955, "grad_norm": 0.5471473932266235, "learning_rate": 6.231930539280095e-06, "loss": 0.6816, "step": 8288 }, { "epoch": 3.172215843857635, "grad_norm": 0.5098207592964172, "learning_rate": 6.2296344098690476e-06, "loss": 0.6532, "step": 8289 }, { "epoch": 3.1725985457328743, "grad_norm": 0.5313097834587097, "learning_rate": 6.227338512157097e-06, "loss": 0.6378, "step": 8290 }, { "epoch": 3.172981247608113, "grad_norm": 0.5621312856674194, "learning_rate": 6.225042846285329e-06, "loss": 0.6537, "step": 8291 }, { "epoch": 3.1733639494833525, "grad_norm": 0.5653220415115356, "learning_rate": 6.222747412394821e-06, "loss": 0.6418, "step": 8292 }, { "epoch": 3.1737466513585915, "grad_norm": 0.5703030228614807, "learning_rate": 6.220452210626631e-06, "loss": 0.5772, "step": 8293 }, { "epoch": 3.174129353233831, "grad_norm": 0.5356035232543945, "learning_rate": 6.218157241121805e-06, "loss": 0.5496, "step": 8294 }, { "epoch": 3.17451205510907, "grad_norm": 0.5323372483253479, "learning_rate": 6.215862504021377e-06, "loss": 0.699, "step": 8295 }, { "epoch": 3.174894756984309, "grad_norm": 0.49777036905288696, "learning_rate": 6.213567999466369e-06, "loss": 0.5864, "step": 8296 }, { "epoch": 3.1752774588595485, "grad_norm": 0.5888774394989014, "learning_rate": 6.211273727597775e-06, "loss": 0.5947, "step": 8297 }, { "epoch": 3.1756601607347874, "grad_norm": 0.542488157749176, "learning_rate": 6.208979688556589e-06, "loss": 0.6687, "step": 8298 }, { "epoch": 3.1760428626100268, "grad_norm": 0.6458394527435303, "learning_rate": 6.206685882483785e-06, "loss": 0.5325, "step": 8299 }, { "epoch": 3.176425564485266, "grad_norm": 0.5471669435501099, "learning_rate": 6.204392309520327e-06, "loss": 0.5537, "step": 8300 }, { "epoch": 3.176808266360505, "grad_norm": 0.5094649791717529, "learning_rate": 6.202098969807156e-06, "loss": 0.6395, "step": 8301 }, { "epoch": 3.1771909682357444, "grad_norm": 0.4995215833187103, "learning_rate": 6.199805863485205e-06, "loss": 0.6196, "step": 8302 }, { "epoch": 3.1775736701109834, "grad_norm": 0.5940017700195312, "learning_rate": 6.1975129906953965e-06, "loss": 0.5537, "step": 8303 }, { "epoch": 3.1779563719862227, "grad_norm": 0.5466468334197998, "learning_rate": 6.195220351578628e-06, "loss": 0.6201, "step": 8304 }, { "epoch": 3.178339073861462, "grad_norm": 0.5531811714172363, "learning_rate": 6.1929279462757905e-06, "loss": 0.595, "step": 8305 }, { "epoch": 3.178721775736701, "grad_norm": 0.6358805298805237, "learning_rate": 6.190635774927762e-06, "loss": 0.6423, "step": 8306 }, { "epoch": 3.1791044776119404, "grad_norm": 0.5168138146400452, "learning_rate": 6.188343837675395e-06, "loss": 0.6098, "step": 8307 }, { "epoch": 3.1794871794871793, "grad_norm": 0.5726544857025146, "learning_rate": 6.186052134659539e-06, "loss": 0.6047, "step": 8308 }, { "epoch": 3.1798698813624187, "grad_norm": 0.5329588651657104, "learning_rate": 6.183760666021028e-06, "loss": 0.5873, "step": 8309 }, { "epoch": 3.180252583237658, "grad_norm": 0.5229746699333191, "learning_rate": 6.181469431900673e-06, "loss": 0.6588, "step": 8310 }, { "epoch": 3.180635285112897, "grad_norm": 0.5792624354362488, "learning_rate": 6.179178432439281e-06, "loss": 0.7057, "step": 8311 }, { "epoch": 3.1810179869881363, "grad_norm": 0.5730105638504028, "learning_rate": 6.176887667777639e-06, "loss": 0.6416, "step": 8312 }, { "epoch": 3.1814006888633752, "grad_norm": 0.5245574116706848, "learning_rate": 6.174597138056524e-06, "loss": 0.6007, "step": 8313 }, { "epoch": 3.1817833907386146, "grad_norm": 0.5177772045135498, "learning_rate": 6.1723068434166865e-06, "loss": 0.6766, "step": 8314 }, { "epoch": 3.182166092613854, "grad_norm": 0.5633239150047302, "learning_rate": 6.170016783998878e-06, "loss": 0.631, "step": 8315 }, { "epoch": 3.182548794489093, "grad_norm": 0.5060051083564758, "learning_rate": 6.167726959943831e-06, "loss": 0.6765, "step": 8316 }, { "epoch": 3.1829314963643323, "grad_norm": 0.5378186106681824, "learning_rate": 6.165437371392255e-06, "loss": 0.6728, "step": 8317 }, { "epoch": 3.183314198239571, "grad_norm": 0.5500878095626831, "learning_rate": 6.1631480184848525e-06, "loss": 0.6241, "step": 8318 }, { "epoch": 3.1836969001148105, "grad_norm": 0.4832990765571594, "learning_rate": 6.160858901362315e-06, "loss": 0.5877, "step": 8319 }, { "epoch": 3.18407960199005, "grad_norm": 0.5553619861602783, "learning_rate": 6.1585700201653085e-06, "loss": 0.6211, "step": 8320 }, { "epoch": 3.184462303865289, "grad_norm": 0.5637766718864441, "learning_rate": 6.156281375034495e-06, "loss": 0.5828, "step": 8321 }, { "epoch": 3.184845005740528, "grad_norm": 0.5953965783119202, "learning_rate": 6.1539929661105225e-06, "loss": 0.5811, "step": 8322 }, { "epoch": 3.185227707615767, "grad_norm": 0.5717920660972595, "learning_rate": 6.151704793534008e-06, "loss": 0.6314, "step": 8323 }, { "epoch": 3.1856104094910065, "grad_norm": 0.593540370464325, "learning_rate": 6.149416857445574e-06, "loss": 0.5968, "step": 8324 }, { "epoch": 3.185993111366246, "grad_norm": 0.5078893899917603, "learning_rate": 6.147129157985821e-06, "loss": 0.6184, "step": 8325 }, { "epoch": 3.1863758132414848, "grad_norm": 0.484758585691452, "learning_rate": 6.144841695295328e-06, "loss": 0.6141, "step": 8326 }, { "epoch": 3.186758515116724, "grad_norm": 0.5149199366569519, "learning_rate": 6.142554469514671e-06, "loss": 0.6699, "step": 8327 }, { "epoch": 3.187141216991963, "grad_norm": 0.5091167092323303, "learning_rate": 6.140267480784405e-06, "loss": 0.5444, "step": 8328 }, { "epoch": 3.1875239188672024, "grad_norm": 0.7698819041252136, "learning_rate": 6.137980729245072e-06, "loss": 0.6969, "step": 8329 }, { "epoch": 3.187906620742442, "grad_norm": 0.5102490782737732, "learning_rate": 6.135694215037198e-06, "loss": 0.6019, "step": 8330 }, { "epoch": 3.1882893226176807, "grad_norm": 0.5640191435813904, "learning_rate": 6.133407938301296e-06, "loss": 0.6291, "step": 8331 }, { "epoch": 3.18867202449292, "grad_norm": 0.539298415184021, "learning_rate": 6.131121899177868e-06, "loss": 0.5603, "step": 8332 }, { "epoch": 3.189054726368159, "grad_norm": 0.5384029746055603, "learning_rate": 6.128836097807387e-06, "loss": 0.6629, "step": 8333 }, { "epoch": 3.1894374282433984, "grad_norm": 0.5805519819259644, "learning_rate": 6.126550534330329e-06, "loss": 0.6114, "step": 8334 }, { "epoch": 3.1898201301186377, "grad_norm": 0.5919396281242371, "learning_rate": 6.1242652088871475e-06, "loss": 0.6751, "step": 8335 }, { "epoch": 3.1902028319938767, "grad_norm": 0.5806111693382263, "learning_rate": 6.121980121618281e-06, "loss": 0.6384, "step": 8336 }, { "epoch": 3.190585533869116, "grad_norm": 0.5657544136047363, "learning_rate": 6.119695272664153e-06, "loss": 0.6467, "step": 8337 }, { "epoch": 3.190968235744355, "grad_norm": 0.5432100892066956, "learning_rate": 6.117410662165177e-06, "loss": 0.6893, "step": 8338 }, { "epoch": 3.1913509376195943, "grad_norm": 0.6261433959007263, "learning_rate": 6.115126290261746e-06, "loss": 0.6054, "step": 8339 }, { "epoch": 3.1917336394948337, "grad_norm": 0.5320110321044922, "learning_rate": 6.112842157094239e-06, "loss": 0.5319, "step": 8340 }, { "epoch": 3.1921163413700726, "grad_norm": 0.5801515579223633, "learning_rate": 6.1105582628030245e-06, "loss": 0.6343, "step": 8341 }, { "epoch": 3.192499043245312, "grad_norm": 0.5325592756271362, "learning_rate": 6.108274607528458e-06, "loss": 0.6336, "step": 8342 }, { "epoch": 3.192881745120551, "grad_norm": 0.7406520843505859, "learning_rate": 6.105991191410868e-06, "loss": 0.6949, "step": 8343 }, { "epoch": 3.1932644469957903, "grad_norm": 0.5443252325057983, "learning_rate": 6.103708014590581e-06, "loss": 0.6188, "step": 8344 }, { "epoch": 3.1936471488710296, "grad_norm": 0.5518866777420044, "learning_rate": 6.101425077207905e-06, "loss": 0.6509, "step": 8345 }, { "epoch": 3.1940298507462686, "grad_norm": 0.5202893018722534, "learning_rate": 6.099142379403131e-06, "loss": 0.6031, "step": 8346 }, { "epoch": 3.194412552621508, "grad_norm": 0.5333220362663269, "learning_rate": 6.096859921316536e-06, "loss": 0.646, "step": 8347 }, { "epoch": 3.194795254496747, "grad_norm": 0.539306640625, "learning_rate": 6.094577703088387e-06, "loss": 0.6666, "step": 8348 }, { "epoch": 3.195177956371986, "grad_norm": 0.5075886249542236, "learning_rate": 6.092295724858927e-06, "loss": 0.5157, "step": 8349 }, { "epoch": 3.1955606582472256, "grad_norm": 0.5515849590301514, "learning_rate": 6.090013986768395e-06, "loss": 0.7061, "step": 8350 }, { "epoch": 3.1959433601224645, "grad_norm": 0.529930055141449, "learning_rate": 6.08773248895701e-06, "loss": 0.5586, "step": 8351 }, { "epoch": 3.196326061997704, "grad_norm": 0.5522654056549072, "learning_rate": 6.0854512315649695e-06, "loss": 0.6031, "step": 8352 }, { "epoch": 3.196708763872943, "grad_norm": 0.5736112594604492, "learning_rate": 6.083170214732468e-06, "loss": 0.6902, "step": 8353 }, { "epoch": 3.197091465748182, "grad_norm": 0.5534785389900208, "learning_rate": 6.0808894385996795e-06, "loss": 0.6112, "step": 8354 }, { "epoch": 3.1974741676234215, "grad_norm": 0.6228262186050415, "learning_rate": 6.078608903306765e-06, "loss": 0.6904, "step": 8355 }, { "epoch": 3.1978568694986604, "grad_norm": 0.545413076877594, "learning_rate": 6.076328608993866e-06, "loss": 0.5414, "step": 8356 }, { "epoch": 3.1982395713739, "grad_norm": 0.5171567797660828, "learning_rate": 6.074048555801115e-06, "loss": 0.5602, "step": 8357 }, { "epoch": 3.1986222732491387, "grad_norm": 0.49978893995285034, "learning_rate": 6.071768743868632e-06, "loss": 0.5757, "step": 8358 }, { "epoch": 3.199004975124378, "grad_norm": 0.5463864207267761, "learning_rate": 6.069489173336507e-06, "loss": 0.6746, "step": 8359 }, { "epoch": 3.1993876769996175, "grad_norm": 0.5503717064857483, "learning_rate": 6.067209844344832e-06, "loss": 0.5642, "step": 8360 }, { "epoch": 3.1997703788748564, "grad_norm": 0.5127408504486084, "learning_rate": 6.064930757033684e-06, "loss": 0.5982, "step": 8361 }, { "epoch": 3.2001530807500957, "grad_norm": 0.5138292908668518, "learning_rate": 6.062651911543107e-06, "loss": 0.5426, "step": 8362 }, { "epoch": 3.2005357826253347, "grad_norm": 0.5693392753601074, "learning_rate": 6.060373308013147e-06, "loss": 0.599, "step": 8363 }, { "epoch": 3.200918484500574, "grad_norm": 0.5480133891105652, "learning_rate": 6.058094946583835e-06, "loss": 0.6313, "step": 8364 }, { "epoch": 3.2013011863758134, "grad_norm": 0.5571325421333313, "learning_rate": 6.055816827395174e-06, "loss": 0.6173, "step": 8365 }, { "epoch": 3.2016838882510523, "grad_norm": 0.5702278017997742, "learning_rate": 6.0535389505871656e-06, "loss": 0.5943, "step": 8366 }, { "epoch": 3.2020665901262917, "grad_norm": 0.5776106715202332, "learning_rate": 6.051261316299793e-06, "loss": 0.6442, "step": 8367 }, { "epoch": 3.2024492920015306, "grad_norm": 0.5381090641021729, "learning_rate": 6.048983924673022e-06, "loss": 0.6705, "step": 8368 }, { "epoch": 3.20283199387677, "grad_norm": 1.1986337900161743, "learning_rate": 6.046706775846798e-06, "loss": 0.6178, "step": 8369 }, { "epoch": 3.2032146957520093, "grad_norm": 0.5419399738311768, "learning_rate": 6.044429869961066e-06, "loss": 0.6549, "step": 8370 }, { "epoch": 3.2035973976272483, "grad_norm": 0.5762174725532532, "learning_rate": 6.042153207155748e-06, "loss": 0.6329, "step": 8371 }, { "epoch": 3.2039800995024876, "grad_norm": 0.6185474991798401, "learning_rate": 6.0398767875707445e-06, "loss": 0.6405, "step": 8372 }, { "epoch": 3.2043628013777266, "grad_norm": 0.5643362402915955, "learning_rate": 6.0376006113459505e-06, "loss": 0.5741, "step": 8373 }, { "epoch": 3.204745503252966, "grad_norm": 0.507834255695343, "learning_rate": 6.035324678621248e-06, "loss": 0.5484, "step": 8374 }, { "epoch": 3.2051282051282053, "grad_norm": 0.5751644968986511, "learning_rate": 6.033048989536492e-06, "loss": 0.6569, "step": 8375 }, { "epoch": 3.205510907003444, "grad_norm": 0.5705971121788025, "learning_rate": 6.030773544231532e-06, "loss": 0.6963, "step": 8376 }, { "epoch": 3.2058936088786836, "grad_norm": 0.5750172734260559, "learning_rate": 6.0284983428462055e-06, "loss": 0.66, "step": 8377 }, { "epoch": 3.2062763107539225, "grad_norm": 0.5618394017219543, "learning_rate": 6.026223385520321e-06, "loss": 0.6172, "step": 8378 }, { "epoch": 3.206659012629162, "grad_norm": 0.5697727203369141, "learning_rate": 6.023948672393684e-06, "loss": 0.5532, "step": 8379 }, { "epoch": 3.2070417145044012, "grad_norm": 0.5404154062271118, "learning_rate": 6.0216742036060826e-06, "loss": 0.6505, "step": 8380 }, { "epoch": 3.20742441637964, "grad_norm": 0.5502350926399231, "learning_rate": 6.0193999792972945e-06, "loss": 0.5874, "step": 8381 }, { "epoch": 3.2078071182548795, "grad_norm": 0.5777880549430847, "learning_rate": 6.017125999607067e-06, "loss": 0.6326, "step": 8382 }, { "epoch": 3.2081898201301184, "grad_norm": 0.49483171105384827, "learning_rate": 6.014852264675148e-06, "loss": 0.589, "step": 8383 }, { "epoch": 3.208572522005358, "grad_norm": 0.506339430809021, "learning_rate": 6.012578774641264e-06, "loss": 0.5897, "step": 8384 }, { "epoch": 3.208955223880597, "grad_norm": 0.5144838094711304, "learning_rate": 6.0103055296451265e-06, "loss": 0.5946, "step": 8385 }, { "epoch": 3.209337925755836, "grad_norm": 0.5311105251312256, "learning_rate": 6.008032529826431e-06, "loss": 0.6704, "step": 8386 }, { "epoch": 3.2097206276310755, "grad_norm": 0.5478817224502563, "learning_rate": 6.005759775324864e-06, "loss": 0.6349, "step": 8387 }, { "epoch": 3.2101033295063144, "grad_norm": 0.5506625771522522, "learning_rate": 6.003487266280086e-06, "loss": 0.6213, "step": 8388 }, { "epoch": 3.2104860313815538, "grad_norm": 0.5803205966949463, "learning_rate": 6.001215002831753e-06, "loss": 0.6737, "step": 8389 }, { "epoch": 3.210868733256793, "grad_norm": 0.5314289331436157, "learning_rate": 5.998942985119503e-06, "loss": 0.6231, "step": 8390 }, { "epoch": 3.211251435132032, "grad_norm": 0.5891969203948975, "learning_rate": 5.9966712132829515e-06, "loss": 0.6406, "step": 8391 }, { "epoch": 3.2116341370072714, "grad_norm": 0.6041547060012817, "learning_rate": 5.994399687461709e-06, "loss": 0.6482, "step": 8392 }, { "epoch": 3.2120168388825103, "grad_norm": 0.5743964910507202, "learning_rate": 5.992128407795368e-06, "loss": 0.6147, "step": 8393 }, { "epoch": 3.2123995407577497, "grad_norm": 0.5388651490211487, "learning_rate": 5.989857374423504e-06, "loss": 0.6899, "step": 8394 }, { "epoch": 3.212782242632989, "grad_norm": 0.5001206994056702, "learning_rate": 5.9875865874856775e-06, "loss": 0.6559, "step": 8395 }, { "epoch": 3.213164944508228, "grad_norm": 0.5604224801063538, "learning_rate": 5.9853160471214344e-06, "loss": 0.6596, "step": 8396 }, { "epoch": 3.2135476463834673, "grad_norm": 0.526123583316803, "learning_rate": 5.983045753470308e-06, "loss": 0.6471, "step": 8397 }, { "epoch": 3.2139303482587063, "grad_norm": 0.5591220259666443, "learning_rate": 5.980775706671808e-06, "loss": 0.5581, "step": 8398 }, { "epoch": 3.2143130501339456, "grad_norm": 0.6206357479095459, "learning_rate": 5.9785059068654395e-06, "loss": 0.5491, "step": 8399 }, { "epoch": 3.214695752009185, "grad_norm": 0.5526155829429626, "learning_rate": 5.9762363541906875e-06, "loss": 0.6791, "step": 8400 }, { "epoch": 3.215078453884424, "grad_norm": 0.5626634359359741, "learning_rate": 5.97396704878702e-06, "loss": 0.677, "step": 8401 }, { "epoch": 3.2154611557596633, "grad_norm": 0.5236843824386597, "learning_rate": 5.971697990793892e-06, "loss": 0.6487, "step": 8402 }, { "epoch": 3.215843857634902, "grad_norm": 0.5351406931877136, "learning_rate": 5.969429180350747e-06, "loss": 0.5796, "step": 8403 }, { "epoch": 3.2162265595101416, "grad_norm": 0.5564992427825928, "learning_rate": 5.9671606175970074e-06, "loss": 0.6529, "step": 8404 }, { "epoch": 3.216609261385381, "grad_norm": 0.5604808330535889, "learning_rate": 5.96489230267208e-06, "loss": 0.5994, "step": 8405 }, { "epoch": 3.21699196326062, "grad_norm": 0.5029141306877136, "learning_rate": 5.962624235715359e-06, "loss": 0.5907, "step": 8406 }, { "epoch": 3.2173746651358592, "grad_norm": 0.5468887686729431, "learning_rate": 5.9603564168662335e-06, "loss": 0.5716, "step": 8407 }, { "epoch": 3.217757367011098, "grad_norm": 0.5240965485572815, "learning_rate": 5.958088846264052e-06, "loss": 0.593, "step": 8408 }, { "epoch": 3.2181400688863375, "grad_norm": 0.5367223620414734, "learning_rate": 5.955821524048169e-06, "loss": 0.5988, "step": 8409 }, { "epoch": 3.218522770761577, "grad_norm": 0.5377628207206726, "learning_rate": 5.953554450357921e-06, "loss": 0.5692, "step": 8410 }, { "epoch": 3.218905472636816, "grad_norm": 0.5263607501983643, "learning_rate": 5.95128762533262e-06, "loss": 0.5964, "step": 8411 }, { "epoch": 3.219288174512055, "grad_norm": 0.5379920601844788, "learning_rate": 5.949021049111572e-06, "loss": 0.6105, "step": 8412 }, { "epoch": 3.219670876387294, "grad_norm": 0.5488507747650146, "learning_rate": 5.946754721834067e-06, "loss": 0.6947, "step": 8413 }, { "epoch": 3.2200535782625335, "grad_norm": 0.600835919380188, "learning_rate": 5.944488643639367e-06, "loss": 0.6222, "step": 8414 }, { "epoch": 3.220436280137773, "grad_norm": 0.5502333045005798, "learning_rate": 5.942222814666737e-06, "loss": 0.6128, "step": 8415 }, { "epoch": 3.2208189820130118, "grad_norm": 0.5495694875717163, "learning_rate": 5.93995723505542e-06, "loss": 0.588, "step": 8416 }, { "epoch": 3.221201683888251, "grad_norm": 0.5102148652076721, "learning_rate": 5.937691904944636e-06, "loss": 0.6465, "step": 8417 }, { "epoch": 3.22158438576349, "grad_norm": 0.5718217492103577, "learning_rate": 5.9354268244735955e-06, "loss": 0.6601, "step": 8418 }, { "epoch": 3.2219670876387294, "grad_norm": 0.504006028175354, "learning_rate": 5.933161993781498e-06, "loss": 0.5806, "step": 8419 }, { "epoch": 3.2223497895139688, "grad_norm": 0.5338902473449707, "learning_rate": 5.930897413007523e-06, "loss": 0.6564, "step": 8420 }, { "epoch": 3.2227324913892077, "grad_norm": 0.5746241807937622, "learning_rate": 5.928633082290831e-06, "loss": 0.7118, "step": 8421 }, { "epoch": 3.223115193264447, "grad_norm": 0.568056046962738, "learning_rate": 5.926369001770573e-06, "loss": 0.5721, "step": 8422 }, { "epoch": 3.223497895139686, "grad_norm": 0.521208643913269, "learning_rate": 5.92410517158589e-06, "loss": 0.6543, "step": 8423 }, { "epoch": 3.2238805970149254, "grad_norm": 0.5812993049621582, "learning_rate": 5.9218415918758875e-06, "loss": 0.5894, "step": 8424 }, { "epoch": 3.2242632988901647, "grad_norm": 0.6487745642662048, "learning_rate": 5.919578262779675e-06, "loss": 0.6767, "step": 8425 }, { "epoch": 3.2246460007654036, "grad_norm": 0.6026182174682617, "learning_rate": 5.917315184436345e-06, "loss": 0.6186, "step": 8426 }, { "epoch": 3.225028702640643, "grad_norm": 0.5524078011512756, "learning_rate": 5.915052356984962e-06, "loss": 0.6569, "step": 8427 }, { "epoch": 3.225411404515882, "grad_norm": 0.5488671660423279, "learning_rate": 5.912789780564585e-06, "loss": 0.6587, "step": 8428 }, { "epoch": 3.2257941063911213, "grad_norm": 0.5110180974006653, "learning_rate": 5.910527455314259e-06, "loss": 0.6322, "step": 8429 }, { "epoch": 3.2261768082663607, "grad_norm": 0.6000036001205444, "learning_rate": 5.908265381373004e-06, "loss": 0.5938, "step": 8430 }, { "epoch": 3.2265595101415996, "grad_norm": 0.5323085188865662, "learning_rate": 5.906003558879835e-06, "loss": 0.5559, "step": 8431 }, { "epoch": 3.226942212016839, "grad_norm": 0.5393365621566772, "learning_rate": 5.9037419879737455e-06, "loss": 0.6143, "step": 8432 }, { "epoch": 3.227324913892078, "grad_norm": 0.5470940470695496, "learning_rate": 5.901480668793719e-06, "loss": 0.6079, "step": 8433 }, { "epoch": 3.2277076157673172, "grad_norm": 0.5702247619628906, "learning_rate": 5.899219601478712e-06, "loss": 0.6437, "step": 8434 }, { "epoch": 3.2280903176425566, "grad_norm": 0.5387771725654602, "learning_rate": 5.896958786167675e-06, "loss": 0.6615, "step": 8435 }, { "epoch": 3.2284730195177955, "grad_norm": 0.5425986051559448, "learning_rate": 5.89469822299955e-06, "loss": 0.574, "step": 8436 }, { "epoch": 3.228855721393035, "grad_norm": 0.5389093160629272, "learning_rate": 5.892437912113243e-06, "loss": 0.6998, "step": 8437 }, { "epoch": 3.229238423268274, "grad_norm": 0.512358546257019, "learning_rate": 5.8901778536476625e-06, "loss": 0.7234, "step": 8438 }, { "epoch": 3.229621125143513, "grad_norm": 0.5928404927253723, "learning_rate": 5.887918047741697e-06, "loss": 0.6126, "step": 8439 }, { "epoch": 3.2300038270187525, "grad_norm": 0.5901973843574524, "learning_rate": 5.88565849453421e-06, "loss": 0.614, "step": 8440 }, { "epoch": 3.2303865288939915, "grad_norm": 0.5160195827484131, "learning_rate": 5.883399194164062e-06, "loss": 0.6021, "step": 8441 }, { "epoch": 3.230769230769231, "grad_norm": 0.5078070759773254, "learning_rate": 5.881140146770097e-06, "loss": 0.5649, "step": 8442 }, { "epoch": 3.2311519326444698, "grad_norm": 0.5493196249008179, "learning_rate": 5.8788813524911324e-06, "loss": 0.6414, "step": 8443 }, { "epoch": 3.231534634519709, "grad_norm": 0.5430728793144226, "learning_rate": 5.8766228114659795e-06, "loss": 0.6361, "step": 8444 }, { "epoch": 3.2319173363949485, "grad_norm": 0.5482619404792786, "learning_rate": 5.8743645238334315e-06, "loss": 0.6296, "step": 8445 }, { "epoch": 3.2323000382701874, "grad_norm": 0.5543866157531738, "learning_rate": 5.8721064897322695e-06, "loss": 0.5878, "step": 8446 }, { "epoch": 3.232682740145427, "grad_norm": 0.6078193783760071, "learning_rate": 5.869848709301251e-06, "loss": 0.6562, "step": 8447 }, { "epoch": 3.2330654420206657, "grad_norm": 0.6022229194641113, "learning_rate": 5.8675911826791255e-06, "loss": 0.6437, "step": 8448 }, { "epoch": 3.233448143895905, "grad_norm": 0.5250568985939026, "learning_rate": 5.865333910004625e-06, "loss": 0.5993, "step": 8449 }, { "epoch": 3.2338308457711444, "grad_norm": 0.5193833112716675, "learning_rate": 5.8630768914164615e-06, "loss": 0.6029, "step": 8450 }, { "epoch": 3.2342135476463834, "grad_norm": 0.5271419882774353, "learning_rate": 5.860820127053337e-06, "loss": 0.6232, "step": 8451 }, { "epoch": 3.2345962495216227, "grad_norm": 0.557089626789093, "learning_rate": 5.85856361705394e-06, "loss": 0.6257, "step": 8452 }, { "epoch": 3.2349789513968616, "grad_norm": 0.5218263268470764, "learning_rate": 5.85630736155693e-06, "loss": 0.6169, "step": 8453 }, { "epoch": 3.235361653272101, "grad_norm": 0.5085397958755493, "learning_rate": 5.854051360700964e-06, "loss": 0.597, "step": 8454 }, { "epoch": 3.2357443551473404, "grad_norm": 0.5236576199531555, "learning_rate": 5.8517956146246826e-06, "loss": 0.6991, "step": 8455 }, { "epoch": 3.2361270570225793, "grad_norm": 0.6388953924179077, "learning_rate": 5.849540123466704e-06, "loss": 0.6698, "step": 8456 }, { "epoch": 3.2365097588978187, "grad_norm": 0.5618085861206055, "learning_rate": 5.847284887365633e-06, "loss": 0.6048, "step": 8457 }, { "epoch": 3.2368924607730576, "grad_norm": 0.6182373762130737, "learning_rate": 5.845029906460062e-06, "loss": 0.5942, "step": 8458 }, { "epoch": 3.237275162648297, "grad_norm": 0.5135549902915955, "learning_rate": 5.842775180888569e-06, "loss": 0.6034, "step": 8459 }, { "epoch": 3.2376578645235363, "grad_norm": 0.5589584708213806, "learning_rate": 5.840520710789707e-06, "loss": 0.6996, "step": 8460 }, { "epoch": 3.2380405663987752, "grad_norm": 0.5281846523284912, "learning_rate": 5.838266496302023e-06, "loss": 0.623, "step": 8461 }, { "epoch": 3.2384232682740146, "grad_norm": 0.547559916973114, "learning_rate": 5.836012537564042e-06, "loss": 0.6504, "step": 8462 }, { "epoch": 3.2388059701492535, "grad_norm": 0.5494623780250549, "learning_rate": 5.833758834714277e-06, "loss": 0.6703, "step": 8463 }, { "epoch": 3.239188672024493, "grad_norm": 0.5482773780822754, "learning_rate": 5.831505387891223e-06, "loss": 0.6574, "step": 8464 }, { "epoch": 3.2395713738997323, "grad_norm": 0.5580146312713623, "learning_rate": 5.829252197233367e-06, "loss": 0.6233, "step": 8465 }, { "epoch": 3.239954075774971, "grad_norm": 0.5461364388465881, "learning_rate": 5.826999262879164e-06, "loss": 0.6503, "step": 8466 }, { "epoch": 3.2403367776502106, "grad_norm": 0.5434088706970215, "learning_rate": 5.824746584967067e-06, "loss": 0.6084, "step": 8467 }, { "epoch": 3.2407194795254495, "grad_norm": 0.5344303846359253, "learning_rate": 5.8224941636355126e-06, "loss": 0.6921, "step": 8468 }, { "epoch": 3.241102181400689, "grad_norm": 0.5233600735664368, "learning_rate": 5.820241999022911e-06, "loss": 0.5911, "step": 8469 }, { "epoch": 3.241484883275928, "grad_norm": 0.49407970905303955, "learning_rate": 5.817990091267666e-06, "loss": 0.6628, "step": 8470 }, { "epoch": 3.241867585151167, "grad_norm": 0.5590240955352783, "learning_rate": 5.815738440508167e-06, "loss": 0.6588, "step": 8471 }, { "epoch": 3.2422502870264065, "grad_norm": 0.5356194376945496, "learning_rate": 5.813487046882782e-06, "loss": 0.6302, "step": 8472 }, { "epoch": 3.2426329889016454, "grad_norm": 0.5345044136047363, "learning_rate": 5.8112359105298636e-06, "loss": 0.6102, "step": 8473 }, { "epoch": 3.243015690776885, "grad_norm": 0.5244908928871155, "learning_rate": 5.808985031587751e-06, "loss": 0.6666, "step": 8474 }, { "epoch": 3.243398392652124, "grad_norm": 0.5815526843070984, "learning_rate": 5.806734410194772e-06, "loss": 0.6009, "step": 8475 }, { "epoch": 3.243781094527363, "grad_norm": 0.5292102098464966, "learning_rate": 5.804484046489225e-06, "loss": 0.6223, "step": 8476 }, { "epoch": 3.2441637964026024, "grad_norm": 0.549006998538971, "learning_rate": 5.8022339406094055e-06, "loss": 0.6525, "step": 8477 }, { "epoch": 3.2445464982778414, "grad_norm": 0.6051764488220215, "learning_rate": 5.79998409269359e-06, "loss": 0.6609, "step": 8478 }, { "epoch": 3.2449292001530807, "grad_norm": 0.5054174661636353, "learning_rate": 5.797734502880032e-06, "loss": 0.5945, "step": 8479 }, { "epoch": 3.24531190202832, "grad_norm": 0.5722002387046814, "learning_rate": 5.795485171306978e-06, "loss": 0.5963, "step": 8480 }, { "epoch": 3.245694603903559, "grad_norm": 0.520666778087616, "learning_rate": 5.7932360981126555e-06, "loss": 0.5832, "step": 8481 }, { "epoch": 3.2460773057787984, "grad_norm": 0.5688790082931519, "learning_rate": 5.7909872834352765e-06, "loss": 0.6993, "step": 8482 }, { "epoch": 3.2464600076540373, "grad_norm": 0.5182526707649231, "learning_rate": 5.788738727413036e-06, "loss": 0.553, "step": 8483 }, { "epoch": 3.2468427095292767, "grad_norm": 0.5634012222290039, "learning_rate": 5.786490430184115e-06, "loss": 0.6126, "step": 8484 }, { "epoch": 3.247225411404516, "grad_norm": 0.5876265168190002, "learning_rate": 5.78424239188668e-06, "loss": 0.6465, "step": 8485 }, { "epoch": 3.247608113279755, "grad_norm": 0.536705493927002, "learning_rate": 5.781994612658871e-06, "loss": 0.5865, "step": 8486 }, { "epoch": 3.2479908151549943, "grad_norm": 0.5617520809173584, "learning_rate": 5.7797470926388255e-06, "loss": 0.6817, "step": 8487 }, { "epoch": 3.2483735170302332, "grad_norm": 0.5134342908859253, "learning_rate": 5.777499831964662e-06, "loss": 0.6017, "step": 8488 }, { "epoch": 3.2487562189054726, "grad_norm": 0.5423100590705872, "learning_rate": 5.775252830774475e-06, "loss": 0.6293, "step": 8489 }, { "epoch": 3.249138920780712, "grad_norm": 0.5534109473228455, "learning_rate": 5.77300608920635e-06, "loss": 0.6124, "step": 8490 }, { "epoch": 3.249521622655951, "grad_norm": 0.5393233299255371, "learning_rate": 5.770759607398362e-06, "loss": 0.6702, "step": 8491 }, { "epoch": 3.2499043245311903, "grad_norm": 0.6125256419181824, "learning_rate": 5.768513385488548e-06, "loss": 0.6424, "step": 8492 }, { "epoch": 3.250287026406429, "grad_norm": 0.5457475185394287, "learning_rate": 5.76626742361496e-06, "loss": 0.6073, "step": 8493 }, { "epoch": 3.2506697282816686, "grad_norm": 0.5863997936248779, "learning_rate": 5.764021721915616e-06, "loss": 0.6011, "step": 8494 }, { "epoch": 3.251052430156908, "grad_norm": 0.5576493144035339, "learning_rate": 5.761776280528513e-06, "loss": 0.6535, "step": 8495 }, { "epoch": 3.251435132032147, "grad_norm": 0.6027393341064453, "learning_rate": 5.759531099591643e-06, "loss": 0.6548, "step": 8496 }, { "epoch": 3.251817833907386, "grad_norm": 0.5454468727111816, "learning_rate": 5.757286179242981e-06, "loss": 0.6273, "step": 8497 }, { "epoch": 3.2522005357826256, "grad_norm": 0.5817806124687195, "learning_rate": 5.755041519620483e-06, "loss": 0.6611, "step": 8498 }, { "epoch": 3.2525832376578645, "grad_norm": 0.5548540353775024, "learning_rate": 5.752797120862084e-06, "loss": 0.6077, "step": 8499 }, { "epoch": 3.252965939533104, "grad_norm": 0.5241988897323608, "learning_rate": 5.750552983105713e-06, "loss": 0.5258, "step": 8500 }, { "epoch": 3.253348641408343, "grad_norm": 0.5988390445709229, "learning_rate": 5.74830910648928e-06, "loss": 0.6288, "step": 8501 }, { "epoch": 3.253731343283582, "grad_norm": 0.53080815076828, "learning_rate": 5.746065491150668e-06, "loss": 0.6334, "step": 8502 }, { "epoch": 3.254114045158821, "grad_norm": 0.5181780457496643, "learning_rate": 5.743822137227762e-06, "loss": 0.6535, "step": 8503 }, { "epoch": 3.2544967470340604, "grad_norm": 2.0192160606384277, "learning_rate": 5.7415790448584256e-06, "loss": 0.5608, "step": 8504 }, { "epoch": 3.2548794489093, "grad_norm": 0.5829670429229736, "learning_rate": 5.739336214180492e-06, "loss": 0.5753, "step": 8505 }, { "epoch": 3.2552621507845387, "grad_norm": 0.48352015018463135, "learning_rate": 5.737093645331795e-06, "loss": 0.6738, "step": 8506 }, { "epoch": 3.255644852659778, "grad_norm": 0.6204797625541687, "learning_rate": 5.734851338450149e-06, "loss": 0.5605, "step": 8507 }, { "epoch": 3.2560275545350175, "grad_norm": 0.5298004746437073, "learning_rate": 5.7326092936733435e-06, "loss": 0.6496, "step": 8508 }, { "epoch": 3.2564102564102564, "grad_norm": 0.6060826778411865, "learning_rate": 5.73036751113916e-06, "loss": 0.7064, "step": 8509 }, { "epoch": 3.2567929582854958, "grad_norm": 0.5503368973731995, "learning_rate": 5.7281259909853635e-06, "loss": 0.5971, "step": 8510 }, { "epoch": 3.2571756601607347, "grad_norm": 0.5369572639465332, "learning_rate": 5.725884733349706e-06, "loss": 0.6501, "step": 8511 }, { "epoch": 3.257558362035974, "grad_norm": 0.5326817035675049, "learning_rate": 5.723643738369909e-06, "loss": 0.6633, "step": 8512 }, { "epoch": 3.257941063911213, "grad_norm": 0.5619016289710999, "learning_rate": 5.72140300618369e-06, "loss": 0.6797, "step": 8513 }, { "epoch": 3.2583237657864523, "grad_norm": 0.5496311783790588, "learning_rate": 5.719162536928757e-06, "loss": 0.647, "step": 8514 }, { "epoch": 3.2587064676616917, "grad_norm": 0.5059453248977661, "learning_rate": 5.716922330742785e-06, "loss": 0.5195, "step": 8515 }, { "epoch": 3.2590891695369306, "grad_norm": 0.5902653336524963, "learning_rate": 5.7146823877634395e-06, "loss": 0.6361, "step": 8516 }, { "epoch": 3.25947187141217, "grad_norm": 0.5075474977493286, "learning_rate": 5.712442708128378e-06, "loss": 0.6393, "step": 8517 }, { "epoch": 3.2598545732874094, "grad_norm": 0.6719024181365967, "learning_rate": 5.710203291975229e-06, "loss": 0.7424, "step": 8518 }, { "epoch": 3.2602372751626483, "grad_norm": 0.5296468734741211, "learning_rate": 5.707964139441611e-06, "loss": 0.6384, "step": 8519 }, { "epoch": 3.2606199770378876, "grad_norm": 0.598102331161499, "learning_rate": 5.70572525066513e-06, "loss": 0.6867, "step": 8520 }, { "epoch": 3.2610026789131266, "grad_norm": 0.5336624979972839, "learning_rate": 5.7034866257833655e-06, "loss": 0.6405, "step": 8521 }, { "epoch": 3.261385380788366, "grad_norm": 0.5467917323112488, "learning_rate": 5.7012482649338906e-06, "loss": 0.6488, "step": 8522 }, { "epoch": 3.261768082663605, "grad_norm": 0.5797281861305237, "learning_rate": 5.6990101682542586e-06, "loss": 0.6247, "step": 8523 }, { "epoch": 3.262150784538844, "grad_norm": 0.5525241494178772, "learning_rate": 5.696772335882006e-06, "loss": 0.569, "step": 8524 }, { "epoch": 3.2625334864140836, "grad_norm": 0.5409854650497437, "learning_rate": 5.6945347679546534e-06, "loss": 0.681, "step": 8525 }, { "epoch": 3.2629161882893225, "grad_norm": 0.5487058758735657, "learning_rate": 5.692297464609706e-06, "loss": 0.6205, "step": 8526 }, { "epoch": 3.263298890164562, "grad_norm": 0.5199636816978455, "learning_rate": 5.690060425984658e-06, "loss": 0.5932, "step": 8527 }, { "epoch": 3.2636815920398012, "grad_norm": 0.5047903060913086, "learning_rate": 5.687823652216969e-06, "loss": 0.595, "step": 8528 }, { "epoch": 3.26406429391504, "grad_norm": 0.5361440777778625, "learning_rate": 5.685587143444103e-06, "loss": 0.6554, "step": 8529 }, { "epoch": 3.2644469957902795, "grad_norm": 0.531608521938324, "learning_rate": 5.6833508998035e-06, "loss": 0.5772, "step": 8530 }, { "epoch": 3.2648296976655184, "grad_norm": 0.542939305305481, "learning_rate": 5.681114921432578e-06, "loss": 0.6343, "step": 8531 }, { "epoch": 3.265212399540758, "grad_norm": 0.5219190120697021, "learning_rate": 5.678879208468746e-06, "loss": 0.6135, "step": 8532 }, { "epoch": 3.2655951014159967, "grad_norm": 0.4870462417602539, "learning_rate": 5.676643761049396e-06, "loss": 0.5776, "step": 8533 }, { "epoch": 3.265977803291236, "grad_norm": 0.617404043674469, "learning_rate": 5.6744085793119005e-06, "loss": 0.6302, "step": 8534 }, { "epoch": 3.2663605051664755, "grad_norm": 0.5703626275062561, "learning_rate": 5.672173663393618e-06, "loss": 0.6664, "step": 8535 }, { "epoch": 3.2667432070417144, "grad_norm": 0.5253960490226746, "learning_rate": 5.6699390134318975e-06, "loss": 0.6605, "step": 8536 }, { "epoch": 3.2671259089169538, "grad_norm": 0.5630472302436829, "learning_rate": 5.66770462956405e-06, "loss": 0.5795, "step": 8537 }, { "epoch": 3.267508610792193, "grad_norm": 0.5291171669960022, "learning_rate": 5.665470511927393e-06, "loss": 0.6032, "step": 8538 }, { "epoch": 3.267891312667432, "grad_norm": 0.5680221915245056, "learning_rate": 5.663236660659219e-06, "loss": 0.6621, "step": 8539 }, { "epoch": 3.2682740145426714, "grad_norm": 0.516203761100769, "learning_rate": 5.6610030758968045e-06, "loss": 0.6433, "step": 8540 }, { "epoch": 3.2686567164179103, "grad_norm": 0.5493515729904175, "learning_rate": 5.658769757777406e-06, "loss": 0.5669, "step": 8541 }, { "epoch": 3.2690394182931497, "grad_norm": 0.5056105852127075, "learning_rate": 5.656536706438267e-06, "loss": 0.6462, "step": 8542 }, { "epoch": 3.2694221201683886, "grad_norm": 0.5673261880874634, "learning_rate": 5.654303922016618e-06, "loss": 0.6922, "step": 8543 }, { "epoch": 3.269804822043628, "grad_norm": 0.5931646227836609, "learning_rate": 5.652071404649665e-06, "loss": 0.6006, "step": 8544 }, { "epoch": 3.2701875239188674, "grad_norm": 0.5507664084434509, "learning_rate": 5.649839154474605e-06, "loss": 0.6661, "step": 8545 }, { "epoch": 3.2705702257941063, "grad_norm": 0.5780677199363708, "learning_rate": 5.647607171628622e-06, "loss": 0.6283, "step": 8546 }, { "epoch": 3.2709529276693456, "grad_norm": 0.5517998337745667, "learning_rate": 5.645375456248865e-06, "loss": 0.6917, "step": 8547 }, { "epoch": 3.271335629544585, "grad_norm": 0.6381132006645203, "learning_rate": 5.643144008472485e-06, "loss": 0.6482, "step": 8548 }, { "epoch": 3.271718331419824, "grad_norm": 0.5359036922454834, "learning_rate": 5.640912828436616e-06, "loss": 0.6363, "step": 8549 }, { "epoch": 3.2721010332950633, "grad_norm": 0.5934209227561951, "learning_rate": 5.638681916278359e-06, "loss": 0.5967, "step": 8550 }, { "epoch": 3.272483735170302, "grad_norm": 0.5244393348693848, "learning_rate": 5.636451272134815e-06, "loss": 0.6037, "step": 8551 }, { "epoch": 3.2728664370455416, "grad_norm": 0.49806147813796997, "learning_rate": 5.634220896143062e-06, "loss": 0.4911, "step": 8552 }, { "epoch": 3.2732491389207805, "grad_norm": 0.5831845998764038, "learning_rate": 5.6319907884401646e-06, "loss": 0.5621, "step": 8553 }, { "epoch": 3.27363184079602, "grad_norm": 0.5437464118003845, "learning_rate": 5.629760949163168e-06, "loss": 0.5568, "step": 8554 }, { "epoch": 3.2740145426712592, "grad_norm": 0.5971753001213074, "learning_rate": 5.627531378449101e-06, "loss": 0.6415, "step": 8555 }, { "epoch": 3.274397244546498, "grad_norm": 0.5479947328567505, "learning_rate": 5.6253020764349816e-06, "loss": 0.6385, "step": 8556 }, { "epoch": 3.2747799464217375, "grad_norm": 0.5256616473197937, "learning_rate": 5.623073043257799e-06, "loss": 0.6837, "step": 8557 }, { "epoch": 3.275162648296977, "grad_norm": 0.541148841381073, "learning_rate": 5.620844279054536e-06, "loss": 0.581, "step": 8558 }, { "epoch": 3.275545350172216, "grad_norm": 0.5406705141067505, "learning_rate": 5.61861578396216e-06, "loss": 0.5423, "step": 8559 }, { "epoch": 3.275928052047455, "grad_norm": 0.5304831266403198, "learning_rate": 5.616387558117611e-06, "loss": 0.6519, "step": 8560 }, { "epoch": 3.276310753922694, "grad_norm": 0.5315951108932495, "learning_rate": 5.6141596016578234e-06, "loss": 0.5862, "step": 8561 }, { "epoch": 3.2766934557979335, "grad_norm": 0.5180490016937256, "learning_rate": 5.61193191471971e-06, "loss": 0.6478, "step": 8562 }, { "epoch": 3.2770761576731724, "grad_norm": 0.5136643648147583, "learning_rate": 5.60970449744017e-06, "loss": 0.6005, "step": 8563 }, { "epoch": 3.2774588595484118, "grad_norm": 0.5460065007209778, "learning_rate": 5.607477349956083e-06, "loss": 0.6181, "step": 8564 }, { "epoch": 3.277841561423651, "grad_norm": 0.5786851644515991, "learning_rate": 5.605250472404314e-06, "loss": 0.5518, "step": 8565 }, { "epoch": 3.27822426329889, "grad_norm": 0.5939887762069702, "learning_rate": 5.603023864921711e-06, "loss": 0.6031, "step": 8566 }, { "epoch": 3.2786069651741294, "grad_norm": 0.6115535497665405, "learning_rate": 5.6007975276451035e-06, "loss": 0.5919, "step": 8567 }, { "epoch": 3.278989667049369, "grad_norm": 0.5861110687255859, "learning_rate": 5.598571460711304e-06, "loss": 0.5413, "step": 8568 }, { "epoch": 3.2793723689246077, "grad_norm": 0.6443053483963013, "learning_rate": 5.596345664257117e-06, "loss": 0.6884, "step": 8569 }, { "epoch": 3.279755070799847, "grad_norm": 0.5632622241973877, "learning_rate": 5.5941201384193155e-06, "loss": 0.5382, "step": 8570 }, { "epoch": 3.280137772675086, "grad_norm": 0.548905074596405, "learning_rate": 5.591894883334668e-06, "loss": 0.6144, "step": 8571 }, { "epoch": 3.2805204745503254, "grad_norm": 0.5270458459854126, "learning_rate": 5.5896698991399224e-06, "loss": 0.6426, "step": 8572 }, { "epoch": 3.2809031764255643, "grad_norm": 0.7527400851249695, "learning_rate": 5.587445185971809e-06, "loss": 0.6504, "step": 8573 }, { "epoch": 3.2812858783008036, "grad_norm": 0.5201122760772705, "learning_rate": 5.585220743967044e-06, "loss": 0.6195, "step": 8574 }, { "epoch": 3.281668580176043, "grad_norm": 0.7305895686149597, "learning_rate": 5.5829965732623286e-06, "loss": 0.6778, "step": 8575 }, { "epoch": 3.282051282051282, "grad_norm": 0.5401184558868408, "learning_rate": 5.580772673994335e-06, "loss": 0.6413, "step": 8576 }, { "epoch": 3.2824339839265213, "grad_norm": 0.6013079285621643, "learning_rate": 5.578549046299734e-06, "loss": 0.7058, "step": 8577 }, { "epoch": 3.2828166858017607, "grad_norm": 0.5662589073181152, "learning_rate": 5.576325690315172e-06, "loss": 0.6407, "step": 8578 }, { "epoch": 3.2831993876769996, "grad_norm": 0.5636295080184937, "learning_rate": 5.574102606177283e-06, "loss": 0.6656, "step": 8579 }, { "epoch": 3.283582089552239, "grad_norm": 0.5187814831733704, "learning_rate": 5.571879794022677e-06, "loss": 0.5746, "step": 8580 }, { "epoch": 3.283964791427478, "grad_norm": 0.5749329924583435, "learning_rate": 5.569657253987952e-06, "loss": 0.6674, "step": 8581 }, { "epoch": 3.2843474933027172, "grad_norm": 0.5294129848480225, "learning_rate": 5.567434986209692e-06, "loss": 0.6006, "step": 8582 }, { "epoch": 3.284730195177956, "grad_norm": 0.5545613765716553, "learning_rate": 5.56521299082446e-06, "loss": 0.6042, "step": 8583 }, { "epoch": 3.2851128970531955, "grad_norm": 0.5571599006652832, "learning_rate": 5.562991267968805e-06, "loss": 0.5724, "step": 8584 }, { "epoch": 3.285495598928435, "grad_norm": 0.548882782459259, "learning_rate": 5.560769817779259e-06, "loss": 0.6504, "step": 8585 }, { "epoch": 3.285878300803674, "grad_norm": 0.5335469245910645, "learning_rate": 5.558548640392329e-06, "loss": 0.6172, "step": 8586 }, { "epoch": 3.286261002678913, "grad_norm": 0.551396369934082, "learning_rate": 5.556327735944518e-06, "loss": 0.6661, "step": 8587 }, { "epoch": 3.2866437045541526, "grad_norm": 0.5600773692131042, "learning_rate": 5.554107104572311e-06, "loss": 0.5823, "step": 8588 }, { "epoch": 3.2870264064293915, "grad_norm": 0.5288172364234924, "learning_rate": 5.55188674641216e-06, "loss": 0.555, "step": 8589 }, { "epoch": 3.287409108304631, "grad_norm": 0.5638316869735718, "learning_rate": 5.549666661600519e-06, "loss": 0.6716, "step": 8590 }, { "epoch": 3.2877918101798698, "grad_norm": 0.6034201979637146, "learning_rate": 5.547446850273817e-06, "loss": 0.6341, "step": 8591 }, { "epoch": 3.288174512055109, "grad_norm": 0.5912831425666809, "learning_rate": 5.545227312568467e-06, "loss": 0.6608, "step": 8592 }, { "epoch": 3.288557213930348, "grad_norm": 0.5566282868385315, "learning_rate": 5.543008048620868e-06, "loss": 0.6087, "step": 8593 }, { "epoch": 3.2889399158055874, "grad_norm": 0.5389382839202881, "learning_rate": 5.540789058567397e-06, "loss": 0.6568, "step": 8594 }, { "epoch": 3.289322617680827, "grad_norm": 0.5762555003166199, "learning_rate": 5.538570342544423e-06, "loss": 0.6226, "step": 8595 }, { "epoch": 3.2897053195560657, "grad_norm": 0.5608561038970947, "learning_rate": 5.5363519006882825e-06, "loss": 0.5715, "step": 8596 }, { "epoch": 3.290088021431305, "grad_norm": 0.5236302018165588, "learning_rate": 5.53413373313531e-06, "loss": 0.6581, "step": 8597 }, { "epoch": 3.2904707233065444, "grad_norm": 0.5840685367584229, "learning_rate": 5.53191584002182e-06, "loss": 0.6253, "step": 8598 }, { "epoch": 3.2908534251817834, "grad_norm": 0.5280181169509888, "learning_rate": 5.529698221484101e-06, "loss": 0.5947, "step": 8599 }, { "epoch": 3.2912361270570227, "grad_norm": 0.5765677690505981, "learning_rate": 5.527480877658437e-06, "loss": 0.5865, "step": 8600 }, { "epoch": 3.2916188289322617, "grad_norm": 0.5735210180282593, "learning_rate": 5.525263808681092e-06, "loss": 0.5683, "step": 8601 }, { "epoch": 3.292001530807501, "grad_norm": 0.5477118492126465, "learning_rate": 5.523047014688299e-06, "loss": 0.5813, "step": 8602 }, { "epoch": 3.29238423268274, "grad_norm": 0.5962415337562561, "learning_rate": 5.5208304958162974e-06, "loss": 0.6526, "step": 8603 }, { "epoch": 3.2927669345579793, "grad_norm": 0.5738001465797424, "learning_rate": 5.518614252201295e-06, "loss": 0.591, "step": 8604 }, { "epoch": 3.2931496364332187, "grad_norm": 0.541134238243103, "learning_rate": 5.516398283979492e-06, "loss": 0.6587, "step": 8605 }, { "epoch": 3.2935323383084576, "grad_norm": 0.536805272102356, "learning_rate": 5.514182591287054e-06, "loss": 0.5698, "step": 8606 }, { "epoch": 3.293915040183697, "grad_norm": 0.5670372843742371, "learning_rate": 5.511967174260146e-06, "loss": 0.6031, "step": 8607 }, { "epoch": 3.2942977420589363, "grad_norm": 0.600479781627655, "learning_rate": 5.509752033034918e-06, "loss": 0.6851, "step": 8608 }, { "epoch": 3.2946804439341753, "grad_norm": 0.5356311202049255, "learning_rate": 5.507537167747486e-06, "loss": 0.6016, "step": 8609 }, { "epoch": 3.2950631458094146, "grad_norm": 0.5755977034568787, "learning_rate": 5.5053225785339645e-06, "loss": 0.6314, "step": 8610 }, { "epoch": 3.2954458476846535, "grad_norm": 0.6367641687393188, "learning_rate": 5.503108265530449e-06, "loss": 0.6825, "step": 8611 }, { "epoch": 3.295828549559893, "grad_norm": 0.5387645363807678, "learning_rate": 5.500894228873007e-06, "loss": 0.6391, "step": 8612 }, { "epoch": 3.296211251435132, "grad_norm": 0.5535750389099121, "learning_rate": 5.498680468697698e-06, "loss": 0.6639, "step": 8613 }, { "epoch": 3.296593953310371, "grad_norm": 0.5466100573539734, "learning_rate": 5.496466985140575e-06, "loss": 0.5614, "step": 8614 }, { "epoch": 3.2969766551856106, "grad_norm": 0.5641258358955383, "learning_rate": 5.49425377833765e-06, "loss": 0.6883, "step": 8615 }, { "epoch": 3.2973593570608495, "grad_norm": 0.5570525527000427, "learning_rate": 5.492040848424935e-06, "loss": 0.5848, "step": 8616 }, { "epoch": 3.297742058936089, "grad_norm": 0.5469807386398315, "learning_rate": 5.489828195538421e-06, "loss": 0.5629, "step": 8617 }, { "epoch": 3.298124760811328, "grad_norm": 0.5792750716209412, "learning_rate": 5.487615819814085e-06, "loss": 0.6205, "step": 8618 }, { "epoch": 3.298507462686567, "grad_norm": 0.5457613468170166, "learning_rate": 5.4854037213878745e-06, "loss": 0.5802, "step": 8619 }, { "epoch": 3.2988901645618065, "grad_norm": 0.5156266093254089, "learning_rate": 5.483191900395735e-06, "loss": 0.5645, "step": 8620 }, { "epoch": 3.2992728664370454, "grad_norm": 0.5358796119689941, "learning_rate": 5.480980356973591e-06, "loss": 0.6163, "step": 8621 }, { "epoch": 3.299655568312285, "grad_norm": 0.5626704692840576, "learning_rate": 5.4787690912573385e-06, "loss": 0.6491, "step": 8622 }, { "epoch": 3.3000382701875237, "grad_norm": 0.5458387136459351, "learning_rate": 5.476558103382874e-06, "loss": 0.6829, "step": 8623 }, { "epoch": 3.300420972062763, "grad_norm": 0.5747483968734741, "learning_rate": 5.474347393486066e-06, "loss": 0.6694, "step": 8624 }, { "epoch": 3.3008036739380024, "grad_norm": 0.5781671404838562, "learning_rate": 5.472136961702767e-06, "loss": 0.6787, "step": 8625 }, { "epoch": 3.3011863758132414, "grad_norm": 0.5348453521728516, "learning_rate": 5.469926808168818e-06, "loss": 0.6465, "step": 8626 }, { "epoch": 3.3015690776884807, "grad_norm": 0.6115335822105408, "learning_rate": 5.46771693302004e-06, "loss": 0.5934, "step": 8627 }, { "epoch": 3.30195177956372, "grad_norm": 0.5161353349685669, "learning_rate": 5.465507336392227e-06, "loss": 0.6197, "step": 8628 }, { "epoch": 3.302334481438959, "grad_norm": 0.8002517819404602, "learning_rate": 5.463298018421171e-06, "loss": 0.6548, "step": 8629 }, { "epoch": 3.3027171833141984, "grad_norm": 0.5253156423568726, "learning_rate": 5.46108897924264e-06, "loss": 0.6351, "step": 8630 }, { "epoch": 3.3030998851894373, "grad_norm": 0.5391119122505188, "learning_rate": 5.4588802189923884e-06, "loss": 0.6129, "step": 8631 }, { "epoch": 3.3034825870646767, "grad_norm": 0.5429519414901733, "learning_rate": 5.4566717378061425e-06, "loss": 0.5727, "step": 8632 }, { "epoch": 3.3038652889399156, "grad_norm": 0.5165612101554871, "learning_rate": 5.454463535819625e-06, "loss": 0.6062, "step": 8633 }, { "epoch": 3.304247990815155, "grad_norm": 0.5180498361587524, "learning_rate": 5.452255613168536e-06, "loss": 0.6251, "step": 8634 }, { "epoch": 3.3046306926903943, "grad_norm": 0.5449695587158203, "learning_rate": 5.4500479699885545e-06, "loss": 0.5928, "step": 8635 }, { "epoch": 3.3050133945656333, "grad_norm": 0.620904803276062, "learning_rate": 5.447840606415349e-06, "loss": 0.6592, "step": 8636 }, { "epoch": 3.3053960964408726, "grad_norm": 0.5610430836677551, "learning_rate": 5.4456335225845724e-06, "loss": 0.6385, "step": 8637 }, { "epoch": 3.305778798316112, "grad_norm": 0.5157874822616577, "learning_rate": 5.443426718631848e-06, "loss": 0.5683, "step": 8638 }, { "epoch": 3.306161500191351, "grad_norm": 0.5494135022163391, "learning_rate": 5.4412201946927915e-06, "loss": 0.6581, "step": 8639 }, { "epoch": 3.3065442020665903, "grad_norm": 0.5773364305496216, "learning_rate": 5.439013950903005e-06, "loss": 0.5815, "step": 8640 }, { "epoch": 3.306926903941829, "grad_norm": 0.5432609915733337, "learning_rate": 5.43680798739806e-06, "loss": 0.5819, "step": 8641 }, { "epoch": 3.3073096058170686, "grad_norm": 0.5461665391921997, "learning_rate": 5.4346023043135245e-06, "loss": 0.5808, "step": 8642 }, { "epoch": 3.3076923076923075, "grad_norm": 0.571250855922699, "learning_rate": 5.432396901784942e-06, "loss": 0.6216, "step": 8643 }, { "epoch": 3.308075009567547, "grad_norm": 0.5586475133895874, "learning_rate": 5.43019177994784e-06, "loss": 0.6202, "step": 8644 }, { "epoch": 3.308457711442786, "grad_norm": 0.6365706920623779, "learning_rate": 5.42798693893773e-06, "loss": 0.5266, "step": 8645 }, { "epoch": 3.308840413318025, "grad_norm": 0.5220035314559937, "learning_rate": 5.4257823788901054e-06, "loss": 0.5722, "step": 8646 }, { "epoch": 3.3092231151932645, "grad_norm": 0.5374812483787537, "learning_rate": 5.423578099940447e-06, "loss": 0.6059, "step": 8647 }, { "epoch": 3.309605817068504, "grad_norm": 0.588910698890686, "learning_rate": 5.421374102224204e-06, "loss": 0.6082, "step": 8648 }, { "epoch": 3.309988518943743, "grad_norm": 0.4976396858692169, "learning_rate": 5.419170385876822e-06, "loss": 0.655, "step": 8649 }, { "epoch": 3.310371220818982, "grad_norm": 0.5649003982543945, "learning_rate": 5.416966951033731e-06, "loss": 0.604, "step": 8650 }, { "epoch": 3.310753922694221, "grad_norm": 0.5474513173103333, "learning_rate": 5.4147637978303295e-06, "loss": 0.5922, "step": 8651 }, { "epoch": 3.3111366245694605, "grad_norm": 0.5898867249488831, "learning_rate": 5.412560926402011e-06, "loss": 0.6774, "step": 8652 }, { "epoch": 3.3115193264446994, "grad_norm": 0.5112956762313843, "learning_rate": 5.410358336884147e-06, "loss": 0.5842, "step": 8653 }, { "epoch": 3.3119020283199387, "grad_norm": 0.5580829381942749, "learning_rate": 5.408156029412094e-06, "loss": 0.6325, "step": 8654 }, { "epoch": 3.312284730195178, "grad_norm": 0.6433830857276917, "learning_rate": 5.4059540041211876e-06, "loss": 0.6355, "step": 8655 }, { "epoch": 3.312667432070417, "grad_norm": 0.5924918055534363, "learning_rate": 5.403752261146751e-06, "loss": 0.5985, "step": 8656 }, { "epoch": 3.3130501339456564, "grad_norm": 0.5569586753845215, "learning_rate": 5.401550800624089e-06, "loss": 0.6373, "step": 8657 }, { "epoch": 3.3134328358208958, "grad_norm": 0.5333135724067688, "learning_rate": 5.399349622688479e-06, "loss": 0.5869, "step": 8658 }, { "epoch": 3.3138155376961347, "grad_norm": 0.6183821558952332, "learning_rate": 5.397148727475196e-06, "loss": 0.6167, "step": 8659 }, { "epoch": 3.314198239571374, "grad_norm": 0.5417342782020569, "learning_rate": 5.394948115119493e-06, "loss": 0.6276, "step": 8660 }, { "epoch": 3.314580941446613, "grad_norm": 0.566882848739624, "learning_rate": 5.3927477857565945e-06, "loss": 0.5878, "step": 8661 }, { "epoch": 3.3149636433218523, "grad_norm": 0.5517059564590454, "learning_rate": 5.390547739521723e-06, "loss": 0.6325, "step": 8662 }, { "epoch": 3.3153463451970913, "grad_norm": 0.6027379631996155, "learning_rate": 5.388347976550076e-06, "loss": 0.6893, "step": 8663 }, { "epoch": 3.3157290470723306, "grad_norm": 0.5122885704040527, "learning_rate": 5.386148496976835e-06, "loss": 0.6336, "step": 8664 }, { "epoch": 3.31611174894757, "grad_norm": 0.5423863530158997, "learning_rate": 5.383949300937163e-06, "loss": 0.7159, "step": 8665 }, { "epoch": 3.316494450822809, "grad_norm": 0.5560845136642456, "learning_rate": 5.381750388566213e-06, "loss": 0.5775, "step": 8666 }, { "epoch": 3.3168771526980483, "grad_norm": 0.5340280532836914, "learning_rate": 5.3795517599991035e-06, "loss": 0.6605, "step": 8667 }, { "epoch": 3.3172598545732876, "grad_norm": 0.5198853611946106, "learning_rate": 5.377353415370952e-06, "loss": 0.6306, "step": 8668 }, { "epoch": 3.3176425564485266, "grad_norm": 0.5430471301078796, "learning_rate": 5.37515535481685e-06, "loss": 0.6237, "step": 8669 }, { "epoch": 3.318025258323766, "grad_norm": 0.5250962972640991, "learning_rate": 5.372957578471881e-06, "loss": 0.6335, "step": 8670 }, { "epoch": 3.318407960199005, "grad_norm": 0.592801570892334, "learning_rate": 5.370760086471097e-06, "loss": 0.6962, "step": 8671 }, { "epoch": 3.3187906620742442, "grad_norm": 0.5178826451301575, "learning_rate": 5.368562878949539e-06, "loss": 0.616, "step": 8672 }, { "epoch": 3.319173363949483, "grad_norm": 0.5736622214317322, "learning_rate": 5.366365956042236e-06, "loss": 0.656, "step": 8673 }, { "epoch": 3.3195560658247225, "grad_norm": 0.5534983277320862, "learning_rate": 5.364169317884194e-06, "loss": 0.5917, "step": 8674 }, { "epoch": 3.319938767699962, "grad_norm": 0.5084884166717529, "learning_rate": 5.3619729646104e-06, "loss": 0.5893, "step": 8675 }, { "epoch": 3.320321469575201, "grad_norm": 0.5637151002883911, "learning_rate": 5.3597768963558326e-06, "loss": 0.5344, "step": 8676 }, { "epoch": 3.32070417145044, "grad_norm": 0.5464562177658081, "learning_rate": 5.357581113255437e-06, "loss": 0.5969, "step": 8677 }, { "epoch": 3.3210868733256795, "grad_norm": 0.5693392753601074, "learning_rate": 5.355385615444152e-06, "loss": 0.6726, "step": 8678 }, { "epoch": 3.3214695752009185, "grad_norm": 0.5462765693664551, "learning_rate": 5.3531904030569035e-06, "loss": 0.6399, "step": 8679 }, { "epoch": 3.321852277076158, "grad_norm": 0.4997628629207611, "learning_rate": 5.350995476228584e-06, "loss": 0.588, "step": 8680 }, { "epoch": 3.3222349789513967, "grad_norm": 0.6008414626121521, "learning_rate": 5.3488008350940834e-06, "loss": 0.6201, "step": 8681 }, { "epoch": 3.322617680826636, "grad_norm": 0.7146506309509277, "learning_rate": 5.346606479788266e-06, "loss": 0.7255, "step": 8682 }, { "epoch": 3.323000382701875, "grad_norm": 0.572189450263977, "learning_rate": 5.344412410445981e-06, "loss": 0.6177, "step": 8683 }, { "epoch": 3.3233830845771144, "grad_norm": 0.5688552260398865, "learning_rate": 5.3422186272020624e-06, "loss": 0.618, "step": 8684 }, { "epoch": 3.3237657864523538, "grad_norm": 0.546562910079956, "learning_rate": 5.340025130191321e-06, "loss": 0.6683, "step": 8685 }, { "epoch": 3.3241484883275927, "grad_norm": 0.5100232362747192, "learning_rate": 5.337831919548558e-06, "loss": 0.6801, "step": 8686 }, { "epoch": 3.324531190202832, "grad_norm": 0.604843258857727, "learning_rate": 5.3356389954085455e-06, "loss": 0.5918, "step": 8687 }, { "epoch": 3.3249138920780714, "grad_norm": 0.5645681023597717, "learning_rate": 5.3334463579060465e-06, "loss": 0.6231, "step": 8688 }, { "epoch": 3.3252965939533103, "grad_norm": 0.6155034303665161, "learning_rate": 5.33125400717581e-06, "loss": 0.5821, "step": 8689 }, { "epoch": 3.3256792958285497, "grad_norm": 0.5919166207313538, "learning_rate": 5.329061943352553e-06, "loss": 0.701, "step": 8690 }, { "epoch": 3.3260619977037886, "grad_norm": 0.5819544196128845, "learning_rate": 5.326870166570988e-06, "loss": 0.6111, "step": 8691 }, { "epoch": 3.326444699579028, "grad_norm": 0.5707108378410339, "learning_rate": 5.324678676965806e-06, "loss": 0.6352, "step": 8692 }, { "epoch": 3.326827401454267, "grad_norm": 0.5363116264343262, "learning_rate": 5.322487474671678e-06, "loss": 0.6057, "step": 8693 }, { "epoch": 3.3272101033295063, "grad_norm": 0.554015040397644, "learning_rate": 5.320296559823264e-06, "loss": 0.7361, "step": 8694 }, { "epoch": 3.3275928052047457, "grad_norm": 0.5653914213180542, "learning_rate": 5.318105932555195e-06, "loss": 0.6327, "step": 8695 }, { "epoch": 3.3279755070799846, "grad_norm": 0.5047522783279419, "learning_rate": 5.3159155930021e-06, "loss": 0.5748, "step": 8696 }, { "epoch": 3.328358208955224, "grad_norm": 0.5135650038719177, "learning_rate": 5.313725541298571e-06, "loss": 0.6501, "step": 8697 }, { "epoch": 3.3287409108304633, "grad_norm": 0.5260362029075623, "learning_rate": 5.311535777579197e-06, "loss": 0.623, "step": 8698 }, { "epoch": 3.3291236127057022, "grad_norm": 0.5582957863807678, "learning_rate": 5.3093463019785495e-06, "loss": 0.668, "step": 8699 }, { "epoch": 3.3295063145809416, "grad_norm": 0.528303861618042, "learning_rate": 5.30715711463117e-06, "loss": 0.6628, "step": 8700 }, { "epoch": 3.3298890164561805, "grad_norm": 0.5169469118118286, "learning_rate": 5.304968215671591e-06, "loss": 0.6577, "step": 8701 }, { "epoch": 3.33027171833142, "grad_norm": 0.5683587789535522, "learning_rate": 5.30277960523433e-06, "loss": 0.5726, "step": 8702 }, { "epoch": 3.330654420206659, "grad_norm": 0.5576856732368469, "learning_rate": 5.30059128345388e-06, "loss": 0.6927, "step": 8703 }, { "epoch": 3.331037122081898, "grad_norm": 0.5344882011413574, "learning_rate": 5.298403250464721e-06, "loss": 0.6229, "step": 8704 }, { "epoch": 3.3314198239571375, "grad_norm": 0.5647845268249512, "learning_rate": 5.296215506401318e-06, "loss": 0.6168, "step": 8705 }, { "epoch": 3.3318025258323765, "grad_norm": 0.538081705570221, "learning_rate": 5.294028051398104e-06, "loss": 0.5475, "step": 8706 }, { "epoch": 3.332185227707616, "grad_norm": 0.5548336505889893, "learning_rate": 5.291840885589508e-06, "loss": 0.6615, "step": 8707 }, { "epoch": 3.332567929582855, "grad_norm": 0.5880813598632812, "learning_rate": 5.289654009109939e-06, "loss": 0.6717, "step": 8708 }, { "epoch": 3.332950631458094, "grad_norm": 0.49989423155784607, "learning_rate": 5.287467422093789e-06, "loss": 0.5908, "step": 8709 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5381253957748413, "learning_rate": 5.285281124675423e-06, "loss": 0.5928, "step": 8710 }, { "epoch": 3.3337160352085724, "grad_norm": 0.5415118336677551, "learning_rate": 5.283095116989196e-06, "loss": 0.6458, "step": 8711 }, { "epoch": 3.3340987370838118, "grad_norm": 0.5385809540748596, "learning_rate": 5.280909399169452e-06, "loss": 0.6254, "step": 8712 }, { "epoch": 3.3344814389590507, "grad_norm": 0.5992684960365295, "learning_rate": 5.278723971350494e-06, "loss": 0.6439, "step": 8713 }, { "epoch": 3.33486414083429, "grad_norm": 0.4994044899940491, "learning_rate": 5.276538833666637e-06, "loss": 0.6342, "step": 8714 }, { "epoch": 3.3352468427095294, "grad_norm": 0.5128160715103149, "learning_rate": 5.274353986252161e-06, "loss": 0.6248, "step": 8715 }, { "epoch": 3.3356295445847683, "grad_norm": 0.5344101786613464, "learning_rate": 5.272169429241325e-06, "loss": 0.577, "step": 8716 }, { "epoch": 3.3360122464600077, "grad_norm": 0.5448600053787231, "learning_rate": 5.269985162768377e-06, "loss": 0.6149, "step": 8717 }, { "epoch": 3.336394948335247, "grad_norm": 0.5644651651382446, "learning_rate": 5.2678011869675535e-06, "loss": 0.6636, "step": 8718 }, { "epoch": 3.336777650210486, "grad_norm": 0.498042494058609, "learning_rate": 5.265617501973055e-06, "loss": 0.6129, "step": 8719 }, { "epoch": 3.3371603520857254, "grad_norm": 0.5601867437362671, "learning_rate": 5.263434107919081e-06, "loss": 0.5553, "step": 8720 }, { "epoch": 3.3375430539609643, "grad_norm": 0.5561563372612, "learning_rate": 5.261251004939805e-06, "loss": 0.5649, "step": 8721 }, { "epoch": 3.3379257558362037, "grad_norm": 0.563574492931366, "learning_rate": 5.25906819316939e-06, "loss": 0.6309, "step": 8722 }, { "epoch": 3.3383084577114426, "grad_norm": 0.526992917060852, "learning_rate": 5.256885672741966e-06, "loss": 0.5112, "step": 8723 }, { "epoch": 3.338691159586682, "grad_norm": 0.5184522867202759, "learning_rate": 5.254703443791655e-06, "loss": 0.5737, "step": 8724 }, { "epoch": 3.3390738614619213, "grad_norm": 0.5142524242401123, "learning_rate": 5.252521506452576e-06, "loss": 0.5562, "step": 8725 }, { "epoch": 3.3394565633371602, "grad_norm": 0.5233796834945679, "learning_rate": 5.2503398608588e-06, "loss": 0.6393, "step": 8726 }, { "epoch": 3.3398392652123996, "grad_norm": 0.5414186716079712, "learning_rate": 5.248158507144398e-06, "loss": 0.5923, "step": 8727 }, { "epoch": 3.340221967087639, "grad_norm": 0.5229254961013794, "learning_rate": 5.2459774454434265e-06, "loss": 0.6563, "step": 8728 }, { "epoch": 3.340604668962878, "grad_norm": 0.5942581295967102, "learning_rate": 5.2437966758899085e-06, "loss": 0.689, "step": 8729 }, { "epoch": 3.3409873708381173, "grad_norm": 0.5342761874198914, "learning_rate": 5.241616198617863e-06, "loss": 0.5822, "step": 8730 }, { "epoch": 3.341370072713356, "grad_norm": 0.5316049456596375, "learning_rate": 5.239436013761287e-06, "loss": 0.6601, "step": 8731 }, { "epoch": 3.3417527745885955, "grad_norm": 0.5275317430496216, "learning_rate": 5.237256121454155e-06, "loss": 0.5469, "step": 8732 }, { "epoch": 3.3421354764638345, "grad_norm": 0.6207234859466553, "learning_rate": 5.235076521830429e-06, "loss": 0.6783, "step": 8733 }, { "epoch": 3.342518178339074, "grad_norm": 0.5528748035430908, "learning_rate": 5.232897215024049e-06, "loss": 0.6126, "step": 8734 }, { "epoch": 3.342900880214313, "grad_norm": 0.6494544744491577, "learning_rate": 5.2307182011689475e-06, "loss": 0.649, "step": 8735 }, { "epoch": 3.343283582089552, "grad_norm": 0.5920964479446411, "learning_rate": 5.228539480399022e-06, "loss": 0.5877, "step": 8736 }, { "epoch": 3.3436662839647915, "grad_norm": 0.5397937297821045, "learning_rate": 5.226361052848166e-06, "loss": 0.6879, "step": 8737 }, { "epoch": 3.344048985840031, "grad_norm": 0.48665690422058105, "learning_rate": 5.224182918650249e-06, "loss": 0.6236, "step": 8738 }, { "epoch": 3.3444316877152698, "grad_norm": 0.5295740365982056, "learning_rate": 5.222005077939119e-06, "loss": 0.594, "step": 8739 }, { "epoch": 3.344814389590509, "grad_norm": 0.533266007900238, "learning_rate": 5.219827530848613e-06, "loss": 0.6447, "step": 8740 }, { "epoch": 3.345197091465748, "grad_norm": 0.5436145067214966, "learning_rate": 5.217650277512551e-06, "loss": 0.5996, "step": 8741 }, { "epoch": 3.3455797933409874, "grad_norm": 0.5602266192436218, "learning_rate": 5.215473318064722e-06, "loss": 0.6117, "step": 8742 }, { "epoch": 3.3459624952162264, "grad_norm": 0.5436751842498779, "learning_rate": 5.2132966526389126e-06, "loss": 0.6258, "step": 8743 }, { "epoch": 3.3463451970914657, "grad_norm": 0.5222557783126831, "learning_rate": 5.211120281368882e-06, "loss": 0.5689, "step": 8744 }, { "epoch": 3.346727898966705, "grad_norm": 0.5709372162818909, "learning_rate": 5.208944204388377e-06, "loss": 0.6935, "step": 8745 }, { "epoch": 3.347110600841944, "grad_norm": 0.5586922764778137, "learning_rate": 5.206768421831121e-06, "loss": 0.5925, "step": 8746 }, { "epoch": 3.3474933027171834, "grad_norm": 0.5759789347648621, "learning_rate": 5.204592933830827e-06, "loss": 0.6122, "step": 8747 }, { "epoch": 3.3478760045924227, "grad_norm": 0.5626954436302185, "learning_rate": 5.202417740521175e-06, "loss": 0.6797, "step": 8748 }, { "epoch": 3.3482587064676617, "grad_norm": 0.5490573048591614, "learning_rate": 5.200242842035843e-06, "loss": 0.6052, "step": 8749 }, { "epoch": 3.348641408342901, "grad_norm": 0.557408332824707, "learning_rate": 5.198068238508483e-06, "loss": 0.5949, "step": 8750 }, { "epoch": 3.34902411021814, "grad_norm": 0.509990394115448, "learning_rate": 5.195893930072732e-06, "loss": 0.6626, "step": 8751 }, { "epoch": 3.3494068120933793, "grad_norm": 0.602725088596344, "learning_rate": 5.193719916862201e-06, "loss": 0.6522, "step": 8752 }, { "epoch": 3.3497895139686182, "grad_norm": 0.532379686832428, "learning_rate": 5.191546199010495e-06, "loss": 0.6084, "step": 8753 }, { "epoch": 3.3501722158438576, "grad_norm": 0.5456154942512512, "learning_rate": 5.189372776651192e-06, "loss": 0.6133, "step": 8754 }, { "epoch": 3.350554917719097, "grad_norm": 0.5643286108970642, "learning_rate": 5.187199649917856e-06, "loss": 0.6076, "step": 8755 }, { "epoch": 3.350937619594336, "grad_norm": 0.54981529712677, "learning_rate": 5.185026818944031e-06, "loss": 0.6164, "step": 8756 }, { "epoch": 3.3513203214695753, "grad_norm": 0.5342456698417664, "learning_rate": 5.182854283863248e-06, "loss": 0.578, "step": 8757 }, { "epoch": 3.3517030233448146, "grad_norm": 0.5615673661231995, "learning_rate": 5.1806820448090044e-06, "loss": 0.6618, "step": 8758 }, { "epoch": 3.3520857252200535, "grad_norm": 0.5279461741447449, "learning_rate": 5.178510101914798e-06, "loss": 0.5889, "step": 8759 }, { "epoch": 3.352468427095293, "grad_norm": 0.5604521632194519, "learning_rate": 5.176338455314103e-06, "loss": 0.6144, "step": 8760 }, { "epoch": 3.352851128970532, "grad_norm": 0.6380817890167236, "learning_rate": 5.174167105140363e-06, "loss": 0.566, "step": 8761 }, { "epoch": 3.353233830845771, "grad_norm": 0.5087745785713196, "learning_rate": 5.17199605152702e-06, "loss": 0.6902, "step": 8762 }, { "epoch": 3.35361653272101, "grad_norm": 0.5087315440177917, "learning_rate": 5.16982529460749e-06, "loss": 0.6451, "step": 8763 }, { "epoch": 3.3539992345962495, "grad_norm": 0.6143864393234253, "learning_rate": 5.167654834515172e-06, "loss": 0.652, "step": 8764 }, { "epoch": 3.354381936471489, "grad_norm": 0.5552747845649719, "learning_rate": 5.165484671383445e-06, "loss": 0.544, "step": 8765 }, { "epoch": 3.354764638346728, "grad_norm": 0.5746663212776184, "learning_rate": 5.1633148053456746e-06, "loss": 0.7017, "step": 8766 }, { "epoch": 3.355147340221967, "grad_norm": 0.5525409579277039, "learning_rate": 5.161145236535207e-06, "loss": 0.5973, "step": 8767 }, { "epoch": 3.3555300420972065, "grad_norm": 0.5032180547714233, "learning_rate": 5.15897596508536e-06, "loss": 0.4918, "step": 8768 }, { "epoch": 3.3559127439724454, "grad_norm": 0.5563977956771851, "learning_rate": 5.1568069911294435e-06, "loss": 0.6258, "step": 8769 }, { "epoch": 3.356295445847685, "grad_norm": 0.5428351759910583, "learning_rate": 5.154638314800755e-06, "loss": 0.6741, "step": 8770 }, { "epoch": 3.3566781477229237, "grad_norm": 0.550351619720459, "learning_rate": 5.152469936232553e-06, "loss": 0.5823, "step": 8771 }, { "epoch": 3.357060849598163, "grad_norm": 0.5599220395088196, "learning_rate": 5.150301855558097e-06, "loss": 0.6206, "step": 8772 }, { "epoch": 3.357443551473402, "grad_norm": 0.596869707107544, "learning_rate": 5.148134072910622e-06, "loss": 0.6387, "step": 8773 }, { "epoch": 3.3578262533486414, "grad_norm": 0.5685721039772034, "learning_rate": 5.145966588423341e-06, "loss": 0.6429, "step": 8774 }, { "epoch": 3.3582089552238807, "grad_norm": 0.5052478313446045, "learning_rate": 5.143799402229454e-06, "loss": 0.6323, "step": 8775 }, { "epoch": 3.3585916570991197, "grad_norm": 0.5260011553764343, "learning_rate": 5.14163251446214e-06, "loss": 0.562, "step": 8776 }, { "epoch": 3.358974358974359, "grad_norm": 0.5575593113899231, "learning_rate": 5.139465925254565e-06, "loss": 0.6627, "step": 8777 }, { "epoch": 3.3593570608495984, "grad_norm": 0.5390504002571106, "learning_rate": 5.137299634739862e-06, "loss": 0.6273, "step": 8778 }, { "epoch": 3.3597397627248373, "grad_norm": 0.5387511253356934, "learning_rate": 5.13513364305116e-06, "loss": 0.6564, "step": 8779 }, { "epoch": 3.3601224646000767, "grad_norm": 0.5479289889335632, "learning_rate": 5.132967950321568e-06, "loss": 0.6537, "step": 8780 }, { "epoch": 3.3605051664753156, "grad_norm": 0.521805465221405, "learning_rate": 5.130802556684167e-06, "loss": 0.6485, "step": 8781 }, { "epoch": 3.360887868350555, "grad_norm": 0.4917367994785309, "learning_rate": 5.128637462272031e-06, "loss": 0.6042, "step": 8782 }, { "epoch": 3.361270570225794, "grad_norm": 0.5193238854408264, "learning_rate": 5.12647266721821e-06, "loss": 0.5897, "step": 8783 }, { "epoch": 3.3616532721010333, "grad_norm": 0.5629118084907532, "learning_rate": 5.124308171655734e-06, "loss": 0.6431, "step": 8784 }, { "epoch": 3.3620359739762726, "grad_norm": 0.5825820565223694, "learning_rate": 5.122143975717622e-06, "loss": 0.6445, "step": 8785 }, { "epoch": 3.3624186758515116, "grad_norm": 0.5219680666923523, "learning_rate": 5.1199800795368685e-06, "loss": 0.61, "step": 8786 }, { "epoch": 3.362801377726751, "grad_norm": 0.6709839105606079, "learning_rate": 5.117816483246447e-06, "loss": 0.6839, "step": 8787 }, { "epoch": 3.3631840796019903, "grad_norm": 0.5320220589637756, "learning_rate": 5.115653186979318e-06, "loss": 0.6203, "step": 8788 }, { "epoch": 3.363566781477229, "grad_norm": 0.54897141456604, "learning_rate": 5.113490190868422e-06, "loss": 0.6082, "step": 8789 }, { "epoch": 3.3639494833524686, "grad_norm": 0.5283834338188171, "learning_rate": 5.111327495046686e-06, "loss": 0.6561, "step": 8790 }, { "epoch": 3.3643321852277075, "grad_norm": 0.5597741007804871, "learning_rate": 5.109165099647004e-06, "loss": 0.5951, "step": 8791 }, { "epoch": 3.364714887102947, "grad_norm": 0.5860404968261719, "learning_rate": 5.107003004802267e-06, "loss": 0.5551, "step": 8792 }, { "epoch": 3.365097588978186, "grad_norm": 0.5386548638343811, "learning_rate": 5.104841210645339e-06, "loss": 0.5976, "step": 8793 }, { "epoch": 3.365480290853425, "grad_norm": 0.5632630586624146, "learning_rate": 5.102679717309072e-06, "loss": 0.5946, "step": 8794 }, { "epoch": 3.3658629927286645, "grad_norm": 0.5560896396636963, "learning_rate": 5.10051852492629e-06, "loss": 0.6111, "step": 8795 }, { "epoch": 3.3662456946039034, "grad_norm": 0.5320311784744263, "learning_rate": 5.098357633629812e-06, "loss": 0.631, "step": 8796 }, { "epoch": 3.366628396479143, "grad_norm": 0.5177313685417175, "learning_rate": 5.096197043552422e-06, "loss": 0.6426, "step": 8797 }, { "epoch": 3.367011098354382, "grad_norm": 0.5242767333984375, "learning_rate": 5.094036754826897e-06, "loss": 0.6844, "step": 8798 }, { "epoch": 3.367393800229621, "grad_norm": 0.49343517422676086, "learning_rate": 5.091876767585998e-06, "loss": 0.6121, "step": 8799 }, { "epoch": 3.3677765021048605, "grad_norm": 0.5171264410018921, "learning_rate": 5.089717081962453e-06, "loss": 0.5619, "step": 8800 }, { "epoch": 3.3681592039800994, "grad_norm": 0.5552119612693787, "learning_rate": 5.087557698088985e-06, "loss": 0.6064, "step": 8801 }, { "epoch": 3.3685419058553387, "grad_norm": 0.547504186630249, "learning_rate": 5.085398616098294e-06, "loss": 0.5676, "step": 8802 }, { "epoch": 3.3689246077305777, "grad_norm": 0.5607532858848572, "learning_rate": 5.0832398361230595e-06, "loss": 0.6426, "step": 8803 }, { "epoch": 3.369307309605817, "grad_norm": 0.6493342518806458, "learning_rate": 5.081081358295947e-06, "loss": 0.664, "step": 8804 }, { "epoch": 3.3696900114810564, "grad_norm": 0.5693574547767639, "learning_rate": 5.0789231827496e-06, "loss": 0.6619, "step": 8805 }, { "epoch": 3.3700727133562953, "grad_norm": 0.5413281321525574, "learning_rate": 5.076765309616647e-06, "loss": 0.6242, "step": 8806 }, { "epoch": 3.3704554152315347, "grad_norm": 0.5679946541786194, "learning_rate": 5.074607739029689e-06, "loss": 0.5687, "step": 8807 }, { "epoch": 3.370838117106774, "grad_norm": 0.5549412965774536, "learning_rate": 5.072450471121316e-06, "loss": 0.5775, "step": 8808 }, { "epoch": 3.371220818982013, "grad_norm": 0.5409916639328003, "learning_rate": 5.070293506024104e-06, "loss": 0.6478, "step": 8809 }, { "epoch": 3.3716035208572523, "grad_norm": 0.568661630153656, "learning_rate": 5.0681368438705954e-06, "loss": 0.6677, "step": 8810 }, { "epoch": 3.3719862227324913, "grad_norm": 0.5303263664245605, "learning_rate": 5.065980484793328e-06, "loss": 0.6182, "step": 8811 }, { "epoch": 3.3723689246077306, "grad_norm": 0.5550745129585266, "learning_rate": 5.063824428924818e-06, "loss": 0.6517, "step": 8812 }, { "epoch": 3.3727516264829696, "grad_norm": 0.5726884007453918, "learning_rate": 5.061668676397551e-06, "loss": 0.6038, "step": 8813 }, { "epoch": 3.373134328358209, "grad_norm": 0.5670856237411499, "learning_rate": 5.059513227344014e-06, "loss": 0.6455, "step": 8814 }, { "epoch": 3.3735170302334483, "grad_norm": 0.5710092782974243, "learning_rate": 5.057358081896661e-06, "loss": 0.5867, "step": 8815 }, { "epoch": 3.373899732108687, "grad_norm": 0.5256910920143127, "learning_rate": 5.055203240187938e-06, "loss": 0.6289, "step": 8816 }, { "epoch": 3.3742824339839266, "grad_norm": 0.5162184238433838, "learning_rate": 5.053048702350254e-06, "loss": 0.6131, "step": 8817 }, { "epoch": 3.374665135859166, "grad_norm": 0.50974041223526, "learning_rate": 5.0508944685160184e-06, "loss": 0.6234, "step": 8818 }, { "epoch": 3.375047837734405, "grad_norm": 0.5994487404823303, "learning_rate": 5.048740538817617e-06, "loss": 0.6324, "step": 8819 }, { "epoch": 3.3754305396096442, "grad_norm": 0.5519504547119141, "learning_rate": 5.046586913387408e-06, "loss": 0.5684, "step": 8820 }, { "epoch": 3.375813241484883, "grad_norm": 0.5219932198524475, "learning_rate": 5.0444335923577395e-06, "loss": 0.5972, "step": 8821 }, { "epoch": 3.3761959433601225, "grad_norm": 0.547518789768219, "learning_rate": 5.0422805758609424e-06, "loss": 0.6738, "step": 8822 }, { "epoch": 3.3765786452353614, "grad_norm": 0.5441820025444031, "learning_rate": 5.0401278640293206e-06, "loss": 0.6191, "step": 8823 }, { "epoch": 3.376961347110601, "grad_norm": 0.6414895057678223, "learning_rate": 5.0379754569951635e-06, "loss": 0.5495, "step": 8824 }, { "epoch": 3.37734404898584, "grad_norm": 0.5376620888710022, "learning_rate": 5.035823354890751e-06, "loss": 0.5918, "step": 8825 }, { "epoch": 3.377726750861079, "grad_norm": 0.5285441875457764, "learning_rate": 5.033671557848325e-06, "loss": 0.5911, "step": 8826 }, { "epoch": 3.3781094527363185, "grad_norm": 0.5721601247787476, "learning_rate": 5.031520066000126e-06, "loss": 0.6769, "step": 8827 }, { "epoch": 3.378492154611558, "grad_norm": 0.5502384901046753, "learning_rate": 5.029368879478364e-06, "loss": 0.702, "step": 8828 }, { "epoch": 3.3788748564867968, "grad_norm": 0.5466519594192505, "learning_rate": 5.027217998415244e-06, "loss": 0.5656, "step": 8829 }, { "epoch": 3.379257558362036, "grad_norm": 0.4854561984539032, "learning_rate": 5.025067422942933e-06, "loss": 0.6398, "step": 8830 }, { "epoch": 3.379640260237275, "grad_norm": 0.5177948474884033, "learning_rate": 5.022917153193594e-06, "loss": 0.6155, "step": 8831 }, { "epoch": 3.3800229621125144, "grad_norm": 0.6464068293571472, "learning_rate": 5.020767189299369e-06, "loss": 0.6656, "step": 8832 }, { "epoch": 3.3804056639877533, "grad_norm": 0.5649464130401611, "learning_rate": 5.018617531392376e-06, "loss": 0.6048, "step": 8833 }, { "epoch": 3.3807883658629927, "grad_norm": 0.5248705744743347, "learning_rate": 5.016468179604712e-06, "loss": 0.6047, "step": 8834 }, { "epoch": 3.381171067738232, "grad_norm": 0.5927045941352844, "learning_rate": 5.014319134068475e-06, "loss": 0.6425, "step": 8835 }, { "epoch": 3.381553769613471, "grad_norm": 0.5518525838851929, "learning_rate": 5.012170394915716e-06, "loss": 0.6235, "step": 8836 }, { "epoch": 3.3819364714887103, "grad_norm": 0.5218704342842102, "learning_rate": 5.0100219622784885e-06, "loss": 0.596, "step": 8837 }, { "epoch": 3.3823191733639497, "grad_norm": 0.4997512102127075, "learning_rate": 5.007873836288819e-06, "loss": 0.5646, "step": 8838 }, { "epoch": 3.3827018752391886, "grad_norm": 0.5572733283042908, "learning_rate": 5.005726017078711e-06, "loss": 0.6218, "step": 8839 }, { "epoch": 3.383084577114428, "grad_norm": 0.49161943793296814, "learning_rate": 5.003578504780155e-06, "loss": 0.5741, "step": 8840 }, { "epoch": 3.383467278989667, "grad_norm": 0.5467098951339722, "learning_rate": 5.001431299525123e-06, "loss": 0.6594, "step": 8841 }, { "epoch": 3.3838499808649063, "grad_norm": 0.5324609875679016, "learning_rate": 4.999284401445571e-06, "loss": 0.6011, "step": 8842 }, { "epoch": 3.384232682740145, "grad_norm": 0.5715609788894653, "learning_rate": 4.997137810673422e-06, "loss": 0.5649, "step": 8843 }, { "epoch": 3.3846153846153846, "grad_norm": 0.5884891152381897, "learning_rate": 4.994991527340596e-06, "loss": 0.6247, "step": 8844 }, { "epoch": 3.384998086490624, "grad_norm": 0.5538039207458496, "learning_rate": 4.992845551578986e-06, "loss": 0.6567, "step": 8845 }, { "epoch": 3.385380788365863, "grad_norm": 0.5813185572624207, "learning_rate": 4.990699883520468e-06, "loss": 0.715, "step": 8846 }, { "epoch": 3.3857634902411022, "grad_norm": 0.5360687971115112, "learning_rate": 4.988554523296899e-06, "loss": 0.6295, "step": 8847 }, { "epoch": 3.3861461921163416, "grad_norm": 0.5593990087509155, "learning_rate": 4.986409471040123e-06, "loss": 0.6377, "step": 8848 }, { "epoch": 3.3865288939915805, "grad_norm": 0.53931725025177, "learning_rate": 4.9842647268819485e-06, "loss": 0.5867, "step": 8849 }, { "epoch": 3.38691159586682, "grad_norm": 0.5469239354133606, "learning_rate": 4.9821202909541824e-06, "loss": 0.6534, "step": 8850 }, { "epoch": 3.387294297742059, "grad_norm": 0.5578554272651672, "learning_rate": 4.979976163388609e-06, "loss": 0.6895, "step": 8851 }, { "epoch": 3.387676999617298, "grad_norm": 0.5063869953155518, "learning_rate": 4.977832344316982e-06, "loss": 0.6351, "step": 8852 }, { "epoch": 3.388059701492537, "grad_norm": 0.5502596497535706, "learning_rate": 4.97568883387105e-06, "loss": 0.5951, "step": 8853 }, { "epoch": 3.3884424033677765, "grad_norm": 0.536761462688446, "learning_rate": 4.973545632182537e-06, "loss": 0.5872, "step": 8854 }, { "epoch": 3.388825105243016, "grad_norm": 0.5215675830841064, "learning_rate": 4.971402739383148e-06, "loss": 0.6739, "step": 8855 }, { "epoch": 3.3892078071182548, "grad_norm": 0.6460384130477905, "learning_rate": 4.969260155604569e-06, "loss": 0.7155, "step": 8856 }, { "epoch": 3.389590508993494, "grad_norm": 0.5688791275024414, "learning_rate": 4.96711788097847e-06, "loss": 0.6605, "step": 8857 }, { "epoch": 3.3899732108687335, "grad_norm": 0.5973365902900696, "learning_rate": 4.964975915636502e-06, "loss": 0.6413, "step": 8858 }, { "epoch": 3.3903559127439724, "grad_norm": 0.6486764550209045, "learning_rate": 4.962834259710286e-06, "loss": 0.6411, "step": 8859 }, { "epoch": 3.3907386146192118, "grad_norm": 0.5535121560096741, "learning_rate": 4.960692913331436e-06, "loss": 0.6784, "step": 8860 }, { "epoch": 3.3911213164944507, "grad_norm": 0.5835902094841003, "learning_rate": 4.95855187663155e-06, "loss": 0.7584, "step": 8861 }, { "epoch": 3.39150401836969, "grad_norm": 0.6009443998336792, "learning_rate": 4.956411149742191e-06, "loss": 0.612, "step": 8862 }, { "epoch": 3.391886720244929, "grad_norm": 0.5589229464530945, "learning_rate": 4.954270732794916e-06, "loss": 0.6953, "step": 8863 }, { "epoch": 3.3922694221201684, "grad_norm": 0.5288258194923401, "learning_rate": 4.95213062592126e-06, "loss": 0.7009, "step": 8864 }, { "epoch": 3.3926521239954077, "grad_norm": 0.5262620449066162, "learning_rate": 4.949990829252737e-06, "loss": 0.577, "step": 8865 }, { "epoch": 3.3930348258706466, "grad_norm": 0.570956826210022, "learning_rate": 4.9478513429208455e-06, "loss": 0.5873, "step": 8866 }, { "epoch": 3.393417527745886, "grad_norm": 0.5657492876052856, "learning_rate": 4.945712167057062e-06, "loss": 0.5598, "step": 8867 }, { "epoch": 3.3938002296211254, "grad_norm": 0.5603811740875244, "learning_rate": 4.9435733017928466e-06, "loss": 0.6344, "step": 8868 }, { "epoch": 3.3941829314963643, "grad_norm": 0.5477356910705566, "learning_rate": 4.941434747259632e-06, "loss": 0.6257, "step": 8869 }, { "epoch": 3.3945656333716037, "grad_norm": 0.5183456540107727, "learning_rate": 4.939296503588843e-06, "loss": 0.5609, "step": 8870 }, { "epoch": 3.3949483352468426, "grad_norm": 0.569968581199646, "learning_rate": 4.937158570911883e-06, "loss": 0.6811, "step": 8871 }, { "epoch": 3.395331037122082, "grad_norm": 0.5540138483047485, "learning_rate": 4.935020949360126e-06, "loss": 0.6821, "step": 8872 }, { "epoch": 3.395713738997321, "grad_norm": 0.532691240310669, "learning_rate": 4.932883639064938e-06, "loss": 0.547, "step": 8873 }, { "epoch": 3.3960964408725602, "grad_norm": 0.5454957485198975, "learning_rate": 4.930746640157664e-06, "loss": 0.6967, "step": 8874 }, { "epoch": 3.3964791427477996, "grad_norm": 0.5686455368995667, "learning_rate": 4.928609952769627e-06, "loss": 0.6277, "step": 8875 }, { "epoch": 3.3968618446230385, "grad_norm": 0.5864038467407227, "learning_rate": 4.926473577032133e-06, "loss": 0.6892, "step": 8876 }, { "epoch": 3.397244546498278, "grad_norm": 0.595224142074585, "learning_rate": 4.924337513076472e-06, "loss": 0.5641, "step": 8877 }, { "epoch": 3.3976272483735173, "grad_norm": 0.5392104983329773, "learning_rate": 4.9222017610339025e-06, "loss": 0.5899, "step": 8878 }, { "epoch": 3.398009950248756, "grad_norm": 0.5340175032615662, "learning_rate": 4.920066321035676e-06, "loss": 0.5571, "step": 8879 }, { "epoch": 3.3983926521239955, "grad_norm": 0.5071137547492981, "learning_rate": 4.91793119321302e-06, "loss": 0.5605, "step": 8880 }, { "epoch": 3.3987753539992345, "grad_norm": 0.7216305136680603, "learning_rate": 4.915796377697151e-06, "loss": 0.5734, "step": 8881 }, { "epoch": 3.399158055874474, "grad_norm": 1.181982398033142, "learning_rate": 4.9136618746192485e-06, "loss": 0.6759, "step": 8882 }, { "epoch": 3.3995407577497128, "grad_norm": 0.5335919260978699, "learning_rate": 4.911527684110487e-06, "loss": 0.646, "step": 8883 }, { "epoch": 3.399923459624952, "grad_norm": 0.5294303297996521, "learning_rate": 4.909393806302022e-06, "loss": 0.6292, "step": 8884 }, { "epoch": 3.4003061615001915, "grad_norm": 0.5236298441886902, "learning_rate": 4.907260241324982e-06, "loss": 0.529, "step": 8885 }, { "epoch": 3.4006888633754304, "grad_norm": 0.5252600908279419, "learning_rate": 4.905126989310481e-06, "loss": 0.5947, "step": 8886 }, { "epoch": 3.40107156525067, "grad_norm": 0.5974651575088501, "learning_rate": 4.902994050389619e-06, "loss": 0.7504, "step": 8887 }, { "epoch": 3.401454267125909, "grad_norm": 0.5359563231468201, "learning_rate": 4.900861424693461e-06, "loss": 0.6395, "step": 8888 }, { "epoch": 3.401836969001148, "grad_norm": 0.530234694480896, "learning_rate": 4.898729112353068e-06, "loss": 0.594, "step": 8889 }, { "epoch": 3.4022196708763874, "grad_norm": 0.5578190684318542, "learning_rate": 4.896597113499479e-06, "loss": 0.6948, "step": 8890 }, { "epoch": 3.4026023727516264, "grad_norm": 0.540435791015625, "learning_rate": 4.894465428263704e-06, "loss": 0.5659, "step": 8891 }, { "epoch": 3.4029850746268657, "grad_norm": 0.5276878476142883, "learning_rate": 4.892334056776744e-06, "loss": 0.6344, "step": 8892 }, { "epoch": 3.4033677765021046, "grad_norm": 0.557384192943573, "learning_rate": 4.8902029991695774e-06, "loss": 0.6915, "step": 8893 }, { "epoch": 3.403750478377344, "grad_norm": 0.543910801410675, "learning_rate": 4.888072255573165e-06, "loss": 0.6407, "step": 8894 }, { "epoch": 3.4041331802525834, "grad_norm": 0.5261417627334595, "learning_rate": 4.885941826118446e-06, "loss": 0.6065, "step": 8895 }, { "epoch": 3.4045158821278223, "grad_norm": 0.5480362772941589, "learning_rate": 4.883811710936339e-06, "loss": 0.6573, "step": 8896 }, { "epoch": 3.4048985840030617, "grad_norm": 0.5046383738517761, "learning_rate": 4.8816819101577514e-06, "loss": 0.5419, "step": 8897 }, { "epoch": 3.405281285878301, "grad_norm": 0.5653195381164551, "learning_rate": 4.879552423913557e-06, "loss": 0.5785, "step": 8898 }, { "epoch": 3.40566398775354, "grad_norm": 0.5406000018119812, "learning_rate": 4.877423252334623e-06, "loss": 0.6951, "step": 8899 }, { "epoch": 3.4060466896287793, "grad_norm": 0.5393122434616089, "learning_rate": 4.875294395551795e-06, "loss": 0.6899, "step": 8900 }, { "epoch": 3.4064293915040182, "grad_norm": 0.5147521495819092, "learning_rate": 4.87316585369589e-06, "loss": 0.601, "step": 8901 }, { "epoch": 3.4068120933792576, "grad_norm": 0.5200279355049133, "learning_rate": 4.871037626897716e-06, "loss": 0.5763, "step": 8902 }, { "epoch": 3.4071947952544965, "grad_norm": 0.5503559112548828, "learning_rate": 4.86890971528806e-06, "loss": 0.6906, "step": 8903 }, { "epoch": 3.407577497129736, "grad_norm": 0.5369661450386047, "learning_rate": 4.866782118997686e-06, "loss": 0.5552, "step": 8904 }, { "epoch": 3.4079601990049753, "grad_norm": 0.5469724535942078, "learning_rate": 4.864654838157341e-06, "loss": 0.6157, "step": 8905 }, { "epoch": 3.408342900880214, "grad_norm": 0.5289632081985474, "learning_rate": 4.862527872897752e-06, "loss": 0.6618, "step": 8906 }, { "epoch": 3.4087256027554536, "grad_norm": 0.5287640690803528, "learning_rate": 4.8604012233496315e-06, "loss": 0.5501, "step": 8907 }, { "epoch": 3.409108304630693, "grad_norm": 0.5483160614967346, "learning_rate": 4.858274889643658e-06, "loss": 0.6609, "step": 8908 }, { "epoch": 3.409491006505932, "grad_norm": 0.5341436266899109, "learning_rate": 4.856148871910508e-06, "loss": 0.6188, "step": 8909 }, { "epoch": 3.409873708381171, "grad_norm": 0.6623152494430542, "learning_rate": 4.854023170280831e-06, "loss": 0.7387, "step": 8910 }, { "epoch": 3.41025641025641, "grad_norm": 0.555190920829773, "learning_rate": 4.851897784885251e-06, "loss": 0.6326, "step": 8911 }, { "epoch": 3.4106391121316495, "grad_norm": 0.5380598306655884, "learning_rate": 4.849772715854383e-06, "loss": 0.5553, "step": 8912 }, { "epoch": 3.4110218140068884, "grad_norm": 0.5499697923660278, "learning_rate": 4.847647963318818e-06, "loss": 0.5801, "step": 8913 }, { "epoch": 3.411404515882128, "grad_norm": 0.5373635292053223, "learning_rate": 4.845523527409126e-06, "loss": 0.6468, "step": 8914 }, { "epoch": 3.411787217757367, "grad_norm": 0.4926687777042389, "learning_rate": 4.843399408255861e-06, "loss": 0.6025, "step": 8915 }, { "epoch": 3.412169919632606, "grad_norm": 0.5196709036827087, "learning_rate": 4.841275605989561e-06, "loss": 0.5753, "step": 8916 }, { "epoch": 3.4125526215078454, "grad_norm": 0.532550036907196, "learning_rate": 4.839152120740728e-06, "loss": 0.5352, "step": 8917 }, { "epoch": 3.412935323383085, "grad_norm": 0.521373450756073, "learning_rate": 4.837028952639863e-06, "loss": 0.5611, "step": 8918 }, { "epoch": 3.4133180252583237, "grad_norm": 0.5570446848869324, "learning_rate": 4.8349061018174385e-06, "loss": 0.685, "step": 8919 }, { "epoch": 3.413700727133563, "grad_norm": 0.5670595765113831, "learning_rate": 4.832783568403914e-06, "loss": 0.7096, "step": 8920 }, { "epoch": 3.414083429008802, "grad_norm": 0.6581287384033203, "learning_rate": 4.830661352529717e-06, "loss": 0.6861, "step": 8921 }, { "epoch": 3.4144661308840414, "grad_norm": 0.6003056168556213, "learning_rate": 4.828539454325267e-06, "loss": 0.615, "step": 8922 }, { "epoch": 3.4148488327592803, "grad_norm": 0.5978861451148987, "learning_rate": 4.826417873920965e-06, "loss": 0.6085, "step": 8923 }, { "epoch": 3.4152315346345197, "grad_norm": 0.5676347017288208, "learning_rate": 4.824296611447177e-06, "loss": 0.5402, "step": 8924 }, { "epoch": 3.415614236509759, "grad_norm": 0.5899574756622314, "learning_rate": 4.822175667034268e-06, "loss": 0.5996, "step": 8925 }, { "epoch": 3.415996938384998, "grad_norm": 0.5588691234588623, "learning_rate": 4.82005504081258e-06, "loss": 0.6078, "step": 8926 }, { "epoch": 3.4163796402602373, "grad_norm": 0.4880218207836151, "learning_rate": 4.817934732912422e-06, "loss": 0.5179, "step": 8927 }, { "epoch": 3.4167623421354767, "grad_norm": 0.5855790972709656, "learning_rate": 4.815814743464095e-06, "loss": 0.6649, "step": 8928 }, { "epoch": 3.4171450440107156, "grad_norm": 0.7915591597557068, "learning_rate": 4.8136950725978825e-06, "loss": 0.6684, "step": 8929 }, { "epoch": 3.417527745885955, "grad_norm": 0.5106220841407776, "learning_rate": 4.811575720444038e-06, "loss": 0.6076, "step": 8930 }, { "epoch": 3.417910447761194, "grad_norm": 0.5932120680809021, "learning_rate": 4.809456687132804e-06, "loss": 0.6395, "step": 8931 }, { "epoch": 3.4182931496364333, "grad_norm": 0.5746772289276123, "learning_rate": 4.8073379727944015e-06, "loss": 0.6151, "step": 8932 }, { "epoch": 3.418675851511672, "grad_norm": 0.5325120091438293, "learning_rate": 4.805219577559034e-06, "loss": 0.6731, "step": 8933 }, { "epoch": 3.4190585533869116, "grad_norm": 0.5425044298171997, "learning_rate": 4.803101501556872e-06, "loss": 0.6555, "step": 8934 }, { "epoch": 3.419441255262151, "grad_norm": 0.53909832239151, "learning_rate": 4.800983744918086e-06, "loss": 0.5839, "step": 8935 }, { "epoch": 3.41982395713739, "grad_norm": 0.5521237254142761, "learning_rate": 4.798866307772822e-06, "loss": 0.6582, "step": 8936 }, { "epoch": 3.420206659012629, "grad_norm": 0.5837801098823547, "learning_rate": 4.796749190251192e-06, "loss": 0.6532, "step": 8937 }, { "epoch": 3.4205893608878686, "grad_norm": 0.5565385222434998, "learning_rate": 4.794632392483301e-06, "loss": 0.5716, "step": 8938 }, { "epoch": 3.4209720627631075, "grad_norm": 0.5550360083580017, "learning_rate": 4.79251591459924e-06, "loss": 0.6725, "step": 8939 }, { "epoch": 3.421354764638347, "grad_norm": 0.5634509325027466, "learning_rate": 4.79039975672906e-06, "loss": 0.5814, "step": 8940 }, { "epoch": 3.421737466513586, "grad_norm": 0.5875561833381653, "learning_rate": 4.7882839190028115e-06, "loss": 0.5986, "step": 8941 }, { "epoch": 3.422120168388825, "grad_norm": 0.5820155143737793, "learning_rate": 4.7861684015505215e-06, "loss": 0.6395, "step": 8942 }, { "epoch": 3.422502870264064, "grad_norm": 0.5268713235855103, "learning_rate": 4.784053204502186e-06, "loss": 0.6297, "step": 8943 }, { "epoch": 3.4228855721393034, "grad_norm": 0.5265344977378845, "learning_rate": 4.781938327987793e-06, "loss": 0.5975, "step": 8944 }, { "epoch": 3.423268274014543, "grad_norm": 0.5380074977874756, "learning_rate": 4.7798237721373055e-06, "loss": 0.6566, "step": 8945 }, { "epoch": 3.4236509758897817, "grad_norm": 0.5014985799789429, "learning_rate": 4.777709537080678e-06, "loss": 0.5789, "step": 8946 }, { "epoch": 3.424033677765021, "grad_norm": 0.5550546646118164, "learning_rate": 4.775595622947825e-06, "loss": 0.6314, "step": 8947 }, { "epoch": 3.4244163796402605, "grad_norm": 0.5562630891799927, "learning_rate": 4.773482029868657e-06, "loss": 0.6442, "step": 8948 }, { "epoch": 3.4247990815154994, "grad_norm": 0.5636210441589355, "learning_rate": 4.771368757973062e-06, "loss": 0.6862, "step": 8949 }, { "epoch": 3.4251817833907388, "grad_norm": 0.5391055941581726, "learning_rate": 4.7692558073909e-06, "loss": 0.6204, "step": 8950 }, { "epoch": 3.4255644852659777, "grad_norm": 0.5157000422477722, "learning_rate": 4.767143178252021e-06, "loss": 0.6678, "step": 8951 }, { "epoch": 3.425947187141217, "grad_norm": 0.618902862071991, "learning_rate": 4.765030870686257e-06, "loss": 0.6388, "step": 8952 }, { "epoch": 3.426329889016456, "grad_norm": 0.531413197517395, "learning_rate": 4.762918884823404e-06, "loss": 0.5864, "step": 8953 }, { "epoch": 3.4267125908916953, "grad_norm": 0.5452973246574402, "learning_rate": 4.760807220793255e-06, "loss": 0.5607, "step": 8954 }, { "epoch": 3.4270952927669347, "grad_norm": 0.5181421637535095, "learning_rate": 4.758695878725579e-06, "loss": 0.5769, "step": 8955 }, { "epoch": 3.4274779946421736, "grad_norm": 0.5826393365859985, "learning_rate": 4.756584858750121e-06, "loss": 0.5703, "step": 8956 }, { "epoch": 3.427860696517413, "grad_norm": 0.6708411574363708, "learning_rate": 4.754474160996609e-06, "loss": 0.5631, "step": 8957 }, { "epoch": 3.4282433983926524, "grad_norm": 0.5326642394065857, "learning_rate": 4.752363785594757e-06, "loss": 0.5559, "step": 8958 }, { "epoch": 3.4286261002678913, "grad_norm": 0.5357866287231445, "learning_rate": 4.750253732674243e-06, "loss": 0.5963, "step": 8959 }, { "epoch": 3.4290088021431306, "grad_norm": 0.6029236316680908, "learning_rate": 4.748144002364742e-06, "loss": 0.6329, "step": 8960 }, { "epoch": 3.4293915040183696, "grad_norm": 0.5388456583023071, "learning_rate": 4.7460345947959e-06, "loss": 0.6461, "step": 8961 }, { "epoch": 3.429774205893609, "grad_norm": 0.6534820199012756, "learning_rate": 4.74392551009735e-06, "loss": 0.7113, "step": 8962 }, { "epoch": 3.430156907768848, "grad_norm": 0.5222404599189758, "learning_rate": 4.741816748398695e-06, "loss": 0.5528, "step": 8963 }, { "epoch": 3.430539609644087, "grad_norm": 0.5835378766059875, "learning_rate": 4.739708309829527e-06, "loss": 0.6748, "step": 8964 }, { "epoch": 3.4309223115193266, "grad_norm": 0.5187358260154724, "learning_rate": 4.737600194519415e-06, "loss": 0.5632, "step": 8965 }, { "epoch": 3.4313050133945655, "grad_norm": 0.5452395677566528, "learning_rate": 4.735492402597909e-06, "loss": 0.6634, "step": 8966 }, { "epoch": 3.431687715269805, "grad_norm": 0.5411945581436157, "learning_rate": 4.733384934194537e-06, "loss": 0.6396, "step": 8967 }, { "epoch": 3.4320704171450442, "grad_norm": 0.5606086850166321, "learning_rate": 4.731277789438813e-06, "loss": 0.6165, "step": 8968 }, { "epoch": 3.432453119020283, "grad_norm": 0.48900359869003296, "learning_rate": 4.72917096846022e-06, "loss": 0.6146, "step": 8969 }, { "epoch": 3.4328358208955225, "grad_norm": 0.5317371487617493, "learning_rate": 4.72706447138823e-06, "loss": 0.5741, "step": 8970 }, { "epoch": 3.4332185227707614, "grad_norm": 0.5581058859825134, "learning_rate": 4.7249582983522994e-06, "loss": 0.5878, "step": 8971 }, { "epoch": 3.433601224646001, "grad_norm": 0.5566272139549255, "learning_rate": 4.722852449481848e-06, "loss": 0.6493, "step": 8972 }, { "epoch": 3.4339839265212397, "grad_norm": 0.5118004083633423, "learning_rate": 4.720746924906291e-06, "loss": 0.567, "step": 8973 }, { "epoch": 3.434366628396479, "grad_norm": 0.5477319359779358, "learning_rate": 4.718641724755017e-06, "loss": 0.6447, "step": 8974 }, { "epoch": 3.4347493302717185, "grad_norm": 0.5355347394943237, "learning_rate": 4.7165368491574e-06, "loss": 0.5866, "step": 8975 }, { "epoch": 3.4351320321469574, "grad_norm": 0.5473859310150146, "learning_rate": 4.714432298242784e-06, "loss": 0.5987, "step": 8976 }, { "epoch": 3.4355147340221968, "grad_norm": 0.5388268232345581, "learning_rate": 4.712328072140505e-06, "loss": 0.5775, "step": 8977 }, { "epoch": 3.435897435897436, "grad_norm": 0.5808721780776978, "learning_rate": 4.710224170979876e-06, "loss": 0.5919, "step": 8978 }, { "epoch": 3.436280137772675, "grad_norm": 0.5152506828308105, "learning_rate": 4.708120594890176e-06, "loss": 0.5615, "step": 8979 }, { "epoch": 3.4366628396479144, "grad_norm": 0.5441275835037231, "learning_rate": 4.706017344000684e-06, "loss": 0.5561, "step": 8980 }, { "epoch": 3.4370455415231533, "grad_norm": 0.5642613768577576, "learning_rate": 4.703914418440652e-06, "loss": 0.5847, "step": 8981 }, { "epoch": 3.4374282433983927, "grad_norm": 0.566473662853241, "learning_rate": 4.7018118183393035e-06, "loss": 0.6194, "step": 8982 }, { "epoch": 3.4378109452736316, "grad_norm": 0.5738710165023804, "learning_rate": 4.699709543825851e-06, "loss": 0.659, "step": 8983 }, { "epoch": 3.438193647148871, "grad_norm": 0.5138322710990906, "learning_rate": 4.697607595029487e-06, "loss": 0.6168, "step": 8984 }, { "epoch": 3.4385763490241104, "grad_norm": 0.5460842251777649, "learning_rate": 4.695505972079382e-06, "loss": 0.6869, "step": 8985 }, { "epoch": 3.4389590508993493, "grad_norm": 0.5281000137329102, "learning_rate": 4.693404675104684e-06, "loss": 0.6327, "step": 8986 }, { "epoch": 3.4393417527745886, "grad_norm": 0.6526610851287842, "learning_rate": 4.691303704234526e-06, "loss": 0.532, "step": 8987 }, { "epoch": 3.439724454649828, "grad_norm": 0.5445467829704285, "learning_rate": 4.68920305959802e-06, "loss": 0.5813, "step": 8988 }, { "epoch": 3.440107156525067, "grad_norm": 0.5255109667778015, "learning_rate": 4.6871027413242485e-06, "loss": 0.602, "step": 8989 }, { "epoch": 3.4404898584003063, "grad_norm": 0.5441023111343384, "learning_rate": 4.685002749542289e-06, "loss": 0.6476, "step": 8990 }, { "epoch": 3.4408725602755452, "grad_norm": 0.5499892234802246, "learning_rate": 4.682903084381192e-06, "loss": 0.6502, "step": 8991 }, { "epoch": 3.4412552621507846, "grad_norm": 0.5310178995132446, "learning_rate": 4.68080374596998e-06, "loss": 0.575, "step": 8992 }, { "epoch": 3.4416379640260235, "grad_norm": 0.5469740033149719, "learning_rate": 4.678704734437669e-06, "loss": 0.6385, "step": 8993 }, { "epoch": 3.442020665901263, "grad_norm": 0.5235070586204529, "learning_rate": 4.676606049913247e-06, "loss": 0.5472, "step": 8994 }, { "epoch": 3.4424033677765022, "grad_norm": 0.5447333455085754, "learning_rate": 4.674507692525685e-06, "loss": 0.6369, "step": 8995 }, { "epoch": 3.442786069651741, "grad_norm": 0.5663597583770752, "learning_rate": 4.672409662403933e-06, "loss": 0.6358, "step": 8996 }, { "epoch": 3.4431687715269805, "grad_norm": 0.5332692265510559, "learning_rate": 4.670311959676922e-06, "loss": 0.5816, "step": 8997 }, { "epoch": 3.44355147340222, "grad_norm": 0.543953001499176, "learning_rate": 4.668214584473556e-06, "loss": 0.625, "step": 8998 }, { "epoch": 3.443934175277459, "grad_norm": 0.5799727439880371, "learning_rate": 4.666117536922727e-06, "loss": 0.5281, "step": 8999 }, { "epoch": 3.444316877152698, "grad_norm": 0.5459615588188171, "learning_rate": 4.664020817153306e-06, "loss": 0.6229, "step": 9000 }, { "epoch": 3.444699579027937, "grad_norm": 0.5530115962028503, "learning_rate": 4.661924425294144e-06, "loss": 0.5791, "step": 9001 }, { "epoch": 3.4450822809031765, "grad_norm": 0.5774402022361755, "learning_rate": 4.659828361474063e-06, "loss": 0.6045, "step": 9002 }, { "epoch": 3.4454649827784154, "grad_norm": 0.5479900240898132, "learning_rate": 4.657732625821876e-06, "loss": 0.6093, "step": 9003 }, { "epoch": 3.4458476846536548, "grad_norm": 0.5790103077888489, "learning_rate": 4.6556372184663705e-06, "loss": 0.6032, "step": 9004 }, { "epoch": 3.446230386528894, "grad_norm": 0.5267292261123657, "learning_rate": 4.653542139536316e-06, "loss": 0.5742, "step": 9005 }, { "epoch": 3.446613088404133, "grad_norm": 0.58527010679245, "learning_rate": 4.6514473891604584e-06, "loss": 0.6655, "step": 9006 }, { "epoch": 3.4469957902793724, "grad_norm": 0.5237046480178833, "learning_rate": 4.6493529674675335e-06, "loss": 0.5851, "step": 9007 }, { "epoch": 3.447378492154612, "grad_norm": 0.5034927725791931, "learning_rate": 4.647258874586238e-06, "loss": 0.5976, "step": 9008 }, { "epoch": 3.4477611940298507, "grad_norm": 0.5626207590103149, "learning_rate": 4.645165110645265e-06, "loss": 0.6918, "step": 9009 }, { "epoch": 3.44814389590509, "grad_norm": 0.5659066438674927, "learning_rate": 4.643071675773286e-06, "loss": 0.6176, "step": 9010 }, { "epoch": 3.448526597780329, "grad_norm": 0.5522670745849609, "learning_rate": 4.64097857009894e-06, "loss": 0.6516, "step": 9011 }, { "epoch": 3.4489092996555684, "grad_norm": 0.566267728805542, "learning_rate": 4.638885793750858e-06, "loss": 0.6099, "step": 9012 }, { "epoch": 3.4492920015308073, "grad_norm": 0.5603529810905457, "learning_rate": 4.636793346857647e-06, "loss": 0.6272, "step": 9013 }, { "epoch": 3.4496747034060466, "grad_norm": 0.5555288791656494, "learning_rate": 4.634701229547892e-06, "loss": 0.6484, "step": 9014 }, { "epoch": 3.450057405281286, "grad_norm": 0.542690634727478, "learning_rate": 4.632609441950161e-06, "loss": 0.6077, "step": 9015 }, { "epoch": 3.450440107156525, "grad_norm": 0.5288034081459045, "learning_rate": 4.630517984192998e-06, "loss": 0.6035, "step": 9016 }, { "epoch": 3.4508228090317643, "grad_norm": 0.548115074634552, "learning_rate": 4.628426856404935e-06, "loss": 0.5747, "step": 9017 }, { "epoch": 3.4512055109070037, "grad_norm": 0.5350794196128845, "learning_rate": 4.626336058714468e-06, "loss": 0.6183, "step": 9018 }, { "epoch": 3.4515882127822426, "grad_norm": 0.5567619204521179, "learning_rate": 4.624245591250087e-06, "loss": 0.5668, "step": 9019 }, { "epoch": 3.451970914657482, "grad_norm": 0.5558483600616455, "learning_rate": 4.622155454140259e-06, "loss": 0.6355, "step": 9020 }, { "epoch": 3.452353616532721, "grad_norm": 0.5145325064659119, "learning_rate": 4.620065647513423e-06, "loss": 0.5718, "step": 9021 }, { "epoch": 3.4527363184079602, "grad_norm": 0.5680913925170898, "learning_rate": 4.617976171498007e-06, "loss": 0.6453, "step": 9022 }, { "epoch": 3.453119020283199, "grad_norm": 0.5900663137435913, "learning_rate": 4.615887026222415e-06, "loss": 0.6295, "step": 9023 }, { "epoch": 3.4535017221584385, "grad_norm": 0.5865111947059631, "learning_rate": 4.613798211815025e-06, "loss": 0.6218, "step": 9024 }, { "epoch": 3.453884424033678, "grad_norm": 0.5954302549362183, "learning_rate": 4.611709728404207e-06, "loss": 0.6446, "step": 9025 }, { "epoch": 3.454267125908917, "grad_norm": 0.5488958954811096, "learning_rate": 4.609621576118302e-06, "loss": 0.7053, "step": 9026 }, { "epoch": 3.454649827784156, "grad_norm": 0.5570497512817383, "learning_rate": 4.607533755085636e-06, "loss": 0.6264, "step": 9027 }, { "epoch": 3.4550325296593956, "grad_norm": 0.5588634014129639, "learning_rate": 4.605446265434505e-06, "loss": 0.638, "step": 9028 }, { "epoch": 3.4554152315346345, "grad_norm": 0.5747649073600769, "learning_rate": 4.603359107293192e-06, "loss": 0.7207, "step": 9029 }, { "epoch": 3.455797933409874, "grad_norm": 0.5655427575111389, "learning_rate": 4.601272280789964e-06, "loss": 0.6076, "step": 9030 }, { "epoch": 3.4561806352851128, "grad_norm": 0.5159870386123657, "learning_rate": 4.599185786053055e-06, "loss": 0.6118, "step": 9031 }, { "epoch": 3.456563337160352, "grad_norm": 0.5111024379730225, "learning_rate": 4.597099623210687e-06, "loss": 0.6221, "step": 9032 }, { "epoch": 3.456946039035591, "grad_norm": 0.5731357336044312, "learning_rate": 4.595013792391067e-06, "loss": 0.6732, "step": 9033 }, { "epoch": 3.4573287409108304, "grad_norm": 0.5277714133262634, "learning_rate": 4.592928293722362e-06, "loss": 0.6499, "step": 9034 }, { "epoch": 3.45771144278607, "grad_norm": 0.5061019062995911, "learning_rate": 4.590843127332744e-06, "loss": 0.6756, "step": 9035 }, { "epoch": 3.4580941446613087, "grad_norm": 0.5759908556938171, "learning_rate": 4.588758293350349e-06, "loss": 0.6319, "step": 9036 }, { "epoch": 3.458476846536548, "grad_norm": 0.5311617851257324, "learning_rate": 4.586673791903292e-06, "loss": 0.6505, "step": 9037 }, { "epoch": 3.4588595484117874, "grad_norm": 0.4830232560634613, "learning_rate": 4.584589623119674e-06, "loss": 0.5884, "step": 9038 }, { "epoch": 3.4592422502870264, "grad_norm": 0.5338715314865112, "learning_rate": 4.582505787127571e-06, "loss": 0.6699, "step": 9039 }, { "epoch": 3.4596249521622657, "grad_norm": 0.5909689664840698, "learning_rate": 4.580422284055046e-06, "loss": 0.5842, "step": 9040 }, { "epoch": 3.4600076540375047, "grad_norm": 0.5057592391967773, "learning_rate": 4.578339114030127e-06, "loss": 0.6062, "step": 9041 }, { "epoch": 3.460390355912744, "grad_norm": 0.5230502486228943, "learning_rate": 4.576256277180834e-06, "loss": 0.5926, "step": 9042 }, { "epoch": 3.460773057787983, "grad_norm": 0.5084019303321838, "learning_rate": 4.574173773635168e-06, "loss": 0.5631, "step": 9043 }, { "epoch": 3.4611557596632223, "grad_norm": 0.4776516556739807, "learning_rate": 4.572091603521096e-06, "loss": 0.6027, "step": 9044 }, { "epoch": 3.4615384615384617, "grad_norm": 0.5880131721496582, "learning_rate": 4.570009766966574e-06, "loss": 0.6385, "step": 9045 }, { "epoch": 3.4619211634137006, "grad_norm": 0.543931245803833, "learning_rate": 4.567928264099547e-06, "loss": 0.6898, "step": 9046 }, { "epoch": 3.46230386528894, "grad_norm": 0.53757643699646, "learning_rate": 4.565847095047917e-06, "loss": 0.6076, "step": 9047 }, { "epoch": 3.4626865671641793, "grad_norm": 0.5447434186935425, "learning_rate": 4.563766259939581e-06, "loss": 0.6268, "step": 9048 }, { "epoch": 3.4630692690394183, "grad_norm": 0.5532149076461792, "learning_rate": 4.561685758902418e-06, "loss": 0.6817, "step": 9049 }, { "epoch": 3.4634519709146576, "grad_norm": 0.5292133092880249, "learning_rate": 4.5596055920642715e-06, "loss": 0.556, "step": 9050 }, { "epoch": 3.4638346727898965, "grad_norm": 0.5370820164680481, "learning_rate": 4.557525759552975e-06, "loss": 0.7271, "step": 9051 }, { "epoch": 3.464217374665136, "grad_norm": 0.573383629322052, "learning_rate": 4.555446261496343e-06, "loss": 0.6953, "step": 9052 }, { "epoch": 3.464600076540375, "grad_norm": 0.5450612902641296, "learning_rate": 4.553367098022169e-06, "loss": 0.5524, "step": 9053 }, { "epoch": 3.464982778415614, "grad_norm": 0.5217882990837097, "learning_rate": 4.551288269258215e-06, "loss": 0.5721, "step": 9054 }, { "epoch": 3.4653654802908536, "grad_norm": 0.5111291408538818, "learning_rate": 4.549209775332235e-06, "loss": 0.6701, "step": 9055 }, { "epoch": 3.4657481821660925, "grad_norm": 0.5424990057945251, "learning_rate": 4.547131616371957e-06, "loss": 0.7005, "step": 9056 }, { "epoch": 3.466130884041332, "grad_norm": 0.5290237069129944, "learning_rate": 4.545053792505091e-06, "loss": 0.5546, "step": 9057 }, { "epoch": 3.466513585916571, "grad_norm": 0.5395734906196594, "learning_rate": 4.542976303859323e-06, "loss": 0.5777, "step": 9058 }, { "epoch": 3.46689628779181, "grad_norm": 0.5480394959449768, "learning_rate": 4.540899150562327e-06, "loss": 0.6458, "step": 9059 }, { "epoch": 3.4672789896670495, "grad_norm": 0.5274195075035095, "learning_rate": 4.538822332741739e-06, "loss": 0.5945, "step": 9060 }, { "epoch": 3.4676616915422884, "grad_norm": 0.5856204032897949, "learning_rate": 4.53674585052519e-06, "loss": 0.6682, "step": 9061 }, { "epoch": 3.468044393417528, "grad_norm": 0.5325820446014404, "learning_rate": 4.53466970404029e-06, "loss": 0.5764, "step": 9062 }, { "epoch": 3.4684270952927667, "grad_norm": 0.5130486488342285, "learning_rate": 4.532593893414615e-06, "loss": 0.6712, "step": 9063 }, { "epoch": 3.468809797168006, "grad_norm": 0.5533526539802551, "learning_rate": 4.530518418775734e-06, "loss": 0.6372, "step": 9064 }, { "epoch": 3.4691924990432454, "grad_norm": 0.5338895916938782, "learning_rate": 4.528443280251189e-06, "loss": 0.6236, "step": 9065 }, { "epoch": 3.4695752009184844, "grad_norm": 0.5342287421226501, "learning_rate": 4.526368477968504e-06, "loss": 0.637, "step": 9066 }, { "epoch": 3.4699579027937237, "grad_norm": 0.5238841772079468, "learning_rate": 4.524294012055181e-06, "loss": 0.5981, "step": 9067 }, { "epoch": 3.470340604668963, "grad_norm": 0.5714090466499329, "learning_rate": 4.5222198826387005e-06, "loss": 0.6102, "step": 9068 }, { "epoch": 3.470723306544202, "grad_norm": 0.5662676095962524, "learning_rate": 4.520146089846529e-06, "loss": 0.6145, "step": 9069 }, { "epoch": 3.4711060084194414, "grad_norm": 0.5394870042800903, "learning_rate": 4.518072633806098e-06, "loss": 0.6724, "step": 9070 }, { "epoch": 3.4714887102946803, "grad_norm": 0.5605854392051697, "learning_rate": 4.515999514644831e-06, "loss": 0.6761, "step": 9071 }, { "epoch": 3.4718714121699197, "grad_norm": 0.5557815432548523, "learning_rate": 4.513926732490129e-06, "loss": 0.6447, "step": 9072 }, { "epoch": 3.4722541140451586, "grad_norm": 0.5323126912117004, "learning_rate": 4.511854287469365e-06, "loss": 0.5573, "step": 9073 }, { "epoch": 3.472636815920398, "grad_norm": 0.5853457450866699, "learning_rate": 4.509782179709899e-06, "loss": 0.652, "step": 9074 }, { "epoch": 3.4730195177956373, "grad_norm": 0.5544766187667847, "learning_rate": 4.5077104093390665e-06, "loss": 0.7081, "step": 9075 }, { "epoch": 3.4734022196708763, "grad_norm": 0.534389078617096, "learning_rate": 4.505638976484186e-06, "loss": 0.5892, "step": 9076 }, { "epoch": 3.4737849215461156, "grad_norm": 0.5468797087669373, "learning_rate": 4.503567881272549e-06, "loss": 0.6386, "step": 9077 }, { "epoch": 3.474167623421355, "grad_norm": 0.5413761138916016, "learning_rate": 4.501497123831433e-06, "loss": 0.6037, "step": 9078 }, { "epoch": 3.474550325296594, "grad_norm": 0.504989743232727, "learning_rate": 4.499426704288094e-06, "loss": 0.588, "step": 9079 }, { "epoch": 3.4749330271718333, "grad_norm": 0.5365326404571533, "learning_rate": 4.497356622769757e-06, "loss": 0.5776, "step": 9080 }, { "epoch": 3.475315729047072, "grad_norm": 0.5420194268226624, "learning_rate": 4.495286879403639e-06, "loss": 0.6404, "step": 9081 }, { "epoch": 3.4756984309223116, "grad_norm": 0.5065795183181763, "learning_rate": 4.493217474316934e-06, "loss": 0.5135, "step": 9082 }, { "epoch": 3.4760811327975505, "grad_norm": 0.5248684287071228, "learning_rate": 4.491148407636805e-06, "loss": 0.5942, "step": 9083 }, { "epoch": 3.47646383467279, "grad_norm": 0.5476508736610413, "learning_rate": 4.489079679490407e-06, "loss": 0.7068, "step": 9084 }, { "epoch": 3.476846536548029, "grad_norm": 0.5472608208656311, "learning_rate": 4.487011290004867e-06, "loss": 0.6565, "step": 9085 }, { "epoch": 3.477229238423268, "grad_norm": 0.5121553540229797, "learning_rate": 4.484943239307293e-06, "loss": 0.5814, "step": 9086 }, { "epoch": 3.4776119402985075, "grad_norm": 0.5369265079498291, "learning_rate": 4.482875527524774e-06, "loss": 0.5558, "step": 9087 }, { "epoch": 3.477994642173747, "grad_norm": 0.5948525667190552, "learning_rate": 4.48080815478438e-06, "loss": 0.6558, "step": 9088 }, { "epoch": 3.478377344048986, "grad_norm": 0.5271750092506409, "learning_rate": 4.478741121213147e-06, "loss": 0.5206, "step": 9089 }, { "epoch": 3.478760045924225, "grad_norm": 0.548861026763916, "learning_rate": 4.476674426938104e-06, "loss": 0.6684, "step": 9090 }, { "epoch": 3.479142747799464, "grad_norm": 0.5336481928825378, "learning_rate": 4.474608072086258e-06, "loss": 0.5246, "step": 9091 }, { "epoch": 3.4795254496747035, "grad_norm": 0.6002518534660339, "learning_rate": 4.472542056784593e-06, "loss": 0.6648, "step": 9092 }, { "epoch": 3.4799081515499424, "grad_norm": 0.5908930897712708, "learning_rate": 4.470476381160065e-06, "loss": 0.5746, "step": 9093 }, { "epoch": 3.4802908534251817, "grad_norm": 0.5865207314491272, "learning_rate": 4.468411045339617e-06, "loss": 0.6683, "step": 9094 }, { "epoch": 3.480673555300421, "grad_norm": 0.6228679418563843, "learning_rate": 4.466346049450171e-06, "loss": 0.6309, "step": 9095 }, { "epoch": 3.48105625717566, "grad_norm": 0.6996047496795654, "learning_rate": 4.464281393618627e-06, "loss": 0.6426, "step": 9096 }, { "epoch": 3.4814389590508994, "grad_norm": 0.5432799458503723, "learning_rate": 4.462217077971863e-06, "loss": 0.6226, "step": 9097 }, { "epoch": 3.4818216609261388, "grad_norm": 0.5519952178001404, "learning_rate": 4.4601531026367405e-06, "loss": 0.6839, "step": 9098 }, { "epoch": 3.4822043628013777, "grad_norm": 0.5847206115722656, "learning_rate": 4.458089467740089e-06, "loss": 0.6611, "step": 9099 }, { "epoch": 3.482587064676617, "grad_norm": 0.5307133197784424, "learning_rate": 4.4560261734087274e-06, "loss": 0.6172, "step": 9100 }, { "epoch": 3.482969766551856, "grad_norm": 0.5710432529449463, "learning_rate": 4.453963219769455e-06, "loss": 0.6336, "step": 9101 }, { "epoch": 3.4833524684270953, "grad_norm": 1.0180174112319946, "learning_rate": 4.451900606949039e-06, "loss": 0.5807, "step": 9102 }, { "epoch": 3.4837351703023343, "grad_norm": 0.5362034440040588, "learning_rate": 4.449838335074235e-06, "loss": 0.718, "step": 9103 }, { "epoch": 3.4841178721775736, "grad_norm": 0.5084841251373291, "learning_rate": 4.447776404271775e-06, "loss": 0.5929, "step": 9104 }, { "epoch": 3.484500574052813, "grad_norm": 0.5462340116500854, "learning_rate": 4.445714814668373e-06, "loss": 0.6031, "step": 9105 }, { "epoch": 3.484883275928052, "grad_norm": 0.7242419719696045, "learning_rate": 4.443653566390716e-06, "loss": 0.6058, "step": 9106 }, { "epoch": 3.4852659778032913, "grad_norm": 0.5354925394058228, "learning_rate": 4.441592659565475e-06, "loss": 0.595, "step": 9107 }, { "epoch": 3.4856486796785306, "grad_norm": 0.5723905563354492, "learning_rate": 4.4395320943193e-06, "loss": 0.6054, "step": 9108 }, { "epoch": 3.4860313815537696, "grad_norm": 0.6077447533607483, "learning_rate": 4.437471870778813e-06, "loss": 0.623, "step": 9109 }, { "epoch": 3.486414083429009, "grad_norm": 0.5507403016090393, "learning_rate": 4.435411989070624e-06, "loss": 0.6269, "step": 9110 }, { "epoch": 3.486796785304248, "grad_norm": 0.560336709022522, "learning_rate": 4.43335244932132e-06, "loss": 0.6289, "step": 9111 }, { "epoch": 3.4871794871794872, "grad_norm": 0.5451111793518066, "learning_rate": 4.431293251657459e-06, "loss": 0.6557, "step": 9112 }, { "epoch": 3.487562189054726, "grad_norm": 0.513099730014801, "learning_rate": 4.42923439620559e-06, "loss": 0.6806, "step": 9113 }, { "epoch": 3.4879448909299655, "grad_norm": 0.5368736386299133, "learning_rate": 4.4271758830922305e-06, "loss": 0.5677, "step": 9114 }, { "epoch": 3.488327592805205, "grad_norm": 0.5549760460853577, "learning_rate": 4.425117712443886e-06, "loss": 0.5689, "step": 9115 }, { "epoch": 3.488710294680444, "grad_norm": 0.49858370423316956, "learning_rate": 4.4230598843870346e-06, "loss": 0.5791, "step": 9116 }, { "epoch": 3.489092996555683, "grad_norm": 0.5804941058158875, "learning_rate": 4.421002399048136e-06, "loss": 0.6369, "step": 9117 }, { "epoch": 3.4894756984309225, "grad_norm": 0.5565260052680969, "learning_rate": 4.418945256553631e-06, "loss": 0.7056, "step": 9118 }, { "epoch": 3.4898584003061615, "grad_norm": 0.574406623840332, "learning_rate": 4.41688845702993e-06, "loss": 0.6338, "step": 9119 }, { "epoch": 3.490241102181401, "grad_norm": 0.5399852991104126, "learning_rate": 4.414832000603432e-06, "loss": 0.6246, "step": 9120 }, { "epoch": 3.4906238040566397, "grad_norm": 0.523627519607544, "learning_rate": 4.412775887400516e-06, "loss": 0.6292, "step": 9121 }, { "epoch": 3.491006505931879, "grad_norm": 0.5163648128509521, "learning_rate": 4.410720117547528e-06, "loss": 0.6504, "step": 9122 }, { "epoch": 3.491389207807118, "grad_norm": 0.5153742432594299, "learning_rate": 4.408664691170803e-06, "loss": 0.5528, "step": 9123 }, { "epoch": 3.4917719096823574, "grad_norm": 0.5319254994392395, "learning_rate": 4.406609608396656e-06, "loss": 0.6549, "step": 9124 }, { "epoch": 3.4921546115575968, "grad_norm": 0.5345211029052734, "learning_rate": 4.404554869351374e-06, "loss": 0.6489, "step": 9125 }, { "epoch": 3.4925373134328357, "grad_norm": 0.5430510640144348, "learning_rate": 4.402500474161227e-06, "loss": 0.5262, "step": 9126 }, { "epoch": 3.492920015308075, "grad_norm": 0.5566636919975281, "learning_rate": 4.400446422952468e-06, "loss": 0.565, "step": 9127 }, { "epoch": 3.4933027171833144, "grad_norm": 0.5478867292404175, "learning_rate": 4.3983927158513164e-06, "loss": 0.687, "step": 9128 }, { "epoch": 3.4936854190585533, "grad_norm": 0.5421234965324402, "learning_rate": 4.396339352983979e-06, "loss": 0.5481, "step": 9129 }, { "epoch": 3.4940681209337927, "grad_norm": 0.5649966597557068, "learning_rate": 4.394286334476644e-06, "loss": 0.6363, "step": 9130 }, { "epoch": 3.4944508228090316, "grad_norm": 0.5660368800163269, "learning_rate": 4.392233660455477e-06, "loss": 0.6422, "step": 9131 }, { "epoch": 3.494833524684271, "grad_norm": 0.5738133788108826, "learning_rate": 4.390181331046613e-06, "loss": 0.5749, "step": 9132 }, { "epoch": 3.49521622655951, "grad_norm": 0.503998339176178, "learning_rate": 4.388129346376177e-06, "loss": 0.5605, "step": 9133 }, { "epoch": 3.4955989284347493, "grad_norm": 0.5359538793563843, "learning_rate": 4.386077706570269e-06, "loss": 0.5959, "step": 9134 }, { "epoch": 3.4959816303099887, "grad_norm": 0.6117579340934753, "learning_rate": 4.384026411754967e-06, "loss": 0.6385, "step": 9135 }, { "epoch": 3.4963643321852276, "grad_norm": 0.538139283657074, "learning_rate": 4.38197546205633e-06, "loss": 0.6136, "step": 9136 }, { "epoch": 3.496747034060467, "grad_norm": 0.5439583659172058, "learning_rate": 4.379924857600397e-06, "loss": 0.6263, "step": 9137 }, { "epoch": 3.4971297359357063, "grad_norm": 0.4872678220272064, "learning_rate": 4.377874598513176e-06, "loss": 0.4978, "step": 9138 }, { "epoch": 3.4975124378109452, "grad_norm": 0.5956224203109741, "learning_rate": 4.375824684920665e-06, "loss": 0.6483, "step": 9139 }, { "epoch": 3.4978951396861846, "grad_norm": 0.5791977047920227, "learning_rate": 4.37377511694884e-06, "loss": 0.6547, "step": 9140 }, { "epoch": 3.4982778415614235, "grad_norm": 0.5514206290245056, "learning_rate": 4.371725894723645e-06, "loss": 0.6776, "step": 9141 }, { "epoch": 3.498660543436663, "grad_norm": 0.5337388515472412, "learning_rate": 4.369677018371015e-06, "loss": 0.6253, "step": 9142 }, { "epoch": 3.499043245311902, "grad_norm": 0.518513560295105, "learning_rate": 4.367628488016856e-06, "loss": 0.5879, "step": 9143 }, { "epoch": 3.499425947187141, "grad_norm": 0.5204852223396301, "learning_rate": 4.365580303787063e-06, "loss": 0.6013, "step": 9144 }, { "epoch": 3.4998086490623805, "grad_norm": 0.5431310534477234, "learning_rate": 4.363532465807489e-06, "loss": 0.6459, "step": 9145 }, { "epoch": 3.5001913509376195, "grad_norm": 0.5617052316665649, "learning_rate": 4.361484974203991e-06, "loss": 0.6344, "step": 9146 }, { "epoch": 3.500574052812859, "grad_norm": 0.5502060055732727, "learning_rate": 4.3594378291023945e-06, "loss": 0.552, "step": 9147 }, { "epoch": 3.500956754688098, "grad_norm": 0.5501947999000549, "learning_rate": 4.357391030628491e-06, "loss": 0.6703, "step": 9148 }, { "epoch": 3.501339456563337, "grad_norm": 0.5274763107299805, "learning_rate": 4.355344578908069e-06, "loss": 0.6423, "step": 9149 }, { "epoch": 3.5017221584385765, "grad_norm": 0.5117822885513306, "learning_rate": 4.353298474066889e-06, "loss": 0.5586, "step": 9150 }, { "epoch": 3.5021048603138154, "grad_norm": 0.49452269077301025, "learning_rate": 4.351252716230685e-06, "loss": 0.616, "step": 9151 }, { "epoch": 3.5024875621890548, "grad_norm": 0.5132919549942017, "learning_rate": 4.349207305525176e-06, "loss": 0.5993, "step": 9152 }, { "epoch": 3.5028702640642937, "grad_norm": 0.5549650192260742, "learning_rate": 4.347162242076063e-06, "loss": 0.6341, "step": 9153 }, { "epoch": 3.503252965939533, "grad_norm": 0.505520761013031, "learning_rate": 4.345117526009013e-06, "loss": 0.6274, "step": 9154 }, { "epoch": 3.5036356678147724, "grad_norm": 0.5582035183906555, "learning_rate": 4.343073157449683e-06, "loss": 0.6447, "step": 9155 }, { "epoch": 3.5040183696900113, "grad_norm": 0.5639969110488892, "learning_rate": 4.341029136523701e-06, "loss": 0.5995, "step": 9156 }, { "epoch": 3.5044010715652507, "grad_norm": 0.5226112008094788, "learning_rate": 4.338985463356689e-06, "loss": 0.5983, "step": 9157 }, { "epoch": 3.50478377344049, "grad_norm": 0.6846259236335754, "learning_rate": 4.336942138074224e-06, "loss": 0.636, "step": 9158 }, { "epoch": 3.505166475315729, "grad_norm": 0.5047906041145325, "learning_rate": 4.334899160801879e-06, "loss": 0.6063, "step": 9159 }, { "epoch": 3.5055491771909684, "grad_norm": 0.5436668992042542, "learning_rate": 4.3328565316652025e-06, "loss": 0.6825, "step": 9160 }, { "epoch": 3.5059318790662073, "grad_norm": 0.6903054118156433, "learning_rate": 4.330814250789714e-06, "loss": 0.6192, "step": 9161 }, { "epoch": 3.5063145809414467, "grad_norm": 0.5755972266197205, "learning_rate": 4.32877231830092e-06, "loss": 0.7005, "step": 9162 }, { "epoch": 3.5066972828166856, "grad_norm": 0.5995773673057556, "learning_rate": 4.3267307343243055e-06, "loss": 0.6126, "step": 9163 }, { "epoch": 3.507079984691925, "grad_norm": 0.5465055108070374, "learning_rate": 4.324689498985326e-06, "loss": 0.6034, "step": 9164 }, { "epoch": 3.5074626865671643, "grad_norm": 0.536560595035553, "learning_rate": 4.322648612409423e-06, "loss": 0.5778, "step": 9165 }, { "epoch": 3.5078453884424032, "grad_norm": 0.5374481678009033, "learning_rate": 4.320608074722016e-06, "loss": 0.6042, "step": 9166 }, { "epoch": 3.5082280903176426, "grad_norm": 0.5320728421211243, "learning_rate": 4.318567886048499e-06, "loss": 0.667, "step": 9167 }, { "epoch": 3.508610792192882, "grad_norm": 0.5425442457199097, "learning_rate": 4.316528046514249e-06, "loss": 0.6264, "step": 9168 }, { "epoch": 3.508993494068121, "grad_norm": 0.5500946640968323, "learning_rate": 4.314488556244623e-06, "loss": 0.6476, "step": 9169 }, { "epoch": 3.5093761959433603, "grad_norm": 0.5639225840568542, "learning_rate": 4.312449415364946e-06, "loss": 0.6457, "step": 9170 }, { "epoch": 3.509758897818599, "grad_norm": 0.5520843267440796, "learning_rate": 4.310410624000531e-06, "loss": 0.6461, "step": 9171 }, { "epoch": 3.5101415996938385, "grad_norm": 0.5110095739364624, "learning_rate": 4.3083721822766674e-06, "loss": 0.622, "step": 9172 }, { "epoch": 3.5105243015690775, "grad_norm": 0.6068160533905029, "learning_rate": 4.306334090318629e-06, "loss": 0.563, "step": 9173 }, { "epoch": 3.510907003444317, "grad_norm": 0.7644041776657104, "learning_rate": 4.304296348251653e-06, "loss": 0.6353, "step": 9174 }, { "epoch": 3.511289705319556, "grad_norm": 0.532951295375824, "learning_rate": 4.302258956200968e-06, "loss": 0.5633, "step": 9175 }, { "epoch": 3.511672407194795, "grad_norm": 0.5535467863082886, "learning_rate": 4.300221914291775e-06, "loss": 0.6493, "step": 9176 }, { "epoch": 3.5120551090700345, "grad_norm": 0.5186949372291565, "learning_rate": 4.298185222649261e-06, "loss": 0.6066, "step": 9177 }, { "epoch": 3.512437810945274, "grad_norm": 0.5529534816741943, "learning_rate": 4.296148881398581e-06, "loss": 0.7101, "step": 9178 }, { "epoch": 3.5128205128205128, "grad_norm": 0.6235147714614868, "learning_rate": 4.2941128906648786e-06, "loss": 0.6168, "step": 9179 }, { "epoch": 3.513203214695752, "grad_norm": 0.5328839421272278, "learning_rate": 4.292077250573265e-06, "loss": 0.5788, "step": 9180 }, { "epoch": 3.513585916570991, "grad_norm": 0.5185196399688721, "learning_rate": 4.2900419612488384e-06, "loss": 0.5553, "step": 9181 }, { "epoch": 3.5139686184462304, "grad_norm": 0.5627520084381104, "learning_rate": 4.288007022816678e-06, "loss": 0.6667, "step": 9182 }, { "epoch": 3.5143513203214694, "grad_norm": 0.5686626434326172, "learning_rate": 4.285972435401826e-06, "loss": 0.6208, "step": 9183 }, { "epoch": 3.5147340221967087, "grad_norm": 0.5737216472625732, "learning_rate": 4.283938199129319e-06, "loss": 0.6676, "step": 9184 }, { "epoch": 3.515116724071948, "grad_norm": 0.5946042537689209, "learning_rate": 4.2819043141241655e-06, "loss": 0.7293, "step": 9185 }, { "epoch": 3.515499425947187, "grad_norm": 0.5835551619529724, "learning_rate": 4.279870780511355e-06, "loss": 0.6562, "step": 9186 }, { "epoch": 3.5158821278224264, "grad_norm": 0.5240269303321838, "learning_rate": 4.27783759841585e-06, "loss": 0.5784, "step": 9187 }, { "epoch": 3.5162648296976657, "grad_norm": 0.5930378437042236, "learning_rate": 4.275804767962599e-06, "loss": 0.6767, "step": 9188 }, { "epoch": 3.5166475315729047, "grad_norm": 0.5445623993873596, "learning_rate": 4.273772289276525e-06, "loss": 0.6182, "step": 9189 }, { "epoch": 3.517030233448144, "grad_norm": 0.5542327165603638, "learning_rate": 4.271740162482525e-06, "loss": 0.6313, "step": 9190 }, { "epoch": 3.517412935323383, "grad_norm": 0.5339009761810303, "learning_rate": 4.26970838770548e-06, "loss": 0.6289, "step": 9191 }, { "epoch": 3.5177956371986223, "grad_norm": 0.5522938370704651, "learning_rate": 4.267676965070253e-06, "loss": 0.6759, "step": 9192 }, { "epoch": 3.5181783390738612, "grad_norm": 0.5478227734565735, "learning_rate": 4.265645894701672e-06, "loss": 0.6732, "step": 9193 }, { "epoch": 3.5185610409491006, "grad_norm": 0.514248251914978, "learning_rate": 4.263615176724557e-06, "loss": 0.6, "step": 9194 }, { "epoch": 3.51894374282434, "grad_norm": 0.5763166546821594, "learning_rate": 4.2615848112637e-06, "loss": 0.6307, "step": 9195 }, { "epoch": 3.519326444699579, "grad_norm": 0.5758410096168518, "learning_rate": 4.259554798443872e-06, "loss": 0.6541, "step": 9196 }, { "epoch": 3.5197091465748183, "grad_norm": 0.5089911818504333, "learning_rate": 4.257525138389824e-06, "loss": 0.5666, "step": 9197 }, { "epoch": 3.5200918484500576, "grad_norm": 0.591132640838623, "learning_rate": 4.255495831226283e-06, "loss": 0.6527, "step": 9198 }, { "epoch": 3.5204745503252965, "grad_norm": 0.5334320664405823, "learning_rate": 4.25346687707796e-06, "loss": 0.6168, "step": 9199 }, { "epoch": 3.520857252200536, "grad_norm": 0.5714334845542908, "learning_rate": 4.25143827606953e-06, "loss": 0.5954, "step": 9200 }, { "epoch": 3.521239954075775, "grad_norm": 0.5454012751579285, "learning_rate": 4.249410028325662e-06, "loss": 0.6079, "step": 9201 }, { "epoch": 3.521622655951014, "grad_norm": 0.5899356603622437, "learning_rate": 4.247382133971002e-06, "loss": 0.6481, "step": 9202 }, { "epoch": 3.522005357826253, "grad_norm": 0.5461914539337158, "learning_rate": 4.2453545931301584e-06, "loss": 0.5711, "step": 9203 }, { "epoch": 3.5223880597014925, "grad_norm": 0.5695574283599854, "learning_rate": 4.2433274059277364e-06, "loss": 0.6661, "step": 9204 }, { "epoch": 3.522770761576732, "grad_norm": 0.5630292892456055, "learning_rate": 4.24130057248831e-06, "loss": 0.5906, "step": 9205 }, { "epoch": 3.523153463451971, "grad_norm": 0.5172450542449951, "learning_rate": 4.239274092936434e-06, "loss": 0.6218, "step": 9206 }, { "epoch": 3.52353616532721, "grad_norm": 0.5716785788536072, "learning_rate": 4.237247967396641e-06, "loss": 0.6017, "step": 9207 }, { "epoch": 3.5239188672024495, "grad_norm": 0.5503019094467163, "learning_rate": 4.235222195993446e-06, "loss": 0.6501, "step": 9208 }, { "epoch": 3.5243015690776884, "grad_norm": 0.5931400060653687, "learning_rate": 4.2331967788513295e-06, "loss": 0.6648, "step": 9209 }, { "epoch": 3.524684270952928, "grad_norm": 0.5438958406448364, "learning_rate": 4.231171716094764e-06, "loss": 0.5815, "step": 9210 }, { "epoch": 3.5250669728281667, "grad_norm": 0.5728680491447449, "learning_rate": 4.229147007848196e-06, "loss": 0.6986, "step": 9211 }, { "epoch": 3.525449674703406, "grad_norm": 0.5586576461791992, "learning_rate": 4.2271226542360486e-06, "loss": 0.5829, "step": 9212 }, { "epoch": 3.525832376578645, "grad_norm": 0.545095682144165, "learning_rate": 4.225098655382721e-06, "loss": 0.5864, "step": 9213 }, { "epoch": 3.5262150784538844, "grad_norm": 0.5250848531723022, "learning_rate": 4.223075011412595e-06, "loss": 0.583, "step": 9214 }, { "epoch": 3.5265977803291237, "grad_norm": 0.5570597052574158, "learning_rate": 4.221051722450029e-06, "loss": 0.5801, "step": 9215 }, { "epoch": 3.5269804822043627, "grad_norm": 0.5136033296585083, "learning_rate": 4.219028788619359e-06, "loss": 0.5677, "step": 9216 }, { "epoch": 3.527363184079602, "grad_norm": 0.5422559380531311, "learning_rate": 4.217006210044902e-06, "loss": 0.5585, "step": 9217 }, { "epoch": 3.5277458859548414, "grad_norm": 0.5571998953819275, "learning_rate": 4.214983986850951e-06, "loss": 0.6523, "step": 9218 }, { "epoch": 3.5281285878300803, "grad_norm": 0.5408032536506653, "learning_rate": 4.212962119161773e-06, "loss": 0.5716, "step": 9219 }, { "epoch": 3.5285112897053197, "grad_norm": 0.5768899321556091, "learning_rate": 4.2109406071016205e-06, "loss": 0.6223, "step": 9220 }, { "epoch": 3.5288939915805586, "grad_norm": 0.5090261697769165, "learning_rate": 4.2089194507947216e-06, "loss": 0.6331, "step": 9221 }, { "epoch": 3.529276693455798, "grad_norm": 0.5397464632987976, "learning_rate": 4.206898650365277e-06, "loss": 0.6104, "step": 9222 }, { "epoch": 3.529659395331037, "grad_norm": 0.5062885880470276, "learning_rate": 4.204878205937475e-06, "loss": 0.6788, "step": 9223 }, { "epoch": 3.5300420972062763, "grad_norm": 0.4838072955608368, "learning_rate": 4.202858117635474e-06, "loss": 0.5758, "step": 9224 }, { "epoch": 3.5304247990815156, "grad_norm": 0.5328428149223328, "learning_rate": 4.200838385583417e-06, "loss": 0.5578, "step": 9225 }, { "epoch": 3.5308075009567546, "grad_norm": 0.5351399779319763, "learning_rate": 4.198819009905421e-06, "loss": 0.6265, "step": 9226 }, { "epoch": 3.531190202831994, "grad_norm": 0.6320289373397827, "learning_rate": 4.19679999072558e-06, "loss": 0.612, "step": 9227 }, { "epoch": 3.5315729047072333, "grad_norm": 0.5135018825531006, "learning_rate": 4.1947813281679735e-06, "loss": 0.611, "step": 9228 }, { "epoch": 3.531955606582472, "grad_norm": 0.7667892575263977, "learning_rate": 4.192763022356647e-06, "loss": 0.5918, "step": 9229 }, { "epoch": 3.5323383084577116, "grad_norm": 0.6204013824462891, "learning_rate": 4.190745073415634e-06, "loss": 0.6797, "step": 9230 }, { "epoch": 3.5327210103329505, "grad_norm": 0.4763709306716919, "learning_rate": 4.188727481468946e-06, "loss": 0.5749, "step": 9231 }, { "epoch": 3.53310371220819, "grad_norm": 0.521600604057312, "learning_rate": 4.186710246640563e-06, "loss": 0.6001, "step": 9232 }, { "epoch": 3.533486414083429, "grad_norm": 0.5475045442581177, "learning_rate": 4.184693369054452e-06, "loss": 0.6618, "step": 9233 }, { "epoch": 3.533869115958668, "grad_norm": 0.5541495680809021, "learning_rate": 4.182676848834557e-06, "loss": 0.5571, "step": 9234 }, { "epoch": 3.5342518178339075, "grad_norm": 0.5772644877433777, "learning_rate": 4.180660686104797e-06, "loss": 0.6569, "step": 9235 }, { "epoch": 3.5346345197091464, "grad_norm": 0.5220925807952881, "learning_rate": 4.178644880989072e-06, "loss": 0.523, "step": 9236 }, { "epoch": 3.535017221584386, "grad_norm": 0.549428403377533, "learning_rate": 4.176629433611258e-06, "loss": 0.6013, "step": 9237 }, { "epoch": 3.535399923459625, "grad_norm": 0.5684159398078918, "learning_rate": 4.1746143440952135e-06, "loss": 0.6272, "step": 9238 }, { "epoch": 3.535782625334864, "grad_norm": 0.5401646494865417, "learning_rate": 4.172599612564764e-06, "loss": 0.6905, "step": 9239 }, { "epoch": 3.5361653272101035, "grad_norm": 0.9791266322135925, "learning_rate": 4.170585239143723e-06, "loss": 0.6136, "step": 9240 }, { "epoch": 3.5365480290853424, "grad_norm": 0.5260521173477173, "learning_rate": 4.168571223955883e-06, "loss": 0.6332, "step": 9241 }, { "epoch": 3.5369307309605817, "grad_norm": 0.5641314387321472, "learning_rate": 4.166557567125005e-06, "loss": 0.6437, "step": 9242 }, { "epoch": 3.5373134328358207, "grad_norm": 0.520857572555542, "learning_rate": 4.164544268774835e-06, "loss": 0.593, "step": 9243 }, { "epoch": 3.53769613471106, "grad_norm": 0.49554792046546936, "learning_rate": 4.162531329029101e-06, "loss": 0.5971, "step": 9244 }, { "epoch": 3.5380788365862994, "grad_norm": 0.49542152881622314, "learning_rate": 4.160518748011491e-06, "loss": 0.608, "step": 9245 }, { "epoch": 3.5384615384615383, "grad_norm": 0.5935623049736023, "learning_rate": 4.1585065258456946e-06, "loss": 0.6177, "step": 9246 }, { "epoch": 3.5388442403367777, "grad_norm": 0.5237728357315063, "learning_rate": 4.15649466265537e-06, "loss": 0.6055, "step": 9247 }, { "epoch": 3.539226942212017, "grad_norm": 0.5369744300842285, "learning_rate": 4.154483158564143e-06, "loss": 0.7019, "step": 9248 }, { "epoch": 3.539609644087256, "grad_norm": 0.5189865231513977, "learning_rate": 4.15247201369563e-06, "loss": 0.6565, "step": 9249 }, { "epoch": 3.5399923459624953, "grad_norm": 0.5217751264572144, "learning_rate": 4.150461228173421e-06, "loss": 0.6075, "step": 9250 }, { "epoch": 3.5403750478377343, "grad_norm": 0.5420924425125122, "learning_rate": 4.148450802121088e-06, "loss": 0.5605, "step": 9251 }, { "epoch": 3.5407577497129736, "grad_norm": 0.5194869637489319, "learning_rate": 4.1464407356621696e-06, "loss": 0.5987, "step": 9252 }, { "epoch": 3.5411404515882126, "grad_norm": 0.5525397062301636, "learning_rate": 4.144431028920194e-06, "loss": 0.686, "step": 9253 }, { "epoch": 3.541523153463452, "grad_norm": 0.5664142966270447, "learning_rate": 4.142421682018666e-06, "loss": 0.6041, "step": 9254 }, { "epoch": 3.5419058553386913, "grad_norm": 0.5391342639923096, "learning_rate": 4.140412695081058e-06, "loss": 0.6259, "step": 9255 }, { "epoch": 3.54228855721393, "grad_norm": 0.5345112681388855, "learning_rate": 4.138404068230828e-06, "loss": 0.5836, "step": 9256 }, { "epoch": 3.5426712590891696, "grad_norm": 0.5450584888458252, "learning_rate": 4.136395801591423e-06, "loss": 0.6137, "step": 9257 }, { "epoch": 3.543053960964409, "grad_norm": 0.5336347818374634, "learning_rate": 4.134387895286243e-06, "loss": 0.6465, "step": 9258 }, { "epoch": 3.543436662839648, "grad_norm": 0.5655457377433777, "learning_rate": 4.132380349438687e-06, "loss": 0.6634, "step": 9259 }, { "epoch": 3.5438193647148872, "grad_norm": 0.5454888343811035, "learning_rate": 4.130373164172125e-06, "loss": 0.5859, "step": 9260 }, { "epoch": 3.544202066590126, "grad_norm": 0.5395885109901428, "learning_rate": 4.128366339609896e-06, "loss": 0.5861, "step": 9261 }, { "epoch": 3.5445847684653655, "grad_norm": 0.5651029944419861, "learning_rate": 4.126359875875331e-06, "loss": 0.6368, "step": 9262 }, { "epoch": 3.5449674703406044, "grad_norm": 0.5541322231292725, "learning_rate": 4.124353773091729e-06, "loss": 0.6024, "step": 9263 }, { "epoch": 3.545350172215844, "grad_norm": 0.538139283657074, "learning_rate": 4.122348031382378e-06, "loss": 0.6108, "step": 9264 }, { "epoch": 3.545732874091083, "grad_norm": 0.5128725171089172, "learning_rate": 4.120342650870527e-06, "loss": 0.6203, "step": 9265 }, { "epoch": 3.546115575966322, "grad_norm": 0.5327793955802917, "learning_rate": 4.118337631679411e-06, "loss": 0.6039, "step": 9266 }, { "epoch": 3.5464982778415615, "grad_norm": 0.5591983795166016, "learning_rate": 4.116332973932256e-06, "loss": 0.6786, "step": 9267 }, { "epoch": 3.546880979716801, "grad_norm": 0.5260587334632874, "learning_rate": 4.114328677752244e-06, "loss": 0.6473, "step": 9268 }, { "epoch": 3.5472636815920398, "grad_norm": 0.5625166296958923, "learning_rate": 4.1123247432625456e-06, "loss": 0.6595, "step": 9269 }, { "epoch": 3.547646383467279, "grad_norm": 0.7247753739356995, "learning_rate": 4.110321170586313e-06, "loss": 0.6588, "step": 9270 }, { "epoch": 3.548029085342518, "grad_norm": 0.5451138615608215, "learning_rate": 4.1083179598466625e-06, "loss": 0.6745, "step": 9271 }, { "epoch": 3.5484117872177574, "grad_norm": 0.5955342054367065, "learning_rate": 4.106315111166702e-06, "loss": 0.6369, "step": 9272 }, { "epoch": 3.5487944890929963, "grad_norm": 0.5923752188682556, "learning_rate": 4.104312624669514e-06, "loss": 0.6419, "step": 9273 }, { "epoch": 3.5491771909682357, "grad_norm": 0.5378561615943909, "learning_rate": 4.10231050047815e-06, "loss": 0.5076, "step": 9274 }, { "epoch": 3.549559892843475, "grad_norm": 0.5356446504592896, "learning_rate": 4.100308738715652e-06, "loss": 0.5915, "step": 9275 }, { "epoch": 3.549942594718714, "grad_norm": 0.5057463645935059, "learning_rate": 4.09830733950503e-06, "loss": 0.6415, "step": 9276 }, { "epoch": 3.5503252965939534, "grad_norm": 0.6292493343353271, "learning_rate": 4.096306302969277e-06, "loss": 0.6839, "step": 9277 }, { "epoch": 3.5507079984691927, "grad_norm": 0.5906907916069031, "learning_rate": 4.094305629231362e-06, "loss": 0.5934, "step": 9278 }, { "epoch": 3.5510907003444316, "grad_norm": 0.5890095233917236, "learning_rate": 4.092305318414231e-06, "loss": 0.6292, "step": 9279 }, { "epoch": 3.551473402219671, "grad_norm": 0.5700520277023315, "learning_rate": 4.090305370640814e-06, "loss": 0.6267, "step": 9280 }, { "epoch": 3.55185610409491, "grad_norm": 0.5444921851158142, "learning_rate": 4.088305786034004e-06, "loss": 0.574, "step": 9281 }, { "epoch": 3.5522388059701493, "grad_norm": 0.5592206120491028, "learning_rate": 4.0863065647166846e-06, "loss": 0.5986, "step": 9282 }, { "epoch": 3.552621507845388, "grad_norm": 0.5310786962509155, "learning_rate": 4.084307706811719e-06, "loss": 0.6481, "step": 9283 }, { "epoch": 3.5530042097206276, "grad_norm": 0.551369845867157, "learning_rate": 4.082309212441932e-06, "loss": 0.6124, "step": 9284 }, { "epoch": 3.553386911595867, "grad_norm": 0.5282934904098511, "learning_rate": 4.080311081730143e-06, "loss": 0.6959, "step": 9285 }, { "epoch": 3.553769613471106, "grad_norm": 0.4877776801586151, "learning_rate": 4.07831331479914e-06, "loss": 0.5797, "step": 9286 }, { "epoch": 3.5541523153463452, "grad_norm": 0.5036528706550598, "learning_rate": 4.076315911771694e-06, "loss": 0.6743, "step": 9287 }, { "epoch": 3.5545350172215846, "grad_norm": 0.5347640514373779, "learning_rate": 4.0743188727705475e-06, "loss": 0.5962, "step": 9288 }, { "epoch": 3.5549177190968235, "grad_norm": 0.5492874383926392, "learning_rate": 4.072322197918426e-06, "loss": 0.6066, "step": 9289 }, { "epoch": 3.555300420972063, "grad_norm": 0.550675630569458, "learning_rate": 4.070325887338033e-06, "loss": 0.6595, "step": 9290 }, { "epoch": 3.555683122847302, "grad_norm": 0.5348733067512512, "learning_rate": 4.068329941152043e-06, "loss": 0.6143, "step": 9291 }, { "epoch": 3.556065824722541, "grad_norm": 0.5764672160148621, "learning_rate": 4.0663343594831105e-06, "loss": 0.6032, "step": 9292 }, { "epoch": 3.55644852659778, "grad_norm": 0.5336082577705383, "learning_rate": 4.0643391424538785e-06, "loss": 0.5955, "step": 9293 }, { "epoch": 3.5568312284730195, "grad_norm": 0.5353018045425415, "learning_rate": 4.062344290186947e-06, "loss": 0.6209, "step": 9294 }, { "epoch": 3.557213930348259, "grad_norm": 0.5901408791542053, "learning_rate": 4.060349802804912e-06, "loss": 0.6923, "step": 9295 }, { "epoch": 3.5575966322234978, "grad_norm": 0.5504863262176514, "learning_rate": 4.058355680430337e-06, "loss": 0.6387, "step": 9296 }, { "epoch": 3.557979334098737, "grad_norm": 0.519006073474884, "learning_rate": 4.056361923185768e-06, "loss": 0.6934, "step": 9297 }, { "epoch": 3.5583620359739765, "grad_norm": 0.555903434753418, "learning_rate": 4.054368531193726e-06, "loss": 0.6537, "step": 9298 }, { "epoch": 3.5587447378492154, "grad_norm": 0.5487158894538879, "learning_rate": 4.052375504576715e-06, "loss": 0.6756, "step": 9299 }, { "epoch": 3.5591274397244548, "grad_norm": 0.5295811891555786, "learning_rate": 4.050382843457202e-06, "loss": 0.6914, "step": 9300 }, { "epoch": 3.5595101415996937, "grad_norm": 0.5234153866767883, "learning_rate": 4.048390547957649e-06, "loss": 0.6164, "step": 9301 }, { "epoch": 3.559892843474933, "grad_norm": 0.4842701256275177, "learning_rate": 4.046398618200486e-06, "loss": 0.5585, "step": 9302 }, { "epoch": 3.560275545350172, "grad_norm": 0.5474865436553955, "learning_rate": 4.044407054308124e-06, "loss": 0.6893, "step": 9303 }, { "epoch": 3.5606582472254114, "grad_norm": 0.4759216606616974, "learning_rate": 4.042415856402945e-06, "loss": 0.6176, "step": 9304 }, { "epoch": 3.5610409491006507, "grad_norm": 0.5383132100105286, "learning_rate": 4.040425024607317e-06, "loss": 0.6568, "step": 9305 }, { "epoch": 3.5614236509758896, "grad_norm": 0.5287265777587891, "learning_rate": 4.038434559043581e-06, "loss": 0.5715, "step": 9306 }, { "epoch": 3.561806352851129, "grad_norm": 0.5532228946685791, "learning_rate": 4.036444459834058e-06, "loss": 0.6342, "step": 9307 }, { "epoch": 3.5621890547263684, "grad_norm": 0.5432168245315552, "learning_rate": 4.034454727101043e-06, "loss": 0.6821, "step": 9308 }, { "epoch": 3.5625717566016073, "grad_norm": 0.5528867244720459, "learning_rate": 4.032465360966817e-06, "loss": 0.6378, "step": 9309 }, { "epoch": 3.5629544584768467, "grad_norm": 0.5695577263832092, "learning_rate": 4.0304763615536215e-06, "loss": 0.5876, "step": 9310 }, { "epoch": 3.5633371603520856, "grad_norm": 0.5221908688545227, "learning_rate": 4.02848772898369e-06, "loss": 0.6605, "step": 9311 }, { "epoch": 3.563719862227325, "grad_norm": 0.5328943133354187, "learning_rate": 4.026499463379233e-06, "loss": 0.6199, "step": 9312 }, { "epoch": 3.564102564102564, "grad_norm": 0.5382272005081177, "learning_rate": 4.02451156486243e-06, "loss": 0.6711, "step": 9313 }, { "epoch": 3.5644852659778032, "grad_norm": 0.5530247092247009, "learning_rate": 4.022524033555443e-06, "loss": 0.6972, "step": 9314 }, { "epoch": 3.5648679678530426, "grad_norm": 0.6440438032150269, "learning_rate": 4.020536869580414e-06, "loss": 0.6125, "step": 9315 }, { "epoch": 3.5652506697282815, "grad_norm": 0.592586100101471, "learning_rate": 4.018550073059456e-06, "loss": 0.6123, "step": 9316 }, { "epoch": 3.565633371603521, "grad_norm": 0.5415182113647461, "learning_rate": 4.0165636441146676e-06, "loss": 0.6653, "step": 9317 }, { "epoch": 3.5660160734787603, "grad_norm": 0.5470125675201416, "learning_rate": 4.0145775828681154e-06, "loss": 0.6117, "step": 9318 }, { "epoch": 3.566398775353999, "grad_norm": 0.585585355758667, "learning_rate": 4.012591889441856e-06, "loss": 0.6547, "step": 9319 }, { "epoch": 3.5667814772292386, "grad_norm": 0.5010002255439758, "learning_rate": 4.010606563957905e-06, "loss": 0.5849, "step": 9320 }, { "epoch": 3.5671641791044775, "grad_norm": 0.6009742021560669, "learning_rate": 4.008621606538272e-06, "loss": 0.7009, "step": 9321 }, { "epoch": 3.567546880979717, "grad_norm": 0.5099114179611206, "learning_rate": 4.006637017304941e-06, "loss": 0.6341, "step": 9322 }, { "epoch": 3.5679295828549558, "grad_norm": 0.6282520294189453, "learning_rate": 4.004652796379862e-06, "loss": 0.7279, "step": 9323 }, { "epoch": 3.568312284730195, "grad_norm": 0.5220016837120056, "learning_rate": 4.002668943884975e-06, "loss": 0.6277, "step": 9324 }, { "epoch": 3.5686949866054345, "grad_norm": 0.547821581363678, "learning_rate": 4.000685459942193e-06, "loss": 0.6513, "step": 9325 }, { "epoch": 3.5690776884806734, "grad_norm": 0.5908491015434265, "learning_rate": 3.998702344673408e-06, "loss": 0.5827, "step": 9326 }, { "epoch": 3.569460390355913, "grad_norm": 0.5583266019821167, "learning_rate": 3.996719598200485e-06, "loss": 0.6299, "step": 9327 }, { "epoch": 3.569843092231152, "grad_norm": 0.56971675157547, "learning_rate": 3.9947372206452705e-06, "loss": 0.632, "step": 9328 }, { "epoch": 3.570225794106391, "grad_norm": 0.5420271158218384, "learning_rate": 3.992755212129591e-06, "loss": 0.6976, "step": 9329 }, { "epoch": 3.5706084959816304, "grad_norm": 0.5972867608070374, "learning_rate": 3.990773572775239e-06, "loss": 0.6353, "step": 9330 }, { "epoch": 3.5709911978568694, "grad_norm": 0.5521243214607239, "learning_rate": 3.988792302703993e-06, "loss": 0.6602, "step": 9331 }, { "epoch": 3.5713738997321087, "grad_norm": 0.5388076305389404, "learning_rate": 3.9868114020376135e-06, "loss": 0.6052, "step": 9332 }, { "epoch": 3.5717566016073476, "grad_norm": 0.5228607058525085, "learning_rate": 3.984830870897824e-06, "loss": 0.598, "step": 9333 }, { "epoch": 3.572139303482587, "grad_norm": 0.5693979859352112, "learning_rate": 3.982850709406338e-06, "loss": 0.6408, "step": 9334 }, { "epoch": 3.5725220053578264, "grad_norm": 0.554241418838501, "learning_rate": 3.98087091768484e-06, "loss": 0.6844, "step": 9335 }, { "epoch": 3.5729047072330653, "grad_norm": 0.573101818561554, "learning_rate": 3.978891495854995e-06, "loss": 0.6537, "step": 9336 }, { "epoch": 3.5732874091083047, "grad_norm": 0.5545898675918579, "learning_rate": 3.976912444038443e-06, "loss": 0.665, "step": 9337 }, { "epoch": 3.573670110983544, "grad_norm": 0.5143846273422241, "learning_rate": 3.974933762356806e-06, "loss": 0.5536, "step": 9338 }, { "epoch": 3.574052812858783, "grad_norm": 0.5397096872329712, "learning_rate": 3.972955450931672e-06, "loss": 0.5853, "step": 9339 }, { "epoch": 3.5744355147340223, "grad_norm": 0.5359756350517273, "learning_rate": 3.970977509884617e-06, "loss": 0.6909, "step": 9340 }, { "epoch": 3.5748182166092612, "grad_norm": 0.5818163156509399, "learning_rate": 3.968999939337191e-06, "loss": 0.6116, "step": 9341 }, { "epoch": 3.5752009184845006, "grad_norm": 0.6482685804367065, "learning_rate": 3.967022739410924e-06, "loss": 0.5428, "step": 9342 }, { "epoch": 3.5755836203597395, "grad_norm": 0.5562599897384644, "learning_rate": 3.965045910227314e-06, "loss": 0.7161, "step": 9343 }, { "epoch": 3.575966322234979, "grad_norm": 0.5538987517356873, "learning_rate": 3.963069451907846e-06, "loss": 0.665, "step": 9344 }, { "epoch": 3.5763490241102183, "grad_norm": 0.5613912343978882, "learning_rate": 3.961093364573978e-06, "loss": 0.6163, "step": 9345 }, { "epoch": 3.576731725985457, "grad_norm": 0.5420278310775757, "learning_rate": 3.959117648347146e-06, "loss": 0.5635, "step": 9346 }, { "epoch": 3.5771144278606966, "grad_norm": 0.5421245098114014, "learning_rate": 3.9571423033487646e-06, "loss": 0.5738, "step": 9347 }, { "epoch": 3.577497129735936, "grad_norm": 0.5328360199928284, "learning_rate": 3.955167329700225e-06, "loss": 0.5726, "step": 9348 }, { "epoch": 3.577879831611175, "grad_norm": 0.539503812789917, "learning_rate": 3.953192727522889e-06, "loss": 0.5984, "step": 9349 }, { "epoch": 3.578262533486414, "grad_norm": 0.5666580200195312, "learning_rate": 3.951218496938104e-06, "loss": 0.6994, "step": 9350 }, { "epoch": 3.578645235361653, "grad_norm": 0.5127447843551636, "learning_rate": 3.9492446380671966e-06, "loss": 0.6224, "step": 9351 }, { "epoch": 3.5790279372368925, "grad_norm": 0.5569789409637451, "learning_rate": 3.9472711510314575e-06, "loss": 0.7239, "step": 9352 }, { "epoch": 3.5794106391121314, "grad_norm": 0.6328333616256714, "learning_rate": 3.9452980359521674e-06, "loss": 0.736, "step": 9353 }, { "epoch": 3.579793340987371, "grad_norm": 0.5900084376335144, "learning_rate": 3.943325292950579e-06, "loss": 0.682, "step": 9354 }, { "epoch": 3.58017604286261, "grad_norm": 0.6189733147621155, "learning_rate": 3.941352922147925e-06, "loss": 0.6419, "step": 9355 }, { "epoch": 3.580558744737849, "grad_norm": 0.6939754486083984, "learning_rate": 3.939380923665405e-06, "loss": 0.6321, "step": 9356 }, { "epoch": 3.5809414466130884, "grad_norm": 0.5287783741950989, "learning_rate": 3.9374092976242115e-06, "loss": 0.6129, "step": 9357 }, { "epoch": 3.581324148488328, "grad_norm": 0.5532200932502747, "learning_rate": 3.9354380441455075e-06, "loss": 0.6121, "step": 9358 }, { "epoch": 3.5817068503635667, "grad_norm": 0.5913769006729126, "learning_rate": 3.933467163350424e-06, "loss": 0.6589, "step": 9359 }, { "epoch": 3.582089552238806, "grad_norm": 0.5693351030349731, "learning_rate": 3.931496655360082e-06, "loss": 0.6485, "step": 9360 }, { "epoch": 3.582472254114045, "grad_norm": 0.527981162071228, "learning_rate": 3.929526520295575e-06, "loss": 0.6116, "step": 9361 }, { "epoch": 3.5828549559892844, "grad_norm": 0.8105064034461975, "learning_rate": 3.92755675827797e-06, "loss": 0.6456, "step": 9362 }, { "epoch": 3.5832376578645233, "grad_norm": 0.5555248260498047, "learning_rate": 3.925587369428316e-06, "loss": 0.6268, "step": 9363 }, { "epoch": 3.5836203597397627, "grad_norm": 0.5876550078392029, "learning_rate": 3.9236183538676396e-06, "loss": 0.634, "step": 9364 }, { "epoch": 3.584003061615002, "grad_norm": 0.6118975281715393, "learning_rate": 3.921649711716936e-06, "loss": 0.6962, "step": 9365 }, { "epoch": 3.584385763490241, "grad_norm": 0.5814709663391113, "learning_rate": 3.919681443097184e-06, "loss": 0.6291, "step": 9366 }, { "epoch": 3.5847684653654803, "grad_norm": 0.5412062406539917, "learning_rate": 3.917713548129345e-06, "loss": 0.6215, "step": 9367 }, { "epoch": 3.5851511672407197, "grad_norm": 0.5741699934005737, "learning_rate": 3.915746026934353e-06, "loss": 0.6511, "step": 9368 }, { "epoch": 3.5855338691159586, "grad_norm": 0.5189167857170105, "learning_rate": 3.913778879633109e-06, "loss": 0.5906, "step": 9369 }, { "epoch": 3.585916570991198, "grad_norm": 0.5877575874328613, "learning_rate": 3.911812106346502e-06, "loss": 0.6915, "step": 9370 }, { "epoch": 3.586299272866437, "grad_norm": 0.5549004077911377, "learning_rate": 3.909845707195402e-06, "loss": 0.5988, "step": 9371 }, { "epoch": 3.5866819747416763, "grad_norm": 0.5365760922431946, "learning_rate": 3.90787968230064e-06, "loss": 0.6143, "step": 9372 }, { "epoch": 3.587064676616915, "grad_norm": 0.5166588425636292, "learning_rate": 3.905914031783039e-06, "loss": 0.6096, "step": 9373 }, { "epoch": 3.5874473784921546, "grad_norm": 0.6291145086288452, "learning_rate": 3.903948755763395e-06, "loss": 0.6393, "step": 9374 }, { "epoch": 3.587830080367394, "grad_norm": 0.5465793013572693, "learning_rate": 3.9019838543624735e-06, "loss": 0.5893, "step": 9375 }, { "epoch": 3.588212782242633, "grad_norm": 0.5159702301025391, "learning_rate": 3.900019327701027e-06, "loss": 0.6203, "step": 9376 }, { "epoch": 3.588595484117872, "grad_norm": 0.5198609232902527, "learning_rate": 3.89805517589978e-06, "loss": 0.5656, "step": 9377 }, { "epoch": 3.5889781859931116, "grad_norm": 0.5862793326377869, "learning_rate": 3.896091399079436e-06, "loss": 0.6356, "step": 9378 }, { "epoch": 3.5893608878683505, "grad_norm": 0.5184786915779114, "learning_rate": 3.894127997360671e-06, "loss": 0.5905, "step": 9379 }, { "epoch": 3.58974358974359, "grad_norm": 0.5335460305213928, "learning_rate": 3.892164970864151e-06, "loss": 0.6538, "step": 9380 }, { "epoch": 3.590126291618829, "grad_norm": 0.49684783816337585, "learning_rate": 3.890202319710496e-06, "loss": 0.6115, "step": 9381 }, { "epoch": 3.590508993494068, "grad_norm": 0.5405381917953491, "learning_rate": 3.888240044020324e-06, "loss": 0.6174, "step": 9382 }, { "epoch": 3.590891695369307, "grad_norm": 0.539320170879364, "learning_rate": 3.886278143914219e-06, "loss": 0.6221, "step": 9383 }, { "epoch": 3.5912743972445464, "grad_norm": 0.6023665070533752, "learning_rate": 3.884316619512751e-06, "loss": 0.6018, "step": 9384 }, { "epoch": 3.591657099119786, "grad_norm": 0.6192318201065063, "learning_rate": 3.882355470936453e-06, "loss": 0.6655, "step": 9385 }, { "epoch": 3.5920398009950247, "grad_norm": 0.5255956649780273, "learning_rate": 3.880394698305846e-06, "loss": 0.5669, "step": 9386 }, { "epoch": 3.592422502870264, "grad_norm": 0.5291840434074402, "learning_rate": 3.878434301741425e-06, "loss": 0.6273, "step": 9387 }, { "epoch": 3.5928052047455035, "grad_norm": 0.5836756229400635, "learning_rate": 3.876474281363663e-06, "loss": 0.6127, "step": 9388 }, { "epoch": 3.5931879066207424, "grad_norm": 0.559683084487915, "learning_rate": 3.874514637293005e-06, "loss": 0.6208, "step": 9389 }, { "epoch": 3.5935706084959818, "grad_norm": 0.5435411930084229, "learning_rate": 3.872555369649883e-06, "loss": 0.631, "step": 9390 }, { "epoch": 3.5939533103712207, "grad_norm": 0.5472005605697632, "learning_rate": 3.870596478554691e-06, "loss": 0.5519, "step": 9391 }, { "epoch": 3.59433601224646, "grad_norm": 0.5177951455116272, "learning_rate": 3.868637964127813e-06, "loss": 0.5749, "step": 9392 }, { "epoch": 3.594718714121699, "grad_norm": 0.5538347363471985, "learning_rate": 3.866679826489606e-06, "loss": 0.5923, "step": 9393 }, { "epoch": 3.5951014159969383, "grad_norm": 0.5379197001457214, "learning_rate": 3.864722065760399e-06, "loss": 0.62, "step": 9394 }, { "epoch": 3.5954841178721777, "grad_norm": 0.5290416479110718, "learning_rate": 3.862764682060503e-06, "loss": 0.5933, "step": 9395 }, { "epoch": 3.5958668197474166, "grad_norm": 0.5655488967895508, "learning_rate": 3.860807675510204e-06, "loss": 0.6327, "step": 9396 }, { "epoch": 3.596249521622656, "grad_norm": 0.540692150592804, "learning_rate": 3.858851046229767e-06, "loss": 0.5841, "step": 9397 }, { "epoch": 3.5966322234978954, "grad_norm": 0.5862171053886414, "learning_rate": 3.8568947943394305e-06, "loss": 0.6543, "step": 9398 }, { "epoch": 3.5970149253731343, "grad_norm": 0.564038097858429, "learning_rate": 3.8549389199594134e-06, "loss": 0.6395, "step": 9399 }, { "epoch": 3.5973976272483736, "grad_norm": 0.5916633605957031, "learning_rate": 3.852983423209911e-06, "loss": 0.6933, "step": 9400 }, { "epoch": 3.5977803291236126, "grad_norm": 0.47377660870552063, "learning_rate": 3.851028304211088e-06, "loss": 0.5846, "step": 9401 }, { "epoch": 3.598163030998852, "grad_norm": 0.5297030210494995, "learning_rate": 3.849073563083095e-06, "loss": 0.5899, "step": 9402 }, { "epoch": 3.598545732874091, "grad_norm": 0.561050295829773, "learning_rate": 3.8471191999460586e-06, "loss": 0.6653, "step": 9403 }, { "epoch": 3.59892843474933, "grad_norm": 0.5248869061470032, "learning_rate": 3.8451652149200746e-06, "loss": 0.6608, "step": 9404 }, { "epoch": 3.5993111366245696, "grad_norm": 0.507759153842926, "learning_rate": 3.8432116081252225e-06, "loss": 0.5621, "step": 9405 }, { "epoch": 3.5996938384998085, "grad_norm": 0.58323734998703, "learning_rate": 3.841258379681557e-06, "loss": 0.6808, "step": 9406 }, { "epoch": 3.600076540375048, "grad_norm": 0.6054151654243469, "learning_rate": 3.83930552970911e-06, "loss": 0.7181, "step": 9407 }, { "epoch": 3.6004592422502872, "grad_norm": 0.537646472454071, "learning_rate": 3.837353058327889e-06, "loss": 0.6528, "step": 9408 }, { "epoch": 3.600841944125526, "grad_norm": 0.5663532614707947, "learning_rate": 3.8354009656578775e-06, "loss": 0.7167, "step": 9409 }, { "epoch": 3.6012246460007655, "grad_norm": 0.5054287314414978, "learning_rate": 3.8334492518190415e-06, "loss": 0.5817, "step": 9410 }, { "epoch": 3.6016073478760045, "grad_norm": 0.5710376501083374, "learning_rate": 3.831497916931311e-06, "loss": 0.6318, "step": 9411 }, { "epoch": 3.601990049751244, "grad_norm": 0.6879833340644836, "learning_rate": 3.829546961114608e-06, "loss": 0.6754, "step": 9412 }, { "epoch": 3.6023727516264827, "grad_norm": 0.5239124298095703, "learning_rate": 3.827596384488822e-06, "loss": 0.6385, "step": 9413 }, { "epoch": 3.602755453501722, "grad_norm": 0.5739216208457947, "learning_rate": 3.825646187173818e-06, "loss": 0.6545, "step": 9414 }, { "epoch": 3.6031381553769615, "grad_norm": 0.5375769734382629, "learning_rate": 3.823696369289442e-06, "loss": 0.662, "step": 9415 }, { "epoch": 3.6035208572522004, "grad_norm": 0.5476676821708679, "learning_rate": 3.8217469309555175e-06, "loss": 0.546, "step": 9416 }, { "epoch": 3.6039035591274398, "grad_norm": 0.5577735304832458, "learning_rate": 3.819797872291843e-06, "loss": 0.6181, "step": 9417 }, { "epoch": 3.604286261002679, "grad_norm": 0.5319965481758118, "learning_rate": 3.8178491934181905e-06, "loss": 0.6931, "step": 9418 }, { "epoch": 3.604668962877918, "grad_norm": 0.5692721605300903, "learning_rate": 3.815900894454317e-06, "loss": 0.6369, "step": 9419 }, { "epoch": 3.6050516647531574, "grad_norm": 0.5237733125686646, "learning_rate": 3.8139529755199444e-06, "loss": 0.629, "step": 9420 }, { "epoch": 3.6054343666283963, "grad_norm": 0.644747793674469, "learning_rate": 3.8120054367347802e-06, "loss": 0.5948, "step": 9421 }, { "epoch": 3.6058170685036357, "grad_norm": 0.5495641231536865, "learning_rate": 3.810058278218506e-06, "loss": 0.6284, "step": 9422 }, { "epoch": 3.6061997703788746, "grad_norm": 0.600850522518158, "learning_rate": 3.808111500090782e-06, "loss": 0.6859, "step": 9423 }, { "epoch": 3.606582472254114, "grad_norm": 0.5325748324394226, "learning_rate": 3.8061651024712386e-06, "loss": 0.6651, "step": 9424 }, { "epoch": 3.6069651741293534, "grad_norm": 0.5430147051811218, "learning_rate": 3.8042190854794893e-06, "loss": 0.6248, "step": 9425 }, { "epoch": 3.6073478760045923, "grad_norm": 0.5638641119003296, "learning_rate": 3.802273449235121e-06, "loss": 0.6649, "step": 9426 }, { "epoch": 3.6077305778798316, "grad_norm": 0.7676005959510803, "learning_rate": 3.800328193857701e-06, "loss": 0.6224, "step": 9427 }, { "epoch": 3.608113279755071, "grad_norm": 0.5862032771110535, "learning_rate": 3.7983833194667673e-06, "loss": 0.6554, "step": 9428 }, { "epoch": 3.60849598163031, "grad_norm": 0.5037775039672852, "learning_rate": 3.796438826181843e-06, "loss": 0.5694, "step": 9429 }, { "epoch": 3.6088786835055493, "grad_norm": 0.61687171459198, "learning_rate": 3.7944947141224153e-06, "loss": 0.6821, "step": 9430 }, { "epoch": 3.6092613853807882, "grad_norm": 0.5026047825813293, "learning_rate": 3.7925509834079587e-06, "loss": 0.5883, "step": 9431 }, { "epoch": 3.6096440872560276, "grad_norm": 0.5487895607948303, "learning_rate": 3.7906076341579235e-06, "loss": 0.576, "step": 9432 }, { "epoch": 3.6100267891312665, "grad_norm": 0.5095998048782349, "learning_rate": 3.7886646664917282e-06, "loss": 0.5592, "step": 9433 }, { "epoch": 3.610409491006506, "grad_norm": 0.5177849531173706, "learning_rate": 3.7867220805287753e-06, "loss": 0.6376, "step": 9434 }, { "epoch": 3.6107921928817452, "grad_norm": 0.5719823837280273, "learning_rate": 3.7847798763884413e-06, "loss": 0.6239, "step": 9435 }, { "epoch": 3.611174894756984, "grad_norm": 0.5383319854736328, "learning_rate": 3.782838054190082e-06, "loss": 0.6271, "step": 9436 }, { "epoch": 3.6115575966322235, "grad_norm": 0.5443828701972961, "learning_rate": 3.7808966140530267e-06, "loss": 0.5434, "step": 9437 }, { "epoch": 3.611940298507463, "grad_norm": 0.5064697861671448, "learning_rate": 3.778955556096582e-06, "loss": 0.6157, "step": 9438 }, { "epoch": 3.612323000382702, "grad_norm": 0.5554393529891968, "learning_rate": 3.7770148804400342e-06, "loss": 0.5948, "step": 9439 }, { "epoch": 3.612705702257941, "grad_norm": 0.5472335815429688, "learning_rate": 3.7750745872026373e-06, "loss": 0.561, "step": 9440 }, { "epoch": 3.61308840413318, "grad_norm": 0.5223163962364197, "learning_rate": 3.773134676503629e-06, "loss": 0.5359, "step": 9441 }, { "epoch": 3.6134711060084195, "grad_norm": 0.5096899271011353, "learning_rate": 3.7711951484622278e-06, "loss": 0.5945, "step": 9442 }, { "epoch": 3.6138538078836584, "grad_norm": 0.5478146076202393, "learning_rate": 3.7692560031976143e-06, "loss": 0.6679, "step": 9443 }, { "epoch": 3.6142365097588978, "grad_norm": 0.5642886757850647, "learning_rate": 3.767317240828959e-06, "loss": 0.5917, "step": 9444 }, { "epoch": 3.614619211634137, "grad_norm": 0.5712054967880249, "learning_rate": 3.7653788614754026e-06, "loss": 0.6741, "step": 9445 }, { "epoch": 3.615001913509376, "grad_norm": 0.5989255309104919, "learning_rate": 3.763440865256065e-06, "loss": 0.673, "step": 9446 }, { "epoch": 3.6153846153846154, "grad_norm": 0.5071102380752563, "learning_rate": 3.7615032522900396e-06, "loss": 0.5679, "step": 9447 }, { "epoch": 3.615767317259855, "grad_norm": 0.5212725400924683, "learning_rate": 3.7595660226964e-06, "loss": 0.6203, "step": 9448 }, { "epoch": 3.6161500191350937, "grad_norm": 0.5457295775413513, "learning_rate": 3.7576291765941964e-06, "loss": 0.6387, "step": 9449 }, { "epoch": 3.616532721010333, "grad_norm": 0.5490075945854187, "learning_rate": 3.7556927141024456e-06, "loss": 0.6315, "step": 9450 }, { "epoch": 3.616915422885572, "grad_norm": 0.5201722979545593, "learning_rate": 3.7537566353401543e-06, "loss": 0.5879, "step": 9451 }, { "epoch": 3.6172981247608114, "grad_norm": 0.545558512210846, "learning_rate": 3.7518209404263006e-06, "loss": 0.6356, "step": 9452 }, { "epoch": 3.6176808266360503, "grad_norm": 0.5646321177482605, "learning_rate": 3.7498856294798325e-06, "loss": 0.6562, "step": 9453 }, { "epoch": 3.6180635285112897, "grad_norm": 0.5531801581382751, "learning_rate": 3.747950702619684e-06, "loss": 0.6913, "step": 9454 }, { "epoch": 3.618446230386529, "grad_norm": 0.5816057920455933, "learning_rate": 3.746016159964764e-06, "loss": 0.649, "step": 9455 }, { "epoch": 3.618828932261768, "grad_norm": 0.5636683106422424, "learning_rate": 3.7440820016339454e-06, "loss": 0.6047, "step": 9456 }, { "epoch": 3.6192116341370073, "grad_norm": 0.5628538131713867, "learning_rate": 3.7421482277460974e-06, "loss": 0.5812, "step": 9457 }, { "epoch": 3.6195943360122467, "grad_norm": 0.5308190584182739, "learning_rate": 3.7402148384200555e-06, "loss": 0.5748, "step": 9458 }, { "epoch": 3.6199770378874856, "grad_norm": 0.5395619869232178, "learning_rate": 3.7382818337746264e-06, "loss": 0.7127, "step": 9459 }, { "epoch": 3.620359739762725, "grad_norm": 0.5624816417694092, "learning_rate": 3.7363492139285995e-06, "loss": 0.6043, "step": 9460 }, { "epoch": 3.620742441637964, "grad_norm": 0.5852784514427185, "learning_rate": 3.734416979000741e-06, "loss": 0.6033, "step": 9461 }, { "epoch": 3.6211251435132032, "grad_norm": 0.5760985016822815, "learning_rate": 3.7324851291097954e-06, "loss": 0.6964, "step": 9462 }, { "epoch": 3.621507845388442, "grad_norm": 0.5500501990318298, "learning_rate": 3.7305536643744724e-06, "loss": 0.6388, "step": 9463 }, { "epoch": 3.6218905472636815, "grad_norm": 0.6137940287590027, "learning_rate": 3.7286225849134695e-06, "loss": 0.6054, "step": 9464 }, { "epoch": 3.622273249138921, "grad_norm": 0.5173502564430237, "learning_rate": 3.72669189084546e-06, "loss": 0.6076, "step": 9465 }, { "epoch": 3.62265595101416, "grad_norm": 0.5696431994438171, "learning_rate": 3.72476158228908e-06, "loss": 0.6139, "step": 9466 }, { "epoch": 3.623038652889399, "grad_norm": 0.6154253482818604, "learning_rate": 3.7228316593629633e-06, "loss": 0.5741, "step": 9467 }, { "epoch": 3.6234213547646386, "grad_norm": 0.5386055707931519, "learning_rate": 3.720902122185708e-06, "loss": 0.6522, "step": 9468 }, { "epoch": 3.6238040566398775, "grad_norm": 0.6268584132194519, "learning_rate": 3.7189729708758828e-06, "loss": 0.6024, "step": 9469 }, { "epoch": 3.624186758515117, "grad_norm": 0.5785117745399475, "learning_rate": 3.7170442055520418e-06, "loss": 0.5931, "step": 9470 }, { "epoch": 3.6245694603903558, "grad_norm": 0.5217722058296204, "learning_rate": 3.715115826332717e-06, "loss": 0.5735, "step": 9471 }, { "epoch": 3.624952162265595, "grad_norm": 0.5570452809333801, "learning_rate": 3.7131878333364056e-06, "loss": 0.6798, "step": 9472 }, { "epoch": 3.625334864140834, "grad_norm": 0.5444090366363525, "learning_rate": 3.711260226681591e-06, "loss": 0.6286, "step": 9473 }, { "epoch": 3.6257175660160734, "grad_norm": 0.543280303478241, "learning_rate": 3.7093330064867304e-06, "loss": 0.6237, "step": 9474 }, { "epoch": 3.626100267891313, "grad_norm": 0.5044546127319336, "learning_rate": 3.7074061728702605e-06, "loss": 0.5794, "step": 9475 }, { "epoch": 3.6264829697665517, "grad_norm": 0.5729519724845886, "learning_rate": 3.7054797259505814e-06, "loss": 0.68, "step": 9476 }, { "epoch": 3.626865671641791, "grad_norm": 0.5552915334701538, "learning_rate": 3.7035536658460813e-06, "loss": 0.5838, "step": 9477 }, { "epoch": 3.6272483735170304, "grad_norm": 0.5810418128967285, "learning_rate": 3.70162799267513e-06, "loss": 0.6134, "step": 9478 }, { "epoch": 3.6276310753922694, "grad_norm": 0.5154591202735901, "learning_rate": 3.6997027065560553e-06, "loss": 0.6044, "step": 9479 }, { "epoch": 3.6280137772675087, "grad_norm": 0.5589325428009033, "learning_rate": 3.6977778076071757e-06, "loss": 0.6527, "step": 9480 }, { "epoch": 3.6283964791427477, "grad_norm": 0.545913577079773, "learning_rate": 3.695853295946783e-06, "loss": 0.5918, "step": 9481 }, { "epoch": 3.628779181017987, "grad_norm": 0.5505867600440979, "learning_rate": 3.6939291716931393e-06, "loss": 0.6345, "step": 9482 }, { "epoch": 3.629161882893226, "grad_norm": 0.5316153764724731, "learning_rate": 3.6920054349644885e-06, "loss": 0.64, "step": 9483 }, { "epoch": 3.6295445847684653, "grad_norm": 0.5415337681770325, "learning_rate": 3.690082085879053e-06, "loss": 0.5792, "step": 9484 }, { "epoch": 3.6299272866437047, "grad_norm": 0.5391581654548645, "learning_rate": 3.688159124555022e-06, "loss": 0.6469, "step": 9485 }, { "epoch": 3.6303099885189436, "grad_norm": 0.5433171391487122, "learning_rate": 3.6862365511105692e-06, "loss": 0.576, "step": 9486 }, { "epoch": 3.630692690394183, "grad_norm": 0.5037914514541626, "learning_rate": 3.6843143656638425e-06, "loss": 0.6189, "step": 9487 }, { "epoch": 3.6310753922694223, "grad_norm": 0.5737419128417969, "learning_rate": 3.6823925683329655e-06, "loss": 0.6127, "step": 9488 }, { "epoch": 3.6314580941446613, "grad_norm": 0.49432826042175293, "learning_rate": 3.6804711592360366e-06, "loss": 0.5268, "step": 9489 }, { "epoch": 3.6318407960199006, "grad_norm": 0.5019313097000122, "learning_rate": 3.6785501384911326e-06, "loss": 0.5979, "step": 9490 }, { "epoch": 3.6322234978951395, "grad_norm": 0.5399207472801208, "learning_rate": 3.6766295062163083e-06, "loss": 0.6441, "step": 9491 }, { "epoch": 3.632606199770379, "grad_norm": 0.5516269207000732, "learning_rate": 3.6747092625295846e-06, "loss": 0.5792, "step": 9492 }, { "epoch": 3.632988901645618, "grad_norm": 0.5945339798927307, "learning_rate": 3.67278940754897e-06, "loss": 0.6477, "step": 9493 }, { "epoch": 3.633371603520857, "grad_norm": 0.5241367816925049, "learning_rate": 3.6708699413924486e-06, "loss": 0.585, "step": 9494 }, { "epoch": 3.6337543053960966, "grad_norm": 0.5165520906448364, "learning_rate": 3.668950864177968e-06, "loss": 0.6341, "step": 9495 }, { "epoch": 3.6341370072713355, "grad_norm": 0.5578262805938721, "learning_rate": 3.667032176023465e-06, "loss": 0.655, "step": 9496 }, { "epoch": 3.634519709146575, "grad_norm": 0.5327039361000061, "learning_rate": 3.6651138770468487e-06, "loss": 0.6107, "step": 9497 }, { "epoch": 3.634902411021814, "grad_norm": 0.6421614289283752, "learning_rate": 3.663195967366003e-06, "loss": 0.6033, "step": 9498 }, { "epoch": 3.635285112897053, "grad_norm": 0.5735668540000916, "learning_rate": 3.6612784470987893e-06, "loss": 0.6765, "step": 9499 }, { "epoch": 3.6356678147722925, "grad_norm": 0.5629181861877441, "learning_rate": 3.6593613163630437e-06, "loss": 0.6815, "step": 9500 }, { "epoch": 3.6360505166475314, "grad_norm": 0.5352336764335632, "learning_rate": 3.657444575276583e-06, "loss": 0.6771, "step": 9501 }, { "epoch": 3.636433218522771, "grad_norm": 0.652988851070404, "learning_rate": 3.6555282239571887e-06, "loss": 0.6666, "step": 9502 }, { "epoch": 3.6368159203980097, "grad_norm": 0.5267829298973083, "learning_rate": 3.653612262522629e-06, "loss": 0.622, "step": 9503 }, { "epoch": 3.637198622273249, "grad_norm": 0.583365261554718, "learning_rate": 3.651696691090649e-06, "loss": 0.5982, "step": 9504 }, { "epoch": 3.6375813241484884, "grad_norm": 0.5562605261802673, "learning_rate": 3.649781509778959e-06, "loss": 0.6306, "step": 9505 }, { "epoch": 3.6379640260237274, "grad_norm": 0.49458596110343933, "learning_rate": 3.647866718705254e-06, "loss": 0.6169, "step": 9506 }, { "epoch": 3.6383467278989667, "grad_norm": 0.5815884470939636, "learning_rate": 3.6459523179872047e-06, "loss": 0.6585, "step": 9507 }, { "epoch": 3.638729429774206, "grad_norm": 0.5751127004623413, "learning_rate": 3.6440383077424556e-06, "loss": 0.602, "step": 9508 }, { "epoch": 3.639112131649445, "grad_norm": 0.4958820641040802, "learning_rate": 3.642124688088627e-06, "loss": 0.6006, "step": 9509 }, { "epoch": 3.6394948335246844, "grad_norm": 0.5661231875419617, "learning_rate": 3.6402114591433214e-06, "loss": 0.608, "step": 9510 }, { "epoch": 3.6398775353999233, "grad_norm": 0.5926946997642517, "learning_rate": 3.6382986210241035e-06, "loss": 0.5764, "step": 9511 }, { "epoch": 3.6402602372751627, "grad_norm": 0.508571982383728, "learning_rate": 3.636386173848525e-06, "loss": 0.6696, "step": 9512 }, { "epoch": 3.6406429391504016, "grad_norm": 0.5718238353729248, "learning_rate": 3.6344741177341126e-06, "loss": 0.6411, "step": 9513 }, { "epoch": 3.641025641025641, "grad_norm": 0.6035239100456238, "learning_rate": 3.6325624527983704e-06, "loss": 0.6661, "step": 9514 }, { "epoch": 3.6414083429008803, "grad_norm": 0.5743820667266846, "learning_rate": 3.6306511791587673e-06, "loss": 0.6278, "step": 9515 }, { "epoch": 3.6417910447761193, "grad_norm": 0.5321818590164185, "learning_rate": 3.62874029693276e-06, "loss": 0.5899, "step": 9516 }, { "epoch": 3.6421737466513586, "grad_norm": 0.5469914078712463, "learning_rate": 3.626829806237779e-06, "loss": 0.6256, "step": 9517 }, { "epoch": 3.642556448526598, "grad_norm": 0.5840620398521423, "learning_rate": 3.624919707191227e-06, "loss": 0.6566, "step": 9518 }, { "epoch": 3.642939150401837, "grad_norm": 0.5099475383758545, "learning_rate": 3.623009999910486e-06, "loss": 0.6006, "step": 9519 }, { "epoch": 3.6433218522770763, "grad_norm": 0.6042795777320862, "learning_rate": 3.621100684512916e-06, "loss": 0.6626, "step": 9520 }, { "epoch": 3.643704554152315, "grad_norm": 0.5720909237861633, "learning_rate": 3.6191917611158424e-06, "loss": 0.6234, "step": 9521 }, { "epoch": 3.6440872560275546, "grad_norm": 0.5542675256729126, "learning_rate": 3.6172832298365756e-06, "loss": 0.5971, "step": 9522 }, { "epoch": 3.6444699579027935, "grad_norm": 0.5304833650588989, "learning_rate": 3.615375090792406e-06, "loss": 0.6202, "step": 9523 }, { "epoch": 3.644852659778033, "grad_norm": 0.5485625267028809, "learning_rate": 3.6134673441005864e-06, "loss": 0.6638, "step": 9524 }, { "epoch": 3.645235361653272, "grad_norm": 0.5714985132217407, "learning_rate": 3.611559989878355e-06, "loss": 0.6195, "step": 9525 }, { "epoch": 3.645618063528511, "grad_norm": 0.5447081327438354, "learning_rate": 3.609653028242924e-06, "loss": 0.6673, "step": 9526 }, { "epoch": 3.6460007654037505, "grad_norm": 0.5398667454719543, "learning_rate": 3.607746459311483e-06, "loss": 0.5573, "step": 9527 }, { "epoch": 3.64638346727899, "grad_norm": 0.5338885188102722, "learning_rate": 3.6058402832011953e-06, "loss": 0.6132, "step": 9528 }, { "epoch": 3.646766169154229, "grad_norm": 0.52582186460495, "learning_rate": 3.603934500029199e-06, "loss": 0.5947, "step": 9529 }, { "epoch": 3.647148871029468, "grad_norm": 0.6348018050193787, "learning_rate": 3.602029109912615e-06, "loss": 0.6455, "step": 9530 }, { "epoch": 3.647531572904707, "grad_norm": 0.5246788263320923, "learning_rate": 3.600124112968526e-06, "loss": 0.6513, "step": 9531 }, { "epoch": 3.6479142747799465, "grad_norm": 0.5322864651679993, "learning_rate": 3.5982195093140036e-06, "loss": 0.5588, "step": 9532 }, { "epoch": 3.6482969766551854, "grad_norm": 0.5642610788345337, "learning_rate": 3.5963152990660943e-06, "loss": 0.6202, "step": 9533 }, { "epoch": 3.6486796785304247, "grad_norm": 0.5465490221977234, "learning_rate": 3.5944114823418098e-06, "loss": 0.6822, "step": 9534 }, { "epoch": 3.649062380405664, "grad_norm": 0.5632639527320862, "learning_rate": 3.5925080592581486e-06, "loss": 0.6524, "step": 9535 }, { "epoch": 3.649445082280903, "grad_norm": 0.5198510885238647, "learning_rate": 3.5906050299320805e-06, "loss": 0.5332, "step": 9536 }, { "epoch": 3.6498277841561424, "grad_norm": 0.5416030883789062, "learning_rate": 3.5887023944805523e-06, "loss": 0.6394, "step": 9537 }, { "epoch": 3.6502104860313818, "grad_norm": 0.5015885829925537, "learning_rate": 3.5868001530204856e-06, "loss": 0.5565, "step": 9538 }, { "epoch": 3.6505931879066207, "grad_norm": 0.5349140167236328, "learning_rate": 3.584898305668778e-06, "loss": 0.6088, "step": 9539 }, { "epoch": 3.65097588978186, "grad_norm": 0.5230055451393127, "learning_rate": 3.5829968525423076e-06, "loss": 0.6623, "step": 9540 }, { "epoch": 3.651358591657099, "grad_norm": 0.5654947757720947, "learning_rate": 3.5810957937579172e-06, "loss": 0.6594, "step": 9541 }, { "epoch": 3.6517412935323383, "grad_norm": 0.5033594369888306, "learning_rate": 3.5791951294324336e-06, "loss": 0.5804, "step": 9542 }, { "epoch": 3.6521239954075773, "grad_norm": 0.4766111671924591, "learning_rate": 3.5772948596826617e-06, "loss": 0.5588, "step": 9543 }, { "epoch": 3.6525066972828166, "grad_norm": 0.5088559985160828, "learning_rate": 3.5753949846253734e-06, "loss": 0.5491, "step": 9544 }, { "epoch": 3.652889399158056, "grad_norm": 0.5954936146736145, "learning_rate": 3.5734955043773214e-06, "loss": 0.6627, "step": 9545 }, { "epoch": 3.653272101033295, "grad_norm": 0.5615391731262207, "learning_rate": 3.5715964190552368e-06, "loss": 0.6975, "step": 9546 }, { "epoch": 3.6536548029085343, "grad_norm": 0.5290510058403015, "learning_rate": 3.5696977287758205e-06, "loss": 0.6665, "step": 9547 }, { "epoch": 3.6540375047837736, "grad_norm": 0.5714266896247864, "learning_rate": 3.567799433655754e-06, "loss": 0.5845, "step": 9548 }, { "epoch": 3.6544202066590126, "grad_norm": 0.5845206379890442, "learning_rate": 3.5659015338116964e-06, "loss": 0.7117, "step": 9549 }, { "epoch": 3.654802908534252, "grad_norm": 0.5284110903739929, "learning_rate": 3.5640040293602717e-06, "loss": 0.561, "step": 9550 }, { "epoch": 3.655185610409491, "grad_norm": 0.48644986748695374, "learning_rate": 3.562106920418089e-06, "loss": 0.5625, "step": 9551 }, { "epoch": 3.6555683122847302, "grad_norm": 0.5677610039710999, "learning_rate": 3.560210207101731e-06, "loss": 0.6268, "step": 9552 }, { "epoch": 3.655951014159969, "grad_norm": 0.6040377616882324, "learning_rate": 3.55831388952776e-06, "loss": 0.7141, "step": 9553 }, { "epoch": 3.6563337160352085, "grad_norm": 0.5575169920921326, "learning_rate": 3.556417967812703e-06, "loss": 0.628, "step": 9554 }, { "epoch": 3.656716417910448, "grad_norm": 0.5684942007064819, "learning_rate": 3.5545224420730717e-06, "loss": 0.6553, "step": 9555 }, { "epoch": 3.657099119785687, "grad_norm": 0.5293998718261719, "learning_rate": 3.552627312425353e-06, "loss": 0.5106, "step": 9556 }, { "epoch": 3.657481821660926, "grad_norm": 0.5043238997459412, "learning_rate": 3.550732578986006e-06, "loss": 0.5668, "step": 9557 }, { "epoch": 3.6578645235361655, "grad_norm": 0.5969059467315674, "learning_rate": 3.5488382418714685e-06, "loss": 0.5995, "step": 9558 }, { "epoch": 3.6582472254114045, "grad_norm": 0.56044602394104, "learning_rate": 3.546944301198154e-06, "loss": 0.5876, "step": 9559 }, { "epoch": 3.658629927286644, "grad_norm": 0.5717214941978455, "learning_rate": 3.5450507570824465e-06, "loss": 0.6103, "step": 9560 }, { "epoch": 3.6590126291618827, "grad_norm": 0.5372283458709717, "learning_rate": 3.5431576096407106e-06, "loss": 0.5832, "step": 9561 }, { "epoch": 3.659395331037122, "grad_norm": 0.5304058194160461, "learning_rate": 3.5412648589892897e-06, "loss": 0.6392, "step": 9562 }, { "epoch": 3.659778032912361, "grad_norm": 0.5592396855354309, "learning_rate": 3.5393725052444906e-06, "loss": 0.6615, "step": 9563 }, { "epoch": 3.6601607347876004, "grad_norm": 0.5284397006034851, "learning_rate": 3.5374805485226072e-06, "loss": 0.5891, "step": 9564 }, { "epoch": 3.6605434366628398, "grad_norm": 0.59660804271698, "learning_rate": 3.5355889889399054e-06, "loss": 0.6964, "step": 9565 }, { "epoch": 3.6609261385380787, "grad_norm": 0.5781922340393066, "learning_rate": 3.5336978266126277e-06, "loss": 0.6236, "step": 9566 }, { "epoch": 3.661308840413318, "grad_norm": 0.5739726424217224, "learning_rate": 3.53180706165699e-06, "loss": 0.6301, "step": 9567 }, { "epoch": 3.6616915422885574, "grad_norm": 0.5396193265914917, "learning_rate": 3.5299166941891836e-06, "loss": 0.6582, "step": 9568 }, { "epoch": 3.6620742441637963, "grad_norm": 0.5395243763923645, "learning_rate": 3.528026724325383e-06, "loss": 0.6321, "step": 9569 }, { "epoch": 3.6624569460390357, "grad_norm": 0.5396141409873962, "learning_rate": 3.5261371521817247e-06, "loss": 0.5713, "step": 9570 }, { "epoch": 3.6628396479142746, "grad_norm": 0.5491786003112793, "learning_rate": 3.524247977874329e-06, "loss": 0.5548, "step": 9571 }, { "epoch": 3.663222349789514, "grad_norm": 0.5872480273246765, "learning_rate": 3.522359201519295e-06, "loss": 0.618, "step": 9572 }, { "epoch": 3.663605051664753, "grad_norm": 0.5294017195701599, "learning_rate": 3.5204708232326877e-06, "loss": 0.5707, "step": 9573 }, { "epoch": 3.6639877535399923, "grad_norm": 0.5900763273239136, "learning_rate": 3.5185828431305547e-06, "loss": 0.6771, "step": 9574 }, { "epoch": 3.6643704554152317, "grad_norm": 0.5402776598930359, "learning_rate": 3.516695261328923e-06, "loss": 0.5822, "step": 9575 }, { "epoch": 3.6647531572904706, "grad_norm": 0.5423122644424438, "learning_rate": 3.51480807794378e-06, "loss": 0.6693, "step": 9576 }, { "epoch": 3.66513585916571, "grad_norm": 0.5546485185623169, "learning_rate": 3.5129212930911004e-06, "loss": 0.6121, "step": 9577 }, { "epoch": 3.6655185610409493, "grad_norm": 0.5193621516227722, "learning_rate": 3.5110349068868377e-06, "loss": 0.6587, "step": 9578 }, { "epoch": 3.6659012629161882, "grad_norm": 0.5628952980041504, "learning_rate": 3.5091489194469163e-06, "loss": 0.6753, "step": 9579 }, { "epoch": 3.6662839647914276, "grad_norm": 0.5492024421691895, "learning_rate": 3.5072633308872285e-06, "loss": 0.6318, "step": 9580 }, { "epoch": 3.6666666666666665, "grad_norm": 0.5121034979820251, "learning_rate": 3.505378141323651e-06, "loss": 0.6556, "step": 9581 }, { "epoch": 3.667049368541906, "grad_norm": 0.5541493892669678, "learning_rate": 3.503493350872038e-06, "loss": 0.6842, "step": 9582 }, { "epoch": 3.667432070417145, "grad_norm": 0.6101325750350952, "learning_rate": 3.5016089596482085e-06, "loss": 0.6027, "step": 9583 }, { "epoch": 3.667814772292384, "grad_norm": 0.5902135372161865, "learning_rate": 3.4997249677679656e-06, "loss": 0.6064, "step": 9584 }, { "epoch": 3.6681974741676235, "grad_norm": 0.5966252684593201, "learning_rate": 3.4978413753470896e-06, "loss": 0.6467, "step": 9585 }, { "epoch": 3.6685801760428625, "grad_norm": 0.5835254192352295, "learning_rate": 3.4959581825013256e-06, "loss": 0.6209, "step": 9586 }, { "epoch": 3.668962877918102, "grad_norm": 0.5486626029014587, "learning_rate": 3.4940753893464053e-06, "loss": 0.6291, "step": 9587 }, { "epoch": 3.669345579793341, "grad_norm": 0.5392060279846191, "learning_rate": 3.4921929959980296e-06, "loss": 0.5945, "step": 9588 }, { "epoch": 3.66972828166858, "grad_norm": 0.5237155556678772, "learning_rate": 3.490311002571878e-06, "loss": 0.5976, "step": 9589 }, { "epoch": 3.6701109835438195, "grad_norm": 0.5598379969596863, "learning_rate": 3.488429409183602e-06, "loss": 0.6303, "step": 9590 }, { "epoch": 3.6704936854190584, "grad_norm": 0.5513355731964111, "learning_rate": 3.4865482159488375e-06, "loss": 0.542, "step": 9591 }, { "epoch": 3.6708763872942978, "grad_norm": 0.553986132144928, "learning_rate": 3.484667422983179e-06, "loss": 0.6481, "step": 9592 }, { "epoch": 3.6712590891695367, "grad_norm": 0.5138723254203796, "learning_rate": 3.4827870304022116e-06, "loss": 0.6897, "step": 9593 }, { "epoch": 3.671641791044776, "grad_norm": 0.6150749325752258, "learning_rate": 3.4809070383214883e-06, "loss": 0.6565, "step": 9594 }, { "epoch": 3.6720244929200154, "grad_norm": 0.5580517053604126, "learning_rate": 3.4790274468565454e-06, "loss": 0.6229, "step": 9595 }, { "epoch": 3.6724071947952543, "grad_norm": 0.5672194361686707, "learning_rate": 3.477148256122881e-06, "loss": 0.6672, "step": 9596 }, { "epoch": 3.6727898966704937, "grad_norm": 0.5723077058792114, "learning_rate": 3.4752694662359787e-06, "loss": 0.59, "step": 9597 }, { "epoch": 3.673172598545733, "grad_norm": 0.5865790843963623, "learning_rate": 3.473391077311298e-06, "loss": 0.6664, "step": 9598 }, { "epoch": 3.673555300420972, "grad_norm": 0.5978556871414185, "learning_rate": 3.4715130894642677e-06, "loss": 0.5626, "step": 9599 }, { "epoch": 3.6739380022962114, "grad_norm": 0.505406379699707, "learning_rate": 3.469635502810298e-06, "loss": 0.6369, "step": 9600 }, { "epoch": 3.6743207041714503, "grad_norm": 0.530310332775116, "learning_rate": 3.467758317464772e-06, "loss": 0.6432, "step": 9601 }, { "epoch": 3.6747034060466897, "grad_norm": 0.5823054909706116, "learning_rate": 3.465881533543045e-06, "loss": 0.6554, "step": 9602 }, { "epoch": 3.6750861079219286, "grad_norm": 0.5305204391479492, "learning_rate": 3.46400515116045e-06, "loss": 0.5766, "step": 9603 }, { "epoch": 3.675468809797168, "grad_norm": 0.7641539573669434, "learning_rate": 3.462129170432301e-06, "loss": 0.6345, "step": 9604 }, { "epoch": 3.6758515116724073, "grad_norm": 0.5555742979049683, "learning_rate": 3.4602535914738764e-06, "loss": 0.6076, "step": 9605 }, { "epoch": 3.6762342135476462, "grad_norm": 0.5408474206924438, "learning_rate": 3.458378414400436e-06, "loss": 0.6383, "step": 9606 }, { "epoch": 3.6766169154228856, "grad_norm": 0.5564239621162415, "learning_rate": 3.4565036393272165e-06, "loss": 0.685, "step": 9607 }, { "epoch": 3.676999617298125, "grad_norm": 0.5596725344657898, "learning_rate": 3.454629266369428e-06, "loss": 0.5994, "step": 9608 }, { "epoch": 3.677382319173364, "grad_norm": 0.5472362041473389, "learning_rate": 3.452755295642255e-06, "loss": 0.6878, "step": 9609 }, { "epoch": 3.6777650210486033, "grad_norm": 0.5224126577377319, "learning_rate": 3.4508817272608575e-06, "loss": 0.64, "step": 9610 }, { "epoch": 3.678147722923842, "grad_norm": 0.5912081599235535, "learning_rate": 3.4490085613403755e-06, "loss": 0.6504, "step": 9611 }, { "epoch": 3.6785304247990815, "grad_norm": 0.550175666809082, "learning_rate": 3.447135797995913e-06, "loss": 0.6045, "step": 9612 }, { "epoch": 3.6789131266743205, "grad_norm": 0.5889111757278442, "learning_rate": 3.445263437342561e-06, "loss": 0.6044, "step": 9613 }, { "epoch": 3.67929582854956, "grad_norm": 0.5545201897621155, "learning_rate": 3.4433914794953826e-06, "loss": 0.6364, "step": 9614 }, { "epoch": 3.679678530424799, "grad_norm": 0.5390269160270691, "learning_rate": 3.4415199245694084e-06, "loss": 0.639, "step": 9615 }, { "epoch": 3.680061232300038, "grad_norm": 0.5617865920066833, "learning_rate": 3.4396487726796545e-06, "loss": 0.5632, "step": 9616 }, { "epoch": 3.6804439341752775, "grad_norm": 0.6166203618049622, "learning_rate": 3.4377780239411086e-06, "loss": 0.6594, "step": 9617 }, { "epoch": 3.680826636050517, "grad_norm": 0.5717048048973083, "learning_rate": 3.4359076784687316e-06, "loss": 0.6766, "step": 9618 }, { "epoch": 3.6812093379257558, "grad_norm": 0.5872711539268494, "learning_rate": 3.4340377363774626e-06, "loss": 0.5877, "step": 9619 }, { "epoch": 3.681592039800995, "grad_norm": 0.5382428169250488, "learning_rate": 3.432168197782213e-06, "loss": 0.6133, "step": 9620 }, { "epoch": 3.681974741676234, "grad_norm": 0.5493345856666565, "learning_rate": 3.430299062797876e-06, "loss": 0.5804, "step": 9621 }, { "epoch": 3.6823574435514734, "grad_norm": 0.5246384739875793, "learning_rate": 3.4284303315393085e-06, "loss": 0.5901, "step": 9622 }, { "epoch": 3.6827401454267124, "grad_norm": 0.5443353652954102, "learning_rate": 3.4265620041213508e-06, "loss": 0.6584, "step": 9623 }, { "epoch": 3.6831228473019517, "grad_norm": 0.5650492906570435, "learning_rate": 3.4246940806588202e-06, "loss": 0.6053, "step": 9624 }, { "epoch": 3.683505549177191, "grad_norm": 0.5361307263374329, "learning_rate": 3.4228265612665e-06, "loss": 0.6182, "step": 9625 }, { "epoch": 3.68388825105243, "grad_norm": 0.5477315783500671, "learning_rate": 3.4209594460591576e-06, "loss": 0.5965, "step": 9626 }, { "epoch": 3.6842709529276694, "grad_norm": 0.545179009437561, "learning_rate": 3.4190927351515313e-06, "loss": 0.5759, "step": 9627 }, { "epoch": 3.6846536548029087, "grad_norm": 0.5443210005760193, "learning_rate": 3.4172264286583367e-06, "loss": 0.6518, "step": 9628 }, { "epoch": 3.6850363566781477, "grad_norm": 0.5643564462661743, "learning_rate": 3.4153605266942614e-06, "loss": 0.6557, "step": 9629 }, { "epoch": 3.685419058553387, "grad_norm": 0.596716582775116, "learning_rate": 3.4134950293739753e-06, "loss": 0.665, "step": 9630 }, { "epoch": 3.685801760428626, "grad_norm": 0.5683025121688843, "learning_rate": 3.411629936812111e-06, "loss": 0.7413, "step": 9631 }, { "epoch": 3.6861844623038653, "grad_norm": 0.5065093040466309, "learning_rate": 3.409765249123287e-06, "loss": 0.6386, "step": 9632 }, { "epoch": 3.6865671641791042, "grad_norm": 0.5092499256134033, "learning_rate": 3.4079009664220917e-06, "loss": 0.5912, "step": 9633 }, { "epoch": 3.6869498660543436, "grad_norm": 0.6057722568511963, "learning_rate": 3.4060370888230966e-06, "loss": 0.696, "step": 9634 }, { "epoch": 3.687332567929583, "grad_norm": 0.590476930141449, "learning_rate": 3.4041736164408323e-06, "loss": 0.6653, "step": 9635 }, { "epoch": 3.687715269804822, "grad_norm": 0.5683693885803223, "learning_rate": 3.40231054938982e-06, "loss": 0.6843, "step": 9636 }, { "epoch": 3.6880979716800613, "grad_norm": 0.5666566491127014, "learning_rate": 3.400447887784548e-06, "loss": 0.6384, "step": 9637 }, { "epoch": 3.6884806735553006, "grad_norm": 0.5158955454826355, "learning_rate": 3.398585631739484e-06, "loss": 0.5912, "step": 9638 }, { "epoch": 3.6888633754305395, "grad_norm": 0.5076144933700562, "learning_rate": 3.396723781369067e-06, "loss": 0.5494, "step": 9639 }, { "epoch": 3.689246077305779, "grad_norm": 0.5068195462226868, "learning_rate": 3.3948623367877166e-06, "loss": 0.6071, "step": 9640 }, { "epoch": 3.689628779181018, "grad_norm": 0.5768494606018066, "learning_rate": 3.393001298109817e-06, "loss": 0.6245, "step": 9641 }, { "epoch": 3.690011481056257, "grad_norm": 0.5807752013206482, "learning_rate": 3.391140665449737e-06, "loss": 0.6042, "step": 9642 }, { "epoch": 3.690394182931496, "grad_norm": 0.5630356073379517, "learning_rate": 3.3892804389218216e-06, "loss": 0.6111, "step": 9643 }, { "epoch": 3.6907768848067355, "grad_norm": 0.5877439975738525, "learning_rate": 3.387420618640379e-06, "loss": 0.6824, "step": 9644 }, { "epoch": 3.691159586681975, "grad_norm": 0.5217148065567017, "learning_rate": 3.385561204719704e-06, "loss": 0.5739, "step": 9645 }, { "epoch": 3.691542288557214, "grad_norm": 0.5449119806289673, "learning_rate": 3.3837021972740623e-06, "loss": 0.621, "step": 9646 }, { "epoch": 3.691924990432453, "grad_norm": 0.5438744425773621, "learning_rate": 3.3818435964176966e-06, "loss": 0.6087, "step": 9647 }, { "epoch": 3.6923076923076925, "grad_norm": 0.5030414462089539, "learning_rate": 3.3799854022648202e-06, "loss": 0.5597, "step": 9648 }, { "epoch": 3.6926903941829314, "grad_norm": 0.4961245357990265, "learning_rate": 3.378127614929626e-06, "loss": 0.6281, "step": 9649 }, { "epoch": 3.693073096058171, "grad_norm": 0.5184893608093262, "learning_rate": 3.3762702345262823e-06, "loss": 0.5148, "step": 9650 }, { "epoch": 3.6934557979334097, "grad_norm": 0.7346920967102051, "learning_rate": 3.374413261168924e-06, "loss": 0.6334, "step": 9651 }, { "epoch": 3.693838499808649, "grad_norm": 0.5528405904769897, "learning_rate": 3.3725566949716717e-06, "loss": 0.6343, "step": 9652 }, { "epoch": 3.694221201683888, "grad_norm": 0.5765792727470398, "learning_rate": 3.3707005360486167e-06, "loss": 0.6347, "step": 9653 }, { "epoch": 3.6946039035591274, "grad_norm": 0.5370559692382812, "learning_rate": 3.3688447845138204e-06, "loss": 0.6413, "step": 9654 }, { "epoch": 3.6949866054343667, "grad_norm": 0.5392832159996033, "learning_rate": 3.3669894404813277e-06, "loss": 0.5744, "step": 9655 }, { "epoch": 3.6953693073096057, "grad_norm": 0.529382586479187, "learning_rate": 3.3651345040651516e-06, "loss": 0.6148, "step": 9656 }, { "epoch": 3.695752009184845, "grad_norm": 0.540462851524353, "learning_rate": 3.3632799753792856e-06, "loss": 0.6755, "step": 9657 }, { "epoch": 3.6961347110600844, "grad_norm": 0.5660855770111084, "learning_rate": 3.3614258545376953e-06, "loss": 0.6343, "step": 9658 }, { "epoch": 3.6965174129353233, "grad_norm": 0.5241846442222595, "learning_rate": 3.359572141654319e-06, "loss": 0.57, "step": 9659 }, { "epoch": 3.6969001148105627, "grad_norm": 0.565778911113739, "learning_rate": 3.357718836843079e-06, "loss": 0.6193, "step": 9660 }, { "epoch": 3.6972828166858016, "grad_norm": 0.4865502417087555, "learning_rate": 3.3558659402178573e-06, "loss": 0.5496, "step": 9661 }, { "epoch": 3.697665518561041, "grad_norm": 0.5323857069015503, "learning_rate": 3.3540134518925226e-06, "loss": 0.6356, "step": 9662 }, { "epoch": 3.69804822043628, "grad_norm": 0.522785484790802, "learning_rate": 3.352161371980919e-06, "loss": 0.5664, "step": 9663 }, { "epoch": 3.6984309223115193, "grad_norm": 0.5146846771240234, "learning_rate": 3.3503097005968554e-06, "loss": 0.6258, "step": 9664 }, { "epoch": 3.6988136241867586, "grad_norm": 0.6357548832893372, "learning_rate": 3.348458437854124e-06, "loss": 0.5511, "step": 9665 }, { "epoch": 3.6991963260619976, "grad_norm": 0.5107144713401794, "learning_rate": 3.3466075838664926e-06, "loss": 0.6286, "step": 9666 }, { "epoch": 3.699579027937237, "grad_norm": 0.5362825989723206, "learning_rate": 3.3447571387476975e-06, "loss": 0.6414, "step": 9667 }, { "epoch": 3.6999617298124763, "grad_norm": 0.5422275066375732, "learning_rate": 3.342907102611457e-06, "loss": 0.6399, "step": 9668 }, { "epoch": 3.700344431687715, "grad_norm": 0.5070762634277344, "learning_rate": 3.341057475571462e-06, "loss": 0.5897, "step": 9669 }, { "epoch": 3.7007271335629546, "grad_norm": 0.5107502341270447, "learning_rate": 3.339208257741371e-06, "loss": 0.6382, "step": 9670 }, { "epoch": 3.7011098354381935, "grad_norm": 0.5910830497741699, "learning_rate": 3.337359449234826e-06, "loss": 0.6872, "step": 9671 }, { "epoch": 3.701492537313433, "grad_norm": 0.5612546801567078, "learning_rate": 3.335511050165443e-06, "loss": 0.622, "step": 9672 }, { "epoch": 3.701875239188672, "grad_norm": 0.5700890421867371, "learning_rate": 3.3336630606468136e-06, "loss": 0.5368, "step": 9673 }, { "epoch": 3.702257941063911, "grad_norm": 0.5166764259338379, "learning_rate": 3.331815480792494e-06, "loss": 0.5917, "step": 9674 }, { "epoch": 3.7026406429391505, "grad_norm": 0.5834545493125916, "learning_rate": 3.3299683107160274e-06, "loss": 0.6357, "step": 9675 }, { "epoch": 3.7030233448143894, "grad_norm": 0.5609895586967468, "learning_rate": 3.3281215505309304e-06, "loss": 0.6401, "step": 9676 }, { "epoch": 3.703406046689629, "grad_norm": 0.5701138377189636, "learning_rate": 3.3262752003506814e-06, "loss": 0.6346, "step": 9677 }, { "epoch": 3.703788748564868, "grad_norm": 0.5769045352935791, "learning_rate": 3.324429260288754e-06, "loss": 0.5961, "step": 9678 }, { "epoch": 3.704171450440107, "grad_norm": 0.517967939376831, "learning_rate": 3.322583730458585e-06, "loss": 0.5731, "step": 9679 }, { "epoch": 3.7045541523153465, "grad_norm": 0.5477491617202759, "learning_rate": 3.3207386109735817e-06, "loss": 0.6599, "step": 9680 }, { "epoch": 3.7049368541905854, "grad_norm": 0.5457311868667603, "learning_rate": 3.3188939019471344e-06, "loss": 0.7149, "step": 9681 }, { "epoch": 3.7053195560658247, "grad_norm": 0.5877906084060669, "learning_rate": 3.317049603492609e-06, "loss": 0.6965, "step": 9682 }, { "epoch": 3.7057022579410637, "grad_norm": 0.5377223491668701, "learning_rate": 3.3152057157233353e-06, "loss": 0.6584, "step": 9683 }, { "epoch": 3.706084959816303, "grad_norm": 0.5666875243186951, "learning_rate": 3.3133622387526298e-06, "loss": 0.5224, "step": 9684 }, { "epoch": 3.7064676616915424, "grad_norm": 0.5825419425964355, "learning_rate": 3.311519172693778e-06, "loss": 0.5997, "step": 9685 }, { "epoch": 3.7068503635667813, "grad_norm": 0.5603167414665222, "learning_rate": 3.309676517660044e-06, "loss": 0.6043, "step": 9686 }, { "epoch": 3.7072330654420207, "grad_norm": 0.5310230255126953, "learning_rate": 3.3078342737646595e-06, "loss": 0.6423, "step": 9687 }, { "epoch": 3.70761576731726, "grad_norm": 0.5442038774490356, "learning_rate": 3.3059924411208333e-06, "loss": 0.653, "step": 9688 }, { "epoch": 3.707998469192499, "grad_norm": 0.5489985942840576, "learning_rate": 3.3041510198417614e-06, "loss": 0.654, "step": 9689 }, { "epoch": 3.7083811710677383, "grad_norm": 0.5924104452133179, "learning_rate": 3.3023100100405947e-06, "loss": 0.59, "step": 9690 }, { "epoch": 3.7087638729429773, "grad_norm": 0.48822057247161865, "learning_rate": 3.3004694118304714e-06, "loss": 0.5928, "step": 9691 }, { "epoch": 3.7091465748182166, "grad_norm": 0.536491870880127, "learning_rate": 3.2986292253245043e-06, "loss": 0.7056, "step": 9692 }, { "epoch": 3.7095292766934556, "grad_norm": 0.5440041422843933, "learning_rate": 3.296789450635771e-06, "loss": 0.6453, "step": 9693 }, { "epoch": 3.709911978568695, "grad_norm": 0.5102075338363647, "learning_rate": 3.294950087877333e-06, "loss": 0.6062, "step": 9694 }, { "epoch": 3.7102946804439343, "grad_norm": 0.4998311400413513, "learning_rate": 3.2931111371622304e-06, "loss": 0.5631, "step": 9695 }, { "epoch": 3.710677382319173, "grad_norm": 0.5574980974197388, "learning_rate": 3.2912725986034623e-06, "loss": 0.6178, "step": 9696 }, { "epoch": 3.7110600841944126, "grad_norm": 0.5174526572227478, "learning_rate": 3.289434472314015e-06, "loss": 0.6502, "step": 9697 }, { "epoch": 3.711442786069652, "grad_norm": 0.5663344264030457, "learning_rate": 3.2875967584068447e-06, "loss": 0.6251, "step": 9698 }, { "epoch": 3.711825487944891, "grad_norm": 0.5919713377952576, "learning_rate": 3.285759456994894e-06, "loss": 0.637, "step": 9699 }, { "epoch": 3.7122081898201302, "grad_norm": 0.5325832962989807, "learning_rate": 3.283922568191057e-06, "loss": 0.6356, "step": 9700 }, { "epoch": 3.712590891695369, "grad_norm": 0.5520521402359009, "learning_rate": 3.282086092108222e-06, "loss": 0.6144, "step": 9701 }, { "epoch": 3.7129735935706085, "grad_norm": 1.281793475151062, "learning_rate": 3.2802500288592477e-06, "loss": 0.5669, "step": 9702 }, { "epoch": 3.7133562954458474, "grad_norm": 0.505564272403717, "learning_rate": 3.2784143785569577e-06, "loss": 0.5657, "step": 9703 }, { "epoch": 3.713738997321087, "grad_norm": 0.6035811305046082, "learning_rate": 3.276579141314162e-06, "loss": 0.6325, "step": 9704 }, { "epoch": 3.714121699196326, "grad_norm": 0.54965740442276, "learning_rate": 3.2747443172436443e-06, "loss": 0.5669, "step": 9705 }, { "epoch": 3.714504401071565, "grad_norm": 0.5528252720832825, "learning_rate": 3.2729099064581514e-06, "loss": 0.609, "step": 9706 }, { "epoch": 3.7148871029468045, "grad_norm": 0.5589099526405334, "learning_rate": 3.271075909070418e-06, "loss": 0.6343, "step": 9707 }, { "epoch": 3.715269804822044, "grad_norm": 0.5698517560958862, "learning_rate": 3.269242325193147e-06, "loss": 0.6226, "step": 9708 }, { "epoch": 3.7156525066972828, "grad_norm": 0.5394648313522339, "learning_rate": 3.2674091549390163e-06, "loss": 0.6777, "step": 9709 }, { "epoch": 3.716035208572522, "grad_norm": 0.6040518283843994, "learning_rate": 3.2655763984206813e-06, "loss": 0.6563, "step": 9710 }, { "epoch": 3.716417910447761, "grad_norm": 0.5539823770523071, "learning_rate": 3.2637440557507672e-06, "loss": 0.6183, "step": 9711 }, { "epoch": 3.7168006123230004, "grad_norm": 0.5447090268135071, "learning_rate": 3.261912127041882e-06, "loss": 0.5755, "step": 9712 }, { "epoch": 3.7171833141982393, "grad_norm": 0.5636189579963684, "learning_rate": 3.260080612406595e-06, "loss": 0.6173, "step": 9713 }, { "epoch": 3.7175660160734787, "grad_norm": 2.3031387329101562, "learning_rate": 3.2582495119574608e-06, "loss": 0.6458, "step": 9714 }, { "epoch": 3.717948717948718, "grad_norm": 0.5160519480705261, "learning_rate": 3.2564188258070096e-06, "loss": 0.6094, "step": 9715 }, { "epoch": 3.718331419823957, "grad_norm": 0.5618635416030884, "learning_rate": 3.2545885540677346e-06, "loss": 0.5538, "step": 9716 }, { "epoch": 3.7187141216991964, "grad_norm": 0.5048941969871521, "learning_rate": 3.252758696852114e-06, "loss": 0.6544, "step": 9717 }, { "epoch": 3.7190968235744357, "grad_norm": 0.5429862141609192, "learning_rate": 3.2509292542725978e-06, "loss": 0.6525, "step": 9718 }, { "epoch": 3.7194795254496746, "grad_norm": 0.5261406898498535, "learning_rate": 3.2491002264416105e-06, "loss": 0.5878, "step": 9719 }, { "epoch": 3.719862227324914, "grad_norm": 0.5410823225975037, "learning_rate": 3.247271613471551e-06, "loss": 0.6887, "step": 9720 }, { "epoch": 3.720244929200153, "grad_norm": 0.5174698233604431, "learning_rate": 3.245443415474795e-06, "loss": 0.5779, "step": 9721 }, { "epoch": 3.7206276310753923, "grad_norm": 0.48121118545532227, "learning_rate": 3.2436156325636836e-06, "loss": 0.5894, "step": 9722 }, { "epoch": 3.721010332950631, "grad_norm": 0.5765323042869568, "learning_rate": 3.241788264850543e-06, "loss": 0.6517, "step": 9723 }, { "epoch": 3.7213930348258706, "grad_norm": 0.5497322082519531, "learning_rate": 3.239961312447668e-06, "loss": 0.6749, "step": 9724 }, { "epoch": 3.72177573670111, "grad_norm": 0.5269626379013062, "learning_rate": 3.2381347754673365e-06, "loss": 0.6079, "step": 9725 }, { "epoch": 3.722158438576349, "grad_norm": 0.5642813444137573, "learning_rate": 3.2363086540217837e-06, "loss": 0.6413, "step": 9726 }, { "epoch": 3.7225411404515882, "grad_norm": 0.5542829036712646, "learning_rate": 3.234482948223235e-06, "loss": 0.7125, "step": 9727 }, { "epoch": 3.7229238423268276, "grad_norm": 0.5637125372886658, "learning_rate": 3.2326576581838853e-06, "loss": 0.5915, "step": 9728 }, { "epoch": 3.7233065442020665, "grad_norm": 0.554226279258728, "learning_rate": 3.2308327840159026e-06, "loss": 0.6332, "step": 9729 }, { "epoch": 3.723689246077306, "grad_norm": 0.5385833382606506, "learning_rate": 3.229008325831431e-06, "loss": 0.5862, "step": 9730 }, { "epoch": 3.724071947952545, "grad_norm": 0.5947968363761902, "learning_rate": 3.2271842837425917e-06, "loss": 0.6476, "step": 9731 }, { "epoch": 3.724454649827784, "grad_norm": 0.5636001825332642, "learning_rate": 3.225360657861468e-06, "loss": 0.6555, "step": 9732 }, { "epoch": 3.724837351703023, "grad_norm": 0.5656930804252625, "learning_rate": 3.2235374483001335e-06, "loss": 0.6211, "step": 9733 }, { "epoch": 3.7252200535782625, "grad_norm": 0.5716999769210815, "learning_rate": 3.22171465517063e-06, "loss": 0.6436, "step": 9734 }, { "epoch": 3.725602755453502, "grad_norm": 0.5206620097160339, "learning_rate": 3.2198922785849686e-06, "loss": 0.6175, "step": 9735 }, { "epoch": 3.7259854573287408, "grad_norm": 0.539728045463562, "learning_rate": 3.218070318655141e-06, "loss": 0.6194, "step": 9736 }, { "epoch": 3.72636815920398, "grad_norm": 0.5470936894416809, "learning_rate": 3.2162487754931115e-06, "loss": 0.5403, "step": 9737 }, { "epoch": 3.7267508610792195, "grad_norm": 0.5678439140319824, "learning_rate": 3.2144276492108196e-06, "loss": 0.7091, "step": 9738 }, { "epoch": 3.7271335629544584, "grad_norm": 0.4943036437034607, "learning_rate": 3.2126069399201775e-06, "loss": 0.5878, "step": 9739 }, { "epoch": 3.727516264829698, "grad_norm": 0.49609512090682983, "learning_rate": 3.210786647733074e-06, "loss": 0.6345, "step": 9740 }, { "epoch": 3.7278989667049367, "grad_norm": 0.5366259813308716, "learning_rate": 3.2089667727613728e-06, "loss": 0.5891, "step": 9741 }, { "epoch": 3.728281668580176, "grad_norm": 0.5211608409881592, "learning_rate": 3.207147315116904e-06, "loss": 0.5465, "step": 9742 }, { "epoch": 3.728664370455415, "grad_norm": 0.5290776491165161, "learning_rate": 3.2053282749114823e-06, "loss": 0.6673, "step": 9743 }, { "epoch": 3.7290470723306544, "grad_norm": 0.5194268226623535, "learning_rate": 3.2035096522568943e-06, "loss": 0.5698, "step": 9744 }, { "epoch": 3.7294297742058937, "grad_norm": 0.5384404063224792, "learning_rate": 3.201691447264894e-06, "loss": 0.5926, "step": 9745 }, { "epoch": 3.7298124760811326, "grad_norm": 0.5349627137184143, "learning_rate": 3.199873660047218e-06, "loss": 0.6485, "step": 9746 }, { "epoch": 3.730195177956372, "grad_norm": 0.5269471406936646, "learning_rate": 3.1980562907155743e-06, "loss": 0.6339, "step": 9747 }, { "epoch": 3.7305778798316114, "grad_norm": 0.5511407852172852, "learning_rate": 3.1962393393816447e-06, "loss": 0.6679, "step": 9748 }, { "epoch": 3.7309605817068503, "grad_norm": 0.5564678311347961, "learning_rate": 3.194422806157086e-06, "loss": 0.5819, "step": 9749 }, { "epoch": 3.7313432835820897, "grad_norm": 0.5380440354347229, "learning_rate": 3.1926066911535293e-06, "loss": 0.6054, "step": 9750 }, { "epoch": 3.7317259854573286, "grad_norm": 0.6290005445480347, "learning_rate": 3.190790994482582e-06, "loss": 0.6567, "step": 9751 }, { "epoch": 3.732108687332568, "grad_norm": 0.5370422601699829, "learning_rate": 3.188975716255818e-06, "loss": 0.6497, "step": 9752 }, { "epoch": 3.732491389207807, "grad_norm": 0.601036012172699, "learning_rate": 3.1871608565847933e-06, "loss": 0.5609, "step": 9753 }, { "epoch": 3.7328740910830462, "grad_norm": 0.5505651235580444, "learning_rate": 3.185346415581041e-06, "loss": 0.6556, "step": 9754 }, { "epoch": 3.7332567929582856, "grad_norm": 0.5284082889556885, "learning_rate": 3.1835323933560547e-06, "loss": 0.5984, "step": 9755 }, { "epoch": 3.7336394948335245, "grad_norm": 0.5589686632156372, "learning_rate": 3.1817187900213166e-06, "loss": 0.5143, "step": 9756 }, { "epoch": 3.734022196708764, "grad_norm": 0.5245158076286316, "learning_rate": 3.179905605688275e-06, "loss": 0.6266, "step": 9757 }, { "epoch": 3.7344048985840033, "grad_norm": 0.5318672060966492, "learning_rate": 3.1780928404683554e-06, "loss": 0.5765, "step": 9758 }, { "epoch": 3.734787600459242, "grad_norm": 0.697074294090271, "learning_rate": 3.176280494472959e-06, "loss": 0.5314, "step": 9759 }, { "epoch": 3.7351703023344816, "grad_norm": 0.5442189574241638, "learning_rate": 3.174468567813461e-06, "loss": 0.5845, "step": 9760 }, { "epoch": 3.7355530042097205, "grad_norm": 0.5509704351425171, "learning_rate": 3.172657060601203e-06, "loss": 0.6199, "step": 9761 }, { "epoch": 3.73593570608496, "grad_norm": 0.5077142119407654, "learning_rate": 3.1708459729475094e-06, "loss": 0.6233, "step": 9762 }, { "epoch": 3.7363184079601988, "grad_norm": 0.5479790568351746, "learning_rate": 3.169035304963678e-06, "loss": 0.6889, "step": 9763 }, { "epoch": 3.736701109835438, "grad_norm": 0.5510374903678894, "learning_rate": 3.167225056760982e-06, "loss": 0.6348, "step": 9764 }, { "epoch": 3.7370838117106775, "grad_norm": 0.5622367262840271, "learning_rate": 3.1654152284506588e-06, "loss": 0.6406, "step": 9765 }, { "epoch": 3.7374665135859164, "grad_norm": 0.5539929866790771, "learning_rate": 3.16360582014393e-06, "loss": 0.6573, "step": 9766 }, { "epoch": 3.737849215461156, "grad_norm": 0.5560228228569031, "learning_rate": 3.161796831951991e-06, "loss": 0.5807, "step": 9767 }, { "epoch": 3.738231917336395, "grad_norm": 0.5292220711708069, "learning_rate": 3.1599882639860058e-06, "loss": 0.6002, "step": 9768 }, { "epoch": 3.738614619211634, "grad_norm": 0.5630899667739868, "learning_rate": 3.1581801163571192e-06, "loss": 0.6543, "step": 9769 }, { "epoch": 3.7389973210868734, "grad_norm": 0.5368460416793823, "learning_rate": 3.1563723891764464e-06, "loss": 0.701, "step": 9770 }, { "epoch": 3.7393800229621124, "grad_norm": 0.5715883374214172, "learning_rate": 3.1545650825550734e-06, "loss": 0.6482, "step": 9771 }, { "epoch": 3.7397627248373517, "grad_norm": 0.5267295241355896, "learning_rate": 3.1527581966040667e-06, "loss": 0.7258, "step": 9772 }, { "epoch": 3.7401454267125906, "grad_norm": 0.5607187747955322, "learning_rate": 3.1509517314344664e-06, "loss": 0.6307, "step": 9773 }, { "epoch": 3.74052812858783, "grad_norm": 0.5542261600494385, "learning_rate": 3.149145687157279e-06, "loss": 0.6625, "step": 9774 }, { "epoch": 3.7409108304630694, "grad_norm": 0.5683253407478333, "learning_rate": 3.147340063883494e-06, "loss": 0.6298, "step": 9775 }, { "epoch": 3.7412935323383083, "grad_norm": 0.5146757364273071, "learning_rate": 3.1455348617240712e-06, "loss": 0.5623, "step": 9776 }, { "epoch": 3.7416762342135477, "grad_norm": 0.5380454063415527, "learning_rate": 3.1437300807899462e-06, "loss": 0.6376, "step": 9777 }, { "epoch": 3.742058936088787, "grad_norm": 0.5067238807678223, "learning_rate": 3.141925721192026e-06, "loss": 0.5905, "step": 9778 }, { "epoch": 3.742441637964026, "grad_norm": 0.5765577554702759, "learning_rate": 3.1401217830411956e-06, "loss": 0.6309, "step": 9779 }, { "epoch": 3.7428243398392653, "grad_norm": 0.5434549450874329, "learning_rate": 3.1383182664483134e-06, "loss": 0.6604, "step": 9780 }, { "epoch": 3.7432070417145042, "grad_norm": 0.5268422961235046, "learning_rate": 3.1365151715242037e-06, "loss": 0.586, "step": 9781 }, { "epoch": 3.7435897435897436, "grad_norm": 0.5248993039131165, "learning_rate": 3.1347124983796762e-06, "loss": 0.5634, "step": 9782 }, { "epoch": 3.7439724454649825, "grad_norm": 0.594125509262085, "learning_rate": 3.132910247125512e-06, "loss": 0.681, "step": 9783 }, { "epoch": 3.744355147340222, "grad_norm": 0.5708572864532471, "learning_rate": 3.1311084178724595e-06, "loss": 0.6833, "step": 9784 }, { "epoch": 3.7447378492154613, "grad_norm": 0.5227465033531189, "learning_rate": 3.129307010731246e-06, "loss": 0.6264, "step": 9785 }, { "epoch": 3.7451205510907, "grad_norm": 0.671461820602417, "learning_rate": 3.127506025812579e-06, "loss": 0.7041, "step": 9786 }, { "epoch": 3.7455032529659396, "grad_norm": 0.5611217021942139, "learning_rate": 3.125705463227127e-06, "loss": 0.5983, "step": 9787 }, { "epoch": 3.745885954841179, "grad_norm": 0.540473997592926, "learning_rate": 3.1239053230855374e-06, "loss": 0.6083, "step": 9788 }, { "epoch": 3.746268656716418, "grad_norm": 0.5436522960662842, "learning_rate": 3.122105605498442e-06, "loss": 0.66, "step": 9789 }, { "epoch": 3.746651358591657, "grad_norm": 0.5114380717277527, "learning_rate": 3.1203063105764388e-06, "loss": 0.6228, "step": 9790 }, { "epoch": 3.747034060466896, "grad_norm": 0.49162837862968445, "learning_rate": 3.1185074384300907e-06, "loss": 0.6199, "step": 9791 }, { "epoch": 3.7474167623421355, "grad_norm": 0.5173079371452332, "learning_rate": 3.1167089891699475e-06, "loss": 0.668, "step": 9792 }, { "epoch": 3.7477994642173744, "grad_norm": 0.6168041229248047, "learning_rate": 3.114910962906532e-06, "loss": 0.6362, "step": 9793 }, { "epoch": 3.748182166092614, "grad_norm": 0.5341818928718567, "learning_rate": 3.1131133597503316e-06, "loss": 0.5613, "step": 9794 }, { "epoch": 3.748564867967853, "grad_norm": 0.49309471249580383, "learning_rate": 3.1113161798118164e-06, "loss": 0.5679, "step": 9795 }, { "epoch": 3.748947569843092, "grad_norm": 0.5629945993423462, "learning_rate": 3.1095194232014324e-06, "loss": 0.6261, "step": 9796 }, { "epoch": 3.7493302717183314, "grad_norm": 0.5191208124160767, "learning_rate": 3.1077230900295873e-06, "loss": 0.5933, "step": 9797 }, { "epoch": 3.749712973593571, "grad_norm": 0.5623129606246948, "learning_rate": 3.1059271804066716e-06, "loss": 0.5902, "step": 9798 }, { "epoch": 3.7500956754688097, "grad_norm": 0.5867291688919067, "learning_rate": 3.1041316944430576e-06, "loss": 0.7128, "step": 9799 }, { "epoch": 3.750478377344049, "grad_norm": 0.595055103302002, "learning_rate": 3.1023366322490733e-06, "loss": 0.6641, "step": 9800 }, { "epoch": 3.750861079219288, "grad_norm": 0.5399697422981262, "learning_rate": 3.1005419939350333e-06, "loss": 0.616, "step": 9801 }, { "epoch": 3.7512437810945274, "grad_norm": 0.528954029083252, "learning_rate": 3.0987477796112265e-06, "loss": 0.6498, "step": 9802 }, { "epoch": 3.7516264829697663, "grad_norm": 0.5366382002830505, "learning_rate": 3.096953989387905e-06, "loss": 0.5776, "step": 9803 }, { "epoch": 3.7520091848450057, "grad_norm": 0.5824870467185974, "learning_rate": 3.095160623375307e-06, "loss": 0.6675, "step": 9804 }, { "epoch": 3.752391886720245, "grad_norm": 0.5205615758895874, "learning_rate": 3.0933676816836376e-06, "loss": 0.6064, "step": 9805 }, { "epoch": 3.752774588595484, "grad_norm": 0.5750672221183777, "learning_rate": 3.091575164423083e-06, "loss": 0.6701, "step": 9806 }, { "epoch": 3.7531572904707233, "grad_norm": 0.53006911277771, "learning_rate": 3.08978307170379e-06, "loss": 0.5564, "step": 9807 }, { "epoch": 3.7535399923459627, "grad_norm": 0.6192997694015503, "learning_rate": 3.0879914036358915e-06, "loss": 0.64, "step": 9808 }, { "epoch": 3.7539226942212016, "grad_norm": 0.5334147810935974, "learning_rate": 3.08620016032949e-06, "loss": 0.6764, "step": 9809 }, { "epoch": 3.754305396096441, "grad_norm": 0.5205471515655518, "learning_rate": 3.084409341894663e-06, "loss": 0.5799, "step": 9810 }, { "epoch": 3.75468809797168, "grad_norm": 0.5304623246192932, "learning_rate": 3.0826189484414605e-06, "loss": 0.6579, "step": 9811 }, { "epoch": 3.7550707998469193, "grad_norm": 0.5566868782043457, "learning_rate": 3.0808289800799107e-06, "loss": 0.6234, "step": 9812 }, { "epoch": 3.755453501722158, "grad_norm": 0.5360240936279297, "learning_rate": 3.079039436920004e-06, "loss": 0.6385, "step": 9813 }, { "epoch": 3.7558362035973976, "grad_norm": 0.5215380787849426, "learning_rate": 3.0772503190717175e-06, "loss": 0.6095, "step": 9814 }, { "epoch": 3.756218905472637, "grad_norm": 0.5445026159286499, "learning_rate": 3.075461626645e-06, "loss": 0.6221, "step": 9815 }, { "epoch": 3.756601607347876, "grad_norm": 0.5613710284233093, "learning_rate": 3.073673359749766e-06, "loss": 0.5837, "step": 9816 }, { "epoch": 3.756984309223115, "grad_norm": 0.5608628392219543, "learning_rate": 3.0718855184959106e-06, "loss": 0.5719, "step": 9817 }, { "epoch": 3.7573670110983546, "grad_norm": 0.5067806839942932, "learning_rate": 3.0700981029933017e-06, "loss": 0.5354, "step": 9818 }, { "epoch": 3.7577497129735935, "grad_norm": 0.568947970867157, "learning_rate": 3.068311113351783e-06, "loss": 0.7127, "step": 9819 }, { "epoch": 3.758132414848833, "grad_norm": 0.5602158308029175, "learning_rate": 3.066524549681168e-06, "loss": 0.6096, "step": 9820 }, { "epoch": 3.758515116724072, "grad_norm": 0.6020223498344421, "learning_rate": 3.064738412091245e-06, "loss": 0.5962, "step": 9821 }, { "epoch": 3.758897818599311, "grad_norm": 0.5223006010055542, "learning_rate": 3.0629527006917825e-06, "loss": 0.6866, "step": 9822 }, { "epoch": 3.75928052047455, "grad_norm": 0.5689187049865723, "learning_rate": 3.0611674155925085e-06, "loss": 0.6567, "step": 9823 }, { "epoch": 3.7596632223497894, "grad_norm": 0.5579220652580261, "learning_rate": 3.059382556903139e-06, "loss": 0.6195, "step": 9824 }, { "epoch": 3.760045924225029, "grad_norm": 0.5288358926773071, "learning_rate": 3.0575981247333596e-06, "loss": 0.6077, "step": 9825 }, { "epoch": 3.7604286261002677, "grad_norm": 0.6156314611434937, "learning_rate": 3.0558141191928238e-06, "loss": 0.5927, "step": 9826 }, { "epoch": 3.760811327975507, "grad_norm": 0.5247079133987427, "learning_rate": 3.054030540391165e-06, "loss": 0.5967, "step": 9827 }, { "epoch": 3.7611940298507465, "grad_norm": 0.531792938709259, "learning_rate": 3.052247388437991e-06, "loss": 0.5945, "step": 9828 }, { "epoch": 3.7615767317259854, "grad_norm": 0.5153583884239197, "learning_rate": 3.0504646634428793e-06, "loss": 0.6243, "step": 9829 }, { "epoch": 3.7619594336012248, "grad_norm": 0.521467387676239, "learning_rate": 3.048682365515384e-06, "loss": 0.5806, "step": 9830 }, { "epoch": 3.7623421354764637, "grad_norm": 0.577603280544281, "learning_rate": 3.0469004947650316e-06, "loss": 0.6675, "step": 9831 }, { "epoch": 3.762724837351703, "grad_norm": 0.5472520589828491, "learning_rate": 3.045119051301326e-06, "loss": 0.6953, "step": 9832 }, { "epoch": 3.763107539226942, "grad_norm": 0.5401227474212646, "learning_rate": 3.043338035233737e-06, "loss": 0.6052, "step": 9833 }, { "epoch": 3.7634902411021813, "grad_norm": 0.5386372208595276, "learning_rate": 3.0415574466717135e-06, "loss": 0.674, "step": 9834 }, { "epoch": 3.7638729429774207, "grad_norm": 0.5228644609451294, "learning_rate": 3.039777285724683e-06, "loss": 0.5946, "step": 9835 }, { "epoch": 3.7642556448526596, "grad_norm": 0.5786600708961487, "learning_rate": 3.0379975525020334e-06, "loss": 0.593, "step": 9836 }, { "epoch": 3.764638346727899, "grad_norm": 0.5248952507972717, "learning_rate": 3.0362182471131374e-06, "loss": 0.5578, "step": 9837 }, { "epoch": 3.7650210486031384, "grad_norm": 0.6269221305847168, "learning_rate": 3.034439369667338e-06, "loss": 0.784, "step": 9838 }, { "epoch": 3.7654037504783773, "grad_norm": 0.5433233976364136, "learning_rate": 3.032660920273952e-06, "loss": 0.6674, "step": 9839 }, { "epoch": 3.7657864523536166, "grad_norm": 0.5362517833709717, "learning_rate": 3.030882899042271e-06, "loss": 0.621, "step": 9840 }, { "epoch": 3.7661691542288556, "grad_norm": 0.546634316444397, "learning_rate": 3.029105306081561e-06, "loss": 0.5877, "step": 9841 }, { "epoch": 3.766551856104095, "grad_norm": 0.5231983065605164, "learning_rate": 3.0273281415010536e-06, "loss": 0.6409, "step": 9842 }, { "epoch": 3.766934557979334, "grad_norm": 0.5062413215637207, "learning_rate": 3.0255514054099645e-06, "loss": 0.5383, "step": 9843 }, { "epoch": 3.767317259854573, "grad_norm": 0.5248929858207703, "learning_rate": 3.0237750979174785e-06, "loss": 0.7428, "step": 9844 }, { "epoch": 3.7676999617298126, "grad_norm": 0.5526524782180786, "learning_rate": 3.0219992191327574e-06, "loss": 0.6225, "step": 9845 }, { "epoch": 3.7680826636050515, "grad_norm": 0.5405586361885071, "learning_rate": 3.0202237691649263e-06, "loss": 0.6426, "step": 9846 }, { "epoch": 3.768465365480291, "grad_norm": 0.543852686882019, "learning_rate": 3.018448748123097e-06, "loss": 0.6795, "step": 9847 }, { "epoch": 3.7688480673555302, "grad_norm": 0.6007512807846069, "learning_rate": 3.016674156116347e-06, "loss": 0.6562, "step": 9848 }, { "epoch": 3.769230769230769, "grad_norm": 0.5090292096138, "learning_rate": 3.0148999932537317e-06, "loss": 0.6293, "step": 9849 }, { "epoch": 3.7696134711060085, "grad_norm": 0.5577221512794495, "learning_rate": 3.0131262596442766e-06, "loss": 0.6054, "step": 9850 }, { "epoch": 3.7699961729812475, "grad_norm": 0.5510316491127014, "learning_rate": 3.0113529553969868e-06, "loss": 0.5985, "step": 9851 }, { "epoch": 3.770378874856487, "grad_norm": 0.558601438999176, "learning_rate": 3.0095800806208287e-06, "loss": 0.6153, "step": 9852 }, { "epoch": 3.7707615767317257, "grad_norm": 0.5127334594726562, "learning_rate": 3.0078076354247553e-06, "loss": 0.6431, "step": 9853 }, { "epoch": 3.771144278606965, "grad_norm": 0.5578764081001282, "learning_rate": 3.0060356199176897e-06, "loss": 0.6454, "step": 9854 }, { "epoch": 3.7715269804822045, "grad_norm": 0.5232360363006592, "learning_rate": 3.0042640342085217e-06, "loss": 0.7553, "step": 9855 }, { "epoch": 3.7719096823574434, "grad_norm": 0.52305668592453, "learning_rate": 3.0024928784061225e-06, "loss": 0.5947, "step": 9856 }, { "epoch": 3.7722923842326828, "grad_norm": 0.532434344291687, "learning_rate": 3.000722152619334e-06, "loss": 0.6414, "step": 9857 }, { "epoch": 3.772675086107922, "grad_norm": 0.5004011392593384, "learning_rate": 2.9989518569569743e-06, "loss": 0.6082, "step": 9858 }, { "epoch": 3.773057787983161, "grad_norm": 0.6452011466026306, "learning_rate": 2.99718199152783e-06, "loss": 0.6172, "step": 9859 }, { "epoch": 3.7734404898584004, "grad_norm": 0.5649629235267639, "learning_rate": 2.9954125564406655e-06, "loss": 0.577, "step": 9860 }, { "epoch": 3.7738231917336393, "grad_norm": 0.5227235555648804, "learning_rate": 2.9936435518042206e-06, "loss": 0.6184, "step": 9861 }, { "epoch": 3.7742058936088787, "grad_norm": 0.5577107071876526, "learning_rate": 2.9918749777271992e-06, "loss": 0.5806, "step": 9862 }, { "epoch": 3.7745885954841176, "grad_norm": 0.5633190870285034, "learning_rate": 2.990106834318287e-06, "loss": 0.5994, "step": 9863 }, { "epoch": 3.774971297359357, "grad_norm": 0.561862587928772, "learning_rate": 2.9883391216861447e-06, "loss": 0.7158, "step": 9864 }, { "epoch": 3.7753539992345964, "grad_norm": 0.5331745743751526, "learning_rate": 2.986571839939397e-06, "loss": 0.6351, "step": 9865 }, { "epoch": 3.7757367011098353, "grad_norm": 0.5985412001609802, "learning_rate": 2.9848049891866526e-06, "loss": 0.6082, "step": 9866 }, { "epoch": 3.7761194029850746, "grad_norm": 0.5260043740272522, "learning_rate": 2.983038569536487e-06, "loss": 0.6336, "step": 9867 }, { "epoch": 3.776502104860314, "grad_norm": 0.5324889421463013, "learning_rate": 2.9812725810974517e-06, "loss": 0.6198, "step": 9868 }, { "epoch": 3.776884806735553, "grad_norm": 0.5446351170539856, "learning_rate": 2.9795070239780734e-06, "loss": 0.5888, "step": 9869 }, { "epoch": 3.7772675086107923, "grad_norm": 0.5603732466697693, "learning_rate": 2.9777418982868477e-06, "loss": 0.6494, "step": 9870 }, { "epoch": 3.7776502104860312, "grad_norm": 0.5679568648338318, "learning_rate": 2.975977204132251e-06, "loss": 0.6995, "step": 9871 }, { "epoch": 3.7780329123612706, "grad_norm": 0.5357484817504883, "learning_rate": 2.9742129416227218e-06, "loss": 0.6465, "step": 9872 }, { "epoch": 3.7784156142365095, "grad_norm": 0.5263817310333252, "learning_rate": 2.972449110866683e-06, "loss": 0.6104, "step": 9873 }, { "epoch": 3.778798316111749, "grad_norm": 0.5692196488380432, "learning_rate": 2.9706857119725274e-06, "loss": 0.5568, "step": 9874 }, { "epoch": 3.7791810179869882, "grad_norm": 0.5358734130859375, "learning_rate": 2.968922745048617e-06, "loss": 0.5929, "step": 9875 }, { "epoch": 3.779563719862227, "grad_norm": 0.5674314498901367, "learning_rate": 2.9671602102032926e-06, "loss": 0.7521, "step": 9876 }, { "epoch": 3.7799464217374665, "grad_norm": 0.6555472612380981, "learning_rate": 2.9653981075448667e-06, "loss": 0.6563, "step": 9877 }, { "epoch": 3.780329123612706, "grad_norm": 0.5094373822212219, "learning_rate": 2.963636437181626e-06, "loss": 0.5132, "step": 9878 }, { "epoch": 3.780711825487945, "grad_norm": 0.5692169666290283, "learning_rate": 2.9618751992218285e-06, "loss": 0.6679, "step": 9879 }, { "epoch": 3.781094527363184, "grad_norm": 0.5347950458526611, "learning_rate": 2.960114393773712e-06, "loss": 0.6152, "step": 9880 }, { "epoch": 3.781477229238423, "grad_norm": 0.5226945877075195, "learning_rate": 2.9583540209454743e-06, "loss": 0.6159, "step": 9881 }, { "epoch": 3.7818599311136625, "grad_norm": 0.5487821698188782, "learning_rate": 2.9565940808453e-06, "loss": 0.6898, "step": 9882 }, { "epoch": 3.7822426329889014, "grad_norm": 0.5643488168716431, "learning_rate": 2.954834573581341e-06, "loss": 0.6611, "step": 9883 }, { "epoch": 3.7826253348641408, "grad_norm": 0.5419975519180298, "learning_rate": 2.9530754992617284e-06, "loss": 0.667, "step": 9884 }, { "epoch": 3.78300803673938, "grad_norm": 0.518548309803009, "learning_rate": 2.951316857994554e-06, "loss": 0.5469, "step": 9885 }, { "epoch": 3.783390738614619, "grad_norm": 0.569868803024292, "learning_rate": 2.9495586498878946e-06, "loss": 0.5963, "step": 9886 }, { "epoch": 3.7837734404898584, "grad_norm": 0.5898333787918091, "learning_rate": 2.947800875049801e-06, "loss": 0.6019, "step": 9887 }, { "epoch": 3.784156142365098, "grad_norm": 0.5143133997917175, "learning_rate": 2.946043533588283e-06, "loss": 0.5462, "step": 9888 }, { "epoch": 3.7845388442403367, "grad_norm": 0.5416970252990723, "learning_rate": 2.9442866256113422e-06, "loss": 0.6272, "step": 9889 }, { "epoch": 3.784921546115576, "grad_norm": 0.505152702331543, "learning_rate": 2.9425301512269476e-06, "loss": 0.5944, "step": 9890 }, { "epoch": 3.785304247990815, "grad_norm": 0.5872811079025269, "learning_rate": 2.940774110543032e-06, "loss": 0.5648, "step": 9891 }, { "epoch": 3.7856869498660544, "grad_norm": 0.5547801852226257, "learning_rate": 2.939018503667511e-06, "loss": 0.5831, "step": 9892 }, { "epoch": 3.7860696517412933, "grad_norm": 0.5446993112564087, "learning_rate": 2.9372633307082754e-06, "loss": 0.6498, "step": 9893 }, { "epoch": 3.7864523536165327, "grad_norm": 0.5235533118247986, "learning_rate": 2.9355085917731798e-06, "loss": 0.6231, "step": 9894 }, { "epoch": 3.786835055491772, "grad_norm": 0.5277474522590637, "learning_rate": 2.9337542869700596e-06, "loss": 0.6455, "step": 9895 }, { "epoch": 3.787217757367011, "grad_norm": 0.5056071281433105, "learning_rate": 2.932000416406722e-06, "loss": 0.624, "step": 9896 }, { "epoch": 3.7876004592422503, "grad_norm": 0.4991721510887146, "learning_rate": 2.9302469801909493e-06, "loss": 0.6794, "step": 9897 }, { "epoch": 3.7879831611174897, "grad_norm": 0.5362375378608704, "learning_rate": 2.9284939784304867e-06, "loss": 0.6196, "step": 9898 }, { "epoch": 3.7883658629927286, "grad_norm": 0.5373159646987915, "learning_rate": 2.9267414112330694e-06, "loss": 0.5937, "step": 9899 }, { "epoch": 3.788748564867968, "grad_norm": 0.5179824829101562, "learning_rate": 2.924989278706398e-06, "loss": 0.6164, "step": 9900 }, { "epoch": 3.789131266743207, "grad_norm": 0.5708467960357666, "learning_rate": 2.9232375809581406e-06, "loss": 0.641, "step": 9901 }, { "epoch": 3.7895139686184462, "grad_norm": 0.5055057406425476, "learning_rate": 2.921486318095944e-06, "loss": 0.5797, "step": 9902 }, { "epoch": 3.789896670493685, "grad_norm": 0.49698111414909363, "learning_rate": 2.919735490227432e-06, "loss": 0.5719, "step": 9903 }, { "epoch": 3.7902793723689245, "grad_norm": 0.5377119183540344, "learning_rate": 2.917985097460193e-06, "loss": 0.5615, "step": 9904 }, { "epoch": 3.790662074244164, "grad_norm": 0.5649453997612, "learning_rate": 2.9162351399017964e-06, "loss": 0.5963, "step": 9905 }, { "epoch": 3.791044776119403, "grad_norm": 0.5518119931221008, "learning_rate": 2.9144856176597824e-06, "loss": 0.6635, "step": 9906 }, { "epoch": 3.791427477994642, "grad_norm": 0.5361660718917847, "learning_rate": 2.91273653084166e-06, "loss": 0.6002, "step": 9907 }, { "epoch": 3.7918101798698816, "grad_norm": 0.5950743556022644, "learning_rate": 2.9109878795549175e-06, "loss": 0.6167, "step": 9908 }, { "epoch": 3.7921928817451205, "grad_norm": 0.5887429118156433, "learning_rate": 2.909239663907012e-06, "loss": 0.6617, "step": 9909 }, { "epoch": 3.79257558362036, "grad_norm": 0.5640369057655334, "learning_rate": 2.9074918840053832e-06, "loss": 0.6832, "step": 9910 }, { "epoch": 3.7929582854955988, "grad_norm": 0.6091558337211609, "learning_rate": 2.9057445399574304e-06, "loss": 0.61, "step": 9911 }, { "epoch": 3.793340987370838, "grad_norm": 0.5299991965293884, "learning_rate": 2.903997631870533e-06, "loss": 0.6096, "step": 9912 }, { "epoch": 3.793723689246077, "grad_norm": 0.5437654852867126, "learning_rate": 2.9022511598520488e-06, "loss": 0.6498, "step": 9913 }, { "epoch": 3.7941063911213164, "grad_norm": 0.64975506067276, "learning_rate": 2.9005051240092953e-06, "loss": 0.6421, "step": 9914 }, { "epoch": 3.794489092996556, "grad_norm": 0.5620417594909668, "learning_rate": 2.8987595244495736e-06, "loss": 0.5723, "step": 9915 }, { "epoch": 3.7948717948717947, "grad_norm": 0.511301577091217, "learning_rate": 2.8970143612801615e-06, "loss": 0.5748, "step": 9916 }, { "epoch": 3.795254496747034, "grad_norm": 0.5666773319244385, "learning_rate": 2.8952696346082944e-06, "loss": 0.6706, "step": 9917 }, { "epoch": 3.7956371986222734, "grad_norm": 0.5073747038841248, "learning_rate": 2.893525344541196e-06, "loss": 0.6156, "step": 9918 }, { "epoch": 3.7960199004975124, "grad_norm": 0.5425633192062378, "learning_rate": 2.8917814911860553e-06, "loss": 0.5742, "step": 9919 }, { "epoch": 3.7964026023727517, "grad_norm": 0.563490092754364, "learning_rate": 2.890038074650039e-06, "loss": 0.6259, "step": 9920 }, { "epoch": 3.7967853042479907, "grad_norm": 0.5867006778717041, "learning_rate": 2.8882950950402845e-06, "loss": 0.6515, "step": 9921 }, { "epoch": 3.79716800612323, "grad_norm": 0.5393668413162231, "learning_rate": 2.8865525524639016e-06, "loss": 0.7103, "step": 9922 }, { "epoch": 3.797550707998469, "grad_norm": 0.5130787491798401, "learning_rate": 2.884810447027977e-06, "loss": 0.6517, "step": 9923 }, { "epoch": 3.7979334098737083, "grad_norm": 0.5655619502067566, "learning_rate": 2.883068778839563e-06, "loss": 0.5851, "step": 9924 }, { "epoch": 3.7983161117489477, "grad_norm": 0.5364257097244263, "learning_rate": 2.8813275480056915e-06, "loss": 0.6251, "step": 9925 }, { "epoch": 3.7986988136241866, "grad_norm": 0.5082932114601135, "learning_rate": 2.879586754633369e-06, "loss": 0.5595, "step": 9926 }, { "epoch": 3.799081515499426, "grad_norm": 0.6096019148826599, "learning_rate": 2.877846398829567e-06, "loss": 0.708, "step": 9927 }, { "epoch": 3.7994642173746653, "grad_norm": 0.6959862112998962, "learning_rate": 2.8761064807012375e-06, "loss": 0.5536, "step": 9928 }, { "epoch": 3.7998469192499043, "grad_norm": 0.5798264741897583, "learning_rate": 2.8743670003553025e-06, "loss": 0.595, "step": 9929 }, { "epoch": 3.8002296211251436, "grad_norm": 0.5487147569656372, "learning_rate": 2.8726279578986595e-06, "loss": 0.5573, "step": 9930 }, { "epoch": 3.8006123230003825, "grad_norm": 0.5323843359947205, "learning_rate": 2.8708893534381745e-06, "loss": 0.5672, "step": 9931 }, { "epoch": 3.800995024875622, "grad_norm": 0.5583842992782593, "learning_rate": 2.869151187080695e-06, "loss": 0.7111, "step": 9932 }, { "epoch": 3.801377726750861, "grad_norm": 0.5502694845199585, "learning_rate": 2.8674134589330294e-06, "loss": 0.5796, "step": 9933 }, { "epoch": 3.8017604286261, "grad_norm": 0.5543885231018066, "learning_rate": 2.8656761691019673e-06, "loss": 0.5969, "step": 9934 }, { "epoch": 3.8021431305013396, "grad_norm": 0.5189929008483887, "learning_rate": 2.863939317694271e-06, "loss": 0.6901, "step": 9935 }, { "epoch": 3.8025258323765785, "grad_norm": 0.5889059901237488, "learning_rate": 2.8622029048166777e-06, "loss": 0.6428, "step": 9936 }, { "epoch": 3.802908534251818, "grad_norm": 0.504173755645752, "learning_rate": 2.8604669305758893e-06, "loss": 0.6296, "step": 9937 }, { "epoch": 3.803291236127057, "grad_norm": 0.5548140406608582, "learning_rate": 2.8587313950785876e-06, "loss": 0.5787, "step": 9938 }, { "epoch": 3.803673938002296, "grad_norm": 0.5882185101509094, "learning_rate": 2.856996298431427e-06, "loss": 0.5887, "step": 9939 }, { "epoch": 3.8040566398775355, "grad_norm": 0.5483806133270264, "learning_rate": 2.855261640741033e-06, "loss": 0.6091, "step": 9940 }, { "epoch": 3.8044393417527744, "grad_norm": 0.4763704240322113, "learning_rate": 2.853527422114005e-06, "loss": 0.6378, "step": 9941 }, { "epoch": 3.804822043628014, "grad_norm": 0.5084008574485779, "learning_rate": 2.85179364265692e-06, "loss": 0.5861, "step": 9942 }, { "epoch": 3.8052047455032527, "grad_norm": 0.5204588174819946, "learning_rate": 2.8500603024763152e-06, "loss": 0.6086, "step": 9943 }, { "epoch": 3.805587447378492, "grad_norm": 0.5083126425743103, "learning_rate": 2.8483274016787145e-06, "loss": 0.5593, "step": 9944 }, { "epoch": 3.8059701492537314, "grad_norm": 0.553295373916626, "learning_rate": 2.8465949403706094e-06, "loss": 0.6173, "step": 9945 }, { "epoch": 3.8063528511289704, "grad_norm": 0.5147071480751038, "learning_rate": 2.844862918658461e-06, "loss": 0.6713, "step": 9946 }, { "epoch": 3.8067355530042097, "grad_norm": 0.563506007194519, "learning_rate": 2.8431313366487066e-06, "loss": 0.648, "step": 9947 }, { "epoch": 3.807118254879449, "grad_norm": 0.5494356155395508, "learning_rate": 2.84140019444776e-06, "loss": 0.6769, "step": 9948 }, { "epoch": 3.807500956754688, "grad_norm": 0.5118331909179688, "learning_rate": 2.8396694921620026e-06, "loss": 0.6238, "step": 9949 }, { "epoch": 3.8078836586299274, "grad_norm": 0.5375686287879944, "learning_rate": 2.8379392298977913e-06, "loss": 0.6275, "step": 9950 }, { "epoch": 3.8082663605051663, "grad_norm": 0.573313295841217, "learning_rate": 2.8362094077614542e-06, "loss": 0.6217, "step": 9951 }, { "epoch": 3.8086490623804057, "grad_norm": 0.483283132314682, "learning_rate": 2.8344800258592984e-06, "loss": 0.6238, "step": 9952 }, { "epoch": 3.8090317642556446, "grad_norm": 0.5634105801582336, "learning_rate": 2.8327510842975913e-06, "loss": 0.6343, "step": 9953 }, { "epoch": 3.809414466130884, "grad_norm": 0.5489015579223633, "learning_rate": 2.8310225831825855e-06, "loss": 0.6433, "step": 9954 }, { "epoch": 3.8097971680061233, "grad_norm": 0.5107267498970032, "learning_rate": 2.829294522620505e-06, "loss": 0.5745, "step": 9955 }, { "epoch": 3.8101798698813623, "grad_norm": 0.575377345085144, "learning_rate": 2.827566902717536e-06, "loss": 0.6316, "step": 9956 }, { "epoch": 3.8105625717566016, "grad_norm": 0.5539364218711853, "learning_rate": 2.8258397235798494e-06, "loss": 0.6519, "step": 9957 }, { "epoch": 3.810945273631841, "grad_norm": 0.5321628451347351, "learning_rate": 2.8241129853135853e-06, "loss": 0.6008, "step": 9958 }, { "epoch": 3.81132797550708, "grad_norm": 0.547227144241333, "learning_rate": 2.8223866880248565e-06, "loss": 0.6289, "step": 9959 }, { "epoch": 3.8117106773823193, "grad_norm": 0.5491920113563538, "learning_rate": 2.820660831819748e-06, "loss": 0.6284, "step": 9960 }, { "epoch": 3.812093379257558, "grad_norm": 0.5340823531150818, "learning_rate": 2.8189354168043183e-06, "loss": 0.6162, "step": 9961 }, { "epoch": 3.8124760811327976, "grad_norm": 0.49928516149520874, "learning_rate": 2.8172104430846027e-06, "loss": 0.5647, "step": 9962 }, { "epoch": 3.8128587830080365, "grad_norm": 0.5520774126052856, "learning_rate": 2.815485910766599e-06, "loss": 0.6813, "step": 9963 }, { "epoch": 3.813241484883276, "grad_norm": 0.5712342262268066, "learning_rate": 2.813761819956287e-06, "loss": 0.7051, "step": 9964 }, { "epoch": 3.8136241867585152, "grad_norm": 0.5370113253593445, "learning_rate": 2.81203817075962e-06, "loss": 0.6937, "step": 9965 }, { "epoch": 3.814006888633754, "grad_norm": 0.5327613949775696, "learning_rate": 2.8103149632825145e-06, "loss": 0.6555, "step": 9966 }, { "epoch": 3.8143895905089935, "grad_norm": 0.5264140963554382, "learning_rate": 2.8085921976308694e-06, "loss": 0.6309, "step": 9967 }, { "epoch": 3.814772292384233, "grad_norm": 0.5441619753837585, "learning_rate": 2.8068698739105537e-06, "loss": 0.5387, "step": 9968 }, { "epoch": 3.815154994259472, "grad_norm": 0.5170334577560425, "learning_rate": 2.8051479922274085e-06, "loss": 0.6334, "step": 9969 }, { "epoch": 3.815537696134711, "grad_norm": 0.5536555647850037, "learning_rate": 2.8034265526872473e-06, "loss": 0.6253, "step": 9970 }, { "epoch": 3.81592039800995, "grad_norm": 0.5585379600524902, "learning_rate": 2.8017055553958627e-06, "loss": 0.5906, "step": 9971 }, { "epoch": 3.8163030998851895, "grad_norm": 0.6003852486610413, "learning_rate": 2.7999850004590056e-06, "loss": 0.6036, "step": 9972 }, { "epoch": 3.8166858017604284, "grad_norm": 0.5326415300369263, "learning_rate": 2.7982648879824127e-06, "loss": 0.573, "step": 9973 }, { "epoch": 3.8170685036356677, "grad_norm": 0.5198793411254883, "learning_rate": 2.7965452180717913e-06, "loss": 0.5922, "step": 9974 }, { "epoch": 3.817451205510907, "grad_norm": 0.5826728940010071, "learning_rate": 2.79482599083282e-06, "loss": 0.6783, "step": 9975 }, { "epoch": 3.817833907386146, "grad_norm": 0.5502220392227173, "learning_rate": 2.793107206371145e-06, "loss": 0.6264, "step": 9976 }, { "epoch": 3.8182166092613854, "grad_norm": 0.5596006512641907, "learning_rate": 2.7913888647923937e-06, "loss": 0.707, "step": 9977 }, { "epoch": 3.8185993111366248, "grad_norm": 0.5999387502670288, "learning_rate": 2.7896709662021627e-06, "loss": 0.6133, "step": 9978 }, { "epoch": 3.8189820130118637, "grad_norm": 0.523986279964447, "learning_rate": 2.787953510706022e-06, "loss": 0.6178, "step": 9979 }, { "epoch": 3.819364714887103, "grad_norm": 0.5335853099822998, "learning_rate": 2.7862364984095126e-06, "loss": 0.6465, "step": 9980 }, { "epoch": 3.819747416762342, "grad_norm": 0.589469313621521, "learning_rate": 2.7845199294181526e-06, "loss": 0.6233, "step": 9981 }, { "epoch": 3.8201301186375813, "grad_norm": 0.5151971578598022, "learning_rate": 2.782803803837425e-06, "loss": 0.5491, "step": 9982 }, { "epoch": 3.8205128205128203, "grad_norm": 0.561114490032196, "learning_rate": 2.7810881217727913e-06, "loss": 0.6235, "step": 9983 }, { "epoch": 3.8208955223880596, "grad_norm": 0.521107017993927, "learning_rate": 2.7793728833296906e-06, "loss": 0.6433, "step": 9984 }, { "epoch": 3.821278224263299, "grad_norm": 0.5459619164466858, "learning_rate": 2.777658088613521e-06, "loss": 0.5728, "step": 9985 }, { "epoch": 3.821660926138538, "grad_norm": 0.5347748398780823, "learning_rate": 2.775943737729664e-06, "loss": 0.681, "step": 9986 }, { "epoch": 3.8220436280137773, "grad_norm": 0.5568072199821472, "learning_rate": 2.7742298307834712e-06, "loss": 0.6026, "step": 9987 }, { "epoch": 3.8224263298890166, "grad_norm": 0.4995124936103821, "learning_rate": 2.772516367880269e-06, "loss": 0.6328, "step": 9988 }, { "epoch": 3.8228090317642556, "grad_norm": 0.5756936073303223, "learning_rate": 2.7708033491253515e-06, "loss": 0.5942, "step": 9989 }, { "epoch": 3.823191733639495, "grad_norm": 0.5509454011917114, "learning_rate": 2.7690907746239894e-06, "loss": 0.6322, "step": 9990 }, { "epoch": 3.823574435514734, "grad_norm": 0.5529312491416931, "learning_rate": 2.7673786444814277e-06, "loss": 0.6533, "step": 9991 }, { "epoch": 3.8239571373899732, "grad_norm": 0.5537627935409546, "learning_rate": 2.765666958802876e-06, "loss": 0.6235, "step": 9992 }, { "epoch": 3.824339839265212, "grad_norm": 0.49169066548347473, "learning_rate": 2.763955717693525e-06, "loss": 0.5104, "step": 9993 }, { "epoch": 3.8247225411404515, "grad_norm": 0.5830194354057312, "learning_rate": 2.7622449212585367e-06, "loss": 0.6526, "step": 9994 }, { "epoch": 3.825105243015691, "grad_norm": 0.5214808583259583, "learning_rate": 2.7605345696030405e-06, "loss": 0.5532, "step": 9995 }, { "epoch": 3.82548794489093, "grad_norm": 0.5627797245979309, "learning_rate": 2.7588246628321426e-06, "loss": 0.6822, "step": 9996 }, { "epoch": 3.825870646766169, "grad_norm": 0.5724940896034241, "learning_rate": 2.757115201050926e-06, "loss": 0.6514, "step": 9997 }, { "epoch": 3.8262533486414085, "grad_norm": 0.5880692601203918, "learning_rate": 2.755406184364432e-06, "loss": 0.6554, "step": 9998 }, { "epoch": 3.8266360505166475, "grad_norm": 0.5137600898742676, "learning_rate": 2.7536976128776936e-06, "loss": 0.6434, "step": 9999 }, { "epoch": 3.827018752391887, "grad_norm": 0.5581413507461548, "learning_rate": 2.751989486695704e-06, "loss": 0.6683, "step": 10000 }, { "epoch": 3.8274014542671257, "grad_norm": 0.547593355178833, "learning_rate": 2.750281805923435e-06, "loss": 0.5947, "step": 10001 }, { "epoch": 3.827784156142365, "grad_norm": 0.5596176981925964, "learning_rate": 2.748574570665823e-06, "loss": 0.6501, "step": 10002 }, { "epoch": 3.828166858017604, "grad_norm": 0.4916166663169861, "learning_rate": 2.746867781027784e-06, "loss": 0.498, "step": 10003 }, { "epoch": 3.8285495598928434, "grad_norm": 0.5650556087493896, "learning_rate": 2.7451614371142086e-06, "loss": 0.7156, "step": 10004 }, { "epoch": 3.8289322617680828, "grad_norm": 0.5424352884292603, "learning_rate": 2.743455539029949e-06, "loss": 0.6491, "step": 10005 }, { "epoch": 3.8293149636433217, "grad_norm": 0.5715173482894897, "learning_rate": 2.741750086879842e-06, "loss": 0.6823, "step": 10006 }, { "epoch": 3.829697665518561, "grad_norm": 0.5204979777336121, "learning_rate": 2.740045080768694e-06, "loss": 0.5605, "step": 10007 }, { "epoch": 3.8300803673938004, "grad_norm": 0.6594555377960205, "learning_rate": 2.7383405208012768e-06, "loss": 0.6083, "step": 10008 }, { "epoch": 3.8304630692690393, "grad_norm": 0.5359405279159546, "learning_rate": 2.7366364070823403e-06, "loss": 0.6038, "step": 10009 }, { "epoch": 3.8308457711442787, "grad_norm": 0.5129268765449524, "learning_rate": 2.734932739716616e-06, "loss": 0.5672, "step": 10010 }, { "epoch": 3.8312284730195176, "grad_norm": 0.5588179230690002, "learning_rate": 2.733229518808788e-06, "loss": 0.5642, "step": 10011 }, { "epoch": 3.831611174894757, "grad_norm": 0.48288556933403015, "learning_rate": 2.7315267444635287e-06, "loss": 0.5942, "step": 10012 }, { "epoch": 3.831993876769996, "grad_norm": 0.5283138751983643, "learning_rate": 2.7298244167854805e-06, "loss": 0.6773, "step": 10013 }, { "epoch": 3.8323765786452353, "grad_norm": 0.5208426713943481, "learning_rate": 2.7281225358792506e-06, "loss": 0.6013, "step": 10014 }, { "epoch": 3.8327592805204747, "grad_norm": 0.5858639478683472, "learning_rate": 2.7264211018494267e-06, "loss": 0.5399, "step": 10015 }, { "epoch": 3.8331419823957136, "grad_norm": 0.6053136587142944, "learning_rate": 2.7247201148005663e-06, "loss": 0.6774, "step": 10016 }, { "epoch": 3.833524684270953, "grad_norm": 0.5309439897537231, "learning_rate": 2.7230195748372024e-06, "loss": 0.6223, "step": 10017 }, { "epoch": 3.8339073861461923, "grad_norm": 0.5327920913696289, "learning_rate": 2.7213194820638322e-06, "loss": 0.6607, "step": 10018 }, { "epoch": 3.8342900880214312, "grad_norm": 0.518094539642334, "learning_rate": 2.719619836584935e-06, "loss": 0.6098, "step": 10019 }, { "epoch": 3.8346727898966706, "grad_norm": 0.5134308934211731, "learning_rate": 2.717920638504956e-06, "loss": 0.5441, "step": 10020 }, { "epoch": 3.8350554917719095, "grad_norm": 0.5530874729156494, "learning_rate": 2.7162218879283174e-06, "loss": 0.5881, "step": 10021 }, { "epoch": 3.835438193647149, "grad_norm": 0.4828595519065857, "learning_rate": 2.714523584959412e-06, "loss": 0.5179, "step": 10022 }, { "epoch": 3.835820895522388, "grad_norm": 0.6935729384422302, "learning_rate": 2.7128257297026085e-06, "loss": 0.5459, "step": 10023 }, { "epoch": 3.836203597397627, "grad_norm": 0.5523639917373657, "learning_rate": 2.7111283222622365e-06, "loss": 0.6287, "step": 10024 }, { "epoch": 3.8365862992728665, "grad_norm": 0.5827748775482178, "learning_rate": 2.7094313627426106e-06, "loss": 0.6777, "step": 10025 }, { "epoch": 3.8369690011481055, "grad_norm": 0.533406138420105, "learning_rate": 2.7077348512480174e-06, "loss": 0.6375, "step": 10026 }, { "epoch": 3.837351703023345, "grad_norm": 0.5637115240097046, "learning_rate": 2.7060387878827043e-06, "loss": 0.5572, "step": 10027 }, { "epoch": 3.837734404898584, "grad_norm": 0.526939868927002, "learning_rate": 2.7043431727509028e-06, "loss": 0.6186, "step": 10028 }, { "epoch": 3.838117106773823, "grad_norm": 0.5515013337135315, "learning_rate": 2.702648005956814e-06, "loss": 0.6415, "step": 10029 }, { "epoch": 3.8384998086490625, "grad_norm": 0.5284570455551147, "learning_rate": 2.7009532876046084e-06, "loss": 0.7047, "step": 10030 }, { "epoch": 3.8388825105243014, "grad_norm": 0.5905161499977112, "learning_rate": 2.6992590177984336e-06, "loss": 0.6184, "step": 10031 }, { "epoch": 3.8392652123995408, "grad_norm": 0.5651587843894958, "learning_rate": 2.6975651966424044e-06, "loss": 0.6732, "step": 10032 }, { "epoch": 3.8396479142747797, "grad_norm": 0.6298454999923706, "learning_rate": 2.6958718242406147e-06, "loss": 0.5772, "step": 10033 }, { "epoch": 3.840030616150019, "grad_norm": 0.5398114323616028, "learning_rate": 2.694178900697121e-06, "loss": 0.6741, "step": 10034 }, { "epoch": 3.8404133180252584, "grad_norm": 0.5527653098106384, "learning_rate": 2.692486426115961e-06, "loss": 0.6105, "step": 10035 }, { "epoch": 3.8407960199004973, "grad_norm": 0.5566855072975159, "learning_rate": 2.6907944006011445e-06, "loss": 0.6328, "step": 10036 }, { "epoch": 3.8411787217757367, "grad_norm": 0.6245619654655457, "learning_rate": 2.6891028242566454e-06, "loss": 0.6803, "step": 10037 }, { "epoch": 3.841561423650976, "grad_norm": 0.5011248588562012, "learning_rate": 2.6874116971864183e-06, "loss": 0.648, "step": 10038 }, { "epoch": 3.841944125526215, "grad_norm": 0.5395399332046509, "learning_rate": 2.6857210194943874e-06, "loss": 0.603, "step": 10039 }, { "epoch": 3.8423268274014544, "grad_norm": 0.5176140666007996, "learning_rate": 2.68403079128445e-06, "loss": 0.5886, "step": 10040 }, { "epoch": 3.8427095292766933, "grad_norm": 0.5565984845161438, "learning_rate": 2.682341012660474e-06, "loss": 0.7397, "step": 10041 }, { "epoch": 3.8430922311519327, "grad_norm": 0.5952370762825012, "learning_rate": 2.680651683726303e-06, "loss": 0.665, "step": 10042 }, { "epoch": 3.8434749330271716, "grad_norm": 0.5292149186134338, "learning_rate": 2.6789628045857507e-06, "loss": 0.5393, "step": 10043 }, { "epoch": 3.843857634902411, "grad_norm": 0.4892589747905731, "learning_rate": 2.6772743753426e-06, "loss": 0.5718, "step": 10044 }, { "epoch": 3.8442403367776503, "grad_norm": 0.496612548828125, "learning_rate": 2.675586396100611e-06, "loss": 0.6029, "step": 10045 }, { "epoch": 3.8446230386528892, "grad_norm": 0.5285893678665161, "learning_rate": 2.673898866963518e-06, "loss": 0.5437, "step": 10046 }, { "epoch": 3.8450057405281286, "grad_norm": 0.5904894471168518, "learning_rate": 2.6722117880350185e-06, "loss": 0.6439, "step": 10047 }, { "epoch": 3.845388442403368, "grad_norm": 0.5509406924247742, "learning_rate": 2.670525159418791e-06, "loss": 0.6288, "step": 10048 }, { "epoch": 3.845771144278607, "grad_norm": 0.49840468168258667, "learning_rate": 2.6688389812184824e-06, "loss": 0.5735, "step": 10049 }, { "epoch": 3.8461538461538463, "grad_norm": 0.5415095686912537, "learning_rate": 2.6671532535377132e-06, "loss": 0.6312, "step": 10050 }, { "epoch": 3.846536548029085, "grad_norm": 0.5571936368942261, "learning_rate": 2.6654679764800763e-06, "loss": 0.5508, "step": 10051 }, { "epoch": 3.8469192499043245, "grad_norm": 1.172694206237793, "learning_rate": 2.6637831501491395e-06, "loss": 0.6691, "step": 10052 }, { "epoch": 3.8473019517795635, "grad_norm": 0.5316779017448425, "learning_rate": 2.662098774648434e-06, "loss": 0.6076, "step": 10053 }, { "epoch": 3.847684653654803, "grad_norm": 0.49257364869117737, "learning_rate": 2.660414850081472e-06, "loss": 0.6759, "step": 10054 }, { "epoch": 3.848067355530042, "grad_norm": 0.560536801815033, "learning_rate": 2.6587313765517343e-06, "loss": 0.6019, "step": 10055 }, { "epoch": 3.848450057405281, "grad_norm": 0.5274465084075928, "learning_rate": 2.6570483541626803e-06, "loss": 0.5415, "step": 10056 }, { "epoch": 3.8488327592805205, "grad_norm": 0.5404607653617859, "learning_rate": 2.6553657830177272e-06, "loss": 0.586, "step": 10057 }, { "epoch": 3.84921546115576, "grad_norm": 0.5412017703056335, "learning_rate": 2.6536836632202788e-06, "loss": 0.5602, "step": 10058 }, { "epoch": 3.8495981630309988, "grad_norm": 0.5236424207687378, "learning_rate": 2.652001994873705e-06, "loss": 0.5579, "step": 10059 }, { "epoch": 3.849980864906238, "grad_norm": 0.5625947117805481, "learning_rate": 2.650320778081349e-06, "loss": 0.6828, "step": 10060 }, { "epoch": 3.850363566781477, "grad_norm": 0.5219811201095581, "learning_rate": 2.648640012946526e-06, "loss": 0.6547, "step": 10061 }, { "epoch": 3.8507462686567164, "grad_norm": 0.5172786116600037, "learning_rate": 2.6469596995725268e-06, "loss": 0.5368, "step": 10062 }, { "epoch": 3.8511289705319554, "grad_norm": 0.5291547775268555, "learning_rate": 2.6452798380626044e-06, "loss": 0.6339, "step": 10063 }, { "epoch": 3.8515116724071947, "grad_norm": 1.0239362716674805, "learning_rate": 2.6436004285199946e-06, "loss": 0.5268, "step": 10064 }, { "epoch": 3.851894374282434, "grad_norm": 0.5321683883666992, "learning_rate": 2.6419214710479037e-06, "loss": 0.5931, "step": 10065 }, { "epoch": 3.852277076157673, "grad_norm": 0.500929594039917, "learning_rate": 2.640242965749503e-06, "loss": 0.6026, "step": 10066 }, { "epoch": 3.8526597780329124, "grad_norm": 0.538779079914093, "learning_rate": 2.6385649127279454e-06, "loss": 0.6215, "step": 10067 }, { "epoch": 3.8530424799081517, "grad_norm": 0.5602166652679443, "learning_rate": 2.6368873120863493e-06, "loss": 0.6538, "step": 10068 }, { "epoch": 3.8534251817833907, "grad_norm": 0.5263581275939941, "learning_rate": 2.6352101639278085e-06, "loss": 0.5997, "step": 10069 }, { "epoch": 3.85380788365863, "grad_norm": 0.563442587852478, "learning_rate": 2.6335334683553893e-06, "loss": 0.6247, "step": 10070 }, { "epoch": 3.854190585533869, "grad_norm": 0.5148174166679382, "learning_rate": 2.6318572254721276e-06, "loss": 0.5794, "step": 10071 }, { "epoch": 3.8545732874091083, "grad_norm": 0.5535804629325867, "learning_rate": 2.6301814353810373e-06, "loss": 0.5985, "step": 10072 }, { "epoch": 3.8549559892843472, "grad_norm": 0.5559884905815125, "learning_rate": 2.628506098185093e-06, "loss": 0.6757, "step": 10073 }, { "epoch": 3.8553386911595866, "grad_norm": 0.5418714880943298, "learning_rate": 2.6268312139872534e-06, "loss": 0.6022, "step": 10074 }, { "epoch": 3.855721393034826, "grad_norm": 0.5616790056228638, "learning_rate": 2.6251567828904468e-06, "loss": 0.7062, "step": 10075 }, { "epoch": 3.856104094910065, "grad_norm": 0.5364934206008911, "learning_rate": 2.6234828049975645e-06, "loss": 0.5649, "step": 10076 }, { "epoch": 3.8564867967853043, "grad_norm": 0.5148260593414307, "learning_rate": 2.6218092804114812e-06, "loss": 0.5886, "step": 10077 }, { "epoch": 3.8568694986605436, "grad_norm": 0.516871452331543, "learning_rate": 2.620136209235039e-06, "loss": 0.6355, "step": 10078 }, { "epoch": 3.8572522005357825, "grad_norm": 0.6482405066490173, "learning_rate": 2.618463591571052e-06, "loss": 0.6223, "step": 10079 }, { "epoch": 3.857634902411022, "grad_norm": 0.6142717003822327, "learning_rate": 2.6167914275223093e-06, "loss": 0.704, "step": 10080 }, { "epoch": 3.858017604286261, "grad_norm": 0.523193359375, "learning_rate": 2.6151197171915667e-06, "loss": 0.6127, "step": 10081 }, { "epoch": 3.8584003061615, "grad_norm": 0.5953977704048157, "learning_rate": 2.6134484606815603e-06, "loss": 0.6208, "step": 10082 }, { "epoch": 3.858783008036739, "grad_norm": 0.5812308192253113, "learning_rate": 2.6117776580949873e-06, "loss": 0.6883, "step": 10083 }, { "epoch": 3.8591657099119785, "grad_norm": 0.5822384357452393, "learning_rate": 2.610107309534525e-06, "loss": 0.5281, "step": 10084 }, { "epoch": 3.859548411787218, "grad_norm": 0.5471153259277344, "learning_rate": 2.6084374151028257e-06, "loss": 0.6059, "step": 10085 }, { "epoch": 3.859931113662457, "grad_norm": 0.5378099679946899, "learning_rate": 2.606767974902501e-06, "loss": 0.583, "step": 10086 }, { "epoch": 3.860313815537696, "grad_norm": 0.5099547505378723, "learning_rate": 2.6050989890361457e-06, "loss": 0.6233, "step": 10087 }, { "epoch": 3.8606965174129355, "grad_norm": 0.5614661574363708, "learning_rate": 2.603430457606324e-06, "loss": 0.65, "step": 10088 }, { "epoch": 3.8610792192881744, "grad_norm": 0.543380856513977, "learning_rate": 2.601762380715571e-06, "loss": 0.5857, "step": 10089 }, { "epoch": 3.861461921163414, "grad_norm": 0.5440600514411926, "learning_rate": 2.600094758466395e-06, "loss": 0.6596, "step": 10090 }, { "epoch": 3.8618446230386527, "grad_norm": 0.5671229362487793, "learning_rate": 2.59842759096128e-06, "loss": 0.6412, "step": 10091 }, { "epoch": 3.862227324913892, "grad_norm": 0.5331782698631287, "learning_rate": 2.59676087830267e-06, "loss": 0.6124, "step": 10092 }, { "epoch": 3.862610026789131, "grad_norm": 0.5540857911109924, "learning_rate": 2.595094620592993e-06, "loss": 0.6959, "step": 10093 }, { "epoch": 3.8629927286643704, "grad_norm": 0.5161728858947754, "learning_rate": 2.5934288179346444e-06, "loss": 0.5454, "step": 10094 }, { "epoch": 3.8633754305396097, "grad_norm": 0.5183088183403015, "learning_rate": 2.5917634704299956e-06, "loss": 0.6057, "step": 10095 }, { "epoch": 3.8637581324148487, "grad_norm": 0.5926685929298401, "learning_rate": 2.5900985781813804e-06, "loss": 0.603, "step": 10096 }, { "epoch": 3.864140834290088, "grad_norm": 0.5673673152923584, "learning_rate": 2.588434141291115e-06, "loss": 0.61, "step": 10097 }, { "epoch": 3.8645235361653274, "grad_norm": 0.5645519495010376, "learning_rate": 2.5867701598614827e-06, "loss": 0.671, "step": 10098 }, { "epoch": 3.8649062380405663, "grad_norm": 0.5184718370437622, "learning_rate": 2.5851066339947397e-06, "loss": 0.5987, "step": 10099 }, { "epoch": 3.8652889399158057, "grad_norm": 0.5556961894035339, "learning_rate": 2.5834435637931132e-06, "loss": 0.6801, "step": 10100 }, { "epoch": 3.8656716417910446, "grad_norm": 0.5258594155311584, "learning_rate": 2.5817809493588076e-06, "loss": 0.5817, "step": 10101 }, { "epoch": 3.866054343666284, "grad_norm": 0.553171694278717, "learning_rate": 2.5801187907939895e-06, "loss": 0.5803, "step": 10102 }, { "epoch": 3.866437045541523, "grad_norm": 0.5399090051651001, "learning_rate": 2.578457088200804e-06, "loss": 0.6559, "step": 10103 }, { "epoch": 3.8668197474167623, "grad_norm": 0.5691872835159302, "learning_rate": 2.576795841681372e-06, "loss": 0.6042, "step": 10104 }, { "epoch": 3.8672024492920016, "grad_norm": 0.6146008372306824, "learning_rate": 2.5751350513377738e-06, "loss": 0.5906, "step": 10105 }, { "epoch": 3.8675851511672406, "grad_norm": 0.5691215991973877, "learning_rate": 2.573474717272075e-06, "loss": 0.6748, "step": 10106 }, { "epoch": 3.86796785304248, "grad_norm": 0.5938581824302673, "learning_rate": 2.571814839586305e-06, "loss": 0.6223, "step": 10107 }, { "epoch": 3.8683505549177193, "grad_norm": 0.5586518049240112, "learning_rate": 2.570155418382473e-06, "loss": 0.5341, "step": 10108 }, { "epoch": 3.868733256792958, "grad_norm": 0.5103022456169128, "learning_rate": 2.5684964537625433e-06, "loss": 0.5817, "step": 10109 }, { "epoch": 3.8691159586681976, "grad_norm": 0.5722428560256958, "learning_rate": 2.5668379458284753e-06, "loss": 0.6098, "step": 10110 }, { "epoch": 3.8694986605434365, "grad_norm": 0.5549942851066589, "learning_rate": 2.565179894682187e-06, "loss": 0.6191, "step": 10111 }, { "epoch": 3.869881362418676, "grad_norm": 0.5429159998893738, "learning_rate": 2.563522300425565e-06, "loss": 0.6144, "step": 10112 }, { "epoch": 3.870264064293915, "grad_norm": 0.5460123419761658, "learning_rate": 2.5618651631604762e-06, "loss": 0.6552, "step": 10113 }, { "epoch": 3.870646766169154, "grad_norm": 0.5000735521316528, "learning_rate": 2.5602084829887574e-06, "loss": 0.5718, "step": 10114 }, { "epoch": 3.8710294680443935, "grad_norm": 0.7676375508308411, "learning_rate": 2.5585522600122125e-06, "loss": 0.6618, "step": 10115 }, { "epoch": 3.8714121699196324, "grad_norm": 0.5879810452461243, "learning_rate": 2.556896494332621e-06, "loss": 0.6404, "step": 10116 }, { "epoch": 3.871794871794872, "grad_norm": 0.5106772184371948, "learning_rate": 2.55524118605174e-06, "loss": 0.6258, "step": 10117 }, { "epoch": 3.872177573670111, "grad_norm": 0.53461092710495, "learning_rate": 2.5535863352712853e-06, "loss": 0.5626, "step": 10118 }, { "epoch": 3.87256027554535, "grad_norm": 0.5683826804161072, "learning_rate": 2.5519319420929556e-06, "loss": 0.6168, "step": 10119 }, { "epoch": 3.8729429774205895, "grad_norm": 0.6277991533279419, "learning_rate": 2.550278006618413e-06, "loss": 0.625, "step": 10120 }, { "epoch": 3.8733256792958284, "grad_norm": 0.5668351054191589, "learning_rate": 2.548624528949307e-06, "loss": 0.6277, "step": 10121 }, { "epoch": 3.8737083811710677, "grad_norm": 0.5555065274238586, "learning_rate": 2.546971509187238e-06, "loss": 0.5795, "step": 10122 }, { "epoch": 3.8740910830463067, "grad_norm": 0.5369688272476196, "learning_rate": 2.5453189474337938e-06, "loss": 0.5694, "step": 10123 }, { "epoch": 3.874473784921546, "grad_norm": 0.5300976037979126, "learning_rate": 2.5436668437905286e-06, "loss": 0.5898, "step": 10124 }, { "epoch": 3.8748564867967854, "grad_norm": 0.5226010680198669, "learning_rate": 2.5420151983589656e-06, "loss": 0.6615, "step": 10125 }, { "epoch": 3.8752391886720243, "grad_norm": 0.6325751543045044, "learning_rate": 2.540364011240604e-06, "loss": 0.6513, "step": 10126 }, { "epoch": 3.8756218905472637, "grad_norm": 0.5311406254768372, "learning_rate": 2.5387132825369164e-06, "loss": 0.6603, "step": 10127 }, { "epoch": 3.876004592422503, "grad_norm": 0.5776282548904419, "learning_rate": 2.53706301234934e-06, "loss": 0.6459, "step": 10128 }, { "epoch": 3.876387294297742, "grad_norm": 0.5481382012367249, "learning_rate": 2.535413200779291e-06, "loss": 0.5985, "step": 10129 }, { "epoch": 3.8767699961729813, "grad_norm": 0.536763072013855, "learning_rate": 2.5337638479281533e-06, "loss": 0.5476, "step": 10130 }, { "epoch": 3.8771526980482207, "grad_norm": 0.5318331718444824, "learning_rate": 2.5321149538972855e-06, "loss": 0.7184, "step": 10131 }, { "epoch": 3.8775353999234596, "grad_norm": 0.5568578243255615, "learning_rate": 2.5304665187880164e-06, "loss": 0.6116, "step": 10132 }, { "epoch": 3.8779181017986986, "grad_norm": 0.5627933740615845, "learning_rate": 2.5288185427016465e-06, "loss": 0.546, "step": 10133 }, { "epoch": 3.878300803673938, "grad_norm": 0.5133364200592041, "learning_rate": 2.5271710257394512e-06, "loss": 0.5548, "step": 10134 }, { "epoch": 3.8786835055491773, "grad_norm": 0.5718116164207458, "learning_rate": 2.525523968002669e-06, "loss": 0.5363, "step": 10135 }, { "epoch": 3.879066207424416, "grad_norm": 0.5123940110206604, "learning_rate": 2.523877369592519e-06, "loss": 0.6226, "step": 10136 }, { "epoch": 3.8794489092996556, "grad_norm": 0.5413663387298584, "learning_rate": 2.5222312306101925e-06, "loss": 0.6357, "step": 10137 }, { "epoch": 3.879831611174895, "grad_norm": 0.5579719543457031, "learning_rate": 2.5205855511568432e-06, "loss": 0.6525, "step": 10138 }, { "epoch": 3.880214313050134, "grad_norm": 0.6088363528251648, "learning_rate": 2.5189403313336048e-06, "loss": 0.6219, "step": 10139 }, { "epoch": 3.8805970149253732, "grad_norm": 0.5489184856414795, "learning_rate": 2.5172955712415803e-06, "loss": 0.7093, "step": 10140 }, { "epoch": 3.8809797168006126, "grad_norm": 0.5206589102745056, "learning_rate": 2.515651270981846e-06, "loss": 0.6045, "step": 10141 }, { "epoch": 3.8813624186758515, "grad_norm": 0.5729591250419617, "learning_rate": 2.5140074306554464e-06, "loss": 0.7058, "step": 10142 }, { "epoch": 3.8817451205510904, "grad_norm": 0.508581280708313, "learning_rate": 2.512364050363405e-06, "loss": 0.6019, "step": 10143 }, { "epoch": 3.88212782242633, "grad_norm": 0.5447705984115601, "learning_rate": 2.510721130206706e-06, "loss": 0.5877, "step": 10144 }, { "epoch": 3.882510524301569, "grad_norm": 0.6045892834663391, "learning_rate": 2.5090786702863126e-06, "loss": 0.6181, "step": 10145 }, { "epoch": 3.882893226176808, "grad_norm": 0.5693555474281311, "learning_rate": 2.5074366707031595e-06, "loss": 0.6768, "step": 10146 }, { "epoch": 3.8832759280520475, "grad_norm": 0.5788046717643738, "learning_rate": 2.505795131558154e-06, "loss": 0.6686, "step": 10147 }, { "epoch": 3.883658629927287, "grad_norm": 0.5662764310836792, "learning_rate": 2.504154052952168e-06, "loss": 0.5761, "step": 10148 }, { "epoch": 3.8840413318025258, "grad_norm": 0.5172243714332581, "learning_rate": 2.5025134349860535e-06, "loss": 0.6623, "step": 10149 }, { "epoch": 3.884424033677765, "grad_norm": 0.5724687576293945, "learning_rate": 2.5008732777606305e-06, "loss": 0.6129, "step": 10150 }, { "epoch": 3.8848067355530045, "grad_norm": 0.5618602633476257, "learning_rate": 2.49923358137669e-06, "loss": 0.5788, "step": 10151 }, { "epoch": 3.8851894374282434, "grad_norm": 0.5015730261802673, "learning_rate": 2.497594345934997e-06, "loss": 0.7029, "step": 10152 }, { "epoch": 3.8855721393034823, "grad_norm": 0.5467812418937683, "learning_rate": 2.4959555715362894e-06, "loss": 0.6155, "step": 10153 }, { "epoch": 3.8859548411787217, "grad_norm": 0.5717893838882446, "learning_rate": 2.4943172582812693e-06, "loss": 0.634, "step": 10154 }, { "epoch": 3.886337543053961, "grad_norm": 0.5056716799736023, "learning_rate": 2.492679406270617e-06, "loss": 0.6289, "step": 10155 }, { "epoch": 3.8867202449292, "grad_norm": 0.5138017535209656, "learning_rate": 2.4910420156049865e-06, "loss": 0.6551, "step": 10156 }, { "epoch": 3.8871029468044394, "grad_norm": 0.5119362473487854, "learning_rate": 2.4894050863849937e-06, "loss": 0.5487, "step": 10157 }, { "epoch": 3.8874856486796787, "grad_norm": 0.5781825184822083, "learning_rate": 2.487768618711236e-06, "loss": 0.6255, "step": 10158 }, { "epoch": 3.8878683505549176, "grad_norm": 0.5267737507820129, "learning_rate": 2.4861326126842776e-06, "loss": 0.6246, "step": 10159 }, { "epoch": 3.888251052430157, "grad_norm": 0.535179853439331, "learning_rate": 2.4844970684046565e-06, "loss": 0.5895, "step": 10160 }, { "epoch": 3.8886337543053964, "grad_norm": 0.6034830808639526, "learning_rate": 2.482861985972881e-06, "loss": 0.5972, "step": 10161 }, { "epoch": 3.8890164561806353, "grad_norm": 0.5203924179077148, "learning_rate": 2.4812273654894314e-06, "loss": 0.6558, "step": 10162 }, { "epoch": 3.889399158055874, "grad_norm": 0.5396326184272766, "learning_rate": 2.479593207054761e-06, "loss": 0.6499, "step": 10163 }, { "epoch": 3.8897818599311136, "grad_norm": 0.5991748571395874, "learning_rate": 2.47795951076929e-06, "loss": 0.6233, "step": 10164 }, { "epoch": 3.890164561806353, "grad_norm": 0.5384154319763184, "learning_rate": 2.476326276733414e-06, "loss": 0.5742, "step": 10165 }, { "epoch": 3.890547263681592, "grad_norm": 0.5016571879386902, "learning_rate": 2.474693505047504e-06, "loss": 0.5987, "step": 10166 }, { "epoch": 3.8909299655568312, "grad_norm": 0.53034508228302, "learning_rate": 2.473061195811892e-06, "loss": 0.6215, "step": 10167 }, { "epoch": 3.8913126674320706, "grad_norm": 0.5920446515083313, "learning_rate": 2.4714293491268904e-06, "loss": 0.6512, "step": 10168 }, { "epoch": 3.8916953693073095, "grad_norm": 0.5272919535636902, "learning_rate": 2.469797965092781e-06, "loss": 0.6722, "step": 10169 }, { "epoch": 3.892078071182549, "grad_norm": 0.5430365800857544, "learning_rate": 2.4681670438098147e-06, "loss": 0.615, "step": 10170 }, { "epoch": 3.8924607730577883, "grad_norm": 0.6058531403541565, "learning_rate": 2.4665365853782198e-06, "loss": 0.6501, "step": 10171 }, { "epoch": 3.892843474933027, "grad_norm": 0.5611199140548706, "learning_rate": 2.4649065898981895e-06, "loss": 0.7155, "step": 10172 }, { "epoch": 3.893226176808266, "grad_norm": 0.5429691672325134, "learning_rate": 2.463277057469895e-06, "loss": 0.6175, "step": 10173 }, { "epoch": 3.8936088786835055, "grad_norm": 0.5746036171913147, "learning_rate": 2.46164798819347e-06, "loss": 0.7305, "step": 10174 }, { "epoch": 3.893991580558745, "grad_norm": 0.5424342155456543, "learning_rate": 2.4600193821690275e-06, "loss": 0.6351, "step": 10175 }, { "epoch": 3.8943742824339838, "grad_norm": 0.5584412217140198, "learning_rate": 2.4583912394966537e-06, "loss": 0.6436, "step": 10176 }, { "epoch": 3.894756984309223, "grad_norm": 0.6115570068359375, "learning_rate": 2.456763560276395e-06, "loss": 0.6784, "step": 10177 }, { "epoch": 3.8951396861844625, "grad_norm": 0.5169993042945862, "learning_rate": 2.455136344608281e-06, "loss": 0.5975, "step": 10178 }, { "epoch": 3.8955223880597014, "grad_norm": 0.5160218477249146, "learning_rate": 2.453509592592307e-06, "loss": 0.6411, "step": 10179 }, { "epoch": 3.895905089934941, "grad_norm": 0.5018174648284912, "learning_rate": 2.4518833043284427e-06, "loss": 0.6329, "step": 10180 }, { "epoch": 3.89628779181018, "grad_norm": 0.5470846891403198, "learning_rate": 2.450257479916628e-06, "loss": 0.6121, "step": 10181 }, { "epoch": 3.896670493685419, "grad_norm": 0.5885950326919556, "learning_rate": 2.448632119456775e-06, "loss": 0.5675, "step": 10182 }, { "epoch": 3.897053195560658, "grad_norm": 0.5285342335700989, "learning_rate": 2.447007223048763e-06, "loss": 0.6285, "step": 10183 }, { "epoch": 3.8974358974358974, "grad_norm": 0.5213053226470947, "learning_rate": 2.4453827907924476e-06, "loss": 0.5411, "step": 10184 }, { "epoch": 3.8978185993111367, "grad_norm": 0.5636323690414429, "learning_rate": 2.4437588227876552e-06, "loss": 0.6367, "step": 10185 }, { "epoch": 3.8982013011863756, "grad_norm": 0.5198363065719604, "learning_rate": 2.4421353191341855e-06, "loss": 0.6509, "step": 10186 }, { "epoch": 3.898584003061615, "grad_norm": 0.6036190390586853, "learning_rate": 2.4405122799318027e-06, "loss": 0.6392, "step": 10187 }, { "epoch": 3.8989667049368544, "grad_norm": 0.61554354429245, "learning_rate": 2.4388897052802475e-06, "loss": 0.6692, "step": 10188 }, { "epoch": 3.8993494068120933, "grad_norm": 0.5652013421058655, "learning_rate": 2.4372675952792335e-06, "loss": 0.7393, "step": 10189 }, { "epoch": 3.8997321086873327, "grad_norm": 0.5124176144599915, "learning_rate": 2.4356459500284426e-06, "loss": 0.5497, "step": 10190 }, { "epoch": 3.900114810562572, "grad_norm": 0.4802270233631134, "learning_rate": 2.4340247696275297e-06, "loss": 0.5518, "step": 10191 }, { "epoch": 3.900497512437811, "grad_norm": 0.5554218292236328, "learning_rate": 2.432404054176124e-06, "loss": 0.6791, "step": 10192 }, { "epoch": 3.90088021431305, "grad_norm": 0.5407744646072388, "learning_rate": 2.430783803773816e-06, "loss": 0.6313, "step": 10193 }, { "epoch": 3.9012629161882892, "grad_norm": 0.5986825823783875, "learning_rate": 2.4291640185201783e-06, "loss": 0.7753, "step": 10194 }, { "epoch": 3.9016456180635286, "grad_norm": 0.5404754281044006, "learning_rate": 2.427544698514753e-06, "loss": 0.6229, "step": 10195 }, { "epoch": 3.9020283199387675, "grad_norm": 0.5479784607887268, "learning_rate": 2.4259258438570476e-06, "loss": 0.6086, "step": 10196 }, { "epoch": 3.902411021814007, "grad_norm": 0.5437963604927063, "learning_rate": 2.424307454646546e-06, "loss": 0.6382, "step": 10197 }, { "epoch": 3.9027937236892463, "grad_norm": 0.4925910532474518, "learning_rate": 2.4226895309827027e-06, "loss": 0.6264, "step": 10198 }, { "epoch": 3.903176425564485, "grad_norm": 0.5247091054916382, "learning_rate": 2.421072072964945e-06, "loss": 0.5265, "step": 10199 }, { "epoch": 3.9035591274397246, "grad_norm": 0.5832870006561279, "learning_rate": 2.419455080692669e-06, "loss": 0.5833, "step": 10200 }, { "epoch": 3.903941829314964, "grad_norm": 0.5007687211036682, "learning_rate": 2.4178385542652427e-06, "loss": 0.6353, "step": 10201 }, { "epoch": 3.904324531190203, "grad_norm": 0.5287690162658691, "learning_rate": 2.41622249378201e-06, "loss": 0.5778, "step": 10202 }, { "epoch": 3.9047072330654418, "grad_norm": 0.49901244044303894, "learning_rate": 2.4146068993422757e-06, "loss": 0.6479, "step": 10203 }, { "epoch": 3.905089934940681, "grad_norm": 0.6702032089233398, "learning_rate": 2.4129917710453255e-06, "loss": 0.6502, "step": 10204 }, { "epoch": 3.9054726368159205, "grad_norm": 0.5705827474594116, "learning_rate": 2.411377108990417e-06, "loss": 0.6104, "step": 10205 }, { "epoch": 3.9058553386911594, "grad_norm": 0.554781973361969, "learning_rate": 2.4097629132767687e-06, "loss": 0.642, "step": 10206 }, { "epoch": 3.906238040566399, "grad_norm": 0.5280086994171143, "learning_rate": 2.40814918400358e-06, "loss": 0.5864, "step": 10207 }, { "epoch": 3.906620742441638, "grad_norm": 0.501406192779541, "learning_rate": 2.406535921270022e-06, "loss": 0.5253, "step": 10208 }, { "epoch": 3.907003444316877, "grad_norm": 0.5261262059211731, "learning_rate": 2.4049231251752257e-06, "loss": 0.5895, "step": 10209 }, { "epoch": 3.9073861461921164, "grad_norm": 0.5816993713378906, "learning_rate": 2.403310795818311e-06, "loss": 0.638, "step": 10210 }, { "epoch": 3.907768848067356, "grad_norm": 0.5463564991950989, "learning_rate": 2.4016989332983564e-06, "loss": 0.5566, "step": 10211 }, { "epoch": 3.9081515499425947, "grad_norm": 0.5832769274711609, "learning_rate": 2.4000875377144173e-06, "loss": 0.6284, "step": 10212 }, { "epoch": 3.9085342518178336, "grad_norm": 0.551167905330658, "learning_rate": 2.398476609165514e-06, "loss": 0.5937, "step": 10213 }, { "epoch": 3.908916953693073, "grad_norm": 0.516435444355011, "learning_rate": 2.396866147750644e-06, "loss": 0.577, "step": 10214 }, { "epoch": 3.9092996555683124, "grad_norm": 0.5169782042503357, "learning_rate": 2.395256153568778e-06, "loss": 0.6392, "step": 10215 }, { "epoch": 3.9096823574435513, "grad_norm": 0.5643632411956787, "learning_rate": 2.3936466267188487e-06, "loss": 0.6651, "step": 10216 }, { "epoch": 3.9100650593187907, "grad_norm": 0.5320153832435608, "learning_rate": 2.3920375672997685e-06, "loss": 0.5643, "step": 10217 }, { "epoch": 3.91044776119403, "grad_norm": 0.5831044316291809, "learning_rate": 2.3904289754104215e-06, "loss": 0.6777, "step": 10218 }, { "epoch": 3.910830463069269, "grad_norm": 0.5489414930343628, "learning_rate": 2.3888208511496536e-06, "loss": 0.5913, "step": 10219 }, { "epoch": 3.9112131649445083, "grad_norm": 0.5144282579421997, "learning_rate": 2.387213194616289e-06, "loss": 0.6163, "step": 10220 }, { "epoch": 3.9115958668197477, "grad_norm": 0.524412989616394, "learning_rate": 2.385606005909131e-06, "loss": 0.6048, "step": 10221 }, { "epoch": 3.9119785686949866, "grad_norm": 0.5419246554374695, "learning_rate": 2.3839992851269366e-06, "loss": 0.5979, "step": 10222 }, { "epoch": 3.9123612705702255, "grad_norm": 0.528031051158905, "learning_rate": 2.3823930323684474e-06, "loss": 0.5916, "step": 10223 }, { "epoch": 3.912743972445465, "grad_norm": 0.5555135011672974, "learning_rate": 2.3807872477323736e-06, "loss": 0.6234, "step": 10224 }, { "epoch": 3.9131266743207043, "grad_norm": 0.605220377445221, "learning_rate": 2.379181931317389e-06, "loss": 0.6438, "step": 10225 }, { "epoch": 3.913509376195943, "grad_norm": 0.5239377021789551, "learning_rate": 2.377577083222149e-06, "loss": 0.5834, "step": 10226 }, { "epoch": 3.9138920780711826, "grad_norm": 0.5615348219871521, "learning_rate": 2.375972703545274e-06, "loss": 0.6817, "step": 10227 }, { "epoch": 3.914274779946422, "grad_norm": 0.5241987705230713, "learning_rate": 2.374368792385361e-06, "loss": 0.5942, "step": 10228 }, { "epoch": 3.914657481821661, "grad_norm": 0.5519990921020508, "learning_rate": 2.37276534984097e-06, "loss": 0.5805, "step": 10229 }, { "epoch": 3.9150401836969, "grad_norm": 0.534440815448761, "learning_rate": 2.371162376010635e-06, "loss": 0.5948, "step": 10230 }, { "epoch": 3.9154228855721396, "grad_norm": 0.5960130095481873, "learning_rate": 2.3695598709928726e-06, "loss": 0.6449, "step": 10231 }, { "epoch": 3.9158055874473785, "grad_norm": 0.6015350222587585, "learning_rate": 2.367957834886152e-06, "loss": 0.7118, "step": 10232 }, { "epoch": 3.9161882893226174, "grad_norm": 0.5265507102012634, "learning_rate": 2.3663562677889263e-06, "loss": 0.5196, "step": 10233 }, { "epoch": 3.916570991197857, "grad_norm": 0.523259162902832, "learning_rate": 2.3647551697996187e-06, "loss": 0.624, "step": 10234 }, { "epoch": 3.916953693073096, "grad_norm": 0.554656445980072, "learning_rate": 2.3631545410166144e-06, "loss": 0.6965, "step": 10235 }, { "epoch": 3.917336394948335, "grad_norm": 0.5366269946098328, "learning_rate": 2.36155438153828e-06, "loss": 0.7089, "step": 10236 }, { "epoch": 3.9177190968235744, "grad_norm": 0.5353883504867554, "learning_rate": 2.3599546914629534e-06, "loss": 0.5921, "step": 10237 }, { "epoch": 3.918101798698814, "grad_norm": 0.6363198757171631, "learning_rate": 2.358355470888932e-06, "loss": 0.6431, "step": 10238 }, { "epoch": 3.9184845005740527, "grad_norm": 0.5739976763725281, "learning_rate": 2.356756719914497e-06, "loss": 0.6804, "step": 10239 }, { "epoch": 3.918867202449292, "grad_norm": 0.4885123372077942, "learning_rate": 2.3551584386378947e-06, "loss": 0.5528, "step": 10240 }, { "epoch": 3.9192499043245315, "grad_norm": 0.5903192162513733, "learning_rate": 2.353560627157343e-06, "loss": 0.5756, "step": 10241 }, { "epoch": 3.9196326061997704, "grad_norm": 0.5232539772987366, "learning_rate": 2.351963285571034e-06, "loss": 0.5809, "step": 10242 }, { "epoch": 3.9200153080750093, "grad_norm": 0.647160530090332, "learning_rate": 2.3503664139771275e-06, "loss": 0.648, "step": 10243 }, { "epoch": 3.9203980099502487, "grad_norm": 0.4922778308391571, "learning_rate": 2.3487700124737577e-06, "loss": 0.594, "step": 10244 }, { "epoch": 3.920780711825488, "grad_norm": 0.5619660019874573, "learning_rate": 2.3471740811590237e-06, "loss": 0.6108, "step": 10245 }, { "epoch": 3.921163413700727, "grad_norm": 0.4995241165161133, "learning_rate": 2.3455786201310004e-06, "loss": 0.5499, "step": 10246 }, { "epoch": 3.9215461155759663, "grad_norm": 0.5198280811309814, "learning_rate": 2.3439836294877382e-06, "loss": 0.6037, "step": 10247 }, { "epoch": 3.9219288174512057, "grad_norm": 0.5036407113075256, "learning_rate": 2.3423891093272465e-06, "loss": 0.6168, "step": 10248 }, { "epoch": 3.9223115193264446, "grad_norm": 0.5356417298316956, "learning_rate": 2.340795059747516e-06, "loss": 0.6358, "step": 10249 }, { "epoch": 3.922694221201684, "grad_norm": 0.5956174731254578, "learning_rate": 2.339201480846506e-06, "loss": 0.5923, "step": 10250 }, { "epoch": 3.9230769230769234, "grad_norm": 0.5501269102096558, "learning_rate": 2.3376083727221453e-06, "loss": 0.5764, "step": 10251 }, { "epoch": 3.9234596249521623, "grad_norm": 0.5315981507301331, "learning_rate": 2.336015735472336e-06, "loss": 0.6752, "step": 10252 }, { "epoch": 3.923842326827401, "grad_norm": 0.5471804738044739, "learning_rate": 2.334423569194948e-06, "loss": 0.621, "step": 10253 }, { "epoch": 3.9242250287026406, "grad_norm": 0.5776287317276001, "learning_rate": 2.3328318739878274e-06, "loss": 0.6718, "step": 10254 }, { "epoch": 3.92460773057788, "grad_norm": 0.5262307524681091, "learning_rate": 2.331240649948784e-06, "loss": 0.608, "step": 10255 }, { "epoch": 3.924990432453119, "grad_norm": 0.5490840673446655, "learning_rate": 2.3296498971756045e-06, "loss": 0.5816, "step": 10256 }, { "epoch": 3.925373134328358, "grad_norm": 0.5545336008071899, "learning_rate": 2.3280596157660485e-06, "loss": 0.6417, "step": 10257 }, { "epoch": 3.9257558362035976, "grad_norm": 0.9159362316131592, "learning_rate": 2.326469805817836e-06, "loss": 0.6563, "step": 10258 }, { "epoch": 3.9261385380788365, "grad_norm": 0.5108888745307922, "learning_rate": 2.3248804674286685e-06, "loss": 0.5932, "step": 10259 }, { "epoch": 3.926521239954076, "grad_norm": 0.5330396294593811, "learning_rate": 2.323291600696217e-06, "loss": 0.5893, "step": 10260 }, { "epoch": 3.9269039418293152, "grad_norm": 0.5537769794464111, "learning_rate": 2.321703205718119e-06, "loss": 0.6474, "step": 10261 }, { "epoch": 3.927286643704554, "grad_norm": 0.5241600275039673, "learning_rate": 2.320115282591986e-06, "loss": 0.6343, "step": 10262 }, { "epoch": 3.927669345579793, "grad_norm": 0.510458767414093, "learning_rate": 2.3185278314154046e-06, "loss": 0.5668, "step": 10263 }, { "epoch": 3.9280520474550324, "grad_norm": 0.5083804130554199, "learning_rate": 2.3169408522859216e-06, "loss": 0.5499, "step": 10264 }, { "epoch": 3.928434749330272, "grad_norm": 0.5063880085945129, "learning_rate": 2.315354345301063e-06, "loss": 0.544, "step": 10265 }, { "epoch": 3.9288174512055107, "grad_norm": 0.5195845365524292, "learning_rate": 2.313768310558324e-06, "loss": 0.5335, "step": 10266 }, { "epoch": 3.92920015308075, "grad_norm": 0.53917396068573, "learning_rate": 2.312182748155175e-06, "loss": 0.6167, "step": 10267 }, { "epoch": 3.9295828549559895, "grad_norm": 0.5208091735839844, "learning_rate": 2.3105976581890465e-06, "loss": 0.5711, "step": 10268 }, { "epoch": 3.9299655568312284, "grad_norm": 0.5651727914810181, "learning_rate": 2.3090130407573496e-06, "loss": 0.5917, "step": 10269 }, { "epoch": 3.9303482587064678, "grad_norm": 0.5384100675582886, "learning_rate": 2.307428895957463e-06, "loss": 0.5493, "step": 10270 }, { "epoch": 3.930730960581707, "grad_norm": 0.5316745638847351, "learning_rate": 2.3058452238867366e-06, "loss": 0.634, "step": 10271 }, { "epoch": 3.931113662456946, "grad_norm": 0.6553912162780762, "learning_rate": 2.304262024642492e-06, "loss": 0.6287, "step": 10272 }, { "epoch": 3.931496364332185, "grad_norm": 0.47809624671936035, "learning_rate": 2.3026792983220225e-06, "loss": 0.5718, "step": 10273 }, { "epoch": 3.9318790662074243, "grad_norm": 0.5250147581100464, "learning_rate": 2.3010970450225866e-06, "loss": 0.5491, "step": 10274 }, { "epoch": 3.9322617680826637, "grad_norm": 0.5742583870887756, "learning_rate": 2.2995152648414197e-06, "loss": 0.6854, "step": 10275 }, { "epoch": 3.9326444699579026, "grad_norm": 0.5345507264137268, "learning_rate": 2.2979339578757307e-06, "loss": 0.6742, "step": 10276 }, { "epoch": 3.933027171833142, "grad_norm": 0.5482451319694519, "learning_rate": 2.296353124222688e-06, "loss": 0.6631, "step": 10277 }, { "epoch": 3.9334098737083814, "grad_norm": 0.5518039464950562, "learning_rate": 2.2947727639794416e-06, "loss": 0.6847, "step": 10278 }, { "epoch": 3.9337925755836203, "grad_norm": 0.5536584854125977, "learning_rate": 2.293192877243109e-06, "loss": 0.6422, "step": 10279 }, { "epoch": 3.9341752774588596, "grad_norm": 0.5698908567428589, "learning_rate": 2.291613464110779e-06, "loss": 0.615, "step": 10280 }, { "epoch": 3.934557979334099, "grad_norm": 0.5635569095611572, "learning_rate": 2.2900345246795097e-06, "loss": 0.7126, "step": 10281 }, { "epoch": 3.934940681209338, "grad_norm": 0.5203247666358948, "learning_rate": 2.288456059046331e-06, "loss": 0.6643, "step": 10282 }, { "epoch": 3.935323383084577, "grad_norm": 0.5367289185523987, "learning_rate": 2.286878067308248e-06, "loss": 0.6358, "step": 10283 }, { "epoch": 3.935706084959816, "grad_norm": 0.5283297300338745, "learning_rate": 2.285300549562226e-06, "loss": 0.6224, "step": 10284 }, { "epoch": 3.9360887868350556, "grad_norm": 0.5270591378211975, "learning_rate": 2.283723505905211e-06, "loss": 0.6289, "step": 10285 }, { "epoch": 3.9364714887102945, "grad_norm": 0.5494409203529358, "learning_rate": 2.2821469364341185e-06, "loss": 0.6557, "step": 10286 }, { "epoch": 3.936854190585534, "grad_norm": 0.5505053400993347, "learning_rate": 2.2805708412458283e-06, "loss": 0.6677, "step": 10287 }, { "epoch": 3.9372368924607732, "grad_norm": 0.47401341795921326, "learning_rate": 2.278995220437198e-06, "loss": 0.584, "step": 10288 }, { "epoch": 3.937619594336012, "grad_norm": 0.5659545063972473, "learning_rate": 2.2774200741050534e-06, "loss": 0.6694, "step": 10289 }, { "epoch": 3.9380022962112515, "grad_norm": 0.6250771284103394, "learning_rate": 2.2758454023461917e-06, "loss": 0.6628, "step": 10290 }, { "epoch": 3.938384998086491, "grad_norm": 0.5038658380508423, "learning_rate": 2.274271205257381e-06, "loss": 0.5393, "step": 10291 }, { "epoch": 3.93876769996173, "grad_norm": 0.5450126528739929, "learning_rate": 2.272697482935361e-06, "loss": 0.5798, "step": 10292 }, { "epoch": 3.9391504018369687, "grad_norm": 0.5481778383255005, "learning_rate": 2.2711242354768415e-06, "loss": 0.6191, "step": 10293 }, { "epoch": 3.939533103712208, "grad_norm": 0.5331355333328247, "learning_rate": 2.2695514629784985e-06, "loss": 0.6187, "step": 10294 }, { "epoch": 3.9399158055874475, "grad_norm": 0.5811300277709961, "learning_rate": 2.2679791655369865e-06, "loss": 0.6419, "step": 10295 }, { "epoch": 3.9402985074626864, "grad_norm": 0.5927801728248596, "learning_rate": 2.26640734324893e-06, "loss": 0.5864, "step": 10296 }, { "epoch": 3.9406812093379258, "grad_norm": 0.5379248857498169, "learning_rate": 2.2648359962109146e-06, "loss": 0.6326, "step": 10297 }, { "epoch": 3.941063911213165, "grad_norm": 0.5234038233757019, "learning_rate": 2.2632651245195083e-06, "loss": 0.6393, "step": 10298 }, { "epoch": 3.941446613088404, "grad_norm": 0.57942134141922, "learning_rate": 2.2616947282712443e-06, "loss": 0.7323, "step": 10299 }, { "epoch": 3.9418293149636434, "grad_norm": 0.7518671751022339, "learning_rate": 2.260124807562628e-06, "loss": 0.6896, "step": 10300 }, { "epoch": 3.942212016838883, "grad_norm": 0.5651615262031555, "learning_rate": 2.2585553624901355e-06, "loss": 0.6324, "step": 10301 }, { "epoch": 3.9425947187141217, "grad_norm": 0.542452335357666, "learning_rate": 2.256986393150217e-06, "loss": 0.6763, "step": 10302 }, { "epoch": 3.9429774205893606, "grad_norm": 0.5451302528381348, "learning_rate": 2.2554178996392827e-06, "loss": 0.5915, "step": 10303 }, { "epoch": 3.9433601224646, "grad_norm": 0.5567128658294678, "learning_rate": 2.2538498820537246e-06, "loss": 0.6199, "step": 10304 }, { "epoch": 3.9437428243398394, "grad_norm": 0.5566142797470093, "learning_rate": 2.252282340489901e-06, "loss": 0.6287, "step": 10305 }, { "epoch": 3.9441255262150783, "grad_norm": 0.5551641583442688, "learning_rate": 2.250715275044144e-06, "loss": 0.6242, "step": 10306 }, { "epoch": 3.9445082280903176, "grad_norm": 0.6449941396713257, "learning_rate": 2.2491486858127485e-06, "loss": 0.6736, "step": 10307 }, { "epoch": 3.944890929965557, "grad_norm": 0.5952624678611755, "learning_rate": 2.247582572891991e-06, "loss": 0.6918, "step": 10308 }, { "epoch": 3.945273631840796, "grad_norm": 0.5364135503768921, "learning_rate": 2.2460169363781094e-06, "loss": 0.6726, "step": 10309 }, { "epoch": 3.9456563337160353, "grad_norm": 0.5462812185287476, "learning_rate": 2.244451776367318e-06, "loss": 0.6166, "step": 10310 }, { "epoch": 3.9460390355912747, "grad_norm": 0.5689333081245422, "learning_rate": 2.2428870929558012e-06, "loss": 0.7054, "step": 10311 }, { "epoch": 3.9464217374665136, "grad_norm": 0.5357646346092224, "learning_rate": 2.241322886239714e-06, "loss": 0.6588, "step": 10312 }, { "epoch": 3.9468044393417525, "grad_norm": 0.5493935942649841, "learning_rate": 2.2397591563151765e-06, "loss": 0.5937, "step": 10313 }, { "epoch": 3.947187141216992, "grad_norm": 0.5732094049453735, "learning_rate": 2.2381959032782864e-06, "loss": 0.6886, "step": 10314 }, { "epoch": 3.9475698430922312, "grad_norm": 0.5419829487800598, "learning_rate": 2.236633127225113e-06, "loss": 0.6629, "step": 10315 }, { "epoch": 3.94795254496747, "grad_norm": 0.5628198385238647, "learning_rate": 2.2350708282516874e-06, "loss": 0.6318, "step": 10316 }, { "epoch": 3.9483352468427095, "grad_norm": 0.5562101602554321, "learning_rate": 2.23350900645402e-06, "loss": 0.6578, "step": 10317 }, { "epoch": 3.948717948717949, "grad_norm": 0.5565147399902344, "learning_rate": 2.231947661928088e-06, "loss": 0.7321, "step": 10318 }, { "epoch": 3.949100650593188, "grad_norm": 0.6169881224632263, "learning_rate": 2.230386794769844e-06, "loss": 0.7254, "step": 10319 }, { "epoch": 3.949483352468427, "grad_norm": 0.580678403377533, "learning_rate": 2.2288264050751997e-06, "loss": 0.7689, "step": 10320 }, { "epoch": 3.9498660543436666, "grad_norm": 0.529898464679718, "learning_rate": 2.2272664929400522e-06, "loss": 0.5874, "step": 10321 }, { "epoch": 3.9502487562189055, "grad_norm": 0.5778708457946777, "learning_rate": 2.2257070584602626e-06, "loss": 0.6552, "step": 10322 }, { "epoch": 3.9506314580941444, "grad_norm": 0.5730776786804199, "learning_rate": 2.224148101731658e-06, "loss": 0.6561, "step": 10323 }, { "epoch": 3.9510141599693838, "grad_norm": 0.48563212156295776, "learning_rate": 2.222589622850041e-06, "loss": 0.573, "step": 10324 }, { "epoch": 3.951396861844623, "grad_norm": 0.564930260181427, "learning_rate": 2.221031621911189e-06, "loss": 0.6479, "step": 10325 }, { "epoch": 3.951779563719862, "grad_norm": 0.5016940236091614, "learning_rate": 2.219474099010839e-06, "loss": 0.5964, "step": 10326 }, { "epoch": 3.9521622655951014, "grad_norm": 0.572206437587738, "learning_rate": 2.217917054244708e-06, "loss": 0.6701, "step": 10327 }, { "epoch": 3.952544967470341, "grad_norm": 0.5317202210426331, "learning_rate": 2.2163604877084833e-06, "loss": 0.5977, "step": 10328 }, { "epoch": 3.9529276693455797, "grad_norm": 0.528175950050354, "learning_rate": 2.214804399497815e-06, "loss": 0.6479, "step": 10329 }, { "epoch": 3.953310371220819, "grad_norm": 0.5577924847602844, "learning_rate": 2.2132487897083276e-06, "loss": 0.5576, "step": 10330 }, { "epoch": 3.9536930730960584, "grad_norm": 0.5603327751159668, "learning_rate": 2.211693658435624e-06, "loss": 0.6176, "step": 10331 }, { "epoch": 3.9540757749712974, "grad_norm": 0.5377969741821289, "learning_rate": 2.210139005775271e-06, "loss": 0.6144, "step": 10332 }, { "epoch": 3.9544584768465363, "grad_norm": 0.5363202095031738, "learning_rate": 2.208584831822802e-06, "loss": 0.6206, "step": 10333 }, { "epoch": 3.9548411787217757, "grad_norm": 0.5396900773048401, "learning_rate": 2.207031136673725e-06, "loss": 0.5976, "step": 10334 }, { "epoch": 3.955223880597015, "grad_norm": 0.5712380409240723, "learning_rate": 2.2054779204235244e-06, "loss": 0.6838, "step": 10335 }, { "epoch": 3.955606582472254, "grad_norm": 0.5366420745849609, "learning_rate": 2.2039251831676424e-06, "loss": 0.684, "step": 10336 }, { "epoch": 3.9559892843474933, "grad_norm": 0.5090441107749939, "learning_rate": 2.202372925001501e-06, "loss": 0.5549, "step": 10337 }, { "epoch": 3.9563719862227327, "grad_norm": 0.5773622393608093, "learning_rate": 2.2008211460204952e-06, "loss": 0.6074, "step": 10338 }, { "epoch": 3.9567546880979716, "grad_norm": 0.5466211438179016, "learning_rate": 2.19926984631998e-06, "loss": 0.6615, "step": 10339 }, { "epoch": 3.957137389973211, "grad_norm": 0.4842971861362457, "learning_rate": 2.1977190259952887e-06, "loss": 0.5812, "step": 10340 }, { "epoch": 3.9575200918484503, "grad_norm": 0.532318651676178, "learning_rate": 2.196168685141724e-06, "loss": 0.6228, "step": 10341 }, { "epoch": 3.9579027937236892, "grad_norm": 0.5127707123756409, "learning_rate": 2.1946188238545596e-06, "loss": 0.5909, "step": 10342 }, { "epoch": 3.958285495598928, "grad_norm": 0.5810410976409912, "learning_rate": 2.1930694422290366e-06, "loss": 0.5957, "step": 10343 }, { "epoch": 3.9586681974741675, "grad_norm": 0.6064177751541138, "learning_rate": 2.191520540360369e-06, "loss": 0.6365, "step": 10344 }, { "epoch": 3.959050899349407, "grad_norm": 0.5603401064872742, "learning_rate": 2.189972118343745e-06, "loss": 0.6373, "step": 10345 }, { "epoch": 3.959433601224646, "grad_norm": 0.5885182619094849, "learning_rate": 2.188424176274313e-06, "loss": 0.6762, "step": 10346 }, { "epoch": 3.959816303099885, "grad_norm": 0.5229772329330444, "learning_rate": 2.1868767142472013e-06, "loss": 0.6129, "step": 10347 }, { "epoch": 3.9601990049751246, "grad_norm": 0.5604742169380188, "learning_rate": 2.1853297323575084e-06, "loss": 0.5961, "step": 10348 }, { "epoch": 3.9605817068503635, "grad_norm": 0.5496050119400024, "learning_rate": 2.183783230700294e-06, "loss": 0.6044, "step": 10349 }, { "epoch": 3.960964408725603, "grad_norm": 0.5491943955421448, "learning_rate": 2.182237209370599e-06, "loss": 0.6559, "step": 10350 }, { "epoch": 3.961347110600842, "grad_norm": 0.5782939791679382, "learning_rate": 2.1806916684634295e-06, "loss": 0.6218, "step": 10351 }, { "epoch": 3.961729812476081, "grad_norm": 0.5995064973831177, "learning_rate": 2.179146608073763e-06, "loss": 0.6834, "step": 10352 }, { "epoch": 3.96211251435132, "grad_norm": 0.5748842358589172, "learning_rate": 2.177602028296548e-06, "loss": 0.6402, "step": 10353 }, { "epoch": 3.9624952162265594, "grad_norm": 0.6232174038887024, "learning_rate": 2.176057929226707e-06, "loss": 0.5805, "step": 10354 }, { "epoch": 3.962877918101799, "grad_norm": 0.5387597680091858, "learning_rate": 2.1745143109591214e-06, "loss": 0.584, "step": 10355 }, { "epoch": 3.9632606199770377, "grad_norm": 0.516388475894928, "learning_rate": 2.172971173588655e-06, "loss": 0.6123, "step": 10356 }, { "epoch": 3.963643321852277, "grad_norm": 0.518531858921051, "learning_rate": 2.171428517210138e-06, "loss": 0.6519, "step": 10357 }, { "epoch": 3.9640260237275164, "grad_norm": 0.5338418483734131, "learning_rate": 2.1698863419183714e-06, "loss": 0.6186, "step": 10358 }, { "epoch": 3.9644087256027554, "grad_norm": 0.5393376350402832, "learning_rate": 2.1683446478081227e-06, "loss": 0.6966, "step": 10359 }, { "epoch": 3.9647914274779947, "grad_norm": 0.5607839226722717, "learning_rate": 2.166803434974134e-06, "loss": 0.6345, "step": 10360 }, { "epoch": 3.965174129353234, "grad_norm": 0.5554313063621521, "learning_rate": 2.1652627035111198e-06, "loss": 0.6846, "step": 10361 }, { "epoch": 3.965556831228473, "grad_norm": 0.5117478966712952, "learning_rate": 2.163722453513759e-06, "loss": 0.6196, "step": 10362 }, { "epoch": 3.965939533103712, "grad_norm": 0.5842755436897278, "learning_rate": 2.1621826850767056e-06, "loss": 0.597, "step": 10363 }, { "epoch": 3.9663222349789513, "grad_norm": 0.5499684810638428, "learning_rate": 2.160643398294585e-06, "loss": 0.5821, "step": 10364 }, { "epoch": 3.9667049368541907, "grad_norm": 0.5585997104644775, "learning_rate": 2.159104593261986e-06, "loss": 0.6427, "step": 10365 }, { "epoch": 3.9670876387294296, "grad_norm": 0.5433608293533325, "learning_rate": 2.1575662700734735e-06, "loss": 0.597, "step": 10366 }, { "epoch": 3.967470340604669, "grad_norm": 0.554469883441925, "learning_rate": 2.1560284288235854e-06, "loss": 0.5542, "step": 10367 }, { "epoch": 3.9678530424799083, "grad_norm": 0.5330696702003479, "learning_rate": 2.15449106960682e-06, "loss": 0.6659, "step": 10368 }, { "epoch": 3.9682357443551473, "grad_norm": 0.5294312238693237, "learning_rate": 2.1529541925176555e-06, "loss": 0.6655, "step": 10369 }, { "epoch": 3.9686184462303866, "grad_norm": 0.5391156673431396, "learning_rate": 2.1514177976505377e-06, "loss": 0.6095, "step": 10370 }, { "epoch": 3.969001148105626, "grad_norm": 0.5639857053756714, "learning_rate": 2.14988188509988e-06, "loss": 0.6346, "step": 10371 }, { "epoch": 3.969383849980865, "grad_norm": 0.529685914516449, "learning_rate": 2.1483464549600708e-06, "loss": 0.5978, "step": 10372 }, { "epoch": 3.969766551856104, "grad_norm": 0.5133771896362305, "learning_rate": 2.1468115073254646e-06, "loss": 0.5515, "step": 10373 }, { "epoch": 3.970149253731343, "grad_norm": 0.557182252407074, "learning_rate": 2.1452770422903925e-06, "loss": 0.7191, "step": 10374 }, { "epoch": 3.9705319556065826, "grad_norm": 0.5000736117362976, "learning_rate": 2.143743059949144e-06, "loss": 0.5809, "step": 10375 }, { "epoch": 3.9709146574818215, "grad_norm": 0.5554234981536865, "learning_rate": 2.1422095603959904e-06, "loss": 0.7046, "step": 10376 }, { "epoch": 3.971297359357061, "grad_norm": 0.5222229361534119, "learning_rate": 2.140676543725172e-06, "loss": 0.6882, "step": 10377 }, { "epoch": 3.9716800612323, "grad_norm": 0.5830409526824951, "learning_rate": 2.1391440100308915e-06, "loss": 0.7427, "step": 10378 }, { "epoch": 3.972062763107539, "grad_norm": 0.5336783528327942, "learning_rate": 2.1376119594073296e-06, "loss": 0.6462, "step": 10379 }, { "epoch": 3.9724454649827785, "grad_norm": 0.5297459959983826, "learning_rate": 2.136080391948635e-06, "loss": 0.6247, "step": 10380 }, { "epoch": 3.972828166858018, "grad_norm": 0.5823040008544922, "learning_rate": 2.1345493077489265e-06, "loss": 0.651, "step": 10381 }, { "epoch": 3.973210868733257, "grad_norm": 0.596892774105072, "learning_rate": 2.1330187069022934e-06, "loss": 0.6747, "step": 10382 }, { "epoch": 3.9735935706084957, "grad_norm": 0.5568455457687378, "learning_rate": 2.131488589502796e-06, "loss": 0.6308, "step": 10383 }, { "epoch": 3.973976272483735, "grad_norm": 0.6288628578186035, "learning_rate": 2.1299589556444657e-06, "loss": 0.6623, "step": 10384 }, { "epoch": 3.9743589743589745, "grad_norm": 0.5782763361930847, "learning_rate": 2.128429805421297e-06, "loss": 0.6443, "step": 10385 }, { "epoch": 3.9747416762342134, "grad_norm": 0.5654838681221008, "learning_rate": 2.1269011389272644e-06, "loss": 0.6421, "step": 10386 }, { "epoch": 3.9751243781094527, "grad_norm": 0.5046560764312744, "learning_rate": 2.1253729562563106e-06, "loss": 0.6088, "step": 10387 }, { "epoch": 3.975507079984692, "grad_norm": 0.5010104775428772, "learning_rate": 2.1238452575023406e-06, "loss": 0.6354, "step": 10388 }, { "epoch": 3.975889781859931, "grad_norm": 0.5107307434082031, "learning_rate": 2.1223180427592393e-06, "loss": 0.6748, "step": 10389 }, { "epoch": 3.9762724837351704, "grad_norm": 0.5565019845962524, "learning_rate": 2.1207913121208567e-06, "loss": 0.5875, "step": 10390 }, { "epoch": 3.9766551856104098, "grad_norm": 0.49515873193740845, "learning_rate": 2.1192650656810166e-06, "loss": 0.5859, "step": 10391 }, { "epoch": 3.9770378874856487, "grad_norm": 0.5393649339675903, "learning_rate": 2.1177393035335092e-06, "loss": 0.5953, "step": 10392 }, { "epoch": 3.9774205893608876, "grad_norm": 0.6035898327827454, "learning_rate": 2.1162140257721e-06, "loss": 0.6566, "step": 10393 }, { "epoch": 3.977803291236127, "grad_norm": 0.5482913255691528, "learning_rate": 2.1146892324905157e-06, "loss": 0.6323, "step": 10394 }, { "epoch": 3.9781859931113663, "grad_norm": 0.5387911200523376, "learning_rate": 2.1131649237824614e-06, "loss": 0.6686, "step": 10395 }, { "epoch": 3.9785686949866053, "grad_norm": 0.5369036197662354, "learning_rate": 2.1116410997416116e-06, "loss": 0.6264, "step": 10396 }, { "epoch": 3.9789513968618446, "grad_norm": 0.6009455919265747, "learning_rate": 2.1101177604616096e-06, "loss": 0.676, "step": 10397 }, { "epoch": 3.979334098737084, "grad_norm": 0.5525690317153931, "learning_rate": 2.1085949060360654e-06, "loss": 0.6507, "step": 10398 }, { "epoch": 3.979716800612323, "grad_norm": 0.5509178638458252, "learning_rate": 2.1070725365585645e-06, "loss": 0.5848, "step": 10399 }, { "epoch": 3.9800995024875623, "grad_norm": 0.5796674489974976, "learning_rate": 2.10555065212266e-06, "loss": 0.6983, "step": 10400 }, { "epoch": 3.9804822043628016, "grad_norm": 0.5482116937637329, "learning_rate": 2.1040292528218764e-06, "loss": 0.6454, "step": 10401 }, { "epoch": 3.9808649062380406, "grad_norm": 0.49484890699386597, "learning_rate": 2.102508338749708e-06, "loss": 0.5007, "step": 10402 }, { "epoch": 3.9812476081132795, "grad_norm": 0.524399995803833, "learning_rate": 2.1009879099996212e-06, "loss": 0.6233, "step": 10403 }, { "epoch": 3.981630309988519, "grad_norm": 0.5052105784416199, "learning_rate": 2.0994679666650454e-06, "loss": 0.6562, "step": 10404 }, { "epoch": 3.9820130118637582, "grad_norm": 0.5056648850440979, "learning_rate": 2.097948508839387e-06, "loss": 0.5676, "step": 10405 }, { "epoch": 3.982395713738997, "grad_norm": 0.512679398059845, "learning_rate": 2.096429536616025e-06, "loss": 0.625, "step": 10406 }, { "epoch": 3.9827784156142365, "grad_norm": 0.5254321694374084, "learning_rate": 2.0949110500882986e-06, "loss": 0.6348, "step": 10407 }, { "epoch": 3.983161117489476, "grad_norm": 0.52745121717453, "learning_rate": 2.0933930493495237e-06, "loss": 0.6003, "step": 10408 }, { "epoch": 3.983543819364715, "grad_norm": 0.5990690588951111, "learning_rate": 2.0918755344929874e-06, "loss": 0.615, "step": 10409 }, { "epoch": 3.983926521239954, "grad_norm": 0.5213233828544617, "learning_rate": 2.090358505611946e-06, "loss": 0.5643, "step": 10410 }, { "epoch": 3.9843092231151935, "grad_norm": 0.529789388179779, "learning_rate": 2.0888419627996213e-06, "loss": 0.5808, "step": 10411 }, { "epoch": 3.9846919249904325, "grad_norm": 0.579879105091095, "learning_rate": 2.0873259061492134e-06, "loss": 0.5871, "step": 10412 }, { "epoch": 3.9850746268656714, "grad_norm": 0.5155131816864014, "learning_rate": 2.085810335753887e-06, "loss": 0.6294, "step": 10413 }, { "epoch": 3.9854573287409107, "grad_norm": 0.5453059077262878, "learning_rate": 2.084295251706775e-06, "loss": 0.7371, "step": 10414 }, { "epoch": 3.98584003061615, "grad_norm": 0.530784547328949, "learning_rate": 2.0827806541009855e-06, "loss": 0.6543, "step": 10415 }, { "epoch": 3.986222732491389, "grad_norm": 0.5673373341560364, "learning_rate": 2.0812665430295963e-06, "loss": 0.6788, "step": 10416 }, { "epoch": 3.9866054343666284, "grad_norm": 0.5532338619232178, "learning_rate": 2.0797529185856492e-06, "loss": 0.6447, "step": 10417 }, { "epoch": 3.9869881362418678, "grad_norm": 0.561229407787323, "learning_rate": 2.0782397808621624e-06, "loss": 0.6288, "step": 10418 }, { "epoch": 3.9873708381171067, "grad_norm": 0.6070306897163391, "learning_rate": 2.0767271299521264e-06, "loss": 0.7212, "step": 10419 }, { "epoch": 3.987753539992346, "grad_norm": 0.6394069194793701, "learning_rate": 2.0752149659484876e-06, "loss": 0.7077, "step": 10420 }, { "epoch": 3.9881362418675854, "grad_norm": 0.5961258411407471, "learning_rate": 2.073703288944183e-06, "loss": 0.6142, "step": 10421 }, { "epoch": 3.9885189437428243, "grad_norm": 0.574331521987915, "learning_rate": 2.0721920990321043e-06, "loss": 0.6009, "step": 10422 }, { "epoch": 3.9889016456180633, "grad_norm": 0.5392223000526428, "learning_rate": 2.070681396305122e-06, "loss": 0.5869, "step": 10423 }, { "epoch": 3.9892843474933026, "grad_norm": 0.5105207562446594, "learning_rate": 2.069171180856068e-06, "loss": 0.5925, "step": 10424 }, { "epoch": 3.989667049368542, "grad_norm": 0.5497372150421143, "learning_rate": 2.0676614527777495e-06, "loss": 0.6801, "step": 10425 }, { "epoch": 3.990049751243781, "grad_norm": 0.4993487298488617, "learning_rate": 2.0661522121629485e-06, "loss": 0.601, "step": 10426 }, { "epoch": 3.9904324531190203, "grad_norm": 0.5473046898841858, "learning_rate": 2.064643459104405e-06, "loss": 0.5862, "step": 10427 }, { "epoch": 3.9908151549942597, "grad_norm": 0.5263763070106506, "learning_rate": 2.0631351936948396e-06, "loss": 0.5995, "step": 10428 }, { "epoch": 3.9911978568694986, "grad_norm": 0.4969819188117981, "learning_rate": 2.0616274160269422e-06, "loss": 0.5699, "step": 10429 }, { "epoch": 3.991580558744738, "grad_norm": 0.551617443561554, "learning_rate": 2.06012012619336e-06, "loss": 0.6647, "step": 10430 }, { "epoch": 3.9919632606199773, "grad_norm": 0.6751014590263367, "learning_rate": 2.05861332428673e-06, "loss": 0.6248, "step": 10431 }, { "epoch": 3.9923459624952162, "grad_norm": 0.5077188014984131, "learning_rate": 2.057107010399647e-06, "loss": 0.6381, "step": 10432 }, { "epoch": 3.992728664370455, "grad_norm": 0.5652092099189758, "learning_rate": 2.055601184624676e-06, "loss": 0.7453, "step": 10433 }, { "epoch": 3.9931113662456945, "grad_norm": 0.5410761833190918, "learning_rate": 2.054095847054354e-06, "loss": 0.6078, "step": 10434 }, { "epoch": 3.993494068120934, "grad_norm": 0.542711615562439, "learning_rate": 2.052590997781191e-06, "loss": 0.634, "step": 10435 }, { "epoch": 3.993876769996173, "grad_norm": 0.5450778603553772, "learning_rate": 2.0510866368976588e-06, "loss": 0.5896, "step": 10436 }, { "epoch": 3.994259471871412, "grad_norm": 0.5440341830253601, "learning_rate": 2.0495827644962076e-06, "loss": 0.5728, "step": 10437 }, { "epoch": 3.9946421737466515, "grad_norm": 0.5267189145088196, "learning_rate": 2.0480793806692544e-06, "loss": 0.6765, "step": 10438 }, { "epoch": 3.9950248756218905, "grad_norm": 0.5533367991447449, "learning_rate": 2.0465764855091898e-06, "loss": 0.5975, "step": 10439 }, { "epoch": 3.99540757749713, "grad_norm": 0.5694990754127502, "learning_rate": 2.0450740791083633e-06, "loss": 0.6225, "step": 10440 }, { "epoch": 3.995790279372369, "grad_norm": 0.581802248954773, "learning_rate": 2.043572161559102e-06, "loss": 0.6297, "step": 10441 }, { "epoch": 3.996172981247608, "grad_norm": 0.5122140049934387, "learning_rate": 2.0420707329537125e-06, "loss": 0.547, "step": 10442 }, { "epoch": 3.996555683122847, "grad_norm": 0.5403052568435669, "learning_rate": 2.0405697933844526e-06, "loss": 0.6855, "step": 10443 }, { "epoch": 3.9969383849980864, "grad_norm": 0.5239669680595398, "learning_rate": 2.0390693429435626e-06, "loss": 0.6158, "step": 10444 }, { "epoch": 3.9973210868733258, "grad_norm": 0.5393129587173462, "learning_rate": 2.03756938172325e-06, "loss": 0.6295, "step": 10445 }, { "epoch": 3.9977037887485647, "grad_norm": 0.5446615815162659, "learning_rate": 2.036069909815688e-06, "loss": 0.6221, "step": 10446 }, { "epoch": 3.998086490623804, "grad_norm": 0.5302301645278931, "learning_rate": 2.0345709273130245e-06, "loss": 0.5698, "step": 10447 }, { "epoch": 3.9984691924990434, "grad_norm": 0.5231440663337708, "learning_rate": 2.033072434307379e-06, "loss": 0.5869, "step": 10448 }, { "epoch": 3.9988518943742823, "grad_norm": 0.5510360598564148, "learning_rate": 2.031574430890834e-06, "loss": 0.6119, "step": 10449 }, { "epoch": 3.9992345962495217, "grad_norm": 0.5475967526435852, "learning_rate": 2.0300769171554457e-06, "loss": 0.5632, "step": 10450 }, { "epoch": 3.999617298124761, "grad_norm": 0.5371907949447632, "learning_rate": 2.0285798931932397e-06, "loss": 0.6355, "step": 10451 }, { "epoch": 4.0, "grad_norm": 0.5783579349517822, "learning_rate": 2.0270833590962203e-06, "loss": 0.6363, "step": 10452 }, { "epoch": 4.000382701875239, "grad_norm": 0.5167096257209778, "learning_rate": 2.0255873149563443e-06, "loss": 0.6705, "step": 10453 }, { "epoch": 4.000765403750479, "grad_norm": 0.5399066209793091, "learning_rate": 2.024091760865552e-06, "loss": 0.6375, "step": 10454 }, { "epoch": 4.001148105625718, "grad_norm": 0.5440178513526917, "learning_rate": 2.022596696915751e-06, "loss": 0.6149, "step": 10455 }, { "epoch": 4.001530807500957, "grad_norm": 1.0067723989486694, "learning_rate": 2.0211021231988103e-06, "loss": 0.6629, "step": 10456 }, { "epoch": 4.001913509376196, "grad_norm": 0.5320191383361816, "learning_rate": 2.0196080398065797e-06, "loss": 0.6745, "step": 10457 }, { "epoch": 4.002296211251435, "grad_norm": 0.515980064868927, "learning_rate": 2.0181144468308765e-06, "loss": 0.651, "step": 10458 }, { "epoch": 4.002678913126674, "grad_norm": 0.8039026856422424, "learning_rate": 2.016621344363482e-06, "loss": 0.6511, "step": 10459 }, { "epoch": 4.003061615001913, "grad_norm": 0.5522236227989197, "learning_rate": 2.0151287324961534e-06, "loss": 0.6268, "step": 10460 }, { "epoch": 4.003444316877153, "grad_norm": 0.639008641242981, "learning_rate": 2.013636611320615e-06, "loss": 0.6671, "step": 10461 }, { "epoch": 4.003827018752392, "grad_norm": 0.5505332350730896, "learning_rate": 2.012144980928562e-06, "loss": 0.6006, "step": 10462 }, { "epoch": 4.004209720627631, "grad_norm": 0.5007908344268799, "learning_rate": 2.0106538414116595e-06, "loss": 0.5182, "step": 10463 }, { "epoch": 4.004592422502871, "grad_norm": 0.5421581268310547, "learning_rate": 2.009163192861542e-06, "loss": 0.6661, "step": 10464 }, { "epoch": 4.0049751243781095, "grad_norm": 0.5402958393096924, "learning_rate": 2.007673035369815e-06, "loss": 0.6135, "step": 10465 }, { "epoch": 4.0053578262533485, "grad_norm": 0.5606939792633057, "learning_rate": 2.0061833690280496e-06, "loss": 0.5939, "step": 10466 }, { "epoch": 4.005740528128588, "grad_norm": 0.5407587289810181, "learning_rate": 2.004694193927791e-06, "loss": 0.6456, "step": 10467 }, { "epoch": 4.006123230003827, "grad_norm": 0.5840094089508057, "learning_rate": 2.0032055101605564e-06, "loss": 0.643, "step": 10468 }, { "epoch": 4.006505931879066, "grad_norm": 0.6343135237693787, "learning_rate": 2.001717317817824e-06, "loss": 0.6309, "step": 10469 }, { "epoch": 4.006888633754305, "grad_norm": 0.5830726027488708, "learning_rate": 2.0002296169910495e-06, "loss": 0.6691, "step": 10470 }, { "epoch": 4.007271335629545, "grad_norm": 0.5709691643714905, "learning_rate": 1.9987424077716566e-06, "loss": 0.6521, "step": 10471 }, { "epoch": 4.007654037504784, "grad_norm": 0.5558174252510071, "learning_rate": 1.997255690251039e-06, "loss": 0.6222, "step": 10472 }, { "epoch": 4.008036739380023, "grad_norm": 0.5619134902954102, "learning_rate": 1.995769464520557e-06, "loss": 0.6533, "step": 10473 }, { "epoch": 4.0084194412552625, "grad_norm": 0.5951761603355408, "learning_rate": 1.994283730671548e-06, "loss": 0.7264, "step": 10474 }, { "epoch": 4.008802143130501, "grad_norm": 0.5811408758163452, "learning_rate": 1.992798488795308e-06, "loss": 0.6473, "step": 10475 }, { "epoch": 4.00918484500574, "grad_norm": 0.5966352820396423, "learning_rate": 1.9913137389831117e-06, "loss": 0.6782, "step": 10476 }, { "epoch": 4.00956754688098, "grad_norm": 0.5275029540061951, "learning_rate": 1.989829481326202e-06, "loss": 0.6535, "step": 10477 }, { "epoch": 4.009950248756219, "grad_norm": 0.49894699454307556, "learning_rate": 1.9883457159157925e-06, "loss": 0.5853, "step": 10478 }, { "epoch": 4.010332950631458, "grad_norm": 0.5361876487731934, "learning_rate": 1.9868624428430594e-06, "loss": 0.585, "step": 10479 }, { "epoch": 4.010715652506697, "grad_norm": 0.5506048202514648, "learning_rate": 1.9853796621991563e-06, "loss": 0.5444, "step": 10480 }, { "epoch": 4.011098354381937, "grad_norm": 0.5402643084526062, "learning_rate": 1.9838973740752044e-06, "loss": 0.5793, "step": 10481 }, { "epoch": 4.011481056257176, "grad_norm": 0.4819026291370392, "learning_rate": 1.9824155785622946e-06, "loss": 0.5796, "step": 10482 }, { "epoch": 4.011863758132415, "grad_norm": 0.5369727611541748, "learning_rate": 1.980934275751485e-06, "loss": 0.6431, "step": 10483 }, { "epoch": 4.012246460007654, "grad_norm": 0.5260148048400879, "learning_rate": 1.9794534657338117e-06, "loss": 0.5446, "step": 10484 }, { "epoch": 4.012629161882893, "grad_norm": 0.5239769816398621, "learning_rate": 1.9779731486002664e-06, "loss": 0.5836, "step": 10485 }, { "epoch": 4.013011863758132, "grad_norm": 0.5360181927680969, "learning_rate": 1.9764933244418217e-06, "loss": 0.6248, "step": 10486 }, { "epoch": 4.013394565633372, "grad_norm": 0.5321371555328369, "learning_rate": 1.9750139933494206e-06, "loss": 0.604, "step": 10487 }, { "epoch": 4.013777267508611, "grad_norm": 0.5782116651535034, "learning_rate": 1.9735351554139657e-06, "loss": 0.6243, "step": 10488 }, { "epoch": 4.01415996938385, "grad_norm": 0.5230002403259277, "learning_rate": 1.972056810726339e-06, "loss": 0.5805, "step": 10489 }, { "epoch": 4.014542671259089, "grad_norm": 0.5318050384521484, "learning_rate": 1.970578959377388e-06, "loss": 0.6195, "step": 10490 }, { "epoch": 4.014925373134329, "grad_norm": 0.49777573347091675, "learning_rate": 1.969101601457931e-06, "loss": 0.6483, "step": 10491 }, { "epoch": 4.0153080750095675, "grad_norm": 0.528103768825531, "learning_rate": 1.9676247370587564e-06, "loss": 0.6171, "step": 10492 }, { "epoch": 4.0156907768848065, "grad_norm": 0.5139891505241394, "learning_rate": 1.9661483662706194e-06, "loss": 0.5323, "step": 10493 }, { "epoch": 4.016073478760046, "grad_norm": 0.5474322438240051, "learning_rate": 1.9646724891842517e-06, "loss": 0.6812, "step": 10494 }, { "epoch": 4.016456180635285, "grad_norm": 0.5428072810173035, "learning_rate": 1.9631971058903423e-06, "loss": 0.6432, "step": 10495 }, { "epoch": 4.016838882510524, "grad_norm": 0.5068820118904114, "learning_rate": 1.9617222164795624e-06, "loss": 0.6629, "step": 10496 }, { "epoch": 4.017221584385764, "grad_norm": 0.564373791217804, "learning_rate": 1.9602478210425503e-06, "loss": 0.6721, "step": 10497 }, { "epoch": 4.017604286261003, "grad_norm": 0.5536541938781738, "learning_rate": 1.958773919669905e-06, "loss": 0.6071, "step": 10498 }, { "epoch": 4.017986988136242, "grad_norm": 0.5286252498626709, "learning_rate": 1.9573005124522047e-06, "loss": 0.5574, "step": 10499 }, { "epoch": 4.018369690011481, "grad_norm": 0.537168562412262, "learning_rate": 1.9558275994799948e-06, "loss": 0.6428, "step": 10500 }, { "epoch": 4.0187523918867205, "grad_norm": 0.5333830714225769, "learning_rate": 1.95435518084379e-06, "loss": 0.5887, "step": 10501 }, { "epoch": 4.019135093761959, "grad_norm": 0.5332955718040466, "learning_rate": 1.9528832566340726e-06, "loss": 0.5807, "step": 10502 }, { "epoch": 4.019517795637198, "grad_norm": 0.532995343208313, "learning_rate": 1.9514118269412984e-06, "loss": 0.6192, "step": 10503 }, { "epoch": 4.019900497512438, "grad_norm": 0.5651121139526367, "learning_rate": 1.9499408918558924e-06, "loss": 0.6586, "step": 10504 }, { "epoch": 4.020283199387677, "grad_norm": 0.5515106916427612, "learning_rate": 1.948470451468243e-06, "loss": 0.6163, "step": 10505 }, { "epoch": 4.020665901262916, "grad_norm": 0.5385189056396484, "learning_rate": 1.9470005058687146e-06, "loss": 0.5719, "step": 10506 }, { "epoch": 4.021048603138156, "grad_norm": 0.5248370170593262, "learning_rate": 1.9455310551476416e-06, "loss": 0.602, "step": 10507 }, { "epoch": 4.021431305013395, "grad_norm": 0.5839464664459229, "learning_rate": 1.944062099395321e-06, "loss": 0.7405, "step": 10508 }, { "epoch": 4.021814006888634, "grad_norm": 0.5148890018463135, "learning_rate": 1.9425936387020262e-06, "loss": 0.5804, "step": 10509 }, { "epoch": 4.022196708763873, "grad_norm": 0.518677830696106, "learning_rate": 1.9411256731579985e-06, "loss": 0.6598, "step": 10510 }, { "epoch": 4.022579410639112, "grad_norm": 0.5700747966766357, "learning_rate": 1.9396582028534493e-06, "loss": 0.5856, "step": 10511 }, { "epoch": 4.022962112514351, "grad_norm": 0.5112847089767456, "learning_rate": 1.9381912278785565e-06, "loss": 0.6431, "step": 10512 }, { "epoch": 4.02334481438959, "grad_norm": 0.5496962666511536, "learning_rate": 1.9367247483234743e-06, "loss": 0.678, "step": 10513 }, { "epoch": 4.02372751626483, "grad_norm": 0.5175562500953674, "learning_rate": 1.9352587642783137e-06, "loss": 0.5416, "step": 10514 }, { "epoch": 4.024110218140069, "grad_norm": 0.5456050038337708, "learning_rate": 1.9337932758331703e-06, "loss": 0.5397, "step": 10515 }, { "epoch": 4.024492920015308, "grad_norm": 0.49539896845817566, "learning_rate": 1.9323282830780976e-06, "loss": 0.6469, "step": 10516 }, { "epoch": 4.024875621890548, "grad_norm": 0.5613133311271667, "learning_rate": 1.93086378610313e-06, "loss": 0.6698, "step": 10517 }, { "epoch": 4.025258323765787, "grad_norm": 0.5911003351211548, "learning_rate": 1.929399784998257e-06, "loss": 0.6426, "step": 10518 }, { "epoch": 4.0256410256410255, "grad_norm": 0.5525010228157043, "learning_rate": 1.9279362798534486e-06, "loss": 0.6401, "step": 10519 }, { "epoch": 4.0260237275162645, "grad_norm": 0.519534170627594, "learning_rate": 1.926473270758642e-06, "loss": 0.5167, "step": 10520 }, { "epoch": 4.026406429391504, "grad_norm": 0.5242161154747009, "learning_rate": 1.9250107578037412e-06, "loss": 0.6364, "step": 10521 }, { "epoch": 4.026789131266743, "grad_norm": 0.5434394478797913, "learning_rate": 1.9235487410786245e-06, "loss": 0.6529, "step": 10522 }, { "epoch": 4.027171833141982, "grad_norm": 0.5217671394348145, "learning_rate": 1.9220872206731367e-06, "loss": 0.6456, "step": 10523 }, { "epoch": 4.027554535017222, "grad_norm": 0.5459175109863281, "learning_rate": 1.920626196677087e-06, "loss": 0.611, "step": 10524 }, { "epoch": 4.027937236892461, "grad_norm": 0.5421363115310669, "learning_rate": 1.919165669180264e-06, "loss": 0.6491, "step": 10525 }, { "epoch": 4.0283199387677, "grad_norm": 0.5232968330383301, "learning_rate": 1.9177056382724212e-06, "loss": 0.5749, "step": 10526 }, { "epoch": 4.02870264064294, "grad_norm": 0.5048580765724182, "learning_rate": 1.916246104043278e-06, "loss": 0.5727, "step": 10527 }, { "epoch": 4.0290853425181785, "grad_norm": 0.5301661491394043, "learning_rate": 1.9147870665825284e-06, "loss": 0.6556, "step": 10528 }, { "epoch": 4.029468044393417, "grad_norm": 0.5334673523902893, "learning_rate": 1.913328525979834e-06, "loss": 0.5836, "step": 10529 }, { "epoch": 4.029850746268656, "grad_norm": 0.5152831077575684, "learning_rate": 1.9118704823248267e-06, "loss": 0.6144, "step": 10530 }, { "epoch": 4.030233448143896, "grad_norm": 0.5292565226554871, "learning_rate": 1.9104129357071077e-06, "loss": 0.5603, "step": 10531 }, { "epoch": 4.030616150019135, "grad_norm": 0.5702510476112366, "learning_rate": 1.908955886216246e-06, "loss": 0.5759, "step": 10532 }, { "epoch": 4.030998851894374, "grad_norm": 0.5429449081420898, "learning_rate": 1.9074993339417837e-06, "loss": 0.6337, "step": 10533 }, { "epoch": 4.031381553769614, "grad_norm": 0.5213263034820557, "learning_rate": 1.9060432789732253e-06, "loss": 0.5999, "step": 10534 }, { "epoch": 4.031764255644853, "grad_norm": 0.5667542815208435, "learning_rate": 1.904587721400052e-06, "loss": 0.6568, "step": 10535 }, { "epoch": 4.032146957520092, "grad_norm": 0.5461099743843079, "learning_rate": 1.903132661311714e-06, "loss": 0.5823, "step": 10536 }, { "epoch": 4.0325296593953315, "grad_norm": 0.6333910226821899, "learning_rate": 1.9016780987976235e-06, "loss": 0.7065, "step": 10537 }, { "epoch": 4.03291236127057, "grad_norm": 0.5873038172721863, "learning_rate": 1.90022403394717e-06, "loss": 0.5667, "step": 10538 }, { "epoch": 4.033295063145809, "grad_norm": 0.5034006834030151, "learning_rate": 1.898770466849712e-06, "loss": 0.6205, "step": 10539 }, { "epoch": 4.033677765021048, "grad_norm": 0.5329110622406006, "learning_rate": 1.8973173975945703e-06, "loss": 0.6263, "step": 10540 }, { "epoch": 4.034060466896288, "grad_norm": 0.5652744174003601, "learning_rate": 1.895864826271039e-06, "loss": 0.6419, "step": 10541 }, { "epoch": 4.034443168771527, "grad_norm": 0.491362065076828, "learning_rate": 1.8944127529683887e-06, "loss": 0.6121, "step": 10542 }, { "epoch": 4.034825870646766, "grad_norm": 0.5420511364936829, "learning_rate": 1.8929611777758528e-06, "loss": 0.6609, "step": 10543 }, { "epoch": 4.035208572522006, "grad_norm": 0.6061182022094727, "learning_rate": 1.8915101007826297e-06, "loss": 0.5656, "step": 10544 }, { "epoch": 4.035591274397245, "grad_norm": 0.5164005160331726, "learning_rate": 1.890059522077895e-06, "loss": 0.6498, "step": 10545 }, { "epoch": 4.0359739762724836, "grad_norm": 0.5176419019699097, "learning_rate": 1.8886094417507916e-06, "loss": 0.6647, "step": 10546 }, { "epoch": 4.0363566781477225, "grad_norm": 0.5578771233558655, "learning_rate": 1.8871598598904272e-06, "loss": 0.5444, "step": 10547 }, { "epoch": 4.036739380022962, "grad_norm": 0.5423886775970459, "learning_rate": 1.885710776585884e-06, "loss": 0.6148, "step": 10548 }, { "epoch": 4.037122081898201, "grad_norm": 0.5001776218414307, "learning_rate": 1.884262191926215e-06, "loss": 0.5594, "step": 10549 }, { "epoch": 4.03750478377344, "grad_norm": 0.5399124026298523, "learning_rate": 1.8828141060004347e-06, "loss": 0.5916, "step": 10550 }, { "epoch": 4.03788748564868, "grad_norm": 0.5368645191192627, "learning_rate": 1.8813665188975316e-06, "loss": 0.6031, "step": 10551 }, { "epoch": 4.038270187523919, "grad_norm": 0.5427919030189514, "learning_rate": 1.8799194307064716e-06, "loss": 0.5896, "step": 10552 }, { "epoch": 4.038652889399158, "grad_norm": 0.5258603692054749, "learning_rate": 1.8784728415161757e-06, "loss": 0.618, "step": 10553 }, { "epoch": 4.039035591274398, "grad_norm": 0.6026743650436401, "learning_rate": 1.8770267514155405e-06, "loss": 0.6449, "step": 10554 }, { "epoch": 4.0394182931496365, "grad_norm": 0.5446630716323853, "learning_rate": 1.875581160493435e-06, "loss": 0.6016, "step": 10555 }, { "epoch": 4.039800995024875, "grad_norm": 0.5736079812049866, "learning_rate": 1.8741360688386956e-06, "loss": 0.6062, "step": 10556 }, { "epoch": 4.040183696900115, "grad_norm": 0.5646123886108398, "learning_rate": 1.8726914765401217e-06, "loss": 0.6193, "step": 10557 }, { "epoch": 4.040566398775354, "grad_norm": 0.6129902005195618, "learning_rate": 1.8712473836864908e-06, "loss": 0.6081, "step": 10558 }, { "epoch": 4.040949100650593, "grad_norm": 0.5474441647529602, "learning_rate": 1.8698037903665479e-06, "loss": 0.6364, "step": 10559 }, { "epoch": 4.041331802525832, "grad_norm": 0.5006536245346069, "learning_rate": 1.8683606966690026e-06, "loss": 0.5874, "step": 10560 }, { "epoch": 4.041714504401072, "grad_norm": 0.5650124549865723, "learning_rate": 1.8669181026825367e-06, "loss": 0.5985, "step": 10561 }, { "epoch": 4.042097206276311, "grad_norm": 0.5143659710884094, "learning_rate": 1.8654760084958035e-06, "loss": 0.5397, "step": 10562 }, { "epoch": 4.04247990815155, "grad_norm": 0.5583959817886353, "learning_rate": 1.8640344141974232e-06, "loss": 0.5908, "step": 10563 }, { "epoch": 4.0428626100267895, "grad_norm": 0.5363191366195679, "learning_rate": 1.862593319875985e-06, "loss": 0.6327, "step": 10564 }, { "epoch": 4.043245311902028, "grad_norm": 0.5013111233711243, "learning_rate": 1.8611527256200512e-06, "loss": 0.5447, "step": 10565 }, { "epoch": 4.043628013777267, "grad_norm": 0.5212194323539734, "learning_rate": 1.8597126315181435e-06, "loss": 0.5706, "step": 10566 }, { "epoch": 4.044010715652507, "grad_norm": 0.5423014760017395, "learning_rate": 1.8582730376587644e-06, "loss": 0.5796, "step": 10567 }, { "epoch": 4.044393417527746, "grad_norm": 0.5901000499725342, "learning_rate": 1.8568339441303807e-06, "loss": 0.6125, "step": 10568 }, { "epoch": 4.044776119402985, "grad_norm": 0.5832827091217041, "learning_rate": 1.8553953510214285e-06, "loss": 0.7442, "step": 10569 }, { "epoch": 4.045158821278224, "grad_norm": 0.5744256377220154, "learning_rate": 1.8539572584203114e-06, "loss": 0.6596, "step": 10570 }, { "epoch": 4.045541523153464, "grad_norm": 0.5066943168640137, "learning_rate": 1.8525196664154032e-06, "loss": 0.6476, "step": 10571 }, { "epoch": 4.045924225028703, "grad_norm": 0.6044012308120728, "learning_rate": 1.8510825750950512e-06, "loss": 0.719, "step": 10572 }, { "epoch": 4.046306926903942, "grad_norm": 0.5629158020019531, "learning_rate": 1.8496459845475668e-06, "loss": 0.5902, "step": 10573 }, { "epoch": 4.046689628779181, "grad_norm": 0.5577785968780518, "learning_rate": 1.8482098948612314e-06, "loss": 0.7067, "step": 10574 }, { "epoch": 4.04707233065442, "grad_norm": 0.49698564410209656, "learning_rate": 1.8467743061243015e-06, "loss": 0.5091, "step": 10575 }, { "epoch": 4.047455032529659, "grad_norm": 0.5328636765480042, "learning_rate": 1.8453392184249908e-06, "loss": 0.6053, "step": 10576 }, { "epoch": 4.047837734404899, "grad_norm": 0.5526072978973389, "learning_rate": 1.843904631851492e-06, "loss": 0.5852, "step": 10577 }, { "epoch": 4.048220436280138, "grad_norm": 0.5340604186058044, "learning_rate": 1.8424705464919678e-06, "loss": 0.6159, "step": 10578 }, { "epoch": 4.048603138155377, "grad_norm": 0.5704076886177063, "learning_rate": 1.8410369624345415e-06, "loss": 0.6451, "step": 10579 }, { "epoch": 4.048985840030616, "grad_norm": 0.5626938343048096, "learning_rate": 1.8396038797673133e-06, "loss": 0.5853, "step": 10580 }, { "epoch": 4.049368541905856, "grad_norm": 0.5338062047958374, "learning_rate": 1.8381712985783485e-06, "loss": 0.6278, "step": 10581 }, { "epoch": 4.0497512437810945, "grad_norm": 0.555102527141571, "learning_rate": 1.8367392189556843e-06, "loss": 0.5797, "step": 10582 }, { "epoch": 4.0501339456563334, "grad_norm": 0.5545079112052917, "learning_rate": 1.835307640987326e-06, "loss": 0.7078, "step": 10583 }, { "epoch": 4.050516647531573, "grad_norm": 0.5209150910377502, "learning_rate": 1.8338765647612478e-06, "loss": 0.5565, "step": 10584 }, { "epoch": 4.050899349406812, "grad_norm": 0.540971577167511, "learning_rate": 1.8324459903653945e-06, "loss": 0.6589, "step": 10585 }, { "epoch": 4.051282051282051, "grad_norm": 0.5369868278503418, "learning_rate": 1.8310159178876752e-06, "loss": 0.6428, "step": 10586 }, { "epoch": 4.051664753157291, "grad_norm": 0.5619657039642334, "learning_rate": 1.8295863474159725e-06, "loss": 0.6433, "step": 10587 }, { "epoch": 4.05204745503253, "grad_norm": 0.4965663254261017, "learning_rate": 1.828157279038143e-06, "loss": 0.5881, "step": 10588 }, { "epoch": 4.052430156907769, "grad_norm": 0.5412299633026123, "learning_rate": 1.8267287128419986e-06, "loss": 0.6635, "step": 10589 }, { "epoch": 4.052812858783008, "grad_norm": 0.5518292188644409, "learning_rate": 1.8253006489153323e-06, "loss": 0.628, "step": 10590 }, { "epoch": 4.0531955606582475, "grad_norm": 0.4903489351272583, "learning_rate": 1.8238730873459021e-06, "loss": 0.6095, "step": 10591 }, { "epoch": 4.053578262533486, "grad_norm": 0.5516908764839172, "learning_rate": 1.8224460282214362e-06, "loss": 0.603, "step": 10592 }, { "epoch": 4.053960964408725, "grad_norm": 0.5079843997955322, "learning_rate": 1.8210194716296303e-06, "loss": 0.581, "step": 10593 }, { "epoch": 4.054343666283965, "grad_norm": 0.5539417862892151, "learning_rate": 1.8195934176581508e-06, "loss": 0.6263, "step": 10594 }, { "epoch": 4.054726368159204, "grad_norm": 0.5393332242965698, "learning_rate": 1.8181678663946355e-06, "loss": 0.6382, "step": 10595 }, { "epoch": 4.055109070034443, "grad_norm": 0.49971267580986023, "learning_rate": 1.816742817926682e-06, "loss": 0.603, "step": 10596 }, { "epoch": 4.055491771909683, "grad_norm": 0.5297471284866333, "learning_rate": 1.8153182723418672e-06, "loss": 0.65, "step": 10597 }, { "epoch": 4.055874473784922, "grad_norm": 0.5797608494758606, "learning_rate": 1.813894229727734e-06, "loss": 0.6544, "step": 10598 }, { "epoch": 4.056257175660161, "grad_norm": 0.5969451665878296, "learning_rate": 1.8124706901717903e-06, "loss": 0.6634, "step": 10599 }, { "epoch": 4.0566398775354, "grad_norm": 0.540961742401123, "learning_rate": 1.8110476537615195e-06, "loss": 0.6438, "step": 10600 }, { "epoch": 4.057022579410639, "grad_norm": 0.5947835445404053, "learning_rate": 1.8096251205843685e-06, "loss": 0.6459, "step": 10601 }, { "epoch": 4.057405281285878, "grad_norm": 0.5539694428443909, "learning_rate": 1.808203090727758e-06, "loss": 0.555, "step": 10602 }, { "epoch": 4.057787983161117, "grad_norm": 0.5879793763160706, "learning_rate": 1.8067815642790743e-06, "loss": 0.6376, "step": 10603 }, { "epoch": 4.058170685036357, "grad_norm": 0.5253616571426392, "learning_rate": 1.805360541325677e-06, "loss": 0.6461, "step": 10604 }, { "epoch": 4.058553386911596, "grad_norm": 0.5145171284675598, "learning_rate": 1.8039400219548876e-06, "loss": 0.6022, "step": 10605 }, { "epoch": 4.058936088786835, "grad_norm": 0.5042438507080078, "learning_rate": 1.8025200062540015e-06, "loss": 0.577, "step": 10606 }, { "epoch": 4.059318790662075, "grad_norm": 0.5495544672012329, "learning_rate": 1.801100494310284e-06, "loss": 0.6391, "step": 10607 }, { "epoch": 4.059701492537314, "grad_norm": 0.5633994936943054, "learning_rate": 1.799681486210969e-06, "loss": 0.6303, "step": 10608 }, { "epoch": 4.0600841944125525, "grad_norm": 0.5371735692024231, "learning_rate": 1.7982629820432552e-06, "loss": 0.5883, "step": 10609 }, { "epoch": 4.0604668962877914, "grad_norm": 0.5174639821052551, "learning_rate": 1.7968449818943146e-06, "loss": 0.6674, "step": 10610 }, { "epoch": 4.060849598163031, "grad_norm": 0.5165132880210876, "learning_rate": 1.7954274858512877e-06, "loss": 0.6591, "step": 10611 }, { "epoch": 4.06123230003827, "grad_norm": 0.5381743907928467, "learning_rate": 1.7940104940012827e-06, "loss": 0.5521, "step": 10612 }, { "epoch": 4.061615001913509, "grad_norm": 0.5335759520530701, "learning_rate": 1.7925940064313795e-06, "loss": 0.6566, "step": 10613 }, { "epoch": 4.061997703788749, "grad_norm": 0.5615806579589844, "learning_rate": 1.7911780232286247e-06, "loss": 0.6483, "step": 10614 }, { "epoch": 4.062380405663988, "grad_norm": 0.530494213104248, "learning_rate": 1.7897625444800314e-06, "loss": 0.5929, "step": 10615 }, { "epoch": 4.062763107539227, "grad_norm": 0.5681344866752625, "learning_rate": 1.7883475702725871e-06, "loss": 0.5889, "step": 10616 }, { "epoch": 4.063145809414467, "grad_norm": 0.5448426604270935, "learning_rate": 1.7869331006932467e-06, "loss": 0.6739, "step": 10617 }, { "epoch": 4.0635285112897055, "grad_norm": 0.5238112211227417, "learning_rate": 1.7855191358289293e-06, "loss": 0.5639, "step": 10618 }, { "epoch": 4.063911213164944, "grad_norm": 0.584032416343689, "learning_rate": 1.7841056757665298e-06, "loss": 0.6875, "step": 10619 }, { "epoch": 4.064293915040183, "grad_norm": 0.5535657405853271, "learning_rate": 1.782692720592908e-06, "loss": 0.573, "step": 10620 }, { "epoch": 4.064676616915423, "grad_norm": 0.5496698617935181, "learning_rate": 1.781280270394894e-06, "loss": 0.6225, "step": 10621 }, { "epoch": 4.065059318790662, "grad_norm": 0.5779097676277161, "learning_rate": 1.7798683252592875e-06, "loss": 0.5907, "step": 10622 }, { "epoch": 4.065442020665901, "grad_norm": 0.5955173373222351, "learning_rate": 1.7784568852728545e-06, "loss": 0.6101, "step": 10623 }, { "epoch": 4.065824722541141, "grad_norm": 0.5483471751213074, "learning_rate": 1.777045950522337e-06, "loss": 0.557, "step": 10624 }, { "epoch": 4.06620742441638, "grad_norm": 0.5072681903839111, "learning_rate": 1.7756355210944332e-06, "loss": 0.6482, "step": 10625 }, { "epoch": 4.066590126291619, "grad_norm": 0.5386958122253418, "learning_rate": 1.7742255970758216e-06, "loss": 0.5682, "step": 10626 }, { "epoch": 4.066972828166858, "grad_norm": 0.5097566246986389, "learning_rate": 1.7728161785531483e-06, "loss": 0.6052, "step": 10627 }, { "epoch": 4.067355530042097, "grad_norm": 0.5507000088691711, "learning_rate": 1.7714072656130199e-06, "loss": 0.6177, "step": 10628 }, { "epoch": 4.067738231917336, "grad_norm": 0.566310465335846, "learning_rate": 1.7699988583420214e-06, "loss": 0.633, "step": 10629 }, { "epoch": 4.068120933792575, "grad_norm": 0.5569844841957092, "learning_rate": 1.7685909568267034e-06, "loss": 0.6177, "step": 10630 }, { "epoch": 4.068503635667815, "grad_norm": 0.5271821022033691, "learning_rate": 1.7671835611535838e-06, "loss": 0.6603, "step": 10631 }, { "epoch": 4.068886337543054, "grad_norm": 0.5444648265838623, "learning_rate": 1.7657766714091517e-06, "loss": 0.6197, "step": 10632 }, { "epoch": 4.069269039418293, "grad_norm": 0.5607817769050598, "learning_rate": 1.7643702876798652e-06, "loss": 0.6283, "step": 10633 }, { "epoch": 4.069651741293533, "grad_norm": 0.5464797616004944, "learning_rate": 1.7629644100521516e-06, "loss": 0.6142, "step": 10634 }, { "epoch": 4.070034443168772, "grad_norm": 0.5789594650268555, "learning_rate": 1.7615590386124004e-06, "loss": 0.6003, "step": 10635 }, { "epoch": 4.0704171450440105, "grad_norm": 0.5785678625106812, "learning_rate": 1.7601541734469795e-06, "loss": 0.6715, "step": 10636 }, { "epoch": 4.07079984691925, "grad_norm": 0.49239009618759155, "learning_rate": 1.7587498146422245e-06, "loss": 0.6274, "step": 10637 }, { "epoch": 4.071182548794489, "grad_norm": 0.5918996930122375, "learning_rate": 1.7573459622844292e-06, "loss": 0.6615, "step": 10638 }, { "epoch": 4.071565250669728, "grad_norm": 0.5402877926826477, "learning_rate": 1.7559426164598692e-06, "loss": 0.5634, "step": 10639 }, { "epoch": 4.071947952544967, "grad_norm": 0.5708256959915161, "learning_rate": 1.7545397772547846e-06, "loss": 0.6707, "step": 10640 }, { "epoch": 4.072330654420207, "grad_norm": 0.5527368187904358, "learning_rate": 1.7531374447553784e-06, "loss": 0.7058, "step": 10641 }, { "epoch": 4.072713356295446, "grad_norm": 0.5718826055526733, "learning_rate": 1.751735619047833e-06, "loss": 0.6199, "step": 10642 }, { "epoch": 4.073096058170685, "grad_norm": 0.5491909980773926, "learning_rate": 1.750334300218295e-06, "loss": 0.6465, "step": 10643 }, { "epoch": 4.073478760045925, "grad_norm": 0.5808547735214233, "learning_rate": 1.7489334883528742e-06, "loss": 0.6131, "step": 10644 }, { "epoch": 4.0738614619211635, "grad_norm": 0.5922839045524597, "learning_rate": 1.7475331835376575e-06, "loss": 0.6489, "step": 10645 }, { "epoch": 4.074244163796402, "grad_norm": 0.535534143447876, "learning_rate": 1.7461333858586993e-06, "loss": 0.6108, "step": 10646 }, { "epoch": 4.074626865671641, "grad_norm": 0.5540947318077087, "learning_rate": 1.7447340954020164e-06, "loss": 0.6379, "step": 10647 }, { "epoch": 4.075009567546881, "grad_norm": 0.5576249361038208, "learning_rate": 1.7433353122536e-06, "loss": 0.636, "step": 10648 }, { "epoch": 4.07539226942212, "grad_norm": 0.5547431111335754, "learning_rate": 1.7419370364994116e-06, "loss": 0.5647, "step": 10649 }, { "epoch": 4.075774971297359, "grad_norm": 0.5509272217750549, "learning_rate": 1.7405392682253786e-06, "loss": 0.6428, "step": 10650 }, { "epoch": 4.076157673172599, "grad_norm": 0.5360444188117981, "learning_rate": 1.7391420075173915e-06, "loss": 0.5864, "step": 10651 }, { "epoch": 4.076540375047838, "grad_norm": 0.511132001876831, "learning_rate": 1.7377452544613237e-06, "loss": 0.5926, "step": 10652 }, { "epoch": 4.076923076923077, "grad_norm": 0.5733149647712708, "learning_rate": 1.736349009143009e-06, "loss": 0.7043, "step": 10653 }, { "epoch": 4.0773057787983165, "grad_norm": 0.532463014125824, "learning_rate": 1.7349532716482452e-06, "loss": 0.5895, "step": 10654 }, { "epoch": 4.077688480673555, "grad_norm": 0.6168012022972107, "learning_rate": 1.7335580420628073e-06, "loss": 0.6644, "step": 10655 }, { "epoch": 4.078071182548794, "grad_norm": 0.5145155191421509, "learning_rate": 1.7321633204724375e-06, "loss": 0.6174, "step": 10656 }, { "epoch": 4.078453884424034, "grad_norm": 0.5551841259002686, "learning_rate": 1.7307691069628418e-06, "loss": 0.6551, "step": 10657 }, { "epoch": 4.078836586299273, "grad_norm": 0.5851531624794006, "learning_rate": 1.7293754016196983e-06, "loss": 0.6063, "step": 10658 }, { "epoch": 4.079219288174512, "grad_norm": 0.5601384043693542, "learning_rate": 1.7279822045286577e-06, "loss": 0.6474, "step": 10659 }, { "epoch": 4.079601990049751, "grad_norm": 0.5141785144805908, "learning_rate": 1.7265895157753321e-06, "loss": 0.5595, "step": 10660 }, { "epoch": 4.079984691924991, "grad_norm": 0.5133126974105835, "learning_rate": 1.7251973354453055e-06, "loss": 0.5732, "step": 10661 }, { "epoch": 4.08036739380023, "grad_norm": 0.5366511940956116, "learning_rate": 1.7238056636241318e-06, "loss": 0.6578, "step": 10662 }, { "epoch": 4.0807500956754685, "grad_norm": 0.5829437375068665, "learning_rate": 1.722414500397338e-06, "loss": 0.5514, "step": 10663 }, { "epoch": 4.081132797550708, "grad_norm": 0.5475204586982727, "learning_rate": 1.7210238458504091e-06, "loss": 0.624, "step": 10664 }, { "epoch": 4.081515499425947, "grad_norm": 0.535976231098175, "learning_rate": 1.7196337000688056e-06, "loss": 0.6306, "step": 10665 }, { "epoch": 4.081898201301186, "grad_norm": 0.5376891493797302, "learning_rate": 1.7182440631379594e-06, "loss": 0.5785, "step": 10666 }, { "epoch": 4.082280903176426, "grad_norm": 0.5165416598320007, "learning_rate": 1.7168549351432618e-06, "loss": 0.6204, "step": 10667 }, { "epoch": 4.082663605051665, "grad_norm": 0.5908911228179932, "learning_rate": 1.7154663161700801e-06, "loss": 0.6545, "step": 10668 }, { "epoch": 4.083046306926904, "grad_norm": 0.5940994620323181, "learning_rate": 1.7140782063037532e-06, "loss": 0.5916, "step": 10669 }, { "epoch": 4.083429008802143, "grad_norm": 0.5262715816497803, "learning_rate": 1.7126906056295778e-06, "loss": 0.5548, "step": 10670 }, { "epoch": 4.083811710677383, "grad_norm": 0.48628106713294983, "learning_rate": 1.7113035142328283e-06, "loss": 0.5731, "step": 10671 }, { "epoch": 4.0841944125526215, "grad_norm": 0.4970517158508301, "learning_rate": 1.7099169321987452e-06, "loss": 0.533, "step": 10672 }, { "epoch": 4.08457711442786, "grad_norm": 0.5889089107513428, "learning_rate": 1.7085308596125372e-06, "loss": 0.653, "step": 10673 }, { "epoch": 4.0849598163031, "grad_norm": 0.5450440645217896, "learning_rate": 1.7071452965593839e-06, "loss": 0.7146, "step": 10674 }, { "epoch": 4.085342518178339, "grad_norm": 0.5753144025802612, "learning_rate": 1.7057602431244301e-06, "loss": 0.6157, "step": 10675 }, { "epoch": 4.085725220053578, "grad_norm": 0.5381754040718079, "learning_rate": 1.7043756993927952e-06, "loss": 0.5612, "step": 10676 }, { "epoch": 4.086107921928818, "grad_norm": 0.5650796294212341, "learning_rate": 1.702991665449557e-06, "loss": 0.6507, "step": 10677 }, { "epoch": 4.086490623804057, "grad_norm": 0.5136803984642029, "learning_rate": 1.7016081413797703e-06, "loss": 0.5477, "step": 10678 }, { "epoch": 4.086873325679296, "grad_norm": 0.5741363763809204, "learning_rate": 1.7002251272684589e-06, "loss": 0.5296, "step": 10679 }, { "epoch": 4.087256027554535, "grad_norm": 0.5029909610748291, "learning_rate": 1.6988426232006095e-06, "loss": 0.5965, "step": 10680 }, { "epoch": 4.0876387294297745, "grad_norm": 0.5586974024772644, "learning_rate": 1.6974606292611806e-06, "loss": 0.6969, "step": 10681 }, { "epoch": 4.088021431305013, "grad_norm": 0.518599271774292, "learning_rate": 1.6960791455351012e-06, "loss": 0.6627, "step": 10682 }, { "epoch": 4.088404133180252, "grad_norm": 0.5742851495742798, "learning_rate": 1.6946981721072663e-06, "loss": 0.6377, "step": 10683 }, { "epoch": 4.088786835055492, "grad_norm": 0.5164517164230347, "learning_rate": 1.6933177090625419e-06, "loss": 0.6529, "step": 10684 }, { "epoch": 4.089169536930731, "grad_norm": 0.5478144884109497, "learning_rate": 1.691937756485762e-06, "loss": 0.6052, "step": 10685 }, { "epoch": 4.08955223880597, "grad_norm": 0.5279051065444946, "learning_rate": 1.6905583144617244e-06, "loss": 0.5817, "step": 10686 }, { "epoch": 4.08993494068121, "grad_norm": 0.5358412265777588, "learning_rate": 1.6891793830752012e-06, "loss": 0.6562, "step": 10687 }, { "epoch": 4.090317642556449, "grad_norm": 0.5559607744216919, "learning_rate": 1.6878009624109315e-06, "loss": 0.6121, "step": 10688 }, { "epoch": 4.090700344431688, "grad_norm": 0.5606096982955933, "learning_rate": 1.6864230525536263e-06, "loss": 0.5651, "step": 10689 }, { "epoch": 4.0910830463069265, "grad_norm": 0.5340587496757507, "learning_rate": 1.6850456535879555e-06, "loss": 0.6746, "step": 10690 }, { "epoch": 4.091465748182166, "grad_norm": 0.5469401478767395, "learning_rate": 1.6836687655985685e-06, "loss": 0.6183, "step": 10691 }, { "epoch": 4.091848450057405, "grad_norm": 0.5058410167694092, "learning_rate": 1.682292388670077e-06, "loss": 0.5758, "step": 10692 }, { "epoch": 4.092231151932644, "grad_norm": 0.5093255639076233, "learning_rate": 1.6809165228870638e-06, "loss": 0.5676, "step": 10693 }, { "epoch": 4.092613853807884, "grad_norm": 0.6624271273612976, "learning_rate": 1.679541168334079e-06, "loss": 0.6817, "step": 10694 }, { "epoch": 4.092996555683123, "grad_norm": 0.5669341087341309, "learning_rate": 1.678166325095646e-06, "loss": 0.6178, "step": 10695 }, { "epoch": 4.093379257558362, "grad_norm": 0.5277988314628601, "learning_rate": 1.6767919932562449e-06, "loss": 0.6079, "step": 10696 }, { "epoch": 4.093761959433602, "grad_norm": 0.5217770934104919, "learning_rate": 1.675418172900336e-06, "loss": 0.5639, "step": 10697 }, { "epoch": 4.094144661308841, "grad_norm": 0.6043589115142822, "learning_rate": 1.6740448641123474e-06, "loss": 0.6178, "step": 10698 }, { "epoch": 4.0945273631840795, "grad_norm": 0.507972002029419, "learning_rate": 1.672672066976667e-06, "loss": 0.5678, "step": 10699 }, { "epoch": 4.094910065059318, "grad_norm": 0.5012720823287964, "learning_rate": 1.6712997815776599e-06, "loss": 0.6286, "step": 10700 }, { "epoch": 4.095292766934558, "grad_norm": 0.5571919083595276, "learning_rate": 1.6699280079996549e-06, "loss": 0.5982, "step": 10701 }, { "epoch": 4.095675468809797, "grad_norm": 0.5161751508712769, "learning_rate": 1.6685567463269537e-06, "loss": 0.5903, "step": 10702 }, { "epoch": 4.096058170685036, "grad_norm": 0.5776546597480774, "learning_rate": 1.667185996643822e-06, "loss": 0.6328, "step": 10703 }, { "epoch": 4.096440872560276, "grad_norm": 0.5618300437927246, "learning_rate": 1.665815759034497e-06, "loss": 0.5646, "step": 10704 }, { "epoch": 4.096823574435515, "grad_norm": 0.556129515171051, "learning_rate": 1.6644460335831857e-06, "loss": 0.5971, "step": 10705 }, { "epoch": 4.097206276310754, "grad_norm": 0.5313978791236877, "learning_rate": 1.6630768203740565e-06, "loss": 0.6354, "step": 10706 }, { "epoch": 4.0975889781859935, "grad_norm": 0.5564944744110107, "learning_rate": 1.6617081194912544e-06, "loss": 0.6989, "step": 10707 }, { "epoch": 4.0979716800612325, "grad_norm": 0.5471763610839844, "learning_rate": 1.6603399310188917e-06, "loss": 0.6025, "step": 10708 }, { "epoch": 4.098354381936471, "grad_norm": 0.5289596319198608, "learning_rate": 1.6589722550410415e-06, "loss": 0.5667, "step": 10709 }, { "epoch": 4.09873708381171, "grad_norm": 0.5750741362571716, "learning_rate": 1.657605091641754e-06, "loss": 0.6464, "step": 10710 }, { "epoch": 4.09911978568695, "grad_norm": 0.5691887140274048, "learning_rate": 1.6562384409050458e-06, "loss": 0.6953, "step": 10711 }, { "epoch": 4.099502487562189, "grad_norm": 0.5566563010215759, "learning_rate": 1.6548723029149005e-06, "loss": 0.6686, "step": 10712 }, { "epoch": 4.099885189437428, "grad_norm": 0.5439900755882263, "learning_rate": 1.6535066777552723e-06, "loss": 0.5639, "step": 10713 }, { "epoch": 4.100267891312668, "grad_norm": 0.5481577515602112, "learning_rate": 1.6521415655100804e-06, "loss": 0.654, "step": 10714 }, { "epoch": 4.100650593187907, "grad_norm": 0.5161579251289368, "learning_rate": 1.650776966263219e-06, "loss": 0.5828, "step": 10715 }, { "epoch": 4.101033295063146, "grad_norm": 0.5156195163726807, "learning_rate": 1.6494128800985398e-06, "loss": 0.6102, "step": 10716 }, { "epoch": 4.101415996938385, "grad_norm": 0.5172154307365417, "learning_rate": 1.6480493070998738e-06, "loss": 0.6094, "step": 10717 }, { "epoch": 4.101798698813624, "grad_norm": 0.5062513947486877, "learning_rate": 1.646686247351017e-06, "loss": 0.6735, "step": 10718 }, { "epoch": 4.102181400688863, "grad_norm": 0.5351349711418152, "learning_rate": 1.6453237009357293e-06, "loss": 0.5553, "step": 10719 }, { "epoch": 4.102564102564102, "grad_norm": 0.5065281987190247, "learning_rate": 1.6439616679377445e-06, "loss": 0.645, "step": 10720 }, { "epoch": 4.102946804439342, "grad_norm": 0.5393707752227783, "learning_rate": 1.6426001484407638e-06, "loss": 0.571, "step": 10721 }, { "epoch": 4.103329506314581, "grad_norm": 0.5902019143104553, "learning_rate": 1.6412391425284568e-06, "loss": 0.6157, "step": 10722 }, { "epoch": 4.10371220818982, "grad_norm": 0.5661962628364563, "learning_rate": 1.63987865028446e-06, "loss": 0.6925, "step": 10723 }, { "epoch": 4.10409491006506, "grad_norm": 0.5524935722351074, "learning_rate": 1.6385186717923806e-06, "loss": 0.6181, "step": 10724 }, { "epoch": 4.104477611940299, "grad_norm": 0.5303073525428772, "learning_rate": 1.6371592071357911e-06, "loss": 0.6731, "step": 10725 }, { "epoch": 4.1048603138155375, "grad_norm": 0.5695868730545044, "learning_rate": 1.6358002563982344e-06, "loss": 0.6059, "step": 10726 }, { "epoch": 4.105243015690777, "grad_norm": 0.5155102610588074, "learning_rate": 1.6344418196632217e-06, "loss": 0.6341, "step": 10727 }, { "epoch": 4.105625717566016, "grad_norm": 0.550277590751648, "learning_rate": 1.6330838970142372e-06, "loss": 0.6637, "step": 10728 }, { "epoch": 4.106008419441255, "grad_norm": 0.585151731967926, "learning_rate": 1.6317264885347206e-06, "loss": 0.65, "step": 10729 }, { "epoch": 4.106391121316494, "grad_norm": 0.553117573261261, "learning_rate": 1.6303695943080933e-06, "loss": 0.6663, "step": 10730 }, { "epoch": 4.106773823191734, "grad_norm": 0.5406652688980103, "learning_rate": 1.6290132144177396e-06, "loss": 0.5751, "step": 10731 }, { "epoch": 4.107156525066973, "grad_norm": 0.5195037722587585, "learning_rate": 1.6276573489470127e-06, "loss": 0.5751, "step": 10732 }, { "epoch": 4.107539226942212, "grad_norm": 0.5107700824737549, "learning_rate": 1.6263019979792327e-06, "loss": 0.6277, "step": 10733 }, { "epoch": 4.1079219288174516, "grad_norm": 0.5273398756980896, "learning_rate": 1.6249471615976942e-06, "loss": 0.518, "step": 10734 }, { "epoch": 4.1083046306926905, "grad_norm": 0.5557748675346375, "learning_rate": 1.6235928398856494e-06, "loss": 0.6606, "step": 10735 }, { "epoch": 4.108687332567929, "grad_norm": 0.5393115878105164, "learning_rate": 1.6222390329263282e-06, "loss": 0.5754, "step": 10736 }, { "epoch": 4.109070034443169, "grad_norm": 0.5130707025527954, "learning_rate": 1.6208857408029266e-06, "loss": 0.5838, "step": 10737 }, { "epoch": 4.109452736318408, "grad_norm": 0.5640367269515991, "learning_rate": 1.6195329635986045e-06, "loss": 0.6372, "step": 10738 }, { "epoch": 4.109835438193647, "grad_norm": 0.5808382630348206, "learning_rate": 1.6181807013964968e-06, "loss": 0.5512, "step": 10739 }, { "epoch": 4.110218140068886, "grad_norm": 0.5155152082443237, "learning_rate": 1.6168289542797023e-06, "loss": 0.591, "step": 10740 }, { "epoch": 4.110600841944126, "grad_norm": 0.5605766773223877, "learning_rate": 1.6154777223312889e-06, "loss": 0.6481, "step": 10741 }, { "epoch": 4.110983543819365, "grad_norm": 0.5712302327156067, "learning_rate": 1.614127005634295e-06, "loss": 0.593, "step": 10742 }, { "epoch": 4.111366245694604, "grad_norm": 0.5166690945625305, "learning_rate": 1.6127768042717252e-06, "loss": 0.5879, "step": 10743 }, { "epoch": 4.111748947569843, "grad_norm": 0.5688937902450562, "learning_rate": 1.611427118326555e-06, "loss": 0.66, "step": 10744 }, { "epoch": 4.112131649445082, "grad_norm": 0.5175645351409912, "learning_rate": 1.610077947881723e-06, "loss": 0.5531, "step": 10745 }, { "epoch": 4.112514351320321, "grad_norm": 0.5233761072158813, "learning_rate": 1.6087292930201393e-06, "loss": 0.5737, "step": 10746 }, { "epoch": 4.112897053195561, "grad_norm": 0.5836296081542969, "learning_rate": 1.6073811538246863e-06, "loss": 0.6015, "step": 10747 }, { "epoch": 4.1132797550708, "grad_norm": 0.5403249859809875, "learning_rate": 1.606033530378206e-06, "loss": 0.5812, "step": 10748 }, { "epoch": 4.113662456946039, "grad_norm": 0.5257736444473267, "learning_rate": 1.604686422763515e-06, "loss": 0.6692, "step": 10749 }, { "epoch": 4.114045158821278, "grad_norm": 0.539925754070282, "learning_rate": 1.6033398310634008e-06, "loss": 0.5566, "step": 10750 }, { "epoch": 4.114427860696518, "grad_norm": 0.5528910756111145, "learning_rate": 1.6019937553606058e-06, "loss": 0.6118, "step": 10751 }, { "epoch": 4.114810562571757, "grad_norm": 0.5627871751785278, "learning_rate": 1.6006481957378584e-06, "loss": 0.6126, "step": 10752 }, { "epoch": 4.1151932644469955, "grad_norm": 0.5246900320053101, "learning_rate": 1.5993031522778434e-06, "loss": 0.6063, "step": 10753 }, { "epoch": 4.115575966322235, "grad_norm": 0.577713131904602, "learning_rate": 1.5979586250632206e-06, "loss": 0.5632, "step": 10754 }, { "epoch": 4.115958668197474, "grad_norm": 0.5031216144561768, "learning_rate": 1.5966146141766114e-06, "loss": 0.5777, "step": 10755 }, { "epoch": 4.116341370072713, "grad_norm": 0.552051842212677, "learning_rate": 1.5952711197006088e-06, "loss": 0.6102, "step": 10756 }, { "epoch": 4.116724071947953, "grad_norm": 0.5679560303688049, "learning_rate": 1.5939281417177766e-06, "loss": 0.6415, "step": 10757 }, { "epoch": 4.117106773823192, "grad_norm": 0.5494787693023682, "learning_rate": 1.592585680310642e-06, "loss": 0.6542, "step": 10758 }, { "epoch": 4.117489475698431, "grad_norm": 0.5084942579269409, "learning_rate": 1.5912437355617027e-06, "loss": 0.5376, "step": 10759 }, { "epoch": 4.11787217757367, "grad_norm": 0.49909707903862, "learning_rate": 1.5899023075534293e-06, "loss": 0.5474, "step": 10760 }, { "epoch": 4.1182548794489096, "grad_norm": 0.5143803358078003, "learning_rate": 1.588561396368249e-06, "loss": 0.6279, "step": 10761 }, { "epoch": 4.1186375813241485, "grad_norm": 0.534851610660553, "learning_rate": 1.587221002088567e-06, "loss": 0.6565, "step": 10762 }, { "epoch": 4.119020283199387, "grad_norm": 0.5834057927131653, "learning_rate": 1.5858811247967586e-06, "loss": 0.6826, "step": 10763 }, { "epoch": 4.119402985074627, "grad_norm": 0.5132082104682922, "learning_rate": 1.5845417645751582e-06, "loss": 0.603, "step": 10764 }, { "epoch": 4.119785686949866, "grad_norm": 0.5929358601570129, "learning_rate": 1.5832029215060751e-06, "loss": 0.6338, "step": 10765 }, { "epoch": 4.120168388825105, "grad_norm": 0.5594726204872131, "learning_rate": 1.5818645956717827e-06, "loss": 0.5027, "step": 10766 }, { "epoch": 4.120551090700345, "grad_norm": 0.5772356390953064, "learning_rate": 1.5805267871545293e-06, "loss": 0.6394, "step": 10767 }, { "epoch": 4.120933792575584, "grad_norm": 0.5065824389457703, "learning_rate": 1.5791894960365205e-06, "loss": 0.5899, "step": 10768 }, { "epoch": 4.121316494450823, "grad_norm": 0.5363481640815735, "learning_rate": 1.5778527223999408e-06, "loss": 0.5292, "step": 10769 }, { "epoch": 4.121699196326062, "grad_norm": 0.558143138885498, "learning_rate": 1.5765164663269393e-06, "loss": 0.7078, "step": 10770 }, { "epoch": 4.122081898201301, "grad_norm": 0.5177260041236877, "learning_rate": 1.5751807278996278e-06, "loss": 0.5427, "step": 10771 }, { "epoch": 4.12246460007654, "grad_norm": 0.4965040981769562, "learning_rate": 1.5738455072000936e-06, "loss": 0.6686, "step": 10772 }, { "epoch": 4.122847301951779, "grad_norm": 0.5540722012519836, "learning_rate": 1.57251080431039e-06, "loss": 0.6899, "step": 10773 }, { "epoch": 4.123230003827019, "grad_norm": 0.5675886273384094, "learning_rate": 1.5711766193125377e-06, "loss": 0.6387, "step": 10774 }, { "epoch": 4.123612705702258, "grad_norm": 0.5242002606391907, "learning_rate": 1.5698429522885273e-06, "loss": 0.6732, "step": 10775 }, { "epoch": 4.123995407577497, "grad_norm": 0.5485064387321472, "learning_rate": 1.568509803320316e-06, "loss": 0.5564, "step": 10776 }, { "epoch": 4.124378109452737, "grad_norm": 0.5075619220733643, "learning_rate": 1.5671771724898267e-06, "loss": 0.5433, "step": 10777 }, { "epoch": 4.124760811327976, "grad_norm": 0.5833445191383362, "learning_rate": 1.5658450598789542e-06, "loss": 0.6634, "step": 10778 }, { "epoch": 4.125143513203215, "grad_norm": 0.5066182017326355, "learning_rate": 1.5645134655695615e-06, "loss": 0.5363, "step": 10779 }, { "epoch": 4.1255262150784535, "grad_norm": 0.574154794216156, "learning_rate": 1.5631823896434806e-06, "loss": 0.5976, "step": 10780 }, { "epoch": 4.125908916953693, "grad_norm": 0.5639042854309082, "learning_rate": 1.561851832182505e-06, "loss": 0.6406, "step": 10781 }, { "epoch": 4.126291618828932, "grad_norm": 0.5422372221946716, "learning_rate": 1.5605217932684024e-06, "loss": 0.6364, "step": 10782 }, { "epoch": 4.126674320704171, "grad_norm": 0.5503021478652954, "learning_rate": 1.559192272982909e-06, "loss": 0.5619, "step": 10783 }, { "epoch": 4.127057022579411, "grad_norm": 0.5582170486450195, "learning_rate": 1.557863271407727e-06, "loss": 0.5732, "step": 10784 }, { "epoch": 4.12743972445465, "grad_norm": 0.7134414911270142, "learning_rate": 1.5565347886245253e-06, "loss": 0.6949, "step": 10785 }, { "epoch": 4.127822426329889, "grad_norm": 0.5428075194358826, "learning_rate": 1.5552068247149476e-06, "loss": 0.6047, "step": 10786 }, { "epoch": 4.128205128205128, "grad_norm": 0.5801676511764526, "learning_rate": 1.5538793797605944e-06, "loss": 0.5548, "step": 10787 }, { "epoch": 4.128587830080368, "grad_norm": 0.5967167019844055, "learning_rate": 1.5525524538430436e-06, "loss": 0.6865, "step": 10788 }, { "epoch": 4.1289705319556065, "grad_norm": 0.49781671166419983, "learning_rate": 1.5512260470438422e-06, "loss": 0.5527, "step": 10789 }, { "epoch": 4.129353233830845, "grad_norm": 0.5001024007797241, "learning_rate": 1.5499001594444941e-06, "loss": 0.5871, "step": 10790 }, { "epoch": 4.129735935706085, "grad_norm": 0.5059944987297058, "learning_rate": 1.5485747911264826e-06, "loss": 0.5704, "step": 10791 }, { "epoch": 4.130118637581324, "grad_norm": 0.5143744945526123, "learning_rate": 1.547249942171255e-06, "loss": 0.6833, "step": 10792 }, { "epoch": 4.130501339456563, "grad_norm": 0.5645732879638672, "learning_rate": 1.5459256126602262e-06, "loss": 0.6165, "step": 10793 }, { "epoch": 4.130884041331803, "grad_norm": 0.5136183500289917, "learning_rate": 1.5446018026747811e-06, "loss": 0.6338, "step": 10794 }, { "epoch": 4.131266743207042, "grad_norm": 0.5826267004013062, "learning_rate": 1.5432785122962702e-06, "loss": 0.6257, "step": 10795 }, { "epoch": 4.131649445082281, "grad_norm": 0.5583099722862244, "learning_rate": 1.541955741606016e-06, "loss": 0.7044, "step": 10796 }, { "epoch": 4.1320321469575205, "grad_norm": 0.5965427160263062, "learning_rate": 1.5406334906853016e-06, "loss": 0.6154, "step": 10797 }, { "epoch": 4.1324148488327594, "grad_norm": 0.5319274067878723, "learning_rate": 1.5393117596153851e-06, "loss": 0.584, "step": 10798 }, { "epoch": 4.132797550707998, "grad_norm": 0.586959958076477, "learning_rate": 1.5379905484774925e-06, "loss": 0.6486, "step": 10799 }, { "epoch": 4.133180252583237, "grad_norm": 0.5078414678573608, "learning_rate": 1.5366698573528115e-06, "loss": 0.5946, "step": 10800 }, { "epoch": 4.133562954458477, "grad_norm": 0.5111815333366394, "learning_rate": 1.5353496863225048e-06, "loss": 0.6227, "step": 10801 }, { "epoch": 4.133945656333716, "grad_norm": 0.497733473777771, "learning_rate": 1.5340300354677007e-06, "loss": 0.6453, "step": 10802 }, { "epoch": 4.134328358208955, "grad_norm": 0.5600321292877197, "learning_rate": 1.5327109048694943e-06, "loss": 0.6094, "step": 10803 }, { "epoch": 4.134711060084195, "grad_norm": 0.5175899863243103, "learning_rate": 1.5313922946089488e-06, "loss": 0.5659, "step": 10804 }, { "epoch": 4.135093761959434, "grad_norm": 0.538560152053833, "learning_rate": 1.530074204767099e-06, "loss": 0.6178, "step": 10805 }, { "epoch": 4.135476463834673, "grad_norm": 0.5519664287567139, "learning_rate": 1.528756635424946e-06, "loss": 0.6423, "step": 10806 }, { "epoch": 4.1358591657099115, "grad_norm": 0.5181506872177124, "learning_rate": 1.5274395866634527e-06, "loss": 0.5928, "step": 10807 }, { "epoch": 4.136241867585151, "grad_norm": 0.5458213090896606, "learning_rate": 1.5261230585635589e-06, "loss": 0.6043, "step": 10808 }, { "epoch": 4.13662456946039, "grad_norm": 0.5916305184364319, "learning_rate": 1.5248070512061696e-06, "loss": 0.6639, "step": 10809 }, { "epoch": 4.137007271335629, "grad_norm": 0.5570040345191956, "learning_rate": 1.5234915646721537e-06, "loss": 0.7452, "step": 10810 }, { "epoch": 4.137389973210869, "grad_norm": 0.5493572950363159, "learning_rate": 1.522176599042353e-06, "loss": 0.6617, "step": 10811 }, { "epoch": 4.137772675086108, "grad_norm": 0.5460299849510193, "learning_rate": 1.5208621543975766e-06, "loss": 0.5749, "step": 10812 }, { "epoch": 4.138155376961347, "grad_norm": 0.5250301957130432, "learning_rate": 1.5195482308185983e-06, "loss": 0.6087, "step": 10813 }, { "epoch": 4.138538078836587, "grad_norm": 0.5464304089546204, "learning_rate": 1.518234828386166e-06, "loss": 0.6974, "step": 10814 }, { "epoch": 4.138920780711826, "grad_norm": 0.541810929775238, "learning_rate": 1.5169219471809903e-06, "loss": 0.7104, "step": 10815 }, { "epoch": 4.1393034825870645, "grad_norm": 0.5175960659980774, "learning_rate": 1.5156095872837496e-06, "loss": 0.621, "step": 10816 }, { "epoch": 4.139686184462304, "grad_norm": 0.5277795195579529, "learning_rate": 1.514297748775092e-06, "loss": 0.6516, "step": 10817 }, { "epoch": 4.140068886337543, "grad_norm": 0.5857836008071899, "learning_rate": 1.5129864317356347e-06, "loss": 0.5848, "step": 10818 }, { "epoch": 4.140451588212782, "grad_norm": 0.49363890290260315, "learning_rate": 1.511675636245964e-06, "loss": 0.5972, "step": 10819 }, { "epoch": 4.140834290088021, "grad_norm": 0.5463591814041138, "learning_rate": 1.5103653623866266e-06, "loss": 0.6442, "step": 10820 }, { "epoch": 4.141216991963261, "grad_norm": 0.5678899884223938, "learning_rate": 1.5090556102381448e-06, "loss": 0.5818, "step": 10821 }, { "epoch": 4.1415996938385, "grad_norm": 0.5988809466362, "learning_rate": 1.5077463798810076e-06, "loss": 0.5805, "step": 10822 }, { "epoch": 4.141982395713739, "grad_norm": 0.5741158723831177, "learning_rate": 1.5064376713956686e-06, "loss": 0.5635, "step": 10823 }, { "epoch": 4.1423650975889785, "grad_norm": 0.5580189824104309, "learning_rate": 1.5051294848625531e-06, "loss": 0.5934, "step": 10824 }, { "epoch": 4.1427477994642175, "grad_norm": 0.5682191252708435, "learning_rate": 1.5038218203620548e-06, "loss": 0.6371, "step": 10825 }, { "epoch": 4.143130501339456, "grad_norm": 0.541313648223877, "learning_rate": 1.5025146779745281e-06, "loss": 0.7018, "step": 10826 }, { "epoch": 4.143513203214695, "grad_norm": 0.5473852753639221, "learning_rate": 1.5012080577803022e-06, "loss": 0.5796, "step": 10827 }, { "epoch": 4.143895905089935, "grad_norm": 0.5228004455566406, "learning_rate": 1.4999019598596754e-06, "loss": 0.5819, "step": 10828 }, { "epoch": 4.144278606965174, "grad_norm": 0.5133361220359802, "learning_rate": 1.4985963842929063e-06, "loss": 0.5543, "step": 10829 }, { "epoch": 4.144661308840413, "grad_norm": 0.49393853545188904, "learning_rate": 1.4972913311602288e-06, "loss": 0.6144, "step": 10830 }, { "epoch": 4.145044010715653, "grad_norm": 0.5528366565704346, "learning_rate": 1.495986800541842e-06, "loss": 0.5901, "step": 10831 }, { "epoch": 4.145426712590892, "grad_norm": 0.5237122178077698, "learning_rate": 1.4946827925179108e-06, "loss": 0.5797, "step": 10832 }, { "epoch": 4.145809414466131, "grad_norm": 0.6490319967269897, "learning_rate": 1.493379307168573e-06, "loss": 0.6257, "step": 10833 }, { "epoch": 4.14619211634137, "grad_norm": 0.5462831258773804, "learning_rate": 1.4920763445739284e-06, "loss": 0.612, "step": 10834 }, { "epoch": 4.146574818216609, "grad_norm": 0.5313533544540405, "learning_rate": 1.4907739048140524e-06, "loss": 0.5465, "step": 10835 }, { "epoch": 4.146957520091848, "grad_norm": 0.5316413640975952, "learning_rate": 1.4894719879689767e-06, "loss": 0.5465, "step": 10836 }, { "epoch": 4.147340221967088, "grad_norm": 0.5424501299858093, "learning_rate": 1.4881705941187109e-06, "loss": 0.6022, "step": 10837 }, { "epoch": 4.147722923842327, "grad_norm": 0.5837632417678833, "learning_rate": 1.4868697233432316e-06, "loss": 0.609, "step": 10838 }, { "epoch": 4.148105625717566, "grad_norm": 0.5817790031433105, "learning_rate": 1.4855693757224755e-06, "loss": 0.6797, "step": 10839 }, { "epoch": 4.148488327592805, "grad_norm": 0.5550265312194824, "learning_rate": 1.4842695513363548e-06, "loss": 0.5806, "step": 10840 }, { "epoch": 4.148871029468045, "grad_norm": 0.5645151138305664, "learning_rate": 1.4829702502647492e-06, "loss": 0.6414, "step": 10841 }, { "epoch": 4.149253731343284, "grad_norm": 0.524770200252533, "learning_rate": 1.4816714725875014e-06, "loss": 0.6429, "step": 10842 }, { "epoch": 4.1496364332185225, "grad_norm": 0.5372937321662903, "learning_rate": 1.4803732183844266e-06, "loss": 0.5823, "step": 10843 }, { "epoch": 4.150019135093762, "grad_norm": 0.563945472240448, "learning_rate": 1.479075487735304e-06, "loss": 0.6447, "step": 10844 }, { "epoch": 4.150401836969001, "grad_norm": 0.5742523074150085, "learning_rate": 1.477778280719888e-06, "loss": 0.6646, "step": 10845 }, { "epoch": 4.15078453884424, "grad_norm": 0.5280263423919678, "learning_rate": 1.4764815974178892e-06, "loss": 0.5946, "step": 10846 }, { "epoch": 4.151167240719479, "grad_norm": 0.5774083733558655, "learning_rate": 1.4751854379089937e-06, "loss": 0.5707, "step": 10847 }, { "epoch": 4.151549942594719, "grad_norm": 0.5616793632507324, "learning_rate": 1.4738898022728575e-06, "loss": 0.5974, "step": 10848 }, { "epoch": 4.151932644469958, "grad_norm": 0.6228924989700317, "learning_rate": 1.4725946905890965e-06, "loss": 0.638, "step": 10849 }, { "epoch": 4.152315346345197, "grad_norm": 0.5282232165336609, "learning_rate": 1.4713001029372997e-06, "loss": 0.5924, "step": 10850 }, { "epoch": 4.1526980482204365, "grad_norm": 0.5348519086837769, "learning_rate": 1.470006039397024e-06, "loss": 0.6347, "step": 10851 }, { "epoch": 4.1530807500956755, "grad_norm": 0.5759292840957642, "learning_rate": 1.468712500047793e-06, "loss": 0.6425, "step": 10852 }, { "epoch": 4.153463451970914, "grad_norm": 0.5459307432174683, "learning_rate": 1.467419484969098e-06, "loss": 0.6339, "step": 10853 }, { "epoch": 4.153846153846154, "grad_norm": 0.5605271458625793, "learning_rate": 1.466126994240401e-06, "loss": 0.6156, "step": 10854 }, { "epoch": 4.154228855721393, "grad_norm": 0.573951244354248, "learning_rate": 1.464835027941124e-06, "loss": 0.6649, "step": 10855 }, { "epoch": 4.154611557596632, "grad_norm": 0.5605790019035339, "learning_rate": 1.4635435861506643e-06, "loss": 0.7289, "step": 10856 }, { "epoch": 4.154994259471872, "grad_norm": 0.533539891242981, "learning_rate": 1.4622526689483862e-06, "loss": 0.6083, "step": 10857 }, { "epoch": 4.155376961347111, "grad_norm": 0.5615165829658508, "learning_rate": 1.4609622764136155e-06, "loss": 0.6575, "step": 10858 }, { "epoch": 4.15575966322235, "grad_norm": 0.5481427907943726, "learning_rate": 1.4596724086256542e-06, "loss": 0.6822, "step": 10859 }, { "epoch": 4.156142365097589, "grad_norm": 0.5792900919914246, "learning_rate": 1.4583830656637655e-06, "loss": 0.5964, "step": 10860 }, { "epoch": 4.156525066972828, "grad_norm": 0.525493860244751, "learning_rate": 1.4570942476071881e-06, "loss": 0.5918, "step": 10861 }, { "epoch": 4.156907768848067, "grad_norm": 0.6406289935112, "learning_rate": 1.4558059545351144e-06, "loss": 0.602, "step": 10862 }, { "epoch": 4.157290470723306, "grad_norm": 0.5283809900283813, "learning_rate": 1.45451818652672e-06, "loss": 0.6154, "step": 10863 }, { "epoch": 4.157673172598546, "grad_norm": 0.5792129039764404, "learning_rate": 1.4532309436611435e-06, "loss": 0.585, "step": 10864 }, { "epoch": 4.158055874473785, "grad_norm": 0.5635628700256348, "learning_rate": 1.4519442260174844e-06, "loss": 0.5977, "step": 10865 }, { "epoch": 4.158438576349024, "grad_norm": 0.5428827404975891, "learning_rate": 1.4506580336748166e-06, "loss": 0.5917, "step": 10866 }, { "epoch": 4.158821278224264, "grad_norm": 0.5378767251968384, "learning_rate": 1.449372366712183e-06, "loss": 0.6397, "step": 10867 }, { "epoch": 4.159203980099503, "grad_norm": 0.5377917885780334, "learning_rate": 1.4480872252085853e-06, "loss": 0.6059, "step": 10868 }, { "epoch": 4.159586681974742, "grad_norm": 0.5466722846031189, "learning_rate": 1.4468026092430033e-06, "loss": 0.6238, "step": 10869 }, { "epoch": 4.1599693838499805, "grad_norm": 0.5175067186355591, "learning_rate": 1.445518518894381e-06, "loss": 0.5258, "step": 10870 }, { "epoch": 4.16035208572522, "grad_norm": 0.52703857421875, "learning_rate": 1.4442349542416244e-06, "loss": 0.5488, "step": 10871 }, { "epoch": 4.160734787600459, "grad_norm": 0.5787171125411987, "learning_rate": 1.442951915363615e-06, "loss": 0.6019, "step": 10872 }, { "epoch": 4.161117489475698, "grad_norm": 0.5548927187919617, "learning_rate": 1.4416694023391976e-06, "loss": 0.5386, "step": 10873 }, { "epoch": 4.161500191350938, "grad_norm": 0.512133777141571, "learning_rate": 1.4403874152471909e-06, "loss": 0.5695, "step": 10874 }, { "epoch": 4.161882893226177, "grad_norm": 0.520807683467865, "learning_rate": 1.439105954166371e-06, "loss": 0.5813, "step": 10875 }, { "epoch": 4.162265595101416, "grad_norm": 0.5386932492256165, "learning_rate": 1.4378250191754883e-06, "loss": 0.6329, "step": 10876 }, { "epoch": 4.162648296976656, "grad_norm": 0.5349950194358826, "learning_rate": 1.436544610353263e-06, "loss": 0.5664, "step": 10877 }, { "epoch": 4.1630309988518945, "grad_norm": 0.5150017738342285, "learning_rate": 1.4352647277783749e-06, "loss": 0.6538, "step": 10878 }, { "epoch": 4.1634137007271335, "grad_norm": 0.531193196773529, "learning_rate": 1.4339853715294783e-06, "loss": 0.5782, "step": 10879 }, { "epoch": 4.163796402602372, "grad_norm": 0.528308629989624, "learning_rate": 1.432706541685196e-06, "loss": 0.644, "step": 10880 }, { "epoch": 4.164179104477612, "grad_norm": 0.5431950688362122, "learning_rate": 1.4314282383241097e-06, "loss": 0.642, "step": 10881 }, { "epoch": 4.164561806352851, "grad_norm": 0.5097150206565857, "learning_rate": 1.4301504615247774e-06, "loss": 0.5374, "step": 10882 }, { "epoch": 4.16494450822809, "grad_norm": 0.5405341386795044, "learning_rate": 1.4288732113657223e-06, "loss": 0.6427, "step": 10883 }, { "epoch": 4.16532721010333, "grad_norm": 0.5158794522285461, "learning_rate": 1.4275964879254345e-06, "loss": 0.5825, "step": 10884 }, { "epoch": 4.165709911978569, "grad_norm": 0.5862905979156494, "learning_rate": 1.4263202912823726e-06, "loss": 0.6695, "step": 10885 }, { "epoch": 4.166092613853808, "grad_norm": 0.5095664262771606, "learning_rate": 1.4250446215149616e-06, "loss": 0.5748, "step": 10886 }, { "epoch": 4.1664753157290475, "grad_norm": 0.5262332558631897, "learning_rate": 1.4237694787015966e-06, "loss": 0.6032, "step": 10887 }, { "epoch": 4.166858017604286, "grad_norm": 0.5204539895057678, "learning_rate": 1.4224948629206358e-06, "loss": 0.5843, "step": 10888 }, { "epoch": 4.167240719479525, "grad_norm": 0.5413949489593506, "learning_rate": 1.4212207742504092e-06, "loss": 0.5558, "step": 10889 }, { "epoch": 4.167623421354764, "grad_norm": 0.5596081018447876, "learning_rate": 1.4199472127692149e-06, "loss": 0.6347, "step": 10890 }, { "epoch": 4.168006123230004, "grad_norm": 0.5116510987281799, "learning_rate": 1.4186741785553116e-06, "loss": 0.6003, "step": 10891 }, { "epoch": 4.168388825105243, "grad_norm": 0.5110495090484619, "learning_rate": 1.4174016716869344e-06, "loss": 0.6012, "step": 10892 }, { "epoch": 4.168771526980482, "grad_norm": 0.5493763089179993, "learning_rate": 1.4161296922422817e-06, "loss": 0.628, "step": 10893 }, { "epoch": 4.169154228855722, "grad_norm": 0.528553307056427, "learning_rate": 1.4148582402995192e-06, "loss": 0.6712, "step": 10894 }, { "epoch": 4.169536930730961, "grad_norm": 0.5048376321792603, "learning_rate": 1.413587315936782e-06, "loss": 0.5854, "step": 10895 }, { "epoch": 4.1699196326062, "grad_norm": 0.654063880443573, "learning_rate": 1.412316919232174e-06, "loss": 0.6234, "step": 10896 }, { "epoch": 4.170302334481439, "grad_norm": 0.505234956741333, "learning_rate": 1.4110470502637596e-06, "loss": 0.5811, "step": 10897 }, { "epoch": 4.170685036356678, "grad_norm": 0.5401294827461243, "learning_rate": 1.4097777091095787e-06, "loss": 0.648, "step": 10898 }, { "epoch": 4.171067738231917, "grad_norm": 0.5360243320465088, "learning_rate": 1.4085088958476345e-06, "loss": 0.6337, "step": 10899 }, { "epoch": 4.171450440107156, "grad_norm": 0.5346643924713135, "learning_rate": 1.4072406105559011e-06, "loss": 0.674, "step": 10900 }, { "epoch": 4.171833141982396, "grad_norm": 0.5673101544380188, "learning_rate": 1.405972853312315e-06, "loss": 0.6402, "step": 10901 }, { "epoch": 4.172215843857635, "grad_norm": 0.5389459729194641, "learning_rate": 1.4047056241947853e-06, "loss": 0.6045, "step": 10902 }, { "epoch": 4.172598545732874, "grad_norm": 0.49984678626060486, "learning_rate": 1.4034389232811862e-06, "loss": 0.6494, "step": 10903 }, { "epoch": 4.172981247608114, "grad_norm": 0.5682809948921204, "learning_rate": 1.40217275064936e-06, "loss": 0.6446, "step": 10904 }, { "epoch": 4.1733639494833525, "grad_norm": 0.5723304748535156, "learning_rate": 1.4009071063771151e-06, "loss": 0.6365, "step": 10905 }, { "epoch": 4.1737466513585915, "grad_norm": 0.5683809518814087, "learning_rate": 1.3996419905422332e-06, "loss": 0.5972, "step": 10906 }, { "epoch": 4.174129353233831, "grad_norm": 0.5171093940734863, "learning_rate": 1.3983774032224528e-06, "loss": 0.5856, "step": 10907 }, { "epoch": 4.17451205510907, "grad_norm": 0.5148698091506958, "learning_rate": 1.3971133444954887e-06, "loss": 0.6285, "step": 10908 }, { "epoch": 4.174894756984309, "grad_norm": 0.49568015336990356, "learning_rate": 1.3958498144390243e-06, "loss": 0.5204, "step": 10909 }, { "epoch": 4.175277458859548, "grad_norm": 0.5135076642036438, "learning_rate": 1.3945868131307017e-06, "loss": 0.6456, "step": 10910 }, { "epoch": 4.175660160734788, "grad_norm": 0.5341295599937439, "learning_rate": 1.3933243406481378e-06, "loss": 0.5895, "step": 10911 }, { "epoch": 4.176042862610027, "grad_norm": 0.59359210729599, "learning_rate": 1.392062397068914e-06, "loss": 0.7192, "step": 10912 }, { "epoch": 4.176425564485266, "grad_norm": 0.5314785838127136, "learning_rate": 1.3908009824705815e-06, "loss": 0.6304, "step": 10913 }, { "epoch": 4.1768082663605055, "grad_norm": 0.5539320707321167, "learning_rate": 1.389540096930657e-06, "loss": 0.6158, "step": 10914 }, { "epoch": 4.177190968235744, "grad_norm": 0.5388113260269165, "learning_rate": 1.3882797405266257e-06, "loss": 0.7071, "step": 10915 }, { "epoch": 4.177573670110983, "grad_norm": 0.5654579401016235, "learning_rate": 1.3870199133359419e-06, "loss": 0.5972, "step": 10916 }, { "epoch": 4.177956371986223, "grad_norm": 0.5869570374488831, "learning_rate": 1.3857606154360215e-06, "loss": 0.5963, "step": 10917 }, { "epoch": 4.178339073861462, "grad_norm": 0.5295727252960205, "learning_rate": 1.3845018469042525e-06, "loss": 0.6813, "step": 10918 }, { "epoch": 4.178721775736701, "grad_norm": 0.5352165102958679, "learning_rate": 1.3832436078179923e-06, "loss": 0.639, "step": 10919 }, { "epoch": 4.17910447761194, "grad_norm": 0.4888002872467041, "learning_rate": 1.3819858982545598e-06, "loss": 0.5746, "step": 10920 }, { "epoch": 4.17948717948718, "grad_norm": 0.5296195149421692, "learning_rate": 1.3807287182912455e-06, "loss": 0.6687, "step": 10921 }, { "epoch": 4.179869881362419, "grad_norm": 0.5166662335395813, "learning_rate": 1.3794720680053075e-06, "loss": 0.5995, "step": 10922 }, { "epoch": 4.180252583237658, "grad_norm": 0.5492962002754211, "learning_rate": 1.3782159474739697e-06, "loss": 0.5474, "step": 10923 }, { "epoch": 4.180635285112897, "grad_norm": 0.5619085431098938, "learning_rate": 1.376960356774425e-06, "loss": 0.5906, "step": 10924 }, { "epoch": 4.181017986988136, "grad_norm": 0.5025700926780701, "learning_rate": 1.3757052959838314e-06, "loss": 0.648, "step": 10925 }, { "epoch": 4.181400688863375, "grad_norm": 0.5637312531471252, "learning_rate": 1.374450765179318e-06, "loss": 0.666, "step": 10926 }, { "epoch": 4.181783390738615, "grad_norm": 0.5480453968048096, "learning_rate": 1.3731967644379761e-06, "loss": 0.6009, "step": 10927 }, { "epoch": 4.182166092613854, "grad_norm": 0.5350233912467957, "learning_rate": 1.371943293836868e-06, "loss": 0.6364, "step": 10928 }, { "epoch": 4.182548794489093, "grad_norm": 0.5026962757110596, "learning_rate": 1.370690353453027e-06, "loss": 0.5653, "step": 10929 }, { "epoch": 4.182931496364332, "grad_norm": 0.5964048504829407, "learning_rate": 1.3694379433634441e-06, "loss": 0.6898, "step": 10930 }, { "epoch": 4.183314198239572, "grad_norm": 0.5006957650184631, "learning_rate": 1.368186063645085e-06, "loss": 0.6509, "step": 10931 }, { "epoch": 4.1836969001148105, "grad_norm": 0.5306167602539062, "learning_rate": 1.3669347143748812e-06, "loss": 0.6728, "step": 10932 }, { "epoch": 4.1840796019900495, "grad_norm": 0.5434749126434326, "learning_rate": 1.3656838956297324e-06, "loss": 0.6846, "step": 10933 }, { "epoch": 4.184462303865289, "grad_norm": 0.5547255873680115, "learning_rate": 1.364433607486505e-06, "loss": 0.6457, "step": 10934 }, { "epoch": 4.184845005740528, "grad_norm": 0.5844815373420715, "learning_rate": 1.3631838500220328e-06, "loss": 0.6533, "step": 10935 }, { "epoch": 4.185227707615767, "grad_norm": 0.5399032831192017, "learning_rate": 1.3619346233131137e-06, "loss": 0.5656, "step": 10936 }, { "epoch": 4.185610409491007, "grad_norm": 0.5124359130859375, "learning_rate": 1.360685927436518e-06, "loss": 0.5979, "step": 10937 }, { "epoch": 4.185993111366246, "grad_norm": 0.550957977771759, "learning_rate": 1.3594377624689814e-06, "loss": 0.6419, "step": 10938 }, { "epoch": 4.186375813241485, "grad_norm": 0.5523251891136169, "learning_rate": 1.3581901284872101e-06, "loss": 0.6162, "step": 10939 }, { "epoch": 4.186758515116724, "grad_norm": 0.5624990463256836, "learning_rate": 1.3569430255678685e-06, "loss": 0.5782, "step": 10940 }, { "epoch": 4.1871412169919635, "grad_norm": 0.5223588943481445, "learning_rate": 1.3556964537875983e-06, "loss": 0.5726, "step": 10941 }, { "epoch": 4.187523918867202, "grad_norm": 0.5932309031486511, "learning_rate": 1.354450413223003e-06, "loss": 0.623, "step": 10942 }, { "epoch": 4.187906620742441, "grad_norm": 0.590374767780304, "learning_rate": 1.3532049039506568e-06, "loss": 0.7244, "step": 10943 }, { "epoch": 4.188289322617681, "grad_norm": 0.5340643525123596, "learning_rate": 1.3519599260470985e-06, "loss": 0.5655, "step": 10944 }, { "epoch": 4.18867202449292, "grad_norm": 0.5093200206756592, "learning_rate": 1.3507154795888367e-06, "loss": 0.5158, "step": 10945 }, { "epoch": 4.189054726368159, "grad_norm": 0.5280052423477173, "learning_rate": 1.3494715646523438e-06, "loss": 0.6287, "step": 10946 }, { "epoch": 4.189437428243399, "grad_norm": 0.5633461475372314, "learning_rate": 1.348228181314063e-06, "loss": 0.6149, "step": 10947 }, { "epoch": 4.189820130118638, "grad_norm": 0.5594897270202637, "learning_rate": 1.3469853296504055e-06, "loss": 0.6931, "step": 10948 }, { "epoch": 4.190202831993877, "grad_norm": 0.5146785378456116, "learning_rate": 1.3457430097377421e-06, "loss": 0.5785, "step": 10949 }, { "epoch": 4.190585533869116, "grad_norm": 0.545332670211792, "learning_rate": 1.3445012216524223e-06, "loss": 0.6501, "step": 10950 }, { "epoch": 4.190968235744355, "grad_norm": 0.5053755044937134, "learning_rate": 1.3432599654707545e-06, "loss": 0.6154, "step": 10951 }, { "epoch": 4.191350937619594, "grad_norm": 0.5031763911247253, "learning_rate": 1.3420192412690181e-06, "loss": 0.6668, "step": 10952 }, { "epoch": 4.191733639494833, "grad_norm": 0.5246724486351013, "learning_rate": 1.3407790491234585e-06, "loss": 0.4823, "step": 10953 }, { "epoch": 4.192116341370073, "grad_norm": 0.5287572741508484, "learning_rate": 1.3395393891102893e-06, "loss": 0.5607, "step": 10954 }, { "epoch": 4.192499043245312, "grad_norm": 0.5557687282562256, "learning_rate": 1.338300261305694e-06, "loss": 0.6563, "step": 10955 }, { "epoch": 4.192881745120551, "grad_norm": 0.554312527179718, "learning_rate": 1.3370616657858137e-06, "loss": 0.6537, "step": 10956 }, { "epoch": 4.193264446995791, "grad_norm": 0.554293155670166, "learning_rate": 1.335823602626768e-06, "loss": 0.5609, "step": 10957 }, { "epoch": 4.19364714887103, "grad_norm": 0.5346238017082214, "learning_rate": 1.33458607190464e-06, "loss": 0.5571, "step": 10958 }, { "epoch": 4.1940298507462686, "grad_norm": 0.5647938847541809, "learning_rate": 1.3333490736954746e-06, "loss": 0.6129, "step": 10959 }, { "epoch": 4.1944125526215075, "grad_norm": 0.5358499884605408, "learning_rate": 1.3321126080752922e-06, "loss": 0.7424, "step": 10960 }, { "epoch": 4.194795254496747, "grad_norm": 0.5458754897117615, "learning_rate": 1.3308766751200786e-06, "loss": 0.6169, "step": 10961 }, { "epoch": 4.195177956371986, "grad_norm": 0.5265944600105286, "learning_rate": 1.3296412749057774e-06, "loss": 0.6409, "step": 10962 }, { "epoch": 4.195560658247225, "grad_norm": 0.5255309343338013, "learning_rate": 1.3284064075083158e-06, "loss": 0.6256, "step": 10963 }, { "epoch": 4.195943360122465, "grad_norm": 0.5635595917701721, "learning_rate": 1.3271720730035764e-06, "loss": 0.7089, "step": 10964 }, { "epoch": 4.196326061997704, "grad_norm": 0.5250805020332336, "learning_rate": 1.3259382714674151e-06, "loss": 0.6663, "step": 10965 }, { "epoch": 4.196708763872943, "grad_norm": 0.5466090440750122, "learning_rate": 1.324705002975647e-06, "loss": 0.6159, "step": 10966 }, { "epoch": 4.197091465748183, "grad_norm": 0.5404798984527588, "learning_rate": 1.3234722676040634e-06, "loss": 0.6416, "step": 10967 }, { "epoch": 4.1974741676234215, "grad_norm": 0.5582977533340454, "learning_rate": 1.3222400654284195e-06, "loss": 0.6243, "step": 10968 }, { "epoch": 4.19785686949866, "grad_norm": 0.5332407355308533, "learning_rate": 1.3210083965244346e-06, "loss": 0.6, "step": 10969 }, { "epoch": 4.198239571373899, "grad_norm": 0.5703872442245483, "learning_rate": 1.3197772609677995e-06, "loss": 0.5601, "step": 10970 }, { "epoch": 4.198622273249139, "grad_norm": 0.5352997779846191, "learning_rate": 1.3185466588341733e-06, "loss": 0.5787, "step": 10971 }, { "epoch": 4.199004975124378, "grad_norm": 0.5452797412872314, "learning_rate": 1.3173165901991759e-06, "loss": 0.5419, "step": 10972 }, { "epoch": 4.199387676999617, "grad_norm": 0.5302254557609558, "learning_rate": 1.3160870551383975e-06, "loss": 0.6017, "step": 10973 }, { "epoch": 4.199770378874857, "grad_norm": 0.5756043791770935, "learning_rate": 1.3148580537274026e-06, "loss": 0.5957, "step": 10974 }, { "epoch": 4.200153080750096, "grad_norm": 0.5488331317901611, "learning_rate": 1.3136295860417115e-06, "loss": 0.6458, "step": 10975 }, { "epoch": 4.200535782625335, "grad_norm": 0.4811481833457947, "learning_rate": 1.3124016521568184e-06, "loss": 0.5776, "step": 10976 }, { "epoch": 4.2009184845005745, "grad_norm": 0.557675302028656, "learning_rate": 1.3111742521481819e-06, "loss": 0.6041, "step": 10977 }, { "epoch": 4.201301186375813, "grad_norm": 0.5316823720932007, "learning_rate": 1.3099473860912325e-06, "loss": 0.5664, "step": 10978 }, { "epoch": 4.201683888251052, "grad_norm": 0.5500053763389587, "learning_rate": 1.3087210540613603e-06, "loss": 0.6085, "step": 10979 }, { "epoch": 4.202066590126291, "grad_norm": 0.5374147891998291, "learning_rate": 1.3074952561339283e-06, "loss": 0.6039, "step": 10980 }, { "epoch": 4.202449292001531, "grad_norm": 0.6061112880706787, "learning_rate": 1.306269992384267e-06, "loss": 0.6977, "step": 10981 }, { "epoch": 4.20283199387677, "grad_norm": 0.6695296764373779, "learning_rate": 1.3050452628876675e-06, "loss": 0.6817, "step": 10982 }, { "epoch": 4.203214695752009, "grad_norm": 0.5608701705932617, "learning_rate": 1.3038210677193942e-06, "loss": 0.6319, "step": 10983 }, { "epoch": 4.203597397627249, "grad_norm": 0.6229343414306641, "learning_rate": 1.302597406954682e-06, "loss": 0.6401, "step": 10984 }, { "epoch": 4.203980099502488, "grad_norm": 0.5086715221405029, "learning_rate": 1.3013742806687234e-06, "loss": 0.6035, "step": 10985 }, { "epoch": 4.2043628013777266, "grad_norm": 0.5795563459396362, "learning_rate": 1.300151688936684e-06, "loss": 0.6098, "step": 10986 }, { "epoch": 4.204745503252966, "grad_norm": 0.5081731677055359, "learning_rate": 1.2989296318336975e-06, "loss": 0.6182, "step": 10987 }, { "epoch": 4.205128205128205, "grad_norm": 0.552534818649292, "learning_rate": 1.2977081094348576e-06, "loss": 0.5474, "step": 10988 }, { "epoch": 4.205510907003444, "grad_norm": 0.5475320816040039, "learning_rate": 1.296487121815233e-06, "loss": 0.5816, "step": 10989 }, { "epoch": 4.205893608878683, "grad_norm": 0.5605835318565369, "learning_rate": 1.2952666690498572e-06, "loss": 0.5817, "step": 10990 }, { "epoch": 4.206276310753923, "grad_norm": 0.5679904222488403, "learning_rate": 1.2940467512137323e-06, "loss": 0.6231, "step": 10991 }, { "epoch": 4.206659012629162, "grad_norm": 0.5282607078552246, "learning_rate": 1.2928273683818205e-06, "loss": 0.6435, "step": 10992 }, { "epoch": 4.207041714504401, "grad_norm": 0.542420506477356, "learning_rate": 1.2916085206290586e-06, "loss": 0.604, "step": 10993 }, { "epoch": 4.207424416379641, "grad_norm": 0.5854694247245789, "learning_rate": 1.2903902080303477e-06, "loss": 0.7109, "step": 10994 }, { "epoch": 4.2078071182548795, "grad_norm": 0.5511180758476257, "learning_rate": 1.2891724306605568e-06, "loss": 0.6167, "step": 10995 }, { "epoch": 4.208189820130118, "grad_norm": 0.5631760358810425, "learning_rate": 1.287955188594522e-06, "loss": 0.7357, "step": 10996 }, { "epoch": 4.208572522005358, "grad_norm": 0.5450243353843689, "learning_rate": 1.2867384819070472e-06, "loss": 0.6351, "step": 10997 }, { "epoch": 4.208955223880597, "grad_norm": 0.5657713413238525, "learning_rate": 1.2855223106728998e-06, "loss": 0.6096, "step": 10998 }, { "epoch": 4.209337925755836, "grad_norm": 0.6203344464302063, "learning_rate": 1.2843066749668165e-06, "loss": 0.6074, "step": 10999 }, { "epoch": 4.209720627631075, "grad_norm": 0.5422614216804504, "learning_rate": 1.2830915748635054e-06, "loss": 0.6224, "step": 11000 }, { "epoch": 4.210103329506315, "grad_norm": 0.5113427639007568, "learning_rate": 1.2818770104376333e-06, "loss": 0.5744, "step": 11001 }, { "epoch": 4.210486031381554, "grad_norm": 0.5013185739517212, "learning_rate": 1.2806629817638394e-06, "loss": 0.6279, "step": 11002 }, { "epoch": 4.210868733256793, "grad_norm": 0.5618643164634705, "learning_rate": 1.2794494889167297e-06, "loss": 0.5448, "step": 11003 }, { "epoch": 4.2112514351320325, "grad_norm": 0.5746356844902039, "learning_rate": 1.2782365319708767e-06, "loss": 0.7293, "step": 11004 }, { "epoch": 4.211634137007271, "grad_norm": 0.5869371891021729, "learning_rate": 1.2770241110008196e-06, "loss": 0.661, "step": 11005 }, { "epoch": 4.21201683888251, "grad_norm": 0.546631395816803, "learning_rate": 1.2758122260810647e-06, "loss": 0.6372, "step": 11006 }, { "epoch": 4.21239954075775, "grad_norm": 0.7027443647384644, "learning_rate": 1.2746008772860885e-06, "loss": 0.6439, "step": 11007 }, { "epoch": 4.212782242632989, "grad_norm": 0.5549666285514832, "learning_rate": 1.2733900646903275e-06, "loss": 0.5934, "step": 11008 }, { "epoch": 4.213164944508228, "grad_norm": 0.5124642848968506, "learning_rate": 1.2721797883681896e-06, "loss": 0.6398, "step": 11009 }, { "epoch": 4.213547646383467, "grad_norm": 0.5029041767120361, "learning_rate": 1.2709700483940545e-06, "loss": 0.5304, "step": 11010 }, { "epoch": 4.213930348258707, "grad_norm": 0.5467985272407532, "learning_rate": 1.2697608448422571e-06, "loss": 0.6677, "step": 11011 }, { "epoch": 4.214313050133946, "grad_norm": 0.5591344237327576, "learning_rate": 1.26855217778711e-06, "loss": 0.6026, "step": 11012 }, { "epoch": 4.214695752009185, "grad_norm": 0.5257734656333923, "learning_rate": 1.2673440473028885e-06, "loss": 0.6085, "step": 11013 }, { "epoch": 4.215078453884424, "grad_norm": 0.5288127064704895, "learning_rate": 1.2661364534638355e-06, "loss": 0.5954, "step": 11014 }, { "epoch": 4.215461155759663, "grad_norm": 0.5733157396316528, "learning_rate": 1.2649293963441612e-06, "loss": 0.6439, "step": 11015 }, { "epoch": 4.215843857634902, "grad_norm": 0.511562705039978, "learning_rate": 1.263722876018042e-06, "loss": 0.5792, "step": 11016 }, { "epoch": 4.216226559510142, "grad_norm": 0.5089082717895508, "learning_rate": 1.2625168925596244e-06, "loss": 0.5758, "step": 11017 }, { "epoch": 4.216609261385381, "grad_norm": 0.5204033255577087, "learning_rate": 1.2613114460430144e-06, "loss": 0.6528, "step": 11018 }, { "epoch": 4.21699196326062, "grad_norm": 0.6279197335243225, "learning_rate": 1.2601065365422915e-06, "loss": 0.6701, "step": 11019 }, { "epoch": 4.217374665135859, "grad_norm": 0.814069390296936, "learning_rate": 1.2589021641315048e-06, "loss": 0.605, "step": 11020 }, { "epoch": 4.217757367011099, "grad_norm": 0.5006528496742249, "learning_rate": 1.2576983288846612e-06, "loss": 0.5219, "step": 11021 }, { "epoch": 4.2181400688863375, "grad_norm": 0.5551484823226929, "learning_rate": 1.2564950308757407e-06, "loss": 0.6127, "step": 11022 }, { "epoch": 4.2185227707615764, "grad_norm": 0.5692393779754639, "learning_rate": 1.2552922701786896e-06, "loss": 0.6579, "step": 11023 }, { "epoch": 4.218905472636816, "grad_norm": 0.5193941593170166, "learning_rate": 1.2540900468674222e-06, "loss": 0.6446, "step": 11024 }, { "epoch": 4.219288174512055, "grad_norm": 0.6002271771430969, "learning_rate": 1.252888361015816e-06, "loss": 0.6417, "step": 11025 }, { "epoch": 4.219670876387294, "grad_norm": 0.5855181217193604, "learning_rate": 1.251687212697722e-06, "loss": 0.6482, "step": 11026 }, { "epoch": 4.220053578262534, "grad_norm": 0.5024583339691162, "learning_rate": 1.2504866019869487e-06, "loss": 0.5544, "step": 11027 }, { "epoch": 4.220436280137773, "grad_norm": 0.5949772000312805, "learning_rate": 1.2492865289572786e-06, "loss": 0.7112, "step": 11028 }, { "epoch": 4.220818982013012, "grad_norm": 0.6314610242843628, "learning_rate": 1.2480869936824603e-06, "loss": 0.634, "step": 11029 }, { "epoch": 4.221201683888251, "grad_norm": 0.5662582516670227, "learning_rate": 1.2468879962362101e-06, "loss": 0.6255, "step": 11030 }, { "epoch": 4.2215843857634905, "grad_norm": 0.5295080542564392, "learning_rate": 1.245689536692206e-06, "loss": 0.5877, "step": 11031 }, { "epoch": 4.221967087638729, "grad_norm": 0.5527205467224121, "learning_rate": 1.2444916151240982e-06, "loss": 0.6094, "step": 11032 }, { "epoch": 4.222349789513968, "grad_norm": 0.5392138361930847, "learning_rate": 1.243294231605503e-06, "loss": 0.5691, "step": 11033 }, { "epoch": 4.222732491389208, "grad_norm": 0.569677472114563, "learning_rate": 1.2420973862100006e-06, "loss": 0.6214, "step": 11034 }, { "epoch": 4.223115193264447, "grad_norm": 0.6021636724472046, "learning_rate": 1.2409010790111432e-06, "loss": 0.7034, "step": 11035 }, { "epoch": 4.223497895139686, "grad_norm": 0.47263893485069275, "learning_rate": 1.2397053100824463e-06, "loss": 0.5566, "step": 11036 }, { "epoch": 4.223880597014926, "grad_norm": 0.5074910521507263, "learning_rate": 1.2385100794973914e-06, "loss": 0.5753, "step": 11037 }, { "epoch": 4.224263298890165, "grad_norm": 0.5667068362236023, "learning_rate": 1.2373153873294298e-06, "loss": 0.5822, "step": 11038 }, { "epoch": 4.224646000765404, "grad_norm": 0.5033859610557556, "learning_rate": 1.23612123365198e-06, "loss": 0.6045, "step": 11039 }, { "epoch": 4.225028702640643, "grad_norm": 0.5468618869781494, "learning_rate": 1.234927618538423e-06, "loss": 0.6558, "step": 11040 }, { "epoch": 4.225411404515882, "grad_norm": 0.6050553321838379, "learning_rate": 1.2337345420621115e-06, "loss": 0.6385, "step": 11041 }, { "epoch": 4.225794106391121, "grad_norm": 0.5440728068351746, "learning_rate": 1.232542004296362e-06, "loss": 0.63, "step": 11042 }, { "epoch": 4.22617680826636, "grad_norm": 0.537486732006073, "learning_rate": 1.2313500053144601e-06, "loss": 0.636, "step": 11043 }, { "epoch": 4.2265595101416, "grad_norm": 0.5817175507545471, "learning_rate": 1.230158545189658e-06, "loss": 0.6979, "step": 11044 }, { "epoch": 4.226942212016839, "grad_norm": 0.5572187900543213, "learning_rate": 1.228967623995172e-06, "loss": 0.6398, "step": 11045 }, { "epoch": 4.227324913892078, "grad_norm": 0.5797673463821411, "learning_rate": 1.2277772418041922e-06, "loss": 0.6114, "step": 11046 }, { "epoch": 4.227707615767318, "grad_norm": 0.5588760375976562, "learning_rate": 1.2265873986898646e-06, "loss": 0.6135, "step": 11047 }, { "epoch": 4.228090317642557, "grad_norm": 0.5566720962524414, "learning_rate": 1.225398094725312e-06, "loss": 0.6201, "step": 11048 }, { "epoch": 4.2284730195177955, "grad_norm": 0.5485813617706299, "learning_rate": 1.22420932998362e-06, "loss": 0.6468, "step": 11049 }, { "epoch": 4.2288557213930345, "grad_norm": 0.5387700200080872, "learning_rate": 1.2230211045378393e-06, "loss": 0.6929, "step": 11050 }, { "epoch": 4.229238423268274, "grad_norm": 0.5849254131317139, "learning_rate": 1.2218334184609915e-06, "loss": 0.6229, "step": 11051 }, { "epoch": 4.229621125143513, "grad_norm": 0.5516777634620667, "learning_rate": 1.2206462718260615e-06, "loss": 0.5638, "step": 11052 }, { "epoch": 4.230003827018752, "grad_norm": 0.535207986831665, "learning_rate": 1.2194596647060053e-06, "loss": 0.6176, "step": 11053 }, { "epoch": 4.230386528893992, "grad_norm": 0.5548439025878906, "learning_rate": 1.2182735971737403e-06, "loss": 0.6779, "step": 11054 }, { "epoch": 4.230769230769231, "grad_norm": 0.5351312160491943, "learning_rate": 1.2170880693021546e-06, "loss": 0.6489, "step": 11055 }, { "epoch": 4.23115193264447, "grad_norm": 0.5778864026069641, "learning_rate": 1.2159030811641047e-06, "loss": 0.6622, "step": 11056 }, { "epoch": 4.23153463451971, "grad_norm": 0.50702965259552, "learning_rate": 1.2147186328324068e-06, "loss": 0.4965, "step": 11057 }, { "epoch": 4.2319173363949485, "grad_norm": 0.5616050958633423, "learning_rate": 1.2135347243798512e-06, "loss": 0.672, "step": 11058 }, { "epoch": 4.232300038270187, "grad_norm": 0.5139710307121277, "learning_rate": 1.2123513558791934e-06, "loss": 0.6237, "step": 11059 }, { "epoch": 4.232682740145426, "grad_norm": 0.5112733840942383, "learning_rate": 1.2111685274031494e-06, "loss": 0.6105, "step": 11060 }, { "epoch": 4.233065442020666, "grad_norm": 0.5778552293777466, "learning_rate": 1.209986239024411e-06, "loss": 0.6122, "step": 11061 }, { "epoch": 4.233448143895905, "grad_norm": 0.5394092798233032, "learning_rate": 1.2088044908156338e-06, "loss": 0.6431, "step": 11062 }, { "epoch": 4.233830845771144, "grad_norm": 0.5760520100593567, "learning_rate": 1.207623282849436e-06, "loss": 0.6826, "step": 11063 }, { "epoch": 4.234213547646384, "grad_norm": 0.5133363008499146, "learning_rate": 1.20644261519841e-06, "loss": 0.6782, "step": 11064 }, { "epoch": 4.234596249521623, "grad_norm": 0.5439971089363098, "learning_rate": 1.2052624879351105e-06, "loss": 0.6521, "step": 11065 }, { "epoch": 4.234978951396862, "grad_norm": 0.5443016290664673, "learning_rate": 1.2040829011320554e-06, "loss": 0.6594, "step": 11066 }, { "epoch": 4.2353616532721015, "grad_norm": 0.5365907549858093, "learning_rate": 1.2029038548617368e-06, "loss": 0.6098, "step": 11067 }, { "epoch": 4.23574435514734, "grad_norm": 0.5438171029090881, "learning_rate": 1.2017253491966118e-06, "loss": 0.6223, "step": 11068 }, { "epoch": 4.236127057022579, "grad_norm": 0.5069991946220398, "learning_rate": 1.2005473842090976e-06, "loss": 0.6145, "step": 11069 }, { "epoch": 4.236509758897818, "grad_norm": 0.5664775967597961, "learning_rate": 1.199369959971587e-06, "loss": 0.6095, "step": 11070 }, { "epoch": 4.236892460773058, "grad_norm": 0.5510345101356506, "learning_rate": 1.1981930765564353e-06, "loss": 0.6335, "step": 11071 }, { "epoch": 4.237275162648297, "grad_norm": 0.5758383870124817, "learning_rate": 1.1970167340359672e-06, "loss": 0.6182, "step": 11072 }, { "epoch": 4.237657864523536, "grad_norm": 0.5613378882408142, "learning_rate": 1.195840932482465e-06, "loss": 0.6227, "step": 11073 }, { "epoch": 4.238040566398776, "grad_norm": 0.5378267765045166, "learning_rate": 1.1946656719681926e-06, "loss": 0.6317, "step": 11074 }, { "epoch": 4.238423268274015, "grad_norm": 0.5627957582473755, "learning_rate": 1.1934909525653715e-06, "loss": 0.6449, "step": 11075 }, { "epoch": 4.2388059701492535, "grad_norm": 0.5554263591766357, "learning_rate": 1.1923167743461884e-06, "loss": 0.5963, "step": 11076 }, { "epoch": 4.239188672024493, "grad_norm": 0.5184958577156067, "learning_rate": 1.1911431373828009e-06, "loss": 0.6109, "step": 11077 }, { "epoch": 4.239571373899732, "grad_norm": 0.5039195418357849, "learning_rate": 1.1899700417473347e-06, "loss": 0.5443, "step": 11078 }, { "epoch": 4.239954075774971, "grad_norm": 0.5597178339958191, "learning_rate": 1.1887974875118758e-06, "loss": 0.546, "step": 11079 }, { "epoch": 4.24033677765021, "grad_norm": 0.5087544322013855, "learning_rate": 1.1876254747484827e-06, "loss": 0.5293, "step": 11080 }, { "epoch": 4.24071947952545, "grad_norm": 0.5188131332397461, "learning_rate": 1.1864540035291793e-06, "loss": 0.6456, "step": 11081 }, { "epoch": 4.241102181400689, "grad_norm": 0.5548796653747559, "learning_rate": 1.1852830739259546e-06, "loss": 0.669, "step": 11082 }, { "epoch": 4.241484883275928, "grad_norm": 0.5017492771148682, "learning_rate": 1.184112686010762e-06, "loss": 0.5439, "step": 11083 }, { "epoch": 4.241867585151168, "grad_norm": 0.5763781666755676, "learning_rate": 1.182942839855531e-06, "loss": 0.6434, "step": 11084 }, { "epoch": 4.2422502870264065, "grad_norm": 0.5089772343635559, "learning_rate": 1.1817735355321526e-06, "loss": 0.6415, "step": 11085 }, { "epoch": 4.242632988901645, "grad_norm": 0.5574910640716553, "learning_rate": 1.1806047731124771e-06, "loss": 0.6044, "step": 11086 }, { "epoch": 4.243015690776885, "grad_norm": 0.6431347727775574, "learning_rate": 1.1794365526683326e-06, "loss": 0.6354, "step": 11087 }, { "epoch": 4.243398392652124, "grad_norm": 0.5852413177490234, "learning_rate": 1.1782688742715098e-06, "loss": 0.6773, "step": 11088 }, { "epoch": 4.243781094527363, "grad_norm": 0.5196291208267212, "learning_rate": 1.1771017379937621e-06, "loss": 0.557, "step": 11089 }, { "epoch": 4.244163796402602, "grad_norm": 0.5338638424873352, "learning_rate": 1.1759351439068156e-06, "loss": 0.6232, "step": 11090 }, { "epoch": 4.244546498277842, "grad_norm": 0.540123462677002, "learning_rate": 1.1747690920823617e-06, "loss": 0.6148, "step": 11091 }, { "epoch": 4.244929200153081, "grad_norm": 0.54254150390625, "learning_rate": 1.1736035825920544e-06, "loss": 0.6444, "step": 11092 }, { "epoch": 4.24531190202832, "grad_norm": 0.5506824851036072, "learning_rate": 1.1724386155075184e-06, "loss": 0.5657, "step": 11093 }, { "epoch": 4.2456946039035595, "grad_norm": 0.5821827054023743, "learning_rate": 1.1712741909003444e-06, "loss": 0.6034, "step": 11094 }, { "epoch": 4.246077305778798, "grad_norm": 0.5513712167739868, "learning_rate": 1.1701103088420907e-06, "loss": 0.6242, "step": 11095 }, { "epoch": 4.246460007654037, "grad_norm": 0.5332359671592712, "learning_rate": 1.1689469694042799e-06, "loss": 0.5921, "step": 11096 }, { "epoch": 4.246842709529277, "grad_norm": 0.5335699319839478, "learning_rate": 1.1677841726584015e-06, "loss": 0.6211, "step": 11097 }, { "epoch": 4.247225411404516, "grad_norm": 0.5344750285148621, "learning_rate": 1.1666219186759165e-06, "loss": 0.666, "step": 11098 }, { "epoch": 4.247608113279755, "grad_norm": 0.5674877166748047, "learning_rate": 1.1654602075282418e-06, "loss": 0.6476, "step": 11099 }, { "epoch": 4.247990815154994, "grad_norm": 0.5147172212600708, "learning_rate": 1.1642990392867726e-06, "loss": 0.5409, "step": 11100 }, { "epoch": 4.248373517030234, "grad_norm": 0.5501988530158997, "learning_rate": 1.1631384140228662e-06, "loss": 0.6431, "step": 11101 }, { "epoch": 4.248756218905473, "grad_norm": 0.5464019775390625, "learning_rate": 1.1619783318078415e-06, "loss": 0.5446, "step": 11102 }, { "epoch": 4.2491389207807115, "grad_norm": 0.553562343120575, "learning_rate": 1.1608187927129933e-06, "loss": 0.6865, "step": 11103 }, { "epoch": 4.249521622655951, "grad_norm": 0.5095235109329224, "learning_rate": 1.1596597968095746e-06, "loss": 0.5425, "step": 11104 }, { "epoch": 4.24990432453119, "grad_norm": 0.5819931626319885, "learning_rate": 1.158501344168812e-06, "loss": 0.6779, "step": 11105 }, { "epoch": 4.250287026406429, "grad_norm": 0.5037328004837036, "learning_rate": 1.1573434348618951e-06, "loss": 0.6358, "step": 11106 }, { "epoch": 4.250669728281668, "grad_norm": 0.5515223741531372, "learning_rate": 1.1561860689599802e-06, "loss": 0.6221, "step": 11107 }, { "epoch": 4.251052430156908, "grad_norm": 0.5348237752914429, "learning_rate": 1.1550292465341895e-06, "loss": 0.6745, "step": 11108 }, { "epoch": 4.251435132032147, "grad_norm": 0.5654158592224121, "learning_rate": 1.1538729676556126e-06, "loss": 0.5878, "step": 11109 }, { "epoch": 4.251817833907386, "grad_norm": 0.5228031873703003, "learning_rate": 1.1527172323953073e-06, "loss": 0.5962, "step": 11110 }, { "epoch": 4.252200535782626, "grad_norm": 0.5487667918205261, "learning_rate": 1.1515620408242989e-06, "loss": 0.609, "step": 11111 }, { "epoch": 4.2525832376578645, "grad_norm": 0.5346028804779053, "learning_rate": 1.1504073930135706e-06, "loss": 0.6561, "step": 11112 }, { "epoch": 4.252965939533103, "grad_norm": 0.6018579006195068, "learning_rate": 1.1492532890340836e-06, "loss": 0.5822, "step": 11113 }, { "epoch": 4.253348641408343, "grad_norm": 0.5645239353179932, "learning_rate": 1.1480997289567596e-06, "loss": 0.6132, "step": 11114 }, { "epoch": 4.253731343283582, "grad_norm": 0.5324336290359497, "learning_rate": 1.146946712852487e-06, "loss": 0.6153, "step": 11115 }, { "epoch": 4.254114045158821, "grad_norm": 0.5545833706855774, "learning_rate": 1.145794240792123e-06, "loss": 0.7058, "step": 11116 }, { "epoch": 4.254496747034061, "grad_norm": 0.5806849002838135, "learning_rate": 1.1446423128464913e-06, "loss": 0.6236, "step": 11117 }, { "epoch": 4.2548794489093, "grad_norm": 0.5358523726463318, "learning_rate": 1.1434909290863783e-06, "loss": 0.5595, "step": 11118 }, { "epoch": 4.255262150784539, "grad_norm": 0.5577012300491333, "learning_rate": 1.1423400895825398e-06, "loss": 0.5712, "step": 11119 }, { "epoch": 4.255644852659778, "grad_norm": 0.5267889499664307, "learning_rate": 1.1411897944057004e-06, "loss": 0.5784, "step": 11120 }, { "epoch": 4.2560275545350175, "grad_norm": 0.5898492336273193, "learning_rate": 1.1400400436265458e-06, "loss": 0.5843, "step": 11121 }, { "epoch": 4.256410256410256, "grad_norm": 0.50673508644104, "learning_rate": 1.1388908373157325e-06, "loss": 0.5285, "step": 11122 }, { "epoch": 4.256792958285495, "grad_norm": 0.6201563477516174, "learning_rate": 1.1377421755438834e-06, "loss": 0.6332, "step": 11123 }, { "epoch": 4.257175660160735, "grad_norm": 0.5226055979728699, "learning_rate": 1.1365940583815848e-06, "loss": 0.6589, "step": 11124 }, { "epoch": 4.257558362035974, "grad_norm": 0.5444477200508118, "learning_rate": 1.135446485899393e-06, "loss": 0.5556, "step": 11125 }, { "epoch": 4.257941063911213, "grad_norm": 0.5822721123695374, "learning_rate": 1.134299458167829e-06, "loss": 0.5685, "step": 11126 }, { "epoch": 4.258323765786452, "grad_norm": 0.5409423112869263, "learning_rate": 1.1331529752573845e-06, "loss": 0.6152, "step": 11127 }, { "epoch": 4.258706467661692, "grad_norm": 0.5462958216667175, "learning_rate": 1.1320070372385062e-06, "loss": 0.5327, "step": 11128 }, { "epoch": 4.259089169536931, "grad_norm": 0.5327183604240417, "learning_rate": 1.13086164418162e-06, "loss": 0.5654, "step": 11129 }, { "epoch": 4.2594718714121695, "grad_norm": 0.554833710193634, "learning_rate": 1.1297167961571154e-06, "loss": 0.6513, "step": 11130 }, { "epoch": 4.259854573287409, "grad_norm": 0.5571714043617249, "learning_rate": 1.1285724932353415e-06, "loss": 0.6693, "step": 11131 }, { "epoch": 4.260237275162648, "grad_norm": 0.5639869570732117, "learning_rate": 1.1274287354866197e-06, "loss": 0.632, "step": 11132 }, { "epoch": 4.260619977037887, "grad_norm": 0.5349244475364685, "learning_rate": 1.1262855229812387e-06, "loss": 0.5864, "step": 11133 }, { "epoch": 4.261002678913127, "grad_norm": 0.6014665365219116, "learning_rate": 1.1251428557894516e-06, "loss": 0.6023, "step": 11134 }, { "epoch": 4.261385380788366, "grad_norm": 0.5768909454345703, "learning_rate": 1.1240007339814784e-06, "loss": 0.6914, "step": 11135 }, { "epoch": 4.261768082663605, "grad_norm": 0.6067772507667542, "learning_rate": 1.122859157627505e-06, "loss": 0.6433, "step": 11136 }, { "epoch": 4.262150784538845, "grad_norm": 0.5839040279388428, "learning_rate": 1.1217181267976872e-06, "loss": 0.5761, "step": 11137 }, { "epoch": 4.262533486414084, "grad_norm": 0.5206677913665771, "learning_rate": 1.1205776415621394e-06, "loss": 0.5631, "step": 11138 }, { "epoch": 4.2629161882893225, "grad_norm": 0.5429579615592957, "learning_rate": 1.1194377019909507e-06, "loss": 0.7016, "step": 11139 }, { "epoch": 4.263298890164561, "grad_norm": 0.5390485525131226, "learning_rate": 1.1182983081541743e-06, "loss": 0.6227, "step": 11140 }, { "epoch": 4.263681592039801, "grad_norm": 0.5379793643951416, "learning_rate": 1.117159460121825e-06, "loss": 0.5926, "step": 11141 }, { "epoch": 4.26406429391504, "grad_norm": 0.5367494821548462, "learning_rate": 1.1160211579638913e-06, "loss": 0.709, "step": 11142 }, { "epoch": 4.264446995790279, "grad_norm": 0.498310923576355, "learning_rate": 1.114883401750324e-06, "loss": 0.5643, "step": 11143 }, { "epoch": 4.264829697665519, "grad_norm": 0.5060187578201294, "learning_rate": 1.1137461915510417e-06, "loss": 0.5911, "step": 11144 }, { "epoch": 4.265212399540758, "grad_norm": 0.5269771814346313, "learning_rate": 1.1126095274359284e-06, "loss": 0.5754, "step": 11145 }, { "epoch": 4.265595101415997, "grad_norm": 0.5488085746765137, "learning_rate": 1.111473409474837e-06, "loss": 0.6216, "step": 11146 }, { "epoch": 4.265977803291236, "grad_norm": 0.5709197521209717, "learning_rate": 1.1103378377375818e-06, "loss": 0.6697, "step": 11147 }, { "epoch": 4.2663605051664755, "grad_norm": 0.5910073518753052, "learning_rate": 1.1092028122939492e-06, "loss": 0.6848, "step": 11148 }, { "epoch": 4.266743207041714, "grad_norm": 0.5680426359176636, "learning_rate": 1.1080683332136876e-06, "loss": 0.5883, "step": 11149 }, { "epoch": 4.267125908916953, "grad_norm": 0.5764985680580139, "learning_rate": 1.1069344005665183e-06, "loss": 0.6465, "step": 11150 }, { "epoch": 4.267508610792193, "grad_norm": 0.5255790948867798, "learning_rate": 1.1058010144221188e-06, "loss": 0.6246, "step": 11151 }, { "epoch": 4.267891312667432, "grad_norm": 0.5225108861923218, "learning_rate": 1.1046681748501409e-06, "loss": 0.5365, "step": 11152 }, { "epoch": 4.268274014542671, "grad_norm": 0.5344287753105164, "learning_rate": 1.1035358819202002e-06, "loss": 0.62, "step": 11153 }, { "epoch": 4.268656716417911, "grad_norm": 0.5136569738388062, "learning_rate": 1.102404135701881e-06, "loss": 0.5105, "step": 11154 }, { "epoch": 4.26903941829315, "grad_norm": 0.528879702091217, "learning_rate": 1.10127293626473e-06, "loss": 0.5712, "step": 11155 }, { "epoch": 4.269422120168389, "grad_norm": 0.538610577583313, "learning_rate": 1.100142283678266e-06, "loss": 0.6927, "step": 11156 }, { "epoch": 4.269804822043628, "grad_norm": 0.5185073614120483, "learning_rate": 1.0990121780119668e-06, "loss": 0.6063, "step": 11157 }, { "epoch": 4.270187523918867, "grad_norm": 0.5448299646377563, "learning_rate": 1.0978826193352798e-06, "loss": 0.5578, "step": 11158 }, { "epoch": 4.270570225794106, "grad_norm": 0.5285356044769287, "learning_rate": 1.0967536077176245e-06, "loss": 0.651, "step": 11159 }, { "epoch": 4.270952927669345, "grad_norm": 0.5023664236068726, "learning_rate": 1.095625143228377e-06, "loss": 0.5282, "step": 11160 }, { "epoch": 4.271335629544585, "grad_norm": 0.5565369129180908, "learning_rate": 1.0944972259368848e-06, "loss": 0.5951, "step": 11161 }, { "epoch": 4.271718331419824, "grad_norm": 0.534015417098999, "learning_rate": 1.0933698559124629e-06, "loss": 0.5621, "step": 11162 }, { "epoch": 4.272101033295063, "grad_norm": 0.5131295323371887, "learning_rate": 1.0922430332243916e-06, "loss": 0.597, "step": 11163 }, { "epoch": 4.272483735170303, "grad_norm": 0.5600849986076355, "learning_rate": 1.0911167579419158e-06, "loss": 0.6647, "step": 11164 }, { "epoch": 4.272866437045542, "grad_norm": 0.5666799545288086, "learning_rate": 1.0899910301342486e-06, "loss": 0.5923, "step": 11165 }, { "epoch": 4.2732491389207805, "grad_norm": 0.5641816854476929, "learning_rate": 1.0888658498705717e-06, "loss": 0.6331, "step": 11166 }, { "epoch": 4.273631840796019, "grad_norm": 0.5850141048431396, "learning_rate": 1.0877412172200264e-06, "loss": 0.6629, "step": 11167 }, { "epoch": 4.274014542671259, "grad_norm": 0.5338900089263916, "learning_rate": 1.0866171322517261e-06, "loss": 0.5748, "step": 11168 }, { "epoch": 4.274397244546498, "grad_norm": 0.5597543716430664, "learning_rate": 1.0854935950347501e-06, "loss": 0.6748, "step": 11169 }, { "epoch": 4.274779946421737, "grad_norm": 0.5145540833473206, "learning_rate": 1.0843706056381397e-06, "loss": 0.6555, "step": 11170 }, { "epoch": 4.275162648296977, "grad_norm": 0.5201734304428101, "learning_rate": 1.0832481641309067e-06, "loss": 0.6664, "step": 11171 }, { "epoch": 4.275545350172216, "grad_norm": 0.5516719222068787, "learning_rate": 1.0821262705820313e-06, "loss": 0.6589, "step": 11172 }, { "epoch": 4.275928052047455, "grad_norm": 0.5429859757423401, "learning_rate": 1.0810049250604505e-06, "loss": 0.6137, "step": 11173 }, { "epoch": 4.2763107539226946, "grad_norm": 0.5536341071128845, "learning_rate": 1.0798841276350792e-06, "loss": 0.6637, "step": 11174 }, { "epoch": 4.2766934557979335, "grad_norm": 0.5553443431854248, "learning_rate": 1.0787638783747922e-06, "loss": 0.6487, "step": 11175 }, { "epoch": 4.277076157673172, "grad_norm": 0.5199742317199707, "learning_rate": 1.0776441773484348e-06, "loss": 0.584, "step": 11176 }, { "epoch": 4.277458859548412, "grad_norm": 0.567894697189331, "learning_rate": 1.0765250246248093e-06, "loss": 0.6183, "step": 11177 }, { "epoch": 4.277841561423651, "grad_norm": 0.5315114259719849, "learning_rate": 1.0754064202726943e-06, "loss": 0.5819, "step": 11178 }, { "epoch": 4.27822426329889, "grad_norm": 0.5245517492294312, "learning_rate": 1.0742883643608326e-06, "loss": 0.5882, "step": 11179 }, { "epoch": 4.278606965174129, "grad_norm": 0.5781959295272827, "learning_rate": 1.073170856957928e-06, "loss": 0.6364, "step": 11180 }, { "epoch": 4.278989667049369, "grad_norm": 0.5331621170043945, "learning_rate": 1.0720538981326557e-06, "loss": 0.6506, "step": 11181 }, { "epoch": 4.279372368924608, "grad_norm": 0.5626967549324036, "learning_rate": 1.0709374879536582e-06, "loss": 0.6406, "step": 11182 }, { "epoch": 4.279755070799847, "grad_norm": 0.5197425484657288, "learning_rate": 1.0698216264895355e-06, "loss": 0.6442, "step": 11183 }, { "epoch": 4.280137772675086, "grad_norm": 0.5338577628135681, "learning_rate": 1.068706313808867e-06, "loss": 0.6562, "step": 11184 }, { "epoch": 4.280520474550325, "grad_norm": 0.5734832882881165, "learning_rate": 1.0675915499801915e-06, "loss": 0.6648, "step": 11185 }, { "epoch": 4.280903176425564, "grad_norm": 0.5616318583488464, "learning_rate": 1.0664773350720104e-06, "loss": 0.623, "step": 11186 }, { "epoch": 4.281285878300803, "grad_norm": 0.5552297830581665, "learning_rate": 1.065363669152797e-06, "loss": 0.6437, "step": 11187 }, { "epoch": 4.281668580176043, "grad_norm": 0.5588557720184326, "learning_rate": 1.0642505522909885e-06, "loss": 0.6549, "step": 11188 }, { "epoch": 4.282051282051282, "grad_norm": 0.565185546875, "learning_rate": 1.0631379845549916e-06, "loss": 0.5971, "step": 11189 }, { "epoch": 4.282433983926521, "grad_norm": 0.5381927490234375, "learning_rate": 1.0620259660131715e-06, "loss": 0.6425, "step": 11190 }, { "epoch": 4.282816685801761, "grad_norm": 0.5393435955047607, "learning_rate": 1.060914496733868e-06, "loss": 0.627, "step": 11191 }, { "epoch": 4.283199387677, "grad_norm": 0.5091724395751953, "learning_rate": 1.0598035767853853e-06, "loss": 0.5974, "step": 11192 }, { "epoch": 4.2835820895522385, "grad_norm": 0.5694056749343872, "learning_rate": 1.058693206235989e-06, "loss": 0.6856, "step": 11193 }, { "epoch": 4.283964791427478, "grad_norm": 0.5282039642333984, "learning_rate": 1.0575833851539142e-06, "loss": 0.5705, "step": 11194 }, { "epoch": 4.284347493302717, "grad_norm": 0.522997260093689, "learning_rate": 1.0564741136073675e-06, "loss": 0.6456, "step": 11195 }, { "epoch": 4.284730195177956, "grad_norm": 0.5168172121047974, "learning_rate": 1.0553653916645113e-06, "loss": 0.5651, "step": 11196 }, { "epoch": 4.285112897053196, "grad_norm": 0.48557883501052856, "learning_rate": 1.0542572193934819e-06, "loss": 0.5468, "step": 11197 }, { "epoch": 4.285495598928435, "grad_norm": 0.49195849895477295, "learning_rate": 1.0531495968623806e-06, "loss": 0.633, "step": 11198 }, { "epoch": 4.285878300803674, "grad_norm": 0.5464527606964111, "learning_rate": 1.0520425241392695e-06, "loss": 0.6605, "step": 11199 }, { "epoch": 4.286261002678913, "grad_norm": 0.5494336485862732, "learning_rate": 1.0509360012921855e-06, "loss": 0.5955, "step": 11200 }, { "epoch": 4.286643704554153, "grad_norm": 0.5108503103256226, "learning_rate": 1.049830028389125e-06, "loss": 0.5254, "step": 11201 }, { "epoch": 4.2870264064293915, "grad_norm": 0.5872328877449036, "learning_rate": 1.048724605498055e-06, "loss": 0.5953, "step": 11202 }, { "epoch": 4.28740910830463, "grad_norm": 0.5797367691993713, "learning_rate": 1.0476197326869043e-06, "loss": 0.6596, "step": 11203 }, { "epoch": 4.28779181017987, "grad_norm": 0.5308888554573059, "learning_rate": 1.046515410023572e-06, "loss": 0.5741, "step": 11204 }, { "epoch": 4.288174512055109, "grad_norm": 0.6769266128540039, "learning_rate": 1.0454116375759205e-06, "loss": 0.5956, "step": 11205 }, { "epoch": 4.288557213930348, "grad_norm": 0.5274788737297058, "learning_rate": 1.04430841541178e-06, "loss": 0.6038, "step": 11206 }, { "epoch": 4.288939915805587, "grad_norm": 0.5424413084983826, "learning_rate": 1.0432057435989474e-06, "loss": 0.6508, "step": 11207 }, { "epoch": 4.289322617680827, "grad_norm": 0.5195704102516174, "learning_rate": 1.042103622205186e-06, "loss": 0.6509, "step": 11208 }, { "epoch": 4.289705319556066, "grad_norm": 0.5281949639320374, "learning_rate": 1.04100205129822e-06, "loss": 0.6199, "step": 11209 }, { "epoch": 4.290088021431305, "grad_norm": 0.6003187894821167, "learning_rate": 1.0399010309457459e-06, "loss": 0.5982, "step": 11210 }, { "epoch": 4.290470723306544, "grad_norm": 0.5390472412109375, "learning_rate": 1.0388005612154273e-06, "loss": 0.5983, "step": 11211 }, { "epoch": 4.290853425181783, "grad_norm": 0.5288276672363281, "learning_rate": 1.037700642174887e-06, "loss": 0.5584, "step": 11212 }, { "epoch": 4.291236127057022, "grad_norm": 0.5336052775382996, "learning_rate": 1.0366012738917186e-06, "loss": 0.5801, "step": 11213 }, { "epoch": 4.291618828932262, "grad_norm": 0.5251816511154175, "learning_rate": 1.0355024564334815e-06, "loss": 0.5876, "step": 11214 }, { "epoch": 4.292001530807501, "grad_norm": 0.5371766090393066, "learning_rate": 1.0344041898677027e-06, "loss": 0.5745, "step": 11215 }, { "epoch": 4.29238423268274, "grad_norm": 0.5242972373962402, "learning_rate": 1.0333064742618716e-06, "loss": 0.5937, "step": 11216 }, { "epoch": 4.29276693455798, "grad_norm": 0.5653554797172546, "learning_rate": 1.0322093096834461e-06, "loss": 0.6064, "step": 11217 }, { "epoch": 4.293149636433219, "grad_norm": 0.5860525965690613, "learning_rate": 1.031112696199853e-06, "loss": 0.6187, "step": 11218 }, { "epoch": 4.293532338308458, "grad_norm": 0.5201283693313599, "learning_rate": 1.0300166338784777e-06, "loss": 0.5836, "step": 11219 }, { "epoch": 4.2939150401836965, "grad_norm": 0.536653995513916, "learning_rate": 1.028921122786678e-06, "loss": 0.6225, "step": 11220 }, { "epoch": 4.294297742058936, "grad_norm": 0.5881136655807495, "learning_rate": 1.0278261629917775e-06, "loss": 0.5753, "step": 11221 }, { "epoch": 4.294680443934175, "grad_norm": 0.571479320526123, "learning_rate": 1.0267317545610622e-06, "loss": 0.6934, "step": 11222 }, { "epoch": 4.295063145809414, "grad_norm": 0.5933157801628113, "learning_rate": 1.0256378975617854e-06, "loss": 0.6514, "step": 11223 }, { "epoch": 4.295445847684654, "grad_norm": 0.5365381240844727, "learning_rate": 1.0245445920611707e-06, "loss": 0.5944, "step": 11224 }, { "epoch": 4.295828549559893, "grad_norm": 1.85227370262146, "learning_rate": 1.0234518381264025e-06, "loss": 0.6088, "step": 11225 }, { "epoch": 4.296211251435132, "grad_norm": 0.5351013541221619, "learning_rate": 1.0223596358246346e-06, "loss": 0.6231, "step": 11226 }, { "epoch": 4.296593953310371, "grad_norm": 0.5346802473068237, "learning_rate": 1.0212679852229857e-06, "loss": 0.5354, "step": 11227 }, { "epoch": 4.296976655185611, "grad_norm": 0.552497148513794, "learning_rate": 1.0201768863885418e-06, "loss": 0.6823, "step": 11228 }, { "epoch": 4.2973593570608495, "grad_norm": 0.5048446655273438, "learning_rate": 1.019086339388351e-06, "loss": 0.5424, "step": 11229 }, { "epoch": 4.297742058936088, "grad_norm": 0.5362714529037476, "learning_rate": 1.0179963442894315e-06, "loss": 0.5989, "step": 11230 }, { "epoch": 4.298124760811328, "grad_norm": 0.5284846425056458, "learning_rate": 1.0169069011587684e-06, "loss": 0.6404, "step": 11231 }, { "epoch": 4.298507462686567, "grad_norm": 0.5186644196510315, "learning_rate": 1.0158180100633074e-06, "loss": 0.6006, "step": 11232 }, { "epoch": 4.298890164561806, "grad_norm": 0.5216668844223022, "learning_rate": 1.0147296710699661e-06, "loss": 0.5267, "step": 11233 }, { "epoch": 4.299272866437046, "grad_norm": 0.5663781762123108, "learning_rate": 1.0136418842456252e-06, "loss": 0.5856, "step": 11234 }, { "epoch": 4.299655568312285, "grad_norm": 0.5450865030288696, "learning_rate": 1.0125546496571315e-06, "loss": 0.5912, "step": 11235 }, { "epoch": 4.300038270187524, "grad_norm": 0.5387639403343201, "learning_rate": 1.0114679673713002e-06, "loss": 0.6621, "step": 11236 }, { "epoch": 4.3004209720627635, "grad_norm": 0.5788896679878235, "learning_rate": 1.0103818374549113e-06, "loss": 0.6127, "step": 11237 }, { "epoch": 4.3008036739380024, "grad_norm": 0.5164273381233215, "learning_rate": 1.009296259974707e-06, "loss": 0.585, "step": 11238 }, { "epoch": 4.301186375813241, "grad_norm": 0.5246689915657043, "learning_rate": 1.0082112349974017e-06, "loss": 0.5737, "step": 11239 }, { "epoch": 4.30156907768848, "grad_norm": 0.5899128317832947, "learning_rate": 1.0071267625896719e-06, "loss": 0.6809, "step": 11240 }, { "epoch": 4.30195177956372, "grad_norm": 0.5165373682975769, "learning_rate": 1.0060428428181635e-06, "loss": 0.5206, "step": 11241 }, { "epoch": 4.302334481438959, "grad_norm": 0.5288999676704407, "learning_rate": 1.0049594757494829e-06, "loss": 0.7064, "step": 11242 }, { "epoch": 4.302717183314198, "grad_norm": 0.6025933027267456, "learning_rate": 1.003876661450207e-06, "loss": 0.5662, "step": 11243 }, { "epoch": 4.303099885189438, "grad_norm": 0.5802053213119507, "learning_rate": 1.0027943999868784e-06, "loss": 0.5717, "step": 11244 }, { "epoch": 4.303482587064677, "grad_norm": 0.5129358172416687, "learning_rate": 1.0017126914260055e-06, "loss": 0.5539, "step": 11245 }, { "epoch": 4.303865288939916, "grad_norm": 0.5441921949386597, "learning_rate": 1.0006315358340612e-06, "loss": 0.6629, "step": 11246 }, { "epoch": 4.3042479908151545, "grad_norm": 0.5361728072166443, "learning_rate": 9.995509332774877e-07, "loss": 0.6236, "step": 11247 }, { "epoch": 4.304630692690394, "grad_norm": 0.5496836304664612, "learning_rate": 9.984708838226875e-07, "loss": 0.6347, "step": 11248 }, { "epoch": 4.305013394565633, "grad_norm": 0.5419224500656128, "learning_rate": 9.973913875360331e-07, "loss": 0.5809, "step": 11249 }, { "epoch": 4.305396096440872, "grad_norm": 0.585127055644989, "learning_rate": 9.963124444838656e-07, "loss": 0.5616, "step": 11250 }, { "epoch": 4.305778798316112, "grad_norm": 0.5758878588676453, "learning_rate": 9.952340547324845e-07, "loss": 0.6261, "step": 11251 }, { "epoch": 4.306161500191351, "grad_norm": 0.5327380299568176, "learning_rate": 9.941562183481622e-07, "loss": 0.6427, "step": 11252 }, { "epoch": 4.30654420206659, "grad_norm": 0.5261566042900085, "learning_rate": 9.930789353971348e-07, "loss": 0.6007, "step": 11253 }, { "epoch": 4.30692690394183, "grad_norm": 0.5769641995429993, "learning_rate": 9.920022059456035e-07, "loss": 0.573, "step": 11254 }, { "epoch": 4.307309605817069, "grad_norm": 0.5528581738471985, "learning_rate": 9.909260300597367e-07, "loss": 0.7192, "step": 11255 }, { "epoch": 4.3076923076923075, "grad_norm": 0.5459734201431274, "learning_rate": 9.898504078056681e-07, "loss": 0.643, "step": 11256 }, { "epoch": 4.308075009567547, "grad_norm": 0.5359740853309631, "learning_rate": 9.887753392494992e-07, "loss": 0.5803, "step": 11257 }, { "epoch": 4.308457711442786, "grad_norm": 0.5361897349357605, "learning_rate": 9.877008244572927e-07, "loss": 0.6565, "step": 11258 }, { "epoch": 4.308840413318025, "grad_norm": 0.5453000068664551, "learning_rate": 9.866268634950816e-07, "loss": 0.6261, "step": 11259 }, { "epoch": 4.309223115193264, "grad_norm": 0.5624399185180664, "learning_rate": 9.85553456428866e-07, "loss": 0.6697, "step": 11260 }, { "epoch": 4.309605817068504, "grad_norm": 0.5197600722312927, "learning_rate": 9.844806033246069e-07, "loss": 0.6692, "step": 11261 }, { "epoch": 4.309988518943743, "grad_norm": 0.5354188084602356, "learning_rate": 9.834083042482334e-07, "loss": 0.616, "step": 11262 }, { "epoch": 4.310371220818982, "grad_norm": 0.5335478186607361, "learning_rate": 9.823365592656431e-07, "loss": 0.6507, "step": 11263 }, { "epoch": 4.3107539226942215, "grad_norm": 0.5633135437965393, "learning_rate": 9.812653684426975e-07, "loss": 0.5862, "step": 11264 }, { "epoch": 4.3111366245694605, "grad_norm": 0.49891239404678345, "learning_rate": 9.801947318452233e-07, "loss": 0.5899, "step": 11265 }, { "epoch": 4.311519326444699, "grad_norm": 0.5448890924453735, "learning_rate": 9.791246495390139e-07, "loss": 0.6438, "step": 11266 }, { "epoch": 4.311902028319938, "grad_norm": 0.5201904773712158, "learning_rate": 9.780551215898326e-07, "loss": 0.5734, "step": 11267 }, { "epoch": 4.312284730195178, "grad_norm": 0.573647677898407, "learning_rate": 9.76986148063398e-07, "loss": 0.6939, "step": 11268 }, { "epoch": 4.312667432070417, "grad_norm": 0.5633631348609924, "learning_rate": 9.759177290254062e-07, "loss": 0.5565, "step": 11269 }, { "epoch": 4.313050133945656, "grad_norm": 0.6134405136108398, "learning_rate": 9.748498645415139e-07, "loss": 0.6043, "step": 11270 }, { "epoch": 4.313432835820896, "grad_norm": 0.5366398692131042, "learning_rate": 9.737825546773427e-07, "loss": 0.6601, "step": 11271 }, { "epoch": 4.313815537696135, "grad_norm": 0.5473141670227051, "learning_rate": 9.727157994984815e-07, "loss": 0.6362, "step": 11272 }, { "epoch": 4.314198239571374, "grad_norm": 0.5603510141372681, "learning_rate": 9.716495990704856e-07, "loss": 0.6629, "step": 11273 }, { "epoch": 4.314580941446613, "grad_norm": 0.5569769740104675, "learning_rate": 9.705839534588767e-07, "loss": 0.6418, "step": 11274 }, { "epoch": 4.314963643321852, "grad_norm": 0.536586582660675, "learning_rate": 9.695188627291419e-07, "loss": 0.6162, "step": 11275 }, { "epoch": 4.315346345197091, "grad_norm": 0.5519245266914368, "learning_rate": 9.684543269467351e-07, "loss": 0.5986, "step": 11276 }, { "epoch": 4.315729047072331, "grad_norm": 0.5831440091133118, "learning_rate": 9.673903461770705e-07, "loss": 0.6905, "step": 11277 }, { "epoch": 4.31611174894757, "grad_norm": 0.6083848476409912, "learning_rate": 9.663269204855364e-07, "loss": 0.6229, "step": 11278 }, { "epoch": 4.316494450822809, "grad_norm": 0.5615477561950684, "learning_rate": 9.652640499374832e-07, "loss": 0.6494, "step": 11279 }, { "epoch": 4.316877152698048, "grad_norm": 0.5790256261825562, "learning_rate": 9.642017345982235e-07, "loss": 0.6354, "step": 11280 }, { "epoch": 4.317259854573288, "grad_norm": 0.5847644805908203, "learning_rate": 9.631399745330417e-07, "loss": 0.643, "step": 11281 }, { "epoch": 4.317642556448527, "grad_norm": 0.5102486610412598, "learning_rate": 9.620787698071864e-07, "loss": 0.6325, "step": 11282 }, { "epoch": 4.3180252583237655, "grad_norm": 0.5926884412765503, "learning_rate": 9.610181204858715e-07, "loss": 0.7002, "step": 11283 }, { "epoch": 4.318407960199005, "grad_norm": 0.57903653383255, "learning_rate": 9.599580266342755e-07, "loss": 0.7507, "step": 11284 }, { "epoch": 4.318790662074244, "grad_norm": 0.5231698155403137, "learning_rate": 9.588984883175445e-07, "loss": 0.585, "step": 11285 }, { "epoch": 4.319173363949483, "grad_norm": 0.542397677898407, "learning_rate": 9.578395056007929e-07, "loss": 0.5938, "step": 11286 }, { "epoch": 4.319556065824723, "grad_norm": 0.5529583692550659, "learning_rate": 9.567810785490928e-07, "loss": 0.5796, "step": 11287 }, { "epoch": 4.319938767699962, "grad_norm": 0.5466506481170654, "learning_rate": 9.557232072274902e-07, "loss": 0.5554, "step": 11288 }, { "epoch": 4.320321469575201, "grad_norm": 0.5240030288696289, "learning_rate": 9.546658917009953e-07, "loss": 0.6175, "step": 11289 }, { "epoch": 4.32070417145044, "grad_norm": 0.6031405925750732, "learning_rate": 9.536091320345798e-07, "loss": 0.6371, "step": 11290 }, { "epoch": 4.3210868733256795, "grad_norm": 0.5771393775939941, "learning_rate": 9.525529282931867e-07, "loss": 0.6714, "step": 11291 }, { "epoch": 4.3214695752009185, "grad_norm": 0.5490373373031616, "learning_rate": 9.514972805417233e-07, "loss": 0.6116, "step": 11292 }, { "epoch": 4.321852277076157, "grad_norm": 0.5703131556510925, "learning_rate": 9.504421888450599e-07, "loss": 0.5882, "step": 11293 }, { "epoch": 4.322234978951397, "grad_norm": 0.5519819259643555, "learning_rate": 9.493876532680324e-07, "loss": 0.7251, "step": 11294 }, { "epoch": 4.322617680826636, "grad_norm": 0.5228085517883301, "learning_rate": 9.4833367387545e-07, "loss": 0.5666, "step": 11295 }, { "epoch": 4.323000382701875, "grad_norm": 0.5554666519165039, "learning_rate": 9.472802507320833e-07, "loss": 0.6472, "step": 11296 }, { "epoch": 4.323383084577115, "grad_norm": 0.5786156058311462, "learning_rate": 9.462273839026625e-07, "loss": 0.6039, "step": 11297 }, { "epoch": 4.323765786452354, "grad_norm": 0.5522854328155518, "learning_rate": 9.451750734518928e-07, "loss": 0.6009, "step": 11298 }, { "epoch": 4.324148488327593, "grad_norm": 0.5315338373184204, "learning_rate": 9.44123319444441e-07, "loss": 0.6853, "step": 11299 }, { "epoch": 4.324531190202832, "grad_norm": 0.5654141902923584, "learning_rate": 9.430721219449391e-07, "loss": 0.6265, "step": 11300 }, { "epoch": 4.324913892078071, "grad_norm": 0.5423840284347534, "learning_rate": 9.420214810179861e-07, "loss": 0.5717, "step": 11301 }, { "epoch": 4.32529659395331, "grad_norm": 0.533583402633667, "learning_rate": 9.409713967281497e-07, "loss": 0.7008, "step": 11302 }, { "epoch": 4.325679295828549, "grad_norm": 0.580827534198761, "learning_rate": 9.399218691399559e-07, "loss": 0.6384, "step": 11303 }, { "epoch": 4.326061997703789, "grad_norm": 0.5888300538063049, "learning_rate": 9.388728983179018e-07, "loss": 0.6204, "step": 11304 }, { "epoch": 4.326444699579028, "grad_norm": 0.5811976790428162, "learning_rate": 9.378244843264528e-07, "loss": 0.6151, "step": 11305 }, { "epoch": 4.326827401454267, "grad_norm": 0.5427747964859009, "learning_rate": 9.367766272300328e-07, "loss": 0.6408, "step": 11306 }, { "epoch": 4.327210103329507, "grad_norm": 0.5213712453842163, "learning_rate": 9.357293270930379e-07, "loss": 0.5155, "step": 11307 }, { "epoch": 4.327592805204746, "grad_norm": 0.5707231760025024, "learning_rate": 9.34682583979828e-07, "loss": 0.6152, "step": 11308 }, { "epoch": 4.327975507079985, "grad_norm": 0.5450393557548523, "learning_rate": 9.33636397954728e-07, "loss": 0.6275, "step": 11309 }, { "epoch": 4.3283582089552235, "grad_norm": 0.5538814067840576, "learning_rate": 9.325907690820258e-07, "loss": 0.6484, "step": 11310 }, { "epoch": 4.328740910830463, "grad_norm": 0.48799028992652893, "learning_rate": 9.315456974259818e-07, "loss": 0.5667, "step": 11311 }, { "epoch": 4.329123612705702, "grad_norm": 0.5389369130134583, "learning_rate": 9.305011830508171e-07, "loss": 0.7213, "step": 11312 }, { "epoch": 4.329506314580941, "grad_norm": 0.5382900238037109, "learning_rate": 9.294572260207191e-07, "loss": 0.6386, "step": 11313 }, { "epoch": 4.329889016456181, "grad_norm": 0.5489630103111267, "learning_rate": 9.284138263998422e-07, "loss": 0.6104, "step": 11314 }, { "epoch": 4.33027171833142, "grad_norm": 0.5536631941795349, "learning_rate": 9.27370984252306e-07, "loss": 0.6526, "step": 11315 }, { "epoch": 4.330654420206659, "grad_norm": 0.5643709897994995, "learning_rate": 9.263286996421971e-07, "loss": 0.6273, "step": 11316 }, { "epoch": 4.331037122081899, "grad_norm": 0.5353527665138245, "learning_rate": 9.252869726335656e-07, "loss": 0.6126, "step": 11317 }, { "epoch": 4.3314198239571375, "grad_norm": 0.543228268623352, "learning_rate": 9.242458032904311e-07, "loss": 0.5741, "step": 11318 }, { "epoch": 4.3318025258323765, "grad_norm": 0.5956726670265198, "learning_rate": 9.232051916767715e-07, "loss": 0.5947, "step": 11319 }, { "epoch": 4.332185227707615, "grad_norm": 0.534945011138916, "learning_rate": 9.221651378565377e-07, "loss": 0.599, "step": 11320 }, { "epoch": 4.332567929582855, "grad_norm": 0.5337009429931641, "learning_rate": 9.211256418936432e-07, "loss": 0.6545, "step": 11321 }, { "epoch": 4.332950631458094, "grad_norm": 0.5250556468963623, "learning_rate": 9.200867038519701e-07, "loss": 0.6272, "step": 11322 }, { "epoch": 4.333333333333333, "grad_norm": 0.5594605803489685, "learning_rate": 9.190483237953618e-07, "loss": 0.5887, "step": 11323 }, { "epoch": 4.333716035208573, "grad_norm": 0.5635792016983032, "learning_rate": 9.180105017876284e-07, "loss": 0.6376, "step": 11324 }, { "epoch": 4.334098737083812, "grad_norm": 0.5023397207260132, "learning_rate": 9.169732378925489e-07, "loss": 0.5536, "step": 11325 }, { "epoch": 4.334481438959051, "grad_norm": 0.5654516816139221, "learning_rate": 9.159365321738655e-07, "loss": 0.6844, "step": 11326 }, { "epoch": 4.3348641408342905, "grad_norm": 0.5162226557731628, "learning_rate": 9.149003846952853e-07, "loss": 0.5689, "step": 11327 }, { "epoch": 4.335246842709529, "grad_norm": 0.5804110765457153, "learning_rate": 9.138647955204872e-07, "loss": 0.6562, "step": 11328 }, { "epoch": 4.335629544584768, "grad_norm": 0.5463897585868835, "learning_rate": 9.128297647131046e-07, "loss": 0.7048, "step": 11329 }, { "epoch": 4.336012246460007, "grad_norm": 0.5753100514411926, "learning_rate": 9.117952923367446e-07, "loss": 0.6914, "step": 11330 }, { "epoch": 4.336394948335247, "grad_norm": 0.5935587286949158, "learning_rate": 9.107613784549818e-07, "loss": 0.6205, "step": 11331 }, { "epoch": 4.336777650210486, "grad_norm": 0.6074502468109131, "learning_rate": 9.097280231313488e-07, "loss": 0.7562, "step": 11332 }, { "epoch": 4.337160352085725, "grad_norm": 0.5945861339569092, "learning_rate": 9.086952264293502e-07, "loss": 0.6627, "step": 11333 }, { "epoch": 4.337543053960965, "grad_norm": 0.5104286074638367, "learning_rate": 9.076629884124521e-07, "loss": 0.5875, "step": 11334 }, { "epoch": 4.337925755836204, "grad_norm": 0.5814955830574036, "learning_rate": 9.066313091440915e-07, "loss": 0.6118, "step": 11335 }, { "epoch": 4.338308457711443, "grad_norm": 0.5294062495231628, "learning_rate": 9.056001886876653e-07, "loss": 0.5953, "step": 11336 }, { "epoch": 4.338691159586682, "grad_norm": 0.5508163571357727, "learning_rate": 9.045696271065396e-07, "loss": 0.6128, "step": 11337 }, { "epoch": 4.339073861461921, "grad_norm": 0.5441054701805115, "learning_rate": 9.035396244640471e-07, "loss": 0.623, "step": 11338 }, { "epoch": 4.33945656333716, "grad_norm": 0.5732150077819824, "learning_rate": 9.025101808234804e-07, "loss": 0.6133, "step": 11339 }, { "epoch": 4.339839265212399, "grad_norm": 0.6253705620765686, "learning_rate": 9.014812962481035e-07, "loss": 0.6188, "step": 11340 }, { "epoch": 4.340221967087639, "grad_norm": 0.53536456823349, "learning_rate": 9.004529708011455e-07, "loss": 0.6624, "step": 11341 }, { "epoch": 4.340604668962878, "grad_norm": 0.5637244582176208, "learning_rate": 8.994252045457963e-07, "loss": 0.6726, "step": 11342 }, { "epoch": 4.340987370838117, "grad_norm": 0.5263952016830444, "learning_rate": 8.983979975452173e-07, "loss": 0.6223, "step": 11343 }, { "epoch": 4.341370072713357, "grad_norm": 0.579846978187561, "learning_rate": 8.973713498625325e-07, "loss": 0.6465, "step": 11344 }, { "epoch": 4.3417527745885955, "grad_norm": 0.5064054727554321, "learning_rate": 8.963452615608325e-07, "loss": 0.6427, "step": 11345 }, { "epoch": 4.3421354764638345, "grad_norm": 0.6170699596405029, "learning_rate": 8.953197327031726e-07, "loss": 0.6849, "step": 11346 }, { "epoch": 4.342518178339074, "grad_norm": 0.5249903202056885, "learning_rate": 8.942947633525756e-07, "loss": 0.6439, "step": 11347 }, { "epoch": 4.342900880214313, "grad_norm": 0.5643290281295776, "learning_rate": 8.932703535720289e-07, "loss": 0.5764, "step": 11348 }, { "epoch": 4.343283582089552, "grad_norm": 0.524678111076355, "learning_rate": 8.922465034244832e-07, "loss": 0.6376, "step": 11349 }, { "epoch": 4.343666283964791, "grad_norm": 0.5591107606887817, "learning_rate": 8.912232129728571e-07, "loss": 0.6445, "step": 11350 }, { "epoch": 4.344048985840031, "grad_norm": 0.5000340938568115, "learning_rate": 8.902004822800369e-07, "loss": 0.5974, "step": 11351 }, { "epoch": 4.34443168771527, "grad_norm": 0.5023794770240784, "learning_rate": 8.891783114088681e-07, "loss": 0.6077, "step": 11352 }, { "epoch": 4.344814389590509, "grad_norm": 0.5332596302032471, "learning_rate": 8.881567004221691e-07, "loss": 0.6179, "step": 11353 }, { "epoch": 4.3451970914657485, "grad_norm": 0.5198609232902527, "learning_rate": 8.871356493827199e-07, "loss": 0.6096, "step": 11354 }, { "epoch": 4.345579793340987, "grad_norm": 0.5593064427375793, "learning_rate": 8.861151583532657e-07, "loss": 0.5998, "step": 11355 }, { "epoch": 4.345962495216226, "grad_norm": 0.5138819217681885, "learning_rate": 8.850952273965197e-07, "loss": 0.599, "step": 11356 }, { "epoch": 4.346345197091466, "grad_norm": 0.5076264142990112, "learning_rate": 8.840758565751618e-07, "loss": 0.6549, "step": 11357 }, { "epoch": 4.346727898966705, "grad_norm": 0.5806369185447693, "learning_rate": 8.830570459518296e-07, "loss": 0.5386, "step": 11358 }, { "epoch": 4.347110600841944, "grad_norm": 0.5661466121673584, "learning_rate": 8.820387955891341e-07, "loss": 0.5927, "step": 11359 }, { "epoch": 4.347493302717183, "grad_norm": 0.7012872695922852, "learning_rate": 8.810211055496509e-07, "loss": 0.6641, "step": 11360 }, { "epoch": 4.347876004592423, "grad_norm": 0.4898611009120941, "learning_rate": 8.8000397589592e-07, "loss": 0.6538, "step": 11361 }, { "epoch": 4.348258706467662, "grad_norm": 0.5689818859100342, "learning_rate": 8.789874066904446e-07, "loss": 0.6409, "step": 11362 }, { "epoch": 4.348641408342901, "grad_norm": 0.5240331888198853, "learning_rate": 8.779713979956961e-07, "loss": 0.5832, "step": 11363 }, { "epoch": 4.34902411021814, "grad_norm": 0.5224165320396423, "learning_rate": 8.769559498741109e-07, "loss": 0.5389, "step": 11364 }, { "epoch": 4.349406812093379, "grad_norm": 0.5183570981025696, "learning_rate": 8.759410623880926e-07, "loss": 0.6108, "step": 11365 }, { "epoch": 4.349789513968618, "grad_norm": 0.5454550981521606, "learning_rate": 8.749267356000069e-07, "loss": 0.6614, "step": 11366 }, { "epoch": 4.350172215843858, "grad_norm": 0.5497246384620667, "learning_rate": 8.739129695721893e-07, "loss": 0.5899, "step": 11367 }, { "epoch": 4.350554917719097, "grad_norm": 0.5802149176597595, "learning_rate": 8.728997643669357e-07, "loss": 0.5913, "step": 11368 }, { "epoch": 4.350937619594336, "grad_norm": 0.5616410374641418, "learning_rate": 8.718871200465106e-07, "loss": 0.6111, "step": 11369 }, { "epoch": 4.351320321469575, "grad_norm": 0.558582603931427, "learning_rate": 8.708750366731477e-07, "loss": 0.6941, "step": 11370 }, { "epoch": 4.351703023344815, "grad_norm": 0.5526182651519775, "learning_rate": 8.698635143090362e-07, "loss": 0.6071, "step": 11371 }, { "epoch": 4.3520857252200535, "grad_norm": 0.5008744597434998, "learning_rate": 8.688525530163405e-07, "loss": 0.622, "step": 11372 }, { "epoch": 4.3524684270952925, "grad_norm": 0.6006699204444885, "learning_rate": 8.678421528571857e-07, "loss": 0.6745, "step": 11373 }, { "epoch": 4.352851128970532, "grad_norm": 0.57806396484375, "learning_rate": 8.66832313893664e-07, "loss": 0.7177, "step": 11374 }, { "epoch": 4.353233830845771, "grad_norm": 0.5581571459770203, "learning_rate": 8.658230361878328e-07, "loss": 0.6548, "step": 11375 }, { "epoch": 4.35361653272101, "grad_norm": 0.6419740319252014, "learning_rate": 8.648143198017145e-07, "loss": 0.7056, "step": 11376 }, { "epoch": 4.35399923459625, "grad_norm": 0.5279223322868347, "learning_rate": 8.638061647973006e-07, "loss": 0.6735, "step": 11377 }, { "epoch": 4.354381936471489, "grad_norm": 0.5160274505615234, "learning_rate": 8.627985712365394e-07, "loss": 0.6469, "step": 11378 }, { "epoch": 4.354764638346728, "grad_norm": 0.5295261740684509, "learning_rate": 8.617915391813536e-07, "loss": 0.5175, "step": 11379 }, { "epoch": 4.355147340221967, "grad_norm": 0.5779756307601929, "learning_rate": 8.607850686936292e-07, "loss": 0.6155, "step": 11380 }, { "epoch": 4.3555300420972065, "grad_norm": 0.4986264705657959, "learning_rate": 8.597791598352135e-07, "loss": 0.5898, "step": 11381 }, { "epoch": 4.355912743972445, "grad_norm": 0.5515378713607788, "learning_rate": 8.587738126679223e-07, "loss": 0.5604, "step": 11382 }, { "epoch": 4.356295445847684, "grad_norm": 0.5529900193214417, "learning_rate": 8.577690272535388e-07, "loss": 0.6538, "step": 11383 }, { "epoch": 4.356678147722924, "grad_norm": 0.5625290870666504, "learning_rate": 8.56764803653809e-07, "loss": 0.6152, "step": 11384 }, { "epoch": 4.357060849598163, "grad_norm": 0.5303884148597717, "learning_rate": 8.557611419304446e-07, "loss": 0.6131, "step": 11385 }, { "epoch": 4.357443551473402, "grad_norm": 0.553939938545227, "learning_rate": 8.547580421451251e-07, "loss": 0.6188, "step": 11386 }, { "epoch": 4.357826253348642, "grad_norm": 0.5416896343231201, "learning_rate": 8.537555043594936e-07, "loss": 0.59, "step": 11387 }, { "epoch": 4.358208955223881, "grad_norm": 0.5938543081283569, "learning_rate": 8.527535286351562e-07, "loss": 0.658, "step": 11388 }, { "epoch": 4.35859165709912, "grad_norm": 0.543136715888977, "learning_rate": 8.517521150336883e-07, "loss": 0.6882, "step": 11389 }, { "epoch": 4.358974358974359, "grad_norm": 0.5400126576423645, "learning_rate": 8.507512636166315e-07, "loss": 0.6061, "step": 11390 }, { "epoch": 4.359357060849598, "grad_norm": 0.5301060676574707, "learning_rate": 8.497509744454868e-07, "loss": 0.6388, "step": 11391 }, { "epoch": 4.359739762724837, "grad_norm": 0.5096287727355957, "learning_rate": 8.487512475817261e-07, "loss": 0.558, "step": 11392 }, { "epoch": 4.360122464600076, "grad_norm": 0.5524293780326843, "learning_rate": 8.477520830867891e-07, "loss": 0.6069, "step": 11393 }, { "epoch": 4.360505166475316, "grad_norm": 0.51176917552948, "learning_rate": 8.467534810220701e-07, "loss": 0.5946, "step": 11394 }, { "epoch": 4.360887868350555, "grad_norm": 0.6100890040397644, "learning_rate": 8.457554414489411e-07, "loss": 0.6034, "step": 11395 }, { "epoch": 4.361270570225794, "grad_norm": 0.6370075941085815, "learning_rate": 8.447579644287352e-07, "loss": 0.6376, "step": 11396 }, { "epoch": 4.361653272101034, "grad_norm": 0.5399150252342224, "learning_rate": 8.437610500227455e-07, "loss": 0.6414, "step": 11397 }, { "epoch": 4.362035973976273, "grad_norm": 0.5016889572143555, "learning_rate": 8.427646982922388e-07, "loss": 0.5704, "step": 11398 }, { "epoch": 4.3624186758515116, "grad_norm": 0.5743057727813721, "learning_rate": 8.417689092984404e-07, "loss": 0.5757, "step": 11399 }, { "epoch": 4.3628013777267505, "grad_norm": 0.5574837923049927, "learning_rate": 8.407736831025492e-07, "loss": 0.6313, "step": 11400 }, { "epoch": 4.36318407960199, "grad_norm": 0.5104244351387024, "learning_rate": 8.397790197657185e-07, "loss": 0.6019, "step": 11401 }, { "epoch": 4.363566781477229, "grad_norm": 0.5084320306777954, "learning_rate": 8.387849193490772e-07, "loss": 0.6692, "step": 11402 }, { "epoch": 4.363949483352468, "grad_norm": 0.5361828207969666, "learning_rate": 8.377913819137151e-07, "loss": 0.6029, "step": 11403 }, { "epoch": 4.364332185227708, "grad_norm": 0.5407100319862366, "learning_rate": 8.367984075206847e-07, "loss": 0.6209, "step": 11404 }, { "epoch": 4.364714887102947, "grad_norm": 0.5183334946632385, "learning_rate": 8.358059962310072e-07, "loss": 0.6174, "step": 11405 }, { "epoch": 4.365097588978186, "grad_norm": 0.5174257755279541, "learning_rate": 8.348141481056749e-07, "loss": 0.5677, "step": 11406 }, { "epoch": 4.365480290853426, "grad_norm": 0.5467763543128967, "learning_rate": 8.338228632056333e-07, "loss": 0.5988, "step": 11407 }, { "epoch": 4.3658629927286645, "grad_norm": 0.5139197707176208, "learning_rate": 8.328321415918017e-07, "loss": 0.6263, "step": 11408 }, { "epoch": 4.366245694603903, "grad_norm": 0.5306565165519714, "learning_rate": 8.318419833250646e-07, "loss": 0.6243, "step": 11409 }, { "epoch": 4.366628396479142, "grad_norm": 0.5293470621109009, "learning_rate": 8.308523884662656e-07, "loss": 0.5897, "step": 11410 }, { "epoch": 4.367011098354382, "grad_norm": 0.5295272469520569, "learning_rate": 8.298633570762204e-07, "loss": 0.6624, "step": 11411 }, { "epoch": 4.367393800229621, "grad_norm": 0.5829920768737793, "learning_rate": 8.288748892157061e-07, "loss": 0.6291, "step": 11412 }, { "epoch": 4.36777650210486, "grad_norm": 0.5337051153182983, "learning_rate": 8.278869849454718e-07, "loss": 0.6649, "step": 11413 }, { "epoch": 4.3681592039801, "grad_norm": 0.5318670868873596, "learning_rate": 8.268996443262201e-07, "loss": 0.5801, "step": 11414 }, { "epoch": 4.368541905855339, "grad_norm": 0.5196452140808105, "learning_rate": 8.259128674186268e-07, "loss": 0.512, "step": 11415 }, { "epoch": 4.368924607730578, "grad_norm": 0.6126628518104553, "learning_rate": 8.249266542833379e-07, "loss": 0.5973, "step": 11416 }, { "epoch": 4.3693073096058175, "grad_norm": 0.6430193185806274, "learning_rate": 8.239410049809526e-07, "loss": 0.6028, "step": 11417 }, { "epoch": 4.369690011481056, "grad_norm": 0.5620688796043396, "learning_rate": 8.229559195720449e-07, "loss": 0.6303, "step": 11418 }, { "epoch": 4.370072713356295, "grad_norm": 0.5093744397163391, "learning_rate": 8.219713981171506e-07, "loss": 0.5205, "step": 11419 }, { "epoch": 4.370455415231534, "grad_norm": 0.5407353043556213, "learning_rate": 8.209874406767693e-07, "loss": 0.6728, "step": 11420 }, { "epoch": 4.370838117106774, "grad_norm": 0.5263716578483582, "learning_rate": 8.20004047311369e-07, "loss": 0.6566, "step": 11421 }, { "epoch": 4.371220818982013, "grad_norm": 0.5120761394500732, "learning_rate": 8.190212180813839e-07, "loss": 0.6299, "step": 11422 }, { "epoch": 4.371603520857252, "grad_norm": 0.5626143217086792, "learning_rate": 8.180389530472077e-07, "loss": 0.5977, "step": 11423 }, { "epoch": 4.371986222732492, "grad_norm": 0.5511415004730225, "learning_rate": 8.170572522692055e-07, "loss": 0.6525, "step": 11424 }, { "epoch": 4.372368924607731, "grad_norm": 0.5636019110679626, "learning_rate": 8.160761158077047e-07, "loss": 0.6305, "step": 11425 }, { "epoch": 4.3727516264829696, "grad_norm": 0.5230242013931274, "learning_rate": 8.15095543722999e-07, "loss": 0.5484, "step": 11426 }, { "epoch": 4.373134328358209, "grad_norm": 0.531427264213562, "learning_rate": 8.141155360753472e-07, "loss": 0.5839, "step": 11427 }, { "epoch": 4.373517030233448, "grad_norm": 0.5410900712013245, "learning_rate": 8.131360929249732e-07, "loss": 0.6131, "step": 11428 }, { "epoch": 4.373899732108687, "grad_norm": 0.5200222730636597, "learning_rate": 8.121572143320688e-07, "loss": 0.6413, "step": 11429 }, { "epoch": 4.374282433983926, "grad_norm": 0.5442525148391724, "learning_rate": 8.11178900356785e-07, "loss": 0.5757, "step": 11430 }, { "epoch": 4.374665135859166, "grad_norm": 0.5410154461860657, "learning_rate": 8.102011510592433e-07, "loss": 0.6827, "step": 11431 }, { "epoch": 4.375047837734405, "grad_norm": 0.5795398950576782, "learning_rate": 8.092239664995316e-07, "loss": 0.5969, "step": 11432 }, { "epoch": 4.375430539609644, "grad_norm": 0.5498635172843933, "learning_rate": 8.08247346737695e-07, "loss": 0.5906, "step": 11433 }, { "epoch": 4.375813241484884, "grad_norm": 0.56410151720047, "learning_rate": 8.072712918337533e-07, "loss": 0.5728, "step": 11434 }, { "epoch": 4.3761959433601225, "grad_norm": 0.49575304985046387, "learning_rate": 8.062958018476874e-07, "loss": 0.6048, "step": 11435 }, { "epoch": 4.376578645235361, "grad_norm": 0.5296369791030884, "learning_rate": 8.053208768394428e-07, "loss": 0.6343, "step": 11436 }, { "epoch": 4.376961347110601, "grad_norm": 0.5267423987388611, "learning_rate": 8.043465168689313e-07, "loss": 0.5812, "step": 11437 }, { "epoch": 4.37734404898584, "grad_norm": 0.6032684445381165, "learning_rate": 8.033727219960308e-07, "loss": 0.582, "step": 11438 }, { "epoch": 4.377726750861079, "grad_norm": 0.5437876582145691, "learning_rate": 8.023994922805844e-07, "loss": 0.7295, "step": 11439 }, { "epoch": 4.378109452736318, "grad_norm": 0.5366148352622986, "learning_rate": 8.014268277823977e-07, "loss": 0.5622, "step": 11440 }, { "epoch": 4.378492154611558, "grad_norm": 0.5319567918777466, "learning_rate": 8.00454728561244e-07, "loss": 0.6387, "step": 11441 }, { "epoch": 4.378874856486797, "grad_norm": 0.517776370048523, "learning_rate": 7.994831946768622e-07, "loss": 0.5598, "step": 11442 }, { "epoch": 4.379257558362036, "grad_norm": 0.531887412071228, "learning_rate": 7.985122261889544e-07, "loss": 0.5766, "step": 11443 }, { "epoch": 4.3796402602372755, "grad_norm": 0.5467109084129333, "learning_rate": 7.975418231571908e-07, "loss": 0.6289, "step": 11444 }, { "epoch": 4.380022962112514, "grad_norm": 0.547592282295227, "learning_rate": 7.965719856412036e-07, "loss": 0.612, "step": 11445 }, { "epoch": 4.380405663987753, "grad_norm": 0.5740048885345459, "learning_rate": 7.956027137005929e-07, "loss": 0.6491, "step": 11446 }, { "epoch": 4.380788365862993, "grad_norm": 0.5115635991096497, "learning_rate": 7.946340073949233e-07, "loss": 0.5984, "step": 11447 }, { "epoch": 4.381171067738232, "grad_norm": 0.5765868425369263, "learning_rate": 7.936658667837249e-07, "loss": 0.6115, "step": 11448 }, { "epoch": 4.381553769613471, "grad_norm": 0.5936787128448486, "learning_rate": 7.926982919264902e-07, "loss": 0.6354, "step": 11449 }, { "epoch": 4.38193647148871, "grad_norm": 0.5566381812095642, "learning_rate": 7.917312828826818e-07, "loss": 0.6504, "step": 11450 }, { "epoch": 4.38231917336395, "grad_norm": 0.5723176002502441, "learning_rate": 7.907648397117229e-07, "loss": 0.6268, "step": 11451 }, { "epoch": 4.382701875239189, "grad_norm": 0.575110912322998, "learning_rate": 7.897989624730073e-07, "loss": 0.746, "step": 11452 }, { "epoch": 4.383084577114428, "grad_norm": 0.5070961117744446, "learning_rate": 7.888336512258865e-07, "loss": 0.5035, "step": 11453 }, { "epoch": 4.383467278989667, "grad_norm": 0.6111892461776733, "learning_rate": 7.87868906029684e-07, "loss": 0.6022, "step": 11454 }, { "epoch": 4.383849980864906, "grad_norm": 0.5279174447059631, "learning_rate": 7.869047269436858e-07, "loss": 0.6691, "step": 11455 }, { "epoch": 4.384232682740145, "grad_norm": 0.5788205862045288, "learning_rate": 7.859411140271422e-07, "loss": 0.6542, "step": 11456 }, { "epoch": 4.384615384615385, "grad_norm": 0.5518066883087158, "learning_rate": 7.849780673392715e-07, "loss": 0.6274, "step": 11457 }, { "epoch": 4.384998086490624, "grad_norm": 0.6487661004066467, "learning_rate": 7.840155869392552e-07, "loss": 0.7174, "step": 11458 }, { "epoch": 4.385380788365863, "grad_norm": 0.5503106713294983, "learning_rate": 7.830536728862392e-07, "loss": 0.5753, "step": 11459 }, { "epoch": 4.385763490241102, "grad_norm": 0.48939162492752075, "learning_rate": 7.820923252393353e-07, "loss": 0.5823, "step": 11460 }, { "epoch": 4.386146192116342, "grad_norm": 0.544389545917511, "learning_rate": 7.81131544057624e-07, "loss": 0.5835, "step": 11461 }, { "epoch": 4.3865288939915805, "grad_norm": 0.5644438862800598, "learning_rate": 7.801713294001434e-07, "loss": 0.6365, "step": 11462 }, { "epoch": 4.3869115958668194, "grad_norm": 0.5688590407371521, "learning_rate": 7.792116813259043e-07, "loss": 0.5902, "step": 11463 }, { "epoch": 4.387294297742059, "grad_norm": 0.5454980134963989, "learning_rate": 7.782525998938795e-07, "loss": 0.6086, "step": 11464 }, { "epoch": 4.387676999617298, "grad_norm": 0.5418369770050049, "learning_rate": 7.772940851630051e-07, "loss": 0.5242, "step": 11465 }, { "epoch": 4.388059701492537, "grad_norm": 0.5955923795700073, "learning_rate": 7.763361371921873e-07, "loss": 0.6498, "step": 11466 }, { "epoch": 4.388442403367777, "grad_norm": 0.5050221085548401, "learning_rate": 7.753787560402915e-07, "loss": 0.5993, "step": 11467 }, { "epoch": 4.388825105243016, "grad_norm": 0.5423046350479126, "learning_rate": 7.744219417661558e-07, "loss": 0.6052, "step": 11468 }, { "epoch": 4.389207807118255, "grad_norm": 0.5163379907608032, "learning_rate": 7.734656944285746e-07, "loss": 0.6357, "step": 11469 }, { "epoch": 4.389590508993494, "grad_norm": 0.5795417428016663, "learning_rate": 7.725100140863129e-07, "loss": 0.6724, "step": 11470 }, { "epoch": 4.3899732108687335, "grad_norm": 0.5315183997154236, "learning_rate": 7.715549007981026e-07, "loss": 0.6196, "step": 11471 }, { "epoch": 4.390355912743972, "grad_norm": 0.5091745257377625, "learning_rate": 7.706003546226337e-07, "loss": 0.5594, "step": 11472 }, { "epoch": 4.390738614619211, "grad_norm": 0.5417531132698059, "learning_rate": 7.696463756185679e-07, "loss": 0.6231, "step": 11473 }, { "epoch": 4.391121316494451, "grad_norm": 0.537571370601654, "learning_rate": 7.686929638445295e-07, "loss": 0.6203, "step": 11474 }, { "epoch": 4.39150401836969, "grad_norm": 0.5415813326835632, "learning_rate": 7.677401193591083e-07, "loss": 0.6018, "step": 11475 }, { "epoch": 4.391886720244929, "grad_norm": 0.5694551467895508, "learning_rate": 7.667878422208597e-07, "loss": 0.6857, "step": 11476 }, { "epoch": 4.392269422120169, "grad_norm": 0.5445242524147034, "learning_rate": 7.658361324883023e-07, "loss": 0.6346, "step": 11477 }, { "epoch": 4.392652123995408, "grad_norm": 0.5713156461715698, "learning_rate": 7.64884990219924e-07, "loss": 0.548, "step": 11478 }, { "epoch": 4.393034825870647, "grad_norm": 0.5275543928146362, "learning_rate": 7.639344154741713e-07, "loss": 0.6872, "step": 11479 }, { "epoch": 4.393417527745886, "grad_norm": 0.5946520566940308, "learning_rate": 7.629844083094617e-07, "loss": 0.6818, "step": 11480 }, { "epoch": 4.393800229621125, "grad_norm": 0.5293524861335754, "learning_rate": 7.620349687841766e-07, "loss": 0.5985, "step": 11481 }, { "epoch": 4.394182931496364, "grad_norm": 0.569839358329773, "learning_rate": 7.610860969566591e-07, "loss": 0.6543, "step": 11482 }, { "epoch": 4.394565633371603, "grad_norm": 0.5588036775588989, "learning_rate": 7.601377928852205e-07, "loss": 0.601, "step": 11483 }, { "epoch": 4.394948335246843, "grad_norm": 0.5206298232078552, "learning_rate": 7.591900566281374e-07, "loss": 0.5914, "step": 11484 }, { "epoch": 4.395331037122082, "grad_norm": 0.5579543709754944, "learning_rate": 7.58242888243651e-07, "loss": 0.5965, "step": 11485 }, { "epoch": 4.395713738997321, "grad_norm": 0.5537558197975159, "learning_rate": 7.572962877899681e-07, "loss": 0.5561, "step": 11486 }, { "epoch": 4.396096440872561, "grad_norm": 0.554443359375, "learning_rate": 7.563502553252589e-07, "loss": 0.6058, "step": 11487 }, { "epoch": 4.3964791427478, "grad_norm": 0.5319913029670715, "learning_rate": 7.554047909076579e-07, "loss": 0.6103, "step": 11488 }, { "epoch": 4.3968618446230385, "grad_norm": 0.5283881425857544, "learning_rate": 7.544598945952685e-07, "loss": 0.5506, "step": 11489 }, { "epoch": 4.3972445464982775, "grad_norm": 0.5276660919189453, "learning_rate": 7.53515566446158e-07, "loss": 0.6221, "step": 11490 }, { "epoch": 4.397627248373517, "grad_norm": 0.535922646522522, "learning_rate": 7.525718065183552e-07, "loss": 0.5454, "step": 11491 }, { "epoch": 4.398009950248756, "grad_norm": 0.5205044150352478, "learning_rate": 7.516286148698581e-07, "loss": 0.616, "step": 11492 }, { "epoch": 4.398392652123995, "grad_norm": 0.5795876383781433, "learning_rate": 7.506859915586285e-07, "loss": 0.6348, "step": 11493 }, { "epoch": 4.398775353999235, "grad_norm": 0.5236628651618958, "learning_rate": 7.497439366425918e-07, "loss": 0.5426, "step": 11494 }, { "epoch": 4.399158055874474, "grad_norm": 0.5884067416191101, "learning_rate": 7.488024501796421e-07, "loss": 0.6159, "step": 11495 }, { "epoch": 4.399540757749713, "grad_norm": 0.4955446422100067, "learning_rate": 7.478615322276339e-07, "loss": 0.4757, "step": 11496 }, { "epoch": 4.399923459624953, "grad_norm": 0.504987359046936, "learning_rate": 7.469211828443935e-07, "loss": 0.6417, "step": 11497 }, { "epoch": 4.4003061615001915, "grad_norm": 0.5253503322601318, "learning_rate": 7.459814020877021e-07, "loss": 0.5616, "step": 11498 }, { "epoch": 4.40068886337543, "grad_norm": 0.5603559613227844, "learning_rate": 7.450421900153137e-07, "loss": 0.5966, "step": 11499 }, { "epoch": 4.401071565250669, "grad_norm": 0.5224111080169678, "learning_rate": 7.441035466849489e-07, "loss": 0.6674, "step": 11500 }, { "epoch": 4.401454267125909, "grad_norm": 0.5197374224662781, "learning_rate": 7.431654721542847e-07, "loss": 0.5803, "step": 11501 }, { "epoch": 4.401836969001148, "grad_norm": 0.5306342840194702, "learning_rate": 7.422279664809706e-07, "loss": 0.5641, "step": 11502 }, { "epoch": 4.402219670876387, "grad_norm": 0.5309545993804932, "learning_rate": 7.412910297226217e-07, "loss": 0.5781, "step": 11503 }, { "epoch": 4.402602372751627, "grad_norm": 0.49726220965385437, "learning_rate": 7.403546619368097e-07, "loss": 0.6263, "step": 11504 }, { "epoch": 4.402985074626866, "grad_norm": 0.5142990350723267, "learning_rate": 7.394188631810784e-07, "loss": 0.6018, "step": 11505 }, { "epoch": 4.403367776502105, "grad_norm": 0.5313230752944946, "learning_rate": 7.384836335129375e-07, "loss": 0.6117, "step": 11506 }, { "epoch": 4.4037504783773445, "grad_norm": 0.5224822163581848, "learning_rate": 7.3754897298986e-07, "loss": 0.6146, "step": 11507 }, { "epoch": 4.404133180252583, "grad_norm": 0.5374131798744202, "learning_rate": 7.366148816692808e-07, "loss": 0.5906, "step": 11508 }, { "epoch": 4.404515882127822, "grad_norm": 0.5415740609169006, "learning_rate": 7.356813596086021e-07, "loss": 0.6005, "step": 11509 }, { "epoch": 4.404898584003061, "grad_norm": 0.5702290534973145, "learning_rate": 7.347484068651945e-07, "loss": 0.655, "step": 11510 }, { "epoch": 4.405281285878301, "grad_norm": 0.5164325833320618, "learning_rate": 7.338160234963865e-07, "loss": 0.59, "step": 11511 }, { "epoch": 4.40566398775354, "grad_norm": 0.5453478097915649, "learning_rate": 7.32884209559478e-07, "loss": 0.5782, "step": 11512 }, { "epoch": 4.406046689628779, "grad_norm": 0.5548063516616821, "learning_rate": 7.319529651117319e-07, "loss": 0.5331, "step": 11513 }, { "epoch": 4.406429391504019, "grad_norm": 0.629844069480896, "learning_rate": 7.310222902103725e-07, "loss": 0.6363, "step": 11514 }, { "epoch": 4.406812093379258, "grad_norm": 0.5365163683891296, "learning_rate": 7.30092184912593e-07, "loss": 0.564, "step": 11515 }, { "epoch": 4.4071947952544965, "grad_norm": 0.5170725584030151, "learning_rate": 7.291626492755566e-07, "loss": 0.5484, "step": 11516 }, { "epoch": 4.407577497129736, "grad_norm": 0.5258112549781799, "learning_rate": 7.282336833563786e-07, "loss": 0.5822, "step": 11517 }, { "epoch": 4.407960199004975, "grad_norm": 0.587650716304779, "learning_rate": 7.2730528721215e-07, "loss": 0.7191, "step": 11518 }, { "epoch": 4.408342900880214, "grad_norm": 0.5815141797065735, "learning_rate": 7.263774608999219e-07, "loss": 0.6858, "step": 11519 }, { "epoch": 4.408725602755453, "grad_norm": 0.5146313309669495, "learning_rate": 7.254502044767142e-07, "loss": 0.6247, "step": 11520 }, { "epoch": 4.409108304630693, "grad_norm": 0.5144848227500916, "learning_rate": 7.245235179995058e-07, "loss": 0.6252, "step": 11521 }, { "epoch": 4.409491006505932, "grad_norm": 0.5571010112762451, "learning_rate": 7.235974015252456e-07, "loss": 0.5948, "step": 11522 }, { "epoch": 4.409873708381171, "grad_norm": 0.561020016670227, "learning_rate": 7.226718551108481e-07, "loss": 0.684, "step": 11523 }, { "epoch": 4.410256410256411, "grad_norm": 0.5320772528648376, "learning_rate": 7.217468788131854e-07, "loss": 0.6153, "step": 11524 }, { "epoch": 4.4106391121316495, "grad_norm": 0.5143921375274658, "learning_rate": 7.208224726891044e-07, "loss": 0.5856, "step": 11525 }, { "epoch": 4.411021814006888, "grad_norm": 0.5282148718833923, "learning_rate": 7.198986367954109e-07, "loss": 0.5914, "step": 11526 }, { "epoch": 4.411404515882128, "grad_norm": 0.5296438932418823, "learning_rate": 7.18975371188877e-07, "loss": 0.615, "step": 11527 }, { "epoch": 4.411787217757367, "grad_norm": 0.5627567172050476, "learning_rate": 7.180526759262396e-07, "loss": 0.6295, "step": 11528 }, { "epoch": 4.412169919632606, "grad_norm": 0.5247200727462769, "learning_rate": 7.171305510642024e-07, "loss": 0.5644, "step": 11529 }, { "epoch": 4.412552621507845, "grad_norm": 0.5154531598091125, "learning_rate": 7.162089966594299e-07, "loss": 0.6151, "step": 11530 }, { "epoch": 4.412935323383085, "grad_norm": 0.5751973390579224, "learning_rate": 7.152880127685558e-07, "loss": 0.5541, "step": 11531 }, { "epoch": 4.413318025258324, "grad_norm": 0.5368563532829285, "learning_rate": 7.143675994481758e-07, "loss": 0.5472, "step": 11532 }, { "epoch": 4.413700727133563, "grad_norm": 0.5235212445259094, "learning_rate": 7.134477567548547e-07, "loss": 0.6305, "step": 11533 }, { "epoch": 4.4140834290088025, "grad_norm": 0.5663688778877258, "learning_rate": 7.125284847451152e-07, "loss": 0.6313, "step": 11534 }, { "epoch": 4.414466130884041, "grad_norm": 0.530174732208252, "learning_rate": 7.116097834754509e-07, "loss": 0.6224, "step": 11535 }, { "epoch": 4.41484883275928, "grad_norm": 0.5523028373718262, "learning_rate": 7.106916530023189e-07, "loss": 0.5864, "step": 11536 }, { "epoch": 4.41523153463452, "grad_norm": 0.5614204406738281, "learning_rate": 7.097740933821406e-07, "loss": 0.5905, "step": 11537 }, { "epoch": 4.415614236509759, "grad_norm": 0.5319677591323853, "learning_rate": 7.088571046713022e-07, "loss": 0.5909, "step": 11538 }, { "epoch": 4.415996938384998, "grad_norm": 0.5546190142631531, "learning_rate": 7.079406869261574e-07, "loss": 0.5958, "step": 11539 }, { "epoch": 4.416379640260237, "grad_norm": 0.5701148509979248, "learning_rate": 7.070248402030178e-07, "loss": 0.6109, "step": 11540 }, { "epoch": 4.416762342135477, "grad_norm": 0.552880048751831, "learning_rate": 7.061095645581684e-07, "loss": 0.5995, "step": 11541 }, { "epoch": 4.417145044010716, "grad_norm": 0.6076520681381226, "learning_rate": 7.051948600478542e-07, "loss": 0.6919, "step": 11542 }, { "epoch": 4.4175277458859545, "grad_norm": 0.5679783225059509, "learning_rate": 7.042807267282859e-07, "loss": 0.626, "step": 11543 }, { "epoch": 4.417910447761194, "grad_norm": 0.578410267829895, "learning_rate": 7.033671646556395e-07, "loss": 0.6571, "step": 11544 }, { "epoch": 4.418293149636433, "grad_norm": 0.5463119745254517, "learning_rate": 7.024541738860569e-07, "loss": 0.6314, "step": 11545 }, { "epoch": 4.418675851511672, "grad_norm": 0.5152310132980347, "learning_rate": 7.01541754475642e-07, "loss": 0.6242, "step": 11546 }, { "epoch": 4.419058553386912, "grad_norm": 0.5487401485443115, "learning_rate": 7.006299064804667e-07, "loss": 0.5846, "step": 11547 }, { "epoch": 4.419441255262151, "grad_norm": 0.5551196932792664, "learning_rate": 6.997186299565661e-07, "loss": 0.6451, "step": 11548 }, { "epoch": 4.41982395713739, "grad_norm": 0.5581559538841248, "learning_rate": 6.988079249599433e-07, "loss": 0.6717, "step": 11549 }, { "epoch": 4.420206659012629, "grad_norm": 0.5503594279289246, "learning_rate": 6.97897791546559e-07, "loss": 0.5625, "step": 11550 }, { "epoch": 4.420589360887869, "grad_norm": 0.531947135925293, "learning_rate": 6.969882297723451e-07, "loss": 0.6024, "step": 11551 }, { "epoch": 4.4209720627631075, "grad_norm": 0.537452757358551, "learning_rate": 6.960792396931982e-07, "loss": 0.6573, "step": 11552 }, { "epoch": 4.421354764638346, "grad_norm": 0.5232405662536621, "learning_rate": 6.951708213649766e-07, "loss": 0.6043, "step": 11553 }, { "epoch": 4.421737466513586, "grad_norm": 0.6722478270530701, "learning_rate": 6.942629748435037e-07, "loss": 0.7221, "step": 11554 }, { "epoch": 4.422120168388825, "grad_norm": 0.5643764138221741, "learning_rate": 6.933557001845725e-07, "loss": 0.679, "step": 11555 }, { "epoch": 4.422502870264064, "grad_norm": 0.5384470224380493, "learning_rate": 6.924489974439341e-07, "loss": 0.5963, "step": 11556 }, { "epoch": 4.422885572139304, "grad_norm": 0.5644858479499817, "learning_rate": 6.915428666773106e-07, "loss": 0.6425, "step": 11557 }, { "epoch": 4.423268274014543, "grad_norm": 0.5214495062828064, "learning_rate": 6.90637307940385e-07, "loss": 0.5702, "step": 11558 }, { "epoch": 4.423650975889782, "grad_norm": 0.5713121294975281, "learning_rate": 6.897323212888074e-07, "loss": 0.6392, "step": 11559 }, { "epoch": 4.424033677765021, "grad_norm": 0.5573539137840271, "learning_rate": 6.888279067781888e-07, "loss": 0.6019, "step": 11560 }, { "epoch": 4.4244163796402605, "grad_norm": 0.5432078838348389, "learning_rate": 6.879240644641105e-07, "loss": 0.6714, "step": 11561 }, { "epoch": 4.424799081515499, "grad_norm": 0.5246207118034363, "learning_rate": 6.870207944021168e-07, "loss": 0.6286, "step": 11562 }, { "epoch": 4.425181783390738, "grad_norm": 0.5923270583152771, "learning_rate": 6.861180966477121e-07, "loss": 0.7348, "step": 11563 }, { "epoch": 4.425564485265978, "grad_norm": 0.588011622428894, "learning_rate": 6.852159712563721e-07, "loss": 0.605, "step": 11564 }, { "epoch": 4.425947187141217, "grad_norm": 0.5417081117630005, "learning_rate": 6.84314418283536e-07, "loss": 0.5454, "step": 11565 }, { "epoch": 4.426329889016456, "grad_norm": 0.5871461629867554, "learning_rate": 6.834134377846036e-07, "loss": 0.6334, "step": 11566 }, { "epoch": 4.426712590891696, "grad_norm": 0.5524975657463074, "learning_rate": 6.825130298149452e-07, "loss": 0.6443, "step": 11567 }, { "epoch": 4.427095292766935, "grad_norm": 0.54051673412323, "learning_rate": 6.816131944298943e-07, "loss": 0.5383, "step": 11568 }, { "epoch": 4.427477994642174, "grad_norm": 0.5591880679130554, "learning_rate": 6.807139316847444e-07, "loss": 0.5904, "step": 11569 }, { "epoch": 4.4278606965174125, "grad_norm": 0.5297701954841614, "learning_rate": 6.798152416347603e-07, "loss": 0.6029, "step": 11570 }, { "epoch": 4.428243398392652, "grad_norm": 0.5763322114944458, "learning_rate": 6.789171243351678e-07, "loss": 0.635, "step": 11571 }, { "epoch": 4.428626100267891, "grad_norm": 0.5559956431388855, "learning_rate": 6.780195798411593e-07, "loss": 0.552, "step": 11572 }, { "epoch": 4.42900880214313, "grad_norm": 0.5351948738098145, "learning_rate": 6.771226082078908e-07, "loss": 0.6165, "step": 11573 }, { "epoch": 4.42939150401837, "grad_norm": 0.5904949903488159, "learning_rate": 6.762262094904837e-07, "loss": 0.6409, "step": 11574 }, { "epoch": 4.429774205893609, "grad_norm": 0.5509878396987915, "learning_rate": 6.753303837440239e-07, "loss": 0.6402, "step": 11575 }, { "epoch": 4.430156907768848, "grad_norm": 0.543308675289154, "learning_rate": 6.744351310235631e-07, "loss": 0.6467, "step": 11576 }, { "epoch": 4.430539609644088, "grad_norm": 0.5234894156455994, "learning_rate": 6.73540451384116e-07, "loss": 0.6798, "step": 11577 }, { "epoch": 4.430922311519327, "grad_norm": 0.531318187713623, "learning_rate": 6.726463448806652e-07, "loss": 0.7846, "step": 11578 }, { "epoch": 4.4313050133945655, "grad_norm": 0.5306398272514343, "learning_rate": 6.717528115681537e-07, "loss": 0.5525, "step": 11579 }, { "epoch": 4.431687715269804, "grad_norm": 0.5345569252967834, "learning_rate": 6.708598515014919e-07, "loss": 0.5773, "step": 11580 }, { "epoch": 4.432070417145044, "grad_norm": 0.5379434823989868, "learning_rate": 6.699674647355559e-07, "loss": 0.6202, "step": 11581 }, { "epoch": 4.432453119020283, "grad_norm": 0.733713686466217, "learning_rate": 6.690756513251828e-07, "loss": 0.6258, "step": 11582 }, { "epoch": 4.432835820895522, "grad_norm": 0.5395162105560303, "learning_rate": 6.681844113251779e-07, "loss": 0.587, "step": 11583 }, { "epoch": 4.433218522770762, "grad_norm": 0.607158362865448, "learning_rate": 6.672937447903116e-07, "loss": 0.6814, "step": 11584 }, { "epoch": 4.433601224646001, "grad_norm": 0.49394962191581726, "learning_rate": 6.664036517753158e-07, "loss": 0.6141, "step": 11585 }, { "epoch": 4.43398392652124, "grad_norm": 0.8949559926986694, "learning_rate": 6.655141323348912e-07, "loss": 0.7116, "step": 11586 }, { "epoch": 4.4343666283964795, "grad_norm": 0.5859171152114868, "learning_rate": 6.646251865236997e-07, "loss": 0.7345, "step": 11587 }, { "epoch": 4.4347493302717185, "grad_norm": 0.5352309346199036, "learning_rate": 6.637368143963718e-07, "loss": 0.6027, "step": 11588 }, { "epoch": 4.435132032146957, "grad_norm": 0.6288377642631531, "learning_rate": 6.628490160074963e-07, "loss": 0.6223, "step": 11589 }, { "epoch": 4.435514734022196, "grad_norm": 0.5491818785667419, "learning_rate": 6.619617914116338e-07, "loss": 0.5855, "step": 11590 }, { "epoch": 4.435897435897436, "grad_norm": 0.5176071524620056, "learning_rate": 6.610751406633065e-07, "loss": 0.5542, "step": 11591 }, { "epoch": 4.436280137772675, "grad_norm": 0.5614598393440247, "learning_rate": 6.601890638169995e-07, "loss": 0.6277, "step": 11592 }, { "epoch": 4.436662839647914, "grad_norm": 0.551112949848175, "learning_rate": 6.593035609271658e-07, "loss": 0.6221, "step": 11593 }, { "epoch": 4.437045541523154, "grad_norm": 0.5637988448143005, "learning_rate": 6.584186320482222e-07, "loss": 0.5895, "step": 11594 }, { "epoch": 4.437428243398393, "grad_norm": 0.5334077477455139, "learning_rate": 6.575342772345494e-07, "loss": 0.5474, "step": 11595 }, { "epoch": 4.437810945273632, "grad_norm": 0.5019708275794983, "learning_rate": 6.56650496540494e-07, "loss": 0.5385, "step": 11596 }, { "epoch": 4.438193647148871, "grad_norm": 0.5422314405441284, "learning_rate": 6.55767290020366e-07, "loss": 0.5853, "step": 11597 }, { "epoch": 4.43857634902411, "grad_norm": 0.6926212310791016, "learning_rate": 6.54884657728444e-07, "loss": 0.5742, "step": 11598 }, { "epoch": 4.438959050899349, "grad_norm": 0.5158065557479858, "learning_rate": 6.540025997189625e-07, "loss": 0.5447, "step": 11599 }, { "epoch": 4.439341752774588, "grad_norm": 0.5222346782684326, "learning_rate": 6.531211160461293e-07, "loss": 0.667, "step": 11600 }, { "epoch": 4.439724454649828, "grad_norm": 0.5201216340065002, "learning_rate": 6.522402067641153e-07, "loss": 0.6567, "step": 11601 }, { "epoch": 4.440107156525067, "grad_norm": 0.5355445742607117, "learning_rate": 6.51359871927052e-07, "loss": 0.587, "step": 11602 }, { "epoch": 4.440489858400306, "grad_norm": 0.5465403199195862, "learning_rate": 6.504801115890403e-07, "loss": 0.5728, "step": 11603 }, { "epoch": 4.440872560275546, "grad_norm": 0.6102461814880371, "learning_rate": 6.496009258041436e-07, "loss": 0.5945, "step": 11604 }, { "epoch": 4.441255262150785, "grad_norm": 0.5266895890235901, "learning_rate": 6.487223146263866e-07, "loss": 0.5291, "step": 11605 }, { "epoch": 4.4416379640260235, "grad_norm": 0.5482074618339539, "learning_rate": 6.478442781097672e-07, "loss": 0.6564, "step": 11606 }, { "epoch": 4.442020665901263, "grad_norm": 0.528879702091217, "learning_rate": 6.469668163082433e-07, "loss": 0.6098, "step": 11607 }, { "epoch": 4.442403367776502, "grad_norm": 0.5658349990844727, "learning_rate": 6.460899292757338e-07, "loss": 0.604, "step": 11608 }, { "epoch": 4.442786069651741, "grad_norm": 0.553159236907959, "learning_rate": 6.452136170661272e-07, "loss": 0.5951, "step": 11609 }, { "epoch": 4.44316877152698, "grad_norm": 0.5222615003585815, "learning_rate": 6.443378797332756e-07, "loss": 0.6194, "step": 11610 }, { "epoch": 4.44355147340222, "grad_norm": 0.6011397242546082, "learning_rate": 6.434627173309959e-07, "loss": 0.6655, "step": 11611 }, { "epoch": 4.443934175277459, "grad_norm": 0.5735527873039246, "learning_rate": 6.425881299130677e-07, "loss": 0.7269, "step": 11612 }, { "epoch": 4.444316877152698, "grad_norm": 0.5180725455284119, "learning_rate": 6.417141175332375e-07, "loss": 0.6253, "step": 11613 }, { "epoch": 4.4446995790279376, "grad_norm": 0.5295387506484985, "learning_rate": 6.408406802452171e-07, "loss": 0.6054, "step": 11614 }, { "epoch": 4.4450822809031765, "grad_norm": 0.569581151008606, "learning_rate": 6.39967818102677e-07, "loss": 0.6319, "step": 11615 }, { "epoch": 4.445464982778415, "grad_norm": 0.5250229239463806, "learning_rate": 6.390955311592617e-07, "loss": 0.6045, "step": 11616 }, { "epoch": 4.445847684653655, "grad_norm": 0.5734957456588745, "learning_rate": 6.382238194685752e-07, "loss": 0.643, "step": 11617 }, { "epoch": 4.446230386528894, "grad_norm": 0.5320799946784973, "learning_rate": 6.373526830841847e-07, "loss": 0.5955, "step": 11618 }, { "epoch": 4.446613088404133, "grad_norm": 0.5454009175300598, "learning_rate": 6.36482122059624e-07, "loss": 0.6105, "step": 11619 }, { "epoch": 4.446995790279372, "grad_norm": 0.5858940482139587, "learning_rate": 6.356121364483924e-07, "loss": 0.727, "step": 11620 }, { "epoch": 4.447378492154612, "grad_norm": 0.5515756607055664, "learning_rate": 6.347427263039519e-07, "loss": 0.6434, "step": 11621 }, { "epoch": 4.447761194029851, "grad_norm": 0.49864694476127625, "learning_rate": 6.338738916797304e-07, "loss": 0.5377, "step": 11622 }, { "epoch": 4.44814389590509, "grad_norm": 0.4884408712387085, "learning_rate": 6.330056326291201e-07, "loss": 0.6149, "step": 11623 }, { "epoch": 4.448526597780329, "grad_norm": 0.5091502666473389, "learning_rate": 6.321379492054802e-07, "loss": 0.577, "step": 11624 }, { "epoch": 4.448909299655568, "grad_norm": 0.4993176758289337, "learning_rate": 6.312708414621283e-07, "loss": 0.5938, "step": 11625 }, { "epoch": 4.449292001530807, "grad_norm": 0.5272352695465088, "learning_rate": 6.304043094523504e-07, "loss": 0.5733, "step": 11626 }, { "epoch": 4.449674703406047, "grad_norm": 0.5044870376586914, "learning_rate": 6.295383532294019e-07, "loss": 0.6597, "step": 11627 }, { "epoch": 4.450057405281286, "grad_norm": 0.6351279020309448, "learning_rate": 6.286729728464935e-07, "loss": 0.6392, "step": 11628 }, { "epoch": 4.450440107156525, "grad_norm": 0.5451211929321289, "learning_rate": 6.278081683568072e-07, "loss": 0.5966, "step": 11629 }, { "epoch": 4.450822809031764, "grad_norm": 0.4861851930618286, "learning_rate": 6.26943939813488e-07, "loss": 0.5382, "step": 11630 }, { "epoch": 4.451205510907004, "grad_norm": 0.49030762910842896, "learning_rate": 6.260802872696436e-07, "loss": 0.5422, "step": 11631 }, { "epoch": 4.451588212782243, "grad_norm": 0.5319507718086243, "learning_rate": 6.252172107783483e-07, "loss": 0.6352, "step": 11632 }, { "epoch": 4.4519709146574815, "grad_norm": 0.571259617805481, "learning_rate": 6.243547103926428e-07, "loss": 0.5777, "step": 11633 }, { "epoch": 4.452353616532721, "grad_norm": 0.5220990180969238, "learning_rate": 6.23492786165526e-07, "loss": 0.6004, "step": 11634 }, { "epoch": 4.45273631840796, "grad_norm": 0.562831699848175, "learning_rate": 6.226314381499676e-07, "loss": 0.6104, "step": 11635 }, { "epoch": 4.453119020283199, "grad_norm": 0.5379453897476196, "learning_rate": 6.217706663988988e-07, "loss": 0.5358, "step": 11636 }, { "epoch": 4.453501722158439, "grad_norm": 0.5329179167747498, "learning_rate": 6.209104709652169e-07, "loss": 0.5959, "step": 11637 }, { "epoch": 4.453884424033678, "grad_norm": 0.5553539991378784, "learning_rate": 6.200508519017845e-07, "loss": 0.565, "step": 11638 }, { "epoch": 4.454267125908917, "grad_norm": 0.5655813813209534, "learning_rate": 6.191918092614258e-07, "loss": 0.6891, "step": 11639 }, { "epoch": 4.454649827784156, "grad_norm": 0.5900316834449768, "learning_rate": 6.18333343096933e-07, "loss": 0.685, "step": 11640 }, { "epoch": 4.455032529659396, "grad_norm": 0.5389359593391418, "learning_rate": 6.174754534610594e-07, "loss": 0.5669, "step": 11641 }, { "epoch": 4.4554152315346345, "grad_norm": 0.5775995850563049, "learning_rate": 6.166181404065252e-07, "loss": 0.7068, "step": 11642 }, { "epoch": 4.455797933409873, "grad_norm": 0.5437980890274048, "learning_rate": 6.157614039860149e-07, "loss": 0.6446, "step": 11643 }, { "epoch": 4.456180635285113, "grad_norm": 0.5661171078681946, "learning_rate": 6.149052442521763e-07, "loss": 0.652, "step": 11644 }, { "epoch": 4.456563337160352, "grad_norm": 0.5438652634620667, "learning_rate": 6.140496612576241e-07, "loss": 0.6238, "step": 11645 }, { "epoch": 4.456946039035591, "grad_norm": 0.5584627389907837, "learning_rate": 6.131946550549339e-07, "loss": 0.58, "step": 11646 }, { "epoch": 4.457328740910831, "grad_norm": 0.5624057650566101, "learning_rate": 6.123402256966515e-07, "loss": 0.6456, "step": 11647 }, { "epoch": 4.45771144278607, "grad_norm": 0.4908067584037781, "learning_rate": 6.114863732352805e-07, "loss": 0.5237, "step": 11648 }, { "epoch": 4.458094144661309, "grad_norm": 0.5442347526550293, "learning_rate": 6.106330977232944e-07, "loss": 0.6179, "step": 11649 }, { "epoch": 4.458476846536548, "grad_norm": 0.5033951997756958, "learning_rate": 6.097803992131313e-07, "loss": 0.5473, "step": 11650 }, { "epoch": 4.458859548411787, "grad_norm": 0.5756692886352539, "learning_rate": 6.08928277757187e-07, "loss": 0.6515, "step": 11651 }, { "epoch": 4.459242250287026, "grad_norm": 0.5535922646522522, "learning_rate": 6.080767334078297e-07, "loss": 0.6178, "step": 11652 }, { "epoch": 4.459624952162265, "grad_norm": 0.5398918986320496, "learning_rate": 6.072257662173897e-07, "loss": 0.588, "step": 11653 }, { "epoch": 4.460007654037505, "grad_norm": 0.6488944888114929, "learning_rate": 6.063753762381575e-07, "loss": 0.668, "step": 11654 }, { "epoch": 4.460390355912744, "grad_norm": 0.5297558307647705, "learning_rate": 6.055255635223955e-07, "loss": 0.5944, "step": 11655 }, { "epoch": 4.460773057787983, "grad_norm": 0.5493123531341553, "learning_rate": 6.046763281223255e-07, "loss": 0.58, "step": 11656 }, { "epoch": 4.461155759663223, "grad_norm": 0.5354337096214294, "learning_rate": 6.038276700901346e-07, "loss": 0.6156, "step": 11657 }, { "epoch": 4.461538461538462, "grad_norm": 0.5455252528190613, "learning_rate": 6.029795894779778e-07, "loss": 0.6435, "step": 11658 }, { "epoch": 4.461921163413701, "grad_norm": 0.5480889678001404, "learning_rate": 6.02132086337971e-07, "loss": 0.6176, "step": 11659 }, { "epoch": 4.4623038652889395, "grad_norm": 0.557575523853302, "learning_rate": 6.012851607221926e-07, "loss": 0.6521, "step": 11660 }, { "epoch": 4.462686567164179, "grad_norm": 0.5689406991004944, "learning_rate": 6.00438812682691e-07, "loss": 0.6155, "step": 11661 }, { "epoch": 4.463069269039418, "grad_norm": 0.6123286485671997, "learning_rate": 5.995930422714768e-07, "loss": 0.6109, "step": 11662 }, { "epoch": 4.463451970914657, "grad_norm": 0.516672670841217, "learning_rate": 5.987478495405252e-07, "loss": 0.6471, "step": 11663 }, { "epoch": 4.463834672789897, "grad_norm": 0.5624162554740906, "learning_rate": 5.979032345417735e-07, "loss": 0.5867, "step": 11664 }, { "epoch": 4.464217374665136, "grad_norm": 0.536940336227417, "learning_rate": 5.970591973271256e-07, "loss": 0.6547, "step": 11665 }, { "epoch": 4.464600076540375, "grad_norm": 0.49849772453308105, "learning_rate": 5.962157379484524e-07, "loss": 0.603, "step": 11666 }, { "epoch": 4.464982778415615, "grad_norm": 0.5496699213981628, "learning_rate": 5.953728564575845e-07, "loss": 0.6125, "step": 11667 }, { "epoch": 4.465365480290854, "grad_norm": 0.5411420464515686, "learning_rate": 5.945305529063195e-07, "loss": 0.6326, "step": 11668 }, { "epoch": 4.4657481821660925, "grad_norm": 0.5352087020874023, "learning_rate": 5.936888273464226e-07, "loss": 0.5821, "step": 11669 }, { "epoch": 4.466130884041331, "grad_norm": 0.5765442848205566, "learning_rate": 5.928476798296146e-07, "loss": 0.6963, "step": 11670 }, { "epoch": 4.466513585916571, "grad_norm": 0.5374569892883301, "learning_rate": 5.920071104075898e-07, "loss": 0.5793, "step": 11671 }, { "epoch": 4.46689628779181, "grad_norm": 0.5585675835609436, "learning_rate": 5.911671191320034e-07, "loss": 0.6672, "step": 11672 }, { "epoch": 4.467278989667049, "grad_norm": 0.5265167355537415, "learning_rate": 5.903277060544721e-07, "loss": 0.5291, "step": 11673 }, { "epoch": 4.467661691542289, "grad_norm": 0.5549155473709106, "learning_rate": 5.894888712265834e-07, "loss": 0.5612, "step": 11674 }, { "epoch": 4.468044393417528, "grad_norm": 0.5758666396141052, "learning_rate": 5.886506146998838e-07, "loss": 0.6732, "step": 11675 }, { "epoch": 4.468427095292767, "grad_norm": 0.554671049118042, "learning_rate": 5.878129365258889e-07, "loss": 0.5583, "step": 11676 }, { "epoch": 4.4688097971680065, "grad_norm": 0.5685891509056091, "learning_rate": 5.869758367560729e-07, "loss": 0.6342, "step": 11677 }, { "epoch": 4.4691924990432454, "grad_norm": 0.5440572500228882, "learning_rate": 5.861393154418815e-07, "loss": 0.6033, "step": 11678 }, { "epoch": 4.469575200918484, "grad_norm": 0.5312686562538147, "learning_rate": 5.853033726347201e-07, "loss": 0.5692, "step": 11679 }, { "epoch": 4.469957902793723, "grad_norm": 0.5494184494018555, "learning_rate": 5.844680083859577e-07, "loss": 0.639, "step": 11680 }, { "epoch": 4.470340604668963, "grad_norm": 0.5340695977210999, "learning_rate": 5.836332227469299e-07, "loss": 0.5831, "step": 11681 }, { "epoch": 4.470723306544202, "grad_norm": 0.5658212900161743, "learning_rate": 5.827990157689401e-07, "loss": 0.6391, "step": 11682 }, { "epoch": 4.471106008419441, "grad_norm": 0.5676540732383728, "learning_rate": 5.819653875032482e-07, "loss": 0.6874, "step": 11683 }, { "epoch": 4.471488710294681, "grad_norm": 0.7047023773193359, "learning_rate": 5.811323380010848e-07, "loss": 0.5917, "step": 11684 }, { "epoch": 4.47187141216992, "grad_norm": 0.5145113468170166, "learning_rate": 5.802998673136429e-07, "loss": 0.5956, "step": 11685 }, { "epoch": 4.472254114045159, "grad_norm": 0.5180082321166992, "learning_rate": 5.794679754920796e-07, "loss": 0.594, "step": 11686 }, { "epoch": 4.472636815920398, "grad_norm": 0.5757470726966858, "learning_rate": 5.786366625875184e-07, "loss": 0.5256, "step": 11687 }, { "epoch": 4.473019517795637, "grad_norm": 0.5697283148765564, "learning_rate": 5.77805928651044e-07, "loss": 0.5903, "step": 11688 }, { "epoch": 4.473402219670876, "grad_norm": 0.5233200192451477, "learning_rate": 5.76975773733709e-07, "loss": 0.5427, "step": 11689 }, { "epoch": 4.473784921546115, "grad_norm": 0.5528836250305176, "learning_rate": 5.761461978865269e-07, "loss": 0.6736, "step": 11690 }, { "epoch": 4.474167623421355, "grad_norm": 0.5047960877418518, "learning_rate": 5.75317201160478e-07, "loss": 0.6565, "step": 11691 }, { "epoch": 4.474550325296594, "grad_norm": 0.5203193426132202, "learning_rate": 5.744887836065072e-07, "loss": 0.5781, "step": 11692 }, { "epoch": 4.474933027171833, "grad_norm": 0.4936862587928772, "learning_rate": 5.736609452755215e-07, "loss": 0.6045, "step": 11693 }, { "epoch": 4.475315729047073, "grad_norm": 0.5369796752929688, "learning_rate": 5.728336862183936e-07, "loss": 0.5532, "step": 11694 }, { "epoch": 4.475698430922312, "grad_norm": 0.518889844417572, "learning_rate": 5.720070064859617e-07, "loss": 0.6161, "step": 11695 }, { "epoch": 4.4760811327975505, "grad_norm": 0.5386477708816528, "learning_rate": 5.711809061290285e-07, "loss": 0.5863, "step": 11696 }, { "epoch": 4.47646383467279, "grad_norm": 0.5133368372917175, "learning_rate": 5.703553851983579e-07, "loss": 0.5712, "step": 11697 }, { "epoch": 4.476846536548029, "grad_norm": 0.5448476076126099, "learning_rate": 5.695304437446835e-07, "loss": 0.6826, "step": 11698 }, { "epoch": 4.477229238423268, "grad_norm": 0.5819980502128601, "learning_rate": 5.687060818186974e-07, "loss": 0.6298, "step": 11699 }, { "epoch": 4.477611940298507, "grad_norm": 0.5176756978034973, "learning_rate": 5.678822994710587e-07, "loss": 0.6306, "step": 11700 }, { "epoch": 4.477994642173747, "grad_norm": 0.627921998500824, "learning_rate": 5.670590967523936e-07, "loss": 0.6185, "step": 11701 }, { "epoch": 4.478377344048986, "grad_norm": 0.5336430072784424, "learning_rate": 5.662364737132875e-07, "loss": 0.741, "step": 11702 }, { "epoch": 4.478760045924225, "grad_norm": 0.5607943534851074, "learning_rate": 5.65414430404293e-07, "loss": 0.6741, "step": 11703 }, { "epoch": 4.4791427477994645, "grad_norm": 0.5118253827095032, "learning_rate": 5.645929668759275e-07, "loss": 0.5839, "step": 11704 }, { "epoch": 4.4795254496747035, "grad_norm": 0.5032827854156494, "learning_rate": 5.637720831786731e-07, "loss": 0.6064, "step": 11705 }, { "epoch": 4.479908151549942, "grad_norm": 0.552728533744812, "learning_rate": 5.629517793629746e-07, "loss": 0.571, "step": 11706 }, { "epoch": 4.480290853425182, "grad_norm": 0.5903404355049133, "learning_rate": 5.62132055479242e-07, "loss": 0.6302, "step": 11707 }, { "epoch": 4.480673555300421, "grad_norm": 0.6411042213439941, "learning_rate": 5.613129115778504e-07, "loss": 0.6194, "step": 11708 }, { "epoch": 4.48105625717566, "grad_norm": 0.5667031407356262, "learning_rate": 5.604943477091351e-07, "loss": 0.7173, "step": 11709 }, { "epoch": 4.481438959050899, "grad_norm": 0.5452989935874939, "learning_rate": 5.596763639234026e-07, "loss": 0.5865, "step": 11710 }, { "epoch": 4.481821660926139, "grad_norm": 0.5159440040588379, "learning_rate": 5.588589602709193e-07, "loss": 0.6066, "step": 11711 }, { "epoch": 4.482204362801378, "grad_norm": 0.5974624752998352, "learning_rate": 5.580421368019151e-07, "loss": 0.593, "step": 11712 }, { "epoch": 4.482587064676617, "grad_norm": 0.5336064696311951, "learning_rate": 5.572258935665875e-07, "loss": 0.6017, "step": 11713 }, { "epoch": 4.482969766551856, "grad_norm": 0.6084231734275818, "learning_rate": 5.564102306150987e-07, "loss": 0.6569, "step": 11714 }, { "epoch": 4.483352468427095, "grad_norm": 0.6402245759963989, "learning_rate": 5.555951479975674e-07, "loss": 0.6498, "step": 11715 }, { "epoch": 4.483735170302334, "grad_norm": 0.5077067017555237, "learning_rate": 5.547806457640892e-07, "loss": 0.6472, "step": 11716 }, { "epoch": 4.484117872177574, "grad_norm": 0.5490917563438416, "learning_rate": 5.53966723964714e-07, "loss": 0.6121, "step": 11717 }, { "epoch": 4.484500574052813, "grad_norm": 0.530243992805481, "learning_rate": 5.531533826494617e-07, "loss": 0.5872, "step": 11718 }, { "epoch": 4.484883275928052, "grad_norm": 0.5239179134368896, "learning_rate": 5.523406218683114e-07, "loss": 0.6098, "step": 11719 }, { "epoch": 4.485265977803291, "grad_norm": 0.5719754099845886, "learning_rate": 5.515284416712108e-07, "loss": 0.682, "step": 11720 }, { "epoch": 4.485648679678531, "grad_norm": 0.5332634449005127, "learning_rate": 5.507168421080733e-07, "loss": 0.5886, "step": 11721 }, { "epoch": 4.48603138155377, "grad_norm": 0.5645827054977417, "learning_rate": 5.499058232287691e-07, "loss": 0.6146, "step": 11722 }, { "epoch": 4.4864140834290085, "grad_norm": 0.6010430455207825, "learning_rate": 5.490953850831393e-07, "loss": 0.6675, "step": 11723 }, { "epoch": 4.486796785304248, "grad_norm": 0.6062299609184265, "learning_rate": 5.482855277209886e-07, "loss": 0.6224, "step": 11724 }, { "epoch": 4.487179487179487, "grad_norm": 0.5251789093017578, "learning_rate": 5.474762511920839e-07, "loss": 0.6373, "step": 11725 }, { "epoch": 4.487562189054726, "grad_norm": 0.5522649884223938, "learning_rate": 5.466675555461565e-07, "loss": 0.5539, "step": 11726 }, { "epoch": 4.487944890929966, "grad_norm": 0.5676888227462769, "learning_rate": 5.458594408329065e-07, "loss": 0.6449, "step": 11727 }, { "epoch": 4.488327592805205, "grad_norm": 0.5452454090118408, "learning_rate": 5.4505190710199e-07, "loss": 0.5876, "step": 11728 }, { "epoch": 4.488710294680444, "grad_norm": 0.5336045622825623, "learning_rate": 5.442449544030359e-07, "loss": 0.5336, "step": 11729 }, { "epoch": 4.489092996555683, "grad_norm": 0.5648736953735352, "learning_rate": 5.434385827856315e-07, "loss": 0.6083, "step": 11730 }, { "epoch": 4.4894756984309225, "grad_norm": 0.5417981147766113, "learning_rate": 5.426327922993324e-07, "loss": 0.631, "step": 11731 }, { "epoch": 4.4898584003061615, "grad_norm": 0.5557138919830322, "learning_rate": 5.418275829936537e-07, "loss": 0.6426, "step": 11732 }, { "epoch": 4.4902411021814, "grad_norm": 0.5198363661766052, "learning_rate": 5.410229549180801e-07, "loss": 0.6202, "step": 11733 }, { "epoch": 4.49062380405664, "grad_norm": 0.5840898156166077, "learning_rate": 5.402189081220577e-07, "loss": 0.6824, "step": 11734 }, { "epoch": 4.491006505931879, "grad_norm": 0.5395063757896423, "learning_rate": 5.394154426549969e-07, "loss": 0.5952, "step": 11735 }, { "epoch": 4.491389207807118, "grad_norm": 0.6848244071006775, "learning_rate": 5.386125585662716e-07, "loss": 0.554, "step": 11736 }, { "epoch": 4.491771909682358, "grad_norm": 0.4919540584087372, "learning_rate": 5.378102559052234e-07, "loss": 0.5607, "step": 11737 }, { "epoch": 4.492154611557597, "grad_norm": 0.5485332012176514, "learning_rate": 5.370085347211551e-07, "loss": 0.595, "step": 11738 }, { "epoch": 4.492537313432836, "grad_norm": 0.5224841237068176, "learning_rate": 5.36207395063334e-07, "loss": 0.5321, "step": 11739 }, { "epoch": 4.492920015308075, "grad_norm": 0.5402424335479736, "learning_rate": 5.35406836980994e-07, "loss": 0.5785, "step": 11740 }, { "epoch": 4.493302717183314, "grad_norm": 0.5625516772270203, "learning_rate": 5.346068605233301e-07, "loss": 0.6054, "step": 11741 }, { "epoch": 4.493685419058553, "grad_norm": 0.5760561227798462, "learning_rate": 5.338074657395021e-07, "loss": 0.6037, "step": 11742 }, { "epoch": 4.494068120933792, "grad_norm": 0.5259699821472168, "learning_rate": 5.330086526786371e-07, "loss": 0.6494, "step": 11743 }, { "epoch": 4.494450822809032, "grad_norm": 0.5784645080566406, "learning_rate": 5.322104213898238e-07, "loss": 0.6405, "step": 11744 }, { "epoch": 4.494833524684271, "grad_norm": 0.5759619474411011, "learning_rate": 5.314127719221152e-07, "loss": 0.5591, "step": 11745 }, { "epoch": 4.49521622655951, "grad_norm": 0.5310688018798828, "learning_rate": 5.306157043245286e-07, "loss": 0.5869, "step": 11746 }, { "epoch": 4.49559892843475, "grad_norm": 0.5961280465126038, "learning_rate": 5.29819218646046e-07, "loss": 0.5473, "step": 11747 }, { "epoch": 4.495981630309989, "grad_norm": 0.5301914811134338, "learning_rate": 5.290233149356139e-07, "loss": 0.6443, "step": 11748 }, { "epoch": 4.496364332185228, "grad_norm": 0.4982032775878906, "learning_rate": 5.28227993242143e-07, "loss": 0.5771, "step": 11749 }, { "epoch": 4.4967470340604665, "grad_norm": 0.5124770998954773, "learning_rate": 5.274332536145088e-07, "loss": 0.533, "step": 11750 }, { "epoch": 4.497129735935706, "grad_norm": 0.5579079389572144, "learning_rate": 5.266390961015477e-07, "loss": 0.7017, "step": 11751 }, { "epoch": 4.497512437810945, "grad_norm": 0.5330807566642761, "learning_rate": 5.258455207520641e-07, "loss": 0.5298, "step": 11752 }, { "epoch": 4.497895139686184, "grad_norm": 0.5086737871170044, "learning_rate": 5.250525276148278e-07, "loss": 0.58, "step": 11753 }, { "epoch": 4.498277841561424, "grad_norm": 0.5592315793037415, "learning_rate": 5.242601167385663e-07, "loss": 0.6398, "step": 11754 }, { "epoch": 4.498660543436663, "grad_norm": 0.539566695690155, "learning_rate": 5.234682881719766e-07, "loss": 0.6357, "step": 11755 }, { "epoch": 4.499043245311902, "grad_norm": 0.5505167245864868, "learning_rate": 5.226770419637195e-07, "loss": 0.5724, "step": 11756 }, { "epoch": 4.499425947187142, "grad_norm": 0.578654944896698, "learning_rate": 5.218863781624184e-07, "loss": 0.5832, "step": 11757 }, { "epoch": 4.4998086490623805, "grad_norm": 0.5801931619644165, "learning_rate": 5.210962968166633e-07, "loss": 0.611, "step": 11758 }, { "epoch": 4.5001913509376195, "grad_norm": 0.5482897162437439, "learning_rate": 5.203067979750043e-07, "loss": 0.6474, "step": 11759 }, { "epoch": 4.500574052812858, "grad_norm": 0.5391135811805725, "learning_rate": 5.195178816859625e-07, "loss": 0.5904, "step": 11760 }, { "epoch": 4.500956754688098, "grad_norm": 0.5715667605400085, "learning_rate": 5.187295479980136e-07, "loss": 0.6126, "step": 11761 }, { "epoch": 4.501339456563337, "grad_norm": 0.6280490159988403, "learning_rate": 5.179417969596057e-07, "loss": 0.6537, "step": 11762 }, { "epoch": 4.501722158438576, "grad_norm": 0.5247637033462524, "learning_rate": 5.171546286191498e-07, "loss": 0.5891, "step": 11763 }, { "epoch": 4.502104860313816, "grad_norm": 0.5522745847702026, "learning_rate": 5.163680430250151e-07, "loss": 0.6127, "step": 11764 }, { "epoch": 4.502487562189055, "grad_norm": 0.5418065190315247, "learning_rate": 5.155820402255429e-07, "loss": 0.6082, "step": 11765 }, { "epoch": 4.502870264064294, "grad_norm": 0.565119206905365, "learning_rate": 5.147966202690335e-07, "loss": 0.6625, "step": 11766 }, { "epoch": 4.503252965939533, "grad_norm": 0.5241219997406006, "learning_rate": 5.140117832037539e-07, "loss": 0.5405, "step": 11767 }, { "epoch": 4.503635667814772, "grad_norm": 0.5393630862236023, "learning_rate": 5.132275290779343e-07, "loss": 0.6842, "step": 11768 }, { "epoch": 4.504018369690011, "grad_norm": 0.5437049269676208, "learning_rate": 5.124438579397684e-07, "loss": 0.6157, "step": 11769 }, { "epoch": 4.504401071565251, "grad_norm": 0.6019932627677917, "learning_rate": 5.116607698374176e-07, "loss": 0.6259, "step": 11770 }, { "epoch": 4.50478377344049, "grad_norm": 0.55706387758255, "learning_rate": 5.108782648190013e-07, "loss": 0.6333, "step": 11771 }, { "epoch": 4.505166475315729, "grad_norm": 0.540966272354126, "learning_rate": 5.100963429326089e-07, "loss": 0.5637, "step": 11772 }, { "epoch": 4.505549177190968, "grad_norm": 0.5739874839782715, "learning_rate": 5.093150042262918e-07, "loss": 0.6857, "step": 11773 }, { "epoch": 4.505931879066208, "grad_norm": 0.5149136781692505, "learning_rate": 5.085342487480627e-07, "loss": 0.5352, "step": 11774 }, { "epoch": 4.506314580941447, "grad_norm": 0.5951369404792786, "learning_rate": 5.077540765459021e-07, "loss": 0.6862, "step": 11775 }, { "epoch": 4.506697282816686, "grad_norm": 0.6001572012901306, "learning_rate": 5.069744876677551e-07, "loss": 0.6383, "step": 11776 }, { "epoch": 4.507079984691925, "grad_norm": 0.5398831963539124, "learning_rate": 5.061954821615278e-07, "loss": 0.5878, "step": 11777 }, { "epoch": 4.507462686567164, "grad_norm": 0.5372045040130615, "learning_rate": 5.05417060075094e-07, "loss": 0.6284, "step": 11778 }, { "epoch": 4.507845388442403, "grad_norm": 0.5741170644760132, "learning_rate": 5.046392214562901e-07, "loss": 0.5825, "step": 11779 }, { "epoch": 4.508228090317642, "grad_norm": 0.5211293697357178, "learning_rate": 5.038619663529143e-07, "loss": 0.6189, "step": 11780 }, { "epoch": 4.508610792192882, "grad_norm": 0.5692930221557617, "learning_rate": 5.030852948127318e-07, "loss": 0.6196, "step": 11781 }, { "epoch": 4.508993494068121, "grad_norm": 0.5830888152122498, "learning_rate": 5.023092068834712e-07, "loss": 0.6373, "step": 11782 }, { "epoch": 4.50937619594336, "grad_norm": 0.5404382348060608, "learning_rate": 5.015337026128264e-07, "loss": 0.5816, "step": 11783 }, { "epoch": 4.5097588978186, "grad_norm": 0.5124760866165161, "learning_rate": 5.007587820484517e-07, "loss": 0.5743, "step": 11784 }, { "epoch": 4.5101415996938385, "grad_norm": 0.6014795899391174, "learning_rate": 4.999844452379699e-07, "loss": 0.6271, "step": 11785 }, { "epoch": 4.5105243015690775, "grad_norm": 0.6129798889160156, "learning_rate": 4.992106922289652e-07, "loss": 0.6191, "step": 11786 }, { "epoch": 4.510907003444316, "grad_norm": 0.57005774974823, "learning_rate": 4.984375230689875e-07, "loss": 0.7088, "step": 11787 }, { "epoch": 4.511289705319556, "grad_norm": 0.5448446273803711, "learning_rate": 4.976649378055498e-07, "loss": 0.608, "step": 11788 }, { "epoch": 4.511672407194795, "grad_norm": 0.6024661064147949, "learning_rate": 4.968929364861308e-07, "loss": 0.6468, "step": 11789 }, { "epoch": 4.512055109070035, "grad_norm": 0.5602962970733643, "learning_rate": 4.961215191581692e-07, "loss": 0.6403, "step": 11790 }, { "epoch": 4.512437810945274, "grad_norm": 0.5730224847793579, "learning_rate": 4.953506858690726e-07, "loss": 0.6831, "step": 11791 }, { "epoch": 4.512820512820513, "grad_norm": 0.49685007333755493, "learning_rate": 4.945804366662121e-07, "loss": 0.5286, "step": 11792 }, { "epoch": 4.513203214695752, "grad_norm": 0.7018316984176636, "learning_rate": 4.938107715969187e-07, "loss": 0.5916, "step": 11793 }, { "epoch": 4.5135859165709915, "grad_norm": 0.6051359176635742, "learning_rate": 4.930416907084912e-07, "loss": 0.5717, "step": 11794 }, { "epoch": 4.51396861844623, "grad_norm": 0.5491724014282227, "learning_rate": 4.922731940481929e-07, "loss": 0.6363, "step": 11795 }, { "epoch": 4.514351320321469, "grad_norm": 0.5782783031463623, "learning_rate": 4.915052816632482e-07, "loss": 0.6665, "step": 11796 }, { "epoch": 4.514734022196709, "grad_norm": 0.5531595945358276, "learning_rate": 4.907379536008494e-07, "loss": 0.6952, "step": 11797 }, { "epoch": 4.515116724071948, "grad_norm": 0.5417954921722412, "learning_rate": 4.899712099081488e-07, "loss": 0.6376, "step": 11798 }, { "epoch": 4.515499425947187, "grad_norm": 0.5189024209976196, "learning_rate": 4.892050506322687e-07, "loss": 0.5142, "step": 11799 }, { "epoch": 4.515882127822426, "grad_norm": 0.5621868968009949, "learning_rate": 4.88439475820286e-07, "loss": 0.6053, "step": 11800 }, { "epoch": 4.516264829697666, "grad_norm": 0.57747483253479, "learning_rate": 4.876744855192516e-07, "loss": 0.5827, "step": 11801 }, { "epoch": 4.516647531572905, "grad_norm": 0.49137547612190247, "learning_rate": 4.86910079776175e-07, "loss": 0.5548, "step": 11802 }, { "epoch": 4.517030233448144, "grad_norm": 0.53453129529953, "learning_rate": 4.861462586380305e-07, "loss": 0.5977, "step": 11803 }, { "epoch": 4.517412935323383, "grad_norm": 0.5680879354476929, "learning_rate": 4.853830221517564e-07, "loss": 0.6734, "step": 11804 }, { "epoch": 4.517795637198622, "grad_norm": 0.5121333003044128, "learning_rate": 4.846203703642572e-07, "loss": 0.5568, "step": 11805 }, { "epoch": 4.518178339073861, "grad_norm": 0.54777991771698, "learning_rate": 4.838583033223987e-07, "loss": 0.5678, "step": 11806 }, { "epoch": 4.5185610409491, "grad_norm": 0.558025062084198, "learning_rate": 4.830968210730124e-07, "loss": 0.6136, "step": 11807 }, { "epoch": 4.51894374282434, "grad_norm": 0.5192382335662842, "learning_rate": 4.823359236628944e-07, "loss": 0.622, "step": 11808 }, { "epoch": 4.519326444699579, "grad_norm": 0.5825366377830505, "learning_rate": 4.815756111388037e-07, "loss": 0.6302, "step": 11809 }, { "epoch": 4.519709146574819, "grad_norm": 0.5112608075141907, "learning_rate": 4.808158835474619e-07, "loss": 0.6267, "step": 11810 }, { "epoch": 4.520091848450058, "grad_norm": 0.551100492477417, "learning_rate": 4.800567409355572e-07, "loss": 0.6072, "step": 11811 }, { "epoch": 4.5204745503252965, "grad_norm": 0.570942759513855, "learning_rate": 4.792981833497412e-07, "loss": 0.5755, "step": 11812 }, { "epoch": 4.5208572522005355, "grad_norm": 0.5247594118118286, "learning_rate": 4.785402108366289e-07, "loss": 0.5896, "step": 11813 }, { "epoch": 4.521239954075775, "grad_norm": 0.556790828704834, "learning_rate": 4.777828234427994e-07, "loss": 0.6016, "step": 11814 }, { "epoch": 4.521622655951014, "grad_norm": 0.5680943727493286, "learning_rate": 4.770260212147971e-07, "loss": 0.583, "step": 11815 }, { "epoch": 4.522005357826253, "grad_norm": 0.5775167942047119, "learning_rate": 4.762698041991298e-07, "loss": 0.6054, "step": 11816 }, { "epoch": 4.522388059701493, "grad_norm": 0.5412175059318542, "learning_rate": 4.755141724422674e-07, "loss": 0.5795, "step": 11817 }, { "epoch": 4.522770761576732, "grad_norm": 0.5278676152229309, "learning_rate": 4.747591259906481e-07, "loss": 0.6105, "step": 11818 }, { "epoch": 4.523153463451971, "grad_norm": 0.5151717066764832, "learning_rate": 4.7400466489066823e-07, "loss": 0.555, "step": 11819 }, { "epoch": 4.52353616532721, "grad_norm": 0.5071009993553162, "learning_rate": 4.7325078918869394e-07, "loss": 0.5209, "step": 11820 }, { "epoch": 4.5239188672024495, "grad_norm": 0.5332441329956055, "learning_rate": 4.724974989310527e-07, "loss": 0.6318, "step": 11821 }, { "epoch": 4.524301569077688, "grad_norm": 0.5272001028060913, "learning_rate": 4.7174479416403517e-07, "loss": 0.57, "step": 11822 }, { "epoch": 4.524684270952927, "grad_norm": 0.5291545987129211, "learning_rate": 4.709926749338978e-07, "loss": 0.5585, "step": 11823 }, { "epoch": 4.525066972828167, "grad_norm": 0.560947597026825, "learning_rate": 4.7024114128686017e-07, "loss": 0.6363, "step": 11824 }, { "epoch": 4.525449674703406, "grad_norm": 0.6998535394668579, "learning_rate": 4.694901932691065e-07, "loss": 0.5445, "step": 11825 }, { "epoch": 4.525832376578645, "grad_norm": 0.5963816046714783, "learning_rate": 4.687398309267821e-07, "loss": 0.6016, "step": 11826 }, { "epoch": 4.526215078453884, "grad_norm": 0.5666505694389343, "learning_rate": 4.679900543060012e-07, "loss": 0.6293, "step": 11827 }, { "epoch": 4.526597780329124, "grad_norm": 0.5647178888320923, "learning_rate": 4.672408634528414e-07, "loss": 0.6107, "step": 11828 }, { "epoch": 4.526980482204363, "grad_norm": 0.546783447265625, "learning_rate": 4.66492258413338e-07, "loss": 0.6404, "step": 11829 }, { "epoch": 4.5273631840796025, "grad_norm": 0.5401746034622192, "learning_rate": 4.6574423923349767e-07, "loss": 0.6139, "step": 11830 }, { "epoch": 4.527745885954841, "grad_norm": 0.4986925721168518, "learning_rate": 4.64996805959288e-07, "loss": 0.6102, "step": 11831 }, { "epoch": 4.52812858783008, "grad_norm": 0.5547558665275574, "learning_rate": 4.6424995863664e-07, "loss": 0.6531, "step": 11832 }, { "epoch": 4.528511289705319, "grad_norm": 0.5087123513221741, "learning_rate": 4.635036973114493e-07, "loss": 0.647, "step": 11833 }, { "epoch": 4.528893991580559, "grad_norm": 0.5929021835327148, "learning_rate": 4.6275802202957576e-07, "loss": 0.6669, "step": 11834 }, { "epoch": 4.529276693455798, "grad_norm": 0.5346415042877197, "learning_rate": 4.62012932836845e-07, "loss": 0.6738, "step": 11835 }, { "epoch": 4.529659395331037, "grad_norm": 0.5710256099700928, "learning_rate": 4.6126842977904264e-07, "loss": 0.6244, "step": 11836 }, { "epoch": 4.530042097206277, "grad_norm": 0.5763043761253357, "learning_rate": 4.605245129019187e-07, "loss": 0.5927, "step": 11837 }, { "epoch": 4.530424799081516, "grad_norm": 0.5557389855384827, "learning_rate": 4.597811822511944e-07, "loss": 0.6359, "step": 11838 }, { "epoch": 4.5308075009567546, "grad_norm": 0.5440192222595215, "learning_rate": 4.590384378725443e-07, "loss": 0.6047, "step": 11839 }, { "epoch": 4.5311902028319935, "grad_norm": 0.6033305525779724, "learning_rate": 4.582962798116142e-07, "loss": 0.6216, "step": 11840 }, { "epoch": 4.531572904707233, "grad_norm": 0.6919610500335693, "learning_rate": 4.57554708114013e-07, "loss": 0.6769, "step": 11841 }, { "epoch": 4.531955606582472, "grad_norm": 0.551542341709137, "learning_rate": 4.568137228253089e-07, "loss": 0.6341, "step": 11842 }, { "epoch": 4.532338308457711, "grad_norm": 0.5479455590248108, "learning_rate": 4.560733239910387e-07, "loss": 0.5785, "step": 11843 }, { "epoch": 4.532721010332951, "grad_norm": 0.5478429794311523, "learning_rate": 4.553335116567048e-07, "loss": 0.6147, "step": 11844 }, { "epoch": 4.53310371220819, "grad_norm": 0.5092911720275879, "learning_rate": 4.5459428586776654e-07, "loss": 0.4843, "step": 11845 }, { "epoch": 4.533486414083429, "grad_norm": 0.5648206472396851, "learning_rate": 4.538556466696531e-07, "loss": 0.5931, "step": 11846 }, { "epoch": 4.533869115958668, "grad_norm": 0.5422651171684265, "learning_rate": 4.531175941077537e-07, "loss": 0.6221, "step": 11847 }, { "epoch": 4.5342518178339075, "grad_norm": 0.5287827253341675, "learning_rate": 4.5238012822742874e-07, "loss": 0.5703, "step": 11848 }, { "epoch": 4.534634519709146, "grad_norm": 0.5504177212715149, "learning_rate": 4.5164324907399417e-07, "loss": 0.6105, "step": 11849 }, { "epoch": 4.535017221584386, "grad_norm": 0.5449342727661133, "learning_rate": 4.5090695669273264e-07, "loss": 0.6092, "step": 11850 }, { "epoch": 4.535399923459625, "grad_norm": 0.5862095952033997, "learning_rate": 4.501712511288936e-07, "loss": 0.6469, "step": 11851 }, { "epoch": 4.535782625334864, "grad_norm": 0.5245534181594849, "learning_rate": 4.4943613242768523e-07, "loss": 0.6351, "step": 11852 }, { "epoch": 4.536165327210103, "grad_norm": 0.5224121809005737, "learning_rate": 4.487016006342837e-07, "loss": 0.6432, "step": 11853 }, { "epoch": 4.536548029085343, "grad_norm": 0.6077659130096436, "learning_rate": 4.479676557938306e-07, "loss": 0.6082, "step": 11854 }, { "epoch": 4.536930730960582, "grad_norm": 0.5345128178596497, "learning_rate": 4.4723429795142434e-07, "loss": 0.6194, "step": 11855 }, { "epoch": 4.537313432835821, "grad_norm": 0.545944333076477, "learning_rate": 4.4650152715213444e-07, "loss": 0.6402, "step": 11856 }, { "epoch": 4.5376961347110605, "grad_norm": 0.5813195705413818, "learning_rate": 4.4576934344099045e-07, "loss": 0.6165, "step": 11857 }, { "epoch": 4.538078836586299, "grad_norm": 0.5277384519577026, "learning_rate": 4.450377468629885e-07, "loss": 0.6709, "step": 11858 }, { "epoch": 4.538461538461538, "grad_norm": 0.5739874243736267, "learning_rate": 4.4430673746308605e-07, "loss": 0.6175, "step": 11859 }, { "epoch": 4.538844240336777, "grad_norm": 0.5573253035545349, "learning_rate": 4.4357631528620495e-07, "loss": 0.7041, "step": 11860 }, { "epoch": 4.539226942212017, "grad_norm": 0.5607824921607971, "learning_rate": 4.4284648037723477e-07, "loss": 0.663, "step": 11861 }, { "epoch": 4.539609644087256, "grad_norm": 0.525721549987793, "learning_rate": 4.4211723278102193e-07, "loss": 0.606, "step": 11862 }, { "epoch": 4.539992345962495, "grad_norm": 0.5028517842292786, "learning_rate": 4.4138857254238274e-07, "loss": 0.5409, "step": 11863 }, { "epoch": 4.540375047837735, "grad_norm": 0.5218502283096313, "learning_rate": 4.406604997060959e-07, "loss": 0.6359, "step": 11864 }, { "epoch": 4.540757749712974, "grad_norm": 0.5217921733856201, "learning_rate": 4.3993301431690115e-07, "loss": 0.649, "step": 11865 }, { "epoch": 4.541140451588213, "grad_norm": 0.5023990869522095, "learning_rate": 4.3920611641950494e-07, "loss": 0.5288, "step": 11866 }, { "epoch": 4.5415231534634515, "grad_norm": 0.5402843356132507, "learning_rate": 4.384798060585782e-07, "loss": 0.5735, "step": 11867 }, { "epoch": 4.541905855338691, "grad_norm": 0.5066766738891602, "learning_rate": 4.3775408327875526e-07, "loss": 0.5323, "step": 11868 }, { "epoch": 4.54228855721393, "grad_norm": 0.5480735301971436, "learning_rate": 4.3702894812463146e-07, "loss": 0.6018, "step": 11869 }, { "epoch": 4.54267125908917, "grad_norm": 0.552323579788208, "learning_rate": 4.363044006407724e-07, "loss": 0.578, "step": 11870 }, { "epoch": 4.543053960964409, "grad_norm": 0.5876510739326477, "learning_rate": 4.355804408716979e-07, "loss": 0.6397, "step": 11871 }, { "epoch": 4.543436662839648, "grad_norm": 0.5370407104492188, "learning_rate": 4.3485706886190136e-07, "loss": 0.5691, "step": 11872 }, { "epoch": 4.543819364714887, "grad_norm": 0.5558518767356873, "learning_rate": 4.3413428465583385e-07, "loss": 0.6728, "step": 11873 }, { "epoch": 4.544202066590127, "grad_norm": 0.5739495754241943, "learning_rate": 4.3341208829791426e-07, "loss": 0.5445, "step": 11874 }, { "epoch": 4.5445847684653655, "grad_norm": 0.573823094367981, "learning_rate": 4.3269047983252156e-07, "loss": 0.637, "step": 11875 }, { "epoch": 4.544967470340604, "grad_norm": 0.5423131585121155, "learning_rate": 4.3196945930400025e-07, "loss": 0.6752, "step": 11876 }, { "epoch": 4.545350172215844, "grad_norm": 0.5003639459609985, "learning_rate": 4.312490267566616e-07, "loss": 0.6371, "step": 11877 }, { "epoch": 4.545732874091083, "grad_norm": 0.5288115739822388, "learning_rate": 4.305291822347757e-07, "loss": 0.5614, "step": 11878 }, { "epoch": 4.546115575966322, "grad_norm": 0.5892658233642578, "learning_rate": 4.298099257825805e-07, "loss": 0.5814, "step": 11879 }, { "epoch": 4.546498277841561, "grad_norm": 0.5279377102851868, "learning_rate": 4.290912574442763e-07, "loss": 0.6047, "step": 11880 }, { "epoch": 4.546880979716801, "grad_norm": 0.5573335289955139, "learning_rate": 4.283731772640254e-07, "loss": 0.6019, "step": 11881 }, { "epoch": 4.54726368159204, "grad_norm": 0.5645923614501953, "learning_rate": 4.27655685285957e-07, "loss": 0.6518, "step": 11882 }, { "epoch": 4.547646383467279, "grad_norm": 0.5674377083778381, "learning_rate": 4.269387815541637e-07, "loss": 0.6363, "step": 11883 }, { "epoch": 4.5480290853425185, "grad_norm": 0.8457763195037842, "learning_rate": 4.26222466112699e-07, "loss": 0.6063, "step": 11884 }, { "epoch": 4.548411787217757, "grad_norm": 0.5756105780601501, "learning_rate": 4.2550673900558336e-07, "loss": 0.6536, "step": 11885 }, { "epoch": 4.548794489092996, "grad_norm": 0.5366872549057007, "learning_rate": 4.2479160027680043e-07, "loss": 0.5692, "step": 11886 }, { "epoch": 4.549177190968235, "grad_norm": 0.5673533082008362, "learning_rate": 4.2407704997029843e-07, "loss": 0.6246, "step": 11887 }, { "epoch": 4.549559892843475, "grad_norm": 0.521731436252594, "learning_rate": 4.233630881299855e-07, "loss": 0.5945, "step": 11888 }, { "epoch": 4.549942594718714, "grad_norm": 0.5807080268859863, "learning_rate": 4.2264971479973993e-07, "loss": 0.607, "step": 11889 }, { "epoch": 4.550325296593954, "grad_norm": 0.537246823310852, "learning_rate": 4.219369300233989e-07, "loss": 0.5726, "step": 11890 }, { "epoch": 4.550707998469193, "grad_norm": 0.5640912652015686, "learning_rate": 4.2122473384476504e-07, "loss": 0.545, "step": 11891 }, { "epoch": 4.551090700344432, "grad_norm": 0.5491268634796143, "learning_rate": 4.205131263076034e-07, "loss": 0.609, "step": 11892 }, { "epoch": 4.551473402219671, "grad_norm": 0.5424216389656067, "learning_rate": 4.198021074556469e-07, "loss": 0.6206, "step": 11893 }, { "epoch": 4.55185610409491, "grad_norm": 0.5256964564323425, "learning_rate": 4.1909167733258703e-07, "loss": 0.618, "step": 11894 }, { "epoch": 4.552238805970149, "grad_norm": 0.5385007858276367, "learning_rate": 4.1838183598208235e-07, "loss": 0.5271, "step": 11895 }, { "epoch": 4.552621507845388, "grad_norm": 0.5192835330963135, "learning_rate": 4.1767258344775465e-07, "loss": 0.5858, "step": 11896 }, { "epoch": 4.553004209720628, "grad_norm": 0.5535687208175659, "learning_rate": 4.1696391977319007e-07, "loss": 0.6146, "step": 11897 }, { "epoch": 4.553386911595867, "grad_norm": 0.5802410840988159, "learning_rate": 4.162558450019372e-07, "loss": 0.6485, "step": 11898 }, { "epoch": 4.553769613471106, "grad_norm": 0.5229335427284241, "learning_rate": 4.155483591775089e-07, "loss": 0.586, "step": 11899 }, { "epoch": 4.554152315346345, "grad_norm": 0.5071057677268982, "learning_rate": 4.1484146234338386e-07, "loss": 0.539, "step": 11900 }, { "epoch": 4.554535017221585, "grad_norm": 0.5722752213478088, "learning_rate": 4.1413515454299944e-07, "loss": 0.6326, "step": 11901 }, { "epoch": 4.5549177190968235, "grad_norm": 0.5098156929016113, "learning_rate": 4.1342943581976323e-07, "loss": 0.645, "step": 11902 }, { "epoch": 4.5553004209720624, "grad_norm": 0.5656176805496216, "learning_rate": 4.127243062170427e-07, "loss": 0.6013, "step": 11903 }, { "epoch": 4.555683122847302, "grad_norm": 0.5369493365287781, "learning_rate": 4.120197657781699e-07, "loss": 0.5556, "step": 11904 }, { "epoch": 4.556065824722541, "grad_norm": 0.5577359199523926, "learning_rate": 4.113158145464391e-07, "loss": 0.6884, "step": 11905 }, { "epoch": 4.55644852659778, "grad_norm": 0.5563628673553467, "learning_rate": 4.1061245256511227e-07, "loss": 0.6155, "step": 11906 }, { "epoch": 4.556831228473019, "grad_norm": 0.5728029608726501, "learning_rate": 4.0990967987741157e-07, "loss": 0.6411, "step": 11907 }, { "epoch": 4.557213930348259, "grad_norm": 0.5016283392906189, "learning_rate": 4.092074965265247e-07, "loss": 0.5958, "step": 11908 }, { "epoch": 4.557596632223498, "grad_norm": 0.5940382480621338, "learning_rate": 4.0850590255560375e-07, "loss": 0.5407, "step": 11909 }, { "epoch": 4.557979334098738, "grad_norm": 0.6317160725593567, "learning_rate": 4.07804898007762e-07, "loss": 0.6535, "step": 11910 }, { "epoch": 4.5583620359739765, "grad_norm": 0.5028939247131348, "learning_rate": 4.071044829260773e-07, "loss": 0.6106, "step": 11911 }, { "epoch": 4.558744737849215, "grad_norm": 0.5638496279716492, "learning_rate": 4.064046573535962e-07, "loss": 0.6095, "step": 11912 }, { "epoch": 4.559127439724454, "grad_norm": 0.586836040019989, "learning_rate": 4.0570542133332e-07, "loss": 0.6978, "step": 11913 }, { "epoch": 4.559510141599694, "grad_norm": 0.5325730443000793, "learning_rate": 4.0500677490821985e-07, "loss": 0.696, "step": 11914 }, { "epoch": 4.559892843474933, "grad_norm": 0.5079731345176697, "learning_rate": 4.043087181212313e-07, "loss": 0.5471, "step": 11915 }, { "epoch": 4.560275545350172, "grad_norm": 0.5336204767227173, "learning_rate": 4.0361125101525124e-07, "loss": 0.5808, "step": 11916 }, { "epoch": 4.560658247225412, "grad_norm": 0.5476039052009583, "learning_rate": 4.029143736331387e-07, "loss": 0.5814, "step": 11917 }, { "epoch": 4.561040949100651, "grad_norm": 0.5143474340438843, "learning_rate": 4.022180860177216e-07, "loss": 0.6103, "step": 11918 }, { "epoch": 4.56142365097589, "grad_norm": 0.5431118011474609, "learning_rate": 4.0152238821178803e-07, "loss": 0.5758, "step": 11919 }, { "epoch": 4.561806352851129, "grad_norm": 0.5392035245895386, "learning_rate": 4.0082728025808925e-07, "loss": 0.6005, "step": 11920 }, { "epoch": 4.562189054726368, "grad_norm": 0.5715843439102173, "learning_rate": 4.0013276219934225e-07, "loss": 0.5825, "step": 11921 }, { "epoch": 4.562571756601607, "grad_norm": 0.5626299977302551, "learning_rate": 3.9943883407822846e-07, "loss": 0.6568, "step": 11922 }, { "epoch": 4.562954458476846, "grad_norm": 0.5117465257644653, "learning_rate": 3.987454959373882e-07, "loss": 0.6933, "step": 11923 }, { "epoch": 4.563337160352086, "grad_norm": 0.5268619656562805, "learning_rate": 3.980527478194307e-07, "loss": 0.5491, "step": 11924 }, { "epoch": 4.563719862227325, "grad_norm": 0.539592981338501, "learning_rate": 3.973605897669286e-07, "loss": 0.5772, "step": 11925 }, { "epoch": 4.564102564102564, "grad_norm": 0.5745587944984436, "learning_rate": 3.9666902182241453e-07, "loss": 0.6158, "step": 11926 }, { "epoch": 4.564485265977803, "grad_norm": 0.5407849550247192, "learning_rate": 3.959780440283878e-07, "loss": 0.6069, "step": 11927 }, { "epoch": 4.564867967853043, "grad_norm": 0.5298041701316833, "learning_rate": 3.9528765642731225e-07, "loss": 0.5658, "step": 11928 }, { "epoch": 4.5652506697282815, "grad_norm": 0.5797930359840393, "learning_rate": 3.94597859061614e-07, "loss": 0.6817, "step": 11929 }, { "epoch": 4.565633371603521, "grad_norm": 0.5191001296043396, "learning_rate": 3.939086519736812e-07, "loss": 0.5451, "step": 11930 }, { "epoch": 4.56601607347876, "grad_norm": 0.5604599118232727, "learning_rate": 3.93220035205869e-07, "loss": 0.652, "step": 11931 }, { "epoch": 4.566398775353999, "grad_norm": 0.5251785516738892, "learning_rate": 3.925320088004947e-07, "loss": 0.6489, "step": 11932 }, { "epoch": 4.566781477229238, "grad_norm": 0.5318724513053894, "learning_rate": 3.9184457279983766e-07, "loss": 0.6109, "step": 11933 }, { "epoch": 4.567164179104478, "grad_norm": 0.5630382299423218, "learning_rate": 3.911577272461442e-07, "loss": 0.5861, "step": 11934 }, { "epoch": 4.567546880979717, "grad_norm": 0.5216839909553528, "learning_rate": 3.9047147218162273e-07, "loss": 0.6147, "step": 11935 }, { "epoch": 4.567929582854956, "grad_norm": 0.5305783748626709, "learning_rate": 3.897858076484451e-07, "loss": 0.6421, "step": 11936 }, { "epoch": 4.568312284730196, "grad_norm": 0.5647500157356262, "learning_rate": 3.8910073368874533e-07, "loss": 0.6338, "step": 11937 }, { "epoch": 4.5686949866054345, "grad_norm": 0.5181987881660461, "learning_rate": 3.884162503446276e-07, "loss": 0.6009, "step": 11938 }, { "epoch": 4.569077688480673, "grad_norm": 0.5664381980895996, "learning_rate": 3.8773235765815155e-07, "loss": 0.5823, "step": 11939 }, { "epoch": 4.569460390355912, "grad_norm": 0.6120026111602783, "learning_rate": 3.8704905567134574e-07, "loss": 0.6266, "step": 11940 }, { "epoch": 4.569843092231152, "grad_norm": 0.5514488220214844, "learning_rate": 3.8636634442620004e-07, "loss": 0.6572, "step": 11941 }, { "epoch": 4.570225794106391, "grad_norm": 0.5257810950279236, "learning_rate": 3.8568422396467075e-07, "loss": 0.5454, "step": 11942 }, { "epoch": 4.57060849598163, "grad_norm": 0.5510586500167847, "learning_rate": 3.8500269432867335e-07, "loss": 0.6797, "step": 11943 }, { "epoch": 4.57099119785687, "grad_norm": 0.5892579555511475, "learning_rate": 3.84321755560092e-07, "loss": 0.6282, "step": 11944 }, { "epoch": 4.571373899732109, "grad_norm": 0.5549626350402832, "learning_rate": 3.836414077007711e-07, "loss": 0.6356, "step": 11945 }, { "epoch": 4.571756601607348, "grad_norm": 0.5236932635307312, "learning_rate": 3.829616507925193e-07, "loss": 0.5615, "step": 11946 }, { "epoch": 4.572139303482587, "grad_norm": 0.5465821027755737, "learning_rate": 3.8228248487710893e-07, "loss": 0.5827, "step": 11947 }, { "epoch": 4.572522005357826, "grad_norm": 0.5543435215950012, "learning_rate": 3.8160390999627986e-07, "loss": 0.5821, "step": 11948 }, { "epoch": 4.572904707233065, "grad_norm": 0.5432076454162598, "learning_rate": 3.809259261917297e-07, "loss": 0.7074, "step": 11949 }, { "epoch": 4.573287409108305, "grad_norm": 0.5335464477539062, "learning_rate": 3.80248533505122e-07, "loss": 0.6356, "step": 11950 }, { "epoch": 4.573670110983544, "grad_norm": 0.5292462706565857, "learning_rate": 3.795717319780867e-07, "loss": 0.6635, "step": 11951 }, { "epoch": 4.574052812858783, "grad_norm": 0.520757257938385, "learning_rate": 3.7889552165221277e-07, "loss": 0.6487, "step": 11952 }, { "epoch": 4.574435514734022, "grad_norm": 1.1594294309616089, "learning_rate": 3.782199025690558e-07, "loss": 0.6787, "step": 11953 }, { "epoch": 4.574818216609262, "grad_norm": 0.6147387623786926, "learning_rate": 3.7754487477013493e-07, "loss": 0.7296, "step": 11954 }, { "epoch": 4.575200918484501, "grad_norm": 0.5368598699569702, "learning_rate": 3.768704382969335e-07, "loss": 0.661, "step": 11955 }, { "epoch": 4.5755836203597395, "grad_norm": 0.5395115613937378, "learning_rate": 3.761965931908951e-07, "loss": 0.6033, "step": 11956 }, { "epoch": 4.575966322234979, "grad_norm": 0.5860260128974915, "learning_rate": 3.755233394934299e-07, "loss": 0.6129, "step": 11957 }, { "epoch": 4.576349024110218, "grad_norm": 0.6240185499191284, "learning_rate": 3.748506772459104e-07, "loss": 0.6099, "step": 11958 }, { "epoch": 4.576731725985457, "grad_norm": 0.5283963680267334, "learning_rate": 3.7417860648967573e-07, "loss": 0.5218, "step": 11959 }, { "epoch": 4.577114427860696, "grad_norm": 0.5039244890213013, "learning_rate": 3.7350712726602623e-07, "loss": 0.6093, "step": 11960 }, { "epoch": 4.577497129735936, "grad_norm": 0.5977012515068054, "learning_rate": 3.728362396162255e-07, "loss": 0.6128, "step": 11961 }, { "epoch": 4.577879831611175, "grad_norm": 0.5572035312652588, "learning_rate": 3.721659435814995e-07, "loss": 0.6238, "step": 11962 }, { "epoch": 4.578262533486414, "grad_norm": 0.5199502110481262, "learning_rate": 3.7149623920304187e-07, "loss": 0.6912, "step": 11963 }, { "epoch": 4.578645235361654, "grad_norm": 0.579789400100708, "learning_rate": 3.708271265220087e-07, "loss": 0.7681, "step": 11964 }, { "epoch": 4.5790279372368925, "grad_norm": 0.5527381896972656, "learning_rate": 3.701586055795148e-07, "loss": 0.5515, "step": 11965 }, { "epoch": 4.579410639112131, "grad_norm": 0.5420636534690857, "learning_rate": 3.6949067641664616e-07, "loss": 0.5564, "step": 11966 }, { "epoch": 4.57979334098737, "grad_norm": 0.5714250802993774, "learning_rate": 3.6882333907444777e-07, "loss": 0.649, "step": 11967 }, { "epoch": 4.58017604286261, "grad_norm": 0.5212757587432861, "learning_rate": 3.681565935939291e-07, "loss": 0.6053, "step": 11968 }, { "epoch": 4.580558744737849, "grad_norm": 0.532256543636322, "learning_rate": 3.6749044001606395e-07, "loss": 0.5897, "step": 11969 }, { "epoch": 4.580941446613089, "grad_norm": 0.5700889229774475, "learning_rate": 3.6682487838178847e-07, "loss": 0.7211, "step": 11970 }, { "epoch": 4.581324148488328, "grad_norm": 0.5702705979347229, "learning_rate": 3.6615990873200447e-07, "loss": 0.7612, "step": 11971 }, { "epoch": 4.581706850363567, "grad_norm": 0.6382039785385132, "learning_rate": 3.6549553110757474e-07, "loss": 0.6279, "step": 11972 }, { "epoch": 4.582089552238806, "grad_norm": 0.5180734992027283, "learning_rate": 3.6483174554932664e-07, "loss": 0.6201, "step": 11973 }, { "epoch": 4.5824722541140455, "grad_norm": 0.5503820180892944, "learning_rate": 3.6416855209805423e-07, "loss": 0.658, "step": 11974 }, { "epoch": 4.582854955989284, "grad_norm": 0.5875105857849121, "learning_rate": 3.635059507945105e-07, "loss": 0.7049, "step": 11975 }, { "epoch": 4.583237657864523, "grad_norm": 0.5061069130897522, "learning_rate": 3.6284394167941385e-07, "loss": 0.5649, "step": 11976 }, { "epoch": 4.583620359739763, "grad_norm": 0.511499285697937, "learning_rate": 3.621825247934474e-07, "loss": 0.5567, "step": 11977 }, { "epoch": 4.584003061615002, "grad_norm": 0.5581942200660706, "learning_rate": 3.615217001772564e-07, "loss": 0.6637, "step": 11978 }, { "epoch": 4.584385763490241, "grad_norm": 0.6000387072563171, "learning_rate": 3.608614678714506e-07, "loss": 0.6312, "step": 11979 }, { "epoch": 4.58476846536548, "grad_norm": 0.5322536826133728, "learning_rate": 3.6020182791660195e-07, "loss": 0.6054, "step": 11980 }, { "epoch": 4.58515116724072, "grad_norm": 0.5623868703842163, "learning_rate": 3.595427803532514e-07, "loss": 0.5465, "step": 11981 }, { "epoch": 4.585533869115959, "grad_norm": 0.6041160225868225, "learning_rate": 3.588843252218932e-07, "loss": 0.5845, "step": 11982 }, { "epoch": 4.5859165709911975, "grad_norm": 0.5770783424377441, "learning_rate": 3.58226462562995e-07, "loss": 0.6886, "step": 11983 }, { "epoch": 4.586299272866437, "grad_norm": 0.5168430805206299, "learning_rate": 3.575691924169833e-07, "loss": 0.5437, "step": 11984 }, { "epoch": 4.586681974741676, "grad_norm": 0.567079484462738, "learning_rate": 3.5691251482424914e-07, "loss": 0.6352, "step": 11985 }, { "epoch": 4.587064676616915, "grad_norm": 0.5733622312545776, "learning_rate": 3.5625642982514583e-07, "loss": 0.6386, "step": 11986 }, { "epoch": 4.587447378492154, "grad_norm": 0.5395804643630981, "learning_rate": 3.5560093745999336e-07, "loss": 0.6159, "step": 11987 }, { "epoch": 4.587830080367394, "grad_norm": 0.5903790593147278, "learning_rate": 3.549460377690728e-07, "loss": 0.6594, "step": 11988 }, { "epoch": 4.588212782242633, "grad_norm": 0.6029056310653687, "learning_rate": 3.5429173079262967e-07, "loss": 0.6415, "step": 11989 }, { "epoch": 4.588595484117873, "grad_norm": 0.5846849679946899, "learning_rate": 3.5363801657087415e-07, "loss": 0.6408, "step": 11990 }, { "epoch": 4.588978185993112, "grad_norm": 0.5255994200706482, "learning_rate": 3.529848951439763e-07, "loss": 0.583, "step": 11991 }, { "epoch": 4.5893608878683505, "grad_norm": 0.5818180441856384, "learning_rate": 3.523323665520728e-07, "loss": 0.6245, "step": 11992 }, { "epoch": 4.589743589743589, "grad_norm": 0.563209056854248, "learning_rate": 3.516804308352628e-07, "loss": 0.6147, "step": 11993 }, { "epoch": 4.590126291618829, "grad_norm": 0.5888954997062683, "learning_rate": 3.5102908803361314e-07, "loss": 0.6238, "step": 11994 }, { "epoch": 4.590508993494068, "grad_norm": 0.5403844118118286, "learning_rate": 3.5037833818714616e-07, "loss": 0.6203, "step": 11995 }, { "epoch": 4.590891695369307, "grad_norm": 0.5332226157188416, "learning_rate": 3.497281813358544e-07, "loss": 0.591, "step": 11996 }, { "epoch": 4.591274397244547, "grad_norm": 0.5517668128013611, "learning_rate": 3.490786175196903e-07, "loss": 0.6336, "step": 11997 }, { "epoch": 4.591657099119786, "grad_norm": 0.5319545865058899, "learning_rate": 3.4842964677857303e-07, "loss": 0.6596, "step": 11998 }, { "epoch": 4.592039800995025, "grad_norm": 0.590878963470459, "learning_rate": 3.4778126915238187e-07, "loss": 0.6433, "step": 11999 }, { "epoch": 4.592422502870264, "grad_norm": 0.5839358568191528, "learning_rate": 3.4713348468096377e-07, "loss": 0.6079, "step": 12000 }, { "epoch": 4.5928052047455035, "grad_norm": 0.5994459390640259, "learning_rate": 3.4648629340412463e-07, "loss": 0.6098, "step": 12001 }, { "epoch": 4.593187906620742, "grad_norm": 0.5676131844520569, "learning_rate": 3.458396953616372e-07, "loss": 0.7098, "step": 12002 }, { "epoch": 4.593570608495981, "grad_norm": 0.5741029381752014, "learning_rate": 3.451936905932374e-07, "loss": 0.6087, "step": 12003 }, { "epoch": 4.593953310371221, "grad_norm": 0.5102450847625732, "learning_rate": 3.4454827913862123e-07, "loss": 0.6422, "step": 12004 }, { "epoch": 4.59433601224646, "grad_norm": 0.5877562165260315, "learning_rate": 3.439034610374525e-07, "loss": 0.5973, "step": 12005 }, { "epoch": 4.594718714121699, "grad_norm": 0.5496272444725037, "learning_rate": 3.432592363293563e-07, "loss": 0.6762, "step": 12006 }, { "epoch": 4.595101415996938, "grad_norm": 0.5721208453178406, "learning_rate": 3.426156050539242e-07, "loss": 0.662, "step": 12007 }, { "epoch": 4.595484117872178, "grad_norm": 0.5406104326248169, "learning_rate": 3.419725672507068e-07, "loss": 0.6588, "step": 12008 }, { "epoch": 4.595866819747417, "grad_norm": 0.8787626624107361, "learning_rate": 3.413301229592214e-07, "loss": 0.6081, "step": 12009 }, { "epoch": 4.596249521622656, "grad_norm": 0.5128843784332275, "learning_rate": 3.4068827221894975e-07, "loss": 0.5482, "step": 12010 }, { "epoch": 4.596632223497895, "grad_norm": 0.506075382232666, "learning_rate": 3.4004701506933134e-07, "loss": 0.5844, "step": 12011 }, { "epoch": 4.597014925373134, "grad_norm": 0.5664159059524536, "learning_rate": 3.394063515497759e-07, "loss": 0.5398, "step": 12012 }, { "epoch": 4.597397627248373, "grad_norm": 0.5588523149490356, "learning_rate": 3.38766281699654e-07, "loss": 0.6086, "step": 12013 }, { "epoch": 4.597780329123613, "grad_norm": 0.5382854342460632, "learning_rate": 3.381268055582976e-07, "loss": 0.6023, "step": 12014 }, { "epoch": 4.598163030998852, "grad_norm": 0.5395389795303345, "learning_rate": 3.3748792316500633e-07, "loss": 0.5876, "step": 12015 }, { "epoch": 4.598545732874091, "grad_norm": 0.5260952711105347, "learning_rate": 3.3684963455903994e-07, "loss": 0.5755, "step": 12016 }, { "epoch": 4.598928434749331, "grad_norm": 0.5482215285301208, "learning_rate": 3.3621193977962375e-07, "loss": 0.647, "step": 12017 }, { "epoch": 4.59931113662457, "grad_norm": 0.5785109996795654, "learning_rate": 3.3557483886594524e-07, "loss": 0.6641, "step": 12018 }, { "epoch": 4.5996938384998085, "grad_norm": 0.5135534405708313, "learning_rate": 3.349383318571553e-07, "loss": 0.5456, "step": 12019 }, { "epoch": 4.600076540375047, "grad_norm": 0.4927951693534851, "learning_rate": 3.343024187923727e-07, "loss": 0.5311, "step": 12020 }, { "epoch": 4.600459242250287, "grad_norm": 0.5579778552055359, "learning_rate": 3.3366709971067166e-07, "loss": 0.5681, "step": 12021 }, { "epoch": 4.600841944125526, "grad_norm": 0.5661026835441589, "learning_rate": 3.330323746510955e-07, "loss": 0.6469, "step": 12022 }, { "epoch": 4.601224646000765, "grad_norm": 0.5752442479133606, "learning_rate": 3.323982436526507e-07, "loss": 0.563, "step": 12023 }, { "epoch": 4.601607347876005, "grad_norm": 0.5763504505157471, "learning_rate": 3.3176470675430505e-07, "loss": 0.6578, "step": 12024 }, { "epoch": 4.601990049751244, "grad_norm": 0.5400048494338989, "learning_rate": 3.311317639949929e-07, "loss": 0.6078, "step": 12025 }, { "epoch": 4.602372751626483, "grad_norm": 0.583737850189209, "learning_rate": 3.304994154136076e-07, "loss": 0.6754, "step": 12026 }, { "epoch": 4.602755453501722, "grad_norm": 0.5496630668640137, "learning_rate": 3.2986766104901146e-07, "loss": 0.6743, "step": 12027 }, { "epoch": 4.6031381553769615, "grad_norm": 0.5141679048538208, "learning_rate": 3.292365009400256e-07, "loss": 0.6052, "step": 12028 }, { "epoch": 4.6035208572522, "grad_norm": 0.555976927280426, "learning_rate": 3.286059351254378e-07, "loss": 0.6012, "step": 12029 }, { "epoch": 4.60390355912744, "grad_norm": 0.5615912079811096, "learning_rate": 3.279759636439972e-07, "loss": 0.6853, "step": 12030 }, { "epoch": 4.604286261002679, "grad_norm": 0.5005000233650208, "learning_rate": 3.273465865344172e-07, "loss": 0.5301, "step": 12031 }, { "epoch": 4.604668962877918, "grad_norm": 0.5348353981971741, "learning_rate": 3.267178038353747e-07, "loss": 0.6603, "step": 12032 }, { "epoch": 4.605051664753157, "grad_norm": 0.5171619653701782, "learning_rate": 3.26089615585512e-07, "loss": 0.5934, "step": 12033 }, { "epoch": 4.605434366628397, "grad_norm": 0.6017894744873047, "learning_rate": 3.2546202182342945e-07, "loss": 0.6588, "step": 12034 }, { "epoch": 4.605817068503636, "grad_norm": 0.5511787533760071, "learning_rate": 3.2483502258769617e-07, "loss": 0.73, "step": 12035 }, { "epoch": 4.606199770378875, "grad_norm": 0.565092146396637, "learning_rate": 3.2420861791684464e-07, "loss": 0.6311, "step": 12036 }, { "epoch": 4.606582472254114, "grad_norm": 0.6042028665542603, "learning_rate": 3.2358280784936526e-07, "loss": 0.5915, "step": 12037 }, { "epoch": 4.606965174129353, "grad_norm": 0.5931689739227295, "learning_rate": 3.2295759242371937e-07, "loss": 0.5461, "step": 12038 }, { "epoch": 4.607347876004592, "grad_norm": 0.5423325896263123, "learning_rate": 3.2233297167832636e-07, "loss": 0.6585, "step": 12039 }, { "epoch": 4.607730577879831, "grad_norm": 0.547478199005127, "learning_rate": 3.2170894565157096e-07, "loss": 0.6024, "step": 12040 }, { "epoch": 4.608113279755071, "grad_norm": 0.5033291578292847, "learning_rate": 3.210855143818015e-07, "loss": 0.5851, "step": 12041 }, { "epoch": 4.60849598163031, "grad_norm": 0.5588648319244385, "learning_rate": 3.2046267790733054e-07, "loss": 0.6628, "step": 12042 }, { "epoch": 4.608878683505549, "grad_norm": 0.5340779423713684, "learning_rate": 3.19840436266432e-07, "loss": 0.6309, "step": 12043 }, { "epoch": 4.609261385380789, "grad_norm": 0.5351253747940063, "learning_rate": 3.192187894973431e-07, "loss": 0.6086, "step": 12044 }, { "epoch": 4.609644087256028, "grad_norm": 0.5286898016929626, "learning_rate": 3.185977376382665e-07, "loss": 0.6008, "step": 12045 }, { "epoch": 4.6100267891312665, "grad_norm": 0.5501571297645569, "learning_rate": 3.1797728072736956e-07, "loss": 0.6411, "step": 12046 }, { "epoch": 4.610409491006505, "grad_norm": 0.5587639808654785, "learning_rate": 3.1735741880277726e-07, "loss": 0.6256, "step": 12047 }, { "epoch": 4.610792192881745, "grad_norm": 0.6758572459220886, "learning_rate": 3.1673815190258474e-07, "loss": 0.6227, "step": 12048 }, { "epoch": 4.611174894756984, "grad_norm": 0.5354729294776917, "learning_rate": 3.161194800648482e-07, "loss": 0.6329, "step": 12049 }, { "epoch": 4.611557596632224, "grad_norm": 0.5609530806541443, "learning_rate": 3.1550140332758384e-07, "loss": 0.6752, "step": 12050 }, { "epoch": 4.611940298507463, "grad_norm": 0.5455505847930908, "learning_rate": 3.1488392172877577e-07, "loss": 0.5539, "step": 12051 }, { "epoch": 4.612323000382702, "grad_norm": 0.5600203275680542, "learning_rate": 3.1426703530637146e-07, "loss": 0.6788, "step": 12052 }, { "epoch": 4.612705702257941, "grad_norm": 0.5675562024116516, "learning_rate": 3.136507440982761e-07, "loss": 0.6217, "step": 12053 }, { "epoch": 4.6130884041331806, "grad_norm": 0.5766631960868835, "learning_rate": 3.1303504814236494e-07, "loss": 0.688, "step": 12054 }, { "epoch": 4.6134711060084195, "grad_norm": 0.5643371939659119, "learning_rate": 3.124199474764755e-07, "loss": 0.6047, "step": 12055 }, { "epoch": 4.613853807883658, "grad_norm": 0.5173043608665466, "learning_rate": 3.1180544213840536e-07, "loss": 0.5778, "step": 12056 }, { "epoch": 4.614236509758898, "grad_norm": 0.5507822632789612, "learning_rate": 3.111915321659176e-07, "loss": 0.6086, "step": 12057 }, { "epoch": 4.614619211634137, "grad_norm": 0.5580874681472778, "learning_rate": 3.105782175967376e-07, "loss": 0.5935, "step": 12058 }, { "epoch": 4.615001913509376, "grad_norm": 0.6420934200286865, "learning_rate": 3.099654984685585e-07, "loss": 0.6162, "step": 12059 }, { "epoch": 4.615384615384615, "grad_norm": 0.5685631632804871, "learning_rate": 3.0935337481903137e-07, "loss": 0.579, "step": 12060 }, { "epoch": 4.615767317259855, "grad_norm": 0.49993273615837097, "learning_rate": 3.0874184668577277e-07, "loss": 0.665, "step": 12061 }, { "epoch": 4.616150019135094, "grad_norm": 0.5560086965560913, "learning_rate": 3.0813091410636487e-07, "loss": 0.5867, "step": 12062 }, { "epoch": 4.616532721010333, "grad_norm": 0.5940142273902893, "learning_rate": 3.075205771183476e-07, "loss": 0.7116, "step": 12063 }, { "epoch": 4.616915422885572, "grad_norm": 0.577073335647583, "learning_rate": 3.069108357592299e-07, "loss": 0.6412, "step": 12064 }, { "epoch": 4.617298124760811, "grad_norm": 0.5729652047157288, "learning_rate": 3.063016900664828e-07, "loss": 0.594, "step": 12065 }, { "epoch": 4.61768082663605, "grad_norm": 0.6078553795814514, "learning_rate": 3.056931400775376e-07, "loss": 0.6466, "step": 12066 }, { "epoch": 4.618063528511289, "grad_norm": 0.5781973600387573, "learning_rate": 3.0508518582979317e-07, "loss": 0.5692, "step": 12067 }, { "epoch": 4.618446230386529, "grad_norm": 0.5540727376937866, "learning_rate": 3.044778273606086e-07, "loss": 0.5761, "step": 12068 }, { "epoch": 4.618828932261768, "grad_norm": 0.5752570033073425, "learning_rate": 3.0387106470730954e-07, "loss": 0.6216, "step": 12069 }, { "epoch": 4.619211634137008, "grad_norm": 0.5189759135246277, "learning_rate": 3.032648979071806e-07, "loss": 0.5309, "step": 12070 }, { "epoch": 4.619594336012247, "grad_norm": 0.5315952897071838, "learning_rate": 3.0265932699747534e-07, "loss": 0.6061, "step": 12071 }, { "epoch": 4.619977037887486, "grad_norm": 0.5215435028076172, "learning_rate": 3.0205435201540625e-07, "loss": 0.6161, "step": 12072 }, { "epoch": 4.6203597397627245, "grad_norm": 0.5906978249549866, "learning_rate": 3.0144997299815013e-07, "loss": 0.6426, "step": 12073 }, { "epoch": 4.620742441637964, "grad_norm": 0.48627984523773193, "learning_rate": 3.008461899828474e-07, "loss": 0.5941, "step": 12074 }, { "epoch": 4.621125143513203, "grad_norm": 0.523879885673523, "learning_rate": 3.0024300300660504e-07, "loss": 0.6286, "step": 12075 }, { "epoch": 4.621507845388442, "grad_norm": 0.5286355018615723, "learning_rate": 2.996404121064878e-07, "loss": 0.5706, "step": 12076 }, { "epoch": 4.621890547263682, "grad_norm": 0.5158140659332275, "learning_rate": 2.99038417319526e-07, "loss": 0.5444, "step": 12077 }, { "epoch": 4.622273249138921, "grad_norm": 0.528745174407959, "learning_rate": 2.984370186827157e-07, "loss": 0.6026, "step": 12078 }, { "epoch": 4.62265595101416, "grad_norm": 0.5490230917930603, "learning_rate": 2.9783621623301284e-07, "loss": 0.6083, "step": 12079 }, { "epoch": 4.623038652889399, "grad_norm": 0.5441139340400696, "learning_rate": 2.9723601000734013e-07, "loss": 0.6784, "step": 12080 }, { "epoch": 4.623421354764639, "grad_norm": 0.5304040312767029, "learning_rate": 2.966364000425814e-07, "loss": 0.6462, "step": 12081 }, { "epoch": 4.6238040566398775, "grad_norm": 0.5552332997322083, "learning_rate": 2.960373863755839e-07, "loss": 0.6038, "step": 12082 }, { "epoch": 4.624186758515116, "grad_norm": 0.5848835110664368, "learning_rate": 2.9543896904315693e-07, "loss": 0.5774, "step": 12083 }, { "epoch": 4.624569460390356, "grad_norm": 0.5143131613731384, "learning_rate": 2.9484114808207786e-07, "loss": 0.4827, "step": 12084 }, { "epoch": 4.624952162265595, "grad_norm": 0.5645819306373596, "learning_rate": 2.9424392352908395e-07, "loss": 0.6389, "step": 12085 }, { "epoch": 4.625334864140834, "grad_norm": 0.5689683556556702, "learning_rate": 2.9364729542087354e-07, "loss": 0.5958, "step": 12086 }, { "epoch": 4.625717566016074, "grad_norm": 0.5533234477043152, "learning_rate": 2.9305126379411406e-07, "loss": 0.6784, "step": 12087 }, { "epoch": 4.626100267891313, "grad_norm": 0.5609163045883179, "learning_rate": 2.924558286854307e-07, "loss": 0.6071, "step": 12088 }, { "epoch": 4.626482969766552, "grad_norm": 0.5801453590393066, "learning_rate": 2.9186099013141733e-07, "loss": 0.7041, "step": 12089 }, { "epoch": 4.6268656716417915, "grad_norm": 0.5991806983947754, "learning_rate": 2.91266748168626e-07, "loss": 0.5391, "step": 12090 }, { "epoch": 4.6272483735170304, "grad_norm": 0.5167158842086792, "learning_rate": 2.906731028335763e-07, "loss": 0.6829, "step": 12091 }, { "epoch": 4.627631075392269, "grad_norm": 0.5303935408592224, "learning_rate": 2.9008005416274796e-07, "loss": 0.5447, "step": 12092 }, { "epoch": 4.628013777267508, "grad_norm": 0.5287306904792786, "learning_rate": 2.894876021925852e-07, "loss": 0.5844, "step": 12093 }, { "epoch": 4.628396479142748, "grad_norm": 0.5339670181274414, "learning_rate": 2.888957469594977e-07, "loss": 0.5971, "step": 12094 }, { "epoch": 4.628779181017987, "grad_norm": 0.5334836840629578, "learning_rate": 2.8830448849985535e-07, "loss": 0.6316, "step": 12095 }, { "epoch": 4.629161882893226, "grad_norm": 0.6398550868034363, "learning_rate": 2.877138268499913e-07, "loss": 0.7026, "step": 12096 }, { "epoch": 4.629544584768466, "grad_norm": 0.564466655254364, "learning_rate": 2.871237620462053e-07, "loss": 0.6295, "step": 12097 }, { "epoch": 4.629927286643705, "grad_norm": 0.5517302751541138, "learning_rate": 2.865342941247573e-07, "loss": 0.612, "step": 12098 }, { "epoch": 4.630309988518944, "grad_norm": 0.5474767684936523, "learning_rate": 2.859454231218717e-07, "loss": 0.6226, "step": 12099 }, { "epoch": 4.6306926903941825, "grad_norm": 0.6021904945373535, "learning_rate": 2.853571490737372e-07, "loss": 0.6187, "step": 12100 }, { "epoch": 4.631075392269422, "grad_norm": 0.5018277764320374, "learning_rate": 2.847694720165051e-07, "loss": 0.6072, "step": 12101 }, { "epoch": 4.631458094144661, "grad_norm": 0.4981164336204529, "learning_rate": 2.8418239198628737e-07, "loss": 0.5958, "step": 12102 }, { "epoch": 4.6318407960199, "grad_norm": 0.5296295285224915, "learning_rate": 2.835959090191631e-07, "loss": 0.5825, "step": 12103 }, { "epoch": 4.63222349789514, "grad_norm": 0.5907389521598816, "learning_rate": 2.8301002315117454e-07, "loss": 0.6146, "step": 12104 }, { "epoch": 4.632606199770379, "grad_norm": 0.5531162619590759, "learning_rate": 2.82424734418324e-07, "loss": 0.6014, "step": 12105 }, { "epoch": 4.632988901645618, "grad_norm": 0.552596390247345, "learning_rate": 2.818400428565793e-07, "loss": 0.6841, "step": 12106 }, { "epoch": 4.633371603520858, "grad_norm": 0.5859269499778748, "learning_rate": 2.8125594850187177e-07, "loss": 0.6071, "step": 12107 }, { "epoch": 4.633754305396097, "grad_norm": 0.5526888370513916, "learning_rate": 2.806724513900949e-07, "loss": 0.5857, "step": 12108 }, { "epoch": 4.6341370072713355, "grad_norm": 0.5309843420982361, "learning_rate": 2.8008955155710784e-07, "loss": 0.6546, "step": 12109 }, { "epoch": 4.634519709146575, "grad_norm": 0.5587872266769409, "learning_rate": 2.795072490387296e-07, "loss": 0.643, "step": 12110 }, { "epoch": 4.634902411021814, "grad_norm": 0.49673834443092346, "learning_rate": 2.789255438707461e-07, "loss": 0.6424, "step": 12111 }, { "epoch": 4.635285112897053, "grad_norm": 0.5613114833831787, "learning_rate": 2.7834443608890314e-07, "loss": 0.6107, "step": 12112 }, { "epoch": 4.635667814772292, "grad_norm": 0.54083651304245, "learning_rate": 2.777639257289111e-07, "loss": 0.5706, "step": 12113 }, { "epoch": 4.636050516647532, "grad_norm": 0.5819313526153564, "learning_rate": 2.771840128264447e-07, "loss": 0.6923, "step": 12114 }, { "epoch": 4.636433218522771, "grad_norm": 0.5555955171585083, "learning_rate": 2.7660469741714104e-07, "loss": 0.5561, "step": 12115 }, { "epoch": 4.63681592039801, "grad_norm": 0.6207208633422852, "learning_rate": 2.760259795366005e-07, "loss": 0.6595, "step": 12116 }, { "epoch": 4.6371986222732495, "grad_norm": 0.5354348421096802, "learning_rate": 2.7544785922038575e-07, "loss": 0.6425, "step": 12117 }, { "epoch": 4.6375813241484884, "grad_norm": 0.5723737478256226, "learning_rate": 2.748703365040262e-07, "loss": 0.6056, "step": 12118 }, { "epoch": 4.637964026023727, "grad_norm": 0.5421286225318909, "learning_rate": 2.7429341142301115e-07, "loss": 0.6145, "step": 12119 }, { "epoch": 4.638346727898966, "grad_norm": 0.5648110508918762, "learning_rate": 2.737170840127934e-07, "loss": 0.6751, "step": 12120 }, { "epoch": 4.638729429774206, "grad_norm": 0.5835244059562683, "learning_rate": 2.7314135430879127e-07, "loss": 0.6192, "step": 12121 }, { "epoch": 4.639112131649445, "grad_norm": 0.5509203672409058, "learning_rate": 2.7256622234638207e-07, "loss": 0.618, "step": 12122 }, { "epoch": 4.639494833524684, "grad_norm": 0.5297933220863342, "learning_rate": 2.7199168816091416e-07, "loss": 0.5439, "step": 12123 }, { "epoch": 4.639877535399924, "grad_norm": 0.5558453798294067, "learning_rate": 2.7141775178768926e-07, "loss": 0.6352, "step": 12124 }, { "epoch": 4.640260237275163, "grad_norm": 0.5345826745033264, "learning_rate": 2.7084441326197917e-07, "loss": 0.6839, "step": 12125 }, { "epoch": 4.640642939150402, "grad_norm": 0.5302783846855164, "learning_rate": 2.702716726190169e-07, "loss": 0.6425, "step": 12126 }, { "epoch": 4.641025641025641, "grad_norm": 0.5248739719390869, "learning_rate": 2.6969952989399973e-07, "loss": 0.6381, "step": 12127 }, { "epoch": 4.64140834290088, "grad_norm": 0.5492274165153503, "learning_rate": 2.6912798512208627e-07, "loss": 0.5991, "step": 12128 }, { "epoch": 4.641791044776119, "grad_norm": 0.535831868648529, "learning_rate": 2.6855703833840064e-07, "loss": 0.6472, "step": 12129 }, { "epoch": 4.642173746651359, "grad_norm": 0.5562845468521118, "learning_rate": 2.679866895780292e-07, "loss": 0.6219, "step": 12130 }, { "epoch": 4.642556448526598, "grad_norm": 0.4990120232105255, "learning_rate": 2.674169388760195e-07, "loss": 0.5734, "step": 12131 }, { "epoch": 4.642939150401837, "grad_norm": 0.5390089750289917, "learning_rate": 2.6684778626738464e-07, "loss": 0.6735, "step": 12132 }, { "epoch": 4.643321852277076, "grad_norm": 0.528290331363678, "learning_rate": 2.6627923178710324e-07, "loss": 0.6301, "step": 12133 }, { "epoch": 4.643704554152316, "grad_norm": 0.5001811385154724, "learning_rate": 2.657112754701119e-07, "loss": 0.5834, "step": 12134 }, { "epoch": 4.644087256027555, "grad_norm": 0.5544610023498535, "learning_rate": 2.651439173513126e-07, "loss": 0.6499, "step": 12135 }, { "epoch": 4.6444699579027935, "grad_norm": 0.6440417170524597, "learning_rate": 2.645771574655742e-07, "loss": 0.5651, "step": 12136 }, { "epoch": 4.644852659778033, "grad_norm": 0.5331578254699707, "learning_rate": 2.6401099584772106e-07, "loss": 0.6934, "step": 12137 }, { "epoch": 4.645235361653272, "grad_norm": 0.5152967572212219, "learning_rate": 2.634454325325497e-07, "loss": 0.6433, "step": 12138 }, { "epoch": 4.645618063528511, "grad_norm": 0.540980339050293, "learning_rate": 2.6288046755481244e-07, "loss": 0.6942, "step": 12139 }, { "epoch": 4.64600076540375, "grad_norm": 0.6751289963722229, "learning_rate": 2.623161009492314e-07, "loss": 0.5952, "step": 12140 }, { "epoch": 4.64638346727899, "grad_norm": 0.5189618468284607, "learning_rate": 2.617523327504834e-07, "loss": 0.659, "step": 12141 }, { "epoch": 4.646766169154229, "grad_norm": 0.5659645795822144, "learning_rate": 2.611891629932184e-07, "loss": 0.6441, "step": 12142 }, { "epoch": 4.647148871029468, "grad_norm": 0.5201966762542725, "learning_rate": 2.606265917120421e-07, "loss": 0.6495, "step": 12143 }, { "epoch": 4.6475315729047075, "grad_norm": 0.5366488695144653, "learning_rate": 2.600646189415257e-07, "loss": 0.6326, "step": 12144 }, { "epoch": 4.6479142747799465, "grad_norm": 0.5597273707389832, "learning_rate": 2.595032447162049e-07, "loss": 0.5887, "step": 12145 }, { "epoch": 4.648296976655185, "grad_norm": 0.5634549856185913, "learning_rate": 2.589424690705777e-07, "loss": 0.6799, "step": 12146 }, { "epoch": 4.648679678530425, "grad_norm": 0.600168764591217, "learning_rate": 2.5838229203910435e-07, "loss": 0.5979, "step": 12147 }, { "epoch": 4.649062380405664, "grad_norm": 0.5231002569198608, "learning_rate": 2.5782271365621057e-07, "loss": 0.5989, "step": 12148 }, { "epoch": 4.649445082280903, "grad_norm": 0.5278729796409607, "learning_rate": 2.5726373395628337e-07, "loss": 0.5192, "step": 12149 }, { "epoch": 4.649827784156143, "grad_norm": 0.515262246131897, "learning_rate": 2.5670535297367295e-07, "loss": 0.5563, "step": 12150 }, { "epoch": 4.650210486031382, "grad_norm": 0.5837035179138184, "learning_rate": 2.5614757074269416e-07, "loss": 0.7212, "step": 12151 }, { "epoch": 4.650593187906621, "grad_norm": 0.5211502313613892, "learning_rate": 2.55590387297624e-07, "loss": 0.5747, "step": 12152 }, { "epoch": 4.65097588978186, "grad_norm": 0.5507612824440002, "learning_rate": 2.5503380267270393e-07, "loss": 0.6804, "step": 12153 }, { "epoch": 4.651358591657099, "grad_norm": 0.5627462863922119, "learning_rate": 2.5447781690213446e-07, "loss": 0.6073, "step": 12154 }, { "epoch": 4.651741293532338, "grad_norm": 0.5777851343154907, "learning_rate": 2.5392243002008486e-07, "loss": 0.6343, "step": 12155 }, { "epoch": 4.652123995407577, "grad_norm": 0.6098002195358276, "learning_rate": 2.5336764206068564e-07, "loss": 0.6301, "step": 12156 }, { "epoch": 4.652506697282817, "grad_norm": 0.5702993273735046, "learning_rate": 2.528134530580284e-07, "loss": 0.6673, "step": 12157 }, { "epoch": 4.652889399158056, "grad_norm": 0.5296881794929504, "learning_rate": 2.522598630461692e-07, "loss": 0.7064, "step": 12158 }, { "epoch": 4.653272101033295, "grad_norm": 0.5300864577293396, "learning_rate": 2.517068720591298e-07, "loss": 0.6152, "step": 12159 }, { "epoch": 4.653654802908534, "grad_norm": 0.5717763900756836, "learning_rate": 2.511544801308918e-07, "loss": 0.6605, "step": 12160 }, { "epoch": 4.654037504783774, "grad_norm": 0.5260769724845886, "learning_rate": 2.506026872954004e-07, "loss": 0.6472, "step": 12161 }, { "epoch": 4.654420206659013, "grad_norm": 0.5404273867607117, "learning_rate": 2.500514935865672e-07, "loss": 0.6285, "step": 12162 }, { "epoch": 4.6548029085342515, "grad_norm": 0.5184690952301025, "learning_rate": 2.495008990382619e-07, "loss": 0.6071, "step": 12163 }, { "epoch": 4.655185610409491, "grad_norm": 0.5238134264945984, "learning_rate": 2.4895090368432073e-07, "loss": 0.6446, "step": 12164 }, { "epoch": 4.65556831228473, "grad_norm": 0.5666056275367737, "learning_rate": 2.484015075585422e-07, "loss": 0.6029, "step": 12165 }, { "epoch": 4.655951014159969, "grad_norm": 0.5264044404029846, "learning_rate": 2.478527106946904e-07, "loss": 0.5685, "step": 12166 }, { "epoch": 4.656333716035209, "grad_norm": 0.5701244473457336, "learning_rate": 2.4730451312648617e-07, "loss": 0.6241, "step": 12167 }, { "epoch": 4.656716417910448, "grad_norm": 0.5199837684631348, "learning_rate": 2.4675691488762144e-07, "loss": 0.6202, "step": 12168 }, { "epoch": 4.657099119785687, "grad_norm": 0.5338907837867737, "learning_rate": 2.4620991601174596e-07, "loss": 0.6096, "step": 12169 }, { "epoch": 4.657481821660927, "grad_norm": 0.5147120952606201, "learning_rate": 2.45663516532475e-07, "loss": 0.5939, "step": 12170 }, { "epoch": 4.6578645235361655, "grad_norm": 0.5417539477348328, "learning_rate": 2.451177164833851e-07, "loss": 0.675, "step": 12171 }, { "epoch": 4.6582472254114045, "grad_norm": 0.5423441529273987, "learning_rate": 2.4457251589801945e-07, "loss": 0.6711, "step": 12172 }, { "epoch": 4.658629927286643, "grad_norm": 0.5732550621032715, "learning_rate": 2.4402791480987896e-07, "loss": 0.6105, "step": 12173 }, { "epoch": 4.659012629161883, "grad_norm": 0.5450476408004761, "learning_rate": 2.4348391325243245e-07, "loss": 0.6672, "step": 12174 }, { "epoch": 4.659395331037122, "grad_norm": 0.5805870890617371, "learning_rate": 2.429405112591121e-07, "loss": 0.6576, "step": 12175 }, { "epoch": 4.659778032912361, "grad_norm": 0.5532625913619995, "learning_rate": 2.4239770886330784e-07, "loss": 0.6283, "step": 12176 }, { "epoch": 4.660160734787601, "grad_norm": 0.5006528496742249, "learning_rate": 2.418555060983785e-07, "loss": 0.5757, "step": 12177 }, { "epoch": 4.66054343666284, "grad_norm": 0.550295352935791, "learning_rate": 2.4131390299764303e-07, "loss": 0.6033, "step": 12178 }, { "epoch": 4.660926138538079, "grad_norm": 0.5456677079200745, "learning_rate": 2.4077289959438476e-07, "loss": 0.6388, "step": 12179 }, { "epoch": 4.661308840413318, "grad_norm": 0.6050501465797424, "learning_rate": 2.402324959218505e-07, "loss": 0.5818, "step": 12180 }, { "epoch": 4.661691542288557, "grad_norm": 0.5306825637817383, "learning_rate": 2.396926920132481e-07, "loss": 0.6259, "step": 12181 }, { "epoch": 4.662074244163796, "grad_norm": 0.5216883420944214, "learning_rate": 2.391534879017521e-07, "loss": 0.5822, "step": 12182 }, { "epoch": 4.662456946039035, "grad_norm": 0.5534911751747131, "learning_rate": 2.38614883620496e-07, "loss": 0.607, "step": 12183 }, { "epoch": 4.662839647914275, "grad_norm": 0.5095000863075256, "learning_rate": 2.3807687920257895e-07, "loss": 0.5843, "step": 12184 }, { "epoch": 4.663222349789514, "grad_norm": 0.5816523432731628, "learning_rate": 2.3753947468106443e-07, "loss": 0.6781, "step": 12185 }, { "epoch": 4.663605051664753, "grad_norm": 0.5393242835998535, "learning_rate": 2.370026700889738e-07, "loss": 0.5869, "step": 12186 }, { "epoch": 4.663987753539993, "grad_norm": 0.5112661123275757, "learning_rate": 2.3646646545929852e-07, "loss": 0.5184, "step": 12187 }, { "epoch": 4.664370455415232, "grad_norm": 0.5519946813583374, "learning_rate": 2.3593086082498772e-07, "loss": 0.5928, "step": 12188 }, { "epoch": 4.664753157290471, "grad_norm": 0.573910653591156, "learning_rate": 2.3539585621895733e-07, "loss": 0.6943, "step": 12189 }, { "epoch": 4.66513585916571, "grad_norm": 0.5603654384613037, "learning_rate": 2.3486145167408324e-07, "loss": 0.6578, "step": 12190 }, { "epoch": 4.665518561040949, "grad_norm": 0.5642079710960388, "learning_rate": 2.34327647223207e-07, "loss": 0.6305, "step": 12191 }, { "epoch": 4.665901262916188, "grad_norm": 0.591157853603363, "learning_rate": 2.3379444289913344e-07, "loss": 0.6163, "step": 12192 }, { "epoch": 4.666283964791427, "grad_norm": 0.530184268951416, "learning_rate": 2.3326183873462748e-07, "loss": 0.614, "step": 12193 }, { "epoch": 4.666666666666667, "grad_norm": 0.5621085166931152, "learning_rate": 2.3272983476241962e-07, "loss": 0.632, "step": 12194 }, { "epoch": 4.667049368541906, "grad_norm": 0.5393961071968079, "learning_rate": 2.3219843101520369e-07, "loss": 0.616, "step": 12195 }, { "epoch": 4.667432070417145, "grad_norm": 0.5104389190673828, "learning_rate": 2.3166762752563466e-07, "loss": 0.6256, "step": 12196 }, { "epoch": 4.667814772292385, "grad_norm": 0.547243595123291, "learning_rate": 2.31137424326332e-07, "loss": 0.6495, "step": 12197 }, { "epoch": 4.6681974741676235, "grad_norm": 0.7355848550796509, "learning_rate": 2.306078214498797e-07, "loss": 0.618, "step": 12198 }, { "epoch": 4.6685801760428625, "grad_norm": 0.5285188555717468, "learning_rate": 2.3007881892882055e-07, "loss": 0.6128, "step": 12199 }, { "epoch": 4.668962877918101, "grad_norm": 0.4993334114551544, "learning_rate": 2.2955041679566637e-07, "loss": 0.623, "step": 12200 }, { "epoch": 4.669345579793341, "grad_norm": 0.5561435222625732, "learning_rate": 2.290226150828867e-07, "loss": 0.5887, "step": 12201 }, { "epoch": 4.66972828166858, "grad_norm": 0.6665411591529846, "learning_rate": 2.2849541382291673e-07, "loss": 0.5637, "step": 12202 }, { "epoch": 4.670110983543819, "grad_norm": 0.5467920899391174, "learning_rate": 2.27968813048155e-07, "loss": 0.5761, "step": 12203 }, { "epoch": 4.670493685419059, "grad_norm": 0.5592683553695679, "learning_rate": 2.2744281279096226e-07, "loss": 0.5936, "step": 12204 }, { "epoch": 4.670876387294298, "grad_norm": 0.5956951379776001, "learning_rate": 2.2691741308366267e-07, "loss": 0.6823, "step": 12205 }, { "epoch": 4.671259089169537, "grad_norm": 0.5242606997489929, "learning_rate": 2.2639261395854262e-07, "loss": 0.5942, "step": 12206 }, { "epoch": 4.6716417910447765, "grad_norm": 0.5101072192192078, "learning_rate": 2.258684154478541e-07, "loss": 0.5991, "step": 12207 }, { "epoch": 4.672024492920015, "grad_norm": 0.5480127930641174, "learning_rate": 2.25344817583808e-07, "loss": 0.6532, "step": 12208 }, { "epoch": 4.672407194795254, "grad_norm": 0.5097024440765381, "learning_rate": 2.2482182039858304e-07, "loss": 0.6464, "step": 12209 }, { "epoch": 4.672789896670494, "grad_norm": 0.5445371866226196, "learning_rate": 2.2429942392431903e-07, "loss": 0.6166, "step": 12210 }, { "epoch": 4.673172598545733, "grad_norm": 0.5334567427635193, "learning_rate": 2.2377762819311698e-07, "loss": 0.6278, "step": 12211 }, { "epoch": 4.673555300420972, "grad_norm": 0.5408106446266174, "learning_rate": 2.232564332370435e-07, "loss": 0.6809, "step": 12212 }, { "epoch": 4.673938002296211, "grad_norm": 0.6027951240539551, "learning_rate": 2.2273583908812736e-07, "loss": 0.6202, "step": 12213 }, { "epoch": 4.674320704171451, "grad_norm": 0.5576539039611816, "learning_rate": 2.2221584577836187e-07, "loss": 0.6531, "step": 12214 }, { "epoch": 4.67470340604669, "grad_norm": 0.5825307965278625, "learning_rate": 2.2169645333969815e-07, "loss": 0.6263, "step": 12215 }, { "epoch": 4.675086107921929, "grad_norm": 0.5445646047592163, "learning_rate": 2.211776618040573e-07, "loss": 0.6231, "step": 12216 }, { "epoch": 4.675468809797168, "grad_norm": 0.5370567440986633, "learning_rate": 2.2065947120332055e-07, "loss": 0.5743, "step": 12217 }, { "epoch": 4.675851511672407, "grad_norm": 0.5409052968025208, "learning_rate": 2.2014188156933124e-07, "loss": 0.5711, "step": 12218 }, { "epoch": 4.676234213547646, "grad_norm": 0.5425118207931519, "learning_rate": 2.1962489293389622e-07, "loss": 0.677, "step": 12219 }, { "epoch": 4.676616915422885, "grad_norm": 0.5519376397132874, "learning_rate": 2.191085053287867e-07, "loss": 0.5928, "step": 12220 }, { "epoch": 4.676999617298125, "grad_norm": 0.5404468774795532, "learning_rate": 2.1859271878573618e-07, "loss": 0.6163, "step": 12221 }, { "epoch": 4.677382319173364, "grad_norm": 0.517729640007019, "learning_rate": 2.1807753333644043e-07, "loss": 0.5814, "step": 12222 }, { "epoch": 4.677765021048603, "grad_norm": 0.5408376455307007, "learning_rate": 2.175629490125586e-07, "loss": 0.5789, "step": 12223 }, { "epoch": 4.678147722923843, "grad_norm": 0.5361735820770264, "learning_rate": 2.1704896584571534e-07, "loss": 0.6141, "step": 12224 }, { "epoch": 4.6785304247990815, "grad_norm": 0.5240185856819153, "learning_rate": 2.165355838674943e-07, "loss": 0.5916, "step": 12225 }, { "epoch": 4.6789131266743205, "grad_norm": 0.5329815745353699, "learning_rate": 2.1602280310944468e-07, "loss": 0.5567, "step": 12226 }, { "epoch": 4.67929582854956, "grad_norm": 0.5459856986999512, "learning_rate": 2.1551062360307794e-07, "loss": 0.6142, "step": 12227 }, { "epoch": 4.679678530424799, "grad_norm": 0.5287842154502869, "learning_rate": 2.1499904537987004e-07, "loss": 0.5524, "step": 12228 }, { "epoch": 4.680061232300038, "grad_norm": 0.5809191465377808, "learning_rate": 2.1448806847125802e-07, "loss": 0.6007, "step": 12229 }, { "epoch": 4.680443934175278, "grad_norm": 0.5374624729156494, "learning_rate": 2.1397769290864235e-07, "loss": 0.6319, "step": 12230 }, { "epoch": 4.680826636050517, "grad_norm": 0.5509041547775269, "learning_rate": 2.1346791872338902e-07, "loss": 0.5302, "step": 12231 }, { "epoch": 4.681209337925756, "grad_norm": 0.5551190972328186, "learning_rate": 2.1295874594682187e-07, "loss": 0.5657, "step": 12232 }, { "epoch": 4.681592039800995, "grad_norm": 0.5545428991317749, "learning_rate": 2.1245017461023255e-07, "loss": 0.6238, "step": 12233 }, { "epoch": 4.6819747416762345, "grad_norm": 0.5416950583457947, "learning_rate": 2.1194220474487604e-07, "loss": 0.6175, "step": 12234 }, { "epoch": 4.682357443551473, "grad_norm": 0.5872295498847961, "learning_rate": 2.1143483638196517e-07, "loss": 0.7102, "step": 12235 }, { "epoch": 4.682740145426712, "grad_norm": 0.5540584921836853, "learning_rate": 2.1092806955267942e-07, "loss": 0.6182, "step": 12236 }, { "epoch": 4.683122847301952, "grad_norm": 0.5517016649246216, "learning_rate": 2.104219042881639e-07, "loss": 0.6721, "step": 12237 }, { "epoch": 4.683505549177191, "grad_norm": 0.5276760458946228, "learning_rate": 2.0991634061952038e-07, "loss": 0.6166, "step": 12238 }, { "epoch": 4.68388825105243, "grad_norm": 0.5375874042510986, "learning_rate": 2.0941137857781956e-07, "loss": 0.6207, "step": 12239 }, { "epoch": 4.684270952927669, "grad_norm": 0.5658925771713257, "learning_rate": 2.089070181940922e-07, "loss": 0.7145, "step": 12240 }, { "epoch": 4.684653654802909, "grad_norm": 0.5853984355926514, "learning_rate": 2.0840325949933238e-07, "loss": 0.6861, "step": 12241 }, { "epoch": 4.685036356678148, "grad_norm": 0.5139490365982056, "learning_rate": 2.0790010252449643e-07, "loss": 0.5665, "step": 12242 }, { "epoch": 4.685419058553387, "grad_norm": 0.5683445334434509, "learning_rate": 2.073975473005052e-07, "loss": 0.5712, "step": 12243 }, { "epoch": 4.685801760428626, "grad_norm": 0.5376088619232178, "learning_rate": 2.06895593858244e-07, "loss": 0.6523, "step": 12244 }, { "epoch": 4.686184462303865, "grad_norm": 0.5738924145698547, "learning_rate": 2.0639424222855698e-07, "loss": 0.6548, "step": 12245 }, { "epoch": 4.686567164179104, "grad_norm": 0.5707527995109558, "learning_rate": 2.0589349244225288e-07, "loss": 0.6632, "step": 12246 }, { "epoch": 4.686949866054344, "grad_norm": 0.5383082032203674, "learning_rate": 2.0539334453010707e-07, "loss": 0.5487, "step": 12247 }, { "epoch": 4.687332567929583, "grad_norm": 0.5233001112937927, "learning_rate": 2.0489379852285162e-07, "loss": 0.584, "step": 12248 }, { "epoch": 4.687715269804822, "grad_norm": 0.5507248044013977, "learning_rate": 2.0439485445118867e-07, "loss": 0.5714, "step": 12249 }, { "epoch": 4.688097971680062, "grad_norm": 0.5800592303276062, "learning_rate": 2.03896512345777e-07, "loss": 0.6338, "step": 12250 }, { "epoch": 4.688480673555301, "grad_norm": 0.5529888272285461, "learning_rate": 2.0339877223724215e-07, "loss": 0.6502, "step": 12251 }, { "epoch": 4.6888633754305395, "grad_norm": 0.5427209138870239, "learning_rate": 2.029016341561696e-07, "loss": 0.624, "step": 12252 }, { "epoch": 4.6892460773057785, "grad_norm": 0.5607098340988159, "learning_rate": 2.0240509813311382e-07, "loss": 0.6358, "step": 12253 }, { "epoch": 4.689628779181018, "grad_norm": 0.492958128452301, "learning_rate": 2.0190916419858486e-07, "loss": 0.5401, "step": 12254 }, { "epoch": 4.690011481056257, "grad_norm": 0.5382902026176453, "learning_rate": 2.0141383238305946e-07, "loss": 0.5654, "step": 12255 }, { "epoch": 4.690394182931496, "grad_norm": 0.5542005300521851, "learning_rate": 2.0091910271697768e-07, "loss": 0.5898, "step": 12256 }, { "epoch": 4.690776884806736, "grad_norm": 0.5557591915130615, "learning_rate": 2.0042497523074412e-07, "loss": 0.5427, "step": 12257 }, { "epoch": 4.691159586681975, "grad_norm": 0.5776146650314331, "learning_rate": 1.9993144995472004e-07, "loss": 0.6153, "step": 12258 }, { "epoch": 4.691542288557214, "grad_norm": 0.5086137056350708, "learning_rate": 1.994385269192367e-07, "loss": 0.5652, "step": 12259 }, { "epoch": 4.691924990432453, "grad_norm": 0.5292207598686218, "learning_rate": 1.989462061545866e-07, "loss": 0.5904, "step": 12260 }, { "epoch": 4.6923076923076925, "grad_norm": 0.5881882309913635, "learning_rate": 1.9845448769102105e-07, "loss": 0.6411, "step": 12261 }, { "epoch": 4.692690394182931, "grad_norm": 0.5676940083503723, "learning_rate": 1.979633715587592e-07, "loss": 0.626, "step": 12262 }, { "epoch": 4.69307309605817, "grad_norm": 0.5335917472839355, "learning_rate": 1.9747285778798254e-07, "loss": 0.5896, "step": 12263 }, { "epoch": 4.69345579793341, "grad_norm": 0.5735641121864319, "learning_rate": 1.969829464088313e-07, "loss": 0.6678, "step": 12264 }, { "epoch": 4.693838499808649, "grad_norm": 0.4935328960418701, "learning_rate": 1.9649363745141482e-07, "loss": 0.5973, "step": 12265 }, { "epoch": 4.694221201683888, "grad_norm": 0.5520224571228027, "learning_rate": 1.960049309458012e-07, "loss": 0.5525, "step": 12266 }, { "epoch": 4.694603903559128, "grad_norm": 0.523585855960846, "learning_rate": 1.95516826922022e-07, "loss": 0.6064, "step": 12267 }, { "epoch": 4.694986605434367, "grad_norm": 0.5020290017127991, "learning_rate": 1.9502932541007436e-07, "loss": 0.5771, "step": 12268 }, { "epoch": 4.695369307309606, "grad_norm": 0.5475844144821167, "learning_rate": 1.9454242643991427e-07, "loss": 0.5957, "step": 12269 }, { "epoch": 4.6957520091848455, "grad_norm": 0.8128412365913391, "learning_rate": 1.9405613004146562e-07, "loss": 0.6203, "step": 12270 }, { "epoch": 4.696134711060084, "grad_norm": 0.5584147572517395, "learning_rate": 1.9357043624461115e-07, "loss": 0.5135, "step": 12271 }, { "epoch": 4.696517412935323, "grad_norm": 0.48561710119247437, "learning_rate": 1.9308534507919808e-07, "loss": 0.524, "step": 12272 }, { "epoch": 4.696900114810562, "grad_norm": 0.5346113443374634, "learning_rate": 1.9260085657503812e-07, "loss": 0.6103, "step": 12273 }, { "epoch": 4.697282816685802, "grad_norm": 0.5679559707641602, "learning_rate": 1.9211697076190084e-07, "loss": 0.5839, "step": 12274 }, { "epoch": 4.697665518561041, "grad_norm": 0.5182730555534363, "learning_rate": 1.9163368766952572e-07, "loss": 0.6865, "step": 12275 }, { "epoch": 4.69804822043628, "grad_norm": 0.5481812953948975, "learning_rate": 1.9115100732761126e-07, "loss": 0.7017, "step": 12276 }, { "epoch": 4.69843092231152, "grad_norm": 0.49258309602737427, "learning_rate": 1.9066892976581708e-07, "loss": 0.5926, "step": 12277 }, { "epoch": 4.698813624186759, "grad_norm": 0.5630466341972351, "learning_rate": 1.9018745501377055e-07, "loss": 0.68, "step": 12278 }, { "epoch": 4.6991963260619976, "grad_norm": 0.5515807271003723, "learning_rate": 1.8970658310105917e-07, "loss": 0.6415, "step": 12279 }, { "epoch": 4.6995790279372365, "grad_norm": 0.5244725942611694, "learning_rate": 1.8922631405723256e-07, "loss": 0.6704, "step": 12280 }, { "epoch": 4.699961729812476, "grad_norm": 0.5935258269309998, "learning_rate": 1.887466479118061e-07, "loss": 0.6099, "step": 12281 }, { "epoch": 4.700344431687715, "grad_norm": 0.5143455862998962, "learning_rate": 1.88267584694255e-07, "loss": 0.5373, "step": 12282 }, { "epoch": 4.700727133562954, "grad_norm": 0.575991690158844, "learning_rate": 1.8778912443402242e-07, "loss": 0.6434, "step": 12283 }, { "epoch": 4.701109835438194, "grad_norm": 0.4900639057159424, "learning_rate": 1.8731126716050707e-07, "loss": 0.552, "step": 12284 }, { "epoch": 4.701492537313433, "grad_norm": 0.5109453201293945, "learning_rate": 1.8683401290307546e-07, "loss": 0.545, "step": 12285 }, { "epoch": 4.701875239188672, "grad_norm": 0.5417972207069397, "learning_rate": 1.8635736169105746e-07, "loss": 0.5924, "step": 12286 }, { "epoch": 4.702257941063912, "grad_norm": 0.537179172039032, "learning_rate": 1.8588131355374405e-07, "loss": 0.6385, "step": 12287 }, { "epoch": 4.7026406429391505, "grad_norm": 0.5276868939399719, "learning_rate": 1.8540586852038855e-07, "loss": 0.5923, "step": 12288 }, { "epoch": 4.703023344814389, "grad_norm": 0.561937689781189, "learning_rate": 1.849310266202098e-07, "loss": 0.6499, "step": 12289 }, { "epoch": 4.703406046689629, "grad_norm": 0.5140676498413086, "learning_rate": 1.8445678788238775e-07, "loss": 0.586, "step": 12290 }, { "epoch": 4.703788748564868, "grad_norm": 0.5876363515853882, "learning_rate": 1.8398315233606467e-07, "loss": 0.655, "step": 12291 }, { "epoch": 4.704171450440107, "grad_norm": 0.5346970558166504, "learning_rate": 1.835101200103484e-07, "loss": 0.6189, "step": 12292 }, { "epoch": 4.704554152315346, "grad_norm": 0.5227566957473755, "learning_rate": 1.830376909343068e-07, "loss": 0.5417, "step": 12293 }, { "epoch": 4.704936854190586, "grad_norm": 0.5326010584831238, "learning_rate": 1.8256586513697217e-07, "loss": 0.6475, "step": 12294 }, { "epoch": 4.705319556065825, "grad_norm": 0.5429602265357971, "learning_rate": 1.8209464264733912e-07, "loss": 0.6259, "step": 12295 }, { "epoch": 4.705702257941064, "grad_norm": 0.5180036425590515, "learning_rate": 1.816240234943678e-07, "loss": 0.5806, "step": 12296 }, { "epoch": 4.7060849598163035, "grad_norm": 0.5889097452163696, "learning_rate": 1.811540077069751e-07, "loss": 0.5369, "step": 12297 }, { "epoch": 4.706467661691542, "grad_norm": 0.7053231596946716, "learning_rate": 1.8068459531404791e-07, "loss": 0.6691, "step": 12298 }, { "epoch": 4.706850363566781, "grad_norm": 0.5363121032714844, "learning_rate": 1.8021578634443093e-07, "loss": 0.5475, "step": 12299 }, { "epoch": 4.70723306544202, "grad_norm": 0.5440658926963806, "learning_rate": 1.7974758082693554e-07, "loss": 0.5358, "step": 12300 }, { "epoch": 4.70761576731726, "grad_norm": 0.5337477326393127, "learning_rate": 1.7927997879033209e-07, "loss": 0.6127, "step": 12301 }, { "epoch": 4.707998469192499, "grad_norm": 0.556600034236908, "learning_rate": 1.788129802633598e-07, "loss": 0.5933, "step": 12302 }, { "epoch": 4.708381171067738, "grad_norm": 0.5775076150894165, "learning_rate": 1.7834658527471238e-07, "loss": 0.6246, "step": 12303 }, { "epoch": 4.708763872942978, "grad_norm": 0.5684707760810852, "learning_rate": 1.7788079385305246e-07, "loss": 0.6153, "step": 12304 }, { "epoch": 4.709146574818217, "grad_norm": 0.5496896505355835, "learning_rate": 1.7741560602700715e-07, "loss": 0.6611, "step": 12305 }, { "epoch": 4.709529276693456, "grad_norm": 0.5491759181022644, "learning_rate": 1.7695102182515912e-07, "loss": 0.5764, "step": 12306 }, { "epoch": 4.709911978568695, "grad_norm": 0.6163854002952576, "learning_rate": 1.764870412760611e-07, "loss": 0.6664, "step": 12307 }, { "epoch": 4.710294680443934, "grad_norm": 0.5348997712135315, "learning_rate": 1.760236644082247e-07, "loss": 0.6549, "step": 12308 }, { "epoch": 4.710677382319173, "grad_norm": 0.5657865405082703, "learning_rate": 1.755608912501261e-07, "loss": 0.6346, "step": 12309 }, { "epoch": 4.711060084194413, "grad_norm": 0.5659242868423462, "learning_rate": 1.750987218302047e-07, "loss": 0.6211, "step": 12310 }, { "epoch": 4.711442786069652, "grad_norm": 0.5301674008369446, "learning_rate": 1.7463715617686004e-07, "loss": 0.6202, "step": 12311 }, { "epoch": 4.711825487944891, "grad_norm": 0.4863542914390564, "learning_rate": 1.7417619431845945e-07, "loss": 0.5917, "step": 12312 }, { "epoch": 4.71220818982013, "grad_norm": 0.5478348731994629, "learning_rate": 1.73715836283328e-07, "loss": 0.6543, "step": 12313 }, { "epoch": 4.71259089169537, "grad_norm": 0.5598545670509338, "learning_rate": 1.7325608209975643e-07, "loss": 0.7027, "step": 12314 }, { "epoch": 4.7129735935706085, "grad_norm": 0.5214572548866272, "learning_rate": 1.7279693179599876e-07, "loss": 0.5993, "step": 12315 }, { "epoch": 4.713356295445847, "grad_norm": 0.5574260354042053, "learning_rate": 1.7233838540027025e-07, "loss": 0.6271, "step": 12316 }, { "epoch": 4.713738997321087, "grad_norm": 0.5440186858177185, "learning_rate": 1.7188044294074946e-07, "loss": 0.6443, "step": 12317 }, { "epoch": 4.714121699196326, "grad_norm": 0.5279264450073242, "learning_rate": 1.714231044455783e-07, "loss": 0.5875, "step": 12318 }, { "epoch": 4.714504401071565, "grad_norm": 0.5154576897621155, "learning_rate": 1.7096636994286208e-07, "loss": 0.5998, "step": 12319 }, { "epoch": 4.714887102946804, "grad_norm": 0.5074495077133179, "learning_rate": 1.7051023946066724e-07, "loss": 0.6003, "step": 12320 }, { "epoch": 4.715269804822044, "grad_norm": 0.5475678443908691, "learning_rate": 1.7005471302702582e-07, "loss": 0.6528, "step": 12321 }, { "epoch": 4.715652506697283, "grad_norm": 0.5709872245788574, "learning_rate": 1.6959979066993092e-07, "loss": 0.5804, "step": 12322 }, { "epoch": 4.716035208572522, "grad_norm": 0.524621844291687, "learning_rate": 1.691454724173369e-07, "loss": 0.6581, "step": 12323 }, { "epoch": 4.7164179104477615, "grad_norm": 0.5109123587608337, "learning_rate": 1.686917582971648e-07, "loss": 0.7172, "step": 12324 }, { "epoch": 4.716800612323, "grad_norm": 0.5489553213119507, "learning_rate": 1.6823864833729664e-07, "loss": 0.5488, "step": 12325 }, { "epoch": 4.717183314198239, "grad_norm": 0.5203409790992737, "learning_rate": 1.6778614256557468e-07, "loss": 0.63, "step": 12326 }, { "epoch": 4.717566016073479, "grad_norm": 0.5134336352348328, "learning_rate": 1.6733424100981e-07, "loss": 0.5978, "step": 12327 }, { "epoch": 4.717948717948718, "grad_norm": 0.5344496965408325, "learning_rate": 1.6688294369777037e-07, "loss": 0.6098, "step": 12328 }, { "epoch": 4.718331419823957, "grad_norm": 0.5475597381591797, "learning_rate": 1.6643225065719026e-07, "loss": 0.5901, "step": 12329 }, { "epoch": 4.718714121699197, "grad_norm": 0.5687548518180847, "learning_rate": 1.6598216191576643e-07, "loss": 0.5577, "step": 12330 }, { "epoch": 4.719096823574436, "grad_norm": 0.5156930088996887, "learning_rate": 1.6553267750115898e-07, "loss": 0.5962, "step": 12331 }, { "epoch": 4.719479525449675, "grad_norm": 0.5635525584220886, "learning_rate": 1.65083797440988e-07, "loss": 0.6229, "step": 12332 }, { "epoch": 4.719862227324914, "grad_norm": 0.5495395064353943, "learning_rate": 1.6463552176283814e-07, "loss": 0.6094, "step": 12333 }, { "epoch": 4.720244929200153, "grad_norm": 0.5753505229949951, "learning_rate": 1.6418785049425955e-07, "loss": 0.7583, "step": 12334 }, { "epoch": 4.720627631075392, "grad_norm": 0.5970518589019775, "learning_rate": 1.6374078366276026e-07, "loss": 0.5738, "step": 12335 }, { "epoch": 4.721010332950631, "grad_norm": 0.5299988985061646, "learning_rate": 1.632943212958149e-07, "loss": 0.595, "step": 12336 }, { "epoch": 4.721393034825871, "grad_norm": 0.5373620390892029, "learning_rate": 1.6284846342086046e-07, "loss": 0.5984, "step": 12337 }, { "epoch": 4.72177573670111, "grad_norm": 0.5727149248123169, "learning_rate": 1.62403210065295e-07, "loss": 0.613, "step": 12338 }, { "epoch": 4.722158438576349, "grad_norm": 0.5601971745491028, "learning_rate": 1.6195856125648e-07, "loss": 0.5762, "step": 12339 }, { "epoch": 4.722541140451588, "grad_norm": 0.8122668862342834, "learning_rate": 1.6151451702174137e-07, "loss": 0.6245, "step": 12340 }, { "epoch": 4.722923842326828, "grad_norm": 0.589588463306427, "learning_rate": 1.6107107738836835e-07, "loss": 0.627, "step": 12341 }, { "epoch": 4.7233065442020665, "grad_norm": 0.5396147966384888, "learning_rate": 1.6062824238360807e-07, "loss": 0.6084, "step": 12342 }, { "epoch": 4.7236892460773054, "grad_norm": 0.5597115755081177, "learning_rate": 1.6018601203467654e-07, "loss": 0.6704, "step": 12343 }, { "epoch": 4.724071947952545, "grad_norm": 0.6651238203048706, "learning_rate": 1.5974438636874979e-07, "loss": 0.651, "step": 12344 }, { "epoch": 4.724454649827784, "grad_norm": 0.5403446555137634, "learning_rate": 1.59303365412965e-07, "loss": 0.6033, "step": 12345 }, { "epoch": 4.724837351703023, "grad_norm": 0.5372081995010376, "learning_rate": 1.5886294919442602e-07, "loss": 0.5804, "step": 12346 }, { "epoch": 4.725220053578263, "grad_norm": 0.553449809551239, "learning_rate": 1.5842313774019568e-07, "loss": 0.6384, "step": 12347 }, { "epoch": 4.725602755453502, "grad_norm": 0.5003529191017151, "learning_rate": 1.5798393107730346e-07, "loss": 0.5931, "step": 12348 }, { "epoch": 4.725985457328741, "grad_norm": 0.4898752272129059, "learning_rate": 1.5754532923274e-07, "loss": 0.595, "step": 12349 }, { "epoch": 4.726368159203981, "grad_norm": 0.5480614900588989, "learning_rate": 1.5710733223345598e-07, "loss": 0.5976, "step": 12350 }, { "epoch": 4.7267508610792195, "grad_norm": 0.5280140042304993, "learning_rate": 1.5666994010637205e-07, "loss": 0.5837, "step": 12351 }, { "epoch": 4.727133562954458, "grad_norm": 0.5766247510910034, "learning_rate": 1.5623315287836228e-07, "loss": 0.6011, "step": 12352 }, { "epoch": 4.727516264829697, "grad_norm": 0.5661806464195251, "learning_rate": 1.5579697057627074e-07, "loss": 0.5666, "step": 12353 }, { "epoch": 4.727898966704937, "grad_norm": 0.5591440796852112, "learning_rate": 1.5536139322690158e-07, "loss": 0.6597, "step": 12354 }, { "epoch": 4.728281668580176, "grad_norm": 0.5347779393196106, "learning_rate": 1.5492642085702225e-07, "loss": 0.5522, "step": 12355 }, { "epoch": 4.728664370455415, "grad_norm": 0.5605959892272949, "learning_rate": 1.5449205349336362e-07, "loss": 0.6019, "step": 12356 }, { "epoch": 4.729047072330655, "grad_norm": 0.5320698618888855, "learning_rate": 1.5405829116261872e-07, "loss": 0.7002, "step": 12357 }, { "epoch": 4.729429774205894, "grad_norm": 0.5651136040687561, "learning_rate": 1.5362513389144075e-07, "loss": 0.5881, "step": 12358 }, { "epoch": 4.729812476081133, "grad_norm": 0.5332383513450623, "learning_rate": 1.5319258170645169e-07, "loss": 0.6388, "step": 12359 }, { "epoch": 4.730195177956372, "grad_norm": 0.5390496253967285, "learning_rate": 1.527606346342325e-07, "loss": 0.5999, "step": 12360 }, { "epoch": 4.730577879831611, "grad_norm": 0.5185132622718811, "learning_rate": 1.5232929270132535e-07, "loss": 0.5046, "step": 12361 }, { "epoch": 4.73096058170685, "grad_norm": 0.5288156867027283, "learning_rate": 1.5189855593424007e-07, "loss": 0.5912, "step": 12362 }, { "epoch": 4.731343283582089, "grad_norm": 0.5748713612556458, "learning_rate": 1.5146842435944443e-07, "loss": 0.5945, "step": 12363 }, { "epoch": 4.731725985457329, "grad_norm": 0.4885762333869934, "learning_rate": 1.5103889800337279e-07, "loss": 0.5685, "step": 12364 }, { "epoch": 4.732108687332568, "grad_norm": 0.5419812202453613, "learning_rate": 1.5060997689241964e-07, "loss": 0.6516, "step": 12365 }, { "epoch": 4.732491389207807, "grad_norm": 0.5567833185195923, "learning_rate": 1.5018166105294385e-07, "loss": 0.655, "step": 12366 }, { "epoch": 4.732874091083047, "grad_norm": 0.559183657169342, "learning_rate": 1.4975395051126772e-07, "loss": 0.6304, "step": 12367 }, { "epoch": 4.733256792958286, "grad_norm": 0.6859309673309326, "learning_rate": 1.4932684529367248e-07, "loss": 0.6886, "step": 12368 }, { "epoch": 4.7336394948335245, "grad_norm": 0.5267247557640076, "learning_rate": 1.4890034542640485e-07, "loss": 0.6429, "step": 12369 }, { "epoch": 4.734022196708764, "grad_norm": 0.5952823162078857, "learning_rate": 1.4847445093567836e-07, "loss": 0.6567, "step": 12370 }, { "epoch": 4.734404898584003, "grad_norm": 0.551049530506134, "learning_rate": 1.4804916184766206e-07, "loss": 0.6174, "step": 12371 }, { "epoch": 4.734787600459242, "grad_norm": 0.5461760759353638, "learning_rate": 1.4762447818849168e-07, "loss": 0.6426, "step": 12372 }, { "epoch": 4.735170302334481, "grad_norm": 0.5385497808456421, "learning_rate": 1.4720039998426638e-07, "loss": 0.5265, "step": 12373 }, { "epoch": 4.735553004209721, "grad_norm": 0.6117137670516968, "learning_rate": 1.467769272610453e-07, "loss": 0.6595, "step": 12374 }, { "epoch": 4.73593570608496, "grad_norm": 0.5428680777549744, "learning_rate": 1.4635406004485208e-07, "loss": 0.5961, "step": 12375 }, { "epoch": 4.736318407960199, "grad_norm": 0.5971164107322693, "learning_rate": 1.459317983616737e-07, "loss": 0.6133, "step": 12376 }, { "epoch": 4.736701109835439, "grad_norm": 0.5299938321113586, "learning_rate": 1.4551014223746052e-07, "loss": 0.5891, "step": 12377 }, { "epoch": 4.7370838117106775, "grad_norm": 0.5880628824234009, "learning_rate": 1.450890916981218e-07, "loss": 0.6469, "step": 12378 }, { "epoch": 4.737466513585916, "grad_norm": 0.5262853503227234, "learning_rate": 1.446686467695324e-07, "loss": 0.5804, "step": 12379 }, { "epoch": 4.737849215461155, "grad_norm": 0.507086992263794, "learning_rate": 1.4424880747753277e-07, "loss": 0.561, "step": 12380 }, { "epoch": 4.738231917336395, "grad_norm": 0.5478930473327637, "learning_rate": 1.4382957384792116e-07, "loss": 0.6128, "step": 12381 }, { "epoch": 4.738614619211634, "grad_norm": 0.49974527955055237, "learning_rate": 1.4341094590646033e-07, "loss": 0.6162, "step": 12382 }, { "epoch": 4.738997321086873, "grad_norm": 0.5995707511901855, "learning_rate": 1.429929236788763e-07, "loss": 0.6679, "step": 12383 }, { "epoch": 4.739380022962113, "grad_norm": 0.5055043697357178, "learning_rate": 1.4257550719085855e-07, "loss": 0.5714, "step": 12384 }, { "epoch": 4.739762724837352, "grad_norm": 0.6026327013969421, "learning_rate": 1.4215869646805657e-07, "loss": 0.6211, "step": 12385 }, { "epoch": 4.740145426712591, "grad_norm": 0.5723575353622437, "learning_rate": 1.4174249153608766e-07, "loss": 0.5813, "step": 12386 }, { "epoch": 4.7405281285878305, "grad_norm": 0.5168317556381226, "learning_rate": 1.4132689242052466e-07, "loss": 0.5486, "step": 12387 }, { "epoch": 4.740910830463069, "grad_norm": 0.6122647523880005, "learning_rate": 1.409118991469105e-07, "loss": 0.6046, "step": 12388 }, { "epoch": 4.741293532338308, "grad_norm": 0.5693713426589966, "learning_rate": 1.404975117407459e-07, "loss": 0.6562, "step": 12389 }, { "epoch": 4.741676234213548, "grad_norm": 0.5581436157226562, "learning_rate": 1.4008373022749711e-07, "loss": 0.6425, "step": 12390 }, { "epoch": 4.742058936088787, "grad_norm": 0.5334264636039734, "learning_rate": 1.396705546325916e-07, "loss": 0.5557, "step": 12391 }, { "epoch": 4.742441637964026, "grad_norm": 0.564592182636261, "learning_rate": 1.3925798498141908e-07, "loss": 0.5736, "step": 12392 }, { "epoch": 4.742824339839265, "grad_norm": 0.5131211280822754, "learning_rate": 1.3884602129933588e-07, "loss": 0.5835, "step": 12393 }, { "epoch": 4.743207041714505, "grad_norm": 0.5190221667289734, "learning_rate": 1.3843466361165626e-07, "loss": 0.5268, "step": 12394 }, { "epoch": 4.743589743589744, "grad_norm": 0.5466880798339844, "learning_rate": 1.3802391194365883e-07, "loss": 0.6494, "step": 12395 }, { "epoch": 4.7439724454649825, "grad_norm": 0.565109372138977, "learning_rate": 1.3761376632058675e-07, "loss": 0.6496, "step": 12396 }, { "epoch": 4.744355147340222, "grad_norm": 0.55927973985672, "learning_rate": 1.372042267676432e-07, "loss": 0.5892, "step": 12397 }, { "epoch": 4.744737849215461, "grad_norm": 0.5702834129333496, "learning_rate": 1.3679529330999696e-07, "loss": 0.539, "step": 12398 }, { "epoch": 4.7451205510907, "grad_norm": 0.549924373626709, "learning_rate": 1.3638696597277678e-07, "loss": 0.605, "step": 12399 }, { "epoch": 4.745503252965939, "grad_norm": 0.5208427309989929, "learning_rate": 1.3597924478107594e-07, "loss": 0.6434, "step": 12400 }, { "epoch": 4.745885954841179, "grad_norm": 0.552638590335846, "learning_rate": 1.3557212975994992e-07, "loss": 0.6163, "step": 12401 }, { "epoch": 4.746268656716418, "grad_norm": 0.5472196340560913, "learning_rate": 1.3516562093441766e-07, "loss": 0.6065, "step": 12402 }, { "epoch": 4.746651358591657, "grad_norm": 0.5360415577888489, "learning_rate": 1.3475971832946022e-07, "loss": 0.6142, "step": 12403 }, { "epoch": 4.747034060466897, "grad_norm": 0.5361657738685608, "learning_rate": 1.3435442197002102e-07, "loss": 0.6414, "step": 12404 }, { "epoch": 4.7474167623421355, "grad_norm": 0.5411206483840942, "learning_rate": 1.3394973188100567e-07, "loss": 0.5816, "step": 12405 }, { "epoch": 4.747799464217374, "grad_norm": 0.5262181162834167, "learning_rate": 1.3354564808728544e-07, "loss": 0.5361, "step": 12406 }, { "epoch": 4.748182166092614, "grad_norm": 0.5742524862289429, "learning_rate": 1.331421706136904e-07, "loss": 0.6563, "step": 12407 }, { "epoch": 4.748564867967853, "grad_norm": 0.5206157565116882, "learning_rate": 1.327392994850163e-07, "loss": 0.6449, "step": 12408 }, { "epoch": 4.748947569843092, "grad_norm": 0.6023339033126831, "learning_rate": 1.3233703472602e-07, "loss": 0.6477, "step": 12409 }, { "epoch": 4.749330271718332, "grad_norm": 0.5599960088729858, "learning_rate": 1.3193537636142283e-07, "loss": 0.6462, "step": 12410 }, { "epoch": 4.749712973593571, "grad_norm": 0.5698777437210083, "learning_rate": 1.315343244159073e-07, "loss": 0.5868, "step": 12411 }, { "epoch": 4.75009567546881, "grad_norm": 0.523372232913971, "learning_rate": 1.3113387891411922e-07, "loss": 0.5416, "step": 12412 }, { "epoch": 4.750478377344049, "grad_norm": 0.5006179213523865, "learning_rate": 1.307340398806667e-07, "loss": 0.5869, "step": 12413 }, { "epoch": 4.7508610792192885, "grad_norm": 0.5381483435630798, "learning_rate": 1.3033480734012115e-07, "loss": 0.5757, "step": 12414 }, { "epoch": 4.751243781094527, "grad_norm": 0.5108547806739807, "learning_rate": 1.2993618131701635e-07, "loss": 0.5552, "step": 12415 }, { "epoch": 4.751626482969766, "grad_norm": 0.515989363193512, "learning_rate": 1.2953816183584934e-07, "loss": 0.5981, "step": 12416 }, { "epoch": 4.752009184845006, "grad_norm": 0.5468152165412903, "learning_rate": 1.291407489210783e-07, "loss": 0.6736, "step": 12417 }, { "epoch": 4.752391886720245, "grad_norm": 0.5076107978820801, "learning_rate": 1.2874394259712708e-07, "loss": 0.5768, "step": 12418 }, { "epoch": 4.752774588595484, "grad_norm": 0.5194104909896851, "learning_rate": 1.2834774288837948e-07, "loss": 0.5832, "step": 12419 }, { "epoch": 4.753157290470723, "grad_norm": 0.5591374635696411, "learning_rate": 1.2795214981918268e-07, "loss": 0.6443, "step": 12420 }, { "epoch": 4.753539992345963, "grad_norm": 0.5586863160133362, "learning_rate": 1.2755716341384727e-07, "loss": 0.623, "step": 12421 }, { "epoch": 4.753922694221202, "grad_norm": 0.5895813703536987, "learning_rate": 1.2716278369664825e-07, "loss": 0.5986, "step": 12422 }, { "epoch": 4.7543053960964405, "grad_norm": 0.5537276268005371, "learning_rate": 1.2676901069181736e-07, "loss": 0.5832, "step": 12423 }, { "epoch": 4.75468809797168, "grad_norm": 0.5603901147842407, "learning_rate": 1.2637584442355632e-07, "loss": 0.6233, "step": 12424 }, { "epoch": 4.755070799846919, "grad_norm": 0.509869396686554, "learning_rate": 1.2598328491602473e-07, "loss": 0.5172, "step": 12425 }, { "epoch": 4.755453501722158, "grad_norm": 0.5748355388641357, "learning_rate": 1.2559133219334662e-07, "loss": 0.5987, "step": 12426 }, { "epoch": 4.755836203597398, "grad_norm": 0.552153468132019, "learning_rate": 1.251999862796083e-07, "loss": 0.5963, "step": 12427 }, { "epoch": 4.756218905472637, "grad_norm": 0.5491917133331299, "learning_rate": 1.2480924719885934e-07, "loss": 0.6986, "step": 12428 }, { "epoch": 4.756601607347876, "grad_norm": 0.5634970664978027, "learning_rate": 1.2441911497511284e-07, "loss": 0.5991, "step": 12429 }, { "epoch": 4.756984309223116, "grad_norm": 0.5248203277587891, "learning_rate": 1.240295896323407e-07, "loss": 0.6955, "step": 12430 }, { "epoch": 4.757367011098355, "grad_norm": 0.61408931016922, "learning_rate": 1.2364067119448264e-07, "loss": 0.5885, "step": 12431 }, { "epoch": 4.7577497129735935, "grad_norm": 0.5516828894615173, "learning_rate": 1.2325235968543958e-07, "loss": 0.6181, "step": 12432 }, { "epoch": 4.758132414848832, "grad_norm": 0.545154333114624, "learning_rate": 1.228646551290713e-07, "loss": 0.5919, "step": 12433 }, { "epoch": 4.758515116724072, "grad_norm": 0.5511189699172974, "learning_rate": 1.224775575492043e-07, "loss": 0.5886, "step": 12434 }, { "epoch": 4.758897818599311, "grad_norm": 0.5244868397712708, "learning_rate": 1.2209106696962848e-07, "loss": 0.6687, "step": 12435 }, { "epoch": 4.75928052047455, "grad_norm": 0.569420337677002, "learning_rate": 1.2170518341409254e-07, "loss": 0.655, "step": 12436 }, { "epoch": 4.75966322234979, "grad_norm": 0.5355077981948853, "learning_rate": 1.2131990690631203e-07, "loss": 0.6597, "step": 12437 }, { "epoch": 4.760045924225029, "grad_norm": 0.5434831380844116, "learning_rate": 1.2093523746996128e-07, "loss": 0.6545, "step": 12438 }, { "epoch": 4.760428626100268, "grad_norm": 0.5215162634849548, "learning_rate": 1.205511751286803e-07, "loss": 0.5965, "step": 12439 }, { "epoch": 4.760811327975507, "grad_norm": 0.5714865326881409, "learning_rate": 1.2016771990607134e-07, "loss": 0.6464, "step": 12440 }, { "epoch": 4.7611940298507465, "grad_norm": 0.5452125072479248, "learning_rate": 1.1978487182569666e-07, "loss": 0.597, "step": 12441 }, { "epoch": 4.761576731725985, "grad_norm": 0.537205159664154, "learning_rate": 1.1940263091108628e-07, "loss": 0.5802, "step": 12442 }, { "epoch": 4.761959433601224, "grad_norm": 0.5377905964851379, "learning_rate": 1.1902099718572813e-07, "loss": 0.6517, "step": 12443 }, { "epoch": 4.762342135476464, "grad_norm": 0.5689975619316101, "learning_rate": 1.1863997067307453e-07, "loss": 0.6759, "step": 12444 }, { "epoch": 4.762724837351703, "grad_norm": 0.531788170337677, "learning_rate": 1.1825955139654121e-07, "loss": 0.6054, "step": 12445 }, { "epoch": 4.763107539226942, "grad_norm": 0.5234243869781494, "learning_rate": 1.1787973937950503e-07, "loss": 0.591, "step": 12446 }, { "epoch": 4.763490241102182, "grad_norm": 0.5546068549156189, "learning_rate": 1.175005346453073e-07, "loss": 0.6598, "step": 12447 }, { "epoch": 4.763872942977421, "grad_norm": 0.5343483686447144, "learning_rate": 1.1712193721725051e-07, "loss": 0.6294, "step": 12448 }, { "epoch": 4.76425564485266, "grad_norm": 0.5467444062232971, "learning_rate": 1.167439471186016e-07, "loss": 0.6165, "step": 12449 }, { "epoch": 4.764638346727899, "grad_norm": 0.5186666250228882, "learning_rate": 1.1636656437258754e-07, "loss": 0.6275, "step": 12450 }, { "epoch": 4.765021048603138, "grad_norm": 0.5379390120506287, "learning_rate": 1.1598978900240198e-07, "loss": 0.6005, "step": 12451 }, { "epoch": 4.765403750478377, "grad_norm": 0.5546663403511047, "learning_rate": 1.1561362103119533e-07, "loss": 0.683, "step": 12452 }, { "epoch": 4.765786452353616, "grad_norm": 0.5318812727928162, "learning_rate": 1.1523806048208685e-07, "loss": 0.597, "step": 12453 }, { "epoch": 4.766169154228856, "grad_norm": 0.5314708352088928, "learning_rate": 1.1486310737815475e-07, "loss": 0.6425, "step": 12454 }, { "epoch": 4.766551856104095, "grad_norm": 0.5267226099967957, "learning_rate": 1.1448876174244172e-07, "loss": 0.6309, "step": 12455 }, { "epoch": 4.766934557979334, "grad_norm": 0.5460078120231628, "learning_rate": 1.1411502359795157e-07, "loss": 0.5963, "step": 12456 }, { "epoch": 4.767317259854574, "grad_norm": 0.500774621963501, "learning_rate": 1.1374189296765037e-07, "loss": 0.617, "step": 12457 }, { "epoch": 4.767699961729813, "grad_norm": 0.5236737132072449, "learning_rate": 1.1336936987446978e-07, "loss": 0.6308, "step": 12458 }, { "epoch": 4.7680826636050515, "grad_norm": 0.5097300410270691, "learning_rate": 1.1299745434130149e-07, "loss": 0.5775, "step": 12459 }, { "epoch": 4.76846536548029, "grad_norm": 0.5303465127944946, "learning_rate": 1.1262614639100166e-07, "loss": 0.6511, "step": 12460 }, { "epoch": 4.76884806735553, "grad_norm": 0.5015098452568054, "learning_rate": 1.1225544604638761e-07, "loss": 0.6115, "step": 12461 }, { "epoch": 4.769230769230769, "grad_norm": 0.5583244562149048, "learning_rate": 1.1188535333023887e-07, "loss": 0.5778, "step": 12462 }, { "epoch": 4.769613471106008, "grad_norm": 0.5555882453918457, "learning_rate": 1.1151586826529948e-07, "loss": 0.6906, "step": 12463 }, { "epoch": 4.769996172981248, "grad_norm": 0.5694411396980286, "learning_rate": 1.1114699087427682e-07, "loss": 0.6636, "step": 12464 }, { "epoch": 4.770378874856487, "grad_norm": 0.5207274556159973, "learning_rate": 1.1077872117983613e-07, "loss": 0.6172, "step": 12465 }, { "epoch": 4.770761576731726, "grad_norm": 0.5248399376869202, "learning_rate": 1.1041105920461037e-07, "loss": 0.6581, "step": 12466 }, { "epoch": 4.7711442786069655, "grad_norm": 0.5749682188034058, "learning_rate": 1.1004400497119371e-07, "loss": 0.6364, "step": 12467 }, { "epoch": 4.7715269804822045, "grad_norm": 0.5276601910591125, "learning_rate": 1.0967755850214257e-07, "loss": 0.5888, "step": 12468 }, { "epoch": 4.771909682357443, "grad_norm": 0.6145559549331665, "learning_rate": 1.0931171981997557e-07, "loss": 0.6109, "step": 12469 }, { "epoch": 4.772292384232683, "grad_norm": 0.5214796662330627, "learning_rate": 1.0894648894717364e-07, "loss": 0.646, "step": 12470 }, { "epoch": 4.772675086107922, "grad_norm": 0.5534881353378296, "learning_rate": 1.0858186590618324e-07, "loss": 0.6238, "step": 12471 }, { "epoch": 4.773057787983161, "grad_norm": 0.5966793894767761, "learning_rate": 1.0821785071941094e-07, "loss": 0.6099, "step": 12472 }, { "epoch": 4.7734404898584, "grad_norm": 0.5671879053115845, "learning_rate": 1.0785444340922435e-07, "loss": 0.6424, "step": 12473 }, { "epoch": 4.77382319173364, "grad_norm": 0.5558319091796875, "learning_rate": 1.0749164399795897e-07, "loss": 0.6432, "step": 12474 }, { "epoch": 4.774205893608879, "grad_norm": 0.5479616522789001, "learning_rate": 1.0712945250790697e-07, "loss": 0.5492, "step": 12475 }, { "epoch": 4.774588595484118, "grad_norm": 0.7402980923652649, "learning_rate": 1.0676786896132718e-07, "loss": 0.556, "step": 12476 }, { "epoch": 4.774971297359357, "grad_norm": 0.553568422794342, "learning_rate": 1.0640689338044075e-07, "loss": 0.6015, "step": 12477 }, { "epoch": 4.775353999234596, "grad_norm": 0.5809771418571472, "learning_rate": 1.060465257874299e-07, "loss": 0.6461, "step": 12478 }, { "epoch": 4.775736701109835, "grad_norm": 0.5326566100120544, "learning_rate": 1.0568676620443807e-07, "loss": 0.6685, "step": 12479 }, { "epoch": 4.776119402985074, "grad_norm": 0.5584684014320374, "learning_rate": 1.0532761465357644e-07, "loss": 0.6767, "step": 12480 }, { "epoch": 4.776502104860314, "grad_norm": 0.5659473538398743, "learning_rate": 1.0496907115691624e-07, "loss": 0.5962, "step": 12481 }, { "epoch": 4.776884806735553, "grad_norm": 0.5776344537734985, "learning_rate": 1.0461113573648763e-07, "loss": 0.5765, "step": 12482 }, { "epoch": 4.777267508610792, "grad_norm": 0.5675857067108154, "learning_rate": 1.0425380841428967e-07, "loss": 0.5841, "step": 12483 }, { "epoch": 4.777650210486032, "grad_norm": 0.5757749676704407, "learning_rate": 1.0389708921228037e-07, "loss": 0.6222, "step": 12484 }, { "epoch": 4.778032912361271, "grad_norm": 0.5593574047088623, "learning_rate": 1.0354097815237996e-07, "loss": 0.6215, "step": 12485 }, { "epoch": 4.7784156142365095, "grad_norm": 0.5495160818099976, "learning_rate": 1.0318547525647316e-07, "loss": 0.5831, "step": 12486 }, { "epoch": 4.778798316111749, "grad_norm": 0.5513842105865479, "learning_rate": 1.0283058054640693e-07, "loss": 0.5567, "step": 12487 }, { "epoch": 4.779181017986988, "grad_norm": 0.5148898959159851, "learning_rate": 1.0247629404399052e-07, "loss": 0.6963, "step": 12488 }, { "epoch": 4.779563719862227, "grad_norm": 0.5440896153450012, "learning_rate": 1.0212261577099424e-07, "loss": 0.5759, "step": 12489 }, { "epoch": 4.779946421737467, "grad_norm": 0.5660918951034546, "learning_rate": 1.017695457491541e-07, "loss": 0.5368, "step": 12490 }, { "epoch": 4.780329123612706, "grad_norm": 0.5293257832527161, "learning_rate": 1.0141708400016714e-07, "loss": 0.6567, "step": 12491 }, { "epoch": 4.780711825487945, "grad_norm": 0.5657624006271362, "learning_rate": 1.0106523054569273e-07, "loss": 0.7466, "step": 12492 }, { "epoch": 4.781094527363184, "grad_norm": 0.5228883624076843, "learning_rate": 1.0071398540735356e-07, "loss": 0.5706, "step": 12493 }, { "epoch": 4.7814772292384236, "grad_norm": 0.5710503458976746, "learning_rate": 1.0036334860673457e-07, "loss": 0.6198, "step": 12494 }, { "epoch": 4.7818599311136625, "grad_norm": 0.5372796654701233, "learning_rate": 1.00013320165383e-07, "loss": 0.6327, "step": 12495 }, { "epoch": 4.782242632988901, "grad_norm": 0.5360987186431885, "learning_rate": 9.96639001048083e-08, "loss": 0.5427, "step": 12496 }, { "epoch": 4.782625334864141, "grad_norm": 0.5961438417434692, "learning_rate": 9.931508844648552e-08, "loss": 0.6002, "step": 12497 }, { "epoch": 4.78300803673938, "grad_norm": 0.566097617149353, "learning_rate": 9.89668852118475e-08, "loss": 0.6453, "step": 12498 }, { "epoch": 4.783390738614619, "grad_norm": 0.5257608294487, "learning_rate": 9.861929042229379e-08, "loss": 0.5969, "step": 12499 }, { "epoch": 4.783773440489858, "grad_norm": 0.49627017974853516, "learning_rate": 9.827230409918398e-08, "loss": 0.5551, "step": 12500 }, { "epoch": 4.784156142365098, "grad_norm": 0.6127722859382629, "learning_rate": 9.792592626384212e-08, "loss": 0.6293, "step": 12501 }, { "epoch": 4.784538844240337, "grad_norm": 0.5504506230354309, "learning_rate": 9.758015693755451e-08, "loss": 0.5494, "step": 12502 }, { "epoch": 4.784921546115576, "grad_norm": 0.5448653697967529, "learning_rate": 9.723499614156972e-08, "loss": 0.5928, "step": 12503 }, { "epoch": 4.785304247990815, "grad_norm": 0.5653883814811707, "learning_rate": 9.689044389709635e-08, "loss": 0.5852, "step": 12504 }, { "epoch": 4.785686949866054, "grad_norm": 0.631713330745697, "learning_rate": 9.654650022531076e-08, "loss": 0.6498, "step": 12505 }, { "epoch": 4.786069651741293, "grad_norm": 0.5522932410240173, "learning_rate": 9.620316514734828e-08, "loss": 0.5542, "step": 12506 }, { "epoch": 4.786452353616533, "grad_norm": 0.54893559217453, "learning_rate": 9.586043868430761e-08, "loss": 0.676, "step": 12507 }, { "epoch": 4.786835055491772, "grad_norm": 0.5245932340621948, "learning_rate": 9.551832085725076e-08, "loss": 0.5949, "step": 12508 }, { "epoch": 4.787217757367011, "grad_norm": 0.5775803327560425, "learning_rate": 9.517681168720095e-08, "loss": 0.6073, "step": 12509 }, { "epoch": 4.787600459242251, "grad_norm": 0.5475956201553345, "learning_rate": 9.483591119514468e-08, "loss": 0.6428, "step": 12510 }, { "epoch": 4.78798316111749, "grad_norm": 0.5715718865394592, "learning_rate": 9.449561940203189e-08, "loss": 0.5781, "step": 12511 }, { "epoch": 4.788365862992729, "grad_norm": 0.5042867064476013, "learning_rate": 9.415593632877473e-08, "loss": 0.5204, "step": 12512 }, { "epoch": 4.7887485648679675, "grad_norm": 0.5374199151992798, "learning_rate": 9.381686199624762e-08, "loss": 0.5425, "step": 12513 }, { "epoch": 4.789131266743207, "grad_norm": 0.6280125975608826, "learning_rate": 9.347839642528721e-08, "loss": 0.5621, "step": 12514 }, { "epoch": 4.789513968618446, "grad_norm": 0.5325548648834229, "learning_rate": 9.314053963669245e-08, "loss": 0.6564, "step": 12515 }, { "epoch": 4.789896670493685, "grad_norm": 0.5334901809692383, "learning_rate": 9.280329165122671e-08, "loss": 0.6341, "step": 12516 }, { "epoch": 4.790279372368925, "grad_norm": 0.5077374577522278, "learning_rate": 9.24666524896145e-08, "loss": 0.5373, "step": 12517 }, { "epoch": 4.790662074244164, "grad_norm": 0.5072186589241028, "learning_rate": 9.213062217254265e-08, "loss": 0.6012, "step": 12518 }, { "epoch": 4.791044776119403, "grad_norm": 0.5887176990509033, "learning_rate": 9.179520072066128e-08, "loss": 0.5507, "step": 12519 }, { "epoch": 4.791427477994642, "grad_norm": 0.5732893347740173, "learning_rate": 9.14603881545828e-08, "loss": 0.6434, "step": 12520 }, { "epoch": 4.791810179869882, "grad_norm": 0.6102625727653503, "learning_rate": 9.112618449488298e-08, "loss": 0.5817, "step": 12521 }, { "epoch": 4.7921928817451205, "grad_norm": 0.5190370082855225, "learning_rate": 9.07925897620987e-08, "loss": 0.5027, "step": 12522 }, { "epoch": 4.792575583620359, "grad_norm": 0.5434346199035645, "learning_rate": 9.045960397673248e-08, "loss": 0.6116, "step": 12523 }, { "epoch": 4.792958285495599, "grad_norm": 0.5551389455795288, "learning_rate": 9.01272271592446e-08, "loss": 0.637, "step": 12524 }, { "epoch": 4.793340987370838, "grad_norm": 0.5482539534568787, "learning_rate": 8.979545933006095e-08, "loss": 0.5575, "step": 12525 }, { "epoch": 4.793723689246077, "grad_norm": 0.5303285717964172, "learning_rate": 8.946430050957078e-08, "loss": 0.6463, "step": 12526 }, { "epoch": 4.794106391121317, "grad_norm": 0.566405713558197, "learning_rate": 8.913375071812225e-08, "loss": 0.646, "step": 12527 }, { "epoch": 4.794489092996556, "grad_norm": 0.5574449300765991, "learning_rate": 8.880380997603133e-08, "loss": 0.5694, "step": 12528 }, { "epoch": 4.794871794871795, "grad_norm": 0.5384371876716614, "learning_rate": 8.847447830357292e-08, "loss": 0.5679, "step": 12529 }, { "epoch": 4.7952544967470345, "grad_norm": 0.5274690389633179, "learning_rate": 8.814575572098416e-08, "loss": 0.6495, "step": 12530 }, { "epoch": 4.7956371986222734, "grad_norm": 0.5352190136909485, "learning_rate": 8.781764224846778e-08, "loss": 0.523, "step": 12531 }, { "epoch": 4.796019900497512, "grad_norm": 0.5295953750610352, "learning_rate": 8.749013790618543e-08, "loss": 0.5455, "step": 12532 }, { "epoch": 4.796402602372751, "grad_norm": 0.5364496111869812, "learning_rate": 8.716324271426435e-08, "loss": 0.5704, "step": 12533 }, { "epoch": 4.796785304247991, "grad_norm": 0.5475273728370667, "learning_rate": 8.68369566927929e-08, "loss": 0.7023, "step": 12534 }, { "epoch": 4.79716800612323, "grad_norm": 0.5534316301345825, "learning_rate": 8.651127986182172e-08, "loss": 0.6097, "step": 12535 }, { "epoch": 4.797550707998469, "grad_norm": 0.5822980403900146, "learning_rate": 8.618621224136481e-08, "loss": 0.5757, "step": 12536 }, { "epoch": 4.797933409873709, "grad_norm": 0.6123639345169067, "learning_rate": 8.58617538513995e-08, "loss": 0.6762, "step": 12537 }, { "epoch": 4.798316111748948, "grad_norm": 0.5428249835968018, "learning_rate": 8.553790471186207e-08, "loss": 0.6637, "step": 12538 }, { "epoch": 4.798698813624187, "grad_norm": 0.5460563898086548, "learning_rate": 8.521466484265662e-08, "loss": 0.6477, "step": 12539 }, { "epoch": 4.7990815154994255, "grad_norm": 0.5175760984420776, "learning_rate": 8.489203426364501e-08, "loss": 0.5941, "step": 12540 }, { "epoch": 4.799464217374665, "grad_norm": 0.5553536415100098, "learning_rate": 8.457001299465583e-08, "loss": 0.7313, "step": 12541 }, { "epoch": 4.799846919249904, "grad_norm": 0.6292601227760315, "learning_rate": 8.42486010554766e-08, "loss": 0.6738, "step": 12542 }, { "epoch": 4.800229621125143, "grad_norm": 0.5380892753601074, "learning_rate": 8.39277984658593e-08, "loss": 0.6138, "step": 12543 }, { "epoch": 4.800612323000383, "grad_norm": 0.5879014730453491, "learning_rate": 8.360760524551814e-08, "loss": 0.6761, "step": 12544 }, { "epoch": 4.800995024875622, "grad_norm": 0.5769553184509277, "learning_rate": 8.328802141413072e-08, "loss": 0.6328, "step": 12545 }, { "epoch": 4.801377726750861, "grad_norm": 0.607089102268219, "learning_rate": 8.296904699133467e-08, "loss": 0.6447, "step": 12546 }, { "epoch": 4.801760428626101, "grad_norm": 0.5069425702095032, "learning_rate": 8.265068199673321e-08, "loss": 0.5597, "step": 12547 }, { "epoch": 4.80214313050134, "grad_norm": 0.5186064839363098, "learning_rate": 8.233292644988956e-08, "loss": 0.5739, "step": 12548 }, { "epoch": 4.8025258323765785, "grad_norm": 0.5503049492835999, "learning_rate": 8.201578037033143e-08, "loss": 0.61, "step": 12549 }, { "epoch": 4.802908534251818, "grad_norm": 0.5483407974243164, "learning_rate": 8.169924377754879e-08, "loss": 0.6164, "step": 12550 }, { "epoch": 4.803291236127057, "grad_norm": 0.5614001750946045, "learning_rate": 8.138331669099164e-08, "loss": 0.5971, "step": 12551 }, { "epoch": 4.803673938002296, "grad_norm": 0.5358448028564453, "learning_rate": 8.10679991300778e-08, "loss": 0.6013, "step": 12552 }, { "epoch": 4.804056639877535, "grad_norm": 0.5133763551712036, "learning_rate": 8.075329111418173e-08, "loss": 0.5893, "step": 12553 }, { "epoch": 4.804439341752775, "grad_norm": 0.5289050340652466, "learning_rate": 8.043919266264355e-08, "loss": 0.6, "step": 12554 }, { "epoch": 4.804822043628014, "grad_norm": 0.5145845413208008, "learning_rate": 8.01257037947667e-08, "loss": 0.5939, "step": 12555 }, { "epoch": 4.805204745503253, "grad_norm": 0.5917441844940186, "learning_rate": 7.981282452981353e-08, "loss": 0.6116, "step": 12556 }, { "epoch": 4.8055874473784925, "grad_norm": 0.5304750204086304, "learning_rate": 7.950055488701424e-08, "loss": 0.6373, "step": 12557 }, { "epoch": 4.8059701492537314, "grad_norm": 0.5394321084022522, "learning_rate": 7.91888948855557e-08, "loss": 0.5558, "step": 12558 }, { "epoch": 4.80635285112897, "grad_norm": 0.5872263312339783, "learning_rate": 7.887784454459257e-08, "loss": 0.6158, "step": 12559 }, { "epoch": 4.806735553004209, "grad_norm": 0.8311297297477722, "learning_rate": 7.856740388323847e-08, "loss": 0.6692, "step": 12560 }, { "epoch": 4.807118254879449, "grad_norm": 0.5037561058998108, "learning_rate": 7.825757292057146e-08, "loss": 0.6093, "step": 12561 }, { "epoch": 4.807500956754688, "grad_norm": 0.5434562563896179, "learning_rate": 7.794835167563187e-08, "loss": 0.6354, "step": 12562 }, { "epoch": 4.807883658629927, "grad_norm": 0.5149165987968445, "learning_rate": 7.763974016742115e-08, "loss": 0.5665, "step": 12563 }, { "epoch": 4.808266360505167, "grad_norm": 0.5415800213813782, "learning_rate": 7.733173841490416e-08, "loss": 0.6948, "step": 12564 }, { "epoch": 4.808649062380406, "grad_norm": 0.4896146059036255, "learning_rate": 7.702434643701017e-08, "loss": 0.5946, "step": 12565 }, { "epoch": 4.809031764255645, "grad_norm": 0.5494376420974731, "learning_rate": 7.671756425262744e-08, "loss": 0.6221, "step": 12566 }, { "epoch": 4.809414466130884, "grad_norm": 0.5436204671859741, "learning_rate": 7.641139188060864e-08, "loss": 0.6511, "step": 12567 }, { "epoch": 4.809797168006123, "grad_norm": 0.5744156837463379, "learning_rate": 7.610582933977096e-08, "loss": 0.6856, "step": 12568 }, { "epoch": 4.810179869881362, "grad_norm": 0.5556087493896484, "learning_rate": 7.580087664888824e-08, "loss": 0.6474, "step": 12569 }, { "epoch": 4.810562571756602, "grad_norm": 0.5258070826530457, "learning_rate": 7.549653382670441e-08, "loss": 0.59, "step": 12570 }, { "epoch": 4.810945273631841, "grad_norm": 0.5098209381103516, "learning_rate": 7.519280089192005e-08, "loss": 0.6088, "step": 12571 }, { "epoch": 4.81132797550708, "grad_norm": 0.5406017303466797, "learning_rate": 7.488967786320133e-08, "loss": 0.6264, "step": 12572 }, { "epoch": 4.811710677382319, "grad_norm": 0.5452250838279724, "learning_rate": 7.45871647591756e-08, "loss": 0.5671, "step": 12573 }, { "epoch": 4.812093379257559, "grad_norm": 0.5869795680046082, "learning_rate": 7.428526159843353e-08, "loss": 0.6363, "step": 12574 }, { "epoch": 4.812476081132798, "grad_norm": 0.5958783030509949, "learning_rate": 7.398396839952692e-08, "loss": 0.6215, "step": 12575 }, { "epoch": 4.8128587830080365, "grad_norm": 0.5552250742912292, "learning_rate": 7.368328518097212e-08, "loss": 0.6415, "step": 12576 }, { "epoch": 4.813241484883276, "grad_norm": 0.5283420085906982, "learning_rate": 7.338321196124543e-08, "loss": 0.5639, "step": 12577 }, { "epoch": 4.813624186758515, "grad_norm": 0.5458859205245972, "learning_rate": 7.308374875878877e-08, "loss": 0.6212, "step": 12578 }, { "epoch": 4.814006888633754, "grad_norm": 0.5629627704620361, "learning_rate": 7.278489559200407e-08, "loss": 0.6261, "step": 12579 }, { "epoch": 4.814389590508993, "grad_norm": 0.5450494289398193, "learning_rate": 7.248665247925668e-08, "loss": 0.5839, "step": 12580 }, { "epoch": 4.814772292384233, "grad_norm": 0.5342354774475098, "learning_rate": 7.218901943887525e-08, "loss": 0.6892, "step": 12581 }, { "epoch": 4.815154994259472, "grad_norm": 0.535675048828125, "learning_rate": 7.189199648914957e-08, "loss": 0.5953, "step": 12582 }, { "epoch": 4.815537696134711, "grad_norm": 0.5104790329933167, "learning_rate": 7.159558364833175e-08, "loss": 0.5925, "step": 12583 }, { "epoch": 4.8159203980099505, "grad_norm": 0.5195679068565369, "learning_rate": 7.12997809346383e-08, "loss": 0.5526, "step": 12584 }, { "epoch": 4.8163030998851895, "grad_norm": 0.6089479327201843, "learning_rate": 7.10045883662469e-08, "loss": 0.7048, "step": 12585 }, { "epoch": 4.816685801760428, "grad_norm": 0.6412591934204102, "learning_rate": 7.071000596129751e-08, "loss": 0.6114, "step": 12586 }, { "epoch": 4.817068503635668, "grad_norm": 0.5601996183395386, "learning_rate": 7.04160337378923e-08, "loss": 0.5777, "step": 12587 }, { "epoch": 4.817451205510907, "grad_norm": 0.5603938698768616, "learning_rate": 7.012267171409904e-08, "loss": 0.5983, "step": 12588 }, { "epoch": 4.817833907386146, "grad_norm": 0.5536684989929199, "learning_rate": 6.98299199079433e-08, "loss": 0.6918, "step": 12589 }, { "epoch": 4.818216609261386, "grad_norm": 0.5701835751533508, "learning_rate": 6.953777833741626e-08, "loss": 0.7036, "step": 12590 }, { "epoch": 4.818599311136625, "grad_norm": 0.5575446486473083, "learning_rate": 6.924624702047134e-08, "loss": 0.6411, "step": 12591 }, { "epoch": 4.818982013011864, "grad_norm": 0.5099160671234131, "learning_rate": 6.895532597502308e-08, "loss": 0.5825, "step": 12592 }, { "epoch": 4.819364714887103, "grad_norm": 0.5892859697341919, "learning_rate": 6.866501521894942e-08, "loss": 0.6139, "step": 12593 }, { "epoch": 4.819747416762342, "grad_norm": 0.6177361607551575, "learning_rate": 6.837531477009163e-08, "loss": 0.6266, "step": 12594 }, { "epoch": 4.820130118637581, "grad_norm": 0.5784786939620972, "learning_rate": 6.808622464625104e-08, "loss": 0.6389, "step": 12595 }, { "epoch": 4.82051282051282, "grad_norm": 0.544823169708252, "learning_rate": 6.779774486519453e-08, "loss": 0.6064, "step": 12596 }, { "epoch": 4.82089552238806, "grad_norm": 0.5321487784385681, "learning_rate": 6.750987544465015e-08, "loss": 0.6658, "step": 12597 }, { "epoch": 4.821278224263299, "grad_norm": 0.5593995451927185, "learning_rate": 6.722261640230599e-08, "loss": 0.6081, "step": 12598 }, { "epoch": 4.821660926138538, "grad_norm": 0.5271536707878113, "learning_rate": 6.693596775581679e-08, "loss": 0.6785, "step": 12599 }, { "epoch": 4.822043628013777, "grad_norm": 0.5954329967498779, "learning_rate": 6.664992952279736e-08, "loss": 0.6493, "step": 12600 }, { "epoch": 4.822426329889017, "grad_norm": 0.5493499040603638, "learning_rate": 6.636450172082587e-08, "loss": 0.685, "step": 12601 }, { "epoch": 4.822809031764256, "grad_norm": 0.5111834406852722, "learning_rate": 6.607968436744272e-08, "loss": 0.6473, "step": 12602 }, { "epoch": 4.8231917336394945, "grad_norm": 0.5770829319953918, "learning_rate": 6.579547748014948e-08, "loss": 0.5898, "step": 12603 }, { "epoch": 4.823574435514734, "grad_norm": 0.5393608212471008, "learning_rate": 6.551188107641327e-08, "loss": 0.6458, "step": 12604 }, { "epoch": 4.823957137389973, "grad_norm": 0.5644798874855042, "learning_rate": 6.522889517366016e-08, "loss": 0.6087, "step": 12605 }, { "epoch": 4.824339839265212, "grad_norm": 0.5546600222587585, "learning_rate": 6.494651978928179e-08, "loss": 0.5947, "step": 12606 }, { "epoch": 4.824722541140452, "grad_norm": 0.5443123579025269, "learning_rate": 6.466475494062985e-08, "loss": 0.6511, "step": 12607 }, { "epoch": 4.825105243015691, "grad_norm": 0.512640655040741, "learning_rate": 6.438360064501936e-08, "loss": 0.5443, "step": 12608 }, { "epoch": 4.82548794489093, "grad_norm": 0.7669849991798401, "learning_rate": 6.410305691972874e-08, "loss": 0.6285, "step": 12609 }, { "epoch": 4.82587064676617, "grad_norm": 0.5461691617965698, "learning_rate": 6.382312378199862e-08, "loss": 0.5883, "step": 12610 }, { "epoch": 4.8262533486414085, "grad_norm": 0.5320504903793335, "learning_rate": 6.354380124902971e-08, "loss": 0.6205, "step": 12611 }, { "epoch": 4.8266360505166475, "grad_norm": 0.5394220948219299, "learning_rate": 6.326508933798936e-08, "loss": 0.6078, "step": 12612 }, { "epoch": 4.827018752391886, "grad_norm": 0.5048136115074158, "learning_rate": 6.29869880660039e-08, "loss": 0.6944, "step": 12613 }, { "epoch": 4.827401454267126, "grad_norm": 0.5346640944480896, "learning_rate": 6.270949745016408e-08, "loss": 0.657, "step": 12614 }, { "epoch": 4.827784156142365, "grad_norm": 0.5957465171813965, "learning_rate": 6.243261750752183e-08, "loss": 0.6237, "step": 12615 }, { "epoch": 4.828166858017604, "grad_norm": 0.5295772552490234, "learning_rate": 6.215634825509243e-08, "loss": 0.5501, "step": 12616 }, { "epoch": 4.828549559892844, "grad_norm": 0.5763145089149475, "learning_rate": 6.18806897098545e-08, "loss": 0.6965, "step": 12617 }, { "epoch": 4.828932261768083, "grad_norm": 0.5549412369728088, "learning_rate": 6.160564188874562e-08, "loss": 0.5962, "step": 12618 }, { "epoch": 4.829314963643322, "grad_norm": 0.5404839515686035, "learning_rate": 6.133120480866894e-08, "loss": 0.5902, "step": 12619 }, { "epoch": 4.829697665518561, "grad_norm": 0.5017463564872742, "learning_rate": 6.105737848648985e-08, "loss": 0.6272, "step": 12620 }, { "epoch": 4.8300803673938, "grad_norm": 0.5362432599067688, "learning_rate": 6.078416293903711e-08, "loss": 0.5377, "step": 12621 }, { "epoch": 4.830463069269039, "grad_norm": 0.526970624923706, "learning_rate": 6.051155818309839e-08, "loss": 0.6797, "step": 12622 }, { "epoch": 4.830845771144278, "grad_norm": 0.5416781902313232, "learning_rate": 6.023956423542698e-08, "loss": 0.6607, "step": 12623 }, { "epoch": 4.831228473019518, "grad_norm": 0.5766021013259888, "learning_rate": 5.996818111273617e-08, "loss": 0.7119, "step": 12624 }, { "epoch": 4.831611174894757, "grad_norm": 0.6983477473258972, "learning_rate": 5.969740883170595e-08, "loss": 0.5835, "step": 12625 }, { "epoch": 4.831993876769996, "grad_norm": 0.49091053009033203, "learning_rate": 5.94272474089741e-08, "loss": 0.6031, "step": 12626 }, { "epoch": 4.832376578645236, "grad_norm": 0.5281139612197876, "learning_rate": 5.915769686114292e-08, "loss": 0.6424, "step": 12627 }, { "epoch": 4.832759280520475, "grad_norm": 0.5548135042190552, "learning_rate": 5.8888757204776936e-08, "loss": 0.6121, "step": 12628 }, { "epoch": 4.833141982395714, "grad_norm": 0.5592852234840393, "learning_rate": 5.862042845640403e-08, "loss": 0.6163, "step": 12629 }, { "epoch": 4.833524684270953, "grad_norm": 0.5353814959526062, "learning_rate": 5.8352710632513244e-08, "loss": 0.585, "step": 12630 }, { "epoch": 4.833907386146192, "grad_norm": 0.5538852214813232, "learning_rate": 5.808560374955585e-08, "loss": 0.6175, "step": 12631 }, { "epoch": 4.834290088021431, "grad_norm": 0.5113311409950256, "learning_rate": 5.781910782394762e-08, "loss": 0.5908, "step": 12632 }, { "epoch": 4.83467278989667, "grad_norm": 0.5475194454193115, "learning_rate": 5.755322287206544e-08, "loss": 0.649, "step": 12633 }, { "epoch": 4.83505549177191, "grad_norm": 0.5014076828956604, "learning_rate": 5.728794891024625e-08, "loss": 0.5801, "step": 12634 }, { "epoch": 4.835438193647149, "grad_norm": 0.5229318141937256, "learning_rate": 5.7023285954794784e-08, "loss": 0.5941, "step": 12635 }, { "epoch": 4.835820895522388, "grad_norm": 0.5126830339431763, "learning_rate": 5.6759234021973584e-08, "loss": 0.5668, "step": 12636 }, { "epoch": 4.836203597397628, "grad_norm": 0.5831472277641296, "learning_rate": 5.649579312801079e-08, "loss": 0.6712, "step": 12637 }, { "epoch": 4.8365862992728665, "grad_norm": 0.6178779006004333, "learning_rate": 5.6232963289093446e-08, "loss": 0.6264, "step": 12638 }, { "epoch": 4.8369690011481055, "grad_norm": 0.5413645505905151, "learning_rate": 5.59707445213753e-08, "loss": 0.5857, "step": 12639 }, { "epoch": 4.837351703023344, "grad_norm": 0.4748691916465759, "learning_rate": 5.57091368409679e-08, "loss": 0.6158, "step": 12640 }, { "epoch": 4.837734404898584, "grad_norm": 0.5282959342002869, "learning_rate": 5.544814026395062e-08, "loss": 0.6486, "step": 12641 }, { "epoch": 4.838117106773823, "grad_norm": 0.5360713005065918, "learning_rate": 5.518775480636063e-08, "loss": 0.6163, "step": 12642 }, { "epoch": 4.838499808649062, "grad_norm": 0.5704688429832458, "learning_rate": 5.4927980484200674e-08, "loss": 0.6411, "step": 12643 }, { "epoch": 4.838882510524302, "grad_norm": 0.5271190404891968, "learning_rate": 5.4668817313432435e-08, "loss": 0.5818, "step": 12644 }, { "epoch": 4.839265212399541, "grad_norm": 0.5762602090835571, "learning_rate": 5.441026530998428e-08, "loss": 0.6503, "step": 12645 }, { "epoch": 4.83964791427478, "grad_norm": 0.5824402570724487, "learning_rate": 5.41523244897435e-08, "loss": 0.5919, "step": 12646 }, { "epoch": 4.8400306161500195, "grad_norm": 0.5552062392234802, "learning_rate": 5.389499486856187e-08, "loss": 0.7053, "step": 12647 }, { "epoch": 4.840413318025258, "grad_norm": 0.5527291297912598, "learning_rate": 5.363827646225339e-08, "loss": 0.6145, "step": 12648 }, { "epoch": 4.840796019900497, "grad_norm": 0.5587460398674011, "learning_rate": 5.338216928659212e-08, "loss": 0.5886, "step": 12649 }, { "epoch": 4.841178721775737, "grad_norm": 0.5109637975692749, "learning_rate": 5.312667335731991e-08, "loss": 0.6364, "step": 12650 }, { "epoch": 4.841561423650976, "grad_norm": 0.5388289093971252, "learning_rate": 5.2871788690134206e-08, "loss": 0.6355, "step": 12651 }, { "epoch": 4.841944125526215, "grad_norm": 0.5768613219261169, "learning_rate": 5.261751530070136e-08, "loss": 0.6252, "step": 12652 }, { "epoch": 4.842326827401454, "grad_norm": 0.5750800967216492, "learning_rate": 5.2363853204645543e-08, "loss": 0.6343, "step": 12653 }, { "epoch": 4.842709529276694, "grad_norm": 0.542357861995697, "learning_rate": 5.211080241755429e-08, "loss": 0.5782, "step": 12654 }, { "epoch": 4.843092231151933, "grad_norm": 0.5435949563980103, "learning_rate": 5.185836295497959e-08, "loss": 0.5376, "step": 12655 }, { "epoch": 4.843474933027172, "grad_norm": 0.508277416229248, "learning_rate": 5.160653483243461e-08, "loss": 0.5194, "step": 12656 }, { "epoch": 4.843857634902411, "grad_norm": 0.5211808085441589, "learning_rate": 5.1355318065393625e-08, "loss": 0.6382, "step": 12657 }, { "epoch": 4.84424033677765, "grad_norm": 0.5516977310180664, "learning_rate": 5.11047126692954e-08, "loss": 0.553, "step": 12658 }, { "epoch": 4.844623038652889, "grad_norm": 0.5202578902244568, "learning_rate": 5.085471865953984e-08, "loss": 0.5577, "step": 12659 }, { "epoch": 4.845005740528128, "grad_norm": 0.5013253688812256, "learning_rate": 5.06053360514902e-08, "loss": 0.5854, "step": 12660 }, { "epoch": 4.845388442403368, "grad_norm": 0.5199227333068848, "learning_rate": 5.0356564860472024e-08, "loss": 0.5642, "step": 12661 }, { "epoch": 4.845771144278607, "grad_norm": 0.5504307150840759, "learning_rate": 5.010840510177195e-08, "loss": 0.6622, "step": 12662 }, { "epoch": 4.846153846153846, "grad_norm": 0.5490125417709351, "learning_rate": 4.986085679064112e-08, "loss": 0.645, "step": 12663 }, { "epoch": 4.846536548029086, "grad_norm": 0.5332785248756409, "learning_rate": 4.961391994229181e-08, "loss": 0.5596, "step": 12664 }, { "epoch": 4.8469192499043245, "grad_norm": 0.5127111077308655, "learning_rate": 4.9367594571898546e-08, "loss": 0.5878, "step": 12665 }, { "epoch": 4.8473019517795635, "grad_norm": 0.5022981762886047, "learning_rate": 4.912188069459922e-08, "loss": 0.5876, "step": 12666 }, { "epoch": 4.847684653654803, "grad_norm": 0.5276004076004028, "learning_rate": 4.887677832549287e-08, "loss": 0.6005, "step": 12667 }, { "epoch": 4.848067355530042, "grad_norm": 0.5653325319290161, "learning_rate": 4.863228747964188e-08, "loss": 0.6804, "step": 12668 }, { "epoch": 4.848450057405281, "grad_norm": 0.5905620455741882, "learning_rate": 4.838840817207091e-08, "loss": 0.6598, "step": 12669 }, { "epoch": 4.848832759280521, "grad_norm": 0.5487508773803711, "learning_rate": 4.814514041776797e-08, "loss": 0.618, "step": 12670 }, { "epoch": 4.84921546115576, "grad_norm": 0.5318014621734619, "learning_rate": 4.790248423168109e-08, "loss": 0.6386, "step": 12671 }, { "epoch": 4.849598163030999, "grad_norm": 0.6370165348052979, "learning_rate": 4.76604396287228e-08, "loss": 0.6022, "step": 12672 }, { "epoch": 4.849980864906238, "grad_norm": 0.5734788179397583, "learning_rate": 4.741900662376786e-08, "loss": 0.6699, "step": 12673 }, { "epoch": 4.8503635667814775, "grad_norm": 0.5775933861732483, "learning_rate": 4.7178185231651074e-08, "loss": 0.5999, "step": 12674 }, { "epoch": 4.850746268656716, "grad_norm": 0.5612978935241699, "learning_rate": 4.693797546717505e-08, "loss": 0.604, "step": 12675 }, { "epoch": 4.851128970531955, "grad_norm": 0.5780505537986755, "learning_rate": 4.669837734509797e-08, "loss": 0.6492, "step": 12676 }, { "epoch": 4.851511672407195, "grad_norm": 0.5389362573623657, "learning_rate": 4.645939088014473e-08, "loss": 0.6071, "step": 12677 }, { "epoch": 4.851894374282434, "grad_norm": 0.5401656031608582, "learning_rate": 4.6221016087002466e-08, "loss": 0.5745, "step": 12678 }, { "epoch": 4.852277076157673, "grad_norm": 0.6153091192245483, "learning_rate": 4.598325298031947e-08, "loss": 0.5885, "step": 12679 }, { "epoch": 4.852659778032912, "grad_norm": 0.5335890054702759, "learning_rate": 4.574610157470738e-08, "loss": 0.6012, "step": 12680 }, { "epoch": 4.853042479908152, "grad_norm": 0.6071294546127319, "learning_rate": 4.550956188473899e-08, "loss": 0.6767, "step": 12681 }, { "epoch": 4.853425181783391, "grad_norm": 0.5128602981567383, "learning_rate": 4.5273633924950434e-08, "loss": 0.6207, "step": 12682 }, { "epoch": 4.85380788365863, "grad_norm": 0.5310193300247192, "learning_rate": 4.503831770984013e-08, "loss": 0.6242, "step": 12683 }, { "epoch": 4.854190585533869, "grad_norm": 0.5224404335021973, "learning_rate": 4.480361325386984e-08, "loss": 0.6144, "step": 12684 }, { "epoch": 4.854573287409108, "grad_norm": 0.5481545329093933, "learning_rate": 4.4569520571462465e-08, "loss": 0.6465, "step": 12685 }, { "epoch": 4.854955989284347, "grad_norm": 0.7093521952629089, "learning_rate": 4.4336039677002065e-08, "loss": 0.5475, "step": 12686 }, { "epoch": 4.855338691159587, "grad_norm": 0.5519238710403442, "learning_rate": 4.410317058483826e-08, "loss": 0.6169, "step": 12687 }, { "epoch": 4.855721393034826, "grad_norm": 0.5123266577720642, "learning_rate": 4.3870913309280725e-08, "loss": 0.5273, "step": 12688 }, { "epoch": 4.856104094910065, "grad_norm": 0.5084357261657715, "learning_rate": 4.3639267864603594e-08, "loss": 0.589, "step": 12689 }, { "epoch": 4.856486796785305, "grad_norm": 0.5037499666213989, "learning_rate": 4.340823426503993e-08, "loss": 0.5734, "step": 12690 }, { "epoch": 4.856869498660544, "grad_norm": 0.5179377198219299, "learning_rate": 4.3177812524788366e-08, "loss": 0.6551, "step": 12691 }, { "epoch": 4.8572522005357825, "grad_norm": 0.49665194749832153, "learning_rate": 4.2948002658009805e-08, "loss": 0.5629, "step": 12692 }, { "epoch": 4.8576349024110215, "grad_norm": 0.7968802452087402, "learning_rate": 4.2718804678826277e-08, "loss": 0.6283, "step": 12693 }, { "epoch": 4.858017604286261, "grad_norm": 0.5102057456970215, "learning_rate": 4.249021860132208e-08, "loss": 0.531, "step": 12694 }, { "epoch": 4.8584003061615, "grad_norm": 0.5631064772605896, "learning_rate": 4.2262244439544855e-08, "loss": 0.6766, "step": 12695 }, { "epoch": 4.858783008036739, "grad_norm": 0.553795576095581, "learning_rate": 4.20348822075034e-08, "loss": 0.6224, "step": 12696 }, { "epoch": 4.859165709911979, "grad_norm": 0.5443204045295715, "learning_rate": 4.180813191917099e-08, "loss": 0.6093, "step": 12697 }, { "epoch": 4.859548411787218, "grad_norm": 0.5505012273788452, "learning_rate": 4.1581993588482025e-08, "loss": 0.5946, "step": 12698 }, { "epoch": 4.859931113662457, "grad_norm": 0.5724555850028992, "learning_rate": 4.135646722933206e-08, "loss": 0.5923, "step": 12699 }, { "epoch": 4.860313815537696, "grad_norm": 0.5673931241035461, "learning_rate": 4.113155285558113e-08, "loss": 0.5559, "step": 12700 }, { "epoch": 4.8606965174129355, "grad_norm": 0.5808496475219727, "learning_rate": 4.0907250481050377e-08, "loss": 0.686, "step": 12701 }, { "epoch": 4.861079219288174, "grad_norm": 0.5212671756744385, "learning_rate": 4.068356011952324e-08, "loss": 0.6523, "step": 12702 }, { "epoch": 4.861461921163413, "grad_norm": 0.5574178099632263, "learning_rate": 4.0460481784748706e-08, "loss": 0.5737, "step": 12703 }, { "epoch": 4.861844623038653, "grad_norm": 0.5676889419555664, "learning_rate": 4.0238015490432494e-08, "loss": 0.6807, "step": 12704 }, { "epoch": 4.862227324913892, "grad_norm": 0.5278597474098206, "learning_rate": 4.0016161250248096e-08, "loss": 0.5918, "step": 12705 }, { "epoch": 4.862610026789131, "grad_norm": 0.5199499130249023, "learning_rate": 3.979491907782684e-08, "loss": 0.5429, "step": 12706 }, { "epoch": 4.862992728664371, "grad_norm": 0.5250698924064636, "learning_rate": 3.957428898676674e-08, "loss": 0.5604, "step": 12707 }, { "epoch": 4.86337543053961, "grad_norm": 0.5426193475723267, "learning_rate": 3.935427099062472e-08, "loss": 0.5883, "step": 12708 }, { "epoch": 4.863758132414849, "grad_norm": 0.5472418665885925, "learning_rate": 3.913486510292219e-08, "loss": 0.6128, "step": 12709 }, { "epoch": 4.8641408342900885, "grad_norm": 0.5651639699935913, "learning_rate": 3.89160713371417e-08, "loss": 0.6298, "step": 12710 }, { "epoch": 4.864523536165327, "grad_norm": 0.5155590176582336, "learning_rate": 3.869788970672917e-08, "loss": 0.5628, "step": 12711 }, { "epoch": 4.864906238040566, "grad_norm": 0.5317453742027283, "learning_rate": 3.848032022509163e-08, "loss": 0.5928, "step": 12712 }, { "epoch": 4.865288939915805, "grad_norm": 0.5683351755142212, "learning_rate": 3.826336290560062e-08, "loss": 0.6472, "step": 12713 }, { "epoch": 4.865671641791045, "grad_norm": 0.5688779950141907, "learning_rate": 3.80470177615877e-08, "loss": 0.649, "step": 12714 }, { "epoch": 4.866054343666284, "grad_norm": 0.5157376527786255, "learning_rate": 3.783128480634779e-08, "loss": 0.5922, "step": 12715 }, { "epoch": 4.866437045541523, "grad_norm": 0.5592067241668701, "learning_rate": 3.761616405314028e-08, "loss": 0.6126, "step": 12716 }, { "epoch": 4.866819747416763, "grad_norm": 0.5242480635643005, "learning_rate": 3.7401655515181266e-08, "loss": 0.5636, "step": 12717 }, { "epoch": 4.867202449292002, "grad_norm": 0.5407953262329102, "learning_rate": 3.718775920565687e-08, "loss": 0.5836, "step": 12718 }, { "epoch": 4.8675851511672406, "grad_norm": 0.527029812335968, "learning_rate": 3.69744751377088e-08, "loss": 0.6225, "step": 12719 }, { "epoch": 4.8679678530424795, "grad_norm": 0.6000462174415588, "learning_rate": 3.6761803324444346e-08, "loss": 0.6515, "step": 12720 }, { "epoch": 4.868350554917719, "grad_norm": 0.548730194568634, "learning_rate": 3.6549743778934166e-08, "loss": 0.6338, "step": 12721 }, { "epoch": 4.868733256792958, "grad_norm": 0.5064654350280762, "learning_rate": 3.633829651420784e-08, "loss": 0.6056, "step": 12722 }, { "epoch": 4.869115958668197, "grad_norm": 0.5476255416870117, "learning_rate": 3.6127461543261635e-08, "loss": 0.6421, "step": 12723 }, { "epoch": 4.869498660543437, "grad_norm": 0.5551549792289734, "learning_rate": 3.5917238879049634e-08, "loss": 0.5956, "step": 12724 }, { "epoch": 4.869881362418676, "grad_norm": 0.5338268280029297, "learning_rate": 3.570762853449261e-08, "loss": 0.5149, "step": 12725 }, { "epoch": 4.870264064293915, "grad_norm": 0.5212820172309875, "learning_rate": 3.549863052247027e-08, "loss": 0.638, "step": 12726 }, { "epoch": 4.870646766169155, "grad_norm": 0.5369117856025696, "learning_rate": 3.529024485582677e-08, "loss": 0.5895, "step": 12727 }, { "epoch": 4.8710294680443935, "grad_norm": 0.5722334384918213, "learning_rate": 3.508247154736854e-08, "loss": 0.6403, "step": 12728 }, { "epoch": 4.871412169919632, "grad_norm": 0.5716758966445923, "learning_rate": 3.487531060986205e-08, "loss": 0.5936, "step": 12729 }, { "epoch": 4.871794871794872, "grad_norm": 0.608475923538208, "learning_rate": 3.4668762056039304e-08, "loss": 0.6726, "step": 12730 }, { "epoch": 4.872177573670111, "grad_norm": 0.5283510684967041, "learning_rate": 3.44628258985924e-08, "loss": 0.6377, "step": 12731 }, { "epoch": 4.87256027554535, "grad_norm": 0.5135639905929565, "learning_rate": 3.425750215017787e-08, "loss": 0.5336, "step": 12732 }, { "epoch": 4.872942977420589, "grad_norm": 0.563885509967804, "learning_rate": 3.405279082341228e-08, "loss": 0.5758, "step": 12733 }, { "epoch": 4.873325679295829, "grad_norm": 0.5399649143218994, "learning_rate": 3.38486919308767e-08, "loss": 0.6335, "step": 12734 }, { "epoch": 4.873708381171068, "grad_norm": 0.5472825169563293, "learning_rate": 3.3645205485112184e-08, "loss": 0.6318, "step": 12735 }, { "epoch": 4.874091083046307, "grad_norm": 0.5750558972358704, "learning_rate": 3.34423314986243e-08, "loss": 0.6379, "step": 12736 }, { "epoch": 4.8744737849215465, "grad_norm": 0.5446425676345825, "learning_rate": 3.324006998388085e-08, "loss": 0.6276, "step": 12737 }, { "epoch": 4.874856486796785, "grad_norm": 0.5631312131881714, "learning_rate": 3.3038420953310776e-08, "loss": 0.6197, "step": 12738 }, { "epoch": 4.875239188672024, "grad_norm": 0.5744243264198303, "learning_rate": 3.2837384419305285e-08, "loss": 0.6282, "step": 12739 }, { "epoch": 4.875621890547263, "grad_norm": 0.5576966404914856, "learning_rate": 3.263696039421893e-08, "loss": 0.5952, "step": 12740 }, { "epoch": 4.876004592422503, "grad_norm": 0.5547515153884888, "learning_rate": 3.243714889036853e-08, "loss": 0.5752, "step": 12741 }, { "epoch": 4.876387294297742, "grad_norm": 0.5490753650665283, "learning_rate": 3.2237949920034264e-08, "loss": 0.5294, "step": 12742 }, { "epoch": 4.876769996172981, "grad_norm": 0.5295710563659668, "learning_rate": 3.203936349545522e-08, "loss": 0.6101, "step": 12743 }, { "epoch": 4.877152698048221, "grad_norm": 0.5828664302825928, "learning_rate": 3.18413896288372e-08, "loss": 0.5775, "step": 12744 }, { "epoch": 4.87753539992346, "grad_norm": 0.5164065957069397, "learning_rate": 3.164402833234381e-08, "loss": 0.6278, "step": 12745 }, { "epoch": 4.877918101798699, "grad_norm": 0.5259318351745605, "learning_rate": 3.144727961810534e-08, "loss": 0.6634, "step": 12746 }, { "epoch": 4.878300803673938, "grad_norm": 0.5469813942909241, "learning_rate": 3.125114349821212e-08, "loss": 0.605, "step": 12747 }, { "epoch": 4.878683505549177, "grad_norm": 0.5478485226631165, "learning_rate": 3.1055619984716735e-08, "loss": 0.6532, "step": 12748 }, { "epoch": 4.879066207424416, "grad_norm": 0.5563852787017822, "learning_rate": 3.086070908963512e-08, "loss": 0.635, "step": 12749 }, { "epoch": 4.879448909299656, "grad_norm": 0.5081863403320312, "learning_rate": 3.066641082494548e-08, "loss": 0.6274, "step": 12750 }, { "epoch": 4.879831611174895, "grad_norm": 0.5301873683929443, "learning_rate": 3.047272520258715e-08, "loss": 0.6412, "step": 12751 }, { "epoch": 4.880214313050134, "grad_norm": 0.5539125204086304, "learning_rate": 3.027965223446172e-08, "loss": 0.584, "step": 12752 }, { "epoch": 4.880597014925373, "grad_norm": 0.5181247591972351, "learning_rate": 3.008719193243637e-08, "loss": 0.5802, "step": 12753 }, { "epoch": 4.880979716800613, "grad_norm": 0.535348653793335, "learning_rate": 2.989534430833718e-08, "loss": 0.5695, "step": 12754 }, { "epoch": 4.8813624186758515, "grad_norm": 0.5502045154571533, "learning_rate": 2.9704109373953625e-08, "loss": 0.6313, "step": 12755 }, { "epoch": 4.8817451205510904, "grad_norm": 0.5361356735229492, "learning_rate": 2.9513487141037412e-08, "loss": 0.5835, "step": 12756 }, { "epoch": 4.88212782242633, "grad_norm": 0.5976353287696838, "learning_rate": 2.9323477621303607e-08, "loss": 0.6211, "step": 12757 }, { "epoch": 4.882510524301569, "grad_norm": 0.5127227902412415, "learning_rate": 2.9134080826427326e-08, "loss": 0.5615, "step": 12758 }, { "epoch": 4.882893226176808, "grad_norm": 0.5321168303489685, "learning_rate": 2.8945296768048138e-08, "loss": 0.5636, "step": 12759 }, { "epoch": 4.883275928052047, "grad_norm": 0.5438576936721802, "learning_rate": 2.8757125457768985e-08, "loss": 0.598, "step": 12760 }, { "epoch": 4.883658629927287, "grad_norm": 0.5877348184585571, "learning_rate": 2.8569566907150625e-08, "loss": 0.733, "step": 12761 }, { "epoch": 4.884041331802526, "grad_norm": 0.5448612570762634, "learning_rate": 2.8382621127721612e-08, "loss": 0.6844, "step": 12762 }, { "epoch": 4.884424033677765, "grad_norm": 0.5288140773773193, "learning_rate": 2.819628813096831e-08, "loss": 0.5626, "step": 12763 }, { "epoch": 4.8848067355530045, "grad_norm": 0.5597026348114014, "learning_rate": 2.801056792834267e-08, "loss": 0.5944, "step": 12764 }, { "epoch": 4.885189437428243, "grad_norm": 0.5148041844367981, "learning_rate": 2.7825460531257788e-08, "loss": 0.5692, "step": 12765 }, { "epoch": 4.885572139303482, "grad_norm": 0.5552306771278381, "learning_rate": 2.7640965951087895e-08, "loss": 0.6347, "step": 12766 }, { "epoch": 4.885954841178722, "grad_norm": 0.5294095873832703, "learning_rate": 2.74570841991717e-08, "loss": 0.5921, "step": 12767 }, { "epoch": 4.886337543053961, "grad_norm": 0.5246250629425049, "learning_rate": 2.727381528680906e-08, "loss": 0.5351, "step": 12768 }, { "epoch": 4.8867202449292, "grad_norm": 0.5515176057815552, "learning_rate": 2.7091159225260953e-08, "loss": 0.6273, "step": 12769 }, { "epoch": 4.88710294680444, "grad_norm": 0.5389417409896851, "learning_rate": 2.6909116025753967e-08, "loss": 0.6623, "step": 12770 }, { "epoch": 4.887485648679679, "grad_norm": 0.5918572545051575, "learning_rate": 2.6727685699474704e-08, "loss": 0.6901, "step": 12771 }, { "epoch": 4.887868350554918, "grad_norm": 0.5259273648262024, "learning_rate": 2.6546868257570914e-08, "loss": 0.6164, "step": 12772 }, { "epoch": 4.888251052430157, "grad_norm": 0.4905908703804016, "learning_rate": 2.6366663711157036e-08, "loss": 0.5865, "step": 12773 }, { "epoch": 4.888633754305396, "grad_norm": 0.5232177376747131, "learning_rate": 2.618707207130533e-08, "loss": 0.666, "step": 12774 }, { "epoch": 4.889016456180635, "grad_norm": 0.6663294434547424, "learning_rate": 2.600809334905252e-08, "loss": 0.6606, "step": 12775 }, { "epoch": 4.889399158055874, "grad_norm": 0.5261683464050293, "learning_rate": 2.5829727555397587e-08, "loss": 0.5355, "step": 12776 }, { "epoch": 4.889781859931114, "grad_norm": 0.5684863924980164, "learning_rate": 2.5651974701301764e-08, "loss": 0.6379, "step": 12777 }, { "epoch": 4.890164561806353, "grad_norm": 0.5531014204025269, "learning_rate": 2.547483479768742e-08, "loss": 0.6769, "step": 12778 }, { "epoch": 4.890547263681592, "grad_norm": 0.542137861251831, "learning_rate": 2.5298307855440296e-08, "loss": 0.5617, "step": 12779 }, { "epoch": 4.890929965556831, "grad_norm": 0.5326817631721497, "learning_rate": 2.5122393885410602e-08, "loss": 0.6389, "step": 12780 }, { "epoch": 4.891312667432071, "grad_norm": 0.5077548623085022, "learning_rate": 2.4947092898406357e-08, "loss": 0.5951, "step": 12781 }, { "epoch": 4.8916953693073095, "grad_norm": 0.5216836929321289, "learning_rate": 2.4772404905201163e-08, "loss": 0.5869, "step": 12782 }, { "epoch": 4.8920780711825484, "grad_norm": 0.5692434906959534, "learning_rate": 2.4598329916529772e-08, "loss": 0.6554, "step": 12783 }, { "epoch": 4.892460773057788, "grad_norm": 0.529521644115448, "learning_rate": 2.442486794309029e-08, "loss": 0.5534, "step": 12784 }, { "epoch": 4.892843474933027, "grad_norm": 0.5422469973564148, "learning_rate": 2.425201899554197e-08, "loss": 0.5772, "step": 12785 }, { "epoch": 4.893226176808266, "grad_norm": 0.5911493301391602, "learning_rate": 2.4079783084507423e-08, "loss": 0.6994, "step": 12786 }, { "epoch": 4.893608878683506, "grad_norm": 0.5452778339385986, "learning_rate": 2.3908160220569298e-08, "loss": 0.6264, "step": 12787 }, { "epoch": 4.893991580558745, "grad_norm": 0.4927705228328705, "learning_rate": 2.373715041427582e-08, "loss": 0.5888, "step": 12788 }, { "epoch": 4.894374282433984, "grad_norm": 0.5855845808982849, "learning_rate": 2.3566753676135258e-08, "loss": 0.6564, "step": 12789 }, { "epoch": 4.894756984309224, "grad_norm": 0.5456482172012329, "learning_rate": 2.3396970016619225e-08, "loss": 0.6436, "step": 12790 }, { "epoch": 4.8951396861844625, "grad_norm": 0.6504916548728943, "learning_rate": 2.3227799446161602e-08, "loss": 0.6427, "step": 12791 }, { "epoch": 4.895522388059701, "grad_norm": 0.575125515460968, "learning_rate": 2.305924197515963e-08, "loss": 0.6393, "step": 12792 }, { "epoch": 4.89590508993494, "grad_norm": 0.5629921555519104, "learning_rate": 2.2891297613968356e-08, "loss": 0.6441, "step": 12793 }, { "epoch": 4.89628779181018, "grad_norm": 0.5437064170837402, "learning_rate": 2.2723966372910634e-08, "loss": 0.6586, "step": 12794 }, { "epoch": 4.896670493685419, "grad_norm": 0.5492035150527954, "learning_rate": 2.255724826226935e-08, "loss": 0.5641, "step": 12795 }, { "epoch": 4.897053195560658, "grad_norm": 0.5427319407463074, "learning_rate": 2.239114329228964e-08, "loss": 0.61, "step": 12796 }, { "epoch": 4.897435897435898, "grad_norm": 0.5387789607048035, "learning_rate": 2.2225651473177788e-08, "loss": 0.5502, "step": 12797 }, { "epoch": 4.897818599311137, "grad_norm": 0.523002028465271, "learning_rate": 2.2060772815105657e-08, "loss": 0.5852, "step": 12798 }, { "epoch": 4.898201301186376, "grad_norm": 0.562064528465271, "learning_rate": 2.1896507328205143e-08, "loss": 0.6691, "step": 12799 }, { "epoch": 4.898584003061615, "grad_norm": 0.5975819230079651, "learning_rate": 2.1732855022570388e-08, "loss": 0.6166, "step": 12800 }, { "epoch": 4.898966704936854, "grad_norm": 0.5579656362533569, "learning_rate": 2.1569815908256685e-08, "loss": 0.618, "step": 12801 }, { "epoch": 4.899349406812093, "grad_norm": 0.5798674821853638, "learning_rate": 2.140738999528602e-08, "loss": 0.6785, "step": 12802 }, { "epoch": 4.899732108687332, "grad_norm": 0.5367173552513123, "learning_rate": 2.1245577293638187e-08, "loss": 0.5969, "step": 12803 }, { "epoch": 4.900114810562572, "grad_norm": 0.5354813933372498, "learning_rate": 2.1084377813257452e-08, "loss": 0.6414, "step": 12804 }, { "epoch": 4.900497512437811, "grad_norm": 0.5610390901565552, "learning_rate": 2.0923791564050333e-08, "loss": 0.5927, "step": 12805 }, { "epoch": 4.90088021431305, "grad_norm": 0.5122501254081726, "learning_rate": 2.0763818555884497e-08, "loss": 0.6304, "step": 12806 }, { "epoch": 4.90126291618829, "grad_norm": 0.5374773144721985, "learning_rate": 2.0604458798592077e-08, "loss": 0.6239, "step": 12807 }, { "epoch": 4.901645618063529, "grad_norm": 0.5589026212692261, "learning_rate": 2.0445712301964126e-08, "loss": 0.641, "step": 12808 }, { "epoch": 4.9020283199387675, "grad_norm": 0.5501976609230042, "learning_rate": 2.028757907575729e-08, "loss": 0.5997, "step": 12809 }, { "epoch": 4.902411021814007, "grad_norm": 0.567398190498352, "learning_rate": 2.013005912968935e-08, "loss": 0.6344, "step": 12810 }, { "epoch": 4.902793723689246, "grad_norm": 0.5378836989402771, "learning_rate": 1.9973152473439227e-08, "loss": 0.5549, "step": 12811 }, { "epoch": 4.903176425564485, "grad_norm": 0.582241952419281, "learning_rate": 1.981685911665143e-08, "loss": 0.6472, "step": 12812 }, { "epoch": 4.903559127439724, "grad_norm": 0.5163405537605286, "learning_rate": 1.9661179068928283e-08, "loss": 0.615, "step": 12813 }, { "epoch": 4.903941829314964, "grad_norm": 0.5218581557273865, "learning_rate": 1.950611233983879e-08, "loss": 0.5257, "step": 12814 }, { "epoch": 4.904324531190203, "grad_norm": 0.5443541407585144, "learning_rate": 1.935165893891089e-08, "loss": 0.6124, "step": 12815 }, { "epoch": 4.904707233065442, "grad_norm": 0.5725446343421936, "learning_rate": 1.9197818875635876e-08, "loss": 0.6489, "step": 12816 }, { "epoch": 4.905089934940682, "grad_norm": 0.5530892014503479, "learning_rate": 1.904459215946841e-08, "loss": 0.6709, "step": 12817 }, { "epoch": 4.9054726368159205, "grad_norm": 0.5504615306854248, "learning_rate": 1.8891978799825407e-08, "loss": 0.669, "step": 12818 }, { "epoch": 4.905855338691159, "grad_norm": 0.5029222965240479, "learning_rate": 1.8739978806082693e-08, "loss": 0.6948, "step": 12819 }, { "epoch": 4.906238040566398, "grad_norm": 0.5326550602912903, "learning_rate": 1.858859218758391e-08, "loss": 0.5772, "step": 12820 }, { "epoch": 4.906620742441638, "grad_norm": 0.5254983901977539, "learning_rate": 1.8437818953631614e-08, "loss": 0.6098, "step": 12821 }, { "epoch": 4.907003444316877, "grad_norm": 0.5672082901000977, "learning_rate": 1.828765911348951e-08, "loss": 0.6156, "step": 12822 }, { "epoch": 4.907386146192116, "grad_norm": 0.5424613952636719, "learning_rate": 1.8138112676386877e-08, "loss": 0.6462, "step": 12823 }, { "epoch": 4.907768848067356, "grad_norm": 0.6061156988143921, "learning_rate": 1.7989179651514143e-08, "loss": 0.6157, "step": 12824 }, { "epoch": 4.908151549942595, "grad_norm": 0.5369955897331238, "learning_rate": 1.784086004802288e-08, "loss": 0.6583, "step": 12825 }, { "epoch": 4.908534251817834, "grad_norm": 0.5245370864868164, "learning_rate": 1.7693153875026904e-08, "loss": 0.5479, "step": 12826 }, { "epoch": 4.9089169536930735, "grad_norm": 0.5319206714630127, "learning_rate": 1.7546061141603398e-08, "loss": 0.581, "step": 12827 }, { "epoch": 4.909299655568312, "grad_norm": 0.5658556222915649, "learning_rate": 1.7399581856794025e-08, "loss": 0.7318, "step": 12828 }, { "epoch": 4.909682357443551, "grad_norm": 0.5786296129226685, "learning_rate": 1.7253716029597133e-08, "loss": 0.6533, "step": 12829 }, { "epoch": 4.910065059318791, "grad_norm": 0.5570720434188843, "learning_rate": 1.710846366897889e-08, "loss": 0.7206, "step": 12830 }, { "epoch": 4.91044776119403, "grad_norm": 0.6109384298324585, "learning_rate": 1.696382478386327e-08, "loss": 0.6903, "step": 12831 }, { "epoch": 4.910830463069269, "grad_norm": 0.5300262570381165, "learning_rate": 1.6819799383140934e-08, "loss": 0.6123, "step": 12832 }, { "epoch": 4.911213164944508, "grad_norm": 0.5443320870399475, "learning_rate": 1.667638747566147e-08, "loss": 0.6056, "step": 12833 }, { "epoch": 4.911595866819748, "grad_norm": 0.4957999587059021, "learning_rate": 1.653358907023783e-08, "loss": 0.5628, "step": 12834 }, { "epoch": 4.911978568694987, "grad_norm": 0.5241764783859253, "learning_rate": 1.639140417564633e-08, "loss": 0.5939, "step": 12835 }, { "epoch": 4.9123612705702255, "grad_norm": 0.5365626811981201, "learning_rate": 1.624983280062331e-08, "loss": 0.6472, "step": 12836 }, { "epoch": 4.912743972445465, "grad_norm": 0.5630788207054138, "learning_rate": 1.610887495386848e-08, "loss": 0.6622, "step": 12837 }, { "epoch": 4.913126674320704, "grad_norm": 0.5262974500656128, "learning_rate": 1.5968530644046022e-08, "loss": 0.5731, "step": 12838 }, { "epoch": 4.913509376195943, "grad_norm": 0.5625216960906982, "learning_rate": 1.5828799879779034e-08, "loss": 0.6497, "step": 12839 }, { "epoch": 4.913892078071182, "grad_norm": 0.5037396550178528, "learning_rate": 1.5689682669655094e-08, "loss": 0.6018, "step": 12840 }, { "epoch": 4.914274779946422, "grad_norm": 0.5580479502677917, "learning_rate": 1.5551179022221806e-08, "loss": 0.6873, "step": 12841 }, { "epoch": 4.914657481821661, "grad_norm": 0.5723142623901367, "learning_rate": 1.5413288945992365e-08, "loss": 0.6144, "step": 12842 }, { "epoch": 4.9150401836969, "grad_norm": 0.5476597547531128, "learning_rate": 1.527601244943999e-08, "loss": 0.6341, "step": 12843 }, { "epoch": 4.91542288557214, "grad_norm": 0.5641941428184509, "learning_rate": 1.513934954100127e-08, "loss": 0.6272, "step": 12844 }, { "epoch": 4.9158055874473785, "grad_norm": 0.5717250108718872, "learning_rate": 1.5003300229072816e-08, "loss": 0.6035, "step": 12845 }, { "epoch": 4.916188289322617, "grad_norm": 0.5218536853790283, "learning_rate": 1.4867864522015717e-08, "loss": 0.5975, "step": 12846 }, { "epoch": 4.916570991197857, "grad_norm": 0.5555984973907471, "learning_rate": 1.4733042428154431e-08, "loss": 0.7039, "step": 12847 }, { "epoch": 4.916953693073096, "grad_norm": 0.5707993507385254, "learning_rate": 1.4598833955772329e-08, "loss": 0.6012, "step": 12848 }, { "epoch": 4.917336394948335, "grad_norm": 0.5125514268875122, "learning_rate": 1.4465239113117257e-08, "loss": 0.6189, "step": 12849 }, { "epoch": 4.917719096823575, "grad_norm": 0.5583671927452087, "learning_rate": 1.4332257908399316e-08, "loss": 0.6329, "step": 12850 }, { "epoch": 4.918101798698814, "grad_norm": 0.5345807671546936, "learning_rate": 1.4199890349790858e-08, "loss": 0.6106, "step": 12851 }, { "epoch": 4.918484500574053, "grad_norm": 0.4772335886955261, "learning_rate": 1.4068136445425374e-08, "loss": 0.551, "step": 12852 }, { "epoch": 4.918867202449292, "grad_norm": 0.5580195784568787, "learning_rate": 1.3936996203400832e-08, "loss": 0.592, "step": 12853 }, { "epoch": 4.9192499043245315, "grad_norm": 0.5729390382766724, "learning_rate": 1.3806469631775232e-08, "loss": 0.6236, "step": 12854 }, { "epoch": 4.91963260619977, "grad_norm": 0.5325219035148621, "learning_rate": 1.3676556738568825e-08, "loss": 0.5994, "step": 12855 }, { "epoch": 4.920015308075009, "grad_norm": 0.5381340384483337, "learning_rate": 1.3547257531767444e-08, "loss": 0.6306, "step": 12856 }, { "epoch": 4.920398009950249, "grad_norm": 0.5133954286575317, "learning_rate": 1.3418572019314735e-08, "loss": 0.6172, "step": 12857 }, { "epoch": 4.920780711825488, "grad_norm": 0.5465015172958374, "learning_rate": 1.3290500209119927e-08, "loss": 0.6357, "step": 12858 }, { "epoch": 4.921163413700727, "grad_norm": 0.5165016055107117, "learning_rate": 1.3163042109053393e-08, "loss": 0.5746, "step": 12859 }, { "epoch": 4.921546115575966, "grad_norm": 0.5305241346359253, "learning_rate": 1.3036197726947753e-08, "loss": 0.5819, "step": 12860 }, { "epoch": 4.921928817451206, "grad_norm": 0.5395466089248657, "learning_rate": 1.2909967070596774e-08, "loss": 0.5408, "step": 12861 }, { "epoch": 4.922311519326445, "grad_norm": 0.5720908045768738, "learning_rate": 1.2784350147758695e-08, "loss": 0.6128, "step": 12862 }, { "epoch": 4.9226942212016835, "grad_norm": 0.5699158906936646, "learning_rate": 1.2659346966152897e-08, "loss": 0.63, "step": 12863 }, { "epoch": 4.923076923076923, "grad_norm": 0.5428200960159302, "learning_rate": 1.2534957533461012e-08, "loss": 0.6713, "step": 12864 }, { "epoch": 4.923459624952162, "grad_norm": 0.5342931747436523, "learning_rate": 1.2411181857328036e-08, "loss": 0.6359, "step": 12865 }, { "epoch": 4.923842326827401, "grad_norm": 0.5611650943756104, "learning_rate": 1.2288019945358998e-08, "loss": 0.6006, "step": 12866 }, { "epoch": 4.924225028702641, "grad_norm": 0.5461639761924744, "learning_rate": 1.2165471805122286e-08, "loss": 0.6143, "step": 12867 }, { "epoch": 4.92460773057788, "grad_norm": 0.5698415040969849, "learning_rate": 1.2043537444150765e-08, "loss": 0.6464, "step": 12868 }, { "epoch": 4.924990432453119, "grad_norm": 0.5328624248504639, "learning_rate": 1.192221686993511e-08, "loss": 0.6467, "step": 12869 }, { "epoch": 4.925373134328359, "grad_norm": 0.5816605687141418, "learning_rate": 1.1801510089932689e-08, "loss": 0.6194, "step": 12870 }, { "epoch": 4.925755836203598, "grad_norm": 0.5726465582847595, "learning_rate": 1.1681417111560899e-08, "loss": 0.6754, "step": 12871 }, { "epoch": 4.9261385380788365, "grad_norm": 0.5408940315246582, "learning_rate": 1.1561937942199398e-08, "loss": 0.5632, "step": 12872 }, { "epoch": 4.926521239954075, "grad_norm": 0.5313767194747925, "learning_rate": 1.1443072589191195e-08, "loss": 0.6121, "step": 12873 }, { "epoch": 4.926903941829315, "grad_norm": 0.6035058498382568, "learning_rate": 1.1324821059839342e-08, "loss": 0.6459, "step": 12874 }, { "epoch": 4.927286643704554, "grad_norm": 0.5412118434906006, "learning_rate": 1.1207183361412466e-08, "loss": 0.5747, "step": 12875 }, { "epoch": 4.927669345579793, "grad_norm": 0.6096402406692505, "learning_rate": 1.1090159501138121e-08, "loss": 0.5018, "step": 12876 }, { "epoch": 4.928052047455033, "grad_norm": 0.5372272729873657, "learning_rate": 1.0973749486209439e-08, "loss": 0.6186, "step": 12877 }, { "epoch": 4.928434749330272, "grad_norm": 0.539186954498291, "learning_rate": 1.085795332377959e-08, "loss": 0.5746, "step": 12878 }, { "epoch": 4.928817451205511, "grad_norm": 0.5216759443283081, "learning_rate": 1.0742771020962883e-08, "loss": 0.6208, "step": 12879 }, { "epoch": 4.92920015308075, "grad_norm": 0.5200689435005188, "learning_rate": 1.062820258483921e-08, "loss": 0.5292, "step": 12880 }, { "epoch": 4.9295828549559895, "grad_norm": 0.8393425345420837, "learning_rate": 1.0514248022449603e-08, "loss": 0.6029, "step": 12881 }, { "epoch": 4.929965556831228, "grad_norm": 0.5491442680358887, "learning_rate": 1.040090734079624e-08, "loss": 0.7314, "step": 12882 }, { "epoch": 4.930348258706467, "grad_norm": 0.532185971736908, "learning_rate": 1.0288180546843551e-08, "loss": 0.6183, "step": 12883 }, { "epoch": 4.930730960581707, "grad_norm": 0.5452542901039124, "learning_rate": 1.0176067647519327e-08, "loss": 0.6209, "step": 12884 }, { "epoch": 4.931113662456946, "grad_norm": 0.5033364295959473, "learning_rate": 1.0064568649713614e-08, "loss": 0.6064, "step": 12885 }, { "epoch": 4.931496364332185, "grad_norm": 0.5417090058326721, "learning_rate": 9.953683560277594e-09, "loss": 0.6633, "step": 12886 }, { "epoch": 4.931879066207425, "grad_norm": 0.5553765296936035, "learning_rate": 9.843412386025819e-09, "loss": 0.617, "step": 12887 }, { "epoch": 4.932261768082664, "grad_norm": 0.5401895046234131, "learning_rate": 9.73375513373509e-09, "loss": 0.7215, "step": 12888 }, { "epoch": 4.932644469957903, "grad_norm": 0.5525054335594177, "learning_rate": 9.62471181014446e-09, "loss": 0.6555, "step": 12889 }, { "epoch": 4.933027171833142, "grad_norm": 0.5678810477256775, "learning_rate": 9.516282421954126e-09, "loss": 0.6615, "step": 12890 }, { "epoch": 4.933409873708381, "grad_norm": 0.5871012806892395, "learning_rate": 9.408466975826536e-09, "loss": 0.6043, "step": 12891 }, { "epoch": 4.93379257558362, "grad_norm": 0.5240786671638489, "learning_rate": 9.30126547838861e-09, "loss": 0.639, "step": 12892 }, { "epoch": 4.934175277458859, "grad_norm": 0.5061721801757812, "learning_rate": 9.194677936228414e-09, "loss": 0.5863, "step": 12893 }, { "epoch": 4.934557979334099, "grad_norm": 0.558638870716095, "learning_rate": 9.088704355895151e-09, "loss": 0.6483, "step": 12894 }, { "epoch": 4.934940681209338, "grad_norm": 0.5382723808288574, "learning_rate": 8.98334474390139e-09, "loss": 0.6419, "step": 12895 }, { "epoch": 4.935323383084577, "grad_norm": 0.5288630127906799, "learning_rate": 8.878599106723062e-09, "loss": 0.5676, "step": 12896 }, { "epoch": 4.935706084959817, "grad_norm": 0.5322120785713196, "learning_rate": 8.77446745079391e-09, "loss": 0.5928, "step": 12897 }, { "epoch": 4.936088786835056, "grad_norm": 0.5263005495071411, "learning_rate": 8.67094978251659e-09, "loss": 0.6034, "step": 12898 }, { "epoch": 4.9364714887102945, "grad_norm": 0.5696966052055359, "learning_rate": 8.56804610825046e-09, "loss": 0.6099, "step": 12899 }, { "epoch": 4.936854190585533, "grad_norm": 0.5801308155059814, "learning_rate": 8.46575643431935e-09, "loss": 0.7007, "step": 12900 }, { "epoch": 4.937236892460773, "grad_norm": 0.5276888012886047, "learning_rate": 8.364080767009341e-09, "loss": 0.5862, "step": 12901 }, { "epoch": 4.937619594336012, "grad_norm": 0.49935486912727356, "learning_rate": 8.263019112569882e-09, "loss": 0.5589, "step": 12902 }, { "epoch": 4.938002296211251, "grad_norm": 0.5134600400924683, "learning_rate": 8.162571477210445e-09, "loss": 0.5911, "step": 12903 }, { "epoch": 4.938384998086491, "grad_norm": 0.6179021000862122, "learning_rate": 8.062737867103876e-09, "loss": 0.628, "step": 12904 }, { "epoch": 4.93876769996173, "grad_norm": 0.5536340475082397, "learning_rate": 7.963518288385264e-09, "loss": 0.6491, "step": 12905 }, { "epoch": 4.939150401836969, "grad_norm": 0.5179790258407593, "learning_rate": 7.864912747151953e-09, "loss": 0.6003, "step": 12906 }, { "epoch": 4.9395331037122086, "grad_norm": 0.5545229911804199, "learning_rate": 7.766921249463543e-09, "loss": 0.5243, "step": 12907 }, { "epoch": 4.9399158055874475, "grad_norm": 0.5168160200119019, "learning_rate": 7.669543801341883e-09, "loss": 0.7007, "step": 12908 }, { "epoch": 4.940298507462686, "grad_norm": 0.5225627422332764, "learning_rate": 7.572780408772185e-09, "loss": 0.5631, "step": 12909 }, { "epoch": 4.940681209337926, "grad_norm": 0.5677005052566528, "learning_rate": 7.476631077698582e-09, "loss": 0.6256, "step": 12910 }, { "epoch": 4.941063911213165, "grad_norm": 0.6054744124412537, "learning_rate": 7.381095814029682e-09, "loss": 0.6483, "step": 12911 }, { "epoch": 4.941446613088404, "grad_norm": 0.5485429167747498, "learning_rate": 7.286174623639675e-09, "loss": 0.6204, "step": 12912 }, { "epoch": 4.941829314963643, "grad_norm": 0.5485870242118835, "learning_rate": 7.191867512358341e-09, "loss": 0.5808, "step": 12913 }, { "epoch": 4.942212016838883, "grad_norm": 0.5474002361297607, "learning_rate": 7.098174485982157e-09, "loss": 0.6104, "step": 12914 }, { "epoch": 4.942594718714122, "grad_norm": 0.597926139831543, "learning_rate": 7.0050955502687365e-09, "loss": 0.6235, "step": 12915 }, { "epoch": 4.942977420589361, "grad_norm": 0.5299083590507507, "learning_rate": 6.912630710939061e-09, "loss": 0.5868, "step": 12916 }, { "epoch": 4.9433601224646, "grad_norm": 0.6006332635879517, "learning_rate": 6.820779973674141e-09, "loss": 0.6111, "step": 12917 }, { "epoch": 4.943742824339839, "grad_norm": 0.5418727397918701, "learning_rate": 6.72954334411946e-09, "loss": 0.6275, "step": 12918 }, { "epoch": 4.944125526215078, "grad_norm": 0.5472028255462646, "learning_rate": 6.638920827880535e-09, "loss": 0.6486, "step": 12919 }, { "epoch": 4.944508228090317, "grad_norm": 0.518240213394165, "learning_rate": 6.548912430527354e-09, "loss": 0.6677, "step": 12920 }, { "epoch": 4.944890929965557, "grad_norm": 0.5145532488822937, "learning_rate": 6.4595181575910496e-09, "loss": 0.5632, "step": 12921 }, { "epoch": 4.945273631840796, "grad_norm": 0.5427786707878113, "learning_rate": 6.370738014563893e-09, "loss": 0.6376, "step": 12922 }, { "epoch": 4.945656333716035, "grad_norm": 0.5950617790222168, "learning_rate": 6.282572006903742e-09, "loss": 0.6479, "step": 12923 }, { "epoch": 4.946039035591275, "grad_norm": 0.5185495018959045, "learning_rate": 6.195020140027375e-09, "loss": 0.6422, "step": 12924 }, { "epoch": 4.946421737466514, "grad_norm": 0.5088520646095276, "learning_rate": 6.108082419314931e-09, "loss": 0.5945, "step": 12925 }, { "epoch": 4.9468044393417525, "grad_norm": 0.5481089949607849, "learning_rate": 6.021758850108805e-09, "loss": 0.5832, "step": 12926 }, { "epoch": 4.947187141216992, "grad_norm": 0.7647290825843811, "learning_rate": 5.936049437714753e-09, "loss": 0.678, "step": 12927 }, { "epoch": 4.947569843092231, "grad_norm": 0.5391228795051575, "learning_rate": 5.850954187399671e-09, "loss": 0.6534, "step": 12928 }, { "epoch": 4.94795254496747, "grad_norm": 0.5040662288665771, "learning_rate": 5.766473104392712e-09, "loss": 0.6234, "step": 12929 }, { "epoch": 4.94833524684271, "grad_norm": 0.5774620175361633, "learning_rate": 5.682606193885276e-09, "loss": 0.6347, "step": 12930 }, { "epoch": 4.948717948717949, "grad_norm": 0.5395097732543945, "learning_rate": 5.5993534610310205e-09, "loss": 0.6269, "step": 12931 }, { "epoch": 4.949100650593188, "grad_norm": 0.5139246582984924, "learning_rate": 5.516714910946963e-09, "loss": 0.5939, "step": 12932 }, { "epoch": 4.949483352468427, "grad_norm": 0.5645142793655396, "learning_rate": 5.4346905487101534e-09, "loss": 0.5974, "step": 12933 }, { "epoch": 4.9498660543436666, "grad_norm": 0.5102503895759583, "learning_rate": 5.3532803793621134e-09, "loss": 0.6563, "step": 12934 }, { "epoch": 4.9502487562189055, "grad_norm": 0.5587743520736694, "learning_rate": 5.27248440790662e-09, "loss": 0.6168, "step": 12935 }, { "epoch": 4.950631458094144, "grad_norm": 0.5335599780082703, "learning_rate": 5.192302639306368e-09, "loss": 0.6075, "step": 12936 }, { "epoch": 4.951014159969384, "grad_norm": 0.5884761810302734, "learning_rate": 5.112735078489639e-09, "loss": 0.6213, "step": 12937 }, { "epoch": 4.951396861844623, "grad_norm": 0.5177172422409058, "learning_rate": 5.033781730348075e-09, "loss": 0.5643, "step": 12938 }, { "epoch": 4.951779563719862, "grad_norm": 0.5270063281059265, "learning_rate": 4.955442599731131e-09, "loss": 0.6381, "step": 12939 }, { "epoch": 4.952162265595101, "grad_norm": 0.5058334469795227, "learning_rate": 4.877717691453843e-09, "loss": 0.5758, "step": 12940 }, { "epoch": 4.952544967470341, "grad_norm": 0.536816418170929, "learning_rate": 4.800607010292391e-09, "loss": 0.6295, "step": 12941 }, { "epoch": 4.95292766934558, "grad_norm": 0.5273334383964539, "learning_rate": 4.724110560986317e-09, "loss": 0.6365, "step": 12942 }, { "epoch": 4.953310371220819, "grad_norm": 0.5553278923034668, "learning_rate": 4.648228348235195e-09, "loss": 0.648, "step": 12943 }, { "epoch": 4.953693073096058, "grad_norm": 0.5669435262680054, "learning_rate": 4.572960376703073e-09, "loss": 0.6786, "step": 12944 }, { "epoch": 4.954075774971297, "grad_norm": 0.5645465850830078, "learning_rate": 4.4983066510151385e-09, "loss": 0.6551, "step": 12945 }, { "epoch": 4.954458476846536, "grad_norm": 0.5701251029968262, "learning_rate": 4.424267175758834e-09, "loss": 0.6498, "step": 12946 }, { "epoch": 4.954841178721776, "grad_norm": 0.5176231265068054, "learning_rate": 4.350841955484963e-09, "loss": 0.6118, "step": 12947 }, { "epoch": 4.955223880597015, "grad_norm": 0.5634961724281311, "learning_rate": 4.278030994704363e-09, "loss": 0.6474, "step": 12948 }, { "epoch": 4.955606582472254, "grad_norm": 0.550102949142456, "learning_rate": 4.205834297892342e-09, "loss": 0.6685, "step": 12949 }, { "epoch": 4.955989284347494, "grad_norm": 0.5033911466598511, "learning_rate": 4.13425186948535e-09, "loss": 0.5128, "step": 12950 }, { "epoch": 4.956371986222733, "grad_norm": 0.544337809085846, "learning_rate": 4.063283713883203e-09, "loss": 0.619, "step": 12951 }, { "epoch": 4.956754688097972, "grad_norm": 0.5677129030227661, "learning_rate": 3.992929835444637e-09, "loss": 0.6468, "step": 12952 }, { "epoch": 4.9571373899732105, "grad_norm": 0.5622820854187012, "learning_rate": 3.923190238496188e-09, "loss": 0.528, "step": 12953 }, { "epoch": 4.95752009184845, "grad_norm": 0.5304381251335144, "learning_rate": 3.854064927321099e-09, "loss": 0.5947, "step": 12954 }, { "epoch": 4.957902793723689, "grad_norm": 0.6196551322937012, "learning_rate": 3.785553906169303e-09, "loss": 0.595, "step": 12955 }, { "epoch": 4.958285495598928, "grad_norm": 0.5427524447441101, "learning_rate": 3.717657179248546e-09, "loss": 0.5934, "step": 12956 }, { "epoch": 4.958668197474168, "grad_norm": 0.5272826552391052, "learning_rate": 3.6503747507343754e-09, "loss": 0.6499, "step": 12957 }, { "epoch": 4.959050899349407, "grad_norm": 0.5631623864173889, "learning_rate": 3.5837066247590423e-09, "loss": 0.6187, "step": 12958 }, { "epoch": 4.959433601224646, "grad_norm": 0.5859520435333252, "learning_rate": 3.5176528054192695e-09, "loss": 0.5794, "step": 12959 }, { "epoch": 4.959816303099885, "grad_norm": 0.5403589010238647, "learning_rate": 3.4522132967762524e-09, "loss": 0.6433, "step": 12960 }, { "epoch": 4.960199004975125, "grad_norm": 0.602813720703125, "learning_rate": 3.3873881028501087e-09, "loss": 0.6411, "step": 12961 }, { "epoch": 4.9605817068503635, "grad_norm": 0.5528770089149475, "learning_rate": 3.323177227624319e-09, "loss": 0.6328, "step": 12962 }, { "epoch": 4.960964408725602, "grad_norm": 0.6400797367095947, "learning_rate": 3.259580675045726e-09, "loss": 0.5756, "step": 12963 }, { "epoch": 4.961347110600842, "grad_norm": 0.5654847621917725, "learning_rate": 3.1965984490223145e-09, "loss": 0.6269, "step": 12964 }, { "epoch": 4.961729812476081, "grad_norm": 0.6670234203338623, "learning_rate": 3.134230553423212e-09, "loss": 0.6709, "step": 12965 }, { "epoch": 4.96211251435132, "grad_norm": 0.5844568610191345, "learning_rate": 3.072476992083129e-09, "loss": 0.6232, "step": 12966 }, { "epoch": 4.96249521622656, "grad_norm": 0.605331301689148, "learning_rate": 3.011337768795697e-09, "loss": 0.6197, "step": 12967 }, { "epoch": 4.962877918101799, "grad_norm": 0.5598270893096924, "learning_rate": 2.950812887317911e-09, "loss": 0.5884, "step": 12968 }, { "epoch": 4.963260619977038, "grad_norm": 0.5420481562614441, "learning_rate": 2.8909023513701283e-09, "loss": 0.6516, "step": 12969 }, { "epoch": 4.9636433218522775, "grad_norm": 0.5402905344963074, "learning_rate": 2.831606164633849e-09, "loss": 0.5938, "step": 12970 }, { "epoch": 4.9640260237275164, "grad_norm": 0.549258291721344, "learning_rate": 2.772924330751714e-09, "loss": 0.5665, "step": 12971 }, { "epoch": 4.964408725602755, "grad_norm": 0.5298993587493896, "learning_rate": 2.7148568533319485e-09, "loss": 0.6322, "step": 12972 }, { "epoch": 4.964791427477994, "grad_norm": 0.5464409589767456, "learning_rate": 2.6574037359405893e-09, "loss": 0.6734, "step": 12973 }, { "epoch": 4.965174129353234, "grad_norm": 0.5521329045295715, "learning_rate": 2.600564982110365e-09, "loss": 0.5817, "step": 12974 }, { "epoch": 4.965556831228473, "grad_norm": 0.5857043862342834, "learning_rate": 2.544340595332928e-09, "loss": 0.6094, "step": 12975 }, { "epoch": 4.965939533103712, "grad_norm": 0.5321537256240845, "learning_rate": 2.488730579064402e-09, "loss": 0.6606, "step": 12976 }, { "epoch": 4.966322234978952, "grad_norm": 0.5325576663017273, "learning_rate": 2.433734936720944e-09, "loss": 0.6843, "step": 12977 }, { "epoch": 4.966704936854191, "grad_norm": 0.5382782816886902, "learning_rate": 2.3793536716831823e-09, "loss": 0.5265, "step": 12978 }, { "epoch": 4.96708763872943, "grad_norm": 0.5042755603790283, "learning_rate": 2.3255867872928885e-09, "loss": 0.6068, "step": 12979 }, { "epoch": 4.9674703406046685, "grad_norm": 0.581205427646637, "learning_rate": 2.2724342868529757e-09, "loss": 0.6664, "step": 12980 }, { "epoch": 4.967853042479908, "grad_norm": 0.5370177626609802, "learning_rate": 2.2198961736319415e-09, "loss": 0.6705, "step": 12981 }, { "epoch": 4.968235744355147, "grad_norm": 0.5587920546531677, "learning_rate": 2.167972450856093e-09, "loss": 0.5674, "step": 12982 }, { "epoch": 4.968618446230386, "grad_norm": 0.5590201616287231, "learning_rate": 2.116663121718432e-09, "loss": 0.637, "step": 12983 }, { "epoch": 4.969001148105626, "grad_norm": 0.5462186932563782, "learning_rate": 2.0659681893708816e-09, "loss": 0.6349, "step": 12984 }, { "epoch": 4.969383849980865, "grad_norm": 0.5405112504959106, "learning_rate": 2.0158876569287277e-09, "loss": 0.5731, "step": 12985 }, { "epoch": 4.969766551856104, "grad_norm": 0.5245516896247864, "learning_rate": 1.966421527469509e-09, "loss": 0.6201, "step": 12986 }, { "epoch": 4.970149253731344, "grad_norm": 0.5654741525650024, "learning_rate": 1.917569804033015e-09, "loss": 0.5988, "step": 12987 }, { "epoch": 4.970531955606583, "grad_norm": 0.5400444865226746, "learning_rate": 1.8693324896224e-09, "loss": 0.7008, "step": 12988 }, { "epoch": 4.9709146574818215, "grad_norm": 0.5653648972511292, "learning_rate": 1.8217095872008483e-09, "loss": 0.629, "step": 12989 }, { "epoch": 4.971297359357061, "grad_norm": 0.577028214931488, "learning_rate": 1.7747010996949087e-09, "loss": 0.6682, "step": 12990 }, { "epoch": 4.9716800612323, "grad_norm": 0.5616087317466736, "learning_rate": 1.7283070299944916e-09, "loss": 0.5688, "step": 12991 }, { "epoch": 4.972062763107539, "grad_norm": 0.5666642785072327, "learning_rate": 1.6825273809495391e-09, "loss": 0.6331, "step": 12992 }, { "epoch": 4.972445464982778, "grad_norm": 0.5855655670166016, "learning_rate": 1.6373621553733565e-09, "loss": 0.6428, "step": 12993 }, { "epoch": 4.972828166858018, "grad_norm": 0.5872904062271118, "learning_rate": 1.5928113560415016e-09, "loss": 0.6315, "step": 12994 }, { "epoch": 4.973210868733257, "grad_norm": 0.538945198059082, "learning_rate": 1.5488749856928943e-09, "loss": 0.5736, "step": 12995 }, { "epoch": 4.973593570608496, "grad_norm": 0.546280562877655, "learning_rate": 1.5055530470264867e-09, "loss": 0.67, "step": 12996 }, { "epoch": 4.9739762724837355, "grad_norm": 0.49727603793144226, "learning_rate": 1.462845542704594e-09, "loss": 0.5894, "step": 12997 }, { "epoch": 4.9743589743589745, "grad_norm": 1.0400619506835938, "learning_rate": 1.420752475350673e-09, "loss": 0.7047, "step": 12998 }, { "epoch": 4.974741676234213, "grad_norm": 0.564212441444397, "learning_rate": 1.3792738475537636e-09, "loss": 0.6522, "step": 12999 }, { "epoch": 4.975124378109452, "grad_norm": 0.49532148241996765, "learning_rate": 1.3384096618618281e-09, "loss": 0.608, "step": 13000 }, { "epoch": 4.975507079984692, "grad_norm": 0.5278918743133545, "learning_rate": 1.2981599207850803e-09, "loss": 0.5575, "step": 13001 }, { "epoch": 4.975889781859931, "grad_norm": 0.5915699601173401, "learning_rate": 1.2585246267982077e-09, "loss": 0.7467, "step": 13002 }, { "epoch": 4.97627248373517, "grad_norm": 0.5826804637908936, "learning_rate": 1.2195037823370392e-09, "loss": 0.7204, "step": 13003 }, { "epoch": 4.97665518561041, "grad_norm": 0.5376716256141663, "learning_rate": 1.1810973897985467e-09, "loss": 0.5367, "step": 13004 }, { "epoch": 4.977037887485649, "grad_norm": 0.7021266222000122, "learning_rate": 1.1433054515430641e-09, "loss": 0.7042, "step": 13005 }, { "epoch": 4.977420589360888, "grad_norm": 0.5523099303245544, "learning_rate": 1.1061279698942884e-09, "loss": 0.6138, "step": 13006 }, { "epoch": 4.977803291236127, "grad_norm": 0.5706189274787903, "learning_rate": 1.069564947134838e-09, "loss": 0.5941, "step": 13007 }, { "epoch": 4.978185993111366, "grad_norm": 0.5564797520637512, "learning_rate": 1.0336163855129143e-09, "loss": 0.6376, "step": 13008 }, { "epoch": 4.978568694986605, "grad_norm": 0.5459800362586975, "learning_rate": 9.982822872378616e-10, "loss": 0.5797, "step": 13009 }, { "epoch": 4.978951396861845, "grad_norm": 0.531505286693573, "learning_rate": 9.635626544801657e-10, "loss": 0.5579, "step": 13010 }, { "epoch": 4.979334098737084, "grad_norm": 0.5220417976379395, "learning_rate": 9.294574893736752e-10, "loss": 0.5378, "step": 13011 }, { "epoch": 4.979716800612323, "grad_norm": 0.5556370615959167, "learning_rate": 8.959667940144912e-10, "loss": 0.657, "step": 13012 }, { "epoch": 4.980099502487562, "grad_norm": 0.5979492664337158, "learning_rate": 8.630905704609671e-10, "loss": 0.6148, "step": 13013 }, { "epoch": 4.980482204362802, "grad_norm": 0.6897139549255371, "learning_rate": 8.308288207325987e-10, "loss": 0.6604, "step": 13014 }, { "epoch": 4.980864906238041, "grad_norm": 0.5100217461585999, "learning_rate": 7.991815468122443e-10, "loss": 0.6009, "step": 13015 }, { "epoch": 4.9812476081132795, "grad_norm": 0.5324516296386719, "learning_rate": 7.681487506461249e-10, "loss": 0.5844, "step": 13016 }, { "epoch": 4.981630309988519, "grad_norm": 0.5312784314155579, "learning_rate": 7.377304341393832e-10, "loss": 0.5948, "step": 13017 }, { "epoch": 4.982013011863758, "grad_norm": 0.5385386943817139, "learning_rate": 7.079265991616347e-10, "loss": 0.6002, "step": 13018 }, { "epoch": 4.982395713738997, "grad_norm": 0.5580663084983826, "learning_rate": 6.787372475458576e-10, "loss": 0.6318, "step": 13019 }, { "epoch": 4.982778415614236, "grad_norm": 0.5152972936630249, "learning_rate": 6.50162381083952e-10, "loss": 0.5713, "step": 13020 }, { "epoch": 4.983161117489476, "grad_norm": 0.5038657188415527, "learning_rate": 6.222020015322905e-10, "loss": 0.6491, "step": 13021 }, { "epoch": 4.983543819364715, "grad_norm": 0.5408946871757507, "learning_rate": 5.948561106106088e-10, "loss": 0.5775, "step": 13022 }, { "epoch": 4.983926521239954, "grad_norm": 0.528423547744751, "learning_rate": 5.681247099975639e-10, "loss": 0.5974, "step": 13023 }, { "epoch": 4.9843092231151935, "grad_norm": 0.5557752847671509, "learning_rate": 5.420078013373964e-10, "loss": 0.6078, "step": 13024 }, { "epoch": 4.9846919249904325, "grad_norm": 0.5158812999725342, "learning_rate": 5.165053862332681e-10, "loss": 0.6224, "step": 13025 }, { "epoch": 4.985074626865671, "grad_norm": 0.5545287132263184, "learning_rate": 4.916174662539241e-10, "loss": 0.6017, "step": 13026 }, { "epoch": 4.985457328740911, "grad_norm": 0.5719128847122192, "learning_rate": 4.673440429281417e-10, "loss": 0.5923, "step": 13027 }, { "epoch": 4.98584003061615, "grad_norm": 0.5425817370414734, "learning_rate": 4.4368511774806057e-10, "loss": 0.6464, "step": 13028 }, { "epoch": 4.986222732491389, "grad_norm": 0.5642353892326355, "learning_rate": 4.2064069216696255e-10, "loss": 0.6205, "step": 13029 }, { "epoch": 4.986605434366629, "grad_norm": 0.5424538254737854, "learning_rate": 3.9821076760149234e-10, "loss": 0.6003, "step": 13030 }, { "epoch": 4.986988136241868, "grad_norm": 0.557558536529541, "learning_rate": 3.7639534542943667e-10, "loss": 0.6019, "step": 13031 }, { "epoch": 4.987370838117107, "grad_norm": 0.56698077917099, "learning_rate": 3.5519442699194496e-10, "loss": 0.6578, "step": 13032 }, { "epoch": 4.987753539992346, "grad_norm": 0.5221746563911438, "learning_rate": 3.3460801359130877e-10, "loss": 0.5981, "step": 13033 }, { "epoch": 4.988136241867585, "grad_norm": 0.48464229702949524, "learning_rate": 3.146361064931824e-10, "loss": 0.5947, "step": 13034 }, { "epoch": 4.988518943742824, "grad_norm": 0.5523820519447327, "learning_rate": 2.952787069254726e-10, "loss": 0.5908, "step": 13035 }, { "epoch": 4.988901645618063, "grad_norm": 0.5528908967971802, "learning_rate": 2.765358160761178e-10, "loss": 0.4755, "step": 13036 }, { "epoch": 4.989284347493303, "grad_norm": 0.5460435748100281, "learning_rate": 2.584074350986399e-10, "loss": 0.5984, "step": 13037 }, { "epoch": 4.989667049368542, "grad_norm": 0.5495511293411255, "learning_rate": 2.408935651054822e-10, "loss": 0.5845, "step": 13038 }, { "epoch": 4.990049751243781, "grad_norm": 0.5514429211616516, "learning_rate": 2.2399420717356123e-10, "loss": 0.6122, "step": 13039 }, { "epoch": 4.99043245311902, "grad_norm": 0.56214439868927, "learning_rate": 2.0770936234204565e-10, "loss": 0.7268, "step": 13040 }, { "epoch": 4.99081515499426, "grad_norm": 0.570823073387146, "learning_rate": 1.9203903161124639e-10, "loss": 0.6019, "step": 13041 }, { "epoch": 4.991197856869499, "grad_norm": 0.5419581532478333, "learning_rate": 1.7698321594372681e-10, "loss": 0.6237, "step": 13042 }, { "epoch": 4.9915805587447375, "grad_norm": 0.5536218285560608, "learning_rate": 1.6254191626430272e-10, "loss": 0.6268, "step": 13043 }, { "epoch": 4.991963260619977, "grad_norm": 0.5406533479690552, "learning_rate": 1.4871513346226273e-10, "loss": 0.6031, "step": 13044 }, { "epoch": 4.992345962495216, "grad_norm": 0.5352851748466492, "learning_rate": 1.3550286838581728e-10, "loss": 0.5948, "step": 13045 }, { "epoch": 4.992728664370455, "grad_norm": 0.5435356497764587, "learning_rate": 1.2290512184653937e-10, "loss": 0.6308, "step": 13046 }, { "epoch": 4.993111366245695, "grad_norm": 0.508456289768219, "learning_rate": 1.1092189462047487e-10, "loss": 0.6136, "step": 13047 }, { "epoch": 4.993494068120934, "grad_norm": 0.5369634628295898, "learning_rate": 9.955318744259146e-11, "loss": 0.6786, "step": 13048 }, { "epoch": 4.993876769996173, "grad_norm": 0.5388721823692322, "learning_rate": 8.87990010112194e-11, "loss": 0.628, "step": 13049 }, { "epoch": 4.994259471871413, "grad_norm": 0.5360571146011353, "learning_rate": 7.865933598805164e-11, "loss": 0.6393, "step": 13050 }, { "epoch": 4.9946421737466515, "grad_norm": 0.5394847989082336, "learning_rate": 6.913419299592327e-11, "loss": 0.6588, "step": 13051 }, { "epoch": 4.9950248756218905, "grad_norm": 0.5452165603637695, "learning_rate": 6.022357261992184e-11, "loss": 0.634, "step": 13052 }, { "epoch": 4.995407577497129, "grad_norm": 0.5538443922996521, "learning_rate": 5.192747540849752e-11, "loss": 0.6475, "step": 13053 }, { "epoch": 4.995790279372369, "grad_norm": 0.5184859037399292, "learning_rate": 4.424590187124267e-11, "loss": 0.6328, "step": 13054 }, { "epoch": 4.996172981247608, "grad_norm": 0.5636581182479858, "learning_rate": 3.717885247889186e-11, "loss": 0.7055, "step": 13055 }, { "epoch": 4.996555683122847, "grad_norm": 0.5823739767074585, "learning_rate": 3.0726327666652514e-11, "loss": 0.629, "step": 13056 }, { "epoch": 4.996938384998087, "grad_norm": 0.5723466277122498, "learning_rate": 2.488832783198447e-11, "loss": 0.5756, "step": 13057 }, { "epoch": 4.997321086873326, "grad_norm": 0.6142847537994385, "learning_rate": 1.9664853332379552e-11, "loss": 0.6015, "step": 13058 }, { "epoch": 4.997703788748565, "grad_norm": 0.5307672619819641, "learning_rate": 1.5055904488692207e-11, "loss": 0.6505, "step": 13059 }, { "epoch": 4.998086490623804, "grad_norm": 0.5710265636444092, "learning_rate": 1.1061481585139533e-11, "loss": 0.5561, "step": 13060 }, { "epoch": 4.998469192499043, "grad_norm": 0.5701170563697815, "learning_rate": 7.681584867080816e-12, "loss": 0.6378, "step": 13061 }, { "epoch": 4.998851894374282, "grad_norm": 0.5231911540031433, "learning_rate": 4.916214541017539e-12, "loss": 0.5492, "step": 13062 }, { "epoch": 4.999234596249521, "grad_norm": 0.5774722695350647, "learning_rate": 2.765370779034271e-12, "loss": 0.6755, "step": 13063 }, { "epoch": 4.999617298124761, "grad_norm": 0.5533973574638367, "learning_rate": 1.2290537110271062e-12, "loss": 0.6495, "step": 13064 }, { "epoch": 5.0, "grad_norm": 0.5379692316055298, "learning_rate": 3.072634324752244e-13, "loss": 0.608, "step": 13065 } ], "logging_steps": 1.0, "max_steps": 13065, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1609090515844035e+21, "train_batch_size": 1, "trial_name": null, "trial_params": null }