diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4869 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 690, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004347826086956522, + "learning_rate": 0, + "loss": 2.2149, + "step": 1 + }, + { + "epoch": 0.008695652173913044, + "learning_rate": 0, + "loss": 2.2161, + "step": 2 + }, + { + "epoch": 0.013043478260869565, + "learning_rate": 0, + "loss": 2.3121, + "step": 3 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 3.31545352935791, + "learning_rate": 0.0, + "loss": 2.2694, + "step": 4 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 3.31545352935791, + "learning_rate": 0.0, + "loss": 2.298, + "step": 5 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 3.31545352935791, + "learning_rate": 0.0, + "loss": 2.2501, + "step": 6 + }, + { + "epoch": 0.030434782608695653, + "grad_norm": 3.31545352935791, + "learning_rate": 0.0, + "loss": 2.2571, + "step": 7 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 3.273796796798706, + "learning_rate": 3.2741131089043125e-06, + "loss": 2.286, + "step": 8 + }, + { + "epoch": 0.0391304347826087, + "grad_norm": 3.273796796798706, + "learning_rate": 3.2741131089043125e-06, + "loss": 2.2707, + "step": 9 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 3.273796796798706, + "learning_rate": 3.2741131089043125e-06, + "loss": 2.3606, + "step": 10 + }, + { + "epoch": 0.04782608695652174, + "grad_norm": 3.273796796798706, + "learning_rate": 3.2741131089043125e-06, + "loss": 2.306, + "step": 11 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 3.272700071334839, + "learning_rate": 5.189346500732899e-06, + "loss": 2.3623, + "step": 12 + }, + { + "epoch": 0.05652173913043478, + "grad_norm": 3.272700071334839, + "learning_rate": 5.189346500732899e-06, + "loss": 2.3027, + "step": 13 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 3.272700071334839, + "learning_rate": 5.189346500732899e-06, + "loss": 2.3053, + "step": 14 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 3.272700071334839, + "learning_rate": 5.189346500732899e-06, + "loss": 2.2793, + "step": 15 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 3.2200253009796143, + "learning_rate": 6.548226217808625e-06, + "loss": 2.2608, + "step": 16 + }, + { + "epoch": 0.07391304347826087, + "grad_norm": 3.2200253009796143, + "learning_rate": 6.548226217808625e-06, + "loss": 2.2949, + "step": 17 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 3.2200253009796143, + "learning_rate": 6.548226217808625e-06, + "loss": 2.2639, + "step": 18 + }, + { + "epoch": 0.08260869565217391, + "grad_norm": 3.2200253009796143, + "learning_rate": 6.548226217808625e-06, + "loss": 2.2472, + "step": 19 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 3.144207239151001, + "learning_rate": 7.60225521340393e-06, + "loss": 2.1505, + "step": 20 + }, + { + "epoch": 0.09130434782608696, + "grad_norm": 3.144207239151001, + "learning_rate": 7.60225521340393e-06, + "loss": 2.2222, + "step": 21 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 3.144207239151001, + "learning_rate": 7.60225521340393e-06, + "loss": 2.3522, + "step": 22 + }, + { + "epoch": 0.1, + "grad_norm": 3.144207239151001, + "learning_rate": 7.60225521340393e-06, + "loss": 2.275, + "step": 23 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 3.0425093173980713, + "learning_rate": 8.463459609637211e-06, + "loss": 2.3877, + "step": 24 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 3.0425093173980713, + "learning_rate": 8.463459609637211e-06, + "loss": 2.2595, + "step": 25 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 3.0425093173980713, + "learning_rate": 8.463459609637211e-06, + "loss": 2.213, + "step": 26 + }, + { + "epoch": 0.11739130434782609, + "grad_norm": 3.0425093173980713, + "learning_rate": 8.463459609637211e-06, + "loss": 2.2204, + "step": 27 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 2.7785863876342773, + "learning_rate": 9.191597551655847e-06, + "loss": 2.2942, + "step": 28 + }, + { + "epoch": 0.12608695652173912, + "grad_norm": 2.7785863876342773, + "learning_rate": 9.191597551655847e-06, + "loss": 2.1182, + "step": 29 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 2.7785863876342773, + "learning_rate": 9.191597551655847e-06, + "loss": 2.0758, + "step": 30 + }, + { + "epoch": 0.13478260869565217, + "grad_norm": 2.7785863876342773, + "learning_rate": 9.191597551655847e-06, + "loss": 1.9933, + "step": 31 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 2.2910876274108887, + "learning_rate": 9.822339326712938e-06, + "loss": 2.1012, + "step": 32 + }, + { + "epoch": 0.14347826086956522, + "grad_norm": 2.2910876274108887, + "learning_rate": 9.822339326712938e-06, + "loss": 1.953, + "step": 33 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 2.2910876274108887, + "learning_rate": 9.822339326712938e-06, + "loss": 2.0373, + "step": 34 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 2.2910876274108887, + "learning_rate": 9.822339326712938e-06, + "loss": 1.9755, + "step": 35 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 1.7529128789901733, + "learning_rate": 1.0378693001465798e-05, + "loss": 2.0704, + "step": 36 + }, + { + "epoch": 0.1608695652173913, + "grad_norm": 1.7529128789901733, + "learning_rate": 1.0378693001465798e-05, + "loss": 1.9861, + "step": 37 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 1.7529128789901733, + "learning_rate": 1.0378693001465798e-05, + "loss": 2.0054, + "step": 38 + }, + { + "epoch": 0.16956521739130434, + "grad_norm": 1.7529128789901733, + "learning_rate": 1.0378693001465798e-05, + "loss": 2.098, + "step": 39 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 1.4156919717788696, + "learning_rate": 1.0876368322308244e-05, + "loss": 1.9489, + "step": 40 + }, + { + "epoch": 0.1782608695652174, + "grad_norm": 1.4156919717788696, + "learning_rate": 1.0876368322308244e-05, + "loss": 1.9384, + "step": 41 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 1.4156919717788696, + "learning_rate": 1.0876368322308244e-05, + "loss": 1.9856, + "step": 42 + }, + { + "epoch": 0.18695652173913044, + "grad_norm": 1.4156919717788696, + "learning_rate": 1.0876368322308244e-05, + "loss": 2.0604, + "step": 43 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 1.382755160331726, + "learning_rate": 1.1326570411938442e-05, + "loss": 1.9795, + "step": 44 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 1.382755160331726, + "learning_rate": 1.1326570411938442e-05, + "loss": 2.0085, + "step": 45 + }, + { + "epoch": 0.2, + "grad_norm": 1.382755160331726, + "learning_rate": 1.1326570411938442e-05, + "loss": 1.8687, + "step": 46 + }, + { + "epoch": 0.20434782608695654, + "grad_norm": 1.382755160331726, + "learning_rate": 1.1326570411938442e-05, + "loss": 1.8855, + "step": 47 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 1.4150187969207764, + "learning_rate": 1.1737572718541526e-05, + "loss": 1.9876, + "step": 48 + }, + { + "epoch": 0.21304347826086956, + "grad_norm": 1.4150187969207764, + "learning_rate": 1.1737572718541526e-05, + "loss": 1.8712, + "step": 49 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 1.4150187969207764, + "learning_rate": 1.1737572718541526e-05, + "loss": 1.8736, + "step": 50 + }, + { + "epoch": 0.2217391304347826, + "grad_norm": 1.4150187969207764, + "learning_rate": 1.1737572718541526e-05, + "loss": 1.9638, + "step": 51 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 1.1287221908569336, + "learning_rate": 1.2115658189875932e-05, + "loss": 1.9343, + "step": 52 + }, + { + "epoch": 0.23043478260869565, + "grad_norm": 1.1287221908569336, + "learning_rate": 1.2115658189875932e-05, + "loss": 2.0118, + "step": 53 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 1.1287221908569336, + "learning_rate": 1.2115658189875932e-05, + "loss": 1.9, + "step": 54 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 1.1287221908569336, + "learning_rate": 1.2115658189875932e-05, + "loss": 1.9386, + "step": 55 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 1.0455442667007446, + "learning_rate": 1.2465710660560159e-05, + "loss": 1.9017, + "step": 56 + }, + { + "epoch": 0.24782608695652175, + "grad_norm": 1.0455442667007446, + "learning_rate": 1.2465710660560159e-05, + "loss": 1.884, + "step": 57 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 1.0455442667007446, + "learning_rate": 1.2465710660560159e-05, + "loss": 2.0199, + "step": 58 + }, + { + "epoch": 0.2565217391304348, + "grad_norm": 1.0455442667007446, + "learning_rate": 1.2465710660560159e-05, + "loss": 1.9413, + "step": 59 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.9044898152351379, + "learning_rate": 1.279160171413683e-05, + "loss": 1.8714, + "step": 60 + }, + { + "epoch": 0.26521739130434785, + "grad_norm": 0.9044898152351379, + "learning_rate": 1.279160171413683e-05, + "loss": 1.9318, + "step": 61 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 0.9044898152351379, + "learning_rate": 1.279160171413683e-05, + "loss": 1.9003, + "step": 62 + }, + { + "epoch": 0.27391304347826084, + "grad_norm": 0.9044898152351379, + "learning_rate": 1.279160171413683e-05, + "loss": 1.8814, + "step": 63 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 0.7885456681251526, + "learning_rate": 1.309645243561725e-05, + "loss": 1.9581, + "step": 64 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 0.7885456681251526, + "learning_rate": 1.309645243561725e-05, + "loss": 1.818, + "step": 65 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 0.7885456681251526, + "learning_rate": 1.309645243561725e-05, + "loss": 1.796, + "step": 66 + }, + { + "epoch": 0.29130434782608694, + "grad_norm": 0.7885456681251526, + "learning_rate": 1.309645243561725e-05, + "loss": 1.8588, + "step": 67 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 0.6783104538917542, + "learning_rate": 1.3382815670697004e-05, + "loss": 1.8261, + "step": 68 + }, + { + "epoch": 0.3, + "grad_norm": 0.6783104538917542, + "learning_rate": 1.3382815670697004e-05, + "loss": 1.8421, + "step": 69 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.6783104538917542, + "learning_rate": 1.3382815670697004e-05, + "loss": 1.7374, + "step": 70 + }, + { + "epoch": 0.30869565217391304, + "grad_norm": 0.6783104538917542, + "learning_rate": 1.3382815670697004e-05, + "loss": 1.8355, + "step": 71 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 0.6615838408470154, + "learning_rate": 1.365280611037011e-05, + "loss": 1.877, + "step": 72 + }, + { + "epoch": 0.3173913043478261, + "grad_norm": 0.6615838408470154, + "learning_rate": 1.365280611037011e-05, + "loss": 1.7588, + "step": 73 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 0.6615838408470154, + "learning_rate": 1.365280611037011e-05, + "loss": 1.859, + "step": 74 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.6615838408470154, + "learning_rate": 1.365280611037011e-05, + "loss": 1.8263, + "step": 75 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 0.7432522773742676, + "learning_rate": 1.3908195157440944e-05, + "loss": 1.864, + "step": 76 + }, + { + "epoch": 0.3347826086956522, + "grad_norm": 0.7432522773742676, + "learning_rate": 1.3908195157440944e-05, + "loss": 1.8922, + "step": 77 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 0.7432522773742676, + "learning_rate": 1.3908195157440944e-05, + "loss": 1.8468, + "step": 78 + }, + { + "epoch": 0.34347826086956523, + "grad_norm": 0.7432522773742676, + "learning_rate": 1.3908195157440944e-05, + "loss": 1.7736, + "step": 79 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.6891193389892578, + "learning_rate": 1.4150481431212555e-05, + "loss": 1.8292, + "step": 80 + }, + { + "epoch": 0.3521739130434783, + "grad_norm": 0.6891193389892578, + "learning_rate": 1.4150481431212555e-05, + "loss": 1.7703, + "step": 81 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 0.6891193389892578, + "learning_rate": 1.4150481431212555e-05, + "loss": 1.812, + "step": 82 + }, + { + "epoch": 0.36086956521739133, + "grad_norm": 0.6891193389892578, + "learning_rate": 1.4150481431212555e-05, + "loss": 1.8104, + "step": 83 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 0.5785617232322693, + "learning_rate": 1.4380944052388746e-05, + "loss": 1.8025, + "step": 84 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 0.5785617232322693, + "learning_rate": 1.4380944052388746e-05, + "loss": 1.8167, + "step": 85 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 0.5785617232322693, + "learning_rate": 1.4380944052388746e-05, + "loss": 1.8114, + "step": 86 + }, + { + "epoch": 0.3782608695652174, + "grad_norm": 0.5785617232322693, + "learning_rate": 1.4380944052388746e-05, + "loss": 1.7922, + "step": 87 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 0.5551931262016296, + "learning_rate": 1.4600683520842756e-05, + "loss": 1.7535, + "step": 88 + }, + { + "epoch": 0.3869565217391304, + "grad_norm": 0.5551931262016296, + "learning_rate": 1.4600683520842756e-05, + "loss": 1.8034, + "step": 89 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.5551931262016296, + "learning_rate": 1.4600683520842756e-05, + "loss": 1.799, + "step": 90 + }, + { + "epoch": 0.39565217391304347, + "grad_norm": 0.5551931262016296, + "learning_rate": 1.4600683520842756e-05, + "loss": 1.7228, + "step": 91 + }, + { + "epoch": 0.4, + "grad_norm": 0.548203706741333, + "learning_rate": 1.48106534992671e-05, + "loss": 1.826, + "step": 92 + }, + { + "epoch": 0.4043478260869565, + "grad_norm": 0.548203706741333, + "learning_rate": 1.48106534992671e-05, + "loss": 1.8175, + "step": 93 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 0.548203706741333, + "learning_rate": 1.48106534992671e-05, + "loss": 1.7583, + "step": 94 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 0.548203706741333, + "learning_rate": 1.48106534992671e-05, + "loss": 1.7545, + "step": 95 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.5352587699890137, + "learning_rate": 1.5011685827445838e-05, + "loss": 1.8338, + "step": 96 + }, + { + "epoch": 0.4217391304347826, + "grad_norm": 0.5352587699890137, + "learning_rate": 1.5011685827445838e-05, + "loss": 1.7581, + "step": 97 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 0.5352587699890137, + "learning_rate": 1.5011685827445838e-05, + "loss": 1.738, + "step": 98 + }, + { + "epoch": 0.43043478260869567, + "grad_norm": 0.5352587699890137, + "learning_rate": 1.5011685827445838e-05, + "loss": 1.7772, + "step": 99 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.516083836555481, + "learning_rate": 1.520451042680786e-05, + "loss": 1.7126, + "step": 100 + }, + { + "epoch": 0.4391304347826087, + "grad_norm": 0.516083836555481, + "learning_rate": 1.520451042680786e-05, + "loss": 1.7873, + "step": 101 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 0.516083836555481, + "learning_rate": 1.520451042680786e-05, + "loss": 1.7673, + "step": 102 + }, + { + "epoch": 0.44782608695652176, + "grad_norm": 0.516083836555481, + "learning_rate": 1.520451042680786e-05, + "loss": 1.7459, + "step": 103 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 0.5135278701782227, + "learning_rate": 1.5389771298780244e-05, + "loss": 1.7853, + "step": 104 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.5135278701782227, + "learning_rate": 1.5389771298780244e-05, + "loss": 1.7792, + "step": 105 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 0.5135278701782227, + "learning_rate": 1.5389771298780244e-05, + "loss": 1.8095, + "step": 106 + }, + { + "epoch": 0.4652173913043478, + "grad_norm": 0.5135278701782227, + "learning_rate": 1.5389771298780244e-05, + "loss": 1.714, + "step": 107 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 0.47637420892715454, + "learning_rate": 1.5568039502198696e-05, + "loss": 1.757, + "step": 108 + }, + { + "epoch": 0.47391304347826085, + "grad_norm": 0.47637420892715454, + "learning_rate": 1.5568039502198696e-05, + "loss": 1.715, + "step": 109 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.47637420892715454, + "learning_rate": 1.5568039502198696e-05, + "loss": 1.7535, + "step": 110 + }, + { + "epoch": 0.4826086956521739, + "grad_norm": 0.47637420892715454, + "learning_rate": 1.5568039502198696e-05, + "loss": 1.7446, + "step": 111 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 0.46542423963546753, + "learning_rate": 1.5739823769464473e-05, + "loss": 1.7352, + "step": 112 + }, + { + "epoch": 0.49130434782608695, + "grad_norm": 0.46542423963546753, + "learning_rate": 1.5739823769464473e-05, + "loss": 1.7561, + "step": 113 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 0.46542423963546753, + "learning_rate": 1.5739823769464473e-05, + "loss": 1.7318, + "step": 114 + }, + { + "epoch": 0.5, + "grad_norm": 0.46542423963546753, + "learning_rate": 1.5739823769464473e-05, + "loss": 1.7682, + "step": 115 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 0.4574941396713257, + "learning_rate": 1.5905579258955202e-05, + "loss": 1.7042, + "step": 116 + }, + { + "epoch": 0.508695652173913, + "grad_norm": 0.4574941396713257, + "learning_rate": 1.5905579258955202e-05, + "loss": 1.7712, + "step": 117 + }, + { + "epoch": 0.5130434782608696, + "grad_norm": 0.4574941396713257, + "learning_rate": 1.5905579258955202e-05, + "loss": 1.7179, + "step": 118 + }, + { + "epoch": 0.5173913043478261, + "grad_norm": 0.4574941396713257, + "learning_rate": 1.5905579258955202e-05, + "loss": 1.7505, + "step": 119 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.45183202624320984, + "learning_rate": 1.606571482304114e-05, + "loss": 1.75, + "step": 120 + }, + { + "epoch": 0.5260869565217391, + "grad_norm": 0.45183202624320984, + "learning_rate": 1.606571482304114e-05, + "loss": 1.7725, + "step": 121 + }, + { + "epoch": 0.5304347826086957, + "grad_norm": 0.45183202624320984, + "learning_rate": 1.606571482304114e-05, + "loss": 1.7267, + "step": 122 + }, + { + "epoch": 0.5347826086956522, + "grad_norm": 0.45183202624320984, + "learning_rate": 1.606571482304114e-05, + "loss": 1.6849, + "step": 123 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 0.46187731623649597, + "learning_rate": 1.6220599083923048e-05, + "loss": 1.731, + "step": 124 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 0.46187731623649597, + "learning_rate": 1.6220599083923048e-05, + "loss": 1.695, + "step": 125 + }, + { + "epoch": 0.5478260869565217, + "grad_norm": 0.46187731623649597, + "learning_rate": 1.6220599083923048e-05, + "loss": 1.7525, + "step": 126 + }, + { + "epoch": 0.5521739130434783, + "grad_norm": 0.46187731623649597, + "learning_rate": 1.6220599083923048e-05, + "loss": 1.7797, + "step": 127 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 0.44319385290145874, + "learning_rate": 1.6370565544521564e-05, + "loss": 1.7984, + "step": 128 + }, + { + "epoch": 0.5608695652173913, + "grad_norm": 0.44319385290145874, + "learning_rate": 1.6370565544521564e-05, + "loss": 1.6682, + "step": 129 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.44319385290145874, + "learning_rate": 1.6370565544521564e-05, + "loss": 1.6129, + "step": 130 + }, + { + "epoch": 0.5695652173913044, + "grad_norm": 0.44319385290145874, + "learning_rate": 1.6370565544521564e-05, + "loss": 1.7295, + "step": 131 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 0.43330496549606323, + "learning_rate": 1.651591691267134e-05, + "loss": 1.7513, + "step": 132 + }, + { + "epoch": 0.5782608695652174, + "grad_norm": 0.43330496549606323, + "learning_rate": 1.651591691267134e-05, + "loss": 1.7653, + "step": 133 + }, + { + "epoch": 0.5826086956521739, + "grad_norm": 0.43330496549606323, + "learning_rate": 1.651591691267134e-05, + "loss": 1.6724, + "step": 134 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 0.43330496549606323, + "learning_rate": 1.651591691267134e-05, + "loss": 1.7435, + "step": 135 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 0.4281637966632843, + "learning_rate": 1.6656928779601318e-05, + "loss": 1.7297, + "step": 136 + }, + { + "epoch": 0.5956521739130435, + "grad_norm": 0.4281637966632843, + "learning_rate": 1.6656928779601318e-05, + "loss": 1.7354, + "step": 137 + }, + { + "epoch": 0.6, + "grad_norm": 0.4281637966632843, + "learning_rate": 1.6656928779601318e-05, + "loss": 1.806, + "step": 138 + }, + { + "epoch": 0.6043478260869565, + "grad_norm": 0.4281637966632843, + "learning_rate": 1.6656928779601318e-05, + "loss": 1.7033, + "step": 139 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.459945410490036, + "learning_rate": 1.6793852765059776e-05, + "loss": 1.6826, + "step": 140 + }, + { + "epoch": 0.6130434782608696, + "grad_norm": 0.459945410490036, + "learning_rate": 1.6793852765059776e-05, + "loss": 1.6544, + "step": 141 + }, + { + "epoch": 0.6173913043478261, + "grad_norm": 0.459945410490036, + "learning_rate": 1.6793852765059776e-05, + "loss": 1.7139, + "step": 142 + }, + { + "epoch": 0.6217391304347826, + "grad_norm": 0.459945410490036, + "learning_rate": 1.6793852765059776e-05, + "loss": 1.7592, + "step": 143 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 0.41728129982948303, + "learning_rate": 1.6926919219274422e-05, + "loss": 1.7499, + "step": 144 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 0.41728129982948303, + "learning_rate": 1.6926919219274422e-05, + "loss": 1.7006, + "step": 145 + }, + { + "epoch": 0.6347826086956522, + "grad_norm": 0.41728129982948303, + "learning_rate": 1.6926919219274422e-05, + "loss": 1.6757, + "step": 146 + }, + { + "epoch": 0.6391304347826087, + "grad_norm": 0.41728129982948303, + "learning_rate": 1.6926919219274422e-05, + "loss": 1.7343, + "step": 147 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 0.436886727809906, + "learning_rate": 1.7056339554631436e-05, + "loss": 1.716, + "step": 148 + }, + { + "epoch": 0.6478260869565218, + "grad_norm": 0.436886727809906, + "learning_rate": 1.7056339554631436e-05, + "loss": 1.6438, + "step": 149 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.436886727809906, + "learning_rate": 1.7056339554631436e-05, + "loss": 1.6398, + "step": 150 + }, + { + "epoch": 0.6565217391304348, + "grad_norm": 0.436886727809906, + "learning_rate": 1.7056339554631436e-05, + "loss": 1.65, + "step": 151 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 0.42069295048713684, + "learning_rate": 1.7182308266345256e-05, + "loss": 1.7152, + "step": 152 + }, + { + "epoch": 0.6652173913043479, + "grad_norm": 0.42069295048713684, + "learning_rate": 1.7182308266345256e-05, + "loss": 1.7178, + "step": 153 + }, + { + "epoch": 0.6695652173913044, + "grad_norm": 0.42069295048713684, + "learning_rate": 1.7182308266345256e-05, + "loss": 1.6735, + "step": 154 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 0.42069295048713684, + "learning_rate": 1.7182308266345256e-05, + "loss": 1.6999, + "step": 155 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 0.44798803329467773, + "learning_rate": 1.7305004690608827e-05, + "loss": 1.6549, + "step": 156 + }, + { + "epoch": 0.6826086956521739, + "grad_norm": 0.44798803329467773, + "learning_rate": 1.7305004690608827e-05, + "loss": 1.6681, + "step": 157 + }, + { + "epoch": 0.6869565217391305, + "grad_norm": 0.44798803329467773, + "learning_rate": 1.7305004690608827e-05, + "loss": 1.5953, + "step": 158 + }, + { + "epoch": 0.691304347826087, + "grad_norm": 0.44798803329467773, + "learning_rate": 1.7305004690608827e-05, + "loss": 1.6615, + "step": 159 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.4351099133491516, + "learning_rate": 1.7424594540116867e-05, + "loss": 1.6599, + "step": 160 + }, + { + "epoch": 0.7, + "grad_norm": 0.4351099133491516, + "learning_rate": 1.7424594540116867e-05, + "loss": 1.6897, + "step": 161 + }, + { + "epoch": 0.7043478260869566, + "grad_norm": 0.4351099133491516, + "learning_rate": 1.7424594540116867e-05, + "loss": 1.5914, + "step": 162 + }, + { + "epoch": 0.7086956521739131, + "grad_norm": 0.4351099133491516, + "learning_rate": 1.7424594540116867e-05, + "loss": 1.6879, + "step": 163 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 0.41112539172172546, + "learning_rate": 1.754123124995665e-05, + "loss": 1.6539, + "step": 164 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 0.41112539172172546, + "learning_rate": 1.754123124995665e-05, + "loss": 1.6907, + "step": 165 + }, + { + "epoch": 0.7217391304347827, + "grad_norm": 0.41112539172172546, + "learning_rate": 1.754123124995665e-05, + "loss": 1.6745, + "step": 166 + }, + { + "epoch": 0.7260869565217392, + "grad_norm": 0.41112539172172546, + "learning_rate": 1.754123124995665e-05, + "loss": 1.6728, + "step": 167 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 0.4084008038043976, + "learning_rate": 1.765505716129306e-05, + "loss": 1.6311, + "step": 168 + }, + { + "epoch": 0.7347826086956522, + "grad_norm": 0.4084008038043976, + "learning_rate": 1.765505716129306e-05, + "loss": 1.6012, + "step": 169 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.4084008038043976, + "learning_rate": 1.765505716129306e-05, + "loss": 1.7379, + "step": 170 + }, + { + "epoch": 0.7434782608695653, + "grad_norm": 0.4084008038043976, + "learning_rate": 1.765505716129306e-05, + "loss": 1.6965, + "step": 171 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 0.3991777002811432, + "learning_rate": 1.7766204565755586e-05, + "loss": 1.6599, + "step": 172 + }, + { + "epoch": 0.7521739130434782, + "grad_norm": 0.3991777002811432, + "learning_rate": 1.7766204565755586e-05, + "loss": 1.6769, + "step": 173 + }, + { + "epoch": 0.7565217391304347, + "grad_norm": 0.3991777002811432, + "learning_rate": 1.7766204565755586e-05, + "loss": 1.6354, + "step": 174 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 0.3991777002811432, + "learning_rate": 1.7766204565755586e-05, + "loss": 1.6373, + "step": 175 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 0.42373913526535034, + "learning_rate": 1.7874796629747068e-05, + "loss": 1.6901, + "step": 176 + }, + { + "epoch": 0.7695652173913043, + "grad_norm": 0.42373913526535034, + "learning_rate": 1.7874796629747068e-05, + "loss": 1.6162, + "step": 177 + }, + { + "epoch": 0.7739130434782608, + "grad_norm": 0.42373913526535034, + "learning_rate": 1.7874796629747068e-05, + "loss": 1.6333, + "step": 178 + }, + { + "epoch": 0.7782608695652173, + "grad_norm": 0.42373913526535034, + "learning_rate": 1.7874796629747068e-05, + "loss": 1.7013, + "step": 179 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.3955826461315155, + "learning_rate": 1.7980948214869728e-05, + "loss": 1.6865, + "step": 180 + }, + { + "epoch": 0.7869565217391304, + "grad_norm": 0.3955826461315155, + "learning_rate": 1.7980948214869728e-05, + "loss": 1.6776, + "step": 181 + }, + { + "epoch": 0.7913043478260869, + "grad_norm": 0.3955826461315155, + "learning_rate": 1.7980948214869728e-05, + "loss": 1.6903, + "step": 182 + }, + { + "epoch": 0.7956521739130434, + "grad_norm": 0.3955826461315155, + "learning_rate": 1.7980948214869728e-05, + "loss": 1.6803, + "step": 183 + }, + { + "epoch": 0.8, + "grad_norm": 0.4143446087837219, + "learning_rate": 1.8084766608171415e-05, + "loss": 1.6619, + "step": 184 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 0.4143446087837219, + "learning_rate": 1.8084766608171415e-05, + "loss": 1.6797, + "step": 185 + }, + { + "epoch": 0.808695652173913, + "grad_norm": 0.4143446087837219, + "learning_rate": 1.8084766608171415e-05, + "loss": 1.6324, + "step": 186 + }, + { + "epoch": 0.8130434782608695, + "grad_norm": 0.4143446087837219, + "learning_rate": 1.8084766608171415e-05, + "loss": 1.6539, + "step": 187 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 0.41547316312789917, + "learning_rate": 1.8186352173851508e-05, + "loss": 1.7622, + "step": 188 + }, + { + "epoch": 0.8217391304347826, + "grad_norm": 0.41547316312789917, + "learning_rate": 1.8186352173851508e-05, + "loss": 1.6468, + "step": 189 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.41547316312789917, + "learning_rate": 1.8186352173851508e-05, + "loss": 1.6644, + "step": 190 + }, + { + "epoch": 0.8304347826086956, + "grad_norm": 0.41547316312789917, + "learning_rate": 1.8186352173851508e-05, + "loss": 1.6822, + "step": 191 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 0.42321857810020447, + "learning_rate": 1.828579893635015e-05, + "loss": 1.6321, + "step": 192 + }, + { + "epoch": 0.8391304347826087, + "grad_norm": 0.42321857810020447, + "learning_rate": 1.828579893635015e-05, + "loss": 1.6843, + "step": 193 + }, + { + "epoch": 0.8434782608695652, + "grad_norm": 0.42321857810020447, + "learning_rate": 1.828579893635015e-05, + "loss": 1.63, + "step": 194 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 0.42321857810020447, + "learning_rate": 1.828579893635015e-05, + "loss": 1.6028, + "step": 195 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 0.41478654742240906, + "learning_rate": 1.8383195103311694e-05, + "loss": 1.7511, + "step": 196 + }, + { + "epoch": 0.8565217391304348, + "grad_norm": 0.41478654742240906, + "learning_rate": 1.8383195103311694e-05, + "loss": 1.644, + "step": 197 + }, + { + "epoch": 0.8608695652173913, + "grad_norm": 0.41478654742240906, + "learning_rate": 1.8383195103311694e-05, + "loss": 1.6338, + "step": 198 + }, + { + "epoch": 0.8652173913043478, + "grad_norm": 0.41478654742240906, + "learning_rate": 1.8383195103311694e-05, + "loss": 1.6912, + "step": 199 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.41812795400619507, + "learning_rate": 1.8478623535712173e-05, + "loss": 1.6322, + "step": 200 + }, + { + "epoch": 0.8739130434782608, + "grad_norm": 0.41812795400619507, + "learning_rate": 1.8478623535712173e-05, + "loss": 1.7339, + "step": 201 + }, + { + "epoch": 0.8782608695652174, + "grad_norm": 0.41812795400619507, + "learning_rate": 1.8478623535712173e-05, + "loss": 1.617, + "step": 202 + }, + { + "epoch": 0.8826086956521739, + "grad_norm": 0.41812795400619507, + "learning_rate": 1.8478623535712173e-05, + "loss": 1.6637, + "step": 203 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 0.3970027565956116, + "learning_rate": 1.8572162171429905e-05, + "loss": 1.6093, + "step": 204 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 0.3970027565956116, + "learning_rate": 1.8572162171429905e-05, + "loss": 1.6211, + "step": 205 + }, + { + "epoch": 0.8956521739130435, + "grad_norm": 0.3970027565956116, + "learning_rate": 1.8572162171429905e-05, + "loss": 1.7275, + "step": 206 + }, + { + "epoch": 0.9, + "grad_norm": 0.3970027565956116, + "learning_rate": 1.8572162171429905e-05, + "loss": 1.6095, + "step": 207 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 0.4158819913864136, + "learning_rate": 1.866388440768456e-05, + "loss": 1.6403, + "step": 208 + }, + { + "epoch": 0.908695652173913, + "grad_norm": 0.4158819913864136, + "learning_rate": 1.866388440768456e-05, + "loss": 1.5544, + "step": 209 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.4158819913864136, + "learning_rate": 1.866388440768456e-05, + "loss": 1.7182, + "step": 210 + }, + { + "epoch": 0.9173913043478261, + "grad_norm": 0.4158819913864136, + "learning_rate": 1.866388440768456e-05, + "loss": 1.6324, + "step": 211 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 0.41026684641838074, + "learning_rate": 1.875385944704652e-05, + "loss": 1.6029, + "step": 212 + }, + { + "epoch": 0.9260869565217391, + "grad_norm": 0.41026684641838074, + "learning_rate": 1.875385944704652e-05, + "loss": 1.6479, + "step": 213 + }, + { + "epoch": 0.9304347826086956, + "grad_norm": 0.41026684641838074, + "learning_rate": 1.875385944704652e-05, + "loss": 1.5916, + "step": 214 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 0.41026684641838074, + "learning_rate": 1.875385944704652e-05, + "loss": 1.6131, + "step": 215 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 0.3943828046321869, + "learning_rate": 1.8842152611103012e-05, + "loss": 1.653, + "step": 216 + }, + { + "epoch": 0.9434782608695652, + "grad_norm": 0.3943828046321869, + "learning_rate": 1.8842152611103012e-05, + "loss": 1.6321, + "step": 217 + }, + { + "epoch": 0.9478260869565217, + "grad_norm": 0.3943828046321869, + "learning_rate": 1.8842152611103012e-05, + "loss": 1.65, + "step": 218 + }, + { + "epoch": 0.9521739130434783, + "grad_norm": 0.3943828046321869, + "learning_rate": 1.8842152611103012e-05, + "loss": 1.7162, + "step": 219 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.4221249520778656, + "learning_rate": 1.8928825625342374e-05, + "loss": 1.6494, + "step": 220 + }, + { + "epoch": 0.9608695652173913, + "grad_norm": 0.4221249520778656, + "learning_rate": 1.8928825625342374e-05, + "loss": 1.6674, + "step": 221 + }, + { + "epoch": 0.9652173913043478, + "grad_norm": 0.4221249520778656, + "learning_rate": 1.8928825625342374e-05, + "loss": 1.6672, + "step": 222 + }, + { + "epoch": 0.9695652173913043, + "grad_norm": 0.4221249520778656, + "learning_rate": 1.8928825625342374e-05, + "loss": 1.5908, + "step": 223 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 0.43016910552978516, + "learning_rate": 1.901393687836879e-05, + "loss": 1.6331, + "step": 224 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 0.43016910552978516, + "learning_rate": 1.901393687836879e-05, + "loss": 1.6466, + "step": 225 + }, + { + "epoch": 0.9826086956521739, + "grad_norm": 0.43016910552978516, + "learning_rate": 1.901393687836879e-05, + "loss": 1.5822, + "step": 226 + }, + { + "epoch": 0.9869565217391304, + "grad_norm": 0.43016910552978516, + "learning_rate": 1.901393687836879e-05, + "loss": 1.6973, + "step": 227 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 0.4303857982158661, + "learning_rate": 1.9097541658173843e-05, + "loss": 1.6486, + "step": 228 + }, + { + "epoch": 0.9956521739130435, + "grad_norm": 0.4303857982158661, + "learning_rate": 1.9097541658173843e-05, + "loss": 1.6024, + "step": 229 + }, + { + "epoch": 1.0, + "grad_norm": 0.4303857982158661, + "learning_rate": 1.9097541658173843e-05, + "loss": 1.7058, + "step": 230 + }, + { + "epoch": 1.0043478260869565, + "grad_norm": 0.4303857982158661, + "learning_rate": 1.9097541658173843e-05, + "loss": 1.5432, + "step": 231 + }, + { + "epoch": 1.008695652173913, + "grad_norm": 0.422577828168869, + "learning_rate": 1.9179692367859514e-05, + "loss": 1.5811, + "step": 232 + }, + { + "epoch": 1.0130434782608695, + "grad_norm": 0.422577828168869, + "learning_rate": 1.9179692367859514e-05, + "loss": 1.5742, + "step": 233 + }, + { + "epoch": 1.017391304347826, + "grad_norm": 0.422577828168869, + "learning_rate": 1.9179692367859514e-05, + "loss": 1.5679, + "step": 234 + }, + { + "epoch": 1.0217391304347827, + "grad_norm": 0.422577828168869, + "learning_rate": 1.9179692367859514e-05, + "loss": 1.5029, + "step": 235 + }, + { + "epoch": 1.0260869565217392, + "grad_norm": 0.4109610915184021, + "learning_rate": 1.926043872292045e-05, + "loss": 1.6383, + "step": 236 + }, + { + "epoch": 1.0304347826086957, + "grad_norm": 0.4109610915184021, + "learning_rate": 1.926043872292045e-05, + "loss": 1.6379, + "step": 237 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 0.4109610915184021, + "learning_rate": 1.926043872292045e-05, + "loss": 1.5574, + "step": 238 + }, + { + "epoch": 1.0391304347826087, + "grad_norm": 0.4109610915184021, + "learning_rate": 1.926043872292045e-05, + "loss": 1.5899, + "step": 239 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.4028691351413727, + "learning_rate": 1.9339827931945454e-05, + "loss": 1.6503, + "step": 240 + }, + { + "epoch": 1.0478260869565217, + "grad_norm": 0.4028691351413727, + "learning_rate": 1.9339827931945454e-05, + "loss": 1.5943, + "step": 241 + }, + { + "epoch": 1.0521739130434782, + "grad_norm": 0.4028691351413727, + "learning_rate": 1.9339827931945454e-05, + "loss": 1.5263, + "step": 242 + }, + { + "epoch": 1.0565217391304347, + "grad_norm": 0.4028691351413727, + "learning_rate": 1.9339827931945454e-05, + "loss": 1.6053, + "step": 243 + }, + { + "epoch": 1.0608695652173914, + "grad_norm": 0.415353387594223, + "learning_rate": 1.941790486238291e-05, + "loss": 1.5856, + "step": 244 + }, + { + "epoch": 1.065217391304348, + "grad_norm": 0.415353387594223, + "learning_rate": 1.941790486238291e-05, + "loss": 1.6749, + "step": 245 + }, + { + "epoch": 1.0695652173913044, + "grad_norm": 0.415353387594223, + "learning_rate": 1.941790486238291e-05, + "loss": 1.5961, + "step": 246 + }, + { + "epoch": 1.0739130434782609, + "grad_norm": 0.415353387594223, + "learning_rate": 1.941790486238291e-05, + "loss": 1.5564, + "step": 247 + }, + { + "epoch": 1.0782608695652174, + "grad_norm": 0.43506088852882385, + "learning_rate": 1.949471219282736e-05, + "loss": 1.5932, + "step": 248 + }, + { + "epoch": 1.0826086956521739, + "grad_norm": 0.43506088852882385, + "learning_rate": 1.949471219282736e-05, + "loss": 1.5567, + "step": 249 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.43506088852882385, + "learning_rate": 1.949471219282736e-05, + "loss": 1.5601, + "step": 250 + }, + { + "epoch": 1.0913043478260869, + "grad_norm": 0.43506088852882385, + "learning_rate": 1.949471219282736e-05, + "loss": 1.6339, + "step": 251 + }, + { + "epoch": 1.0956521739130434, + "grad_norm": 0.4195059835910797, + "learning_rate": 1.9570290553121646e-05, + "loss": 1.5493, + "step": 252 + }, + { + "epoch": 1.1, + "grad_norm": 0.4195059835910797, + "learning_rate": 1.9570290553121646e-05, + "loss": 1.5483, + "step": 253 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 0.4195059835910797, + "learning_rate": 1.9570290553121646e-05, + "loss": 1.5828, + "step": 254 + }, + { + "epoch": 1.108695652173913, + "grad_norm": 0.4195059835910797, + "learning_rate": 1.9570290553121646e-05, + "loss": 1.5189, + "step": 255 + }, + { + "epoch": 1.1130434782608696, + "grad_norm": 0.41635432839393616, + "learning_rate": 1.9644678653425876e-05, + "loss": 1.5438, + "step": 256 + }, + { + "epoch": 1.117391304347826, + "grad_norm": 0.41635432839393616, + "learning_rate": 1.9644678653425876e-05, + "loss": 1.6092, + "step": 257 + }, + { + "epoch": 1.1217391304347826, + "grad_norm": 0.41635432839393616, + "learning_rate": 1.9644678653425876e-05, + "loss": 1.5977, + "step": 258 + }, + { + "epoch": 1.126086956521739, + "grad_norm": 0.41635432839393616, + "learning_rate": 1.9644678653425876e-05, + "loss": 1.6079, + "step": 259 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.4125351905822754, + "learning_rate": 1.971791340327986e-05, + "loss": 1.553, + "step": 260 + }, + { + "epoch": 1.134782608695652, + "grad_norm": 0.4125351905822754, + "learning_rate": 1.971791340327986e-05, + "loss": 1.5419, + "step": 261 + }, + { + "epoch": 1.1391304347826088, + "grad_norm": 0.4125351905822754, + "learning_rate": 1.971791340327986e-05, + "loss": 1.5396, + "step": 262 + }, + { + "epoch": 1.1434782608695653, + "grad_norm": 0.4125351905822754, + "learning_rate": 1.971791340327986e-05, + "loss": 1.6408, + "step": 263 + }, + { + "epoch": 1.1478260869565218, + "grad_norm": 0.40901288390159607, + "learning_rate": 1.979003002157565e-05, + "loss": 1.5664, + "step": 264 + }, + { + "epoch": 1.1521739130434783, + "grad_norm": 0.40901288390159607, + "learning_rate": 1.979003002157565e-05, + "loss": 1.5708, + "step": 265 + }, + { + "epoch": 1.1565217391304348, + "grad_norm": 0.40901288390159607, + "learning_rate": 1.979003002157565e-05, + "loss": 1.6379, + "step": 266 + }, + { + "epoch": 1.1608695652173913, + "grad_norm": 0.40901288390159607, + "learning_rate": 1.979003002157565e-05, + "loss": 1.5962, + "step": 267 + }, + { + "epoch": 1.1652173913043478, + "grad_norm": 0.4133271872997284, + "learning_rate": 1.9861062138260542e-05, + "loss": 1.6233, + "step": 268 + }, + { + "epoch": 1.1695652173913043, + "grad_norm": 0.4133271872997284, + "learning_rate": 1.9861062138260542e-05, + "loss": 1.5597, + "step": 269 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.4133271872997284, + "learning_rate": 1.9861062138260542e-05, + "loss": 1.5396, + "step": 270 + }, + { + "epoch": 1.1782608695652175, + "grad_norm": 0.4133271872997284, + "learning_rate": 1.9861062138260542e-05, + "loss": 1.6504, + "step": 271 + }, + { + "epoch": 1.182608695652174, + "grad_norm": 0.4083419144153595, + "learning_rate": 1.993104188850563e-05, + "loss": 1.519, + "step": 272 + }, + { + "epoch": 1.1869565217391305, + "grad_norm": 0.4083419144153595, + "learning_rate": 1.993104188850563e-05, + "loss": 1.6034, + "step": 273 + }, + { + "epoch": 1.191304347826087, + "grad_norm": 0.4083419144153595, + "learning_rate": 1.993104188850563e-05, + "loss": 1.5509, + "step": 274 + }, + { + "epoch": 1.1956521739130435, + "grad_norm": 0.4083419144153595, + "learning_rate": 1.993104188850563e-05, + "loss": 1.4744, + "step": 275 + }, + { + "epoch": 1.2, + "grad_norm": 0.40321245789527893, + "learning_rate": 2e-05, + "loss": 1.5611, + "step": 276 + }, + { + "epoch": 1.2043478260869565, + "grad_norm": 0.40321245789527893, + "learning_rate": 2e-05, + "loss": 1.5937, + "step": 277 + }, + { + "epoch": 1.208695652173913, + "grad_norm": 0.40321245789527893, + "learning_rate": 2e-05, + "loss": 1.5801, + "step": 278 + }, + { + "epoch": 1.2130434782608694, + "grad_norm": 0.40321245789527893, + "learning_rate": 2e-05, + "loss": 1.5999, + "step": 279 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.4187948703765869, + "learning_rate": 2e-05, + "loss": 1.5488, + "step": 280 + }, + { + "epoch": 1.2217391304347827, + "grad_norm": 0.4187948703765869, + "learning_rate": 2e-05, + "loss": 1.4954, + "step": 281 + }, + { + "epoch": 1.2260869565217392, + "grad_norm": 0.4187948703765869, + "learning_rate": 2e-05, + "loss": 1.4704, + "step": 282 + }, + { + "epoch": 1.2304347826086957, + "grad_norm": 0.4187948703765869, + "learning_rate": 2e-05, + "loss": 1.5532, + "step": 283 + }, + { + "epoch": 1.2347826086956522, + "grad_norm": 0.39265695214271545, + "learning_rate": 2e-05, + "loss": 1.6158, + "step": 284 + }, + { + "epoch": 1.2391304347826086, + "grad_norm": 0.39265695214271545, + "learning_rate": 2e-05, + "loss": 1.5282, + "step": 285 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 0.39265695214271545, + "learning_rate": 2e-05, + "loss": 1.6288, + "step": 286 + }, + { + "epoch": 1.2478260869565219, + "grad_norm": 0.39265695214271545, + "learning_rate": 2e-05, + "loss": 1.6232, + "step": 287 + }, + { + "epoch": 1.2521739130434781, + "grad_norm": 0.4242652654647827, + "learning_rate": 2e-05, + "loss": 1.5431, + "step": 288 + }, + { + "epoch": 1.2565217391304349, + "grad_norm": 0.4242652654647827, + "learning_rate": 2e-05, + "loss": 1.5737, + "step": 289 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.4242652654647827, + "learning_rate": 2e-05, + "loss": 1.5198, + "step": 290 + }, + { + "epoch": 1.2652173913043478, + "grad_norm": 0.4242652654647827, + "learning_rate": 2e-05, + "loss": 1.5479, + "step": 291 + }, + { + "epoch": 1.2695652173913043, + "grad_norm": 0.4110427796840668, + "learning_rate": 2e-05, + "loss": 1.6079, + "step": 292 + }, + { + "epoch": 1.2739130434782608, + "grad_norm": 0.4110427796840668, + "learning_rate": 2e-05, + "loss": 1.6189, + "step": 293 + }, + { + "epoch": 1.2782608695652173, + "grad_norm": 0.4110427796840668, + "learning_rate": 2e-05, + "loss": 1.5868, + "step": 294 + }, + { + "epoch": 1.2826086956521738, + "grad_norm": 0.4110427796840668, + "learning_rate": 2e-05, + "loss": 1.5536, + "step": 295 + }, + { + "epoch": 1.2869565217391306, + "grad_norm": 0.4077237546443939, + "learning_rate": 2e-05, + "loss": 1.6191, + "step": 296 + }, + { + "epoch": 1.2913043478260868, + "grad_norm": 0.4077237546443939, + "learning_rate": 2e-05, + "loss": 1.5256, + "step": 297 + }, + { + "epoch": 1.2956521739130435, + "grad_norm": 0.4077237546443939, + "learning_rate": 2e-05, + "loss": 1.4858, + "step": 298 + }, + { + "epoch": 1.3, + "grad_norm": 0.4077237546443939, + "learning_rate": 2e-05, + "loss": 1.6217, + "step": 299 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.4294397234916687, + "learning_rate": 2e-05, + "loss": 1.5448, + "step": 300 + }, + { + "epoch": 1.308695652173913, + "grad_norm": 0.4294397234916687, + "learning_rate": 2e-05, + "loss": 1.5482, + "step": 301 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 0.4294397234916687, + "learning_rate": 2e-05, + "loss": 1.5587, + "step": 302 + }, + { + "epoch": 1.317391304347826, + "grad_norm": 0.4294397234916687, + "learning_rate": 2e-05, + "loss": 1.5039, + "step": 303 + }, + { + "epoch": 1.3217391304347825, + "grad_norm": 0.42842897772789, + "learning_rate": 2e-05, + "loss": 1.6381, + "step": 304 + }, + { + "epoch": 1.3260869565217392, + "grad_norm": 0.42842897772789, + "learning_rate": 2e-05, + "loss": 1.6137, + "step": 305 + }, + { + "epoch": 1.3304347826086955, + "grad_norm": 0.42842897772789, + "learning_rate": 2e-05, + "loss": 1.5976, + "step": 306 + }, + { + "epoch": 1.3347826086956522, + "grad_norm": 0.42842897772789, + "learning_rate": 2e-05, + "loss": 1.6185, + "step": 307 + }, + { + "epoch": 1.3391304347826087, + "grad_norm": 0.41912516951560974, + "learning_rate": 2e-05, + "loss": 1.5669, + "step": 308 + }, + { + "epoch": 1.3434782608695652, + "grad_norm": 0.41912516951560974, + "learning_rate": 2e-05, + "loss": 1.5439, + "step": 309 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.41912516951560974, + "learning_rate": 2e-05, + "loss": 1.5554, + "step": 310 + }, + { + "epoch": 1.3521739130434782, + "grad_norm": 0.41912516951560974, + "learning_rate": 2e-05, + "loss": 1.5461, + "step": 311 + }, + { + "epoch": 1.3565217391304347, + "grad_norm": 0.4236287772655487, + "learning_rate": 2e-05, + "loss": 1.5871, + "step": 312 + }, + { + "epoch": 1.3608695652173912, + "grad_norm": 0.4236287772655487, + "learning_rate": 2e-05, + "loss": 1.4733, + "step": 313 + }, + { + "epoch": 1.365217391304348, + "grad_norm": 0.4236287772655487, + "learning_rate": 2e-05, + "loss": 1.5409, + "step": 314 + }, + { + "epoch": 1.3695652173913042, + "grad_norm": 0.4236287772655487, + "learning_rate": 2e-05, + "loss": 1.5149, + "step": 315 + }, + { + "epoch": 1.373913043478261, + "grad_norm": 0.38583850860595703, + "learning_rate": 2e-05, + "loss": 1.4956, + "step": 316 + }, + { + "epoch": 1.3782608695652174, + "grad_norm": 0.38583850860595703, + "learning_rate": 2e-05, + "loss": 1.5329, + "step": 317 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 0.38583850860595703, + "learning_rate": 2e-05, + "loss": 1.5038, + "step": 318 + }, + { + "epoch": 1.3869565217391304, + "grad_norm": 0.38583850860595703, + "learning_rate": 2e-05, + "loss": 1.5759, + "step": 319 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.4223187267780304, + "learning_rate": 2e-05, + "loss": 1.5672, + "step": 320 + }, + { + "epoch": 1.3956521739130434, + "grad_norm": 0.4223187267780304, + "learning_rate": 2e-05, + "loss": 1.5082, + "step": 321 + }, + { + "epoch": 1.4, + "grad_norm": 0.4223187267780304, + "learning_rate": 2e-05, + "loss": 1.5306, + "step": 322 + }, + { + "epoch": 1.4043478260869566, + "grad_norm": 0.4223187267780304, + "learning_rate": 2e-05, + "loss": 1.5861, + "step": 323 + }, + { + "epoch": 1.4086956521739131, + "grad_norm": 0.42702987790107727, + "learning_rate": 2e-05, + "loss": 1.6032, + "step": 324 + }, + { + "epoch": 1.4130434782608696, + "grad_norm": 0.42702987790107727, + "learning_rate": 2e-05, + "loss": 1.5757, + "step": 325 + }, + { + "epoch": 1.4173913043478261, + "grad_norm": 0.42702987790107727, + "learning_rate": 2e-05, + "loss": 1.5847, + "step": 326 + }, + { + "epoch": 1.4217391304347826, + "grad_norm": 0.42702987790107727, + "learning_rate": 2e-05, + "loss": 1.5571, + "step": 327 + }, + { + "epoch": 1.4260869565217391, + "grad_norm": 0.4275616705417633, + "learning_rate": 2e-05, + "loss": 1.4902, + "step": 328 + }, + { + "epoch": 1.4304347826086956, + "grad_norm": 0.4275616705417633, + "learning_rate": 2e-05, + "loss": 1.5888, + "step": 329 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.4275616705417633, + "learning_rate": 2e-05, + "loss": 1.5521, + "step": 330 + }, + { + "epoch": 1.4391304347826086, + "grad_norm": 0.4275616705417633, + "learning_rate": 2e-05, + "loss": 1.5895, + "step": 331 + }, + { + "epoch": 1.4434782608695653, + "grad_norm": 0.4133172333240509, + "learning_rate": 2e-05, + "loss": 1.5116, + "step": 332 + }, + { + "epoch": 1.4478260869565218, + "grad_norm": 0.4133172333240509, + "learning_rate": 2e-05, + "loss": 1.5397, + "step": 333 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 0.4133172333240509, + "learning_rate": 2e-05, + "loss": 1.5809, + "step": 334 + }, + { + "epoch": 1.4565217391304348, + "grad_norm": 0.4133172333240509, + "learning_rate": 2e-05, + "loss": 1.4922, + "step": 335 + }, + { + "epoch": 1.4608695652173913, + "grad_norm": 0.4177166819572449, + "learning_rate": 2e-05, + "loss": 1.5604, + "step": 336 + }, + { + "epoch": 1.4652173913043478, + "grad_norm": 0.4177166819572449, + "learning_rate": 2e-05, + "loss": 1.5536, + "step": 337 + }, + { + "epoch": 1.4695652173913043, + "grad_norm": 0.4177166819572449, + "learning_rate": 2e-05, + "loss": 1.5712, + "step": 338 + }, + { + "epoch": 1.4739130434782608, + "grad_norm": 0.4177166819572449, + "learning_rate": 2e-05, + "loss": 1.4619, + "step": 339 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.4101729393005371, + "learning_rate": 2e-05, + "loss": 1.5609, + "step": 340 + }, + { + "epoch": 1.482608695652174, + "grad_norm": 0.4101729393005371, + "learning_rate": 2e-05, + "loss": 1.5474, + "step": 341 + }, + { + "epoch": 1.4869565217391305, + "grad_norm": 0.4101729393005371, + "learning_rate": 2e-05, + "loss": 1.5178, + "step": 342 + }, + { + "epoch": 1.491304347826087, + "grad_norm": 0.4101729393005371, + "learning_rate": 2e-05, + "loss": 1.4915, + "step": 343 + }, + { + "epoch": 1.4956521739130435, + "grad_norm": 0.41781285405158997, + "learning_rate": 2e-05, + "loss": 1.5977, + "step": 344 + }, + { + "epoch": 1.5, + "grad_norm": 0.41781285405158997, + "learning_rate": 2e-05, + "loss": 1.5584, + "step": 345 + }, + { + "epoch": 1.5043478260869565, + "grad_norm": 0.41781285405158997, + "learning_rate": 2e-05, + "loss": 1.5904, + "step": 346 + }, + { + "epoch": 1.508695652173913, + "grad_norm": 0.41781285405158997, + "learning_rate": 2e-05, + "loss": 1.6015, + "step": 347 + }, + { + "epoch": 1.5130434782608697, + "grad_norm": 0.4429299235343933, + "learning_rate": 2e-05, + "loss": 1.5282, + "step": 348 + }, + { + "epoch": 1.517391304347826, + "grad_norm": 0.4429299235343933, + "learning_rate": 2e-05, + "loss": 1.5595, + "step": 349 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.4429299235343933, + "learning_rate": 2e-05, + "loss": 1.5522, + "step": 350 + }, + { + "epoch": 1.526086956521739, + "grad_norm": 0.4429299235343933, + "learning_rate": 2e-05, + "loss": 1.5934, + "step": 351 + }, + { + "epoch": 1.5304347826086957, + "grad_norm": 0.4120587706565857, + "learning_rate": 2e-05, + "loss": 1.4523, + "step": 352 + }, + { + "epoch": 1.5347826086956522, + "grad_norm": 0.4120587706565857, + "learning_rate": 2e-05, + "loss": 1.5408, + "step": 353 + }, + { + "epoch": 1.5391304347826087, + "grad_norm": 0.4120587706565857, + "learning_rate": 2e-05, + "loss": 1.6324, + "step": 354 + }, + { + "epoch": 1.5434782608695652, + "grad_norm": 0.4120587706565857, + "learning_rate": 2e-05, + "loss": 1.4724, + "step": 355 + }, + { + "epoch": 1.5478260869565217, + "grad_norm": 0.4269024431705475, + "learning_rate": 2e-05, + "loss": 1.5582, + "step": 356 + }, + { + "epoch": 1.5521739130434784, + "grad_norm": 0.4269024431705475, + "learning_rate": 2e-05, + "loss": 1.4954, + "step": 357 + }, + { + "epoch": 1.5565217391304347, + "grad_norm": 0.4269024431705475, + "learning_rate": 2e-05, + "loss": 1.5452, + "step": 358 + }, + { + "epoch": 1.5608695652173914, + "grad_norm": 0.4269024431705475, + "learning_rate": 2e-05, + "loss": 1.4923, + "step": 359 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.4282481074333191, + "learning_rate": 2e-05, + "loss": 1.5833, + "step": 360 + }, + { + "epoch": 1.5695652173913044, + "grad_norm": 0.4282481074333191, + "learning_rate": 2e-05, + "loss": 1.589, + "step": 361 + }, + { + "epoch": 1.5739130434782609, + "grad_norm": 0.4282481074333191, + "learning_rate": 2e-05, + "loss": 1.5912, + "step": 362 + }, + { + "epoch": 1.5782608695652174, + "grad_norm": 0.4282481074333191, + "learning_rate": 2e-05, + "loss": 1.5138, + "step": 363 + }, + { + "epoch": 1.5826086956521739, + "grad_norm": 0.40982702374458313, + "learning_rate": 2e-05, + "loss": 1.5423, + "step": 364 + }, + { + "epoch": 1.5869565217391304, + "grad_norm": 0.40982702374458313, + "learning_rate": 2e-05, + "loss": 1.5006, + "step": 365 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 0.40982702374458313, + "learning_rate": 2e-05, + "loss": 1.5404, + "step": 366 + }, + { + "epoch": 1.5956521739130434, + "grad_norm": 0.40982702374458313, + "learning_rate": 2e-05, + "loss": 1.5456, + "step": 367 + }, + { + "epoch": 1.6, + "grad_norm": 0.46088093519210815, + "learning_rate": 2e-05, + "loss": 1.5764, + "step": 368 + }, + { + "epoch": 1.6043478260869564, + "grad_norm": 0.46088093519210815, + "learning_rate": 2e-05, + "loss": 1.5396, + "step": 369 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.46088093519210815, + "learning_rate": 2e-05, + "loss": 1.5228, + "step": 370 + }, + { + "epoch": 1.6130434782608696, + "grad_norm": 0.46088093519210815, + "learning_rate": 2e-05, + "loss": 1.5328, + "step": 371 + }, + { + "epoch": 1.617391304347826, + "grad_norm": 0.43352362513542175, + "learning_rate": 2e-05, + "loss": 1.5284, + "step": 372 + }, + { + "epoch": 1.6217391304347826, + "grad_norm": 0.43352362513542175, + "learning_rate": 2e-05, + "loss": 1.5102, + "step": 373 + }, + { + "epoch": 1.626086956521739, + "grad_norm": 0.43352362513542175, + "learning_rate": 2e-05, + "loss": 1.5272, + "step": 374 + }, + { + "epoch": 1.6304347826086958, + "grad_norm": 0.43352362513542175, + "learning_rate": 2e-05, + "loss": 1.5409, + "step": 375 + }, + { + "epoch": 1.634782608695652, + "grad_norm": 0.436289519071579, + "learning_rate": 2e-05, + "loss": 1.495, + "step": 376 + }, + { + "epoch": 1.6391304347826088, + "grad_norm": 0.436289519071579, + "learning_rate": 2e-05, + "loss": 1.5289, + "step": 377 + }, + { + "epoch": 1.643478260869565, + "grad_norm": 0.436289519071579, + "learning_rate": 2e-05, + "loss": 1.5114, + "step": 378 + }, + { + "epoch": 1.6478260869565218, + "grad_norm": 0.436289519071579, + "learning_rate": 2e-05, + "loss": 1.5157, + "step": 379 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.41269221901893616, + "learning_rate": 2e-05, + "loss": 1.5147, + "step": 380 + }, + { + "epoch": 1.6565217391304348, + "grad_norm": 0.41269221901893616, + "learning_rate": 2e-05, + "loss": 1.4763, + "step": 381 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 0.41269221901893616, + "learning_rate": 2e-05, + "loss": 1.4898, + "step": 382 + }, + { + "epoch": 1.6652173913043478, + "grad_norm": 0.41269221901893616, + "learning_rate": 2e-05, + "loss": 1.5237, + "step": 383 + }, + { + "epoch": 1.6695652173913045, + "grad_norm": 0.42367058992385864, + "learning_rate": 2e-05, + "loss": 1.5789, + "step": 384 + }, + { + "epoch": 1.6739130434782608, + "grad_norm": 0.42367058992385864, + "learning_rate": 2e-05, + "loss": 1.5807, + "step": 385 + }, + { + "epoch": 1.6782608695652175, + "grad_norm": 0.42367058992385864, + "learning_rate": 2e-05, + "loss": 1.5238, + "step": 386 + }, + { + "epoch": 1.6826086956521737, + "grad_norm": 0.42367058992385864, + "learning_rate": 2e-05, + "loss": 1.5146, + "step": 387 + }, + { + "epoch": 1.6869565217391305, + "grad_norm": 0.4358891546726227, + "learning_rate": 2e-05, + "loss": 1.5674, + "step": 388 + }, + { + "epoch": 1.691304347826087, + "grad_norm": 0.4358891546726227, + "learning_rate": 2e-05, + "loss": 1.4984, + "step": 389 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.4358891546726227, + "learning_rate": 2e-05, + "loss": 1.5361, + "step": 390 + }, + { + "epoch": 1.7, + "grad_norm": 0.4358891546726227, + "learning_rate": 2e-05, + "loss": 1.4934, + "step": 391 + }, + { + "epoch": 1.7043478260869565, + "grad_norm": 0.42739376425743103, + "learning_rate": 2e-05, + "loss": 1.5489, + "step": 392 + }, + { + "epoch": 1.7086956521739132, + "grad_norm": 0.42739376425743103, + "learning_rate": 2e-05, + "loss": 1.5327, + "step": 393 + }, + { + "epoch": 1.7130434782608694, + "grad_norm": 0.42739376425743103, + "learning_rate": 2e-05, + "loss": 1.5521, + "step": 394 + }, + { + "epoch": 1.7173913043478262, + "grad_norm": 0.42739376425743103, + "learning_rate": 2e-05, + "loss": 1.586, + "step": 395 + }, + { + "epoch": 1.7217391304347827, + "grad_norm": 0.43897366523742676, + "learning_rate": 2e-05, + "loss": 1.5053, + "step": 396 + }, + { + "epoch": 1.7260869565217392, + "grad_norm": 0.43897366523742676, + "learning_rate": 2e-05, + "loss": 1.5414, + "step": 397 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 0.43897366523742676, + "learning_rate": 2e-05, + "loss": 1.5572, + "step": 398 + }, + { + "epoch": 1.7347826086956522, + "grad_norm": 0.43897366523742676, + "learning_rate": 2e-05, + "loss": 1.565, + "step": 399 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.4224741756916046, + "learning_rate": 2e-05, + "loss": 1.5816, + "step": 400 + }, + { + "epoch": 1.7434782608695651, + "grad_norm": 0.4224741756916046, + "learning_rate": 2e-05, + "loss": 1.5445, + "step": 401 + }, + { + "epoch": 1.7478260869565219, + "grad_norm": 0.4224741756916046, + "learning_rate": 2e-05, + "loss": 1.5533, + "step": 402 + }, + { + "epoch": 1.7521739130434781, + "grad_norm": 0.4224741756916046, + "learning_rate": 2e-05, + "loss": 1.558, + "step": 403 + }, + { + "epoch": 1.7565217391304349, + "grad_norm": 0.4469275176525116, + "learning_rate": 2e-05, + "loss": 1.5241, + "step": 404 + }, + { + "epoch": 1.7608695652173914, + "grad_norm": 0.4469275176525116, + "learning_rate": 2e-05, + "loss": 1.5289, + "step": 405 + }, + { + "epoch": 1.7652173913043478, + "grad_norm": 0.4469275176525116, + "learning_rate": 2e-05, + "loss": 1.5669, + "step": 406 + }, + { + "epoch": 1.7695652173913043, + "grad_norm": 0.4469275176525116, + "learning_rate": 2e-05, + "loss": 1.5317, + "step": 407 + }, + { + "epoch": 1.7739130434782608, + "grad_norm": 0.44230249524116516, + "learning_rate": 2e-05, + "loss": 1.5305, + "step": 408 + }, + { + "epoch": 1.7782608695652173, + "grad_norm": 0.44230249524116516, + "learning_rate": 2e-05, + "loss": 1.5701, + "step": 409 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.44230249524116516, + "learning_rate": 2e-05, + "loss": 1.5086, + "step": 410 + }, + { + "epoch": 1.7869565217391306, + "grad_norm": 0.44230249524116516, + "learning_rate": 2e-05, + "loss": 1.4514, + "step": 411 + }, + { + "epoch": 1.7913043478260868, + "grad_norm": 0.42857682704925537, + "learning_rate": 2e-05, + "loss": 1.5295, + "step": 412 + }, + { + "epoch": 1.7956521739130435, + "grad_norm": 0.42857682704925537, + "learning_rate": 2e-05, + "loss": 1.4738, + "step": 413 + }, + { + "epoch": 1.8, + "grad_norm": 0.42857682704925537, + "learning_rate": 2e-05, + "loss": 1.5406, + "step": 414 + }, + { + "epoch": 1.8043478260869565, + "grad_norm": 0.42857682704925537, + "learning_rate": 2e-05, + "loss": 1.533, + "step": 415 + }, + { + "epoch": 1.808695652173913, + "grad_norm": 0.44212475419044495, + "learning_rate": 2e-05, + "loss": 1.5497, + "step": 416 + }, + { + "epoch": 1.8130434782608695, + "grad_norm": 0.44212475419044495, + "learning_rate": 2e-05, + "loss": 1.558, + "step": 417 + }, + { + "epoch": 1.8173913043478263, + "grad_norm": 0.44212475419044495, + "learning_rate": 2e-05, + "loss": 1.557, + "step": 418 + }, + { + "epoch": 1.8217391304347825, + "grad_norm": 0.44212475419044495, + "learning_rate": 2e-05, + "loss": 1.5076, + "step": 419 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.42618614435195923, + "learning_rate": 2e-05, + "loss": 1.5906, + "step": 420 + }, + { + "epoch": 1.8304347826086955, + "grad_norm": 0.42618614435195923, + "learning_rate": 2e-05, + "loss": 1.5247, + "step": 421 + }, + { + "epoch": 1.8347826086956522, + "grad_norm": 0.42618614435195923, + "learning_rate": 2e-05, + "loss": 1.4649, + "step": 422 + }, + { + "epoch": 1.8391304347826087, + "grad_norm": 0.42618614435195923, + "learning_rate": 2e-05, + "loss": 1.5779, + "step": 423 + }, + { + "epoch": 1.8434782608695652, + "grad_norm": 0.44451162219047546, + "learning_rate": 2e-05, + "loss": 1.5232, + "step": 424 + }, + { + "epoch": 1.8478260869565217, + "grad_norm": 0.44451162219047546, + "learning_rate": 2e-05, + "loss": 1.5158, + "step": 425 + }, + { + "epoch": 1.8521739130434782, + "grad_norm": 0.44451162219047546, + "learning_rate": 2e-05, + "loss": 1.4625, + "step": 426 + }, + { + "epoch": 1.856521739130435, + "grad_norm": 0.44451162219047546, + "learning_rate": 2e-05, + "loss": 1.5394, + "step": 427 + }, + { + "epoch": 1.8608695652173912, + "grad_norm": 0.41748350858688354, + "learning_rate": 2e-05, + "loss": 1.5336, + "step": 428 + }, + { + "epoch": 1.865217391304348, + "grad_norm": 0.41748350858688354, + "learning_rate": 2e-05, + "loss": 1.5231, + "step": 429 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.41748350858688354, + "learning_rate": 2e-05, + "loss": 1.498, + "step": 430 + }, + { + "epoch": 1.873913043478261, + "grad_norm": 0.41748350858688354, + "learning_rate": 2e-05, + "loss": 1.4854, + "step": 431 + }, + { + "epoch": 1.8782608695652174, + "grad_norm": 0.426869660615921, + "learning_rate": 2e-05, + "loss": 1.5253, + "step": 432 + }, + { + "epoch": 1.882608695652174, + "grad_norm": 0.426869660615921, + "learning_rate": 2e-05, + "loss": 1.5273, + "step": 433 + }, + { + "epoch": 1.8869565217391304, + "grad_norm": 0.426869660615921, + "learning_rate": 2e-05, + "loss": 1.5221, + "step": 434 + }, + { + "epoch": 1.891304347826087, + "grad_norm": 0.426869660615921, + "learning_rate": 2e-05, + "loss": 1.5335, + "step": 435 + }, + { + "epoch": 1.8956521739130436, + "grad_norm": 0.45192039012908936, + "learning_rate": 2e-05, + "loss": 1.425, + "step": 436 + }, + { + "epoch": 1.9, + "grad_norm": 0.45192039012908936, + "learning_rate": 2e-05, + "loss": 1.5333, + "step": 437 + }, + { + "epoch": 1.9043478260869566, + "grad_norm": 0.45192039012908936, + "learning_rate": 2e-05, + "loss": 1.5895, + "step": 438 + }, + { + "epoch": 1.908695652173913, + "grad_norm": 0.45192039012908936, + "learning_rate": 2e-05, + "loss": 1.5203, + "step": 439 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.42814260721206665, + "learning_rate": 2e-05, + "loss": 1.5387, + "step": 440 + }, + { + "epoch": 1.9173913043478261, + "grad_norm": 0.42814260721206665, + "learning_rate": 2e-05, + "loss": 1.492, + "step": 441 + }, + { + "epoch": 1.9217391304347826, + "grad_norm": 0.42814260721206665, + "learning_rate": 2e-05, + "loss": 1.4469, + "step": 442 + }, + { + "epoch": 1.9260869565217391, + "grad_norm": 0.42814260721206665, + "learning_rate": 2e-05, + "loss": 1.4633, + "step": 443 + }, + { + "epoch": 1.9304347826086956, + "grad_norm": 0.42902347445487976, + "learning_rate": 2e-05, + "loss": 1.5168, + "step": 444 + }, + { + "epoch": 1.9347826086956523, + "grad_norm": 0.42902347445487976, + "learning_rate": 2e-05, + "loss": 1.5096, + "step": 445 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 0.42902347445487976, + "learning_rate": 2e-05, + "loss": 1.5049, + "step": 446 + }, + { + "epoch": 1.9434782608695653, + "grad_norm": 0.42902347445487976, + "learning_rate": 2e-05, + "loss": 1.5848, + "step": 447 + }, + { + "epoch": 1.9478260869565216, + "grad_norm": 0.4723300337791443, + "learning_rate": 2e-05, + "loss": 1.4657, + "step": 448 + }, + { + "epoch": 1.9521739130434783, + "grad_norm": 0.4723300337791443, + "learning_rate": 2e-05, + "loss": 1.5034, + "step": 449 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.4723300337791443, + "learning_rate": 2e-05, + "loss": 1.5047, + "step": 450 + }, + { + "epoch": 1.9608695652173913, + "grad_norm": 0.4723300337791443, + "learning_rate": 2e-05, + "loss": 1.4795, + "step": 451 + }, + { + "epoch": 1.9652173913043478, + "grad_norm": 0.4308329224586487, + "learning_rate": 2e-05, + "loss": 1.5861, + "step": 452 + }, + { + "epoch": 1.9695652173913043, + "grad_norm": 0.4308329224586487, + "learning_rate": 2e-05, + "loss": 1.532, + "step": 453 + }, + { + "epoch": 1.973913043478261, + "grad_norm": 0.4308329224586487, + "learning_rate": 2e-05, + "loss": 1.4708, + "step": 454 + }, + { + "epoch": 1.9782608695652173, + "grad_norm": 0.4308329224586487, + "learning_rate": 2e-05, + "loss": 1.5533, + "step": 455 + }, + { + "epoch": 1.982608695652174, + "grad_norm": 0.4359206557273865, + "learning_rate": 2e-05, + "loss": 1.4912, + "step": 456 + }, + { + "epoch": 1.9869565217391303, + "grad_norm": 0.4359206557273865, + "learning_rate": 2e-05, + "loss": 1.5453, + "step": 457 + }, + { + "epoch": 1.991304347826087, + "grad_norm": 0.4359206557273865, + "learning_rate": 2e-05, + "loss": 1.5074, + "step": 458 + }, + { + "epoch": 1.9956521739130435, + "grad_norm": 0.4359206557273865, + "learning_rate": 2e-05, + "loss": 1.6286, + "step": 459 + }, + { + "epoch": 2.0, + "grad_norm": 0.43849724531173706, + "learning_rate": 2e-05, + "loss": 1.4783, + "step": 460 + }, + { + "epoch": 2.0043478260869567, + "grad_norm": 0.43849724531173706, + "learning_rate": 2e-05, + "loss": 1.4826, + "step": 461 + }, + { + "epoch": 2.008695652173913, + "grad_norm": 0.43849724531173706, + "learning_rate": 2e-05, + "loss": 1.4406, + "step": 462 + }, + { + "epoch": 2.0130434782608697, + "grad_norm": 0.43849724531173706, + "learning_rate": 2e-05, + "loss": 1.4177, + "step": 463 + }, + { + "epoch": 2.017391304347826, + "grad_norm": 0.4389805495738983, + "learning_rate": 2e-05, + "loss": 1.51, + "step": 464 + }, + { + "epoch": 2.0217391304347827, + "grad_norm": 0.4389805495738983, + "learning_rate": 2e-05, + "loss": 1.4149, + "step": 465 + }, + { + "epoch": 2.026086956521739, + "grad_norm": 0.4389805495738983, + "learning_rate": 2e-05, + "loss": 1.4876, + "step": 466 + }, + { + "epoch": 2.0304347826086957, + "grad_norm": 0.4389805495738983, + "learning_rate": 2e-05, + "loss": 1.5165, + "step": 467 + }, + { + "epoch": 2.034782608695652, + "grad_norm": 0.4780814051628113, + "learning_rate": 2e-05, + "loss": 1.4525, + "step": 468 + }, + { + "epoch": 2.0391304347826087, + "grad_norm": 0.4780814051628113, + "learning_rate": 2e-05, + "loss": 1.4272, + "step": 469 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 0.4780814051628113, + "learning_rate": 2e-05, + "loss": 1.4653, + "step": 470 + }, + { + "epoch": 2.0478260869565217, + "grad_norm": 0.4780814051628113, + "learning_rate": 2e-05, + "loss": 1.4531, + "step": 471 + }, + { + "epoch": 2.0521739130434784, + "grad_norm": 0.4610339105129242, + "learning_rate": 2e-05, + "loss": 1.3922, + "step": 472 + }, + { + "epoch": 2.0565217391304347, + "grad_norm": 0.4610339105129242, + "learning_rate": 2e-05, + "loss": 1.4538, + "step": 473 + }, + { + "epoch": 2.0608695652173914, + "grad_norm": 0.4610339105129242, + "learning_rate": 2e-05, + "loss": 1.5089, + "step": 474 + }, + { + "epoch": 2.0652173913043477, + "grad_norm": 0.4610339105129242, + "learning_rate": 2e-05, + "loss": 1.4111, + "step": 475 + }, + { + "epoch": 2.0695652173913044, + "grad_norm": 0.4437248706817627, + "learning_rate": 2e-05, + "loss": 1.4582, + "step": 476 + }, + { + "epoch": 2.0739130434782607, + "grad_norm": 0.4437248706817627, + "learning_rate": 2e-05, + "loss": 1.4614, + "step": 477 + }, + { + "epoch": 2.0782608695652174, + "grad_norm": 0.4437248706817627, + "learning_rate": 2e-05, + "loss": 1.4708, + "step": 478 + }, + { + "epoch": 2.082608695652174, + "grad_norm": 0.4437248706817627, + "learning_rate": 2e-05, + "loss": 1.4156, + "step": 479 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.41777855157852173, + "learning_rate": 2e-05, + "loss": 1.4923, + "step": 480 + }, + { + "epoch": 2.091304347826087, + "grad_norm": 0.41777855157852173, + "learning_rate": 2e-05, + "loss": 1.4219, + "step": 481 + }, + { + "epoch": 2.0956521739130434, + "grad_norm": 0.41777855157852173, + "learning_rate": 2e-05, + "loss": 1.4431, + "step": 482 + }, + { + "epoch": 2.1, + "grad_norm": 0.41777855157852173, + "learning_rate": 2e-05, + "loss": 1.3807, + "step": 483 + }, + { + "epoch": 2.1043478260869564, + "grad_norm": 0.43318524956703186, + "learning_rate": 2e-05, + "loss": 1.4227, + "step": 484 + }, + { + "epoch": 2.108695652173913, + "grad_norm": 0.43318524956703186, + "learning_rate": 2e-05, + "loss": 1.4109, + "step": 485 + }, + { + "epoch": 2.1130434782608694, + "grad_norm": 0.43318524956703186, + "learning_rate": 2e-05, + "loss": 1.4565, + "step": 486 + }, + { + "epoch": 2.117391304347826, + "grad_norm": 0.43318524956703186, + "learning_rate": 2e-05, + "loss": 1.4458, + "step": 487 + }, + { + "epoch": 2.121739130434783, + "grad_norm": 0.46397483348846436, + "learning_rate": 2e-05, + "loss": 1.4003, + "step": 488 + }, + { + "epoch": 2.126086956521739, + "grad_norm": 0.46397483348846436, + "learning_rate": 2e-05, + "loss": 1.4473, + "step": 489 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 0.46397483348846436, + "learning_rate": 2e-05, + "loss": 1.4112, + "step": 490 + }, + { + "epoch": 2.134782608695652, + "grad_norm": 0.46397483348846436, + "learning_rate": 2e-05, + "loss": 1.4627, + "step": 491 + }, + { + "epoch": 2.139130434782609, + "grad_norm": 0.43796172738075256, + "learning_rate": 2e-05, + "loss": 1.4559, + "step": 492 + }, + { + "epoch": 2.143478260869565, + "grad_norm": 0.43796172738075256, + "learning_rate": 2e-05, + "loss": 1.456, + "step": 493 + }, + { + "epoch": 2.1478260869565218, + "grad_norm": 0.43796172738075256, + "learning_rate": 2e-05, + "loss": 1.4903, + "step": 494 + }, + { + "epoch": 2.1521739130434785, + "grad_norm": 0.43796172738075256, + "learning_rate": 2e-05, + "loss": 1.4208, + "step": 495 + }, + { + "epoch": 2.1565217391304348, + "grad_norm": 0.4605541229248047, + "learning_rate": 2e-05, + "loss": 1.4443, + "step": 496 + }, + { + "epoch": 2.1608695652173915, + "grad_norm": 0.4605541229248047, + "learning_rate": 2e-05, + "loss": 1.4738, + "step": 497 + }, + { + "epoch": 2.1652173913043478, + "grad_norm": 0.4605541229248047, + "learning_rate": 2e-05, + "loss": 1.4085, + "step": 498 + }, + { + "epoch": 2.1695652173913045, + "grad_norm": 0.4605541229248047, + "learning_rate": 2e-05, + "loss": 1.5024, + "step": 499 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.4581138491630554, + "learning_rate": 2e-05, + "loss": 1.4846, + "step": 500 + }, + { + "epoch": 2.1782608695652175, + "grad_norm": 0.4581138491630554, + "learning_rate": 2e-05, + "loss": 1.473, + "step": 501 + }, + { + "epoch": 2.1826086956521737, + "grad_norm": 0.4581138491630554, + "learning_rate": 2e-05, + "loss": 1.4521, + "step": 502 + }, + { + "epoch": 2.1869565217391305, + "grad_norm": 0.4581138491630554, + "learning_rate": 2e-05, + "loss": 1.443, + "step": 503 + }, + { + "epoch": 2.1913043478260867, + "grad_norm": 0.429681658744812, + "learning_rate": 2e-05, + "loss": 1.4399, + "step": 504 + }, + { + "epoch": 2.1956521739130435, + "grad_norm": 0.429681658744812, + "learning_rate": 2e-05, + "loss": 1.469, + "step": 505 + }, + { + "epoch": 2.2, + "grad_norm": 0.429681658744812, + "learning_rate": 2e-05, + "loss": 1.4104, + "step": 506 + }, + { + "epoch": 2.2043478260869565, + "grad_norm": 0.429681658744812, + "learning_rate": 2e-05, + "loss": 1.4559, + "step": 507 + }, + { + "epoch": 2.208695652173913, + "grad_norm": 0.4941965937614441, + "learning_rate": 2e-05, + "loss": 1.4777, + "step": 508 + }, + { + "epoch": 2.2130434782608694, + "grad_norm": 0.4941965937614441, + "learning_rate": 2e-05, + "loss": 1.4692, + "step": 509 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 0.4941965937614441, + "learning_rate": 2e-05, + "loss": 1.48, + "step": 510 + }, + { + "epoch": 2.2217391304347824, + "grad_norm": 0.4941965937614441, + "learning_rate": 2e-05, + "loss": 1.335, + "step": 511 + }, + { + "epoch": 2.226086956521739, + "grad_norm": 0.439325213432312, + "learning_rate": 2e-05, + "loss": 1.4571, + "step": 512 + }, + { + "epoch": 2.230434782608696, + "grad_norm": 0.439325213432312, + "learning_rate": 2e-05, + "loss": 1.3929, + "step": 513 + }, + { + "epoch": 2.234782608695652, + "grad_norm": 0.439325213432312, + "learning_rate": 2e-05, + "loss": 1.4184, + "step": 514 + }, + { + "epoch": 2.239130434782609, + "grad_norm": 0.439325213432312, + "learning_rate": 2e-05, + "loss": 1.4146, + "step": 515 + }, + { + "epoch": 2.243478260869565, + "grad_norm": 0.4552342891693115, + "learning_rate": 2e-05, + "loss": 1.5433, + "step": 516 + }, + { + "epoch": 2.247826086956522, + "grad_norm": 0.4552342891693115, + "learning_rate": 2e-05, + "loss": 1.4631, + "step": 517 + }, + { + "epoch": 2.252173913043478, + "grad_norm": 0.4552342891693115, + "learning_rate": 2e-05, + "loss": 1.4267, + "step": 518 + }, + { + "epoch": 2.256521739130435, + "grad_norm": 0.4552342891693115, + "learning_rate": 2e-05, + "loss": 1.4323, + "step": 519 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.4403064548969269, + "learning_rate": 2e-05, + "loss": 1.4965, + "step": 520 + }, + { + "epoch": 2.265217391304348, + "grad_norm": 0.4403064548969269, + "learning_rate": 2e-05, + "loss": 1.4342, + "step": 521 + }, + { + "epoch": 2.269565217391304, + "grad_norm": 0.4403064548969269, + "learning_rate": 2e-05, + "loss": 1.3941, + "step": 522 + }, + { + "epoch": 2.273913043478261, + "grad_norm": 0.4403064548969269, + "learning_rate": 2e-05, + "loss": 1.4012, + "step": 523 + }, + { + "epoch": 2.2782608695652176, + "grad_norm": 0.4627401530742645, + "learning_rate": 2e-05, + "loss": 1.4316, + "step": 524 + }, + { + "epoch": 2.282608695652174, + "grad_norm": 0.4627401530742645, + "learning_rate": 2e-05, + "loss": 1.3794, + "step": 525 + }, + { + "epoch": 2.2869565217391306, + "grad_norm": 0.4627401530742645, + "learning_rate": 2e-05, + "loss": 1.4841, + "step": 526 + }, + { + "epoch": 2.291304347826087, + "grad_norm": 0.4627401530742645, + "learning_rate": 2e-05, + "loss": 1.403, + "step": 527 + }, + { + "epoch": 2.2956521739130435, + "grad_norm": 0.4262266755104065, + "learning_rate": 2e-05, + "loss": 1.4839, + "step": 528 + }, + { + "epoch": 2.3, + "grad_norm": 0.4262266755104065, + "learning_rate": 2e-05, + "loss": 1.4936, + "step": 529 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 0.4262266755104065, + "learning_rate": 2e-05, + "loss": 1.4565, + "step": 530 + }, + { + "epoch": 2.3086956521739133, + "grad_norm": 0.4262266755104065, + "learning_rate": 2e-05, + "loss": 1.3934, + "step": 531 + }, + { + "epoch": 2.3130434782608695, + "grad_norm": 0.41838669776916504, + "learning_rate": 2e-05, + "loss": 1.4768, + "step": 532 + }, + { + "epoch": 2.3173913043478263, + "grad_norm": 0.41838669776916504, + "learning_rate": 2e-05, + "loss": 1.4235, + "step": 533 + }, + { + "epoch": 2.3217391304347825, + "grad_norm": 0.41838669776916504, + "learning_rate": 2e-05, + "loss": 1.3986, + "step": 534 + }, + { + "epoch": 2.3260869565217392, + "grad_norm": 0.41838669776916504, + "learning_rate": 2e-05, + "loss": 1.4706, + "step": 535 + }, + { + "epoch": 2.3304347826086955, + "grad_norm": 0.44334736466407776, + "learning_rate": 2e-05, + "loss": 1.4141, + "step": 536 + }, + { + "epoch": 2.3347826086956522, + "grad_norm": 0.44334736466407776, + "learning_rate": 2e-05, + "loss": 1.4433, + "step": 537 + }, + { + "epoch": 2.3391304347826085, + "grad_norm": 0.44334736466407776, + "learning_rate": 2e-05, + "loss": 1.4327, + "step": 538 + }, + { + "epoch": 2.3434782608695652, + "grad_norm": 0.44334736466407776, + "learning_rate": 2e-05, + "loss": 1.4567, + "step": 539 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.45214343070983887, + "learning_rate": 2e-05, + "loss": 1.4291, + "step": 540 + }, + { + "epoch": 2.3521739130434782, + "grad_norm": 0.45214343070983887, + "learning_rate": 2e-05, + "loss": 1.4177, + "step": 541 + }, + { + "epoch": 2.356521739130435, + "grad_norm": 0.45214343070983887, + "learning_rate": 2e-05, + "loss": 1.4696, + "step": 542 + }, + { + "epoch": 2.360869565217391, + "grad_norm": 0.45214343070983887, + "learning_rate": 2e-05, + "loss": 1.4217, + "step": 543 + }, + { + "epoch": 2.365217391304348, + "grad_norm": 0.4370884299278259, + "learning_rate": 2e-05, + "loss": 1.447, + "step": 544 + }, + { + "epoch": 2.369565217391304, + "grad_norm": 0.4370884299278259, + "learning_rate": 2e-05, + "loss": 1.4681, + "step": 545 + }, + { + "epoch": 2.373913043478261, + "grad_norm": 0.4370884299278259, + "learning_rate": 2e-05, + "loss": 1.3833, + "step": 546 + }, + { + "epoch": 2.378260869565217, + "grad_norm": 0.4370884299278259, + "learning_rate": 2e-05, + "loss": 1.4899, + "step": 547 + }, + { + "epoch": 2.382608695652174, + "grad_norm": 0.45092737674713135, + "learning_rate": 2e-05, + "loss": 1.4567, + "step": 548 + }, + { + "epoch": 2.3869565217391306, + "grad_norm": 0.45092737674713135, + "learning_rate": 2e-05, + "loss": 1.4025, + "step": 549 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 0.45092737674713135, + "learning_rate": 2e-05, + "loss": 1.4732, + "step": 550 + }, + { + "epoch": 2.3956521739130436, + "grad_norm": 0.45092737674713135, + "learning_rate": 2e-05, + "loss": 1.5067, + "step": 551 + }, + { + "epoch": 2.4, + "grad_norm": 0.4495592713356018, + "learning_rate": 2e-05, + "loss": 1.4597, + "step": 552 + }, + { + "epoch": 2.4043478260869566, + "grad_norm": 0.4495592713356018, + "learning_rate": 2e-05, + "loss": 1.3733, + "step": 553 + }, + { + "epoch": 2.408695652173913, + "grad_norm": 0.4495592713356018, + "learning_rate": 2e-05, + "loss": 1.4127, + "step": 554 + }, + { + "epoch": 2.4130434782608696, + "grad_norm": 0.4495592713356018, + "learning_rate": 2e-05, + "loss": 1.4427, + "step": 555 + }, + { + "epoch": 2.417391304347826, + "grad_norm": 0.44914448261260986, + "learning_rate": 2e-05, + "loss": 1.4285, + "step": 556 + }, + { + "epoch": 2.4217391304347826, + "grad_norm": 0.44914448261260986, + "learning_rate": 2e-05, + "loss": 1.3925, + "step": 557 + }, + { + "epoch": 2.426086956521739, + "grad_norm": 0.44914448261260986, + "learning_rate": 2e-05, + "loss": 1.4277, + "step": 558 + }, + { + "epoch": 2.4304347826086956, + "grad_norm": 0.44914448261260986, + "learning_rate": 2e-05, + "loss": 1.449, + "step": 559 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.4478745758533478, + "learning_rate": 2e-05, + "loss": 1.4359, + "step": 560 + }, + { + "epoch": 2.4391304347826086, + "grad_norm": 0.4478745758533478, + "learning_rate": 2e-05, + "loss": 1.4546, + "step": 561 + }, + { + "epoch": 2.4434782608695653, + "grad_norm": 0.4478745758533478, + "learning_rate": 2e-05, + "loss": 1.3997, + "step": 562 + }, + { + "epoch": 2.4478260869565216, + "grad_norm": 0.4478745758533478, + "learning_rate": 2e-05, + "loss": 1.4591, + "step": 563 + }, + { + "epoch": 2.4521739130434783, + "grad_norm": 0.43071508407592773, + "learning_rate": 2e-05, + "loss": 1.3532, + "step": 564 + }, + { + "epoch": 2.4565217391304346, + "grad_norm": 0.43071508407592773, + "learning_rate": 2e-05, + "loss": 1.343, + "step": 565 + }, + { + "epoch": 2.4608695652173913, + "grad_norm": 0.43071508407592773, + "learning_rate": 2e-05, + "loss": 1.4923, + "step": 566 + }, + { + "epoch": 2.465217391304348, + "grad_norm": 0.43071508407592773, + "learning_rate": 2e-05, + "loss": 1.4101, + "step": 567 + }, + { + "epoch": 2.4695652173913043, + "grad_norm": 0.4356819689273834, + "learning_rate": 2e-05, + "loss": 1.4828, + "step": 568 + }, + { + "epoch": 2.473913043478261, + "grad_norm": 0.4356819689273834, + "learning_rate": 2e-05, + "loss": 1.4608, + "step": 569 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 0.4356819689273834, + "learning_rate": 2e-05, + "loss": 1.3872, + "step": 570 + }, + { + "epoch": 2.482608695652174, + "grad_norm": 0.4356819689273834, + "learning_rate": 2e-05, + "loss": 1.4487, + "step": 571 + }, + { + "epoch": 2.4869565217391303, + "grad_norm": 0.44995924830436707, + "learning_rate": 2e-05, + "loss": 1.4694, + "step": 572 + }, + { + "epoch": 2.491304347826087, + "grad_norm": 0.44995924830436707, + "learning_rate": 2e-05, + "loss": 1.4029, + "step": 573 + }, + { + "epoch": 2.4956521739130437, + "grad_norm": 0.44995924830436707, + "learning_rate": 2e-05, + "loss": 1.4111, + "step": 574 + }, + { + "epoch": 2.5, + "grad_norm": 0.44995924830436707, + "learning_rate": 2e-05, + "loss": 1.4275, + "step": 575 + }, + { + "epoch": 2.5043478260869563, + "grad_norm": 0.4352399408817291, + "learning_rate": 2e-05, + "loss": 1.443, + "step": 576 + }, + { + "epoch": 2.508695652173913, + "grad_norm": 0.4352399408817291, + "learning_rate": 2e-05, + "loss": 1.4107, + "step": 577 + }, + { + "epoch": 2.5130434782608697, + "grad_norm": 0.4352399408817291, + "learning_rate": 2e-05, + "loss": 1.4341, + "step": 578 + }, + { + "epoch": 2.517391304347826, + "grad_norm": 0.4352399408817291, + "learning_rate": 2e-05, + "loss": 1.3694, + "step": 579 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.45637837052345276, + "learning_rate": 2e-05, + "loss": 1.428, + "step": 580 + }, + { + "epoch": 2.526086956521739, + "grad_norm": 0.45637837052345276, + "learning_rate": 2e-05, + "loss": 1.409, + "step": 581 + }, + { + "epoch": 2.5304347826086957, + "grad_norm": 0.45637837052345276, + "learning_rate": 2e-05, + "loss": 1.3807, + "step": 582 + }, + { + "epoch": 2.534782608695652, + "grad_norm": 0.45637837052345276, + "learning_rate": 2e-05, + "loss": 1.3878, + "step": 583 + }, + { + "epoch": 2.5391304347826087, + "grad_norm": 0.4560960531234741, + "learning_rate": 2e-05, + "loss": 1.327, + "step": 584 + }, + { + "epoch": 2.5434782608695654, + "grad_norm": 0.4560960531234741, + "learning_rate": 2e-05, + "loss": 1.4564, + "step": 585 + }, + { + "epoch": 2.5478260869565217, + "grad_norm": 0.4560960531234741, + "learning_rate": 2e-05, + "loss": 1.3758, + "step": 586 + }, + { + "epoch": 2.5521739130434784, + "grad_norm": 0.4560960531234741, + "learning_rate": 2e-05, + "loss": 1.4642, + "step": 587 + }, + { + "epoch": 2.5565217391304347, + "grad_norm": 0.458340048789978, + "learning_rate": 2e-05, + "loss": 1.4277, + "step": 588 + }, + { + "epoch": 2.5608695652173914, + "grad_norm": 0.458340048789978, + "learning_rate": 2e-05, + "loss": 1.423, + "step": 589 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 0.458340048789978, + "learning_rate": 2e-05, + "loss": 1.4059, + "step": 590 + }, + { + "epoch": 2.5695652173913044, + "grad_norm": 0.458340048789978, + "learning_rate": 2e-05, + "loss": 1.4327, + "step": 591 + }, + { + "epoch": 2.573913043478261, + "grad_norm": 0.46771275997161865, + "learning_rate": 2e-05, + "loss": 1.4411, + "step": 592 + }, + { + "epoch": 2.5782608695652174, + "grad_norm": 0.46771275997161865, + "learning_rate": 2e-05, + "loss": 1.3755, + "step": 593 + }, + { + "epoch": 2.5826086956521737, + "grad_norm": 0.46771275997161865, + "learning_rate": 2e-05, + "loss": 1.474, + "step": 594 + }, + { + "epoch": 2.5869565217391304, + "grad_norm": 0.46771275997161865, + "learning_rate": 2e-05, + "loss": 1.4165, + "step": 595 + }, + { + "epoch": 2.591304347826087, + "grad_norm": 0.44320788979530334, + "learning_rate": 2e-05, + "loss": 1.402, + "step": 596 + }, + { + "epoch": 2.5956521739130434, + "grad_norm": 0.44320788979530334, + "learning_rate": 2e-05, + "loss": 1.3985, + "step": 597 + }, + { + "epoch": 2.6, + "grad_norm": 0.44320788979530334, + "learning_rate": 2e-05, + "loss": 1.4689, + "step": 598 + }, + { + "epoch": 2.6043478260869564, + "grad_norm": 0.44320788979530334, + "learning_rate": 2e-05, + "loss": 1.4768, + "step": 599 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4548298418521881, + "learning_rate": 2e-05, + "loss": 1.4092, + "step": 600 + }, + { + "epoch": 2.6130434782608694, + "grad_norm": 0.4548298418521881, + "learning_rate": 2e-05, + "loss": 1.3731, + "step": 601 + }, + { + "epoch": 2.617391304347826, + "grad_norm": 0.4548298418521881, + "learning_rate": 2e-05, + "loss": 1.3681, + "step": 602 + }, + { + "epoch": 2.621739130434783, + "grad_norm": 0.4548298418521881, + "learning_rate": 2e-05, + "loss": 1.3759, + "step": 603 + }, + { + "epoch": 2.626086956521739, + "grad_norm": 0.4255348742008209, + "learning_rate": 2e-05, + "loss": 1.3581, + "step": 604 + }, + { + "epoch": 2.630434782608696, + "grad_norm": 0.4255348742008209, + "learning_rate": 2e-05, + "loss": 1.389, + "step": 605 + }, + { + "epoch": 2.634782608695652, + "grad_norm": 0.4255348742008209, + "learning_rate": 2e-05, + "loss": 1.4142, + "step": 606 + }, + { + "epoch": 2.639130434782609, + "grad_norm": 0.4255348742008209, + "learning_rate": 2e-05, + "loss": 1.4308, + "step": 607 + }, + { + "epoch": 2.643478260869565, + "grad_norm": 0.44347140192985535, + "learning_rate": 2e-05, + "loss": 1.3694, + "step": 608 + }, + { + "epoch": 2.6478260869565218, + "grad_norm": 0.44347140192985535, + "learning_rate": 2e-05, + "loss": 1.3745, + "step": 609 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 0.44347140192985535, + "learning_rate": 2e-05, + "loss": 1.3965, + "step": 610 + }, + { + "epoch": 2.6565217391304348, + "grad_norm": 0.44347140192985535, + "learning_rate": 2e-05, + "loss": 1.3552, + "step": 611 + }, + { + "epoch": 2.660869565217391, + "grad_norm": 0.43682318925857544, + "learning_rate": 2e-05, + "loss": 1.3977, + "step": 612 + }, + { + "epoch": 2.6652173913043478, + "grad_norm": 0.43682318925857544, + "learning_rate": 2e-05, + "loss": 1.4557, + "step": 613 + }, + { + "epoch": 2.6695652173913045, + "grad_norm": 0.43682318925857544, + "learning_rate": 2e-05, + "loss": 1.4087, + "step": 614 + }, + { + "epoch": 2.6739130434782608, + "grad_norm": 0.43682318925857544, + "learning_rate": 2e-05, + "loss": 1.3819, + "step": 615 + }, + { + "epoch": 2.6782608695652175, + "grad_norm": 0.43287912011146545, + "learning_rate": 2e-05, + "loss": 1.4954, + "step": 616 + }, + { + "epoch": 2.6826086956521737, + "grad_norm": 0.43287912011146545, + "learning_rate": 2e-05, + "loss": 1.4255, + "step": 617 + }, + { + "epoch": 2.6869565217391305, + "grad_norm": 0.43287912011146545, + "learning_rate": 2e-05, + "loss": 1.4469, + "step": 618 + }, + { + "epoch": 2.6913043478260867, + "grad_norm": 0.43287912011146545, + "learning_rate": 2e-05, + "loss": 1.3364, + "step": 619 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.437889039516449, + "learning_rate": 2e-05, + "loss": 1.4392, + "step": 620 + }, + { + "epoch": 2.7, + "grad_norm": 0.437889039516449, + "learning_rate": 2e-05, + "loss": 1.4408, + "step": 621 + }, + { + "epoch": 2.7043478260869565, + "grad_norm": 0.437889039516449, + "learning_rate": 2e-05, + "loss": 1.4241, + "step": 622 + }, + { + "epoch": 2.708695652173913, + "grad_norm": 0.437889039516449, + "learning_rate": 2e-05, + "loss": 1.4606, + "step": 623 + }, + { + "epoch": 2.7130434782608694, + "grad_norm": 0.4409219026565552, + "learning_rate": 2e-05, + "loss": 1.4289, + "step": 624 + }, + { + "epoch": 2.717391304347826, + "grad_norm": 0.4409219026565552, + "learning_rate": 2e-05, + "loss": 1.3626, + "step": 625 + }, + { + "epoch": 2.7217391304347824, + "grad_norm": 0.4409219026565552, + "learning_rate": 2e-05, + "loss": 1.3953, + "step": 626 + }, + { + "epoch": 2.726086956521739, + "grad_norm": 0.4409219026565552, + "learning_rate": 2e-05, + "loss": 1.4515, + "step": 627 + }, + { + "epoch": 2.730434782608696, + "grad_norm": 0.4441738724708557, + "learning_rate": 2e-05, + "loss": 1.3941, + "step": 628 + }, + { + "epoch": 2.734782608695652, + "grad_norm": 0.4441738724708557, + "learning_rate": 2e-05, + "loss": 1.4925, + "step": 629 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 0.4441738724708557, + "learning_rate": 2e-05, + "loss": 1.4274, + "step": 630 + }, + { + "epoch": 2.743478260869565, + "grad_norm": 0.4441738724708557, + "learning_rate": 2e-05, + "loss": 1.4483, + "step": 631 + }, + { + "epoch": 2.747826086956522, + "grad_norm": 0.44980600476264954, + "learning_rate": 2e-05, + "loss": 1.4143, + "step": 632 + }, + { + "epoch": 2.752173913043478, + "grad_norm": 0.44980600476264954, + "learning_rate": 2e-05, + "loss": 1.4103, + "step": 633 + }, + { + "epoch": 2.756521739130435, + "grad_norm": 0.44980600476264954, + "learning_rate": 2e-05, + "loss": 1.4343, + "step": 634 + }, + { + "epoch": 2.7608695652173916, + "grad_norm": 0.44980600476264954, + "learning_rate": 2e-05, + "loss": 1.4193, + "step": 635 + }, + { + "epoch": 2.765217391304348, + "grad_norm": 0.4380491077899933, + "learning_rate": 2e-05, + "loss": 1.3609, + "step": 636 + }, + { + "epoch": 2.769565217391304, + "grad_norm": 0.4380491077899933, + "learning_rate": 2e-05, + "loss": 1.4065, + "step": 637 + }, + { + "epoch": 2.773913043478261, + "grad_norm": 0.4380491077899933, + "learning_rate": 2e-05, + "loss": 1.3425, + "step": 638 + }, + { + "epoch": 2.7782608695652176, + "grad_norm": 0.4380491077899933, + "learning_rate": 2e-05, + "loss": 1.3964, + "step": 639 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.42945510149002075, + "learning_rate": 2e-05, + "loss": 1.3966, + "step": 640 + }, + { + "epoch": 2.7869565217391306, + "grad_norm": 0.42945510149002075, + "learning_rate": 2e-05, + "loss": 1.4238, + "step": 641 + }, + { + "epoch": 2.791304347826087, + "grad_norm": 0.42945510149002075, + "learning_rate": 2e-05, + "loss": 1.3981, + "step": 642 + }, + { + "epoch": 2.7956521739130435, + "grad_norm": 0.42945510149002075, + "learning_rate": 2e-05, + "loss": 1.4368, + "step": 643 + }, + { + "epoch": 2.8, + "grad_norm": 0.4350828528404236, + "learning_rate": 2e-05, + "loss": 1.3879, + "step": 644 + }, + { + "epoch": 2.8043478260869565, + "grad_norm": 0.4350828528404236, + "learning_rate": 2e-05, + "loss": 1.3954, + "step": 645 + }, + { + "epoch": 2.8086956521739133, + "grad_norm": 0.4350828528404236, + "learning_rate": 2e-05, + "loss": 1.4562, + "step": 646 + }, + { + "epoch": 2.8130434782608695, + "grad_norm": 0.4350828528404236, + "learning_rate": 2e-05, + "loss": 1.3741, + "step": 647 + }, + { + "epoch": 2.8173913043478263, + "grad_norm": 0.43145105242729187, + "learning_rate": 2e-05, + "loss": 1.3833, + "step": 648 + }, + { + "epoch": 2.8217391304347825, + "grad_norm": 0.43145105242729187, + "learning_rate": 2e-05, + "loss": 1.4499, + "step": 649 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 0.43145105242729187, + "learning_rate": 2e-05, + "loss": 1.4488, + "step": 650 + }, + { + "epoch": 2.8304347826086955, + "grad_norm": 0.43145105242729187, + "learning_rate": 2e-05, + "loss": 1.3758, + "step": 651 + }, + { + "epoch": 2.8347826086956522, + "grad_norm": 0.44008344411849976, + "learning_rate": 2e-05, + "loss": 1.4219, + "step": 652 + }, + { + "epoch": 2.839130434782609, + "grad_norm": 0.44008344411849976, + "learning_rate": 2e-05, + "loss": 1.4525, + "step": 653 + }, + { + "epoch": 2.8434782608695652, + "grad_norm": 0.44008344411849976, + "learning_rate": 2e-05, + "loss": 1.4668, + "step": 654 + }, + { + "epoch": 2.8478260869565215, + "grad_norm": 0.44008344411849976, + "learning_rate": 2e-05, + "loss": 1.4025, + "step": 655 + }, + { + "epoch": 2.8521739130434782, + "grad_norm": 0.4309910535812378, + "learning_rate": 2e-05, + "loss": 1.4007, + "step": 656 + }, + { + "epoch": 2.856521739130435, + "grad_norm": 0.4309910535812378, + "learning_rate": 2e-05, + "loss": 1.3499, + "step": 657 + }, + { + "epoch": 2.860869565217391, + "grad_norm": 0.4309910535812378, + "learning_rate": 2e-05, + "loss": 1.4505, + "step": 658 + }, + { + "epoch": 2.865217391304348, + "grad_norm": 0.4309910535812378, + "learning_rate": 2e-05, + "loss": 1.4449, + "step": 659 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.44285786151885986, + "learning_rate": 2e-05, + "loss": 1.3851, + "step": 660 + }, + { + "epoch": 2.873913043478261, + "grad_norm": 0.44285786151885986, + "learning_rate": 2e-05, + "loss": 1.4374, + "step": 661 + }, + { + "epoch": 2.878260869565217, + "grad_norm": 0.44285786151885986, + "learning_rate": 2e-05, + "loss": 1.4486, + "step": 662 + }, + { + "epoch": 2.882608695652174, + "grad_norm": 0.44285786151885986, + "learning_rate": 2e-05, + "loss": 1.4333, + "step": 663 + }, + { + "epoch": 2.8869565217391306, + "grad_norm": 0.43612438440322876, + "learning_rate": 2e-05, + "loss": 1.418, + "step": 664 + }, + { + "epoch": 2.891304347826087, + "grad_norm": 0.43612438440322876, + "learning_rate": 2e-05, + "loss": 1.5115, + "step": 665 + }, + { + "epoch": 2.8956521739130436, + "grad_norm": 0.43612438440322876, + "learning_rate": 2e-05, + "loss": 1.4567, + "step": 666 + }, + { + "epoch": 2.9, + "grad_norm": 0.43612438440322876, + "learning_rate": 2e-05, + "loss": 1.4776, + "step": 667 + }, + { + "epoch": 2.9043478260869566, + "grad_norm": 0.4316587448120117, + "learning_rate": 2e-05, + "loss": 1.3331, + "step": 668 + }, + { + "epoch": 2.908695652173913, + "grad_norm": 0.4316587448120117, + "learning_rate": 2e-05, + "loss": 1.4252, + "step": 669 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 0.4316587448120117, + "learning_rate": 2e-05, + "loss": 1.4541, + "step": 670 + }, + { + "epoch": 2.9173913043478263, + "grad_norm": 0.4316587448120117, + "learning_rate": 2e-05, + "loss": 1.3706, + "step": 671 + }, + { + "epoch": 2.9217391304347826, + "grad_norm": 0.45414403080940247, + "learning_rate": 2e-05, + "loss": 1.4172, + "step": 672 + }, + { + "epoch": 2.926086956521739, + "grad_norm": 0.45414403080940247, + "learning_rate": 2e-05, + "loss": 1.4269, + "step": 673 + }, + { + "epoch": 2.9304347826086956, + "grad_norm": 0.45414403080940247, + "learning_rate": 2e-05, + "loss": 1.4144, + "step": 674 + }, + { + "epoch": 2.9347826086956523, + "grad_norm": 0.45414403080940247, + "learning_rate": 2e-05, + "loss": 1.3762, + "step": 675 + }, + { + "epoch": 2.9391304347826086, + "grad_norm": 0.4472872018814087, + "learning_rate": 2e-05, + "loss": 1.3774, + "step": 676 + }, + { + "epoch": 2.9434782608695653, + "grad_norm": 0.4472872018814087, + "learning_rate": 2e-05, + "loss": 1.4429, + "step": 677 + }, + { + "epoch": 2.9478260869565216, + "grad_norm": 0.4472872018814087, + "learning_rate": 2e-05, + "loss": 1.3774, + "step": 678 + }, + { + "epoch": 2.9521739130434783, + "grad_norm": 0.4472872018814087, + "learning_rate": 2e-05, + "loss": 1.4752, + "step": 679 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.4483736753463745, + "learning_rate": 2e-05, + "loss": 1.3637, + "step": 680 + }, + { + "epoch": 2.9608695652173913, + "grad_norm": 0.4483736753463745, + "learning_rate": 2e-05, + "loss": 1.4605, + "step": 681 + }, + { + "epoch": 2.965217391304348, + "grad_norm": 0.4483736753463745, + "learning_rate": 2e-05, + "loss": 1.369, + "step": 682 + }, + { + "epoch": 2.9695652173913043, + "grad_norm": 0.4483736753463745, + "learning_rate": 2e-05, + "loss": 1.38, + "step": 683 + }, + { + "epoch": 2.973913043478261, + "grad_norm": 0.4401063621044159, + "learning_rate": 2e-05, + "loss": 1.3735, + "step": 684 + }, + { + "epoch": 2.9782608695652173, + "grad_norm": 0.4401063621044159, + "learning_rate": 2e-05, + "loss": 1.4165, + "step": 685 + }, + { + "epoch": 2.982608695652174, + "grad_norm": 0.4401063621044159, + "learning_rate": 2e-05, + "loss": 1.4523, + "step": 686 + }, + { + "epoch": 2.9869565217391303, + "grad_norm": 0.4401063621044159, + "learning_rate": 2e-05, + "loss": 1.3555, + "step": 687 + }, + { + "epoch": 2.991304347826087, + "grad_norm": 0.43969812989234924, + "learning_rate": 2e-05, + "loss": 1.3699, + "step": 688 + }, + { + "epoch": 2.9956521739130437, + "grad_norm": 0.43969812989234924, + "learning_rate": 2e-05, + "loss": 1.3609, + "step": 689 + }, + { + "epoch": 3.0, + "grad_norm": 0.43969812989234924, + "learning_rate": 2e-05, + "loss": 1.3846, + "step": 690 + }, + { + "epoch": 3.0, + "step": 690, + "total_flos": 3.864388766479155e+17, + "train_loss": 1.5982977670171987, + "train_runtime": 425.6486, + "train_samples_per_second": 362.842, + "train_steps_per_second": 1.621 + } + ], + "logging_steps": 1.0, + "max_steps": 690, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.864388766479155e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}