|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 10, |
|
"global_step": 268, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007462686567164179, |
|
"grad_norm": 3.073047161102295, |
|
"learning_rate": 0.0, |
|
"loss": 0.5576, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 4.13862419128418, |
|
"learning_rate": 2.9411764705882356e-07, |
|
"loss": 0.6964, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022388059701492536, |
|
"grad_norm": 4.69643497467041, |
|
"learning_rate": 5.882352941176471e-07, |
|
"loss": 0.8567, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 4.596504211425781, |
|
"learning_rate": 8.823529411764707e-07, |
|
"loss": 0.8257, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03731343283582089, |
|
"grad_norm": 2.9434754848480225, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 0.4799, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 6.064290523529053, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 1.0834, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05223880597014925, |
|
"grad_norm": 3.5080268383026123, |
|
"learning_rate": 1.7647058823529414e-06, |
|
"loss": 0.6863, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 3.5126824378967285, |
|
"learning_rate": 2.058823529411765e-06, |
|
"loss": 0.6823, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06716417910447761, |
|
"grad_norm": 2.0096919536590576, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 0.5064, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 2.6150012016296387, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 0.9675, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"eval_loss": 0.3130320906639099, |
|
"eval_runtime": 15.9567, |
|
"eval_samples_per_second": 10.717, |
|
"eval_steps_per_second": 1.379, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08208955223880597, |
|
"grad_norm": 2.244316577911377, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 0.6905, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 1.4434276819229126, |
|
"learning_rate": 3.2352941176470594e-06, |
|
"loss": 0.6089, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09701492537313433, |
|
"grad_norm": 2.2414164543151855, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 0.4699, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 2.0805301666259766, |
|
"learning_rate": 3.8235294117647055e-06, |
|
"loss": 0.6048, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11194029850746269, |
|
"grad_norm": 2.441056251525879, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 0.6305, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 2.762195110321045, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 0.7118, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12686567164179105, |
|
"grad_norm": 2.0519938468933105, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 0.6276, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 2.0802817344665527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6074, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1417910447761194, |
|
"grad_norm": 2.1733672618865967, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 0.6163, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 2.3623249530792236, |
|
"learning_rate": 5.588235294117647e-06, |
|
"loss": 0.8673, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"eval_loss": 0.29671722650527954, |
|
"eval_runtime": 16.4113, |
|
"eval_samples_per_second": 10.42, |
|
"eval_steps_per_second": 1.341, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15671641791044777, |
|
"grad_norm": 2.0744874477386475, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.7421, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 1.4317227602005005, |
|
"learning_rate": 6.176470588235295e-06, |
|
"loss": 0.6758, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.17164179104477612, |
|
"grad_norm": 1.5149955749511719, |
|
"learning_rate": 6.470588235294119e-06, |
|
"loss": 0.7199, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 1.4495139122009277, |
|
"learning_rate": 6.764705882352942e-06, |
|
"loss": 0.6544, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"grad_norm": 1.240043044090271, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 0.3998, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 1.3463258743286133, |
|
"learning_rate": 7.352941176470589e-06, |
|
"loss": 0.6238, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.20149253731343283, |
|
"grad_norm": 1.5899564027786255, |
|
"learning_rate": 7.647058823529411e-06, |
|
"loss": 0.7535, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 0.9583171606063843, |
|
"learning_rate": 7.941176470588236e-06, |
|
"loss": 0.4474, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.21641791044776118, |
|
"grad_norm": 1.2705698013305664, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.6322, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 1.2777875661849976, |
|
"learning_rate": 8.529411764705883e-06, |
|
"loss": 0.6081, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"eval_loss": 0.2805997133255005, |
|
"eval_runtime": 16.0973, |
|
"eval_samples_per_second": 10.623, |
|
"eval_steps_per_second": 1.367, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23134328358208955, |
|
"grad_norm": 1.2508858442306519, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.4562, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 1.1761791706085205, |
|
"learning_rate": 9.11764705882353e-06, |
|
"loss": 0.5464, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2462686567164179, |
|
"grad_norm": 1.298427939414978, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 0.4709, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 0.7913378477096558, |
|
"learning_rate": 9.705882352941177e-06, |
|
"loss": 0.2938, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.26119402985074625, |
|
"grad_norm": 1.0940293073654175, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4555, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 1.1584433317184448, |
|
"learning_rate": 9.999939000729718e-06, |
|
"loss": 0.5991, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.27611940298507465, |
|
"grad_norm": 1.0347867012023926, |
|
"learning_rate": 9.99975600440723e-06, |
|
"loss": 0.6246, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 1.4303066730499268, |
|
"learning_rate": 9.999451015497595e-06, |
|
"loss": 0.652, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.291044776119403, |
|
"grad_norm": 0.8853992819786072, |
|
"learning_rate": 9.999024041442455e-06, |
|
"loss": 0.5446, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.7761288285255432, |
|
"learning_rate": 9.99847509265985e-06, |
|
"loss": 0.2384, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"eval_loss": 0.2706838846206665, |
|
"eval_runtime": 15.7024, |
|
"eval_samples_per_second": 10.89, |
|
"eval_steps_per_second": 1.401, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30597014925373134, |
|
"grad_norm": 1.230162262916565, |
|
"learning_rate": 9.997804182543973e-06, |
|
"loss": 0.7637, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 1.2055861949920654, |
|
"learning_rate": 9.997011327464832e-06, |
|
"loss": 0.6891, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3208955223880597, |
|
"grad_norm": 1.5400534868240356, |
|
"learning_rate": 9.99609654676786e-06, |
|
"loss": 0.7425, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 1.1577208042144775, |
|
"learning_rate": 9.99505986277344e-06, |
|
"loss": 0.5395, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3358208955223881, |
|
"grad_norm": 1.050628423690796, |
|
"learning_rate": 9.993901300776358e-06, |
|
"loss": 0.4203, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 1.0624663829803467, |
|
"learning_rate": 9.99262088904519e-06, |
|
"loss": 0.6665, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.35074626865671643, |
|
"grad_norm": 1.544499158859253, |
|
"learning_rate": 9.991218658821609e-06, |
|
"loss": 0.6507, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 1.3652788400650024, |
|
"learning_rate": 9.989694644319618e-06, |
|
"loss": 0.8455, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3656716417910448, |
|
"grad_norm": 1.29508376121521, |
|
"learning_rate": 9.988048882724732e-06, |
|
"loss": 0.5849, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 1.6214385032653809, |
|
"learning_rate": 9.98628141419305e-06, |
|
"loss": 0.5859, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"eval_loss": 0.2677931785583496, |
|
"eval_runtime": 16.2049, |
|
"eval_samples_per_second": 10.552, |
|
"eval_steps_per_second": 1.358, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3805970149253731, |
|
"grad_norm": 0.8862821459770203, |
|
"learning_rate": 9.984392281850293e-06, |
|
"loss": 0.4504, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 1.047014832496643, |
|
"learning_rate": 9.982381531790733e-06, |
|
"loss": 0.4514, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.39552238805970147, |
|
"grad_norm": 1.115302324295044, |
|
"learning_rate": 9.980249213076085e-06, |
|
"loss": 0.6673, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 1.1105623245239258, |
|
"learning_rate": 9.977995377734307e-06, |
|
"loss": 0.4681, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.41044776119402987, |
|
"grad_norm": 0.9929081201553345, |
|
"learning_rate": 9.975620080758321e-06, |
|
"loss": 0.5058, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 1.0726927518844604, |
|
"learning_rate": 9.97312338010468e-06, |
|
"loss": 0.6072, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4253731343283582, |
|
"grad_norm": 0.9720289707183838, |
|
"learning_rate": 9.970505336692153e-06, |
|
"loss": 0.5592, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 1.327181100845337, |
|
"learning_rate": 9.967766014400233e-06, |
|
"loss": 0.5153, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.44029850746268656, |
|
"grad_norm": 1.0509099960327148, |
|
"learning_rate": 9.964905480067585e-06, |
|
"loss": 0.4982, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.8376150727272034, |
|
"learning_rate": 9.961923803490412e-06, |
|
"loss": 0.488, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"eval_loss": 0.2636841833591461, |
|
"eval_runtime": 15.6643, |
|
"eval_samples_per_second": 10.917, |
|
"eval_steps_per_second": 1.404, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4552238805970149, |
|
"grad_norm": 1.2392864227294922, |
|
"learning_rate": 9.958821057420752e-06, |
|
"loss": 0.4786, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 1.0299177169799805, |
|
"learning_rate": 9.955597317564705e-06, |
|
"loss": 0.6627, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4701492537313433, |
|
"grad_norm": 1.244341492652893, |
|
"learning_rate": 9.95225266258058e-06, |
|
"loss": 0.8958, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.9261450171470642, |
|
"learning_rate": 9.948787174076982e-06, |
|
"loss": 0.4079, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.48507462686567165, |
|
"grad_norm": 0.7639384269714355, |
|
"learning_rate": 9.945200936610821e-06, |
|
"loss": 0.3676, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 1.1056618690490723, |
|
"learning_rate": 9.941494037685244e-06, |
|
"loss": 0.6575, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.5201542377471924, |
|
"learning_rate": 9.9376665677475e-06, |
|
"loss": 0.5449, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.904944121837616, |
|
"learning_rate": 9.933718620186745e-06, |
|
"loss": 0.4245, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5149253731343284, |
|
"grad_norm": 0.9076265096664429, |
|
"learning_rate": 9.92965029133174e-06, |
|
"loss": 0.5433, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.8957193493843079, |
|
"learning_rate": 9.925461680448528e-06, |
|
"loss": 0.3874, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"eval_loss": 0.2620687484741211, |
|
"eval_runtime": 16.11, |
|
"eval_samples_per_second": 10.615, |
|
"eval_steps_per_second": 1.366, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5298507462686567, |
|
"grad_norm": 1.1844974756240845, |
|
"learning_rate": 9.921152889737985e-06, |
|
"loss": 0.7415, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 1.1240217685699463, |
|
"learning_rate": 9.91672402433335e-06, |
|
"loss": 0.6783, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5447761194029851, |
|
"grad_norm": 1.0070396661758423, |
|
"learning_rate": 9.912175192297648e-06, |
|
"loss": 0.3991, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 0.8779905438423157, |
|
"learning_rate": 9.907506504621052e-06, |
|
"loss": 0.3972, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"grad_norm": 1.2013908624649048, |
|
"learning_rate": 9.902718075218176e-06, |
|
"loss": 0.59, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 1.4874799251556396, |
|
"learning_rate": 9.897810020925301e-06, |
|
"loss": 0.6601, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5746268656716418, |
|
"grad_norm": 1.2102411985397339, |
|
"learning_rate": 9.892782461497521e-06, |
|
"loss": 0.6659, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 1.09522545337677, |
|
"learning_rate": 9.887635519605816e-06, |
|
"loss": 0.6796, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5895522388059702, |
|
"grad_norm": 1.020628809928894, |
|
"learning_rate": 9.882369320834068e-06, |
|
"loss": 0.6041, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 1.0358238220214844, |
|
"learning_rate": 9.87698399367599e-06, |
|
"loss": 0.7233, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"eval_loss": 0.2595260739326477, |
|
"eval_runtime": 16.6446, |
|
"eval_samples_per_second": 10.274, |
|
"eval_steps_per_second": 1.322, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6044776119402985, |
|
"grad_norm": 1.099974513053894, |
|
"learning_rate": 9.871479669531988e-06, |
|
"loss": 0.6985, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 1.3373053073883057, |
|
"learning_rate": 9.865856482705973e-06, |
|
"loss": 0.6532, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6194029850746269, |
|
"grad_norm": 0.9295642971992493, |
|
"learning_rate": 9.860114570402055e-06, |
|
"loss": 0.6586, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 0.8639213442802429, |
|
"learning_rate": 9.854254072721222e-06, |
|
"loss": 0.3806, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6343283582089553, |
|
"grad_norm": 1.1236610412597656, |
|
"learning_rate": 9.848275132657903e-06, |
|
"loss": 0.5605, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 1.0268014669418335, |
|
"learning_rate": 9.842177896096495e-06, |
|
"loss": 0.6164, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6492537313432836, |
|
"grad_norm": 1.311312198638916, |
|
"learning_rate": 9.835962511807786e-06, |
|
"loss": 0.6248, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 1.103390097618103, |
|
"learning_rate": 9.829629131445342e-06, |
|
"loss": 0.5902, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.664179104477612, |
|
"grad_norm": 1.1991970539093018, |
|
"learning_rate": 9.823177909541795e-06, |
|
"loss": 0.7637, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 1.0043079853057861, |
|
"learning_rate": 9.816609003505073e-06, |
|
"loss": 0.5539, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"eval_loss": 0.2582014799118042, |
|
"eval_runtime": 15.6475, |
|
"eval_samples_per_second": 10.928, |
|
"eval_steps_per_second": 1.406, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6791044776119403, |
|
"grad_norm": 0.7138786315917969, |
|
"learning_rate": 9.80992257361457e-06, |
|
"loss": 0.3945, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 1.0335519313812256, |
|
"learning_rate": 9.803118783017221e-06, |
|
"loss": 0.6189, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6940298507462687, |
|
"grad_norm": 1.3408584594726562, |
|
"learning_rate": 9.796197797723532e-06, |
|
"loss": 0.7297, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 1.0914255380630493, |
|
"learning_rate": 9.789159786603524e-06, |
|
"loss": 0.6801, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7089552238805971, |
|
"grad_norm": 1.1385866403579712, |
|
"learning_rate": 9.782004921382612e-06, |
|
"loss": 0.6327, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.8933367133140564, |
|
"learning_rate": 9.774733376637422e-06, |
|
"loss": 0.5923, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7238805970149254, |
|
"grad_norm": 1.1315196752548218, |
|
"learning_rate": 9.767345329791523e-06, |
|
"loss": 0.6883, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 0.9604437947273254, |
|
"learning_rate": 9.759840961111098e-06, |
|
"loss": 0.5106, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7388059701492538, |
|
"grad_norm": 1.1591802835464478, |
|
"learning_rate": 9.752220453700556e-06, |
|
"loss": 0.6319, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 1.2259401082992554, |
|
"learning_rate": 9.744483993498052e-06, |
|
"loss": 0.4872, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"eval_loss": 0.25687727332115173, |
|
"eval_runtime": 15.7529, |
|
"eval_samples_per_second": 10.855, |
|
"eval_steps_per_second": 1.397, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.753731343283582, |
|
"grad_norm": 0.6692157983779907, |
|
"learning_rate": 9.736631769270958e-06, |
|
"loss": 0.2946, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 1.2447078227996826, |
|
"learning_rate": 9.728663972611253e-06, |
|
"loss": 0.6527, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7686567164179104, |
|
"grad_norm": 1.0134166479110718, |
|
"learning_rate": 9.720580797930845e-06, |
|
"loss": 0.5679, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 1.3856276273727417, |
|
"learning_rate": 9.712382442456845e-06, |
|
"loss": 0.7747, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7835820895522388, |
|
"grad_norm": 1.1679767370224, |
|
"learning_rate": 9.704069106226728e-06, |
|
"loss": 0.6551, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 1.041477918624878, |
|
"learning_rate": 9.695640992083472e-06, |
|
"loss": 0.6679, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7985074626865671, |
|
"grad_norm": 1.5636087656021118, |
|
"learning_rate": 9.687098305670606e-06, |
|
"loss": 0.8609, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 1.0191229581832886, |
|
"learning_rate": 9.67844125542718e-06, |
|
"loss": 0.516, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8134328358208955, |
|
"grad_norm": 1.1325874328613281, |
|
"learning_rate": 9.669670052582695e-06, |
|
"loss": 0.7177, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.9325621128082275, |
|
"learning_rate": 9.66078491115194e-06, |
|
"loss": 0.3969, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"eval_loss": 0.25576016306877136, |
|
"eval_runtime": 15.6749, |
|
"eval_samples_per_second": 10.909, |
|
"eval_steps_per_second": 1.404, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8283582089552238, |
|
"grad_norm": 0.9852689504623413, |
|
"learning_rate": 9.651786047929772e-06, |
|
"loss": 0.4127, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 1.1226204633712769, |
|
"learning_rate": 9.642673682485831e-06, |
|
"loss": 0.7266, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8432835820895522, |
|
"grad_norm": 1.1815338134765625, |
|
"learning_rate": 9.633448037159167e-06, |
|
"loss": 0.7233, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 1.067295789718628, |
|
"learning_rate": 9.624109337052839e-06, |
|
"loss": 0.7226, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8582089552238806, |
|
"grad_norm": 0.7038566470146179, |
|
"learning_rate": 9.614657810028402e-06, |
|
"loss": 0.352, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 1.0434422492980957, |
|
"learning_rate": 9.605093686700356e-06, |
|
"loss": 0.7496, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8731343283582089, |
|
"grad_norm": 0.6814462542533875, |
|
"learning_rate": 9.595417200430517e-06, |
|
"loss": 0.3977, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 1.2506330013275146, |
|
"learning_rate": 9.585628587322329e-06, |
|
"loss": 0.5536, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8880597014925373, |
|
"grad_norm": 2.40456485748291, |
|
"learning_rate": 9.575728086215093e-06, |
|
"loss": 0.8602, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.9782617092132568, |
|
"learning_rate": 9.565715938678146e-06, |
|
"loss": 0.5878, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"eval_loss": 0.25602081418037415, |
|
"eval_runtime": 16.0769, |
|
"eval_samples_per_second": 10.636, |
|
"eval_steps_per_second": 1.368, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9029850746268657, |
|
"grad_norm": 0.8370435833930969, |
|
"learning_rate": 9.555592389004967e-06, |
|
"loss": 0.6102, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 1.0810141563415527, |
|
"learning_rate": 9.54535768420721e-06, |
|
"loss": 0.5595, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.917910447761194, |
|
"grad_norm": 1.0081559419631958, |
|
"learning_rate": 9.535012074008688e-06, |
|
"loss": 0.4607, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 0.9018644690513611, |
|
"learning_rate": 9.524555810839267e-06, |
|
"loss": 0.4455, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"grad_norm": 0.6649499535560608, |
|
"learning_rate": 9.513989149828718e-06, |
|
"loss": 0.2407, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 1.1930391788482666, |
|
"learning_rate": 9.503312348800486e-06, |
|
"loss": 0.6361, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9477611940298507, |
|
"grad_norm": 0.9348613023757935, |
|
"learning_rate": 9.4925256682654e-06, |
|
"loss": 0.4939, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 1.2714776992797852, |
|
"learning_rate": 9.481629371415315e-06, |
|
"loss": 0.6854, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9626865671641791, |
|
"grad_norm": 1.0918177366256714, |
|
"learning_rate": 9.470623724116693e-06, |
|
"loss": 0.6523, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 0.747653603553772, |
|
"learning_rate": 9.459508994904119e-06, |
|
"loss": 0.4212, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"eval_loss": 0.25406235456466675, |
|
"eval_runtime": 16.5104, |
|
"eval_samples_per_second": 10.357, |
|
"eval_steps_per_second": 1.332, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9776119402985075, |
|
"grad_norm": 0.867660641670227, |
|
"learning_rate": 9.448285454973739e-06, |
|
"loss": 0.4303, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 0.9838992357254028, |
|
"learning_rate": 9.43695337817665e-06, |
|
"loss": 0.6881, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9925373134328358, |
|
"grad_norm": 1.149843454360962, |
|
"learning_rate": 9.42551304101222e-06, |
|
"loss": 0.6812, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6692924499511719, |
|
"learning_rate": 9.413964722621339e-06, |
|
"loss": 0.3432, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.007462686567164, |
|
"grad_norm": 1.6719107627868652, |
|
"learning_rate": 9.4023087047796e-06, |
|
"loss": 0.5406, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0149253731343284, |
|
"grad_norm": 0.7576178312301636, |
|
"learning_rate": 9.390545271890438e-06, |
|
"loss": 0.4529, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0223880597014925, |
|
"grad_norm": 1.1865450143814087, |
|
"learning_rate": 9.378674710978185e-06, |
|
"loss": 0.4609, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0298507462686568, |
|
"grad_norm": 1.0263293981552124, |
|
"learning_rate": 9.366697311681058e-06, |
|
"loss": 0.5971, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.037313432835821, |
|
"grad_norm": 0.838365912437439, |
|
"learning_rate": 9.354613366244108e-06, |
|
"loss": 0.4553, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.9639071822166443, |
|
"learning_rate": 9.342423169512072e-06, |
|
"loss": 0.5695, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"eval_loss": 0.2554219663143158, |
|
"eval_runtime": 15.7211, |
|
"eval_samples_per_second": 10.877, |
|
"eval_steps_per_second": 1.399, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0522388059701493, |
|
"grad_norm": 1.9610925912857056, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.4975, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0597014925373134, |
|
"grad_norm": 0.6606642603874207, |
|
"learning_rate": 9.31772521449696e-06, |
|
"loss": 0.1836, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0671641791044777, |
|
"grad_norm": 1.3162277936935425, |
|
"learning_rate": 9.305218058836778e-06, |
|
"loss": 0.6204, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 1.4593067169189453, |
|
"learning_rate": 9.292605857112595e-06, |
|
"loss": 0.4129, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0820895522388059, |
|
"grad_norm": 0.8607202172279358, |
|
"learning_rate": 9.279888917058453e-06, |
|
"loss": 0.4644, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0895522388059702, |
|
"grad_norm": 1.15462064743042, |
|
"learning_rate": 9.267067548963975e-06, |
|
"loss": 0.4726, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0970149253731343, |
|
"grad_norm": 2.365873336791992, |
|
"learning_rate": 9.254142065666802e-06, |
|
"loss": 0.716, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1044776119402986, |
|
"grad_norm": 2.1044762134552, |
|
"learning_rate": 9.241112782544953e-06, |
|
"loss": 0.4993, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1119402985074627, |
|
"grad_norm": 0.7202847003936768, |
|
"learning_rate": 9.22798001750913e-06, |
|
"loss": 0.2366, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.93817538022995, |
|
"learning_rate": 9.214744090994973e-06, |
|
"loss": 0.4358, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"eval_loss": 0.2571178376674652, |
|
"eval_runtime": 16.0344, |
|
"eval_samples_per_second": 10.665, |
|
"eval_steps_per_second": 1.372, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.126865671641791, |
|
"grad_norm": 1.1024166345596313, |
|
"learning_rate": 9.201405325955222e-06, |
|
"loss": 0.4351, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1343283582089552, |
|
"grad_norm": 0.8645644187927246, |
|
"learning_rate": 9.187964047851851e-06, |
|
"loss": 0.4179, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1417910447761195, |
|
"grad_norm": 0.6371698975563049, |
|
"learning_rate": 9.174420584648123e-06, |
|
"loss": 0.1953, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1492537313432836, |
|
"grad_norm": 1.643067717552185, |
|
"learning_rate": 9.160775266800583e-06, |
|
"loss": 0.4733, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1567164179104479, |
|
"grad_norm": 0.9567477703094482, |
|
"learning_rate": 9.14702842725101e-06, |
|
"loss": 0.4015, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.164179104477612, |
|
"grad_norm": 0.8631231784820557, |
|
"learning_rate": 9.133180401418271e-06, |
|
"loss": 0.2983, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.171641791044776, |
|
"grad_norm": 0.6879262328147888, |
|
"learning_rate": 9.11923152719016e-06, |
|
"loss": 0.3474, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1791044776119404, |
|
"grad_norm": 0.9094505906105042, |
|
"learning_rate": 9.10518214491513e-06, |
|
"loss": 0.5146, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1865671641791045, |
|
"grad_norm": 0.7373756766319275, |
|
"learning_rate": 9.091032597394012e-06, |
|
"loss": 0.3366, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 1.2869657278060913, |
|
"learning_rate": 9.076783229871636e-06, |
|
"loss": 0.6705, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"eval_loss": 0.2553011476993561, |
|
"eval_runtime": 15.7243, |
|
"eval_samples_per_second": 10.875, |
|
"eval_steps_per_second": 1.399, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2014925373134329, |
|
"grad_norm": 1.151159405708313, |
|
"learning_rate": 9.062434390028407e-06, |
|
"loss": 0.5876, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.208955223880597, |
|
"grad_norm": 1.174561858177185, |
|
"learning_rate": 9.047986427971832e-06, |
|
"loss": 0.4806, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2164179104477613, |
|
"grad_norm": 1.2193154096603394, |
|
"learning_rate": 9.033439696227966e-06, |
|
"loss": 0.5455, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2238805970149254, |
|
"grad_norm": 1.2365609407424927, |
|
"learning_rate": 9.018794549732819e-06, |
|
"loss": 0.4742, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2313432835820897, |
|
"grad_norm": 1.3054121732711792, |
|
"learning_rate": 9.00405134582369e-06, |
|
"loss": 0.5833, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2388059701492538, |
|
"grad_norm": 0.5406851172447205, |
|
"learning_rate": 8.98921044423045e-06, |
|
"loss": 0.2561, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2462686567164178, |
|
"grad_norm": 1.277519702911377, |
|
"learning_rate": 8.974272207066767e-06, |
|
"loss": 0.686, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"grad_norm": 0.9183409810066223, |
|
"learning_rate": 8.959236998821267e-06, |
|
"loss": 0.6249, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2611940298507462, |
|
"grad_norm": 0.838150143623352, |
|
"learning_rate": 8.944105186348646e-06, |
|
"loss": 0.3115, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 0.821236252784729, |
|
"learning_rate": 8.928877138860708e-06, |
|
"loss": 0.4498, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"eval_loss": 0.2549591362476349, |
|
"eval_runtime": 15.6822, |
|
"eval_samples_per_second": 10.904, |
|
"eval_steps_per_second": 1.403, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2761194029850746, |
|
"grad_norm": 1.1852009296417236, |
|
"learning_rate": 8.913553227917366e-06, |
|
"loss": 0.5287, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2835820895522387, |
|
"grad_norm": 0.8103587627410889, |
|
"learning_rate": 8.89813382741758e-06, |
|
"loss": 0.4295, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.291044776119403, |
|
"grad_norm": 0.7645599246025085, |
|
"learning_rate": 8.882619313590212e-06, |
|
"loss": 0.3835, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2985074626865671, |
|
"grad_norm": 0.6826095581054688, |
|
"learning_rate": 8.86701006498488e-06, |
|
"loss": 0.2922, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"grad_norm": 0.7371439933776855, |
|
"learning_rate": 8.851306462462689e-06, |
|
"loss": 0.305, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 1.2219048738479614, |
|
"learning_rate": 8.835508889186957e-06, |
|
"loss": 0.6685, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3208955223880596, |
|
"grad_norm": 0.6370560526847839, |
|
"learning_rate": 8.819617730613863e-06, |
|
"loss": 0.2816, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.328358208955224, |
|
"grad_norm": 0.9550657272338867, |
|
"learning_rate": 8.803633374483036e-06, |
|
"loss": 0.433, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.335820895522388, |
|
"grad_norm": 0.9216323494911194, |
|
"learning_rate": 8.787556210808101e-06, |
|
"loss": 0.372, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 0.8051882386207581, |
|
"learning_rate": 8.771386631867158e-06, |
|
"loss": 0.4607, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"eval_loss": 0.255341112613678, |
|
"eval_runtime": 15.7753, |
|
"eval_samples_per_second": 10.84, |
|
"eval_steps_per_second": 1.395, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3507462686567164, |
|
"grad_norm": 1.3185007572174072, |
|
"learning_rate": 8.755125032193215e-06, |
|
"loss": 0.731, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3582089552238805, |
|
"grad_norm": 0.8819650411605835, |
|
"learning_rate": 8.738771808564555e-06, |
|
"loss": 0.3844, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3656716417910448, |
|
"grad_norm": 0.9434659481048584, |
|
"learning_rate": 8.722327359995064e-06, |
|
"loss": 0.4774, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.373134328358209, |
|
"grad_norm": 1.0129953622817993, |
|
"learning_rate": 8.705792087724485e-06, |
|
"loss": 0.6397, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3805970149253732, |
|
"grad_norm": 0.8750033378601074, |
|
"learning_rate": 8.689166395208638e-06, |
|
"loss": 0.5263, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3880597014925373, |
|
"grad_norm": 0.8555174469947815, |
|
"learning_rate": 8.672450688109563e-06, |
|
"loss": 0.4434, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3955223880597014, |
|
"grad_norm": 1.1890219449996948, |
|
"learning_rate": 8.655645374285637e-06, |
|
"loss": 0.4444, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4029850746268657, |
|
"grad_norm": 0.8571549654006958, |
|
"learning_rate": 8.638750863781614e-06, |
|
"loss": 0.2912, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4104477611940298, |
|
"grad_norm": 0.8109440207481384, |
|
"learning_rate": 8.621767568818614e-06, |
|
"loss": 0.5321, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 0.5370824933052063, |
|
"learning_rate": 8.60469590378408e-06, |
|
"loss": 0.2494, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"eval_loss": 0.2551746368408203, |
|
"eval_runtime": 16.3169, |
|
"eval_samples_per_second": 10.48, |
|
"eval_steps_per_second": 1.348, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4253731343283582, |
|
"grad_norm": 0.9776721596717834, |
|
"learning_rate": 8.587536285221656e-06, |
|
"loss": 0.4893, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.8991337418556213, |
|
"learning_rate": 8.570289131821025e-06, |
|
"loss": 0.4496, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4402985074626866, |
|
"grad_norm": 0.8432929515838623, |
|
"learning_rate": 8.552954864407699e-06, |
|
"loss": 0.3647, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4477611940298507, |
|
"grad_norm": 0.7621536254882812, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.3156, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.455223880597015, |
|
"grad_norm": 0.9852966070175171, |
|
"learning_rate": 8.518026681462448e-06, |
|
"loss": 0.3542, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"grad_norm": 1.1621863842010498, |
|
"learning_rate": 8.500433618167993e-06, |
|
"loss": 0.6648, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4701492537313432, |
|
"grad_norm": 0.845145583152771, |
|
"learning_rate": 8.482755145314987e-06, |
|
"loss": 0.2917, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4776119402985075, |
|
"grad_norm": 0.8889628648757935, |
|
"learning_rate": 8.464991694253001e-06, |
|
"loss": 0.3615, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.4850746268656716, |
|
"grad_norm": 0.9698963165283203, |
|
"learning_rate": 8.44714369840506e-06, |
|
"loss": 0.5225, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.7870234251022339, |
|
"learning_rate": 8.429211593257054e-06, |
|
"loss": 0.4429, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"eval_loss": 0.2533767521381378, |
|
"eval_runtime": 16.4295, |
|
"eval_samples_per_second": 10.408, |
|
"eval_steps_per_second": 1.339, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.8282889127731323, |
|
"learning_rate": 8.41119581634711e-06, |
|
"loss": 0.2888, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.5074626865671643, |
|
"grad_norm": 1.3417259454727173, |
|
"learning_rate": 8.393096807254932e-06, |
|
"loss": 0.8644, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5149253731343284, |
|
"grad_norm": 0.7999916076660156, |
|
"learning_rate": 8.374915007591053e-06, |
|
"loss": 0.4493, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.5223880597014925, |
|
"grad_norm": 1.507180094718933, |
|
"learning_rate": 8.356650860986083e-06, |
|
"loss": 0.4486, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5298507462686568, |
|
"grad_norm": 0.8137674927711487, |
|
"learning_rate": 8.338304813079866e-06, |
|
"loss": 0.4833, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.537313432835821, |
|
"grad_norm": 1.2336827516555786, |
|
"learning_rate": 8.319877311510614e-06, |
|
"loss": 0.6646, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.544776119402985, |
|
"grad_norm": 0.9395962953567505, |
|
"learning_rate": 8.301368805903988e-06, |
|
"loss": 0.455, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.9407913684844971, |
|
"learning_rate": 8.282779747862122e-06, |
|
"loss": 0.6132, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5597014925373134, |
|
"grad_norm": 0.7824919819831848, |
|
"learning_rate": 8.264110590952609e-06, |
|
"loss": 0.5489, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 0.7193758487701416, |
|
"learning_rate": 8.245361790697425e-06, |
|
"loss": 0.4639, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"eval_loss": 0.2530945837497711, |
|
"eval_runtime": 16.071, |
|
"eval_samples_per_second": 10.64, |
|
"eval_steps_per_second": 1.369, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5746268656716418, |
|
"grad_norm": 1.141431450843811, |
|
"learning_rate": 8.226533804561828e-06, |
|
"loss": 0.6021, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.582089552238806, |
|
"grad_norm": 0.8864127397537231, |
|
"learning_rate": 8.207627091943178e-06, |
|
"loss": 0.5709, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.5895522388059702, |
|
"grad_norm": 1.565999150276184, |
|
"learning_rate": 8.188642114159748e-06, |
|
"loss": 0.4649, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5970149253731343, |
|
"grad_norm": 1.0003879070281982, |
|
"learning_rate": 8.169579334439453e-06, |
|
"loss": 0.5816, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6044776119402986, |
|
"grad_norm": 1.0208603143692017, |
|
"learning_rate": 8.150439217908557e-06, |
|
"loss": 0.4815, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6119402985074627, |
|
"grad_norm": 0.6908705830574036, |
|
"learning_rate": 8.131222231580313e-06, |
|
"loss": 0.3388, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.6194029850746268, |
|
"grad_norm": 1.2532159090042114, |
|
"learning_rate": 8.11192884434358e-06, |
|
"loss": 0.6774, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.626865671641791, |
|
"grad_norm": 0.8950490951538086, |
|
"learning_rate": 8.092559526951374e-06, |
|
"loss": 0.6011, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.6343283582089554, |
|
"grad_norm": 1.6891182661056519, |
|
"learning_rate": 8.073114752009388e-06, |
|
"loss": 0.5921, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 1.0266426801681519, |
|
"learning_rate": 8.053594993964453e-06, |
|
"loss": 0.6355, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"eval_loss": 0.25234121084213257, |
|
"eval_runtime": 16.2031, |
|
"eval_samples_per_second": 10.554, |
|
"eval_steps_per_second": 1.358, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6492537313432836, |
|
"grad_norm": 0.7774348258972168, |
|
"learning_rate": 8.034000729092967e-06, |
|
"loss": 0.4213, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6567164179104479, |
|
"grad_norm": 0.9359886646270752, |
|
"learning_rate": 8.014332435489276e-06, |
|
"loss": 0.4473, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.664179104477612, |
|
"grad_norm": 0.8109466433525085, |
|
"learning_rate": 7.994590593054001e-06, |
|
"loss": 0.4733, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.7832448482513428, |
|
"learning_rate": 7.974775683482337e-06, |
|
"loss": 0.3198, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"grad_norm": 1.1789060831069946, |
|
"learning_rate": 7.954888190252292e-06, |
|
"loss": 0.5846, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6865671641791045, |
|
"grad_norm": 0.9324489831924438, |
|
"learning_rate": 7.934928598612896e-06, |
|
"loss": 0.3998, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.6940298507462686, |
|
"grad_norm": 1.1701297760009766, |
|
"learning_rate": 7.914897395572362e-06, |
|
"loss": 0.6273, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7014925373134329, |
|
"grad_norm": 0.9922040700912476, |
|
"learning_rate": 7.894795069886192e-06, |
|
"loss": 0.5599, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7089552238805972, |
|
"grad_norm": 0.8753315806388855, |
|
"learning_rate": 7.874622112045269e-06, |
|
"loss": 0.4123, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 1.1100213527679443, |
|
"learning_rate": 7.854379014263877e-06, |
|
"loss": 0.3805, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"eval_loss": 0.2517262399196625, |
|
"eval_runtime": 15.9604, |
|
"eval_samples_per_second": 10.714, |
|
"eval_steps_per_second": 1.378, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7238805970149254, |
|
"grad_norm": 0.9963248372077942, |
|
"learning_rate": 7.83406627046769e-06, |
|
"loss": 0.2903, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.7313432835820897, |
|
"grad_norm": 0.7608401775360107, |
|
"learning_rate": 7.81368437628173e-06, |
|
"loss": 0.4172, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.7388059701492538, |
|
"grad_norm": 1.166113018989563, |
|
"learning_rate": 7.793233829018263e-06, |
|
"loss": 0.609, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.7462686567164178, |
|
"grad_norm": 0.7339043617248535, |
|
"learning_rate": 7.772715127664676e-06, |
|
"loss": 0.3509, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7537313432835822, |
|
"grad_norm": 1.2148951292037964, |
|
"learning_rate": 7.752128772871292e-06, |
|
"loss": 0.4682, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.7611940298507462, |
|
"grad_norm": 2.158393383026123, |
|
"learning_rate": 7.731475266939159e-06, |
|
"loss": 0.4768, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.7686567164179103, |
|
"grad_norm": 1.624114990234375, |
|
"learning_rate": 7.710755113807793e-06, |
|
"loss": 0.5563, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7761194029850746, |
|
"grad_norm": 0.6288245916366577, |
|
"learning_rate": 7.689968819042884e-06, |
|
"loss": 0.2496, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.783582089552239, |
|
"grad_norm": 0.6780006885528564, |
|
"learning_rate": 7.669116889823955e-06, |
|
"loss": 0.243, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 1.302188515663147, |
|
"learning_rate": 7.648199834931994e-06, |
|
"loss": 0.5689, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"eval_loss": 0.25215157866477966, |
|
"eval_runtime": 16.1686, |
|
"eval_samples_per_second": 10.576, |
|
"eval_steps_per_second": 1.361, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7985074626865671, |
|
"grad_norm": 0.9067533016204834, |
|
"learning_rate": 7.627218164737031e-06, |
|
"loss": 0.4587, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8059701492537314, |
|
"grad_norm": 0.7963538765907288, |
|
"learning_rate": 7.6061723911857e-06, |
|
"loss": 0.4912, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8134328358208955, |
|
"grad_norm": 0.8649603724479675, |
|
"learning_rate": 7.58506302778873e-06, |
|
"loss": 0.3812, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8208955223880596, |
|
"grad_norm": 0.5839230418205261, |
|
"learning_rate": 7.563890589608427e-06, |
|
"loss": 0.341, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.828358208955224, |
|
"grad_norm": 0.8967713117599487, |
|
"learning_rate": 7.542655593246103e-06, |
|
"loss": 0.5506, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.835820895522388, |
|
"grad_norm": 1.0193324089050293, |
|
"learning_rate": 7.52135855682947e-06, |
|
"loss": 0.7101, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8432835820895521, |
|
"grad_norm": 1.0445090532302856, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.4873, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8507462686567164, |
|
"grad_norm": 1.0798559188842773, |
|
"learning_rate": 7.478580443900247e-06, |
|
"loss": 0.6465, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8582089552238807, |
|
"grad_norm": 1.5234549045562744, |
|
"learning_rate": 7.457100411161128e-06, |
|
"loss": 0.6318, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 1.1321910619735718, |
|
"learning_rate": 7.435560425889169e-06, |
|
"loss": 0.5929, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"eval_loss": 0.25326502323150635, |
|
"eval_runtime": 15.7124, |
|
"eval_samples_per_second": 10.883, |
|
"eval_steps_per_second": 1.4, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.873134328358209, |
|
"grad_norm": 0.8814710974693298, |
|
"learning_rate": 7.413961013653725e-06, |
|
"loss": 0.413, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"grad_norm": 0.782132625579834, |
|
"learning_rate": 7.392302701474151e-06, |
|
"loss": 0.4623, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.8880597014925373, |
|
"grad_norm": 1.0612297058105469, |
|
"learning_rate": 7.370586017806942e-06, |
|
"loss": 0.6307, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.8955223880597014, |
|
"grad_norm": 0.7790008783340454, |
|
"learning_rate": 7.34881149253284e-06, |
|
"loss": 0.4426, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.9029850746268657, |
|
"grad_norm": 0.8862120509147644, |
|
"learning_rate": 7.326979656943907e-06, |
|
"loss": 0.56, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.6480329632759094, |
|
"learning_rate": 7.305091043730558e-06, |
|
"loss": 0.3719, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.917910447761194, |
|
"grad_norm": 1.0156606435775757, |
|
"learning_rate": 7.283146186968566e-06, |
|
"loss": 0.4542, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.9253731343283582, |
|
"grad_norm": 0.8710734248161316, |
|
"learning_rate": 7.261145622106033e-06, |
|
"loss": 0.4221, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.9328358208955225, |
|
"grad_norm": 0.9503659009933472, |
|
"learning_rate": 7.239089885950317e-06, |
|
"loss": 0.474, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 0.9117936491966248, |
|
"learning_rate": 7.216979516654944e-06, |
|
"loss": 0.5434, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"eval_loss": 0.25116848945617676, |
|
"eval_runtime": 15.9835, |
|
"eval_samples_per_second": 10.699, |
|
"eval_steps_per_second": 1.376, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9477611940298507, |
|
"grad_norm": 0.9242149591445923, |
|
"learning_rate": 7.194815053706471e-06, |
|
"loss": 0.4634, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.955223880597015, |
|
"grad_norm": 1.1205576658248901, |
|
"learning_rate": 7.172597037911323e-06, |
|
"loss": 0.5393, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.962686567164179, |
|
"grad_norm": 0.9420229196548462, |
|
"learning_rate": 7.1503260113826035e-06, |
|
"loss": 0.5668, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.9701492537313432, |
|
"grad_norm": 0.8083785772323608, |
|
"learning_rate": 7.128002517526856e-06, |
|
"loss": 0.3753, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.9776119402985075, |
|
"grad_norm": 0.8978068232536316, |
|
"learning_rate": 7.105627101030816e-06, |
|
"loss": 0.5198, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9850746268656716, |
|
"grad_norm": 0.7519446611404419, |
|
"learning_rate": 7.083200307848116e-06, |
|
"loss": 0.3855, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.9925373134328357, |
|
"grad_norm": 0.9396398663520813, |
|
"learning_rate": 7.060722685185961e-06, |
|
"loss": 0.4073, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6269404888153076, |
|
"learning_rate": 7.038194781491785e-06, |
|
"loss": 0.3679, |
|
"step": 268 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 670, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.204653518707753e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|