|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 10, |
|
"global_step": 536, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007462686567164179, |
|
"grad_norm": 3.073047161102295, |
|
"learning_rate": 0.0, |
|
"loss": 0.5576, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 4.13862419128418, |
|
"learning_rate": 2.9411764705882356e-07, |
|
"loss": 0.6964, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022388059701492536, |
|
"grad_norm": 4.69643497467041, |
|
"learning_rate": 5.882352941176471e-07, |
|
"loss": 0.8567, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 4.596504211425781, |
|
"learning_rate": 8.823529411764707e-07, |
|
"loss": 0.8257, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03731343283582089, |
|
"grad_norm": 2.9434754848480225, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 0.4799, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 6.064290523529053, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 1.0834, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05223880597014925, |
|
"grad_norm": 3.5080268383026123, |
|
"learning_rate": 1.7647058823529414e-06, |
|
"loss": 0.6863, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 3.5126824378967285, |
|
"learning_rate": 2.058823529411765e-06, |
|
"loss": 0.6823, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06716417910447761, |
|
"grad_norm": 2.0096919536590576, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 0.5064, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 2.6150012016296387, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 0.9675, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"eval_loss": 0.3130320906639099, |
|
"eval_runtime": 15.9567, |
|
"eval_samples_per_second": 10.717, |
|
"eval_steps_per_second": 1.379, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08208955223880597, |
|
"grad_norm": 2.244316577911377, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 0.6905, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 1.4434276819229126, |
|
"learning_rate": 3.2352941176470594e-06, |
|
"loss": 0.6089, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09701492537313433, |
|
"grad_norm": 2.2414164543151855, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 0.4699, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 2.0805301666259766, |
|
"learning_rate": 3.8235294117647055e-06, |
|
"loss": 0.6048, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11194029850746269, |
|
"grad_norm": 2.441056251525879, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 0.6305, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 2.762195110321045, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 0.7118, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12686567164179105, |
|
"grad_norm": 2.0519938468933105, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 0.6276, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 2.0802817344665527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6074, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1417910447761194, |
|
"grad_norm": 2.1733672618865967, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 0.6163, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 2.3623249530792236, |
|
"learning_rate": 5.588235294117647e-06, |
|
"loss": 0.8673, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"eval_loss": 0.29671722650527954, |
|
"eval_runtime": 16.4113, |
|
"eval_samples_per_second": 10.42, |
|
"eval_steps_per_second": 1.341, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15671641791044777, |
|
"grad_norm": 2.0744874477386475, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.7421, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 1.4317227602005005, |
|
"learning_rate": 6.176470588235295e-06, |
|
"loss": 0.6758, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.17164179104477612, |
|
"grad_norm": 1.5149955749511719, |
|
"learning_rate": 6.470588235294119e-06, |
|
"loss": 0.7199, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 1.4495139122009277, |
|
"learning_rate": 6.764705882352942e-06, |
|
"loss": 0.6544, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"grad_norm": 1.240043044090271, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 0.3998, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 1.3463258743286133, |
|
"learning_rate": 7.352941176470589e-06, |
|
"loss": 0.6238, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.20149253731343283, |
|
"grad_norm": 1.5899564027786255, |
|
"learning_rate": 7.647058823529411e-06, |
|
"loss": 0.7535, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 0.9583171606063843, |
|
"learning_rate": 7.941176470588236e-06, |
|
"loss": 0.4474, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.21641791044776118, |
|
"grad_norm": 1.2705698013305664, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.6322, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 1.2777875661849976, |
|
"learning_rate": 8.529411764705883e-06, |
|
"loss": 0.6081, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"eval_loss": 0.2805997133255005, |
|
"eval_runtime": 16.0973, |
|
"eval_samples_per_second": 10.623, |
|
"eval_steps_per_second": 1.367, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23134328358208955, |
|
"grad_norm": 1.2508858442306519, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.4562, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 1.1761791706085205, |
|
"learning_rate": 9.11764705882353e-06, |
|
"loss": 0.5464, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2462686567164179, |
|
"grad_norm": 1.298427939414978, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 0.4709, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 0.7913378477096558, |
|
"learning_rate": 9.705882352941177e-06, |
|
"loss": 0.2938, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.26119402985074625, |
|
"grad_norm": 1.0940293073654175, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4555, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 1.1584433317184448, |
|
"learning_rate": 9.999939000729718e-06, |
|
"loss": 0.5991, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.27611940298507465, |
|
"grad_norm": 1.0347867012023926, |
|
"learning_rate": 9.99975600440723e-06, |
|
"loss": 0.6246, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 1.4303066730499268, |
|
"learning_rate": 9.999451015497595e-06, |
|
"loss": 0.652, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.291044776119403, |
|
"grad_norm": 0.8853992819786072, |
|
"learning_rate": 9.999024041442455e-06, |
|
"loss": 0.5446, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.7761288285255432, |
|
"learning_rate": 9.99847509265985e-06, |
|
"loss": 0.2384, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"eval_loss": 0.2706838846206665, |
|
"eval_runtime": 15.7024, |
|
"eval_samples_per_second": 10.89, |
|
"eval_steps_per_second": 1.401, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30597014925373134, |
|
"grad_norm": 1.230162262916565, |
|
"learning_rate": 9.997804182543973e-06, |
|
"loss": 0.7637, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 1.2055861949920654, |
|
"learning_rate": 9.997011327464832e-06, |
|
"loss": 0.6891, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3208955223880597, |
|
"grad_norm": 1.5400534868240356, |
|
"learning_rate": 9.99609654676786e-06, |
|
"loss": 0.7425, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 1.1577208042144775, |
|
"learning_rate": 9.99505986277344e-06, |
|
"loss": 0.5395, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3358208955223881, |
|
"grad_norm": 1.050628423690796, |
|
"learning_rate": 9.993901300776358e-06, |
|
"loss": 0.4203, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 1.0624663829803467, |
|
"learning_rate": 9.99262088904519e-06, |
|
"loss": 0.6665, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.35074626865671643, |
|
"grad_norm": 1.544499158859253, |
|
"learning_rate": 9.991218658821609e-06, |
|
"loss": 0.6507, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 1.3652788400650024, |
|
"learning_rate": 9.989694644319618e-06, |
|
"loss": 0.8455, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3656716417910448, |
|
"grad_norm": 1.29508376121521, |
|
"learning_rate": 9.988048882724732e-06, |
|
"loss": 0.5849, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 1.6214385032653809, |
|
"learning_rate": 9.98628141419305e-06, |
|
"loss": 0.5859, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"eval_loss": 0.2677931785583496, |
|
"eval_runtime": 16.2049, |
|
"eval_samples_per_second": 10.552, |
|
"eval_steps_per_second": 1.358, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3805970149253731, |
|
"grad_norm": 0.8862821459770203, |
|
"learning_rate": 9.984392281850293e-06, |
|
"loss": 0.4504, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 1.047014832496643, |
|
"learning_rate": 9.982381531790733e-06, |
|
"loss": 0.4514, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.39552238805970147, |
|
"grad_norm": 1.115302324295044, |
|
"learning_rate": 9.980249213076085e-06, |
|
"loss": 0.6673, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 1.1105623245239258, |
|
"learning_rate": 9.977995377734307e-06, |
|
"loss": 0.4681, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.41044776119402987, |
|
"grad_norm": 0.9929081201553345, |
|
"learning_rate": 9.975620080758321e-06, |
|
"loss": 0.5058, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 1.0726927518844604, |
|
"learning_rate": 9.97312338010468e-06, |
|
"loss": 0.6072, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4253731343283582, |
|
"grad_norm": 0.9720289707183838, |
|
"learning_rate": 9.970505336692153e-06, |
|
"loss": 0.5592, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 1.327181100845337, |
|
"learning_rate": 9.967766014400233e-06, |
|
"loss": 0.5153, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.44029850746268656, |
|
"grad_norm": 1.0509099960327148, |
|
"learning_rate": 9.964905480067585e-06, |
|
"loss": 0.4982, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.8376150727272034, |
|
"learning_rate": 9.961923803490412e-06, |
|
"loss": 0.488, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"eval_loss": 0.2636841833591461, |
|
"eval_runtime": 15.6643, |
|
"eval_samples_per_second": 10.917, |
|
"eval_steps_per_second": 1.404, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4552238805970149, |
|
"grad_norm": 1.2392864227294922, |
|
"learning_rate": 9.958821057420752e-06, |
|
"loss": 0.4786, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 1.0299177169799805, |
|
"learning_rate": 9.955597317564705e-06, |
|
"loss": 0.6627, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4701492537313433, |
|
"grad_norm": 1.244341492652893, |
|
"learning_rate": 9.95225266258058e-06, |
|
"loss": 0.8958, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.9261450171470642, |
|
"learning_rate": 9.948787174076982e-06, |
|
"loss": 0.4079, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.48507462686567165, |
|
"grad_norm": 0.7639384269714355, |
|
"learning_rate": 9.945200936610821e-06, |
|
"loss": 0.3676, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 1.1056618690490723, |
|
"learning_rate": 9.941494037685244e-06, |
|
"loss": 0.6575, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.5201542377471924, |
|
"learning_rate": 9.9376665677475e-06, |
|
"loss": 0.5449, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.904944121837616, |
|
"learning_rate": 9.933718620186745e-06, |
|
"loss": 0.4245, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5149253731343284, |
|
"grad_norm": 0.9076265096664429, |
|
"learning_rate": 9.92965029133174e-06, |
|
"loss": 0.5433, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.8957193493843079, |
|
"learning_rate": 9.925461680448528e-06, |
|
"loss": 0.3874, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"eval_loss": 0.2620687484741211, |
|
"eval_runtime": 16.11, |
|
"eval_samples_per_second": 10.615, |
|
"eval_steps_per_second": 1.366, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5298507462686567, |
|
"grad_norm": 1.1844974756240845, |
|
"learning_rate": 9.921152889737985e-06, |
|
"loss": 0.7415, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 1.1240217685699463, |
|
"learning_rate": 9.91672402433335e-06, |
|
"loss": 0.6783, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5447761194029851, |
|
"grad_norm": 1.0070396661758423, |
|
"learning_rate": 9.912175192297648e-06, |
|
"loss": 0.3991, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 0.8779905438423157, |
|
"learning_rate": 9.907506504621052e-06, |
|
"loss": 0.3972, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"grad_norm": 1.2013908624649048, |
|
"learning_rate": 9.902718075218176e-06, |
|
"loss": 0.59, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 1.4874799251556396, |
|
"learning_rate": 9.897810020925301e-06, |
|
"loss": 0.6601, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5746268656716418, |
|
"grad_norm": 1.2102411985397339, |
|
"learning_rate": 9.892782461497521e-06, |
|
"loss": 0.6659, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 1.09522545337677, |
|
"learning_rate": 9.887635519605816e-06, |
|
"loss": 0.6796, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5895522388059702, |
|
"grad_norm": 1.020628809928894, |
|
"learning_rate": 9.882369320834068e-06, |
|
"loss": 0.6041, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 1.0358238220214844, |
|
"learning_rate": 9.87698399367599e-06, |
|
"loss": 0.7233, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"eval_loss": 0.2595260739326477, |
|
"eval_runtime": 16.6446, |
|
"eval_samples_per_second": 10.274, |
|
"eval_steps_per_second": 1.322, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6044776119402985, |
|
"grad_norm": 1.099974513053894, |
|
"learning_rate": 9.871479669531988e-06, |
|
"loss": 0.6985, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 1.3373053073883057, |
|
"learning_rate": 9.865856482705973e-06, |
|
"loss": 0.6532, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6194029850746269, |
|
"grad_norm": 0.9295642971992493, |
|
"learning_rate": 9.860114570402055e-06, |
|
"loss": 0.6586, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 0.8639213442802429, |
|
"learning_rate": 9.854254072721222e-06, |
|
"loss": 0.3806, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6343283582089553, |
|
"grad_norm": 1.1236610412597656, |
|
"learning_rate": 9.848275132657903e-06, |
|
"loss": 0.5605, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 1.0268014669418335, |
|
"learning_rate": 9.842177896096495e-06, |
|
"loss": 0.6164, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6492537313432836, |
|
"grad_norm": 1.311312198638916, |
|
"learning_rate": 9.835962511807786e-06, |
|
"loss": 0.6248, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 1.103390097618103, |
|
"learning_rate": 9.829629131445342e-06, |
|
"loss": 0.5902, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.664179104477612, |
|
"grad_norm": 1.1991970539093018, |
|
"learning_rate": 9.823177909541795e-06, |
|
"loss": 0.7637, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 1.0043079853057861, |
|
"learning_rate": 9.816609003505073e-06, |
|
"loss": 0.5539, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"eval_loss": 0.2582014799118042, |
|
"eval_runtime": 15.6475, |
|
"eval_samples_per_second": 10.928, |
|
"eval_steps_per_second": 1.406, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6791044776119403, |
|
"grad_norm": 0.7138786315917969, |
|
"learning_rate": 9.80992257361457e-06, |
|
"loss": 0.3945, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 1.0335519313812256, |
|
"learning_rate": 9.803118783017221e-06, |
|
"loss": 0.6189, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6940298507462687, |
|
"grad_norm": 1.3408584594726562, |
|
"learning_rate": 9.796197797723532e-06, |
|
"loss": 0.7297, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 1.0914255380630493, |
|
"learning_rate": 9.789159786603524e-06, |
|
"loss": 0.6801, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7089552238805971, |
|
"grad_norm": 1.1385866403579712, |
|
"learning_rate": 9.782004921382612e-06, |
|
"loss": 0.6327, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.8933367133140564, |
|
"learning_rate": 9.774733376637422e-06, |
|
"loss": 0.5923, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7238805970149254, |
|
"grad_norm": 1.1315196752548218, |
|
"learning_rate": 9.767345329791523e-06, |
|
"loss": 0.6883, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 0.9604437947273254, |
|
"learning_rate": 9.759840961111098e-06, |
|
"loss": 0.5106, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7388059701492538, |
|
"grad_norm": 1.1591802835464478, |
|
"learning_rate": 9.752220453700556e-06, |
|
"loss": 0.6319, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 1.2259401082992554, |
|
"learning_rate": 9.744483993498052e-06, |
|
"loss": 0.4872, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"eval_loss": 0.25687727332115173, |
|
"eval_runtime": 15.7529, |
|
"eval_samples_per_second": 10.855, |
|
"eval_steps_per_second": 1.397, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.753731343283582, |
|
"grad_norm": 0.6692157983779907, |
|
"learning_rate": 9.736631769270958e-06, |
|
"loss": 0.2946, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 1.2447078227996826, |
|
"learning_rate": 9.728663972611253e-06, |
|
"loss": 0.6527, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7686567164179104, |
|
"grad_norm": 1.0134166479110718, |
|
"learning_rate": 9.720580797930845e-06, |
|
"loss": 0.5679, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 1.3856276273727417, |
|
"learning_rate": 9.712382442456845e-06, |
|
"loss": 0.7747, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7835820895522388, |
|
"grad_norm": 1.1679767370224, |
|
"learning_rate": 9.704069106226728e-06, |
|
"loss": 0.6551, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 1.041477918624878, |
|
"learning_rate": 9.695640992083472e-06, |
|
"loss": 0.6679, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7985074626865671, |
|
"grad_norm": 1.5636087656021118, |
|
"learning_rate": 9.687098305670606e-06, |
|
"loss": 0.8609, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 1.0191229581832886, |
|
"learning_rate": 9.67844125542718e-06, |
|
"loss": 0.516, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8134328358208955, |
|
"grad_norm": 1.1325874328613281, |
|
"learning_rate": 9.669670052582695e-06, |
|
"loss": 0.7177, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.9325621128082275, |
|
"learning_rate": 9.66078491115194e-06, |
|
"loss": 0.3969, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"eval_loss": 0.25576016306877136, |
|
"eval_runtime": 15.6749, |
|
"eval_samples_per_second": 10.909, |
|
"eval_steps_per_second": 1.404, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8283582089552238, |
|
"grad_norm": 0.9852689504623413, |
|
"learning_rate": 9.651786047929772e-06, |
|
"loss": 0.4127, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 1.1226204633712769, |
|
"learning_rate": 9.642673682485831e-06, |
|
"loss": 0.7266, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8432835820895522, |
|
"grad_norm": 1.1815338134765625, |
|
"learning_rate": 9.633448037159167e-06, |
|
"loss": 0.7233, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 1.067295789718628, |
|
"learning_rate": 9.624109337052839e-06, |
|
"loss": 0.7226, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8582089552238806, |
|
"grad_norm": 0.7038566470146179, |
|
"learning_rate": 9.614657810028402e-06, |
|
"loss": 0.352, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 1.0434422492980957, |
|
"learning_rate": 9.605093686700356e-06, |
|
"loss": 0.7496, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8731343283582089, |
|
"grad_norm": 0.6814462542533875, |
|
"learning_rate": 9.595417200430517e-06, |
|
"loss": 0.3977, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 1.2506330013275146, |
|
"learning_rate": 9.585628587322329e-06, |
|
"loss": 0.5536, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8880597014925373, |
|
"grad_norm": 2.40456485748291, |
|
"learning_rate": 9.575728086215093e-06, |
|
"loss": 0.8602, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.9782617092132568, |
|
"learning_rate": 9.565715938678146e-06, |
|
"loss": 0.5878, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"eval_loss": 0.25602081418037415, |
|
"eval_runtime": 16.0769, |
|
"eval_samples_per_second": 10.636, |
|
"eval_steps_per_second": 1.368, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9029850746268657, |
|
"grad_norm": 0.8370435833930969, |
|
"learning_rate": 9.555592389004967e-06, |
|
"loss": 0.6102, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 1.0810141563415527, |
|
"learning_rate": 9.54535768420721e-06, |
|
"loss": 0.5595, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.917910447761194, |
|
"grad_norm": 1.0081559419631958, |
|
"learning_rate": 9.535012074008688e-06, |
|
"loss": 0.4607, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 0.9018644690513611, |
|
"learning_rate": 9.524555810839267e-06, |
|
"loss": 0.4455, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"grad_norm": 0.6649499535560608, |
|
"learning_rate": 9.513989149828718e-06, |
|
"loss": 0.2407, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 1.1930391788482666, |
|
"learning_rate": 9.503312348800486e-06, |
|
"loss": 0.6361, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9477611940298507, |
|
"grad_norm": 0.9348613023757935, |
|
"learning_rate": 9.4925256682654e-06, |
|
"loss": 0.4939, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 1.2714776992797852, |
|
"learning_rate": 9.481629371415315e-06, |
|
"loss": 0.6854, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9626865671641791, |
|
"grad_norm": 1.0918177366256714, |
|
"learning_rate": 9.470623724116693e-06, |
|
"loss": 0.6523, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 0.747653603553772, |
|
"learning_rate": 9.459508994904119e-06, |
|
"loss": 0.4212, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"eval_loss": 0.25406235456466675, |
|
"eval_runtime": 16.5104, |
|
"eval_samples_per_second": 10.357, |
|
"eval_steps_per_second": 1.332, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9776119402985075, |
|
"grad_norm": 0.867660641670227, |
|
"learning_rate": 9.448285454973739e-06, |
|
"loss": 0.4303, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 0.9838992357254028, |
|
"learning_rate": 9.43695337817665e-06, |
|
"loss": 0.6881, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9925373134328358, |
|
"grad_norm": 1.149843454360962, |
|
"learning_rate": 9.42551304101222e-06, |
|
"loss": 0.6812, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6692924499511719, |
|
"learning_rate": 9.413964722621339e-06, |
|
"loss": 0.3432, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.007462686567164, |
|
"grad_norm": 1.6719107627868652, |
|
"learning_rate": 9.4023087047796e-06, |
|
"loss": 0.5406, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0149253731343284, |
|
"grad_norm": 0.7576178312301636, |
|
"learning_rate": 9.390545271890438e-06, |
|
"loss": 0.4529, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0223880597014925, |
|
"grad_norm": 1.1865450143814087, |
|
"learning_rate": 9.378674710978185e-06, |
|
"loss": 0.4609, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0298507462686568, |
|
"grad_norm": 1.0263293981552124, |
|
"learning_rate": 9.366697311681058e-06, |
|
"loss": 0.5971, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.037313432835821, |
|
"grad_norm": 0.838365912437439, |
|
"learning_rate": 9.354613366244108e-06, |
|
"loss": 0.4553, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.9639071822166443, |
|
"learning_rate": 9.342423169512072e-06, |
|
"loss": 0.5695, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"eval_loss": 0.2554219663143158, |
|
"eval_runtime": 15.7211, |
|
"eval_samples_per_second": 10.877, |
|
"eval_steps_per_second": 1.399, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0522388059701493, |
|
"grad_norm": 1.9610925912857056, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.4975, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0597014925373134, |
|
"grad_norm": 0.6606642603874207, |
|
"learning_rate": 9.31772521449696e-06, |
|
"loss": 0.1836, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0671641791044777, |
|
"grad_norm": 1.3162277936935425, |
|
"learning_rate": 9.305218058836778e-06, |
|
"loss": 0.6204, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 1.4593067169189453, |
|
"learning_rate": 9.292605857112595e-06, |
|
"loss": 0.4129, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0820895522388059, |
|
"grad_norm": 0.8607202172279358, |
|
"learning_rate": 9.279888917058453e-06, |
|
"loss": 0.4644, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0895522388059702, |
|
"grad_norm": 1.15462064743042, |
|
"learning_rate": 9.267067548963975e-06, |
|
"loss": 0.4726, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0970149253731343, |
|
"grad_norm": 2.365873336791992, |
|
"learning_rate": 9.254142065666802e-06, |
|
"loss": 0.716, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1044776119402986, |
|
"grad_norm": 2.1044762134552, |
|
"learning_rate": 9.241112782544953e-06, |
|
"loss": 0.4993, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1119402985074627, |
|
"grad_norm": 0.7202847003936768, |
|
"learning_rate": 9.22798001750913e-06, |
|
"loss": 0.2366, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.93817538022995, |
|
"learning_rate": 9.214744090994973e-06, |
|
"loss": 0.4358, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"eval_loss": 0.2571178376674652, |
|
"eval_runtime": 16.0344, |
|
"eval_samples_per_second": 10.665, |
|
"eval_steps_per_second": 1.372, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.126865671641791, |
|
"grad_norm": 1.1024166345596313, |
|
"learning_rate": 9.201405325955222e-06, |
|
"loss": 0.4351, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1343283582089552, |
|
"grad_norm": 0.8645644187927246, |
|
"learning_rate": 9.187964047851851e-06, |
|
"loss": 0.4179, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1417910447761195, |
|
"grad_norm": 0.6371698975563049, |
|
"learning_rate": 9.174420584648123e-06, |
|
"loss": 0.1953, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1492537313432836, |
|
"grad_norm": 1.643067717552185, |
|
"learning_rate": 9.160775266800583e-06, |
|
"loss": 0.4733, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1567164179104479, |
|
"grad_norm": 0.9567477703094482, |
|
"learning_rate": 9.14702842725101e-06, |
|
"loss": 0.4015, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.164179104477612, |
|
"grad_norm": 0.8631231784820557, |
|
"learning_rate": 9.133180401418271e-06, |
|
"loss": 0.2983, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.171641791044776, |
|
"grad_norm": 0.6879262328147888, |
|
"learning_rate": 9.11923152719016e-06, |
|
"loss": 0.3474, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1791044776119404, |
|
"grad_norm": 0.9094505906105042, |
|
"learning_rate": 9.10518214491513e-06, |
|
"loss": 0.5146, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1865671641791045, |
|
"grad_norm": 0.7373756766319275, |
|
"learning_rate": 9.091032597394012e-06, |
|
"loss": 0.3366, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 1.2869657278060913, |
|
"learning_rate": 9.076783229871636e-06, |
|
"loss": 0.6705, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"eval_loss": 0.2553011476993561, |
|
"eval_runtime": 15.7243, |
|
"eval_samples_per_second": 10.875, |
|
"eval_steps_per_second": 1.399, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2014925373134329, |
|
"grad_norm": 1.151159405708313, |
|
"learning_rate": 9.062434390028407e-06, |
|
"loss": 0.5876, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.208955223880597, |
|
"grad_norm": 1.174561858177185, |
|
"learning_rate": 9.047986427971832e-06, |
|
"loss": 0.4806, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2164179104477613, |
|
"grad_norm": 1.2193154096603394, |
|
"learning_rate": 9.033439696227966e-06, |
|
"loss": 0.5455, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2238805970149254, |
|
"grad_norm": 1.2365609407424927, |
|
"learning_rate": 9.018794549732819e-06, |
|
"loss": 0.4742, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2313432835820897, |
|
"grad_norm": 1.3054121732711792, |
|
"learning_rate": 9.00405134582369e-06, |
|
"loss": 0.5833, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2388059701492538, |
|
"grad_norm": 0.5406851172447205, |
|
"learning_rate": 8.98921044423045e-06, |
|
"loss": 0.2561, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2462686567164178, |
|
"grad_norm": 1.277519702911377, |
|
"learning_rate": 8.974272207066767e-06, |
|
"loss": 0.686, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"grad_norm": 0.9183409810066223, |
|
"learning_rate": 8.959236998821267e-06, |
|
"loss": 0.6249, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2611940298507462, |
|
"grad_norm": 0.838150143623352, |
|
"learning_rate": 8.944105186348646e-06, |
|
"loss": 0.3115, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 0.821236252784729, |
|
"learning_rate": 8.928877138860708e-06, |
|
"loss": 0.4498, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"eval_loss": 0.2549591362476349, |
|
"eval_runtime": 15.6822, |
|
"eval_samples_per_second": 10.904, |
|
"eval_steps_per_second": 1.403, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2761194029850746, |
|
"grad_norm": 1.1852009296417236, |
|
"learning_rate": 8.913553227917366e-06, |
|
"loss": 0.5287, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2835820895522387, |
|
"grad_norm": 0.8103587627410889, |
|
"learning_rate": 8.89813382741758e-06, |
|
"loss": 0.4295, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.291044776119403, |
|
"grad_norm": 0.7645599246025085, |
|
"learning_rate": 8.882619313590212e-06, |
|
"loss": 0.3835, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2985074626865671, |
|
"grad_norm": 0.6826095581054688, |
|
"learning_rate": 8.86701006498488e-06, |
|
"loss": 0.2922, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"grad_norm": 0.7371439933776855, |
|
"learning_rate": 8.851306462462689e-06, |
|
"loss": 0.305, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 1.2219048738479614, |
|
"learning_rate": 8.835508889186957e-06, |
|
"loss": 0.6685, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3208955223880596, |
|
"grad_norm": 0.6370560526847839, |
|
"learning_rate": 8.819617730613863e-06, |
|
"loss": 0.2816, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.328358208955224, |
|
"grad_norm": 0.9550657272338867, |
|
"learning_rate": 8.803633374483036e-06, |
|
"loss": 0.433, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.335820895522388, |
|
"grad_norm": 0.9216323494911194, |
|
"learning_rate": 8.787556210808101e-06, |
|
"loss": 0.372, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 0.8051882386207581, |
|
"learning_rate": 8.771386631867158e-06, |
|
"loss": 0.4607, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"eval_loss": 0.255341112613678, |
|
"eval_runtime": 15.7753, |
|
"eval_samples_per_second": 10.84, |
|
"eval_steps_per_second": 1.395, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3507462686567164, |
|
"grad_norm": 1.3185007572174072, |
|
"learning_rate": 8.755125032193215e-06, |
|
"loss": 0.731, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3582089552238805, |
|
"grad_norm": 0.8819650411605835, |
|
"learning_rate": 8.738771808564555e-06, |
|
"loss": 0.3844, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3656716417910448, |
|
"grad_norm": 0.9434659481048584, |
|
"learning_rate": 8.722327359995064e-06, |
|
"loss": 0.4774, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.373134328358209, |
|
"grad_norm": 1.0129953622817993, |
|
"learning_rate": 8.705792087724485e-06, |
|
"loss": 0.6397, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3805970149253732, |
|
"grad_norm": 0.8750033378601074, |
|
"learning_rate": 8.689166395208638e-06, |
|
"loss": 0.5263, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3880597014925373, |
|
"grad_norm": 0.8555174469947815, |
|
"learning_rate": 8.672450688109563e-06, |
|
"loss": 0.4434, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3955223880597014, |
|
"grad_norm": 1.1890219449996948, |
|
"learning_rate": 8.655645374285637e-06, |
|
"loss": 0.4444, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4029850746268657, |
|
"grad_norm": 0.8571549654006958, |
|
"learning_rate": 8.638750863781614e-06, |
|
"loss": 0.2912, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4104477611940298, |
|
"grad_norm": 0.8109440207481384, |
|
"learning_rate": 8.621767568818614e-06, |
|
"loss": 0.5321, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 0.5370824933052063, |
|
"learning_rate": 8.60469590378408e-06, |
|
"loss": 0.2494, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"eval_loss": 0.2551746368408203, |
|
"eval_runtime": 16.3169, |
|
"eval_samples_per_second": 10.48, |
|
"eval_steps_per_second": 1.348, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4253731343283582, |
|
"grad_norm": 0.9776721596717834, |
|
"learning_rate": 8.587536285221656e-06, |
|
"loss": 0.4893, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.8991337418556213, |
|
"learning_rate": 8.570289131821025e-06, |
|
"loss": 0.4496, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4402985074626866, |
|
"grad_norm": 0.8432929515838623, |
|
"learning_rate": 8.552954864407699e-06, |
|
"loss": 0.3647, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4477611940298507, |
|
"grad_norm": 0.7621536254882812, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.3156, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.455223880597015, |
|
"grad_norm": 0.9852966070175171, |
|
"learning_rate": 8.518026681462448e-06, |
|
"loss": 0.3542, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"grad_norm": 1.1621863842010498, |
|
"learning_rate": 8.500433618167993e-06, |
|
"loss": 0.6648, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4701492537313432, |
|
"grad_norm": 0.845145583152771, |
|
"learning_rate": 8.482755145314987e-06, |
|
"loss": 0.2917, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4776119402985075, |
|
"grad_norm": 0.8889628648757935, |
|
"learning_rate": 8.464991694253001e-06, |
|
"loss": 0.3615, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.4850746268656716, |
|
"grad_norm": 0.9698963165283203, |
|
"learning_rate": 8.44714369840506e-06, |
|
"loss": 0.5225, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.7870234251022339, |
|
"learning_rate": 8.429211593257054e-06, |
|
"loss": 0.4429, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"eval_loss": 0.2533767521381378, |
|
"eval_runtime": 16.4295, |
|
"eval_samples_per_second": 10.408, |
|
"eval_steps_per_second": 1.339, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.8282889127731323, |
|
"learning_rate": 8.41119581634711e-06, |
|
"loss": 0.2888, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.5074626865671643, |
|
"grad_norm": 1.3417259454727173, |
|
"learning_rate": 8.393096807254932e-06, |
|
"loss": 0.8644, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5149253731343284, |
|
"grad_norm": 0.7999916076660156, |
|
"learning_rate": 8.374915007591053e-06, |
|
"loss": 0.4493, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.5223880597014925, |
|
"grad_norm": 1.507180094718933, |
|
"learning_rate": 8.356650860986083e-06, |
|
"loss": 0.4486, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5298507462686568, |
|
"grad_norm": 0.8137674927711487, |
|
"learning_rate": 8.338304813079866e-06, |
|
"loss": 0.4833, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.537313432835821, |
|
"grad_norm": 1.2336827516555786, |
|
"learning_rate": 8.319877311510614e-06, |
|
"loss": 0.6646, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.544776119402985, |
|
"grad_norm": 0.9395962953567505, |
|
"learning_rate": 8.301368805903988e-06, |
|
"loss": 0.455, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.9407913684844971, |
|
"learning_rate": 8.282779747862122e-06, |
|
"loss": 0.6132, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5597014925373134, |
|
"grad_norm": 0.7824919819831848, |
|
"learning_rate": 8.264110590952609e-06, |
|
"loss": 0.5489, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 0.7193758487701416, |
|
"learning_rate": 8.245361790697425e-06, |
|
"loss": 0.4639, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"eval_loss": 0.2530945837497711, |
|
"eval_runtime": 16.071, |
|
"eval_samples_per_second": 10.64, |
|
"eval_steps_per_second": 1.369, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5746268656716418, |
|
"grad_norm": 1.141431450843811, |
|
"learning_rate": 8.226533804561828e-06, |
|
"loss": 0.6021, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.582089552238806, |
|
"grad_norm": 0.8864127397537231, |
|
"learning_rate": 8.207627091943178e-06, |
|
"loss": 0.5709, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.5895522388059702, |
|
"grad_norm": 1.565999150276184, |
|
"learning_rate": 8.188642114159748e-06, |
|
"loss": 0.4649, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5970149253731343, |
|
"grad_norm": 1.0003879070281982, |
|
"learning_rate": 8.169579334439453e-06, |
|
"loss": 0.5816, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6044776119402986, |
|
"grad_norm": 1.0208603143692017, |
|
"learning_rate": 8.150439217908557e-06, |
|
"loss": 0.4815, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6119402985074627, |
|
"grad_norm": 0.6908705830574036, |
|
"learning_rate": 8.131222231580313e-06, |
|
"loss": 0.3388, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.6194029850746268, |
|
"grad_norm": 1.2532159090042114, |
|
"learning_rate": 8.11192884434358e-06, |
|
"loss": 0.6774, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.626865671641791, |
|
"grad_norm": 0.8950490951538086, |
|
"learning_rate": 8.092559526951374e-06, |
|
"loss": 0.6011, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.6343283582089554, |
|
"grad_norm": 1.6891182661056519, |
|
"learning_rate": 8.073114752009388e-06, |
|
"loss": 0.5921, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 1.0266426801681519, |
|
"learning_rate": 8.053594993964453e-06, |
|
"loss": 0.6355, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"eval_loss": 0.25234121084213257, |
|
"eval_runtime": 16.2031, |
|
"eval_samples_per_second": 10.554, |
|
"eval_steps_per_second": 1.358, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6492537313432836, |
|
"grad_norm": 0.7774348258972168, |
|
"learning_rate": 8.034000729092967e-06, |
|
"loss": 0.4213, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6567164179104479, |
|
"grad_norm": 0.9359886646270752, |
|
"learning_rate": 8.014332435489276e-06, |
|
"loss": 0.4473, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.664179104477612, |
|
"grad_norm": 0.8109466433525085, |
|
"learning_rate": 7.994590593054001e-06, |
|
"loss": 0.4733, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.7832448482513428, |
|
"learning_rate": 7.974775683482337e-06, |
|
"loss": 0.3198, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"grad_norm": 1.1789060831069946, |
|
"learning_rate": 7.954888190252292e-06, |
|
"loss": 0.5846, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6865671641791045, |
|
"grad_norm": 0.9324489831924438, |
|
"learning_rate": 7.934928598612896e-06, |
|
"loss": 0.3998, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.6940298507462686, |
|
"grad_norm": 1.1701297760009766, |
|
"learning_rate": 7.914897395572362e-06, |
|
"loss": 0.6273, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7014925373134329, |
|
"grad_norm": 0.9922040700912476, |
|
"learning_rate": 7.894795069886192e-06, |
|
"loss": 0.5599, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7089552238805972, |
|
"grad_norm": 0.8753315806388855, |
|
"learning_rate": 7.874622112045269e-06, |
|
"loss": 0.4123, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 1.1100213527679443, |
|
"learning_rate": 7.854379014263877e-06, |
|
"loss": 0.3805, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"eval_loss": 0.2517262399196625, |
|
"eval_runtime": 15.9604, |
|
"eval_samples_per_second": 10.714, |
|
"eval_steps_per_second": 1.378, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7238805970149254, |
|
"grad_norm": 0.9963248372077942, |
|
"learning_rate": 7.83406627046769e-06, |
|
"loss": 0.2903, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.7313432835820897, |
|
"grad_norm": 0.7608401775360107, |
|
"learning_rate": 7.81368437628173e-06, |
|
"loss": 0.4172, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.7388059701492538, |
|
"grad_norm": 1.166113018989563, |
|
"learning_rate": 7.793233829018263e-06, |
|
"loss": 0.609, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.7462686567164178, |
|
"grad_norm": 0.7339043617248535, |
|
"learning_rate": 7.772715127664676e-06, |
|
"loss": 0.3509, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7537313432835822, |
|
"grad_norm": 1.2148951292037964, |
|
"learning_rate": 7.752128772871292e-06, |
|
"loss": 0.4682, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.7611940298507462, |
|
"grad_norm": 2.158393383026123, |
|
"learning_rate": 7.731475266939159e-06, |
|
"loss": 0.4768, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.7686567164179103, |
|
"grad_norm": 1.624114990234375, |
|
"learning_rate": 7.710755113807793e-06, |
|
"loss": 0.5563, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7761194029850746, |
|
"grad_norm": 0.6288245916366577, |
|
"learning_rate": 7.689968819042884e-06, |
|
"loss": 0.2496, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.783582089552239, |
|
"grad_norm": 0.6780006885528564, |
|
"learning_rate": 7.669116889823955e-06, |
|
"loss": 0.243, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 1.302188515663147, |
|
"learning_rate": 7.648199834931994e-06, |
|
"loss": 0.5689, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"eval_loss": 0.25215157866477966, |
|
"eval_runtime": 16.1686, |
|
"eval_samples_per_second": 10.576, |
|
"eval_steps_per_second": 1.361, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7985074626865671, |
|
"grad_norm": 0.9067533016204834, |
|
"learning_rate": 7.627218164737031e-06, |
|
"loss": 0.4587, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8059701492537314, |
|
"grad_norm": 0.7963538765907288, |
|
"learning_rate": 7.6061723911857e-06, |
|
"loss": 0.4912, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8134328358208955, |
|
"grad_norm": 0.8649603724479675, |
|
"learning_rate": 7.58506302778873e-06, |
|
"loss": 0.3812, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8208955223880596, |
|
"grad_norm": 0.5839230418205261, |
|
"learning_rate": 7.563890589608427e-06, |
|
"loss": 0.341, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.828358208955224, |
|
"grad_norm": 0.8967713117599487, |
|
"learning_rate": 7.542655593246103e-06, |
|
"loss": 0.5506, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.835820895522388, |
|
"grad_norm": 1.0193324089050293, |
|
"learning_rate": 7.52135855682947e-06, |
|
"loss": 0.7101, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8432835820895521, |
|
"grad_norm": 1.0445090532302856, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.4873, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8507462686567164, |
|
"grad_norm": 1.0798559188842773, |
|
"learning_rate": 7.478580443900247e-06, |
|
"loss": 0.6465, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8582089552238807, |
|
"grad_norm": 1.5234549045562744, |
|
"learning_rate": 7.457100411161128e-06, |
|
"loss": 0.6318, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 1.1321910619735718, |
|
"learning_rate": 7.435560425889169e-06, |
|
"loss": 0.5929, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"eval_loss": 0.25326502323150635, |
|
"eval_runtime": 15.7124, |
|
"eval_samples_per_second": 10.883, |
|
"eval_steps_per_second": 1.4, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.873134328358209, |
|
"grad_norm": 0.8814710974693298, |
|
"learning_rate": 7.413961013653725e-06, |
|
"loss": 0.413, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"grad_norm": 0.782132625579834, |
|
"learning_rate": 7.392302701474151e-06, |
|
"loss": 0.4623, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.8880597014925373, |
|
"grad_norm": 1.0612297058105469, |
|
"learning_rate": 7.370586017806942e-06, |
|
"loss": 0.6307, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.8955223880597014, |
|
"grad_norm": 0.7790008783340454, |
|
"learning_rate": 7.34881149253284e-06, |
|
"loss": 0.4426, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.9029850746268657, |
|
"grad_norm": 0.8862120509147644, |
|
"learning_rate": 7.326979656943907e-06, |
|
"loss": 0.56, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.6480329632759094, |
|
"learning_rate": 7.305091043730558e-06, |
|
"loss": 0.3719, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.917910447761194, |
|
"grad_norm": 1.0156606435775757, |
|
"learning_rate": 7.283146186968566e-06, |
|
"loss": 0.4542, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.9253731343283582, |
|
"grad_norm": 0.8710734248161316, |
|
"learning_rate": 7.261145622106033e-06, |
|
"loss": 0.4221, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.9328358208955225, |
|
"grad_norm": 0.9503659009933472, |
|
"learning_rate": 7.239089885950317e-06, |
|
"loss": 0.474, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 0.9117936491966248, |
|
"learning_rate": 7.216979516654944e-06, |
|
"loss": 0.5434, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"eval_loss": 0.25116848945617676, |
|
"eval_runtime": 15.9835, |
|
"eval_samples_per_second": 10.699, |
|
"eval_steps_per_second": 1.376, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9477611940298507, |
|
"grad_norm": 0.9242149591445923, |
|
"learning_rate": 7.194815053706471e-06, |
|
"loss": 0.4634, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.955223880597015, |
|
"grad_norm": 1.1205576658248901, |
|
"learning_rate": 7.172597037911323e-06, |
|
"loss": 0.5393, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.962686567164179, |
|
"grad_norm": 0.9420229196548462, |
|
"learning_rate": 7.1503260113826035e-06, |
|
"loss": 0.5668, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.9701492537313432, |
|
"grad_norm": 0.8083785772323608, |
|
"learning_rate": 7.128002517526856e-06, |
|
"loss": 0.3753, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.9776119402985075, |
|
"grad_norm": 0.8978068232536316, |
|
"learning_rate": 7.105627101030816e-06, |
|
"loss": 0.5198, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9850746268656716, |
|
"grad_norm": 0.7519446611404419, |
|
"learning_rate": 7.083200307848116e-06, |
|
"loss": 0.3855, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.9925373134328357, |
|
"grad_norm": 0.9396398663520813, |
|
"learning_rate": 7.060722685185961e-06, |
|
"loss": 0.4073, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6269404888153076, |
|
"learning_rate": 7.038194781491785e-06, |
|
"loss": 0.3679, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.0074626865671643, |
|
"grad_norm": 0.6594401001930237, |
|
"learning_rate": 7.015617146439863e-06, |
|
"loss": 0.2737, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.014925373134328, |
|
"grad_norm": 1.2462356090545654, |
|
"learning_rate": 6.992990330917897e-06, |
|
"loss": 0.4577, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.014925373134328, |
|
"eval_loss": 0.2520785927772522, |
|
"eval_runtime": 16.235, |
|
"eval_samples_per_second": 10.533, |
|
"eval_steps_per_second": 1.355, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0223880597014925, |
|
"grad_norm": 0.8819834589958191, |
|
"learning_rate": 6.970314887013585e-06, |
|
"loss": 0.3055, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.029850746268657, |
|
"grad_norm": 0.7270020842552185, |
|
"learning_rate": 6.947591368001138e-06, |
|
"loss": 0.3549, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.0373134328358207, |
|
"grad_norm": 0.5658406019210815, |
|
"learning_rate": 6.924820328327786e-06, |
|
"loss": 0.1953, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.044776119402985, |
|
"grad_norm": 1.2108463048934937, |
|
"learning_rate": 6.902002323600252e-06, |
|
"loss": 0.4244, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.0522388059701493, |
|
"grad_norm": 1.1563349962234497, |
|
"learning_rate": 6.879137910571191e-06, |
|
"loss": 0.5069, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.0597014925373136, |
|
"grad_norm": 1.2851577997207642, |
|
"learning_rate": 6.856227647125607e-06, |
|
"loss": 0.2914, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.0671641791044775, |
|
"grad_norm": 1.4853419065475464, |
|
"learning_rate": 6.833272092267242e-06, |
|
"loss": 0.4562, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.074626865671642, |
|
"grad_norm": 0.7328011393547058, |
|
"learning_rate": 6.810271806104931e-06, |
|
"loss": 0.4034, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.082089552238806, |
|
"grad_norm": 1.4539604187011719, |
|
"learning_rate": 6.787227349838946e-06, |
|
"loss": 0.4689, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"grad_norm": 0.8328655958175659, |
|
"learning_rate": 6.764139285747292e-06, |
|
"loss": 0.3586, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"eval_loss": 0.26068779826164246, |
|
"eval_runtime": 15.7338, |
|
"eval_samples_per_second": 10.868, |
|
"eval_steps_per_second": 1.398, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.0970149253731343, |
|
"grad_norm": 0.7921857833862305, |
|
"learning_rate": 6.741008177171995e-06, |
|
"loss": 0.3717, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.1044776119402986, |
|
"grad_norm": 0.9007959365844727, |
|
"learning_rate": 6.71783458850535e-06, |
|
"loss": 0.4912, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.111940298507463, |
|
"grad_norm": 1.3404608964920044, |
|
"learning_rate": 6.694619085176159e-06, |
|
"loss": 0.391, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.1194029850746268, |
|
"grad_norm": 1.371519923210144, |
|
"learning_rate": 6.671362233635926e-06, |
|
"loss": 0.3481, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.126865671641791, |
|
"grad_norm": 0.8897073864936829, |
|
"learning_rate": 6.64806460134504e-06, |
|
"loss": 0.4761, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.1343283582089554, |
|
"grad_norm": 0.9109272360801697, |
|
"learning_rate": 6.624726756758928e-06, |
|
"loss": 0.4339, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.1417910447761193, |
|
"grad_norm": 1.2197555303573608, |
|
"learning_rate": 6.601349269314188e-06, |
|
"loss": 0.3753, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.1492537313432836, |
|
"grad_norm": 0.5606504082679749, |
|
"learning_rate": 6.57793270941469e-06, |
|
"loss": 0.2435, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.156716417910448, |
|
"grad_norm": 1.435612678527832, |
|
"learning_rate": 6.554477648417657e-06, |
|
"loss": 0.5838, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.1641791044776117, |
|
"grad_norm": 1.0434784889221191, |
|
"learning_rate": 6.530984658619735e-06, |
|
"loss": 0.4453, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.1641791044776117, |
|
"eval_loss": 0.26290109753608704, |
|
"eval_runtime": 16.2782, |
|
"eval_samples_per_second": 10.505, |
|
"eval_steps_per_second": 1.352, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.171641791044776, |
|
"grad_norm": 1.0325040817260742, |
|
"learning_rate": 6.507454313243016e-06, |
|
"loss": 0.5054, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.1791044776119404, |
|
"grad_norm": 0.8378118276596069, |
|
"learning_rate": 6.483887186421058e-06, |
|
"loss": 0.3621, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.1865671641791047, |
|
"grad_norm": 1.2645375728607178, |
|
"learning_rate": 6.46028385318488e-06, |
|
"loss": 0.3958, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.1940298507462686, |
|
"grad_norm": 1.0846723318099976, |
|
"learning_rate": 6.43664488944892e-06, |
|
"loss": 0.3189, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.201492537313433, |
|
"grad_norm": 0.8250584006309509, |
|
"learning_rate": 6.412970871996995e-06, |
|
"loss": 0.2645, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.208955223880597, |
|
"grad_norm": 1.2599821090698242, |
|
"learning_rate": 6.389262378468219e-06, |
|
"loss": 0.529, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.216417910447761, |
|
"grad_norm": 0.8289515376091003, |
|
"learning_rate": 6.365519987342916e-06, |
|
"loss": 0.3789, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.2238805970149254, |
|
"grad_norm": 0.641613245010376, |
|
"learning_rate": 6.3417442779285e-06, |
|
"loss": 0.2762, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.2313432835820897, |
|
"grad_norm": 1.0464789867401123, |
|
"learning_rate": 6.3179358303453386e-06, |
|
"loss": 0.4239, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 1.1553277969360352, |
|
"learning_rate": 6.294095225512604e-06, |
|
"loss": 0.5326, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"eval_loss": 0.2604566514492035, |
|
"eval_runtime": 16.1917, |
|
"eval_samples_per_second": 10.561, |
|
"eval_steps_per_second": 1.359, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.246268656716418, |
|
"grad_norm": 0.7388222217559814, |
|
"learning_rate": 6.2702230451340965e-06, |
|
"loss": 0.3279, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.253731343283582, |
|
"grad_norm": 0.7281888723373413, |
|
"learning_rate": 6.246319871684048e-06, |
|
"loss": 0.337, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.2611940298507465, |
|
"grad_norm": 0.6409085392951965, |
|
"learning_rate": 6.222386288392914e-06, |
|
"loss": 0.2503, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.2686567164179103, |
|
"grad_norm": 0.7434332370758057, |
|
"learning_rate": 6.198422879233141e-06, |
|
"loss": 0.2295, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.2761194029850746, |
|
"grad_norm": 0.8402727246284485, |
|
"learning_rate": 6.17443022890492e-06, |
|
"loss": 0.4885, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.283582089552239, |
|
"grad_norm": 0.8008245825767517, |
|
"learning_rate": 6.150408922821911e-06, |
|
"loss": 0.2593, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.291044776119403, |
|
"grad_norm": 0.6839753985404968, |
|
"learning_rate": 6.126359547096975e-06, |
|
"loss": 0.3719, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.298507462686567, |
|
"grad_norm": 0.8366461992263794, |
|
"learning_rate": 6.10228268852786e-06, |
|
"loss": 0.3398, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.3059701492537314, |
|
"grad_norm": 1.5120458602905273, |
|
"learning_rate": 6.0781789345828854e-06, |
|
"loss": 0.4279, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.3134328358208958, |
|
"grad_norm": 1.2554453611373901, |
|
"learning_rate": 6.0540488733866135e-06, |
|
"loss": 0.301, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.3134328358208958, |
|
"eval_loss": 0.26177600026130676, |
|
"eval_runtime": 16.0546, |
|
"eval_samples_per_second": 10.651, |
|
"eval_steps_per_second": 1.37, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.3208955223880596, |
|
"grad_norm": 0.8985257744789124, |
|
"learning_rate": 6.029893093705492e-06, |
|
"loss": 0.3436, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.328358208955224, |
|
"grad_norm": 0.720777153968811, |
|
"learning_rate": 6.005712184933497e-06, |
|
"loss": 0.2736, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.3358208955223883, |
|
"grad_norm": 0.8804069757461548, |
|
"learning_rate": 5.981506737077744e-06, |
|
"loss": 0.3795, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.343283582089552, |
|
"grad_norm": 0.6901626586914062, |
|
"learning_rate": 5.957277340744094e-06, |
|
"loss": 0.3386, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.3507462686567164, |
|
"grad_norm": 0.914481520652771, |
|
"learning_rate": 5.933024587122745e-06, |
|
"loss": 0.4116, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.3582089552238807, |
|
"grad_norm": 1.446872591972351, |
|
"learning_rate": 5.908749067973809e-06, |
|
"loss": 0.4929, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.3656716417910446, |
|
"grad_norm": 0.7975624799728394, |
|
"learning_rate": 5.884451375612865e-06, |
|
"loss": 0.327, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.373134328358209, |
|
"grad_norm": 0.6912096738815308, |
|
"learning_rate": 5.860132102896516e-06, |
|
"loss": 0.3441, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.3805970149253732, |
|
"grad_norm": 0.9542496204376221, |
|
"learning_rate": 5.835791843207916e-06, |
|
"loss": 0.2949, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 0.7698549628257751, |
|
"learning_rate": 5.8114311904423e-06, |
|
"loss": 0.4541, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"eval_loss": 0.26203152537345886, |
|
"eval_runtime": 15.9454, |
|
"eval_samples_per_second": 10.724, |
|
"eval_steps_per_second": 1.38, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.3955223880597014, |
|
"grad_norm": 1.0755155086517334, |
|
"learning_rate": 5.787050738992483e-06, |
|
"loss": 0.3032, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.4029850746268657, |
|
"grad_norm": 1.0089167356491089, |
|
"learning_rate": 5.762651083734363e-06, |
|
"loss": 0.5431, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.41044776119403, |
|
"grad_norm": 0.7049623131752014, |
|
"learning_rate": 5.738232820012407e-06, |
|
"loss": 0.4098, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.417910447761194, |
|
"grad_norm": 0.8850160837173462, |
|
"learning_rate": 5.713796543625123e-06, |
|
"loss": 0.3144, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.425373134328358, |
|
"grad_norm": 0.9407287836074829, |
|
"learning_rate": 5.689342850810523e-06, |
|
"loss": 0.4177, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.4328358208955225, |
|
"grad_norm": 0.7530587911605835, |
|
"learning_rate": 5.664872338231572e-06, |
|
"loss": 0.4969, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.4402985074626864, |
|
"grad_norm": 0.7545952200889587, |
|
"learning_rate": 5.640385602961634e-06, |
|
"loss": 0.3682, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.4477611940298507, |
|
"grad_norm": 0.5804095268249512, |
|
"learning_rate": 5.615883242469906e-06, |
|
"loss": 0.1917, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.455223880597015, |
|
"grad_norm": 0.7050689458847046, |
|
"learning_rate": 5.591365854606829e-06, |
|
"loss": 0.2552, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.4626865671641793, |
|
"grad_norm": 1.56814444065094, |
|
"learning_rate": 5.5668340375895116e-06, |
|
"loss": 0.5329, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4626865671641793, |
|
"eval_loss": 0.26006871461868286, |
|
"eval_runtime": 15.6791, |
|
"eval_samples_per_second": 10.906, |
|
"eval_steps_per_second": 1.403, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.470149253731343, |
|
"grad_norm": 0.9624131917953491, |
|
"learning_rate": 5.542288389987128e-06, |
|
"loss": 0.4644, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.4776119402985075, |
|
"grad_norm": 1.158132553100586, |
|
"learning_rate": 5.517729510706316e-06, |
|
"loss": 0.3623, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.485074626865672, |
|
"grad_norm": 0.9126956462860107, |
|
"learning_rate": 5.493157998976559e-06, |
|
"loss": 0.4116, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.4925373134328357, |
|
"grad_norm": 0.9858826994895935, |
|
"learning_rate": 5.4685744543355745e-06, |
|
"loss": 0.3593, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.7516330480575562, |
|
"learning_rate": 5.443979476614674e-06, |
|
"loss": 0.354, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.5074626865671643, |
|
"grad_norm": 1.040735125541687, |
|
"learning_rate": 5.419373665924137e-06, |
|
"loss": 0.5071, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.5149253731343286, |
|
"grad_norm": 0.7374666929244995, |
|
"learning_rate": 5.39475762263856e-06, |
|
"loss": 0.2379, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.5223880597014925, |
|
"grad_norm": 0.8818290829658508, |
|
"learning_rate": 5.370131947382215e-06, |
|
"loss": 0.4322, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.529850746268657, |
|
"grad_norm": 0.8839866518974304, |
|
"learning_rate": 5.34549724101439e-06, |
|
"loss": 0.3252, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.5373134328358207, |
|
"grad_norm": 0.651013195514679, |
|
"learning_rate": 5.320854104614731e-06, |
|
"loss": 0.3228, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.5373134328358207, |
|
"eval_loss": 0.2595307230949402, |
|
"eval_runtime": 16.0858, |
|
"eval_samples_per_second": 10.631, |
|
"eval_steps_per_second": 1.368, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.544776119402985, |
|
"grad_norm": 0.9826297760009766, |
|
"learning_rate": 5.296203139468572e-06, |
|
"loss": 0.3158, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.5522388059701493, |
|
"grad_norm": 0.8578550815582275, |
|
"learning_rate": 5.271544947052267e-06, |
|
"loss": 0.3244, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.5597014925373136, |
|
"grad_norm": 0.9544288516044617, |
|
"learning_rate": 5.246880129018515e-06, |
|
"loss": 0.2803, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.5671641791044775, |
|
"grad_norm": 0.8803668022155762, |
|
"learning_rate": 5.222209287181677e-06, |
|
"loss": 0.4387, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.574626865671642, |
|
"grad_norm": 1.0005831718444824, |
|
"learning_rate": 5.19753302350309e-06, |
|
"loss": 0.5407, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.582089552238806, |
|
"grad_norm": 1.123563528060913, |
|
"learning_rate": 5.172851940076388e-06, |
|
"loss": 0.3194, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.58955223880597, |
|
"grad_norm": 0.7774394750595093, |
|
"learning_rate": 5.148166639112799e-06, |
|
"loss": 0.3315, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.5970149253731343, |
|
"grad_norm": 0.7469123601913452, |
|
"learning_rate": 5.123477722926464e-06, |
|
"loss": 0.1694, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.6044776119402986, |
|
"grad_norm": 0.7881268262863159, |
|
"learning_rate": 5.098785793919733e-06, |
|
"loss": 0.3663, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"grad_norm": 0.8902544975280762, |
|
"learning_rate": 5.074091454568464e-06, |
|
"loss": 0.3029, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"eval_loss": 0.25970542430877686, |
|
"eval_runtime": 16.1426, |
|
"eval_samples_per_second": 10.593, |
|
"eval_steps_per_second": 1.363, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.6194029850746268, |
|
"grad_norm": 0.8880248069763184, |
|
"learning_rate": 5.049395307407329e-06, |
|
"loss": 0.5019, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.626865671641791, |
|
"grad_norm": 0.8329271078109741, |
|
"learning_rate": 5.024697955015112e-06, |
|
"loss": 0.3595, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.6343283582089554, |
|
"grad_norm": 1.1972229480743408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3894, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.6417910447761193, |
|
"grad_norm": 0.9930683970451355, |
|
"learning_rate": 4.975302044984889e-06, |
|
"loss": 0.326, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.6492537313432836, |
|
"grad_norm": 1.0038172006607056, |
|
"learning_rate": 4.9506046925926725e-06, |
|
"loss": 0.354, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.656716417910448, |
|
"grad_norm": 0.7919688820838928, |
|
"learning_rate": 4.925908545431537e-06, |
|
"loss": 0.418, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.664179104477612, |
|
"grad_norm": 0.8915368914604187, |
|
"learning_rate": 4.901214206080269e-06, |
|
"loss": 0.5617, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.671641791044776, |
|
"grad_norm": 0.8227723240852356, |
|
"learning_rate": 4.876522277073535e-06, |
|
"loss": 0.3792, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.6791044776119404, |
|
"grad_norm": 0.8071901202201843, |
|
"learning_rate": 4.8518333608872015e-06, |
|
"loss": 0.3388, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.6865671641791042, |
|
"grad_norm": 0.5898212790489197, |
|
"learning_rate": 4.827148059923614e-06, |
|
"loss": 0.2633, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.6865671641791042, |
|
"eval_loss": 0.25960928201675415, |
|
"eval_runtime": 15.6478, |
|
"eval_samples_per_second": 10.928, |
|
"eval_steps_per_second": 1.406, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.6940298507462686, |
|
"grad_norm": 0.8067932724952698, |
|
"learning_rate": 4.802466976496911e-06, |
|
"loss": 0.2719, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.701492537313433, |
|
"grad_norm": 0.7619811296463013, |
|
"learning_rate": 4.777790712818324e-06, |
|
"loss": 0.4567, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.708955223880597, |
|
"grad_norm": 0.7938811779022217, |
|
"learning_rate": 4.753119870981486e-06, |
|
"loss": 0.375, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.716417910447761, |
|
"grad_norm": 0.8780052065849304, |
|
"learning_rate": 4.728455052947732e-06, |
|
"loss": 0.2885, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.7238805970149254, |
|
"grad_norm": 0.8700647950172424, |
|
"learning_rate": 4.703796860531429e-06, |
|
"loss": 0.3558, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.7313432835820897, |
|
"grad_norm": 0.9367011785507202, |
|
"learning_rate": 4.6791458953852695e-06, |
|
"loss": 0.4803, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.7388059701492535, |
|
"grad_norm": 0.9958668947219849, |
|
"learning_rate": 4.654502758985611e-06, |
|
"loss": 0.3288, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.746268656716418, |
|
"grad_norm": 0.8954978585243225, |
|
"learning_rate": 4.629868052617786e-06, |
|
"loss": 0.4357, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.753731343283582, |
|
"grad_norm": 0.7825214862823486, |
|
"learning_rate": 4.605242377361441e-06, |
|
"loss": 0.3466, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.7611940298507465, |
|
"grad_norm": 1.1193434000015259, |
|
"learning_rate": 4.580626334075864e-06, |
|
"loss": 0.5169, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7611940298507465, |
|
"eval_loss": 0.26192060112953186, |
|
"eval_runtime": 15.6759, |
|
"eval_samples_per_second": 10.908, |
|
"eval_steps_per_second": 1.403, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7686567164179103, |
|
"grad_norm": 0.9331968426704407, |
|
"learning_rate": 4.556020523385326e-06, |
|
"loss": 0.4555, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.7761194029850746, |
|
"grad_norm": 0.9450701475143433, |
|
"learning_rate": 4.5314255456644255e-06, |
|
"loss": 0.4791, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.783582089552239, |
|
"grad_norm": 0.9654837846755981, |
|
"learning_rate": 4.506842001023442e-06, |
|
"loss": 0.338, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.791044776119403, |
|
"grad_norm": 0.7904189825057983, |
|
"learning_rate": 4.482270489293685e-06, |
|
"loss": 0.3524, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.798507462686567, |
|
"grad_norm": 0.8327943682670593, |
|
"learning_rate": 4.457711610012873e-06, |
|
"loss": 0.396, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.8059701492537314, |
|
"grad_norm": 0.7460355758666992, |
|
"learning_rate": 4.433165962410488e-06, |
|
"loss": 0.4905, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.8134328358208958, |
|
"grad_norm": 0.7441300749778748, |
|
"learning_rate": 4.408634145393172e-06, |
|
"loss": 0.2541, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.8208955223880596, |
|
"grad_norm": 0.8444567918777466, |
|
"learning_rate": 4.384116757530094e-06, |
|
"loss": 0.2268, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.828358208955224, |
|
"grad_norm": 0.9554976224899292, |
|
"learning_rate": 4.3596143970383665e-06, |
|
"loss": 0.1891, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 0.915585458278656, |
|
"learning_rate": 4.335127661768429e-06, |
|
"loss": 0.4994, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.835820895522388, |
|
"eval_loss": 0.26057010889053345, |
|
"eval_runtime": 16.0182, |
|
"eval_samples_per_second": 10.675, |
|
"eval_steps_per_second": 1.373, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.843283582089552, |
|
"grad_norm": 0.8522178530693054, |
|
"learning_rate": 4.310657149189479e-06, |
|
"loss": 0.3007, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.8507462686567164, |
|
"grad_norm": 0.8770399689674377, |
|
"learning_rate": 4.286203456374877e-06, |
|
"loss": 0.4177, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.8582089552238807, |
|
"grad_norm": 0.8985636234283447, |
|
"learning_rate": 4.261767179987595e-06, |
|
"loss": 0.495, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.8656716417910446, |
|
"grad_norm": 0.7080250978469849, |
|
"learning_rate": 4.237348916265638e-06, |
|
"loss": 0.2103, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.873134328358209, |
|
"grad_norm": 0.8011423945426941, |
|
"learning_rate": 4.212949261007519e-06, |
|
"loss": 0.3361, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.8805970149253732, |
|
"grad_norm": 0.5794147849082947, |
|
"learning_rate": 4.188568809557701e-06, |
|
"loss": 0.2513, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.888059701492537, |
|
"grad_norm": 1.1606743335723877, |
|
"learning_rate": 4.1642081567920845e-06, |
|
"loss": 0.5069, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.8955223880597014, |
|
"grad_norm": 0.6556090116500854, |
|
"learning_rate": 4.139867897103484e-06, |
|
"loss": 0.3488, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.9029850746268657, |
|
"grad_norm": 0.8688616156578064, |
|
"learning_rate": 4.115548624387136e-06, |
|
"loss": 0.3247, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.91044776119403, |
|
"grad_norm": 0.8440492749214172, |
|
"learning_rate": 4.0912509320261915e-06, |
|
"loss": 0.483, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.91044776119403, |
|
"eval_loss": 0.2596059739589691, |
|
"eval_runtime": 16.0245, |
|
"eval_samples_per_second": 10.671, |
|
"eval_steps_per_second": 1.373, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.917910447761194, |
|
"grad_norm": 0.6580593585968018, |
|
"learning_rate": 4.0669754128772554e-06, |
|
"loss": 0.235, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.925373134328358, |
|
"grad_norm": 0.7615340948104858, |
|
"learning_rate": 4.042722659255907e-06, |
|
"loss": 0.4621, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.9328358208955225, |
|
"grad_norm": 1.1443612575531006, |
|
"learning_rate": 4.018493262922258e-06, |
|
"loss": 0.4012, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.9402985074626864, |
|
"grad_norm": 0.6693568229675293, |
|
"learning_rate": 3.9942878150665035e-06, |
|
"loss": 0.3198, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.9477611940298507, |
|
"grad_norm": 1.4814910888671875, |
|
"learning_rate": 3.970106906294509e-06, |
|
"loss": 0.5098, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.955223880597015, |
|
"grad_norm": 0.9094993472099304, |
|
"learning_rate": 3.945951126613387e-06, |
|
"loss": 0.4186, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.9626865671641793, |
|
"grad_norm": 0.9584222435951233, |
|
"learning_rate": 3.921821065417116e-06, |
|
"loss": 0.5191, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.970149253731343, |
|
"grad_norm": 0.7663047909736633, |
|
"learning_rate": 3.897717311472141e-06, |
|
"loss": 0.4433, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.9776119402985075, |
|
"grad_norm": 1.081271767616272, |
|
"learning_rate": 3.8736404529030255e-06, |
|
"loss": 0.4271, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 1.1289445161819458, |
|
"learning_rate": 3.84959107717809e-06, |
|
"loss": 0.3253, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"eval_loss": 0.26036304235458374, |
|
"eval_runtime": 15.6729, |
|
"eval_samples_per_second": 10.911, |
|
"eval_steps_per_second": 1.404, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9925373134328357, |
|
"grad_norm": 1.1338776350021362, |
|
"learning_rate": 3.825569771095082e-06, |
|
"loss": 0.4307, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.5428889989852905, |
|
"learning_rate": 3.8015771207668593e-06, |
|
"loss": 0.2129, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.0074626865671643, |
|
"grad_norm": 0.9923998713493347, |
|
"learning_rate": 3.777613711607087e-06, |
|
"loss": 0.4193, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.014925373134328, |
|
"grad_norm": 0.9020700454711914, |
|
"learning_rate": 3.7536801283159523e-06, |
|
"loss": 0.3515, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.0223880597014925, |
|
"grad_norm": 0.8282939791679382, |
|
"learning_rate": 3.729776954865905e-06, |
|
"loss": 0.2543, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.029850746268657, |
|
"grad_norm": 1.066192388534546, |
|
"learning_rate": 3.705904774487396e-06, |
|
"loss": 0.335, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.0373134328358207, |
|
"grad_norm": 0.9270622730255127, |
|
"learning_rate": 3.682064169654663e-06, |
|
"loss": 0.1641, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.044776119402985, |
|
"grad_norm": 0.8690205812454224, |
|
"learning_rate": 3.6582557220715005e-06, |
|
"loss": 0.2455, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.0522388059701493, |
|
"grad_norm": 0.7719996571540833, |
|
"learning_rate": 3.6344800126570846e-06, |
|
"loss": 0.2562, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.0597014925373136, |
|
"grad_norm": 1.1272475719451904, |
|
"learning_rate": 3.6107376215317813e-06, |
|
"loss": 0.3581, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.0597014925373136, |
|
"eval_loss": 0.27242982387542725, |
|
"eval_runtime": 16.0973, |
|
"eval_samples_per_second": 10.623, |
|
"eval_steps_per_second": 1.367, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.0671641791044775, |
|
"grad_norm": 1.175228238105774, |
|
"learning_rate": 3.587029128003006e-06, |
|
"loss": 0.6137, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.074626865671642, |
|
"grad_norm": 0.7905718088150024, |
|
"learning_rate": 3.563355110551081e-06, |
|
"loss": 0.2536, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.082089552238806, |
|
"grad_norm": 0.6532904505729675, |
|
"learning_rate": 3.539716146815122e-06, |
|
"loss": 0.2298, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.08955223880597, |
|
"grad_norm": 0.8333055377006531, |
|
"learning_rate": 3.5161128135789414e-06, |
|
"loss": 0.2019, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.0970149253731343, |
|
"grad_norm": 0.7311452031135559, |
|
"learning_rate": 3.492545686756986e-06, |
|
"loss": 0.2557, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.1044776119402986, |
|
"grad_norm": 0.6547526121139526, |
|
"learning_rate": 3.469015341380266e-06, |
|
"loss": 0.2032, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.111940298507463, |
|
"grad_norm": 0.9133289456367493, |
|
"learning_rate": 3.4455223515823442e-06, |
|
"loss": 0.3015, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.1194029850746268, |
|
"grad_norm": 0.8621042370796204, |
|
"learning_rate": 3.4220672905853116e-06, |
|
"loss": 0.3477, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.126865671641791, |
|
"grad_norm": 0.709885835647583, |
|
"learning_rate": 3.398650730685813e-06, |
|
"loss": 0.3672, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"grad_norm": 0.9335821866989136, |
|
"learning_rate": 3.3752732432410716e-06, |
|
"loss": 0.4279, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"eval_loss": 0.2736593186855316, |
|
"eval_runtime": 15.7175, |
|
"eval_samples_per_second": 10.88, |
|
"eval_steps_per_second": 1.4, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.1417910447761193, |
|
"grad_norm": 0.8153016567230225, |
|
"learning_rate": 3.351935398654961e-06, |
|
"loss": 0.5447, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.1492537313432836, |
|
"grad_norm": 0.7281360626220703, |
|
"learning_rate": 3.3286377663640753e-06, |
|
"loss": 0.1008, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.156716417910448, |
|
"grad_norm": 1.1841518878936768, |
|
"learning_rate": 3.3053809148238426e-06, |
|
"loss": 0.4311, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.1641791044776117, |
|
"grad_norm": 0.9218641519546509, |
|
"learning_rate": 3.28216541149465e-06, |
|
"loss": 0.1485, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.171641791044776, |
|
"grad_norm": 0.7973741292953491, |
|
"learning_rate": 3.258991822828007e-06, |
|
"loss": 0.2468, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.1791044776119404, |
|
"grad_norm": 1.0541926622390747, |
|
"learning_rate": 3.2358607142527083e-06, |
|
"loss": 0.2994, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.1865671641791047, |
|
"grad_norm": 0.6120500564575195, |
|
"learning_rate": 3.2127726501610558e-06, |
|
"loss": 0.1962, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.1940298507462686, |
|
"grad_norm": 0.7375171184539795, |
|
"learning_rate": 3.1897281938950693e-06, |
|
"loss": 0.2319, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.201492537313433, |
|
"grad_norm": 0.695970356464386, |
|
"learning_rate": 3.16672790773276e-06, |
|
"loss": 0.3057, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.208955223880597, |
|
"grad_norm": 0.6763619184494019, |
|
"learning_rate": 3.1437723528743933e-06, |
|
"loss": 0.2628, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.208955223880597, |
|
"eval_loss": 0.2731933891773224, |
|
"eval_runtime": 16.0864, |
|
"eval_samples_per_second": 10.63, |
|
"eval_steps_per_second": 1.368, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.216417910447761, |
|
"grad_norm": 0.8292093276977539, |
|
"learning_rate": 3.1208620894288105e-06, |
|
"loss": 0.2462, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.2238805970149254, |
|
"grad_norm": 0.8130122423171997, |
|
"learning_rate": 3.097997676399749e-06, |
|
"loss": 0.3004, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.2313432835820897, |
|
"grad_norm": 1.1734981536865234, |
|
"learning_rate": 3.0751796716722158e-06, |
|
"loss": 0.2067, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.2388059701492535, |
|
"grad_norm": 0.8040467500686646, |
|
"learning_rate": 3.0524086319988635e-06, |
|
"loss": 0.3013, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.246268656716418, |
|
"grad_norm": 1.2771413326263428, |
|
"learning_rate": 3.029685112986417e-06, |
|
"loss": 0.3126, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.253731343283582, |
|
"grad_norm": 0.9754287004470825, |
|
"learning_rate": 3.0070096690821037e-06, |
|
"loss": 0.38, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.2611940298507465, |
|
"grad_norm": 0.8460696935653687, |
|
"learning_rate": 2.98438285356014e-06, |
|
"loss": 0.2953, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.2686567164179103, |
|
"grad_norm": 0.7434819340705872, |
|
"learning_rate": 2.9618052185082158e-06, |
|
"loss": 0.2497, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.2761194029850746, |
|
"grad_norm": 0.7721374034881592, |
|
"learning_rate": 2.9392773148140406e-06, |
|
"loss": 0.2162, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.283582089552239, |
|
"grad_norm": 0.8082780838012695, |
|
"learning_rate": 2.9167996921518848e-06, |
|
"loss": 0.4641, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.283582089552239, |
|
"eval_loss": 0.2711738049983978, |
|
"eval_runtime": 16.1161, |
|
"eval_samples_per_second": 10.61, |
|
"eval_steps_per_second": 1.365, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.291044776119403, |
|
"grad_norm": 0.5867833495140076, |
|
"learning_rate": 2.894372898969186e-06, |
|
"loss": 0.2053, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.298507462686567, |
|
"grad_norm": 0.8754022717475891, |
|
"learning_rate": 2.8719974824731443e-06, |
|
"loss": 0.3102, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.3059701492537314, |
|
"grad_norm": 0.7445163726806641, |
|
"learning_rate": 2.8496739886173994e-06, |
|
"loss": 0.3551, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.3134328358208958, |
|
"grad_norm": 0.8017621040344238, |
|
"learning_rate": 2.8274029620886773e-06, |
|
"loss": 0.3152, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.3208955223880596, |
|
"grad_norm": 0.8486435413360596, |
|
"learning_rate": 2.805184946293532e-06, |
|
"loss": 0.1575, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.328358208955224, |
|
"grad_norm": 0.7612962126731873, |
|
"learning_rate": 2.7830204833450577e-06, |
|
"loss": 0.2389, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.3358208955223883, |
|
"grad_norm": 0.9732226729393005, |
|
"learning_rate": 2.7609101140496863e-06, |
|
"loss": 0.3763, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.343283582089552, |
|
"grad_norm": 0.6321827173233032, |
|
"learning_rate": 2.7388543778939693e-06, |
|
"loss": 0.1987, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.3507462686567164, |
|
"grad_norm": 1.0333235263824463, |
|
"learning_rate": 2.716853813031435e-06, |
|
"loss": 0.4943, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 0.8021907806396484, |
|
"learning_rate": 2.6949089562694434e-06, |
|
"loss": 0.3487, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"eval_loss": 0.2735496163368225, |
|
"eval_runtime": 16.1234, |
|
"eval_samples_per_second": 10.606, |
|
"eval_steps_per_second": 1.364, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3656716417910446, |
|
"grad_norm": 0.702425479888916, |
|
"learning_rate": 2.6730203430560946e-06, |
|
"loss": 0.2221, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.373134328358209, |
|
"grad_norm": 0.9559716582298279, |
|
"learning_rate": 2.651188507467161e-06, |
|
"loss": 0.4402, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.3805970149253732, |
|
"grad_norm": 0.8092005848884583, |
|
"learning_rate": 2.62941398219306e-06, |
|
"loss": 0.1462, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.388059701492537, |
|
"grad_norm": 0.985363245010376, |
|
"learning_rate": 2.60769729852585e-06, |
|
"loss": 0.3575, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.3955223880597014, |
|
"grad_norm": 1.0277003049850464, |
|
"learning_rate": 2.5860389863462765e-06, |
|
"loss": 0.2896, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.4029850746268657, |
|
"grad_norm": 0.9357238411903381, |
|
"learning_rate": 2.564439574110833e-06, |
|
"loss": 0.3942, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.41044776119403, |
|
"grad_norm": 0.9755576848983765, |
|
"learning_rate": 2.542899588838875e-06, |
|
"loss": 0.5156, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.417910447761194, |
|
"grad_norm": 1.0265827178955078, |
|
"learning_rate": 2.5214195560997546e-06, |
|
"loss": 0.2858, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.425373134328358, |
|
"grad_norm": 0.7597145438194275, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.2822, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.4328358208955225, |
|
"grad_norm": 0.9876797199249268, |
|
"learning_rate": 2.47864144317053e-06, |
|
"loss": 0.3247, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.4328358208955225, |
|
"eval_loss": 0.27218928933143616, |
|
"eval_runtime": 15.6426, |
|
"eval_samples_per_second": 10.932, |
|
"eval_steps_per_second": 1.406, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.4402985074626864, |
|
"grad_norm": 0.8439892530441284, |
|
"learning_rate": 2.457344406753899e-06, |
|
"loss": 0.3462, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.4477611940298507, |
|
"grad_norm": 0.8776973485946655, |
|
"learning_rate": 2.4361094103915726e-06, |
|
"loss": 0.2259, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.455223880597015, |
|
"grad_norm": 0.9893208742141724, |
|
"learning_rate": 2.414936972211272e-06, |
|
"loss": 0.5059, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.4626865671641793, |
|
"grad_norm": 0.7548739314079285, |
|
"learning_rate": 2.3938276088143003e-06, |
|
"loss": 0.2269, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.470149253731343, |
|
"grad_norm": 0.7064008116722107, |
|
"learning_rate": 2.372781835262971e-06, |
|
"loss": 0.3191, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.4776119402985075, |
|
"grad_norm": 0.7179411053657532, |
|
"learning_rate": 2.3518001650680084e-06, |
|
"loss": 0.2887, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.485074626865672, |
|
"grad_norm": 0.8408747315406799, |
|
"learning_rate": 2.330883110176049e-06, |
|
"loss": 0.346, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.4925373134328357, |
|
"grad_norm": 1.0033082962036133, |
|
"learning_rate": 2.3100311809571175e-06, |
|
"loss": 0.4948, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.7342469096183777, |
|
"learning_rate": 2.2892448861922075e-06, |
|
"loss": 0.2275, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.5074626865671643, |
|
"grad_norm": 0.6741373538970947, |
|
"learning_rate": 2.2685247330608417e-06, |
|
"loss": 0.1756, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.5074626865671643, |
|
"eval_loss": 0.2731238603591919, |
|
"eval_runtime": 15.975, |
|
"eval_samples_per_second": 10.704, |
|
"eval_steps_per_second": 1.377, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.5149253731343286, |
|
"grad_norm": 0.6658123731613159, |
|
"learning_rate": 2.247871227128709e-06, |
|
"loss": 0.3165, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.5223880597014925, |
|
"grad_norm": 0.7876918911933899, |
|
"learning_rate": 2.227284872335325e-06, |
|
"loss": 0.3699, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.529850746268657, |
|
"grad_norm": 0.6888175010681152, |
|
"learning_rate": 2.2067661709817384e-06, |
|
"loss": 0.2994, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.5373134328358207, |
|
"grad_norm": 0.7835707068443298, |
|
"learning_rate": 2.1863156237182727e-06, |
|
"loss": 0.2034, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.544776119402985, |
|
"grad_norm": 0.88739413022995, |
|
"learning_rate": 2.1659337295323117e-06, |
|
"loss": 0.2325, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.5522388059701493, |
|
"grad_norm": 0.793242335319519, |
|
"learning_rate": 2.145620985736125e-06, |
|
"loss": 0.2518, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.5597014925373136, |
|
"grad_norm": 0.8310137987136841, |
|
"learning_rate": 2.1253778879547323e-06, |
|
"loss": 0.2637, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.5671641791044775, |
|
"grad_norm": 0.907107949256897, |
|
"learning_rate": 2.1052049301138095e-06, |
|
"loss": 0.2982, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.574626865671642, |
|
"grad_norm": 0.8847762942314148, |
|
"learning_rate": 2.0851026044276405e-06, |
|
"loss": 0.2714, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.582089552238806, |
|
"grad_norm": 0.7740510106086731, |
|
"learning_rate": 2.065071401387105e-06, |
|
"loss": 0.1954, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.582089552238806, |
|
"eval_loss": 0.27339252829551697, |
|
"eval_runtime": 15.6675, |
|
"eval_samples_per_second": 10.914, |
|
"eval_steps_per_second": 1.404, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.58955223880597, |
|
"grad_norm": 1.0186398029327393, |
|
"learning_rate": 2.0451118097477095e-06, |
|
"loss": 0.2898, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.5970149253731343, |
|
"grad_norm": 0.9219568967819214, |
|
"learning_rate": 2.025224316517663e-06, |
|
"loss": 0.2446, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.6044776119402986, |
|
"grad_norm": 0.6918323636054993, |
|
"learning_rate": 2.005409406946e-06, |
|
"loss": 0.4241, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.611940298507463, |
|
"grad_norm": 1.0935815572738647, |
|
"learning_rate": 1.9856675645107244e-06, |
|
"loss": 0.2807, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.6194029850746268, |
|
"grad_norm": 0.7006062269210815, |
|
"learning_rate": 1.9659992709070346e-06, |
|
"loss": 0.2355, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.626865671641791, |
|
"grad_norm": 1.0123817920684814, |
|
"learning_rate": 1.946405006035548e-06, |
|
"loss": 0.3256, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.6343283582089554, |
|
"grad_norm": 0.9404323697090149, |
|
"learning_rate": 1.926885247990615e-06, |
|
"loss": 0.339, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.6417910447761193, |
|
"grad_norm": 0.6341800093650818, |
|
"learning_rate": 1.9074404730486264e-06, |
|
"loss": 0.1258, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.6492537313432836, |
|
"grad_norm": 0.7687932252883911, |
|
"learning_rate": 1.8880711556564214e-06, |
|
"loss": 0.3796, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.656716417910448, |
|
"grad_norm": 0.7422243356704712, |
|
"learning_rate": 1.8687777684196883e-06, |
|
"loss": 0.2946, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.656716417910448, |
|
"eval_loss": 0.27211612462997437, |
|
"eval_runtime": 15.9735, |
|
"eval_samples_per_second": 10.705, |
|
"eval_steps_per_second": 1.377, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.664179104477612, |
|
"grad_norm": 0.7336458563804626, |
|
"learning_rate": 1.8495607820914451e-06, |
|
"loss": 0.326, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.671641791044776, |
|
"grad_norm": 0.8988629579544067, |
|
"learning_rate": 1.8304206655605477e-06, |
|
"loss": 0.349, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.6791044776119404, |
|
"grad_norm": 0.9680550694465637, |
|
"learning_rate": 1.811357885840254e-06, |
|
"loss": 0.3199, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.6865671641791042, |
|
"grad_norm": 0.9406099319458008, |
|
"learning_rate": 1.7923729080568242e-06, |
|
"loss": 0.4716, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.6940298507462686, |
|
"grad_norm": 0.7466757893562317, |
|
"learning_rate": 1.7734661954381754e-06, |
|
"loss": 0.2834, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.701492537313433, |
|
"grad_norm": 0.7727675437927246, |
|
"learning_rate": 1.754638209302576e-06, |
|
"loss": 0.2138, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.708955223880597, |
|
"grad_norm": 0.9192438721656799, |
|
"learning_rate": 1.7358894090473928e-06, |
|
"loss": 0.2163, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.716417910447761, |
|
"grad_norm": 0.824645459651947, |
|
"learning_rate": 1.7172202521378794e-06, |
|
"loss": 0.3145, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.7238805970149254, |
|
"grad_norm": 0.8433907628059387, |
|
"learning_rate": 1.6986311940960148e-06, |
|
"loss": 0.2697, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"grad_norm": 0.8727117776870728, |
|
"learning_rate": 1.6801226884893895e-06, |
|
"loss": 0.1878, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"eval_loss": 0.273702472448349, |
|
"eval_runtime": 16.1843, |
|
"eval_samples_per_second": 10.566, |
|
"eval_steps_per_second": 1.359, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.7388059701492535, |
|
"grad_norm": 0.626703143119812, |
|
"learning_rate": 1.661695186920138e-06, |
|
"loss": 0.1891, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.746268656716418, |
|
"grad_norm": 0.8871873021125793, |
|
"learning_rate": 1.6433491390139179e-06, |
|
"loss": 0.1811, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.753731343283582, |
|
"grad_norm": 0.7437407970428467, |
|
"learning_rate": 1.6250849924089485e-06, |
|
"loss": 0.3126, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.7611940298507465, |
|
"grad_norm": 0.8359528183937073, |
|
"learning_rate": 1.6069031927450696e-06, |
|
"loss": 0.3954, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.7686567164179103, |
|
"grad_norm": 2.4655566215515137, |
|
"learning_rate": 1.5888041836528917e-06, |
|
"loss": 0.1926, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.7761194029850746, |
|
"grad_norm": 0.6973621845245361, |
|
"learning_rate": 1.5707884067429474e-06, |
|
"loss": 0.2028, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.783582089552239, |
|
"grad_norm": 0.8303217887878418, |
|
"learning_rate": 1.5528563015949421e-06, |
|
"loss": 0.4122, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.791044776119403, |
|
"grad_norm": 0.8819155693054199, |
|
"learning_rate": 1.535008305747e-06, |
|
"loss": 0.4774, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.798507462686567, |
|
"grad_norm": 0.7322707176208496, |
|
"learning_rate": 1.5172448546850166e-06, |
|
"loss": 0.3824, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.8059701492537314, |
|
"grad_norm": 0.6984463334083557, |
|
"learning_rate": 1.4995663818320071e-06, |
|
"loss": 0.2717, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.8059701492537314, |
|
"eval_loss": 0.2731057405471802, |
|
"eval_runtime": 15.9516, |
|
"eval_samples_per_second": 10.72, |
|
"eval_steps_per_second": 1.379, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.8134328358208958, |
|
"grad_norm": 0.5827436447143555, |
|
"learning_rate": 1.4819733185375535e-06, |
|
"loss": 0.1136, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.8208955223880596, |
|
"grad_norm": 0.7589297294616699, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.3596, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.828358208955224, |
|
"grad_norm": 0.6961478590965271, |
|
"learning_rate": 1.4470451355923026e-06, |
|
"loss": 0.2654, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.835820895522388, |
|
"grad_norm": 0.707029402256012, |
|
"learning_rate": 1.4297108681789752e-06, |
|
"loss": 0.2746, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.843283582089552, |
|
"grad_norm": 0.7735282182693481, |
|
"learning_rate": 1.4124637147783431e-06, |
|
"loss": 0.2864, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.8507462686567164, |
|
"grad_norm": 0.7417818903923035, |
|
"learning_rate": 1.3953040962159208e-06, |
|
"loss": 0.2795, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.8582089552238807, |
|
"grad_norm": 0.7439370155334473, |
|
"learning_rate": 1.378232431181386e-06, |
|
"loss": 0.3834, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.8656716417910446, |
|
"grad_norm": 0.8369311690330505, |
|
"learning_rate": 1.3612491362183887e-06, |
|
"loss": 0.2352, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.873134328358209, |
|
"grad_norm": 0.7087696194648743, |
|
"learning_rate": 1.3443546257143624e-06, |
|
"loss": 0.1658, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.8805970149253732, |
|
"grad_norm": 0.8520712852478027, |
|
"learning_rate": 1.3275493118904388e-06, |
|
"loss": 0.2889, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.8805970149253732, |
|
"eval_loss": 0.2740533947944641, |
|
"eval_runtime": 15.7052, |
|
"eval_samples_per_second": 10.888, |
|
"eval_steps_per_second": 1.401, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.888059701492537, |
|
"grad_norm": 1.036145806312561, |
|
"learning_rate": 1.3108336047913633e-06, |
|
"loss": 0.3731, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.8955223880597014, |
|
"grad_norm": 0.6921767592430115, |
|
"learning_rate": 1.2942079122755162e-06, |
|
"loss": 0.1973, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.9029850746268657, |
|
"grad_norm": 0.8576136231422424, |
|
"learning_rate": 1.277672640004936e-06, |
|
"loss": 0.1866, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.91044776119403, |
|
"grad_norm": 0.7842735052108765, |
|
"learning_rate": 1.2612281914354452e-06, |
|
"loss": 0.3662, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.917910447761194, |
|
"grad_norm": 0.6634000539779663, |
|
"learning_rate": 1.2448749678067856e-06, |
|
"loss": 0.1707, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.925373134328358, |
|
"grad_norm": 0.9016363024711609, |
|
"learning_rate": 1.228613368132842e-06, |
|
"loss": 0.3192, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.9328358208955225, |
|
"grad_norm": 1.0403859615325928, |
|
"learning_rate": 1.2124437891918995e-06, |
|
"loss": 0.4336, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.9402985074626864, |
|
"grad_norm": 0.8219823837280273, |
|
"learning_rate": 1.1963666255169648e-06, |
|
"loss": 0.4064, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.9477611940298507, |
|
"grad_norm": 0.8923879265785217, |
|
"learning_rate": 1.1803822693861377e-06, |
|
"loss": 0.261, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.955223880597015, |
|
"grad_norm": 0.844815731048584, |
|
"learning_rate": 1.1644911108130436e-06, |
|
"loss": 0.2351, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.955223880597015, |
|
"eval_loss": 0.27234381437301636, |
|
"eval_runtime": 16.0487, |
|
"eval_samples_per_second": 10.655, |
|
"eval_steps_per_second": 1.371, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.9626865671641793, |
|
"grad_norm": 0.8358920812606812, |
|
"learning_rate": 1.1486935375373127e-06, |
|
"loss": 0.3608, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.970149253731343, |
|
"grad_norm": 0.7876161932945251, |
|
"learning_rate": 1.1329899350151212e-06, |
|
"loss": 0.3178, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.9776119402985075, |
|
"grad_norm": 1.1506562232971191, |
|
"learning_rate": 1.1173806864097885e-06, |
|
"loss": 0.3036, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.9850746268656714, |
|
"grad_norm": 0.7582001686096191, |
|
"learning_rate": 1.1018661725824231e-06, |
|
"loss": 0.2453, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.9925373134328357, |
|
"grad_norm": 0.7635594606399536, |
|
"learning_rate": 1.0864467720826343e-06, |
|
"loss": 0.3319, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.9543821811676025, |
|
"learning_rate": 1.0711228611392937e-06, |
|
"loss": 0.4539, |
|
"step": 536 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 670, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4114625650674893e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|