|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 2460, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0040650406504065045, |
|
"grad_norm": 8.289212226867676, |
|
"learning_rate": 2.0325203252032523e-06, |
|
"loss": 3.078, |
|
"mean_token_accuracy": 0.449470280110836, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008130081300813009, |
|
"grad_norm": 5.227666854858398, |
|
"learning_rate": 4.0650406504065046e-06, |
|
"loss": 2.9676, |
|
"mean_token_accuracy": 0.4515411153435707, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012195121951219513, |
|
"grad_norm": 4.098899841308594, |
|
"learning_rate": 6.0975609756097564e-06, |
|
"loss": 2.6865, |
|
"mean_token_accuracy": 0.46663461327552797, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.016260162601626018, |
|
"grad_norm": 2.25898814201355, |
|
"learning_rate": 8.130081300813009e-06, |
|
"loss": 2.372, |
|
"mean_token_accuracy": 0.5245550125837326, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02032520325203252, |
|
"grad_norm": 1.3737112283706665, |
|
"learning_rate": 1.016260162601626e-05, |
|
"loss": 2.1568, |
|
"mean_token_accuracy": 0.5585430264472961, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.024390243902439025, |
|
"grad_norm": 1.2044790983200073, |
|
"learning_rate": 1.2195121951219513e-05, |
|
"loss": 2.0289, |
|
"mean_token_accuracy": 0.5695936948060989, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.028455284552845527, |
|
"grad_norm": 0.5751954913139343, |
|
"learning_rate": 1.4227642276422764e-05, |
|
"loss": 1.9646, |
|
"mean_token_accuracy": 0.5815719872713089, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.032520325203252036, |
|
"grad_norm": 0.5788729190826416, |
|
"learning_rate": 1.6260162601626018e-05, |
|
"loss": 1.9203, |
|
"mean_token_accuracy": 0.5884384959936142, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.036585365853658534, |
|
"grad_norm": 0.5332703590393066, |
|
"learning_rate": 1.8292682926829268e-05, |
|
"loss": 1.8852, |
|
"mean_token_accuracy": 0.5897065699100494, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04065040650406504, |
|
"grad_norm": 0.47794440388679504, |
|
"learning_rate": 2.032520325203252e-05, |
|
"loss": 1.9142, |
|
"mean_token_accuracy": 0.5859858870506287, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.044715447154471545, |
|
"grad_norm": 0.4653078317642212, |
|
"learning_rate": 2.2357723577235773e-05, |
|
"loss": 1.9044, |
|
"mean_token_accuracy": 0.5898473501205445, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 0.5005607008934021, |
|
"learning_rate": 2.4390243902439026e-05, |
|
"loss": 1.8355, |
|
"mean_token_accuracy": 0.5983660519123077, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.052845528455284556, |
|
"grad_norm": 0.6820327043533325, |
|
"learning_rate": 2.642276422764228e-05, |
|
"loss": 1.8238, |
|
"mean_token_accuracy": 0.6029254138469696, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.056910569105691054, |
|
"grad_norm": 0.6085386276245117, |
|
"learning_rate": 2.8455284552845528e-05, |
|
"loss": 1.8338, |
|
"mean_token_accuracy": 0.6012787193059921, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06097560975609756, |
|
"grad_norm": 0.47377073764801025, |
|
"learning_rate": 3.048780487804878e-05, |
|
"loss": 1.8109, |
|
"mean_token_accuracy": 0.6033449083566665, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06504065040650407, |
|
"grad_norm": 0.45995065569877625, |
|
"learning_rate": 3.2520325203252037e-05, |
|
"loss": 1.8453, |
|
"mean_token_accuracy": 0.5993239343166351, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06910569105691057, |
|
"grad_norm": 0.4574694335460663, |
|
"learning_rate": 3.4552845528455286e-05, |
|
"loss": 1.8308, |
|
"mean_token_accuracy": 0.6006569236516952, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07317073170731707, |
|
"grad_norm": 0.4918266534805298, |
|
"learning_rate": 3.6585365853658535e-05, |
|
"loss": 1.8487, |
|
"mean_token_accuracy": 0.5970274031162262, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07723577235772358, |
|
"grad_norm": 0.4782467782497406, |
|
"learning_rate": 3.861788617886179e-05, |
|
"loss": 1.802, |
|
"mean_token_accuracy": 0.6060092628002167, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08130081300813008, |
|
"grad_norm": 0.438230037689209, |
|
"learning_rate": 4.065040650406504e-05, |
|
"loss": 1.7958, |
|
"mean_token_accuracy": 0.6061566352844239, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08536585365853659, |
|
"grad_norm": 0.4993686378002167, |
|
"learning_rate": 4.26829268292683e-05, |
|
"loss": 1.8424, |
|
"mean_token_accuracy": 0.6013251960277557, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08943089430894309, |
|
"grad_norm": 0.4643951654434204, |
|
"learning_rate": 4.4715447154471546e-05, |
|
"loss": 1.7742, |
|
"mean_token_accuracy": 0.6113761276006698, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09349593495934959, |
|
"grad_norm": 0.4359968900680542, |
|
"learning_rate": 4.6747967479674795e-05, |
|
"loss": 1.822, |
|
"mean_token_accuracy": 0.6046584963798523, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 0.4505017399787903, |
|
"learning_rate": 4.878048780487805e-05, |
|
"loss": 1.7852, |
|
"mean_token_accuracy": 0.6076123654842377, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1016260162601626, |
|
"grad_norm": 0.441800594329834, |
|
"learning_rate": 5.081300813008131e-05, |
|
"loss": 1.7878, |
|
"mean_token_accuracy": 0.6079567730426788, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10569105691056911, |
|
"grad_norm": 0.45622989535331726, |
|
"learning_rate": 5.284552845528456e-05, |
|
"loss": 1.8014, |
|
"mean_token_accuracy": 0.6061283946037292, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10975609756097561, |
|
"grad_norm": 0.5284667015075684, |
|
"learning_rate": 5.487804878048781e-05, |
|
"loss": 1.791, |
|
"mean_token_accuracy": 0.6052844613790512, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11382113821138211, |
|
"grad_norm": 0.4371776878833771, |
|
"learning_rate": 5.6910569105691056e-05, |
|
"loss": 1.8117, |
|
"mean_token_accuracy": 0.6030213892459869, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11788617886178862, |
|
"grad_norm": 0.43666890263557434, |
|
"learning_rate": 5.894308943089432e-05, |
|
"loss": 1.8047, |
|
"mean_token_accuracy": 0.6026217639446259, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12195121951219512, |
|
"grad_norm": 0.45028433203697205, |
|
"learning_rate": 6.097560975609756e-05, |
|
"loss": 1.7743, |
|
"mean_token_accuracy": 0.6108780682086945, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12601626016260162, |
|
"grad_norm": 0.4556964933872223, |
|
"learning_rate": 6.300813008130082e-05, |
|
"loss": 1.7671, |
|
"mean_token_accuracy": 0.6109252899885178, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13008130081300814, |
|
"grad_norm": 0.4461067318916321, |
|
"learning_rate": 6.504065040650407e-05, |
|
"loss": 1.782, |
|
"mean_token_accuracy": 0.6054368048906327, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13414634146341464, |
|
"grad_norm": 0.49761253595352173, |
|
"learning_rate": 6.707317073170732e-05, |
|
"loss": 1.7556, |
|
"mean_token_accuracy": 0.6134343713521957, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.13821138211382114, |
|
"grad_norm": 0.555749773979187, |
|
"learning_rate": 6.910569105691057e-05, |
|
"loss": 1.7879, |
|
"mean_token_accuracy": 0.6054145842790604, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14227642276422764, |
|
"grad_norm": 0.417368620634079, |
|
"learning_rate": 7.113821138211383e-05, |
|
"loss": 1.78, |
|
"mean_token_accuracy": 0.6087132960557937, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 0.43470966815948486, |
|
"learning_rate": 7.317073170731707e-05, |
|
"loss": 1.7526, |
|
"mean_token_accuracy": 0.6102898091077804, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15040650406504066, |
|
"grad_norm": 0.4223913550376892, |
|
"learning_rate": 7.520325203252033e-05, |
|
"loss": 1.7626, |
|
"mean_token_accuracy": 0.6112073600292206, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15447154471544716, |
|
"grad_norm": 0.43716931343078613, |
|
"learning_rate": 7.723577235772358e-05, |
|
"loss": 1.7921, |
|
"mean_token_accuracy": 0.6047958940267563, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.15853658536585366, |
|
"grad_norm": 0.4590209424495697, |
|
"learning_rate": 7.926829268292683e-05, |
|
"loss": 1.7753, |
|
"mean_token_accuracy": 0.6097242563962937, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.16260162601626016, |
|
"grad_norm": 0.40387144684791565, |
|
"learning_rate": 8.130081300813008e-05, |
|
"loss": 1.7687, |
|
"mean_token_accuracy": 0.609487646818161, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.38079211115837097, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.7674, |
|
"mean_token_accuracy": 0.6086906433105469, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.17073170731707318, |
|
"grad_norm": 0.42580336332321167, |
|
"learning_rate": 8.53658536585366e-05, |
|
"loss": 1.8098, |
|
"mean_token_accuracy": 0.6029835999011993, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17479674796747968, |
|
"grad_norm": 0.4392237663269043, |
|
"learning_rate": 8.739837398373984e-05, |
|
"loss": 1.7597, |
|
"mean_token_accuracy": 0.6129997968673706, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.17886178861788618, |
|
"grad_norm": 0.38361164927482605, |
|
"learning_rate": 8.943089430894309e-05, |
|
"loss": 1.7643, |
|
"mean_token_accuracy": 0.6115556955337524, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18292682926829268, |
|
"grad_norm": 0.3871498107910156, |
|
"learning_rate": 9.146341463414635e-05, |
|
"loss": 1.7619, |
|
"mean_token_accuracy": 0.6097814708948135, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.18699186991869918, |
|
"grad_norm": 0.422797828912735, |
|
"learning_rate": 9.349593495934959e-05, |
|
"loss": 1.7847, |
|
"mean_token_accuracy": 0.6089305311441422, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1910569105691057, |
|
"grad_norm": 0.4390835464000702, |
|
"learning_rate": 9.552845528455285e-05, |
|
"loss": 1.7441, |
|
"mean_token_accuracy": 0.6139709919691085, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 0.3742270767688751, |
|
"learning_rate": 9.75609756097561e-05, |
|
"loss": 1.7726, |
|
"mean_token_accuracy": 0.6077946543693542, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1991869918699187, |
|
"grad_norm": 0.3881438076496124, |
|
"learning_rate": 9.959349593495935e-05, |
|
"loss": 1.7552, |
|
"mean_token_accuracy": 0.6142873078584671, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2032520325203252, |
|
"grad_norm": 0.37677496671676636, |
|
"learning_rate": 9.999919461536915e-05, |
|
"loss": 1.7699, |
|
"mean_token_accuracy": 0.6085665196180343, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2073170731707317, |
|
"grad_norm": 0.3732423186302185, |
|
"learning_rate": 9.999592278477388e-05, |
|
"loss": 1.726, |
|
"mean_token_accuracy": 0.6151001572608947, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.21138211382113822, |
|
"grad_norm": 0.39346984028816223, |
|
"learning_rate": 9.999013433624042e-05, |
|
"loss": 1.7407, |
|
"mean_token_accuracy": 0.6134092509746552, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.21544715447154472, |
|
"grad_norm": 0.41036558151245117, |
|
"learning_rate": 9.998182956113883e-05, |
|
"loss": 1.7635, |
|
"mean_token_accuracy": 0.6121825218200684, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.21951219512195122, |
|
"grad_norm": 0.3847576379776001, |
|
"learning_rate": 9.997100887750215e-05, |
|
"loss": 1.7479, |
|
"mean_token_accuracy": 0.6147123813629151, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.22357723577235772, |
|
"grad_norm": 0.3685747981071472, |
|
"learning_rate": 9.995767283000526e-05, |
|
"loss": 1.7344, |
|
"mean_token_accuracy": 0.6126172572374344, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.22764227642276422, |
|
"grad_norm": 0.36466580629348755, |
|
"learning_rate": 9.994182208993765e-05, |
|
"loss": 1.7475, |
|
"mean_token_accuracy": 0.6147434413433075, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.23170731707317074, |
|
"grad_norm": 0.3779489994049072, |
|
"learning_rate": 9.992345745516953e-05, |
|
"loss": 1.7573, |
|
"mean_token_accuracy": 0.6114851921796799, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.23577235772357724, |
|
"grad_norm": 0.38093748688697815, |
|
"learning_rate": 9.990257985011167e-05, |
|
"loss": 1.7271, |
|
"mean_token_accuracy": 0.6138428241014481, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.23983739837398374, |
|
"grad_norm": 0.36460766196250916, |
|
"learning_rate": 9.987919032566885e-05, |
|
"loss": 1.7372, |
|
"mean_token_accuracy": 0.6160049825906754, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 0.3707461655139923, |
|
"learning_rate": 9.985329005918702e-05, |
|
"loss": 1.7743, |
|
"mean_token_accuracy": 0.6077301442623139, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24796747967479674, |
|
"grad_norm": 0.36816197633743286, |
|
"learning_rate": 9.982488035439401e-05, |
|
"loss": 1.7483, |
|
"mean_token_accuracy": 0.6115146845579147, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.25203252032520324, |
|
"grad_norm": 0.36961716413497925, |
|
"learning_rate": 9.979396264133389e-05, |
|
"loss": 1.7521, |
|
"mean_token_accuracy": 0.6138442009687424, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.25609756097560976, |
|
"grad_norm": 0.3750387132167816, |
|
"learning_rate": 9.976053847629496e-05, |
|
"loss": 1.7469, |
|
"mean_token_accuracy": 0.6126538842916489, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2601626016260163, |
|
"grad_norm": 0.3860867917537689, |
|
"learning_rate": 9.97246095417315e-05, |
|
"loss": 1.7478, |
|
"mean_token_accuracy": 0.6144095450639725, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.26422764227642276, |
|
"grad_norm": 0.4011945426464081, |
|
"learning_rate": 9.9686177646179e-05, |
|
"loss": 1.7793, |
|
"mean_token_accuracy": 0.6059734374284744, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2682926829268293, |
|
"grad_norm": 0.3953823745250702, |
|
"learning_rate": 9.964524472416319e-05, |
|
"loss": 1.72, |
|
"mean_token_accuracy": 0.6176005303859711, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.27235772357723576, |
|
"grad_norm": 0.36000069975852966, |
|
"learning_rate": 9.96018128361026e-05, |
|
"loss": 1.7291, |
|
"mean_token_accuracy": 0.6150078058242798, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2764227642276423, |
|
"grad_norm": 0.4328393340110779, |
|
"learning_rate": 9.955588416820483e-05, |
|
"loss": 1.7201, |
|
"mean_token_accuracy": 0.6150764495134353, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2804878048780488, |
|
"grad_norm": 0.37596067786216736, |
|
"learning_rate": 9.950746103235663e-05, |
|
"loss": 1.7233, |
|
"mean_token_accuracy": 0.6179824382066726, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2845528455284553, |
|
"grad_norm": 0.3459499776363373, |
|
"learning_rate": 9.94565458660074e-05, |
|
"loss": 1.732, |
|
"mean_token_accuracy": 0.6165819704532624, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2886178861788618, |
|
"grad_norm": 0.3702346980571747, |
|
"learning_rate": 9.940314123204655e-05, |
|
"loss": 1.7224, |
|
"mean_token_accuracy": 0.6161389201879501, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 0.3743409216403961, |
|
"learning_rate": 9.934724981867446e-05, |
|
"loss": 1.7228, |
|
"mean_token_accuracy": 0.6177256584167481, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2967479674796748, |
|
"grad_norm": 0.351957768201828, |
|
"learning_rate": 9.928887443926725e-05, |
|
"loss": 1.7265, |
|
"mean_token_accuracy": 0.6170883119106293, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3008130081300813, |
|
"grad_norm": 0.3649267852306366, |
|
"learning_rate": 9.922801803223505e-05, |
|
"loss": 1.7399, |
|
"mean_token_accuracy": 0.6103106021881104, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3048780487804878, |
|
"grad_norm": 0.348762184381485, |
|
"learning_rate": 9.916468366087417e-05, |
|
"loss": 1.7275, |
|
"mean_token_accuracy": 0.6168751239776611, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3089430894308943, |
|
"grad_norm": 0.3591652810573578, |
|
"learning_rate": 9.909887451321288e-05, |
|
"loss": 1.7339, |
|
"mean_token_accuracy": 0.6146889984607696, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3130081300813008, |
|
"grad_norm": 0.35048526525497437, |
|
"learning_rate": 9.903059390185094e-05, |
|
"loss": 1.7119, |
|
"mean_token_accuracy": 0.6178350031375885, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3170731707317073, |
|
"grad_norm": 0.3357296586036682, |
|
"learning_rate": 9.895984526379281e-05, |
|
"loss": 1.7121, |
|
"mean_token_accuracy": 0.6225362658500672, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.32113821138211385, |
|
"grad_norm": 0.3489939868450165, |
|
"learning_rate": 9.888663216027476e-05, |
|
"loss": 1.7043, |
|
"mean_token_accuracy": 0.6186534225940704, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3252032520325203, |
|
"grad_norm": 0.352334588766098, |
|
"learning_rate": 9.881095827658548e-05, |
|
"loss": 1.7602, |
|
"mean_token_accuracy": 0.612470856308937, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.32926829268292684, |
|
"grad_norm": 0.3514755368232727, |
|
"learning_rate": 9.873282742188066e-05, |
|
"loss": 1.7009, |
|
"mean_token_accuracy": 0.6202522426843643, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.3693011999130249, |
|
"learning_rate": 9.865224352899119e-05, |
|
"loss": 1.7074, |
|
"mean_token_accuracy": 0.618286868929863, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.33739837398373984, |
|
"grad_norm": 0.35541000962257385, |
|
"learning_rate": 9.856921065422526e-05, |
|
"loss": 1.7106, |
|
"mean_token_accuracy": 0.620040899515152, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 0.3638647198677063, |
|
"learning_rate": 9.848373297716414e-05, |
|
"loss": 1.7289, |
|
"mean_token_accuracy": 0.615748155117035, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.34552845528455284, |
|
"grad_norm": 0.33716803789138794, |
|
"learning_rate": 9.83958148004518e-05, |
|
"loss": 1.7289, |
|
"mean_token_accuracy": 0.6158941626548767, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.34959349593495936, |
|
"grad_norm": 0.33820581436157227, |
|
"learning_rate": 9.830546054957828e-05, |
|
"loss": 1.7053, |
|
"mean_token_accuracy": 0.6199358224868774, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.35365853658536583, |
|
"grad_norm": 0.3723635673522949, |
|
"learning_rate": 9.821267477265705e-05, |
|
"loss": 1.7264, |
|
"mean_token_accuracy": 0.6148888647556305, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.35772357723577236, |
|
"grad_norm": 0.3225667178630829, |
|
"learning_rate": 9.8117462140196e-05, |
|
"loss": 1.709, |
|
"mean_token_accuracy": 0.6211889505386352, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3617886178861789, |
|
"grad_norm": 0.3565698564052582, |
|
"learning_rate": 9.801982744486229e-05, |
|
"loss": 1.7228, |
|
"mean_token_accuracy": 0.6147832721471786, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.36585365853658536, |
|
"grad_norm": 0.33813104033470154, |
|
"learning_rate": 9.791977560124119e-05, |
|
"loss": 1.7354, |
|
"mean_token_accuracy": 0.6131374180316925, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3699186991869919, |
|
"grad_norm": 0.3350490927696228, |
|
"learning_rate": 9.781731164558869e-05, |
|
"loss": 1.7297, |
|
"mean_token_accuracy": 0.6181742399930954, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.37398373983739835, |
|
"grad_norm": 0.36659350991249084, |
|
"learning_rate": 9.771244073557793e-05, |
|
"loss": 1.726, |
|
"mean_token_accuracy": 0.6187894672155381, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3780487804878049, |
|
"grad_norm": 0.3454345166683197, |
|
"learning_rate": 9.760516815003964e-05, |
|
"loss": 1.7224, |
|
"mean_token_accuracy": 0.6168391436338425, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3821138211382114, |
|
"grad_norm": 0.3488418459892273, |
|
"learning_rate": 9.749549928869637e-05, |
|
"loss": 1.7026, |
|
"mean_token_accuracy": 0.6172319799661636, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3861788617886179, |
|
"grad_norm": 0.3551413416862488, |
|
"learning_rate": 9.738343967189079e-05, |
|
"loss": 1.7064, |
|
"mean_token_accuracy": 0.6189182132482529, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 0.3523434102535248, |
|
"learning_rate": 9.726899494030768e-05, |
|
"loss": 1.7237, |
|
"mean_token_accuracy": 0.6148646742105484, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3943089430894309, |
|
"grad_norm": 0.33508190512657166, |
|
"learning_rate": 9.715217085469008e-05, |
|
"loss": 1.7173, |
|
"mean_token_accuracy": 0.6163572788238525, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3983739837398374, |
|
"grad_norm": 0.3474932610988617, |
|
"learning_rate": 9.70329732955493e-05, |
|
"loss": 1.6878, |
|
"mean_token_accuracy": 0.6266204327344894, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4024390243902439, |
|
"grad_norm": 0.33261817693710327, |
|
"learning_rate": 9.691140826286893e-05, |
|
"loss": 1.7003, |
|
"mean_token_accuracy": 0.6210538297891617, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4065040650406504, |
|
"grad_norm": 0.33157244324684143, |
|
"learning_rate": 9.678748187580279e-05, |
|
"loss": 1.7242, |
|
"mean_token_accuracy": 0.6156573951244354, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4105691056910569, |
|
"grad_norm": 0.3394560217857361, |
|
"learning_rate": 9.666120037236691e-05, |
|
"loss": 1.708, |
|
"mean_token_accuracy": 0.6201102405786514, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.4146341463414634, |
|
"grad_norm": 0.34017136693000793, |
|
"learning_rate": 9.653257010912559e-05, |
|
"loss": 1.7066, |
|
"mean_token_accuracy": 0.6183117419481278, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4186991869918699, |
|
"grad_norm": 0.3640463054180145, |
|
"learning_rate": 9.640159756087135e-05, |
|
"loss": 1.7038, |
|
"mean_token_accuracy": 0.6196237355470657, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.42276422764227645, |
|
"grad_norm": 0.3380732536315918, |
|
"learning_rate": 9.626828932029908e-05, |
|
"loss": 1.6918, |
|
"mean_token_accuracy": 0.6234209835529327, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4268292682926829, |
|
"grad_norm": 0.34295347332954407, |
|
"learning_rate": 9.613265209767417e-05, |
|
"loss": 1.6951, |
|
"mean_token_accuracy": 0.6198685437440872, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.43089430894308944, |
|
"grad_norm": 0.3326297998428345, |
|
"learning_rate": 9.599469272049468e-05, |
|
"loss": 1.7041, |
|
"mean_token_accuracy": 0.6202775299549103, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4349593495934959, |
|
"grad_norm": 0.37368568778038025, |
|
"learning_rate": 9.58544181331478e-05, |
|
"loss": 1.6886, |
|
"mean_token_accuracy": 0.6205744206905365, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 0.3321199417114258, |
|
"learning_rate": 9.571183539656011e-05, |
|
"loss": 1.7284, |
|
"mean_token_accuracy": 0.6155181050300598, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.44308943089430897, |
|
"grad_norm": 0.34314289689064026, |
|
"learning_rate": 9.556695168784236e-05, |
|
"loss": 1.7131, |
|
"mean_token_accuracy": 0.6226642936468124, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.44715447154471544, |
|
"grad_norm": 0.33733630180358887, |
|
"learning_rate": 9.541977429992802e-05, |
|
"loss": 1.7291, |
|
"mean_token_accuracy": 0.6157074451446534, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.45121951219512196, |
|
"grad_norm": 0.3492962718009949, |
|
"learning_rate": 9.527031064120632e-05, |
|
"loss": 1.6977, |
|
"mean_token_accuracy": 0.6195837169885635, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.45528455284552843, |
|
"grad_norm": 0.3503949046134949, |
|
"learning_rate": 9.511856823514923e-05, |
|
"loss": 1.7114, |
|
"mean_token_accuracy": 0.6151729881763458, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.45934959349593496, |
|
"grad_norm": 0.33603164553642273, |
|
"learning_rate": 9.496455471993282e-05, |
|
"loss": 1.7079, |
|
"mean_token_accuracy": 0.6181686371564865, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4634146341463415, |
|
"grad_norm": 0.3251311779022217, |
|
"learning_rate": 9.480827784805278e-05, |
|
"loss": 1.6961, |
|
"mean_token_accuracy": 0.6220617443323135, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.46747967479674796, |
|
"grad_norm": 0.35011976957321167, |
|
"learning_rate": 9.464974548593415e-05, |
|
"loss": 1.7145, |
|
"mean_token_accuracy": 0.6178438782691955, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4715447154471545, |
|
"grad_norm": 0.3231695890426636, |
|
"learning_rate": 9.448896561353536e-05, |
|
"loss": 1.6805, |
|
"mean_token_accuracy": 0.6222502171993256, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.47560975609756095, |
|
"grad_norm": 0.38622918725013733, |
|
"learning_rate": 9.43259463239466e-05, |
|
"loss": 1.6982, |
|
"mean_token_accuracy": 0.6209219604730606, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.4796747967479675, |
|
"grad_norm": 0.3395847976207733, |
|
"learning_rate": 9.416069582298234e-05, |
|
"loss": 1.6764, |
|
"mean_token_accuracy": 0.6220153927803039, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.483739837398374, |
|
"grad_norm": 0.33217620849609375, |
|
"learning_rate": 9.399322242876843e-05, |
|
"loss": 1.6484, |
|
"mean_token_accuracy": 0.6280394732952118, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 0.33441200852394104, |
|
"learning_rate": 9.382353457132317e-05, |
|
"loss": 1.7172, |
|
"mean_token_accuracy": 0.6168171554803848, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.491869918699187, |
|
"grad_norm": 0.33297812938690186, |
|
"learning_rate": 9.365164079213329e-05, |
|
"loss": 1.6896, |
|
"mean_token_accuracy": 0.6211982607841492, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4959349593495935, |
|
"grad_norm": 0.3371240496635437, |
|
"learning_rate": 9.347754974372365e-05, |
|
"loss": 1.7222, |
|
"mean_token_accuracy": 0.6185583353042603, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.33934640884399414, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 1.688, |
|
"mean_token_accuracy": 0.6209723204374313, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5040650406504065, |
|
"grad_norm": 0.3468833863735199, |
|
"learning_rate": 9.312281100191753e-05, |
|
"loss": 1.6865, |
|
"mean_token_accuracy": 0.621919909119606, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.508130081300813, |
|
"grad_norm": 0.3532845675945282, |
|
"learning_rate": 9.294218116481475e-05, |
|
"loss": 1.6877, |
|
"mean_token_accuracy": 0.6208325117826462, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5121951219512195, |
|
"grad_norm": 0.3370439112186432, |
|
"learning_rate": 9.275938977018081e-05, |
|
"loss": 1.7244, |
|
"mean_token_accuracy": 0.6144997507333756, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.516260162601626, |
|
"grad_norm": 0.33375120162963867, |
|
"learning_rate": 9.257444601908806e-05, |
|
"loss": 1.7089, |
|
"mean_token_accuracy": 0.6193614333868027, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5203252032520326, |
|
"grad_norm": 0.37465041875839233, |
|
"learning_rate": 9.238735922095083e-05, |
|
"loss": 1.725, |
|
"mean_token_accuracy": 0.6159345209598541, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.524390243902439, |
|
"grad_norm": 0.3242790997028351, |
|
"learning_rate": 9.219813879305692e-05, |
|
"loss": 1.6763, |
|
"mean_token_accuracy": 0.6275732070207596, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5284552845528455, |
|
"grad_norm": 0.32216328382492065, |
|
"learning_rate": 9.200679426009346e-05, |
|
"loss": 1.7092, |
|
"mean_token_accuracy": 0.620816308259964, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.532520325203252, |
|
"grad_norm": 0.33364465832710266, |
|
"learning_rate": 9.181333525366756e-05, |
|
"loss": 1.729, |
|
"mean_token_accuracy": 0.6153968364000321, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 0.32350727915763855, |
|
"learning_rate": 9.161777151182136e-05, |
|
"loss": 1.6851, |
|
"mean_token_accuracy": 0.6232678264379501, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.540650406504065, |
|
"grad_norm": 0.33794844150543213, |
|
"learning_rate": 9.142011287854206e-05, |
|
"loss": 1.686, |
|
"mean_token_accuracy": 0.619168734550476, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5447154471544715, |
|
"grad_norm": 0.3288458287715912, |
|
"learning_rate": 9.122036930326619e-05, |
|
"loss": 1.6991, |
|
"mean_token_accuracy": 0.6192776530981063, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5487804878048781, |
|
"grad_norm": 0.33086031675338745, |
|
"learning_rate": 9.101855084037894e-05, |
|
"loss": 1.7325, |
|
"mean_token_accuracy": 0.6119826316833497, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5528455284552846, |
|
"grad_norm": 0.3207758665084839, |
|
"learning_rate": 9.081466764870794e-05, |
|
"loss": 1.6639, |
|
"mean_token_accuracy": 0.6272410124540329, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.556910569105691, |
|
"grad_norm": 0.3329343795776367, |
|
"learning_rate": 9.060872999101207e-05, |
|
"loss": 1.6996, |
|
"mean_token_accuracy": 0.6233417153358459, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5609756097560976, |
|
"grad_norm": 0.33917635679244995, |
|
"learning_rate": 9.040074823346465e-05, |
|
"loss": 1.7122, |
|
"mean_token_accuracy": 0.6233748435974121, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5650406504065041, |
|
"grad_norm": 0.33958280086517334, |
|
"learning_rate": 9.019073284513184e-05, |
|
"loss": 1.6622, |
|
"mean_token_accuracy": 0.6255071312189102, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5691056910569106, |
|
"grad_norm": 0.3197284936904907, |
|
"learning_rate": 8.997869439744555e-05, |
|
"loss": 1.7242, |
|
"mean_token_accuracy": 0.616931426525116, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.573170731707317, |
|
"grad_norm": 0.34112757444381714, |
|
"learning_rate": 8.976464356367134e-05, |
|
"loss": 1.6808, |
|
"mean_token_accuracy": 0.6218210339546204, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5772357723577236, |
|
"grad_norm": 0.3305584490299225, |
|
"learning_rate": 8.954859111837115e-05, |
|
"loss": 1.6707, |
|
"mean_token_accuracy": 0.6246382296085358, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5813008130081301, |
|
"grad_norm": 0.31623315811157227, |
|
"learning_rate": 8.933054793686102e-05, |
|
"loss": 1.692, |
|
"mean_token_accuracy": 0.6215147316455841, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 0.34332701563835144, |
|
"learning_rate": 8.911052499466357e-05, |
|
"loss": 1.6665, |
|
"mean_token_accuracy": 0.628682142496109, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5894308943089431, |
|
"grad_norm": 0.3359994888305664, |
|
"learning_rate": 8.888853336695558e-05, |
|
"loss": 1.6603, |
|
"mean_token_accuracy": 0.6243226587772369, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5934959349593496, |
|
"grad_norm": 0.3836466670036316, |
|
"learning_rate": 8.866458422801047e-05, |
|
"loss": 1.6948, |
|
"mean_token_accuracy": 0.6235097616910934, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5975609756097561, |
|
"grad_norm": 0.3324276804924011, |
|
"learning_rate": 8.843868885063593e-05, |
|
"loss": 1.6734, |
|
"mean_token_accuracy": 0.6232392281293869, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6016260162601627, |
|
"grad_norm": 0.3357491195201874, |
|
"learning_rate": 8.821085860560633e-05, |
|
"loss": 1.7242, |
|
"mean_token_accuracy": 0.6178576499223709, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6056910569105691, |
|
"grad_norm": 0.3113623559474945, |
|
"learning_rate": 8.798110496109046e-05, |
|
"loss": 1.6802, |
|
"mean_token_accuracy": 0.6193406730890274, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6097560975609756, |
|
"grad_norm": 0.3288104236125946, |
|
"learning_rate": 8.774943948207426e-05, |
|
"loss": 1.7061, |
|
"mean_token_accuracy": 0.620333781838417, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6138211382113821, |
|
"grad_norm": 0.32581302523612976, |
|
"learning_rate": 8.751587382977861e-05, |
|
"loss": 1.6981, |
|
"mean_token_accuracy": 0.6199865221977234, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6178861788617886, |
|
"grad_norm": 0.3411175608634949, |
|
"learning_rate": 8.728041976107246e-05, |
|
"loss": 1.6874, |
|
"mean_token_accuracy": 0.6265356034040451, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6219512195121951, |
|
"grad_norm": 0.32322341203689575, |
|
"learning_rate": 8.70430891278809e-05, |
|
"loss": 1.7027, |
|
"mean_token_accuracy": 0.6214174151420593, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6260162601626016, |
|
"grad_norm": 0.3652247488498688, |
|
"learning_rate": 8.680389387658866e-05, |
|
"loss": 1.6611, |
|
"mean_token_accuracy": 0.6265343695878982, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6300813008130082, |
|
"grad_norm": 0.3320503532886505, |
|
"learning_rate": 8.656284604743876e-05, |
|
"loss": 1.6674, |
|
"mean_token_accuracy": 0.624911779165268, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6341463414634146, |
|
"grad_norm": 0.3469725549221039, |
|
"learning_rate": 8.631995777392645e-05, |
|
"loss": 1.6816, |
|
"mean_token_accuracy": 0.6228331983089447, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6382113821138211, |
|
"grad_norm": 0.32927247881889343, |
|
"learning_rate": 8.607524128218842e-05, |
|
"loss": 1.6815, |
|
"mean_token_accuracy": 0.6234974592924118, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6422764227642277, |
|
"grad_norm": 0.35349494218826294, |
|
"learning_rate": 8.582870889038738e-05, |
|
"loss": 1.7307, |
|
"mean_token_accuracy": 0.6128053724765777, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6463414634146342, |
|
"grad_norm": 0.32908809185028076, |
|
"learning_rate": 8.558037300809208e-05, |
|
"loss": 1.7138, |
|
"mean_token_accuracy": 0.6193376958370209, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6504065040650406, |
|
"grad_norm": 0.34228646755218506, |
|
"learning_rate": 8.533024613565256e-05, |
|
"loss": 1.6827, |
|
"mean_token_accuracy": 0.6244657814502717, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6544715447154471, |
|
"grad_norm": 0.3343624174594879, |
|
"learning_rate": 8.507834086357099e-05, |
|
"loss": 1.6971, |
|
"mean_token_accuracy": 0.6201623141765594, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6585365853658537, |
|
"grad_norm": 0.3272092044353485, |
|
"learning_rate": 8.482466987186785e-05, |
|
"loss": 1.6661, |
|
"mean_token_accuracy": 0.6262331873178482, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6626016260162602, |
|
"grad_norm": 0.31395092606544495, |
|
"learning_rate": 8.456924592944377e-05, |
|
"loss": 1.6822, |
|
"mean_token_accuracy": 0.6245481789112091, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.32309862971305847, |
|
"learning_rate": 8.43120818934367e-05, |
|
"loss": 1.6853, |
|
"mean_token_accuracy": 0.6223058104515076, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6707317073170732, |
|
"grad_norm": 0.3374616205692291, |
|
"learning_rate": 8.405319070857467e-05, |
|
"loss": 1.7206, |
|
"mean_token_accuracy": 0.6199805110692977, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6747967479674797, |
|
"grad_norm": 0.3281794786453247, |
|
"learning_rate": 8.379258540652438e-05, |
|
"loss": 1.6786, |
|
"mean_token_accuracy": 0.6231010109186172, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6788617886178862, |
|
"grad_norm": 0.33553647994995117, |
|
"learning_rate": 8.353027910523507e-05, |
|
"loss": 1.6807, |
|
"mean_token_accuracy": 0.6234080284833908, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 0.3125993013381958, |
|
"learning_rate": 8.326628500827826e-05, |
|
"loss": 1.6836, |
|
"mean_token_accuracy": 0.6244439870119095, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6869918699186992, |
|
"grad_norm": 0.3228364884853363, |
|
"learning_rate": 8.30006164041832e-05, |
|
"loss": 1.6904, |
|
"mean_token_accuracy": 0.6231540441513062, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.6910569105691057, |
|
"grad_norm": 0.3349599540233612, |
|
"learning_rate": 8.273328666576783e-05, |
|
"loss": 1.7077, |
|
"mean_token_accuracy": 0.621891450881958, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6951219512195121, |
|
"grad_norm": 0.32438069581985474, |
|
"learning_rate": 8.246430924946575e-05, |
|
"loss": 1.6859, |
|
"mean_token_accuracy": 0.618968591094017, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.6991869918699187, |
|
"grad_norm": 0.3320821523666382, |
|
"learning_rate": 8.219369769464883e-05, |
|
"loss": 1.6569, |
|
"mean_token_accuracy": 0.6251401513814926, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7032520325203252, |
|
"grad_norm": 0.3155499994754791, |
|
"learning_rate": 8.192146562294571e-05, |
|
"loss": 1.7052, |
|
"mean_token_accuracy": 0.6208123952150345, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7073170731707317, |
|
"grad_norm": 0.32664182782173157, |
|
"learning_rate": 8.16476267375561e-05, |
|
"loss": 1.6755, |
|
"mean_token_accuracy": 0.6208460986614227, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7113821138211383, |
|
"grad_norm": 0.3416995406150818, |
|
"learning_rate": 8.137219482256102e-05, |
|
"loss": 1.6895, |
|
"mean_token_accuracy": 0.6255667060613632, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7154471544715447, |
|
"grad_norm": 0.3330952525138855, |
|
"learning_rate": 8.109518374222901e-05, |
|
"loss": 1.7053, |
|
"mean_token_accuracy": 0.6207927197217942, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7195121951219512, |
|
"grad_norm": 0.3635248839855194, |
|
"learning_rate": 8.081660744031819e-05, |
|
"loss": 1.6823, |
|
"mean_token_accuracy": 0.62436144053936, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7235772357723578, |
|
"grad_norm": 0.3320518136024475, |
|
"learning_rate": 8.053647993937436e-05, |
|
"loss": 1.6785, |
|
"mean_token_accuracy": 0.6251044809818268, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7276422764227642, |
|
"grad_norm": 0.31727686524391174, |
|
"learning_rate": 8.025481534002524e-05, |
|
"loss": 1.6947, |
|
"mean_token_accuracy": 0.6196910262107849, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 0.3138638138771057, |
|
"learning_rate": 7.997162782027061e-05, |
|
"loss": 1.6746, |
|
"mean_token_accuracy": 0.6235388338565826, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7357723577235772, |
|
"grad_norm": 0.3528794050216675, |
|
"learning_rate": 7.968693163476873e-05, |
|
"loss": 1.6937, |
|
"mean_token_accuracy": 0.6207994610071182, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7398373983739838, |
|
"grad_norm": 0.33244994282722473, |
|
"learning_rate": 7.940074111411869e-05, |
|
"loss": 1.6603, |
|
"mean_token_accuracy": 0.627124959230423, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7439024390243902, |
|
"grad_norm": 0.336577832698822, |
|
"learning_rate": 7.911307066413919e-05, |
|
"loss": 1.6736, |
|
"mean_token_accuracy": 0.6213417708873749, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7479674796747967, |
|
"grad_norm": 0.34156832098960876, |
|
"learning_rate": 7.882393476514329e-05, |
|
"loss": 1.6685, |
|
"mean_token_accuracy": 0.6237036377191544, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7520325203252033, |
|
"grad_norm": 0.3253253400325775, |
|
"learning_rate": 7.853334797120961e-05, |
|
"loss": 1.6527, |
|
"mean_token_accuracy": 0.6282201766967773, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7560975609756098, |
|
"grad_norm": 0.3326328992843628, |
|
"learning_rate": 7.824132490944967e-05, |
|
"loss": 1.6935, |
|
"mean_token_accuracy": 0.6244556874036788, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7601626016260162, |
|
"grad_norm": 0.33001771569252014, |
|
"learning_rate": 7.794788027927164e-05, |
|
"loss": 1.6557, |
|
"mean_token_accuracy": 0.6246199995279312, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.7642276422764228, |
|
"grad_norm": 0.3541715741157532, |
|
"learning_rate": 7.765302885164038e-05, |
|
"loss": 1.6579, |
|
"mean_token_accuracy": 0.6265397042036056, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7682926829268293, |
|
"grad_norm": 0.32156965136528015, |
|
"learning_rate": 7.735678546833402e-05, |
|
"loss": 1.6858, |
|
"mean_token_accuracy": 0.6219491124153137, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7723577235772358, |
|
"grad_norm": 0.32490742206573486, |
|
"learning_rate": 7.70591650411968e-05, |
|
"loss": 1.6991, |
|
"mean_token_accuracy": 0.6211956650018692, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7764227642276422, |
|
"grad_norm": 0.32863402366638184, |
|
"learning_rate": 7.676018255138841e-05, |
|
"loss": 1.6576, |
|
"mean_token_accuracy": 0.6267116487026214, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 0.32754477858543396, |
|
"learning_rate": 7.645985304863003e-05, |
|
"loss": 1.6647, |
|
"mean_token_accuracy": 0.6256497412919998, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7845528455284553, |
|
"grad_norm": 0.3257920742034912, |
|
"learning_rate": 7.61581916504467e-05, |
|
"loss": 1.6538, |
|
"mean_token_accuracy": 0.6261578530073166, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7886178861788617, |
|
"grad_norm": 0.34073394536972046, |
|
"learning_rate": 7.585521354140638e-05, |
|
"loss": 1.6483, |
|
"mean_token_accuracy": 0.6276870489120483, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7926829268292683, |
|
"grad_norm": 0.33769065141677856, |
|
"learning_rate": 7.555093397235552e-05, |
|
"loss": 1.6562, |
|
"mean_token_accuracy": 0.6260519325733185, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7967479674796748, |
|
"grad_norm": 0.32589268684387207, |
|
"learning_rate": 7.524536825965153e-05, |
|
"loss": 1.6456, |
|
"mean_token_accuracy": 0.6263998299837112, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8008130081300813, |
|
"grad_norm": 0.3267214298248291, |
|
"learning_rate": 7.493853178439177e-05, |
|
"loss": 1.6835, |
|
"mean_token_accuracy": 0.6221833378076553, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8048780487804879, |
|
"grad_norm": 0.3289288580417633, |
|
"learning_rate": 7.463043999163919e-05, |
|
"loss": 1.6975, |
|
"mean_token_accuracy": 0.6209107756614685, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8089430894308943, |
|
"grad_norm": 0.3326919376850128, |
|
"learning_rate": 7.432110838964507e-05, |
|
"loss": 1.6648, |
|
"mean_token_accuracy": 0.628352826833725, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8130081300813008, |
|
"grad_norm": 0.3397887945175171, |
|
"learning_rate": 7.401055254906829e-05, |
|
"loss": 1.675, |
|
"mean_token_accuracy": 0.6245883584022522, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8170731707317073, |
|
"grad_norm": 0.3190455138683319, |
|
"learning_rate": 7.369878810219154e-05, |
|
"loss": 1.6654, |
|
"mean_token_accuracy": 0.6273934066295623, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8211382113821138, |
|
"grad_norm": 0.3284507095813751, |
|
"learning_rate": 7.33858307421345e-05, |
|
"loss": 1.6618, |
|
"mean_token_accuracy": 0.6257373452186584, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8252032520325203, |
|
"grad_norm": 0.3523695766925812, |
|
"learning_rate": 7.307169622206387e-05, |
|
"loss": 1.6777, |
|
"mean_token_accuracy": 0.6224261462688446, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8292682926829268, |
|
"grad_norm": 0.32716819643974304, |
|
"learning_rate": 7.275640035440045e-05, |
|
"loss": 1.6679, |
|
"mean_token_accuracy": 0.6271101534366608, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.34479984641075134, |
|
"learning_rate": 7.243995901002312e-05, |
|
"loss": 1.6657, |
|
"mean_token_accuracy": 0.6211905151605606, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8373983739837398, |
|
"grad_norm": 0.3566465973854065, |
|
"learning_rate": 7.212238811747003e-05, |
|
"loss": 1.666, |
|
"mean_token_accuracy": 0.624170771241188, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8414634146341463, |
|
"grad_norm": 0.3507501184940338, |
|
"learning_rate": 7.180370366213684e-05, |
|
"loss": 1.6691, |
|
"mean_token_accuracy": 0.6274985104799271, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8455284552845529, |
|
"grad_norm": 0.3325272500514984, |
|
"learning_rate": 7.148392168547191e-05, |
|
"loss": 1.6778, |
|
"mean_token_accuracy": 0.6261927515268326, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8495934959349594, |
|
"grad_norm": 0.3192001283168793, |
|
"learning_rate": 7.116305828416907e-05, |
|
"loss": 1.6719, |
|
"mean_token_accuracy": 0.6235435485839844, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.8536585365853658, |
|
"grad_norm": 0.3275107443332672, |
|
"learning_rate": 7.084112960935716e-05, |
|
"loss": 1.6568, |
|
"mean_token_accuracy": 0.6283035606145859, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8577235772357723, |
|
"grad_norm": 0.34391120076179504, |
|
"learning_rate": 7.051815186578711e-05, |
|
"loss": 1.6433, |
|
"mean_token_accuracy": 0.6293984144926071, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.8617886178861789, |
|
"grad_norm": 0.3307419717311859, |
|
"learning_rate": 7.019414131101634e-05, |
|
"loss": 1.6667, |
|
"mean_token_accuracy": 0.6279453337192535, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8658536585365854, |
|
"grad_norm": 0.319294810295105, |
|
"learning_rate": 6.986911425459028e-05, |
|
"loss": 1.6634, |
|
"mean_token_accuracy": 0.6289093613624572, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.8699186991869918, |
|
"grad_norm": 0.3250874876976013, |
|
"learning_rate": 6.954308705722143e-05, |
|
"loss": 1.6765, |
|
"mean_token_accuracy": 0.6236967951059341, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8739837398373984, |
|
"grad_norm": 0.35234498977661133, |
|
"learning_rate": 6.921607612996591e-05, |
|
"loss": 1.6337, |
|
"mean_token_accuracy": 0.6328418493270874, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 0.3295537531375885, |
|
"learning_rate": 6.88880979333973e-05, |
|
"loss": 1.6631, |
|
"mean_token_accuracy": 0.623921737074852, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8821138211382114, |
|
"grad_norm": 0.3322238028049469, |
|
"learning_rate": 6.855916897677806e-05, |
|
"loss": 1.6703, |
|
"mean_token_accuracy": 0.6248054713010788, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8861788617886179, |
|
"grad_norm": 0.3290652930736542, |
|
"learning_rate": 6.822930581722864e-05, |
|
"loss": 1.658, |
|
"mean_token_accuracy": 0.628606528043747, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8902439024390244, |
|
"grad_norm": 0.35497719049453735, |
|
"learning_rate": 6.789852505889383e-05, |
|
"loss": 1.6599, |
|
"mean_token_accuracy": 0.6224384754896164, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8943089430894309, |
|
"grad_norm": 0.32751455903053284, |
|
"learning_rate": 6.756684335210723e-05, |
|
"loss": 1.6749, |
|
"mean_token_accuracy": 0.6237338185310364, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8983739837398373, |
|
"grad_norm": 0.3361451029777527, |
|
"learning_rate": 6.723427739255291e-05, |
|
"loss": 1.6742, |
|
"mean_token_accuracy": 0.6254635989665985, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9024390243902439, |
|
"grad_norm": 0.3348131775856018, |
|
"learning_rate": 6.690084392042513e-05, |
|
"loss": 1.6564, |
|
"mean_token_accuracy": 0.6265616714954376, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9065040650406504, |
|
"grad_norm": 0.317259818315506, |
|
"learning_rate": 6.656655971958569e-05, |
|
"loss": 1.6537, |
|
"mean_token_accuracy": 0.6279567360877991, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9105691056910569, |
|
"grad_norm": 0.3306538760662079, |
|
"learning_rate": 6.623144161671899e-05, |
|
"loss": 1.6663, |
|
"mean_token_accuracy": 0.6219373792409897, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9146341463414634, |
|
"grad_norm": 0.32421913743019104, |
|
"learning_rate": 6.589550648048517e-05, |
|
"loss": 1.6731, |
|
"mean_token_accuracy": 0.6240474134683609, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9186991869918699, |
|
"grad_norm": 0.33657172322273254, |
|
"learning_rate": 6.555877122067093e-05, |
|
"loss": 1.674, |
|
"mean_token_accuracy": 0.6280266672372818, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9227642276422764, |
|
"grad_norm": 0.329629510641098, |
|
"learning_rate": 6.522125278733836e-05, |
|
"loss": 1.674, |
|
"mean_token_accuracy": 0.6285534679889679, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.926829268292683, |
|
"grad_norm": 0.3389008045196533, |
|
"learning_rate": 6.488296816997173e-05, |
|
"loss": 1.6653, |
|
"mean_token_accuracy": 0.6280644208192825, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9308943089430894, |
|
"grad_norm": 0.3208218216896057, |
|
"learning_rate": 6.45439343966223e-05, |
|
"loss": 1.6614, |
|
"mean_token_accuracy": 0.6285478830337524, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9349593495934959, |
|
"grad_norm": 0.3256186842918396, |
|
"learning_rate": 6.42041685330512e-05, |
|
"loss": 1.6556, |
|
"mean_token_accuracy": 0.6266997307538986, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9390243902439024, |
|
"grad_norm": 0.334493488073349, |
|
"learning_rate": 6.38636876818704e-05, |
|
"loss": 1.6814, |
|
"mean_token_accuracy": 0.6224779695272445, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.943089430894309, |
|
"grad_norm": 0.3306882679462433, |
|
"learning_rate": 6.35225089816818e-05, |
|
"loss": 1.6875, |
|
"mean_token_accuracy": 0.6235768556594848, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9471544715447154, |
|
"grad_norm": 0.3267136812210083, |
|
"learning_rate": 6.318064960621457e-05, |
|
"loss": 1.6791, |
|
"mean_token_accuracy": 0.6255802571773529, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.9512195121951219, |
|
"grad_norm": 0.3262345492839813, |
|
"learning_rate": 6.283812676346063e-05, |
|
"loss": 1.6595, |
|
"mean_token_accuracy": 0.6266104191541672, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9552845528455285, |
|
"grad_norm": 0.3797209858894348, |
|
"learning_rate": 6.249495769480855e-05, |
|
"loss": 1.6734, |
|
"mean_token_accuracy": 0.6262861758470535, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.959349593495935, |
|
"grad_norm": 0.31889474391937256, |
|
"learning_rate": 6.21511596741756e-05, |
|
"loss": 1.6804, |
|
"mean_token_accuracy": 0.6243892908096313, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9634146341463414, |
|
"grad_norm": 0.32399213314056396, |
|
"learning_rate": 6.180675000713825e-05, |
|
"loss": 1.6321, |
|
"mean_token_accuracy": 0.6283869951963424, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.967479674796748, |
|
"grad_norm": 0.3354453444480896, |
|
"learning_rate": 6.146174603006109e-05, |
|
"loss": 1.6321, |
|
"mean_token_accuracy": 0.6299521565437317, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.9715447154471545, |
|
"grad_norm": 0.3296736180782318, |
|
"learning_rate": 6.111616510922426e-05, |
|
"loss": 1.6837, |
|
"mean_token_accuracy": 0.620937955379486, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 0.3506980836391449, |
|
"learning_rate": 6.0770024639949074e-05, |
|
"loss": 1.6168, |
|
"mean_token_accuracy": 0.6322689533233643, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9796747967479674, |
|
"grad_norm": 0.32376107573509216, |
|
"learning_rate": 6.042334204572261e-05, |
|
"loss": 1.6294, |
|
"mean_token_accuracy": 0.6312895059585572, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.983739837398374, |
|
"grad_norm": 0.3357873260974884, |
|
"learning_rate": 6.0076134777320616e-05, |
|
"loss": 1.6499, |
|
"mean_token_accuracy": 0.6290972381830215, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9878048780487805, |
|
"grad_norm": 0.33059611916542053, |
|
"learning_rate": 5.9728420311929014e-05, |
|
"loss": 1.67, |
|
"mean_token_accuracy": 0.6243463218212127, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.991869918699187, |
|
"grad_norm": 0.3249407708644867, |
|
"learning_rate": 5.938021615226431e-05, |
|
"loss": 1.6295, |
|
"mean_token_accuracy": 0.6341723084449769, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9959349593495935, |
|
"grad_norm": 0.33168989419937134, |
|
"learning_rate": 5.9031539825692425e-05, |
|
"loss": 1.6314, |
|
"mean_token_accuracy": 0.6284798830747604, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.34545496106147766, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 1.6563, |
|
"mean_token_accuracy": 0.6242235869169235, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6696199178695679, |
|
"eval_mean_token_accuracy": 0.6275491353942126, |
|
"eval_runtime": 386.5388, |
|
"eval_samples_per_second": 50.913, |
|
"eval_steps_per_second": 3.182, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.0040650406504066, |
|
"grad_norm": 0.3021821081638336, |
|
"learning_rate": 5.8332840899243504e-05, |
|
"loss": 1.5474, |
|
"mean_token_accuracy": 0.6429942309856415, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.008130081300813, |
|
"grad_norm": 0.31666696071624756, |
|
"learning_rate": 5.798285346939942e-05, |
|
"loss": 1.5409, |
|
"mean_token_accuracy": 0.6427715480327606, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.0121951219512195, |
|
"grad_norm": 0.3398872911930084, |
|
"learning_rate": 5.7632464210943726e-05, |
|
"loss": 1.5152, |
|
"mean_token_accuracy": 0.6504542022943497, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.016260162601626, |
|
"grad_norm": 0.385241836309433, |
|
"learning_rate": 5.728169076123251e-05, |
|
"loss": 1.528, |
|
"mean_token_accuracy": 0.6502102971076965, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0203252032520325, |
|
"grad_norm": 0.33868077397346497, |
|
"learning_rate": 5.6930550776960686e-05, |
|
"loss": 1.5226, |
|
"mean_token_accuracy": 0.6435141801834107, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.024390243902439, |
|
"grad_norm": 0.35628053545951843, |
|
"learning_rate": 5.657906193327325e-05, |
|
"loss": 1.5179, |
|
"mean_token_accuracy": 0.6479565531015397, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0284552845528456, |
|
"grad_norm": 0.33298298716545105, |
|
"learning_rate": 5.6227241922875486e-05, |
|
"loss": 1.5183, |
|
"mean_token_accuracy": 0.6497620791196823, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.032520325203252, |
|
"grad_norm": 0.3421778976917267, |
|
"learning_rate": 5.587510845514249e-05, |
|
"loss": 1.5566, |
|
"mean_token_accuracy": 0.6448631703853607, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.0365853658536586, |
|
"grad_norm": 0.3357449471950531, |
|
"learning_rate": 5.5522679255227695e-05, |
|
"loss": 1.5358, |
|
"mean_token_accuracy": 0.6459738403558731, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.040650406504065, |
|
"grad_norm": 0.34018760919570923, |
|
"learning_rate": 5.5169972063170605e-05, |
|
"loss": 1.5423, |
|
"mean_token_accuracy": 0.6445542246103286, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.0447154471544715, |
|
"grad_norm": 0.3494982421398163, |
|
"learning_rate": 5.48170046330039e-05, |
|
"loss": 1.5386, |
|
"mean_token_accuracy": 0.6470185041427612, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.048780487804878, |
|
"grad_norm": 0.3506191670894623, |
|
"learning_rate": 5.446379473185972e-05, |
|
"loss": 1.5143, |
|
"mean_token_accuracy": 0.6470712095499038, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.0528455284552845, |
|
"grad_norm": 0.34973621368408203, |
|
"learning_rate": 5.4110360139075336e-05, |
|
"loss": 1.5104, |
|
"mean_token_accuracy": 0.6478220403194428, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.056910569105691, |
|
"grad_norm": 0.3722323477268219, |
|
"learning_rate": 5.375671864529816e-05, |
|
"loss": 1.5436, |
|
"mean_token_accuracy": 0.6473211497068405, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.0609756097560976, |
|
"grad_norm": 0.35045626759529114, |
|
"learning_rate": 5.340288805159036e-05, |
|
"loss": 1.5458, |
|
"mean_token_accuracy": 0.6453307747840882, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.065040650406504, |
|
"grad_norm": 0.3494359254837036, |
|
"learning_rate": 5.304888616853264e-05, |
|
"loss": 1.4917, |
|
"mean_token_accuracy": 0.6537877678871155, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.0691056910569106, |
|
"grad_norm": 0.3889128863811493, |
|
"learning_rate": 5.2694730815327844e-05, |
|
"loss": 1.5313, |
|
"mean_token_accuracy": 0.6451919347047805, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.0731707317073171, |
|
"grad_norm": 0.36031612753868103, |
|
"learning_rate": 5.234043981890394e-05, |
|
"loss": 1.5279, |
|
"mean_token_accuracy": 0.6495060652494431, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.0772357723577235, |
|
"grad_norm": 0.3679180443286896, |
|
"learning_rate": 5.19860310130167e-05, |
|
"loss": 1.5389, |
|
"mean_token_accuracy": 0.6498639941215515, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.08130081300813, |
|
"grad_norm": 0.34714043140411377, |
|
"learning_rate": 5.163152223735206e-05, |
|
"loss": 1.5485, |
|
"mean_token_accuracy": 0.6419304549694062, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.0853658536585367, |
|
"grad_norm": 0.3595254719257355, |
|
"learning_rate": 5.127693133662801e-05, |
|
"loss": 1.5258, |
|
"mean_token_accuracy": 0.6485728055238724, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.089430894308943, |
|
"grad_norm": 0.3535126745700836, |
|
"learning_rate": 5.092227615969643e-05, |
|
"loss": 1.5355, |
|
"mean_token_accuracy": 0.6457217365503312, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.0934959349593496, |
|
"grad_norm": 0.3507843613624573, |
|
"learning_rate": 5.056757455864469e-05, |
|
"loss": 1.5089, |
|
"mean_token_accuracy": 0.6506163209676743, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.0975609756097562, |
|
"grad_norm": 0.3525923490524292, |
|
"learning_rate": 5.021284438789694e-05, |
|
"loss": 1.5539, |
|
"mean_token_accuracy": 0.6417040884494781, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1016260162601625, |
|
"grad_norm": 0.34911075234413147, |
|
"learning_rate": 4.985810350331544e-05, |
|
"loss": 1.5212, |
|
"mean_token_accuracy": 0.6477119773626328, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.1056910569105691, |
|
"grad_norm": 0.35920506715774536, |
|
"learning_rate": 4.950336976130177e-05, |
|
"loss": 1.5358, |
|
"mean_token_accuracy": 0.6480906993150711, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.1097560975609757, |
|
"grad_norm": 0.35570406913757324, |
|
"learning_rate": 4.914866101789792e-05, |
|
"loss": 1.5024, |
|
"mean_token_accuracy": 0.6500779658555984, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.113821138211382, |
|
"grad_norm": 0.3491937518119812, |
|
"learning_rate": 4.8793995127887617e-05, |
|
"loss": 1.5209, |
|
"mean_token_accuracy": 0.6504259049892426, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.1178861788617886, |
|
"grad_norm": 0.34110239148139954, |
|
"learning_rate": 4.843938994389743e-05, |
|
"loss": 1.5395, |
|
"mean_token_accuracy": 0.6453883588314057, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.1219512195121952, |
|
"grad_norm": 0.35667678713798523, |
|
"learning_rate": 4.8084863315498234e-05, |
|
"loss": 1.5376, |
|
"mean_token_accuracy": 0.6465921342372895, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1260162601626016, |
|
"grad_norm": 0.342572420835495, |
|
"learning_rate": 4.77304330883067e-05, |
|
"loss": 1.5157, |
|
"mean_token_accuracy": 0.6498256474733353, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.1300813008130082, |
|
"grad_norm": 0.34750455617904663, |
|
"learning_rate": 4.7376117103086974e-05, |
|
"loss": 1.5456, |
|
"mean_token_accuracy": 0.6450578838586807, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.1341463414634148, |
|
"grad_norm": 0.36636075377464294, |
|
"learning_rate": 4.702193319485271e-05, |
|
"loss": 1.5528, |
|
"mean_token_accuracy": 0.6424712836742401, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.1382113821138211, |
|
"grad_norm": 0.37054094672203064, |
|
"learning_rate": 4.666789919196922e-05, |
|
"loss": 1.5419, |
|
"mean_token_accuracy": 0.6446193605661392, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1422764227642277, |
|
"grad_norm": 0.3575651943683624, |
|
"learning_rate": 4.6314032915256144e-05, |
|
"loss": 1.5001, |
|
"mean_token_accuracy": 0.6497335374355316, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.146341463414634, |
|
"grad_norm": 0.38506269454956055, |
|
"learning_rate": 4.5960352177090395e-05, |
|
"loss": 1.5412, |
|
"mean_token_accuracy": 0.6469351649284363, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1504065040650406, |
|
"grad_norm": 0.3651314675807953, |
|
"learning_rate": 4.5606874780509474e-05, |
|
"loss": 1.52, |
|
"mean_token_accuracy": 0.6479903131723403, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.1544715447154472, |
|
"grad_norm": 0.36033448576927185, |
|
"learning_rate": 4.525361851831545e-05, |
|
"loss": 1.5192, |
|
"mean_token_accuracy": 0.650235790014267, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.1585365853658536, |
|
"grad_norm": 0.3604031801223755, |
|
"learning_rate": 4.4900601172179244e-05, |
|
"loss": 1.5073, |
|
"mean_token_accuracy": 0.6518182456493378, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.1626016260162602, |
|
"grad_norm": 0.3583182096481323, |
|
"learning_rate": 4.454784051174556e-05, |
|
"loss": 1.5276, |
|
"mean_token_accuracy": 0.6453711301088333, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.3655434250831604, |
|
"learning_rate": 4.4195354293738484e-05, |
|
"loss": 1.5284, |
|
"mean_token_accuracy": 0.6483315199613571, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.170731707317073, |
|
"grad_norm": 0.3583129942417145, |
|
"learning_rate": 4.3843160261067655e-05, |
|
"loss": 1.5008, |
|
"mean_token_accuracy": 0.6525300085544586, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.1747967479674797, |
|
"grad_norm": 0.37054041028022766, |
|
"learning_rate": 4.34912761419351e-05, |
|
"loss": 1.5162, |
|
"mean_token_accuracy": 0.6502859503030777, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.1788617886178863, |
|
"grad_norm": 0.36705902218818665, |
|
"learning_rate": 4.313971964894289e-05, |
|
"loss": 1.5391, |
|
"mean_token_accuracy": 0.6450930207967758, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.1829268292682926, |
|
"grad_norm": 0.35790133476257324, |
|
"learning_rate": 4.2788508478201606e-05, |
|
"loss": 1.5179, |
|
"mean_token_accuracy": 0.6487129151821136, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.1869918699186992, |
|
"grad_norm": 0.3542655408382416, |
|
"learning_rate": 4.2437660308439464e-05, |
|
"loss": 1.5246, |
|
"mean_token_accuracy": 0.6480981141328812, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.1910569105691058, |
|
"grad_norm": 0.3644402325153351, |
|
"learning_rate": 4.2087192800112544e-05, |
|
"loss": 1.5448, |
|
"mean_token_accuracy": 0.646998131275177, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.1951219512195121, |
|
"grad_norm": 0.36414816975593567, |
|
"learning_rate": 4.1737123594515756e-05, |
|
"loss": 1.4962, |
|
"mean_token_accuracy": 0.6519364267587662, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.1991869918699187, |
|
"grad_norm": 0.36013391613960266, |
|
"learning_rate": 4.138747031289485e-05, |
|
"loss": 1.5284, |
|
"mean_token_accuracy": 0.6466194182634354, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.203252032520325, |
|
"grad_norm": 0.34742000699043274, |
|
"learning_rate": 4.1038250555559464e-05, |
|
"loss": 1.5318, |
|
"mean_token_accuracy": 0.6492583066225052, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2073170731707317, |
|
"grad_norm": 0.3612314760684967, |
|
"learning_rate": 4.06894819009971e-05, |
|
"loss": 1.5118, |
|
"mean_token_accuracy": 0.6507156074047089, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.2113821138211383, |
|
"grad_norm": 0.37525928020477295, |
|
"learning_rate": 4.034118190498843e-05, |
|
"loss": 1.5078, |
|
"mean_token_accuracy": 0.6511429727077485, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2154471544715446, |
|
"grad_norm": 0.3531548082828522, |
|
"learning_rate": 3.9993368099723427e-05, |
|
"loss": 1.5109, |
|
"mean_token_accuracy": 0.6511172980070115, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.2195121951219512, |
|
"grad_norm": 0.3843003511428833, |
|
"learning_rate": 3.964605799291897e-05, |
|
"loss": 1.5312, |
|
"mean_token_accuracy": 0.6471881330013275, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2235772357723578, |
|
"grad_norm": 0.368274450302124, |
|
"learning_rate": 3.9299269066937565e-05, |
|
"loss": 1.5462, |
|
"mean_token_accuracy": 0.6470870286226272, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.2276422764227641, |
|
"grad_norm": 0.36539050936698914, |
|
"learning_rate": 3.895301877790728e-05, |
|
"loss": 1.5091, |
|
"mean_token_accuracy": 0.6507299602031708, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.2317073170731707, |
|
"grad_norm": 0.3680725395679474, |
|
"learning_rate": 3.8607324554843136e-05, |
|
"loss": 1.5169, |
|
"mean_token_accuracy": 0.6476110547780991, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.2357723577235773, |
|
"grad_norm": 0.3924301564693451, |
|
"learning_rate": 3.826220379876974e-05, |
|
"loss": 1.522, |
|
"mean_token_accuracy": 0.6474580347537995, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.2398373983739837, |
|
"grad_norm": 0.40176910161972046, |
|
"learning_rate": 3.7917673881845375e-05, |
|
"loss": 1.5285, |
|
"mean_token_accuracy": 0.6465384632349014, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.2439024390243902, |
|
"grad_norm": 0.3745909035205841, |
|
"learning_rate": 3.757375214648764e-05, |
|
"loss": 1.5088, |
|
"mean_token_accuracy": 0.6503387361764907, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.2479674796747968, |
|
"grad_norm": 0.362008273601532, |
|
"learning_rate": 3.7230455904500384e-05, |
|
"loss": 1.5187, |
|
"mean_token_accuracy": 0.6497048884630203, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.2520325203252032, |
|
"grad_norm": 0.36746859550476074, |
|
"learning_rate": 3.6887802436202305e-05, |
|
"loss": 1.4965, |
|
"mean_token_accuracy": 0.6500811547040939, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.2560975609756098, |
|
"grad_norm": 0.35370299220085144, |
|
"learning_rate": 3.6545808989557205e-05, |
|
"loss": 1.5186, |
|
"mean_token_accuracy": 0.6499067515134811, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.2601626016260163, |
|
"grad_norm": 0.3644496500492096, |
|
"learning_rate": 3.620449277930568e-05, |
|
"loss": 1.5136, |
|
"mean_token_accuracy": 0.65036840736866, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.2642276422764227, |
|
"grad_norm": 0.37275806069374084, |
|
"learning_rate": 3.586387098609865e-05, |
|
"loss": 1.5475, |
|
"mean_token_accuracy": 0.6439596533775329, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.2682926829268293, |
|
"grad_norm": 0.37953129410743713, |
|
"learning_rate": 3.5523960755632574e-05, |
|
"loss": 1.5134, |
|
"mean_token_accuracy": 0.6512956976890564, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.2723577235772359, |
|
"grad_norm": 0.36998462677001953, |
|
"learning_rate": 3.518477919778631e-05, |
|
"loss": 1.5007, |
|
"mean_token_accuracy": 0.6512791901826859, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.2764227642276422, |
|
"grad_norm": 0.37512996792793274, |
|
"learning_rate": 3.484634338575995e-05, |
|
"loss": 1.5248, |
|
"mean_token_accuracy": 0.6483827620744705, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.2804878048780488, |
|
"grad_norm": 0.35395580530166626, |
|
"learning_rate": 3.450867035521536e-05, |
|
"loss": 1.5037, |
|
"mean_token_accuracy": 0.6528991967439651, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.2845528455284554, |
|
"grad_norm": 0.3677853047847748, |
|
"learning_rate": 3.417177710341868e-05, |
|
"loss": 1.5439, |
|
"mean_token_accuracy": 0.6433329194784164, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.2886178861788617, |
|
"grad_norm": 0.36992302536964417, |
|
"learning_rate": 3.3835680588384766e-05, |
|
"loss": 1.4969, |
|
"mean_token_accuracy": 0.6538380652666091, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.2926829268292683, |
|
"grad_norm": 0.3684805631637573, |
|
"learning_rate": 3.350039772802354e-05, |
|
"loss": 1.4974, |
|
"mean_token_accuracy": 0.6528685718774796, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.296747967479675, |
|
"grad_norm": 0.37707433104515076, |
|
"learning_rate": 3.316594539928845e-05, |
|
"loss": 1.5342, |
|
"mean_token_accuracy": 0.6468271166086197, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.3008130081300813, |
|
"grad_norm": 0.38442346453666687, |
|
"learning_rate": 3.283234043732689e-05, |
|
"loss": 1.5385, |
|
"mean_token_accuracy": 0.6482626140117645, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3048780487804879, |
|
"grad_norm": 0.4277592897415161, |
|
"learning_rate": 3.249959963463283e-05, |
|
"loss": 1.537, |
|
"mean_token_accuracy": 0.6459639608860016, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.3089430894308944, |
|
"grad_norm": 0.3657115399837494, |
|
"learning_rate": 3.216773974020152e-05, |
|
"loss": 1.5061, |
|
"mean_token_accuracy": 0.6512133955955506, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.3130081300813008, |
|
"grad_norm": 0.367895245552063, |
|
"learning_rate": 3.183677745868636e-05, |
|
"loss": 1.5197, |
|
"mean_token_accuracy": 0.6486563175916672, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.3170731707317074, |
|
"grad_norm": 0.3654331862926483, |
|
"learning_rate": 3.1506729449558184e-05, |
|
"loss": 1.5237, |
|
"mean_token_accuracy": 0.6474861919879913, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.321138211382114, |
|
"grad_norm": 0.3650592565536499, |
|
"learning_rate": 3.1177612326266484e-05, |
|
"loss": 1.5061, |
|
"mean_token_accuracy": 0.6528548806905746, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.3252032520325203, |
|
"grad_norm": 0.3572981357574463, |
|
"learning_rate": 3.0849442655403315e-05, |
|
"loss": 1.4948, |
|
"mean_token_accuracy": 0.6514863818883896, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.329268292682927, |
|
"grad_norm": 0.36876747012138367, |
|
"learning_rate": 3.052223695586929e-05, |
|
"loss": 1.5031, |
|
"mean_token_accuracy": 0.6546044528484345, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.3717840015888214, |
|
"learning_rate": 3.019601169804216e-05, |
|
"loss": 1.5204, |
|
"mean_token_accuracy": 0.6511706113815308, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.3373983739837398, |
|
"grad_norm": 0.36498016119003296, |
|
"learning_rate": 2.9870783302947668e-05, |
|
"loss": 1.522, |
|
"mean_token_accuracy": 0.6496157437562943, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.3414634146341464, |
|
"grad_norm": 0.36414778232574463, |
|
"learning_rate": 2.9546568141433006e-05, |
|
"loss": 1.512, |
|
"mean_token_accuracy": 0.6521206349134445, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3455284552845528, |
|
"grad_norm": 0.3626897931098938, |
|
"learning_rate": 2.9223382533342826e-05, |
|
"loss": 1.5163, |
|
"mean_token_accuracy": 0.6505622148513794, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.3495934959349594, |
|
"grad_norm": 0.3667406141757965, |
|
"learning_rate": 2.8901242746697638e-05, |
|
"loss": 1.5277, |
|
"mean_token_accuracy": 0.6480904757976532, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.3536585365853657, |
|
"grad_norm": 0.381999135017395, |
|
"learning_rate": 2.858016499687503e-05, |
|
"loss": 1.5193, |
|
"mean_token_accuracy": 0.6478623539209366, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.3577235772357723, |
|
"grad_norm": 0.38897019624710083, |
|
"learning_rate": 2.8260165445793418e-05, |
|
"loss": 1.5284, |
|
"mean_token_accuracy": 0.6481962502002716, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.3617886178861789, |
|
"grad_norm": 0.3818219006061554, |
|
"learning_rate": 2.7941260201098514e-05, |
|
"loss": 1.5105, |
|
"mean_token_accuracy": 0.6506875902414322, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.3658536585365852, |
|
"grad_norm": 0.3766973614692688, |
|
"learning_rate": 2.762346531535246e-05, |
|
"loss": 1.497, |
|
"mean_token_accuracy": 0.6518009305000305, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.3699186991869918, |
|
"grad_norm": 0.39048492908477783, |
|
"learning_rate": 2.730679678522592e-05, |
|
"loss": 1.5164, |
|
"mean_token_accuracy": 0.6483979940414428, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.3739837398373984, |
|
"grad_norm": 0.3916437327861786, |
|
"learning_rate": 2.6991270550692794e-05, |
|
"loss": 1.5322, |
|
"mean_token_accuracy": 0.6441838830709458, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.3780487804878048, |
|
"grad_norm": 0.3581782579421997, |
|
"learning_rate": 2.6676902494227795e-05, |
|
"loss": 1.5068, |
|
"mean_token_accuracy": 0.6516738593578338, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.3821138211382114, |
|
"grad_norm": 0.3775046765804291, |
|
"learning_rate": 2.6363708440007133e-05, |
|
"loss": 1.5539, |
|
"mean_token_accuracy": 0.644507572054863, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.386178861788618, |
|
"grad_norm": 0.37033286690711975, |
|
"learning_rate": 2.6051704153111845e-05, |
|
"loss": 1.542, |
|
"mean_token_accuracy": 0.647097697854042, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.3902439024390243, |
|
"grad_norm": 0.37243974208831787, |
|
"learning_rate": 2.574090533873431e-05, |
|
"loss": 1.5082, |
|
"mean_token_accuracy": 0.6518228441476822, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.3943089430894309, |
|
"grad_norm": 0.37527644634246826, |
|
"learning_rate": 2.543132764138768e-05, |
|
"loss": 1.5117, |
|
"mean_token_accuracy": 0.6485153377056122, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.3983739837398375, |
|
"grad_norm": 0.3756648004055023, |
|
"learning_rate": 2.5122986644118407e-05, |
|
"loss": 1.5182, |
|
"mean_token_accuracy": 0.6499085307121277, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.4024390243902438, |
|
"grad_norm": 0.3746618628501892, |
|
"learning_rate": 2.4815897867721784e-05, |
|
"loss": 1.4912, |
|
"mean_token_accuracy": 0.6532864809036255, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.4065040650406504, |
|
"grad_norm": 0.37365466356277466, |
|
"learning_rate": 2.451007676996078e-05, |
|
"loss": 1.5155, |
|
"mean_token_accuracy": 0.6480746299028397, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.410569105691057, |
|
"grad_norm": 0.3754975199699402, |
|
"learning_rate": 2.4205538744787903e-05, |
|
"loss": 1.5161, |
|
"mean_token_accuracy": 0.6481755256652832, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.4146341463414633, |
|
"grad_norm": 0.35210585594177246, |
|
"learning_rate": 2.3902299121570333e-05, |
|
"loss": 1.4837, |
|
"mean_token_accuracy": 0.6545587033033371, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.41869918699187, |
|
"grad_norm": 0.37373027205467224, |
|
"learning_rate": 2.360037316431823e-05, |
|
"loss": 1.5127, |
|
"mean_token_accuracy": 0.6513051509857177, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.4227642276422765, |
|
"grad_norm": 0.3772600591182709, |
|
"learning_rate": 2.3299776070916517e-05, |
|
"loss": 1.5163, |
|
"mean_token_accuracy": 0.6458168059587479, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4268292682926829, |
|
"grad_norm": 0.3841407597064972, |
|
"learning_rate": 2.30005229723598e-05, |
|
"loss": 1.4891, |
|
"mean_token_accuracy": 0.6541343659162522, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.4308943089430894, |
|
"grad_norm": 0.35385993123054504, |
|
"learning_rate": 2.27026289319907e-05, |
|
"loss": 1.5384, |
|
"mean_token_accuracy": 0.6474617034196853, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.434959349593496, |
|
"grad_norm": 0.39387115836143494, |
|
"learning_rate": 2.2406108944741695e-05, |
|
"loss": 1.5422, |
|
"mean_token_accuracy": 0.6462881207466126, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.4390243902439024, |
|
"grad_norm": 0.35977375507354736, |
|
"learning_rate": 2.211097793638029e-05, |
|
"loss": 1.5234, |
|
"mean_token_accuracy": 0.6480515390634537, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.443089430894309, |
|
"grad_norm": 0.37408530712127686, |
|
"learning_rate": 2.1817250762757657e-05, |
|
"loss": 1.502, |
|
"mean_token_accuracy": 0.6521117597818374, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.4471544715447155, |
|
"grad_norm": 0.3536774516105652, |
|
"learning_rate": 2.1524942209060945e-05, |
|
"loss": 1.5278, |
|
"mean_token_accuracy": 0.6484600365161896, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.451219512195122, |
|
"grad_norm": 0.3701123297214508, |
|
"learning_rate": 2.1234066989068972e-05, |
|
"loss": 1.5088, |
|
"mean_token_accuracy": 0.6488320469856262, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.4552845528455285, |
|
"grad_norm": 0.37546491622924805, |
|
"learning_rate": 2.0944639744411625e-05, |
|
"loss": 1.5285, |
|
"mean_token_accuracy": 0.6473397225141525, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.459349593495935, |
|
"grad_norm": 0.3779871165752411, |
|
"learning_rate": 2.065667504383276e-05, |
|
"loss": 1.5022, |
|
"mean_token_accuracy": 0.6534674495458603, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.4634146341463414, |
|
"grad_norm": 0.3659263551235199, |
|
"learning_rate": 2.0370187382457068e-05, |
|
"loss": 1.4921, |
|
"mean_token_accuracy": 0.6546839594841003, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.467479674796748, |
|
"grad_norm": 0.37984707951545715, |
|
"learning_rate": 2.0085191181060175e-05, |
|
"loss": 1.5032, |
|
"mean_token_accuracy": 0.6512989819049835, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.4715447154471546, |
|
"grad_norm": 0.39589378237724304, |
|
"learning_rate": 1.980170078534297e-05, |
|
"loss": 1.5252, |
|
"mean_token_accuracy": 0.6502268701791764, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.475609756097561, |
|
"grad_norm": 0.3680674731731415, |
|
"learning_rate": 1.9519730465209384e-05, |
|
"loss": 1.5063, |
|
"mean_token_accuracy": 0.649566239118576, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.4796747967479675, |
|
"grad_norm": 0.36867427825927734, |
|
"learning_rate": 1.9239294414048144e-05, |
|
"loss": 1.5129, |
|
"mean_token_accuracy": 0.6540564984083176, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.4837398373983741, |
|
"grad_norm": 0.3737272322177887, |
|
"learning_rate": 1.8960406748018227e-05, |
|
"loss": 1.5382, |
|
"mean_token_accuracy": 0.6441423952579498, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.4878048780487805, |
|
"grad_norm": 0.3629378080368042, |
|
"learning_rate": 1.868308150533847e-05, |
|
"loss": 1.5239, |
|
"mean_token_accuracy": 0.6493414402008056, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.491869918699187, |
|
"grad_norm": 0.37252160906791687, |
|
"learning_rate": 1.8407332645580804e-05, |
|
"loss": 1.4978, |
|
"mean_token_accuracy": 0.6547783106565476, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.4959349593495934, |
|
"grad_norm": 0.3749361038208008, |
|
"learning_rate": 1.8133174048967598e-05, |
|
"loss": 1.5111, |
|
"mean_token_accuracy": 0.6501660078763962, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.3704673945903778, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 1.4988, |
|
"mean_token_accuracy": 0.6547404527664185, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.5040650406504064, |
|
"grad_norm": 0.36430564522743225, |
|
"learning_rate": 1.7589682765128425e-05, |
|
"loss": 1.5097, |
|
"mean_token_accuracy": 0.6511308133602143, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5081300813008132, |
|
"grad_norm": 0.3664015531539917, |
|
"learning_rate": 1.7320377435331558e-05, |
|
"loss": 1.5139, |
|
"mean_token_accuracy": 0.6519527226686478, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.5121951219512195, |
|
"grad_norm": 0.37101101875305176, |
|
"learning_rate": 1.7052717082160346e-05, |
|
"loss": 1.5164, |
|
"mean_token_accuracy": 0.6514888644218445, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.5162601626016259, |
|
"grad_norm": 0.3765888214111328, |
|
"learning_rate": 1.678671517869037e-05, |
|
"loss": 1.5032, |
|
"mean_token_accuracy": 0.6533948063850403, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.5203252032520327, |
|
"grad_norm": 0.3664512038230896, |
|
"learning_rate": 1.652238511451668e-05, |
|
"loss": 1.5174, |
|
"mean_token_accuracy": 0.6536636203527451, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.524390243902439, |
|
"grad_norm": 0.3818027377128601, |
|
"learning_rate": 1.6259740195079903e-05, |
|
"loss": 1.5371, |
|
"mean_token_accuracy": 0.6448280185461044, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.5284552845528454, |
|
"grad_norm": 0.36696919798851013, |
|
"learning_rate": 1.599879364099642e-05, |
|
"loss": 1.51, |
|
"mean_token_accuracy": 0.6512410163879394, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.532520325203252, |
|
"grad_norm": 0.3844444751739502, |
|
"learning_rate": 1.573955858739289e-05, |
|
"loss": 1.5026, |
|
"mean_token_accuracy": 0.6513481706380844, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.5365853658536586, |
|
"grad_norm": 0.36545833945274353, |
|
"learning_rate": 1.5482048083245114e-05, |
|
"loss": 1.5417, |
|
"mean_token_accuracy": 0.64716956615448, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.540650406504065, |
|
"grad_norm": 0.37220799922943115, |
|
"learning_rate": 1.5226275090721181e-05, |
|
"loss": 1.4983, |
|
"mean_token_accuracy": 0.6524787664413452, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.5447154471544715, |
|
"grad_norm": 0.3821048438549042, |
|
"learning_rate": 1.4972252484528937e-05, |
|
"loss": 1.5407, |
|
"mean_token_accuracy": 0.6467223703861237, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.548780487804878, |
|
"grad_norm": 0.368897944688797, |
|
"learning_rate": 1.4719993051268022e-05, |
|
"loss": 1.5303, |
|
"mean_token_accuracy": 0.6483209669589997, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.5528455284552845, |
|
"grad_norm": 0.3617345094680786, |
|
"learning_rate": 1.4469509488786165e-05, |
|
"loss": 1.494, |
|
"mean_token_accuracy": 0.6526229411363602, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.556910569105691, |
|
"grad_norm": 0.36573946475982666, |
|
"learning_rate": 1.4220814405540067e-05, |
|
"loss": 1.4986, |
|
"mean_token_accuracy": 0.6525467038154602, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.5609756097560976, |
|
"grad_norm": 0.375692218542099, |
|
"learning_rate": 1.3973920319960655e-05, |
|
"loss": 1.5278, |
|
"mean_token_accuracy": 0.6468215733766556, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.565040650406504, |
|
"grad_norm": 0.3687947690486908, |
|
"learning_rate": 1.3728839659823045e-05, |
|
"loss": 1.4991, |
|
"mean_token_accuracy": 0.6529325067996978, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.5691056910569106, |
|
"grad_norm": 0.3606472313404083, |
|
"learning_rate": 1.348558476162094e-05, |
|
"loss": 1.5135, |
|
"mean_token_accuracy": 0.6529798805713654, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.5731707317073171, |
|
"grad_norm": 0.3836788833141327, |
|
"learning_rate": 1.3244167869945589e-05, |
|
"loss": 1.5119, |
|
"mean_token_accuracy": 0.6502353459596634, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.5772357723577235, |
|
"grad_norm": 0.3860268294811249, |
|
"learning_rate": 1.3004601136869554e-05, |
|
"loss": 1.525, |
|
"mean_token_accuracy": 0.6472799718379975, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.58130081300813, |
|
"grad_norm": 0.37033963203430176, |
|
"learning_rate": 1.2766896621334929e-05, |
|
"loss": 1.4882, |
|
"mean_token_accuracy": 0.6510943710803986, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.5853658536585367, |
|
"grad_norm": 0.3663535714149475, |
|
"learning_rate": 1.253106628854635e-05, |
|
"loss": 1.4859, |
|
"mean_token_accuracy": 0.6542395889759064, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.589430894308943, |
|
"grad_norm": 0.36218035221099854, |
|
"learning_rate": 1.2297122009368738e-05, |
|
"loss": 1.4916, |
|
"mean_token_accuracy": 0.6534947186708451, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.5934959349593496, |
|
"grad_norm": 0.373336523771286, |
|
"learning_rate": 1.206507555972975e-05, |
|
"loss": 1.5036, |
|
"mean_token_accuracy": 0.6506408721208572, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.5975609756097562, |
|
"grad_norm": 0.3867935240268707, |
|
"learning_rate": 1.183493862002702e-05, |
|
"loss": 1.4974, |
|
"mean_token_accuracy": 0.6530610293149948, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.6016260162601625, |
|
"grad_norm": 0.3730136752128601, |
|
"learning_rate": 1.1606722774540146e-05, |
|
"loss": 1.5286, |
|
"mean_token_accuracy": 0.6463487088680268, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.6056910569105691, |
|
"grad_norm": 0.3694207966327667, |
|
"learning_rate": 1.1380439510847756e-05, |
|
"loss": 1.4613, |
|
"mean_token_accuracy": 0.65815409719944, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.6097560975609757, |
|
"grad_norm": 0.38110825419425964, |
|
"learning_rate": 1.1156100219249022e-05, |
|
"loss": 1.4843, |
|
"mean_token_accuracy": 0.6552929699420929, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.613821138211382, |
|
"grad_norm": 0.38159748911857605, |
|
"learning_rate": 1.0933716192190502e-05, |
|
"loss": 1.516, |
|
"mean_token_accuracy": 0.6525053381919861, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.6178861788617886, |
|
"grad_norm": 0.3856993019580841, |
|
"learning_rate": 1.0713298623697655e-05, |
|
"loss": 1.5262, |
|
"mean_token_accuracy": 0.6481319755315781, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.6219512195121952, |
|
"grad_norm": 0.3722607493400574, |
|
"learning_rate": 1.0494858608811326e-05, |
|
"loss": 1.5144, |
|
"mean_token_accuracy": 0.6509221941232681, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.6260162601626016, |
|
"grad_norm": 0.37401559948921204, |
|
"learning_rate": 1.0278407143029345e-05, |
|
"loss": 1.5211, |
|
"mean_token_accuracy": 0.6487148553133011, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6300813008130082, |
|
"grad_norm": 0.38588014245033264, |
|
"learning_rate": 1.0063955121753e-05, |
|
"loss": 1.5062, |
|
"mean_token_accuracy": 0.6512324333190918, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.6341463414634148, |
|
"grad_norm": 0.3976629078388214, |
|
"learning_rate": 9.851513339738628e-06, |
|
"loss": 1.517, |
|
"mean_token_accuracy": 0.6514996409416198, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.6382113821138211, |
|
"grad_norm": 0.37725597620010376, |
|
"learning_rate": 9.641092490554193e-06, |
|
"loss": 1.5141, |
|
"mean_token_accuracy": 0.6503479897975921, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.6422764227642277, |
|
"grad_norm": 0.3505396544933319, |
|
"learning_rate": 9.432703166041086e-06, |
|
"loss": 1.4817, |
|
"mean_token_accuracy": 0.6531630903482437, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.6463414634146343, |
|
"grad_norm": 0.37985649704933167, |
|
"learning_rate": 9.226355855780921e-06, |
|
"loss": 1.5201, |
|
"mean_token_accuracy": 0.6483439028263092, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.6504065040650406, |
|
"grad_norm": 0.3680102825164795, |
|
"learning_rate": 9.022060946567513e-06, |
|
"loss": 1.4834, |
|
"mean_token_accuracy": 0.6544080674648285, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.654471544715447, |
|
"grad_norm": 0.37355953454971313, |
|
"learning_rate": 8.819828721884093e-06, |
|
"loss": 1.5259, |
|
"mean_token_accuracy": 0.6490114808082581, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.6585365853658538, |
|
"grad_norm": 0.3733120560646057, |
|
"learning_rate": 8.619669361385663e-06, |
|
"loss": 1.5235, |
|
"mean_token_accuracy": 0.6490246951580048, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.6626016260162602, |
|
"grad_norm": 0.37461936473846436, |
|
"learning_rate": 8.421592940386514e-06, |
|
"loss": 1.5048, |
|
"mean_token_accuracy": 0.6523201078176498, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.37563464045524597, |
|
"learning_rate": 8.225609429353187e-06, |
|
"loss": 1.4882, |
|
"mean_token_accuracy": 0.6549115240573883, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.6707317073170733, |
|
"grad_norm": 0.387526273727417, |
|
"learning_rate": 8.031728693402502e-06, |
|
"loss": 1.5324, |
|
"mean_token_accuracy": 0.6482354193925858, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.6747967479674797, |
|
"grad_norm": 0.3791826069355011, |
|
"learning_rate": 7.83996049180505e-06, |
|
"loss": 1.526, |
|
"mean_token_accuracy": 0.6454828530550003, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.678861788617886, |
|
"grad_norm": 0.37179654836654663, |
|
"learning_rate": 7.650314477493875e-06, |
|
"loss": 1.4918, |
|
"mean_token_accuracy": 0.6567697405815125, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.6829268292682928, |
|
"grad_norm": 0.3635365068912506, |
|
"learning_rate": 7.462800196578662e-06, |
|
"loss": 1.4822, |
|
"mean_token_accuracy": 0.6493952751159668, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.6869918699186992, |
|
"grad_norm": 0.37180209159851074, |
|
"learning_rate": 7.277427087865124e-06, |
|
"loss": 1.5132, |
|
"mean_token_accuracy": 0.6481129467487335, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.6910569105691056, |
|
"grad_norm": 0.36996957659721375, |
|
"learning_rate": 7.094204482379985e-06, |
|
"loss": 1.5311, |
|
"mean_token_accuracy": 0.6489183723926544, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.6951219512195121, |
|
"grad_norm": 0.3729800581932068, |
|
"learning_rate": 6.913141602901213e-06, |
|
"loss": 1.5113, |
|
"mean_token_accuracy": 0.649386289715767, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.6991869918699187, |
|
"grad_norm": 0.369079053401947, |
|
"learning_rate": 6.734247563493828e-06, |
|
"loss": 1.5302, |
|
"mean_token_accuracy": 0.6484282493591309, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.703252032520325, |
|
"grad_norm": 0.3800000250339508, |
|
"learning_rate": 6.55753136905109e-06, |
|
"loss": 1.4997, |
|
"mean_token_accuracy": 0.6552641421556473, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.7073170731707317, |
|
"grad_norm": 0.3704945147037506, |
|
"learning_rate": 6.3830019148412525e-06, |
|
"loss": 1.5183, |
|
"mean_token_accuracy": 0.65414277613163, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.7113821138211383, |
|
"grad_norm": 0.39623135328292847, |
|
"learning_rate": 6.21066798605982e-06, |
|
"loss": 1.5003, |
|
"mean_token_accuracy": 0.6531934112310409, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.7154471544715446, |
|
"grad_norm": 0.37291619181632996, |
|
"learning_rate": 6.040538257387268e-06, |
|
"loss": 1.5234, |
|
"mean_token_accuracy": 0.647585466504097, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.7195121951219512, |
|
"grad_norm": 0.36963507533073425, |
|
"learning_rate": 5.872621292552477e-06, |
|
"loss": 1.5052, |
|
"mean_token_accuracy": 0.6535750359296799, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.7235772357723578, |
|
"grad_norm": 0.3857491910457611, |
|
"learning_rate": 5.706925543901609e-06, |
|
"loss": 1.5175, |
|
"mean_token_accuracy": 0.6459192246198654, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.7276422764227641, |
|
"grad_norm": 0.37364619970321655, |
|
"learning_rate": 5.543459351972635e-06, |
|
"loss": 1.5155, |
|
"mean_token_accuracy": 0.6489588230848312, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.7317073170731707, |
|
"grad_norm": 0.38043534755706787, |
|
"learning_rate": 5.382230945075556e-06, |
|
"loss": 1.4989, |
|
"mean_token_accuracy": 0.6491947323083878, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.7357723577235773, |
|
"grad_norm": 0.3708847761154175, |
|
"learning_rate": 5.223248438878176e-06, |
|
"loss": 1.4897, |
|
"mean_token_accuracy": 0.6540365964174271, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.7398373983739837, |
|
"grad_norm": 0.36768320202827454, |
|
"learning_rate": 5.066519835997613e-06, |
|
"loss": 1.4793, |
|
"mean_token_accuracy": 0.6576759397983551, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.7439024390243902, |
|
"grad_norm": 0.3785724639892578, |
|
"learning_rate": 4.912053025597429e-06, |
|
"loss": 1.5215, |
|
"mean_token_accuracy": 0.650973778963089, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.7479674796747968, |
|
"grad_norm": 0.36418020725250244, |
|
"learning_rate": 4.759855782990591e-06, |
|
"loss": 1.5007, |
|
"mean_token_accuracy": 0.6525840789079667, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.7520325203252032, |
|
"grad_norm": 0.38290679454803467, |
|
"learning_rate": 4.609935769248025e-06, |
|
"loss": 1.5164, |
|
"mean_token_accuracy": 0.6524971485137939, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.7560975609756098, |
|
"grad_norm": 0.37019696831703186, |
|
"learning_rate": 4.462300530813024e-06, |
|
"loss": 1.515, |
|
"mean_token_accuracy": 0.651192557811737, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.7601626016260163, |
|
"grad_norm": 0.384054034948349, |
|
"learning_rate": 4.316957499121377e-06, |
|
"loss": 1.5096, |
|
"mean_token_accuracy": 0.6498452335596084, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.7642276422764227, |
|
"grad_norm": 0.38844171166419983, |
|
"learning_rate": 4.173913990227251e-06, |
|
"loss": 1.5294, |
|
"mean_token_accuracy": 0.6471045762300491, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.7682926829268293, |
|
"grad_norm": 0.38293302059173584, |
|
"learning_rate": 4.033177204435024e-06, |
|
"loss": 1.4815, |
|
"mean_token_accuracy": 0.6569824278354645, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.7723577235772359, |
|
"grad_norm": 0.36921030282974243, |
|
"learning_rate": 3.894754225936753e-06, |
|
"loss": 1.5163, |
|
"mean_token_accuracy": 0.6482936680316925, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.7764227642276422, |
|
"grad_norm": 0.3771874010562897, |
|
"learning_rate": 3.7586520224556444e-06, |
|
"loss": 1.4938, |
|
"mean_token_accuracy": 0.6513696044683457, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.7804878048780488, |
|
"grad_norm": 0.37873375415802, |
|
"learning_rate": 3.6248774448952695e-06, |
|
"loss": 1.4998, |
|
"mean_token_accuracy": 0.653925022482872, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.7845528455284554, |
|
"grad_norm": 0.36933666467666626, |
|
"learning_rate": 3.4934372269947614e-06, |
|
"loss": 1.481, |
|
"mean_token_accuracy": 0.655310583114624, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.7886178861788617, |
|
"grad_norm": 0.37470993399620056, |
|
"learning_rate": 3.364337984989846e-06, |
|
"loss": 1.4909, |
|
"mean_token_accuracy": 0.6524089992046356, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.7926829268292683, |
|
"grad_norm": 0.37907570600509644, |
|
"learning_rate": 3.2375862172797864e-06, |
|
"loss": 1.4918, |
|
"mean_token_accuracy": 0.6546390771865844, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.796747967479675, |
|
"grad_norm": 0.38741055130958557, |
|
"learning_rate": 3.113188304100306e-06, |
|
"loss": 1.5131, |
|
"mean_token_accuracy": 0.6526689112186432, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.8008130081300813, |
|
"grad_norm": 0.38547375798225403, |
|
"learning_rate": 2.991150507202417e-06, |
|
"loss": 1.5267, |
|
"mean_token_accuracy": 0.6478798180818558, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.8048780487804879, |
|
"grad_norm": 0.3654401898384094, |
|
"learning_rate": 2.871478969537206e-06, |
|
"loss": 1.4988, |
|
"mean_token_accuracy": 0.652685621380806, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.8089430894308944, |
|
"grad_norm": 0.3701361417770386, |
|
"learning_rate": 2.754179714946653e-06, |
|
"loss": 1.4904, |
|
"mean_token_accuracy": 0.6520592629909515, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.8130081300813008, |
|
"grad_norm": 0.3756057918071747, |
|
"learning_rate": 2.6392586478603986e-06, |
|
"loss": 1.4888, |
|
"mean_token_accuracy": 0.6553830564022064, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.8170731707317072, |
|
"grad_norm": 0.380594402551651, |
|
"learning_rate": 2.5267215529985342e-06, |
|
"loss": 1.5033, |
|
"mean_token_accuracy": 0.6523683369159698, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.821138211382114, |
|
"grad_norm": 0.3656398057937622, |
|
"learning_rate": 2.4165740950804038e-06, |
|
"loss": 1.492, |
|
"mean_token_accuracy": 0.6539373815059661, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.8252032520325203, |
|
"grad_norm": 0.37416109442710876, |
|
"learning_rate": 2.3088218185395193e-06, |
|
"loss": 1.5243, |
|
"mean_token_accuracy": 0.6486186146736145, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.8292682926829267, |
|
"grad_norm": 0.36181768774986267, |
|
"learning_rate": 2.203470147244385e-06, |
|
"loss": 1.4839, |
|
"mean_token_accuracy": 0.655050304532051, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.3828342854976654, |
|
"learning_rate": 2.100524384225555e-06, |
|
"loss": 1.5528, |
|
"mean_token_accuracy": 0.6463040858507156, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.8373983739837398, |
|
"grad_norm": 0.373202919960022, |
|
"learning_rate": 1.999989711408662e-06, |
|
"loss": 1.5023, |
|
"mean_token_accuracy": 0.6541062653064728, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.8414634146341462, |
|
"grad_norm": 0.3628922402858734, |
|
"learning_rate": 1.901871189353599e-06, |
|
"loss": 1.5174, |
|
"mean_token_accuracy": 0.6499102711677551, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.845528455284553, |
|
"grad_norm": 0.3882359266281128, |
|
"learning_rate": 1.8061737569997405e-06, |
|
"loss": 1.5342, |
|
"mean_token_accuracy": 0.6476192146539688, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.8495934959349594, |
|
"grad_norm": 0.3723077178001404, |
|
"learning_rate": 1.7129022314174014e-06, |
|
"loss": 1.4868, |
|
"mean_token_accuracy": 0.6531976610422134, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.8536585365853657, |
|
"grad_norm": 0.39124155044555664, |
|
"learning_rate": 1.6220613075653202e-06, |
|
"loss": 1.5192, |
|
"mean_token_accuracy": 0.6473666697740554, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.8577235772357723, |
|
"grad_norm": 0.36488935351371765, |
|
"learning_rate": 1.5336555580543255e-06, |
|
"loss": 1.492, |
|
"mean_token_accuracy": 0.6551815241575241, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.8617886178861789, |
|
"grad_norm": 0.3742678165435791, |
|
"learning_rate": 1.4476894329172042e-06, |
|
"loss": 1.4897, |
|
"mean_token_accuracy": 0.6530262529850006, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.8658536585365852, |
|
"grad_norm": 0.3836856484413147, |
|
"learning_rate": 1.3641672593846632e-06, |
|
"loss": 1.5057, |
|
"mean_token_accuracy": 0.6508428394794464, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.8699186991869918, |
|
"grad_norm": 0.36993157863616943, |
|
"learning_rate": 1.2830932416675323e-06, |
|
"loss": 1.5363, |
|
"mean_token_accuracy": 0.6465236663818359, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.8739837398373984, |
|
"grad_norm": 0.3744891881942749, |
|
"learning_rate": 1.2044714607451434e-06, |
|
"loss": 1.4918, |
|
"mean_token_accuracy": 0.6547666847705841, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.8780487804878048, |
|
"grad_norm": 0.37403053045272827, |
|
"learning_rate": 1.128305874159896e-06, |
|
"loss": 1.4939, |
|
"mean_token_accuracy": 0.649156105518341, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.8821138211382114, |
|
"grad_norm": 0.39576706290245056, |
|
"learning_rate": 1.0546003158180496e-06, |
|
"loss": 1.5251, |
|
"mean_token_accuracy": 0.6455378264188767, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.886178861788618, |
|
"grad_norm": 0.38050803542137146, |
|
"learning_rate": 9.83358495796749e-07, |
|
"loss": 1.5142, |
|
"mean_token_accuracy": 0.64997338950634, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.8902439024390243, |
|
"grad_norm": 0.38642528653144836, |
|
"learning_rate": 9.145840001572537e-07, |
|
"loss": 1.5159, |
|
"mean_token_accuracy": 0.6485857903957367, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.8943089430894309, |
|
"grad_norm": 0.38081294298171997, |
|
"learning_rate": 8.482802907644527e-07, |
|
"loss": 1.496, |
|
"mean_token_accuracy": 0.6498279809951782, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.8983739837398375, |
|
"grad_norm": 0.38391977548599243, |
|
"learning_rate": 7.844507051125937e-07, |
|
"loss": 1.492, |
|
"mean_token_accuracy": 0.6563570469617843, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.9024390243902438, |
|
"grad_norm": 0.38170570135116577, |
|
"learning_rate": 7.230984561572729e-07, |
|
"loss": 1.4996, |
|
"mean_token_accuracy": 0.6526839107275009, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.9065040650406504, |
|
"grad_norm": 0.3678451478481293, |
|
"learning_rate": 6.642266321537249e-07, |
|
"loss": 1.5152, |
|
"mean_token_accuracy": 0.6528177976608276, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.910569105691057, |
|
"grad_norm": 0.38829123973846436, |
|
"learning_rate": 6.078381965013646e-07, |
|
"loss": 1.5029, |
|
"mean_token_accuracy": 0.6528663575649262, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.9146341463414633, |
|
"grad_norm": 0.3702267110347748, |
|
"learning_rate": 5.53935987594617e-07, |
|
"loss": 1.4898, |
|
"mean_token_accuracy": 0.6556541055440903, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.91869918699187, |
|
"grad_norm": 0.3718504309654236, |
|
"learning_rate": 5.025227186800652e-07, |
|
"loss": 1.493, |
|
"mean_token_accuracy": 0.6525381654500961, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.9227642276422765, |
|
"grad_norm": 0.363751620054245, |
|
"learning_rate": 4.5360097771982023e-07, |
|
"loss": 1.4687, |
|
"mean_token_accuracy": 0.6579646080732345, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.9268292682926829, |
|
"grad_norm": 0.38055574893951416, |
|
"learning_rate": 4.071732272613149e-07, |
|
"loss": 1.4911, |
|
"mean_token_accuracy": 0.6529185950756073, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.9308943089430894, |
|
"grad_norm": 0.39423316717147827, |
|
"learning_rate": 3.632418043133079e-07, |
|
"loss": 1.5252, |
|
"mean_token_accuracy": 0.6489736258983612, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.934959349593496, |
|
"grad_norm": 0.3870586156845093, |
|
"learning_rate": 3.21808920228267e-07, |
|
"loss": 1.5149, |
|
"mean_token_accuracy": 0.651541343331337, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.9390243902439024, |
|
"grad_norm": 0.3715446889400482, |
|
"learning_rate": 2.828766605910471e-07, |
|
"loss": 1.4959, |
|
"mean_token_accuracy": 0.6530480951070785, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.943089430894309, |
|
"grad_norm": 0.37665361166000366, |
|
"learning_rate": 2.464469851139073e-07, |
|
"loss": 1.4895, |
|
"mean_token_accuracy": 0.6550248712301254, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.9471544715447155, |
|
"grad_norm": 0.3780977725982666, |
|
"learning_rate": 2.1252172753787324e-07, |
|
"loss": 1.5185, |
|
"mean_token_accuracy": 0.6447298586368561, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.951219512195122, |
|
"grad_norm": 0.3623560667037964, |
|
"learning_rate": 1.811025955404333e-07, |
|
"loss": 1.4761, |
|
"mean_token_accuracy": 0.6539519965648651, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9552845528455285, |
|
"grad_norm": 0.37359219789505005, |
|
"learning_rate": 1.5219117064957932e-07, |
|
"loss": 1.4972, |
|
"mean_token_accuracy": 0.6537485092878341, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.959349593495935, |
|
"grad_norm": 0.3895682096481323, |
|
"learning_rate": 1.257889081641872e-07, |
|
"loss": 1.5007, |
|
"mean_token_accuracy": 0.6517164409160614, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.9634146341463414, |
|
"grad_norm": 0.37818899750709534, |
|
"learning_rate": 1.0189713708078085e-07, |
|
"loss": 1.5182, |
|
"mean_token_accuracy": 0.651473867893219, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.967479674796748, |
|
"grad_norm": 0.3663657307624817, |
|
"learning_rate": 8.051706002661919e-08, |
|
"loss": 1.5082, |
|
"mean_token_accuracy": 0.6531074911355972, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.9715447154471546, |
|
"grad_norm": 0.37170493602752686, |
|
"learning_rate": 6.164975319917222e-08, |
|
"loss": 1.503, |
|
"mean_token_accuracy": 0.6515372186899185, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.975609756097561, |
|
"grad_norm": 0.3812311887741089, |
|
"learning_rate": 4.529616631193112e-08, |
|
"loss": 1.4928, |
|
"mean_token_accuracy": 0.6511427491903305, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.9796747967479673, |
|
"grad_norm": 0.36720356345176697, |
|
"learning_rate": 3.1457122546635354e-08, |
|
"loss": 1.4966, |
|
"mean_token_accuracy": 0.6511392682790756, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.9837398373983741, |
|
"grad_norm": 0.38245534896850586, |
|
"learning_rate": 2.0133318511800224e-08, |
|
"loss": 1.4962, |
|
"mean_token_accuracy": 0.6507306933403015, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.9878048780487805, |
|
"grad_norm": 0.3702641725540161, |
|
"learning_rate": 1.1325324207667188e-08, |
|
"loss": 1.4909, |
|
"mean_token_accuracy": 0.654961907863617, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.9918699186991868, |
|
"grad_norm": 0.394258052110672, |
|
"learning_rate": 5.033582997526765e-09, |
|
"loss": 1.4929, |
|
"mean_token_accuracy": 0.655028885602951, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.9959349593495936, |
|
"grad_norm": 0.3731299936771393, |
|
"learning_rate": 1.2584115853808699e-09, |
|
"loss": 1.5097, |
|
"mean_token_accuracy": 0.6516744375228882, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.37480729818344116, |
|
"learning_rate": 0.0, |
|
"loss": 1.5041, |
|
"mean_token_accuracy": 0.6515572875738144, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.6576701402664185, |
|
"eval_mean_token_accuracy": 0.6307092761121145, |
|
"eval_runtime": 386.1757, |
|
"eval_samples_per_second": 50.961, |
|
"eval_steps_per_second": 3.185, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 2460, |
|
"total_flos": 7.766431165631693e+17, |
|
"train_loss": 1.623661061418735, |
|
"train_runtime": 10718.087, |
|
"train_samples_per_second": 14.689, |
|
"train_steps_per_second": 0.23 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.766431165631693e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|