|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 368, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01358695652173913, |
|
"grad_norm": 20.125, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 1.708, |
|
"mean_token_accuracy": 0.6925400972366333, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02717391304347826, |
|
"grad_norm": 12.75, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 1.691, |
|
"mean_token_accuracy": 0.6875800609588623, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04076086956521739, |
|
"grad_norm": 13.125, |
|
"learning_rate": 8.108108108108109e-06, |
|
"loss": 1.549, |
|
"mean_token_accuracy": 0.7068500399589539, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05434782608695652, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.0810810810810812e-05, |
|
"loss": 1.3324, |
|
"mean_token_accuracy": 0.7429514169692993, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06793478260869565, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.3513513513513515e-05, |
|
"loss": 0.9671, |
|
"mean_token_accuracy": 0.8036805391311646, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08152173913043478, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 0.9606, |
|
"mean_token_accuracy": 0.8269321084022522, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09510869565217392, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.891891891891892e-05, |
|
"loss": 0.6205, |
|
"mean_token_accuracy": 0.8943289399147034, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10869565217391304, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 1.9995946530314384e-05, |
|
"loss": 0.3571, |
|
"mean_token_accuracy": 0.934650433063507, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12228260869565218, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.9971187226043746e-05, |
|
"loss": 0.4531, |
|
"mean_token_accuracy": 0.9206116795539856, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1358695652173913, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.9923976226947417e-05, |
|
"loss": 0.4687, |
|
"mean_token_accuracy": 0.916878092288971, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14945652173913043, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.985441983600819e-05, |
|
"loss": 0.3238, |
|
"mean_token_accuracy": 0.9440895915031433, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.16304347826086957, |
|
"grad_norm": 16.375, |
|
"learning_rate": 1.9762674670369757e-05, |
|
"loss": 0.2045, |
|
"mean_token_accuracy": 0.9566964030265808, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1766304347826087, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.9648947308688594e-05, |
|
"loss": 0.2272, |
|
"mean_token_accuracy": 0.9550906538963317, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19021739130434784, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.9513493825989664e-05, |
|
"loss": 0.4506, |
|
"mean_token_accuracy": 0.9193356156349182, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20380434782608695, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.9356619217073252e-05, |
|
"loss": 0.4804, |
|
"mean_token_accuracy": 0.9193973302841186, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.917867670977126e-05, |
|
"loss": 0.4535, |
|
"mean_token_accuracy": 0.9251804232597352, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23097826086956522, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.8980066969599216e-05, |
|
"loss": 0.5313, |
|
"mean_token_accuracy": 0.9139753341674804, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.24456521739130435, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.8761237197594945e-05, |
|
"loss": 0.2025, |
|
"mean_token_accuracy": 0.9598217248916626, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.25815217391304346, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.852268012337514e-05, |
|
"loss": 0.4308, |
|
"mean_token_accuracy": 0.9284366369247437, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2717391304347826, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.8264932895677195e-05, |
|
"loss": 0.4234, |
|
"mean_token_accuracy": 0.930481493473053, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.28532608695652173, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.798857587288445e-05, |
|
"loss": 0.3343, |
|
"mean_token_accuracy": 0.9406642079353332, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.29891304347826086, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.769423131625808e-05, |
|
"loss": 0.198, |
|
"mean_token_accuracy": 0.9616581201553345, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.738256198881809e-05, |
|
"loss": 0.2704, |
|
"mean_token_accuracy": 0.9502018094062805, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.32608695652173914, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.7054269663028232e-05, |
|
"loss": 0.208, |
|
"mean_token_accuracy": 0.9611244797706604, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33967391304347827, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.6710093540645056e-05, |
|
"loss": 0.364, |
|
"mean_token_accuracy": 0.9366228461265564, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3532608695652174, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.6350808588288964e-05, |
|
"loss": 0.1251, |
|
"mean_token_accuracy": 0.9757622718811035, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.36684782608695654, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 1.597722379248512e-05, |
|
"loss": 0.2729, |
|
"mean_token_accuracy": 0.9491044163703919, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3804347826086957, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 1.559018033810316e-05, |
|
"loss": 0.3411, |
|
"mean_token_accuracy": 0.9376911044120788, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39402173913043476, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.5190549714297303e-05, |
|
"loss": 0.2513, |
|
"mean_token_accuracy": 0.9505762934684754, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4076086956521739, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.4779231752211546e-05, |
|
"loss": 0.1996, |
|
"mean_token_accuracy": 0.9632942318916321, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.421195652173913, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.4357152598868478e-05, |
|
"loss": 0.2669, |
|
"mean_token_accuracy": 0.9502842426300049, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.3925262631803722e-05, |
|
"loss": 0.1767, |
|
"mean_token_accuracy": 0.962754237651825, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4483695652173913, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.3484534319141592e-05, |
|
"loss": 0.1936, |
|
"mean_token_accuracy": 0.9623476982116699, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.46195652173913043, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.303596002993028e-05, |
|
"loss": 0.2875, |
|
"mean_token_accuracy": 0.948505699634552, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47554347826086957, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.2580549799667034e-05, |
|
"loss": 0.1843, |
|
"mean_token_accuracy": 0.9624204635620117, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4891304347826087, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.2119329056044533e-05, |
|
"loss": 0.1725, |
|
"mean_token_accuracy": 0.9649592399597168, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5027173913043478, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 1.165333631003928e-05, |
|
"loss": 0.3102, |
|
"mean_token_accuracy": 0.9432898998260498, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5163043478260869, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.1183620817540985e-05, |
|
"loss": 0.2023, |
|
"mean_token_accuracy": 0.9577706575393676, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.529891304347826, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.0711240216788036e-05, |
|
"loss": 0.3165, |
|
"mean_token_accuracy": 0.9471179962158203, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5434782608695652, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.0237258146928849e-05, |
|
"loss": 0.1964, |
|
"mean_token_accuracy": 0.9606888651847839, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5570652173913043, |
|
"grad_norm": 5.875, |
|
"learning_rate": 9.762741853071153e-06, |
|
"loss": 0.2211, |
|
"mean_token_accuracy": 0.9585472226142884, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5706521739130435, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 9.288759783211967e-06, |
|
"loss": 0.2484, |
|
"mean_token_accuracy": 0.9550203323364258, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5842391304347826, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.81637918245902e-06, |
|
"loss": 0.3283, |
|
"mean_token_accuracy": 0.9439280152320861, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5978260869565217, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 8.346663689960724e-06, |
|
"loss": 0.2486, |
|
"mean_token_accuracy": 0.953123664855957, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6114130434782609, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 7.880670943955467e-06, |
|
"loss": 0.2411, |
|
"mean_token_accuracy": 0.9570030689239502, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.419450200332965e-06, |
|
"loss": 0.2028, |
|
"mean_token_accuracy": 0.9596728563308716, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6385869565217391, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 6.964039970069722e-06, |
|
"loss": 0.2145, |
|
"mean_token_accuracy": 0.9589278101921082, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 6.515465680858412e-06, |
|
"loss": 0.2646, |
|
"mean_token_accuracy": 0.9482977151870727, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6657608695652174, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 6.074737368196279e-06, |
|
"loss": 0.2332, |
|
"mean_token_accuracy": 0.9552296161651611, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6793478260869565, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 5.642847401131526e-06, |
|
"loss": 0.2629, |
|
"mean_token_accuracy": 0.953015148639679, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6929347826086957, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 5.220768247788458e-06, |
|
"loss": 0.2226, |
|
"mean_token_accuracy": 0.9579466819763184, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7065217391304348, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.809450285702697e-06, |
|
"loss": 0.1219, |
|
"mean_token_accuracy": 0.9744248032569885, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.720108695652174, |
|
"grad_norm": 4.5, |
|
"learning_rate": 4.409819661896839e-06, |
|
"loss": 0.1909, |
|
"mean_token_accuracy": 0.9654230594635009, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7336956521739131, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.022776207514885e-06, |
|
"loss": 0.183, |
|
"mean_token_accuracy": 0.9614866137504577, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7472826086956522, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.6491914117110405e-06, |
|
"loss": 0.2841, |
|
"mean_token_accuracy": 0.9495038151741028, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7608695652173914, |
|
"grad_norm": 3.5, |
|
"learning_rate": 3.2899064593549477e-06, |
|
"loss": 0.147, |
|
"mean_token_accuracy": 0.9706088781356812, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7744565217391305, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.945730336971767e-06, |
|
"loss": 0.2865, |
|
"mean_token_accuracy": 0.9473696708679199, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7880434782608695, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 2.6174380111819144e-06, |
|
"loss": 0.1647, |
|
"mean_token_accuracy": 0.9646413207054139, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8016304347826086, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 2.3057686837419246e-06, |
|
"loss": 0.127, |
|
"mean_token_accuracy": 0.9736526370048523, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8152173913043478, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.011424127115552e-06, |
|
"loss": 0.1714, |
|
"mean_token_accuracy": 0.963629424571991, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8288043478260869, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.7350671043228072e-06, |
|
"loss": 0.2071, |
|
"mean_token_accuracy": 0.9631360173225403, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.842391304347826, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4773198766248642e-06, |
|
"loss": 0.1566, |
|
"mean_token_accuracy": 0.971684205532074, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8559782608695652, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.2387628024050557e-06, |
|
"loss": 0.2517, |
|
"mean_token_accuracy": 0.9517509698867798, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 1.0199330304007858e-06, |
|
"loss": 0.247, |
|
"mean_token_accuracy": 0.9540893912315369, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8831521739130435, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 8.213232902287438e-07, |
|
"loss": 0.1369, |
|
"mean_token_accuracy": 0.9701100349426269, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8967391304347826, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 6.433807829267491e-07, |
|
"loss": 0.1382, |
|
"mean_token_accuracy": 0.9745123624801636, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9103260869565217, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.865061740103361e-07, |
|
"loss": 0.1746, |
|
"mean_token_accuracy": 0.9617828845977783, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9239130434782609, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 3.510526913114065e-07, |
|
"loss": 0.2696, |
|
"mean_token_accuracy": 0.9500232100486755, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 3.5, |
|
"learning_rate": 2.3732532963024468e-07, |
|
"loss": 0.2069, |
|
"mean_token_accuracy": 0.9611208081245423, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9510869565217391, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.4558016399181086e-07, |
|
"loss": 0.1823, |
|
"mean_token_accuracy": 0.965398907661438, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9646739130434783, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 7.602377305258479e-08, |
|
"loss": 0.1987, |
|
"mean_token_accuracy": 0.9622387170791626, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9782608695652174, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.8812773956256034e-08, |
|
"loss": 0.1906, |
|
"mean_token_accuracy": 0.9647317886352539, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9918478260869565, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.053469685617595e-09, |
|
"loss": 0.2464, |
|
"mean_token_accuracy": 0.9541746735572815, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"mean_token_accuracy": 0.9636613726615906, |
|
"step": 368, |
|
"total_flos": 2.6349158744595456e+16, |
|
"train_loss": 0.35062233002289483, |
|
"train_runtime": 5984.3116, |
|
"train_samples_per_second": 0.061, |
|
"train_steps_per_second": 0.061 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 368, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6349158744595456e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|