|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 349, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014352350197344816, |
|
"grad_norm": 11.071812629699707, |
|
"learning_rate": 3.6363636363636366e-06, |
|
"loss": 0.7414, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02870470039468963, |
|
"grad_norm": 7.1317901611328125, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 0.7038, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04305705059203445, |
|
"grad_norm": 4.095168113708496, |
|
"learning_rate": 9.998056338091415e-06, |
|
"loss": 0.7009, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05740940078937926, |
|
"grad_norm": 2.938896894454956, |
|
"learning_rate": 9.986183876164412e-06, |
|
"loss": 0.6751, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07176175098672408, |
|
"grad_norm": 3.5328586101531982, |
|
"learning_rate": 9.96354437049027e-06, |
|
"loss": 0.6547, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0861141011840689, |
|
"grad_norm": 2.831934690475464, |
|
"learning_rate": 9.930186708264902e-06, |
|
"loss": 0.6616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1004664513814137, |
|
"grad_norm": 2.707149028778076, |
|
"learning_rate": 9.88618292120984e-06, |
|
"loss": 0.6828, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11481880157875853, |
|
"grad_norm": 2.4383013248443604, |
|
"learning_rate": 9.831628030028698e-06, |
|
"loss": 0.6486, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12917115177610333, |
|
"grad_norm": 2.7486937046051025, |
|
"learning_rate": 9.76663983922178e-06, |
|
"loss": 0.6592, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14352350197344815, |
|
"grad_norm": 2.4958226680755615, |
|
"learning_rate": 9.691358682701927e-06, |
|
"loss": 0.6616, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15787585217079297, |
|
"grad_norm": 2.9137589931488037, |
|
"learning_rate": 9.605947120760878e-06, |
|
"loss": 0.6479, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1722282023681378, |
|
"grad_norm": 2.6844582557678223, |
|
"learning_rate": 9.510589589040554e-06, |
|
"loss": 0.6677, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1865805525654826, |
|
"grad_norm": 2.127028703689575, |
|
"learning_rate": 9.405492000267228e-06, |
|
"loss": 0.6507, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2009329027628274, |
|
"grad_norm": 2.487703800201416, |
|
"learning_rate": 9.29088129960862e-06, |
|
"loss": 0.6615, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21528525296017223, |
|
"grad_norm": 2.5370140075683594, |
|
"learning_rate": 9.16700497461403e-06, |
|
"loss": 0.6522, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22963760315751705, |
|
"grad_norm": 2.203188896179199, |
|
"learning_rate": 9.034130520795774e-06, |
|
"loss": 0.6521, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24398995335486187, |
|
"grad_norm": 2.402724504470825, |
|
"learning_rate": 8.892544864005899e-06, |
|
"loss": 0.6496, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.25834230355220666, |
|
"grad_norm": 2.3624093532562256, |
|
"learning_rate": 8.742553740855507e-06, |
|
"loss": 0.6378, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2726946537495515, |
|
"grad_norm": 2.36360239982605, |
|
"learning_rate": 8.584481038514573e-06, |
|
"loss": 0.6422, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2870470039468963, |
|
"grad_norm": 2.0742623805999756, |
|
"learning_rate": 8.418668095317912e-06, |
|
"loss": 0.6441, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3013993541442411, |
|
"grad_norm": 2.300394058227539, |
|
"learning_rate": 8.245472963687484e-06, |
|
"loss": 0.6333, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.31575170434158595, |
|
"grad_norm": 2.1633546352386475, |
|
"learning_rate": 8.065269636962765e-06, |
|
"loss": 0.6393, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.33010405453893077, |
|
"grad_norm": 2.291600227355957, |
|
"learning_rate": 7.878447241808634e-06, |
|
"loss": 0.647, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3444564047362756, |
|
"grad_norm": 2.0962560176849365, |
|
"learning_rate": 7.685409197944768e-06, |
|
"loss": 0.6226, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35880875493362036, |
|
"grad_norm": 2.4263839721679688, |
|
"learning_rate": 7.486572347010937e-06, |
|
"loss": 0.6442, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3731611051309652, |
|
"grad_norm": 2.1769137382507324, |
|
"learning_rate": 7.282366052449351e-06, |
|
"loss": 0.6434, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.38751345532831, |
|
"grad_norm": 2.181265115737915, |
|
"learning_rate": 7.073231272347714e-06, |
|
"loss": 0.628, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4018658055256548, |
|
"grad_norm": 2.1604084968566895, |
|
"learning_rate": 6.859619607245102e-06, |
|
"loss": 0.6349, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.41621815572299964, |
|
"grad_norm": 2.0021893978118896, |
|
"learning_rate": 6.641992324956776e-06, |
|
"loss": 0.6159, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.43057050592034446, |
|
"grad_norm": 2.3016304969787598, |
|
"learning_rate": 6.4208193645237314e-06, |
|
"loss": 0.6397, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4449228561176893, |
|
"grad_norm": 2.1002519130706787, |
|
"learning_rate": 6.1965783214377895e-06, |
|
"loss": 0.6268, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4592752063150341, |
|
"grad_norm": 4.018091201782227, |
|
"learning_rate": 5.9697534163335645e-06, |
|
"loss": 0.6302, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4736275565123789, |
|
"grad_norm": 1.9025262594223022, |
|
"learning_rate": 5.740834449374237e-06, |
|
"loss": 0.6163, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.48797990670972374, |
|
"grad_norm": 1.9070066213607788, |
|
"learning_rate": 5.510315742589042e-06, |
|
"loss": 0.6243, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5023322569070685, |
|
"grad_norm": 2.002209186553955, |
|
"learning_rate": 5.278695072446342e-06, |
|
"loss": 0.6245, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5166846071044133, |
|
"grad_norm": 1.9884896278381348, |
|
"learning_rate": 5.046472594967279e-06, |
|
"loss": 0.607, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5310369573017582, |
|
"grad_norm": 2.062976837158203, |
|
"learning_rate": 4.814149765701059e-06, |
|
"loss": 0.609, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.545389307499103, |
|
"grad_norm": 1.9803415536880493, |
|
"learning_rate": 4.582228256894093e-06, |
|
"loss": 0.6119, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5597416576964478, |
|
"grad_norm": 1.8966432809829712, |
|
"learning_rate": 4.351208874191192e-06, |
|
"loss": 0.6291, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5740940078937926, |
|
"grad_norm": 2.038236618041992, |
|
"learning_rate": 4.121590475208071e-06, |
|
"loss": 0.6255, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5884463580911374, |
|
"grad_norm": 2.072604179382324, |
|
"learning_rate": 3.8938688923104015e-06, |
|
"loss": 0.6269, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6027987082884823, |
|
"grad_norm": 2.125389814376831, |
|
"learning_rate": 3.668535861925509e-06, |
|
"loss": 0.6278, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6171510584858271, |
|
"grad_norm": 2.0525448322296143, |
|
"learning_rate": 3.4460779626987186e-06, |
|
"loss": 0.6, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6315034086831719, |
|
"grad_norm": 1.934063196182251, |
|
"learning_rate": 3.226975564787322e-06, |
|
"loss": 0.5925, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6458557588805167, |
|
"grad_norm": 1.8446972370147705, |
|
"learning_rate": 3.0117017925609802e-06, |
|
"loss": 0.6035, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6602081090778615, |
|
"grad_norm": 2.0047950744628906, |
|
"learning_rate": 2.800721502948506e-06, |
|
"loss": 0.6022, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6745604592752064, |
|
"grad_norm": 1.9434928894042969, |
|
"learning_rate": 2.5944902816371573e-06, |
|
"loss": 0.5966, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6889128094725512, |
|
"grad_norm": 1.9444347620010376, |
|
"learning_rate": 2.3934534592920416e-06, |
|
"loss": 0.5839, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.703265159669896, |
|
"grad_norm": 1.9846431016921997, |
|
"learning_rate": 2.1980451499199262e-06, |
|
"loss": 0.6063, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7176175098672407, |
|
"grad_norm": 1.8537969589233398, |
|
"learning_rate": 2.0086873134540626e-06, |
|
"loss": 0.6068, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7319698600645855, |
|
"grad_norm": 1.9193341732025146, |
|
"learning_rate": 1.8257888445842026e-06, |
|
"loss": 0.5948, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7463222102619304, |
|
"grad_norm": 1.9606044292449951, |
|
"learning_rate": 1.6497446897993885e-06, |
|
"loss": 0.5932, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7606745604592752, |
|
"grad_norm": 1.9930768013000488, |
|
"learning_rate": 1.4809349945501422e-06, |
|
"loss": 0.5998, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.77502691065662, |
|
"grad_norm": 1.9653593301773071, |
|
"learning_rate": 1.319724282371664e-06, |
|
"loss": 0.6135, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7893792608539648, |
|
"grad_norm": 1.7707090377807617, |
|
"learning_rate": 1.1664606677406025e-06, |
|
"loss": 0.6069, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8037316110513096, |
|
"grad_norm": 1.9296361207962036, |
|
"learning_rate": 1.0214751043651582e-06, |
|
"loss": 0.5974, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8180839612486545, |
|
"grad_norm": 1.89194917678833, |
|
"learning_rate": 8.850806705317183e-07, |
|
"loss": 0.5973, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8324363114459993, |
|
"grad_norm": 1.991350769996643, |
|
"learning_rate": 7.575718930512516e-07, |
|
"loss": 0.5867, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8467886616433441, |
|
"grad_norm": 1.789543867111206, |
|
"learning_rate": 6.392241112653031e-07, |
|
"loss": 0.61, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8611410118406889, |
|
"grad_norm": 1.8926385641098022, |
|
"learning_rate": 5.302928824849335e-07, |
|
"loss": 0.6084, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8754933620380337, |
|
"grad_norm": 1.8285890817642212, |
|
"learning_rate": 4.3101343014651356e-07, |
|
"loss": 0.599, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8898457122353786, |
|
"grad_norm": 1.9200646877288818, |
|
"learning_rate": 3.416001358759635e-07, |
|
"loss": 0.5911, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9041980624327234, |
|
"grad_norm": 1.8193855285644531, |
|
"learning_rate": 2.6224607655831236e-07, |
|
"loss": 0.5886, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9185504126300682, |
|
"grad_norm": 1.7819342613220215, |
|
"learning_rate": 1.9312260741218114e-07, |
|
"loss": 0.5917, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.932902762827413, |
|
"grad_norm": 1.8590822219848633, |
|
"learning_rate": 1.3437899196950765e-07, |
|
"loss": 0.5799, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9472551130247578, |
|
"grad_norm": 1.7508701086044312, |
|
"learning_rate": 8.614207975952083e-08, |
|
"loss": 0.6015, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9616074632221027, |
|
"grad_norm": 1.8058841228485107, |
|
"learning_rate": 4.851603239296065e-08, |
|
"loss": 0.5848, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9759598134194475, |
|
"grad_norm": 1.788930892944336, |
|
"learning_rate": 2.158209863804217e-08, |
|
"loss": 0.5879, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9903121636167922, |
|
"grad_norm": 1.9363088607788086, |
|
"learning_rate": 5.398438973845954e-09, |
|
"loss": 0.5836, |
|
"step": 345 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 349, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.871785990877872e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|