|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984162179284131, |
|
"eval_steps": 500, |
|
"global_step": 394, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025340513145391194, |
|
"grad_norm": 7.014422278343604, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.1276, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05068102629078239, |
|
"grad_norm": 3.950172585471577, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.9532, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07602153943617358, |
|
"grad_norm": 2.8852145535313203, |
|
"learning_rate": 9.985718470743916e-06, |
|
"loss": 0.9247, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10136205258156478, |
|
"grad_norm": 4.372398694120704, |
|
"learning_rate": 9.936454954953108e-06, |
|
"loss": 0.9131, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12670256572695598, |
|
"grad_norm": 2.678600773280688, |
|
"learning_rate": 9.852380451890723e-06, |
|
"loss": 0.9038, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15204307887234716, |
|
"grad_norm": 3.1616554933826584, |
|
"learning_rate": 9.734087839742152e-06, |
|
"loss": 0.9032, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17738359201773837, |
|
"grad_norm": 2.711129850639537, |
|
"learning_rate": 9.58241129660755e-06, |
|
"loss": 0.8988, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20272410516312955, |
|
"grad_norm": 2.816882812285983, |
|
"learning_rate": 9.398420418028789e-06, |
|
"loss": 0.9, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22806461830852076, |
|
"grad_norm": 2.4491670100971437, |
|
"learning_rate": 9.183412674395193e-06, |
|
"loss": 0.889, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.25340513145391197, |
|
"grad_norm": 2.3671858961630132, |
|
"learning_rate": 8.938904261417088e-06, |
|
"loss": 0.8862, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2787456445993031, |
|
"grad_norm": 25.362434554086317, |
|
"learning_rate": 8.666619408187953e-06, |
|
"loss": 0.8874, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30408615774469433, |
|
"grad_norm": 2.435185075586954, |
|
"learning_rate": 8.368478218232787e-06, |
|
"loss": 0.8809, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32942667089008554, |
|
"grad_norm": 2.3423241237120522, |
|
"learning_rate": 8.046583129285422e-06, |
|
"loss": 0.8759, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35476718403547675, |
|
"grad_norm": 2.295983766372591, |
|
"learning_rate": 7.703204087277989e-06, |
|
"loss": 0.8684, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3801076971808679, |
|
"grad_norm": 2.8544179633447255, |
|
"learning_rate": 7.340762539092858e-06, |
|
"loss": 0.8676, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4054482103262591, |
|
"grad_norm": 2.2374422127818963, |
|
"learning_rate": 6.961814356957308e-06, |
|
"loss": 0.8718, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4307887234716503, |
|
"grad_norm": 2.1472864212507976, |
|
"learning_rate": 6.569031814894962e-06, |
|
"loss": 0.8613, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4561292366170415, |
|
"grad_norm": 2.2467067956901183, |
|
"learning_rate": 6.165184744332824e-06, |
|
"loss": 0.8587, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4814697497624327, |
|
"grad_norm": 2.2528338735220905, |
|
"learning_rate": 5.753121001751161e-06, |
|
"loss": 0.8496, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5068102629078239, |
|
"grad_norm": 2.4179129846566636, |
|
"learning_rate": 5.335746386114814e-06, |
|
"loss": 0.8507, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.532150776053215, |
|
"grad_norm": 16.529801921641155, |
|
"learning_rate": 4.9160041477046e-06, |
|
"loss": 0.8465, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5574912891986062, |
|
"grad_norm": 2.1966500837147556, |
|
"learning_rate": 4.4968542328488e-06, |
|
"loss": 0.8543, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5828318023439975, |
|
"grad_norm": 2.2370934180241986, |
|
"learning_rate": 4.081252410917148e-06, |
|
"loss": 0.841, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6081723154893887, |
|
"grad_norm": 2.086887732081978, |
|
"learning_rate": 3.6721294307699786e-06, |
|
"loss": 0.8358, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6335128286347799, |
|
"grad_norm": 2.263852568221778, |
|
"learning_rate": 3.272370353647465e-06, |
|
"loss": 0.8242, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6588533417801711, |
|
"grad_norm": 2.24355503619341, |
|
"learning_rate": 2.8847942082397112e-06, |
|
"loss": 0.83, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6841938549255623, |
|
"grad_norm": 2.2286503006725806, |
|
"learning_rate": 2.512134111406422e-06, |
|
"loss": 0.8256, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7095343680709535, |
|
"grad_norm": 2.1171293835945155, |
|
"learning_rate": 2.1570179947312674e-06, |
|
"loss": 0.8219, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7348748812163446, |
|
"grad_norm": 2.087191484034938, |
|
"learning_rate": 1.8219500728237849e-06, |
|
"loss": 0.8211, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7602153943617358, |
|
"grad_norm": 2.3605396760707866, |
|
"learning_rate": 1.509293184050995e-06, |
|
"loss": 0.8124, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.785555907507127, |
|
"grad_norm": 2.1652486052680384, |
|
"learning_rate": 1.2212521282287093e-06, |
|
"loss": 0.8189, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8108964206525182, |
|
"grad_norm": 2.04980952594426, |
|
"learning_rate": 9.59858118772105e-07, |
|
"loss": 0.8151, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8362369337979094, |
|
"grad_norm": 2.0755635014559606, |
|
"learning_rate": 7.269544589461968e-07, |
|
"loss": 0.8189, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8615774469433006, |
|
"grad_norm": 2.0129993073524095, |
|
"learning_rate": 5.241835432246888e-07, |
|
"loss": 0.8107, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8869179600886918, |
|
"grad_norm": 2.0409156226901315, |
|
"learning_rate": 3.5297527542127675e-07, |
|
"loss": 0.8062, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.912258473234083, |
|
"grad_norm": 2.3050144167741613, |
|
"learning_rate": 2.1453698526664513e-07, |
|
"loss": 0.8155, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9375989863794741, |
|
"grad_norm": 2.0906040167387188, |
|
"learning_rate": 1.0984491453762402e-07, |
|
"loss": 0.8078, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9629394995248653, |
|
"grad_norm": 1.94641588069489, |
|
"learning_rate": 3.963733277679904e-08, |
|
"loss": 0.802, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9882800126702566, |
|
"grad_norm": 2.333717554003802, |
|
"learning_rate": 4.409331149256013e-09, |
|
"loss": 0.8076, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9984162179284131, |
|
"step": 394, |
|
"total_flos": 318229560426496.0, |
|
"train_loss": 0.8606352945269667, |
|
"train_runtime": 15262.586, |
|
"train_samples_per_second": 1.655, |
|
"train_steps_per_second": 0.026 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 394, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 318229560426496.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|