|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.606060606060606, |
|
"eval_steps": 500, |
|
"global_step": 32, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 2.50050950050354, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.8407, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 2.2383944988250732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7878, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 2.1805531978607178, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.8026, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 2.0964150428771973, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8133, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 1.5487000942230225, |
|
"learning_rate": 9.968561049466214e-06, |
|
"loss": 0.7608, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 1.7998461723327637, |
|
"learning_rate": 9.874639560909118e-06, |
|
"loss": 0.7272, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 1.9523899555206299, |
|
"learning_rate": 9.719416651541839e-06, |
|
"loss": 0.6923, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 4.303847312927246, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 0.6853, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.303847312927246, |
|
"learning_rate": 9.233620996141421e-06, |
|
"loss": 0.6532, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 1.121212121212121, |
|
"grad_norm": 2.6484241485595703, |
|
"learning_rate": 8.90915741234015e-06, |
|
"loss": 0.6334, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.2424242424242424, |
|
"grad_norm": 1.0852004289627075, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.6171, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 87.41819763183594, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 0.5768, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.4848484848484849, |
|
"grad_norm": 1.8918662071228027, |
|
"learning_rate": 7.660160382576683e-06, |
|
"loss": 0.5852, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.606060606060606, |
|
"grad_norm": 1.3301656246185303, |
|
"learning_rate": 7.169418695587791e-06, |
|
"loss": 0.5752, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 0.9969548583030701, |
|
"learning_rate": 6.651395309775837e-06, |
|
"loss": 0.5949, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.8484848484848486, |
|
"grad_norm": 0.9531848430633545, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 0.5568, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 0.8378441333770752, |
|
"learning_rate": 5.559822380516539e-06, |
|
"loss": 0.5506, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.8378441333770752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5339, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 1.51701021194458, |
|
"learning_rate": 4.4401776194834615e-06, |
|
"loss": 0.5415, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 2.242424242424242, |
|
"grad_norm": 0.8446238040924072, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 0.5077, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.6361896395683289, |
|
"learning_rate": 3.3486046902241663e-06, |
|
"loss": 0.5249, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 2.484848484848485, |
|
"grad_norm": 0.5378800630569458, |
|
"learning_rate": 2.83058130441221e-06, |
|
"loss": 0.5052, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.606060606060606, |
|
"grad_norm": 0.49560996890068054, |
|
"learning_rate": 2.339839617423318e-06, |
|
"loss": 0.5087, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.5335824489593506, |
|
"learning_rate": 1.8825509907063328e-06, |
|
"loss": 0.4993, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.8484848484848486, |
|
"grad_norm": 0.4428861141204834, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.4946, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.9696969696969697, |
|
"grad_norm": 0.47002682089805603, |
|
"learning_rate": 1.0908425876598512e-06, |
|
"loss": 0.4855, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.47002682089805603, |
|
"learning_rate": 7.663790038585794e-07, |
|
"loss": 0.5004, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 3.121212121212121, |
|
"grad_norm": 0.7940795421600342, |
|
"learning_rate": 4.951556604879049e-07, |
|
"loss": 0.4744, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.242424242424242, |
|
"grad_norm": 0.4210236966609955, |
|
"learning_rate": 2.8058334845816214e-07, |
|
"loss": 0.4723, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 0.41588640213012695, |
|
"learning_rate": 1.253604390908819e-07, |
|
"loss": 0.5029, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.484848484848485, |
|
"grad_norm": 0.42177480459213257, |
|
"learning_rate": 3.143895053378698e-08, |
|
"loss": 0.4881, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 3.606060606060606, |
|
"grad_norm": 0.43024787306785583, |
|
"learning_rate": 0.0, |
|
"loss": 0.5004, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 3.606060606060606, |
|
"step": 32, |
|
"total_flos": 80439221420032.0, |
|
"train_loss": 0.5935216061770916, |
|
"train_runtime": 4011.9078, |
|
"train_samples_per_second": 0.518, |
|
"train_steps_per_second": 0.008 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 32, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 80439221420032.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|