|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 0, |
|
"global_step": 20000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 1.52798593044281, |
|
"learning_rate": 9.756378189094549e-06, |
|
"loss": 0.1999, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.1572341918945312, |
|
"learning_rate": 9.506253126563282e-06, |
|
"loss": 0.0279, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 1.0278618335723877, |
|
"learning_rate": 9.256128064032017e-06, |
|
"loss": 0.0234, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.9378705024719238, |
|
"learning_rate": 9.006003001500752e-06, |
|
"loss": 0.0203, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.0253329277038574, |
|
"learning_rate": 8.755877938969486e-06, |
|
"loss": 0.0179, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.588853359222412, |
|
"learning_rate": 8.50575287643822e-06, |
|
"loss": 0.0171, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 2.365065336227417, |
|
"learning_rate": 8.255627813906954e-06, |
|
"loss": 0.0153, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9309738874435425, |
|
"learning_rate": 8.005502751375689e-06, |
|
"loss": 0.015, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 0.4729306697845459, |
|
"learning_rate": 7.755377688844424e-06, |
|
"loss": 0.0143, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5990219116210938, |
|
"learning_rate": 7.505252626313157e-06, |
|
"loss": 0.014, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 0.32973700761795044, |
|
"learning_rate": 7.255627813906953e-06, |
|
"loss": 0.0128, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7276681661605835, |
|
"learning_rate": 7.0055027513756875e-06, |
|
"loss": 0.013, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 1.137271761894226, |
|
"learning_rate": 6.7553776888444225e-06, |
|
"loss": 0.0129, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2594708800315857, |
|
"learning_rate": 6.505252626313157e-06, |
|
"loss": 0.0124, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.2336331158876419, |
|
"learning_rate": 6.255127563781892e-06, |
|
"loss": 0.012, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.19649972021579742, |
|
"learning_rate": 6.005002501250627e-06, |
|
"loss": 0.0121, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 0.5051653981208801, |
|
"learning_rate": 5.754877438719361e-06, |
|
"loss": 0.0115, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3455007076263428, |
|
"learning_rate": 5.504752376188095e-06, |
|
"loss": 0.0113, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 0.4421294331550598, |
|
"learning_rate": 5.254627313656829e-06, |
|
"loss": 0.0106, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.2041834592819214, |
|
"learning_rate": 5.004502251125563e-06, |
|
"loss": 0.0107, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 1.119016408920288, |
|
"learning_rate": 4.754377188594297e-06, |
|
"loss": 0.011, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9011972546577454, |
|
"learning_rate": 4.5042521260630315e-06, |
|
"loss": 0.0108, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 0.4205041229724884, |
|
"learning_rate": 4.2546273136568285e-06, |
|
"loss": 0.0103, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.22211983799934387, |
|
"learning_rate": 4.0045022511255635e-06, |
|
"loss": 0.0097, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.26337993144989014, |
|
"learning_rate": 3.7543771885942976e-06, |
|
"loss": 0.01, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.526462197303772, |
|
"learning_rate": 3.5042521260630318e-06, |
|
"loss": 0.0104, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 0.12982240319252014, |
|
"learning_rate": 3.2541270635317664e-06, |
|
"loss": 0.0096, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8299207091331482, |
|
"learning_rate": 3.0040020010005005e-06, |
|
"loss": 0.0096, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 0.18750137090682983, |
|
"learning_rate": 2.753876938469235e-06, |
|
"loss": 0.0097, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.37416380643844604, |
|
"learning_rate": 2.5037518759379692e-06, |
|
"loss": 0.0097, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 0.47138354182243347, |
|
"learning_rate": 2.2536268134067034e-06, |
|
"loss": 0.0089, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.859859824180603, |
|
"learning_rate": 2.003501750875438e-06, |
|
"loss": 0.0089, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": Infinity, |
|
"learning_rate": 1.7538769384692347e-06, |
|
"loss": 0.0091, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.4002706706523895, |
|
"learning_rate": 1.503751875937969e-06, |
|
"loss": 0.0085, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.29968053102493286, |
|
"learning_rate": 1.2536268134067034e-06, |
|
"loss": 0.0084, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.422002911567688, |
|
"learning_rate": 1.0035017508754378e-06, |
|
"loss": 0.0089, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 0.272173672914505, |
|
"learning_rate": 7.533766883441721e-07, |
|
"loss": 0.0087, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2766880691051483, |
|
"learning_rate": 5.03751875937969e-07, |
|
"loss": 0.0087, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 0.17756783962249756, |
|
"learning_rate": 2.5362681340670335e-07, |
|
"loss": 0.0088, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5026212334632874, |
|
"learning_rate": 3.501750875437719e-09, |
|
"loss": 0.0089, |
|
"step": 20000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 20000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|