|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9974424552429667, |
|
"eval_steps": 500, |
|
"global_step": 195, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02557544757033248, |
|
"grad_norm": 142.70468406748688, |
|
"learning_rate": 5e-06, |
|
"loss": 3.5042, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05115089514066496, |
|
"grad_norm": 39.35096392339812, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8402, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07672634271099744, |
|
"grad_norm": 2.7883188777304038, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.8528, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10230179028132992, |
|
"grad_norm": 1.9363912255406037, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5077, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1278772378516624, |
|
"grad_norm": 2.479337233395153, |
|
"learning_rate": 1.9959742939952393e-05, |
|
"loss": 0.4772, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1534526854219949, |
|
"grad_norm": 2.437727400797437, |
|
"learning_rate": 1.98392958859863e-05, |
|
"loss": 0.4575, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17902813299232737, |
|
"grad_norm": 2.064035897127794, |
|
"learning_rate": 1.9639628606958535e-05, |
|
"loss": 0.4183, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.20460358056265984, |
|
"grad_norm": 2.0573078173417567, |
|
"learning_rate": 1.9362348706397374e-05, |
|
"loss": 0.3801, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.23017902813299232, |
|
"grad_norm": 1.3860822147023304, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.3071, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2557544757033248, |
|
"grad_norm": 1.1223824377196536, |
|
"learning_rate": 1.8584487936018663e-05, |
|
"loss": 0.2866, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2813299232736573, |
|
"grad_norm": 1.3072552349963746, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.3053, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3069053708439898, |
|
"grad_norm": 1.1007498150237207, |
|
"learning_rate": 1.7530714660036112e-05, |
|
"loss": 0.2824, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.33248081841432225, |
|
"grad_norm": 1.0572125214610735, |
|
"learning_rate": 1.691062648986865e-05, |
|
"loss": 0.2796, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.35805626598465473, |
|
"grad_norm": 1.0088193429142867, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 0.2592, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3836317135549872, |
|
"grad_norm": 0.9536544858527741, |
|
"learning_rate": 1.5508969814521026e-05, |
|
"loss": 0.2729, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4092071611253197, |
|
"grad_norm": 1.0177774752547195, |
|
"learning_rate": 1.4738686624729987e-05, |
|
"loss": 0.2578, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 0.9024655077486329, |
|
"learning_rate": 1.3930250316539237e-05, |
|
"loss": 0.2537, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.46035805626598464, |
|
"grad_norm": 0.9603153657835733, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.2658, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4859335038363171, |
|
"grad_norm": 0.9976253438540076, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 0.2961, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5115089514066496, |
|
"grad_norm": 0.8607729610780752, |
|
"learning_rate": 1.1342332658176556e-05, |
|
"loss": 0.2087, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5370843989769821, |
|
"grad_norm": 0.8346737121409833, |
|
"learning_rate": 1.044864830350515e-05, |
|
"loss": 0.2335, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5626598465473146, |
|
"grad_norm": 0.9276457383456489, |
|
"learning_rate": 9.551351696494854e-06, |
|
"loss": 0.2321, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.8201591451858892, |
|
"learning_rate": 8.657667341823449e-06, |
|
"loss": 0.1986, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6138107416879796, |
|
"grad_norm": 0.9324949471856788, |
|
"learning_rate": 7.774790660436857e-06, |
|
"loss": 0.2346, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.639386189258312, |
|
"grad_norm": 0.8224133465787747, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.2273, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6649616368286445, |
|
"grad_norm": 0.7394741748745267, |
|
"learning_rate": 6.069749683460765e-06, |
|
"loss": 0.2032, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.690537084398977, |
|
"grad_norm": 0.8165965679312597, |
|
"learning_rate": 5.2613133752700145e-06, |
|
"loss": 0.219, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7161125319693095, |
|
"grad_norm": 1.117117835940415, |
|
"learning_rate": 4.491030185478976e-06, |
|
"loss": 0.2337, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7416879795396419, |
|
"grad_norm": 0.8321965045011145, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 0.2199, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7672634271099744, |
|
"grad_norm": 0.9442319309396315, |
|
"learning_rate": 3.089373510131354e-06, |
|
"loss": 0.1942, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7928388746803069, |
|
"grad_norm": 0.8339668845503085, |
|
"learning_rate": 2.469285339963892e-06, |
|
"loss": 0.2021, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8184143222506394, |
|
"grad_norm": 0.8898393855240705, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 0.2428, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8439897698209718, |
|
"grad_norm": 0.8213143519432727, |
|
"learning_rate": 1.4155120639813392e-06, |
|
"loss": 0.1893, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.9728880507025741, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.1854, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8951406649616368, |
|
"grad_norm": 0.7472973685735516, |
|
"learning_rate": 6.37651293602628e-07, |
|
"loss": 0.2287, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9207161125319693, |
|
"grad_norm": 0.818513207281356, |
|
"learning_rate": 3.603713930414676e-07, |
|
"loss": 0.1873, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9462915601023018, |
|
"grad_norm": 0.809589496670413, |
|
"learning_rate": 1.6070411401370335e-07, |
|
"loss": 0.1982, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9718670076726342, |
|
"grad_norm": 0.759402066262018, |
|
"learning_rate": 4.025706004760932e-08, |
|
"loss": 0.2064, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9974424552429667, |
|
"grad_norm": 0.8249592638910015, |
|
"learning_rate": 0.0, |
|
"loss": 0.1977, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9974424552429667, |
|
"step": 195, |
|
"total_flos": 77502516822016.0, |
|
"train_loss": 0.4037646987499335, |
|
"train_runtime": 4195.4221, |
|
"train_samples_per_second": 1.491, |
|
"train_steps_per_second": 0.046 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 195, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 77502516822016.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|