|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 2.2531771659851074, |
|
"learning_rate": 6.25e-06, |
|
"loss": 2.2918, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 2.1257269382476807, |
|
"learning_rate": 3.125e-05, |
|
"loss": 2.284, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.5896614789962769, |
|
"learning_rate": 6.25e-05, |
|
"loss": 2.2225, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.7286556363105774, |
|
"learning_rate": 9.375e-05, |
|
"loss": 2.1021, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.6398385763168335, |
|
"learning_rate": 0.000125, |
|
"loss": 1.9996, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.4788893461227417, |
|
"learning_rate": 0.00015625, |
|
"loss": 1.9178, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.3934371769428253, |
|
"learning_rate": 0.0001875, |
|
"loss": 1.845, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.684882402420044, |
|
"eval_runtime": 0.8301, |
|
"eval_samples_per_second": 7.228, |
|
"eval_steps_per_second": 1.205, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.28933560848236084, |
|
"learning_rate": 0.00019994645874763658, |
|
"loss": 1.786, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.20734435319900513, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 1.7402, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.16501550376415253, |
|
"learning_rate": 0.00019899620837148077, |
|
"loss": 1.7072, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.13184033334255219, |
|
"learning_rate": 0.00019807852804032305, |
|
"loss": 1.6846, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.11590086668729782, |
|
"learning_rate": 0.00019686915803565934, |
|
"loss": 1.665, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.11442063748836517, |
|
"learning_rate": 0.0001953716950748227, |
|
"loss": 1.6455, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.612865686416626, |
|
"eval_runtime": 0.83, |
|
"eval_samples_per_second": 7.229, |
|
"eval_steps_per_second": 1.205, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.09601253271102905, |
|
"learning_rate": 0.0001935905926757326, |
|
"loss": 1.6364, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.10327372699975967, |
|
"learning_rate": 0.00019153114791194473, |
|
"loss": 1.6227, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.09720147401094437, |
|
"learning_rate": 0.00018919948565893142, |
|
"loss": 1.611, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.09051357209682465, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.6067, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.10204631090164185, |
|
"learning_rate": 0.0001837480354951308, |
|
"loss": 1.6009, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.11083294451236725, |
|
"learning_rate": 0.00018064446042674828, |
|
"loss": 1.5825, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.1064315065741539, |
|
"learning_rate": 0.0001773010453362737, |
|
"loss": 1.5798, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.6013729572296143, |
|
"eval_runtime": 0.8292, |
|
"eval_samples_per_second": 7.236, |
|
"eval_steps_per_second": 1.206, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.0996951013803482, |
|
"learning_rate": 0.0001737277336810124, |
|
"loss": 1.5716, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.10399119555950165, |
|
"learning_rate": 0.00016993515264033672, |
|
"loss": 1.5691, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.0989290326833725, |
|
"learning_rate": 0.00016593458151000688, |
|
"loss": 1.5641, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.108305424451828, |
|
"learning_rate": 0.00016173791815707051, |
|
"loss": 1.5539, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.11103138327598572, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 1.5511, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.11553769558668137, |
|
"learning_rate": 0.0001528067850650368, |
|
"loss": 1.546, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.597513437271118, |
|
"eval_runtime": 0.8293, |
|
"eval_samples_per_second": 7.235, |
|
"eval_steps_per_second": 1.206, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.11389793455600739, |
|
"learning_rate": 0.00014809887689193877, |
|
"loss": 1.5361, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.1214594691991806, |
|
"learning_rate": 0.00014324792063301662, |
|
"loss": 1.5346, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.11472882330417633, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 1.5339, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.10563024878501892, |
|
"learning_rate": 0.00013317495417533524, |
|
"loss": 1.5314, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.10731685161590576, |
|
"learning_rate": 0.00012798290140309923, |
|
"loss": 1.5284, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.10968785732984543, |
|
"learning_rate": 0.00012270762630343734, |
|
"loss": 1.5233, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.11469796299934387, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.5244, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.5962610244750977, |
|
"eval_runtime": 0.8296, |
|
"eval_samples_per_second": 7.233, |
|
"eval_steps_per_second": 1.205, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.10656954348087311, |
|
"learning_rate": 0.00011197036553049625, |
|
"loss": 1.5151, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.12216157466173172, |
|
"learning_rate": 0.00010654031292301432, |
|
"loss": 1.5119, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.11443573981523514, |
|
"learning_rate": 0.00010109080914941824, |
|
"loss": 1.5107, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.11616319417953491, |
|
"learning_rate": 9.563806126346642e-05, |
|
"loss": 1.5165, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.1112031489610672, |
|
"learning_rate": 9.019828596704394e-05, |
|
"loss": 1.509, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.11148490756750107, |
|
"learning_rate": 8.478766138100834e-05, |
|
"loss": 1.5072, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.594440460205078, |
|
"eval_runtime": 0.8285, |
|
"eval_samples_per_second": 7.242, |
|
"eval_steps_per_second": 1.207, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.11521880328655243, |
|
"learning_rate": 7.942227893077652e-05, |
|
"loss": 1.5015, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.11160895973443985, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 1.4991, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.11868108808994293, |
|
"learning_rate": 6.889088592289093e-05, |
|
"loss": 1.5008, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.11169662326574326, |
|
"learning_rate": 6.375619617162985e-05, |
|
"loss": 1.5006, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.11667145788669586, |
|
"learning_rate": 5.872929701956054e-05, |
|
"loss": 1.4989, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.10703334957361221, |
|
"learning_rate": 5.382513867649663e-05, |
|
"loss": 1.501, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.594101667404175, |
|
"eval_runtime": 0.8304, |
|
"eval_samples_per_second": 7.225, |
|
"eval_steps_per_second": 1.204, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.11598368734121323, |
|
"learning_rate": 4.9058306315915826e-05, |
|
"loss": 1.5005, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 0.11565965414047241, |
|
"learning_rate": 4.444297669803981e-05, |
|
"loss": 1.4975, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.10369732230901718, |
|
"learning_rate": 3.999287600755192e-05, |
|
"loss": 1.4969, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.10466954857110977, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.4949, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.09984403848648071, |
|
"learning_rate": 3.164076979771287e-05, |
|
"loss": 1.4901, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.10001492500305176, |
|
"learning_rate": 2.776360379402445e-05, |
|
"loss": 1.492, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 0.10096081346273422, |
|
"learning_rate": 2.4101271875283817e-05, |
|
"loss": 1.4858, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.5943994522094727, |
|
"eval_runtime": 0.8348, |
|
"eval_samples_per_second": 7.187, |
|
"eval_steps_per_second": 1.198, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.09807378798723221, |
|
"learning_rate": 2.0664665970876496e-05, |
|
"loss": 1.4889, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 0.1029752716422081, |
|
"learning_rate": 1.7464006691513623e-05, |
|
"loss": 1.4893, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.0935431718826294, |
|
"learning_rate": 1.4508812932705363e-05, |
|
"loss": 1.4883, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 0.09739290177822113, |
|
"learning_rate": 1.1807873565164506e-05, |
|
"loss": 1.4858, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.10472334176301956, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 1.4933, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 0.09686731547117233, |
|
"learning_rate": 7.200108780781556e-06, |
|
"loss": 1.4917, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.5955541133880615, |
|
"eval_runtime": 0.8323, |
|
"eval_samples_per_second": 7.209, |
|
"eval_steps_per_second": 1.201, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.09799981117248535, |
|
"learning_rate": 5.306987050489442e-06, |
|
"loss": 1.4904, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 0.10055091232061386, |
|
"learning_rate": 3.6954863292237297e-06, |
|
"loss": 1.4813, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 0.09550856798887253, |
|
"learning_rate": 2.3703992880066638e-06, |
|
"loss": 1.4873, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 0.09861158579587936, |
|
"learning_rate": 1.3356667915121025e-06, |
|
"loss": 1.4911, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 0.10015583783388138, |
|
"learning_rate": 5.943661777680354e-07, |
|
"loss": 1.4933, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 0.09337816387414932, |
|
"learning_rate": 1.487021060236904e-07, |
|
"loss": 1.49, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.08882776647806168, |
|
"learning_rate": 0.0, |
|
"loss": 1.4886, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.595360517501831, |
|
"eval_runtime": 0.827, |
|
"eval_samples_per_second": 7.255, |
|
"eval_steps_per_second": 1.209, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 320, |
|
"total_flos": 9.830314275137126e+17, |
|
"train_loss": 1.5905956603586673, |
|
"train_runtime": 1497.0202, |
|
"train_samples_per_second": 109.25, |
|
"train_steps_per_second": 0.214 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.830314275137126e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|