|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.654545454545454, |
|
"eval_steps": 500, |
|
"global_step": 270, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03636363636363636, |
|
"grad_norm": 3.691458225250244, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 2.6618, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 3.446274518966675, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 2.6408, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 2.032472848892212, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 2.5317, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.9463324546813965, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 2.36, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.7080094218254089, |
|
"learning_rate": 0.00014814814814814815, |
|
"loss": 2.2174, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.5037975311279297, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 2.1182, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.782132625579834, |
|
"eval_runtime": 0.8292, |
|
"eval_samples_per_second": 12.06, |
|
"eval_steps_per_second": 1.206, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0727272727272728, |
|
"grad_norm": 0.41888633370399475, |
|
"learning_rate": 0.00019992479525042303, |
|
"loss": 2.0277, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.2545454545454544, |
|
"grad_norm": 0.27915704250335693, |
|
"learning_rate": 0.00019946562024066014, |
|
"loss": 1.9587, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.4363636363636363, |
|
"grad_norm": 0.20056034624576569, |
|
"learning_rate": 0.00019859096633447965, |
|
"loss": 1.9087, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.6181818181818182, |
|
"grad_norm": 0.16737522184848785, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 1.8766, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.15048423409461975, |
|
"learning_rate": 0.00019561155617738797, |
|
"loss": 1.8481, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.981818181818182, |
|
"grad_norm": 0.1224176436662674, |
|
"learning_rate": 0.000193519245252219, |
|
"loss": 1.8354, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.737755537033081, |
|
"eval_runtime": 0.829, |
|
"eval_samples_per_second": 12.063, |
|
"eval_steps_per_second": 1.206, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.1454545454545455, |
|
"grad_norm": 0.1324545294046402, |
|
"learning_rate": 0.0001910362940966147, |
|
"loss": 1.8118, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.327272727272727, |
|
"grad_norm": 0.11611360311508179, |
|
"learning_rate": 0.0001881730742721608, |
|
"loss": 1.7937, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.509090909090909, |
|
"grad_norm": 0.1148991584777832, |
|
"learning_rate": 0.00018494154576472976, |
|
"loss": 1.7791, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.690909090909091, |
|
"grad_norm": 0.11438702791929245, |
|
"learning_rate": 0.00018135520702629675, |
|
"loss": 1.7654, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.8727272727272726, |
|
"grad_norm": 0.11716635525226593, |
|
"learning_rate": 0.00017742903859041325, |
|
"loss": 1.7604, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.7259373664855957, |
|
"eval_runtime": 0.8303, |
|
"eval_samples_per_second": 12.044, |
|
"eval_steps_per_second": 1.204, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 3.036363636363636, |
|
"grad_norm": 0.1302882581949234, |
|
"learning_rate": 0.00017317944049686124, |
|
"loss": 1.7453, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 3.2181818181818183, |
|
"grad_norm": 0.12489154189825058, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 1.7396, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.10804688185453415, |
|
"learning_rate": 0.0001637822363550706, |
|
"loss": 1.7272, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.581818181818182, |
|
"grad_norm": 0.1448238343000412, |
|
"learning_rate": 0.0001586738834678418, |
|
"loss": 1.7231, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.7636363636363637, |
|
"grad_norm": 0.12403673678636551, |
|
"learning_rate": 0.00015332044328016914, |
|
"loss": 1.7101, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.9454545454545453, |
|
"grad_norm": 0.11520184576511383, |
|
"learning_rate": 0.0001477442777037949, |
|
"loss": 1.7035, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.724990129470825, |
|
"eval_runtime": 0.8296, |
|
"eval_samples_per_second": 12.053, |
|
"eval_steps_per_second": 1.205, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 4.109090909090909, |
|
"grad_norm": 0.11850611865520477, |
|
"learning_rate": 0.0001419686789990429, |
|
"loss": 1.6998, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 4.290909090909091, |
|
"grad_norm": 0.141310453414917, |
|
"learning_rate": 0.00013601777248047105, |
|
"loss": 1.6942, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.472727272727273, |
|
"grad_norm": 0.14388997852802277, |
|
"learning_rate": 0.00012991641574276418, |
|
"loss": 1.6887, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.654545454545454, |
|
"grad_norm": 0.11356977373361588, |
|
"learning_rate": 0.00012369009482781192, |
|
"loss": 1.6845, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.836363636363636, |
|
"grad_norm": 0.13505423069000244, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.6801, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.18071481585502625, |
|
"learning_rate": 0.00011096700594125318, |
|
"loss": 1.6822, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.7262730598449707, |
|
"eval_runtime": 0.8327, |
|
"eval_samples_per_second": 12.009, |
|
"eval_steps_per_second": 1.201, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 5.181818181818182, |
|
"grad_norm": 0.12405228614807129, |
|
"learning_rate": 0.00010452338371907064, |
|
"loss": 1.671, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 5.363636363636363, |
|
"grad_norm": 0.15709254145622253, |
|
"learning_rate": 9.806086682281758e-05, |
|
"loss": 1.6697, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 5.545454545454545, |
|
"grad_norm": 0.1405353993177414, |
|
"learning_rate": 9.160644990030931e-05, |
|
"loss": 1.6707, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 5.7272727272727275, |
|
"grad_norm": 0.13487176597118378, |
|
"learning_rate": 8.518709376487515e-05, |
|
"loss": 1.6619, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.909090909090909, |
|
"grad_norm": 0.12394227087497711, |
|
"learning_rate": 7.882961277705895e-05, |
|
"loss": 1.6619, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.7253997325897217, |
|
"eval_runtime": 0.8321, |
|
"eval_samples_per_second": 12.017, |
|
"eval_steps_per_second": 1.202, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 6.072727272727272, |
|
"grad_norm": 0.11816684156656265, |
|
"learning_rate": 7.256056283806986e-05, |
|
"loss": 1.6573, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 6.254545454545455, |
|
"grad_norm": 0.14117498695850372, |
|
"learning_rate": 6.640613046284581e-05, |
|
"loss": 1.6622, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 6.4363636363636365, |
|
"grad_norm": 0.1342514008283615, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 1.6535, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 6.618181818181818, |
|
"grad_norm": 0.13483189046382904, |
|
"learning_rate": 5.4543363228149946e-05, |
|
"loss": 1.6532, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.1636153757572174, |
|
"learning_rate": 4.888458045941269e-05, |
|
"loss": 1.6482, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.9818181818181815, |
|
"grad_norm": 0.1563912183046341, |
|
"learning_rate": 4.343931245134616e-05, |
|
"loss": 1.6471, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.7240517139434814, |
|
"eval_runtime": 0.8312, |
|
"eval_samples_per_second": 12.031, |
|
"eval_steps_per_second": 1.203, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 7.1454545454545455, |
|
"grad_norm": 0.11320989578962326, |
|
"learning_rate": 3.8230304690654304e-05, |
|
"loss": 1.6472, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 7.327272727272727, |
|
"grad_norm": 0.111383818089962, |
|
"learning_rate": 3.3279315778858036e-05, |
|
"loss": 1.6488, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 7.509090909090909, |
|
"grad_norm": 0.10844731330871582, |
|
"learning_rate": 2.8607026544210114e-05, |
|
"loss": 1.6458, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 7.690909090909091, |
|
"grad_norm": 0.10823339223861694, |
|
"learning_rate": 2.423295365558821e-05, |
|
"loss": 1.6456, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 7.872727272727273, |
|
"grad_norm": 0.10790830850601196, |
|
"learning_rate": 2.01753680992107e-05, |
|
"loss": 1.6458, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.7252650260925293, |
|
"eval_runtime": 0.8302, |
|
"eval_samples_per_second": 12.045, |
|
"eval_steps_per_second": 1.204, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 8.036363636363637, |
|
"grad_norm": 0.11462420970201492, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 1.643, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 8.218181818181819, |
|
"grad_norm": 0.10164881497621536, |
|
"learning_rate": 1.307606211733522e-05, |
|
"loss": 1.6435, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.11715802550315857, |
|
"learning_rate": 1.0063996278090704e-05, |
|
"loss": 1.6436, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 8.581818181818182, |
|
"grad_norm": 0.1077931597828865, |
|
"learning_rate": 7.427603073110967e-06, |
|
"loss": 1.6437, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 8.763636363636364, |
|
"grad_norm": 0.09881118685007095, |
|
"learning_rate": 5.177895008392353e-06, |
|
"loss": 1.6415, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 8.945454545454545, |
|
"grad_norm": 0.0973580852150917, |
|
"learning_rate": 3.3242693633337983e-06, |
|
"loss": 1.641, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.725569009780884, |
|
"eval_runtime": 0.8306, |
|
"eval_samples_per_second": 12.039, |
|
"eval_steps_per_second": 1.204, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 9.10909090909091, |
|
"grad_norm": 0.10264136642217636, |
|
"learning_rate": 1.874468937261531e-06, |
|
"loss": 1.6464, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 9.290909090909091, |
|
"grad_norm": 0.1021399274468422, |
|
"learning_rate": 8.345497068998897e-07, |
|
"loss": 1.6443, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 9.472727272727273, |
|
"grad_norm": 0.10423731058835983, |
|
"learning_rate": 2.088555298867978e-07, |
|
"loss": 1.6436, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 9.654545454545454, |
|
"grad_norm": 0.09860274940729141, |
|
"learning_rate": 0.0, |
|
"loss": 1.6383, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 9.654545454545454, |
|
"eval_loss": 2.725593328475952, |
|
"eval_runtime": 0.8317, |
|
"eval_samples_per_second": 12.024, |
|
"eval_steps_per_second": 1.202, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 9.654545454545454, |
|
"step": 270, |
|
"total_flos": 8.156088875152835e+17, |
|
"train_loss": 1.7710220513520418, |
|
"train_runtime": 1245.0854, |
|
"train_samples_per_second": 112.233, |
|
"train_steps_per_second": 0.217 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 270, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.156088875152835e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|