llama3-1b-summarize-gpt4o-128k / trainer_state.json
chansung's picture
Model save
e4e88e3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.654545454545454,
"eval_steps": 500,
"global_step": 270,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03636363636363636,
"grad_norm": 3.691458225250244,
"learning_rate": 7.4074074074074075e-06,
"loss": 2.6618,
"step": 1
},
{
"epoch": 0.18181818181818182,
"grad_norm": 3.446274518966675,
"learning_rate": 3.7037037037037037e-05,
"loss": 2.6408,
"step": 5
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.032472848892212,
"learning_rate": 7.407407407407407e-05,
"loss": 2.5317,
"step": 10
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.9463324546813965,
"learning_rate": 0.00011111111111111112,
"loss": 2.36,
"step": 15
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.7080094218254089,
"learning_rate": 0.00014814814814814815,
"loss": 2.2174,
"step": 20
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.5037975311279297,
"learning_rate": 0.0001851851851851852,
"loss": 2.1182,
"step": 25
},
{
"epoch": 1.0,
"eval_loss": 2.782132625579834,
"eval_runtime": 0.8292,
"eval_samples_per_second": 12.06,
"eval_steps_per_second": 1.206,
"step": 28
},
{
"epoch": 1.0727272727272728,
"grad_norm": 0.41888633370399475,
"learning_rate": 0.00019992479525042303,
"loss": 2.0277,
"step": 30
},
{
"epoch": 1.2545454545454544,
"grad_norm": 0.27915704250335693,
"learning_rate": 0.00019946562024066014,
"loss": 1.9587,
"step": 35
},
{
"epoch": 1.4363636363636363,
"grad_norm": 0.20056034624576569,
"learning_rate": 0.00019859096633447965,
"loss": 1.9087,
"step": 40
},
{
"epoch": 1.6181818181818182,
"grad_norm": 0.16737522184848785,
"learning_rate": 0.00019730448705798239,
"loss": 1.8766,
"step": 45
},
{
"epoch": 1.8,
"grad_norm": 0.15048423409461975,
"learning_rate": 0.00019561155617738797,
"loss": 1.8481,
"step": 50
},
{
"epoch": 1.981818181818182,
"grad_norm": 0.1224176436662674,
"learning_rate": 0.000193519245252219,
"loss": 1.8354,
"step": 55
},
{
"epoch": 2.0,
"eval_loss": 2.737755537033081,
"eval_runtime": 0.829,
"eval_samples_per_second": 12.063,
"eval_steps_per_second": 1.206,
"step": 56
},
{
"epoch": 2.1454545454545455,
"grad_norm": 0.1324545294046402,
"learning_rate": 0.0001910362940966147,
"loss": 1.8118,
"step": 60
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.11611360311508179,
"learning_rate": 0.0001881730742721608,
"loss": 1.7937,
"step": 65
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.1148991584777832,
"learning_rate": 0.00018494154576472976,
"loss": 1.7791,
"step": 70
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.11438702791929245,
"learning_rate": 0.00018135520702629675,
"loss": 1.7654,
"step": 75
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.11716635525226593,
"learning_rate": 0.00017742903859041325,
"loss": 1.7604,
"step": 80
},
{
"epoch": 3.0,
"eval_loss": 2.7259373664855957,
"eval_runtime": 0.8303,
"eval_samples_per_second": 12.044,
"eval_steps_per_second": 1.204,
"step": 84
},
{
"epoch": 3.036363636363636,
"grad_norm": 0.1302882581949234,
"learning_rate": 0.00017317944049686124,
"loss": 1.7453,
"step": 85
},
{
"epoch": 3.2181818181818183,
"grad_norm": 0.12489154189825058,
"learning_rate": 0.0001686241637868734,
"loss": 1.7396,
"step": 90
},
{
"epoch": 3.4,
"grad_norm": 0.10804688185453415,
"learning_rate": 0.0001637822363550706,
"loss": 1.7272,
"step": 95
},
{
"epoch": 3.581818181818182,
"grad_norm": 0.1448238343000412,
"learning_rate": 0.0001586738834678418,
"loss": 1.7231,
"step": 100
},
{
"epoch": 3.7636363636363637,
"grad_norm": 0.12403673678636551,
"learning_rate": 0.00015332044328016914,
"loss": 1.7101,
"step": 105
},
{
"epoch": 3.9454545454545453,
"grad_norm": 0.11520184576511383,
"learning_rate": 0.0001477442777037949,
"loss": 1.7035,
"step": 110
},
{
"epoch": 4.0,
"eval_loss": 2.724990129470825,
"eval_runtime": 0.8296,
"eval_samples_per_second": 12.053,
"eval_steps_per_second": 1.205,
"step": 112
},
{
"epoch": 4.109090909090909,
"grad_norm": 0.11850611865520477,
"learning_rate": 0.0001419686789990429,
"loss": 1.6998,
"step": 115
},
{
"epoch": 4.290909090909091,
"grad_norm": 0.141310453414917,
"learning_rate": 0.00013601777248047105,
"loss": 1.6942,
"step": 120
},
{
"epoch": 4.472727272727273,
"grad_norm": 0.14388997852802277,
"learning_rate": 0.00012991641574276418,
"loss": 1.6887,
"step": 125
},
{
"epoch": 4.654545454545454,
"grad_norm": 0.11356977373361588,
"learning_rate": 0.00012369009482781192,
"loss": 1.6845,
"step": 130
},
{
"epoch": 4.836363636363636,
"grad_norm": 0.13505423069000244,
"learning_rate": 0.00011736481776669306,
"loss": 1.6801,
"step": 135
},
{
"epoch": 5.0,
"grad_norm": 0.18071481585502625,
"learning_rate": 0.00011096700594125318,
"loss": 1.6822,
"step": 140
},
{
"epoch": 5.0,
"eval_loss": 2.7262730598449707,
"eval_runtime": 0.8327,
"eval_samples_per_second": 12.009,
"eval_steps_per_second": 1.201,
"step": 140
},
{
"epoch": 5.181818181818182,
"grad_norm": 0.12405228614807129,
"learning_rate": 0.00010452338371907064,
"loss": 1.671,
"step": 145
},
{
"epoch": 5.363636363636363,
"grad_norm": 0.15709254145622253,
"learning_rate": 9.806086682281758e-05,
"loss": 1.6697,
"step": 150
},
{
"epoch": 5.545454545454545,
"grad_norm": 0.1405353993177414,
"learning_rate": 9.160644990030931e-05,
"loss": 1.6707,
"step": 155
},
{
"epoch": 5.7272727272727275,
"grad_norm": 0.13487176597118378,
"learning_rate": 8.518709376487515e-05,
"loss": 1.6619,
"step": 160
},
{
"epoch": 5.909090909090909,
"grad_norm": 0.12394227087497711,
"learning_rate": 7.882961277705895e-05,
"loss": 1.6619,
"step": 165
},
{
"epoch": 6.0,
"eval_loss": 2.7253997325897217,
"eval_runtime": 0.8321,
"eval_samples_per_second": 12.017,
"eval_steps_per_second": 1.202,
"step": 168
},
{
"epoch": 6.072727272727272,
"grad_norm": 0.11816684156656265,
"learning_rate": 7.256056283806986e-05,
"loss": 1.6573,
"step": 170
},
{
"epoch": 6.254545454545455,
"grad_norm": 0.14117498695850372,
"learning_rate": 6.640613046284581e-05,
"loss": 1.6622,
"step": 175
},
{
"epoch": 6.4363636363636365,
"grad_norm": 0.1342514008283615,
"learning_rate": 6.039202339608432e-05,
"loss": 1.6535,
"step": 180
},
{
"epoch": 6.618181818181818,
"grad_norm": 0.13483189046382904,
"learning_rate": 5.4543363228149946e-05,
"loss": 1.6532,
"step": 185
},
{
"epoch": 6.8,
"grad_norm": 0.1636153757572174,
"learning_rate": 4.888458045941269e-05,
"loss": 1.6482,
"step": 190
},
{
"epoch": 6.9818181818181815,
"grad_norm": 0.1563912183046341,
"learning_rate": 4.343931245134616e-05,
"loss": 1.6471,
"step": 195
},
{
"epoch": 7.0,
"eval_loss": 2.7240517139434814,
"eval_runtime": 0.8312,
"eval_samples_per_second": 12.031,
"eval_steps_per_second": 1.203,
"step": 196
},
{
"epoch": 7.1454545454545455,
"grad_norm": 0.11320989578962326,
"learning_rate": 3.8230304690654304e-05,
"loss": 1.6472,
"step": 200
},
{
"epoch": 7.327272727272727,
"grad_norm": 0.111383818089962,
"learning_rate": 3.3279315778858036e-05,
"loss": 1.6488,
"step": 205
},
{
"epoch": 7.509090909090909,
"grad_norm": 0.10844731330871582,
"learning_rate": 2.8607026544210114e-05,
"loss": 1.6458,
"step": 210
},
{
"epoch": 7.690909090909091,
"grad_norm": 0.10823339223861694,
"learning_rate": 2.423295365558821e-05,
"loss": 1.6456,
"step": 215
},
{
"epoch": 7.872727272727273,
"grad_norm": 0.10790830850601196,
"learning_rate": 2.01753680992107e-05,
"loss": 1.6458,
"step": 220
},
{
"epoch": 8.0,
"eval_loss": 2.7252650260925293,
"eval_runtime": 0.8302,
"eval_samples_per_second": 12.045,
"eval_steps_per_second": 1.204,
"step": 224
},
{
"epoch": 8.036363636363637,
"grad_norm": 0.11462420970201492,
"learning_rate": 1.6451218858706374e-05,
"loss": 1.643,
"step": 225
},
{
"epoch": 8.218181818181819,
"grad_norm": 0.10164881497621536,
"learning_rate": 1.307606211733522e-05,
"loss": 1.6435,
"step": 230
},
{
"epoch": 8.4,
"grad_norm": 0.11715802550315857,
"learning_rate": 1.0063996278090704e-05,
"loss": 1.6436,
"step": 235
},
{
"epoch": 8.581818181818182,
"grad_norm": 0.1077931597828865,
"learning_rate": 7.427603073110967e-06,
"loss": 1.6437,
"step": 240
},
{
"epoch": 8.763636363636364,
"grad_norm": 0.09881118685007095,
"learning_rate": 5.177895008392353e-06,
"loss": 1.6415,
"step": 245
},
{
"epoch": 8.945454545454545,
"grad_norm": 0.0973580852150917,
"learning_rate": 3.3242693633337983e-06,
"loss": 1.641,
"step": 250
},
{
"epoch": 9.0,
"eval_loss": 2.725569009780884,
"eval_runtime": 0.8306,
"eval_samples_per_second": 12.039,
"eval_steps_per_second": 1.204,
"step": 252
},
{
"epoch": 9.10909090909091,
"grad_norm": 0.10264136642217636,
"learning_rate": 1.874468937261531e-06,
"loss": 1.6464,
"step": 255
},
{
"epoch": 9.290909090909091,
"grad_norm": 0.1021399274468422,
"learning_rate": 8.345497068998897e-07,
"loss": 1.6443,
"step": 260
},
{
"epoch": 9.472727272727273,
"grad_norm": 0.10423731058835983,
"learning_rate": 2.088555298867978e-07,
"loss": 1.6436,
"step": 265
},
{
"epoch": 9.654545454545454,
"grad_norm": 0.09860274940729141,
"learning_rate": 0.0,
"loss": 1.6383,
"step": 270
},
{
"epoch": 9.654545454545454,
"eval_loss": 2.725593328475952,
"eval_runtime": 0.8317,
"eval_samples_per_second": 12.024,
"eval_steps_per_second": 1.202,
"step": 270
},
{
"epoch": 9.654545454545454,
"step": 270,
"total_flos": 8.156088875152835e+17,
"train_loss": 1.7710220513520418,
"train_runtime": 1245.0854,
"train_samples_per_second": 112.233,
"train_steps_per_second": 0.217
}
],
"logging_steps": 5,
"max_steps": 270,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.156088875152835e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}