|
{ |
|
"best_global_step": 675, |
|
"best_metric": 0.6912277936935425, |
|
"best_model_checkpoint": "checkpoints/rft-finetune-llama-3.1-8b-gsm8k/gsm8k/finetune-llama-3.1-8b-gsm8k-step-1/checkpoint-675", |
|
"epoch": 1.0, |
|
"eval_steps": 75, |
|
"global_step": 744, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006720430107526882, |
|
"grad_norm": 14.59550096035499, |
|
"learning_rate": 5.333333333333333e-09, |
|
"loss": 1.1893, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.013440860215053764, |
|
"grad_norm": 15.552214878649293, |
|
"learning_rate": 1.1999999999999998e-08, |
|
"loss": 1.1733, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020161290322580645, |
|
"grad_norm": 16.885206147000304, |
|
"learning_rate": 1.8666666666666665e-08, |
|
"loss": 1.1934, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.026881720430107527, |
|
"grad_norm": 15.221916137414837, |
|
"learning_rate": 2.5333333333333335e-08, |
|
"loss": 1.2152, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.033602150537634407, |
|
"grad_norm": 15.887269098975871, |
|
"learning_rate": 3.2e-08, |
|
"loss": 1.2051, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04032258064516129, |
|
"grad_norm": 14.254790289709463, |
|
"learning_rate": 3.866666666666666e-08, |
|
"loss": 1.1997, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04704301075268817, |
|
"grad_norm": 14.583670649351852, |
|
"learning_rate": 4.533333333333333e-08, |
|
"loss": 1.1812, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.053763440860215055, |
|
"grad_norm": 14.695116879291863, |
|
"learning_rate": 5.2e-08, |
|
"loss": 1.195, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06048387096774194, |
|
"grad_norm": 14.808886364711231, |
|
"learning_rate": 5.866666666666666e-08, |
|
"loss": 1.19, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06720430107526881, |
|
"grad_norm": 14.279020954727395, |
|
"learning_rate": 6.533333333333332e-08, |
|
"loss": 1.1758, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0739247311827957, |
|
"grad_norm": 15.710193271146736, |
|
"learning_rate": 7.2e-08, |
|
"loss": 1.1654, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 12.823166713617123, |
|
"learning_rate": 7.866666666666666e-08, |
|
"loss": 1.0956, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08736559139784947, |
|
"grad_norm": 13.123484893071117, |
|
"learning_rate": 8.533333333333333e-08, |
|
"loss": 1.0939, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.09408602150537634, |
|
"grad_norm": 13.804028711760814, |
|
"learning_rate": 9.2e-08, |
|
"loss": 1.0806, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10080645161290322, |
|
"grad_norm": 14.362479314024831, |
|
"learning_rate": 9.866666666666666e-08, |
|
"loss": 1.0428, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10080645161290322, |
|
"eval_loss": 1.0135635137557983, |
|
"eval_runtime": 64.0958, |
|
"eval_samples_per_second": 128.277, |
|
"eval_steps_per_second": 2.013, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10752688172043011, |
|
"grad_norm": 12.804236982581402, |
|
"learning_rate": 9.940209267563527e-08, |
|
"loss": 0.9585, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11424731182795698, |
|
"grad_norm": 11.669278429338862, |
|
"learning_rate": 9.865470852017936e-08, |
|
"loss": 0.9112, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12096774193548387, |
|
"grad_norm": 11.081808260451195, |
|
"learning_rate": 9.790732436472347e-08, |
|
"loss": 0.877, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12768817204301075, |
|
"grad_norm": 16.49147962749734, |
|
"learning_rate": 9.715994020926755e-08, |
|
"loss": 0.8504, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.13440860215053763, |
|
"grad_norm": 13.95628735293844, |
|
"learning_rate": 9.641255605381165e-08, |
|
"loss": 0.7931, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14112903225806453, |
|
"grad_norm": 8.601123066406517, |
|
"learning_rate": 9.566517189835575e-08, |
|
"loss": 0.711, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1478494623655914, |
|
"grad_norm": 3.7294133148506337, |
|
"learning_rate": 9.491778774289984e-08, |
|
"loss": 0.6981, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15456989247311828, |
|
"grad_norm": 2.723138779265988, |
|
"learning_rate": 9.417040358744395e-08, |
|
"loss": 0.6953, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 2.2560223539756423, |
|
"learning_rate": 9.342301943198804e-08, |
|
"loss": 0.6846, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.16801075268817203, |
|
"grad_norm": 2.2521211316559433, |
|
"learning_rate": 9.267563527653213e-08, |
|
"loss": 0.6696, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.17473118279569894, |
|
"grad_norm": 2.2818251781816588, |
|
"learning_rate": 9.192825112107622e-08, |
|
"loss": 0.6809, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1814516129032258, |
|
"grad_norm": 2.44710559044971, |
|
"learning_rate": 9.118086696562033e-08, |
|
"loss": 0.6841, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1881720430107527, |
|
"grad_norm": 2.350544704349094, |
|
"learning_rate": 9.043348281016442e-08, |
|
"loss": 0.6766, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.19489247311827956, |
|
"grad_norm": 1.9640158046302565, |
|
"learning_rate": 8.968609865470852e-08, |
|
"loss": 0.6878, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.20161290322580644, |
|
"grad_norm": 2.0782198781659353, |
|
"learning_rate": 8.893871449925261e-08, |
|
"loss": 0.6636, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.20161290322580644, |
|
"eval_loss": 0.6971947550773621, |
|
"eval_runtime": 56.1638, |
|
"eval_samples_per_second": 146.393, |
|
"eval_steps_per_second": 2.297, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 2.1288692138731458, |
|
"learning_rate": 8.819133034379672e-08, |
|
"loss": 0.6866, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.21505376344086022, |
|
"grad_norm": 2.2152810121981568, |
|
"learning_rate": 8.74439461883408e-08, |
|
"loss": 0.6884, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2217741935483871, |
|
"grad_norm": 1.9700569516052628, |
|
"learning_rate": 8.66965620328849e-08, |
|
"loss": 0.6793, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.22849462365591397, |
|
"grad_norm": 2.0535897043325724, |
|
"learning_rate": 8.5949177877429e-08, |
|
"loss": 0.6777, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.23521505376344087, |
|
"grad_norm": 2.011966313411605, |
|
"learning_rate": 8.520179372197309e-08, |
|
"loss": 0.6596, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 2.0907303106285164, |
|
"learning_rate": 8.445440956651718e-08, |
|
"loss": 0.6713, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.24865591397849462, |
|
"grad_norm": 1.9870643151130227, |
|
"learning_rate": 8.370702541106129e-08, |
|
"loss": 0.6835, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2553763440860215, |
|
"grad_norm": 2.1110324099341797, |
|
"learning_rate": 8.295964125560538e-08, |
|
"loss": 0.6691, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2620967741935484, |
|
"grad_norm": 2.337028306260816, |
|
"learning_rate": 8.221225710014947e-08, |
|
"loss": 0.6706, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.26881720430107525, |
|
"grad_norm": 1.9987914444654455, |
|
"learning_rate": 8.146487294469356e-08, |
|
"loss": 0.6769, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27553763440860213, |
|
"grad_norm": 2.1558155606241884, |
|
"learning_rate": 8.071748878923767e-08, |
|
"loss": 0.6709, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.28225806451612906, |
|
"grad_norm": 2.09004359899187, |
|
"learning_rate": 7.997010463378176e-08, |
|
"loss": 0.6784, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.28897849462365593, |
|
"grad_norm": 2.285145499206384, |
|
"learning_rate": 7.922272047832586e-08, |
|
"loss": 0.6715, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2956989247311828, |
|
"grad_norm": 2.027191135310369, |
|
"learning_rate": 7.847533632286996e-08, |
|
"loss": 0.6612, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3024193548387097, |
|
"grad_norm": 2.0151475678114705, |
|
"learning_rate": 7.772795216741404e-08, |
|
"loss": 0.6676, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3024193548387097, |
|
"eval_loss": 0.6939067244529724, |
|
"eval_runtime": 56.1322, |
|
"eval_samples_per_second": 146.476, |
|
"eval_steps_per_second": 2.298, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.30913978494623656, |
|
"grad_norm": 1.9995401054673094, |
|
"learning_rate": 7.698056801195815e-08, |
|
"loss": 0.6593, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.31586021505376344, |
|
"grad_norm": 2.0110294954158454, |
|
"learning_rate": 7.623318385650224e-08, |
|
"loss": 0.6563, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 2.059981924661176, |
|
"learning_rate": 7.548579970104633e-08, |
|
"loss": 0.6764, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3293010752688172, |
|
"grad_norm": 2.149345633891912, |
|
"learning_rate": 7.473841554559043e-08, |
|
"loss": 0.6645, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.33602150537634407, |
|
"grad_norm": 2.101850865821983, |
|
"learning_rate": 7.399103139013453e-08, |
|
"loss": 0.6612, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.34274193548387094, |
|
"grad_norm": 2.1254515490062245, |
|
"learning_rate": 7.324364723467862e-08, |
|
"loss": 0.6773, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.34946236559139787, |
|
"grad_norm": 2.08043259609194, |
|
"learning_rate": 7.249626307922272e-08, |
|
"loss": 0.6826, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.35618279569892475, |
|
"grad_norm": 1.972159359225326, |
|
"learning_rate": 7.174887892376681e-08, |
|
"loss": 0.6736, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3629032258064516, |
|
"grad_norm": 2.020214538372761, |
|
"learning_rate": 7.100149476831092e-08, |
|
"loss": 0.6628, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3696236559139785, |
|
"grad_norm": 2.119471507884024, |
|
"learning_rate": 7.0254110612855e-08, |
|
"loss": 0.6628, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3763440860215054, |
|
"grad_norm": 2.2339027083129803, |
|
"learning_rate": 6.95067264573991e-08, |
|
"loss": 0.6461, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.38306451612903225, |
|
"grad_norm": 2.179556694440606, |
|
"learning_rate": 6.87593423019432e-08, |
|
"loss": 0.6685, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3897849462365591, |
|
"grad_norm": 2.0979269829823477, |
|
"learning_rate": 6.801195814648729e-08, |
|
"loss": 0.6688, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.396505376344086, |
|
"grad_norm": 2.145759818638723, |
|
"learning_rate": 6.72645739910314e-08, |
|
"loss": 0.6582, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 2.0232579108607998, |
|
"learning_rate": 6.651718983557549e-08, |
|
"loss": 0.6849, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"eval_loss": 0.6928321123123169, |
|
"eval_runtime": 56.1826, |
|
"eval_samples_per_second": 146.344, |
|
"eval_steps_per_second": 2.296, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4099462365591398, |
|
"grad_norm": 2.215763186449215, |
|
"learning_rate": 6.576980568011958e-08, |
|
"loss": 0.6772, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 1.9274901065614445, |
|
"learning_rate": 6.502242152466367e-08, |
|
"loss": 0.691, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.42338709677419356, |
|
"grad_norm": 2.108143183803596, |
|
"learning_rate": 6.427503736920778e-08, |
|
"loss": 0.6628, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.43010752688172044, |
|
"grad_norm": 2.207966702792586, |
|
"learning_rate": 6.352765321375186e-08, |
|
"loss": 0.6768, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4368279569892473, |
|
"grad_norm": 2.096647992540451, |
|
"learning_rate": 6.278026905829596e-08, |
|
"loss": 0.6667, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4435483870967742, |
|
"grad_norm": 2.1336095512755113, |
|
"learning_rate": 6.203288490284006e-08, |
|
"loss": 0.6581, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.45026881720430106, |
|
"grad_norm": 2.0159699059441976, |
|
"learning_rate": 6.128550074738415e-08, |
|
"loss": 0.6743, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.45698924731182794, |
|
"grad_norm": 1.9979710525534193, |
|
"learning_rate": 6.053811659192824e-08, |
|
"loss": 0.6638, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4637096774193548, |
|
"grad_norm": 2.049384366870693, |
|
"learning_rate": 5.979073243647235e-08, |
|
"loss": 0.679, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.47043010752688175, |
|
"grad_norm": 2.147547124197116, |
|
"learning_rate": 5.9043348281016435e-08, |
|
"loss": 0.6719, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4771505376344086, |
|
"grad_norm": 1.994604705245644, |
|
"learning_rate": 5.8295964125560534e-08, |
|
"loss": 0.6622, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 2.035462459844569, |
|
"learning_rate": 5.754857997010463e-08, |
|
"loss": 0.6464, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4905913978494624, |
|
"grad_norm": 2.042123968621037, |
|
"learning_rate": 5.6801195814648727e-08, |
|
"loss": 0.6627, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.49731182795698925, |
|
"grad_norm": 2.14484478856972, |
|
"learning_rate": 5.605381165919282e-08, |
|
"loss": 0.6522, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5040322580645161, |
|
"grad_norm": 2.0812193752637462, |
|
"learning_rate": 5.530642750373692e-08, |
|
"loss": 0.6858, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5040322580645161, |
|
"eval_loss": 0.6924068927764893, |
|
"eval_runtime": 56.1089, |
|
"eval_samples_per_second": 146.536, |
|
"eval_steps_per_second": 2.299, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.510752688172043, |
|
"grad_norm": 2.1209544150897757, |
|
"learning_rate": 5.455904334828101e-08, |
|
"loss": 0.6451, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5174731182795699, |
|
"grad_norm": 2.045170736604748, |
|
"learning_rate": 5.381165919282511e-08, |
|
"loss": 0.6624, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5241935483870968, |
|
"grad_norm": 2.10236284260544, |
|
"learning_rate": 5.306427503736921e-08, |
|
"loss": 0.6741, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5309139784946236, |
|
"grad_norm": 2.1314029429521244, |
|
"learning_rate": 5.2316890881913303e-08, |
|
"loss": 0.6642, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5376344086021505, |
|
"grad_norm": 2.186951067665247, |
|
"learning_rate": 5.15695067264574e-08, |
|
"loss": 0.6836, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5443548387096774, |
|
"grad_norm": 2.052806350954309, |
|
"learning_rate": 5.082212257100149e-08, |
|
"loss": 0.672, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5510752688172043, |
|
"grad_norm": 2.071189416663902, |
|
"learning_rate": 5.0074738415545595e-08, |
|
"loss": 0.6677, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5577956989247311, |
|
"grad_norm": 2.0118486297022833, |
|
"learning_rate": 4.932735426008968e-08, |
|
"loss": 0.6639, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 2.0632350448343333, |
|
"learning_rate": 4.8579970104633774e-08, |
|
"loss": 0.6754, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.571236559139785, |
|
"grad_norm": 2.251163063469807, |
|
"learning_rate": 4.7832585949177874e-08, |
|
"loss": 0.7003, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5779569892473119, |
|
"grad_norm": 2.0661342085354697, |
|
"learning_rate": 4.708520179372197e-08, |
|
"loss": 0.6904, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5846774193548387, |
|
"grad_norm": 2.1557028238109517, |
|
"learning_rate": 4.6337817638266066e-08, |
|
"loss": 0.6318, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5913978494623656, |
|
"grad_norm": 2.1516536049922452, |
|
"learning_rate": 4.5590433482810165e-08, |
|
"loss": 0.6552, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5981182795698925, |
|
"grad_norm": 2.086935790084074, |
|
"learning_rate": 4.484304932735426e-08, |
|
"loss": 0.6778, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"grad_norm": 2.146673007914857, |
|
"learning_rate": 4.409566517189836e-08, |
|
"loss": 0.6536, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"eval_loss": 0.6919218897819519, |
|
"eval_runtime": 56.1387, |
|
"eval_samples_per_second": 146.459, |
|
"eval_steps_per_second": 2.298, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6115591397849462, |
|
"grad_norm": 2.1253216816067066, |
|
"learning_rate": 4.334828101644245e-08, |
|
"loss": 0.6511, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6182795698924731, |
|
"grad_norm": 2.071737757767298, |
|
"learning_rate": 4.260089686098654e-08, |
|
"loss": 0.6592, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.0932212158820396, |
|
"learning_rate": 4.185351270553064e-08, |
|
"loss": 0.6692, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.6317204301075269, |
|
"grad_norm": 2.0910146959652307, |
|
"learning_rate": 4.1106128550074736e-08, |
|
"loss": 0.6683, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6384408602150538, |
|
"grad_norm": 2.062557270254667, |
|
"learning_rate": 4.0358744394618835e-08, |
|
"loss": 0.6656, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 2.017329078129843, |
|
"learning_rate": 3.961136023916293e-08, |
|
"loss": 0.6719, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6518817204301075, |
|
"grad_norm": 2.2036353665493755, |
|
"learning_rate": 3.886397608370702e-08, |
|
"loss": 0.682, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6586021505376344, |
|
"grad_norm": 2.007622155160972, |
|
"learning_rate": 3.811659192825112e-08, |
|
"loss": 0.6486, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6653225806451613, |
|
"grad_norm": 2.2339142859626553, |
|
"learning_rate": 3.736920777279521e-08, |
|
"loss": 0.6542, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6720430107526881, |
|
"grad_norm": 2.1968231695887823, |
|
"learning_rate": 3.662182361733931e-08, |
|
"loss": 0.6701, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.678763440860215, |
|
"grad_norm": 2.311102123656589, |
|
"learning_rate": 3.5874439461883405e-08, |
|
"loss": 0.6541, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6854838709677419, |
|
"grad_norm": 2.162411566728306, |
|
"learning_rate": 3.51270553064275e-08, |
|
"loss": 0.6609, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6922043010752689, |
|
"grad_norm": 2.0777352845523156, |
|
"learning_rate": 3.43796711509716e-08, |
|
"loss": 0.6654, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6989247311827957, |
|
"grad_norm": 2.0466624810725613, |
|
"learning_rate": 3.36322869955157e-08, |
|
"loss": 0.6637, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7056451612903226, |
|
"grad_norm": 2.046164435882464, |
|
"learning_rate": 3.288490284005979e-08, |
|
"loss": 0.6672, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7056451612903226, |
|
"eval_loss": 0.691673994064331, |
|
"eval_runtime": 56.2018, |
|
"eval_samples_per_second": 146.294, |
|
"eval_steps_per_second": 2.295, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7123655913978495, |
|
"grad_norm": 2.098676297617592, |
|
"learning_rate": 3.213751868460389e-08, |
|
"loss": 0.6581, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7190860215053764, |
|
"grad_norm": 1.9986286959461719, |
|
"learning_rate": 3.139013452914798e-08, |
|
"loss": 0.6658, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 2.1195333643509935, |
|
"learning_rate": 3.0642750373692075e-08, |
|
"loss": 0.669, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7325268817204301, |
|
"grad_norm": 2.055327535910759, |
|
"learning_rate": 2.9895366218236174e-08, |
|
"loss": 0.6663, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.739247311827957, |
|
"grad_norm": 2.074614293697511, |
|
"learning_rate": 2.9147982062780267e-08, |
|
"loss": 0.6627, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7459677419354839, |
|
"grad_norm": 2.0360906767754, |
|
"learning_rate": 2.8400597907324363e-08, |
|
"loss": 0.6619, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7526881720430108, |
|
"grad_norm": 2.098550310165075, |
|
"learning_rate": 2.765321375186846e-08, |
|
"loss": 0.6791, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7594086021505376, |
|
"grad_norm": 2.056390058629133, |
|
"learning_rate": 2.6905829596412556e-08, |
|
"loss": 0.6569, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7661290322580645, |
|
"grad_norm": 2.2413055374611632, |
|
"learning_rate": 2.6158445440956652e-08, |
|
"loss": 0.6704, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7728494623655914, |
|
"grad_norm": 2.054426280985883, |
|
"learning_rate": 2.5411061285500745e-08, |
|
"loss": 0.6563, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7795698924731183, |
|
"grad_norm": 2.0006598598366345, |
|
"learning_rate": 2.466367713004484e-08, |
|
"loss": 0.6567, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7862903225806451, |
|
"grad_norm": 2.2751507511393925, |
|
"learning_rate": 2.3916292974588937e-08, |
|
"loss": 0.6693, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.793010752688172, |
|
"grad_norm": 2.1645437231829456, |
|
"learning_rate": 2.3168908819133033e-08, |
|
"loss": 0.6643, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7997311827956989, |
|
"grad_norm": 2.0220952097349176, |
|
"learning_rate": 2.242152466367713e-08, |
|
"loss": 0.66, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 2.078077165465023, |
|
"learning_rate": 2.1674140508221225e-08, |
|
"loss": 0.6559, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"eval_loss": 0.6913867592811584, |
|
"eval_runtime": 56.1429, |
|
"eval_samples_per_second": 146.448, |
|
"eval_steps_per_second": 2.298, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8131720430107527, |
|
"grad_norm": 2.3106172558361884, |
|
"learning_rate": 2.092675635276532e-08, |
|
"loss": 0.6828, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8198924731182796, |
|
"grad_norm": 2.094033510896202, |
|
"learning_rate": 2.0179372197309417e-08, |
|
"loss": 0.6635, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8266129032258065, |
|
"grad_norm": 2.0658322655908665, |
|
"learning_rate": 1.943198804185351e-08, |
|
"loss": 0.6753, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.17250427254849, |
|
"learning_rate": 1.8684603886397606e-08, |
|
"loss": 0.6727, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8400537634408602, |
|
"grad_norm": 2.259179736932614, |
|
"learning_rate": 1.7937219730941703e-08, |
|
"loss": 0.6959, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.8467741935483871, |
|
"grad_norm": 2.0612328204646815, |
|
"learning_rate": 1.71898355754858e-08, |
|
"loss": 0.6584, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.853494623655914, |
|
"grad_norm": 2.1679992601621154, |
|
"learning_rate": 1.6442451420029895e-08, |
|
"loss": 0.6508, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8602150537634409, |
|
"grad_norm": 1.9886841572370688, |
|
"learning_rate": 1.569506726457399e-08, |
|
"loss": 0.6729, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8669354838709677, |
|
"grad_norm": 2.1506038261656184, |
|
"learning_rate": 1.4947683109118087e-08, |
|
"loss": 0.6608, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8736559139784946, |
|
"grad_norm": 1.9864785884472476, |
|
"learning_rate": 1.4200298953662182e-08, |
|
"loss": 0.6628, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8803763440860215, |
|
"grad_norm": 2.1120434621533284, |
|
"learning_rate": 1.3452914798206278e-08, |
|
"loss": 0.6838, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 1.9981627858324142, |
|
"learning_rate": 1.2705530642750372e-08, |
|
"loss": 0.6561, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8938172043010753, |
|
"grad_norm": 2.050666914131394, |
|
"learning_rate": 1.1958146487294468e-08, |
|
"loss": 0.6491, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9005376344086021, |
|
"grad_norm": 2.159702446356377, |
|
"learning_rate": 1.1210762331838565e-08, |
|
"loss": 0.6783, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.907258064516129, |
|
"grad_norm": 2.0164611126323697, |
|
"learning_rate": 1.046337817638266e-08, |
|
"loss": 0.6472, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.907258064516129, |
|
"eval_loss": 0.6912277936935425, |
|
"eval_runtime": 56.1813, |
|
"eval_samples_per_second": 146.348, |
|
"eval_steps_per_second": 2.296, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.9139784946236559, |
|
"grad_norm": 2.1472859721487683, |
|
"learning_rate": 9.715994020926755e-09, |
|
"loss": 0.6621, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9206989247311828, |
|
"grad_norm": 2.138066510167175, |
|
"learning_rate": 8.968609865470851e-09, |
|
"loss": 0.6396, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9274193548387096, |
|
"grad_norm": 2.2998888062886538, |
|
"learning_rate": 8.221225710014947e-09, |
|
"loss": 0.6603, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9341397849462365, |
|
"grad_norm": 2.0384421650226527, |
|
"learning_rate": 7.473841554559044e-09, |
|
"loss": 0.6384, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.9408602150537635, |
|
"grad_norm": 2.0927886382588827, |
|
"learning_rate": 6.726457399103139e-09, |
|
"loss": 0.6557, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9475806451612904, |
|
"grad_norm": 2.02850385366728, |
|
"learning_rate": 5.979073243647234e-09, |
|
"loss": 0.6663, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.9543010752688172, |
|
"grad_norm": 2.2433567010088797, |
|
"learning_rate": 5.23168908819133e-09, |
|
"loss": 0.6796, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9610215053763441, |
|
"grad_norm": 2.057838483040146, |
|
"learning_rate": 4.484304932735426e-09, |
|
"loss": 0.6626, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 2.1280821159120786, |
|
"learning_rate": 3.736920777279522e-09, |
|
"loss": 0.633, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9744623655913979, |
|
"grad_norm": 2.019967451723735, |
|
"learning_rate": 2.989536621823617e-09, |
|
"loss": 0.6718, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9811827956989247, |
|
"grad_norm": 2.1683941089274477, |
|
"learning_rate": 2.242152466367713e-09, |
|
"loss": 0.6578, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9879032258064516, |
|
"grad_norm": 2.015824272032049, |
|
"learning_rate": 1.4947683109118085e-09, |
|
"loss": 0.6781, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9946236559139785, |
|
"grad_norm": 2.1171098020664325, |
|
"learning_rate": 7.473841554559043e-10, |
|
"loss": 0.6829, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 744, |
|
"total_flos": 1582562992455680.0, |
|
"train_loss": 0.7245069735793657, |
|
"train_runtime": 3975.4106, |
|
"train_samples_per_second": 11.971, |
|
"train_steps_per_second": 0.187 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 744, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 75, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1582562992455680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|