{ "best_global_step": 675, "best_metric": 0.6912277936935425, "best_model_checkpoint": "checkpoints/rft-finetune-llama-3.1-8b-gsm8k/gsm8k/finetune-llama-3.1-8b-gsm8k-step-1/checkpoint-675", "epoch": 1.0, "eval_steps": 75, "global_step": 744, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006720430107526882, "grad_norm": 14.59550096035499, "learning_rate": 5.333333333333333e-09, "loss": 1.1893, "step": 5 }, { "epoch": 0.013440860215053764, "grad_norm": 15.552214878649293, "learning_rate": 1.1999999999999998e-08, "loss": 1.1733, "step": 10 }, { "epoch": 0.020161290322580645, "grad_norm": 16.885206147000304, "learning_rate": 1.8666666666666665e-08, "loss": 1.1934, "step": 15 }, { "epoch": 0.026881720430107527, "grad_norm": 15.221916137414837, "learning_rate": 2.5333333333333335e-08, "loss": 1.2152, "step": 20 }, { "epoch": 0.033602150537634407, "grad_norm": 15.887269098975871, "learning_rate": 3.2e-08, "loss": 1.2051, "step": 25 }, { "epoch": 0.04032258064516129, "grad_norm": 14.254790289709463, "learning_rate": 3.866666666666666e-08, "loss": 1.1997, "step": 30 }, { "epoch": 0.04704301075268817, "grad_norm": 14.583670649351852, "learning_rate": 4.533333333333333e-08, "loss": 1.1812, "step": 35 }, { "epoch": 0.053763440860215055, "grad_norm": 14.695116879291863, "learning_rate": 5.2e-08, "loss": 1.195, "step": 40 }, { "epoch": 0.06048387096774194, "grad_norm": 14.808886364711231, "learning_rate": 5.866666666666666e-08, "loss": 1.19, "step": 45 }, { "epoch": 0.06720430107526881, "grad_norm": 14.279020954727395, "learning_rate": 6.533333333333332e-08, "loss": 1.1758, "step": 50 }, { "epoch": 0.0739247311827957, "grad_norm": 15.710193271146736, "learning_rate": 7.2e-08, "loss": 1.1654, "step": 55 }, { "epoch": 0.08064516129032258, "grad_norm": 12.823166713617123, "learning_rate": 7.866666666666666e-08, "loss": 1.0956, "step": 60 }, { "epoch": 0.08736559139784947, "grad_norm": 13.123484893071117, "learning_rate": 8.533333333333333e-08, "loss": 1.0939, "step": 65 }, { "epoch": 0.09408602150537634, "grad_norm": 13.804028711760814, "learning_rate": 9.2e-08, "loss": 1.0806, "step": 70 }, { "epoch": 0.10080645161290322, "grad_norm": 14.362479314024831, "learning_rate": 9.866666666666666e-08, "loss": 1.0428, "step": 75 }, { "epoch": 0.10080645161290322, "eval_loss": 1.0135635137557983, "eval_runtime": 64.0958, "eval_samples_per_second": 128.277, "eval_steps_per_second": 2.013, "step": 75 }, { "epoch": 0.10752688172043011, "grad_norm": 12.804236982581402, "learning_rate": 9.940209267563527e-08, "loss": 0.9585, "step": 80 }, { "epoch": 0.11424731182795698, "grad_norm": 11.669278429338862, "learning_rate": 9.865470852017936e-08, "loss": 0.9112, "step": 85 }, { "epoch": 0.12096774193548387, "grad_norm": 11.081808260451195, "learning_rate": 9.790732436472347e-08, "loss": 0.877, "step": 90 }, { "epoch": 0.12768817204301075, "grad_norm": 16.49147962749734, "learning_rate": 9.715994020926755e-08, "loss": 0.8504, "step": 95 }, { "epoch": 0.13440860215053763, "grad_norm": 13.95628735293844, "learning_rate": 9.641255605381165e-08, "loss": 0.7931, "step": 100 }, { "epoch": 0.14112903225806453, "grad_norm": 8.601123066406517, "learning_rate": 9.566517189835575e-08, "loss": 0.711, "step": 105 }, { "epoch": 0.1478494623655914, "grad_norm": 3.7294133148506337, "learning_rate": 9.491778774289984e-08, "loss": 0.6981, "step": 110 }, { "epoch": 0.15456989247311828, "grad_norm": 2.723138779265988, "learning_rate": 9.417040358744395e-08, "loss": 0.6953, "step": 115 }, { "epoch": 0.16129032258064516, "grad_norm": 2.2560223539756423, "learning_rate": 9.342301943198804e-08, "loss": 0.6846, "step": 120 }, { "epoch": 0.16801075268817203, "grad_norm": 2.2521211316559433, "learning_rate": 9.267563527653213e-08, "loss": 0.6696, "step": 125 }, { "epoch": 0.17473118279569894, "grad_norm": 2.2818251781816588, "learning_rate": 9.192825112107622e-08, "loss": 0.6809, "step": 130 }, { "epoch": 0.1814516129032258, "grad_norm": 2.44710559044971, "learning_rate": 9.118086696562033e-08, "loss": 0.6841, "step": 135 }, { "epoch": 0.1881720430107527, "grad_norm": 2.350544704349094, "learning_rate": 9.043348281016442e-08, "loss": 0.6766, "step": 140 }, { "epoch": 0.19489247311827956, "grad_norm": 1.9640158046302565, "learning_rate": 8.968609865470852e-08, "loss": 0.6878, "step": 145 }, { "epoch": 0.20161290322580644, "grad_norm": 2.0782198781659353, "learning_rate": 8.893871449925261e-08, "loss": 0.6636, "step": 150 }, { "epoch": 0.20161290322580644, "eval_loss": 0.6971947550773621, "eval_runtime": 56.1638, "eval_samples_per_second": 146.393, "eval_steps_per_second": 2.297, "step": 150 }, { "epoch": 0.20833333333333334, "grad_norm": 2.1288692138731458, "learning_rate": 8.819133034379672e-08, "loss": 0.6866, "step": 155 }, { "epoch": 0.21505376344086022, "grad_norm": 2.2152810121981568, "learning_rate": 8.74439461883408e-08, "loss": 0.6884, "step": 160 }, { "epoch": 0.2217741935483871, "grad_norm": 1.9700569516052628, "learning_rate": 8.66965620328849e-08, "loss": 0.6793, "step": 165 }, { "epoch": 0.22849462365591397, "grad_norm": 2.0535897043325724, "learning_rate": 8.5949177877429e-08, "loss": 0.6777, "step": 170 }, { "epoch": 0.23521505376344087, "grad_norm": 2.011966313411605, "learning_rate": 8.520179372197309e-08, "loss": 0.6596, "step": 175 }, { "epoch": 0.24193548387096775, "grad_norm": 2.0907303106285164, "learning_rate": 8.445440956651718e-08, "loss": 0.6713, "step": 180 }, { "epoch": 0.24865591397849462, "grad_norm": 1.9870643151130227, "learning_rate": 8.370702541106129e-08, "loss": 0.6835, "step": 185 }, { "epoch": 0.2553763440860215, "grad_norm": 2.1110324099341797, "learning_rate": 8.295964125560538e-08, "loss": 0.6691, "step": 190 }, { "epoch": 0.2620967741935484, "grad_norm": 2.337028306260816, "learning_rate": 8.221225710014947e-08, "loss": 0.6706, "step": 195 }, { "epoch": 0.26881720430107525, "grad_norm": 1.9987914444654455, "learning_rate": 8.146487294469356e-08, "loss": 0.6769, "step": 200 }, { "epoch": 0.27553763440860213, "grad_norm": 2.1558155606241884, "learning_rate": 8.071748878923767e-08, "loss": 0.6709, "step": 205 }, { "epoch": 0.28225806451612906, "grad_norm": 2.09004359899187, "learning_rate": 7.997010463378176e-08, "loss": 0.6784, "step": 210 }, { "epoch": 0.28897849462365593, "grad_norm": 2.285145499206384, "learning_rate": 7.922272047832586e-08, "loss": 0.6715, "step": 215 }, { "epoch": 0.2956989247311828, "grad_norm": 2.027191135310369, "learning_rate": 7.847533632286996e-08, "loss": 0.6612, "step": 220 }, { "epoch": 0.3024193548387097, "grad_norm": 2.0151475678114705, "learning_rate": 7.772795216741404e-08, "loss": 0.6676, "step": 225 }, { "epoch": 0.3024193548387097, "eval_loss": 0.6939067244529724, "eval_runtime": 56.1322, "eval_samples_per_second": 146.476, "eval_steps_per_second": 2.298, "step": 225 }, { "epoch": 0.30913978494623656, "grad_norm": 1.9995401054673094, "learning_rate": 7.698056801195815e-08, "loss": 0.6593, "step": 230 }, { "epoch": 0.31586021505376344, "grad_norm": 2.0110294954158454, "learning_rate": 7.623318385650224e-08, "loss": 0.6563, "step": 235 }, { "epoch": 0.3225806451612903, "grad_norm": 2.059981924661176, "learning_rate": 7.548579970104633e-08, "loss": 0.6764, "step": 240 }, { "epoch": 0.3293010752688172, "grad_norm": 2.149345633891912, "learning_rate": 7.473841554559043e-08, "loss": 0.6645, "step": 245 }, { "epoch": 0.33602150537634407, "grad_norm": 2.101850865821983, "learning_rate": 7.399103139013453e-08, "loss": 0.6612, "step": 250 }, { "epoch": 0.34274193548387094, "grad_norm": 2.1254515490062245, "learning_rate": 7.324364723467862e-08, "loss": 0.6773, "step": 255 }, { "epoch": 0.34946236559139787, "grad_norm": 2.08043259609194, "learning_rate": 7.249626307922272e-08, "loss": 0.6826, "step": 260 }, { "epoch": 0.35618279569892475, "grad_norm": 1.972159359225326, "learning_rate": 7.174887892376681e-08, "loss": 0.6736, "step": 265 }, { "epoch": 0.3629032258064516, "grad_norm": 2.020214538372761, "learning_rate": 7.100149476831092e-08, "loss": 0.6628, "step": 270 }, { "epoch": 0.3696236559139785, "grad_norm": 2.119471507884024, "learning_rate": 7.0254110612855e-08, "loss": 0.6628, "step": 275 }, { "epoch": 0.3763440860215054, "grad_norm": 2.2339027083129803, "learning_rate": 6.95067264573991e-08, "loss": 0.6461, "step": 280 }, { "epoch": 0.38306451612903225, "grad_norm": 2.179556694440606, "learning_rate": 6.87593423019432e-08, "loss": 0.6685, "step": 285 }, { "epoch": 0.3897849462365591, "grad_norm": 2.0979269829823477, "learning_rate": 6.801195814648729e-08, "loss": 0.6688, "step": 290 }, { "epoch": 0.396505376344086, "grad_norm": 2.145759818638723, "learning_rate": 6.72645739910314e-08, "loss": 0.6582, "step": 295 }, { "epoch": 0.4032258064516129, "grad_norm": 2.0232579108607998, "learning_rate": 6.651718983557549e-08, "loss": 0.6849, "step": 300 }, { "epoch": 0.4032258064516129, "eval_loss": 0.6928321123123169, "eval_runtime": 56.1826, "eval_samples_per_second": 146.344, "eval_steps_per_second": 2.296, "step": 300 }, { "epoch": 0.4099462365591398, "grad_norm": 2.215763186449215, "learning_rate": 6.576980568011958e-08, "loss": 0.6772, "step": 305 }, { "epoch": 0.4166666666666667, "grad_norm": 1.9274901065614445, "learning_rate": 6.502242152466367e-08, "loss": 0.691, "step": 310 }, { "epoch": 0.42338709677419356, "grad_norm": 2.108143183803596, "learning_rate": 6.427503736920778e-08, "loss": 0.6628, "step": 315 }, { "epoch": 0.43010752688172044, "grad_norm": 2.207966702792586, "learning_rate": 6.352765321375186e-08, "loss": 0.6768, "step": 320 }, { "epoch": 0.4368279569892473, "grad_norm": 2.096647992540451, "learning_rate": 6.278026905829596e-08, "loss": 0.6667, "step": 325 }, { "epoch": 0.4435483870967742, "grad_norm": 2.1336095512755113, "learning_rate": 6.203288490284006e-08, "loss": 0.6581, "step": 330 }, { "epoch": 0.45026881720430106, "grad_norm": 2.0159699059441976, "learning_rate": 6.128550074738415e-08, "loss": 0.6743, "step": 335 }, { "epoch": 0.45698924731182794, "grad_norm": 1.9979710525534193, "learning_rate": 6.053811659192824e-08, "loss": 0.6638, "step": 340 }, { "epoch": 0.4637096774193548, "grad_norm": 2.049384366870693, "learning_rate": 5.979073243647235e-08, "loss": 0.679, "step": 345 }, { "epoch": 0.47043010752688175, "grad_norm": 2.147547124197116, "learning_rate": 5.9043348281016435e-08, "loss": 0.6719, "step": 350 }, { "epoch": 0.4771505376344086, "grad_norm": 1.994604705245644, "learning_rate": 5.8295964125560534e-08, "loss": 0.6622, "step": 355 }, { "epoch": 0.4838709677419355, "grad_norm": 2.035462459844569, "learning_rate": 5.754857997010463e-08, "loss": 0.6464, "step": 360 }, { "epoch": 0.4905913978494624, "grad_norm": 2.042123968621037, "learning_rate": 5.6801195814648727e-08, "loss": 0.6627, "step": 365 }, { "epoch": 0.49731182795698925, "grad_norm": 2.14484478856972, "learning_rate": 5.605381165919282e-08, "loss": 0.6522, "step": 370 }, { "epoch": 0.5040322580645161, "grad_norm": 2.0812193752637462, "learning_rate": 5.530642750373692e-08, "loss": 0.6858, "step": 375 }, { "epoch": 0.5040322580645161, "eval_loss": 0.6924068927764893, "eval_runtime": 56.1089, "eval_samples_per_second": 146.536, "eval_steps_per_second": 2.299, "step": 375 }, { "epoch": 0.510752688172043, "grad_norm": 2.1209544150897757, "learning_rate": 5.455904334828101e-08, "loss": 0.6451, "step": 380 }, { "epoch": 0.5174731182795699, "grad_norm": 2.045170736604748, "learning_rate": 5.381165919282511e-08, "loss": 0.6624, "step": 385 }, { "epoch": 0.5241935483870968, "grad_norm": 2.10236284260544, "learning_rate": 5.306427503736921e-08, "loss": 0.6741, "step": 390 }, { "epoch": 0.5309139784946236, "grad_norm": 2.1314029429521244, "learning_rate": 5.2316890881913303e-08, "loss": 0.6642, "step": 395 }, { "epoch": 0.5376344086021505, "grad_norm": 2.186951067665247, "learning_rate": 5.15695067264574e-08, "loss": 0.6836, "step": 400 }, { "epoch": 0.5443548387096774, "grad_norm": 2.052806350954309, "learning_rate": 5.082212257100149e-08, "loss": 0.672, "step": 405 }, { "epoch": 0.5510752688172043, "grad_norm": 2.071189416663902, "learning_rate": 5.0074738415545595e-08, "loss": 0.6677, "step": 410 }, { "epoch": 0.5577956989247311, "grad_norm": 2.0118486297022833, "learning_rate": 4.932735426008968e-08, "loss": 0.6639, "step": 415 }, { "epoch": 0.5645161290322581, "grad_norm": 2.0632350448343333, "learning_rate": 4.8579970104633774e-08, "loss": 0.6754, "step": 420 }, { "epoch": 0.571236559139785, "grad_norm": 2.251163063469807, "learning_rate": 4.7832585949177874e-08, "loss": 0.7003, "step": 425 }, { "epoch": 0.5779569892473119, "grad_norm": 2.0661342085354697, "learning_rate": 4.708520179372197e-08, "loss": 0.6904, "step": 430 }, { "epoch": 0.5846774193548387, "grad_norm": 2.1557028238109517, "learning_rate": 4.6337817638266066e-08, "loss": 0.6318, "step": 435 }, { "epoch": 0.5913978494623656, "grad_norm": 2.1516536049922452, "learning_rate": 4.5590433482810165e-08, "loss": 0.6552, "step": 440 }, { "epoch": 0.5981182795698925, "grad_norm": 2.086935790084074, "learning_rate": 4.484304932735426e-08, "loss": 0.6778, "step": 445 }, { "epoch": 0.6048387096774194, "grad_norm": 2.146673007914857, "learning_rate": 4.409566517189836e-08, "loss": 0.6536, "step": 450 }, { "epoch": 0.6048387096774194, "eval_loss": 0.6919218897819519, "eval_runtime": 56.1387, "eval_samples_per_second": 146.459, "eval_steps_per_second": 2.298, "step": 450 }, { "epoch": 0.6115591397849462, "grad_norm": 2.1253216816067066, "learning_rate": 4.334828101644245e-08, "loss": 0.6511, "step": 455 }, { "epoch": 0.6182795698924731, "grad_norm": 2.071737757767298, "learning_rate": 4.260089686098654e-08, "loss": 0.6592, "step": 460 }, { "epoch": 0.625, "grad_norm": 2.0932212158820396, "learning_rate": 4.185351270553064e-08, "loss": 0.6692, "step": 465 }, { "epoch": 0.6317204301075269, "grad_norm": 2.0910146959652307, "learning_rate": 4.1106128550074736e-08, "loss": 0.6683, "step": 470 }, { "epoch": 0.6384408602150538, "grad_norm": 2.062557270254667, "learning_rate": 4.0358744394618835e-08, "loss": 0.6656, "step": 475 }, { "epoch": 0.6451612903225806, "grad_norm": 2.017329078129843, "learning_rate": 3.961136023916293e-08, "loss": 0.6719, "step": 480 }, { "epoch": 0.6518817204301075, "grad_norm": 2.2036353665493755, "learning_rate": 3.886397608370702e-08, "loss": 0.682, "step": 485 }, { "epoch": 0.6586021505376344, "grad_norm": 2.007622155160972, "learning_rate": 3.811659192825112e-08, "loss": 0.6486, "step": 490 }, { "epoch": 0.6653225806451613, "grad_norm": 2.2339142859626553, "learning_rate": 3.736920777279521e-08, "loss": 0.6542, "step": 495 }, { "epoch": 0.6720430107526881, "grad_norm": 2.1968231695887823, "learning_rate": 3.662182361733931e-08, "loss": 0.6701, "step": 500 }, { "epoch": 0.678763440860215, "grad_norm": 2.311102123656589, "learning_rate": 3.5874439461883405e-08, "loss": 0.6541, "step": 505 }, { "epoch": 0.6854838709677419, "grad_norm": 2.162411566728306, "learning_rate": 3.51270553064275e-08, "loss": 0.6609, "step": 510 }, { "epoch": 0.6922043010752689, "grad_norm": 2.0777352845523156, "learning_rate": 3.43796711509716e-08, "loss": 0.6654, "step": 515 }, { "epoch": 0.6989247311827957, "grad_norm": 2.0466624810725613, "learning_rate": 3.36322869955157e-08, "loss": 0.6637, "step": 520 }, { "epoch": 0.7056451612903226, "grad_norm": 2.046164435882464, "learning_rate": 3.288490284005979e-08, "loss": 0.6672, "step": 525 }, { "epoch": 0.7056451612903226, "eval_loss": 0.691673994064331, "eval_runtime": 56.2018, "eval_samples_per_second": 146.294, "eval_steps_per_second": 2.295, "step": 525 }, { "epoch": 0.7123655913978495, "grad_norm": 2.098676297617592, "learning_rate": 3.213751868460389e-08, "loss": 0.6581, "step": 530 }, { "epoch": 0.7190860215053764, "grad_norm": 1.9986286959461719, "learning_rate": 3.139013452914798e-08, "loss": 0.6658, "step": 535 }, { "epoch": 0.7258064516129032, "grad_norm": 2.1195333643509935, "learning_rate": 3.0642750373692075e-08, "loss": 0.669, "step": 540 }, { "epoch": 0.7325268817204301, "grad_norm": 2.055327535910759, "learning_rate": 2.9895366218236174e-08, "loss": 0.6663, "step": 545 }, { "epoch": 0.739247311827957, "grad_norm": 2.074614293697511, "learning_rate": 2.9147982062780267e-08, "loss": 0.6627, "step": 550 }, { "epoch": 0.7459677419354839, "grad_norm": 2.0360906767754, "learning_rate": 2.8400597907324363e-08, "loss": 0.6619, "step": 555 }, { "epoch": 0.7526881720430108, "grad_norm": 2.098550310165075, "learning_rate": 2.765321375186846e-08, "loss": 0.6791, "step": 560 }, { "epoch": 0.7594086021505376, "grad_norm": 2.056390058629133, "learning_rate": 2.6905829596412556e-08, "loss": 0.6569, "step": 565 }, { "epoch": 0.7661290322580645, "grad_norm": 2.2413055374611632, "learning_rate": 2.6158445440956652e-08, "loss": 0.6704, "step": 570 }, { "epoch": 0.7728494623655914, "grad_norm": 2.054426280985883, "learning_rate": 2.5411061285500745e-08, "loss": 0.6563, "step": 575 }, { "epoch": 0.7795698924731183, "grad_norm": 2.0006598598366345, "learning_rate": 2.466367713004484e-08, "loss": 0.6567, "step": 580 }, { "epoch": 0.7862903225806451, "grad_norm": 2.2751507511393925, "learning_rate": 2.3916292974588937e-08, "loss": 0.6693, "step": 585 }, { "epoch": 0.793010752688172, "grad_norm": 2.1645437231829456, "learning_rate": 2.3168908819133033e-08, "loss": 0.6643, "step": 590 }, { "epoch": 0.7997311827956989, "grad_norm": 2.0220952097349176, "learning_rate": 2.242152466367713e-08, "loss": 0.66, "step": 595 }, { "epoch": 0.8064516129032258, "grad_norm": 2.078077165465023, "learning_rate": 2.1674140508221225e-08, "loss": 0.6559, "step": 600 }, { "epoch": 0.8064516129032258, "eval_loss": 0.6913867592811584, "eval_runtime": 56.1429, "eval_samples_per_second": 146.448, "eval_steps_per_second": 2.298, "step": 600 }, { "epoch": 0.8131720430107527, "grad_norm": 2.3106172558361884, "learning_rate": 2.092675635276532e-08, "loss": 0.6828, "step": 605 }, { "epoch": 0.8198924731182796, "grad_norm": 2.094033510896202, "learning_rate": 2.0179372197309417e-08, "loss": 0.6635, "step": 610 }, { "epoch": 0.8266129032258065, "grad_norm": 2.0658322655908665, "learning_rate": 1.943198804185351e-08, "loss": 0.6753, "step": 615 }, { "epoch": 0.8333333333333334, "grad_norm": 2.17250427254849, "learning_rate": 1.8684603886397606e-08, "loss": 0.6727, "step": 620 }, { "epoch": 0.8400537634408602, "grad_norm": 2.259179736932614, "learning_rate": 1.7937219730941703e-08, "loss": 0.6959, "step": 625 }, { "epoch": 0.8467741935483871, "grad_norm": 2.0612328204646815, "learning_rate": 1.71898355754858e-08, "loss": 0.6584, "step": 630 }, { "epoch": 0.853494623655914, "grad_norm": 2.1679992601621154, "learning_rate": 1.6442451420029895e-08, "loss": 0.6508, "step": 635 }, { "epoch": 0.8602150537634409, "grad_norm": 1.9886841572370688, "learning_rate": 1.569506726457399e-08, "loss": 0.6729, "step": 640 }, { "epoch": 0.8669354838709677, "grad_norm": 2.1506038261656184, "learning_rate": 1.4947683109118087e-08, "loss": 0.6608, "step": 645 }, { "epoch": 0.8736559139784946, "grad_norm": 1.9864785884472476, "learning_rate": 1.4200298953662182e-08, "loss": 0.6628, "step": 650 }, { "epoch": 0.8803763440860215, "grad_norm": 2.1120434621533284, "learning_rate": 1.3452914798206278e-08, "loss": 0.6838, "step": 655 }, { "epoch": 0.8870967741935484, "grad_norm": 1.9981627858324142, "learning_rate": 1.2705530642750372e-08, "loss": 0.6561, "step": 660 }, { "epoch": 0.8938172043010753, "grad_norm": 2.050666914131394, "learning_rate": 1.1958146487294468e-08, "loss": 0.6491, "step": 665 }, { "epoch": 0.9005376344086021, "grad_norm": 2.159702446356377, "learning_rate": 1.1210762331838565e-08, "loss": 0.6783, "step": 670 }, { "epoch": 0.907258064516129, "grad_norm": 2.0164611126323697, "learning_rate": 1.046337817638266e-08, "loss": 0.6472, "step": 675 }, { "epoch": 0.907258064516129, "eval_loss": 0.6912277936935425, "eval_runtime": 56.1813, "eval_samples_per_second": 146.348, "eval_steps_per_second": 2.296, "step": 675 }, { "epoch": 0.9139784946236559, "grad_norm": 2.1472859721487683, "learning_rate": 9.715994020926755e-09, "loss": 0.6621, "step": 680 }, { "epoch": 0.9206989247311828, "grad_norm": 2.138066510167175, "learning_rate": 8.968609865470851e-09, "loss": 0.6396, "step": 685 }, { "epoch": 0.9274193548387096, "grad_norm": 2.2998888062886538, "learning_rate": 8.221225710014947e-09, "loss": 0.6603, "step": 690 }, { "epoch": 0.9341397849462365, "grad_norm": 2.0384421650226527, "learning_rate": 7.473841554559044e-09, "loss": 0.6384, "step": 695 }, { "epoch": 0.9408602150537635, "grad_norm": 2.0927886382588827, "learning_rate": 6.726457399103139e-09, "loss": 0.6557, "step": 700 }, { "epoch": 0.9475806451612904, "grad_norm": 2.02850385366728, "learning_rate": 5.979073243647234e-09, "loss": 0.6663, "step": 705 }, { "epoch": 0.9543010752688172, "grad_norm": 2.2433567010088797, "learning_rate": 5.23168908819133e-09, "loss": 0.6796, "step": 710 }, { "epoch": 0.9610215053763441, "grad_norm": 2.057838483040146, "learning_rate": 4.484304932735426e-09, "loss": 0.6626, "step": 715 }, { "epoch": 0.967741935483871, "grad_norm": 2.1280821159120786, "learning_rate": 3.736920777279522e-09, "loss": 0.633, "step": 720 }, { "epoch": 0.9744623655913979, "grad_norm": 2.019967451723735, "learning_rate": 2.989536621823617e-09, "loss": 0.6718, "step": 725 }, { "epoch": 0.9811827956989247, "grad_norm": 2.1683941089274477, "learning_rate": 2.242152466367713e-09, "loss": 0.6578, "step": 730 }, { "epoch": 0.9879032258064516, "grad_norm": 2.015824272032049, "learning_rate": 1.4947683109118085e-09, "loss": 0.6781, "step": 735 }, { "epoch": 0.9946236559139785, "grad_norm": 2.1171098020664325, "learning_rate": 7.473841554559043e-10, "loss": 0.6829, "step": 740 }, { "epoch": 1.0, "step": 744, "total_flos": 1582562992455680.0, "train_loss": 0.7245069735793657, "train_runtime": 3975.4106, "train_samples_per_second": 11.971, "train_steps_per_second": 0.187 } ], "logging_steps": 5, "max_steps": 744, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 75, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1582562992455680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }