rft-llama-3.1-8b-gsm8k / trainer_state.json
JakeOh's picture
Upload folder using huggingface_hub
1ab2502 verified
{
"best_global_step": 675,
"best_metric": 0.6912277936935425,
"best_model_checkpoint": "checkpoints/rft-finetune-llama-3.1-8b-gsm8k/gsm8k/finetune-llama-3.1-8b-gsm8k-step-1/checkpoint-675",
"epoch": 1.0,
"eval_steps": 75,
"global_step": 744,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006720430107526882,
"grad_norm": 14.59550096035499,
"learning_rate": 5.333333333333333e-09,
"loss": 1.1893,
"step": 5
},
{
"epoch": 0.013440860215053764,
"grad_norm": 15.552214878649293,
"learning_rate": 1.1999999999999998e-08,
"loss": 1.1733,
"step": 10
},
{
"epoch": 0.020161290322580645,
"grad_norm": 16.885206147000304,
"learning_rate": 1.8666666666666665e-08,
"loss": 1.1934,
"step": 15
},
{
"epoch": 0.026881720430107527,
"grad_norm": 15.221916137414837,
"learning_rate": 2.5333333333333335e-08,
"loss": 1.2152,
"step": 20
},
{
"epoch": 0.033602150537634407,
"grad_norm": 15.887269098975871,
"learning_rate": 3.2e-08,
"loss": 1.2051,
"step": 25
},
{
"epoch": 0.04032258064516129,
"grad_norm": 14.254790289709463,
"learning_rate": 3.866666666666666e-08,
"loss": 1.1997,
"step": 30
},
{
"epoch": 0.04704301075268817,
"grad_norm": 14.583670649351852,
"learning_rate": 4.533333333333333e-08,
"loss": 1.1812,
"step": 35
},
{
"epoch": 0.053763440860215055,
"grad_norm": 14.695116879291863,
"learning_rate": 5.2e-08,
"loss": 1.195,
"step": 40
},
{
"epoch": 0.06048387096774194,
"grad_norm": 14.808886364711231,
"learning_rate": 5.866666666666666e-08,
"loss": 1.19,
"step": 45
},
{
"epoch": 0.06720430107526881,
"grad_norm": 14.279020954727395,
"learning_rate": 6.533333333333332e-08,
"loss": 1.1758,
"step": 50
},
{
"epoch": 0.0739247311827957,
"grad_norm": 15.710193271146736,
"learning_rate": 7.2e-08,
"loss": 1.1654,
"step": 55
},
{
"epoch": 0.08064516129032258,
"grad_norm": 12.823166713617123,
"learning_rate": 7.866666666666666e-08,
"loss": 1.0956,
"step": 60
},
{
"epoch": 0.08736559139784947,
"grad_norm": 13.123484893071117,
"learning_rate": 8.533333333333333e-08,
"loss": 1.0939,
"step": 65
},
{
"epoch": 0.09408602150537634,
"grad_norm": 13.804028711760814,
"learning_rate": 9.2e-08,
"loss": 1.0806,
"step": 70
},
{
"epoch": 0.10080645161290322,
"grad_norm": 14.362479314024831,
"learning_rate": 9.866666666666666e-08,
"loss": 1.0428,
"step": 75
},
{
"epoch": 0.10080645161290322,
"eval_loss": 1.0135635137557983,
"eval_runtime": 64.0958,
"eval_samples_per_second": 128.277,
"eval_steps_per_second": 2.013,
"step": 75
},
{
"epoch": 0.10752688172043011,
"grad_norm": 12.804236982581402,
"learning_rate": 9.940209267563527e-08,
"loss": 0.9585,
"step": 80
},
{
"epoch": 0.11424731182795698,
"grad_norm": 11.669278429338862,
"learning_rate": 9.865470852017936e-08,
"loss": 0.9112,
"step": 85
},
{
"epoch": 0.12096774193548387,
"grad_norm": 11.081808260451195,
"learning_rate": 9.790732436472347e-08,
"loss": 0.877,
"step": 90
},
{
"epoch": 0.12768817204301075,
"grad_norm": 16.49147962749734,
"learning_rate": 9.715994020926755e-08,
"loss": 0.8504,
"step": 95
},
{
"epoch": 0.13440860215053763,
"grad_norm": 13.95628735293844,
"learning_rate": 9.641255605381165e-08,
"loss": 0.7931,
"step": 100
},
{
"epoch": 0.14112903225806453,
"grad_norm": 8.601123066406517,
"learning_rate": 9.566517189835575e-08,
"loss": 0.711,
"step": 105
},
{
"epoch": 0.1478494623655914,
"grad_norm": 3.7294133148506337,
"learning_rate": 9.491778774289984e-08,
"loss": 0.6981,
"step": 110
},
{
"epoch": 0.15456989247311828,
"grad_norm": 2.723138779265988,
"learning_rate": 9.417040358744395e-08,
"loss": 0.6953,
"step": 115
},
{
"epoch": 0.16129032258064516,
"grad_norm": 2.2560223539756423,
"learning_rate": 9.342301943198804e-08,
"loss": 0.6846,
"step": 120
},
{
"epoch": 0.16801075268817203,
"grad_norm": 2.2521211316559433,
"learning_rate": 9.267563527653213e-08,
"loss": 0.6696,
"step": 125
},
{
"epoch": 0.17473118279569894,
"grad_norm": 2.2818251781816588,
"learning_rate": 9.192825112107622e-08,
"loss": 0.6809,
"step": 130
},
{
"epoch": 0.1814516129032258,
"grad_norm": 2.44710559044971,
"learning_rate": 9.118086696562033e-08,
"loss": 0.6841,
"step": 135
},
{
"epoch": 0.1881720430107527,
"grad_norm": 2.350544704349094,
"learning_rate": 9.043348281016442e-08,
"loss": 0.6766,
"step": 140
},
{
"epoch": 0.19489247311827956,
"grad_norm": 1.9640158046302565,
"learning_rate": 8.968609865470852e-08,
"loss": 0.6878,
"step": 145
},
{
"epoch": 0.20161290322580644,
"grad_norm": 2.0782198781659353,
"learning_rate": 8.893871449925261e-08,
"loss": 0.6636,
"step": 150
},
{
"epoch": 0.20161290322580644,
"eval_loss": 0.6971947550773621,
"eval_runtime": 56.1638,
"eval_samples_per_second": 146.393,
"eval_steps_per_second": 2.297,
"step": 150
},
{
"epoch": 0.20833333333333334,
"grad_norm": 2.1288692138731458,
"learning_rate": 8.819133034379672e-08,
"loss": 0.6866,
"step": 155
},
{
"epoch": 0.21505376344086022,
"grad_norm": 2.2152810121981568,
"learning_rate": 8.74439461883408e-08,
"loss": 0.6884,
"step": 160
},
{
"epoch": 0.2217741935483871,
"grad_norm": 1.9700569516052628,
"learning_rate": 8.66965620328849e-08,
"loss": 0.6793,
"step": 165
},
{
"epoch": 0.22849462365591397,
"grad_norm": 2.0535897043325724,
"learning_rate": 8.5949177877429e-08,
"loss": 0.6777,
"step": 170
},
{
"epoch": 0.23521505376344087,
"grad_norm": 2.011966313411605,
"learning_rate": 8.520179372197309e-08,
"loss": 0.6596,
"step": 175
},
{
"epoch": 0.24193548387096775,
"grad_norm": 2.0907303106285164,
"learning_rate": 8.445440956651718e-08,
"loss": 0.6713,
"step": 180
},
{
"epoch": 0.24865591397849462,
"grad_norm": 1.9870643151130227,
"learning_rate": 8.370702541106129e-08,
"loss": 0.6835,
"step": 185
},
{
"epoch": 0.2553763440860215,
"grad_norm": 2.1110324099341797,
"learning_rate": 8.295964125560538e-08,
"loss": 0.6691,
"step": 190
},
{
"epoch": 0.2620967741935484,
"grad_norm": 2.337028306260816,
"learning_rate": 8.221225710014947e-08,
"loss": 0.6706,
"step": 195
},
{
"epoch": 0.26881720430107525,
"grad_norm": 1.9987914444654455,
"learning_rate": 8.146487294469356e-08,
"loss": 0.6769,
"step": 200
},
{
"epoch": 0.27553763440860213,
"grad_norm": 2.1558155606241884,
"learning_rate": 8.071748878923767e-08,
"loss": 0.6709,
"step": 205
},
{
"epoch": 0.28225806451612906,
"grad_norm": 2.09004359899187,
"learning_rate": 7.997010463378176e-08,
"loss": 0.6784,
"step": 210
},
{
"epoch": 0.28897849462365593,
"grad_norm": 2.285145499206384,
"learning_rate": 7.922272047832586e-08,
"loss": 0.6715,
"step": 215
},
{
"epoch": 0.2956989247311828,
"grad_norm": 2.027191135310369,
"learning_rate": 7.847533632286996e-08,
"loss": 0.6612,
"step": 220
},
{
"epoch": 0.3024193548387097,
"grad_norm": 2.0151475678114705,
"learning_rate": 7.772795216741404e-08,
"loss": 0.6676,
"step": 225
},
{
"epoch": 0.3024193548387097,
"eval_loss": 0.6939067244529724,
"eval_runtime": 56.1322,
"eval_samples_per_second": 146.476,
"eval_steps_per_second": 2.298,
"step": 225
},
{
"epoch": 0.30913978494623656,
"grad_norm": 1.9995401054673094,
"learning_rate": 7.698056801195815e-08,
"loss": 0.6593,
"step": 230
},
{
"epoch": 0.31586021505376344,
"grad_norm": 2.0110294954158454,
"learning_rate": 7.623318385650224e-08,
"loss": 0.6563,
"step": 235
},
{
"epoch": 0.3225806451612903,
"grad_norm": 2.059981924661176,
"learning_rate": 7.548579970104633e-08,
"loss": 0.6764,
"step": 240
},
{
"epoch": 0.3293010752688172,
"grad_norm": 2.149345633891912,
"learning_rate": 7.473841554559043e-08,
"loss": 0.6645,
"step": 245
},
{
"epoch": 0.33602150537634407,
"grad_norm": 2.101850865821983,
"learning_rate": 7.399103139013453e-08,
"loss": 0.6612,
"step": 250
},
{
"epoch": 0.34274193548387094,
"grad_norm": 2.1254515490062245,
"learning_rate": 7.324364723467862e-08,
"loss": 0.6773,
"step": 255
},
{
"epoch": 0.34946236559139787,
"grad_norm": 2.08043259609194,
"learning_rate": 7.249626307922272e-08,
"loss": 0.6826,
"step": 260
},
{
"epoch": 0.35618279569892475,
"grad_norm": 1.972159359225326,
"learning_rate": 7.174887892376681e-08,
"loss": 0.6736,
"step": 265
},
{
"epoch": 0.3629032258064516,
"grad_norm": 2.020214538372761,
"learning_rate": 7.100149476831092e-08,
"loss": 0.6628,
"step": 270
},
{
"epoch": 0.3696236559139785,
"grad_norm": 2.119471507884024,
"learning_rate": 7.0254110612855e-08,
"loss": 0.6628,
"step": 275
},
{
"epoch": 0.3763440860215054,
"grad_norm": 2.2339027083129803,
"learning_rate": 6.95067264573991e-08,
"loss": 0.6461,
"step": 280
},
{
"epoch": 0.38306451612903225,
"grad_norm": 2.179556694440606,
"learning_rate": 6.87593423019432e-08,
"loss": 0.6685,
"step": 285
},
{
"epoch": 0.3897849462365591,
"grad_norm": 2.0979269829823477,
"learning_rate": 6.801195814648729e-08,
"loss": 0.6688,
"step": 290
},
{
"epoch": 0.396505376344086,
"grad_norm": 2.145759818638723,
"learning_rate": 6.72645739910314e-08,
"loss": 0.6582,
"step": 295
},
{
"epoch": 0.4032258064516129,
"grad_norm": 2.0232579108607998,
"learning_rate": 6.651718983557549e-08,
"loss": 0.6849,
"step": 300
},
{
"epoch": 0.4032258064516129,
"eval_loss": 0.6928321123123169,
"eval_runtime": 56.1826,
"eval_samples_per_second": 146.344,
"eval_steps_per_second": 2.296,
"step": 300
},
{
"epoch": 0.4099462365591398,
"grad_norm": 2.215763186449215,
"learning_rate": 6.576980568011958e-08,
"loss": 0.6772,
"step": 305
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.9274901065614445,
"learning_rate": 6.502242152466367e-08,
"loss": 0.691,
"step": 310
},
{
"epoch": 0.42338709677419356,
"grad_norm": 2.108143183803596,
"learning_rate": 6.427503736920778e-08,
"loss": 0.6628,
"step": 315
},
{
"epoch": 0.43010752688172044,
"grad_norm": 2.207966702792586,
"learning_rate": 6.352765321375186e-08,
"loss": 0.6768,
"step": 320
},
{
"epoch": 0.4368279569892473,
"grad_norm": 2.096647992540451,
"learning_rate": 6.278026905829596e-08,
"loss": 0.6667,
"step": 325
},
{
"epoch": 0.4435483870967742,
"grad_norm": 2.1336095512755113,
"learning_rate": 6.203288490284006e-08,
"loss": 0.6581,
"step": 330
},
{
"epoch": 0.45026881720430106,
"grad_norm": 2.0159699059441976,
"learning_rate": 6.128550074738415e-08,
"loss": 0.6743,
"step": 335
},
{
"epoch": 0.45698924731182794,
"grad_norm": 1.9979710525534193,
"learning_rate": 6.053811659192824e-08,
"loss": 0.6638,
"step": 340
},
{
"epoch": 0.4637096774193548,
"grad_norm": 2.049384366870693,
"learning_rate": 5.979073243647235e-08,
"loss": 0.679,
"step": 345
},
{
"epoch": 0.47043010752688175,
"grad_norm": 2.147547124197116,
"learning_rate": 5.9043348281016435e-08,
"loss": 0.6719,
"step": 350
},
{
"epoch": 0.4771505376344086,
"grad_norm": 1.994604705245644,
"learning_rate": 5.8295964125560534e-08,
"loss": 0.6622,
"step": 355
},
{
"epoch": 0.4838709677419355,
"grad_norm": 2.035462459844569,
"learning_rate": 5.754857997010463e-08,
"loss": 0.6464,
"step": 360
},
{
"epoch": 0.4905913978494624,
"grad_norm": 2.042123968621037,
"learning_rate": 5.6801195814648727e-08,
"loss": 0.6627,
"step": 365
},
{
"epoch": 0.49731182795698925,
"grad_norm": 2.14484478856972,
"learning_rate": 5.605381165919282e-08,
"loss": 0.6522,
"step": 370
},
{
"epoch": 0.5040322580645161,
"grad_norm": 2.0812193752637462,
"learning_rate": 5.530642750373692e-08,
"loss": 0.6858,
"step": 375
},
{
"epoch": 0.5040322580645161,
"eval_loss": 0.6924068927764893,
"eval_runtime": 56.1089,
"eval_samples_per_second": 146.536,
"eval_steps_per_second": 2.299,
"step": 375
},
{
"epoch": 0.510752688172043,
"grad_norm": 2.1209544150897757,
"learning_rate": 5.455904334828101e-08,
"loss": 0.6451,
"step": 380
},
{
"epoch": 0.5174731182795699,
"grad_norm": 2.045170736604748,
"learning_rate": 5.381165919282511e-08,
"loss": 0.6624,
"step": 385
},
{
"epoch": 0.5241935483870968,
"grad_norm": 2.10236284260544,
"learning_rate": 5.306427503736921e-08,
"loss": 0.6741,
"step": 390
},
{
"epoch": 0.5309139784946236,
"grad_norm": 2.1314029429521244,
"learning_rate": 5.2316890881913303e-08,
"loss": 0.6642,
"step": 395
},
{
"epoch": 0.5376344086021505,
"grad_norm": 2.186951067665247,
"learning_rate": 5.15695067264574e-08,
"loss": 0.6836,
"step": 400
},
{
"epoch": 0.5443548387096774,
"grad_norm": 2.052806350954309,
"learning_rate": 5.082212257100149e-08,
"loss": 0.672,
"step": 405
},
{
"epoch": 0.5510752688172043,
"grad_norm": 2.071189416663902,
"learning_rate": 5.0074738415545595e-08,
"loss": 0.6677,
"step": 410
},
{
"epoch": 0.5577956989247311,
"grad_norm": 2.0118486297022833,
"learning_rate": 4.932735426008968e-08,
"loss": 0.6639,
"step": 415
},
{
"epoch": 0.5645161290322581,
"grad_norm": 2.0632350448343333,
"learning_rate": 4.8579970104633774e-08,
"loss": 0.6754,
"step": 420
},
{
"epoch": 0.571236559139785,
"grad_norm": 2.251163063469807,
"learning_rate": 4.7832585949177874e-08,
"loss": 0.7003,
"step": 425
},
{
"epoch": 0.5779569892473119,
"grad_norm": 2.0661342085354697,
"learning_rate": 4.708520179372197e-08,
"loss": 0.6904,
"step": 430
},
{
"epoch": 0.5846774193548387,
"grad_norm": 2.1557028238109517,
"learning_rate": 4.6337817638266066e-08,
"loss": 0.6318,
"step": 435
},
{
"epoch": 0.5913978494623656,
"grad_norm": 2.1516536049922452,
"learning_rate": 4.5590433482810165e-08,
"loss": 0.6552,
"step": 440
},
{
"epoch": 0.5981182795698925,
"grad_norm": 2.086935790084074,
"learning_rate": 4.484304932735426e-08,
"loss": 0.6778,
"step": 445
},
{
"epoch": 0.6048387096774194,
"grad_norm": 2.146673007914857,
"learning_rate": 4.409566517189836e-08,
"loss": 0.6536,
"step": 450
},
{
"epoch": 0.6048387096774194,
"eval_loss": 0.6919218897819519,
"eval_runtime": 56.1387,
"eval_samples_per_second": 146.459,
"eval_steps_per_second": 2.298,
"step": 450
},
{
"epoch": 0.6115591397849462,
"grad_norm": 2.1253216816067066,
"learning_rate": 4.334828101644245e-08,
"loss": 0.6511,
"step": 455
},
{
"epoch": 0.6182795698924731,
"grad_norm": 2.071737757767298,
"learning_rate": 4.260089686098654e-08,
"loss": 0.6592,
"step": 460
},
{
"epoch": 0.625,
"grad_norm": 2.0932212158820396,
"learning_rate": 4.185351270553064e-08,
"loss": 0.6692,
"step": 465
},
{
"epoch": 0.6317204301075269,
"grad_norm": 2.0910146959652307,
"learning_rate": 4.1106128550074736e-08,
"loss": 0.6683,
"step": 470
},
{
"epoch": 0.6384408602150538,
"grad_norm": 2.062557270254667,
"learning_rate": 4.0358744394618835e-08,
"loss": 0.6656,
"step": 475
},
{
"epoch": 0.6451612903225806,
"grad_norm": 2.017329078129843,
"learning_rate": 3.961136023916293e-08,
"loss": 0.6719,
"step": 480
},
{
"epoch": 0.6518817204301075,
"grad_norm": 2.2036353665493755,
"learning_rate": 3.886397608370702e-08,
"loss": 0.682,
"step": 485
},
{
"epoch": 0.6586021505376344,
"grad_norm": 2.007622155160972,
"learning_rate": 3.811659192825112e-08,
"loss": 0.6486,
"step": 490
},
{
"epoch": 0.6653225806451613,
"grad_norm": 2.2339142859626553,
"learning_rate": 3.736920777279521e-08,
"loss": 0.6542,
"step": 495
},
{
"epoch": 0.6720430107526881,
"grad_norm": 2.1968231695887823,
"learning_rate": 3.662182361733931e-08,
"loss": 0.6701,
"step": 500
},
{
"epoch": 0.678763440860215,
"grad_norm": 2.311102123656589,
"learning_rate": 3.5874439461883405e-08,
"loss": 0.6541,
"step": 505
},
{
"epoch": 0.6854838709677419,
"grad_norm": 2.162411566728306,
"learning_rate": 3.51270553064275e-08,
"loss": 0.6609,
"step": 510
},
{
"epoch": 0.6922043010752689,
"grad_norm": 2.0777352845523156,
"learning_rate": 3.43796711509716e-08,
"loss": 0.6654,
"step": 515
},
{
"epoch": 0.6989247311827957,
"grad_norm": 2.0466624810725613,
"learning_rate": 3.36322869955157e-08,
"loss": 0.6637,
"step": 520
},
{
"epoch": 0.7056451612903226,
"grad_norm": 2.046164435882464,
"learning_rate": 3.288490284005979e-08,
"loss": 0.6672,
"step": 525
},
{
"epoch": 0.7056451612903226,
"eval_loss": 0.691673994064331,
"eval_runtime": 56.2018,
"eval_samples_per_second": 146.294,
"eval_steps_per_second": 2.295,
"step": 525
},
{
"epoch": 0.7123655913978495,
"grad_norm": 2.098676297617592,
"learning_rate": 3.213751868460389e-08,
"loss": 0.6581,
"step": 530
},
{
"epoch": 0.7190860215053764,
"grad_norm": 1.9986286959461719,
"learning_rate": 3.139013452914798e-08,
"loss": 0.6658,
"step": 535
},
{
"epoch": 0.7258064516129032,
"grad_norm": 2.1195333643509935,
"learning_rate": 3.0642750373692075e-08,
"loss": 0.669,
"step": 540
},
{
"epoch": 0.7325268817204301,
"grad_norm": 2.055327535910759,
"learning_rate": 2.9895366218236174e-08,
"loss": 0.6663,
"step": 545
},
{
"epoch": 0.739247311827957,
"grad_norm": 2.074614293697511,
"learning_rate": 2.9147982062780267e-08,
"loss": 0.6627,
"step": 550
},
{
"epoch": 0.7459677419354839,
"grad_norm": 2.0360906767754,
"learning_rate": 2.8400597907324363e-08,
"loss": 0.6619,
"step": 555
},
{
"epoch": 0.7526881720430108,
"grad_norm": 2.098550310165075,
"learning_rate": 2.765321375186846e-08,
"loss": 0.6791,
"step": 560
},
{
"epoch": 0.7594086021505376,
"grad_norm": 2.056390058629133,
"learning_rate": 2.6905829596412556e-08,
"loss": 0.6569,
"step": 565
},
{
"epoch": 0.7661290322580645,
"grad_norm": 2.2413055374611632,
"learning_rate": 2.6158445440956652e-08,
"loss": 0.6704,
"step": 570
},
{
"epoch": 0.7728494623655914,
"grad_norm": 2.054426280985883,
"learning_rate": 2.5411061285500745e-08,
"loss": 0.6563,
"step": 575
},
{
"epoch": 0.7795698924731183,
"grad_norm": 2.0006598598366345,
"learning_rate": 2.466367713004484e-08,
"loss": 0.6567,
"step": 580
},
{
"epoch": 0.7862903225806451,
"grad_norm": 2.2751507511393925,
"learning_rate": 2.3916292974588937e-08,
"loss": 0.6693,
"step": 585
},
{
"epoch": 0.793010752688172,
"grad_norm": 2.1645437231829456,
"learning_rate": 2.3168908819133033e-08,
"loss": 0.6643,
"step": 590
},
{
"epoch": 0.7997311827956989,
"grad_norm": 2.0220952097349176,
"learning_rate": 2.242152466367713e-08,
"loss": 0.66,
"step": 595
},
{
"epoch": 0.8064516129032258,
"grad_norm": 2.078077165465023,
"learning_rate": 2.1674140508221225e-08,
"loss": 0.6559,
"step": 600
},
{
"epoch": 0.8064516129032258,
"eval_loss": 0.6913867592811584,
"eval_runtime": 56.1429,
"eval_samples_per_second": 146.448,
"eval_steps_per_second": 2.298,
"step": 600
},
{
"epoch": 0.8131720430107527,
"grad_norm": 2.3106172558361884,
"learning_rate": 2.092675635276532e-08,
"loss": 0.6828,
"step": 605
},
{
"epoch": 0.8198924731182796,
"grad_norm": 2.094033510896202,
"learning_rate": 2.0179372197309417e-08,
"loss": 0.6635,
"step": 610
},
{
"epoch": 0.8266129032258065,
"grad_norm": 2.0658322655908665,
"learning_rate": 1.943198804185351e-08,
"loss": 0.6753,
"step": 615
},
{
"epoch": 0.8333333333333334,
"grad_norm": 2.17250427254849,
"learning_rate": 1.8684603886397606e-08,
"loss": 0.6727,
"step": 620
},
{
"epoch": 0.8400537634408602,
"grad_norm": 2.259179736932614,
"learning_rate": 1.7937219730941703e-08,
"loss": 0.6959,
"step": 625
},
{
"epoch": 0.8467741935483871,
"grad_norm": 2.0612328204646815,
"learning_rate": 1.71898355754858e-08,
"loss": 0.6584,
"step": 630
},
{
"epoch": 0.853494623655914,
"grad_norm": 2.1679992601621154,
"learning_rate": 1.6442451420029895e-08,
"loss": 0.6508,
"step": 635
},
{
"epoch": 0.8602150537634409,
"grad_norm": 1.9886841572370688,
"learning_rate": 1.569506726457399e-08,
"loss": 0.6729,
"step": 640
},
{
"epoch": 0.8669354838709677,
"grad_norm": 2.1506038261656184,
"learning_rate": 1.4947683109118087e-08,
"loss": 0.6608,
"step": 645
},
{
"epoch": 0.8736559139784946,
"grad_norm": 1.9864785884472476,
"learning_rate": 1.4200298953662182e-08,
"loss": 0.6628,
"step": 650
},
{
"epoch": 0.8803763440860215,
"grad_norm": 2.1120434621533284,
"learning_rate": 1.3452914798206278e-08,
"loss": 0.6838,
"step": 655
},
{
"epoch": 0.8870967741935484,
"grad_norm": 1.9981627858324142,
"learning_rate": 1.2705530642750372e-08,
"loss": 0.6561,
"step": 660
},
{
"epoch": 0.8938172043010753,
"grad_norm": 2.050666914131394,
"learning_rate": 1.1958146487294468e-08,
"loss": 0.6491,
"step": 665
},
{
"epoch": 0.9005376344086021,
"grad_norm": 2.159702446356377,
"learning_rate": 1.1210762331838565e-08,
"loss": 0.6783,
"step": 670
},
{
"epoch": 0.907258064516129,
"grad_norm": 2.0164611126323697,
"learning_rate": 1.046337817638266e-08,
"loss": 0.6472,
"step": 675
},
{
"epoch": 0.907258064516129,
"eval_loss": 0.6912277936935425,
"eval_runtime": 56.1813,
"eval_samples_per_second": 146.348,
"eval_steps_per_second": 2.296,
"step": 675
},
{
"epoch": 0.9139784946236559,
"grad_norm": 2.1472859721487683,
"learning_rate": 9.715994020926755e-09,
"loss": 0.6621,
"step": 680
},
{
"epoch": 0.9206989247311828,
"grad_norm": 2.138066510167175,
"learning_rate": 8.968609865470851e-09,
"loss": 0.6396,
"step": 685
},
{
"epoch": 0.9274193548387096,
"grad_norm": 2.2998888062886538,
"learning_rate": 8.221225710014947e-09,
"loss": 0.6603,
"step": 690
},
{
"epoch": 0.9341397849462365,
"grad_norm": 2.0384421650226527,
"learning_rate": 7.473841554559044e-09,
"loss": 0.6384,
"step": 695
},
{
"epoch": 0.9408602150537635,
"grad_norm": 2.0927886382588827,
"learning_rate": 6.726457399103139e-09,
"loss": 0.6557,
"step": 700
},
{
"epoch": 0.9475806451612904,
"grad_norm": 2.02850385366728,
"learning_rate": 5.979073243647234e-09,
"loss": 0.6663,
"step": 705
},
{
"epoch": 0.9543010752688172,
"grad_norm": 2.2433567010088797,
"learning_rate": 5.23168908819133e-09,
"loss": 0.6796,
"step": 710
},
{
"epoch": 0.9610215053763441,
"grad_norm": 2.057838483040146,
"learning_rate": 4.484304932735426e-09,
"loss": 0.6626,
"step": 715
},
{
"epoch": 0.967741935483871,
"grad_norm": 2.1280821159120786,
"learning_rate": 3.736920777279522e-09,
"loss": 0.633,
"step": 720
},
{
"epoch": 0.9744623655913979,
"grad_norm": 2.019967451723735,
"learning_rate": 2.989536621823617e-09,
"loss": 0.6718,
"step": 725
},
{
"epoch": 0.9811827956989247,
"grad_norm": 2.1683941089274477,
"learning_rate": 2.242152466367713e-09,
"loss": 0.6578,
"step": 730
},
{
"epoch": 0.9879032258064516,
"grad_norm": 2.015824272032049,
"learning_rate": 1.4947683109118085e-09,
"loss": 0.6781,
"step": 735
},
{
"epoch": 0.9946236559139785,
"grad_norm": 2.1171098020664325,
"learning_rate": 7.473841554559043e-10,
"loss": 0.6829,
"step": 740
},
{
"epoch": 1.0,
"step": 744,
"total_flos": 1582562992455680.0,
"train_loss": 0.7245069735793657,
"train_runtime": 3975.4106,
"train_samples_per_second": 11.971,
"train_steps_per_second": 0.187
}
],
"logging_steps": 5,
"max_steps": 744,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 75,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1582562992455680.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}