rft-llama-3.2-1b-instruct-gsm8k / trainer_state.json
JakeOh's picture
Upload folder using huggingface_hub
cead756 verified
{
"best_metric": 0.588302731513977,
"best_model_checkpoint": "checkpoints/rft-llama-3.2-1b-gsm8k/gsm8k/llama-3.2-1b-instruct-step-1/checkpoint-603",
"epoch": 1.0,
"eval_steps": 67,
"global_step": 664,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007530120481927711,
"grad_norm": 69.0,
"learning_rate": 7.462686567164179e-07,
"loss": 1.987,
"step": 5
},
{
"epoch": 0.015060240963855422,
"grad_norm": 58.5,
"learning_rate": 1.4925373134328358e-06,
"loss": 1.9009,
"step": 10
},
{
"epoch": 0.022590361445783132,
"grad_norm": 41.5,
"learning_rate": 2.238805970149254e-06,
"loss": 1.752,
"step": 15
},
{
"epoch": 0.030120481927710843,
"grad_norm": 24.375,
"learning_rate": 2.9850746268656716e-06,
"loss": 1.5653,
"step": 20
},
{
"epoch": 0.03765060240963856,
"grad_norm": 19.0,
"learning_rate": 3.73134328358209e-06,
"loss": 1.3427,
"step": 25
},
{
"epoch": 0.045180722891566265,
"grad_norm": 13.5625,
"learning_rate": 4.477611940298508e-06,
"loss": 1.0619,
"step": 30
},
{
"epoch": 0.05271084337349398,
"grad_norm": 8.5,
"learning_rate": 5.2238805970149255e-06,
"loss": 0.8756,
"step": 35
},
{
"epoch": 0.060240963855421686,
"grad_norm": 5.375,
"learning_rate": 5.970149253731343e-06,
"loss": 0.7862,
"step": 40
},
{
"epoch": 0.0677710843373494,
"grad_norm": 3.140625,
"learning_rate": 6.7164179104477625e-06,
"loss": 0.7085,
"step": 45
},
{
"epoch": 0.07530120481927711,
"grad_norm": 2.90625,
"learning_rate": 7.46268656716418e-06,
"loss": 0.6669,
"step": 50
},
{
"epoch": 0.08283132530120482,
"grad_norm": 2.421875,
"learning_rate": 8.208955223880599e-06,
"loss": 0.6517,
"step": 55
},
{
"epoch": 0.09036144578313253,
"grad_norm": 2.359375,
"learning_rate": 8.955223880597016e-06,
"loss": 0.6513,
"step": 60
},
{
"epoch": 0.09789156626506024,
"grad_norm": 2.15625,
"learning_rate": 9.701492537313434e-06,
"loss": 0.6549,
"step": 65
},
{
"epoch": 0.10090361445783133,
"eval_loss": 0.675618052482605,
"eval_runtime": 3.6002,
"eval_samples_per_second": 1722.964,
"eval_steps_per_second": 26.943,
"step": 67
},
{
"epoch": 0.10542168674698796,
"grad_norm": 2.171875,
"learning_rate": 9.949748743718594e-06,
"loss": 0.6582,
"step": 70
},
{
"epoch": 0.11295180722891567,
"grad_norm": 2.125,
"learning_rate": 9.865996649916248e-06,
"loss": 0.6411,
"step": 75
},
{
"epoch": 0.12048192771084337,
"grad_norm": 2.15625,
"learning_rate": 9.782244556113903e-06,
"loss": 0.6239,
"step": 80
},
{
"epoch": 0.1280120481927711,
"grad_norm": 2.234375,
"learning_rate": 9.698492462311559e-06,
"loss": 0.6404,
"step": 85
},
{
"epoch": 0.1355421686746988,
"grad_norm": 2.328125,
"learning_rate": 9.614740368509213e-06,
"loss": 0.6233,
"step": 90
},
{
"epoch": 0.1430722891566265,
"grad_norm": 2.6875,
"learning_rate": 9.530988274706869e-06,
"loss": 0.6253,
"step": 95
},
{
"epoch": 0.15060240963855423,
"grad_norm": 3.046875,
"learning_rate": 9.447236180904523e-06,
"loss": 0.6121,
"step": 100
},
{
"epoch": 0.15813253012048192,
"grad_norm": 3.40625,
"learning_rate": 9.363484087102177e-06,
"loss": 0.622,
"step": 105
},
{
"epoch": 0.16566265060240964,
"grad_norm": 4.0625,
"learning_rate": 9.279731993299833e-06,
"loss": 0.59,
"step": 110
},
{
"epoch": 0.17319277108433734,
"grad_norm": 3.46875,
"learning_rate": 9.195979899497488e-06,
"loss": 0.5997,
"step": 115
},
{
"epoch": 0.18072289156626506,
"grad_norm": 2.359375,
"learning_rate": 9.112227805695144e-06,
"loss": 0.5906,
"step": 120
},
{
"epoch": 0.18825301204819278,
"grad_norm": 2.40625,
"learning_rate": 9.028475711892798e-06,
"loss": 0.5887,
"step": 125
},
{
"epoch": 0.19578313253012047,
"grad_norm": 2.0,
"learning_rate": 8.944723618090452e-06,
"loss": 0.5751,
"step": 130
},
{
"epoch": 0.20180722891566266,
"eval_loss": 0.6029960513114929,
"eval_runtime": 3.5961,
"eval_samples_per_second": 1724.903,
"eval_steps_per_second": 26.973,
"step": 134
},
{
"epoch": 0.2033132530120482,
"grad_norm": 1.8984375,
"learning_rate": 8.860971524288108e-06,
"loss": 0.571,
"step": 135
},
{
"epoch": 0.21084337349397592,
"grad_norm": 2.0,
"learning_rate": 8.777219430485762e-06,
"loss": 0.5661,
"step": 140
},
{
"epoch": 0.2183734939759036,
"grad_norm": 1.625,
"learning_rate": 8.693467336683418e-06,
"loss": 0.5578,
"step": 145
},
{
"epoch": 0.22590361445783133,
"grad_norm": 1.90625,
"learning_rate": 8.609715242881073e-06,
"loss": 0.5639,
"step": 150
},
{
"epoch": 0.23343373493975902,
"grad_norm": 1.7578125,
"learning_rate": 8.525963149078727e-06,
"loss": 0.5549,
"step": 155
},
{
"epoch": 0.24096385542168675,
"grad_norm": 1.90625,
"learning_rate": 8.442211055276383e-06,
"loss": 0.5524,
"step": 160
},
{
"epoch": 0.24849397590361447,
"grad_norm": 2.078125,
"learning_rate": 8.358458961474037e-06,
"loss": 0.5786,
"step": 165
},
{
"epoch": 0.2560240963855422,
"grad_norm": 1.734375,
"learning_rate": 8.274706867671693e-06,
"loss": 0.564,
"step": 170
},
{
"epoch": 0.2635542168674699,
"grad_norm": 1.8984375,
"learning_rate": 8.190954773869347e-06,
"loss": 0.5666,
"step": 175
},
{
"epoch": 0.2710843373493976,
"grad_norm": 1.765625,
"learning_rate": 8.107202680067002e-06,
"loss": 0.5531,
"step": 180
},
{
"epoch": 0.2786144578313253,
"grad_norm": 1.7890625,
"learning_rate": 8.023450586264658e-06,
"loss": 0.5663,
"step": 185
},
{
"epoch": 0.286144578313253,
"grad_norm": 1.84375,
"learning_rate": 7.939698492462312e-06,
"loss": 0.5447,
"step": 190
},
{
"epoch": 0.2936746987951807,
"grad_norm": 1.84375,
"learning_rate": 7.855946398659968e-06,
"loss": 0.5554,
"step": 195
},
{
"epoch": 0.30120481927710846,
"grad_norm": 1.734375,
"learning_rate": 7.772194304857622e-06,
"loss": 0.5854,
"step": 200
},
{
"epoch": 0.30271084337349397,
"eval_loss": 0.5955631136894226,
"eval_runtime": 3.5892,
"eval_samples_per_second": 1728.235,
"eval_steps_per_second": 27.025,
"step": 201
},
{
"epoch": 0.30873493975903615,
"grad_norm": 1.890625,
"learning_rate": 7.688442211055276e-06,
"loss": 0.5483,
"step": 205
},
{
"epoch": 0.31626506024096385,
"grad_norm": 1.90625,
"learning_rate": 7.604690117252932e-06,
"loss": 0.5649,
"step": 210
},
{
"epoch": 0.32379518072289154,
"grad_norm": 1.9375,
"learning_rate": 7.520938023450587e-06,
"loss": 0.5584,
"step": 215
},
{
"epoch": 0.3313253012048193,
"grad_norm": 1.7109375,
"learning_rate": 7.437185929648242e-06,
"loss": 0.551,
"step": 220
},
{
"epoch": 0.338855421686747,
"grad_norm": 1.796875,
"learning_rate": 7.353433835845897e-06,
"loss": 0.5505,
"step": 225
},
{
"epoch": 0.3463855421686747,
"grad_norm": 1.7734375,
"learning_rate": 7.269681742043552e-06,
"loss": 0.5501,
"step": 230
},
{
"epoch": 0.3539156626506024,
"grad_norm": 1.8359375,
"learning_rate": 7.185929648241206e-06,
"loss": 0.5471,
"step": 235
},
{
"epoch": 0.3614457831325301,
"grad_norm": 1.8046875,
"learning_rate": 7.102177554438861e-06,
"loss": 0.5542,
"step": 240
},
{
"epoch": 0.3689759036144578,
"grad_norm": 1.6640625,
"learning_rate": 7.0184254606365165e-06,
"loss": 0.5382,
"step": 245
},
{
"epoch": 0.37650602409638556,
"grad_norm": 1.8125,
"learning_rate": 6.934673366834172e-06,
"loss": 0.5598,
"step": 250
},
{
"epoch": 0.38403614457831325,
"grad_norm": 1.78125,
"learning_rate": 6.850921273031827e-06,
"loss": 0.5627,
"step": 255
},
{
"epoch": 0.39156626506024095,
"grad_norm": 1.7578125,
"learning_rate": 6.767169179229481e-06,
"loss": 0.5505,
"step": 260
},
{
"epoch": 0.3990963855421687,
"grad_norm": 1.7890625,
"learning_rate": 6.683417085427136e-06,
"loss": 0.5524,
"step": 265
},
{
"epoch": 0.4036144578313253,
"eval_loss": 0.5925902724266052,
"eval_runtime": 3.5903,
"eval_samples_per_second": 1727.697,
"eval_steps_per_second": 27.017,
"step": 268
},
{
"epoch": 0.4066265060240964,
"grad_norm": 1.7578125,
"learning_rate": 6.599664991624791e-06,
"loss": 0.5574,
"step": 270
},
{
"epoch": 0.4141566265060241,
"grad_norm": 1.796875,
"learning_rate": 6.515912897822446e-06,
"loss": 0.5494,
"step": 275
},
{
"epoch": 0.42168674698795183,
"grad_norm": 1.7734375,
"learning_rate": 6.4321608040201015e-06,
"loss": 0.5602,
"step": 280
},
{
"epoch": 0.4292168674698795,
"grad_norm": 1.6953125,
"learning_rate": 6.348408710217756e-06,
"loss": 0.5534,
"step": 285
},
{
"epoch": 0.4367469879518072,
"grad_norm": 1.6953125,
"learning_rate": 6.264656616415411e-06,
"loss": 0.5446,
"step": 290
},
{
"epoch": 0.4442771084337349,
"grad_norm": 1.78125,
"learning_rate": 6.180904522613066e-06,
"loss": 0.5474,
"step": 295
},
{
"epoch": 0.45180722891566266,
"grad_norm": 1.6484375,
"learning_rate": 6.097152428810721e-06,
"loss": 0.5516,
"step": 300
},
{
"epoch": 0.45933734939759036,
"grad_norm": 1.6953125,
"learning_rate": 6.013400335008376e-06,
"loss": 0.5547,
"step": 305
},
{
"epoch": 0.46686746987951805,
"grad_norm": 1.6875,
"learning_rate": 5.9296482412060305e-06,
"loss": 0.5807,
"step": 310
},
{
"epoch": 0.4743975903614458,
"grad_norm": 1.8984375,
"learning_rate": 5.845896147403686e-06,
"loss": 0.559,
"step": 315
},
{
"epoch": 0.4819277108433735,
"grad_norm": 1.6875,
"learning_rate": 5.762144053601341e-06,
"loss": 0.5426,
"step": 320
},
{
"epoch": 0.4894578313253012,
"grad_norm": 1.8828125,
"learning_rate": 5.678391959798996e-06,
"loss": 0.5428,
"step": 325
},
{
"epoch": 0.49698795180722893,
"grad_norm": 1.90625,
"learning_rate": 5.59463986599665e-06,
"loss": 0.5416,
"step": 330
},
{
"epoch": 0.5045180722891566,
"grad_norm": 1.7734375,
"learning_rate": 5.510887772194305e-06,
"loss": 0.556,
"step": 335
},
{
"epoch": 0.5045180722891566,
"eval_loss": 0.5903183817863464,
"eval_runtime": 3.5762,
"eval_samples_per_second": 1734.545,
"eval_steps_per_second": 27.124,
"step": 335
},
{
"epoch": 0.5120481927710844,
"grad_norm": 1.875,
"learning_rate": 5.42713567839196e-06,
"loss": 0.5486,
"step": 340
},
{
"epoch": 0.5195783132530121,
"grad_norm": 1.90625,
"learning_rate": 5.3433835845896155e-06,
"loss": 0.5599,
"step": 345
},
{
"epoch": 0.5271084337349398,
"grad_norm": 1.78125,
"learning_rate": 5.259631490787271e-06,
"loss": 0.5544,
"step": 350
},
{
"epoch": 0.5346385542168675,
"grad_norm": 1.7578125,
"learning_rate": 5.175879396984925e-06,
"loss": 0.5309,
"step": 355
},
{
"epoch": 0.5421686746987951,
"grad_norm": 1.921875,
"learning_rate": 5.09212730318258e-06,
"loss": 0.5532,
"step": 360
},
{
"epoch": 0.5496987951807228,
"grad_norm": 1.7109375,
"learning_rate": 5.008375209380235e-06,
"loss": 0.5515,
"step": 365
},
{
"epoch": 0.5572289156626506,
"grad_norm": 1.84375,
"learning_rate": 4.92462311557789e-06,
"loss": 0.5429,
"step": 370
},
{
"epoch": 0.5647590361445783,
"grad_norm": 1.8203125,
"learning_rate": 4.840871021775545e-06,
"loss": 0.5585,
"step": 375
},
{
"epoch": 0.572289156626506,
"grad_norm": 1.859375,
"learning_rate": 4.7571189279732e-06,
"loss": 0.5551,
"step": 380
},
{
"epoch": 0.5798192771084337,
"grad_norm": 1.7890625,
"learning_rate": 4.673366834170855e-06,
"loss": 0.5318,
"step": 385
},
{
"epoch": 0.5873493975903614,
"grad_norm": 1.7265625,
"learning_rate": 4.58961474036851e-06,
"loss": 0.559,
"step": 390
},
{
"epoch": 0.5948795180722891,
"grad_norm": 1.7265625,
"learning_rate": 4.505862646566165e-06,
"loss": 0.5335,
"step": 395
},
{
"epoch": 0.6024096385542169,
"grad_norm": 1.6640625,
"learning_rate": 4.42211055276382e-06,
"loss": 0.5508,
"step": 400
},
{
"epoch": 0.6054216867469879,
"eval_loss": 0.5893600583076477,
"eval_runtime": 3.5911,
"eval_samples_per_second": 1727.335,
"eval_steps_per_second": 27.011,
"step": 402
},
{
"epoch": 0.6099397590361446,
"grad_norm": 1.78125,
"learning_rate": 4.338358458961474e-06,
"loss": 0.5425,
"step": 405
},
{
"epoch": 0.6174698795180723,
"grad_norm": 1.71875,
"learning_rate": 4.254606365159129e-06,
"loss": 0.535,
"step": 410
},
{
"epoch": 0.625,
"grad_norm": 1.7421875,
"learning_rate": 4.170854271356784e-06,
"loss": 0.5483,
"step": 415
},
{
"epoch": 0.6325301204819277,
"grad_norm": 1.765625,
"learning_rate": 4.087102177554439e-06,
"loss": 0.5447,
"step": 420
},
{
"epoch": 0.6400602409638554,
"grad_norm": 1.7109375,
"learning_rate": 4.003350083752094e-06,
"loss": 0.5443,
"step": 425
},
{
"epoch": 0.6475903614457831,
"grad_norm": 1.6640625,
"learning_rate": 3.919597989949749e-06,
"loss": 0.5469,
"step": 430
},
{
"epoch": 0.6551204819277109,
"grad_norm": 1.765625,
"learning_rate": 3.8358458961474034e-06,
"loss": 0.5525,
"step": 435
},
{
"epoch": 0.6626506024096386,
"grad_norm": 1.734375,
"learning_rate": 3.7520938023450586e-06,
"loss": 0.5524,
"step": 440
},
{
"epoch": 0.6701807228915663,
"grad_norm": 1.890625,
"learning_rate": 3.6683417085427137e-06,
"loss": 0.5452,
"step": 445
},
{
"epoch": 0.677710843373494,
"grad_norm": 1.78125,
"learning_rate": 3.5845896147403684e-06,
"loss": 0.5524,
"step": 450
},
{
"epoch": 0.6852409638554217,
"grad_norm": 1.7421875,
"learning_rate": 3.5008375209380235e-06,
"loss": 0.5508,
"step": 455
},
{
"epoch": 0.6927710843373494,
"grad_norm": 1.7109375,
"learning_rate": 3.4170854271356786e-06,
"loss": 0.5393,
"step": 460
},
{
"epoch": 0.7003012048192772,
"grad_norm": 1.7578125,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5595,
"step": 465
},
{
"epoch": 0.7063253012048193,
"eval_loss": 0.588641345500946,
"eval_runtime": 3.5901,
"eval_samples_per_second": 1727.809,
"eval_steps_per_second": 27.019,
"step": 469
},
{
"epoch": 0.7078313253012049,
"grad_norm": 1.7890625,
"learning_rate": 3.2495812395309884e-06,
"loss": 0.5581,
"step": 470
},
{
"epoch": 0.7153614457831325,
"grad_norm": 1.7421875,
"learning_rate": 3.165829145728643e-06,
"loss": 0.5469,
"step": 475
},
{
"epoch": 0.7228915662650602,
"grad_norm": 1.8203125,
"learning_rate": 3.0820770519262983e-06,
"loss": 0.5565,
"step": 480
},
{
"epoch": 0.7304216867469879,
"grad_norm": 1.7421875,
"learning_rate": 2.9983249581239534e-06,
"loss": 0.5511,
"step": 485
},
{
"epoch": 0.7379518072289156,
"grad_norm": 1.6953125,
"learning_rate": 2.914572864321608e-06,
"loss": 0.5355,
"step": 490
},
{
"epoch": 0.7454819277108434,
"grad_norm": 1.7265625,
"learning_rate": 2.830820770519263e-06,
"loss": 0.5285,
"step": 495
},
{
"epoch": 0.7530120481927711,
"grad_norm": 1.7890625,
"learning_rate": 2.747068676716918e-06,
"loss": 0.5587,
"step": 500
},
{
"epoch": 0.7605421686746988,
"grad_norm": 1.703125,
"learning_rate": 2.663316582914573e-06,
"loss": 0.5516,
"step": 505
},
{
"epoch": 0.7680722891566265,
"grad_norm": 1.703125,
"learning_rate": 2.5795644891122277e-06,
"loss": 0.5484,
"step": 510
},
{
"epoch": 0.7756024096385542,
"grad_norm": 1.859375,
"learning_rate": 2.495812395309883e-06,
"loss": 0.5419,
"step": 515
},
{
"epoch": 0.7831325301204819,
"grad_norm": 1.75,
"learning_rate": 2.412060301507538e-06,
"loss": 0.5501,
"step": 520
},
{
"epoch": 0.7906626506024096,
"grad_norm": 1.6484375,
"learning_rate": 2.3283082077051927e-06,
"loss": 0.5378,
"step": 525
},
{
"epoch": 0.7981927710843374,
"grad_norm": 1.6796875,
"learning_rate": 2.2445561139028478e-06,
"loss": 0.5258,
"step": 530
},
{
"epoch": 0.8057228915662651,
"grad_norm": 1.6953125,
"learning_rate": 2.1608040201005025e-06,
"loss": 0.5488,
"step": 535
},
{
"epoch": 0.8072289156626506,
"eval_loss": 0.5883608460426331,
"eval_runtime": 3.5937,
"eval_samples_per_second": 1726.072,
"eval_steps_per_second": 26.992,
"step": 536
},
{
"epoch": 0.8132530120481928,
"grad_norm": 1.8125,
"learning_rate": 2.0770519262981576e-06,
"loss": 0.553,
"step": 540
},
{
"epoch": 0.8207831325301205,
"grad_norm": 1.6796875,
"learning_rate": 1.9932998324958123e-06,
"loss": 0.5254,
"step": 545
},
{
"epoch": 0.8283132530120482,
"grad_norm": 1.7890625,
"learning_rate": 1.9095477386934674e-06,
"loss": 0.5468,
"step": 550
},
{
"epoch": 0.8358433734939759,
"grad_norm": 1.7265625,
"learning_rate": 1.8257956448911223e-06,
"loss": 0.5407,
"step": 555
},
{
"epoch": 0.8433734939759037,
"grad_norm": 1.765625,
"learning_rate": 1.7420435510887772e-06,
"loss": 0.5445,
"step": 560
},
{
"epoch": 0.8509036144578314,
"grad_norm": 1.7734375,
"learning_rate": 1.6582914572864323e-06,
"loss": 0.5459,
"step": 565
},
{
"epoch": 0.858433734939759,
"grad_norm": 1.7890625,
"learning_rate": 1.5745393634840873e-06,
"loss": 0.5479,
"step": 570
},
{
"epoch": 0.8659638554216867,
"grad_norm": 1.7265625,
"learning_rate": 1.4907872696817422e-06,
"loss": 0.5485,
"step": 575
},
{
"epoch": 0.8734939759036144,
"grad_norm": 1.765625,
"learning_rate": 1.407035175879397e-06,
"loss": 0.5581,
"step": 580
},
{
"epoch": 0.8810240963855421,
"grad_norm": 1.6953125,
"learning_rate": 1.323283082077052e-06,
"loss": 0.533,
"step": 585
},
{
"epoch": 0.8885542168674698,
"grad_norm": 1.671875,
"learning_rate": 1.2395309882747069e-06,
"loss": 0.5246,
"step": 590
},
{
"epoch": 0.8960843373493976,
"grad_norm": 1.7890625,
"learning_rate": 1.155778894472362e-06,
"loss": 0.551,
"step": 595
},
{
"epoch": 0.9036144578313253,
"grad_norm": 1.7265625,
"learning_rate": 1.072026800670017e-06,
"loss": 0.5425,
"step": 600
},
{
"epoch": 0.9081325301204819,
"eval_loss": 0.588302731513977,
"eval_runtime": 3.591,
"eval_samples_per_second": 1727.393,
"eval_steps_per_second": 27.012,
"step": 603
},
{
"epoch": 0.911144578313253,
"grad_norm": 1.640625,
"learning_rate": 9.882747068676718e-07,
"loss": 0.5317,
"step": 605
},
{
"epoch": 0.9186746987951807,
"grad_norm": 1.7734375,
"learning_rate": 9.045226130653267e-07,
"loss": 0.54,
"step": 610
},
{
"epoch": 0.9262048192771084,
"grad_norm": 1.75,
"learning_rate": 8.207705192629816e-07,
"loss": 0.5274,
"step": 615
},
{
"epoch": 0.9337349397590361,
"grad_norm": 1.859375,
"learning_rate": 7.370184254606367e-07,
"loss": 0.5478,
"step": 620
},
{
"epoch": 0.9412650602409639,
"grad_norm": 1.765625,
"learning_rate": 6.532663316582916e-07,
"loss": 0.5633,
"step": 625
},
{
"epoch": 0.9487951807228916,
"grad_norm": 1.875,
"learning_rate": 5.695142378559465e-07,
"loss": 0.5438,
"step": 630
},
{
"epoch": 0.9563253012048193,
"grad_norm": 1.8046875,
"learning_rate": 4.857621440536014e-07,
"loss": 0.54,
"step": 635
},
{
"epoch": 0.963855421686747,
"grad_norm": 1.7734375,
"learning_rate": 4.0201005025125634e-07,
"loss": 0.5439,
"step": 640
},
{
"epoch": 0.9713855421686747,
"grad_norm": 1.734375,
"learning_rate": 3.1825795644891125e-07,
"loss": 0.5337,
"step": 645
},
{
"epoch": 0.9789156626506024,
"grad_norm": 1.671875,
"learning_rate": 2.3450586264656616e-07,
"loss": 0.5338,
"step": 650
},
{
"epoch": 0.9864457831325302,
"grad_norm": 1.703125,
"learning_rate": 1.5075376884422112e-07,
"loss": 0.5374,
"step": 655
},
{
"epoch": 0.9939759036144579,
"grad_norm": 1.78125,
"learning_rate": 6.700167504187606e-08,
"loss": 0.5403,
"step": 660
},
{
"epoch": 1.0,
"step": 664,
"total_flos": 7.546108123702886e+16,
"train_loss": 0.6115179532263653,
"train_runtime": 216.1794,
"train_samples_per_second": 196.448,
"train_steps_per_second": 3.072
}
],
"logging_steps": 5,
"max_steps": 664,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 67,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.546108123702886e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}