|
{ |
|
"best_metric": 0.588302731513977, |
|
"best_model_checkpoint": "checkpoints/rft-llama-3.2-1b-gsm8k/gsm8k/llama-3.2-1b-instruct-step-1/checkpoint-603", |
|
"epoch": 1.0, |
|
"eval_steps": 67, |
|
"global_step": 664, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007530120481927711, |
|
"grad_norm": 69.0, |
|
"learning_rate": 7.462686567164179e-07, |
|
"loss": 1.987, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015060240963855422, |
|
"grad_norm": 58.5, |
|
"learning_rate": 1.4925373134328358e-06, |
|
"loss": 1.9009, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.022590361445783132, |
|
"grad_norm": 41.5, |
|
"learning_rate": 2.238805970149254e-06, |
|
"loss": 1.752, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.030120481927710843, |
|
"grad_norm": 24.375, |
|
"learning_rate": 2.9850746268656716e-06, |
|
"loss": 1.5653, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03765060240963856, |
|
"grad_norm": 19.0, |
|
"learning_rate": 3.73134328358209e-06, |
|
"loss": 1.3427, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.045180722891566265, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 4.477611940298508e-06, |
|
"loss": 1.0619, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05271084337349398, |
|
"grad_norm": 8.5, |
|
"learning_rate": 5.2238805970149255e-06, |
|
"loss": 0.8756, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.060240963855421686, |
|
"grad_norm": 5.375, |
|
"learning_rate": 5.970149253731343e-06, |
|
"loss": 0.7862, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0677710843373494, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.7164179104477625e-06, |
|
"loss": 0.7085, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07530120481927711, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 7.46268656716418e-06, |
|
"loss": 0.6669, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08283132530120482, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 8.208955223880599e-06, |
|
"loss": 0.6517, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09036144578313253, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.955223880597016e-06, |
|
"loss": 0.6513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09789156626506024, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 9.701492537313434e-06, |
|
"loss": 0.6549, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.10090361445783133, |
|
"eval_loss": 0.675618052482605, |
|
"eval_runtime": 3.6002, |
|
"eval_samples_per_second": 1722.964, |
|
"eval_steps_per_second": 26.943, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.10542168674698796, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.949748743718594e-06, |
|
"loss": 0.6582, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11295180722891567, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.865996649916248e-06, |
|
"loss": 0.6411, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 9.782244556113903e-06, |
|
"loss": 0.6239, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1280120481927711, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.698492462311559e-06, |
|
"loss": 0.6404, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1355421686746988, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 9.614740368509213e-06, |
|
"loss": 0.6233, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1430722891566265, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.530988274706869e-06, |
|
"loss": 0.6253, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.15060240963855423, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.447236180904523e-06, |
|
"loss": 0.6121, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15813253012048192, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 9.363484087102177e-06, |
|
"loss": 0.622, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.16566265060240964, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 9.279731993299833e-06, |
|
"loss": 0.59, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17319277108433734, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 9.195979899497488e-06, |
|
"loss": 0.5997, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.18072289156626506, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.112227805695144e-06, |
|
"loss": 0.5906, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18825301204819278, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.028475711892798e-06, |
|
"loss": 0.5887, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.19578313253012047, |
|
"grad_norm": 2.0, |
|
"learning_rate": 8.944723618090452e-06, |
|
"loss": 0.5751, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20180722891566266, |
|
"eval_loss": 0.6029960513114929, |
|
"eval_runtime": 3.5961, |
|
"eval_samples_per_second": 1724.903, |
|
"eval_steps_per_second": 26.973, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2033132530120482, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 8.860971524288108e-06, |
|
"loss": 0.571, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.21084337349397592, |
|
"grad_norm": 2.0, |
|
"learning_rate": 8.777219430485762e-06, |
|
"loss": 0.5661, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2183734939759036, |
|
"grad_norm": 1.625, |
|
"learning_rate": 8.693467336683418e-06, |
|
"loss": 0.5578, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.22590361445783133, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 8.609715242881073e-06, |
|
"loss": 0.5639, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23343373493975902, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 8.525963149078727e-06, |
|
"loss": 0.5549, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 8.442211055276383e-06, |
|
"loss": 0.5524, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24849397590361447, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 8.358458961474037e-06, |
|
"loss": 0.5786, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2560240963855422, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 8.274706867671693e-06, |
|
"loss": 0.564, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2635542168674699, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 8.190954773869347e-06, |
|
"loss": 0.5666, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2710843373493976, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 8.107202680067002e-06, |
|
"loss": 0.5531, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2786144578313253, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 8.023450586264658e-06, |
|
"loss": 0.5663, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.286144578313253, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 7.939698492462312e-06, |
|
"loss": 0.5447, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2936746987951807, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 7.855946398659968e-06, |
|
"loss": 0.5554, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.30120481927710846, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.772194304857622e-06, |
|
"loss": 0.5854, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.30271084337349397, |
|
"eval_loss": 0.5955631136894226, |
|
"eval_runtime": 3.5892, |
|
"eval_samples_per_second": 1728.235, |
|
"eval_steps_per_second": 27.025, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.30873493975903615, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 7.688442211055276e-06, |
|
"loss": 0.5483, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.31626506024096385, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 7.604690117252932e-06, |
|
"loss": 0.5649, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32379518072289154, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 7.520938023450587e-06, |
|
"loss": 0.5584, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3313253012048193, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.437185929648242e-06, |
|
"loss": 0.551, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.338855421686747, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 7.353433835845897e-06, |
|
"loss": 0.5505, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3463855421686747, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 7.269681742043552e-06, |
|
"loss": 0.5501, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3539156626506024, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 7.185929648241206e-06, |
|
"loss": 0.5471, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 7.102177554438861e-06, |
|
"loss": 0.5542, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3689759036144578, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 7.0184254606365165e-06, |
|
"loss": 0.5382, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.37650602409638556, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 6.934673366834172e-06, |
|
"loss": 0.5598, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38403614457831325, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.850921273031827e-06, |
|
"loss": 0.5627, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.39156626506024095, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.767169179229481e-06, |
|
"loss": 0.5505, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3990963855421687, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 6.683417085427136e-06, |
|
"loss": 0.5524, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4036144578313253, |
|
"eval_loss": 0.5925902724266052, |
|
"eval_runtime": 3.5903, |
|
"eval_samples_per_second": 1727.697, |
|
"eval_steps_per_second": 27.017, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.4066265060240964, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.599664991624791e-06, |
|
"loss": 0.5574, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4141566265060241, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 6.515912897822446e-06, |
|
"loss": 0.5494, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.42168674698795183, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.4321608040201015e-06, |
|
"loss": 0.5602, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4292168674698795, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.348408710217756e-06, |
|
"loss": 0.5534, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.4367469879518072, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.264656616415411e-06, |
|
"loss": 0.5446, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4442771084337349, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.180904522613066e-06, |
|
"loss": 0.5474, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.45180722891566266, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 6.097152428810721e-06, |
|
"loss": 0.5516, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45933734939759036, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.013400335008376e-06, |
|
"loss": 0.5547, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.46686746987951805, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.9296482412060305e-06, |
|
"loss": 0.5807, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4743975903614458, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 5.845896147403686e-06, |
|
"loss": 0.559, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.762144053601341e-06, |
|
"loss": 0.5426, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4894578313253012, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 5.678391959798996e-06, |
|
"loss": 0.5428, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.49698795180722893, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 5.59463986599665e-06, |
|
"loss": 0.5416, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5045180722891566, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 5.510887772194305e-06, |
|
"loss": 0.556, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5045180722891566, |
|
"eval_loss": 0.5903183817863464, |
|
"eval_runtime": 3.5762, |
|
"eval_samples_per_second": 1734.545, |
|
"eval_steps_per_second": 27.124, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5120481927710844, |
|
"grad_norm": 1.875, |
|
"learning_rate": 5.42713567839196e-06, |
|
"loss": 0.5486, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5195783132530121, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 5.3433835845896155e-06, |
|
"loss": 0.5599, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5271084337349398, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.259631490787271e-06, |
|
"loss": 0.5544, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5346385542168675, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 5.175879396984925e-06, |
|
"loss": 0.5309, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5421686746987951, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 5.09212730318258e-06, |
|
"loss": 0.5532, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5496987951807228, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.008375209380235e-06, |
|
"loss": 0.5515, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5572289156626506, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.92462311557789e-06, |
|
"loss": 0.5429, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5647590361445783, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.840871021775545e-06, |
|
"loss": 0.5585, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.572289156626506, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.7571189279732e-06, |
|
"loss": 0.5551, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5798192771084337, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.673366834170855e-06, |
|
"loss": 0.5318, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5873493975903614, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.58961474036851e-06, |
|
"loss": 0.559, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5948795180722891, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.505862646566165e-06, |
|
"loss": 0.5335, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.42211055276382e-06, |
|
"loss": 0.5508, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6054216867469879, |
|
"eval_loss": 0.5893600583076477, |
|
"eval_runtime": 3.5911, |
|
"eval_samples_per_second": 1727.335, |
|
"eval_steps_per_second": 27.011, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.6099397590361446, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.338358458961474e-06, |
|
"loss": 0.5425, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6174698795180723, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.254606365159129e-06, |
|
"loss": 0.535, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.170854271356784e-06, |
|
"loss": 0.5483, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6325301204819277, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.087102177554439e-06, |
|
"loss": 0.5447, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6400602409638554, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.003350083752094e-06, |
|
"loss": 0.5443, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6475903614457831, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.919597989949749e-06, |
|
"loss": 0.5469, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6551204819277109, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.8358458961474034e-06, |
|
"loss": 0.5525, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6626506024096386, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.7520938023450586e-06, |
|
"loss": 0.5524, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6701807228915663, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.6683417085427137e-06, |
|
"loss": 0.5452, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.677710843373494, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.5845896147403684e-06, |
|
"loss": 0.5524, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6852409638554217, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.5008375209380235e-06, |
|
"loss": 0.5508, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6927710843373494, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.4170854271356786e-06, |
|
"loss": 0.5393, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7003012048192772, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.5595, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7063253012048193, |
|
"eval_loss": 0.588641345500946, |
|
"eval_runtime": 3.5901, |
|
"eval_samples_per_second": 1727.809, |
|
"eval_steps_per_second": 27.019, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.7078313253012049, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.2495812395309884e-06, |
|
"loss": 0.5581, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7153614457831325, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.165829145728643e-06, |
|
"loss": 0.5469, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.0820770519262983e-06, |
|
"loss": 0.5565, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7304216867469879, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.9983249581239534e-06, |
|
"loss": 0.5511, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7379518072289156, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.914572864321608e-06, |
|
"loss": 0.5355, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7454819277108434, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.830820770519263e-06, |
|
"loss": 0.5285, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7530120481927711, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.747068676716918e-06, |
|
"loss": 0.5587, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7605421686746988, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.663316582914573e-06, |
|
"loss": 0.5516, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7680722891566265, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.5795644891122277e-06, |
|
"loss": 0.5484, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7756024096385542, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 2.495812395309883e-06, |
|
"loss": 0.5419, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7831325301204819, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.412060301507538e-06, |
|
"loss": 0.5501, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7906626506024096, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.3283082077051927e-06, |
|
"loss": 0.5378, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7981927710843374, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.2445561139028478e-06, |
|
"loss": 0.5258, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8057228915662651, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.1608040201005025e-06, |
|
"loss": 0.5488, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.8072289156626506, |
|
"eval_loss": 0.5883608460426331, |
|
"eval_runtime": 3.5937, |
|
"eval_samples_per_second": 1726.072, |
|
"eval_steps_per_second": 26.992, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8132530120481928, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.0770519262981576e-06, |
|
"loss": 0.553, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8207831325301205, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9932998324958123e-06, |
|
"loss": 0.5254, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.8283132530120482, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.9095477386934674e-06, |
|
"loss": 0.5468, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8358433734939759, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.8257956448911223e-06, |
|
"loss": 0.5407, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.7420435510887772e-06, |
|
"loss": 0.5445, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8509036144578314, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.6582914572864323e-06, |
|
"loss": 0.5459, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.858433734939759, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.5745393634840873e-06, |
|
"loss": 0.5479, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8659638554216867, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.4907872696817422e-06, |
|
"loss": 0.5485, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8734939759036144, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.407035175879397e-06, |
|
"loss": 0.5581, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8810240963855421, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.323283082077052e-06, |
|
"loss": 0.533, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.8885542168674698, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.2395309882747069e-06, |
|
"loss": 0.5246, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8960843373493976, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.155778894472362e-06, |
|
"loss": 0.551, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.072026800670017e-06, |
|
"loss": 0.5425, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9081325301204819, |
|
"eval_loss": 0.588302731513977, |
|
"eval_runtime": 3.591, |
|
"eval_samples_per_second": 1727.393, |
|
"eval_steps_per_second": 27.012, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.911144578313253, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 9.882747068676718e-07, |
|
"loss": 0.5317, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.9186746987951807, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 9.045226130653267e-07, |
|
"loss": 0.54, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9262048192771084, |
|
"grad_norm": 1.75, |
|
"learning_rate": 8.207705192629816e-07, |
|
"loss": 0.5274, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.9337349397590361, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 7.370184254606367e-07, |
|
"loss": 0.5478, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9412650602409639, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 6.532663316582916e-07, |
|
"loss": 0.5633, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9487951807228916, |
|
"grad_norm": 1.875, |
|
"learning_rate": 5.695142378559465e-07, |
|
"loss": 0.5438, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9563253012048193, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 4.857621440536014e-07, |
|
"loss": 0.54, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.0201005025125634e-07, |
|
"loss": 0.5439, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9713855421686747, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.1825795644891125e-07, |
|
"loss": 0.5337, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.9789156626506024, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.3450586264656616e-07, |
|
"loss": 0.5338, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9864457831325302, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.5075376884422112e-07, |
|
"loss": 0.5374, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9939759036144579, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.700167504187606e-08, |
|
"loss": 0.5403, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 664, |
|
"total_flos": 7.546108123702886e+16, |
|
"train_loss": 0.6115179532263653, |
|
"train_runtime": 216.1794, |
|
"train_samples_per_second": 196.448, |
|
"train_steps_per_second": 3.072 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 664, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 67, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.546108123702886e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|