|
{ |
|
"best_global_step": 1500, |
|
"best_metric": 0.8288407325744629, |
|
"best_model_checkpoint": "output/reasoning-model_v2/checkpoint-1500", |
|
"epoch": 0.13859373556315255, |
|
"eval_steps": 500, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004619791185438418, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 2.4615384615384616e-07, |
|
"loss": 0.9733, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0009239582370876836, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 5.53846153846154e-07, |
|
"loss": 1.0462, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0013859373556315254, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 8.615384615384616e-07, |
|
"loss": 0.91, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0018479164741753672, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.1692307692307693e-06, |
|
"loss": 0.9938, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002309895592719209, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 1.476923076923077e-06, |
|
"loss": 0.922, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0027718747112630508, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.7846153846153846e-06, |
|
"loss": 0.9497, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0032338538298068925, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 2.0923076923076926e-06, |
|
"loss": 0.9382, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0036958329483507343, |
|
"grad_norm": 5.75, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.9658, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004157812066894576, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 2.7076923076923076e-06, |
|
"loss": 0.9687, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.004619791185438418, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 3.0153846153846154e-06, |
|
"loss": 0.9775, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00508177030398226, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 3.323076923076923e-06, |
|
"loss": 0.9061, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0055437494225261015, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 3.630769230769231e-06, |
|
"loss": 0.9917, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.006005728541069943, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.938461538461539e-06, |
|
"loss": 0.7981, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.006467707659613785, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 4.246153846153846e-06, |
|
"loss": 0.9878, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.006929686778157627, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.553846153846154e-06, |
|
"loss": 0.9152, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.007391665896701469, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 4.861538461538462e-06, |
|
"loss": 1.0513, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.007853645015245311, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 5.16923076923077e-06, |
|
"loss": 0.9603, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.008315624133789152, |
|
"grad_norm": 5.625, |
|
"learning_rate": 5.476923076923077e-06, |
|
"loss": 0.9008, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008777603252332995, |
|
"grad_norm": 6.0, |
|
"learning_rate": 5.784615384615385e-06, |
|
"loss": 0.9536, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.009239582370876836, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 6.092307692307693e-06, |
|
"loss": 0.9726, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009701561489420679, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.0662, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01016354060796452, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 6.707692307692308e-06, |
|
"loss": 0.8922, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.010625519726508362, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 7.015384615384616e-06, |
|
"loss": 0.8976, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.011087498845052203, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 7.323076923076924e-06, |
|
"loss": 0.7963, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.011549477963596046, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 7.630769230769232e-06, |
|
"loss": 0.8124, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.012011457082139887, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 7.93846153846154e-06, |
|
"loss": 0.8409, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01247343620068373, |
|
"grad_norm": 5.375, |
|
"learning_rate": 8.246153846153848e-06, |
|
"loss": 0.9, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.01293541531922757, |
|
"grad_norm": 5.625, |
|
"learning_rate": 8.553846153846156e-06, |
|
"loss": 0.8444, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013397394437771413, |
|
"grad_norm": 5.25, |
|
"learning_rate": 8.861538461538463e-06, |
|
"loss": 0.8523, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.013859373556315254, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 9.169230769230771e-06, |
|
"loss": 0.9402, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.014321352674859096, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.476923076923079e-06, |
|
"loss": 0.7411, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.014783331793402937, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.784615384615387e-06, |
|
"loss": 0.8616, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01524531091194678, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.0092307692307693e-05, |
|
"loss": 0.8655, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.015707290030490623, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.8105, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.016169269149034465, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.0707692307692308e-05, |
|
"loss": 0.8538, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.016631248267578305, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.1015384615384616e-05, |
|
"loss": 0.8744, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.017093227386122147, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.1323076923076924e-05, |
|
"loss": 0.8101, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.01755520650466599, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.1630769230769231e-05, |
|
"loss": 0.831, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.018017185623209832, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.1938461538461539e-05, |
|
"loss": 0.835, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.01847916474175367, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.2246153846153847e-05, |
|
"loss": 0.7559, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018941143860297514, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.2553846153846155e-05, |
|
"loss": 0.7849, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.019403122978841357, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.2861538461538462e-05, |
|
"loss": 0.9112, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0198651020973852, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.316923076923077e-05, |
|
"loss": 1.0893, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.02032708121592904, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.3476923076923078e-05, |
|
"loss": 0.7646, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02078906033447288, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.3784615384615386e-05, |
|
"loss": 0.8373, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.021251039453016724, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.4092307692307693e-05, |
|
"loss": 0.8159, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.021713018571560567, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.8434, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.022174997690104406, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.4707692307692309e-05, |
|
"loss": 0.8048, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02263697680864825, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.5015384615384617e-05, |
|
"loss": 0.8276, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.02309895592719209, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.5323076923076926e-05, |
|
"loss": 0.9429, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.023560935045735934, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.5630769230769232e-05, |
|
"loss": 0.8995, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.024022914164279773, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.593846153846154e-05, |
|
"loss": 0.8191, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.024484893282823616, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.6246153846153848e-05, |
|
"loss": 0.9168, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.02494687240136746, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.6553846153846157e-05, |
|
"loss": 0.8813, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0254088515199113, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.6861538461538463e-05, |
|
"loss": 0.8089, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.02587083063845514, |
|
"grad_norm": 7.625, |
|
"learning_rate": 1.7169230769230772e-05, |
|
"loss": 0.7353, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.026332809756998983, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.747692307692308e-05, |
|
"loss": 0.7367, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.026794788875542826, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.7784615384615388e-05, |
|
"loss": 0.7641, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.02725676799408667, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.8092307692307694e-05, |
|
"loss": 0.7952, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.027718747112630508, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.8539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02818072623117435, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.870769230769231e-05, |
|
"loss": 0.8364, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.028642705349718193, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.901538461538462e-05, |
|
"loss": 0.9327, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.029104684468262036, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9323076923076925e-05, |
|
"loss": 0.7912, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.029566663586805875, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9630769230769234e-05, |
|
"loss": 1.0159, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.030028642705349717, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.993846153846154e-05, |
|
"loss": 0.8499, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.03049062182389356, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.9999992835654137e-05, |
|
"loss": 0.8496, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.030952600942437403, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.999996373051665e-05, |
|
"loss": 0.8142, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.031414580060981245, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.9999912236881042e-05, |
|
"loss": 0.8672, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03187655917952509, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.99998383548626e-05, |
|
"loss": 0.9359, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.03233853829806893, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9999742084626726e-05, |
|
"loss": 0.9701, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.032800517416612766, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9999623426388963e-05, |
|
"loss": 0.7881, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.03326249653515661, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.9999482380414973e-05, |
|
"loss": 0.7965, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03372447565370045, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9999318947020527e-05, |
|
"loss": 0.7915, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.034186454772244294, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.999913312657154e-05, |
|
"loss": 0.9149, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03464843389078814, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.9998924919484034e-05, |
|
"loss": 0.7844, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.03511041300933198, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.9998694326224154e-05, |
|
"loss": 0.7716, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03557239212787582, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.9998441347308164e-05, |
|
"loss": 0.7758, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.036034371246419665, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.999816598330245e-05, |
|
"loss": 0.8478, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.0364963503649635, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.9997868234823513e-05, |
|
"loss": 0.7166, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.03695832948350734, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.999754810253797e-05, |
|
"loss": 0.8396, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.037420308602051186, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.999720558716255e-05, |
|
"loss": 0.9007, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.03788228772059503, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.99968406894641e-05, |
|
"loss": 0.8649, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03834426683913887, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.999645341025957e-05, |
|
"loss": 0.8454, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.038806245957682714, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9996043750416026e-05, |
|
"loss": 0.8146, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.03926822507622656, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.9995611710850634e-05, |
|
"loss": 0.7879, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.0397302041947704, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.9995157292530672e-05, |
|
"loss": 0.8767, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.040192183313314235, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.999468049647352e-05, |
|
"loss": 0.7827, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.04065416243185808, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.9994181323746652e-05, |
|
"loss": 0.9704, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04111614155040192, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9993659775467648e-05, |
|
"loss": 0.7169, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.04157812066894576, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 1.999311585280418e-05, |
|
"loss": 1.0719, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.042040099787489606, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9992549556974015e-05, |
|
"loss": 0.8138, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.04250207890603345, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.9991960889245005e-05, |
|
"loss": 0.724, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04296405802457729, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.99913498509351e-05, |
|
"loss": 0.7338, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.043426037143121134, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.999071644341232e-05, |
|
"loss": 0.7862, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.04388801626166497, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.9990060668094778e-05, |
|
"loss": 1.0113, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.04434999538020881, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.998938252645066e-05, |
|
"loss": 0.7534, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.044811974498752655, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.9988682019998236e-05, |
|
"loss": 0.8548, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.0452739536172965, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9987959150305834e-05, |
|
"loss": 0.9163, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04573593273584034, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9987213918991855e-05, |
|
"loss": 0.7051, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.04619791185438418, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 1.998644632772477e-05, |
|
"loss": 0.7463, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04619791185438418, |
|
"eval_loss": 0.8494759798049927, |
|
"eval_runtime": 442.9436, |
|
"eval_samples_per_second": 20.578, |
|
"eval_steps_per_second": 2.574, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.046659890972928025, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.998565637822311e-05, |
|
"loss": 0.8329, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.04712187009147187, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.998484407225545e-05, |
|
"loss": 0.7516, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04758384921001571, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9984009411640433e-05, |
|
"loss": 0.6933, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.048045828328559546, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.9983152398246747e-05, |
|
"loss": 0.6367, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04850780744710339, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.998227303399312e-05, |
|
"loss": 0.8222, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.04896978656564723, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.9981371320848327e-05, |
|
"loss": 0.7939, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.049431765684191074, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9980447260831177e-05, |
|
"loss": 0.8247, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.04989374480273492, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.99795008560105e-05, |
|
"loss": 0.7723, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05035572392127876, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.997853210850517e-05, |
|
"loss": 0.7777, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.0508177030398226, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.9977541020484078e-05, |
|
"loss": 0.9256, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.051279682158366445, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.9976527594166116e-05, |
|
"loss": 0.841, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.05174166127691028, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9975491831820216e-05, |
|
"loss": 0.8144, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05220364039545412, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.9974433735765297e-05, |
|
"loss": 0.8988, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.052665619513997966, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.9973353308370282e-05, |
|
"loss": 0.9735, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05312759863254181, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.99722505520541e-05, |
|
"loss": 0.7992, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.05358957775108565, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9971125469285663e-05, |
|
"loss": 0.7968, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.054051556869629494, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.996997806258387e-05, |
|
"loss": 0.7315, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.05451353598817334, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.9968808334517607e-05, |
|
"loss": 0.8133, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.05497551510671718, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.9967616287705724e-05, |
|
"loss": 0.8487, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.055437494225261015, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.9966401924817042e-05, |
|
"loss": 0.8386, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05589947334380486, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9965165248570357e-05, |
|
"loss": 0.911, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.0563614524623487, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.9963906261734404e-05, |
|
"loss": 0.8836, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.05682343158089254, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9962624967127877e-05, |
|
"loss": 0.7977, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.057285410699436386, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.9961321367619413e-05, |
|
"loss": 0.8145, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05774738981798023, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.9959995466127582e-05, |
|
"loss": 0.7705, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.05820936893652407, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.99586472656209e-05, |
|
"loss": 0.7896, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.058671348055067914, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.9957276769117785e-05, |
|
"loss": 0.8064, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.05913332717361175, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9955883979686587e-05, |
|
"loss": 0.7744, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.05959530629215559, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.9954468900445567e-05, |
|
"loss": 0.8338, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.060057285410699435, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.9953031534562884e-05, |
|
"loss": 0.7888, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.06051926452924328, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9951571885256594e-05, |
|
"loss": 0.9061, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.06098124364778712, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.995008995579465e-05, |
|
"loss": 0.8619, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.06144322276633096, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9948585749494877e-05, |
|
"loss": 0.7711, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.061905201884874805, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9947059269724983e-05, |
|
"loss": 0.7418, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.06236718100341865, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.9945510519902533e-05, |
|
"loss": 0.9288, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.06282916012196249, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.994393950349496e-05, |
|
"loss": 0.8096, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06329113924050633, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.9942346224019557e-05, |
|
"loss": 0.852, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.06375311835905018, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.9940730685043435e-05, |
|
"loss": 0.942, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.06421509747759402, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.9939092890183562e-05, |
|
"loss": 0.8851, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.06467707659613786, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.9937432843106733e-05, |
|
"loss": 0.7966, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06513905571468169, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9935750547529547e-05, |
|
"loss": 0.9728, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.06560103483322553, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9934046007218437e-05, |
|
"loss": 0.8777, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.06606301395176938, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.993231922598962e-05, |
|
"loss": 0.8269, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.06652499307031322, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.993057020770911e-05, |
|
"loss": 0.7792, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06698697218885706, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9928798956292722e-05, |
|
"loss": 0.8521, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.0674489513074009, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9927005475706024e-05, |
|
"loss": 0.7401, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.06791093042594475, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.9925189769964374e-05, |
|
"loss": 0.7713, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.06837290954448859, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.992335184313287e-05, |
|
"loss": 0.8873, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06883488866303243, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.992149169932638e-05, |
|
"loss": 0.7554, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.06929686778157627, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.9919609342709493e-05, |
|
"loss": 0.8749, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06975884690012012, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.991770477749654e-05, |
|
"loss": 0.8489, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.07022082601866396, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9915778007951572e-05, |
|
"loss": 0.7187, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.0706828051372078, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9913829038388355e-05, |
|
"loss": 0.8325, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.07114478425575164, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9911857873170352e-05, |
|
"loss": 0.8896, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.07160676337429549, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 1.9909864516710724e-05, |
|
"loss": 0.7469, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.07206874249283933, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9907848973472307e-05, |
|
"loss": 0.9805, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07253072161138316, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9905811247967623e-05, |
|
"loss": 0.806, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.990375134475885e-05, |
|
"loss": 0.8426, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.07345467984847084, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.9901669268457814e-05, |
|
"loss": 0.7435, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.07391665896701469, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.9899565023725992e-05, |
|
"loss": 0.8943, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07437863808555853, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.989743861527448e-05, |
|
"loss": 0.9149, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.07484061720410237, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.989529004786402e-05, |
|
"loss": 0.8323, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.07530259632264621, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.9893119326304938e-05, |
|
"loss": 0.7721, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.07576457544119006, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.9890926455457172e-05, |
|
"loss": 0.7716, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.0762265545597339, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.9888711440230258e-05, |
|
"loss": 0.7043, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.07668853367827774, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.9886474285583283e-05, |
|
"loss": 0.777, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.07715051279682159, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.9884214996524935e-05, |
|
"loss": 0.9524, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.07761249191536543, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.988193357811343e-05, |
|
"loss": 0.7875, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07807447103390927, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.987963003545655e-05, |
|
"loss": 0.8051, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.07853645015245311, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9877304373711588e-05, |
|
"loss": 0.8002, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07899842927099696, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9874956598085378e-05, |
|
"loss": 0.7691, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.0794604083895408, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.9872586713834253e-05, |
|
"loss": 0.8483, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07992238750808464, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.987019472626405e-05, |
|
"loss": 0.8258, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.08038436662662847, |
|
"grad_norm": 6.75, |
|
"learning_rate": 1.986778064073009e-05, |
|
"loss": 0.9104, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.08084634574517231, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.9865344462637163e-05, |
|
"loss": 0.765, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.08130832486371616, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.9862886197439525e-05, |
|
"loss": 0.8766, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.08177030398226, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.9860405850640888e-05, |
|
"loss": 0.7801, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.08223228310080384, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.9857903427794393e-05, |
|
"loss": 0.8898, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.08269426221934768, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.985537893450261e-05, |
|
"loss": 0.7725, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.08315624133789153, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.985283237641752e-05, |
|
"loss": 0.8464, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08361822045643537, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9850263759240507e-05, |
|
"loss": 0.7433, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.08408019957497921, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.9847673088722337e-05, |
|
"loss": 0.7768, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.08454217869352305, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9845060370663157e-05, |
|
"loss": 0.9128, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.0850041578120669, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.9842425610912467e-05, |
|
"loss": 0.7274, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08546613693061074, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.983976881536912e-05, |
|
"loss": 0.7639, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.08592811604915458, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.9837089989981307e-05, |
|
"loss": 0.7655, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.08639009516769842, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.983438914074654e-05, |
|
"loss": 0.7869, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.08685207428624227, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.9831666273711628e-05, |
|
"loss": 0.9739, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08731405340478611, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 1.982892139497269e-05, |
|
"loss": 1.0108, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.08777603252332994, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.9826154510675118e-05, |
|
"loss": 0.8337, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.08823801164187378, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.9823365627013573e-05, |
|
"loss": 0.8139, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.08869999076041762, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.9820554750231968e-05, |
|
"loss": 0.7524, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08916196987896147, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.981772188662346e-05, |
|
"loss": 0.8152, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.08962394899750531, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.981486704253042e-05, |
|
"loss": 0.8497, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.09008592811604915, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 1.981199022434445e-05, |
|
"loss": 0.8954, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.090547907234593, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.9809091438506333e-05, |
|
"loss": 0.8133, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.09100988635313684, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.980617069150603e-05, |
|
"loss": 0.7652, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.09147186547168068, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.9803227989882693e-05, |
|
"loss": 0.7811, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.09193384459022452, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9800263340224603e-05, |
|
"loss": 0.8253, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.09239582370876837, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9797276749169192e-05, |
|
"loss": 0.8832, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09239582370876837, |
|
"eval_loss": 0.8367779850959778, |
|
"eval_runtime": 442.761, |
|
"eval_samples_per_second": 20.587, |
|
"eval_steps_per_second": 2.575, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09285780282731221, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.9794268223403012e-05, |
|
"loss": 1.0233, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.09331978194585605, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.9791237769661728e-05, |
|
"loss": 0.8371, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.0937817610643999, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.978818539473009e-05, |
|
"loss": 1.0483, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.09424374018294374, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9785111105441942e-05, |
|
"loss": 0.7854, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09470571930148758, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.9782014908680167e-05, |
|
"loss": 0.6771, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.09516769842003142, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.977889681137672e-05, |
|
"loss": 0.7874, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.09562967753857525, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.9775756820512574e-05, |
|
"loss": 0.9448, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.09609165665711909, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.9772594943117723e-05, |
|
"loss": 0.8937, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.09655363577566294, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.9769411186271162e-05, |
|
"loss": 0.9173, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.09701561489420678, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 1.976620555710087e-05, |
|
"loss": 0.8327, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09747759401275062, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.9762978062783793e-05, |
|
"loss": 0.7447, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.09793957313129446, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.9759728710545836e-05, |
|
"loss": 0.7932, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.0984015522498383, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.9756457507661833e-05, |
|
"loss": 0.8749, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.09886353136838215, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.9753164461455548e-05, |
|
"loss": 0.7447, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.09932551048692599, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.974984957929964e-05, |
|
"loss": 0.7091, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.09978748960546983, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.9746512868615656e-05, |
|
"loss": 0.9072, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.10024946872401368, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9743154336874024e-05, |
|
"loss": 0.7947, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.10071144784255752, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.9739773991594017e-05, |
|
"loss": 0.9017, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.10117342696110136, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.9736371840343745e-05, |
|
"loss": 0.7268, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.1016354060796452, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.9732947890740143e-05, |
|
"loss": 0.7386, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.10209738519818905, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.972950215044895e-05, |
|
"loss": 0.7445, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.10255936431673289, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.9726034627184685e-05, |
|
"loss": 0.8159, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.10302134343527672, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.9722545328710643e-05, |
|
"loss": 0.7683, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.10348332255382056, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.971903426283887e-05, |
|
"loss": 0.9848, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.1039453016723644, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.971550143743014e-05, |
|
"loss": 0.7591, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.10440728079090825, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.971194686039394e-05, |
|
"loss": 0.7353, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.10486925990945209, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.9708370539688476e-05, |
|
"loss": 0.8047, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.10533123902799593, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9704772483320616e-05, |
|
"loss": 0.8734, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.10579321814653977, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.9701152699345898e-05, |
|
"loss": 1.0071, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.10625519726508362, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.9697511195868504e-05, |
|
"loss": 0.9231, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.10671717638362746, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.969384798104124e-05, |
|
"loss": 0.7658, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.1071791555021713, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.9690163063065532e-05, |
|
"loss": 0.8602, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.10764113462071515, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.9686456450191372e-05, |
|
"loss": 0.8779, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.10810311373925899, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.968272815071736e-05, |
|
"loss": 0.8708, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.10856509285780283, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.9678978172990612e-05, |
|
"loss": 0.8517, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.10902707197634667, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.9675206525406803e-05, |
|
"loss": 0.8727, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.10948905109489052, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9671413216410116e-05, |
|
"loss": 0.7547, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.10995103021343436, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.966759825449323e-05, |
|
"loss": 0.7722, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.1104130093319782, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.9663761648197302e-05, |
|
"loss": 0.9483, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.11087498845052203, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.965990340611195e-05, |
|
"loss": 0.846, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.11133696756906587, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9656023536875227e-05, |
|
"loss": 0.8802, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.11179894668760972, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.965212204917361e-05, |
|
"loss": 0.8081, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.11226092580615356, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.964819895174198e-05, |
|
"loss": 0.8051, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.1127229049246974, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.964425425336359e-05, |
|
"loss": 0.8133, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.11318488404324124, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.9640287962870063e-05, |
|
"loss": 0.8061, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.11364686316178509, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.9636300089141355e-05, |
|
"loss": 0.7362, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.11410884228032893, |
|
"grad_norm": 7.625, |
|
"learning_rate": 1.9632290641105754e-05, |
|
"loss": 1.0105, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.11457082139887277, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.962825962773984e-05, |
|
"loss": 0.6403, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.11503280051741661, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.962420705806848e-05, |
|
"loss": 0.8769, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.11549477963596046, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9620132941164806e-05, |
|
"loss": 0.8211, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.1159567587545043, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.961603728615018e-05, |
|
"loss": 0.8711, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.11641873787304814, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.96119201021942e-05, |
|
"loss": 0.8252, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.11688071699159198, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.9607781398514646e-05, |
|
"loss": 0.8147, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.11734269611013583, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9603621184377498e-05, |
|
"loss": 0.7746, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.11780467522867967, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.9599439469096876e-05, |
|
"loss": 0.965, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.1182666543472235, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9595236262035057e-05, |
|
"loss": 0.711, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.11872863346576734, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.959101157260241e-05, |
|
"loss": 0.7464, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.11919061258431118, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9586765410257424e-05, |
|
"loss": 0.7712, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.11965259170285503, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.958249778450665e-05, |
|
"loss": 0.8344, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.12011457082139887, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.95782087049047e-05, |
|
"loss": 0.7442, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.12057654993994271, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 1.957389818105421e-05, |
|
"loss": 0.818, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.12103852905848655, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.9569566222605832e-05, |
|
"loss": 0.6755, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.1215005081770304, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9565212839258204e-05, |
|
"loss": 0.8302, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.12196248729557424, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.9560838040757933e-05, |
|
"loss": 0.7821, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.12242446641411808, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.955644183689957e-05, |
|
"loss": 0.8591, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.12288644553266193, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.9552024237525597e-05, |
|
"loss": 0.7422, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.12334842465120577, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.9547585252526388e-05, |
|
"loss": 0.7762, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.12381040376974961, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.9543124891840196e-05, |
|
"loss": 0.8118, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.12427238288829345, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9538643165453138e-05, |
|
"loss": 0.7443, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.1247343620068373, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.9534140083399165e-05, |
|
"loss": 0.864, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12519634112538114, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.9529615655760034e-05, |
|
"loss": 0.7972, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.12565832024392498, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.9525069892665295e-05, |
|
"loss": 0.8191, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.12612029936246882, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.952050280429227e-05, |
|
"loss": 0.7575, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.12658227848101267, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.9515914400866022e-05, |
|
"loss": 0.817, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.1270442575995565, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.951130469265933e-05, |
|
"loss": 0.9624, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.12750623671810035, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9506673689992673e-05, |
|
"loss": 0.8555, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.1279682158366442, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.950202140323422e-05, |
|
"loss": 0.803, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.12843019495518804, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9497347842799767e-05, |
|
"loss": 0.8625, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.12889217407373188, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.9492653019152762e-05, |
|
"loss": 0.7429, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.12935415319227572, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.9487936942804237e-05, |
|
"loss": 0.8089, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12981613231081954, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.948319962431283e-05, |
|
"loss": 0.8077, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.13027811142936338, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.9478441074284713e-05, |
|
"loss": 0.8612, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.13074009054790722, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.947366130337361e-05, |
|
"loss": 0.7953, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.13120206966645107, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.9468860322280746e-05, |
|
"loss": 0.8029, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.1316640487849949, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.946403814175484e-05, |
|
"loss": 0.9268, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.13212602790353875, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.9459194772592062e-05, |
|
"loss": 0.8112, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1325880070220826, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.9454330225636035e-05, |
|
"loss": 0.6757, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.13304998614062644, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.944944451177778e-05, |
|
"loss": 0.784, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.13351196525917028, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.9444537641955725e-05, |
|
"loss": 0.9202, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.13397394437771412, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.943960962715565e-05, |
|
"loss": 0.8239, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.13443592349625796, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9434660478410676e-05, |
|
"loss": 0.8025, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.1348979026148018, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.9429690206801255e-05, |
|
"loss": 0.8056, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.13535988173334565, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.942469882345511e-05, |
|
"loss": 0.9176, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.1358218608518895, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.941968633954724e-05, |
|
"loss": 0.8589, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.13628383997043333, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.9414652766299887e-05, |
|
"loss": 0.9654, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.13674581908897718, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9409598114982503e-05, |
|
"loss": 0.7178, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.13720779820752102, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.9404522396911742e-05, |
|
"loss": 0.8832, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.13766977732606486, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.9399425623451405e-05, |
|
"loss": 0.8601, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.1381317564446087, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.9394307806012454e-05, |
|
"loss": 0.6866, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.13859373556315255, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9389168956052945e-05, |
|
"loss": 0.7372, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13859373556315255, |
|
"eval_loss": 0.8288407325744629, |
|
"eval_runtime": 443.0014, |
|
"eval_samples_per_second": 20.576, |
|
"eval_steps_per_second": 2.573, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 10823, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.4949569191936e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|