|
{ |
|
"best_metric": 0.010564341209828854, |
|
"best_model_checkpoint": "/home/paperspace/Data/models/relianceV2/llm3br256/checkpoint-145", |
|
"epoch": 4.957264957264957, |
|
"eval_steps": 5, |
|
"global_step": 145, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03418803418803419, |
|
"grad_norm": 0.14103831350803375, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.0642, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06837606837606838, |
|
"grad_norm": 0.14193229377269745, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.0655, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 0.14674556255340576, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0667, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.13675213675213677, |
|
"grad_norm": 0.12595196068286896, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.0607, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"grad_norm": 0.0885230153799057, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.0529, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"eval_loss": 0.04289048910140991, |
|
"eval_runtime": 19.6377, |
|
"eval_samples_per_second": 2.546, |
|
"eval_steps_per_second": 0.662, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.06661484390497208, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0417, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.23931623931623933, |
|
"grad_norm": 0.08302908390760422, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.0405, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.27350427350427353, |
|
"grad_norm": 0.07343365997076035, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 0.0381, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.05457036942243576, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0378, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.04456571117043495, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.0333, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"eval_loss": 0.031967516988515854, |
|
"eval_runtime": 20.2255, |
|
"eval_samples_per_second": 2.472, |
|
"eval_steps_per_second": 0.643, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.37606837606837606, |
|
"grad_norm": 0.04081442579627037, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 0.03, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.03535589948296547, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0308, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.03202543035149574, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 0.0273, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.47863247863247865, |
|
"grad_norm": 0.031043237075209618, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 0.0232, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.027209093794226646, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0247, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"eval_loss": 0.025287916883826256, |
|
"eval_runtime": 20.2717, |
|
"eval_samples_per_second": 2.466, |
|
"eval_steps_per_second": 0.641, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5470085470085471, |
|
"grad_norm": 0.0389866903424263, |
|
"learning_rate": 9.998540070400966e-05, |
|
"loss": 0.0256, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5811965811965812, |
|
"grad_norm": 0.02687215991318226, |
|
"learning_rate": 9.994161134161634e-05, |
|
"loss": 0.0215, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.026792502030730247, |
|
"learning_rate": 9.986865748457457e-05, |
|
"loss": 0.0268, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6495726495726496, |
|
"grad_norm": 0.028072144836187363, |
|
"learning_rate": 9.976658173588244e-05, |
|
"loss": 0.0216, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 0.024400506168603897, |
|
"learning_rate": 9.96354437049027e-05, |
|
"loss": 0.0228, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"eval_loss": 0.02183925360441208, |
|
"eval_runtime": 20.441, |
|
"eval_samples_per_second": 2.446, |
|
"eval_steps_per_second": 0.636, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 0.023090992122888565, |
|
"learning_rate": 9.947531997255256e-05, |
|
"loss": 0.0211, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7521367521367521, |
|
"grad_norm": 0.021273907274007797, |
|
"learning_rate": 9.928630404658255e-05, |
|
"loss": 0.0191, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7863247863247863, |
|
"grad_norm": 0.021417921409010887, |
|
"learning_rate": 9.906850630697068e-05, |
|
"loss": 0.02, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.02200082130730152, |
|
"learning_rate": 9.882205394146361e-05, |
|
"loss": 0.0232, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 0.02200504206120968, |
|
"learning_rate": 9.85470908713026e-05, |
|
"loss": 0.0196, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"eval_loss": 0.019536755979061127, |
|
"eval_runtime": 20.402, |
|
"eval_samples_per_second": 2.451, |
|
"eval_steps_per_second": 0.637, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.02056378312408924, |
|
"learning_rate": 9.824377766717759e-05, |
|
"loss": 0.0208, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.018094003200531006, |
|
"learning_rate": 9.791229145545831e-05, |
|
"loss": 0.0167, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9572649572649573, |
|
"grad_norm": 0.017437463626265526, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.0178, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9914529914529915, |
|
"grad_norm": 0.016863780096173286, |
|
"learning_rate": 9.716559066288715e-05, |
|
"loss": 0.0163, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.043561145663261414, |
|
"learning_rate": 9.675081213427076e-05, |
|
"loss": 0.0342, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"eval_loss": 0.018172133713960648, |
|
"eval_runtime": 20.3342, |
|
"eval_samples_per_second": 2.459, |
|
"eval_steps_per_second": 0.639, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0598290598290598, |
|
"grad_norm": 0.024272920563817024, |
|
"learning_rate": 9.630873244788883e-05, |
|
"loss": 0.016, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0940170940170941, |
|
"grad_norm": 0.025347765535116196, |
|
"learning_rate": 9.583960976582913e-05, |
|
"loss": 0.016, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.1282051282051282, |
|
"grad_norm": 0.026027293875813484, |
|
"learning_rate": 9.534371804252728e-05, |
|
"loss": 0.0178, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.1623931623931625, |
|
"grad_norm": 0.021461905911564827, |
|
"learning_rate": 9.482134686478519e-05, |
|
"loss": 0.0166, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.1965811965811965, |
|
"grad_norm": 0.025416361168026924, |
|
"learning_rate": 9.42728012826605e-05, |
|
"loss": 0.014, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1965811965811965, |
|
"eval_loss": 0.016842123121023178, |
|
"eval_runtime": 20.4307, |
|
"eval_samples_per_second": 2.447, |
|
"eval_steps_per_second": 0.636, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.021278902888298035, |
|
"learning_rate": 9.36984016313259e-05, |
|
"loss": 0.0147, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.264957264957265, |
|
"grad_norm": 0.023418180644512177, |
|
"learning_rate": 9.309848334400246e-05, |
|
"loss": 0.0155, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.2991452991452992, |
|
"grad_norm": 0.021139057353138924, |
|
"learning_rate": 9.247339675607605e-05, |
|
"loss": 0.015, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.021048279479146004, |
|
"learning_rate": 9.182350690051133e-05, |
|
"loss": 0.0148, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"grad_norm": 0.022858263924717903, |
|
"learning_rate": 9.114919329468282e-05, |
|
"loss": 0.0142, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"eval_loss": 0.01600724086165428, |
|
"eval_runtime": 20.1138, |
|
"eval_samples_per_second": 2.486, |
|
"eval_steps_per_second": 0.646, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.4017094017094016, |
|
"grad_norm": 0.020670941099524498, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0157, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.01937521994113922, |
|
"learning_rate": 8.972888398568772e-05, |
|
"loss": 0.0181, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.4700854700854702, |
|
"grad_norm": 0.02165701799094677, |
|
"learning_rate": 8.898371770316111e-05, |
|
"loss": 0.014, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.5042735042735043, |
|
"grad_norm": 0.01963304542005062, |
|
"learning_rate": 8.821578602729242e-05, |
|
"loss": 0.0154, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.016389695927500725, |
|
"learning_rate": 8.742553740855506e-05, |
|
"loss": 0.0133, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 0.014650734141469002, |
|
"eval_runtime": 20.2255, |
|
"eval_samples_per_second": 2.472, |
|
"eval_steps_per_second": 0.643, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.5726495726495726, |
|
"grad_norm": 0.02393350377678871, |
|
"learning_rate": 8.661343332988869e-05, |
|
"loss": 0.015, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.606837606837607, |
|
"grad_norm": 0.01642770506441593, |
|
"learning_rate": 8.577994803720606e-05, |
|
"loss": 0.0149, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.018430177122354507, |
|
"learning_rate": 8.492556826244687e-05, |
|
"loss": 0.0142, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.6752136752136753, |
|
"grad_norm": 0.021527277305722237, |
|
"learning_rate": 8.405079293933986e-05, |
|
"loss": 0.0125, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 0.01738244667649269, |
|
"learning_rate": 8.315613291203976e-05, |
|
"loss": 0.0122, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"eval_loss": 0.013878260739147663, |
|
"eval_runtime": 20.266, |
|
"eval_samples_per_second": 2.467, |
|
"eval_steps_per_second": 0.641, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7435897435897436, |
|
"grad_norm": 0.016872745007276535, |
|
"learning_rate": 8.224211063680853e-05, |
|
"loss": 0.0136, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.01582753099501133, |
|
"learning_rate": 8.130925987691569e-05, |
|
"loss": 0.0125, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.811965811965812, |
|
"grad_norm": 0.01693882793188095, |
|
"learning_rate": 8.035812539093557e-05, |
|
"loss": 0.0105, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.01629294827580452, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.0122, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.8803418803418803, |
|
"grad_norm": 0.02483530156314373, |
|
"learning_rate": 7.840323733655778e-05, |
|
"loss": 0.0136, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.8803418803418803, |
|
"eval_loss": 0.013426054269075394, |
|
"eval_runtime": 20.4254, |
|
"eval_samples_per_second": 2.448, |
|
"eval_steps_per_second": 0.636, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.9145299145299144, |
|
"grad_norm": 0.017920587211847305, |
|
"learning_rate": 7.740062536773352e-05, |
|
"loss": 0.0121, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.9487179487179487, |
|
"grad_norm": 0.018872473388910294, |
|
"learning_rate": 7.638201220530665e-05, |
|
"loss": 0.0124, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.982905982905983, |
|
"grad_norm": 0.018264025449752808, |
|
"learning_rate": 7.534799269067953e-05, |
|
"loss": 0.0125, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.017094017094017, |
|
"grad_norm": 0.042249832302331924, |
|
"learning_rate": 7.42991706621303e-05, |
|
"loss": 0.0197, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.025319905951619148, |
|
"learning_rate": 7.323615860218843e-05, |
|
"loss": 0.011, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"eval_loss": 0.01329517550766468, |
|
"eval_runtime": 20.8923, |
|
"eval_samples_per_second": 2.393, |
|
"eval_steps_per_second": 0.622, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.0854700854700856, |
|
"grad_norm": 0.01969628967344761, |
|
"learning_rate": 7.215957727996207e-05, |
|
"loss": 0.0107, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.1196581196581197, |
|
"grad_norm": 0.021322010084986687, |
|
"learning_rate": 7.107005538862646e-05, |
|
"loss": 0.0107, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.015889568254351616, |
|
"learning_rate": 6.996822917828477e-05, |
|
"loss": 0.0115, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.1880341880341883, |
|
"grad_norm": 0.014772447757422924, |
|
"learning_rate": 6.885474208441603e-05, |
|
"loss": 0.0105, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.02105395495891571, |
|
"learning_rate": 6.773024435212678e-05, |
|
"loss": 0.0098, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"eval_loss": 0.012847676873207092, |
|
"eval_runtime": 20.8651, |
|
"eval_samples_per_second": 2.396, |
|
"eval_steps_per_second": 0.623, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.2564102564102564, |
|
"grad_norm": 0.017252441495656967, |
|
"learning_rate": 6.659539265642643e-05, |
|
"loss": 0.0105, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.2905982905982905, |
|
"grad_norm": 0.015464423224329948, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.0083, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.324786324786325, |
|
"grad_norm": 0.022285865619778633, |
|
"learning_rate": 6.429728391993446e-05, |
|
"loss": 0.0116, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.358974358974359, |
|
"grad_norm": 0.022020753473043442, |
|
"learning_rate": 6.313536890992935e-05, |
|
"loss": 0.0091, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.393162393162393, |
|
"grad_norm": 0.022549916058778763, |
|
"learning_rate": 6.19657832143779e-05, |
|
"loss": 0.011, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.393162393162393, |
|
"eval_loss": 0.012613310478627682, |
|
"eval_runtime": 20.423, |
|
"eval_samples_per_second": 2.448, |
|
"eval_steps_per_second": 0.637, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.427350427350427, |
|
"grad_norm": 0.021075539290905, |
|
"learning_rate": 6.078920983839031e-05, |
|
"loss": 0.0105, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.02389822155237198, |
|
"learning_rate": 5.960633586768543e-05, |
|
"loss": 0.0105, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.4957264957264957, |
|
"grad_norm": 0.01924612931907177, |
|
"learning_rate": 5.841785206735192e-05, |
|
"loss": 0.0106, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.52991452991453, |
|
"grad_norm": 0.01965448632836342, |
|
"learning_rate": 5.7224452478461064e-05, |
|
"loss": 0.0108, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 0.015871981158852577, |
|
"learning_rate": 5.602683401276615e-05, |
|
"loss": 0.0087, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_loss": 0.012018020264804363, |
|
"eval_runtime": 20.4309, |
|
"eval_samples_per_second": 2.447, |
|
"eval_steps_per_second": 0.636, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.5982905982905984, |
|
"grad_norm": 0.017729448154568672, |
|
"learning_rate": 5.482569604572576e-05, |
|
"loss": 0.0085, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.6324786324786325, |
|
"grad_norm": 0.019757023081183434, |
|
"learning_rate": 5.3621740008088126e-05, |
|
"loss": 0.0111, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.016380244866013527, |
|
"learning_rate": 5.2415668976275355e-05, |
|
"loss": 0.0081, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.700854700854701, |
|
"grad_norm": 0.017829637974500656, |
|
"learning_rate": 5.1208187261806615e-05, |
|
"loss": 0.0112, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.735042735042735, |
|
"grad_norm": 0.019623294472694397, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0086, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.735042735042735, |
|
"eval_loss": 0.011721721850335598, |
|
"eval_runtime": 20.2095, |
|
"eval_samples_per_second": 2.474, |
|
"eval_steps_per_second": 0.643, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.01674460805952549, |
|
"learning_rate": 4.87918127381934e-05, |
|
"loss": 0.009, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.8034188034188032, |
|
"grad_norm": 0.01912359707057476, |
|
"learning_rate": 4.758433102372466e-05, |
|
"loss": 0.01, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.8376068376068377, |
|
"grad_norm": 0.021485494449734688, |
|
"learning_rate": 4.6378259991911886e-05, |
|
"loss": 0.008, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.871794871794872, |
|
"grad_norm": 0.02604834921658039, |
|
"learning_rate": 4.5174303954274244e-05, |
|
"loss": 0.0093, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.905982905982906, |
|
"grad_norm": 0.01696549728512764, |
|
"learning_rate": 4.397316598723385e-05, |
|
"loss": 0.0091, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.905982905982906, |
|
"eval_loss": 0.01138822827488184, |
|
"eval_runtime": 20.1788, |
|
"eval_samples_per_second": 2.478, |
|
"eval_steps_per_second": 0.644, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.9401709401709404, |
|
"grad_norm": 0.02111894078552723, |
|
"learning_rate": 4.277554752153895e-05, |
|
"loss": 0.0098, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.9743589743589745, |
|
"grad_norm": 0.021606622263789177, |
|
"learning_rate": 4.1582147932648074e-05, |
|
"loss": 0.01, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 3.0085470085470085, |
|
"grad_norm": 0.04893641173839569, |
|
"learning_rate": 4.039366413231458e-05, |
|
"loss": 0.0134, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 3.0427350427350426, |
|
"grad_norm": 0.0206940695643425, |
|
"learning_rate": 3.92107901616097e-05, |
|
"loss": 0.0079, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.030171874910593033, |
|
"learning_rate": 3.803421678562213e-05, |
|
"loss": 0.009, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 0.011335253715515137, |
|
"eval_runtime": 20.4524, |
|
"eval_samples_per_second": 2.445, |
|
"eval_steps_per_second": 0.636, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.02197062224149704, |
|
"learning_rate": 3.6864631090070655e-05, |
|
"loss": 0.0077, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 3.1452991452991452, |
|
"grad_norm": 0.018409984186291695, |
|
"learning_rate": 3.570271608006555e-05, |
|
"loss": 0.0076, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 3.1794871794871793, |
|
"grad_norm": 0.02173592895269394, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.007, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.213675213675214, |
|
"grad_norm": 0.030453743413090706, |
|
"learning_rate": 3.340460734357359e-05, |
|
"loss": 0.0072, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.247863247863248, |
|
"grad_norm": 0.01577071100473404, |
|
"learning_rate": 3.226975564787322e-05, |
|
"loss": 0.0062, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.247863247863248, |
|
"eval_loss": 0.011275053024291992, |
|
"eval_runtime": 11.7759, |
|
"eval_samples_per_second": 4.246, |
|
"eval_steps_per_second": 1.104, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.282051282051282, |
|
"grad_norm": 0.020528519526124, |
|
"learning_rate": 3.114525791558398e-05, |
|
"loss": 0.0077, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.316239316239316, |
|
"grad_norm": 0.028798261657357216, |
|
"learning_rate": 3.003177082171523e-05, |
|
"loss": 0.0076, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.3504273504273505, |
|
"grad_norm": 0.023175662383437157, |
|
"learning_rate": 2.8929944611373554e-05, |
|
"loss": 0.0072, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 0.020334038883447647, |
|
"learning_rate": 2.784042272003794e-05, |
|
"loss": 0.0069, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"grad_norm": 0.024305541068315506, |
|
"learning_rate": 2.6763841397811573e-05, |
|
"loss": 0.0069, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"eval_loss": 0.011122054420411587, |
|
"eval_runtime": 9.713, |
|
"eval_samples_per_second": 5.148, |
|
"eval_steps_per_second": 1.338, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.452991452991453, |
|
"grad_norm": 0.022912999615073204, |
|
"learning_rate": 2.57008293378697e-05, |
|
"loss": 0.0094, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.4871794871794872, |
|
"grad_norm": 0.021239478141069412, |
|
"learning_rate": 2.4652007309320498e-05, |
|
"loss": 0.0064, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.5213675213675213, |
|
"grad_norm": 0.017512433230876923, |
|
"learning_rate": 2.361798779469336e-05, |
|
"loss": 0.0074, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.018754741176962852, |
|
"learning_rate": 2.259937463226651e-05, |
|
"loss": 0.0063, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"grad_norm": 0.019357949495315552, |
|
"learning_rate": 2.1596762663442218e-05, |
|
"loss": 0.0071, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"eval_loss": 0.01076994277536869, |
|
"eval_runtime": 9.7069, |
|
"eval_samples_per_second": 5.151, |
|
"eval_steps_per_second": 1.339, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.623931623931624, |
|
"grad_norm": 0.02121426723897457, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.0078, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.658119658119658, |
|
"grad_norm": 0.020560307428240776, |
|
"learning_rate": 1.9641874609064443e-05, |
|
"loss": 0.0079, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 0.024254970252513885, |
|
"learning_rate": 1.8690740123084316e-05, |
|
"loss": 0.0067, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.7264957264957266, |
|
"grad_norm": 0.020139144733548164, |
|
"learning_rate": 1.7757889363191483e-05, |
|
"loss": 0.0066, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.7606837606837606, |
|
"grad_norm": 0.01883375644683838, |
|
"learning_rate": 1.684386708796025e-05, |
|
"loss": 0.006, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.7606837606837606, |
|
"eval_loss": 0.010681088082492352, |
|
"eval_runtime": 9.7636, |
|
"eval_samples_per_second": 5.121, |
|
"eval_steps_per_second": 1.331, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.7948717948717947, |
|
"grad_norm": 0.017693698406219482, |
|
"learning_rate": 1.5949207060660138e-05, |
|
"loss": 0.0061, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.8290598290598292, |
|
"grad_norm": 0.020463695749640465, |
|
"learning_rate": 1.5074431737553157e-05, |
|
"loss": 0.0072, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.8632478632478633, |
|
"grad_norm": 0.020722465589642525, |
|
"learning_rate": 1.422005196279395e-05, |
|
"loss": 0.0091, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.8974358974358974, |
|
"grad_norm": 0.020328683778643608, |
|
"learning_rate": 1.338656667011134e-05, |
|
"loss": 0.0071, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.931623931623932, |
|
"grad_norm": 0.0204046331346035, |
|
"learning_rate": 1.257446259144494e-05, |
|
"loss": 0.0065, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.931623931623932, |
|
"eval_loss": 0.010614048689603806, |
|
"eval_runtime": 9.7105, |
|
"eval_samples_per_second": 5.149, |
|
"eval_steps_per_second": 1.339, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.965811965811966, |
|
"grad_norm": 0.018496442586183548, |
|
"learning_rate": 1.178421397270758e-05, |
|
"loss": 0.0059, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.04243512079119682, |
|
"learning_rate": 1.1016282296838887e-05, |
|
"loss": 0.0121, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 4.034188034188034, |
|
"grad_norm": 0.01636839285492897, |
|
"learning_rate": 1.0271116014312293e-05, |
|
"loss": 0.005, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 4.068376068376068, |
|
"grad_norm": 0.016252394765615463, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.0053, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 0.016149073839187622, |
|
"learning_rate": 8.850806705317183e-06, |
|
"loss": 0.0076, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"eval_loss": 0.010566945187747478, |
|
"eval_runtime": 9.7295, |
|
"eval_samples_per_second": 5.139, |
|
"eval_steps_per_second": 1.336, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.136752136752137, |
|
"grad_norm": 0.01633269712328911, |
|
"learning_rate": 8.176493099488663e-06, |
|
"loss": 0.0072, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 4.170940170940171, |
|
"grad_norm": 0.018178582191467285, |
|
"learning_rate": 7.526603243923957e-06, |
|
"loss": 0.0061, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 4.205128205128205, |
|
"grad_norm": 0.017663611099123955, |
|
"learning_rate": 6.901516655997536e-06, |
|
"loss": 0.006, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 4.239316239316239, |
|
"grad_norm": 0.01672632433474064, |
|
"learning_rate": 6.301598368674105e-06, |
|
"loss": 0.0059, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 4.273504273504273, |
|
"grad_norm": 0.01625417172908783, |
|
"learning_rate": 5.727198717339511e-06, |
|
"loss": 0.0057, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.273504273504273, |
|
"eval_loss": 0.010598141700029373, |
|
"eval_runtime": 9.7399, |
|
"eval_samples_per_second": 5.133, |
|
"eval_steps_per_second": 1.335, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.016820693388581276, |
|
"learning_rate": 5.178653135214812e-06, |
|
"loss": 0.0051, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.3418803418803416, |
|
"grad_norm": 0.017715787515044212, |
|
"learning_rate": 4.65628195747273e-06, |
|
"loss": 0.0058, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.3760683760683765, |
|
"grad_norm": 0.01836553029716015, |
|
"learning_rate": 4.16039023417088e-06, |
|
"loss": 0.0051, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.410256410256411, |
|
"grad_norm": 0.02160499058663845, |
|
"learning_rate": 3.691267552111183e-06, |
|
"loss": 0.0054, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.019846314564347267, |
|
"learning_rate": 3.249187865729264e-06, |
|
"loss": 0.0071, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"eval_loss": 0.010587843134999275, |
|
"eval_runtime": 9.707, |
|
"eval_samples_per_second": 5.151, |
|
"eval_steps_per_second": 1.339, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.478632478632479, |
|
"grad_norm": 0.01771419309079647, |
|
"learning_rate": 2.8344093371128424e-06, |
|
"loss": 0.005, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.512820512820513, |
|
"grad_norm": 0.015574207529425621, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.0048, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.547008547008547, |
|
"grad_norm": 0.01756260357797146, |
|
"learning_rate": 2.087708544541689e-06, |
|
"loss": 0.0047, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.581196581196581, |
|
"grad_norm": 0.02206473983824253, |
|
"learning_rate": 1.7562223328224325e-06, |
|
"loss": 0.0053, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.017151422798633575, |
|
"learning_rate": 1.4529091286973995e-06, |
|
"loss": 0.0055, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 0.010594615712761879, |
|
"eval_runtime": 9.7136, |
|
"eval_samples_per_second": 5.147, |
|
"eval_steps_per_second": 1.338, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.64957264957265, |
|
"grad_norm": 0.018389733508229256, |
|
"learning_rate": 1.1779460585363944e-06, |
|
"loss": 0.0053, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.683760683760684, |
|
"grad_norm": 0.016087768599390984, |
|
"learning_rate": 9.314936930293283e-07, |
|
"loss": 0.0048, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.717948717948718, |
|
"grad_norm": 0.019610892981290817, |
|
"learning_rate": 7.136959534174592e-07, |
|
"loss": 0.0056, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.752136752136752, |
|
"grad_norm": 0.018921220675110817, |
|
"learning_rate": 5.246800274474439e-07, |
|
"loss": 0.0065, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.786324786324786, |
|
"grad_norm": 0.020553983747959137, |
|
"learning_rate": 3.6455629509730136e-07, |
|
"loss": 0.0053, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.786324786324786, |
|
"eval_loss": 0.010573416948318481, |
|
"eval_runtime": 9.7174, |
|
"eval_samples_per_second": 5.145, |
|
"eval_steps_per_second": 1.338, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.82051282051282, |
|
"grad_norm": 0.015205741859972477, |
|
"learning_rate": 2.334182641175686e-07, |
|
"loss": 0.0038, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.854700854700854, |
|
"grad_norm": 0.017062170431017876, |
|
"learning_rate": 1.3134251542544774e-07, |
|
"loss": 0.006, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.01831440068781376, |
|
"learning_rate": 5.838865838366792e-08, |
|
"loss": 0.0063, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.019888985902071, |
|
"learning_rate": 1.4599295990352924e-08, |
|
"loss": 0.0068, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.957264957264957, |
|
"grad_norm": 0.01834353245794773, |
|
"learning_rate": 0.0, |
|
"loss": 0.0074, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.957264957264957, |
|
"eval_loss": 0.010564341209828854, |
|
"eval_runtime": 9.7196, |
|
"eval_samples_per_second": 5.144, |
|
"eval_steps_per_second": 1.338, |
|
"step": 145 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 145, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.332736073022669e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|