relianceV2 / checkpoint-145 /trainer_state.json
sizhkhy's picture
Upload folder using huggingface_hub
2739a4e verified
{
"best_metric": 0.010564341209828854,
"best_model_checkpoint": "/home/paperspace/Data/models/relianceV2/llm3br256/checkpoint-145",
"epoch": 4.957264957264957,
"eval_steps": 5,
"global_step": 145,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03418803418803419,
"grad_norm": 0.14103831350803375,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0642,
"step": 1
},
{
"epoch": 0.06837606837606838,
"grad_norm": 0.14193229377269745,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0655,
"step": 2
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.14674556255340576,
"learning_rate": 2e-05,
"loss": 0.0667,
"step": 3
},
{
"epoch": 0.13675213675213677,
"grad_norm": 0.12595196068286896,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.0607,
"step": 4
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.0885230153799057,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0529,
"step": 5
},
{
"epoch": 0.17094017094017094,
"eval_loss": 0.04289048910140991,
"eval_runtime": 19.6377,
"eval_samples_per_second": 2.546,
"eval_steps_per_second": 0.662,
"step": 5
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.06661484390497208,
"learning_rate": 4e-05,
"loss": 0.0417,
"step": 6
},
{
"epoch": 0.23931623931623933,
"grad_norm": 0.08302908390760422,
"learning_rate": 4.666666666666667e-05,
"loss": 0.0405,
"step": 7
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.07343365997076035,
"learning_rate": 5.333333333333333e-05,
"loss": 0.0381,
"step": 8
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.05457036942243576,
"learning_rate": 6e-05,
"loss": 0.0378,
"step": 9
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.04456571117043495,
"learning_rate": 6.666666666666667e-05,
"loss": 0.0333,
"step": 10
},
{
"epoch": 0.3418803418803419,
"eval_loss": 0.031967516988515854,
"eval_runtime": 20.2255,
"eval_samples_per_second": 2.472,
"eval_steps_per_second": 0.643,
"step": 10
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.04081442579627037,
"learning_rate": 7.333333333333333e-05,
"loss": 0.03,
"step": 11
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.03535589948296547,
"learning_rate": 8e-05,
"loss": 0.0308,
"step": 12
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.03202543035149574,
"learning_rate": 8.666666666666667e-05,
"loss": 0.0273,
"step": 13
},
{
"epoch": 0.47863247863247865,
"grad_norm": 0.031043237075209618,
"learning_rate": 9.333333333333334e-05,
"loss": 0.0232,
"step": 14
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.027209093794226646,
"learning_rate": 0.0001,
"loss": 0.0247,
"step": 15
},
{
"epoch": 0.5128205128205128,
"eval_loss": 0.025287916883826256,
"eval_runtime": 20.2717,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 0.641,
"step": 15
},
{
"epoch": 0.5470085470085471,
"grad_norm": 0.0389866903424263,
"learning_rate": 9.998540070400966e-05,
"loss": 0.0256,
"step": 16
},
{
"epoch": 0.5811965811965812,
"grad_norm": 0.02687215991318226,
"learning_rate": 9.994161134161634e-05,
"loss": 0.0215,
"step": 17
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.026792502030730247,
"learning_rate": 9.986865748457457e-05,
"loss": 0.0268,
"step": 18
},
{
"epoch": 0.6495726495726496,
"grad_norm": 0.028072144836187363,
"learning_rate": 9.976658173588244e-05,
"loss": 0.0216,
"step": 19
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.024400506168603897,
"learning_rate": 9.96354437049027e-05,
"loss": 0.0228,
"step": 20
},
{
"epoch": 0.6837606837606838,
"eval_loss": 0.02183925360441208,
"eval_runtime": 20.441,
"eval_samples_per_second": 2.446,
"eval_steps_per_second": 0.636,
"step": 20
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.023090992122888565,
"learning_rate": 9.947531997255256e-05,
"loss": 0.0211,
"step": 21
},
{
"epoch": 0.7521367521367521,
"grad_norm": 0.021273907274007797,
"learning_rate": 9.928630404658255e-05,
"loss": 0.0191,
"step": 22
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.021417921409010887,
"learning_rate": 9.906850630697068e-05,
"loss": 0.02,
"step": 23
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.02200082130730152,
"learning_rate": 9.882205394146361e-05,
"loss": 0.0232,
"step": 24
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.02200504206120968,
"learning_rate": 9.85470908713026e-05,
"loss": 0.0196,
"step": 25
},
{
"epoch": 0.8547008547008547,
"eval_loss": 0.019536755979061127,
"eval_runtime": 20.402,
"eval_samples_per_second": 2.451,
"eval_steps_per_second": 0.637,
"step": 25
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.02056378312408924,
"learning_rate": 9.824377766717759e-05,
"loss": 0.0208,
"step": 26
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.018094003200531006,
"learning_rate": 9.791229145545831e-05,
"loss": 0.0167,
"step": 27
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.017437463626265526,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0178,
"step": 28
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.016863780096173286,
"learning_rate": 9.716559066288715e-05,
"loss": 0.0163,
"step": 29
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.043561145663261414,
"learning_rate": 9.675081213427076e-05,
"loss": 0.0342,
"step": 30
},
{
"epoch": 1.0256410256410255,
"eval_loss": 0.018172133713960648,
"eval_runtime": 20.3342,
"eval_samples_per_second": 2.459,
"eval_steps_per_second": 0.639,
"step": 30
},
{
"epoch": 1.0598290598290598,
"grad_norm": 0.024272920563817024,
"learning_rate": 9.630873244788883e-05,
"loss": 0.016,
"step": 31
},
{
"epoch": 1.0940170940170941,
"grad_norm": 0.025347765535116196,
"learning_rate": 9.583960976582913e-05,
"loss": 0.016,
"step": 32
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.026027293875813484,
"learning_rate": 9.534371804252728e-05,
"loss": 0.0178,
"step": 33
},
{
"epoch": 1.1623931623931625,
"grad_norm": 0.021461905911564827,
"learning_rate": 9.482134686478519e-05,
"loss": 0.0166,
"step": 34
},
{
"epoch": 1.1965811965811965,
"grad_norm": 0.025416361168026924,
"learning_rate": 9.42728012826605e-05,
"loss": 0.014,
"step": 35
},
{
"epoch": 1.1965811965811965,
"eval_loss": 0.016842123121023178,
"eval_runtime": 20.4307,
"eval_samples_per_second": 2.447,
"eval_steps_per_second": 0.636,
"step": 35
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.021278902888298035,
"learning_rate": 9.36984016313259e-05,
"loss": 0.0147,
"step": 36
},
{
"epoch": 1.264957264957265,
"grad_norm": 0.023418180644512177,
"learning_rate": 9.309848334400246e-05,
"loss": 0.0155,
"step": 37
},
{
"epoch": 1.2991452991452992,
"grad_norm": 0.021139057353138924,
"learning_rate": 9.247339675607605e-05,
"loss": 0.015,
"step": 38
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.021048279479146004,
"learning_rate": 9.182350690051133e-05,
"loss": 0.0148,
"step": 39
},
{
"epoch": 1.3675213675213675,
"grad_norm": 0.022858263924717903,
"learning_rate": 9.114919329468282e-05,
"loss": 0.0142,
"step": 40
},
{
"epoch": 1.3675213675213675,
"eval_loss": 0.01600724086165428,
"eval_runtime": 20.1138,
"eval_samples_per_second": 2.486,
"eval_steps_per_second": 0.646,
"step": 40
},
{
"epoch": 1.4017094017094016,
"grad_norm": 0.020670941099524498,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0157,
"step": 41
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.01937521994113922,
"learning_rate": 8.972888398568772e-05,
"loss": 0.0181,
"step": 42
},
{
"epoch": 1.4700854700854702,
"grad_norm": 0.02165701799094677,
"learning_rate": 8.898371770316111e-05,
"loss": 0.014,
"step": 43
},
{
"epoch": 1.5042735042735043,
"grad_norm": 0.01963304542005062,
"learning_rate": 8.821578602729242e-05,
"loss": 0.0154,
"step": 44
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.016389695927500725,
"learning_rate": 8.742553740855506e-05,
"loss": 0.0133,
"step": 45
},
{
"epoch": 1.5384615384615383,
"eval_loss": 0.014650734141469002,
"eval_runtime": 20.2255,
"eval_samples_per_second": 2.472,
"eval_steps_per_second": 0.643,
"step": 45
},
{
"epoch": 1.5726495726495726,
"grad_norm": 0.02393350377678871,
"learning_rate": 8.661343332988869e-05,
"loss": 0.015,
"step": 46
},
{
"epoch": 1.606837606837607,
"grad_norm": 0.01642770506441593,
"learning_rate": 8.577994803720606e-05,
"loss": 0.0149,
"step": 47
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.018430177122354507,
"learning_rate": 8.492556826244687e-05,
"loss": 0.0142,
"step": 48
},
{
"epoch": 1.6752136752136753,
"grad_norm": 0.021527277305722237,
"learning_rate": 8.405079293933986e-05,
"loss": 0.0125,
"step": 49
},
{
"epoch": 1.7094017094017095,
"grad_norm": 0.01738244667649269,
"learning_rate": 8.315613291203976e-05,
"loss": 0.0122,
"step": 50
},
{
"epoch": 1.7094017094017095,
"eval_loss": 0.013878260739147663,
"eval_runtime": 20.266,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 0.641,
"step": 50
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.016872745007276535,
"learning_rate": 8.224211063680853e-05,
"loss": 0.0136,
"step": 51
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.01582753099501133,
"learning_rate": 8.130925987691569e-05,
"loss": 0.0125,
"step": 52
},
{
"epoch": 1.811965811965812,
"grad_norm": 0.01693882793188095,
"learning_rate": 8.035812539093557e-05,
"loss": 0.0105,
"step": 53
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.01629294827580452,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0122,
"step": 54
},
{
"epoch": 1.8803418803418803,
"grad_norm": 0.02483530156314373,
"learning_rate": 7.840323733655778e-05,
"loss": 0.0136,
"step": 55
},
{
"epoch": 1.8803418803418803,
"eval_loss": 0.013426054269075394,
"eval_runtime": 20.4254,
"eval_samples_per_second": 2.448,
"eval_steps_per_second": 0.636,
"step": 55
},
{
"epoch": 1.9145299145299144,
"grad_norm": 0.017920587211847305,
"learning_rate": 7.740062536773352e-05,
"loss": 0.0121,
"step": 56
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.018872473388910294,
"learning_rate": 7.638201220530665e-05,
"loss": 0.0124,
"step": 57
},
{
"epoch": 1.982905982905983,
"grad_norm": 0.018264025449752808,
"learning_rate": 7.534799269067953e-05,
"loss": 0.0125,
"step": 58
},
{
"epoch": 2.017094017094017,
"grad_norm": 0.042249832302331924,
"learning_rate": 7.42991706621303e-05,
"loss": 0.0197,
"step": 59
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.025319905951619148,
"learning_rate": 7.323615860218843e-05,
"loss": 0.011,
"step": 60
},
{
"epoch": 2.051282051282051,
"eval_loss": 0.01329517550766468,
"eval_runtime": 20.8923,
"eval_samples_per_second": 2.393,
"eval_steps_per_second": 0.622,
"step": 60
},
{
"epoch": 2.0854700854700856,
"grad_norm": 0.01969628967344761,
"learning_rate": 7.215957727996207e-05,
"loss": 0.0107,
"step": 61
},
{
"epoch": 2.1196581196581197,
"grad_norm": 0.021322010084986687,
"learning_rate": 7.107005538862646e-05,
"loss": 0.0107,
"step": 62
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.015889568254351616,
"learning_rate": 6.996822917828477e-05,
"loss": 0.0115,
"step": 63
},
{
"epoch": 2.1880341880341883,
"grad_norm": 0.014772447757422924,
"learning_rate": 6.885474208441603e-05,
"loss": 0.0105,
"step": 64
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.02105395495891571,
"learning_rate": 6.773024435212678e-05,
"loss": 0.0098,
"step": 65
},
{
"epoch": 2.2222222222222223,
"eval_loss": 0.012847676873207092,
"eval_runtime": 20.8651,
"eval_samples_per_second": 2.396,
"eval_steps_per_second": 0.623,
"step": 65
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.017252441495656967,
"learning_rate": 6.659539265642643e-05,
"loss": 0.0105,
"step": 66
},
{
"epoch": 2.2905982905982905,
"grad_norm": 0.015464423224329948,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0083,
"step": 67
},
{
"epoch": 2.324786324786325,
"grad_norm": 0.022285865619778633,
"learning_rate": 6.429728391993446e-05,
"loss": 0.0116,
"step": 68
},
{
"epoch": 2.358974358974359,
"grad_norm": 0.022020753473043442,
"learning_rate": 6.313536890992935e-05,
"loss": 0.0091,
"step": 69
},
{
"epoch": 2.393162393162393,
"grad_norm": 0.022549916058778763,
"learning_rate": 6.19657832143779e-05,
"loss": 0.011,
"step": 70
},
{
"epoch": 2.393162393162393,
"eval_loss": 0.012613310478627682,
"eval_runtime": 20.423,
"eval_samples_per_second": 2.448,
"eval_steps_per_second": 0.637,
"step": 70
},
{
"epoch": 2.427350427350427,
"grad_norm": 0.021075539290905,
"learning_rate": 6.078920983839031e-05,
"loss": 0.0105,
"step": 71
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.02389822155237198,
"learning_rate": 5.960633586768543e-05,
"loss": 0.0105,
"step": 72
},
{
"epoch": 2.4957264957264957,
"grad_norm": 0.01924612931907177,
"learning_rate": 5.841785206735192e-05,
"loss": 0.0106,
"step": 73
},
{
"epoch": 2.52991452991453,
"grad_norm": 0.01965448632836342,
"learning_rate": 5.7224452478461064e-05,
"loss": 0.0108,
"step": 74
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.015871981158852577,
"learning_rate": 5.602683401276615e-05,
"loss": 0.0087,
"step": 75
},
{
"epoch": 2.564102564102564,
"eval_loss": 0.012018020264804363,
"eval_runtime": 20.4309,
"eval_samples_per_second": 2.447,
"eval_steps_per_second": 0.636,
"step": 75
},
{
"epoch": 2.5982905982905984,
"grad_norm": 0.017729448154568672,
"learning_rate": 5.482569604572576e-05,
"loss": 0.0085,
"step": 76
},
{
"epoch": 2.6324786324786325,
"grad_norm": 0.019757023081183434,
"learning_rate": 5.3621740008088126e-05,
"loss": 0.0111,
"step": 77
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.016380244866013527,
"learning_rate": 5.2415668976275355e-05,
"loss": 0.0081,
"step": 78
},
{
"epoch": 2.700854700854701,
"grad_norm": 0.017829637974500656,
"learning_rate": 5.1208187261806615e-05,
"loss": 0.0112,
"step": 79
},
{
"epoch": 2.735042735042735,
"grad_norm": 0.019623294472694397,
"learning_rate": 5e-05,
"loss": 0.0086,
"step": 80
},
{
"epoch": 2.735042735042735,
"eval_loss": 0.011721721850335598,
"eval_runtime": 20.2095,
"eval_samples_per_second": 2.474,
"eval_steps_per_second": 0.643,
"step": 80
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.01674460805952549,
"learning_rate": 4.87918127381934e-05,
"loss": 0.009,
"step": 81
},
{
"epoch": 2.8034188034188032,
"grad_norm": 0.01912359707057476,
"learning_rate": 4.758433102372466e-05,
"loss": 0.01,
"step": 82
},
{
"epoch": 2.8376068376068377,
"grad_norm": 0.021485494449734688,
"learning_rate": 4.6378259991911886e-05,
"loss": 0.008,
"step": 83
},
{
"epoch": 2.871794871794872,
"grad_norm": 0.02604834921658039,
"learning_rate": 4.5174303954274244e-05,
"loss": 0.0093,
"step": 84
},
{
"epoch": 2.905982905982906,
"grad_norm": 0.01696549728512764,
"learning_rate": 4.397316598723385e-05,
"loss": 0.0091,
"step": 85
},
{
"epoch": 2.905982905982906,
"eval_loss": 0.01138822827488184,
"eval_runtime": 20.1788,
"eval_samples_per_second": 2.478,
"eval_steps_per_second": 0.644,
"step": 85
},
{
"epoch": 2.9401709401709404,
"grad_norm": 0.02111894078552723,
"learning_rate": 4.277554752153895e-05,
"loss": 0.0098,
"step": 86
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.021606622263789177,
"learning_rate": 4.1582147932648074e-05,
"loss": 0.01,
"step": 87
},
{
"epoch": 3.0085470085470085,
"grad_norm": 0.04893641173839569,
"learning_rate": 4.039366413231458e-05,
"loss": 0.0134,
"step": 88
},
{
"epoch": 3.0427350427350426,
"grad_norm": 0.0206940695643425,
"learning_rate": 3.92107901616097e-05,
"loss": 0.0079,
"step": 89
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.030171874910593033,
"learning_rate": 3.803421678562213e-05,
"loss": 0.009,
"step": 90
},
{
"epoch": 3.076923076923077,
"eval_loss": 0.011335253715515137,
"eval_runtime": 20.4524,
"eval_samples_per_second": 2.445,
"eval_steps_per_second": 0.636,
"step": 90
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.02197062224149704,
"learning_rate": 3.6864631090070655e-05,
"loss": 0.0077,
"step": 91
},
{
"epoch": 3.1452991452991452,
"grad_norm": 0.018409984186291695,
"learning_rate": 3.570271608006555e-05,
"loss": 0.0076,
"step": 92
},
{
"epoch": 3.1794871794871793,
"grad_norm": 0.02173592895269394,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.007,
"step": 93
},
{
"epoch": 3.213675213675214,
"grad_norm": 0.030453743413090706,
"learning_rate": 3.340460734357359e-05,
"loss": 0.0072,
"step": 94
},
{
"epoch": 3.247863247863248,
"grad_norm": 0.01577071100473404,
"learning_rate": 3.226975564787322e-05,
"loss": 0.0062,
"step": 95
},
{
"epoch": 3.247863247863248,
"eval_loss": 0.011275053024291992,
"eval_runtime": 11.7759,
"eval_samples_per_second": 4.246,
"eval_steps_per_second": 1.104,
"step": 95
},
{
"epoch": 3.282051282051282,
"grad_norm": 0.020528519526124,
"learning_rate": 3.114525791558398e-05,
"loss": 0.0077,
"step": 96
},
{
"epoch": 3.316239316239316,
"grad_norm": 0.028798261657357216,
"learning_rate": 3.003177082171523e-05,
"loss": 0.0076,
"step": 97
},
{
"epoch": 3.3504273504273505,
"grad_norm": 0.023175662383437157,
"learning_rate": 2.8929944611373554e-05,
"loss": 0.0072,
"step": 98
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.020334038883447647,
"learning_rate": 2.784042272003794e-05,
"loss": 0.0069,
"step": 99
},
{
"epoch": 3.4188034188034186,
"grad_norm": 0.024305541068315506,
"learning_rate": 2.6763841397811573e-05,
"loss": 0.0069,
"step": 100
},
{
"epoch": 3.4188034188034186,
"eval_loss": 0.011122054420411587,
"eval_runtime": 9.713,
"eval_samples_per_second": 5.148,
"eval_steps_per_second": 1.338,
"step": 100
},
{
"epoch": 3.452991452991453,
"grad_norm": 0.022912999615073204,
"learning_rate": 2.57008293378697e-05,
"loss": 0.0094,
"step": 101
},
{
"epoch": 3.4871794871794872,
"grad_norm": 0.021239478141069412,
"learning_rate": 2.4652007309320498e-05,
"loss": 0.0064,
"step": 102
},
{
"epoch": 3.5213675213675213,
"grad_norm": 0.017512433230876923,
"learning_rate": 2.361798779469336e-05,
"loss": 0.0074,
"step": 103
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.018754741176962852,
"learning_rate": 2.259937463226651e-05,
"loss": 0.0063,
"step": 104
},
{
"epoch": 3.58974358974359,
"grad_norm": 0.019357949495315552,
"learning_rate": 2.1596762663442218e-05,
"loss": 0.0071,
"step": 105
},
{
"epoch": 3.58974358974359,
"eval_loss": 0.01076994277536869,
"eval_runtime": 9.7069,
"eval_samples_per_second": 5.151,
"eval_steps_per_second": 1.339,
"step": 105
},
{
"epoch": 3.623931623931624,
"grad_norm": 0.02121426723897457,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0078,
"step": 106
},
{
"epoch": 3.658119658119658,
"grad_norm": 0.020560307428240776,
"learning_rate": 1.9641874609064443e-05,
"loss": 0.0079,
"step": 107
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.024254970252513885,
"learning_rate": 1.8690740123084316e-05,
"loss": 0.0067,
"step": 108
},
{
"epoch": 3.7264957264957266,
"grad_norm": 0.020139144733548164,
"learning_rate": 1.7757889363191483e-05,
"loss": 0.0066,
"step": 109
},
{
"epoch": 3.7606837606837606,
"grad_norm": 0.01883375644683838,
"learning_rate": 1.684386708796025e-05,
"loss": 0.006,
"step": 110
},
{
"epoch": 3.7606837606837606,
"eval_loss": 0.010681088082492352,
"eval_runtime": 9.7636,
"eval_samples_per_second": 5.121,
"eval_steps_per_second": 1.331,
"step": 110
},
{
"epoch": 3.7948717948717947,
"grad_norm": 0.017693698406219482,
"learning_rate": 1.5949207060660138e-05,
"loss": 0.0061,
"step": 111
},
{
"epoch": 3.8290598290598292,
"grad_norm": 0.020463695749640465,
"learning_rate": 1.5074431737553157e-05,
"loss": 0.0072,
"step": 112
},
{
"epoch": 3.8632478632478633,
"grad_norm": 0.020722465589642525,
"learning_rate": 1.422005196279395e-05,
"loss": 0.0091,
"step": 113
},
{
"epoch": 3.8974358974358974,
"grad_norm": 0.020328683778643608,
"learning_rate": 1.338656667011134e-05,
"loss": 0.0071,
"step": 114
},
{
"epoch": 3.931623931623932,
"grad_norm": 0.0204046331346035,
"learning_rate": 1.257446259144494e-05,
"loss": 0.0065,
"step": 115
},
{
"epoch": 3.931623931623932,
"eval_loss": 0.010614048689603806,
"eval_runtime": 9.7105,
"eval_samples_per_second": 5.149,
"eval_steps_per_second": 1.339,
"step": 115
},
{
"epoch": 3.965811965811966,
"grad_norm": 0.018496442586183548,
"learning_rate": 1.178421397270758e-05,
"loss": 0.0059,
"step": 116
},
{
"epoch": 4.0,
"grad_norm": 0.04243512079119682,
"learning_rate": 1.1016282296838887e-05,
"loss": 0.0121,
"step": 117
},
{
"epoch": 4.034188034188034,
"grad_norm": 0.01636839285492897,
"learning_rate": 1.0271116014312293e-05,
"loss": 0.005,
"step": 118
},
{
"epoch": 4.068376068376068,
"grad_norm": 0.016252394765615463,
"learning_rate": 9.549150281252633e-06,
"loss": 0.0053,
"step": 119
},
{
"epoch": 4.102564102564102,
"grad_norm": 0.016149073839187622,
"learning_rate": 8.850806705317183e-06,
"loss": 0.0076,
"step": 120
},
{
"epoch": 4.102564102564102,
"eval_loss": 0.010566945187747478,
"eval_runtime": 9.7295,
"eval_samples_per_second": 5.139,
"eval_steps_per_second": 1.336,
"step": 120
},
{
"epoch": 4.136752136752137,
"grad_norm": 0.01633269712328911,
"learning_rate": 8.176493099488663e-06,
"loss": 0.0072,
"step": 121
},
{
"epoch": 4.170940170940171,
"grad_norm": 0.018178582191467285,
"learning_rate": 7.526603243923957e-06,
"loss": 0.0061,
"step": 122
},
{
"epoch": 4.205128205128205,
"grad_norm": 0.017663611099123955,
"learning_rate": 6.901516655997536e-06,
"loss": 0.006,
"step": 123
},
{
"epoch": 4.239316239316239,
"grad_norm": 0.01672632433474064,
"learning_rate": 6.301598368674105e-06,
"loss": 0.0059,
"step": 124
},
{
"epoch": 4.273504273504273,
"grad_norm": 0.01625417172908783,
"learning_rate": 5.727198717339511e-06,
"loss": 0.0057,
"step": 125
},
{
"epoch": 4.273504273504273,
"eval_loss": 0.010598141700029373,
"eval_runtime": 9.7399,
"eval_samples_per_second": 5.133,
"eval_steps_per_second": 1.335,
"step": 125
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.016820693388581276,
"learning_rate": 5.178653135214812e-06,
"loss": 0.0051,
"step": 126
},
{
"epoch": 4.3418803418803416,
"grad_norm": 0.017715787515044212,
"learning_rate": 4.65628195747273e-06,
"loss": 0.0058,
"step": 127
},
{
"epoch": 4.3760683760683765,
"grad_norm": 0.01836553029716015,
"learning_rate": 4.16039023417088e-06,
"loss": 0.0051,
"step": 128
},
{
"epoch": 4.410256410256411,
"grad_norm": 0.02160499058663845,
"learning_rate": 3.691267552111183e-06,
"loss": 0.0054,
"step": 129
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.019846314564347267,
"learning_rate": 3.249187865729264e-06,
"loss": 0.0071,
"step": 130
},
{
"epoch": 4.444444444444445,
"eval_loss": 0.010587843134999275,
"eval_runtime": 9.707,
"eval_samples_per_second": 5.151,
"eval_steps_per_second": 1.339,
"step": 130
},
{
"epoch": 4.478632478632479,
"grad_norm": 0.01771419309079647,
"learning_rate": 2.8344093371128424e-06,
"loss": 0.005,
"step": 131
},
{
"epoch": 4.512820512820513,
"grad_norm": 0.015574207529425621,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.0048,
"step": 132
},
{
"epoch": 4.547008547008547,
"grad_norm": 0.01756260357797146,
"learning_rate": 2.087708544541689e-06,
"loss": 0.0047,
"step": 133
},
{
"epoch": 4.581196581196581,
"grad_norm": 0.02206473983824253,
"learning_rate": 1.7562223328224325e-06,
"loss": 0.0053,
"step": 134
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.017151422798633575,
"learning_rate": 1.4529091286973995e-06,
"loss": 0.0055,
"step": 135
},
{
"epoch": 4.615384615384615,
"eval_loss": 0.010594615712761879,
"eval_runtime": 9.7136,
"eval_samples_per_second": 5.147,
"eval_steps_per_second": 1.338,
"step": 135
},
{
"epoch": 4.64957264957265,
"grad_norm": 0.018389733508229256,
"learning_rate": 1.1779460585363944e-06,
"loss": 0.0053,
"step": 136
},
{
"epoch": 4.683760683760684,
"grad_norm": 0.016087768599390984,
"learning_rate": 9.314936930293283e-07,
"loss": 0.0048,
"step": 137
},
{
"epoch": 4.717948717948718,
"grad_norm": 0.019610892981290817,
"learning_rate": 7.136959534174592e-07,
"loss": 0.0056,
"step": 138
},
{
"epoch": 4.752136752136752,
"grad_norm": 0.018921220675110817,
"learning_rate": 5.246800274474439e-07,
"loss": 0.0065,
"step": 139
},
{
"epoch": 4.786324786324786,
"grad_norm": 0.020553983747959137,
"learning_rate": 3.6455629509730136e-07,
"loss": 0.0053,
"step": 140
},
{
"epoch": 4.786324786324786,
"eval_loss": 0.010573416948318481,
"eval_runtime": 9.7174,
"eval_samples_per_second": 5.145,
"eval_steps_per_second": 1.338,
"step": 140
},
{
"epoch": 4.82051282051282,
"grad_norm": 0.015205741859972477,
"learning_rate": 2.334182641175686e-07,
"loss": 0.0038,
"step": 141
},
{
"epoch": 4.854700854700854,
"grad_norm": 0.017062170431017876,
"learning_rate": 1.3134251542544774e-07,
"loss": 0.006,
"step": 142
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.01831440068781376,
"learning_rate": 5.838865838366792e-08,
"loss": 0.0063,
"step": 143
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.019888985902071,
"learning_rate": 1.4599295990352924e-08,
"loss": 0.0068,
"step": 144
},
{
"epoch": 4.957264957264957,
"grad_norm": 0.01834353245794773,
"learning_rate": 0.0,
"loss": 0.0074,
"step": 145
},
{
"epoch": 4.957264957264957,
"eval_loss": 0.010564341209828854,
"eval_runtime": 9.7196,
"eval_samples_per_second": 5.144,
"eval_steps_per_second": 1.338,
"step": 145
}
],
"logging_steps": 1,
"max_steps": 145,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.332736073022669e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}