Llama3.1-8B-Middo-Alpaca / trainer_state.json
Word2Li's picture
Upload model
970c363 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9988901220865705,
"eval_steps": 500,
"global_step": 225,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004439511653718091,
"grad_norm": 37.533348083496094,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.824,
"step": 1
},
{
"epoch": 0.008879023307436182,
"grad_norm": 46.24335479736328,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.8786,
"step": 2
},
{
"epoch": 0.013318534961154272,
"grad_norm": 42.045166015625,
"learning_rate": 8.571428571428571e-06,
"loss": 1.7863,
"step": 3
},
{
"epoch": 0.017758046614872364,
"grad_norm": 8.661040306091309,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.2891,
"step": 4
},
{
"epoch": 0.022197558268590455,
"grad_norm": 3.85249400138855,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.2127,
"step": 5
},
{
"epoch": 0.026637069922308545,
"grad_norm": 7.926912307739258,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.2012,
"step": 6
},
{
"epoch": 0.03107658157602664,
"grad_norm": 4.07999324798584,
"learning_rate": 2e-05,
"loss": 1.2275,
"step": 7
},
{
"epoch": 0.03551609322974473,
"grad_norm": 3.668304681777954,
"learning_rate": 1.9998961636899736e-05,
"loss": 1.2301,
"step": 8
},
{
"epoch": 0.03995560488346282,
"grad_norm": 3.8191933631896973,
"learning_rate": 1.9995846763238514e-05,
"loss": 1.18,
"step": 9
},
{
"epoch": 0.04439511653718091,
"grad_norm": 3.0913546085357666,
"learning_rate": 1.9990656025890315e-05,
"loss": 1.1692,
"step": 10
},
{
"epoch": 0.048834628190899,
"grad_norm": 2.7952189445495605,
"learning_rate": 1.9983390502829168e-05,
"loss": 1.2112,
"step": 11
},
{
"epoch": 0.05327413984461709,
"grad_norm": 2.480602979660034,
"learning_rate": 1.997405170290528e-05,
"loss": 1.2294,
"step": 12
},
{
"epoch": 0.05771365149833518,
"grad_norm": 2.486560344696045,
"learning_rate": 1.9962641565531694e-05,
"loss": 1.222,
"step": 13
},
{
"epoch": 0.06215316315205328,
"grad_norm": 2.59287428855896,
"learning_rate": 1.994916246028154e-05,
"loss": 1.1607,
"step": 14
},
{
"epoch": 0.06659267480577137,
"grad_norm": 2.5444483757019043,
"learning_rate": 1.9933617186395917e-05,
"loss": 1.2571,
"step": 15
},
{
"epoch": 0.07103218645948946,
"grad_norm": 2.5133490562438965,
"learning_rate": 1.9916008972202586e-05,
"loss": 1.1338,
"step": 16
},
{
"epoch": 0.07547169811320754,
"grad_norm": 2.2021095752716064,
"learning_rate": 1.9896341474445526e-05,
"loss": 1.2253,
"step": 17
},
{
"epoch": 0.07991120976692564,
"grad_norm": 2.341919183731079,
"learning_rate": 1.987461877752552e-05,
"loss": 1.2575,
"step": 18
},
{
"epoch": 0.08435072142064373,
"grad_norm": 2.27514386177063,
"learning_rate": 1.985084539265195e-05,
"loss": 1.1895,
"step": 19
},
{
"epoch": 0.08879023307436182,
"grad_norm": 2.0287599563598633,
"learning_rate": 1.982502625690595e-05,
"loss": 1.122,
"step": 20
},
{
"epoch": 0.0932297447280799,
"grad_norm": 2.33461594581604,
"learning_rate": 1.9797166732215078e-05,
"loss": 1.2319,
"step": 21
},
{
"epoch": 0.097669256381798,
"grad_norm": 2.374116897583008,
"learning_rate": 1.9767272604239823e-05,
"loss": 1.2025,
"step": 22
},
{
"epoch": 0.10210876803551609,
"grad_norm": 2.2956087589263916,
"learning_rate": 1.973535008117207e-05,
"loss": 1.2078,
"step": 23
},
{
"epoch": 0.10654827968923418,
"grad_norm": 2.1211025714874268,
"learning_rate": 1.9701405792445815e-05,
"loss": 1.1912,
"step": 24
},
{
"epoch": 0.11098779134295228,
"grad_norm": 2.2014005184173584,
"learning_rate": 1.9665446787360444e-05,
"loss": 1.1932,
"step": 25
},
{
"epoch": 0.11542730299667037,
"grad_norm": 2.1209704875946045,
"learning_rate": 1.962748053361675e-05,
"loss": 1.2348,
"step": 26
},
{
"epoch": 0.11986681465038845,
"grad_norm": 2.0373430252075195,
"learning_rate": 1.9587514915766124e-05,
"loss": 1.2503,
"step": 27
},
{
"epoch": 0.12430632630410655,
"grad_norm": 2.3817737102508545,
"learning_rate": 1.9545558233573136e-05,
"loss": 1.2131,
"step": 28
},
{
"epoch": 0.12874583795782463,
"grad_norm": 2.1462056636810303,
"learning_rate": 1.950161920029191e-05,
"loss": 1.1736,
"step": 29
},
{
"epoch": 0.13318534961154274,
"grad_norm": 2.1412036418914795,
"learning_rate": 1.9455706940856602e-05,
"loss": 1.1716,
"step": 30
},
{
"epoch": 0.13762486126526083,
"grad_norm": 2.2089121341705322,
"learning_rate": 1.940783098998643e-05,
"loss": 1.169,
"step": 31
},
{
"epoch": 0.14206437291897892,
"grad_norm": 2.101334571838379,
"learning_rate": 1.9358001290205542e-05,
"loss": 1.2226,
"step": 32
},
{
"epoch": 0.146503884572697,
"grad_norm": 2.436990976333618,
"learning_rate": 1.9306228189778255e-05,
"loss": 1.192,
"step": 33
},
{
"epoch": 0.1509433962264151,
"grad_norm": 2.069035768508911,
"learning_rate": 1.925252244055998e-05,
"loss": 1.2419,
"step": 34
},
{
"epoch": 0.15538290788013318,
"grad_norm": 2.099047899246216,
"learning_rate": 1.9196895195764363e-05,
"loss": 1.2004,
"step": 35
},
{
"epoch": 0.1598224195338513,
"grad_norm": 2.2311224937438965,
"learning_rate": 1.9139358007647085e-05,
"loss": 1.2065,
"step": 36
},
{
"epoch": 0.16426193118756938,
"grad_norm": 2.1131861209869385,
"learning_rate": 1.907992282510675e-05,
"loss": 1.2158,
"step": 37
},
{
"epoch": 0.16870144284128746,
"grad_norm": 1.924771785736084,
"learning_rate": 1.901860199120344e-05,
"loss": 1.2714,
"step": 38
},
{
"epoch": 0.17314095449500555,
"grad_norm": 1.9812465906143188,
"learning_rate": 1.8955408240595396e-05,
"loss": 1.2216,
"step": 39
},
{
"epoch": 0.17758046614872364,
"grad_norm": 1.9320334196090698,
"learning_rate": 1.8890354696894374e-05,
"loss": 1.2164,
"step": 40
},
{
"epoch": 0.18201997780244172,
"grad_norm": 1.9847488403320312,
"learning_rate": 1.8823454869940243e-05,
"loss": 1.1705,
"step": 41
},
{
"epoch": 0.1864594894561598,
"grad_norm": 1.973925232887268,
"learning_rate": 1.8754722652995346e-05,
"loss": 1.1984,
"step": 42
},
{
"epoch": 0.19089900110987792,
"grad_norm": 2.0115818977355957,
"learning_rate": 1.8684172319859258e-05,
"loss": 1.1659,
"step": 43
},
{
"epoch": 0.195338512763596,
"grad_norm": 2.3217947483062744,
"learning_rate": 1.861181852190451e-05,
"loss": 1.2463,
"step": 44
},
{
"epoch": 0.1997780244173141,
"grad_norm": 2.1839258670806885,
"learning_rate": 1.8537676285033886e-05,
"loss": 1.1657,
"step": 45
},
{
"epoch": 0.20421753607103219,
"grad_norm": 1.9523669481277466,
"learning_rate": 1.8461761006559982e-05,
"loss": 1.1766,
"step": 46
},
{
"epoch": 0.20865704772475027,
"grad_norm": 2.1495866775512695,
"learning_rate": 1.838408845200758e-05,
"loss": 1.2138,
"step": 47
},
{
"epoch": 0.21309655937846836,
"grad_norm": 2.060826539993286,
"learning_rate": 1.8304674751839583e-05,
"loss": 1.1817,
"step": 48
},
{
"epoch": 0.21753607103218647,
"grad_norm": 2.0041329860687256,
"learning_rate": 1.8223536398107177e-05,
"loss": 1.2327,
"step": 49
},
{
"epoch": 0.22197558268590456,
"grad_norm": 2.427309989929199,
"learning_rate": 1.8140690241024872e-05,
"loss": 1.2012,
"step": 50
},
{
"epoch": 0.22641509433962265,
"grad_norm": 2.0693812370300293,
"learning_rate": 1.8056153485471167e-05,
"loss": 1.1806,
"step": 51
},
{
"epoch": 0.23085460599334073,
"grad_norm": 2.1914312839508057,
"learning_rate": 1.7969943687415575e-05,
"loss": 1.1908,
"step": 52
},
{
"epoch": 0.23529411764705882,
"grad_norm": 2.1551156044006348,
"learning_rate": 1.788207875027274e-05,
"loss": 1.2127,
"step": 53
},
{
"epoch": 0.2397336293007769,
"grad_norm": 2.090184211730957,
"learning_rate": 1.7792576921184374e-05,
"loss": 1.1944,
"step": 54
},
{
"epoch": 0.244173140954495,
"grad_norm": 1.9871045351028442,
"learning_rate": 1.7701456787229805e-05,
"loss": 1.1736,
"step": 55
},
{
"epoch": 0.2486126526082131,
"grad_norm": 2.1655359268188477,
"learning_rate": 1.7608737271566004e-05,
"loss": 1.2267,
"step": 56
},
{
"epoch": 0.25305216426193117,
"grad_norm": 2.2310895919799805,
"learning_rate": 1.751443762949772e-05,
"loss": 1.232,
"step": 57
},
{
"epoch": 0.25749167591564925,
"grad_norm": 2.07645583152771,
"learning_rate": 1.741857744447869e-05,
"loss": 1.1872,
"step": 58
},
{
"epoch": 0.2619311875693674,
"grad_norm": 2.183539628982544,
"learning_rate": 1.732117662404469e-05,
"loss": 1.253,
"step": 59
},
{
"epoch": 0.2663706992230855,
"grad_norm": 2.078212261199951,
"learning_rate": 1.7222255395679298e-05,
"loss": 1.2586,
"step": 60
},
{
"epoch": 0.27081021087680357,
"grad_norm": 1.9478346109390259,
"learning_rate": 1.712183430261319e-05,
"loss": 1.3174,
"step": 61
},
{
"epoch": 0.27524972253052166,
"grad_norm": 1.9654473066329956,
"learning_rate": 1.7019934199557868e-05,
"loss": 1.2072,
"step": 62
},
{
"epoch": 0.27968923418423974,
"grad_norm": 2.0590426921844482,
"learning_rate": 1.691657624837472e-05,
"loss": 1.2234,
"step": 63
},
{
"epoch": 0.28412874583795783,
"grad_norm": 1.9870823621749878,
"learning_rate": 1.6811781913680273e-05,
"loss": 1.2366,
"step": 64
},
{
"epoch": 0.2885682574916759,
"grad_norm": 2.0088582038879395,
"learning_rate": 1.6705572958388576e-05,
"loss": 1.1597,
"step": 65
},
{
"epoch": 0.293007769145394,
"grad_norm": 2.107692241668701,
"learning_rate": 1.659797143919165e-05,
"loss": 1.1738,
"step": 66
},
{
"epoch": 0.2974472807991121,
"grad_norm": 1.8893859386444092,
"learning_rate": 1.6488999701978905e-05,
"loss": 1.1214,
"step": 67
},
{
"epoch": 0.3018867924528302,
"grad_norm": 1.8780573606491089,
"learning_rate": 1.6378680377196526e-05,
"loss": 1.1799,
"step": 68
},
{
"epoch": 0.30632630410654826,
"grad_norm": 2.0093393325805664,
"learning_rate": 1.6267036375147728e-05,
"loss": 1.187,
"step": 69
},
{
"epoch": 0.31076581576026635,
"grad_norm": 1.988927960395813,
"learning_rate": 1.615409088123493e-05,
"loss": 1.2668,
"step": 70
},
{
"epoch": 0.31520532741398444,
"grad_norm": 1.8668824434280396,
"learning_rate": 1.6039867351144778e-05,
"loss": 1.2241,
"step": 71
},
{
"epoch": 0.3196448390677026,
"grad_norm": 2.02083420753479,
"learning_rate": 1.5924389505977038e-05,
"loss": 1.228,
"step": 72
},
{
"epoch": 0.32408435072142067,
"grad_norm": 2.080263137817383,
"learning_rate": 1.5807681327318372e-05,
"loss": 1.1584,
"step": 73
},
{
"epoch": 0.32852386237513875,
"grad_norm": 1.9211077690124512,
"learning_rate": 1.5689767052262028e-05,
"loss": 1.2324,
"step": 74
},
{
"epoch": 0.33296337402885684,
"grad_norm": 2.1998391151428223,
"learning_rate": 1.557067116837444e-05,
"loss": 1.1829,
"step": 75
},
{
"epoch": 0.3374028856825749,
"grad_norm": 1.973344087600708,
"learning_rate": 1.545041840860986e-05,
"loss": 1.233,
"step": 76
},
{
"epoch": 0.341842397336293,
"grad_norm": 2.0612385272979736,
"learning_rate": 1.5329033746173975e-05,
"loss": 1.2338,
"step": 77
},
{
"epoch": 0.3462819089900111,
"grad_norm": 2.0529823303222656,
"learning_rate": 1.520654238933767e-05,
"loss": 1.1973,
"step": 78
},
{
"epoch": 0.3507214206437292,
"grad_norm": 1.8720424175262451,
"learning_rate": 1.5082969776201948e-05,
"loss": 1.2133,
"step": 79
},
{
"epoch": 0.3551609322974473,
"grad_norm": 1.9517823457717896,
"learning_rate": 1.4958341569415149e-05,
"loss": 1.2102,
"step": 80
},
{
"epoch": 0.35960044395116536,
"grad_norm": 1.9612551927566528,
"learning_rate": 1.483268365084351e-05,
"loss": 1.1888,
"step": 81
},
{
"epoch": 0.36403995560488345,
"grad_norm": 2.1608946323394775,
"learning_rate": 1.4706022116196208e-05,
"loss": 1.2082,
"step": 82
},
{
"epoch": 0.36847946725860153,
"grad_norm": 1.951665997505188,
"learning_rate": 1.4578383269606004e-05,
"loss": 1.2041,
"step": 83
},
{
"epoch": 0.3729189789123196,
"grad_norm": 1.9137777090072632,
"learning_rate": 1.4449793618166594e-05,
"loss": 1.2091,
"step": 84
},
{
"epoch": 0.37735849056603776,
"grad_norm": 1.9383291006088257,
"learning_rate": 1.4320279866427798e-05,
"loss": 1.1901,
"step": 85
},
{
"epoch": 0.38179800221975585,
"grad_norm": 1.8352899551391602,
"learning_rate": 1.4189868910849779e-05,
"loss": 1.1741,
"step": 86
},
{
"epoch": 0.38623751387347394,
"grad_norm": 1.9220095872879028,
"learning_rate": 1.4058587834217356e-05,
"loss": 1.1934,
"step": 87
},
{
"epoch": 0.390677025527192,
"grad_norm": 1.7801581621170044,
"learning_rate": 1.392646390001569e-05,
"loss": 1.1439,
"step": 88
},
{
"epoch": 0.3951165371809101,
"grad_norm": 1.8106623888015747,
"learning_rate": 1.3793524546768358e-05,
"loss": 1.1316,
"step": 89
},
{
"epoch": 0.3995560488346282,
"grad_norm": 1.822784423828125,
"learning_rate": 1.3659797382339162e-05,
"loss": 1.2326,
"step": 90
},
{
"epoch": 0.4039955604883463,
"grad_norm": 2.061378240585327,
"learning_rate": 1.3525310178198707e-05,
"loss": 1.2022,
"step": 91
},
{
"epoch": 0.40843507214206437,
"grad_norm": 1.999387264251709,
"learning_rate": 1.3390090863657048e-05,
"loss": 1.2236,
"step": 92
},
{
"epoch": 0.41287458379578246,
"grad_norm": 1.7549680471420288,
"learning_rate": 1.325416752006351e-05,
"loss": 1.1421,
"step": 93
},
{
"epoch": 0.41731409544950054,
"grad_norm": 1.8363287448883057,
"learning_rate": 1.311756837497499e-05,
"loss": 1.1325,
"step": 94
},
{
"epoch": 0.42175360710321863,
"grad_norm": 1.942063808441162,
"learning_rate": 1.2980321796293838e-05,
"loss": 1.1475,
"step": 95
},
{
"epoch": 0.4261931187569367,
"grad_norm": 2.029550790786743,
"learning_rate": 1.284245628637665e-05,
"loss": 1.1715,
"step": 96
},
{
"epoch": 0.4306326304106548,
"grad_norm": 2.0929226875305176,
"learning_rate": 1.2704000476115079e-05,
"loss": 1.1595,
"step": 97
},
{
"epoch": 0.43507214206437295,
"grad_norm": 2.0529980659484863,
"learning_rate": 1.256498311899001e-05,
"loss": 1.193,
"step": 98
},
{
"epoch": 0.43951165371809103,
"grad_norm": 1.9034109115600586,
"learning_rate": 1.2425433085100224e-05,
"loss": 1.1566,
"step": 99
},
{
"epoch": 0.4439511653718091,
"grad_norm": 2.2665998935699463,
"learning_rate": 1.2285379355166893e-05,
"loss": 1.1709,
"step": 100
},
{
"epoch": 0.4483906770255272,
"grad_norm": 2.0044686794281006,
"learning_rate": 1.2144851014515055e-05,
"loss": 1.1688,
"step": 101
},
{
"epoch": 0.4528301886792453,
"grad_norm": 1.9301621913909912,
"learning_rate": 1.2003877247033411e-05,
"loss": 1.1855,
"step": 102
},
{
"epoch": 0.4572697003329634,
"grad_norm": 1.996848464012146,
"learning_rate": 1.1862487329113606e-05,
"loss": 1.1603,
"step": 103
},
{
"epoch": 0.46170921198668147,
"grad_norm": 2.0279271602630615,
"learning_rate": 1.172071062357035e-05,
"loss": 1.1789,
"step": 104
},
{
"epoch": 0.46614872364039955,
"grad_norm": 1.9275884628295898,
"learning_rate": 1.1578576573543541e-05,
"loss": 1.1339,
"step": 105
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.9847296476364136,
"learning_rate": 1.1436114696383749e-05,
"loss": 1.186,
"step": 106
},
{
"epoch": 0.4750277469478357,
"grad_norm": 1.9284783601760864,
"learning_rate": 1.1293354577522264e-05,
"loss": 1.1697,
"step": 107
},
{
"epoch": 0.4794672586015538,
"grad_norm": 1.8462920188903809,
"learning_rate": 1.1150325864327003e-05,
"loss": 1.2233,
"step": 108
},
{
"epoch": 0.4839067702552719,
"grad_norm": 1.9206749200820923,
"learning_rate": 1.1007058259945584e-05,
"loss": 1.2148,
"step": 109
},
{
"epoch": 0.48834628190899,
"grad_norm": 1.9734727144241333,
"learning_rate": 1.0863581517136776e-05,
"loss": 1.1367,
"step": 110
},
{
"epoch": 0.49278579356270813,
"grad_norm": 1.83944833278656,
"learning_rate": 1.0719925432091671e-05,
"loss": 1.1834,
"step": 111
},
{
"epoch": 0.4972253052164262,
"grad_norm": 1.9524726867675781,
"learning_rate": 1.0576119838245843e-05,
"loss": 1.1046,
"step": 112
},
{
"epoch": 0.5016648168701443,
"grad_norm": 1.8533103466033936,
"learning_rate": 1.043219460008374e-05,
"loss": 1.1694,
"step": 113
},
{
"epoch": 0.5061043285238623,
"grad_norm": 2.0331192016601562,
"learning_rate": 1.0288179606936666e-05,
"loss": 1.2525,
"step": 114
},
{
"epoch": 0.5105438401775805,
"grad_norm": 1.7501027584075928,
"learning_rate": 1.0144104766775574e-05,
"loss": 1.1787,
"step": 115
},
{
"epoch": 0.5149833518312985,
"grad_norm": 1.8049185276031494,
"learning_rate": 1e-05,
"loss": 1.1388,
"step": 116
},
{
"epoch": 0.5194228634850167,
"grad_norm": 1.9093164205551147,
"learning_rate": 9.855895233224431e-06,
"loss": 1.1042,
"step": 117
},
{
"epoch": 0.5238623751387348,
"grad_norm": 1.7633061408996582,
"learning_rate": 9.711820393063337e-06,
"loss": 1.1286,
"step": 118
},
{
"epoch": 0.5283018867924528,
"grad_norm": 1.8089503049850464,
"learning_rate": 9.56780539991626e-06,
"loss": 1.1385,
"step": 119
},
{
"epoch": 0.532741398446171,
"grad_norm": 1.827928066253662,
"learning_rate": 9.423880161754158e-06,
"loss": 1.1864,
"step": 120
},
{
"epoch": 0.537180910099889,
"grad_norm": 1.9098917245864868,
"learning_rate": 9.28007456790833e-06,
"loss": 1.1412,
"step": 121
},
{
"epoch": 0.5416204217536071,
"grad_norm": 1.7700555324554443,
"learning_rate": 9.13641848286323e-06,
"loss": 1.1733,
"step": 122
},
{
"epoch": 0.5460599334073252,
"grad_norm": 1.829474687576294,
"learning_rate": 8.992941740054418e-06,
"loss": 1.1297,
"step": 123
},
{
"epoch": 0.5504994450610433,
"grad_norm": 1.9246844053268433,
"learning_rate": 8.849674135672999e-06,
"loss": 1.2096,
"step": 124
},
{
"epoch": 0.5549389567147613,
"grad_norm": 1.9404821395874023,
"learning_rate": 8.706645422477739e-06,
"loss": 1.1531,
"step": 125
},
{
"epoch": 0.5593784683684795,
"grad_norm": 1.8768739700317383,
"learning_rate": 8.563885303616253e-06,
"loss": 1.1896,
"step": 126
},
{
"epoch": 0.5638179800221975,
"grad_norm": 1.9277173280715942,
"learning_rate": 8.42142342645646e-06,
"loss": 1.112,
"step": 127
},
{
"epoch": 0.5682574916759157,
"grad_norm": 1.9244784116744995,
"learning_rate": 8.279289376429653e-06,
"loss": 1.1457,
"step": 128
},
{
"epoch": 0.5726970033296337,
"grad_norm": 1.856616497039795,
"learning_rate": 8.137512670886397e-06,
"loss": 1.139,
"step": 129
},
{
"epoch": 0.5771365149833518,
"grad_norm": 1.7110472917556763,
"learning_rate": 7.996122752966596e-06,
"loss": 1.1181,
"step": 130
},
{
"epoch": 0.58157602663707,
"grad_norm": 1.7752193212509155,
"learning_rate": 7.855148985484946e-06,
"loss": 1.1517,
"step": 131
},
{
"epoch": 0.586015538290788,
"grad_norm": 1.789296269416809,
"learning_rate": 7.71462064483311e-06,
"loss": 1.0982,
"step": 132
},
{
"epoch": 0.5904550499445061,
"grad_norm": 2.0241036415100098,
"learning_rate": 7.574566914899779e-06,
"loss": 1.1787,
"step": 133
},
{
"epoch": 0.5948945615982242,
"grad_norm": 2.217411518096924,
"learning_rate": 7.4350168810099955e-06,
"loss": 1.1871,
"step": 134
},
{
"epoch": 0.5993340732519423,
"grad_norm": 1.8291338682174683,
"learning_rate": 7.295999523884921e-06,
"loss": 1.1396,
"step": 135
},
{
"epoch": 0.6037735849056604,
"grad_norm": 1.818039059638977,
"learning_rate": 7.157543713623353e-06,
"loss": 1.1719,
"step": 136
},
{
"epoch": 0.6082130965593785,
"grad_norm": 1.8456182479858398,
"learning_rate": 7.019678203706164e-06,
"loss": 1.1714,
"step": 137
},
{
"epoch": 0.6126526082130965,
"grad_norm": 1.9036660194396973,
"learning_rate": 6.882431625025016e-06,
"loss": 1.1439,
"step": 138
},
{
"epoch": 0.6170921198668147,
"grad_norm": 1.7687510251998901,
"learning_rate": 6.745832479936492e-06,
"loss": 1.1511,
"step": 139
},
{
"epoch": 0.6215316315205327,
"grad_norm": 1.9004747867584229,
"learning_rate": 6.609909136342956e-06,
"loss": 1.172,
"step": 140
},
{
"epoch": 0.6259711431742508,
"grad_norm": 1.736116886138916,
"learning_rate": 6.474689821801295e-06,
"loss": 1.1422,
"step": 141
},
{
"epoch": 0.6304106548279689,
"grad_norm": 2.1278252601623535,
"learning_rate": 6.340202617660842e-06,
"loss": 1.1553,
"step": 142
},
{
"epoch": 0.634850166481687,
"grad_norm": 1.7536156177520752,
"learning_rate": 6.206475453231644e-06,
"loss": 1.2252,
"step": 143
},
{
"epoch": 0.6392896781354052,
"grad_norm": 1.6779848337173462,
"learning_rate": 6.073536099984314e-06,
"loss": 1.193,
"step": 144
},
{
"epoch": 0.6437291897891232,
"grad_norm": 1.8742091655731201,
"learning_rate": 5.941412165782645e-06,
"loss": 1.1921,
"step": 145
},
{
"epoch": 0.6481687014428413,
"grad_norm": 1.8728663921356201,
"learning_rate": 5.810131089150228e-06,
"loss": 1.1906,
"step": 146
},
{
"epoch": 0.6526082130965594,
"grad_norm": 1.6480120420455933,
"learning_rate": 5.6797201335722064e-06,
"loss": 1.1297,
"step": 147
},
{
"epoch": 0.6570477247502775,
"grad_norm": 1.6961437463760376,
"learning_rate": 5.550206381833409e-06,
"loss": 1.213,
"step": 148
},
{
"epoch": 0.6614872364039955,
"grad_norm": 1.7690813541412354,
"learning_rate": 5.421616730394e-06,
"loss": 1.1643,
"step": 149
},
{
"epoch": 0.6659267480577137,
"grad_norm": 1.621023178100586,
"learning_rate": 5.293977883803797e-06,
"loss": 1.1866,
"step": 150
},
{
"epoch": 0.6703662597114317,
"grad_norm": 1.6755465269088745,
"learning_rate": 5.167316349156495e-06,
"loss": 1.1487,
"step": 151
},
{
"epoch": 0.6748057713651499,
"grad_norm": 1.739989995956421,
"learning_rate": 5.041658430584852e-06,
"loss": 1.148,
"step": 152
},
{
"epoch": 0.6792452830188679,
"grad_norm": 1.6027244329452515,
"learning_rate": 4.917030223798057e-06,
"loss": 1.1462,
"step": 153
},
{
"epoch": 0.683684794672586,
"grad_norm": 1.7501620054244995,
"learning_rate": 4.793457610662334e-06,
"loss": 1.1887,
"step": 154
},
{
"epoch": 0.6881243063263041,
"grad_norm": 1.8939120769500732,
"learning_rate": 4.670966253826027e-06,
"loss": 1.0909,
"step": 155
},
{
"epoch": 0.6925638179800222,
"grad_norm": 1.7964297533035278,
"learning_rate": 4.549581591390142e-06,
"loss": 1.1357,
"step": 156
},
{
"epoch": 0.6970033296337403,
"grad_norm": 1.8703395128250122,
"learning_rate": 4.429328831625565e-06,
"loss": 1.1529,
"step": 157
},
{
"epoch": 0.7014428412874584,
"grad_norm": 1.676789402961731,
"learning_rate": 4.310232947737979e-06,
"loss": 1.1435,
"step": 158
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.8037596940994263,
"learning_rate": 4.192318672681631e-06,
"loss": 1.1878,
"step": 159
},
{
"epoch": 0.7103218645948945,
"grad_norm": 1.6602219343185425,
"learning_rate": 4.0756104940229645e-06,
"loss": 1.1887,
"step": 160
},
{
"epoch": 0.7147613762486127,
"grad_norm": 1.8436843156814575,
"learning_rate": 3.960132648855226e-06,
"loss": 1.1406,
"step": 161
},
{
"epoch": 0.7192008879023307,
"grad_norm": 1.7904314994812012,
"learning_rate": 3.845909118765073e-06,
"loss": 1.149,
"step": 162
},
{
"epoch": 0.7236403995560489,
"grad_norm": 1.7600526809692383,
"learning_rate": 3.732963624852275e-06,
"loss": 1.136,
"step": 163
},
{
"epoch": 0.7280799112097669,
"grad_norm": 1.912719488143921,
"learning_rate": 3.6213196228034796e-06,
"loss": 1.1052,
"step": 164
},
{
"epoch": 0.732519422863485,
"grad_norm": 1.8316577672958374,
"learning_rate": 3.511000298021098e-06,
"loss": 1.115,
"step": 165
},
{
"epoch": 0.7369589345172031,
"grad_norm": 1.9175002574920654,
"learning_rate": 3.402028560808357e-06,
"loss": 1.1997,
"step": 166
},
{
"epoch": 0.7413984461709212,
"grad_norm": 1.7501415014266968,
"learning_rate": 3.2944270416114256e-06,
"loss": 1.1601,
"step": 167
},
{
"epoch": 0.7458379578246392,
"grad_norm": 1.8293687105178833,
"learning_rate": 3.1882180863197308e-06,
"loss": 1.131,
"step": 168
},
{
"epoch": 0.7502774694783574,
"grad_norm": 1.7824769020080566,
"learning_rate": 3.0834237516252817e-06,
"loss": 1.1655,
"step": 169
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.672102689743042,
"learning_rate": 2.980065800442137e-06,
"loss": 1.1496,
"step": 170
},
{
"epoch": 0.7591564927857936,
"grad_norm": 1.8063626289367676,
"learning_rate": 2.878165697386812e-06,
"loss": 1.1314,
"step": 171
},
{
"epoch": 0.7635960044395117,
"grad_norm": 1.9365928173065186,
"learning_rate": 2.777744604320706e-06,
"loss": 1.2001,
"step": 172
},
{
"epoch": 0.7680355160932297,
"grad_norm": 2.0963966846466064,
"learning_rate": 2.678823375955314e-06,
"loss": 1.1451,
"step": 173
},
{
"epoch": 0.7724750277469479,
"grad_norm": 1.6925102472305298,
"learning_rate": 2.581422555521316e-06,
"loss": 1.1666,
"step": 174
},
{
"epoch": 0.7769145394006659,
"grad_norm": 1.7280200719833374,
"learning_rate": 2.485562370502279e-06,
"loss": 1.1394,
"step": 175
},
{
"epoch": 0.781354051054384,
"grad_norm": 2.9482946395874023,
"learning_rate": 2.391262728433995e-06,
"loss": 1.1453,
"step": 176
},
{
"epoch": 0.7857935627081021,
"grad_norm": 1.7591135501861572,
"learning_rate": 2.2985432127701945e-06,
"loss": 1.1422,
"step": 177
},
{
"epoch": 0.7902330743618202,
"grad_norm": 1.9076871871948242,
"learning_rate": 2.2074230788156325e-06,
"loss": 1.133,
"step": 178
},
{
"epoch": 0.7946725860155383,
"grad_norm": 1.7988766431808472,
"learning_rate": 2.1179212497272582e-06,
"loss": 1.1535,
"step": 179
},
{
"epoch": 0.7991120976692564,
"grad_norm": 1.9537118673324585,
"learning_rate": 2.030056312584424e-06,
"loss": 1.1871,
"step": 180
},
{
"epoch": 0.8035516093229744,
"grad_norm": 1.6785671710968018,
"learning_rate": 1.9438465145288377e-06,
"loss": 1.0986,
"step": 181
},
{
"epoch": 0.8079911209766926,
"grad_norm": 1.8023606538772583,
"learning_rate": 1.8593097589751318e-06,
"loss": 1.118,
"step": 182
},
{
"epoch": 0.8124306326304107,
"grad_norm": 1.6982240676879883,
"learning_rate": 1.7764636018928249e-06,
"loss": 1.1521,
"step": 183
},
{
"epoch": 0.8168701442841287,
"grad_norm": 1.7196639776229858,
"learning_rate": 1.6953252481604198e-06,
"loss": 1.1243,
"step": 184
},
{
"epoch": 0.8213096559378469,
"grad_norm": 1.6617742776870728,
"learning_rate": 1.6159115479924259e-06,
"loss": 1.1496,
"step": 185
},
{
"epoch": 0.8257491675915649,
"grad_norm": 1.8764078617095947,
"learning_rate": 1.5382389934400199e-06,
"loss": 1.1417,
"step": 186
},
{
"epoch": 0.8301886792452831,
"grad_norm": 1.743654489517212,
"learning_rate": 1.462323714966114e-06,
"loss": 1.1322,
"step": 187
},
{
"epoch": 0.8346281908990011,
"grad_norm": 1.691676139831543,
"learning_rate": 1.3881814780954916e-06,
"loss": 1.104,
"step": 188
},
{
"epoch": 0.8390677025527192,
"grad_norm": 1.6910086870193481,
"learning_rate": 1.3158276801407432e-06,
"loss": 1.181,
"step": 189
},
{
"epoch": 0.8435072142064373,
"grad_norm": 1.7811787128448486,
"learning_rate": 1.2452773470046543e-06,
"loss": 1.2015,
"step": 190
},
{
"epoch": 0.8479467258601554,
"grad_norm": 1.8758302927017212,
"learning_rate": 1.1765451300597574e-06,
"loss": 1.1697,
"step": 191
},
{
"epoch": 0.8523862375138734,
"grad_norm": 1.7708605527877808,
"learning_rate": 1.1096453031056265e-06,
"loss": 1.1007,
"step": 192
},
{
"epoch": 0.8568257491675916,
"grad_norm": 1.7193588018417358,
"learning_rate": 1.0445917594046073e-06,
"loss": 1.1135,
"step": 193
},
{
"epoch": 0.8612652608213096,
"grad_norm": 1.6113449335098267,
"learning_rate": 9.813980087965625e-07,
"loss": 1.1501,
"step": 194
},
{
"epoch": 0.8657047724750278,
"grad_norm": 1.726749062538147,
"learning_rate": 9.200771748932513e-07,
"loss": 1.0861,
"step": 195
},
{
"epoch": 0.8701442841287459,
"grad_norm": 1.7274186611175537,
"learning_rate": 8.606419923529175e-07,
"loss": 1.1256,
"step": 196
},
{
"epoch": 0.8745837957824639,
"grad_norm": 1.7909616231918335,
"learning_rate": 8.031048042356393e-07,
"loss": 1.1844,
"step": 197
},
{
"epoch": 0.8790233074361821,
"grad_norm": 1.769081473350525,
"learning_rate": 7.474775594400252e-07,
"loss": 1.1142,
"step": 198
},
{
"epoch": 0.8834628190899001,
"grad_norm": 1.5884616374969482,
"learning_rate": 6.937718102217461e-07,
"loss": 1.1735,
"step": 199
},
{
"epoch": 0.8879023307436182,
"grad_norm": 1.8353629112243652,
"learning_rate": 6.41998709794458e-07,
"loss": 1.1656,
"step": 200
},
{
"epoch": 0.8923418423973363,
"grad_norm": 1.836683988571167,
"learning_rate": 5.921690100135713e-07,
"loss": 1.1665,
"step": 201
},
{
"epoch": 0.8967813540510544,
"grad_norm": 1.7683050632476807,
"learning_rate": 5.442930591433992e-07,
"loss": 1.0818,
"step": 202
},
{
"epoch": 0.9012208657047724,
"grad_norm": 1.7694001197814941,
"learning_rate": 4.983807997080925e-07,
"loss": 1.1625,
"step": 203
},
{
"epoch": 0.9056603773584906,
"grad_norm": 1.639148473739624,
"learning_rate": 4.544417664268652e-07,
"loss": 1.1001,
"step": 204
},
{
"epoch": 0.9100998890122086,
"grad_norm": 1.682405710220337,
"learning_rate": 4.124850842338779e-07,
"loss": 1.1392,
"step": 205
},
{
"epoch": 0.9145394006659268,
"grad_norm": 1.705399751663208,
"learning_rate": 3.725194663832521e-07,
"loss": 1.1308,
"step": 206
},
{
"epoch": 0.9189789123196448,
"grad_norm": 1.77009117603302,
"learning_rate": 3.345532126395579e-07,
"loss": 1.1334,
"step": 207
},
{
"epoch": 0.9234184239733629,
"grad_norm": 1.6911001205444336,
"learning_rate": 2.985942075541848e-07,
"loss": 1.1135,
"step": 208
},
{
"epoch": 0.9278579356270811,
"grad_norm": 1.6834074258804321,
"learning_rate": 2.646499188279328e-07,
"loss": 1.1604,
"step": 209
},
{
"epoch": 0.9322974472807991,
"grad_norm": 1.6770280599594116,
"learning_rate": 2.3272739576017945e-07,
"loss": 1.1296,
"step": 210
},
{
"epoch": 0.9367369589345172,
"grad_norm": 1.8066668510437012,
"learning_rate": 2.028332677849254e-07,
"loss": 1.1261,
"step": 211
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.8163933753967285,
"learning_rate": 1.7497374309405346e-07,
"loss": 1.174,
"step": 212
},
{
"epoch": 0.9456159822419534,
"grad_norm": 1.6583095788955688,
"learning_rate": 1.49154607348051e-07,
"loss": 1.1626,
"step": 213
},
{
"epoch": 0.9500554938956715,
"grad_norm": 1.7477093935012817,
"learning_rate": 1.2538122247448325e-07,
"loss": 1.1512,
"step": 214
},
{
"epoch": 0.9544950055493896,
"grad_norm": 1.6833035945892334,
"learning_rate": 1.0365852555447642e-07,
"loss": 1.116,
"step": 215
},
{
"epoch": 0.9589345172031076,
"grad_norm": 1.6046544313430786,
"learning_rate": 8.39910277974132e-08,
"loss": 1.1263,
"step": 216
},
{
"epoch": 0.9633740288568258,
"grad_norm": 1.6821755170822144,
"learning_rate": 6.638281360408339e-08,
"loss": 1.1357,
"step": 217
},
{
"epoch": 0.9678135405105438,
"grad_norm": 1.7433803081512451,
"learning_rate": 5.083753971846239e-08,
"loss": 1.0744,
"step": 218
},
{
"epoch": 0.9722530521642619,
"grad_norm": 1.7408146858215332,
"learning_rate": 3.735843446830867e-08,
"loss": 1.1356,
"step": 219
},
{
"epoch": 0.97669256381798,
"grad_norm": 1.7871052026748657,
"learning_rate": 2.5948297094724463e-08,
"loss": 1.1227,
"step": 220
},
{
"epoch": 0.9811320754716981,
"grad_norm": 1.5904314517974854,
"learning_rate": 1.6609497170834154e-08,
"loss": 1.0993,
"step": 221
},
{
"epoch": 0.9855715871254163,
"grad_norm": 1.8538672924041748,
"learning_rate": 9.343974109685684e-09,
"loss": 1.1632,
"step": 222
},
{
"epoch": 0.9900110987791343,
"grad_norm": 1.6794443130493164,
"learning_rate": 4.153236761488266e-09,
"loss": 1.0994,
"step": 223
},
{
"epoch": 0.9944506104328524,
"grad_norm": 1.6343417167663574,
"learning_rate": 1.0383631002686133e-09,
"loss": 1.1072,
"step": 224
},
{
"epoch": 0.9988901220865705,
"grad_norm": 1.9944053888320923,
"learning_rate": 0.0,
"loss": 1.1393,
"step": 225
},
{
"epoch": 0.9988901220865705,
"step": 225,
"total_flos": 8.515440547840655e+17,
"train_loss": 1.1814873923195732,
"train_runtime": 2241.1241,
"train_samples_per_second": 25.717,
"train_steps_per_second": 0.1
}
],
"logging_steps": 1.0,
"max_steps": 225,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.515440547840655e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}