mateoguaman's picture
Upload folder using huggingface_hub
3ea3e46 verified
{
"best_metric": 2.6793947219848633,
"best_model_checkpoint": "data/paligemma2-3b-pt-224-sft-lora-magicsoup_no_cfiphone_no_insta_sub5/checkpoint-3580",
"epoch": 1.0,
"eval_steps": 1790,
"global_step": 7159,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001396843134515994,
"grad_norm": 11.714253425598145,
"learning_rate": 1.3966480446927375e-07,
"loss": 15.8723,
"mean_token_accuracy": 0.051948051899671555,
"step": 1
},
{
"epoch": 0.0027936862690319877,
"grad_norm": 11.453227996826172,
"learning_rate": 2.7932960893854746e-06,
"loss": 15.6868,
"mean_token_accuracy": 0.04536910433518259,
"step": 20
},
{
"epoch": 0.0055873725380639755,
"grad_norm": 14.288809776306152,
"learning_rate": 5.586592178770949e-06,
"loss": 15.2233,
"mean_token_accuracy": 0.04797077886760235,
"step": 40
},
{
"epoch": 0.008381058807095963,
"grad_norm": 15.974210739135742,
"learning_rate": 8.379888268156424e-06,
"loss": 12.5816,
"mean_token_accuracy": 0.06761363632977009,
"step": 60
},
{
"epoch": 0.011174745076127951,
"grad_norm": 8.255433082580566,
"learning_rate": 1.1173184357541899e-05,
"loss": 8.1704,
"mean_token_accuracy": 0.019724026112817226,
"step": 80
},
{
"epoch": 0.013968431345159939,
"grad_norm": 4.060810565948486,
"learning_rate": 1.3966480446927374e-05,
"loss": 5.8302,
"mean_token_accuracy": 0.0771915590390563,
"step": 100
},
{
"epoch": 0.016762117614191926,
"grad_norm": 5.3958940505981445,
"learning_rate": 1.675977653631285e-05,
"loss": 5.1183,
"mean_token_accuracy": 0.09870129823684692,
"step": 120
},
{
"epoch": 0.019555803883223914,
"grad_norm": 6.21713924407959,
"learning_rate": 1.9553072625698323e-05,
"loss": 4.6938,
"mean_token_accuracy": 0.11225649379193783,
"step": 140
},
{
"epoch": 0.022349490152255902,
"grad_norm": 10.742652893066406,
"learning_rate": 2.2346368715083797e-05,
"loss": 4.375,
"mean_token_accuracy": 0.12938311770558358,
"step": 160
},
{
"epoch": 0.02514317642128789,
"grad_norm": 14.718498229980469,
"learning_rate": 2.5139664804469275e-05,
"loss": 4.1989,
"mean_token_accuracy": 0.16461038812994958,
"step": 180
},
{
"epoch": 0.027936862690319877,
"grad_norm": 10.117100715637207,
"learning_rate": 2.793296089385475e-05,
"loss": 3.8056,
"mean_token_accuracy": 0.2379870109260082,
"step": 200
},
{
"epoch": 0.030730548959351865,
"grad_norm": 6.335511207580566,
"learning_rate": 3.0726256983240227e-05,
"loss": 3.5194,
"mean_token_accuracy": 0.2828733794391155,
"step": 220
},
{
"epoch": 0.03352423522838385,
"grad_norm": 8.714235305786133,
"learning_rate": 3.35195530726257e-05,
"loss": 3.4042,
"mean_token_accuracy": 0.2905844166874886,
"step": 240
},
{
"epoch": 0.03631792149741584,
"grad_norm": 28.925806045532227,
"learning_rate": 3.6312849162011175e-05,
"loss": 3.3014,
"mean_token_accuracy": 0.30211039036512377,
"step": 260
},
{
"epoch": 0.03911160776644783,
"grad_norm": 7.385269641876221,
"learning_rate": 3.9106145251396646e-05,
"loss": 3.2824,
"mean_token_accuracy": 0.2980519443750381,
"step": 280
},
{
"epoch": 0.041905294035479816,
"grad_norm": 26.001882553100586,
"learning_rate": 4.1899441340782123e-05,
"loss": 3.2237,
"mean_token_accuracy": 0.3021103873848915,
"step": 300
},
{
"epoch": 0.044698980304511804,
"grad_norm": 6.193004131317139,
"learning_rate": 4.4692737430167594e-05,
"loss": 3.1911,
"mean_token_accuracy": 0.3050324633717537,
"step": 320
},
{
"epoch": 0.04749266657354379,
"grad_norm": 10.755631446838379,
"learning_rate": 4.748603351955307e-05,
"loss": 3.1985,
"mean_token_accuracy": 0.30081168562173843,
"step": 340
},
{
"epoch": 0.05028635284257578,
"grad_norm": 13.610740661621094,
"learning_rate": 5.027932960893855e-05,
"loss": 3.1074,
"mean_token_accuracy": 0.31006493270397184,
"step": 360
},
{
"epoch": 0.05308003911160777,
"grad_norm": 7.772305011749268,
"learning_rate": 5.307262569832403e-05,
"loss": 3.0981,
"mean_token_accuracy": 0.3107142850756645,
"step": 380
},
{
"epoch": 0.055873725380639755,
"grad_norm": 8.072247505187988,
"learning_rate": 5.58659217877095e-05,
"loss": 3.0943,
"mean_token_accuracy": 0.31095779240131377,
"step": 400
},
{
"epoch": 0.05866741164967174,
"grad_norm": 7.870853900909424,
"learning_rate": 5.8659217877094976e-05,
"loss": 3.0422,
"mean_token_accuracy": 0.31168831288814547,
"step": 420
},
{
"epoch": 0.06146109791870373,
"grad_norm": 19.650827407836914,
"learning_rate": 6.145251396648045e-05,
"loss": 3.085,
"mean_token_accuracy": 0.31087662279605865,
"step": 440
},
{
"epoch": 0.06425478418773571,
"grad_norm": 17.583972930908203,
"learning_rate": 6.424581005586592e-05,
"loss": 3.0207,
"mean_token_accuracy": 0.3114448055624962,
"step": 460
},
{
"epoch": 0.0670484704567677,
"grad_norm": 8.581819534301758,
"learning_rate": 6.70391061452514e-05,
"loss": 3.0396,
"mean_token_accuracy": 0.31160714030265807,
"step": 480
},
{
"epoch": 0.06984215672579969,
"grad_norm": 10.295416831970215,
"learning_rate": 6.983240223463688e-05,
"loss": 3.0311,
"mean_token_accuracy": 0.3094155818223953,
"step": 500
},
{
"epoch": 0.07263584299483168,
"grad_norm": 11.54842472076416,
"learning_rate": 7.262569832402235e-05,
"loss": 2.9806,
"mean_token_accuracy": 0.3171266242861748,
"step": 520
},
{
"epoch": 0.07542952926386366,
"grad_norm": 6.4206743240356445,
"learning_rate": 7.541899441340783e-05,
"loss": 3.0046,
"mean_token_accuracy": 0.3163961052894592,
"step": 540
},
{
"epoch": 0.07822321553289566,
"grad_norm": 6.957503795623779,
"learning_rate": 7.821229050279329e-05,
"loss": 2.9919,
"mean_token_accuracy": 0.31185064762830733,
"step": 560
},
{
"epoch": 0.08101690180192764,
"grad_norm": 4.134042263031006,
"learning_rate": 8.100558659217878e-05,
"loss": 3.0287,
"mean_token_accuracy": 0.3153409093618393,
"step": 580
},
{
"epoch": 0.08381058807095963,
"grad_norm": 5.571300029754639,
"learning_rate": 8.379888268156425e-05,
"loss": 2.9836,
"mean_token_accuracy": 0.31599026173353195,
"step": 600
},
{
"epoch": 0.08660427433999161,
"grad_norm": 5.823517799377441,
"learning_rate": 8.659217877094973e-05,
"loss": 2.9693,
"mean_token_accuracy": 0.3173701331019402,
"step": 620
},
{
"epoch": 0.08939796060902361,
"grad_norm": 5.942770004272461,
"learning_rate": 8.938547486033519e-05,
"loss": 2.957,
"mean_token_accuracy": 0.32094155699014665,
"step": 640
},
{
"epoch": 0.09219164687805559,
"grad_norm": 5.494757175445557,
"learning_rate": 9.217877094972067e-05,
"loss": 2.9391,
"mean_token_accuracy": 0.3202922075986862,
"step": 660
},
{
"epoch": 0.09498533314708758,
"grad_norm": 3.7874789237976074,
"learning_rate": 9.497206703910614e-05,
"loss": 2.9662,
"mean_token_accuracy": 0.318506495654583,
"step": 680
},
{
"epoch": 0.09777901941611956,
"grad_norm": 4.3834733963012695,
"learning_rate": 9.776536312849163e-05,
"loss": 2.9742,
"mean_token_accuracy": 0.315259738266468,
"step": 700
},
{
"epoch": 0.10057270568515156,
"grad_norm": 3.8033530712127686,
"learning_rate": 9.999990489938263e-05,
"loss": 2.9255,
"mean_token_accuracy": 0.3217532455921173,
"step": 720
},
{
"epoch": 0.10336639195418354,
"grad_norm": 5.953765869140625,
"learning_rate": 9.99965764157593e-05,
"loss": 2.8773,
"mean_token_accuracy": 0.32402597516775133,
"step": 740
},
{
"epoch": 0.10616007822321553,
"grad_norm": 8.89194107055664,
"learning_rate": 9.998849326302563e-05,
"loss": 2.9062,
"mean_token_accuracy": 0.3172889605164528,
"step": 760
},
{
"epoch": 0.10895376449224752,
"grad_norm": 4.883932113647461,
"learning_rate": 9.997565620988856e-05,
"loss": 2.889,
"mean_token_accuracy": 0.324594159424305,
"step": 780
},
{
"epoch": 0.11174745076127951,
"grad_norm": 6.9450297355651855,
"learning_rate": 9.995806647715047e-05,
"loss": 2.902,
"mean_token_accuracy": 0.319155840575695,
"step": 800
},
{
"epoch": 0.11454113703031149,
"grad_norm": 3.826904296875,
"learning_rate": 9.99357257375931e-05,
"loss": 2.845,
"mean_token_accuracy": 0.32288961112499237,
"step": 820
},
{
"epoch": 0.11733482329934349,
"grad_norm": 3.4276959896087646,
"learning_rate": 9.99086361158184e-05,
"loss": 2.8776,
"mean_token_accuracy": 0.31899350732564924,
"step": 840
},
{
"epoch": 0.12012850956837547,
"grad_norm": 4.847935199737549,
"learning_rate": 9.987680018804652e-05,
"loss": 2.8714,
"mean_token_accuracy": 0.32126623690128325,
"step": 860
},
{
"epoch": 0.12292219583740746,
"grad_norm": 3.4353716373443604,
"learning_rate": 9.984022098187083e-05,
"loss": 2.8408,
"mean_token_accuracy": 0.3215097412467003,
"step": 880
},
{
"epoch": 0.12571588210643944,
"grad_norm": 4.255288600921631,
"learning_rate": 9.979890197596993e-05,
"loss": 2.887,
"mean_token_accuracy": 0.3237824708223343,
"step": 900
},
{
"epoch": 0.12850956837547142,
"grad_norm": 4.189087867736816,
"learning_rate": 9.97528470997769e-05,
"loss": 2.8593,
"mean_token_accuracy": 0.324756495654583,
"step": 920
},
{
"epoch": 0.13130325464450343,
"grad_norm": 2.641284227371216,
"learning_rate": 9.97020607331056e-05,
"loss": 2.8606,
"mean_token_accuracy": 0.32142857313156126,
"step": 940
},
{
"epoch": 0.1340969409135354,
"grad_norm": 2.688258171081543,
"learning_rate": 9.964654770573408e-05,
"loss": 2.8676,
"mean_token_accuracy": 0.32646103799343107,
"step": 960
},
{
"epoch": 0.1368906271825674,
"grad_norm": 3.801523208618164,
"learning_rate": 9.958631329694537e-05,
"loss": 2.8623,
"mean_token_accuracy": 0.3244318202137947,
"step": 980
},
{
"epoch": 0.13968431345159937,
"grad_norm": 3.528196334838867,
"learning_rate": 9.952136323502536e-05,
"loss": 2.865,
"mean_token_accuracy": 0.32459415644407275,
"step": 1000
},
{
"epoch": 0.14247799972063138,
"grad_norm": 3.970863103866577,
"learning_rate": 9.945170369671802e-05,
"loss": 2.8762,
"mean_token_accuracy": 0.32102272659540176,
"step": 1020
},
{
"epoch": 0.14527168598966336,
"grad_norm": 4.364096641540527,
"learning_rate": 9.937734130663807e-05,
"loss": 2.8521,
"mean_token_accuracy": 0.321185065805912,
"step": 1040
},
{
"epoch": 0.14806537225869534,
"grad_norm": 2.6619839668273926,
"learning_rate": 9.92982831366409e-05,
"loss": 2.8557,
"mean_token_accuracy": 0.32167208194732666,
"step": 1060
},
{
"epoch": 0.15085905852772732,
"grad_norm": 2.8934223651885986,
"learning_rate": 9.921453670515009e-05,
"loss": 2.8138,
"mean_token_accuracy": 0.3263798728585243,
"step": 1080
},
{
"epoch": 0.15365274479675933,
"grad_norm": 3.3745875358581543,
"learning_rate": 9.91261099764424e-05,
"loss": 2.8387,
"mean_token_accuracy": 0.3244318187236786,
"step": 1100
},
{
"epoch": 0.1564464310657913,
"grad_norm": 3.961806297302246,
"learning_rate": 9.903301135989032e-05,
"loss": 2.833,
"mean_token_accuracy": 0.324918831884861,
"step": 1120
},
{
"epoch": 0.1592401173348233,
"grad_norm": 2.1680517196655273,
"learning_rate": 9.893524970916242e-05,
"loss": 2.8295,
"mean_token_accuracy": 0.3221590921282768,
"step": 1140
},
{
"epoch": 0.16203380360385528,
"grad_norm": 2.9997398853302,
"learning_rate": 9.883283432138129e-05,
"loss": 2.8284,
"mean_token_accuracy": 0.32670454382896424,
"step": 1160
},
{
"epoch": 0.16482748987288728,
"grad_norm": 6.048096656799316,
"learning_rate": 9.872577493623945e-05,
"loss": 2.814,
"mean_token_accuracy": 0.3250811696052551,
"step": 1180
},
{
"epoch": 0.16762117614191926,
"grad_norm": 2.815396547317505,
"learning_rate": 9.861408173507304e-05,
"loss": 2.8529,
"mean_token_accuracy": 0.32467532753944395,
"step": 1200
},
{
"epoch": 0.17041486241095125,
"grad_norm": 4.112369537353516,
"learning_rate": 9.849776533989369e-05,
"loss": 2.7942,
"mean_token_accuracy": 0.327272729575634,
"step": 1220
},
{
"epoch": 0.17320854867998323,
"grad_norm": 2.6627895832061768,
"learning_rate": 9.837683681237819e-05,
"loss": 2.8087,
"mean_token_accuracy": 0.3275974065065384,
"step": 1240
},
{
"epoch": 0.17600223494901523,
"grad_norm": 3.526514768600464,
"learning_rate": 9.825130765281668e-05,
"loss": 2.8224,
"mean_token_accuracy": 0.32508117258548735,
"step": 1260
},
{
"epoch": 0.17879592121804722,
"grad_norm": 3.236506938934326,
"learning_rate": 9.812118979901891e-05,
"loss": 2.8373,
"mean_token_accuracy": 0.32394480556249616,
"step": 1280
},
{
"epoch": 0.1815896074870792,
"grad_norm": 3.168816566467285,
"learning_rate": 9.7986495625179e-05,
"loss": 2.8059,
"mean_token_accuracy": 0.3279220789670944,
"step": 1300
},
{
"epoch": 0.18438329375611118,
"grad_norm": 4.702609062194824,
"learning_rate": 9.784723794069852e-05,
"loss": 2.7648,
"mean_token_accuracy": 0.33125000149011613,
"step": 1320
},
{
"epoch": 0.18717698002514319,
"grad_norm": 3.00978684425354,
"learning_rate": 9.770342998896851e-05,
"loss": 2.8123,
"mean_token_accuracy": 0.32670454680919647,
"step": 1340
},
{
"epoch": 0.18997066629417517,
"grad_norm": 5.219450950622559,
"learning_rate": 9.755508544610994e-05,
"loss": 2.7966,
"mean_token_accuracy": 0.3340097412467003,
"step": 1360
},
{
"epoch": 0.19276435256320715,
"grad_norm": 2.8464038372039795,
"learning_rate": 9.740221841967307e-05,
"loss": 2.7738,
"mean_token_accuracy": 0.32849026173353196,
"step": 1380
},
{
"epoch": 0.19555803883223913,
"grad_norm": 3.846038341522217,
"learning_rate": 9.72448434472959e-05,
"loss": 2.7938,
"mean_token_accuracy": 0.3283279240131378,
"step": 1400
},
{
"epoch": 0.19835172510127114,
"grad_norm": 2.432882070541382,
"learning_rate": 9.708297549532157e-05,
"loss": 2.7805,
"mean_token_accuracy": 0.329788963496685,
"step": 1420
},
{
"epoch": 0.20114541137030312,
"grad_norm": 2.405036687850952,
"learning_rate": 9.691662995737516e-05,
"loss": 2.7497,
"mean_token_accuracy": 0.327353897690773,
"step": 1440
},
{
"epoch": 0.2039390976393351,
"grad_norm": 2.1323699951171875,
"learning_rate": 9.674582265289967e-05,
"loss": 2.7859,
"mean_token_accuracy": 0.3321428596973419,
"step": 1460
},
{
"epoch": 0.20673278390836708,
"grad_norm": 4.102099418640137,
"learning_rate": 9.657056982565161e-05,
"loss": 2.8042,
"mean_token_accuracy": 0.3272727280855179,
"step": 1480
},
{
"epoch": 0.2095264701773991,
"grad_norm": 3.1227469444274902,
"learning_rate": 9.639088814215627e-05,
"loss": 2.774,
"mean_token_accuracy": 0.32987013161182405,
"step": 1500
},
{
"epoch": 0.21232015644643107,
"grad_norm": 2.4661705493927,
"learning_rate": 9.620679469012266e-05,
"loss": 2.7377,
"mean_token_accuracy": 0.3315746784210205,
"step": 1520
},
{
"epoch": 0.21511384271546305,
"grad_norm": 3.3351097106933594,
"learning_rate": 9.601830697681853e-05,
"loss": 2.7786,
"mean_token_accuracy": 0.3308441549539566,
"step": 1540
},
{
"epoch": 0.21790752898449503,
"grad_norm": 2.9032342433929443,
"learning_rate": 9.582544292740542e-05,
"loss": 2.7485,
"mean_token_accuracy": 0.33076298981904984,
"step": 1560
},
{
"epoch": 0.22070121525352704,
"grad_norm": 2.4244565963745117,
"learning_rate": 9.562822088323396e-05,
"loss": 2.765,
"mean_token_accuracy": 0.3307629868388176,
"step": 1580
},
{
"epoch": 0.22349490152255902,
"grad_norm": 2.1915383338928223,
"learning_rate": 9.542665960009959e-05,
"loss": 2.7455,
"mean_token_accuracy": 0.33303571343421934,
"step": 1600
},
{
"epoch": 0.226288587791591,
"grad_norm": 4.944774150848389,
"learning_rate": 9.522077824645896e-05,
"loss": 2.7786,
"mean_token_accuracy": 0.3247564971446991,
"step": 1620
},
{
"epoch": 0.22908227406062298,
"grad_norm": 3.094341993331909,
"learning_rate": 9.501059640160696e-05,
"loss": 2.7678,
"mean_token_accuracy": 0.32930195033550264,
"step": 1640
},
{
"epoch": 0.231875960329655,
"grad_norm": 3.0549888610839844,
"learning_rate": 9.479613405381474e-05,
"loss": 2.7392,
"mean_token_accuracy": 0.33157467693090437,
"step": 1660
},
{
"epoch": 0.23466964659868697,
"grad_norm": 2.4436118602752686,
"learning_rate": 9.457741159842875e-05,
"loss": 2.7619,
"mean_token_accuracy": 0.32897727340459826,
"step": 1680
},
{
"epoch": 0.23746333286771895,
"grad_norm": 2.174079179763794,
"learning_rate": 9.435444983593133e-05,
"loss": 2.7419,
"mean_token_accuracy": 0.33214285373687746,
"step": 1700
},
{
"epoch": 0.24025701913675093,
"grad_norm": 2.1228203773498535,
"learning_rate": 9.412726996996242e-05,
"loss": 2.711,
"mean_token_accuracy": 0.3351461037993431,
"step": 1720
},
{
"epoch": 0.24305070540578294,
"grad_norm": 2.5232136249542236,
"learning_rate": 9.389589360530315e-05,
"loss": 2.7503,
"mean_token_accuracy": 0.33506493717432023,
"step": 1740
},
{
"epoch": 0.24584439167481492,
"grad_norm": 2.0878124237060547,
"learning_rate": 9.366034274582125e-05,
"loss": 2.7578,
"mean_token_accuracy": 0.33417208194732667,
"step": 1760
},
{
"epoch": 0.2486380779438469,
"grad_norm": 2.1424570083618164,
"learning_rate": 9.342063979237846e-05,
"loss": 2.7409,
"mean_token_accuracy": 0.3298701301217079,
"step": 1780
},
{
"epoch": 0.2500349210783629,
"eval_loss": 2.7205357551574707,
"eval_mean_token_accuracy": 0.33310290950003457,
"eval_runtime": 1163.4728,
"eval_samples_per_second": 84.514,
"eval_steps_per_second": 1.509,
"step": 1790
},
{
"epoch": 0.2514317642128789,
"grad_norm": 2.787091016769409,
"learning_rate": 9.317680754070017e-05,
"loss": 2.7103,
"mean_token_accuracy": 0.3347402632236481,
"step": 1800
},
{
"epoch": 0.25422545048191086,
"grad_norm": 2.668544292449951,
"learning_rate": 9.29288691792077e-05,
"loss": 2.7083,
"mean_token_accuracy": 0.3347402587532997,
"step": 1820
},
{
"epoch": 0.25701913675094284,
"grad_norm": 1.573211669921875,
"learning_rate": 9.267684828681286e-05,
"loss": 2.7137,
"mean_token_accuracy": 0.33206168860197066,
"step": 1840
},
{
"epoch": 0.2598128230199749,
"grad_norm": 2.3701252937316895,
"learning_rate": 9.242076883067579e-05,
"loss": 2.7062,
"mean_token_accuracy": 0.3372564911842346,
"step": 1860
},
{
"epoch": 0.26260650928900686,
"grad_norm": 2.19620418548584,
"learning_rate": 9.216065516392555e-05,
"loss": 2.7239,
"mean_token_accuracy": 0.33417207896709444,
"step": 1880
},
{
"epoch": 0.26540019555803884,
"grad_norm": 2.424910306930542,
"learning_rate": 9.18965320233443e-05,
"loss": 2.7034,
"mean_token_accuracy": 0.33741882890462876,
"step": 1900
},
{
"epoch": 0.2681938818270708,
"grad_norm": 1.7149094343185425,
"learning_rate": 9.162842452701463e-05,
"loss": 2.7212,
"mean_token_accuracy": 0.3329545482993126,
"step": 1920
},
{
"epoch": 0.2709875680961028,
"grad_norm": 2.663922071456909,
"learning_rate": 9.1356358171931e-05,
"loss": 2.7237,
"mean_token_accuracy": 0.33019480258226397,
"step": 1940
},
{
"epoch": 0.2737812543651348,
"grad_norm": 2.727353811264038,
"learning_rate": 9.10803588315749e-05,
"loss": 2.7267,
"mean_token_accuracy": 0.33165584653615954,
"step": 1960
},
{
"epoch": 0.27657494063416677,
"grad_norm": 2.4968833923339844,
"learning_rate": 9.080045275345429e-05,
"loss": 2.7363,
"mean_token_accuracy": 0.33368506729602815,
"step": 1980
},
{
"epoch": 0.27936862690319875,
"grad_norm": 2.447314977645874,
"learning_rate": 9.051666655660752e-05,
"loss": 2.691,
"mean_token_accuracy": 0.33392857313156127,
"step": 2000
},
{
"epoch": 0.2821623131722308,
"grad_norm": 3.4394643306732178,
"learning_rate": 9.022902722907173e-05,
"loss": 2.7273,
"mean_token_accuracy": 0.33084415793418886,
"step": 2020
},
{
"epoch": 0.28495599944126276,
"grad_norm": 1.788519024848938,
"learning_rate": 8.99375621253165e-05,
"loss": 2.7005,
"mean_token_accuracy": 0.3343344137072563,
"step": 2040
},
{
"epoch": 0.28774968571029474,
"grad_norm": 2.173936367034912,
"learning_rate": 8.964229896364223e-05,
"loss": 2.6749,
"mean_token_accuracy": 0.33271104097366333,
"step": 2060
},
{
"epoch": 0.2905433719793267,
"grad_norm": 2.7031307220458984,
"learning_rate": 8.934326582354426e-05,
"loss": 2.7266,
"mean_token_accuracy": 0.3330357104539871,
"step": 2080
},
{
"epoch": 0.2933370582483587,
"grad_norm": 1.726846694946289,
"learning_rate": 8.904049114304247e-05,
"loss": 2.7049,
"mean_token_accuracy": 0.33628246933221817,
"step": 2100
},
{
"epoch": 0.2961307445173907,
"grad_norm": 2.3227744102478027,
"learning_rate": 8.873400371597685e-05,
"loss": 2.7019,
"mean_token_accuracy": 0.3343344181776047,
"step": 2120
},
{
"epoch": 0.29892443078642267,
"grad_norm": 2.884831190109253,
"learning_rate": 8.842383268926917e-05,
"loss": 2.7492,
"mean_token_accuracy": 0.3277597427368164,
"step": 2140
},
{
"epoch": 0.30171811705545465,
"grad_norm": 2.0531013011932373,
"learning_rate": 8.811000756015115e-05,
"loss": 2.7191,
"mean_token_accuracy": 0.3326298698782921,
"step": 2160
},
{
"epoch": 0.3045118033244867,
"grad_norm": 2.0113601684570312,
"learning_rate": 8.779255817335927e-05,
"loss": 2.6986,
"mean_token_accuracy": 0.3370129883289337,
"step": 2180
},
{
"epoch": 0.30730548959351867,
"grad_norm": 2.0892691612243652,
"learning_rate": 8.74715147182965e-05,
"loss": 2.7063,
"mean_token_accuracy": 0.3331980526447296,
"step": 2200
},
{
"epoch": 0.31009917586255065,
"grad_norm": 1.9228285551071167,
"learning_rate": 8.714690772616134e-05,
"loss": 2.6696,
"mean_token_accuracy": 0.3384740263223648,
"step": 2220
},
{
"epoch": 0.3128928621315826,
"grad_norm": 2.699789524078369,
"learning_rate": 8.681876806704431e-05,
"loss": 2.6917,
"mean_token_accuracy": 0.33693181574344633,
"step": 2240
},
{
"epoch": 0.3156865484006146,
"grad_norm": 2.43989896774292,
"learning_rate": 8.648712694699214e-05,
"loss": 2.6816,
"mean_token_accuracy": 0.3405032455921173,
"step": 2260
},
{
"epoch": 0.3184802346696466,
"grad_norm": 1.8896594047546387,
"learning_rate": 8.615201590504017e-05,
"loss": 2.6781,
"mean_token_accuracy": 0.33904220908880234,
"step": 2280
},
{
"epoch": 0.32127392093867857,
"grad_norm": 2.3167271614074707,
"learning_rate": 8.58134668102129e-05,
"loss": 2.6755,
"mean_token_accuracy": 0.33563312143087387,
"step": 2300
},
{
"epoch": 0.32406760720771055,
"grad_norm": 2.3568389415740967,
"learning_rate": 8.547151185849332e-05,
"loss": 2.6837,
"mean_token_accuracy": 0.3358766257762909,
"step": 2320
},
{
"epoch": 0.3268612934767426,
"grad_norm": 2.21427845954895,
"learning_rate": 8.512618356976103e-05,
"loss": 2.6701,
"mean_token_accuracy": 0.3395292177796364,
"step": 2340
},
{
"epoch": 0.32965497974577457,
"grad_norm": 2.3978805541992188,
"learning_rate": 8.477751478469964e-05,
"loss": 2.7054,
"mean_token_accuracy": 0.3341720774769783,
"step": 2360
},
{
"epoch": 0.33244866601480655,
"grad_norm": 2.3003275394439697,
"learning_rate": 8.442553866167362e-05,
"loss": 2.6831,
"mean_token_accuracy": 0.3353896141052246,
"step": 2380
},
{
"epoch": 0.33524235228383853,
"grad_norm": 2.208958625793457,
"learning_rate": 8.40702886735749e-05,
"loss": 2.7123,
"mean_token_accuracy": 0.33327922224998474,
"step": 2400
},
{
"epoch": 0.3380360385528705,
"grad_norm": 2.9336678981781006,
"learning_rate": 8.371179860463962e-05,
"loss": 2.6517,
"mean_token_accuracy": 0.33806818127632143,
"step": 2420
},
{
"epoch": 0.3408297248219025,
"grad_norm": 2.476029634475708,
"learning_rate": 8.335010254723532e-05,
"loss": 2.6725,
"mean_token_accuracy": 0.3363636344671249,
"step": 2440
},
{
"epoch": 0.34362341109093447,
"grad_norm": 2.220250129699707,
"learning_rate": 8.298523489861864e-05,
"loss": 2.6884,
"mean_token_accuracy": 0.3369318187236786,
"step": 2460
},
{
"epoch": 0.34641709735996645,
"grad_norm": 2.475471258163452,
"learning_rate": 8.261723035766424e-05,
"loss": 2.6657,
"mean_token_accuracy": 0.3395292192697525,
"step": 2480
},
{
"epoch": 0.3492107836289985,
"grad_norm": 2.1862926483154297,
"learning_rate": 8.224612392156492e-05,
"loss": 2.6489,
"mean_token_accuracy": 0.33928571343421937,
"step": 2500
},
{
"epoch": 0.35200446989803047,
"grad_norm": 3.5017824172973633,
"learning_rate": 8.187195088250334e-05,
"loss": 2.6563,
"mean_token_accuracy": 0.3419642835855484,
"step": 2520
},
{
"epoch": 0.35479815616706245,
"grad_norm": 2.0507349967956543,
"learning_rate": 8.149474682429581e-05,
"loss": 2.6305,
"mean_token_accuracy": 0.3375,
"step": 2540
},
{
"epoch": 0.35759184243609443,
"grad_norm": 2.558877468109131,
"learning_rate": 8.111454761900823e-05,
"loss": 2.6551,
"mean_token_accuracy": 0.3370129868388176,
"step": 2560
},
{
"epoch": 0.3603855287051264,
"grad_norm": 2.8872764110565186,
"learning_rate": 8.073138942354468e-05,
"loss": 2.6755,
"mean_token_accuracy": 0.3384740278124809,
"step": 2580
},
{
"epoch": 0.3631792149741584,
"grad_norm": 2.2422327995300293,
"learning_rate": 8.034530867620884e-05,
"loss": 2.6338,
"mean_token_accuracy": 0.34042208045721056,
"step": 2600
},
{
"epoch": 0.3659729012431904,
"grad_norm": 3.2413196563720703,
"learning_rate": 7.995634209323886e-05,
"loss": 2.659,
"mean_token_accuracy": 0.33839286118745804,
"step": 2620
},
{
"epoch": 0.36876658751222235,
"grad_norm": 1.9311487674713135,
"learning_rate": 7.956452666531543e-05,
"loss": 2.6735,
"mean_token_accuracy": 0.3370129868388176,
"step": 2640
},
{
"epoch": 0.3715602737812544,
"grad_norm": 2.613398313522339,
"learning_rate": 7.91698996540442e-05,
"loss": 2.6615,
"mean_token_accuracy": 0.3362824648618698,
"step": 2660
},
{
"epoch": 0.37435396005028637,
"grad_norm": 2.089853048324585,
"learning_rate": 7.877249858841205e-05,
"loss": 2.6745,
"mean_token_accuracy": 0.3379058450460434,
"step": 2680
},
{
"epoch": 0.37714764631931835,
"grad_norm": 2.8007054328918457,
"learning_rate": 7.837236126121813e-05,
"loss": 2.6291,
"mean_token_accuracy": 0.3419642895460129,
"step": 2700
},
{
"epoch": 0.37994133258835033,
"grad_norm": 2.5469701290130615,
"learning_rate": 7.796952572547979e-05,
"loss": 2.6565,
"mean_token_accuracy": 0.33522727340459824,
"step": 2720
},
{
"epoch": 0.3827350188573823,
"grad_norm": 2.698303461074829,
"learning_rate": 7.756403029081371e-05,
"loss": 2.6347,
"mean_token_accuracy": 0.34009740352630613,
"step": 2740
},
{
"epoch": 0.3855287051264143,
"grad_norm": 2.7376692295074463,
"learning_rate": 7.71559135197927e-05,
"loss": 2.6732,
"mean_token_accuracy": 0.33725649267435076,
"step": 2760
},
{
"epoch": 0.3883223913954463,
"grad_norm": 2.4188475608825684,
"learning_rate": 7.674521422427837e-05,
"loss": 2.6648,
"mean_token_accuracy": 0.33855519592761996,
"step": 2780
},
{
"epoch": 0.39111607766447826,
"grad_norm": 2.3172502517700195,
"learning_rate": 7.633197146173011e-05,
"loss": 2.6581,
"mean_token_accuracy": 0.33376623392105104,
"step": 2800
},
{
"epoch": 0.3939097639335103,
"grad_norm": 2.007584810256958,
"learning_rate": 7.591622453149078e-05,
"loss": 2.6422,
"mean_token_accuracy": 0.3401785731315613,
"step": 2820
},
{
"epoch": 0.3967034502025423,
"grad_norm": 2.598574161529541,
"learning_rate": 7.549801297104935e-05,
"loss": 2.6408,
"mean_token_accuracy": 0.33725649267435076,
"step": 2840
},
{
"epoch": 0.39949713647157425,
"grad_norm": 2.0446927547454834,
"learning_rate": 7.50773765522808e-05,
"loss": 2.6269,
"mean_token_accuracy": 0.3426136389374733,
"step": 2860
},
{
"epoch": 0.40229082274060624,
"grad_norm": 2.690002918243408,
"learning_rate": 7.465435527766389e-05,
"loss": 2.5858,
"mean_token_accuracy": 0.34058441370725634,
"step": 2880
},
{
"epoch": 0.4050845090096382,
"grad_norm": 2.8425843715667725,
"learning_rate": 7.422898937647695e-05,
"loss": 2.5586,
"mean_token_accuracy": 0.3442370146512985,
"step": 2900
},
{
"epoch": 0.4078781952786702,
"grad_norm": 2.089576482772827,
"learning_rate": 7.380131930097206e-05,
"loss": 2.6763,
"mean_token_accuracy": 0.333441561460495,
"step": 2920
},
{
"epoch": 0.4106718815477022,
"grad_norm": 2.146989107131958,
"learning_rate": 7.337138572252797e-05,
"loss": 2.5974,
"mean_token_accuracy": 0.3448863625526428,
"step": 2940
},
{
"epoch": 0.41346556781673416,
"grad_norm": 1.8725186586380005,
"learning_rate": 7.293922952778239e-05,
"loss": 2.6124,
"mean_token_accuracy": 0.3443993508815765,
"step": 2960
},
{
"epoch": 0.4162592540857662,
"grad_norm": 2.520397186279297,
"learning_rate": 7.250489181474351e-05,
"loss": 2.6684,
"mean_token_accuracy": 0.3356331169605255,
"step": 2980
},
{
"epoch": 0.4190529403547982,
"grad_norm": 2.551262378692627,
"learning_rate": 7.206841388888183e-05,
"loss": 2.6146,
"mean_token_accuracy": 0.3406655803322792,
"step": 3000
},
{
"epoch": 0.42184662662383016,
"grad_norm": 2.6218693256378174,
"learning_rate": 7.16298372592017e-05,
"loss": 2.6026,
"mean_token_accuracy": 0.3432629868388176,
"step": 3020
},
{
"epoch": 0.42464031289286214,
"grad_norm": 1.9419686794281006,
"learning_rate": 7.118920363429405e-05,
"loss": 2.6163,
"mean_token_accuracy": 0.34115259498357775,
"step": 3040
},
{
"epoch": 0.4274339991618941,
"grad_norm": 2.822739601135254,
"learning_rate": 7.074655491836988e-05,
"loss": 2.6081,
"mean_token_accuracy": 0.34123376458883287,
"step": 3060
},
{
"epoch": 0.4302276854309261,
"grad_norm": 2.17193341255188,
"learning_rate": 7.030193320727508e-05,
"loss": 2.5796,
"mean_token_accuracy": 0.3430194780230522,
"step": 3080
},
{
"epoch": 0.4330213716999581,
"grad_norm": 2.2731964588165283,
"learning_rate": 6.985538078448714e-05,
"loss": 2.6235,
"mean_token_accuracy": 0.3362012967467308,
"step": 3100
},
{
"epoch": 0.43581505796899006,
"grad_norm": 2.6764817237854004,
"learning_rate": 6.940694011709411e-05,
"loss": 2.626,
"mean_token_accuracy": 0.3414772734045982,
"step": 3120
},
{
"epoch": 0.4386087442380221,
"grad_norm": 2.273409366607666,
"learning_rate": 6.895665385175587e-05,
"loss": 2.571,
"mean_token_accuracy": 0.3428571432828903,
"step": 3140
},
{
"epoch": 0.4414024305070541,
"grad_norm": 2.117824077606201,
"learning_rate": 6.850456481064841e-05,
"loss": 2.6061,
"mean_token_accuracy": 0.34269480109214784,
"step": 3160
},
{
"epoch": 0.44419611677608606,
"grad_norm": 2.7593703269958496,
"learning_rate": 6.80507159873916e-05,
"loss": 2.5878,
"mean_token_accuracy": 0.34480519592761993,
"step": 3180
},
{
"epoch": 0.44698980304511804,
"grad_norm": 2.5217463970184326,
"learning_rate": 6.759515054296033e-05,
"loss": 2.5867,
"mean_token_accuracy": 0.3475649327039719,
"step": 3200
},
{
"epoch": 0.44978348931415,
"grad_norm": 2.5164499282836914,
"learning_rate": 6.713791180158004e-05,
"loss": 2.6199,
"mean_token_accuracy": 0.3417207792401314,
"step": 3220
},
{
"epoch": 0.452577175583182,
"grad_norm": 4.4600701332092285,
"learning_rate": 6.667904324660648e-05,
"loss": 2.5562,
"mean_token_accuracy": 0.3468344137072563,
"step": 3240
},
{
"epoch": 0.455370861852214,
"grad_norm": 2.2503256797790527,
"learning_rate": 6.621858851639052e-05,
"loss": 2.598,
"mean_token_accuracy": 0.33928571343421937,
"step": 3260
},
{
"epoch": 0.45816454812124596,
"grad_norm": 2.7074501514434814,
"learning_rate": 6.575659140012813e-05,
"loss": 2.5749,
"mean_token_accuracy": 0.34350649267435074,
"step": 3280
},
{
"epoch": 0.460958234390278,
"grad_norm": 2.3996992111206055,
"learning_rate": 6.529309583369605e-05,
"loss": 2.5655,
"mean_token_accuracy": 0.3461850643157959,
"step": 3300
},
{
"epoch": 0.46375192065931,
"grad_norm": 2.124371290206909,
"learning_rate": 6.482814589547343e-05,
"loss": 2.5734,
"mean_token_accuracy": 0.3435064911842346,
"step": 3320
},
{
"epoch": 0.46654560692834196,
"grad_norm": 2.760023832321167,
"learning_rate": 6.436178580215006e-05,
"loss": 2.5831,
"mean_token_accuracy": 0.34821428507566454,
"step": 3340
},
{
"epoch": 0.46933929319737394,
"grad_norm": 2.2467293739318848,
"learning_rate": 6.389405990452131e-05,
"loss": 2.5386,
"mean_token_accuracy": 0.34837662279605863,
"step": 3360
},
{
"epoch": 0.4721329794664059,
"grad_norm": 2.895301342010498,
"learning_rate": 6.342501268327036e-05,
"loss": 2.608,
"mean_token_accuracy": 0.34366882890462874,
"step": 3380
},
{
"epoch": 0.4749266657354379,
"grad_norm": 2.8437559604644775,
"learning_rate": 6.295468874473824e-05,
"loss": 2.5744,
"mean_token_accuracy": 0.34375,
"step": 3400
},
{
"epoch": 0.4777203520044699,
"grad_norm": 2.2013747692108154,
"learning_rate": 6.248313281668151e-05,
"loss": 2.5572,
"mean_token_accuracy": 0.34269480407238007,
"step": 3420
},
{
"epoch": 0.48051403827350186,
"grad_norm": 2.285017728805542,
"learning_rate": 6.201038974401893e-05,
"loss": 2.5755,
"mean_token_accuracy": 0.3433441549539566,
"step": 3440
},
{
"epoch": 0.4833077245425339,
"grad_norm": 3.0466742515563965,
"learning_rate": 6.15365044845665e-05,
"loss": 2.5605,
"mean_token_accuracy": 0.34318181723356245,
"step": 3460
},
{
"epoch": 0.4861014108115659,
"grad_norm": 2.7282652854919434,
"learning_rate": 6.10615221047621e-05,
"loss": 2.5826,
"mean_token_accuracy": 0.34439935386180875,
"step": 3480
},
{
"epoch": 0.48889509708059786,
"grad_norm": 2.5899875164031982,
"learning_rate": 6.0585487775379634e-05,
"loss": 2.569,
"mean_token_accuracy": 0.3461850628256798,
"step": 3500
},
{
"epoch": 0.49168878334962984,
"grad_norm": 2.2856500148773193,
"learning_rate": 6.0108446767233304e-05,
"loss": 2.5432,
"mean_token_accuracy": 0.3460227265954018,
"step": 3520
},
{
"epoch": 0.4944824696186618,
"grad_norm": 2.4979193210601807,
"learning_rate": 5.963044444687235e-05,
"loss": 2.5524,
"mean_token_accuracy": 0.3460227265954018,
"step": 3540
},
{
"epoch": 0.4972761558876938,
"grad_norm": 3.4747583866119385,
"learning_rate": 5.91515262722667e-05,
"loss": 2.5506,
"mean_token_accuracy": 0.34025973826646805,
"step": 3560
},
{
"epoch": 0.5000698421567258,
"grad_norm": 2.0753846168518066,
"learning_rate": 5.867173778848394e-05,
"loss": 2.5333,
"mean_token_accuracy": 0.34845779091119766,
"step": 3580
},
{
"epoch": 0.5000698421567258,
"eval_loss": 2.6793947219848633,
"eval_mean_token_accuracy": 0.3425768423901875,
"eval_runtime": 994.8268,
"eval_samples_per_second": 98.841,
"eval_steps_per_second": 1.765,
"step": 3580
},
{
"epoch": 0.5028635284257578,
"grad_norm": 2.6756086349487305,
"learning_rate": 5.819112462335792e-05,
"loss": 2.5148,
"mean_token_accuracy": 0.34748376458883284,
"step": 3600
},
{
"epoch": 0.5056572146947897,
"grad_norm": 2.5338103771209717,
"learning_rate": 5.770973248314965e-05,
"loss": 2.5498,
"mean_token_accuracy": 0.3452110379934311,
"step": 3620
},
{
"epoch": 0.5084509009638217,
"grad_norm": 2.2562379837036133,
"learning_rate": 5.722760714820057e-05,
"loss": 2.5533,
"mean_token_accuracy": 0.34634740203619,
"step": 3640
},
{
"epoch": 0.5112445872328537,
"grad_norm": 2.7285821437835693,
"learning_rate": 5.674479446857885e-05,
"loss": 2.5056,
"mean_token_accuracy": 0.3499999985098839,
"step": 3660
},
{
"epoch": 0.5140382735018857,
"grad_norm": 2.5474276542663574,
"learning_rate": 5.626134035971908e-05,
"loss": 2.5642,
"mean_token_accuracy": 0.34456168711185453,
"step": 3680
},
{
"epoch": 0.5168319597709177,
"grad_norm": 2.749859094619751,
"learning_rate": 5.577729079805569e-05,
"loss": 2.5206,
"mean_token_accuracy": 0.34821428507566454,
"step": 3700
},
{
"epoch": 0.5196256460399498,
"grad_norm": 3.221196413040161,
"learning_rate": 5.529269181665064e-05,
"loss": 2.5129,
"mean_token_accuracy": 0.3448051944375038,
"step": 3720
},
{
"epoch": 0.5224193323089817,
"grad_norm": 3.7197160720825195,
"learning_rate": 5.4807589500815606e-05,
"loss": 2.5394,
"mean_token_accuracy": 0.3462662324309349,
"step": 3740
},
{
"epoch": 0.5252130185780137,
"grad_norm": 2.5209450721740723,
"learning_rate": 5.432202998372932e-05,
"loss": 2.521,
"mean_token_accuracy": 0.3460227236151695,
"step": 3760
},
{
"epoch": 0.5280067048470457,
"grad_norm": 2.6848323345184326,
"learning_rate": 5.383605944205033e-05,
"loss": 2.5506,
"mean_token_accuracy": 0.3453733742237091,
"step": 3780
},
{
"epoch": 0.5308003911160777,
"grad_norm": 2.5749640464782715,
"learning_rate": 5.334972409152559e-05,
"loss": 2.5357,
"mean_token_accuracy": 0.34399350732564926,
"step": 3800
},
{
"epoch": 0.5335940773851097,
"grad_norm": 2.7281625270843506,
"learning_rate": 5.286307018259529e-05,
"loss": 2.5287,
"mean_token_accuracy": 0.3456168845295906,
"step": 3820
},
{
"epoch": 0.5363877636541416,
"grad_norm": 2.868173122406006,
"learning_rate": 5.237614399599451e-05,
"loss": 2.5066,
"mean_token_accuracy": 0.3488636389374733,
"step": 3840
},
{
"epoch": 0.5391814499231736,
"grad_norm": 2.461381673812866,
"learning_rate": 5.1888991838351916e-05,
"loss": 2.52,
"mean_token_accuracy": 0.34163961112499236,
"step": 3860
},
{
"epoch": 0.5419751361922056,
"grad_norm": 2.8189141750335693,
"learning_rate": 5.140166003778603e-05,
"loss": 2.5391,
"mean_token_accuracy": 0.34618506133556365,
"step": 3880
},
{
"epoch": 0.5447688224612376,
"grad_norm": 2.760181427001953,
"learning_rate": 5.091419493949929e-05,
"loss": 2.5464,
"mean_token_accuracy": 0.34740259647369387,
"step": 3900
},
{
"epoch": 0.5475625087302696,
"grad_norm": 3.0051541328430176,
"learning_rate": 5.042664290137086e-05,
"loss": 2.5276,
"mean_token_accuracy": 0.3481331169605255,
"step": 3920
},
{
"epoch": 0.5503561949993016,
"grad_norm": 2.959178924560547,
"learning_rate": 4.993905028954778e-05,
"loss": 2.5281,
"mean_token_accuracy": 0.34642857015132905,
"step": 3940
},
{
"epoch": 0.5531498812683335,
"grad_norm": 2.7471230030059814,
"learning_rate": 4.94514634740357e-05,
"loss": 2.5127,
"mean_token_accuracy": 0.34821428507566454,
"step": 3960
},
{
"epoch": 0.5559435675373655,
"grad_norm": 3.817932605743408,
"learning_rate": 4.896392882428901e-05,
"loss": 2.5538,
"mean_token_accuracy": 0.3424512967467308,
"step": 3980
},
{
"epoch": 0.5587372538063975,
"grad_norm": 2.950193166732788,
"learning_rate": 4.847649270480117e-05,
"loss": 2.4831,
"mean_token_accuracy": 0.34951298832893374,
"step": 4000
},
{
"epoch": 0.5615309400754295,
"grad_norm": 3.4689207077026367,
"learning_rate": 4.7989201470695396e-05,
"loss": 2.4955,
"mean_token_accuracy": 0.34439934939146044,
"step": 4020
},
{
"epoch": 0.5643246263444616,
"grad_norm": 2.9318456649780273,
"learning_rate": 4.750210146331632e-05,
"loss": 2.5267,
"mean_token_accuracy": 0.34691558331251143,
"step": 4040
},
{
"epoch": 0.5671183126134935,
"grad_norm": 2.702409505844116,
"learning_rate": 4.701523900582295e-05,
"loss": 2.5095,
"mean_token_accuracy": 0.34829545617103574,
"step": 4060
},
{
"epoch": 0.5699119988825255,
"grad_norm": 2.8087456226348877,
"learning_rate": 4.6528660398783326e-05,
"loss": 2.4989,
"mean_token_accuracy": 0.3536525994539261,
"step": 4080
},
{
"epoch": 0.5727056851515575,
"grad_norm": 2.8514254093170166,
"learning_rate": 4.6042411915771306e-05,
"loss": 2.5446,
"mean_token_accuracy": 0.34764610081911085,
"step": 4100
},
{
"epoch": 0.5754993714205895,
"grad_norm": 2.71073317527771,
"learning_rate": 4.555653979896603e-05,
"loss": 2.503,
"mean_token_accuracy": 0.34829545170068743,
"step": 4120
},
{
"epoch": 0.5782930576896215,
"grad_norm": 2.834017753601074,
"learning_rate": 4.507109025475423e-05,
"loss": 2.4925,
"mean_token_accuracy": 0.34853895753622055,
"step": 4140
},
{
"epoch": 0.5810867439586535,
"grad_norm": 2.7667429447174072,
"learning_rate": 4.4586109449336045e-05,
"loss": 2.4791,
"mean_token_accuracy": 0.34967532455921174,
"step": 4160
},
{
"epoch": 0.5838804302276854,
"grad_norm": 2.7539703845977783,
"learning_rate": 4.410164350433457e-05,
"loss": 2.481,
"mean_token_accuracy": 0.3508928567171097,
"step": 4180
},
{
"epoch": 0.5866741164967174,
"grad_norm": 2.7576351165771484,
"learning_rate": 4.361773849240977e-05,
"loss": 2.5077,
"mean_token_accuracy": 0.3529220804572105,
"step": 4200
},
{
"epoch": 0.5894678027657494,
"grad_norm": 2.8457632064819336,
"learning_rate": 4.313444043287691e-05,
"loss": 2.5038,
"mean_token_accuracy": 0.34740259796380996,
"step": 4220
},
{
"epoch": 0.5922614890347814,
"grad_norm": 3.2245895862579346,
"learning_rate": 4.265179528733017e-05,
"loss": 2.5013,
"mean_token_accuracy": 0.3470779210329056,
"step": 4240
},
{
"epoch": 0.5950551753038134,
"grad_norm": 2.93704891204834,
"learning_rate": 4.2169848955271624e-05,
"loss": 2.4865,
"mean_token_accuracy": 0.3499188289046288,
"step": 4260
},
{
"epoch": 0.5978488615728453,
"grad_norm": 2.9629595279693604,
"learning_rate": 4.1688647269746324e-05,
"loss": 2.5028,
"mean_token_accuracy": 0.3491883099079132,
"step": 4280
},
{
"epoch": 0.6006425478418773,
"grad_norm": 3.764291286468506,
"learning_rate": 4.120823599298349e-05,
"loss": 2.4974,
"mean_token_accuracy": 0.3502435013651848,
"step": 4300
},
{
"epoch": 0.6034362341109093,
"grad_norm": 3.1387903690338135,
"learning_rate": 4.0728660812044536e-05,
"loss": 2.4754,
"mean_token_accuracy": 0.3487012982368469,
"step": 4320
},
{
"epoch": 0.6062299203799413,
"grad_norm": 3.713451385498047,
"learning_rate": 4.0249967334478266e-05,
"loss": 2.4594,
"mean_token_accuracy": 0.35519480258226394,
"step": 4340
},
{
"epoch": 0.6090236066489734,
"grad_norm": 3.5359973907470703,
"learning_rate": 3.9772201083983596e-05,
"loss": 2.4914,
"mean_token_accuracy": 0.34983766078948975,
"step": 4360
},
{
"epoch": 0.6118172929180054,
"grad_norm": 3.2408080101013184,
"learning_rate": 3.929540749608024e-05,
"loss": 2.4659,
"mean_token_accuracy": 0.3521103858947754,
"step": 4380
},
{
"epoch": 0.6146109791870373,
"grad_norm": 3.0888562202453613,
"learning_rate": 3.881963191378778e-05,
"loss": 2.4789,
"mean_token_accuracy": 0.3471590906381607,
"step": 4400
},
{
"epoch": 0.6174046654560693,
"grad_norm": 3.0336287021636963,
"learning_rate": 3.83449195833136e-05,
"loss": 2.4546,
"mean_token_accuracy": 0.34886363446712493,
"step": 4420
},
{
"epoch": 0.6201983517251013,
"grad_norm": 3.150221347808838,
"learning_rate": 3.7871315649749953e-05,
"loss": 2.4682,
"mean_token_accuracy": 0.3517857164144516,
"step": 4440
},
{
"epoch": 0.6229920379941333,
"grad_norm": 2.995313882827759,
"learning_rate": 3.739886515278066e-05,
"loss": 2.4592,
"mean_token_accuracy": 0.35413961112499237,
"step": 4460
},
{
"epoch": 0.6257857242631653,
"grad_norm": 3.334155321121216,
"learning_rate": 3.692761302239779e-05,
"loss": 2.4621,
"mean_token_accuracy": 0.3563311696052551,
"step": 4480
},
{
"epoch": 0.6285794105321972,
"grad_norm": 3.2132041454315186,
"learning_rate": 3.645760407462896e-05,
"loss": 2.4589,
"mean_token_accuracy": 0.35081168860197065,
"step": 4500
},
{
"epoch": 0.6313730968012292,
"grad_norm": 3.1528499126434326,
"learning_rate": 3.598888300727521e-05,
"loss": 2.4256,
"mean_token_accuracy": 0.3550324648618698,
"step": 4520
},
{
"epoch": 0.6341667830702612,
"grad_norm": 3.3648734092712402,
"learning_rate": 3.552149439566029e-05,
"loss": 2.4552,
"mean_token_accuracy": 0.35657467097043993,
"step": 4540
},
{
"epoch": 0.6369604693392932,
"grad_norm": 3.0252137184143066,
"learning_rate": 3.505548268839155e-05,
"loss": 2.4769,
"mean_token_accuracy": 0.3514610409736633,
"step": 4560
},
{
"epoch": 0.6397541556083252,
"grad_norm": 2.9841442108154297,
"learning_rate": 3.45908922031329e-05,
"loss": 2.4483,
"mean_token_accuracy": 0.3573863670229912,
"step": 4580
},
{
"epoch": 0.6425478418773571,
"grad_norm": 3.206582546234131,
"learning_rate": 3.412776712239016e-05,
"loss": 2.4437,
"mean_token_accuracy": 0.3538961037993431,
"step": 4600
},
{
"epoch": 0.6453415281463891,
"grad_norm": 2.9362432956695557,
"learning_rate": 3.3666151489309364e-05,
"loss": 2.4284,
"mean_token_accuracy": 0.35787337720394136,
"step": 4620
},
{
"epoch": 0.6481352144154211,
"grad_norm": 3.4137799739837646,
"learning_rate": 3.32060892034882e-05,
"loss": 2.4621,
"mean_token_accuracy": 0.35308441519737244,
"step": 4640
},
{
"epoch": 0.6509289006844531,
"grad_norm": 3.3628833293914795,
"learning_rate": 3.274762401680124e-05,
"loss": 2.4417,
"mean_token_accuracy": 0.35600649267435075,
"step": 4660
},
{
"epoch": 0.6537225869534852,
"grad_norm": 3.157782793045044,
"learning_rate": 3.229079952923908e-05,
"loss": 2.4363,
"mean_token_accuracy": 0.3574675336480141,
"step": 4680
},
{
"epoch": 0.6565162732225172,
"grad_norm": 3.448489189147949,
"learning_rate": 3.183565918476198e-05,
"loss": 2.4261,
"mean_token_accuracy": 0.3572240278124809,
"step": 4700
},
{
"epoch": 0.6593099594915491,
"grad_norm": 3.607093572616577,
"learning_rate": 3.1382246267168386e-05,
"loss": 2.4279,
"mean_token_accuracy": 0.35633117109537127,
"step": 4720
},
{
"epoch": 0.6621036457605811,
"grad_norm": 3.165947914123535,
"learning_rate": 3.093060389597865e-05,
"loss": 2.4558,
"mean_token_accuracy": 0.3527597412467003,
"step": 4740
},
{
"epoch": 0.6648973320296131,
"grad_norm": 3.2351622581481934,
"learning_rate": 3.048077502233434e-05,
"loss": 2.4079,
"mean_token_accuracy": 0.35860389471054077,
"step": 4760
},
{
"epoch": 0.6676910182986451,
"grad_norm": 3.320864677429199,
"learning_rate": 3.0032802424913563e-05,
"loss": 2.4491,
"mean_token_accuracy": 0.3559253215789795,
"step": 4780
},
{
"epoch": 0.6704847045676771,
"grad_norm": 3.1946163177490234,
"learning_rate": 2.9586728705862813e-05,
"loss": 2.4654,
"mean_token_accuracy": 0.3549513012170792,
"step": 4800
},
{
"epoch": 0.673278390836709,
"grad_norm": 3.2893896102905273,
"learning_rate": 2.914259628674542e-05,
"loss": 2.4384,
"mean_token_accuracy": 0.35592532306909563,
"step": 4820
},
{
"epoch": 0.676072077105741,
"grad_norm": 3.068572521209717,
"learning_rate": 2.870044740450729e-05,
"loss": 2.4351,
"mean_token_accuracy": 0.356168831884861,
"step": 4840
},
{
"epoch": 0.678865763374773,
"grad_norm": 3.2332894802093506,
"learning_rate": 2.8260324107460197e-05,
"loss": 2.4432,
"mean_token_accuracy": 0.35292208194732666,
"step": 4860
},
{
"epoch": 0.681659449643805,
"grad_norm": 3.183605194091797,
"learning_rate": 2.7822268251282975e-05,
"loss": 2.4073,
"mean_token_accuracy": 0.3563311696052551,
"step": 4880
},
{
"epoch": 0.684453135912837,
"grad_norm": 3.625239133834839,
"learning_rate": 2.7386321495041047e-05,
"loss": 2.4071,
"mean_token_accuracy": 0.35738636553287506,
"step": 4900
},
{
"epoch": 0.6872468221818689,
"grad_norm": 3.204867362976074,
"learning_rate": 2.695252529722467e-05,
"loss": 2.415,
"mean_token_accuracy": 0.3589285746216774,
"step": 4920
},
{
"epoch": 0.6900405084509009,
"grad_norm": 3.107151985168457,
"learning_rate": 2.65209209118062e-05,
"loss": 2.4391,
"mean_token_accuracy": 0.35519480854272845,
"step": 4940
},
{
"epoch": 0.6928341947199329,
"grad_norm": 3.3900489807128906,
"learning_rate": 2.6091549384316883e-05,
"loss": 2.4317,
"mean_token_accuracy": 0.3538961037993431,
"step": 4960
},
{
"epoch": 0.6956278809889649,
"grad_norm": 3.0111920833587646,
"learning_rate": 2.566445154794341e-05,
"loss": 2.4185,
"mean_token_accuracy": 0.3533279225230217,
"step": 4980
},
{
"epoch": 0.698421567257997,
"grad_norm": 3.488588809967041,
"learning_rate": 2.523966801964468e-05,
"loss": 2.4098,
"mean_token_accuracy": 0.3501623347401619,
"step": 5000
},
{
"epoch": 0.701215253527029,
"grad_norm": 3.103729724884033,
"learning_rate": 2.481723919628916e-05,
"loss": 2.4228,
"mean_token_accuracy": 0.35275974273681643,
"step": 5020
},
{
"epoch": 0.7040089397960609,
"grad_norm": 3.421889305114746,
"learning_rate": 2.4397205250813104e-05,
"loss": 2.4011,
"mean_token_accuracy": 0.3581980481743813,
"step": 5040
},
{
"epoch": 0.7068026260650929,
"grad_norm": 3.167423725128174,
"learning_rate": 2.3979606128400162e-05,
"loss": 2.4235,
"mean_token_accuracy": 0.3530844137072563,
"step": 5060
},
{
"epoch": 0.7095963123341249,
"grad_norm": 3.6433827877044678,
"learning_rate": 2.3564481542682516e-05,
"loss": 2.4054,
"mean_token_accuracy": 0.3555194824934006,
"step": 5080
},
{
"epoch": 0.7123899986031569,
"grad_norm": 3.3545353412628174,
"learning_rate": 2.3151870971964224e-05,
"loss": 2.431,
"mean_token_accuracy": 0.35308441519737244,
"step": 5100
},
{
"epoch": 0.7151836848721889,
"grad_norm": 3.181476593017578,
"learning_rate": 2.2741813655466758e-05,
"loss": 2.39,
"mean_token_accuracy": 0.3538961037993431,
"step": 5120
},
{
"epoch": 0.7179773711412208,
"grad_norm": 3.6350057125091553,
"learning_rate": 2.2334348589597404e-05,
"loss": 2.4131,
"mean_token_accuracy": 0.35202922075986864,
"step": 5140
},
{
"epoch": 0.7207710574102528,
"grad_norm": 4.146636962890625,
"learning_rate": 2.1929514524240667e-05,
"loss": 2.4173,
"mean_token_accuracy": 0.3597402602434158,
"step": 5160
},
{
"epoch": 0.7235647436792848,
"grad_norm": 4.2176384925842285,
"learning_rate": 2.15273499590732e-05,
"loss": 2.4013,
"mean_token_accuracy": 0.3566558465361595,
"step": 5180
},
{
"epoch": 0.7263584299483168,
"grad_norm": 3.659813165664673,
"learning_rate": 2.112789313990246e-05,
"loss": 2.4012,
"mean_token_accuracy": 0.35600649416446684,
"step": 5200
},
{
"epoch": 0.7291521162173488,
"grad_norm": 3.0530693531036377,
"learning_rate": 2.073118205502957e-05,
"loss": 2.4082,
"mean_token_accuracy": 0.3566558450460434,
"step": 5220
},
{
"epoch": 0.7319458024863807,
"grad_norm": 4.101310729980469,
"learning_rate": 2.0337254431636548e-05,
"loss": 2.4011,
"mean_token_accuracy": 0.3585227280855179,
"step": 5240
},
{
"epoch": 0.7347394887554127,
"grad_norm": 4.217702388763428,
"learning_rate": 1.99461477321986e-05,
"loss": 2.3857,
"mean_token_accuracy": 0.35884740203619003,
"step": 5260
},
{
"epoch": 0.7375331750244447,
"grad_norm": 3.2049005031585693,
"learning_rate": 1.9557899150921317e-05,
"loss": 2.3847,
"mean_token_accuracy": 0.35649350583553313,
"step": 5280
},
{
"epoch": 0.7403268612934767,
"grad_norm": 3.6392228603363037,
"learning_rate": 1.9172545610203575e-05,
"loss": 2.435,
"mean_token_accuracy": 0.35349026024341584,
"step": 5300
},
{
"epoch": 0.7431205475625088,
"grad_norm": 3.226102352142334,
"learning_rate": 1.8790123757126195e-05,
"loss": 2.3908,
"mean_token_accuracy": 0.3583603873848915,
"step": 5320
},
{
"epoch": 0.7459142338315408,
"grad_norm": 3.2937116622924805,
"learning_rate": 1.84106699599668e-05,
"loss": 2.3964,
"mean_token_accuracy": 0.3568181827664375,
"step": 5340
},
{
"epoch": 0.7487079201005727,
"grad_norm": 3.6983494758605957,
"learning_rate": 1.803422030474126e-05,
"loss": 2.3886,
"mean_token_accuracy": 0.35698052048683165,
"step": 5360
},
{
"epoch": 0.7501047632350887,
"eval_loss": 2.7606494426727295,
"eval_mean_token_accuracy": 0.3412951716176516,
"eval_runtime": 989.3775,
"eval_samples_per_second": 99.386,
"eval_steps_per_second": 1.775,
"step": 5370
},
{
"epoch": 0.7515016063696047,
"grad_norm": 3.6565871238708496,
"learning_rate": 1.7660810591771785e-05,
"loss": 2.4133,
"mean_token_accuracy": 0.36103896498680116,
"step": 5380
},
{
"epoch": 0.7542952926386367,
"grad_norm": 3.315748691558838,
"learning_rate": 1.7290476332282468e-05,
"loss": 2.3908,
"mean_token_accuracy": 0.36071428656578064,
"step": 5400
},
{
"epoch": 0.7570889789076687,
"grad_norm": 3.21600604057312,
"learning_rate": 1.6923252745022062e-05,
"loss": 2.3913,
"mean_token_accuracy": 0.3598214253783226,
"step": 5420
},
{
"epoch": 0.7598826651767007,
"grad_norm": 4.019015312194824,
"learning_rate": 1.6559174752914754e-05,
"loss": 2.394,
"mean_token_accuracy": 0.35957791954278945,
"step": 5440
},
{
"epoch": 0.7626763514457326,
"grad_norm": 3.757371664047241,
"learning_rate": 1.6198276979738942e-05,
"loss": 2.4179,
"mean_token_accuracy": 0.35170454531908035,
"step": 5460
},
{
"epoch": 0.7654700377147646,
"grad_norm": 3.8441550731658936,
"learning_rate": 1.5840593746834546e-05,
"loss": 2.3617,
"mean_token_accuracy": 0.3576298698782921,
"step": 5480
},
{
"epoch": 0.7682637239837966,
"grad_norm": 3.4171853065490723,
"learning_rate": 1.5486159069839058e-05,
"loss": 2.3991,
"mean_token_accuracy": 0.3582792207598686,
"step": 5500
},
{
"epoch": 0.7710574102528286,
"grad_norm": 3.999089002609253,
"learning_rate": 1.5135006655452644e-05,
"loss": 2.385,
"mean_token_accuracy": 0.35746753215789795,
"step": 5520
},
{
"epoch": 0.7738510965218606,
"grad_norm": 3.6210405826568604,
"learning_rate": 1.4787169898232618e-05,
"loss": 2.3821,
"mean_token_accuracy": 0.3604707822203636,
"step": 5540
},
{
"epoch": 0.7766447827908926,
"grad_norm": 3.676494836807251,
"learning_rate": 1.4442681877417686e-05,
"loss": 2.3897,
"mean_token_accuracy": 0.35560064762830734,
"step": 5560
},
{
"epoch": 0.7794384690599245,
"grad_norm": 3.83720326423645,
"learning_rate": 1.410157535378206e-05,
"loss": 2.3225,
"mean_token_accuracy": 0.3604707807302475,
"step": 5580
},
{
"epoch": 0.7822321553289565,
"grad_norm": 3.8904194831848145,
"learning_rate": 1.3763882766519926e-05,
"loss": 2.3878,
"mean_token_accuracy": 0.35600649267435075,
"step": 5600
},
{
"epoch": 0.7850258415979885,
"grad_norm": 3.4765756130218506,
"learning_rate": 1.3429636230160498e-05,
"loss": 2.3814,
"mean_token_accuracy": 0.3632305219769478,
"step": 5620
},
{
"epoch": 0.7878195278670206,
"grad_norm": 3.398759126663208,
"learning_rate": 1.3098867531513903e-05,
"loss": 2.3775,
"mean_token_accuracy": 0.3590909048914909,
"step": 5640
},
{
"epoch": 0.7906132141360526,
"grad_norm": 3.791919231414795,
"learning_rate": 1.2771608126648293e-05,
"loss": 2.3942,
"mean_token_accuracy": 0.3641233786940575,
"step": 5660
},
{
"epoch": 0.7934069004050845,
"grad_norm": 3.9098916053771973,
"learning_rate": 1.2447889137898293e-05,
"loss": 2.3438,
"mean_token_accuracy": 0.36055194288492204,
"step": 5680
},
{
"epoch": 0.7962005866741165,
"grad_norm": 3.677560567855835,
"learning_rate": 1.2127741350905397e-05,
"loss": 2.3837,
"mean_token_accuracy": 0.3632305219769478,
"step": 5700
},
{
"epoch": 0.7989942729431485,
"grad_norm": 3.279231548309326,
"learning_rate": 1.1811195211690169e-05,
"loss": 2.3939,
"mean_token_accuracy": 0.3599026009440422,
"step": 5720
},
{
"epoch": 0.8017879592121805,
"grad_norm": 3.5640265941619873,
"learning_rate": 1.1498280823756841e-05,
"loss": 2.3774,
"mean_token_accuracy": 0.3581980511546135,
"step": 5740
},
{
"epoch": 0.8045816454812125,
"grad_norm": 3.7095561027526855,
"learning_rate": 1.1189027945230496e-05,
"loss": 2.3944,
"mean_token_accuracy": 0.35584415346384046,
"step": 5760
},
{
"epoch": 0.8073753317502445,
"grad_norm": 3.6211345195770264,
"learning_rate": 1.0883465986027059e-05,
"loss": 2.3879,
"mean_token_accuracy": 0.3619318217039108,
"step": 5780
},
{
"epoch": 0.8101690180192764,
"grad_norm": 3.5692007541656494,
"learning_rate": 1.0581624005056424e-05,
"loss": 2.3611,
"mean_token_accuracy": 0.36030844151973723,
"step": 5800
},
{
"epoch": 0.8129627042883084,
"grad_norm": 3.9471545219421387,
"learning_rate": 1.0283530707458922e-05,
"loss": 2.4062,
"mean_token_accuracy": 0.356331168115139,
"step": 5820
},
{
"epoch": 0.8157563905573404,
"grad_norm": 4.07960844039917,
"learning_rate": 9.989214441875522e-06,
"loss": 2.3634,
"mean_token_accuracy": 0.36566558480262756,
"step": 5840
},
{
"epoch": 0.8185500768263724,
"grad_norm": 3.9245829582214355,
"learning_rate": 9.698703197751851e-06,
"loss": 2.332,
"mean_token_accuracy": 0.3642857164144516,
"step": 5860
},
{
"epoch": 0.8213437630954044,
"grad_norm": 3.7653911113739014,
"learning_rate": 9.412024602676378e-06,
"loss": 2.3998,
"mean_token_accuracy": 0.35511363595724105,
"step": 5880
},
{
"epoch": 0.8241374493644363,
"grad_norm": 3.9197909832000732,
"learning_rate": 9.129205919753075e-06,
"loss": 2.3641,
"mean_token_accuracy": 0.3629870146512985,
"step": 5900
},
{
"epoch": 0.8269311356334683,
"grad_norm": 3.7501025199890137,
"learning_rate": 8.850274045008666e-06,
"loss": 2.3705,
"mean_token_accuracy": 0.36079545617103576,
"step": 5920
},
{
"epoch": 0.8297248219025003,
"grad_norm": 3.8063013553619385,
"learning_rate": 8.575255504834827e-06,
"loss": 2.3993,
"mean_token_accuracy": 0.36266233772039413,
"step": 5940
},
{
"epoch": 0.8325185081715324,
"grad_norm": 3.9794483184814453,
"learning_rate": 8.304176453465556e-06,
"loss": 2.3816,
"mean_token_accuracy": 0.3581168860197067,
"step": 5960
},
{
"epoch": 0.8353121944405644,
"grad_norm": 3.6001126766204834,
"learning_rate": 8.037062670489842e-06,
"loss": 2.3807,
"mean_token_accuracy": 0.3580357149243355,
"step": 5980
},
{
"epoch": 0.8381058807095964,
"grad_norm": 3.5399417877197266,
"learning_rate": 7.773939558400101e-06,
"loss": 2.3788,
"mean_token_accuracy": 0.3605519488453865,
"step": 6000
},
{
"epoch": 0.8408995669786283,
"grad_norm": 3.753068208694458,
"learning_rate": 7.51483214017637e-06,
"loss": 2.3517,
"mean_token_accuracy": 0.3618506520986557,
"step": 6020
},
{
"epoch": 0.8436932532476603,
"grad_norm": 4.195340156555176,
"learning_rate": 7.259765056906609e-06,
"loss": 2.3818,
"mean_token_accuracy": 0.3603896126151085,
"step": 6040
},
{
"epoch": 0.8464869395166923,
"grad_norm": 4.046124458312988,
"learning_rate": 7.008762565443378e-06,
"loss": 2.3566,
"mean_token_accuracy": 0.3603896081447601,
"step": 6060
},
{
"epoch": 0.8492806257857243,
"grad_norm": 3.6873440742492676,
"learning_rate": 6.76184853609696e-06,
"loss": 2.3539,
"mean_token_accuracy": 0.35795454531908033,
"step": 6080
},
{
"epoch": 0.8520743120547563,
"grad_norm": 3.5705506801605225,
"learning_rate": 6.519046450365346e-06,
"loss": 2.3515,
"mean_token_accuracy": 0.3608766242861748,
"step": 6100
},
{
"epoch": 0.8548679983237882,
"grad_norm": 3.630878448486328,
"learning_rate": 6.280379398701114e-06,
"loss": 2.3596,
"mean_token_accuracy": 0.3606331154704094,
"step": 6120
},
{
"epoch": 0.8576616845928202,
"grad_norm": 3.789283514022827,
"learning_rate": 6.045870078315541e-06,
"loss": 2.3504,
"mean_token_accuracy": 0.36233766227960584,
"step": 6140
},
{
"epoch": 0.8604553708618522,
"grad_norm": 3.9113943576812744,
"learning_rate": 5.8155407910201135e-06,
"loss": 2.3738,
"mean_token_accuracy": 0.36241883486509324,
"step": 6160
},
{
"epoch": 0.8632490571308842,
"grad_norm": 3.61296010017395,
"learning_rate": 5.5894134411055955e-06,
"loss": 2.3907,
"mean_token_accuracy": 0.35900974124670026,
"step": 6180
},
{
"epoch": 0.8660427433999162,
"grad_norm": 3.8162384033203125,
"learning_rate": 5.367509533258969e-06,
"loss": 2.3164,
"mean_token_accuracy": 0.3629870146512985,
"step": 6200
},
{
"epoch": 0.8688364296689481,
"grad_norm": 3.9131879806518555,
"learning_rate": 5.149850170518328e-06,
"loss": 2.3605,
"mean_token_accuracy": 0.360146102309227,
"step": 6220
},
{
"epoch": 0.8716301159379801,
"grad_norm": 3.934457540512085,
"learning_rate": 4.9364560522659365e-06,
"loss": 2.3576,
"mean_token_accuracy": 0.3599025964736938,
"step": 6240
},
{
"epoch": 0.8744238022070121,
"grad_norm": 3.8578693866729736,
"learning_rate": 4.727347472259813e-06,
"loss": 2.3573,
"mean_token_accuracy": 0.36201298981904984,
"step": 6260
},
{
"epoch": 0.8772174884760442,
"grad_norm": 3.8296849727630615,
"learning_rate": 4.522544316703709e-06,
"loss": 2.3431,
"mean_token_accuracy": 0.3612824693322182,
"step": 6280
},
{
"epoch": 0.8800111747450762,
"grad_norm": 3.58001971244812,
"learning_rate": 4.322066062355984e-06,
"loss": 2.3756,
"mean_token_accuracy": 0.3618506506085396,
"step": 6300
},
{
"epoch": 0.8828048610141082,
"grad_norm": 3.9927351474761963,
"learning_rate": 4.125931774677349e-06,
"loss": 2.3874,
"mean_token_accuracy": 0.358766233921051,
"step": 6320
},
{
"epoch": 0.8855985472831401,
"grad_norm": 3.5192744731903076,
"learning_rate": 3.934160106017748e-06,
"loss": 2.3473,
"mean_token_accuracy": 0.36599026024341585,
"step": 6340
},
{
"epoch": 0.8883922335521721,
"grad_norm": 3.695925235748291,
"learning_rate": 3.7467692938425057e-06,
"loss": 2.36,
"mean_token_accuracy": 0.3630681812763214,
"step": 6360
},
{
"epoch": 0.8911859198212041,
"grad_norm": 3.908273696899414,
"learning_rate": 3.563777158997977e-06,
"loss": 2.3429,
"mean_token_accuracy": 0.3662337675690651,
"step": 6380
},
{
"epoch": 0.8939796060902361,
"grad_norm": 3.800147294998169,
"learning_rate": 3.3852011040167607e-06,
"loss": 2.3469,
"mean_token_accuracy": 0.3615259721875191,
"step": 6400
},
{
"epoch": 0.8967732923592681,
"grad_norm": 3.7554497718811035,
"learning_rate": 3.2110581114627225e-06,
"loss": 2.3629,
"mean_token_accuracy": 0.3570616886019707,
"step": 6420
},
{
"epoch": 0.8995669786283,
"grad_norm": 3.9945056438446045,
"learning_rate": 3.041364742315983e-06,
"loss": 2.3897,
"mean_token_accuracy": 0.3580357149243355,
"step": 6440
},
{
"epoch": 0.902360664897332,
"grad_norm": 3.705796003341675,
"learning_rate": 2.8761371343979273e-06,
"loss": 2.3326,
"mean_token_accuracy": 0.3644480526447296,
"step": 6460
},
{
"epoch": 0.905154351166364,
"grad_norm": 3.556757688522339,
"learning_rate": 2.7153910008365368e-06,
"loss": 2.3759,
"mean_token_accuracy": 0.3582792192697525,
"step": 6480
},
{
"epoch": 0.907948037435396,
"grad_norm": 3.859874963760376,
"learning_rate": 2.5591416285720424e-06,
"loss": 2.3236,
"mean_token_accuracy": 0.36079545617103576,
"step": 6500
},
{
"epoch": 0.910741723704428,
"grad_norm": 4.084688186645508,
"learning_rate": 2.4074038769031803e-06,
"loss": 2.3757,
"mean_token_accuracy": 0.36030844002962115,
"step": 6520
},
{
"epoch": 0.9135354099734599,
"grad_norm": 3.861067295074463,
"learning_rate": 2.2601921760740107e-06,
"loss": 2.3407,
"mean_token_accuracy": 0.3633116871118546,
"step": 6540
},
{
"epoch": 0.9163290962424919,
"grad_norm": 4.909350395202637,
"learning_rate": 2.1175205259016563e-06,
"loss": 2.3366,
"mean_token_accuracy": 0.3625811696052551,
"step": 6560
},
{
"epoch": 0.9191227825115239,
"grad_norm": 3.506558895111084,
"learning_rate": 1.979402494444915e-06,
"loss": 2.3384,
"mean_token_accuracy": 0.3619318187236786,
"step": 6580
},
{
"epoch": 0.921916468780556,
"grad_norm": 3.9836270809173584,
"learning_rate": 1.845851216713912e-06,
"loss": 2.3518,
"mean_token_accuracy": 0.3620129883289337,
"step": 6600
},
{
"epoch": 0.924710155049588,
"grad_norm": 3.8005995750427246,
"learning_rate": 1.7168793934209893e-06,
"loss": 2.3505,
"mean_token_accuracy": 0.3646915599703789,
"step": 6620
},
{
"epoch": 0.92750384131862,
"grad_norm": 3.654874801635742,
"learning_rate": 1.5924992897728475e-06,
"loss": 2.3256,
"mean_token_accuracy": 0.36566558480262756,
"step": 6640
},
{
"epoch": 0.9302975275876519,
"grad_norm": 3.704705238342285,
"learning_rate": 1.472722734304144e-06,
"loss": 2.305,
"mean_token_accuracy": 0.3631493508815765,
"step": 6660
},
{
"epoch": 0.9330912138566839,
"grad_norm": 4.282053470611572,
"learning_rate": 1.3575611177525926e-06,
"loss": 2.3599,
"mean_token_accuracy": 0.35868506878614426,
"step": 6680
},
{
"epoch": 0.9358849001257159,
"grad_norm": 4.226362228393555,
"learning_rate": 1.247025391975698e-06,
"loss": 2.3347,
"mean_token_accuracy": 0.36542207896709444,
"step": 6700
},
{
"epoch": 0.9386785863947479,
"grad_norm": 3.9076080322265625,
"learning_rate": 1.1411260689092484e-06,
"loss": 2.3658,
"mean_token_accuracy": 0.35722402632236483,
"step": 6720
},
{
"epoch": 0.9414722726637799,
"grad_norm": 4.407858371734619,
"learning_rate": 1.0398732195676331e-06,
"loss": 2.32,
"mean_token_accuracy": 0.3650974050164223,
"step": 6740
},
{
"epoch": 0.9442659589328118,
"grad_norm": 3.794746160507202,
"learning_rate": 9.432764730860744e-07,
"loss": 2.3404,
"mean_token_accuracy": 0.3632305204868317,
"step": 6760
},
{
"epoch": 0.9470596452018438,
"grad_norm": 3.419100046157837,
"learning_rate": 8.513450158049108e-07,
"loss": 2.2942,
"mean_token_accuracy": 0.3678571432828903,
"step": 6780
},
{
"epoch": 0.9498533314708758,
"grad_norm": 3.609236001968384,
"learning_rate": 7.640875903959732e-07,
"loss": 2.3246,
"mean_token_accuracy": 0.36477272808551786,
"step": 6800
},
{
"epoch": 0.9526470177399078,
"grad_norm": 3.939943313598633,
"learning_rate": 6.815124950311557e-07,
"loss": 2.326,
"mean_token_accuracy": 0.36209415793418886,
"step": 6820
},
{
"epoch": 0.9554407040089398,
"grad_norm": 4.04728364944458,
"learning_rate": 6.036275825932525e-07,
"loss": 2.3391,
"mean_token_accuracy": 0.3649350643157959,
"step": 6840
},
{
"epoch": 0.9582343902779717,
"grad_norm": 4.012794494628906,
"learning_rate": 5.304402599291824e-07,
"loss": 2.3555,
"mean_token_accuracy": 0.3619318202137947,
"step": 6860
},
{
"epoch": 0.9610280765470037,
"grad_norm": 3.5218493938446045,
"learning_rate": 4.61957487145559e-07,
"loss": 2.3352,
"mean_token_accuracy": 0.36607143133878706,
"step": 6880
},
{
"epoch": 0.9638217628160357,
"grad_norm": 3.5637614727020264,
"learning_rate": 3.981857769468023e-07,
"loss": 2.3506,
"mean_token_accuracy": 0.363879868388176,
"step": 6900
},
{
"epoch": 0.9666154490850678,
"grad_norm": 4.030181407928467,
"learning_rate": 3.391311940157904e-07,
"loss": 2.3287,
"mean_token_accuracy": 0.36655843555927276,
"step": 6920
},
{
"epoch": 0.9694091353540998,
"grad_norm": 3.6736109256744385,
"learning_rate": 2.8479935443708197e-07,
"loss": 2.3548,
"mean_token_accuracy": 0.36728896349668505,
"step": 6940
},
{
"epoch": 0.9722028216231318,
"grad_norm": 3.6947691440582275,
"learning_rate": 2.3519542516285965e-07,
"loss": 2.3318,
"mean_token_accuracy": 0.3616071417927742,
"step": 6960
},
{
"epoch": 0.9749965078921637,
"grad_norm": 4.099936008453369,
"learning_rate": 1.9032412352153473e-07,
"loss": 2.3518,
"mean_token_accuracy": 0.3631493508815765,
"step": 6980
},
{
"epoch": 0.9777901941611957,
"grad_norm": 3.852231502532959,
"learning_rate": 1.501897167691224e-07,
"loss": 2.3392,
"mean_token_accuracy": 0.3645292207598686,
"step": 7000
},
{
"epoch": 0.9805838804302277,
"grad_norm": 3.9510109424591064,
"learning_rate": 1.1479602168344983e-07,
"loss": 2.3554,
"mean_token_accuracy": 0.3650162324309349,
"step": 7020
},
{
"epoch": 0.9833775666992597,
"grad_norm": 3.799307107925415,
"learning_rate": 8.414640420116305e-08,
"loss": 2.3089,
"mean_token_accuracy": 0.362662336230278,
"step": 7040
},
{
"epoch": 0.9861712529682917,
"grad_norm": 3.7856054306030273,
"learning_rate": 5.824377909763312e-08,
"loss": 2.3135,
"mean_token_accuracy": 0.3685876622796059,
"step": 7060
},
{
"epoch": 0.9889649392373236,
"grad_norm": 3.702875852584839,
"learning_rate": 3.709060970975564e-08,
"loss": 2.3606,
"mean_token_accuracy": 0.36176948100328443,
"step": 7080
},
{
"epoch": 0.9917586255063556,
"grad_norm": 3.9700822830200195,
"learning_rate": 2.068890770169363e-08,
"loss": 2.3494,
"mean_token_accuracy": 0.3568993493914604,
"step": 7100
},
{
"epoch": 0.9945523117753876,
"grad_norm": 3.895416736602783,
"learning_rate": 9.040232873569477e-09,
"loss": 2.3314,
"mean_token_accuracy": 0.3663149356842041,
"step": 7120
},
{
"epoch": 0.9973459980444196,
"grad_norm": 3.7410225868225098,
"learning_rate": 2.145693013116956e-09,
"loss": 2.3568,
"mean_token_accuracy": 0.3575487032532692,
"step": 7140
},
{
"epoch": 1.0,
"mean_token_accuracy": 0.36799384575141103,
"step": 7159,
"total_flos": 1.6530020885331968e+18,
"train_loss": 2.734565882330451,
"train_runtime": 21227.811,
"train_samples_per_second": 18.884,
"train_steps_per_second": 0.337
}
],
"logging_steps": 20,
"max_steps": 7159,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1790,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6530020885331968e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}