|
{ |
|
"best_metric": 2.6793947219848633, |
|
"best_model_checkpoint": "data/paligemma2-3b-pt-224-sft-lora-magicsoup_no_cfiphone_no_insta_sub5/checkpoint-3580", |
|
"epoch": 1.0, |
|
"eval_steps": 1790, |
|
"global_step": 7159, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001396843134515994, |
|
"grad_norm": 11.714253425598145, |
|
"learning_rate": 1.3966480446927375e-07, |
|
"loss": 15.8723, |
|
"mean_token_accuracy": 0.051948051899671555, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0027936862690319877, |
|
"grad_norm": 11.453227996826172, |
|
"learning_rate": 2.7932960893854746e-06, |
|
"loss": 15.6868, |
|
"mean_token_accuracy": 0.04536910433518259, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0055873725380639755, |
|
"grad_norm": 14.288809776306152, |
|
"learning_rate": 5.586592178770949e-06, |
|
"loss": 15.2233, |
|
"mean_token_accuracy": 0.04797077886760235, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.008381058807095963, |
|
"grad_norm": 15.974210739135742, |
|
"learning_rate": 8.379888268156424e-06, |
|
"loss": 12.5816, |
|
"mean_token_accuracy": 0.06761363632977009, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.011174745076127951, |
|
"grad_norm": 8.255433082580566, |
|
"learning_rate": 1.1173184357541899e-05, |
|
"loss": 8.1704, |
|
"mean_token_accuracy": 0.019724026112817226, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.013968431345159939, |
|
"grad_norm": 4.060810565948486, |
|
"learning_rate": 1.3966480446927374e-05, |
|
"loss": 5.8302, |
|
"mean_token_accuracy": 0.0771915590390563, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016762117614191926, |
|
"grad_norm": 5.3958940505981445, |
|
"learning_rate": 1.675977653631285e-05, |
|
"loss": 5.1183, |
|
"mean_token_accuracy": 0.09870129823684692, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.019555803883223914, |
|
"grad_norm": 6.21713924407959, |
|
"learning_rate": 1.9553072625698323e-05, |
|
"loss": 4.6938, |
|
"mean_token_accuracy": 0.11225649379193783, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.022349490152255902, |
|
"grad_norm": 10.742652893066406, |
|
"learning_rate": 2.2346368715083797e-05, |
|
"loss": 4.375, |
|
"mean_token_accuracy": 0.12938311770558358, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.02514317642128789, |
|
"grad_norm": 14.718498229980469, |
|
"learning_rate": 2.5139664804469275e-05, |
|
"loss": 4.1989, |
|
"mean_token_accuracy": 0.16461038812994958, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.027936862690319877, |
|
"grad_norm": 10.117100715637207, |
|
"learning_rate": 2.793296089385475e-05, |
|
"loss": 3.8056, |
|
"mean_token_accuracy": 0.2379870109260082, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.030730548959351865, |
|
"grad_norm": 6.335511207580566, |
|
"learning_rate": 3.0726256983240227e-05, |
|
"loss": 3.5194, |
|
"mean_token_accuracy": 0.2828733794391155, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03352423522838385, |
|
"grad_norm": 8.714235305786133, |
|
"learning_rate": 3.35195530726257e-05, |
|
"loss": 3.4042, |
|
"mean_token_accuracy": 0.2905844166874886, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03631792149741584, |
|
"grad_norm": 28.925806045532227, |
|
"learning_rate": 3.6312849162011175e-05, |
|
"loss": 3.3014, |
|
"mean_token_accuracy": 0.30211039036512377, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03911160776644783, |
|
"grad_norm": 7.385269641876221, |
|
"learning_rate": 3.9106145251396646e-05, |
|
"loss": 3.2824, |
|
"mean_token_accuracy": 0.2980519443750381, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.041905294035479816, |
|
"grad_norm": 26.001882553100586, |
|
"learning_rate": 4.1899441340782123e-05, |
|
"loss": 3.2237, |
|
"mean_token_accuracy": 0.3021103873848915, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.044698980304511804, |
|
"grad_norm": 6.193004131317139, |
|
"learning_rate": 4.4692737430167594e-05, |
|
"loss": 3.1911, |
|
"mean_token_accuracy": 0.3050324633717537, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04749266657354379, |
|
"grad_norm": 10.755631446838379, |
|
"learning_rate": 4.748603351955307e-05, |
|
"loss": 3.1985, |
|
"mean_token_accuracy": 0.30081168562173843, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.05028635284257578, |
|
"grad_norm": 13.610740661621094, |
|
"learning_rate": 5.027932960893855e-05, |
|
"loss": 3.1074, |
|
"mean_token_accuracy": 0.31006493270397184, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.05308003911160777, |
|
"grad_norm": 7.772305011749268, |
|
"learning_rate": 5.307262569832403e-05, |
|
"loss": 3.0981, |
|
"mean_token_accuracy": 0.3107142850756645, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.055873725380639755, |
|
"grad_norm": 8.072247505187988, |
|
"learning_rate": 5.58659217877095e-05, |
|
"loss": 3.0943, |
|
"mean_token_accuracy": 0.31095779240131377, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05866741164967174, |
|
"grad_norm": 7.870853900909424, |
|
"learning_rate": 5.8659217877094976e-05, |
|
"loss": 3.0422, |
|
"mean_token_accuracy": 0.31168831288814547, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.06146109791870373, |
|
"grad_norm": 19.650827407836914, |
|
"learning_rate": 6.145251396648045e-05, |
|
"loss": 3.085, |
|
"mean_token_accuracy": 0.31087662279605865, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06425478418773571, |
|
"grad_norm": 17.583972930908203, |
|
"learning_rate": 6.424581005586592e-05, |
|
"loss": 3.0207, |
|
"mean_token_accuracy": 0.3114448055624962, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0670484704567677, |
|
"grad_norm": 8.581819534301758, |
|
"learning_rate": 6.70391061452514e-05, |
|
"loss": 3.0396, |
|
"mean_token_accuracy": 0.31160714030265807, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.06984215672579969, |
|
"grad_norm": 10.295416831970215, |
|
"learning_rate": 6.983240223463688e-05, |
|
"loss": 3.0311, |
|
"mean_token_accuracy": 0.3094155818223953, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07263584299483168, |
|
"grad_norm": 11.54842472076416, |
|
"learning_rate": 7.262569832402235e-05, |
|
"loss": 2.9806, |
|
"mean_token_accuracy": 0.3171266242861748, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.07542952926386366, |
|
"grad_norm": 6.4206743240356445, |
|
"learning_rate": 7.541899441340783e-05, |
|
"loss": 3.0046, |
|
"mean_token_accuracy": 0.3163961052894592, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.07822321553289566, |
|
"grad_norm": 6.957503795623779, |
|
"learning_rate": 7.821229050279329e-05, |
|
"loss": 2.9919, |
|
"mean_token_accuracy": 0.31185064762830733, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.08101690180192764, |
|
"grad_norm": 4.134042263031006, |
|
"learning_rate": 8.100558659217878e-05, |
|
"loss": 3.0287, |
|
"mean_token_accuracy": 0.3153409093618393, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.08381058807095963, |
|
"grad_norm": 5.571300029754639, |
|
"learning_rate": 8.379888268156425e-05, |
|
"loss": 2.9836, |
|
"mean_token_accuracy": 0.31599026173353195, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.08660427433999161, |
|
"grad_norm": 5.823517799377441, |
|
"learning_rate": 8.659217877094973e-05, |
|
"loss": 2.9693, |
|
"mean_token_accuracy": 0.3173701331019402, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.08939796060902361, |
|
"grad_norm": 5.942770004272461, |
|
"learning_rate": 8.938547486033519e-05, |
|
"loss": 2.957, |
|
"mean_token_accuracy": 0.32094155699014665, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.09219164687805559, |
|
"grad_norm": 5.494757175445557, |
|
"learning_rate": 9.217877094972067e-05, |
|
"loss": 2.9391, |
|
"mean_token_accuracy": 0.3202922075986862, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.09498533314708758, |
|
"grad_norm": 3.7874789237976074, |
|
"learning_rate": 9.497206703910614e-05, |
|
"loss": 2.9662, |
|
"mean_token_accuracy": 0.318506495654583, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.09777901941611956, |
|
"grad_norm": 4.3834733963012695, |
|
"learning_rate": 9.776536312849163e-05, |
|
"loss": 2.9742, |
|
"mean_token_accuracy": 0.315259738266468, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.10057270568515156, |
|
"grad_norm": 3.8033530712127686, |
|
"learning_rate": 9.999990489938263e-05, |
|
"loss": 2.9255, |
|
"mean_token_accuracy": 0.3217532455921173, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.10336639195418354, |
|
"grad_norm": 5.953765869140625, |
|
"learning_rate": 9.99965764157593e-05, |
|
"loss": 2.8773, |
|
"mean_token_accuracy": 0.32402597516775133, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.10616007822321553, |
|
"grad_norm": 8.89194107055664, |
|
"learning_rate": 9.998849326302563e-05, |
|
"loss": 2.9062, |
|
"mean_token_accuracy": 0.3172889605164528, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.10895376449224752, |
|
"grad_norm": 4.883932113647461, |
|
"learning_rate": 9.997565620988856e-05, |
|
"loss": 2.889, |
|
"mean_token_accuracy": 0.324594159424305, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.11174745076127951, |
|
"grad_norm": 6.9450297355651855, |
|
"learning_rate": 9.995806647715047e-05, |
|
"loss": 2.902, |
|
"mean_token_accuracy": 0.319155840575695, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.11454113703031149, |
|
"grad_norm": 3.826904296875, |
|
"learning_rate": 9.99357257375931e-05, |
|
"loss": 2.845, |
|
"mean_token_accuracy": 0.32288961112499237, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.11733482329934349, |
|
"grad_norm": 3.4276959896087646, |
|
"learning_rate": 9.99086361158184e-05, |
|
"loss": 2.8776, |
|
"mean_token_accuracy": 0.31899350732564924, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.12012850956837547, |
|
"grad_norm": 4.847935199737549, |
|
"learning_rate": 9.987680018804652e-05, |
|
"loss": 2.8714, |
|
"mean_token_accuracy": 0.32126623690128325, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.12292219583740746, |
|
"grad_norm": 3.4353716373443604, |
|
"learning_rate": 9.984022098187083e-05, |
|
"loss": 2.8408, |
|
"mean_token_accuracy": 0.3215097412467003, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.12571588210643944, |
|
"grad_norm": 4.255288600921631, |
|
"learning_rate": 9.979890197596993e-05, |
|
"loss": 2.887, |
|
"mean_token_accuracy": 0.3237824708223343, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.12850956837547142, |
|
"grad_norm": 4.189087867736816, |
|
"learning_rate": 9.97528470997769e-05, |
|
"loss": 2.8593, |
|
"mean_token_accuracy": 0.324756495654583, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.13130325464450343, |
|
"grad_norm": 2.641284227371216, |
|
"learning_rate": 9.97020607331056e-05, |
|
"loss": 2.8606, |
|
"mean_token_accuracy": 0.32142857313156126, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1340969409135354, |
|
"grad_norm": 2.688258171081543, |
|
"learning_rate": 9.964654770573408e-05, |
|
"loss": 2.8676, |
|
"mean_token_accuracy": 0.32646103799343107, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1368906271825674, |
|
"grad_norm": 3.801523208618164, |
|
"learning_rate": 9.958631329694537e-05, |
|
"loss": 2.8623, |
|
"mean_token_accuracy": 0.3244318202137947, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.13968431345159937, |
|
"grad_norm": 3.528196334838867, |
|
"learning_rate": 9.952136323502536e-05, |
|
"loss": 2.865, |
|
"mean_token_accuracy": 0.32459415644407275, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.14247799972063138, |
|
"grad_norm": 3.970863103866577, |
|
"learning_rate": 9.945170369671802e-05, |
|
"loss": 2.8762, |
|
"mean_token_accuracy": 0.32102272659540176, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.14527168598966336, |
|
"grad_norm": 4.364096641540527, |
|
"learning_rate": 9.937734130663807e-05, |
|
"loss": 2.8521, |
|
"mean_token_accuracy": 0.321185065805912, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.14806537225869534, |
|
"grad_norm": 2.6619839668273926, |
|
"learning_rate": 9.92982831366409e-05, |
|
"loss": 2.8557, |
|
"mean_token_accuracy": 0.32167208194732666, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.15085905852772732, |
|
"grad_norm": 2.8934223651885986, |
|
"learning_rate": 9.921453670515009e-05, |
|
"loss": 2.8138, |
|
"mean_token_accuracy": 0.3263798728585243, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.15365274479675933, |
|
"grad_norm": 3.3745875358581543, |
|
"learning_rate": 9.91261099764424e-05, |
|
"loss": 2.8387, |
|
"mean_token_accuracy": 0.3244318187236786, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1564464310657913, |
|
"grad_norm": 3.961806297302246, |
|
"learning_rate": 9.903301135989032e-05, |
|
"loss": 2.833, |
|
"mean_token_accuracy": 0.324918831884861, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.1592401173348233, |
|
"grad_norm": 2.1680517196655273, |
|
"learning_rate": 9.893524970916242e-05, |
|
"loss": 2.8295, |
|
"mean_token_accuracy": 0.3221590921282768, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.16203380360385528, |
|
"grad_norm": 2.9997398853302, |
|
"learning_rate": 9.883283432138129e-05, |
|
"loss": 2.8284, |
|
"mean_token_accuracy": 0.32670454382896424, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.16482748987288728, |
|
"grad_norm": 6.048096656799316, |
|
"learning_rate": 9.872577493623945e-05, |
|
"loss": 2.814, |
|
"mean_token_accuracy": 0.3250811696052551, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.16762117614191926, |
|
"grad_norm": 2.815396547317505, |
|
"learning_rate": 9.861408173507304e-05, |
|
"loss": 2.8529, |
|
"mean_token_accuracy": 0.32467532753944395, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.17041486241095125, |
|
"grad_norm": 4.112369537353516, |
|
"learning_rate": 9.849776533989369e-05, |
|
"loss": 2.7942, |
|
"mean_token_accuracy": 0.327272729575634, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.17320854867998323, |
|
"grad_norm": 2.6627895832061768, |
|
"learning_rate": 9.837683681237819e-05, |
|
"loss": 2.8087, |
|
"mean_token_accuracy": 0.3275974065065384, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.17600223494901523, |
|
"grad_norm": 3.526514768600464, |
|
"learning_rate": 9.825130765281668e-05, |
|
"loss": 2.8224, |
|
"mean_token_accuracy": 0.32508117258548735, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.17879592121804722, |
|
"grad_norm": 3.236506938934326, |
|
"learning_rate": 9.812118979901891e-05, |
|
"loss": 2.8373, |
|
"mean_token_accuracy": 0.32394480556249616, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1815896074870792, |
|
"grad_norm": 3.168816566467285, |
|
"learning_rate": 9.7986495625179e-05, |
|
"loss": 2.8059, |
|
"mean_token_accuracy": 0.3279220789670944, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.18438329375611118, |
|
"grad_norm": 4.702609062194824, |
|
"learning_rate": 9.784723794069852e-05, |
|
"loss": 2.7648, |
|
"mean_token_accuracy": 0.33125000149011613, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.18717698002514319, |
|
"grad_norm": 3.00978684425354, |
|
"learning_rate": 9.770342998896851e-05, |
|
"loss": 2.8123, |
|
"mean_token_accuracy": 0.32670454680919647, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.18997066629417517, |
|
"grad_norm": 5.219450950622559, |
|
"learning_rate": 9.755508544610994e-05, |
|
"loss": 2.7966, |
|
"mean_token_accuracy": 0.3340097412467003, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.19276435256320715, |
|
"grad_norm": 2.8464038372039795, |
|
"learning_rate": 9.740221841967307e-05, |
|
"loss": 2.7738, |
|
"mean_token_accuracy": 0.32849026173353196, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.19555803883223913, |
|
"grad_norm": 3.846038341522217, |
|
"learning_rate": 9.72448434472959e-05, |
|
"loss": 2.7938, |
|
"mean_token_accuracy": 0.3283279240131378, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.19835172510127114, |
|
"grad_norm": 2.432882070541382, |
|
"learning_rate": 9.708297549532157e-05, |
|
"loss": 2.7805, |
|
"mean_token_accuracy": 0.329788963496685, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.20114541137030312, |
|
"grad_norm": 2.405036687850952, |
|
"learning_rate": 9.691662995737516e-05, |
|
"loss": 2.7497, |
|
"mean_token_accuracy": 0.327353897690773, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.2039390976393351, |
|
"grad_norm": 2.1323699951171875, |
|
"learning_rate": 9.674582265289967e-05, |
|
"loss": 2.7859, |
|
"mean_token_accuracy": 0.3321428596973419, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.20673278390836708, |
|
"grad_norm": 4.102099418640137, |
|
"learning_rate": 9.657056982565161e-05, |
|
"loss": 2.8042, |
|
"mean_token_accuracy": 0.3272727280855179, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.2095264701773991, |
|
"grad_norm": 3.1227469444274902, |
|
"learning_rate": 9.639088814215627e-05, |
|
"loss": 2.774, |
|
"mean_token_accuracy": 0.32987013161182405, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.21232015644643107, |
|
"grad_norm": 2.4661705493927, |
|
"learning_rate": 9.620679469012266e-05, |
|
"loss": 2.7377, |
|
"mean_token_accuracy": 0.3315746784210205, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.21511384271546305, |
|
"grad_norm": 3.3351097106933594, |
|
"learning_rate": 9.601830697681853e-05, |
|
"loss": 2.7786, |
|
"mean_token_accuracy": 0.3308441549539566, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.21790752898449503, |
|
"grad_norm": 2.9032342433929443, |
|
"learning_rate": 9.582544292740542e-05, |
|
"loss": 2.7485, |
|
"mean_token_accuracy": 0.33076298981904984, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.22070121525352704, |
|
"grad_norm": 2.4244565963745117, |
|
"learning_rate": 9.562822088323396e-05, |
|
"loss": 2.765, |
|
"mean_token_accuracy": 0.3307629868388176, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.22349490152255902, |
|
"grad_norm": 2.1915383338928223, |
|
"learning_rate": 9.542665960009959e-05, |
|
"loss": 2.7455, |
|
"mean_token_accuracy": 0.33303571343421934, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.226288587791591, |
|
"grad_norm": 4.944774150848389, |
|
"learning_rate": 9.522077824645896e-05, |
|
"loss": 2.7786, |
|
"mean_token_accuracy": 0.3247564971446991, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.22908227406062298, |
|
"grad_norm": 3.094341993331909, |
|
"learning_rate": 9.501059640160696e-05, |
|
"loss": 2.7678, |
|
"mean_token_accuracy": 0.32930195033550264, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.231875960329655, |
|
"grad_norm": 3.0549888610839844, |
|
"learning_rate": 9.479613405381474e-05, |
|
"loss": 2.7392, |
|
"mean_token_accuracy": 0.33157467693090437, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.23466964659868697, |
|
"grad_norm": 2.4436118602752686, |
|
"learning_rate": 9.457741159842875e-05, |
|
"loss": 2.7619, |
|
"mean_token_accuracy": 0.32897727340459826, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.23746333286771895, |
|
"grad_norm": 2.174079179763794, |
|
"learning_rate": 9.435444983593133e-05, |
|
"loss": 2.7419, |
|
"mean_token_accuracy": 0.33214285373687746, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.24025701913675093, |
|
"grad_norm": 2.1228203773498535, |
|
"learning_rate": 9.412726996996242e-05, |
|
"loss": 2.711, |
|
"mean_token_accuracy": 0.3351461037993431, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.24305070540578294, |
|
"grad_norm": 2.5232136249542236, |
|
"learning_rate": 9.389589360530315e-05, |
|
"loss": 2.7503, |
|
"mean_token_accuracy": 0.33506493717432023, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.24584439167481492, |
|
"grad_norm": 2.0878124237060547, |
|
"learning_rate": 9.366034274582125e-05, |
|
"loss": 2.7578, |
|
"mean_token_accuracy": 0.33417208194732667, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.2486380779438469, |
|
"grad_norm": 2.1424570083618164, |
|
"learning_rate": 9.342063979237846e-05, |
|
"loss": 2.7409, |
|
"mean_token_accuracy": 0.3298701301217079, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.2500349210783629, |
|
"eval_loss": 2.7205357551574707, |
|
"eval_mean_token_accuracy": 0.33310290950003457, |
|
"eval_runtime": 1163.4728, |
|
"eval_samples_per_second": 84.514, |
|
"eval_steps_per_second": 1.509, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.2514317642128789, |
|
"grad_norm": 2.787091016769409, |
|
"learning_rate": 9.317680754070017e-05, |
|
"loss": 2.7103, |
|
"mean_token_accuracy": 0.3347402632236481, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.25422545048191086, |
|
"grad_norm": 2.668544292449951, |
|
"learning_rate": 9.29288691792077e-05, |
|
"loss": 2.7083, |
|
"mean_token_accuracy": 0.3347402587532997, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.25701913675094284, |
|
"grad_norm": 1.573211669921875, |
|
"learning_rate": 9.267684828681286e-05, |
|
"loss": 2.7137, |
|
"mean_token_accuracy": 0.33206168860197066, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.2598128230199749, |
|
"grad_norm": 2.3701252937316895, |
|
"learning_rate": 9.242076883067579e-05, |
|
"loss": 2.7062, |
|
"mean_token_accuracy": 0.3372564911842346, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.26260650928900686, |
|
"grad_norm": 2.19620418548584, |
|
"learning_rate": 9.216065516392555e-05, |
|
"loss": 2.7239, |
|
"mean_token_accuracy": 0.33417207896709444, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.26540019555803884, |
|
"grad_norm": 2.424910306930542, |
|
"learning_rate": 9.18965320233443e-05, |
|
"loss": 2.7034, |
|
"mean_token_accuracy": 0.33741882890462876, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2681938818270708, |
|
"grad_norm": 1.7149094343185425, |
|
"learning_rate": 9.162842452701463e-05, |
|
"loss": 2.7212, |
|
"mean_token_accuracy": 0.3329545482993126, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.2709875680961028, |
|
"grad_norm": 2.663922071456909, |
|
"learning_rate": 9.1356358171931e-05, |
|
"loss": 2.7237, |
|
"mean_token_accuracy": 0.33019480258226397, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.2737812543651348, |
|
"grad_norm": 2.727353811264038, |
|
"learning_rate": 9.10803588315749e-05, |
|
"loss": 2.7267, |
|
"mean_token_accuracy": 0.33165584653615954, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.27657494063416677, |
|
"grad_norm": 2.4968833923339844, |
|
"learning_rate": 9.080045275345429e-05, |
|
"loss": 2.7363, |
|
"mean_token_accuracy": 0.33368506729602815, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.27936862690319875, |
|
"grad_norm": 2.447314977645874, |
|
"learning_rate": 9.051666655660752e-05, |
|
"loss": 2.691, |
|
"mean_token_accuracy": 0.33392857313156127, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2821623131722308, |
|
"grad_norm": 3.4394643306732178, |
|
"learning_rate": 9.022902722907173e-05, |
|
"loss": 2.7273, |
|
"mean_token_accuracy": 0.33084415793418886, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.28495599944126276, |
|
"grad_norm": 1.788519024848938, |
|
"learning_rate": 8.99375621253165e-05, |
|
"loss": 2.7005, |
|
"mean_token_accuracy": 0.3343344137072563, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.28774968571029474, |
|
"grad_norm": 2.173936367034912, |
|
"learning_rate": 8.964229896364223e-05, |
|
"loss": 2.6749, |
|
"mean_token_accuracy": 0.33271104097366333, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.2905433719793267, |
|
"grad_norm": 2.7031307220458984, |
|
"learning_rate": 8.934326582354426e-05, |
|
"loss": 2.7266, |
|
"mean_token_accuracy": 0.3330357104539871, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.2933370582483587, |
|
"grad_norm": 1.726846694946289, |
|
"learning_rate": 8.904049114304247e-05, |
|
"loss": 2.7049, |
|
"mean_token_accuracy": 0.33628246933221817, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2961307445173907, |
|
"grad_norm": 2.3227744102478027, |
|
"learning_rate": 8.873400371597685e-05, |
|
"loss": 2.7019, |
|
"mean_token_accuracy": 0.3343344181776047, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.29892443078642267, |
|
"grad_norm": 2.884831190109253, |
|
"learning_rate": 8.842383268926917e-05, |
|
"loss": 2.7492, |
|
"mean_token_accuracy": 0.3277597427368164, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.30171811705545465, |
|
"grad_norm": 2.0531013011932373, |
|
"learning_rate": 8.811000756015115e-05, |
|
"loss": 2.7191, |
|
"mean_token_accuracy": 0.3326298698782921, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.3045118033244867, |
|
"grad_norm": 2.0113601684570312, |
|
"learning_rate": 8.779255817335927e-05, |
|
"loss": 2.6986, |
|
"mean_token_accuracy": 0.3370129883289337, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.30730548959351867, |
|
"grad_norm": 2.0892691612243652, |
|
"learning_rate": 8.74715147182965e-05, |
|
"loss": 2.7063, |
|
"mean_token_accuracy": 0.3331980526447296, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.31009917586255065, |
|
"grad_norm": 1.9228285551071167, |
|
"learning_rate": 8.714690772616134e-05, |
|
"loss": 2.6696, |
|
"mean_token_accuracy": 0.3384740263223648, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.3128928621315826, |
|
"grad_norm": 2.699789524078369, |
|
"learning_rate": 8.681876806704431e-05, |
|
"loss": 2.6917, |
|
"mean_token_accuracy": 0.33693181574344633, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.3156865484006146, |
|
"grad_norm": 2.43989896774292, |
|
"learning_rate": 8.648712694699214e-05, |
|
"loss": 2.6816, |
|
"mean_token_accuracy": 0.3405032455921173, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.3184802346696466, |
|
"grad_norm": 1.8896594047546387, |
|
"learning_rate": 8.615201590504017e-05, |
|
"loss": 2.6781, |
|
"mean_token_accuracy": 0.33904220908880234, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.32127392093867857, |
|
"grad_norm": 2.3167271614074707, |
|
"learning_rate": 8.58134668102129e-05, |
|
"loss": 2.6755, |
|
"mean_token_accuracy": 0.33563312143087387, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.32406760720771055, |
|
"grad_norm": 2.3568389415740967, |
|
"learning_rate": 8.547151185849332e-05, |
|
"loss": 2.6837, |
|
"mean_token_accuracy": 0.3358766257762909, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.3268612934767426, |
|
"grad_norm": 2.21427845954895, |
|
"learning_rate": 8.512618356976103e-05, |
|
"loss": 2.6701, |
|
"mean_token_accuracy": 0.3395292177796364, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.32965497974577457, |
|
"grad_norm": 2.3978805541992188, |
|
"learning_rate": 8.477751478469964e-05, |
|
"loss": 2.7054, |
|
"mean_token_accuracy": 0.3341720774769783, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.33244866601480655, |
|
"grad_norm": 2.3003275394439697, |
|
"learning_rate": 8.442553866167362e-05, |
|
"loss": 2.6831, |
|
"mean_token_accuracy": 0.3353896141052246, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.33524235228383853, |
|
"grad_norm": 2.208958625793457, |
|
"learning_rate": 8.40702886735749e-05, |
|
"loss": 2.7123, |
|
"mean_token_accuracy": 0.33327922224998474, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3380360385528705, |
|
"grad_norm": 2.9336678981781006, |
|
"learning_rate": 8.371179860463962e-05, |
|
"loss": 2.6517, |
|
"mean_token_accuracy": 0.33806818127632143, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.3408297248219025, |
|
"grad_norm": 2.476029634475708, |
|
"learning_rate": 8.335010254723532e-05, |
|
"loss": 2.6725, |
|
"mean_token_accuracy": 0.3363636344671249, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.34362341109093447, |
|
"grad_norm": 2.220250129699707, |
|
"learning_rate": 8.298523489861864e-05, |
|
"loss": 2.6884, |
|
"mean_token_accuracy": 0.3369318187236786, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.34641709735996645, |
|
"grad_norm": 2.475471258163452, |
|
"learning_rate": 8.261723035766424e-05, |
|
"loss": 2.6657, |
|
"mean_token_accuracy": 0.3395292192697525, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.3492107836289985, |
|
"grad_norm": 2.1862926483154297, |
|
"learning_rate": 8.224612392156492e-05, |
|
"loss": 2.6489, |
|
"mean_token_accuracy": 0.33928571343421937, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.35200446989803047, |
|
"grad_norm": 3.5017824172973633, |
|
"learning_rate": 8.187195088250334e-05, |
|
"loss": 2.6563, |
|
"mean_token_accuracy": 0.3419642835855484, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.35479815616706245, |
|
"grad_norm": 2.0507349967956543, |
|
"learning_rate": 8.149474682429581e-05, |
|
"loss": 2.6305, |
|
"mean_token_accuracy": 0.3375, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.35759184243609443, |
|
"grad_norm": 2.558877468109131, |
|
"learning_rate": 8.111454761900823e-05, |
|
"loss": 2.6551, |
|
"mean_token_accuracy": 0.3370129868388176, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.3603855287051264, |
|
"grad_norm": 2.8872764110565186, |
|
"learning_rate": 8.073138942354468e-05, |
|
"loss": 2.6755, |
|
"mean_token_accuracy": 0.3384740278124809, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.3631792149741584, |
|
"grad_norm": 2.2422327995300293, |
|
"learning_rate": 8.034530867620884e-05, |
|
"loss": 2.6338, |
|
"mean_token_accuracy": 0.34042208045721056, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3659729012431904, |
|
"grad_norm": 3.2413196563720703, |
|
"learning_rate": 7.995634209323886e-05, |
|
"loss": 2.659, |
|
"mean_token_accuracy": 0.33839286118745804, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.36876658751222235, |
|
"grad_norm": 1.9311487674713135, |
|
"learning_rate": 7.956452666531543e-05, |
|
"loss": 2.6735, |
|
"mean_token_accuracy": 0.3370129868388176, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.3715602737812544, |
|
"grad_norm": 2.613398313522339, |
|
"learning_rate": 7.91698996540442e-05, |
|
"loss": 2.6615, |
|
"mean_token_accuracy": 0.3362824648618698, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.37435396005028637, |
|
"grad_norm": 2.089853048324585, |
|
"learning_rate": 7.877249858841205e-05, |
|
"loss": 2.6745, |
|
"mean_token_accuracy": 0.3379058450460434, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.37714764631931835, |
|
"grad_norm": 2.8007054328918457, |
|
"learning_rate": 7.837236126121813e-05, |
|
"loss": 2.6291, |
|
"mean_token_accuracy": 0.3419642895460129, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.37994133258835033, |
|
"grad_norm": 2.5469701290130615, |
|
"learning_rate": 7.796952572547979e-05, |
|
"loss": 2.6565, |
|
"mean_token_accuracy": 0.33522727340459824, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.3827350188573823, |
|
"grad_norm": 2.698303461074829, |
|
"learning_rate": 7.756403029081371e-05, |
|
"loss": 2.6347, |
|
"mean_token_accuracy": 0.34009740352630613, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.3855287051264143, |
|
"grad_norm": 2.7376692295074463, |
|
"learning_rate": 7.71559135197927e-05, |
|
"loss": 2.6732, |
|
"mean_token_accuracy": 0.33725649267435076, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.3883223913954463, |
|
"grad_norm": 2.4188475608825684, |
|
"learning_rate": 7.674521422427837e-05, |
|
"loss": 2.6648, |
|
"mean_token_accuracy": 0.33855519592761996, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.39111607766447826, |
|
"grad_norm": 2.3172502517700195, |
|
"learning_rate": 7.633197146173011e-05, |
|
"loss": 2.6581, |
|
"mean_token_accuracy": 0.33376623392105104, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.3939097639335103, |
|
"grad_norm": 2.007584810256958, |
|
"learning_rate": 7.591622453149078e-05, |
|
"loss": 2.6422, |
|
"mean_token_accuracy": 0.3401785731315613, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.3967034502025423, |
|
"grad_norm": 2.598574161529541, |
|
"learning_rate": 7.549801297104935e-05, |
|
"loss": 2.6408, |
|
"mean_token_accuracy": 0.33725649267435076, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.39949713647157425, |
|
"grad_norm": 2.0446927547454834, |
|
"learning_rate": 7.50773765522808e-05, |
|
"loss": 2.6269, |
|
"mean_token_accuracy": 0.3426136389374733, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.40229082274060624, |
|
"grad_norm": 2.690002918243408, |
|
"learning_rate": 7.465435527766389e-05, |
|
"loss": 2.5858, |
|
"mean_token_accuracy": 0.34058441370725634, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.4050845090096382, |
|
"grad_norm": 2.8425843715667725, |
|
"learning_rate": 7.422898937647695e-05, |
|
"loss": 2.5586, |
|
"mean_token_accuracy": 0.3442370146512985, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4078781952786702, |
|
"grad_norm": 2.089576482772827, |
|
"learning_rate": 7.380131930097206e-05, |
|
"loss": 2.6763, |
|
"mean_token_accuracy": 0.333441561460495, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.4106718815477022, |
|
"grad_norm": 2.146989107131958, |
|
"learning_rate": 7.337138572252797e-05, |
|
"loss": 2.5974, |
|
"mean_token_accuracy": 0.3448863625526428, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.41346556781673416, |
|
"grad_norm": 1.8725186586380005, |
|
"learning_rate": 7.293922952778239e-05, |
|
"loss": 2.6124, |
|
"mean_token_accuracy": 0.3443993508815765, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.4162592540857662, |
|
"grad_norm": 2.520397186279297, |
|
"learning_rate": 7.250489181474351e-05, |
|
"loss": 2.6684, |
|
"mean_token_accuracy": 0.3356331169605255, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.4190529403547982, |
|
"grad_norm": 2.551262378692627, |
|
"learning_rate": 7.206841388888183e-05, |
|
"loss": 2.6146, |
|
"mean_token_accuracy": 0.3406655803322792, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.42184662662383016, |
|
"grad_norm": 2.6218693256378174, |
|
"learning_rate": 7.16298372592017e-05, |
|
"loss": 2.6026, |
|
"mean_token_accuracy": 0.3432629868388176, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.42464031289286214, |
|
"grad_norm": 1.9419686794281006, |
|
"learning_rate": 7.118920363429405e-05, |
|
"loss": 2.6163, |
|
"mean_token_accuracy": 0.34115259498357775, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.4274339991618941, |
|
"grad_norm": 2.822739601135254, |
|
"learning_rate": 7.074655491836988e-05, |
|
"loss": 2.6081, |
|
"mean_token_accuracy": 0.34123376458883287, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.4302276854309261, |
|
"grad_norm": 2.17193341255188, |
|
"learning_rate": 7.030193320727508e-05, |
|
"loss": 2.5796, |
|
"mean_token_accuracy": 0.3430194780230522, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.4330213716999581, |
|
"grad_norm": 2.2731964588165283, |
|
"learning_rate": 6.985538078448714e-05, |
|
"loss": 2.6235, |
|
"mean_token_accuracy": 0.3362012967467308, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.43581505796899006, |
|
"grad_norm": 2.6764817237854004, |
|
"learning_rate": 6.940694011709411e-05, |
|
"loss": 2.626, |
|
"mean_token_accuracy": 0.3414772734045982, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.4386087442380221, |
|
"grad_norm": 2.273409366607666, |
|
"learning_rate": 6.895665385175587e-05, |
|
"loss": 2.571, |
|
"mean_token_accuracy": 0.3428571432828903, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.4414024305070541, |
|
"grad_norm": 2.117824077606201, |
|
"learning_rate": 6.850456481064841e-05, |
|
"loss": 2.6061, |
|
"mean_token_accuracy": 0.34269480109214784, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.44419611677608606, |
|
"grad_norm": 2.7593703269958496, |
|
"learning_rate": 6.80507159873916e-05, |
|
"loss": 2.5878, |
|
"mean_token_accuracy": 0.34480519592761993, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.44698980304511804, |
|
"grad_norm": 2.5217463970184326, |
|
"learning_rate": 6.759515054296033e-05, |
|
"loss": 2.5867, |
|
"mean_token_accuracy": 0.3475649327039719, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.44978348931415, |
|
"grad_norm": 2.5164499282836914, |
|
"learning_rate": 6.713791180158004e-05, |
|
"loss": 2.6199, |
|
"mean_token_accuracy": 0.3417207792401314, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.452577175583182, |
|
"grad_norm": 4.4600701332092285, |
|
"learning_rate": 6.667904324660648e-05, |
|
"loss": 2.5562, |
|
"mean_token_accuracy": 0.3468344137072563, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.455370861852214, |
|
"grad_norm": 2.2503256797790527, |
|
"learning_rate": 6.621858851639052e-05, |
|
"loss": 2.598, |
|
"mean_token_accuracy": 0.33928571343421937, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.45816454812124596, |
|
"grad_norm": 2.7074501514434814, |
|
"learning_rate": 6.575659140012813e-05, |
|
"loss": 2.5749, |
|
"mean_token_accuracy": 0.34350649267435074, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.460958234390278, |
|
"grad_norm": 2.3996992111206055, |
|
"learning_rate": 6.529309583369605e-05, |
|
"loss": 2.5655, |
|
"mean_token_accuracy": 0.3461850643157959, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.46375192065931, |
|
"grad_norm": 2.124371290206909, |
|
"learning_rate": 6.482814589547343e-05, |
|
"loss": 2.5734, |
|
"mean_token_accuracy": 0.3435064911842346, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.46654560692834196, |
|
"grad_norm": 2.760023832321167, |
|
"learning_rate": 6.436178580215006e-05, |
|
"loss": 2.5831, |
|
"mean_token_accuracy": 0.34821428507566454, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.46933929319737394, |
|
"grad_norm": 2.2467293739318848, |
|
"learning_rate": 6.389405990452131e-05, |
|
"loss": 2.5386, |
|
"mean_token_accuracy": 0.34837662279605863, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.4721329794664059, |
|
"grad_norm": 2.895301342010498, |
|
"learning_rate": 6.342501268327036e-05, |
|
"loss": 2.608, |
|
"mean_token_accuracy": 0.34366882890462874, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.4749266657354379, |
|
"grad_norm": 2.8437559604644775, |
|
"learning_rate": 6.295468874473824e-05, |
|
"loss": 2.5744, |
|
"mean_token_accuracy": 0.34375, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.4777203520044699, |
|
"grad_norm": 2.2013747692108154, |
|
"learning_rate": 6.248313281668151e-05, |
|
"loss": 2.5572, |
|
"mean_token_accuracy": 0.34269480407238007, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.48051403827350186, |
|
"grad_norm": 2.285017728805542, |
|
"learning_rate": 6.201038974401893e-05, |
|
"loss": 2.5755, |
|
"mean_token_accuracy": 0.3433441549539566, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.4833077245425339, |
|
"grad_norm": 3.0466742515563965, |
|
"learning_rate": 6.15365044845665e-05, |
|
"loss": 2.5605, |
|
"mean_token_accuracy": 0.34318181723356245, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.4861014108115659, |
|
"grad_norm": 2.7282652854919434, |
|
"learning_rate": 6.10615221047621e-05, |
|
"loss": 2.5826, |
|
"mean_token_accuracy": 0.34439935386180875, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.48889509708059786, |
|
"grad_norm": 2.5899875164031982, |
|
"learning_rate": 6.0585487775379634e-05, |
|
"loss": 2.569, |
|
"mean_token_accuracy": 0.3461850628256798, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.49168878334962984, |
|
"grad_norm": 2.2856500148773193, |
|
"learning_rate": 6.0108446767233304e-05, |
|
"loss": 2.5432, |
|
"mean_token_accuracy": 0.3460227265954018, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.4944824696186618, |
|
"grad_norm": 2.4979193210601807, |
|
"learning_rate": 5.963044444687235e-05, |
|
"loss": 2.5524, |
|
"mean_token_accuracy": 0.3460227265954018, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.4972761558876938, |
|
"grad_norm": 3.4747583866119385, |
|
"learning_rate": 5.91515262722667e-05, |
|
"loss": 2.5506, |
|
"mean_token_accuracy": 0.34025973826646805, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.5000698421567258, |
|
"grad_norm": 2.0753846168518066, |
|
"learning_rate": 5.867173778848394e-05, |
|
"loss": 2.5333, |
|
"mean_token_accuracy": 0.34845779091119766, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.5000698421567258, |
|
"eval_loss": 2.6793947219848633, |
|
"eval_mean_token_accuracy": 0.3425768423901875, |
|
"eval_runtime": 994.8268, |
|
"eval_samples_per_second": 98.841, |
|
"eval_steps_per_second": 1.765, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.5028635284257578, |
|
"grad_norm": 2.6756086349487305, |
|
"learning_rate": 5.819112462335792e-05, |
|
"loss": 2.5148, |
|
"mean_token_accuracy": 0.34748376458883284, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5056572146947897, |
|
"grad_norm": 2.5338103771209717, |
|
"learning_rate": 5.770973248314965e-05, |
|
"loss": 2.5498, |
|
"mean_token_accuracy": 0.3452110379934311, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.5084509009638217, |
|
"grad_norm": 2.2562379837036133, |
|
"learning_rate": 5.722760714820057e-05, |
|
"loss": 2.5533, |
|
"mean_token_accuracy": 0.34634740203619, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.5112445872328537, |
|
"grad_norm": 2.7285821437835693, |
|
"learning_rate": 5.674479446857885e-05, |
|
"loss": 2.5056, |
|
"mean_token_accuracy": 0.3499999985098839, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.5140382735018857, |
|
"grad_norm": 2.5474276542663574, |
|
"learning_rate": 5.626134035971908e-05, |
|
"loss": 2.5642, |
|
"mean_token_accuracy": 0.34456168711185453, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.5168319597709177, |
|
"grad_norm": 2.749859094619751, |
|
"learning_rate": 5.577729079805569e-05, |
|
"loss": 2.5206, |
|
"mean_token_accuracy": 0.34821428507566454, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.5196256460399498, |
|
"grad_norm": 3.221196413040161, |
|
"learning_rate": 5.529269181665064e-05, |
|
"loss": 2.5129, |
|
"mean_token_accuracy": 0.3448051944375038, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.5224193323089817, |
|
"grad_norm": 3.7197160720825195, |
|
"learning_rate": 5.4807589500815606e-05, |
|
"loss": 2.5394, |
|
"mean_token_accuracy": 0.3462662324309349, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.5252130185780137, |
|
"grad_norm": 2.5209450721740723, |
|
"learning_rate": 5.432202998372932e-05, |
|
"loss": 2.521, |
|
"mean_token_accuracy": 0.3460227236151695, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.5280067048470457, |
|
"grad_norm": 2.6848323345184326, |
|
"learning_rate": 5.383605944205033e-05, |
|
"loss": 2.5506, |
|
"mean_token_accuracy": 0.3453733742237091, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.5308003911160777, |
|
"grad_norm": 2.5749640464782715, |
|
"learning_rate": 5.334972409152559e-05, |
|
"loss": 2.5357, |
|
"mean_token_accuracy": 0.34399350732564926, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.5335940773851097, |
|
"grad_norm": 2.7281625270843506, |
|
"learning_rate": 5.286307018259529e-05, |
|
"loss": 2.5287, |
|
"mean_token_accuracy": 0.3456168845295906, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.5363877636541416, |
|
"grad_norm": 2.868173122406006, |
|
"learning_rate": 5.237614399599451e-05, |
|
"loss": 2.5066, |
|
"mean_token_accuracy": 0.3488636389374733, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.5391814499231736, |
|
"grad_norm": 2.461381673812866, |
|
"learning_rate": 5.1888991838351916e-05, |
|
"loss": 2.52, |
|
"mean_token_accuracy": 0.34163961112499236, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.5419751361922056, |
|
"grad_norm": 2.8189141750335693, |
|
"learning_rate": 5.140166003778603e-05, |
|
"loss": 2.5391, |
|
"mean_token_accuracy": 0.34618506133556365, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.5447688224612376, |
|
"grad_norm": 2.760181427001953, |
|
"learning_rate": 5.091419493949929e-05, |
|
"loss": 2.5464, |
|
"mean_token_accuracy": 0.34740259647369387, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.5475625087302696, |
|
"grad_norm": 3.0051541328430176, |
|
"learning_rate": 5.042664290137086e-05, |
|
"loss": 2.5276, |
|
"mean_token_accuracy": 0.3481331169605255, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.5503561949993016, |
|
"grad_norm": 2.959178924560547, |
|
"learning_rate": 4.993905028954778e-05, |
|
"loss": 2.5281, |
|
"mean_token_accuracy": 0.34642857015132905, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.5531498812683335, |
|
"grad_norm": 2.7471230030059814, |
|
"learning_rate": 4.94514634740357e-05, |
|
"loss": 2.5127, |
|
"mean_token_accuracy": 0.34821428507566454, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.5559435675373655, |
|
"grad_norm": 3.817932605743408, |
|
"learning_rate": 4.896392882428901e-05, |
|
"loss": 2.5538, |
|
"mean_token_accuracy": 0.3424512967467308, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.5587372538063975, |
|
"grad_norm": 2.950193166732788, |
|
"learning_rate": 4.847649270480117e-05, |
|
"loss": 2.4831, |
|
"mean_token_accuracy": 0.34951298832893374, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5615309400754295, |
|
"grad_norm": 3.4689207077026367, |
|
"learning_rate": 4.7989201470695396e-05, |
|
"loss": 2.4955, |
|
"mean_token_accuracy": 0.34439934939146044, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.5643246263444616, |
|
"grad_norm": 2.9318456649780273, |
|
"learning_rate": 4.750210146331632e-05, |
|
"loss": 2.5267, |
|
"mean_token_accuracy": 0.34691558331251143, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.5671183126134935, |
|
"grad_norm": 2.702409505844116, |
|
"learning_rate": 4.701523900582295e-05, |
|
"loss": 2.5095, |
|
"mean_token_accuracy": 0.34829545617103574, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.5699119988825255, |
|
"grad_norm": 2.8087456226348877, |
|
"learning_rate": 4.6528660398783326e-05, |
|
"loss": 2.4989, |
|
"mean_token_accuracy": 0.3536525994539261, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.5727056851515575, |
|
"grad_norm": 2.8514254093170166, |
|
"learning_rate": 4.6042411915771306e-05, |
|
"loss": 2.5446, |
|
"mean_token_accuracy": 0.34764610081911085, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.5754993714205895, |
|
"grad_norm": 2.71073317527771, |
|
"learning_rate": 4.555653979896603e-05, |
|
"loss": 2.503, |
|
"mean_token_accuracy": 0.34829545170068743, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.5782930576896215, |
|
"grad_norm": 2.834017753601074, |
|
"learning_rate": 4.507109025475423e-05, |
|
"loss": 2.4925, |
|
"mean_token_accuracy": 0.34853895753622055, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.5810867439586535, |
|
"grad_norm": 2.7667429447174072, |
|
"learning_rate": 4.4586109449336045e-05, |
|
"loss": 2.4791, |
|
"mean_token_accuracy": 0.34967532455921174, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.5838804302276854, |
|
"grad_norm": 2.7539703845977783, |
|
"learning_rate": 4.410164350433457e-05, |
|
"loss": 2.481, |
|
"mean_token_accuracy": 0.3508928567171097, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.5866741164967174, |
|
"grad_norm": 2.7576351165771484, |
|
"learning_rate": 4.361773849240977e-05, |
|
"loss": 2.5077, |
|
"mean_token_accuracy": 0.3529220804572105, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5894678027657494, |
|
"grad_norm": 2.8457632064819336, |
|
"learning_rate": 4.313444043287691e-05, |
|
"loss": 2.5038, |
|
"mean_token_accuracy": 0.34740259796380996, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.5922614890347814, |
|
"grad_norm": 3.2245895862579346, |
|
"learning_rate": 4.265179528733017e-05, |
|
"loss": 2.5013, |
|
"mean_token_accuracy": 0.3470779210329056, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.5950551753038134, |
|
"grad_norm": 2.93704891204834, |
|
"learning_rate": 4.2169848955271624e-05, |
|
"loss": 2.4865, |
|
"mean_token_accuracy": 0.3499188289046288, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.5978488615728453, |
|
"grad_norm": 2.9629595279693604, |
|
"learning_rate": 4.1688647269746324e-05, |
|
"loss": 2.5028, |
|
"mean_token_accuracy": 0.3491883099079132, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.6006425478418773, |
|
"grad_norm": 3.764291286468506, |
|
"learning_rate": 4.120823599298349e-05, |
|
"loss": 2.4974, |
|
"mean_token_accuracy": 0.3502435013651848, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.6034362341109093, |
|
"grad_norm": 3.1387903690338135, |
|
"learning_rate": 4.0728660812044536e-05, |
|
"loss": 2.4754, |
|
"mean_token_accuracy": 0.3487012982368469, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.6062299203799413, |
|
"grad_norm": 3.713451385498047, |
|
"learning_rate": 4.0249967334478266e-05, |
|
"loss": 2.4594, |
|
"mean_token_accuracy": 0.35519480258226394, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.6090236066489734, |
|
"grad_norm": 3.5359973907470703, |
|
"learning_rate": 3.9772201083983596e-05, |
|
"loss": 2.4914, |
|
"mean_token_accuracy": 0.34983766078948975, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.6118172929180054, |
|
"grad_norm": 3.2408080101013184, |
|
"learning_rate": 3.929540749608024e-05, |
|
"loss": 2.4659, |
|
"mean_token_accuracy": 0.3521103858947754, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.6146109791870373, |
|
"grad_norm": 3.0888562202453613, |
|
"learning_rate": 3.881963191378778e-05, |
|
"loss": 2.4789, |
|
"mean_token_accuracy": 0.3471590906381607, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.6174046654560693, |
|
"grad_norm": 3.0336287021636963, |
|
"learning_rate": 3.83449195833136e-05, |
|
"loss": 2.4546, |
|
"mean_token_accuracy": 0.34886363446712493, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.6201983517251013, |
|
"grad_norm": 3.150221347808838, |
|
"learning_rate": 3.7871315649749953e-05, |
|
"loss": 2.4682, |
|
"mean_token_accuracy": 0.3517857164144516, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.6229920379941333, |
|
"grad_norm": 2.995313882827759, |
|
"learning_rate": 3.739886515278066e-05, |
|
"loss": 2.4592, |
|
"mean_token_accuracy": 0.35413961112499237, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.6257857242631653, |
|
"grad_norm": 3.334155321121216, |
|
"learning_rate": 3.692761302239779e-05, |
|
"loss": 2.4621, |
|
"mean_token_accuracy": 0.3563311696052551, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.6285794105321972, |
|
"grad_norm": 3.2132041454315186, |
|
"learning_rate": 3.645760407462896e-05, |
|
"loss": 2.4589, |
|
"mean_token_accuracy": 0.35081168860197065, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6313730968012292, |
|
"grad_norm": 3.1528499126434326, |
|
"learning_rate": 3.598888300727521e-05, |
|
"loss": 2.4256, |
|
"mean_token_accuracy": 0.3550324648618698, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.6341667830702612, |
|
"grad_norm": 3.3648734092712402, |
|
"learning_rate": 3.552149439566029e-05, |
|
"loss": 2.4552, |
|
"mean_token_accuracy": 0.35657467097043993, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.6369604693392932, |
|
"grad_norm": 3.0252137184143066, |
|
"learning_rate": 3.505548268839155e-05, |
|
"loss": 2.4769, |
|
"mean_token_accuracy": 0.3514610409736633, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.6397541556083252, |
|
"grad_norm": 2.9841442108154297, |
|
"learning_rate": 3.45908922031329e-05, |
|
"loss": 2.4483, |
|
"mean_token_accuracy": 0.3573863670229912, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.6425478418773571, |
|
"grad_norm": 3.206582546234131, |
|
"learning_rate": 3.412776712239016e-05, |
|
"loss": 2.4437, |
|
"mean_token_accuracy": 0.3538961037993431, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.6453415281463891, |
|
"grad_norm": 2.9362432956695557, |
|
"learning_rate": 3.3666151489309364e-05, |
|
"loss": 2.4284, |
|
"mean_token_accuracy": 0.35787337720394136, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.6481352144154211, |
|
"grad_norm": 3.4137799739837646, |
|
"learning_rate": 3.32060892034882e-05, |
|
"loss": 2.4621, |
|
"mean_token_accuracy": 0.35308441519737244, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.6509289006844531, |
|
"grad_norm": 3.3628833293914795, |
|
"learning_rate": 3.274762401680124e-05, |
|
"loss": 2.4417, |
|
"mean_token_accuracy": 0.35600649267435075, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.6537225869534852, |
|
"grad_norm": 3.157782793045044, |
|
"learning_rate": 3.229079952923908e-05, |
|
"loss": 2.4363, |
|
"mean_token_accuracy": 0.3574675336480141, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.6565162732225172, |
|
"grad_norm": 3.448489189147949, |
|
"learning_rate": 3.183565918476198e-05, |
|
"loss": 2.4261, |
|
"mean_token_accuracy": 0.3572240278124809, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.6593099594915491, |
|
"grad_norm": 3.607093572616577, |
|
"learning_rate": 3.1382246267168386e-05, |
|
"loss": 2.4279, |
|
"mean_token_accuracy": 0.35633117109537127, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.6621036457605811, |
|
"grad_norm": 3.165947914123535, |
|
"learning_rate": 3.093060389597865e-05, |
|
"loss": 2.4558, |
|
"mean_token_accuracy": 0.3527597412467003, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.6648973320296131, |
|
"grad_norm": 3.2351622581481934, |
|
"learning_rate": 3.048077502233434e-05, |
|
"loss": 2.4079, |
|
"mean_token_accuracy": 0.35860389471054077, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.6676910182986451, |
|
"grad_norm": 3.320864677429199, |
|
"learning_rate": 3.0032802424913563e-05, |
|
"loss": 2.4491, |
|
"mean_token_accuracy": 0.3559253215789795, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.6704847045676771, |
|
"grad_norm": 3.1946163177490234, |
|
"learning_rate": 2.9586728705862813e-05, |
|
"loss": 2.4654, |
|
"mean_token_accuracy": 0.3549513012170792, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.673278390836709, |
|
"grad_norm": 3.2893896102905273, |
|
"learning_rate": 2.914259628674542e-05, |
|
"loss": 2.4384, |
|
"mean_token_accuracy": 0.35592532306909563, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.676072077105741, |
|
"grad_norm": 3.068572521209717, |
|
"learning_rate": 2.870044740450729e-05, |
|
"loss": 2.4351, |
|
"mean_token_accuracy": 0.356168831884861, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.678865763374773, |
|
"grad_norm": 3.2332894802093506, |
|
"learning_rate": 2.8260324107460197e-05, |
|
"loss": 2.4432, |
|
"mean_token_accuracy": 0.35292208194732666, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.681659449643805, |
|
"grad_norm": 3.183605194091797, |
|
"learning_rate": 2.7822268251282975e-05, |
|
"loss": 2.4073, |
|
"mean_token_accuracy": 0.3563311696052551, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.684453135912837, |
|
"grad_norm": 3.625239133834839, |
|
"learning_rate": 2.7386321495041047e-05, |
|
"loss": 2.4071, |
|
"mean_token_accuracy": 0.35738636553287506, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.6872468221818689, |
|
"grad_norm": 3.204867362976074, |
|
"learning_rate": 2.695252529722467e-05, |
|
"loss": 2.415, |
|
"mean_token_accuracy": 0.3589285746216774, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.6900405084509009, |
|
"grad_norm": 3.107151985168457, |
|
"learning_rate": 2.65209209118062e-05, |
|
"loss": 2.4391, |
|
"mean_token_accuracy": 0.35519480854272845, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.6928341947199329, |
|
"grad_norm": 3.3900489807128906, |
|
"learning_rate": 2.6091549384316883e-05, |
|
"loss": 2.4317, |
|
"mean_token_accuracy": 0.3538961037993431, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.6956278809889649, |
|
"grad_norm": 3.0111920833587646, |
|
"learning_rate": 2.566445154794341e-05, |
|
"loss": 2.4185, |
|
"mean_token_accuracy": 0.3533279225230217, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.698421567257997, |
|
"grad_norm": 3.488588809967041, |
|
"learning_rate": 2.523966801964468e-05, |
|
"loss": 2.4098, |
|
"mean_token_accuracy": 0.3501623347401619, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.701215253527029, |
|
"grad_norm": 3.103729724884033, |
|
"learning_rate": 2.481723919628916e-05, |
|
"loss": 2.4228, |
|
"mean_token_accuracy": 0.35275974273681643, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.7040089397960609, |
|
"grad_norm": 3.421889305114746, |
|
"learning_rate": 2.4397205250813104e-05, |
|
"loss": 2.4011, |
|
"mean_token_accuracy": 0.3581980481743813, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.7068026260650929, |
|
"grad_norm": 3.167423725128174, |
|
"learning_rate": 2.3979606128400162e-05, |
|
"loss": 2.4235, |
|
"mean_token_accuracy": 0.3530844137072563, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.7095963123341249, |
|
"grad_norm": 3.6433827877044678, |
|
"learning_rate": 2.3564481542682516e-05, |
|
"loss": 2.4054, |
|
"mean_token_accuracy": 0.3555194824934006, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.7123899986031569, |
|
"grad_norm": 3.3545353412628174, |
|
"learning_rate": 2.3151870971964224e-05, |
|
"loss": 2.431, |
|
"mean_token_accuracy": 0.35308441519737244, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.7151836848721889, |
|
"grad_norm": 3.181476593017578, |
|
"learning_rate": 2.2741813655466758e-05, |
|
"loss": 2.39, |
|
"mean_token_accuracy": 0.3538961037993431, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.7179773711412208, |
|
"grad_norm": 3.6350057125091553, |
|
"learning_rate": 2.2334348589597404e-05, |
|
"loss": 2.4131, |
|
"mean_token_accuracy": 0.35202922075986864, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.7207710574102528, |
|
"grad_norm": 4.146636962890625, |
|
"learning_rate": 2.1929514524240667e-05, |
|
"loss": 2.4173, |
|
"mean_token_accuracy": 0.3597402602434158, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.7235647436792848, |
|
"grad_norm": 4.2176384925842285, |
|
"learning_rate": 2.15273499590732e-05, |
|
"loss": 2.4013, |
|
"mean_token_accuracy": 0.3566558465361595, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.7263584299483168, |
|
"grad_norm": 3.659813165664673, |
|
"learning_rate": 2.112789313990246e-05, |
|
"loss": 2.4012, |
|
"mean_token_accuracy": 0.35600649416446684, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.7291521162173488, |
|
"grad_norm": 3.0530693531036377, |
|
"learning_rate": 2.073118205502957e-05, |
|
"loss": 2.4082, |
|
"mean_token_accuracy": 0.3566558450460434, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.7319458024863807, |
|
"grad_norm": 4.101310729980469, |
|
"learning_rate": 2.0337254431636548e-05, |
|
"loss": 2.4011, |
|
"mean_token_accuracy": 0.3585227280855179, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.7347394887554127, |
|
"grad_norm": 4.217702388763428, |
|
"learning_rate": 1.99461477321986e-05, |
|
"loss": 2.3857, |
|
"mean_token_accuracy": 0.35884740203619003, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.7375331750244447, |
|
"grad_norm": 3.2049005031585693, |
|
"learning_rate": 1.9557899150921317e-05, |
|
"loss": 2.3847, |
|
"mean_token_accuracy": 0.35649350583553313, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.7403268612934767, |
|
"grad_norm": 3.6392228603363037, |
|
"learning_rate": 1.9172545610203575e-05, |
|
"loss": 2.435, |
|
"mean_token_accuracy": 0.35349026024341584, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.7431205475625088, |
|
"grad_norm": 3.226102352142334, |
|
"learning_rate": 1.8790123757126195e-05, |
|
"loss": 2.3908, |
|
"mean_token_accuracy": 0.3583603873848915, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.7459142338315408, |
|
"grad_norm": 3.2937116622924805, |
|
"learning_rate": 1.84106699599668e-05, |
|
"loss": 2.3964, |
|
"mean_token_accuracy": 0.3568181827664375, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.7487079201005727, |
|
"grad_norm": 3.6983494758605957, |
|
"learning_rate": 1.803422030474126e-05, |
|
"loss": 2.3886, |
|
"mean_token_accuracy": 0.35698052048683165, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.7501047632350887, |
|
"eval_loss": 2.7606494426727295, |
|
"eval_mean_token_accuracy": 0.3412951716176516, |
|
"eval_runtime": 989.3775, |
|
"eval_samples_per_second": 99.386, |
|
"eval_steps_per_second": 1.775, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.7515016063696047, |
|
"grad_norm": 3.6565871238708496, |
|
"learning_rate": 1.7660810591771785e-05, |
|
"loss": 2.4133, |
|
"mean_token_accuracy": 0.36103896498680116, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.7542952926386367, |
|
"grad_norm": 3.315748691558838, |
|
"learning_rate": 1.7290476332282468e-05, |
|
"loss": 2.3908, |
|
"mean_token_accuracy": 0.36071428656578064, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.7570889789076687, |
|
"grad_norm": 3.21600604057312, |
|
"learning_rate": 1.6923252745022062e-05, |
|
"loss": 2.3913, |
|
"mean_token_accuracy": 0.3598214253783226, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.7598826651767007, |
|
"grad_norm": 4.019015312194824, |
|
"learning_rate": 1.6559174752914754e-05, |
|
"loss": 2.394, |
|
"mean_token_accuracy": 0.35957791954278945, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.7626763514457326, |
|
"grad_norm": 3.757371664047241, |
|
"learning_rate": 1.6198276979738942e-05, |
|
"loss": 2.4179, |
|
"mean_token_accuracy": 0.35170454531908035, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.7654700377147646, |
|
"grad_norm": 3.8441550731658936, |
|
"learning_rate": 1.5840593746834546e-05, |
|
"loss": 2.3617, |
|
"mean_token_accuracy": 0.3576298698782921, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.7682637239837966, |
|
"grad_norm": 3.4171853065490723, |
|
"learning_rate": 1.5486159069839058e-05, |
|
"loss": 2.3991, |
|
"mean_token_accuracy": 0.3582792207598686, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7710574102528286, |
|
"grad_norm": 3.999089002609253, |
|
"learning_rate": 1.5135006655452644e-05, |
|
"loss": 2.385, |
|
"mean_token_accuracy": 0.35746753215789795, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.7738510965218606, |
|
"grad_norm": 3.6210405826568604, |
|
"learning_rate": 1.4787169898232618e-05, |
|
"loss": 2.3821, |
|
"mean_token_accuracy": 0.3604707822203636, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.7766447827908926, |
|
"grad_norm": 3.676494836807251, |
|
"learning_rate": 1.4442681877417686e-05, |
|
"loss": 2.3897, |
|
"mean_token_accuracy": 0.35560064762830734, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.7794384690599245, |
|
"grad_norm": 3.83720326423645, |
|
"learning_rate": 1.410157535378206e-05, |
|
"loss": 2.3225, |
|
"mean_token_accuracy": 0.3604707807302475, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.7822321553289565, |
|
"grad_norm": 3.8904194831848145, |
|
"learning_rate": 1.3763882766519926e-05, |
|
"loss": 2.3878, |
|
"mean_token_accuracy": 0.35600649267435075, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.7850258415979885, |
|
"grad_norm": 3.4765756130218506, |
|
"learning_rate": 1.3429636230160498e-05, |
|
"loss": 2.3814, |
|
"mean_token_accuracy": 0.3632305219769478, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.7878195278670206, |
|
"grad_norm": 3.398759126663208, |
|
"learning_rate": 1.3098867531513903e-05, |
|
"loss": 2.3775, |
|
"mean_token_accuracy": 0.3590909048914909, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.7906132141360526, |
|
"grad_norm": 3.791919231414795, |
|
"learning_rate": 1.2771608126648293e-05, |
|
"loss": 2.3942, |
|
"mean_token_accuracy": 0.3641233786940575, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.7934069004050845, |
|
"grad_norm": 3.9098916053771973, |
|
"learning_rate": 1.2447889137898293e-05, |
|
"loss": 2.3438, |
|
"mean_token_accuracy": 0.36055194288492204, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.7962005866741165, |
|
"grad_norm": 3.677560567855835, |
|
"learning_rate": 1.2127741350905397e-05, |
|
"loss": 2.3837, |
|
"mean_token_accuracy": 0.3632305219769478, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.7989942729431485, |
|
"grad_norm": 3.279231548309326, |
|
"learning_rate": 1.1811195211690169e-05, |
|
"loss": 2.3939, |
|
"mean_token_accuracy": 0.3599026009440422, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.8017879592121805, |
|
"grad_norm": 3.5640265941619873, |
|
"learning_rate": 1.1498280823756841e-05, |
|
"loss": 2.3774, |
|
"mean_token_accuracy": 0.3581980511546135, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.8045816454812125, |
|
"grad_norm": 3.7095561027526855, |
|
"learning_rate": 1.1189027945230496e-05, |
|
"loss": 2.3944, |
|
"mean_token_accuracy": 0.35584415346384046, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.8073753317502445, |
|
"grad_norm": 3.6211345195770264, |
|
"learning_rate": 1.0883465986027059e-05, |
|
"loss": 2.3879, |
|
"mean_token_accuracy": 0.3619318217039108, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.8101690180192764, |
|
"grad_norm": 3.5692007541656494, |
|
"learning_rate": 1.0581624005056424e-05, |
|
"loss": 2.3611, |
|
"mean_token_accuracy": 0.36030844151973723, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.8129627042883084, |
|
"grad_norm": 3.9471545219421387, |
|
"learning_rate": 1.0283530707458922e-05, |
|
"loss": 2.4062, |
|
"mean_token_accuracy": 0.356331168115139, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.8157563905573404, |
|
"grad_norm": 4.07960844039917, |
|
"learning_rate": 9.989214441875522e-06, |
|
"loss": 2.3634, |
|
"mean_token_accuracy": 0.36566558480262756, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.8185500768263724, |
|
"grad_norm": 3.9245829582214355, |
|
"learning_rate": 9.698703197751851e-06, |
|
"loss": 2.332, |
|
"mean_token_accuracy": 0.3642857164144516, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.8213437630954044, |
|
"grad_norm": 3.7653911113739014, |
|
"learning_rate": 9.412024602676378e-06, |
|
"loss": 2.3998, |
|
"mean_token_accuracy": 0.35511363595724105, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.8241374493644363, |
|
"grad_norm": 3.9197909832000732, |
|
"learning_rate": 9.129205919753075e-06, |
|
"loss": 2.3641, |
|
"mean_token_accuracy": 0.3629870146512985, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.8269311356334683, |
|
"grad_norm": 3.7501025199890137, |
|
"learning_rate": 8.850274045008666e-06, |
|
"loss": 2.3705, |
|
"mean_token_accuracy": 0.36079545617103576, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.8297248219025003, |
|
"grad_norm": 3.8063013553619385, |
|
"learning_rate": 8.575255504834827e-06, |
|
"loss": 2.3993, |
|
"mean_token_accuracy": 0.36266233772039413, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.8325185081715324, |
|
"grad_norm": 3.9794483184814453, |
|
"learning_rate": 8.304176453465556e-06, |
|
"loss": 2.3816, |
|
"mean_token_accuracy": 0.3581168860197067, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.8353121944405644, |
|
"grad_norm": 3.6001126766204834, |
|
"learning_rate": 8.037062670489842e-06, |
|
"loss": 2.3807, |
|
"mean_token_accuracy": 0.3580357149243355, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.8381058807095964, |
|
"grad_norm": 3.5399417877197266, |
|
"learning_rate": 7.773939558400101e-06, |
|
"loss": 2.3788, |
|
"mean_token_accuracy": 0.3605519488453865, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8408995669786283, |
|
"grad_norm": 3.753068208694458, |
|
"learning_rate": 7.51483214017637e-06, |
|
"loss": 2.3517, |
|
"mean_token_accuracy": 0.3618506520986557, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.8436932532476603, |
|
"grad_norm": 4.195340156555176, |
|
"learning_rate": 7.259765056906609e-06, |
|
"loss": 2.3818, |
|
"mean_token_accuracy": 0.3603896126151085, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.8464869395166923, |
|
"grad_norm": 4.046124458312988, |
|
"learning_rate": 7.008762565443378e-06, |
|
"loss": 2.3566, |
|
"mean_token_accuracy": 0.3603896081447601, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.8492806257857243, |
|
"grad_norm": 3.6873440742492676, |
|
"learning_rate": 6.76184853609696e-06, |
|
"loss": 2.3539, |
|
"mean_token_accuracy": 0.35795454531908033, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.8520743120547563, |
|
"grad_norm": 3.5705506801605225, |
|
"learning_rate": 6.519046450365346e-06, |
|
"loss": 2.3515, |
|
"mean_token_accuracy": 0.3608766242861748, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.8548679983237882, |
|
"grad_norm": 3.630878448486328, |
|
"learning_rate": 6.280379398701114e-06, |
|
"loss": 2.3596, |
|
"mean_token_accuracy": 0.3606331154704094, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.8576616845928202, |
|
"grad_norm": 3.789283514022827, |
|
"learning_rate": 6.045870078315541e-06, |
|
"loss": 2.3504, |
|
"mean_token_accuracy": 0.36233766227960584, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.8604553708618522, |
|
"grad_norm": 3.9113943576812744, |
|
"learning_rate": 5.8155407910201135e-06, |
|
"loss": 2.3738, |
|
"mean_token_accuracy": 0.36241883486509324, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.8632490571308842, |
|
"grad_norm": 3.61296010017395, |
|
"learning_rate": 5.5894134411055955e-06, |
|
"loss": 2.3907, |
|
"mean_token_accuracy": 0.35900974124670026, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.8660427433999162, |
|
"grad_norm": 3.8162384033203125, |
|
"learning_rate": 5.367509533258969e-06, |
|
"loss": 2.3164, |
|
"mean_token_accuracy": 0.3629870146512985, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.8688364296689481, |
|
"grad_norm": 3.9131879806518555, |
|
"learning_rate": 5.149850170518328e-06, |
|
"loss": 2.3605, |
|
"mean_token_accuracy": 0.360146102309227, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.8716301159379801, |
|
"grad_norm": 3.934457540512085, |
|
"learning_rate": 4.9364560522659365e-06, |
|
"loss": 2.3576, |
|
"mean_token_accuracy": 0.3599025964736938, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.8744238022070121, |
|
"grad_norm": 3.8578693866729736, |
|
"learning_rate": 4.727347472259813e-06, |
|
"loss": 2.3573, |
|
"mean_token_accuracy": 0.36201298981904984, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.8772174884760442, |
|
"grad_norm": 3.8296849727630615, |
|
"learning_rate": 4.522544316703709e-06, |
|
"loss": 2.3431, |
|
"mean_token_accuracy": 0.3612824693322182, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.8800111747450762, |
|
"grad_norm": 3.58001971244812, |
|
"learning_rate": 4.322066062355984e-06, |
|
"loss": 2.3756, |
|
"mean_token_accuracy": 0.3618506506085396, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.8828048610141082, |
|
"grad_norm": 3.9927351474761963, |
|
"learning_rate": 4.125931774677349e-06, |
|
"loss": 2.3874, |
|
"mean_token_accuracy": 0.358766233921051, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.8855985472831401, |
|
"grad_norm": 3.5192744731903076, |
|
"learning_rate": 3.934160106017748e-06, |
|
"loss": 2.3473, |
|
"mean_token_accuracy": 0.36599026024341585, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.8883922335521721, |
|
"grad_norm": 3.695925235748291, |
|
"learning_rate": 3.7467692938425057e-06, |
|
"loss": 2.36, |
|
"mean_token_accuracy": 0.3630681812763214, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.8911859198212041, |
|
"grad_norm": 3.908273696899414, |
|
"learning_rate": 3.563777158997977e-06, |
|
"loss": 2.3429, |
|
"mean_token_accuracy": 0.3662337675690651, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.8939796060902361, |
|
"grad_norm": 3.800147294998169, |
|
"learning_rate": 3.3852011040167607e-06, |
|
"loss": 2.3469, |
|
"mean_token_accuracy": 0.3615259721875191, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.8967732923592681, |
|
"grad_norm": 3.7554497718811035, |
|
"learning_rate": 3.2110581114627225e-06, |
|
"loss": 2.3629, |
|
"mean_token_accuracy": 0.3570616886019707, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.8995669786283, |
|
"grad_norm": 3.9945056438446045, |
|
"learning_rate": 3.041364742315983e-06, |
|
"loss": 2.3897, |
|
"mean_token_accuracy": 0.3580357149243355, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.902360664897332, |
|
"grad_norm": 3.705796003341675, |
|
"learning_rate": 2.8761371343979273e-06, |
|
"loss": 2.3326, |
|
"mean_token_accuracy": 0.3644480526447296, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.905154351166364, |
|
"grad_norm": 3.556757688522339, |
|
"learning_rate": 2.7153910008365368e-06, |
|
"loss": 2.3759, |
|
"mean_token_accuracy": 0.3582792192697525, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.907948037435396, |
|
"grad_norm": 3.859874963760376, |
|
"learning_rate": 2.5591416285720424e-06, |
|
"loss": 2.3236, |
|
"mean_token_accuracy": 0.36079545617103576, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.910741723704428, |
|
"grad_norm": 4.084688186645508, |
|
"learning_rate": 2.4074038769031803e-06, |
|
"loss": 2.3757, |
|
"mean_token_accuracy": 0.36030844002962115, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.9135354099734599, |
|
"grad_norm": 3.861067295074463, |
|
"learning_rate": 2.2601921760740107e-06, |
|
"loss": 2.3407, |
|
"mean_token_accuracy": 0.3633116871118546, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.9163290962424919, |
|
"grad_norm": 4.909350395202637, |
|
"learning_rate": 2.1175205259016563e-06, |
|
"loss": 2.3366, |
|
"mean_token_accuracy": 0.3625811696052551, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.9191227825115239, |
|
"grad_norm": 3.506558895111084, |
|
"learning_rate": 1.979402494444915e-06, |
|
"loss": 2.3384, |
|
"mean_token_accuracy": 0.3619318187236786, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.921916468780556, |
|
"grad_norm": 3.9836270809173584, |
|
"learning_rate": 1.845851216713912e-06, |
|
"loss": 2.3518, |
|
"mean_token_accuracy": 0.3620129883289337, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.924710155049588, |
|
"grad_norm": 3.8005995750427246, |
|
"learning_rate": 1.7168793934209893e-06, |
|
"loss": 2.3505, |
|
"mean_token_accuracy": 0.3646915599703789, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.92750384131862, |
|
"grad_norm": 3.654874801635742, |
|
"learning_rate": 1.5924992897728475e-06, |
|
"loss": 2.3256, |
|
"mean_token_accuracy": 0.36566558480262756, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.9302975275876519, |
|
"grad_norm": 3.704705238342285, |
|
"learning_rate": 1.472722734304144e-06, |
|
"loss": 2.305, |
|
"mean_token_accuracy": 0.3631493508815765, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.9330912138566839, |
|
"grad_norm": 4.282053470611572, |
|
"learning_rate": 1.3575611177525926e-06, |
|
"loss": 2.3599, |
|
"mean_token_accuracy": 0.35868506878614426, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.9358849001257159, |
|
"grad_norm": 4.226362228393555, |
|
"learning_rate": 1.247025391975698e-06, |
|
"loss": 2.3347, |
|
"mean_token_accuracy": 0.36542207896709444, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.9386785863947479, |
|
"grad_norm": 3.9076080322265625, |
|
"learning_rate": 1.1411260689092484e-06, |
|
"loss": 2.3658, |
|
"mean_token_accuracy": 0.35722402632236483, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.9414722726637799, |
|
"grad_norm": 4.407858371734619, |
|
"learning_rate": 1.0398732195676331e-06, |
|
"loss": 2.32, |
|
"mean_token_accuracy": 0.3650974050164223, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.9442659589328118, |
|
"grad_norm": 3.794746160507202, |
|
"learning_rate": 9.432764730860744e-07, |
|
"loss": 2.3404, |
|
"mean_token_accuracy": 0.3632305204868317, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.9470596452018438, |
|
"grad_norm": 3.419100046157837, |
|
"learning_rate": 8.513450158049108e-07, |
|
"loss": 2.2942, |
|
"mean_token_accuracy": 0.3678571432828903, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.9498533314708758, |
|
"grad_norm": 3.609236001968384, |
|
"learning_rate": 7.640875903959732e-07, |
|
"loss": 2.3246, |
|
"mean_token_accuracy": 0.36477272808551786, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.9526470177399078, |
|
"grad_norm": 3.939943313598633, |
|
"learning_rate": 6.815124950311557e-07, |
|
"loss": 2.326, |
|
"mean_token_accuracy": 0.36209415793418886, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.9554407040089398, |
|
"grad_norm": 4.04728364944458, |
|
"learning_rate": 6.036275825932525e-07, |
|
"loss": 2.3391, |
|
"mean_token_accuracy": 0.3649350643157959, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.9582343902779717, |
|
"grad_norm": 4.012794494628906, |
|
"learning_rate": 5.304402599291824e-07, |
|
"loss": 2.3555, |
|
"mean_token_accuracy": 0.3619318202137947, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.9610280765470037, |
|
"grad_norm": 3.5218493938446045, |
|
"learning_rate": 4.61957487145559e-07, |
|
"loss": 2.3352, |
|
"mean_token_accuracy": 0.36607143133878706, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.9638217628160357, |
|
"grad_norm": 3.5637614727020264, |
|
"learning_rate": 3.981857769468023e-07, |
|
"loss": 2.3506, |
|
"mean_token_accuracy": 0.363879868388176, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.9666154490850678, |
|
"grad_norm": 4.030181407928467, |
|
"learning_rate": 3.391311940157904e-07, |
|
"loss": 2.3287, |
|
"mean_token_accuracy": 0.36655843555927276, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.9694091353540998, |
|
"grad_norm": 3.6736109256744385, |
|
"learning_rate": 2.8479935443708197e-07, |
|
"loss": 2.3548, |
|
"mean_token_accuracy": 0.36728896349668505, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.9722028216231318, |
|
"grad_norm": 3.6947691440582275, |
|
"learning_rate": 2.3519542516285965e-07, |
|
"loss": 2.3318, |
|
"mean_token_accuracy": 0.3616071417927742, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.9749965078921637, |
|
"grad_norm": 4.099936008453369, |
|
"learning_rate": 1.9032412352153473e-07, |
|
"loss": 2.3518, |
|
"mean_token_accuracy": 0.3631493508815765, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.9777901941611957, |
|
"grad_norm": 3.852231502532959, |
|
"learning_rate": 1.501897167691224e-07, |
|
"loss": 2.3392, |
|
"mean_token_accuracy": 0.3645292207598686, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9805838804302277, |
|
"grad_norm": 3.9510109424591064, |
|
"learning_rate": 1.1479602168344983e-07, |
|
"loss": 2.3554, |
|
"mean_token_accuracy": 0.3650162324309349, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.9833775666992597, |
|
"grad_norm": 3.799307107925415, |
|
"learning_rate": 8.414640420116305e-08, |
|
"loss": 2.3089, |
|
"mean_token_accuracy": 0.362662336230278, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.9861712529682917, |
|
"grad_norm": 3.7856054306030273, |
|
"learning_rate": 5.824377909763312e-08, |
|
"loss": 2.3135, |
|
"mean_token_accuracy": 0.3685876622796059, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.9889649392373236, |
|
"grad_norm": 3.702875852584839, |
|
"learning_rate": 3.709060970975564e-08, |
|
"loss": 2.3606, |
|
"mean_token_accuracy": 0.36176948100328443, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.9917586255063556, |
|
"grad_norm": 3.9700822830200195, |
|
"learning_rate": 2.068890770169363e-08, |
|
"loss": 2.3494, |
|
"mean_token_accuracy": 0.3568993493914604, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.9945523117753876, |
|
"grad_norm": 3.895416736602783, |
|
"learning_rate": 9.040232873569477e-09, |
|
"loss": 2.3314, |
|
"mean_token_accuracy": 0.3663149356842041, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.9973459980444196, |
|
"grad_norm": 3.7410225868225098, |
|
"learning_rate": 2.145693013116956e-09, |
|
"loss": 2.3568, |
|
"mean_token_accuracy": 0.3575487032532692, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"mean_token_accuracy": 0.36799384575141103, |
|
"step": 7159, |
|
"total_flos": 1.6530020885331968e+18, |
|
"train_loss": 2.734565882330451, |
|
"train_runtime": 21227.811, |
|
"train_samples_per_second": 18.884, |
|
"train_steps_per_second": 0.337 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 7159, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1790, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6530020885331968e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|