diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,5868 +10,5868 @@ "log_history": [ { "epoch": 0.0013655372535631989, - "grad_norm": 2.856154910104226, + "grad_norm": 12.694538866354412, "learning_rate": 2.702702702702703e-07, - "loss": 1.4536, - "mean_token_accuracy": 0.6624585683889566, + "loss": 1.349, + "mean_token_accuracy": 0.6731206055166955, "step": 1 }, { "epoch": 0.0027310745071263977, - "grad_norm": 2.707013388829395, + "grad_norm": 14.28682858099926, "learning_rate": 5.405405405405406e-07, - "loss": 1.4622, - "mean_token_accuracy": 0.6589214926901064, + "loss": 1.3695, + "mean_token_accuracy": 0.6685968866221867, "step": 2 }, { "epoch": 0.004096611760689596, - "grad_norm": 2.44177000356028, + "grad_norm": 12.153556394325475, "learning_rate": 8.108108108108109e-07, - "loss": 1.3364, - "mean_token_accuracy": 0.6842013947429949, + "loss": 1.2534, + "mean_token_accuracy": 0.6932581184973134, "step": 3 }, { "epoch": 0.0054621490142527955, - "grad_norm": 2.3033057563617523, + "grad_norm": 10.014906326374183, "learning_rate": 1.0810810810810812e-06, - "loss": 1.2614, - "mean_token_accuracy": 0.6964783027101024, + "loss": 1.1814, + "mean_token_accuracy": 0.7059116849514391, "step": 4 }, { "epoch": 0.006827686267815994, - "grad_norm": 2.7666430678045035, + "grad_norm": 13.154435035689092, "learning_rate": 1.3513513513513515e-06, - "loss": 1.4063, - "mean_token_accuracy": 0.670296182106088, + "loss": 1.3152, + "mean_token_accuracy": 0.6801386295402803, "step": 5 }, { "epoch": 0.008193223521379193, - "grad_norm": 2.7586755232765463, + "grad_norm": 12.656442197272979, "learning_rate": 1.6216216216216219e-06, - "loss": 1.4497, - "mean_token_accuracy": 0.6626343423613201, + "loss": 1.3519, + "mean_token_accuracy": 0.6725742496834655, "step": 6 }, { "epoch": 0.009558760774942391, - "grad_norm": 2.6639463230370986, + "grad_norm": 9.154457220968185, "learning_rate": 1.8918918918918922e-06, - "loss": 1.4282, - "mean_token_accuracy": 0.6662120309890409, + "loss": 1.3201, + "mean_token_accuracy": 0.6776771426562855, "step": 7 }, { "epoch": 0.010924298028505591, - "grad_norm": 2.76664125205853, + "grad_norm": 12.926796361133281, "learning_rate": 2.1621621621621623e-06, - "loss": 1.4878, - "mean_token_accuracy": 0.6610442223856668, + "loss": 1.3668, + "mean_token_accuracy": 0.6739682822033727, "step": 8 }, { "epoch": 0.01228983528206879, - "grad_norm": 2.8340529756484196, + "grad_norm": 13.702550349182166, "learning_rate": 2.432432432432433e-06, - "loss": 1.4588, - "mean_token_accuracy": 0.6639858735565699, + "loss": 1.3398, + "mean_token_accuracy": 0.675676714000192, "step": 9 }, { "epoch": 0.013655372535631987, - "grad_norm": 2.4214142338634335, + "grad_norm": 12.100638571387348, "learning_rate": 2.702702702702703e-06, - "loss": 1.3559, - "mean_token_accuracy": 0.6743468693146191, + "loss": 1.237, + "mean_token_accuracy": 0.6897704789682599, "step": 10 }, { "epoch": 0.015020909789195187, - "grad_norm": 2.443359659623504, + "grad_norm": 9.02772882068927, "learning_rate": 2.9729729729729736e-06, - "loss": 1.3804, - "mean_token_accuracy": 0.6795881338234414, + "loss": 1.237, + "mean_token_accuracy": 0.6967972223171133, "step": 11 }, { "epoch": 0.016386447042758386, - "grad_norm": 2.459985558416387, + "grad_norm": 7.751826273673437, "learning_rate": 3.2432432432432437e-06, - "loss": 1.3956, - "mean_token_accuracy": 0.66928461420722, + "loss": 1.2499, + "mean_token_accuracy": 0.6873925007889158, "step": 12 }, { "epoch": 0.017751984296321584, - "grad_norm": 2.00474402793384, + "grad_norm": 14.605567306288991, "learning_rate": 3.513513513513514e-06, - "loss": 1.3353, - "mean_token_accuracy": 0.6792316835523171, + "loss": 1.1656, + "mean_token_accuracy": 0.702033566686862, "step": 13 }, { "epoch": 0.019117521549884782, - "grad_norm": 1.7911340094889505, + "grad_norm": 16.69580523349115, "learning_rate": 3.7837837837837844e-06, - "loss": 1.2659, - "mean_token_accuracy": 0.6939364426914091, + "loss": 1.1057, + "mean_token_accuracy": 0.7166825564328566, "step": 14 }, { "epoch": 0.02048305880344798, - "grad_norm": 2.024996387503837, + "grad_norm": 4.277348708917889, "learning_rate": 4.0540540540540545e-06, - "loss": 1.3317, - "mean_token_accuracy": 0.6801057906150351, + "loss": 1.1453, + "mean_token_accuracy": 0.7064730765093167, "step": 15 }, { "epoch": 0.021848596057011182, - "grad_norm": 1.5939241622410736, + "grad_norm": 5.501675262398263, "learning_rate": 4.324324324324325e-06, - "loss": 1.2312, - "mean_token_accuracy": 0.7016105211892338, + "loss": 1.0609, + "mean_token_accuracy": 0.7263837797527011, "step": 16 }, { "epoch": 0.02321413331057438, - "grad_norm": 1.7478459848540069, + "grad_norm": 4.594618515621041, "learning_rate": 4.594594594594596e-06, - "loss": 1.36, - "mean_token_accuracy": 0.66686491420135, + "loss": 1.1575, + "mean_token_accuracy": 0.6986264355169661, "step": 17 }, { "epoch": 0.02457967056413758, - "grad_norm": 1.4704389244200098, + "grad_norm": 4.370254196578077, "learning_rate": 4.864864864864866e-06, - "loss": 1.1762, - "mean_token_accuracy": 0.7032436227848233, + "loss": 0.9763, + "mean_token_accuracy": 0.7371238560229174, "step": 18 }, { "epoch": 0.025945207817700777, - "grad_norm": 1.5815846798102828, + "grad_norm": 12.420426104801459, "learning_rate": 5.135135135135135e-06, - "loss": 1.2157, - "mean_token_accuracy": 0.6963411457707981, + "loss": 0.9947, + "mean_token_accuracy": 0.7336301337158547, "step": 19 }, { "epoch": 0.027310745071263975, - "grad_norm": 1.5915699711213915, + "grad_norm": 7.306932902857305, "learning_rate": 5.405405405405406e-06, - "loss": 1.246, - "mean_token_accuracy": 0.6902431103677444, + "loss": 1.0089, + "mean_token_accuracy": 0.7298717119207753, "step": 20 }, { "epoch": 0.028676282324827173, - "grad_norm": 1.5211456916433848, + "grad_norm": 3.792544000354483, "learning_rate": 5.675675675675676e-06, - "loss": 1.2473, - "mean_token_accuracy": 0.6959266825224263, + "loss": 0.9992, + "mean_token_accuracy": 0.7355598065427686, "step": 21 }, { "epoch": 0.030041819578390375, - "grad_norm": 1.3284099737254742, + "grad_norm": 5.599819351248164, "learning_rate": 5.945945945945947e-06, - "loss": 1.1876, - "mean_token_accuracy": 0.7030144363437105, + "loss": 0.938, + "mean_token_accuracy": 0.7446242908841856, "step": 22 }, { "epoch": 0.03140735683195357, - "grad_norm": 0.9949825140464728, + "grad_norm": 3.2303878160644106, "learning_rate": 6.2162162162162164e-06, - "loss": 1.0487, - "mean_token_accuracy": 0.7259135548105291, + "loss": 0.842, + "mean_token_accuracy": 0.7625513938625904, "step": 23 }, { "epoch": 0.03277289408551677, - "grad_norm": 1.108114596804111, + "grad_norm": 7.560111146519516, "learning_rate": 6.486486486486487e-06, - "loss": 1.0896, - "mean_token_accuracy": 0.7185542185799237, + "loss": 0.8541, + "mean_token_accuracy": 0.7603489427240392, "step": 24 }, { "epoch": 0.03413843133907997, - "grad_norm": 1.0865557128605288, + "grad_norm": 8.994000664594445, "learning_rate": 6.7567567567567575e-06, - "loss": 1.0566, - "mean_token_accuracy": 0.7235604714716861, + "loss": 0.8033, + "mean_token_accuracy": 0.7710385708849239, "step": 25 }, { "epoch": 0.03550396859264317, - "grad_norm": 0.910436820637811, + "grad_norm": 11.174112139759627, "learning_rate": 7.027027027027028e-06, - "loss": 1.0481, - "mean_token_accuracy": 0.7235809194087987, + "loss": 0.7972, + "mean_token_accuracy": 0.7709743153956785, "step": 26 }, { "epoch": 0.036869505846206366, - "grad_norm": 0.9169251933078288, + "grad_norm": 4.4474620188546465, "learning_rate": 7.297297297297298e-06, - "loss": 1.0417, - "mean_token_accuracy": 0.7250386308438928, + "loss": 0.7779, + "mean_token_accuracy": 0.7762167832548105, "step": 27 }, { "epoch": 0.038235043099769564, - "grad_norm": 0.7943883684133052, + "grad_norm": 6.420798005839511, "learning_rate": 7.567567567567569e-06, - "loss": 0.9889, - "mean_token_accuracy": 0.7381367949391253, + "loss": 0.7364, + "mean_token_accuracy": 0.7872430296163124, "step": 28 }, { "epoch": 0.03960058035333276, - "grad_norm": 0.7617948165998126, + "grad_norm": 3.505076403243893, "learning_rate": 7.837837837837838e-06, - "loss": 1.0032, - "mean_token_accuracy": 0.7322491972494538, + "loss": 0.7366, + "mean_token_accuracy": 0.7846892672832243, "step": 29 }, { "epoch": 0.04096611760689596, - "grad_norm": 0.7223752695504578, + "grad_norm": 4.257419834855069, "learning_rate": 8.108108108108109e-06, - "loss": 0.9848, - "mean_token_accuracy": 0.7305062130246437, + "loss": 0.7276, + "mean_token_accuracy": 0.7832414543493696, "step": 30 }, { "epoch": 0.04233165486045916, - "grad_norm": 0.7313790683290118, + "grad_norm": 1.8596248429959361, "learning_rate": 8.378378378378378e-06, - "loss": 0.9516, - "mean_token_accuracy": 0.7426278213329017, + "loss": 0.6917, + "mean_token_accuracy": 0.7949227280414939, "step": 31 }, { "epoch": 0.043697192114022364, - "grad_norm": 0.7975105916078674, + "grad_norm": 5.998640085410845, "learning_rate": 8.64864864864865e-06, - "loss": 1.0129, - "mean_token_accuracy": 0.7272589576535597, + "loss": 0.74, + "mean_token_accuracy": 0.7826185949965834, "step": 32 }, { "epoch": 0.04506272936758556, - "grad_norm": 0.8017681055296577, + "grad_norm": 1.6136645601879687, "learning_rate": 8.91891891891892e-06, - "loss": 0.9371, - "mean_token_accuracy": 0.7443093919782506, + "loss": 0.682, + "mean_token_accuracy": 0.7971229334178439, "step": 33 }, { "epoch": 0.04642826662114876, - "grad_norm": 0.8071760199041964, + "grad_norm": 4.813218130166523, "learning_rate": 9.189189189189191e-06, - "loss": 0.937, - "mean_token_accuracy": 0.7426084467059472, + "loss": 0.6818, + "mean_token_accuracy": 0.7956781396448597, "step": 34 }, { "epoch": 0.04779380387471196, - "grad_norm": 0.5490419389795846, + "grad_norm": 1.6648957199273453, "learning_rate": 9.45945945945946e-06, - "loss": 0.903, - "mean_token_accuracy": 0.7513449536470811, + "loss": 0.6536, + "mean_token_accuracy": 0.8039919273243677, "step": 35 }, { "epoch": 0.04915934112827516, - "grad_norm": 0.5331990089903488, + "grad_norm": 2.727767929908723, "learning_rate": 9.729729729729732e-06, - "loss": 0.8246, - "mean_token_accuracy": 0.7676532142810768, + "loss": 0.5991, + "mean_token_accuracy": 0.8157655755998052, "step": 36 }, { "epoch": 0.050524878381838355, - "grad_norm": 0.5068449057689566, + "grad_norm": 3.307683813511232, "learning_rate": 1e-05, - "loss": 0.8333, - "mean_token_accuracy": 0.7622662873836845, + "loss": 0.6039, + "mean_token_accuracy": 0.8131568470367118, "step": 37 }, { "epoch": 0.05189041563540155, - "grad_norm": 0.574202617800939, + "grad_norm": 1.6551722305811238, "learning_rate": 1.027027027027027e-05, - "loss": 0.8298, - "mean_token_accuracy": 0.7643824666512319, + "loss": 0.6005, + "mean_token_accuracy": 0.8140848694651097, "step": 38 }, { "epoch": 0.05325595288896475, - "grad_norm": 0.5093346199893828, + "grad_norm": 1.447375367061217, "learning_rate": 1.0540540540540541e-05, - "loss": 0.8303, - "mean_token_accuracy": 0.7648299467316163, + "loss": 0.6002, + "mean_token_accuracy": 0.8150523180034766, "step": 39 }, { "epoch": 0.05462149014252795, - "grad_norm": 0.46374414440698, + "grad_norm": 1.072024342448014, "learning_rate": 1.0810810810810812e-05, - "loss": 0.8218, - "mean_token_accuracy": 0.7679458722970103, + "loss": 0.6008, + "mean_token_accuracy": 0.8159713680250943, "step": 40 }, { "epoch": 0.05598702739609115, - "grad_norm": 0.5229350750484023, + "grad_norm": 0.7603023981378255, "learning_rate": 1.1081081081081081e-05, - "loss": 0.8076, - "mean_token_accuracy": 0.769910573544534, + "loss": 0.5885, + "mean_token_accuracy": 0.8169645259893636, "step": 41 }, { "epoch": 0.057352564649654346, - "grad_norm": 0.4007376449556555, + "grad_norm": 0.9581646192295539, "learning_rate": 1.1351351351351352e-05, - "loss": 0.771, - "mean_token_accuracy": 0.7779757255470396, + "loss": 0.561, + "mean_token_accuracy": 0.8244735432096343, "step": 42 }, { "epoch": 0.058718101903217544, - "grad_norm": 0.44538694380473465, + "grad_norm": 1.2036888425863506, "learning_rate": 1.1621621621621622e-05, - "loss": 0.8219, - "mean_token_accuracy": 0.7692275525781592, + "loss": 0.5912, + "mean_token_accuracy": 0.8190549345072387, "step": 43 }, { "epoch": 0.06008363915678075, - "grad_norm": 0.3600165212561372, + "grad_norm": 0.7718687528534887, "learning_rate": 1.1891891891891894e-05, - "loss": 0.7898, - "mean_token_accuracy": 0.7730238501942123, + "loss": 0.5749, + "mean_token_accuracy": 0.819959293524251, "step": 44 }, { "epoch": 0.06144917641034395, - "grad_norm": 0.37878200887180796, + "grad_norm": 1.682073620529749, "learning_rate": 1.2162162162162164e-05, - "loss": 0.7724, - "mean_token_accuracy": 0.7767401804426769, + "loss": 0.5584, + "mean_token_accuracy": 0.825639299275228, "step": 45 }, { "epoch": 0.06281471366390715, - "grad_norm": 0.38864206724912786, + "grad_norm": 0.7001166412928403, "learning_rate": 1.2432432432432433e-05, - "loss": 0.7702, - "mean_token_accuracy": 0.7775697265979846, + "loss": 0.5581, + "mean_token_accuracy": 0.8246738540448362, "step": 46 }, { "epoch": 0.06418025091747034, - "grad_norm": 0.32403711695858023, + "grad_norm": 1.0537050693773666, "learning_rate": 1.2702702702702702e-05, - "loss": 0.7614, - "mean_token_accuracy": 0.7803829810125956, + "loss": 0.5551, + "mean_token_accuracy": 0.8259298934704246, "step": 47 }, { "epoch": 0.06554578817103354, - "grad_norm": 0.32376564934320085, + "grad_norm": 0.7920322967117208, "learning_rate": 1.2972972972972975e-05, - "loss": 0.7504, - "mean_token_accuracy": 0.7826889733039633, + "loss": 0.5507, + "mean_token_accuracy": 0.8265237150428488, "step": 48 }, { "epoch": 0.06691132542459674, - "grad_norm": 0.29401307876470584, + "grad_norm": 0.562181333813763, "learning_rate": 1.3243243243243244e-05, - "loss": 0.7776, - "mean_token_accuracy": 0.7736975481661026, + "loss": 0.5688, + "mean_token_accuracy": 0.8204618404616847, "step": 49 }, { "epoch": 0.06827686267815994, - "grad_norm": 0.26908817652544503, + "grad_norm": 0.5763859640174397, "learning_rate": 1.3513513513513515e-05, - "loss": 0.6951, - "mean_token_accuracy": 0.7953305833923959, + "loss": 0.5127, + "mean_token_accuracy": 0.8361940201766735, "step": 50 }, { "epoch": 0.06964239993172314, - "grad_norm": 0.28512395763856796, + "grad_norm": 0.8441858338744881, "learning_rate": 1.3783783783783784e-05, - "loss": 0.7715, - "mean_token_accuracy": 0.7752406193075859, + "loss": 0.5605, + "mean_token_accuracy": 0.8224741152107983, "step": 51 }, { "epoch": 0.07100793718528634, - "grad_norm": 0.23630648057108264, + "grad_norm": 0.623060068151014, "learning_rate": 1.4054054054054055e-05, - "loss": 0.695, - "mean_token_accuracy": 0.7961802597980893, + "loss": 0.5154, + "mean_token_accuracy": 0.8358351656376076, "step": 52 }, { "epoch": 0.07237347443884953, - "grad_norm": 0.26505996661418807, + "grad_norm": 0.487364049768217, "learning_rate": 1.4324324324324326e-05, - "loss": 0.7531, - "mean_token_accuracy": 0.779132850745663, + "loss": 0.5508, + "mean_token_accuracy": 0.824997923142241, "step": 53 }, { "epoch": 0.07373901169241273, - "grad_norm": 0.2433115552284422, + "grad_norm": 0.4710640541449941, "learning_rate": 1.4594594594594596e-05, - "loss": 0.6949, - "mean_token_accuracy": 0.7944784282290792, + "loss": 0.5121, + "mean_token_accuracy": 0.8361596705359315, "step": 54 }, { "epoch": 0.07510454894597593, - "grad_norm": 0.2811280383821415, + "grad_norm": 0.5307868639543387, "learning_rate": 1.4864864864864865e-05, - "loss": 0.7442, - "mean_token_accuracy": 0.7821906110474989, + "loss": 0.5388, + "mean_token_accuracy": 0.8287927367224052, "step": 55 }, { "epoch": 0.07647008619953913, - "grad_norm": 0.23579260376996747, + "grad_norm": 0.502790636865642, "learning_rate": 1.5135135135135138e-05, - "loss": 0.6781, - "mean_token_accuracy": 0.7993101791273419, + "loss": 0.5002, + "mean_token_accuracy": 0.8393840192332627, "step": 56 }, { "epoch": 0.07783562345310233, - "grad_norm": 0.2319227137133674, + "grad_norm": 0.43346690723829867, "learning_rate": 1.540540540540541e-05, - "loss": 0.6995, - "mean_token_accuracy": 0.7937046213203979, + "loss": 0.5125, + "mean_token_accuracy": 0.8363432033977718, "step": 57 }, { "epoch": 0.07920116070666552, - "grad_norm": 0.20822407667697684, + "grad_norm": 0.4993605510803232, "learning_rate": 1.5675675675675676e-05, - "loss": 0.7022, - "mean_token_accuracy": 0.7906265379824102, + "loss": 0.5188, + "mean_token_accuracy": 0.8324141327632871, "step": 58 }, { "epoch": 0.08056669796022872, - "grad_norm": 0.2047416500776899, + "grad_norm": 0.4277849657264807, "learning_rate": 1.5945945945945947e-05, - "loss": 0.6838, - "mean_token_accuracy": 0.797081515195906, + "loss": 0.5099, + "mean_token_accuracy": 0.8360334578638953, "step": 59 }, { "epoch": 0.08193223521379192, - "grad_norm": 0.21768853923992373, + "grad_norm": 0.4591464651505517, "learning_rate": 1.6216216216216218e-05, - "loss": 0.6821, - "mean_token_accuracy": 0.7980219006962407, + "loss": 0.5046, + "mean_token_accuracy": 0.8377397858533894, "step": 60 }, { "epoch": 0.08329777246735512, - "grad_norm": 0.2003717030080966, + "grad_norm": 0.4510027457654597, "learning_rate": 1.648648648648649e-05, - "loss": 0.6842, - "mean_token_accuracy": 0.7951559513557029, + "loss": 0.511, + "mean_token_accuracy": 0.8336544170905339, "step": 61 }, { "epoch": 0.08466330972091832, - "grad_norm": 0.21025621907907927, + "grad_norm": 0.4050262343002075, "learning_rate": 1.6756756756756757e-05, - "loss": 0.684, - "mean_token_accuracy": 0.7955708166268379, + "loss": 0.5051, + "mean_token_accuracy": 0.8365473990294894, "step": 62 }, { "epoch": 0.08602884697448153, - "grad_norm": 0.19723386922516864, + "grad_norm": 0.44615719951666044, "learning_rate": 1.7027027027027028e-05, - "loss": 0.7091, - "mean_token_accuracy": 0.787690704351035, + "loss": 0.5265, + "mean_token_accuracy": 0.8290703723078953, "step": 63 }, { "epoch": 0.08739438422804473, - "grad_norm": 0.19333734400453562, + "grad_norm": 0.38700566099417927, "learning_rate": 1.72972972972973e-05, - "loss": 0.6578, - "mean_token_accuracy": 0.8031256690142385, + "loss": 0.4888, + "mean_token_accuracy": 0.8415804018657317, "step": 64 }, { "epoch": 0.08875992148160793, - "grad_norm": 0.19256222700878659, + "grad_norm": 0.45966773746839323, "learning_rate": 1.756756756756757e-05, - "loss": 0.6761, - "mean_token_accuracy": 0.7980687923008081, + "loss": 0.5034, + "mean_token_accuracy": 0.8367421080881654, "step": 65 }, { "epoch": 0.09012545873517112, - "grad_norm": 0.18944056982525392, + "grad_norm": 0.3462174672127141, "learning_rate": 1.783783783783784e-05, - "loss": 0.6727, - "mean_token_accuracy": 0.7994536661987552, + "loss": 0.5056, + "mean_token_accuracy": 0.8362969110791204, "step": 66 }, { "epoch": 0.09149099598873432, - "grad_norm": 0.18609048021724375, + "grad_norm": 0.5378368089497744, "learning_rate": 1.8108108108108108e-05, - "loss": 0.6478, - "mean_token_accuracy": 0.8046742500762242, + "loss": 0.4845, + "mean_token_accuracy": 0.841530221609978, "step": 67 }, { "epoch": 0.09285653324229752, - "grad_norm": 0.1959251357860619, + "grad_norm": 0.3927908242857095, "learning_rate": 1.8378378378378383e-05, - "loss": 0.673, - "mean_token_accuracy": 0.7974035655329622, + "loss": 0.5017, + "mean_token_accuracy": 0.8354575522429637, "step": 68 }, { "epoch": 0.09422207049586072, - "grad_norm": 0.18628532950522894, + "grad_norm": 0.48195021375325325, "learning_rate": 1.864864864864865e-05, - "loss": 0.6351, - "mean_token_accuracy": 0.8094816765204671, + "loss": 0.4794, + "mean_token_accuracy": 0.8440971672690992, "step": 69 }, { "epoch": 0.09558760774942392, - "grad_norm": 0.1837976933705762, + "grad_norm": 0.42743054642794015, "learning_rate": 1.891891891891892e-05, - "loss": 0.6702, - "mean_token_accuracy": 0.7990883017280949, + "loss": 0.5054, + "mean_token_accuracy": 0.8351946171032623, "step": 70 }, { "epoch": 0.09695314500298712, - "grad_norm": 0.17895628057577329, + "grad_norm": 0.5073725399698784, "learning_rate": 1.918918918918919e-05, - "loss": 0.6489, - "mean_token_accuracy": 0.8059071259574607, + "loss": 0.4872, + "mean_token_accuracy": 0.8416211405529895, "step": 71 }, { "epoch": 0.09831868225655031, - "grad_norm": 0.17700200744568592, + "grad_norm": 0.32802476428146243, "learning_rate": 1.9459459459459463e-05, - "loss": 0.6481, - "mean_token_accuracy": 0.804891784442944, + "loss": 0.4873, + "mean_token_accuracy": 0.8410772826528182, "step": 72 }, { "epoch": 0.09968421951011351, - "grad_norm": 0.17567627292731625, + "grad_norm": 0.44056399284971365, "learning_rate": 1.972972972972973e-05, - "loss": 0.6217, - "mean_token_accuracy": 0.8105455738036946, + "loss": 0.4698, + "mean_token_accuracy": 0.8444900562879512, "step": 73 }, { "epoch": 0.10104975676367671, - "grad_norm": 0.17651285343354287, + "grad_norm": 0.46074182462916863, "learning_rate": 2e-05, - "loss": 0.6678, - "mean_token_accuracy": 0.799923821985901, + "loss": 0.5052, + "mean_token_accuracy": 0.8356767677438455, "step": 74 }, { "epoch": 0.10241529401723991, - "grad_norm": 0.1672581584178598, + "grad_norm": 0.43060352923187173, "learning_rate": 1.999988602302209e-05, - "loss": 0.6655, - "mean_token_accuracy": 0.7997509737152273, + "loss": 0.5034, + "mean_token_accuracy": 0.8357036382011872, "step": 75 }, { "epoch": 0.1037808312708031, - "grad_norm": 0.17574473556990924, + "grad_norm": 0.5392177840776831, "learning_rate": 1.9999544094686517e-05, - "loss": 0.6141, - "mean_token_accuracy": 0.8130920350717817, + "loss": 0.464, + "mean_token_accuracy": 0.8469077059106851, "step": 76 }, { "epoch": 0.1051463685243663, - "grad_norm": 0.1834036113409049, + "grad_norm": 0.4383769298059935, "learning_rate": 1.999897422278767e-05, - "loss": 0.6299, - "mean_token_accuracy": 0.8092562425645671, + "loss": 0.475, + "mean_token_accuracy": 0.8436229064865175, "step": 77 }, { "epoch": 0.1065119057779295, - "grad_norm": 0.17425310417653078, + "grad_norm": 0.4904266229445523, "learning_rate": 1.9998176420316002e-05, - "loss": 0.639, - "mean_token_accuracy": 0.8067353506748101, + "loss": 0.4853, + "mean_token_accuracy": 0.8408816529254115, "step": 78 }, { "epoch": 0.1078774430314927, - "grad_norm": 0.16111836275221228, + "grad_norm": 0.36607390424605935, "learning_rate": 1.9997150705457738e-05, - "loss": 0.5967, - "mean_token_accuracy": 0.8160743517318982, + "loss": 0.4531, + "mean_token_accuracy": 0.8487558375701372, "step": 79 }, { "epoch": 0.1092429802850559, - "grad_norm": 0.15885220040043938, + "grad_norm": 0.4551168695964319, "learning_rate": 1.9995897101594454e-05, - "loss": 0.6391, - "mean_token_accuracy": 0.8059398761600909, + "loss": 0.4876, + "mean_token_accuracy": 0.8396327622489336, "step": 80 }, { "epoch": 0.1106085175386191, - "grad_norm": 0.1675631906023176, + "grad_norm": 0.42442283856925495, "learning_rate": 1.9994415637302545e-05, - "loss": 0.6117, - "mean_token_accuracy": 0.8146563571963998, + "loss": 0.4633, + "mean_token_accuracy": 0.8483948181491869, "step": 81 }, { "epoch": 0.1119740547921823, - "grad_norm": 0.1575439006084883, + "grad_norm": 0.6393625908948564, "learning_rate": 1.999270634635258e-05, - "loss": 0.6096, - "mean_token_accuracy": 0.8146036099045119, + "loss": 0.4606, + "mean_token_accuracy": 0.8486694050596694, "step": 82 }, { "epoch": 0.1133395920457455, - "grad_norm": 0.16629899798581602, + "grad_norm": 0.3938759824609981, "learning_rate": 1.9990769267708517e-05, - "loss": 0.6195, - "mean_token_accuracy": 0.8095868402132419, + "loss": 0.4703, + "mean_token_accuracy": 0.843921182173024, "step": 83 }, { "epoch": 0.11470512929930869, - "grad_norm": 0.16097940260303528, + "grad_norm": 0.46323080288240137, "learning_rate": 1.998860444552683e-05, - "loss": 0.5991, - "mean_token_accuracy": 0.8159027390778586, + "loss": 0.4579, + "mean_token_accuracy": 0.8487062802081777, "step": 84 }, { "epoch": 0.11607066655287189, - "grad_norm": 0.16505041851549865, + "grad_norm": 0.4100887075069859, "learning_rate": 1.99862119291555e-05, - "loss": 0.6237, - "mean_token_accuracy": 0.8095551569279258, + "loss": 0.4753, + "mean_token_accuracy": 0.8426625139694884, "step": 85 }, { "epoch": 0.11743620380643509, - "grad_norm": 0.15322072145398669, + "grad_norm": 0.460266156523643, "learning_rate": 1.9983591773132885e-05, - "loss": 0.6071, - "mean_token_accuracy": 0.815518238339738, + "loss": 0.4634, + "mean_token_accuracy": 0.8479961635199413, "step": 86 }, { "epoch": 0.11880174105999829, - "grad_norm": 0.1665259137560319, + "grad_norm": 0.4093999187610122, "learning_rate": 1.998074403718647e-05, - "loss": 0.6285, - "mean_token_accuracy": 0.8085059805566861, + "loss": 0.481, + "mean_token_accuracy": 0.8413903297828744, "step": 87 }, { "epoch": 0.1201672783135615, - "grad_norm": 0.15703135897883125, + "grad_norm": 0.4529979351460269, "learning_rate": 1.9977668786231536e-05, - "loss": 0.607, - "mean_token_accuracy": 0.8120820297792923, + "loss": 0.4646, + "mean_token_accuracy": 0.8439730016702047, "step": 88 }, { "epoch": 0.1215328155671247, - "grad_norm": 0.16325786752619545, + "grad_norm": 0.31940153601185445, "learning_rate": 1.997436609036963e-05, - "loss": 0.644, - "mean_token_accuracy": 0.8051820396394342, + "loss": 0.4929, + "mean_token_accuracy": 0.8389086531135729, "step": 89 }, { "epoch": 0.1228983528206879, - "grad_norm": 0.15428441651864241, + "grad_norm": 0.36879956082602044, "learning_rate": 1.997083602488702e-05, - "loss": 0.62, - "mean_token_accuracy": 0.8114857874000874, + "loss": 0.4753, + "mean_token_accuracy": 0.8442029592930739, "step": 90 }, { "epoch": 0.1242638900742511, - "grad_norm": 0.15767115306624271, + "grad_norm": 0.31625144322456566, "learning_rate": 1.9967078670252947e-05, - "loss": 0.6245, - "mean_token_accuracy": 0.8091220945133887, + "loss": 0.4809, + "mean_token_accuracy": 0.8414532739667411, "step": 91 }, { "epoch": 0.1256294273278143, - "grad_norm": 0.16404543113011352, + "grad_norm": 0.37572795515302265, "learning_rate": 1.9963094112117786e-05, - "loss": 0.6171, - "mean_token_accuracy": 0.8146379846595098, + "loss": 0.4731, + "mean_token_accuracy": 0.8458833864797164, "step": 92 }, { "epoch": 0.1269949645813775, - "grad_norm": 0.16483171877910405, + "grad_norm": 0.32729110107644405, "learning_rate": 1.995888244131113e-05, - "loss": 0.616, - "mean_token_accuracy": 0.812057344415893, + "loss": 0.4729, + "mean_token_accuracy": 0.844289710669155, "step": 93 }, { "epoch": 0.1283605018349407, - "grad_norm": 0.16391347638541964, + "grad_norm": 0.4508003392275939, "learning_rate": 1.9954443753839666e-05, - "loss": 0.6289, - "mean_token_accuracy": 0.8083010585611569, + "loss": 0.4844, + "mean_token_accuracy": 0.8410096036799376, "step": 94 }, { "epoch": 0.12972603908850389, - "grad_norm": 0.15332112613311347, + "grad_norm": 0.34367729445854445, "learning_rate": 1.994977815088504e-05, - "loss": 0.5876, - "mean_token_accuracy": 0.8198058278811403, + "loss": 0.4536, + "mean_token_accuracy": 0.8496317958214457, "step": 95 }, { "epoch": 0.13109157634206708, - "grad_norm": 0.15156676060366808, + "grad_norm": 0.39368016078712464, "learning_rate": 1.994488573880152e-05, - "loss": 0.5754, - "mean_token_accuracy": 0.8226112506461656, + "loss": 0.4427, + "mean_token_accuracy": 0.8527215734701595, "step": 96 }, { "epoch": 0.13245711359563028, - "grad_norm": 0.15936267038620247, + "grad_norm": 0.39400450065152637, "learning_rate": 1.9939766629113568e-05, - "loss": 0.6108, - "mean_token_accuracy": 0.8123370446513469, + "loss": 0.4718, + "mean_token_accuracy": 0.8439487535684244, "step": 97 }, { "epoch": 0.13382265084919348, - "grad_norm": 0.14899732682502737, + "grad_norm": 0.3644413048898344, "learning_rate": 1.9934420938513313e-05, - "loss": 0.5798, - "mean_token_accuracy": 0.8207332565330951, + "loss": 0.4451, + "mean_token_accuracy": 0.8515555292296904, "step": 98 }, { "epoch": 0.13518818810275668, - "grad_norm": 0.16226768820701468, + "grad_norm": 0.33820519416057315, "learning_rate": 1.9928848788857887e-05, - "loss": 0.6065, - "mean_token_accuracy": 0.8124493390038909, + "loss": 0.4687, + "mean_token_accuracy": 0.8435826255747995, "step": 99 }, { "epoch": 0.13655372535631988, - "grad_norm": 0.1668244978475933, + "grad_norm": 0.35474640844985367, "learning_rate": 1.9923050307166655e-05, - "loss": 0.6099, - "mean_token_accuracy": 0.8125860600623368, + "loss": 0.4694, + "mean_token_accuracy": 0.8443660349778408, "step": 100 }, { "epoch": 0.13791926260988308, - "grad_norm": 0.15950605291384126, + "grad_norm": 0.3787756690185754, "learning_rate": 1.9917025625618295e-05, - "loss": 0.576, - "mean_token_accuracy": 0.8216079326778184, + "loss": 0.443, + "mean_token_accuracy": 0.8515359704739787, "step": 101 }, { "epoch": 0.13928479986344627, - "grad_norm": 0.15625598092564863, + "grad_norm": 0.3362568869450033, "learning_rate": 1.9910774881547803e-05, - "loss": 0.6083, - "mean_token_accuracy": 0.8132024322490655, + "loss": 0.4697, + "mean_token_accuracy": 0.8443949120016512, "step": 102 }, { "epoch": 0.14065033711700947, - "grad_norm": 0.1649007722156239, + "grad_norm": 0.33478608195399623, "learning_rate": 1.9904298217443366e-05, - "loss": 0.6011, - "mean_token_accuracy": 0.8147066418764657, + "loss": 0.4655, + "mean_token_accuracy": 0.8454621512591025, "step": 103 }, { "epoch": 0.14201587437057267, - "grad_norm": 0.16364829331927225, + "grad_norm": 0.3157906568874303, "learning_rate": 1.9897595780943104e-05, - "loss": 0.5872, - "mean_token_accuracy": 0.8175959020514189, + "loss": 0.4549, + "mean_token_accuracy": 0.8479587393969411, "step": 104 }, { "epoch": 0.14338141162413587, - "grad_norm": 0.1612675923664165, + "grad_norm": 0.33967710856761196, "learning_rate": 1.989066772483171e-05, - "loss": 0.5948, - "mean_token_accuracy": 0.8193178037493722, + "loss": 0.4584, + "mean_token_accuracy": 0.8502480645030208, "step": 105 }, { "epoch": 0.14474694887769907, - "grad_norm": 0.1510386286971166, + "grad_norm": 0.3625458211243482, "learning_rate": 1.988351420703696e-05, - "loss": 0.5737, - "mean_token_accuracy": 0.820582794873434, + "loss": 0.4449, + "mean_token_accuracy": 0.8503154268653385, "step": 106 }, { "epoch": 0.14611248613126226, - "grad_norm": 0.16328098311145345, + "grad_norm": 0.33343733476340787, "learning_rate": 1.9876135390626123e-05, - "loss": 0.5949, - "mean_token_accuracy": 0.8164458977195729, + "loss": 0.4612, + "mean_token_accuracy": 0.8469237180257693, "step": 107 }, { "epoch": 0.14747802338482546, - "grad_norm": 0.1560170221302901, + "grad_norm": 0.3402896753249496, "learning_rate": 1.986853144380224e-05, - "loss": 0.6086, - "mean_token_accuracy": 0.8107480733760564, + "loss": 0.4737, + "mean_token_accuracy": 0.8415288812333515, "step": 108 }, { "epoch": 0.14884356063838866, - "grad_norm": 0.15934477578402553, + "grad_norm": 0.30321741460606055, "learning_rate": 1.9860702539900288e-05, - "loss": 0.5949, - "mean_token_accuracy": 0.8176761641667614, + "loss": 0.4604, + "mean_token_accuracy": 0.8479057193176431, "step": 109 }, { "epoch": 0.15020909789195186, - "grad_norm": 0.15243494652690576, + "grad_norm": 0.34336551349752786, "learning_rate": 1.9852648857383224e-05, - "loss": 0.5905, - "mean_token_accuracy": 0.8169501811664259, + "loss": 0.4588, + "mean_token_accuracy": 0.8476062469011421, "step": 110 }, { "epoch": 0.15157463514551506, - "grad_norm": 0.16639575383595412, + "grad_norm": 0.3096450131783167, "learning_rate": 1.984437057983793e-05, - "loss": 0.5622, - "mean_token_accuracy": 0.8248052249037514, + "loss": 0.4388, + "mean_token_accuracy": 0.8534628120170084, "step": 111 }, { "epoch": 0.15294017239907826, - "grad_norm": 0.15339548038719736, + "grad_norm": 0.36874309540738714, "learning_rate": 1.9835867895971015e-05, - "loss": 0.5675, - "mean_token_accuracy": 0.8232901077627216, + "loss": 0.4403, + "mean_token_accuracy": 0.8526718507556617, "step": 112 }, { "epoch": 0.15430570965264145, - "grad_norm": 0.1641574366974239, + "grad_norm": 0.3605796560605681, "learning_rate": 1.982714099960452e-05, - "loss": 0.5944, - "mean_token_accuracy": 0.8164603114849079, + "loss": 0.4622, + "mean_token_accuracy": 0.846226495136954, "step": 113 }, { "epoch": 0.15567124690620465, - "grad_norm": 0.16555790338361875, + "grad_norm": 0.4627441704849731, "learning_rate": 1.981819008967151e-05, - "loss": 0.6242, - "mean_token_accuracy": 0.8073138528223994, + "loss": 0.4858, + "mean_token_accuracy": 0.8394183495047153, "step": 114 }, { "epoch": 0.15703678415976785, - "grad_norm": 0.16231852304839273, + "grad_norm": 0.34714642141003565, "learning_rate": 1.9809015370211505e-05, - "loss": 0.601, - "mean_token_accuracy": 0.8146554075327814, + "loss": 0.4667, + "mean_token_accuracy": 0.8454976903304268, "step": 115 }, { "epoch": 0.15840232141333105, - "grad_norm": 0.14923726611567986, + "grad_norm": 0.34086378030044884, "learning_rate": 1.979961705036587e-05, - "loss": 0.5763, - "mean_token_accuracy": 0.8209353926759003, + "loss": 0.4483, + "mean_token_accuracy": 0.8508435940254968, "step": 116 }, { "epoch": 0.15976785866689425, - "grad_norm": 0.15237541956608258, + "grad_norm": 0.4140107037865621, "learning_rate": 1.9789995344373027e-05, - "loss": 0.5722, - "mean_token_accuracy": 0.8220480969698282, + "loss": 0.4449, + "mean_token_accuracy": 0.8518490808801514, "step": 117 }, { "epoch": 0.16113339592045745, - "grad_norm": 0.17192560552288916, + "grad_norm": 0.34331795050708586, "learning_rate": 1.978015047156356e-05, - "loss": 0.5918, - "mean_token_accuracy": 0.8143836414443538, + "loss": 0.4604, + "mean_token_accuracy": 0.8451241258921095, "step": 118 }, { "epoch": 0.16249893317402064, - "grad_norm": 0.1499688814801604, + "grad_norm": 0.3380747151761631, "learning_rate": 1.977008265635525e-05, - "loss": 0.5604, - "mean_token_accuracy": 0.8256925507628414, + "loss": 0.4343, + "mean_token_accuracy": 0.8551851359088176, "step": 119 }, { "epoch": 0.16386447042758384, - "grad_norm": 0.15574419566206335, + "grad_norm": 0.39246892074574696, "learning_rate": 1.9759792128247922e-05, - "loss": 0.5862, - "mean_token_accuracy": 0.8170917918219478, + "loss": 0.4566, + "mean_token_accuracy": 0.8475478494544696, "step": 120 }, { "epoch": 0.16523000768114704, - "grad_norm": 0.15722630462678863, + "grad_norm": 0.3077478512701279, "learning_rate": 1.9749279121818235e-05, - "loss": 0.5702, - "mean_token_accuracy": 0.8217694208719626, + "loss": 0.4446, + "mean_token_accuracy": 0.8506487627564587, "step": 121 }, { "epoch": 0.16659554493471024, - "grad_norm": 0.1551599270463081, + "grad_norm": 0.3128694161488127, "learning_rate": 1.9738543876714335e-05, - "loss": 0.5859, - "mean_token_accuracy": 0.8177714941060327, + "loss": 0.4566, + "mean_token_accuracy": 0.8477825790742587, "step": 122 }, { "epoch": 0.16796108218827344, - "grad_norm": 0.14791234185184035, + "grad_norm": 0.33718369881101784, "learning_rate": 1.9727586637650373e-05, - "loss": 0.5615, - "mean_token_accuracy": 0.8242425142330655, + "loss": 0.4381, + "mean_token_accuracy": 0.8531846568153524, "step": 123 }, { "epoch": 0.16932661944183663, - "grad_norm": 0.15746047981788416, + "grad_norm": 0.31111696829985575, "learning_rate": 1.9716407654400954e-05, - "loss": 0.6024, - "mean_token_accuracy": 0.8131880107167692, + "loss": 0.4715, + "mean_token_accuracy": 0.8433811011221168, "step": 124 }, { "epoch": 0.17069215669539983, - "grad_norm": 0.1518230254996991, + "grad_norm": 0.3250655447526398, "learning_rate": 1.9705007181795416e-05, - "loss": 0.5875, - "mean_token_accuracy": 0.8178493291115596, + "loss": 0.4594, + "mean_token_accuracy": 0.847703186503929, "step": 125 }, { "epoch": 0.17205769394896306, - "grad_norm": 0.15897910997197656, + "grad_norm": 0.31423640348811155, "learning_rate": 1.9693385479712047e-05, - "loss": 0.5632, - "mean_token_accuracy": 0.8240134756169244, + "loss": 0.442, + "mean_token_accuracy": 0.8514727417072737, "step": 126 }, { "epoch": 0.17342323120252626, - "grad_norm": 0.15169336827091273, + "grad_norm": 0.35736412097514153, "learning_rate": 1.9681542813072147e-05, - "loss": 0.6102, - "mean_token_accuracy": 0.8112127239293933, + "loss": 0.4776, + "mean_token_accuracy": 0.8411854317148191, "step": 127 }, { "epoch": 0.17478876845608946, - "grad_norm": 0.14843024953560452, + "grad_norm": 0.2902859082859034, "learning_rate": 1.9669479451833976e-05, - "loss": 0.5659, - "mean_token_accuracy": 0.824494102154264, + "loss": 0.4388, + "mean_token_accuracy": 0.8539477536048056, "step": 128 }, { "epoch": 0.17615430570965265, - "grad_norm": 0.15254027099274217, + "grad_norm": 0.31967638765399986, "learning_rate": 1.9657195670986638e-05, - "loss": 0.583, - "mean_token_accuracy": 0.81836887451526, + "loss": 0.4563, + "mean_token_accuracy": 0.8475490129873101, "step": 129 }, { "epoch": 0.17751984296321585, - "grad_norm": 0.1443482590373341, + "grad_norm": 0.3019630526325269, "learning_rate": 1.964469175054377e-05, - "loss": 0.5777, - "mean_token_accuracy": 0.8207191289569673, + "loss": 0.4508, + "mean_token_accuracy": 0.8500409132192751, "step": 130 }, { "epoch": 0.17888538021677905, - "grad_norm": 0.15148906940942358, + "grad_norm": 0.28471378493835015, "learning_rate": 1.963196797553718e-05, - "loss": 0.5855, - "mean_token_accuracy": 0.8175059432926892, + "loss": 0.4592, + "mean_token_accuracy": 0.8465730185761348, "step": 131 }, { "epoch": 0.18025091747034225, - "grad_norm": 0.15583639190954585, + "grad_norm": 0.2976461319068471, "learning_rate": 1.961902463601036e-05, - "loss": 0.5732, - "mean_token_accuracy": 0.8213490752756972, + "loss": 0.4483, + "mean_token_accuracy": 0.8504603311676011, "step": 132 }, { "epoch": 0.18161645472390545, - "grad_norm": 0.15376868253070186, + "grad_norm": 0.3420853043630677, "learning_rate": 1.9605862027011858e-05, - "loss": 0.5708, - "mean_token_accuracy": 0.8233471857890292, + "loss": 0.446, + "mean_token_accuracy": 0.8522397868056077, "step": 133 }, { "epoch": 0.18298199197746864, - "grad_norm": 0.1557961846589519, + "grad_norm": 0.30068683893569925, "learning_rate": 1.959248044858854e-05, - "loss": 0.5796, - "mean_token_accuracy": 0.8185728539916792, + "loss": 0.4538, + "mean_token_accuracy": 0.8479476016652441, "step": 134 }, { "epoch": 0.18434752923103184, - "grad_norm": 0.1547477399190793, + "grad_norm": 0.337968764863526, "learning_rate": 1.9578880205778793e-05, - "loss": 0.5494, - "mean_token_accuracy": 0.8268111439446552, + "loss": 0.432, + "mean_token_accuracy": 0.8544472005484545, "step": 135 }, { "epoch": 0.18571306648459504, - "grad_norm": 0.14990492474348963, + "grad_norm": 0.3187881918567848, "learning_rate": 1.9565061608605526e-05, - "loss": 0.578, - "mean_token_accuracy": 0.8188870102970949, + "loss": 0.4533, + "mean_token_accuracy": 0.8479640485502516, "step": 136 }, { "epoch": 0.18707860373815824, - "grad_norm": 0.16686497175879653, + "grad_norm": 0.35269232923831895, "learning_rate": 1.9551024972069127e-05, - "loss": 0.5741, - "mean_token_accuracy": 0.8220592244306001, + "loss": 0.4509, + "mean_token_accuracy": 0.8508397906934503, "step": 137 }, { "epoch": 0.18844414099172144, - "grad_norm": 0.15321599453816967, + "grad_norm": 0.3058471460955213, "learning_rate": 1.9536770616140277e-05, - "loss": 0.574, - "mean_token_accuracy": 0.8210809002578991, + "loss": 0.4514, + "mean_token_accuracy": 0.8494591928231272, "step": 138 }, { "epoch": 0.18980967824528464, - "grad_norm": 0.14704039610363137, + "grad_norm": 0.2923803706742491, "learning_rate": 1.9522298865752662e-05, - "loss": 0.5726, - "mean_token_accuracy": 0.8217046251572349, + "loss": 0.4495, + "mean_token_accuracy": 0.8497101586020829, "step": 139 }, { "epoch": 0.19117521549884783, - "grad_norm": 0.14947910413011775, + "grad_norm": 0.35715418556836526, "learning_rate": 1.950761005079556e-05, - "loss": 0.608, - "mean_token_accuracy": 0.8113111914702771, + "loss": 0.4779, + "mean_token_accuracy": 0.8411692267142432, "step": 140 }, { "epoch": 0.19254075275241103, - "grad_norm": 0.14006407922714537, + "grad_norm": 0.27357483270279737, "learning_rate": 1.949270450610631e-05, - "loss": 0.5266, - "mean_token_accuracy": 0.8338308730096788, + "loss": 0.4116, + "mean_token_accuracy": 0.8609656246183783, "step": 141 }, { "epoch": 0.19390629000597423, - "grad_norm": 0.15105705759092428, + "grad_norm": 0.3713092433646525, "learning_rate": 1.9477582571462706e-05, - "loss": 0.574, - "mean_token_accuracy": 0.8209034170456223, + "loss": 0.4513, + "mean_token_accuracy": 0.8494811710899883, "step": 142 }, { "epoch": 0.19527182725953743, - "grad_norm": 0.15161327457043544, + "grad_norm": 0.34866780781386153, "learning_rate": 1.9462244591575222e-05, - "loss": 0.5683, - "mean_token_accuracy": 0.8242010058951029, + "loss": 0.4447, + "mean_token_accuracy": 0.8526829842758383, "step": 143 }, { "epoch": 0.19663736451310063, - "grad_norm": 0.14286613394281059, + "grad_norm": 0.2949930109269014, "learning_rate": 1.944669091607919e-05, - "loss": 0.5449, - "mean_token_accuracy": 0.8290125632504537, + "loss": 0.4273, + "mean_token_accuracy": 0.8562389055851354, "step": 144 }, { "epoch": 0.19800290176666382, - "grad_norm": 0.13394746845580938, + "grad_norm": 0.374752731020472, "learning_rate": 1.9430921899526786e-05, - "loss": 0.5698, - "mean_token_accuracy": 0.8213632897447157, + "loss": 0.4478, + "mean_token_accuracy": 0.8499277623582386, "step": 145 }, { "epoch": 0.19936843902022702, - "grad_norm": 0.1596358144360359, + "grad_norm": 0.3651538698048502, "learning_rate": 1.941493790137898e-05, - "loss": 0.5649, - "mean_token_accuracy": 0.8245563770741386, + "loss": 0.4442, + "mean_token_accuracy": 0.8526649985454055, "step": 146 }, { "epoch": 0.20073397627379022, - "grad_norm": 0.15965925954961144, + "grad_norm": 0.36636578170953166, "learning_rate": 1.9398739285997342e-05, - "loss": 0.5701, - "mean_token_accuracy": 0.822431978602767, + "loss": 0.4485, + "mean_token_accuracy": 0.8504179631544849, "step": 147 }, { "epoch": 0.20209951352735342, - "grad_norm": 0.149613654438459, + "grad_norm": 0.40761326956885474, "learning_rate": 1.9382326422635705e-05, - "loss": 0.5888, - "mean_token_accuracy": 0.8162564158817408, + "loss": 0.4642, + "mean_token_accuracy": 0.8452167499746224, "step": 148 }, { "epoch": 0.20346505078091662, - "grad_norm": 0.13603898869696263, + "grad_norm": 0.31422691233021155, "learning_rate": 1.936569968543179e-05, - "loss": 0.5558, - "mean_token_accuracy": 0.8272639044101179, + "loss": 0.4368, + "mean_token_accuracy": 0.8547842065414144, "step": 149 }, { "epoch": 0.20483058803447982, - "grad_norm": 0.14257305363496442, + "grad_norm": 0.35446348871617744, "learning_rate": 1.934885945339865e-05, - "loss": 0.558, - "mean_token_accuracy": 0.8228598752402781, + "loss": 0.4402, + "mean_token_accuracy": 0.8503691031564117, "step": 150 }, { "epoch": 0.20619612528804301, - "grad_norm": 0.14269109052044499, + "grad_norm": 0.32285449024202295, "learning_rate": 1.9331806110416027e-05, - "loss": 0.5742, - "mean_token_accuracy": 0.8219213690158541, + "loss": 0.4531, + "mean_token_accuracy": 0.8497348879856765, "step": 151 }, { "epoch": 0.2075616625416062, - "grad_norm": 0.1379489113395024, + "grad_norm": 0.2974246522701837, "learning_rate": 1.9314540045221628e-05, - "loss": 0.545, - "mean_token_accuracy": 0.8301045701472723, + "loss": 0.4292, + "mean_token_accuracy": 0.8566265295867389, "step": 152 }, { "epoch": 0.2089271997951694, - "grad_norm": 0.13979299837928688, + "grad_norm": 0.34647196924654977, "learning_rate": 1.9297061651402237e-05, - "loss": 0.5522, - "mean_token_accuracy": 0.8255754818361906, + "loss": 0.4326, + "mean_token_accuracy": 0.8539450355140382, "step": 153 }, { "epoch": 0.2102927370487326, - "grad_norm": 0.13499535132303167, + "grad_norm": 0.386258719161046, "learning_rate": 1.927937132738476e-05, - "loss": 0.5593, - "mean_token_accuracy": 0.8261764102787341, + "loss": 0.4421, + "mean_token_accuracy": 0.8532191960011701, "step": 154 }, { "epoch": 0.2116582743022958, - "grad_norm": 0.13357832635148342, + "grad_norm": 0.34454258298504326, "learning_rate": 1.9261469476427122e-05, - "loss": 0.5397, - "mean_token_accuracy": 0.8299966359575962, + "loss": 0.4258, + "mean_token_accuracy": 0.8568923616497963, "step": 155 }, { "epoch": 0.213023811555859, - "grad_norm": 0.15460866354890906, + "grad_norm": 0.3541361157616604, "learning_rate": 1.92433565066091e-05, - "loss": 0.5737, - "mean_token_accuracy": 0.8204832190275533, + "loss": 0.4529, + "mean_token_accuracy": 0.8479736960845543, "step": 156 }, { "epoch": 0.2143893488094222, - "grad_norm": 0.13514237756317554, + "grad_norm": 0.35191182105790164, "learning_rate": 1.922503283082301e-05, - "loss": 0.5312, - "mean_token_accuracy": 0.8314253038819239, + "loss": 0.4202, + "mean_token_accuracy": 0.8578095481186783, "step": 157 }, { "epoch": 0.2157548860629854, - "grad_norm": 0.14517411624870374, + "grad_norm": 0.42204829019945844, "learning_rate": 1.920649886676429e-05, - "loss": 0.6027, - "mean_token_accuracy": 0.8145067781443469, + "loss": 0.4753, + "mean_token_accuracy": 0.8431488494361127, "step": 158 }, { "epoch": 0.2171204233165486, - "grad_norm": 0.1331144491854251, + "grad_norm": 0.282563065166861, "learning_rate": 1.9187755036921976e-05, - "loss": 0.569, - "mean_token_accuracy": 0.8209313751458445, + "loss": 0.4481, + "mean_token_accuracy": 0.8492420629486095, "step": 159 }, { "epoch": 0.2184859605701118, - "grad_norm": 0.13545509088905294, + "grad_norm": 0.3294030087715004, "learning_rate": 1.916880176856909e-05, - "loss": 0.5352, - "mean_token_accuracy": 0.8297160330992364, + "loss": 0.4239, + "mean_token_accuracy": 0.856072747237234, "step": 160 }, { "epoch": 0.219851497823675, - "grad_norm": 0.1353911131873309, + "grad_norm": 0.31721191426888634, "learning_rate": 1.914963949375288e-05, - "loss": 0.556, - "mean_token_accuracy": 0.8260038227559241, + "loss": 0.4393, + "mean_token_accuracy": 0.8534969611431503, "step": 161 }, { "epoch": 0.2212170350772382, - "grad_norm": 0.143550455135651, + "grad_norm": 0.34061976253510645, "learning_rate": 1.9130268649284982e-05, - "loss": 0.5694, - "mean_token_accuracy": 0.8208407328549958, + "loss": 0.4495, + "mean_token_accuracy": 0.8489997065725852, "step": 162 }, { "epoch": 0.2225825723308014, - "grad_norm": 0.13808675635320625, + "grad_norm": 0.319599698534074, "learning_rate": 1.9110689676731454e-05, - "loss": 0.571, - "mean_token_accuracy": 0.8201715175969139, + "loss": 0.4524, + "mean_token_accuracy": 0.8484242719179398, "step": 163 }, { "epoch": 0.2239481095843646, - "grad_norm": 0.1356824603082857, + "grad_norm": 0.38523458399821875, "learning_rate": 1.909090302240273e-05, - "loss": 0.5657, - "mean_token_accuracy": 0.8215189568156395, + "loss": 0.4459, + "mean_token_accuracy": 0.8499845577594263, "step": 164 }, { "epoch": 0.2253136468379278, - "grad_norm": 0.1337600993543247, + "grad_norm": 0.3082646414065079, "learning_rate": 1.907090913734341e-05, - "loss": 0.5341, - "mean_token_accuracy": 0.8301763298412165, + "loss": 0.4228, + "mean_token_accuracy": 0.8565411948866354, "step": 165 }, { "epoch": 0.226679184091491, - "grad_norm": 0.13158179752247454, + "grad_norm": 0.4281504609323445, "learning_rate": 1.905070847732202e-05, - "loss": 0.5357, - "mean_token_accuracy": 0.8299088339028999, + "loss": 0.4231, + "mean_token_accuracy": 0.8567314407107315, "step": 166 }, { "epoch": 0.22804472134505419, - "grad_norm": 0.13262900953882384, + "grad_norm": 0.3122301354550845, "learning_rate": 1.9030301502820597e-05, - "loss": 0.5575, - "mean_token_accuracy": 0.8251422531196677, + "loss": 0.4406, + "mean_token_accuracy": 0.8526059868064779, "step": 167 }, { "epoch": 0.22941025859861738, - "grad_norm": 0.1492931382695929, + "grad_norm": 0.3600671344222034, "learning_rate": 1.900968867902419e-05, - "loss": 0.5574, - "mean_token_accuracy": 0.8233750352955169, + "loss": 0.4423, + "mean_token_accuracy": 0.8509005211327412, "step": 168 }, { "epoch": 0.23077579585218058, - "grad_norm": 0.13005401323572754, + "grad_norm": 0.3335012451792573, "learning_rate": 1.8988870475810284e-05, - "loss": 0.5485, - "mean_token_accuracy": 0.826333597918774, + "loss": 0.4323, + "mean_token_accuracy": 0.8542063476327129, "step": 169 }, { "epoch": 0.23214133310574378, - "grad_norm": 0.1415035467111907, + "grad_norm": 0.3416637272262734, "learning_rate": 1.896784736773805e-05, - "loss": 0.5721, - "mean_token_accuracy": 0.8213436999644876, + "loss": 0.4544, + "mean_token_accuracy": 0.8489287380113998, "step": 170 }, { "epoch": 0.23350687035930698, - "grad_norm": 0.1427296035853988, + "grad_norm": 0.4300545403831784, "learning_rate": 1.894661983403755e-05, - "loss": 0.5561, - "mean_token_accuracy": 0.8268984747352183, + "loss": 0.441, + "mean_token_accuracy": 0.8535810843521906, "step": 171 }, { "epoch": 0.23487240761287018, - "grad_norm": 0.12952647926731686, + "grad_norm": 0.3258898174006088, "learning_rate": 1.8925188358598815e-05, - "loss": 0.552, - "mean_token_accuracy": 0.8251283170409514, + "loss": 0.4376, + "mean_token_accuracy": 0.8517990481723685, "step": 172 }, { "epoch": 0.23623794486643337, - "grad_norm": 0.1345736561036896, + "grad_norm": 0.2988219576519335, "learning_rate": 1.8903553429960803e-05, - "loss": 0.5934, - "mean_token_accuracy": 0.8122446660074, + "loss": 0.4694, + "mean_token_accuracy": 0.8419916214618768, "step": 173 }, { "epoch": 0.23760348211999657, - "grad_norm": 0.13630944168995449, + "grad_norm": 0.32171167299283115, "learning_rate": 1.8881715541300278e-05, - "loss": 0.557, - "mean_token_accuracy": 0.8253395927516096, + "loss": 0.4412, + "mean_token_accuracy": 0.8526490817200958, "step": 174 }, { "epoch": 0.23896901937355977, - "grad_norm": 0.14873507389496698, + "grad_norm": 0.36549760624914024, "learning_rate": 1.885967519042054e-05, - "loss": 0.5765, - "mean_token_accuracy": 0.818127566409884, + "loss": 0.4575, + "mean_token_accuracy": 0.8467209578442905, "step": 175 }, { "epoch": 0.240334556627123, - "grad_norm": 0.13810164588116702, + "grad_norm": 0.3206347251456072, "learning_rate": 1.8837432879740113e-05, - "loss": 0.5363, - "mean_token_accuracy": 0.8293301344276796, + "loss": 0.4253, + "mean_token_accuracy": 0.8562328690343533, "step": 176 }, { "epoch": 0.2417000938806862, - "grad_norm": 0.14501815664944306, + "grad_norm": 0.36277383760815624, "learning_rate": 1.881498911628127e-05, - "loss": 0.545, - "mean_token_accuracy": 0.8275873304548665, + "loss": 0.4317, + "mean_token_accuracy": 0.8544379338426664, "step": 177 }, { "epoch": 0.2430656311342494, - "grad_norm": 0.13328386048720128, + "grad_norm": 0.3634699354146494, "learning_rate": 1.879234441165847e-05, - "loss": 0.5356, - "mean_token_accuracy": 0.8297564037224775, + "loss": 0.4257, + "mean_token_accuracy": 0.8562673113043472, "step": 178 }, { "epoch": 0.2444311683878126, - "grad_norm": 0.13545579038846284, + "grad_norm": 0.3364593897580825, "learning_rate": 1.8769499282066716e-05, - "loss": 0.5415, - "mean_token_accuracy": 0.828067797519634, + "loss": 0.4288, + "mean_token_accuracy": 0.8545978179267162, "step": 179 }, { "epoch": 0.2457967056413758, - "grad_norm": 0.14943071887327852, + "grad_norm": 0.36736450922616337, "learning_rate": 1.8746454248269777e-05, - "loss": 0.5759, - "mean_token_accuracy": 0.8204827673999263, + "loss": 0.4547, + "mean_token_accuracy": 0.8489931015745901, "step": 180 }, { "epoch": 0.247162242894939, - "grad_norm": 0.14377197599594754, + "grad_norm": 0.2842529030222029, "learning_rate": 1.872320983558831e-05, - "loss": 0.5502, - "mean_token_accuracy": 0.8262096409918235, + "loss": 0.4357, + "mean_token_accuracy": 0.8537670434938108, "step": 181 }, { "epoch": 0.2485277801485022, - "grad_norm": 0.13753473688253312, + "grad_norm": 0.3503501258863951, "learning_rate": 1.8699766573887902e-05, - "loss": 0.5397, - "mean_token_accuracy": 0.8283154378152029, + "loss": 0.4275, + "mean_token_accuracy": 0.8555725755068402, "step": 182 }, { "epoch": 0.24989331740206538, - "grad_norm": 0.13788152893603342, + "grad_norm": 0.32115728482149525, "learning_rate": 1.867612499756697e-05, - "loss": 0.5487, - "mean_token_accuracy": 0.8269837704923455, + "loss": 0.4349, + "mean_token_accuracy": 0.8538767592466098, "step": 183 }, { "epoch": 0.2512588546556286, - "grad_norm": 0.15718665969687443, + "grad_norm": 0.37284059913436857, "learning_rate": 1.8652285645544602e-05, - "loss": 0.5313, - "mean_token_accuracy": 0.8307698146448872, + "loss": 0.4229, + "mean_token_accuracy": 0.8565206996350189, "step": 184 }, { "epoch": 0.2526243919091918, - "grad_norm": 0.13557438263303107, + "grad_norm": 0.27914377362605364, "learning_rate": 1.862824906124826e-05, - "loss": 0.5549, - "mean_token_accuracy": 0.8258225094589919, + "loss": 0.4389, + "mean_token_accuracy": 0.8533956082213735, "step": 185 }, { "epoch": 0.253989929162755, - "grad_norm": 0.13830195765753106, + "grad_norm": 0.33047332583495903, "learning_rate": 1.8604015792601395e-05, - "loss": 0.5395, - "mean_token_accuracy": 0.8320858107875397, + "loss": 0.4259, + "mean_token_accuracy": 0.8587885669486044, "step": 186 }, { "epoch": 0.2553554664163182, - "grad_norm": 0.14445995905296166, + "grad_norm": 0.3098565583156071, "learning_rate": 1.8579586392010943e-05, - "loss": 0.5962, - "mean_token_accuracy": 0.8120192317691157, + "loss": 0.4725, + "mean_token_accuracy": 0.841463056808317, "step": 187 }, { "epoch": 0.2567210036698814, - "grad_norm": 0.1434439599799226, + "grad_norm": 0.32188400036801423, "learning_rate": 1.855496141635476e-05, - "loss": 0.5656, - "mean_token_accuracy": 0.8218863154853169, + "loss": 0.4488, + "mean_token_accuracy": 0.8494999156308464, "step": 188 }, { "epoch": 0.2580865409234446, - "grad_norm": 0.13868086713753697, + "grad_norm": 0.3130726973042191, "learning_rate": 1.8530141426968905e-05, - "loss": 0.5416, - "mean_token_accuracy": 0.8300532645478108, + "loss": 0.4278, + "mean_token_accuracy": 0.8566668881226269, "step": 189 }, { "epoch": 0.25945207817700777, - "grad_norm": 0.1238779281127213, + "grad_norm": 0.3957651044446383, "learning_rate": 1.850512698963485e-05, - "loss": 0.5334, - "mean_token_accuracy": 0.8308739755013311, + "loss": 0.4261, + "mean_token_accuracy": 0.8556154103093908, "step": 190 }, { "epoch": 0.26081761543057097, - "grad_norm": 0.13496254997436558, + "grad_norm": 0.3031069025669304, "learning_rate": 1.8479918674566602e-05, - "loss": 0.5459, - "mean_token_accuracy": 0.827372883221416, + "loss": 0.4329, + "mean_token_accuracy": 0.8541440562465102, "step": 191 }, { "epoch": 0.26218315268413417, - "grad_norm": 0.13282003481920548, + "grad_norm": 0.3643754085496684, "learning_rate": 1.8454517056397663e-05, - "loss": 0.5439, - "mean_token_accuracy": 0.8287497510015901, + "loss": 0.4324, + "mean_token_accuracy": 0.855023699771463, "step": 192 }, { "epoch": 0.26354868993769737, - "grad_norm": 0.1310777057405676, + "grad_norm": 0.3003852707955295, "learning_rate": 1.842892271416797e-05, - "loss": 0.5363, - "mean_token_accuracy": 0.8290369900235632, + "loss": 0.4261, + "mean_token_accuracy": 0.8553599004543545, "step": 193 }, { "epoch": 0.26491422719126057, - "grad_norm": 0.13451975484760634, + "grad_norm": 0.34484824975763084, "learning_rate": 1.8403136231310686e-05, - "loss": 0.5365, - "mean_token_accuracy": 0.8300333457414567, + "loss": 0.4252, + "mean_token_accuracy": 0.8566094556118384, "step": 194 }, { "epoch": 0.26627976444482376, - "grad_norm": 0.1272263904792763, + "grad_norm": 0.3248230314864982, "learning_rate": 1.8377158195638877e-05, - "loss": 0.5541, - "mean_token_accuracy": 0.8228543929432492, + "loss": 0.4411, + "mean_token_accuracy": 0.850191582672607, "step": 195 }, { "epoch": 0.26764530169838696, - "grad_norm": 0.13686458359739945, + "grad_norm": 0.3425796212560859, "learning_rate": 1.8350989199332156e-05, - "loss": 0.5614, - "mean_token_accuracy": 0.8215735166911772, + "loss": 0.4467, + "mean_token_accuracy": 0.8491048527905652, "step": 196 }, { "epoch": 0.26901083895195016, - "grad_norm": 0.1323734785913802, + "grad_norm": 0.34768568039417325, "learning_rate": 1.8324629838923132e-05, - "loss": 0.5561, - "mean_token_accuracy": 0.8245785342546282, + "loss": 0.4421, + "mean_token_accuracy": 0.8511677206305359, "step": 197 }, { "epoch": 0.27037637620551336, - "grad_norm": 0.14417178181997775, + "grad_norm": 0.34080656327066544, "learning_rate": 1.829808071528386e-05, - "loss": 0.5379, - "mean_token_accuracy": 0.8308799474140498, + "loss": 0.4281, + "mean_token_accuracy": 0.8569207458025948, "step": 198 }, { "epoch": 0.27174191345907656, - "grad_norm": 0.12947284324247152, + "grad_norm": 0.3367911094729273, "learning_rate": 1.8271342433612114e-05, - "loss": 0.5536, - "mean_token_accuracy": 0.8250832698738962, + "loss": 0.4395, + "mean_token_accuracy": 0.8523639031650138, "step": 199 }, { "epoch": 0.27310745071263975, - "grad_norm": 0.14590997190021188, + "grad_norm": 0.3151512003460036, "learning_rate": 1.8244415603417603e-05, - "loss": 0.5483, - "mean_token_accuracy": 0.8294859396930414, + "loss": 0.4348, + "mean_token_accuracy": 0.8558035079566513, "step": 200 }, { "epoch": 0.27447298796620295, - "grad_norm": 0.12681749937502249, + "grad_norm": 0.3220180875156976, "learning_rate": 1.8217300838508075e-05, - "loss": 0.5058, - "mean_token_accuracy": 0.8389237012622073, + "loss": 0.403, + "mean_token_accuracy": 0.8631310614267218, "step": 201 }, { "epoch": 0.27583852521976615, - "grad_norm": 0.1324745045572296, + "grad_norm": 0.28929076526601605, "learning_rate": 1.8189998756975318e-05, - "loss": 0.5371, - "mean_token_accuracy": 0.8296920842600208, + "loss": 0.4282, + "mean_token_accuracy": 0.8555874931968974, "step": 202 }, { "epoch": 0.27720406247332935, - "grad_norm": 0.14201595338162082, + "grad_norm": 0.36330635065555233, "learning_rate": 1.8162509981181084e-05, - "loss": 0.5653, - "mean_token_accuracy": 0.8219865846459583, + "loss": 0.4479, + "mean_token_accuracy": 0.8496229750925639, "step": 203 }, { "epoch": 0.27856959972689255, - "grad_norm": 0.13123684789745846, + "grad_norm": 0.27638019683036386, "learning_rate": 1.813483513774289e-05, - "loss": 0.5475, - "mean_token_accuracy": 0.8265987826592883, + "loss": 0.436, + "mean_token_accuracy": 0.8529176942517751, "step": 204 }, { "epoch": 0.27993513698045575, - "grad_norm": 0.13156048329455144, + "grad_norm": 0.35506296866931764, "learning_rate": 1.8106974857519737e-05, - "loss": 0.5567, - "mean_token_accuracy": 0.8251586870762427, + "loss": 0.4426, + "mean_token_accuracy": 0.8514268242896852, "step": 205 }, { "epoch": 0.28130067423401894, - "grad_norm": 0.1385452033831207, + "grad_norm": 0.3286137389758072, "learning_rate": 1.807892977559774e-05, - "loss": 0.5568, - "mean_token_accuracy": 0.8242658837647271, + "loss": 0.4417, + "mean_token_accuracy": 0.8510960913771437, "step": 206 }, { "epoch": 0.28266621148758214, - "grad_norm": 0.1290006207819128, + "grad_norm": 0.25518635308883814, "learning_rate": 1.8050700531275632e-05, - "loss": 0.5278, - "mean_token_accuracy": 0.833256919964306, + "loss": 0.418, + "mean_token_accuracy": 0.8594128726528723, "step": 207 }, { "epoch": 0.28403174874114534, - "grad_norm": 0.1340632544778837, + "grad_norm": 0.34031577918112327, "learning_rate": 1.80222877680502e-05, - "loss": 0.5294, - "mean_token_accuracy": 0.8311739550173208, + "loss": 0.421, + "mean_token_accuracy": 0.8569013665570591, "step": 208 }, { "epoch": 0.28539728599470854, - "grad_norm": 0.13745074937644006, + "grad_norm": 0.39038645873424477, "learning_rate": 1.799369213360163e-05, - "loss": 0.5484, - "mean_token_accuracy": 0.8260552978104109, + "loss": 0.4389, + "mean_token_accuracy": 0.851740201076106, "step": 209 }, { "epoch": 0.28676282324827174, - "grad_norm": 0.13273081908857917, + "grad_norm": 0.34488640211714505, "learning_rate": 1.7964914279778716e-05, - "loss": 0.5561, - "mean_token_accuracy": 0.8248778354586159, + "loss": 0.4449, + "mean_token_accuracy": 0.8504230733766642, "step": 210 }, { "epoch": 0.28812836050183493, - "grad_norm": 0.13634420454832463, + "grad_norm": 0.2935740182091683, "learning_rate": 1.7935954862584018e-05, - "loss": 0.5424, - "mean_token_accuracy": 0.8296063617958134, + "loss": 0.4321, + "mean_token_accuracy": 0.8557445828098919, "step": 211 }, { "epoch": 0.28949389775539813, - "grad_norm": 0.14507597898409702, + "grad_norm": 0.3228921123428817, "learning_rate": 1.7906814542158913e-05, - "loss": 0.5429, - "mean_token_accuracy": 0.8272200172131677, + "loss": 0.4305, + "mean_token_accuracy": 0.8537493570872144, "step": 212 }, { "epoch": 0.29085943500896133, - "grad_norm": 0.13549144789577303, + "grad_norm": 0.28556153848386795, "learning_rate": 1.7877493982768527e-05, - "loss": 0.5572, - "mean_token_accuracy": 0.8229477700284591, + "loss": 0.444, + "mean_token_accuracy": 0.8495622520508093, "step": 213 }, { "epoch": 0.29222497226252453, - "grad_norm": 0.14011349648698726, + "grad_norm": 0.31694287799599025, "learning_rate": 1.7847993852786612e-05, - "loss": 0.5592, - "mean_token_accuracy": 0.8236593588368766, + "loss": 0.4461, + "mean_token_accuracy": 0.8494224622463822, "step": 214 }, { "epoch": 0.2935905095160877, - "grad_norm": 0.1268734857451273, + "grad_norm": 0.3092018814084449, "learning_rate": 1.78183148246803e-05, - "loss": 0.5432, - "mean_token_accuracy": 0.8279324505424972, + "loss": 0.4301, + "mean_token_accuracy": 0.8543694459213177, "step": 215 }, { "epoch": 0.2949560467696509, - "grad_norm": 0.13044891490589386, + "grad_norm": 0.2902008097484682, "learning_rate": 1.7788457574994777e-05, - "loss": 0.5212, - "mean_token_accuracy": 0.8333174436738015, + "loss": 0.4151, + "mean_token_accuracy": 0.8582394007580811, "step": 216 }, { "epoch": 0.2963215840232141, - "grad_norm": 0.13181248494960948, + "grad_norm": 0.3328026971189236, "learning_rate": 1.775842278433786e-05, - "loss": 0.5345, - "mean_token_accuracy": 0.830449368618983, + "loss": 0.4255, + "mean_token_accuracy": 0.8564833688598898, "step": 217 }, { "epoch": 0.2976871212767773, - "grad_norm": 0.12905826685555044, + "grad_norm": 0.27435410637394864, "learning_rate": 1.772821113736449e-05, - "loss": 0.537, - "mean_token_accuracy": 0.8300991074681177, + "loss": 0.4262, + "mean_token_accuracy": 0.8559848881104419, "step": 218 }, { "epoch": 0.2990526585303405, - "grad_norm": 0.1283519363537463, + "grad_norm": 0.3632335496079091, "learning_rate": 1.76978233227611e-05, - "loss": 0.5154, - "mean_token_accuracy": 0.8365102500151056, + "loss": 0.4107, + "mean_token_accuracy": 0.8612579363381194, "step": 219 }, { "epoch": 0.3004181957839037, - "grad_norm": 0.1366335089373392, + "grad_norm": 0.3167909582527359, "learning_rate": 1.7667260033229953e-05, - "loss": 0.5452, - "mean_token_accuracy": 0.8278096991565351, + "loss": 0.4342, + "mean_token_accuracy": 0.8534075264867609, "step": 220 }, { "epoch": 0.3017837330374669, - "grad_norm": 0.12957719912103285, + "grad_norm": 0.31022395154637067, "learning_rate": 1.7636521965473324e-05, - "loss": 0.5388, - "mean_token_accuracy": 0.8281610383426196, + "loss": 0.4295, + "mean_token_accuracy": 0.8542319208452556, "step": 221 }, { "epoch": 0.3031492702910301, - "grad_norm": 0.13845354182656133, + "grad_norm": 0.33282321447951396, "learning_rate": 1.760560982017762e-05, - "loss": 0.57, - "mean_token_accuracy": 0.8198283955896564, + "loss": 0.4529, + "mean_token_accuracy": 0.8473886646532048, "step": 222 }, { "epoch": 0.3045148075445933, - "grad_norm": 0.14021868256279998, + "grad_norm": 0.3703839580647503, "learning_rate": 1.7574524301997425e-05, - "loss": 0.5533, - "mean_token_accuracy": 0.8260795486468631, + "loss": 0.4418, + "mean_token_accuracy": 0.8520835295773124, "step": 223 }, { "epoch": 0.3058803447981565, - "grad_norm": 0.13513581606031977, + "grad_norm": 0.29499276522228773, "learning_rate": 1.7543266119539424e-05, - "loss": 0.5297, - "mean_token_accuracy": 0.8320646556472823, + "loss": 0.4222, + "mean_token_accuracy": 0.8572304378636991, "step": 224 }, { "epoch": 0.3072458820517197, - "grad_norm": 0.13524998045930295, + "grad_norm": 0.36404736107381586, "learning_rate": 1.751183598534625e-05, - "loss": 0.5243, - "mean_token_accuracy": 0.8338239484220115, + "loss": 0.417, + "mean_token_accuracy": 0.8592393267568642, "step": 225 }, { "epoch": 0.3086114193052829, - "grad_norm": 0.13747110677070973, + "grad_norm": 0.31900880116039887, "learning_rate": 1.7480234615880247e-05, - "loss": 0.5314, - "mean_token_accuracy": 0.8315549957547392, + "loss": 0.4221, + "mean_token_accuracy": 0.8570558484506429, "step": 226 }, { "epoch": 0.3099769565588461, - "grad_norm": 0.1321016501011579, + "grad_norm": 0.39339547941721115, "learning_rate": 1.7448462731507133e-05, - "loss": 0.5467, - "mean_token_accuracy": 0.8262590160621643, + "loss": 0.4351, + "mean_token_accuracy": 0.8529953241096725, "step": 227 }, { "epoch": 0.3113424938124093, - "grad_norm": 0.14148785578493014, + "grad_norm": 0.38617429480416876, "learning_rate": 1.7416521056479577e-05, - "loss": 0.5461, - "mean_token_accuracy": 0.8264614261715929, + "loss": 0.4352, + "mean_token_accuracy": 0.8529279155103227, "step": 228 }, { "epoch": 0.3127080310659725, - "grad_norm": 0.12955174985551135, + "grad_norm": 0.3239118787173082, "learning_rate": 1.7384410318920698e-05, - "loss": 0.5503, - "mean_token_accuracy": 0.8258825994742341, + "loss": 0.4387, + "mean_token_accuracy": 0.8516904522581532, "step": 229 }, { "epoch": 0.3140735683195357, - "grad_norm": 0.1291967470853804, + "grad_norm": 0.3142116046355029, "learning_rate": 1.7352131250807466e-05, - "loss": 0.5229, - "mean_token_accuracy": 0.8344412680528862, + "loss": 0.4164, + "mean_token_accuracy": 0.8592193212873477, "step": 230 }, { "epoch": 0.3154391055730989, - "grad_norm": 0.13224712785647114, + "grad_norm": 0.3581861836859027, "learning_rate": 1.7319684587954e-05, - "loss": 0.5242, - "mean_token_accuracy": 0.8331331640328826, + "loss": 0.4175, + "mean_token_accuracy": 0.8585542096028473, "step": 231 }, { "epoch": 0.3168046428266621, - "grad_norm": 0.1239558039889443, + "grad_norm": 0.3011586770676647, "learning_rate": 1.728707106999482e-05, - "loss": 0.5054, - "mean_token_accuracy": 0.8383066782904636, + "loss": 0.4034, + "mean_token_accuracy": 0.8631680964286239, "step": 232 }, { "epoch": 0.3181701800802253, - "grad_norm": 0.13111111880762122, + "grad_norm": 0.3215575152723449, "learning_rate": 1.725429144036797e-05, - "loss": 0.5189, - "mean_token_accuracy": 0.8335543578656327, + "loss": 0.4146, + "mean_token_accuracy": 0.8578385115822528, "step": 233 }, { "epoch": 0.3195357173337885, - "grad_norm": 0.13521412273746158, + "grad_norm": 0.34263046362886657, "learning_rate": 1.722134644629807e-05, - "loss": 0.521, - "mean_token_accuracy": 0.8325514563600711, + "loss": 0.4142, + "mean_token_accuracy": 0.8586846673999058, "step": 234 }, { "epoch": 0.3209012545873517, - "grad_norm": 0.13512653975156758, + "grad_norm": 0.33342256217650906, "learning_rate": 1.7188236838779297e-05, - "loss": 0.5519, - "mean_token_accuracy": 0.8256504099616956, + "loss": 0.44, + "mean_token_accuracy": 0.851783206820397, "step": 235 }, { "epoch": 0.3222667918409149, - "grad_norm": 0.14345905813078963, + "grad_norm": 0.3442067446188413, "learning_rate": 1.7154963372558246e-05, - "loss": 0.5528, - "mean_token_accuracy": 0.8227875858666639, + "loss": 0.4415, + "mean_token_accuracy": 0.8494488715634262, "step": 236 }, { "epoch": 0.3236323290944781, - "grad_norm": 0.1463689921220358, + "grad_norm": 0.39286054840274137, "learning_rate": 1.712152680611675e-05, - "loss": 0.5564, - "mean_token_accuracy": 0.8257682915202985, + "loss": 0.4446, + "mean_token_accuracy": 0.8519838146425381, "step": 237 }, { "epoch": 0.3249978663480413, - "grad_norm": 0.12768060963486952, + "grad_norm": 0.3707489334417565, "learning_rate": 1.7087927901654558e-05, - "loss": 0.5051, - "mean_token_accuracy": 0.837880739143873, + "loss": 0.4018, + "mean_token_accuracy": 0.8628199801152938, "step": 238 }, { "epoch": 0.3263634036016045, - "grad_norm": 0.1384557966643483, + "grad_norm": 0.32915317985042997, "learning_rate": 1.7054167425071995e-05, - "loss": 0.5324, - "mean_token_accuracy": 0.832886860766279, + "loss": 0.4253, + "mean_token_accuracy": 0.8583234741171639, "step": 239 }, { "epoch": 0.3277289408551677, - "grad_norm": 0.1380342523807595, + "grad_norm": 0.3582433826817124, "learning_rate": 1.702024614595248e-05, - "loss": 0.5322, - "mean_token_accuracy": 0.8316455665369258, + "loss": 0.4245, + "mean_token_accuracy": 0.8573367967440888, "step": 240 }, { "epoch": 0.3290944781087309, - "grad_norm": 0.13818609305567817, + "grad_norm": 0.33842853808026, "learning_rate": 1.6986164837544987e-05, - "loss": 0.5235, - "mean_token_accuracy": 0.8335774370487075, + "loss": 0.4179, + "mean_token_accuracy": 0.8586180123428547, "step": 241 }, { "epoch": 0.3304600153622941, - "grad_norm": 0.1393335965855472, + "grad_norm": 0.2994720944821025, "learning_rate": 1.6951924276746425e-05, - "loss": 0.5483, - "mean_token_accuracy": 0.824988575113149, + "loss": 0.4362, + "mean_token_accuracy": 0.8523116286165904, "step": 242 }, { "epoch": 0.3318255526158573, - "grad_norm": 0.1386967491911664, + "grad_norm": 0.3387086926170212, "learning_rate": 1.6917525244083918e-05, - "loss": 0.5673, - "mean_token_accuracy": 0.820593391134776, + "loss": 0.4512, + "mean_token_accuracy": 0.8479398316838589, "step": 243 }, { "epoch": 0.3331910898694205, - "grad_norm": 0.1348902659133689, + "grad_norm": 0.28444793823353953, "learning_rate": 1.688296852369703e-05, - "loss": 0.5316, - "mean_token_accuracy": 0.831235738948778, + "loss": 0.4227, + "mean_token_accuracy": 0.857218350086272, "step": 244 }, { "epoch": 0.3345566271229837, - "grad_norm": 0.1409750851896132, + "grad_norm": 0.32234365149946503, "learning_rate": 1.6848254903319866e-05, - "loss": 0.5413, - "mean_token_accuracy": 0.8278208574506054, + "loss": 0.433, + "mean_token_accuracy": 0.8537039316837896, "step": 245 }, { "epoch": 0.3359221643765469, - "grad_norm": 0.1372049694747267, + "grad_norm": 0.3091069266476509, "learning_rate": 1.6813385174263137e-05, - "loss": 0.5246, - "mean_token_accuracy": 0.8329100526013489, + "loss": 0.4195, + "mean_token_accuracy": 0.8581039754478647, "step": 246 }, { "epoch": 0.33728770163011007, - "grad_norm": 0.12964825970588947, + "grad_norm": 0.3640194501208851, "learning_rate": 1.677836013139611e-05, - "loss": 0.536, - "mean_token_accuracy": 0.8300777214742293, + "loss": 0.4255, + "mean_token_accuracy": 0.856087771880514, "step": 247 }, { "epoch": 0.33865323888367327, - "grad_norm": 0.12970311633521314, + "grad_norm": 0.2842912100028283, "learning_rate": 1.6743180573128494e-05, - "loss": 0.5082, - "mean_token_accuracy": 0.8374251569908495, + "loss": 0.4063, + "mean_token_accuracy": 0.8612077361066686, "step": 248 }, { "epoch": 0.34001877613723647, - "grad_norm": 0.1245639001471319, + "grad_norm": 0.33927830284172017, "learning_rate": 1.6707847301392237e-05, - "loss": 0.5437, - "mean_token_accuracy": 0.8280258592791195, + "loss": 0.4328, + "mean_token_accuracy": 0.854018667863303, "step": 249 }, { "epoch": 0.34138431339079967, - "grad_norm": 0.13742349653793295, + "grad_norm": 0.3380064283112177, "learning_rate": 1.6672361121623238e-05, - "loss": 0.5232, - "mean_token_accuracy": 0.8335207184911094, + "loss": 0.4196, + "mean_token_accuracy": 0.858289578175996, "step": 250 }, { "epoch": 0.3427498506443629, - "grad_norm": 0.1310310939151256, + "grad_norm": 0.2921545005723523, "learning_rate": 1.6636722842743013e-05, - "loss": 0.5242, - "mean_token_accuracy": 0.8322599980197863, + "loss": 0.418, + "mean_token_accuracy": 0.8576834083602248, "step": 251 }, { "epoch": 0.3441153878979261, - "grad_norm": 0.14574183206257008, + "grad_norm": 0.30696667285073703, "learning_rate": 1.660093327714022e-05, - "loss": 0.5285, - "mean_token_accuracy": 0.8321282197576302, + "loss": 0.4223, + "mean_token_accuracy": 0.8571891667514102, "step": 252 }, { "epoch": 0.3454809251514893, - "grad_norm": 0.1265831288052668, + "grad_norm": 0.2692563224404543, "learning_rate": 1.656499324065217e-05, - "loss": 0.5319, - "mean_token_accuracy": 0.8311278929692987, + "loss": 0.424, + "mean_token_accuracy": 0.8564762224897754, "step": 253 }, { "epoch": 0.3468464624050525, - "grad_norm": 0.1291730547708423, + "grad_norm": 0.2902797695424939, "learning_rate": 1.6528903552546207e-05, - "loss": 0.5424, - "mean_token_accuracy": 0.8276399440032848, + "loss": 0.4348, + "mean_token_accuracy": 0.8531339042109244, "step": 254 }, { "epoch": 0.3482119996586157, - "grad_norm": 0.1296501360735472, + "grad_norm": 0.3054595022905831, "learning_rate": 1.6492665035501048e-05, - "loss": 0.5404, - "mean_token_accuracy": 0.8294862171172406, + "loss": 0.4304, + "mean_token_accuracy": 0.8550912353650719, "step": 255 }, { "epoch": 0.3495775369121789, - "grad_norm": 0.13208625553079495, + "grad_norm": 0.30207364986816204, "learning_rate": 1.6456278515588023e-05, - "loss": 0.5234, - "mean_token_accuracy": 0.8322679188700899, + "loss": 0.4185, + "mean_token_accuracy": 0.8576527365281009, "step": 256 }, { "epoch": 0.3509430741657421, - "grad_norm": 0.12793442244118794, + "grad_norm": 0.29635598975960087, "learning_rate": 1.6419744822252255e-05, - "loss": 0.5103, - "mean_token_accuracy": 0.8370114623906487, + "loss": 0.4078, + "mean_token_accuracy": 0.8610621124104776, "step": 257 }, { "epoch": 0.3523086114193053, - "grad_norm": 0.13654320647514265, + "grad_norm": 0.31915252694298046, "learning_rate": 1.638306478829373e-05, - "loss": 0.5307, - "mean_token_accuracy": 0.8305309850743029, + "loss": 0.4231, + "mean_token_accuracy": 0.8561889856733563, "step": 258 }, { "epoch": 0.3536741486728685, - "grad_norm": 0.1331023584849128, + "grad_norm": 0.31923467215480067, "learning_rate": 1.634623924984833e-05, - "loss": 0.5186, - "mean_token_accuracy": 0.8341979473966676, + "loss": 0.4135, + "mean_token_accuracy": 0.8592538002988109, "step": 259 }, { "epoch": 0.3550396859264317, - "grad_norm": 0.13408398239780045, + "grad_norm": 0.30692282494391426, "learning_rate": 1.6309269046368777e-05, - "loss": 0.4963, - "mean_token_accuracy": 0.8402015739091936, + "loss": 0.3966, + "mean_token_accuracy": 0.8640884833628784, "step": 260 }, { "epoch": 0.3564052231799949, - "grad_norm": 0.1309100280042118, + "grad_norm": 0.27482122390015884, "learning_rate": 1.627215502060548e-05, - "loss": 0.5279, - "mean_token_accuracy": 0.8308650292135563, + "loss": 0.4216, + "mean_token_accuracy": 0.8559256932307063, "step": 261 }, { "epoch": 0.3577707604335581, - "grad_norm": 0.13824315213318378, + "grad_norm": 0.34179806060032714, "learning_rate": 1.6234898018587336e-05, - "loss": 0.5678, - "mean_token_accuracy": 0.8225358707194258, + "loss": 0.4541, + "mean_token_accuracy": 0.8491678521649152, "step": 262 }, { "epoch": 0.3591362976871213, - "grad_norm": 0.13993255886768752, + "grad_norm": 0.3011617462318224, "learning_rate": 1.619749888960245e-05, - "loss": 0.5298, - "mean_token_accuracy": 0.8306238322090179, + "loss": 0.4261, + "mean_token_accuracy": 0.8554996248647118, "step": 263 }, { "epoch": 0.3605018349406845, - "grad_norm": 0.14434033147079306, + "grad_norm": 0.31807503774612333, "learning_rate": 1.615995848617876e-05, - "loss": 0.5394, - "mean_token_accuracy": 0.8312612031555663, + "loss": 0.4318, + "mean_token_accuracy": 0.8559916893442254, "step": 264 }, { "epoch": 0.3618673721942477, - "grad_norm": 0.13312971403681256, + "grad_norm": 0.3316772571952849, "learning_rate": 1.612227766406461e-05, - "loss": 0.5138, - "mean_token_accuracy": 0.836549181309043, + "loss": 0.4114, + "mean_token_accuracy": 0.8605685266181821, "step": 265 }, { "epoch": 0.3632329094478109, - "grad_norm": 0.13533876168275283, + "grad_norm": 0.2831512502783301, "learning_rate": 1.6084457282209244e-05, - "loss": 0.5209, - "mean_token_accuracy": 0.833533133714167, + "loss": 0.4182, + "mean_token_accuracy": 0.8572316390854738, "step": 266 }, { "epoch": 0.3645984467013741, - "grad_norm": 0.14135377592163673, + "grad_norm": 0.3020965586894018, "learning_rate": 1.6046498202743232e-05, - "loss": 0.5339, - "mean_token_accuracy": 0.8298975967797028, + "loss": 0.4278, + "mean_token_accuracy": 0.8552705974923581, "step": 267 }, { "epoch": 0.3659639839549373, - "grad_norm": 0.1292025016784794, + "grad_norm": 0.34579468143149206, "learning_rate": 1.6008401290958806e-05, - "loss": 0.5302, - "mean_token_accuracy": 0.8321701302649457, + "loss": 0.4224, + "mean_token_accuracy": 0.8574742487392688, "step": 268 }, { "epoch": 0.3673295212085005, - "grad_norm": 0.12796903961128375, + "grad_norm": 0.3213456748377343, "learning_rate": 1.5970167415290142e-05, - "loss": 0.5262, - "mean_token_accuracy": 0.833957803455181, + "loss": 0.4191, + "mean_token_accuracy": 0.8592830241547558, "step": 269 }, { "epoch": 0.3686950584620637, - "grad_norm": 0.13268559783834463, + "grad_norm": 0.27169537891651463, "learning_rate": 1.5931797447293553e-05, - "loss": 0.5317, - "mean_token_accuracy": 0.8311859398869986, + "loss": 0.4257, + "mean_token_accuracy": 0.8565419088799743, "step": 270 }, { "epoch": 0.3700605957156269, - "grad_norm": 0.14473734167669677, + "grad_norm": 0.31485256379338494, "learning_rate": 1.5893292261627644e-05, - "loss": 0.5269, - "mean_token_accuracy": 0.8331144081686328, + "loss": 0.4211, + "mean_token_accuracy": 0.8583665894914854, "step": 271 }, { "epoch": 0.3714261329691901, - "grad_norm": 0.12956186354881885, + "grad_norm": 0.34197779647959986, "learning_rate": 1.5854652736033353e-05, - "loss": 0.5263, - "mean_token_accuracy": 0.8337743696966373, + "loss": 0.4199, + "mean_token_accuracy": 0.8586362735400643, "step": 272 }, { "epoch": 0.3727916702227533, - "grad_norm": 0.12550781561940716, + "grad_norm": 0.3463622744135729, "learning_rate": 1.5815879751313957e-05, - "loss": 0.5185, - "mean_token_accuracy": 0.8350405018692312, + "loss": 0.4153, + "mean_token_accuracy": 0.8590972009223808, "step": 273 }, { "epoch": 0.3741572074763165, - "grad_norm": 0.13990083681296156, + "grad_norm": 0.2854754568932129, "learning_rate": 1.577697419131497e-05, - "loss": 0.5357, - "mean_token_accuracy": 0.8305206475990355, + "loss": 0.4286, + "mean_token_accuracy": 0.8549681291103441, "step": 274 }, { "epoch": 0.3755227447298797, - "grad_norm": 0.12551540985974136, + "grad_norm": 0.3612800362541057, "learning_rate": 1.5737936942904025e-05, - "loss": 0.5207, - "mean_token_accuracy": 0.8339303168994808, + "loss": 0.4164, + "mean_token_accuracy": 0.8589556245901419, "step": 275 }, { "epoch": 0.3768882819834429, - "grad_norm": 0.13374161323482353, + "grad_norm": 0.3095104128297478, "learning_rate": 1.5698768895950644e-05, - "loss": 0.5234, - "mean_token_accuracy": 0.8325287278941546, + "loss": 0.4196, + "mean_token_accuracy": 0.8573392516346519, "step": 276 }, { "epoch": 0.3782538192370061, - "grad_norm": 0.13795675279223626, + "grad_norm": 0.28011182927413847, "learning_rate": 1.5659470943305956e-05, - "loss": 0.5264, - "mean_token_accuracy": 0.8325345315238528, + "loss": 0.4207, + "mean_token_accuracy": 0.8575836185111236, "step": 277 }, { "epoch": 0.37961935649056927, - "grad_norm": 0.13832170843978162, + "grad_norm": 0.38665599972326675, "learning_rate": 1.5620043980782327e-05, - "loss": 0.5278, - "mean_token_accuracy": 0.8330776082591373, + "loss": 0.4228, + "mean_token_accuracy": 0.8577991932053891, "step": 278 }, { "epoch": 0.38098489374413247, - "grad_norm": 0.1348905020133406, + "grad_norm": 0.3045801906729731, "learning_rate": 1.5580488907132972e-05, - "loss": 0.5143, - "mean_token_accuracy": 0.8346165601417777, + "loss": 0.4127, + "mean_token_accuracy": 0.859320001053601, "step": 279 }, { "epoch": 0.38235043099769567, - "grad_norm": 0.13288114126064224, + "grad_norm": 0.3027464188711677, "learning_rate": 1.554080662403144e-05, - "loss": 0.5425, - "mean_token_accuracy": 0.8271110319649874, + "loss": 0.4344, + "mean_token_accuracy": 0.8530298389418411, "step": 280 }, { "epoch": 0.38371596825125887, - "grad_norm": 0.1346590215668793, + "grad_norm": 0.30735481784619834, "learning_rate": 1.5500998036051075e-05, - "loss": 0.5277, - "mean_token_accuracy": 0.8316150563301141, + "loss": 0.4226, + "mean_token_accuracy": 0.8564421780464974, "step": 281 }, { "epoch": 0.38508150550482206, - "grad_norm": 0.15061773684600566, + "grad_norm": 0.28386427155284427, "learning_rate": 1.546106405064438e-05, - "loss": 0.555, - "mean_token_accuracy": 0.8250145920273312, + "loss": 0.4447, + "mean_token_accuracy": 0.8507831934070248, "step": 282 }, { "epoch": 0.38644704275838526, - "grad_norm": 0.12627562685090993, + "grad_norm": 0.3157137742956653, "learning_rate": 1.5421005578122356e-05, - "loss": 0.5324, - "mean_token_accuracy": 0.832031101737563, + "loss": 0.4264, + "mean_token_accuracy": 0.8566646495893814, "step": 283 }, { "epoch": 0.38781258001194846, - "grad_norm": 0.13766063954250282, + "grad_norm": 0.3109770970705045, "learning_rate": 1.5380823531633727e-05, - "loss": 0.532, - "mean_token_accuracy": 0.8327597499222905, + "loss": 0.4252, + "mean_token_accuracy": 0.8575001524672929, "step": 284 }, { "epoch": 0.38917811726551166, - "grad_norm": 0.13610086925215903, + "grad_norm": 0.3467377136844463, "learning_rate": 1.5340518827144145e-05, - "loss": 0.5294, - "mean_token_accuracy": 0.8331457763615328, + "loss": 0.4233, + "mean_token_accuracy": 0.8582951077732086, "step": 285 }, { "epoch": 0.39054365451907486, - "grad_norm": 0.1403431841650869, + "grad_norm": 0.3381885844162203, "learning_rate": 1.5300092383415282e-05, - "loss": 0.5364, - "mean_token_accuracy": 0.8294932707717552, + "loss": 0.4289, + "mean_token_accuracy": 0.8549039858412707, "step": 286 }, { "epoch": 0.39190919177263805, - "grad_norm": 0.14055359264786835, + "grad_norm": 0.3297678208614477, "learning_rate": 1.525954512198392e-05, - "loss": 0.5114, - "mean_token_accuracy": 0.8367588685914181, + "loss": 0.4096, + "mean_token_accuracy": 0.8613219951830501, "step": 287 }, { "epoch": 0.39327472902620125, - "grad_norm": 0.13198529747363136, + "grad_norm": 0.3368037995093397, "learning_rate": 1.5218877967140921e-05, - "loss": 0.5278, - "mean_token_accuracy": 0.8304756425209355, + "loss": 0.4224, + "mean_token_accuracy": 0.8548496161946588, "step": 288 }, { "epoch": 0.39464026627976445, - "grad_norm": 0.13097614836877364, + "grad_norm": 0.3212698070828666, "learning_rate": 1.517809184591017e-05, - "loss": 0.5347, - "mean_token_accuracy": 0.8319387441414573, + "loss": 0.4263, + "mean_token_accuracy": 0.8574614580441292, "step": 289 }, { "epoch": 0.39600580353332765, - "grad_norm": 0.14076185155875176, + "grad_norm": 0.40537049798129476, "learning_rate": 1.5137187688027437e-05, - "loss": 0.5366, - "mean_token_accuracy": 0.8297177979604398, + "loss": 0.43, + "mean_token_accuracy": 0.8550197749735438, "step": 290 }, { "epoch": 0.39737134078689085, - "grad_norm": 0.13037051080567982, + "grad_norm": 0.31835336382101076, "learning_rate": 1.5096166425919176e-05, - "loss": 0.5365, - "mean_token_accuracy": 0.8296294864528059, + "loss": 0.4298, + "mean_token_accuracy": 0.8548674541736617, "step": 291 }, { "epoch": 0.39873687804045405, - "grad_norm": 0.13283518111540674, + "grad_norm": 0.34618990419302037, "learning_rate": 1.5055028994681284e-05, - "loss": 0.5106, - "mean_token_accuracy": 0.8372287828384319, + "loss": 0.409, + "mean_token_accuracy": 0.8607840254015119, "step": 292 }, { "epoch": 0.40010241529401724, - "grad_norm": 0.13466309713461214, + "grad_norm": 0.350507807135204, "learning_rate": 1.5013776332057786e-05, - "loss": 0.5174, - "mean_token_accuracy": 0.8336355354085478, + "loss": 0.4151, + "mean_token_accuracy": 0.8577964045490754, "step": 293 }, { "epoch": 0.40146795254758044, - "grad_norm": 0.1370911905979327, + "grad_norm": 0.30189871647045075, "learning_rate": 1.4972409378419439e-05, - "loss": 0.5118, - "mean_token_accuracy": 0.8355702104063977, + "loss": 0.4094, + "mean_token_accuracy": 0.8607436168222747, "step": 294 }, { "epoch": 0.40283348980114364, - "grad_norm": 0.12870992649712387, + "grad_norm": 0.3016208540740829, "learning_rate": 1.4930929076742317e-05, - "loss": 0.5207, - "mean_token_accuracy": 0.8345088456450936, + "loss": 0.4151, + "mean_token_accuracy": 0.8595533954555142, "step": 295 }, { "epoch": 0.40419902705470684, - "grad_norm": 0.12652682391227446, + "grad_norm": 0.29033881446311255, "learning_rate": 1.4889336372586305e-05, - "loss": 0.4884, - "mean_token_accuracy": 0.8406147215051577, + "loss": 0.3928, + "mean_token_accuracy": 0.8637799769336061, "step": 296 }, { "epoch": 0.40556456430827004, - "grad_norm": 0.1273322818382945, + "grad_norm": 0.3668162312007593, "learning_rate": 1.4847632214073548e-05, - "loss": 0.5502, - "mean_token_accuracy": 0.8246969084417677, + "loss": 0.4426, + "mean_token_accuracy": 0.8499288824377587, "step": 297 }, { "epoch": 0.40693010156183324, - "grad_norm": 0.12578029632091806, + "grad_norm": 0.32449852074777696, "learning_rate": 1.4805817551866839e-05, - "loss": 0.5098, - "mean_token_accuracy": 0.8361054765788098, + "loss": 0.4084, + "mean_token_accuracy": 0.8605696761410078, "step": 298 }, { "epoch": 0.40829563881539643, - "grad_norm": 0.13335147445191292, + "grad_norm": 0.2970564472354985, "learning_rate": 1.4763893339147942e-05, - "loss": 0.5243, - "mean_token_accuracy": 0.8317951337067153, + "loss": 0.418, + "mean_token_accuracy": 0.8573568392411037, "step": 299 }, { "epoch": 0.40966117606895963, - "grad_norm": 0.12476871003248997, + "grad_norm": 0.32582221983564175, "learning_rate": 1.4721860531595868e-05, - "loss": 0.5168, - "mean_token_accuracy": 0.8358361965934906, + "loss": 0.4155, + "mean_token_accuracy": 0.8602134585041119, "step": 300 }, { "epoch": 0.41102671332252283, - "grad_norm": 0.12560165059823447, + "grad_norm": 0.3089086009639669, "learning_rate": 1.4679720087365097e-05, - "loss": 0.5121, - "mean_token_accuracy": 0.8367098302068718, + "loss": 0.4099, + "mean_token_accuracy": 0.8610357533905291, "step": 301 }, { "epoch": 0.41239225057608603, - "grad_norm": 0.1360386881744858, + "grad_norm": 0.32593412647065384, "learning_rate": 1.4637472967063721e-05, - "loss": 0.5484, - "mean_token_accuracy": 0.8274633241768209, + "loss": 0.4383, + "mean_token_accuracy": 0.8527080209938287, "step": 302 }, { "epoch": 0.4137577878296492, - "grad_norm": 0.12687854118253483, + "grad_norm": 0.3277327156387932, "learning_rate": 1.4595120133731564e-05, - "loss": 0.5234, - "mean_token_accuracy": 0.8317635075122475, + "loss": 0.4181, + "mean_token_accuracy": 0.8569046290075678, "step": 303 }, { "epoch": 0.4151233250832124, - "grad_norm": 0.13350907077502042, + "grad_norm": 0.3730986070237096, "learning_rate": 1.4552662552818211e-05, - "loss": 0.5524, - "mean_token_accuracy": 0.824538929978512, + "loss": 0.4433, + "mean_token_accuracy": 0.8504104581423189, "step": 304 }, { "epoch": 0.4164888623367756, - "grad_norm": 0.1310118836769573, + "grad_norm": 0.28719200698202685, "learning_rate": 1.451010119216102e-05, - "loss": 0.5462, - "mean_token_accuracy": 0.8254162477467392, + "loss": 0.4369, + "mean_token_accuracy": 0.8519207939028978, "step": 305 }, { "epoch": 0.4178543995903388, - "grad_norm": 0.13881193250236018, + "grad_norm": 0.2597878117331174, "learning_rate": 1.446743702196304e-05, - "loss": 0.5231, - "mean_token_accuracy": 0.8325016591163955, + "loss": 0.4187, + "mean_token_accuracy": 0.8574448859814668, "step": 306 }, { "epoch": 0.419219936843902, - "grad_norm": 0.142183260555802, + "grad_norm": 0.34783364098922354, "learning_rate": 1.4424671014770906e-05, - "loss": 0.5426, - "mean_token_accuracy": 0.8277582748779967, + "loss": 0.4359, + "mean_token_accuracy": 0.8523371430850553, "step": 307 }, { "epoch": 0.4205854740974652, - "grad_norm": 0.12564549406602085, + "grad_norm": 0.33054277050758557, "learning_rate": 1.4381804145452672e-05, - "loss": 0.5178, - "mean_token_accuracy": 0.8343646234479691, + "loss": 0.4155, + "mean_token_accuracy": 0.8585607772789662, "step": 308 }, { "epoch": 0.4219510113510284, - "grad_norm": 0.131614296336296, + "grad_norm": 0.2495797590927163, "learning_rate": 1.4338837391175582e-05, - "loss": 0.5378, - "mean_token_accuracy": 0.8296358778853029, + "loss": 0.4296, + "mean_token_accuracy": 0.855114414007386, "step": 309 }, { "epoch": 0.4233165486045916, - "grad_norm": 0.1354916208264913, + "grad_norm": 0.35728528385485553, "learning_rate": 1.4295771731383799e-05, - "loss": 0.5419, - "mean_token_accuracy": 0.8286937796926852, + "loss": 0.434, + "mean_token_accuracy": 0.8536842164352073, "step": 310 }, { "epoch": 0.4246820858581548, - "grad_norm": 0.13024030066606623, + "grad_norm": 0.3288240381017802, "learning_rate": 1.4252608147776067e-05, - "loss": 0.5064, - "mean_token_accuracy": 0.8371588149619154, + "loss": 0.4051, + "mean_token_accuracy": 0.8619649344224427, "step": 311 }, { "epoch": 0.426047623111718, - "grad_norm": 0.13786567544872633, + "grad_norm": 0.3883949212610513, "learning_rate": 1.4209347624283352e-05, - "loss": 0.5388, - "mean_token_accuracy": 0.8288873420667994, + "loss": 0.4325, + "mean_token_accuracy": 0.8541025576014111, "step": 312 }, { "epoch": 0.4274131603652812, - "grad_norm": 0.12612482670968025, + "grad_norm": 0.3012394552571007, "learning_rate": 1.4165991147046404e-05, - "loss": 0.5135, - "mean_token_accuracy": 0.8361618123710812, + "loss": 0.4121, + "mean_token_accuracy": 0.8598195582006549, "step": 313 }, { "epoch": 0.4287786976188444, - "grad_norm": 0.13605341847031804, + "grad_norm": 0.3197771206984412, "learning_rate": 1.4122539704393265e-05, - "loss": 0.5293, - "mean_token_accuracy": 0.8317211138785928, + "loss": 0.4237, + "mean_token_accuracy": 0.8560034937668209, "step": 314 }, { "epoch": 0.4301442348724076, - "grad_norm": 0.13624193705017706, + "grad_norm": 0.3057046597401257, "learning_rate": 1.4078994286816768e-05, - "loss": 0.5375, - "mean_token_accuracy": 0.8280321905543968, + "loss": 0.4329, + "mean_token_accuracy": 0.853573226005123, "step": 315 }, { "epoch": 0.4315097721259708, - "grad_norm": 0.1388437047272556, + "grad_norm": 0.3422003725649723, "learning_rate": 1.4035355886951924e-05, - "loss": 0.5418, - "mean_token_accuracy": 0.8272199497561654, + "loss": 0.4327, + "mean_token_accuracy": 0.8533252072862426, "step": 316 }, { "epoch": 0.432875309379534, - "grad_norm": 0.1303163735373356, + "grad_norm": 0.2909937625605861, "learning_rate": 1.3991625499553325e-05, - "loss": 0.5458, - "mean_token_accuracy": 0.8271786427469427, + "loss": 0.4373, + "mean_token_accuracy": 0.8526268972423322, "step": 317 }, { "epoch": 0.4342408466330972, - "grad_norm": 0.13101931330237016, + "grad_norm": 0.2810097508624511, "learning_rate": 1.3947804121472453e-05, - "loss": 0.5257, - "mean_token_accuracy": 0.8333335522196373, + "loss": 0.4209, + "mean_token_accuracy": 0.8580495358481994, "step": 318 }, { "epoch": 0.4356063838866604, - "grad_norm": 0.1272712101768958, + "grad_norm": 0.29587543321646376, "learning_rate": 1.3903892751634949e-05, - "loss": 0.5224, - "mean_token_accuracy": 0.833558794924454, + "loss": 0.4181, + "mean_token_accuracy": 0.8580909047258651, "step": 319 }, { "epoch": 0.4369719211402236, - "grad_norm": 0.12945798987629895, + "grad_norm": 0.2803397206271039, "learning_rate": 1.3859892391017867e-05, - "loss": 0.4961, - "mean_token_accuracy": 0.840295010373554, + "loss": 0.398, + "mean_token_accuracy": 0.8636022295266738, "step": 320 }, { "epoch": 0.4383374583937868, - "grad_norm": 0.13144480212466458, + "grad_norm": 0.3031649571676904, "learning_rate": 1.3815804042626828e-05, - "loss": 0.5387, - "mean_token_accuracy": 0.8287325042225203, + "loss": 0.4316, + "mean_token_accuracy": 0.8540270034472657, "step": 321 }, { "epoch": 0.43970299564735, - "grad_norm": 0.14405221685204356, + "grad_norm": 0.2868053671287889, "learning_rate": 1.3771628711473173e-05, - "loss": 0.5574, - "mean_token_accuracy": 0.8241782976096302, + "loss": 0.4458, + "mean_token_accuracy": 0.8500793417904773, "step": 322 }, { "epoch": 0.4410685329009132, - "grad_norm": 0.1302459258800379, + "grad_norm": 0.2607596548746336, "learning_rate": 1.3727367404551055e-05, - "loss": 0.5217, - "mean_token_accuracy": 0.8332794042038847, + "loss": 0.4194, + "mean_token_accuracy": 0.85756931958572, "step": 323 }, { "epoch": 0.4424340701544764, - "grad_norm": 0.12193833817974467, + "grad_norm": 0.31970960349713734, "learning_rate": 1.368302113081447e-05, - "loss": 0.4954, - "mean_token_accuracy": 0.8406406735713372, + "loss": 0.3988, + "mean_token_accuracy": 0.8640139707624799, "step": 324 }, { "epoch": 0.4437996074080396, - "grad_norm": 0.1306720840851277, + "grad_norm": 0.3073106580181061, "learning_rate": 1.3638590901154276e-05, - "loss": 0.5194, - "mean_token_accuracy": 0.8344783438317134, + "loss": 0.4158, + "mean_token_accuracy": 0.8588068782465295, "step": 325 }, { "epoch": 0.4451651446616028, - "grad_norm": 0.13070814790484123, + "grad_norm": 0.30286141069120004, "learning_rate": 1.3594077728375129e-05, - "loss": 0.5244, - "mean_token_accuracy": 0.8327654377281789, + "loss": 0.4201, + "mean_token_accuracy": 0.8569773919052446, "step": 326 }, { "epoch": 0.446530681915166, - "grad_norm": 0.12981282999883934, + "grad_norm": 0.279965967756229, "learning_rate": 1.3549482627172412e-05, - "loss": 0.5389, - "mean_token_accuracy": 0.827943125036659, + "loss": 0.432, + "mean_token_accuracy": 0.8531155705252534, "step": 327 }, { "epoch": 0.4478962191687292, - "grad_norm": 0.1281772727499796, + "grad_norm": 0.30169596805937576, "learning_rate": 1.3504806614109098e-05, - "loss": 0.5242, - "mean_token_accuracy": 0.8319693088071065, + "loss": 0.4207, + "mean_token_accuracy": 0.8567625209561366, "step": 328 }, { "epoch": 0.4492617564222924, - "grad_norm": 0.12203075013857725, + "grad_norm": 0.3459438236471543, "learning_rate": 1.3460050707592581e-05, - "loss": 0.5398, - "mean_token_accuracy": 0.8282606677333658, + "loss": 0.4336, + "mean_token_accuracy": 0.8529489776321926, "step": 329 }, { "epoch": 0.4506272936758556, - "grad_norm": 0.12075540697813768, + "grad_norm": 0.30726581139747894, "learning_rate": 1.341521592785145e-05, - "loss": 0.4998, - "mean_token_accuracy": 0.8389237719024183, + "loss": 0.399, + "mean_token_accuracy": 0.8630228365259139, "step": 330 }, { "epoch": 0.4519928309294188, - "grad_norm": 0.12195154292538155, + "grad_norm": 0.3053218799713375, "learning_rate": 1.3370303296912248e-05, - "loss": 0.5476, - "mean_token_accuracy": 0.8261179603133416, + "loss": 0.4382, + "mean_token_accuracy": 0.8519843865306225, "step": 331 }, { "epoch": 0.453358368182982, - "grad_norm": 0.12404959831730661, + "grad_norm": 0.34629772646315893, "learning_rate": 1.332531383857616e-05, - "loss": 0.5093, - "mean_token_accuracy": 0.8379276810692828, + "loss": 0.4112, + "mean_token_accuracy": 0.860751373812898, "step": 332 }, { "epoch": 0.4547239054365452, - "grad_norm": 0.12491694954059616, + "grad_norm": 0.35416501044407706, "learning_rate": 1.328024857839569e-05, - "loss": 0.5201, - "mean_token_accuracy": 0.8326835859887554, + "loss": 0.4179, + "mean_token_accuracy": 0.857242064978725, "step": 333 }, { "epoch": 0.45608944269010837, - "grad_norm": 0.11851487904684899, + "grad_norm": 0.30654844603106524, "learning_rate": 1.3235108543651272e-05, - "loss": 0.517, - "mean_token_accuracy": 0.8355242129952429, + "loss": 0.4152, + "mean_token_accuracy": 0.8594253890394505, "step": 334 }, { "epoch": 0.45745497994367157, - "grad_norm": 0.13145408458960237, + "grad_norm": 0.35789751735364733, "learning_rate": 1.3189894763327851e-05, - "loss": 0.5437, - "mean_token_accuracy": 0.8265899295891007, + "loss": 0.4352, + "mean_token_accuracy": 0.8524388902413612, "step": 335 }, { "epoch": 0.45882051719723477, - "grad_norm": 0.13049785926043753, + "grad_norm": 0.27296525421750206, "learning_rate": 1.3144608268091435e-05, - "loss": 0.5139, - "mean_token_accuracy": 0.8356139492918141, + "loss": 0.4125, + "mean_token_accuracy": 0.8600694337472964, "step": 336 }, { "epoch": 0.46018605445079797, - "grad_norm": 0.1197085895490477, + "grad_norm": 0.35167998845661236, "learning_rate": 1.3099250090265599e-05, - "loss": 0.5248, - "mean_token_accuracy": 0.832658521262706, + "loss": 0.4212, + "mean_token_accuracy": 0.8573687132989843, "step": 337 }, { "epoch": 0.46155159170436116, - "grad_norm": 0.1244653956373087, + "grad_norm": 0.3399733274636042, "learning_rate": 1.3053821263807947e-05, - "loss": 0.5146, - "mean_token_accuracy": 0.8345009748898053, + "loss": 0.4133, + "mean_token_accuracy": 0.8588422955112552, "step": 338 }, { "epoch": 0.46291712895792436, - "grad_norm": 0.13270218133148493, + "grad_norm": 0.2843523201867168, "learning_rate": 1.3008322824286554e-05, - "loss": 0.5255, - "mean_token_accuracy": 0.8337560022891326, + "loss": 0.4217, + "mean_token_accuracy": 0.8581068469248813, "step": 339 }, { "epoch": 0.46428266621148756, - "grad_norm": 0.12549771342759378, + "grad_norm": 0.25579241148803916, "learning_rate": 1.2962755808856341e-05, - "loss": 0.5421, - "mean_token_accuracy": 0.8269697928340337, + "loss": 0.4329, + "mean_token_accuracy": 0.8531418182041834, "step": 340 }, { "epoch": 0.46564820346505076, - "grad_norm": 0.11605208324136083, + "grad_norm": 0.2849802262084464, "learning_rate": 1.2917121256235454e-05, - "loss": 0.5039, - "mean_token_accuracy": 0.8384597335722043, + "loss": 0.4063, + "mean_token_accuracy": 0.8619580010780344, "step": 341 }, { "epoch": 0.46701374071861396, - "grad_norm": 0.13033557708645638, + "grad_norm": 0.28376439896546823, "learning_rate": 1.2871420206681573e-05, - "loss": 0.5135, - "mean_token_accuracy": 0.8345709765936421, + "loss": 0.4126, + "mean_token_accuracy": 0.8584155141530699, "step": 342 }, { "epoch": 0.46837927797217715, - "grad_norm": 0.1312073263426736, + "grad_norm": 0.27659846392534093, "learning_rate": 1.2825653701968199e-05, - "loss": 0.5011, - "mean_token_accuracy": 0.8394610380482375, + "loss": 0.4028, + "mean_token_accuracy": 0.8630971886050622, "step": 343 }, { "epoch": 0.46974481522574035, - "grad_norm": 0.1326463605037727, + "grad_norm": 0.2680308561203238, "learning_rate": 1.2779822785360913e-05, - "loss": 0.5114, - "mean_token_accuracy": 0.8353350614674057, + "loss": 0.4106, + "mean_token_accuracy": 0.8594322782022719, "step": 344 }, { "epoch": 0.47111035247930355, - "grad_norm": 0.13266853242085758, + "grad_norm": 0.31082123251549165, "learning_rate": 1.2733928501593587e-05, - "loss": 0.5514, - "mean_token_accuracy": 0.8245865291414911, + "loss": 0.4409, + "mean_token_accuracy": 0.8504463201078798, "step": 345 }, { "epoch": 0.47247588973286675, - "grad_norm": 0.12169495123704742, + "grad_norm": 0.28196867432168504, "learning_rate": 1.2687971896844575e-05, - "loss": 0.5174, - "mean_token_accuracy": 0.8354735378350905, + "loss": 0.4143, + "mean_token_accuracy": 0.8599393422041681, "step": 346 }, { "epoch": 0.47384142698642995, - "grad_norm": 0.13275007373556896, + "grad_norm": 0.30055986798006457, "learning_rate": 1.2641954018712863e-05, - "loss": 0.534, - "mean_token_accuracy": 0.8300335547401384, + "loss": 0.427, + "mean_token_accuracy": 0.8553718131914751, "step": 347 }, { "epoch": 0.47520696423999315, - "grad_norm": 0.12983973745682334, + "grad_norm": 0.3360073322586856, "learning_rate": 1.2595875916194188e-05, - "loss": 0.5393, - "mean_token_accuracy": 0.8259284457158772, + "loss": 0.4325, + "mean_token_accuracy": 0.8513545891221347, "step": 348 }, { "epoch": 0.47657250149355634, - "grad_norm": 0.12942927839384322, + "grad_norm": 0.27942894694642584, "learning_rate": 1.2549738639657117e-05, - "loss": 0.5072, - "mean_token_accuracy": 0.8374654376899043, + "loss": 0.4069, + "mean_token_accuracy": 0.8616844269198048, "step": 349 }, { "epoch": 0.47793803874711954, - "grad_norm": 0.1331286773837372, + "grad_norm": 0.25876338591566067, "learning_rate": 1.2503543240819127e-05, - "loss": 0.5208, - "mean_token_accuracy": 0.8342140807779106, + "loss": 0.4189, + "mean_token_accuracy": 0.8584846388067386, "step": 350 }, { "epoch": 0.4793035760006828, - "grad_norm": 0.12591508579428684, + "grad_norm": 0.38300218448302764, "learning_rate": 1.2457290772722607e-05, - "loss": 0.5242, - "mean_token_accuracy": 0.8322697884876021, + "loss": 0.4207, + "mean_token_accuracy": 0.8563944060284168, "step": 351 }, { "epoch": 0.480669113254246, - "grad_norm": 0.12471785669613117, + "grad_norm": 0.29865375211927264, "learning_rate": 1.2410982289710865e-05, - "loss": 0.5333, - "mean_token_accuracy": 0.8299185043369858, + "loss": 0.4274, + "mean_token_accuracy": 0.8550406509222035, "step": 352 }, { "epoch": 0.4820346505078092, - "grad_norm": 0.12526131742493551, + "grad_norm": 0.29262760726515985, "learning_rate": 1.2364618847404088e-05, - "loss": 0.5195, - "mean_token_accuracy": 0.83249990090411, + "loss": 0.4195, + "mean_token_accuracy": 0.856524809953863, "step": 353 }, { "epoch": 0.4834001877613724, - "grad_norm": 0.13051565626809056, + "grad_norm": 0.33358761083999955, "learning_rate": 1.2318201502675285e-05, - "loss": 0.545, - "mean_token_accuracy": 0.8287826219444361, + "loss": 0.4376, + "mean_token_accuracy": 0.8536589150399464, "step": 354 }, { "epoch": 0.4847657250149356, - "grad_norm": 0.12954210255500737, + "grad_norm": 0.24446626288123202, "learning_rate": 1.227173131362619e-05, - "loss": 0.532, - "mean_token_accuracy": 0.8304259223084238, + "loss": 0.4275, + "mean_token_accuracy": 0.8556876461790799, "step": 355 }, { "epoch": 0.4861312622684988, - "grad_norm": 0.12798318624735636, + "grad_norm": 0.2998978259766614, "learning_rate": 1.2225209339563144e-05, - "loss": 0.5292, - "mean_token_accuracy": 0.8315906612203674, + "loss": 0.4244, + "mean_token_accuracy": 0.8561228530745463, "step": 356 }, { "epoch": 0.487496799522062, - "grad_norm": 0.1348469518980085, + "grad_norm": 0.3206206105210003, "learning_rate": 1.2178636640972954e-05, - "loss": 0.5343, - "mean_token_accuracy": 0.8286647169950767, + "loss": 0.4277, + "mean_token_accuracy": 0.8534380165849528, "step": 357 }, { "epoch": 0.4888623367756252, - "grad_norm": 0.13185607512927003, + "grad_norm": 0.3084930416049475, "learning_rate": 1.2132014279498702e-05, - "loss": 0.5333, - "mean_token_accuracy": 0.8279516590231087, + "loss": 0.4288, + "mean_token_accuracy": 0.8533390096208622, "step": 358 }, { "epoch": 0.4902278740291884, - "grad_norm": 0.12669014190842606, + "grad_norm": 0.28357516576751085, "learning_rate": 1.2085343317915565e-05, - "loss": 0.5271, - "mean_token_accuracy": 0.8320907114002073, + "loss": 0.4243, + "mean_token_accuracy": 0.8562038691159706, "step": 359 }, { "epoch": 0.4915934112827516, - "grad_norm": 0.15080911963297733, + "grad_norm": 0.33187103925787603, "learning_rate": 1.2038624820106572e-05, - "loss": 0.5381, - "mean_token_accuracy": 0.8270864477863897, + "loss": 0.4333, + "mean_token_accuracy": 0.852135547957135, "step": 360 }, { "epoch": 0.4929589485363148, - "grad_norm": 0.13060505380635568, + "grad_norm": 0.2747099173609463, "learning_rate": 1.1991859851038362e-05, - "loss": 0.4982, - "mean_token_accuracy": 0.8399593011308186, + "loss": 0.4007, + "mean_token_accuracy": 0.8638977286823286, "step": 361 }, { "epoch": 0.494324485789878, - "grad_norm": 0.1176053729434065, + "grad_norm": 0.3635110977798377, "learning_rate": 1.1945049476736905e-05, - "loss": 0.4928, - "mean_token_accuracy": 0.8414099586800604, + "loss": 0.3958, + "mean_token_accuracy": 0.8647084604295199, "step": 362 }, { "epoch": 0.4956900230434412, - "grad_norm": 0.12479611398874398, + "grad_norm": 0.28529172664511565, "learning_rate": 1.1898194764263198e-05, - "loss": 0.528, - "mean_token_accuracy": 0.832459221266471, + "loss": 0.4233, + "mean_token_accuracy": 0.8569803575165137, "step": 363 }, { "epoch": 0.4970555602970044, - "grad_norm": 0.1195882007420439, + "grad_norm": 0.27537692377864037, "learning_rate": 1.1851296781688952e-05, - "loss": 0.5065, - "mean_token_accuracy": 0.8363233146305585, + "loss": 0.4073, + "mean_token_accuracy": 0.8600902266025654, "step": 364 }, { "epoch": 0.49842109755056757, - "grad_norm": 0.13615363636056205, + "grad_norm": 0.32325185164148623, "learning_rate": 1.1804356598072223e-05, - "loss": 0.518, - "mean_token_accuracy": 0.8350528306508418, + "loss": 0.4176, + "mean_token_accuracy": 0.8587095805096491, "step": 365 }, { "epoch": 0.49978663480413077, - "grad_norm": 0.11889575553537365, + "grad_norm": 0.2940543368854904, "learning_rate": 1.1757375283433077e-05, - "loss": 0.5232, - "mean_token_accuracy": 0.8325364801944461, + "loss": 0.421, + "mean_token_accuracy": 0.8564938642066868, "step": 366 }, { "epoch": 0.5011521720576939, - "grad_norm": 0.12946265744246763, + "grad_norm": 0.28550857526720214, "learning_rate": 1.1710353908729157e-05, - "loss": 0.5008, - "mean_token_accuracy": 0.8389060818701679, + "loss": 0.4032, + "mean_token_accuracy": 0.8619838649329472, "step": 367 }, { "epoch": 0.5025177093112572, - "grad_norm": 0.1285844955383896, + "grad_norm": 0.29699607198851946, "learning_rate": 1.1663293545831302e-05, - "loss": 0.5168, - "mean_token_accuracy": 0.834698764770248, + "loss": 0.4146, + "mean_token_accuracy": 0.8589313902193905, "step": 368 }, { "epoch": 0.5038832465648203, - "grad_norm": 0.11477318547609064, + "grad_norm": 0.25709714555087276, "learning_rate": 1.1616195267499102e-05, - "loss": 0.5353, - "mean_token_accuracy": 0.8276179863877718, + "loss": 0.4298, + "mean_token_accuracy": 0.8524904329384897, "step": 369 }, { "epoch": 0.5052487838183836, - "grad_norm": 0.11581904693042147, + "grad_norm": 0.2623470396493795, "learning_rate": 1.1569060147356441e-05, - "loss": 0.4996, - "mean_token_accuracy": 0.8383243594659534, + "loss": 0.4004, + "mean_token_accuracy": 0.8624617116263626, "step": 370 }, { "epoch": 0.5066143210719467, - "grad_norm": 0.12112672117965069, + "grad_norm": 0.27349918771238607, "learning_rate": 1.1521889259867032e-05, - "loss": 0.4817, - "mean_token_accuracy": 0.8429841869291526, + "loss": 0.3875, + "mean_token_accuracy": 0.8656588419393155, "step": 371 }, { "epoch": 0.50797985832551, - "grad_norm": 0.12660879585715668, + "grad_norm": 0.2659088919583038, "learning_rate": 1.1474683680309913e-05, - "loss": 0.5236, - "mean_token_accuracy": 0.8329598106853243, + "loss": 0.422, + "mean_token_accuracy": 0.8569432102568713, "step": 372 }, { "epoch": 0.5093453955790731, - "grad_norm": 0.12577821742633621, + "grad_norm": 0.24245036453653618, "learning_rate": 1.1427444484754942e-05, - "loss": 0.5392, - "mean_token_accuracy": 0.8286469267752431, + "loss": 0.4345, + "mean_token_accuracy": 0.8533666506270393, "step": 373 }, { "epoch": 0.5107109328326364, - "grad_norm": 0.12057206683576184, + "grad_norm": 0.3003745935393995, "learning_rate": 1.138017275003827e-05, - "loss": 0.5139, - "mean_token_accuracy": 0.8355391844262096, + "loss": 0.4141, + "mean_token_accuracy": 0.859278335907566, "step": 374 }, { "epoch": 0.5120764700861995, - "grad_norm": 0.12387138353978204, + "grad_norm": 0.26886003162184047, "learning_rate": 1.133286955373779e-05, - "loss": 0.5478, - "mean_token_accuracy": 0.8251846270914437, + "loss": 0.4396, + "mean_token_accuracy": 0.8502673324909059, "step": 375 }, { "epoch": 0.5134420073397628, - "grad_norm": 0.12066234655545796, + "grad_norm": 0.2390413269524559, "learning_rate": 1.1285535974148576e-05, - "loss": 0.4959, - "mean_token_accuracy": 0.841789159468393, + "loss": 0.3997, + "mean_token_accuracy": 0.8639690222498632, "step": 376 }, { "epoch": 0.5148075445933259, - "grad_norm": 0.11790344167481273, + "grad_norm": 0.2544175262225982, "learning_rate": 1.1238173090258292e-05, - "loss": 0.4979, - "mean_token_accuracy": 0.8409620564707888, + "loss": 0.3998, + "mean_token_accuracy": 0.8636630321983972, "step": 377 }, { "epoch": 0.5161730818468891, - "grad_norm": 0.13446008554940664, + "grad_norm": 0.23579375374714043, "learning_rate": 1.1190781981722622e-05, - "loss": 0.5234, - "mean_token_accuracy": 0.832928092648003, + "loss": 0.4213, + "mean_token_accuracy": 0.8574136326780977, "step": 378 }, { "epoch": 0.5175386191004523, - "grad_norm": 0.12028719640187133, + "grad_norm": 0.3009505232094782, "learning_rate": 1.1143363728840626e-05, - "loss": 0.5027, - "mean_token_accuracy": 0.839944369778638, + "loss": 0.4048, + "mean_token_accuracy": 0.8630919807826921, "step": 379 }, { "epoch": 0.5189041563540155, - "grad_norm": 0.13038021174997477, + "grad_norm": 0.3008130036826914, "learning_rate": 1.1095919412530136e-05, - "loss": 0.5168, - "mean_token_accuracy": 0.8335012211586063, + "loss": 0.4162, + "mean_token_accuracy": 0.8576130359385502, "step": 380 }, { "epoch": 0.5202696936075787, - "grad_norm": 0.11937622021666097, + "grad_norm": 0.2952343133789482, "learning_rate": 1.1048450114303111e-05, - "loss": 0.5502, - "mean_token_accuracy": 0.8249481541098718, + "loss": 0.4412, + "mean_token_accuracy": 0.850824829194507, "step": 381 }, { "epoch": 0.5216352308611419, - "grad_norm": 0.11721651007241793, + "grad_norm": 0.258661138214629, "learning_rate": 1.1000956916240985e-05, - "loss": 0.5075, - "mean_token_accuracy": 0.8383965564996401, + "loss": 0.4086, + "mean_token_accuracy": 0.8620580382687907, "step": 382 }, { "epoch": 0.5230007681147051, - "grad_norm": 0.12162492161101192, + "grad_norm": 0.25494116388793103, "learning_rate": 1.0953440900969993e-05, - "loss": 0.5211, - "mean_token_accuracy": 0.8325359297516677, + "loss": 0.4207, + "mean_token_accuracy": 0.8565843185391879, "step": 383 }, { "epoch": 0.5243663053682683, - "grad_norm": 0.12367210143025252, + "grad_norm": 0.27361519707795223, "learning_rate": 1.09059031516365e-05, - "loss": 0.5082, - "mean_token_accuracy": 0.8377869215614929, + "loss": 0.4102, + "mean_token_accuracy": 0.8609546120002327, "step": 384 }, { "epoch": 0.5257318426218315, - "grad_norm": 0.12580667171404072, + "grad_norm": 0.2576222350047768, "learning_rate": 1.0858344751882304e-05, - "loss": 0.5089, - "mean_token_accuracy": 0.8364850763886615, + "loss": 0.4069, + "mean_token_accuracy": 0.860766222892137, "step": 385 }, { "epoch": 0.5270973798753947, - "grad_norm": 0.1219013421078964, + "grad_norm": 0.23779312285209903, "learning_rate": 1.0810766785819947e-05, - "loss": 0.5286, - "mean_token_accuracy": 0.8337185745874656, + "loss": 0.4228, + "mean_token_accuracy": 0.8588812441258473, "step": 386 }, { "epoch": 0.5284629171289579, - "grad_norm": 0.12540119401661845, + "grad_norm": 0.28934124430046615, "learning_rate": 1.0763170338007978e-05, - "loss": 0.5108, - "mean_token_accuracy": 0.8368933982491288, + "loss": 0.4107, + "mean_token_accuracy": 0.8604884551869458, "step": 387 }, { "epoch": 0.5298284543825211, - "grad_norm": 0.11853228557954039, + "grad_norm": 0.2466549449805162, "learning_rate": 1.0715556493426263e-05, - "loss": 0.5303, - "mean_token_accuracy": 0.830896575625901, + "loss": 0.4263, + "mean_token_accuracy": 0.8557152597091608, "step": 388 }, { "epoch": 0.5311939916360843, - "grad_norm": 0.12590678114142295, + "grad_norm": 0.27325712567200283, "learning_rate": 1.0667926337451217e-05, - "loss": 0.5177, - "mean_token_accuracy": 0.8336980904888533, + "loss": 0.4149, + "mean_token_accuracy": 0.8584272502154298, "step": 389 }, { "epoch": 0.5325595288896475, - "grad_norm": 0.12302304116129369, + "grad_norm": 0.25477351099504564, "learning_rate": 1.0620280955831088e-05, - "loss": 0.5148, - "mean_token_accuracy": 0.8356371514760794, + "loss": 0.4138, + "mean_token_accuracy": 0.8593247446112272, "step": 390 }, { "epoch": 0.5339250661432107, - "grad_norm": 0.12738509916842178, + "grad_norm": 0.25978114932856317, "learning_rate": 1.0572621434661201e-05, - "loss": 0.523, - "mean_token_accuracy": 0.8328920732406349, + "loss": 0.4189, + "mean_token_accuracy": 0.8573288866024139, "step": 391 }, { "epoch": 0.5352906033967739, - "grad_norm": 0.12456392800865042, + "grad_norm": 0.26074134187169073, "learning_rate": 1.0524948860359194e-05, - "loss": 0.5328, - "mean_token_accuracy": 0.8314845571385587, + "loss": 0.428, + "mean_token_accuracy": 0.8560287289485933, "step": 392 }, { "epoch": 0.5366561406503371, - "grad_norm": 0.12705186467545376, + "grad_norm": 0.2606859248765302, "learning_rate": 1.0477264319640253e-05, - "loss": 0.5125, - "mean_token_accuracy": 0.8359952299417386, + "loss": 0.4117, + "mean_token_accuracy": 0.8600141371640406, "step": 393 }, { "epoch": 0.5380216779039003, - "grad_norm": 0.12063836380824533, + "grad_norm": 0.2260622200890888, "learning_rate": 1.0429568899492349e-05, - "loss": 0.4939, - "mean_token_accuracy": 0.8424667033992155, + "loss": 0.3984, + "mean_token_accuracy": 0.8646881070534751, "step": 394 }, { "epoch": 0.5393872151574635, - "grad_norm": 0.11876007623833751, + "grad_norm": 0.2270692517916632, "learning_rate": 1.038186368715145e-05, - "loss": 0.4889, - "mean_token_accuracy": 0.8417282676637755, + "loss": 0.3935, + "mean_token_accuracy": 0.8645081124206024, "step": 395 }, { "epoch": 0.5407527524110267, - "grad_norm": 0.7993235634865713, + "grad_norm": 0.25880192379351896, "learning_rate": 1.0334149770076747e-05, - "loss": 0.5458, - "mean_token_accuracy": 0.8287097465686134, + "loss": 0.4238, + "mean_token_accuracy": 0.8556846129868956, "step": 396 }, { "epoch": 0.5421182896645899, - "grad_norm": 0.13241833915099374, + "grad_norm": 0.28051700201569574, "learning_rate": 1.0286428235925849e-05, - "loss": 0.5432, - "mean_token_accuracy": 0.8267386636566856, + "loss": 0.4347, + "mean_token_accuracy": 0.8522094587514498, "step": 397 }, { "epoch": 0.5434838269181531, - "grad_norm": 0.11949134547007755, + "grad_norm": 0.25125772452792333, "learning_rate": 1.0238700172530009e-05, - "loss": 0.5033, - "mean_token_accuracy": 0.8393243651278791, + "loss": 0.4049, + "mean_token_accuracy": 0.8625896021400488, "step": 398 }, { "epoch": 0.5448493641717163, - "grad_norm": 0.12074088118488922, + "grad_norm": 0.268673980738851, "learning_rate": 1.019096666786931e-05, - "loss": 0.5227, - "mean_token_accuracy": 0.832630715562292, + "loss": 0.4203, + "mean_token_accuracy": 0.8571522385547277, "step": 399 }, { "epoch": 0.5462149014252795, - "grad_norm": 0.12296503988534725, + "grad_norm": 0.2890895729277909, "learning_rate": 1.0143228810047877e-05, - "loss": 0.5154, - "mean_token_accuracy": 0.8349590018796671, + "loss": 0.4131, + "mean_token_accuracy": 0.8595778477918344, "step": 400 }, { "epoch": 0.5475804386788428, - "grad_norm": 0.12252629339035333, + "grad_norm": 0.2542593184479374, "learning_rate": 1.0095487687269055e-05, - "loss": 0.5402, - "mean_token_accuracy": 0.82939361089684, + "loss": 0.4331, + "mean_token_accuracy": 0.8549465126076115, "step": 401 }, { "epoch": 0.5489459759324059, - "grad_norm": 0.1360273877361324, + "grad_norm": 0.2899474295442422, "learning_rate": 1.0047744387810632e-05, - "loss": 0.5359, - "mean_token_accuracy": 0.8289603846063814, + "loss": 0.4303, + "mean_token_accuracy": 0.8538087426441767, "step": 402 }, { "epoch": 0.5503115131859692, - "grad_norm": 0.12695204677957533, + "grad_norm": 0.2869992392687126, "learning_rate": 1e-05, - "loss": 0.5287, - "mean_token_accuracy": 0.83196312559923, + "loss": 0.4253, + "mean_token_accuracy": 0.8565849096399576, "step": 403 }, { "epoch": 0.5516770504395323, - "grad_norm": 0.12210863953751606, + "grad_norm": 0.2804072799948344, "learning_rate": 9.95225561218937e-06, - "loss": 0.5129, - "mean_token_accuracy": 0.8351943887435843, + "loss": 0.4108, + "mean_token_accuracy": 0.8595849788206829, "step": 404 }, { "epoch": 0.5530425876930956, - "grad_norm": 0.12108838689682434, + "grad_norm": 0.2643969180366311, "learning_rate": 9.904512312730948e-06, - "loss": 0.5141, - "mean_token_accuracy": 0.8364228435420775, + "loss": 0.4137, + "mean_token_accuracy": 0.8603534103336365, "step": 405 }, { "epoch": 0.5544081249466587, - "grad_norm": 0.12340457140106971, + "grad_norm": 0.27062716368889733, "learning_rate": 9.856771189952127e-06, - "loss": 0.4981, - "mean_token_accuracy": 0.8390686559316481, + "loss": 0.4015, + "mean_token_accuracy": 0.8625163495869607, "step": 406 }, { "epoch": 0.555773662200222, - "grad_norm": 0.12500753694797329, + "grad_norm": 0.2929202457665895, "learning_rate": 9.809033332130694e-06, - "loss": 0.529, - "mean_token_accuracy": 0.8316631237203442, + "loss": 0.4245, + "mean_token_accuracy": 0.8562819218107298, "step": 407 }, { "epoch": 0.5571391994537851, - "grad_norm": 0.13261503268132852, + "grad_norm": 0.2541385240616914, "learning_rate": 9.761299827469993e-06, - "loss": 0.5165, - "mean_token_accuracy": 0.8346349551610374, + "loss": 0.4152, + "mean_token_accuracy": 0.8585790240492156, "step": 408 }, { "epoch": 0.5585047367073483, - "grad_norm": 0.1317993623456919, + "grad_norm": 0.24317703441372845, "learning_rate": 9.713571764074153e-06, - "loss": 0.5158, - "mean_token_accuracy": 0.835416347893269, + "loss": 0.4156, + "mean_token_accuracy": 0.8592119050098765, "step": 409 }, { "epoch": 0.5598702739609115, - "grad_norm": 0.11610323969856036, + "grad_norm": 0.2671919366662445, "learning_rate": 9.665850229923258e-06, - "loss": 0.5209, - "mean_token_accuracy": 0.8336962780599988, + "loss": 0.4166, + "mean_token_accuracy": 0.8583581070445756, "step": 410 }, { "epoch": 0.5612358112144747, - "grad_norm": 0.12188764519986442, + "grad_norm": 0.2366025343403895, "learning_rate": 9.618136312848552e-06, - "loss": 0.5173, - "mean_token_accuracy": 0.8331658096184462, + "loss": 0.4167, + "mean_token_accuracy": 0.8573873504387994, "step": 411 }, { "epoch": 0.5626013484680379, - "grad_norm": 0.1242053085706222, + "grad_norm": 0.2531493471961078, "learning_rate": 9.570431100507653e-06, - "loss": 0.4967, - "mean_token_accuracy": 0.8395032769700385, + "loss": 0.3986, + "mean_token_accuracy": 0.8631825906862007, "step": 412 }, { "epoch": 0.5639668857216011, - "grad_norm": 0.12296591378563972, + "grad_norm": 0.2604649082143078, "learning_rate": 9.522735680359752e-06, - "loss": 0.52, - "mean_token_accuracy": 0.8317547275046719, + "loss": 0.419, + "mean_token_accuracy": 0.8566154458082337, "step": 413 }, { "epoch": 0.5653324229751643, - "grad_norm": 0.130220111834676, + "grad_norm": 0.24389282767116163, "learning_rate": 9.47505113964081e-06, - "loss": 0.527, - "mean_token_accuracy": 0.8320779661263545, + "loss": 0.4239, + "mean_token_accuracy": 0.8564176977638592, "step": 414 }, { "epoch": 0.5666979602287275, - "grad_norm": 0.12558756640167995, + "grad_norm": 0.2838315749775247, "learning_rate": 9.4273785653388e-06, - "loss": 0.5013, - "mean_token_accuracy": 0.8388098750862425, + "loss": 0.4044, + "mean_token_accuracy": 0.8619406759583335, "step": 415 }, { "epoch": 0.5680634974822907, - "grad_norm": 0.12475754793215417, + "grad_norm": 0.24792685107778176, "learning_rate": 9.379719044168914e-06, - "loss": 0.5278, - "mean_token_accuracy": 0.8318882147598874, + "loss": 0.4245, + "mean_token_accuracy": 0.8563493408114575, "step": 416 }, { "epoch": 0.5694290347358539, - "grad_norm": 0.11932650893487058, + "grad_norm": 0.252380204387106, "learning_rate": 9.332073662548785e-06, - "loss": 0.5205, - "mean_token_accuracy": 0.8311158983956543, + "loss": 0.4192, + "mean_token_accuracy": 0.8552596982254129, "step": 417 }, { "epoch": 0.5707945719894171, - "grad_norm": 0.13211956792012094, + "grad_norm": 0.2633944099843147, "learning_rate": 9.28444350657374e-06, - "loss": 0.4889, - "mean_token_accuracy": 0.8421529478002472, + "loss": 0.3925, + "mean_token_accuracy": 0.8656064042939838, "step": 418 }, { "epoch": 0.5721601092429803, - "grad_norm": 0.12778828107368884, + "grad_norm": 0.31236207476517847, "learning_rate": 9.236829661992024e-06, - "loss": 0.5438, - "mean_token_accuracy": 0.8257359492735918, + "loss": 0.4362, + "mean_token_accuracy": 0.8508478194007478, "step": 419 }, { "epoch": 0.5735256464965435, - "grad_norm": 0.12378035696266557, + "grad_norm": 0.25934793423863595, "learning_rate": 9.189233214180057e-06, - "loss": 0.501, - "mean_token_accuracy": 0.8392500142424699, + "loss": 0.4029, + "mean_token_accuracy": 0.8626796404616222, "step": 420 }, { "epoch": 0.5748911837501067, - "grad_norm": 0.11961532842749849, + "grad_norm": 0.2602440450848244, "learning_rate": 9.1416552481177e-06, - "loss": 0.4973, - "mean_token_accuracy": 0.8390577018914754, + "loss": 0.4011, + "mean_token_accuracy": 0.8620878476008459, "step": 421 }, { "epoch": 0.5762567210036699, - "grad_norm": 0.12849390507284664, + "grad_norm": 0.3026174301084034, "learning_rate": 9.094096848363503e-06, - "loss": 0.5152, - "mean_token_accuracy": 0.8345137451327518, + "loss": 0.4132, + "mean_token_accuracy": 0.8589640787061215, "step": 422 }, { "epoch": 0.5776222582572331, - "grad_norm": 0.12186651506244973, + "grad_norm": 0.23000512937699333, "learning_rate": 9.046559099030012e-06, - "loss": 0.5115, - "mean_token_accuracy": 0.8355469709512575, + "loss": 0.411, + "mean_token_accuracy": 0.8593097755828117, "step": 423 }, { "epoch": 0.5789877955107963, - "grad_norm": 0.12664472532878793, + "grad_norm": 0.28198364889742744, "learning_rate": 8.999043083759016e-06, - "loss": 0.5307, - "mean_token_accuracy": 0.8288510436068137, + "loss": 0.426, + "mean_token_accuracy": 0.8541148022282637, "step": 424 }, { "epoch": 0.5803533327643595, - "grad_norm": 0.12625147132785983, + "grad_norm": 0.261123335266971, "learning_rate": 8.951549885696889e-06, - "loss": 0.4962, - "mean_token_accuracy": 0.8410106889280509, + "loss": 0.4008, + "mean_token_accuracy": 0.863803099113423, "step": 425 }, { "epoch": 0.5817188700179227, - "grad_norm": 0.1266258608449764, + "grad_norm": 0.2646118229304491, "learning_rate": 8.904080587469869e-06, - "loss": 0.4814, - "mean_token_accuracy": 0.8435021625614095, + "loss": 0.3884, + "mean_token_accuracy": 0.8656690716827113, "step": 426 }, { "epoch": 0.5830844072714859, - "grad_norm": 0.12695738559126954, + "grad_norm": 0.23877427883247976, "learning_rate": 8.856636271159378e-06, - "loss": 0.53, - "mean_token_accuracy": 0.8300437942240506, + "loss": 0.4252, + "mean_token_accuracy": 0.8549220531025736, "step": 427 }, { "epoch": 0.5844499445250491, - "grad_norm": 0.1252759719871267, + "grad_norm": 0.27389213494804404, "learning_rate": 8.80921801827738e-06, - "loss": 0.5143, - "mean_token_accuracy": 0.8338047723482694, + "loss": 0.4139, + "mean_token_accuracy": 0.8583442992618325, "step": 428 }, { "epoch": 0.5858154817786123, - "grad_norm": 0.11558199505371276, + "grad_norm": 0.3009404820928479, "learning_rate": 8.76182690974171e-06, - "loss": 0.5384, - "mean_token_accuracy": 0.8285350542116635, + "loss": 0.4325, + "mean_token_accuracy": 0.854122146316857, "step": 429 }, { "epoch": 0.5871810190321755, - "grad_norm": 0.12760669348851802, + "grad_norm": 0.2723790212614728, "learning_rate": 8.714464025851428e-06, - "loss": 0.5042, - "mean_token_accuracy": 0.8395589546218177, + "loss": 0.4058, + "mean_token_accuracy": 0.8629916382485093, "step": 430 }, { "epoch": 0.5885465562857387, - "grad_norm": 0.11684861625498712, + "grad_norm": 0.25219631499358963, "learning_rate": 8.667130446262214e-06, - "loss": 0.5235, - "mean_token_accuracy": 0.8321636443374427, + "loss": 0.4198, + "mean_token_accuracy": 0.856574897278309, "step": 431 }, { "epoch": 0.5899120935393019, - "grad_norm": 0.12007357768939671, + "grad_norm": 0.2772798105097925, "learning_rate": 8.619827249961732e-06, - "loss": 0.5039, - "mean_token_accuracy": 0.8373322726883831, + "loss": 0.4047, + "mean_token_accuracy": 0.8614651709337668, "step": 432 }, { "epoch": 0.5912776307928651, - "grad_norm": 0.1129325140619799, + "grad_norm": 0.2561843385637697, "learning_rate": 8.57255551524506e-06, - "loss": 0.5042, - "mean_token_accuracy": 0.8384591425175077, + "loss": 0.4058, + "mean_token_accuracy": 0.8620459681672171, "step": 433 }, { "epoch": 0.5926431680464282, - "grad_norm": 0.12175227256801184, + "grad_norm": 0.24701843749006636, "learning_rate": 8.525316319690092e-06, - "loss": 0.485, - "mean_token_accuracy": 0.8445027346688816, + "loss": 0.389, + "mean_token_accuracy": 0.8674125095943455, "step": 434 }, { "epoch": 0.5940087052999915, - "grad_norm": 0.1311900796293075, + "grad_norm": 0.2916250794706418, "learning_rate": 8.478110740132971e-06, - "loss": 0.5247, - "mean_token_accuracy": 0.8322580174139407, + "loss": 0.4222, + "mean_token_accuracy": 0.8566364817285297, "step": 435 }, { "epoch": 0.5953742425535546, - "grad_norm": 0.12660160484661098, + "grad_norm": 0.2806760751496244, "learning_rate": 8.430939852643559e-06, - "loss": 0.5184, - "mean_token_accuracy": 0.8348346902819516, + "loss": 0.4169, + "mean_token_accuracy": 0.8584203185837236, "step": 436 }, { "epoch": 0.5967397798071179, - "grad_norm": 0.11621856070542548, + "grad_norm": 0.27137099893978883, "learning_rate": 8.383804732500901e-06, - "loss": 0.5119, - "mean_token_accuracy": 0.8347223913368035, + "loss": 0.4103, + "mean_token_accuracy": 0.8591519595394866, "step": 437 }, { "epoch": 0.598105317060681, - "grad_norm": 0.12589159880155412, + "grad_norm": 0.29739700836059735, "learning_rate": 8.336706454168701e-06, - "loss": 0.5104, - "mean_token_accuracy": 0.8372731586475606, + "loss": 0.4091, + "mean_token_accuracy": 0.8611807240702429, "step": 438 }, { "epoch": 0.5994708543142443, - "grad_norm": 0.11593520287089763, + "grad_norm": 0.2571864812668034, "learning_rate": 8.289646091270848e-06, - "loss": 0.5029, - "mean_token_accuracy": 0.8407042589638942, + "loss": 0.4015, + "mean_token_accuracy": 0.8637355095867093, "step": 439 }, { "epoch": 0.6008363915678074, - "grad_norm": 0.11614534102626904, + "grad_norm": 0.23339459771292811, "learning_rate": 8.242624716566928e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8425194940834334, + "loss": 0.3887, + "mean_token_accuracy": 0.8657256008172974, "step": 440 }, { "epoch": 0.6022019288213707, - "grad_norm": 0.1252214343129348, + "grad_norm": 0.2992953835452048, "learning_rate": 8.195643401927777e-06, - "loss": 0.4868, - "mean_token_accuracy": 0.842466111153692, + "loss": 0.3916, + "mean_token_accuracy": 0.8653041052199545, "step": 441 }, { "epoch": 0.6035674660749338, - "grad_norm": 0.12339579840736846, + "grad_norm": 0.2585326431716542, "learning_rate": 8.148703218311053e-06, - "loss": 0.5172, - "mean_token_accuracy": 0.8348110944829211, + "loss": 0.4155, + "mean_token_accuracy": 0.8596556131686985, "step": 442 }, { "epoch": 0.6049330033284971, - "grad_norm": 0.1282144966539934, + "grad_norm": 0.2503365003487542, "learning_rate": 8.101805235736804e-06, - "loss": 0.5287, - "mean_token_accuracy": 0.8298261865109949, + "loss": 0.4265, + "mean_token_accuracy": 0.8549398314920128, "step": 443 }, { "epoch": 0.6062985405820602, - "grad_norm": 0.12157988253943283, + "grad_norm": 0.2820273966033343, "learning_rate": 8.054950523263097e-06, - "loss": 0.5127, - "mean_token_accuracy": 0.8347732339168258, + "loss": 0.4136, + "mean_token_accuracy": 0.8588294366875941, "step": 444 }, { "epoch": 0.6076640778356235, - "grad_norm": 0.11953500754249576, + "grad_norm": 0.25413507701062166, "learning_rate": 8.008140148961642e-06, - "loss": 0.5033, - "mean_token_accuracy": 0.8389621612944391, + "loss": 0.4044, + "mean_token_accuracy": 0.8625195591348364, "step": 445 }, { "epoch": 0.6090296150891866, - "grad_norm": 0.1252009293567693, + "grad_norm": 0.23238499595613277, "learning_rate": 7.96137517989343e-06, - "loss": 0.5296, - "mean_token_accuracy": 0.8311967533775438, + "loss": 0.4246, + "mean_token_accuracy": 0.8561338059337695, "step": 446 }, { "epoch": 0.6103951523427499, - "grad_norm": 0.12256962656998621, + "grad_norm": 0.23798509908211196, "learning_rate": 7.914656682084436e-06, - "loss": 0.5194, - "mean_token_accuracy": 0.8332216117773507, + "loss": 0.4169, + "mean_token_accuracy": 0.8577032958934101, "step": 447 }, { "epoch": 0.611760689596313, - "grad_norm": 0.12605639174906047, + "grad_norm": 0.25296096161140275, "learning_rate": 7.867985720501301e-06, - "loss": 0.4964, - "mean_token_accuracy": 0.8396787023243677, + "loss": 0.399, + "mean_token_accuracy": 0.863544434056401, "step": 448 }, { "epoch": 0.6131262268498763, - "grad_norm": 0.1113391874223632, + "grad_norm": 0.22798661470070672, "learning_rate": 7.821363359027047e-06, - "loss": 0.5025, - "mean_token_accuracy": 0.8375202863568625, + "loss": 0.4028, + "mean_token_accuracy": 0.8615431059914728, "step": 449 }, { "epoch": 0.6144917641034394, - "grad_norm": 0.1219902065548848, + "grad_norm": 0.2405890996462141, "learning_rate": 7.774790660436857e-06, - "loss": 0.5332, - "mean_token_accuracy": 0.8296327290705735, + "loss": 0.4279, + "mean_token_accuracy": 0.8547220263658766, "step": 450 }, { "epoch": 0.6158573013570027, - "grad_norm": 0.12584455085891816, + "grad_norm": 0.29357731632515965, "learning_rate": 7.728268686373814e-06, - "loss": 0.5069, - "mean_token_accuracy": 0.8381285192539372, + "loss": 0.4077, + "mean_token_accuracy": 0.8615052330629221, "step": 451 }, { "epoch": 0.6172228386105658, - "grad_norm": 0.13210262545854212, + "grad_norm": 0.28038212690179937, "learning_rate": 7.681798497324717e-06, - "loss": 0.5464, - "mean_token_accuracy": 0.8254239458767981, + "loss": 0.4379, + "mean_token_accuracy": 0.8519818024977281, "step": 452 }, { "epoch": 0.6185883758641291, - "grad_norm": 0.12667312895337426, + "grad_norm": 0.25548937081717665, "learning_rate": 7.635381152595916e-06, - "loss": 0.5183, - "mean_token_accuracy": 0.832755127809813, + "loss": 0.4175, + "mean_token_accuracy": 0.857195676423713, "step": 453 }, { "epoch": 0.6199539131176922, - "grad_norm": 0.1146164806721885, + "grad_norm": 0.25398512091441905, "learning_rate": 7.5890177102891395e-06, - "loss": 0.5041, - "mean_token_accuracy": 0.8371342197241901, + "loss": 0.4039, + "mean_token_accuracy": 0.8617371650977873, "step": 454 }, { "epoch": 0.6213194503712555, - "grad_norm": 0.1201885769720261, + "grad_norm": 0.26543560160540447, "learning_rate": 7.542709227277396e-06, - "loss": 0.5195, - "mean_token_accuracy": 0.8326882827983069, + "loss": 0.4164, + "mean_token_accuracy": 0.857181019142036, "step": 455 }, { "epoch": 0.6226849876248186, - "grad_norm": 0.12868756856786553, + "grad_norm": 0.2459878250772752, "learning_rate": 7.496456759180876e-06, - "loss": 0.5266, - "mean_token_accuracy": 0.8330112007706847, + "loss": 0.4228, + "mean_token_accuracy": 0.8573935855915926, "step": 456 }, { "epoch": 0.6240505248783819, - "grad_norm": 0.13788448934196415, + "grad_norm": 0.23786580710281324, "learning_rate": 7.4502613603428875e-06, - "loss": 0.5051, - "mean_token_accuracy": 0.8381098707605368, + "loss": 0.4088, + "mean_token_accuracy": 0.8607801487917315, "step": 457 }, { "epoch": 0.625416062131945, - "grad_norm": 0.12406442203631797, + "grad_norm": 0.29589767964465397, "learning_rate": 7.404124083805819e-06, - "loss": 0.5091, - "mean_token_accuracy": 0.8375944584785348, + "loss": 0.4093, + "mean_token_accuracy": 0.8612231610118724, "step": 458 }, { "epoch": 0.6267815993855083, - "grad_norm": 0.12143031997129315, + "grad_norm": 0.2887590659400021, "learning_rate": 7.358045981287141e-06, - "loss": 0.5256, - "mean_token_accuracy": 0.8313393808795969, + "loss": 0.4209, + "mean_token_accuracy": 0.8560159432518372, "step": 459 }, { "epoch": 0.6281471366390714, - "grad_norm": 0.11226266056918419, + "grad_norm": 0.21799038062254464, "learning_rate": 7.312028103155426e-06, - "loss": 0.4728, - "mean_token_accuracy": 0.8484355444403938, + "loss": 0.3799, + "mean_token_accuracy": 0.8704455922016139, "step": 460 }, { "epoch": 0.6295126738926347, - "grad_norm": 0.12201905182376982, + "grad_norm": 0.24324879715514983, "learning_rate": 7.266071498406417e-06, - "loss": 0.5078, - "mean_token_accuracy": 0.8371311445204457, + "loss": 0.4076, + "mean_token_accuracy": 0.8610840750554581, "step": 461 }, { "epoch": 0.6308782111461978, - "grad_norm": 0.12446112685230139, + "grad_norm": 0.25428302492825916, "learning_rate": 7.220177214639088e-06, - "loss": 0.5116, - "mean_token_accuracy": 0.8342854852698145, + "loss": 0.4109, + "mean_token_accuracy": 0.8587935610875843, "step": 462 }, { "epoch": 0.632243748399761, - "grad_norm": 0.12750090471187656, + "grad_norm": 0.27202575353290637, "learning_rate": 7.1743462980318045e-06, - "loss": 0.5175, - "mean_token_accuracy": 0.8348392404086464, + "loss": 0.4164, + "mean_token_accuracy": 0.8589937783595583, "step": 463 }, { "epoch": 0.6336092856533242, - "grad_norm": 0.11658460085766126, + "grad_norm": 0.23695566303733537, "learning_rate": 7.128579793318429e-06, - "loss": 0.5001, - "mean_token_accuracy": 0.83965411183352, + "loss": 0.4024, + "mean_token_accuracy": 0.8627954415233714, "step": 464 }, { "epoch": 0.6349748229068874, - "grad_norm": 0.13201219209908094, + "grad_norm": 0.25513688392749734, "learning_rate": 7.0828787437645455e-06, - "loss": 0.5097, - "mean_token_accuracy": 0.8358568623874668, + "loss": 0.4093, + "mean_token_accuracy": 0.8600665306531674, "step": 465 }, { "epoch": 0.6363403601604506, - "grad_norm": 0.11811457855521804, + "grad_norm": 0.26912983946640745, "learning_rate": 7.037244191143662e-06, - "loss": 0.4981, - "mean_token_accuracy": 0.837630626428689, + "loss": 0.3995, + "mean_token_accuracy": 0.8619904346664145, "step": 466 }, { "epoch": 0.6377058974140138, - "grad_norm": 0.12660564543775485, + "grad_norm": 0.2525033514112838, "learning_rate": 6.991677175713449e-06, - "loss": 0.5256, - "mean_token_accuracy": 0.8324087056393724, + "loss": 0.4223, + "mean_token_accuracy": 0.8571298722103126, "step": 467 }, { "epoch": 0.639071434667577, - "grad_norm": 0.1265652255605963, + "grad_norm": 0.4126261428152561, "learning_rate": 6.946178736192053e-06, - "loss": 0.551, - "mean_token_accuracy": 0.8238474132884289, + "loss": 0.4422, + "mean_token_accuracy": 0.8495778270375115, "step": 468 }, { "epoch": 0.6404369719211402, - "grad_norm": 0.12544354145825112, + "grad_norm": 0.25938255418921724, "learning_rate": 6.900749909734406e-06, - "loss": 0.5389, - "mean_token_accuracy": 0.8275534325894015, + "loss": 0.433, + "mean_token_accuracy": 0.8528369297456597, "step": 469 }, { "epoch": 0.6418025091747034, - "grad_norm": 0.12095342304047883, + "grad_norm": 0.2505229138650988, "learning_rate": 6.8553917319085676e-06, - "loss": 0.5168, - "mean_token_accuracy": 0.8348751537562868, + "loss": 0.4148, + "mean_token_accuracy": 0.8588444157084706, "step": 470 }, { "epoch": 0.6431680464282666, - "grad_norm": 0.10871049913584206, + "grad_norm": 0.21310658415353784, "learning_rate": 6.810105236672155e-06, - "loss": 0.4788, - "mean_token_accuracy": 0.8453276256567875, + "loss": 0.3849, + "mean_token_accuracy": 0.8671909034888359, "step": 471 }, { "epoch": 0.6445335836818298, - "grad_norm": 0.11971496378559983, + "grad_norm": 0.23204325202817497, "learning_rate": 6.76489145634873e-06, - "loss": 0.5068, - "mean_token_accuracy": 0.8380669892091602, + "loss": 0.4074, + "mean_token_accuracy": 0.8617341122845665, "step": 472 }, { "epoch": 0.645899120935393, - "grad_norm": 0.1305889987084359, + "grad_norm": 0.27377537704030214, "learning_rate": 6.719751421604309e-06, - "loss": 0.5033, - "mean_token_accuracy": 0.8372513996124086, + "loss": 0.4052, + "mean_token_accuracy": 0.8605801437822177, "step": 473 }, { "epoch": 0.6472646581889562, - "grad_norm": 0.12189080396821103, + "grad_norm": 0.26667506542190905, "learning_rate": 6.6746861614238425e-06, - "loss": 0.5192, - "mean_token_accuracy": 0.8343011951051359, + "loss": 0.4172, + "mean_token_accuracy": 0.8587473642719939, "step": 474 }, { "epoch": 0.6486301954425194, - "grad_norm": 0.12047090727393966, + "grad_norm": 0.2359219117164129, "learning_rate": 6.629696703087755e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.8410742811139031, + "loss": 0.3979, + "mean_token_accuracy": 0.8643947766486794, "step": 475 }, { "epoch": 0.6499957326960826, - "grad_norm": 0.1232509556560695, + "grad_norm": 0.2405700433141458, "learning_rate": 6.584784072148554e-06, - "loss": 0.5119, - "mean_token_accuracy": 0.8345750420549545, + "loss": 0.4117, + "mean_token_accuracy": 0.8583265490823113, "step": 476 }, { "epoch": 0.6513612699496458, - "grad_norm": 0.1165352828116907, + "grad_norm": 0.23752686356493705, "learning_rate": 6.5399492924074215e-06, - "loss": 0.5066, - "mean_token_accuracy": 0.8359044537219931, + "loss": 0.4079, + "mean_token_accuracy": 0.8599794679393602, "step": 477 }, { "epoch": 0.652726807203209, - "grad_norm": 0.11713928121543726, + "grad_norm": 0.260987271653807, "learning_rate": 6.495193385890901e-06, - "loss": 0.4942, - "mean_token_accuracy": 0.8408816468649669, + "loss": 0.3978, + "mean_token_accuracy": 0.863796731863899, "step": 478 }, { "epoch": 0.6540923444567722, - "grad_norm": 0.12555364875965588, + "grad_norm": 0.2389148131609978, "learning_rate": 6.450517372827591e-06, - "loss": 0.5075, - "mean_token_accuracy": 0.8379034265627305, + "loss": 0.4048, + "mean_token_accuracy": 0.8625655846917668, "step": 479 }, { "epoch": 0.6554578817103354, - "grad_norm": 0.1219403131731024, + "grad_norm": 0.23754616663334344, "learning_rate": 6.405922271624874e-06, - "loss": 0.4854, - "mean_token_accuracy": 0.8415307072437642, + "loss": 0.3894, + "mean_token_accuracy": 0.8650371535045855, "step": 480 }, { "epoch": 0.6568234189638986, - "grad_norm": 0.11512298466223274, + "grad_norm": 0.24392056114704547, "learning_rate": 6.3614090988457255e-06, - "loss": 0.5079, - "mean_token_accuracy": 0.8371428485552044, + "loss": 0.4078, + "mean_token_accuracy": 0.860617293319343, "step": 481 }, { "epoch": 0.6581889562174618, - "grad_norm": 0.12202824577987749, + "grad_norm": 0.26180580733734304, "learning_rate": 6.3169788691855326e-06, - "loss": 0.5247, - "mean_token_accuracy": 0.832391209883534, + "loss": 0.4226, + "mean_token_accuracy": 0.8568173015759246, "step": 482 }, { "epoch": 0.659554493471025, - "grad_norm": 0.12521282860879976, + "grad_norm": 0.24631836835717166, "learning_rate": 6.2726325954489474e-06, - "loss": 0.5235, - "mean_token_accuracy": 0.8304635817027883, + "loss": 0.4214, + "mean_token_accuracy": 0.8552957608827673, "step": 483 }, { "epoch": 0.6609200307245882, - "grad_norm": 0.12592461968482208, + "grad_norm": 0.2805589427499748, "learning_rate": 6.22837128852683e-06, - "loss": 0.5249, - "mean_token_accuracy": 0.8325043609582593, + "loss": 0.4209, + "mean_token_accuracy": 0.857800855823934, "step": 484 }, { "epoch": 0.6622855679781514, - "grad_norm": 0.1196867976530261, + "grad_norm": 0.232371020464702, "learning_rate": 6.184195957373176e-06, - "loss": 0.5111, - "mean_token_accuracy": 0.8357545967855985, + "loss": 0.4109, + "mean_token_accuracy": 0.8600986670212696, "step": 485 }, { "epoch": 0.6636511052317146, - "grad_norm": 0.12082194567438065, + "grad_norm": 0.24474802604092138, "learning_rate": 6.140107608982137e-06, - "loss": 0.4923, - "mean_token_accuracy": 0.8422647780182342, + "loss": 0.3952, + "mean_token_accuracy": 0.8657601107656259, "step": 486 }, { "epoch": 0.6650166424852778, - "grad_norm": 0.12003480656767873, + "grad_norm": 0.25305102083356706, "learning_rate": 6.0961072483650526e-06, - "loss": 0.504, - "mean_token_accuracy": 0.838872892583594, + "loss": 0.4037, + "mean_token_accuracy": 0.8629620469736944, "step": 487 }, { "epoch": 0.666382179738841, - "grad_norm": 0.11731377583714832, + "grad_norm": 0.22975337882908747, "learning_rate": 6.052195878527551e-06, - "loss": 0.5035, - "mean_token_accuracy": 0.8385560217686393, + "loss": 0.4038, + "mean_token_accuracy": 0.8623166696689788, "step": 488 }, { "epoch": 0.6677477169924042, - "grad_norm": 0.11472197664452484, + "grad_norm": 0.24494072522854304, "learning_rate": 6.008374500446676e-06, - "loss": 0.5243, - "mean_token_accuracy": 0.8329620358816957, + "loss": 0.4203, + "mean_token_accuracy": 0.8575659022852821, "step": 489 }, { "epoch": 0.6691132542459673, - "grad_norm": 0.12246607518776052, + "grad_norm": 0.24564454125360483, "learning_rate": 5.964644113048079e-06, - "loss": 0.5108, - "mean_token_accuracy": 0.8353016894483374, + "loss": 0.4108, + "mean_token_accuracy": 0.8592899012026329, "step": 490 }, { "epoch": 0.6704787914995306, - "grad_norm": 0.12289308176266316, + "grad_norm": 0.2970193563961585, "learning_rate": 5.921005713183236e-06, - "loss": 0.5146, - "mean_token_accuracy": 0.8336913895616518, + "loss": 0.414, + "mean_token_accuracy": 0.8578038401374278, "step": 491 }, { "epoch": 0.6718443287530937, - "grad_norm": 0.12409695871521655, + "grad_norm": 0.22522151812319466, "learning_rate": 5.877460295606739e-06, - "loss": 0.4856, - "mean_token_accuracy": 0.8425300733898616, + "loss": 0.3909, + "mean_token_accuracy": 0.8663197254697791, "step": 492 }, { "epoch": 0.673209866006657, - "grad_norm": 0.1268503095922522, + "grad_norm": 0.24777229494666286, "learning_rate": 5.834008852953603e-06, - "loss": 0.4965, - "mean_token_accuracy": 0.839688126207297, + "loss": 0.3987, + "mean_token_accuracy": 0.8633314691916142, "step": 493 }, { "epoch": 0.6745754032602201, - "grad_norm": 0.11780058060852934, + "grad_norm": 0.27546512967034037, "learning_rate": 5.790652375716653e-06, - "loss": 0.4851, - "mean_token_accuracy": 0.8422088030092447, + "loss": 0.3887, + "mean_token_accuracy": 0.8660309352349229, "step": 494 }, { "epoch": 0.6759409405137834, - "grad_norm": 0.12332165777619618, + "grad_norm": 0.24981866455290716, "learning_rate": 5.74739185222394e-06, - "loss": 0.521, - "mean_token_accuracy": 0.8342648657252381, + "loss": 0.4182, + "mean_token_accuracy": 0.8580747989016917, "step": 495 }, { "epoch": 0.6773064777673465, - "grad_norm": 0.11821151773116689, + "grad_norm": 0.23497798723811336, "learning_rate": 5.704228268616208e-06, - "loss": 0.5025, - "mean_token_accuracy": 0.8388699455363635, + "loss": 0.4034, + "mean_token_accuracy": 0.8624885421549002, "step": 496 }, { "epoch": 0.6786720150209098, - "grad_norm": 0.11497424323490198, + "grad_norm": 0.22626842482811318, "learning_rate": 5.66116260882442e-06, - "loss": 0.466, - "mean_token_accuracy": 0.8499379131096036, + "loss": 0.3743, + "mean_token_accuracy": 0.8722626987396589, "step": 497 }, { "epoch": 0.6800375522744729, - "grad_norm": 0.12194977229804227, + "grad_norm": 0.273018811101812, "learning_rate": 5.618195854547333e-06, - "loss": 0.5123, - "mean_token_accuracy": 0.835231952201664, + "loss": 0.4118, + "mean_token_accuracy": 0.8591764611730132, "step": 498 }, { "epoch": 0.6814030895280362, - "grad_norm": 0.12130349922839018, + "grad_norm": 0.2517060577155918, "learning_rate": 5.575328985229098e-06, - "loss": 0.5362, - "mean_token_accuracy": 0.8288374992994797, + "loss": 0.4307, + "mean_token_accuracy": 0.8540759587427639, "step": 499 }, { "epoch": 0.6827686267815993, - "grad_norm": 0.12198508980868775, + "grad_norm": 0.23433321670856827, "learning_rate": 5.532562978036964e-06, - "loss": 0.4889, - "mean_token_accuracy": 0.8413691451225482, + "loss": 0.3938, + "mean_token_accuracy": 0.8640612399037781, "step": 500 }, { "epoch": 0.6841341640351626, - "grad_norm": 0.12244786061457337, + "grad_norm": 0.2425154734570016, "learning_rate": 5.48989880783898e-06, - "loss": 0.5254, - "mean_token_accuracy": 0.8327281150380043, + "loss": 0.422, + "mean_token_accuracy": 0.8574342653414885, "step": 501 }, { "epoch": 0.6854997012887258, - "grad_norm": 0.121242479275743, + "grad_norm": 0.23347937555277293, "learning_rate": 5.4473374471817906e-06, - "loss": 0.5216, - "mean_token_accuracy": 0.833079684037441, + "loss": 0.4191, + "mean_token_accuracy": 0.8575295591207817, "step": 502 }, { "epoch": 0.686865238542289, - "grad_norm": 0.11964858038035975, + "grad_norm": 0.2545433711957671, "learning_rate": 5.404879866268438e-06, - "loss": 0.497, - "mean_token_accuracy": 0.8411726466996567, + "loss": 0.4002, + "mean_token_accuracy": 0.8639634173808828, "step": 503 }, { "epoch": 0.6882307757958522, - "grad_norm": 0.11867508901974151, + "grad_norm": 0.24243974966286372, "learning_rate": 5.362527032936278e-06, - "loss": 0.5026, - "mean_token_accuracy": 0.837541648994152, + "loss": 0.4028, + "mean_token_accuracy": 0.862244193071149, "step": 504 }, { "epoch": 0.6895963130494154, - "grad_norm": 0.11995185452129488, + "grad_norm": 0.23059100155723, "learning_rate": 5.320279912634907e-06, - "loss": 0.5017, - "mean_token_accuracy": 0.8388557751462181, + "loss": 0.4035, + "mean_token_accuracy": 0.8626255651551141, "step": 505 }, { "epoch": 0.6909618503029786, - "grad_norm": 0.13275536202463695, + "grad_norm": 0.23428705595664845, "learning_rate": 5.278139468404133e-06, - "loss": 0.5085, - "mean_token_accuracy": 0.8357595176434741, + "loss": 0.4083, + "mean_token_accuracy": 0.8603081045639313, "step": 506 }, { "epoch": 0.6923273875565418, - "grad_norm": 0.12355165273185349, + "grad_norm": 0.27274705668530197, "learning_rate": 5.236106660852058e-06, - "loss": 0.5249, - "mean_token_accuracy": 0.8308308513069204, + "loss": 0.4212, + "mean_token_accuracy": 0.8560662849082573, "step": 507 }, { "epoch": 0.693692924810105, - "grad_norm": 0.12002409379310094, + "grad_norm": 0.2845641889872319, "learning_rate": 5.194182448133163e-06, - "loss": 0.4901, - "mean_token_accuracy": 0.840497814407723, + "loss": 0.3944, + "mean_token_accuracy": 0.8640756812761311, "step": 508 }, { "epoch": 0.6950584620636682, - "grad_norm": 0.11538784228737595, + "grad_norm": 0.22664565522900781, "learning_rate": 5.152367785926452e-06, - "loss": 0.5085, - "mean_token_accuracy": 0.8352713653806227, + "loss": 0.4094, + "mean_token_accuracy": 0.8593605320038422, "step": 509 }, { "epoch": 0.6964239993172314, - "grad_norm": 0.12482876497717726, + "grad_norm": 0.22478587370218284, "learning_rate": 5.110663627413695e-06, - "loss": 0.4919, - "mean_token_accuracy": 0.8415822791020865, + "loss": 0.396, + "mean_token_accuracy": 0.8648808615935969, "step": 510 }, { "epoch": 0.6977895365707946, - "grad_norm": 0.12068104490936474, + "grad_norm": 0.21591948807401068, "learning_rate": 5.069070923257685e-06, - "loss": 0.4966, - "mean_token_accuracy": 0.8394127220515218, + "loss": 0.399, + "mean_token_accuracy": 0.8633928472317149, "step": 511 }, { "epoch": 0.6991550738243578, - "grad_norm": 0.1255812007726533, + "grad_norm": 0.26461022257675626, "learning_rate": 5.027590621580563e-06, - "loss": 0.5478, - "mean_token_accuracy": 0.8250435186298053, + "loss": 0.4394, + "mean_token_accuracy": 0.8507808502833459, "step": 512 }, { "epoch": 0.700520611077921, - "grad_norm": 0.12080305760026157, + "grad_norm": 0.21356588082748598, "learning_rate": 4.986223667942213e-06, - "loss": 0.5327, - "mean_token_accuracy": 0.8282018349779968, + "loss": 0.4271, + "mean_token_accuracy": 0.8533322294161114, "step": 513 }, { "epoch": 0.7018861483314842, - "grad_norm": 0.115561291415391, + "grad_norm": 0.21403249722142711, "learning_rate": 4.944971005318716e-06, - "loss": 0.4961, - "mean_token_accuracy": 0.8396779938990382, + "loss": 0.3987, + "mean_token_accuracy": 0.8636949003001929, "step": 514 }, { "epoch": 0.7032516855850474, - "grad_norm": 0.12008369952815327, + "grad_norm": 0.23897074188676798, "learning_rate": 4.903833574080825e-06, - "loss": 0.4883, - "mean_token_accuracy": 0.8407714106489297, + "loss": 0.3925, + "mean_token_accuracy": 0.8641888082172495, "step": 515 }, { "epoch": 0.7046172228386106, - "grad_norm": 0.12420113242283828, + "grad_norm": 0.22004960179660596, "learning_rate": 4.862812311972567e-06, - "loss": 0.5347, - "mean_token_accuracy": 0.8302554130553389, + "loss": 0.4296, + "mean_token_accuracy": 0.8554980900000537, "step": 516 }, { "epoch": 0.7059827600921738, - "grad_norm": 0.12121678310448425, + "grad_norm": 0.21948469336679155, "learning_rate": 4.82190815408983e-06, - "loss": 0.5002, - "mean_token_accuracy": 0.8373392270291371, + "loss": 0.4033, + "mean_token_accuracy": 0.8607597379326859, "step": 517 }, { "epoch": 0.707348297345737, - "grad_norm": 0.12659025813581198, + "grad_norm": 0.23185408625161705, "learning_rate": 4.781122032859079e-06, - "loss": 0.5132, - "mean_token_accuracy": 0.8346509758387376, + "loss": 0.4137, + "mean_token_accuracy": 0.8586254422034981, "step": 518 }, { "epoch": 0.7087138345993002, - "grad_norm": 0.1308180581594544, + "grad_norm": 0.23176435351640776, "learning_rate": 4.740454878016084e-06, - "loss": 0.5124, - "mean_token_accuracy": 0.8350214500020289, + "loss": 0.4109, + "mean_token_accuracy": 0.8592659758915503, "step": 519 }, { "epoch": 0.7100793718528634, - "grad_norm": 0.12255054550009402, + "grad_norm": 0.24913384906741762, "learning_rate": 4.6999076165847214e-06, - "loss": 0.5392, - "mean_token_accuracy": 0.8262938084076187, + "loss": 0.4343, + "mean_token_accuracy": 0.8513752575363313, "step": 520 }, { "epoch": 0.7114449091064265, - "grad_norm": 0.11156002098510587, + "grad_norm": 0.20722173613037567, "learning_rate": 4.659481172855859e-06, - "loss": 0.4998, - "mean_token_accuracy": 0.8391171313425028, + "loss": 0.4005, + "mean_token_accuracy": 0.8633166282089912, "step": 521 }, { "epoch": 0.7128104463599898, - "grad_norm": 0.11151806525494001, + "grad_norm": 0.2181641440286907, "learning_rate": 4.619176468366274e-06, - "loss": 0.4862, - "mean_token_accuracy": 0.8433726101805832, + "loss": 0.3917, + "mean_token_accuracy": 0.8660028930864674, "step": 522 }, { "epoch": 0.714175983613553, - "grad_norm": 0.12469083357493645, + "grad_norm": 0.25367503570810757, "learning_rate": 4.578994421877645e-06, - "loss": 0.5092, - "mean_token_accuracy": 0.8374194187947644, + "loss": 0.4096, + "mean_token_accuracy": 0.8606353223418006, "step": 523 }, { "epoch": 0.7155415208671162, - "grad_norm": 0.11601940964140946, + "grad_norm": 0.21964433917606951, "learning_rate": 4.538935949355623e-06, - "loss": 0.5117, - "mean_token_accuracy": 0.8340426167281046, + "loss": 0.4115, + "mean_token_accuracy": 0.8579058023067518, "step": 524 }, { "epoch": 0.7169070581206793, - "grad_norm": 0.1312609233810232, + "grad_norm": 0.23194379718009556, "learning_rate": 4.499001963948929e-06, - "loss": 0.5243, - "mean_token_accuracy": 0.8327581090765012, + "loss": 0.4226, + "mean_token_accuracy": 0.8567179649132983, "step": 525 }, { "epoch": 0.7182725953742426, - "grad_norm": 0.1278781251310711, + "grad_norm": 0.23794903701170841, "learning_rate": 4.45919337596856e-06, - "loss": 0.5025, - "mean_token_accuracy": 0.8387631347525201, + "loss": 0.4044, + "mean_token_accuracy": 0.8625243196220369, "step": 526 }, { "epoch": 0.7196381326278057, - "grad_norm": 0.1150881960845902, + "grad_norm": 0.24368825515761053, "learning_rate": 4.41951109286703e-06, - "loss": 0.4903, - "mean_token_accuracy": 0.8417572157644675, + "loss": 0.3946, + "mean_token_accuracy": 0.8651203225887669, "step": 527 }, { "epoch": 0.721003669881369, - "grad_norm": 0.12004857896825291, + "grad_norm": 0.2753308025130241, "learning_rate": 4.379956019217675e-06, - "loss": 0.5167, - "mean_token_accuracy": 0.8364056743504106, + "loss": 0.415, + "mean_token_accuracy": 0.8604877721998891, "step": 528 }, { "epoch": 0.7223692071349321, - "grad_norm": 0.11349733440207685, + "grad_norm": 0.21688742696840135, "learning_rate": 4.3405290566940475e-06, - "loss": 0.4934, - "mean_token_accuracy": 0.8410076792691503, + "loss": 0.397, + "mean_token_accuracy": 0.8641772641669326, "step": 529 }, { "epoch": 0.7237347443884954, - "grad_norm": 0.11609813402949515, + "grad_norm": 0.21378940665861854, "learning_rate": 4.301231104049359e-06, - "loss": 0.5189, - "mean_token_accuracy": 0.8346933566004181, + "loss": 0.4174, + "mean_token_accuracy": 0.8589884247304818, "step": 530 }, { "epoch": 0.7251002816420585, - "grad_norm": 0.11733115685903821, + "grad_norm": 0.2014478430774182, "learning_rate": 4.262063057095978e-06, - "loss": 0.4935, - "mean_token_accuracy": 0.8408121825971543, + "loss": 0.3961, + "mean_token_accuracy": 0.864822926034779, "step": 531 }, { "epoch": 0.7264658188956218, - "grad_norm": 0.1130218697819071, + "grad_norm": 0.21961573395687345, "learning_rate": 4.2230258086850375e-06, - "loss": 0.4992, - "mean_token_accuracy": 0.837098869602991, + "loss": 0.4018, + "mean_token_accuracy": 0.8609590482386245, "step": 532 }, { "epoch": 0.7278313561491849, - "grad_norm": 0.12332290379567455, + "grad_norm": 0.22346096554308978, "learning_rate": 4.184120248686048e-06, - "loss": 0.5362, - "mean_token_accuracy": 0.8305396581867253, + "loss": 0.4296, + "mean_token_accuracy": 0.8551864724213167, "step": 533 }, { "epoch": 0.7291968934027482, - "grad_norm": 0.11407042850032323, + "grad_norm": 0.2324113039004406, "learning_rate": 4.145347263966646e-06, - "loss": 0.5109, - "mean_token_accuracy": 0.8375638538964674, + "loss": 0.4106, + "mean_token_accuracy": 0.8610600169630592, "step": 534 }, { "epoch": 0.7305624306563113, - "grad_norm": 0.11805182230417788, + "grad_norm": 0.21693513782961235, "learning_rate": 4.106707738372357e-06, - "loss": 0.5033, - "mean_token_accuracy": 0.838315691202731, + "loss": 0.4047, + "mean_token_accuracy": 0.8620982081151527, "step": 535 }, { "epoch": 0.7319279679098746, - "grad_norm": 0.12246855529675972, + "grad_norm": 0.24610715663770288, "learning_rate": 4.0682025527064486e-06, - "loss": 0.5142, - "mean_token_accuracy": 0.8349068666250341, + "loss": 0.4142, + "mean_token_accuracy": 0.8586329074626934, "step": 536 }, { "epoch": 0.7332935051634377, - "grad_norm": 0.11780385140674156, + "grad_norm": 0.2496537037626066, "learning_rate": 4.029832584709864e-06, - "loss": 0.5132, - "mean_token_accuracy": 0.8352162915246271, + "loss": 0.4133, + "mean_token_accuracy": 0.8591482955360918, "step": 537 }, { "epoch": 0.734659042417001, - "grad_norm": 0.11685168524423908, + "grad_norm": 0.25290330143622763, "learning_rate": 3.991598709041196e-06, - "loss": 0.5202, - "mean_token_accuracy": 0.8333136886203487, + "loss": 0.4185, + "mean_token_accuracy": 0.8575748615139153, "step": 538 }, { "epoch": 0.7360245796705641, - "grad_norm": 0.11778517621846066, + "grad_norm": 0.23375530633596542, "learning_rate": 3.953501797256768e-06, - "loss": 0.5086, - "mean_token_accuracy": 0.8367270899445329, + "loss": 0.4082, + "mean_token_accuracy": 0.8608288259179717, "step": 539 }, { "epoch": 0.7373901169241274, - "grad_norm": 0.11976766505704459, + "grad_norm": 0.23750273636894062, "learning_rate": 3.915542717790759e-06, - "loss": 0.5083, - "mean_token_accuracy": 0.8365840097036213, + "loss": 0.4087, + "mean_token_accuracy": 0.8604965692097658, "step": 540 }, { "epoch": 0.7387556541776905, - "grad_norm": 0.11992148667466371, + "grad_norm": 0.22536670381141075, "learning_rate": 3.877722335935394e-06, - "loss": 0.5343, - "mean_token_accuracy": 0.8284290832782362, + "loss": 0.4274, + "mean_token_accuracy": 0.8535887513090156, "step": 541 }, { "epoch": 0.7401211914312538, - "grad_norm": 0.11741127761439236, + "grad_norm": 0.2337249133077971, "learning_rate": 3.840041513821243e-06, - "loss": 0.5273, - "mean_token_accuracy": 0.8298143190890194, + "loss": 0.4252, + "mean_token_accuracy": 0.8543320104591253, "step": 542 }, { "epoch": 0.7414867286848169, - "grad_norm": 0.12387797192057075, + "grad_norm": 0.23006457934706245, "learning_rate": 3.802501110397553e-06, - "loss": 0.516, - "mean_token_accuracy": 0.8357361078863811, + "loss": 0.4154, + "mean_token_accuracy": 0.8595805110377353, "step": 543 }, { "epoch": 0.7428522659383802, - "grad_norm": 0.11301339603736248, + "grad_norm": 0.2264725327560074, "learning_rate": 3.7651019814126656e-06, - "loss": 0.4947, - "mean_token_accuracy": 0.840862224523019, + "loss": 0.3978, + "mean_token_accuracy": 0.8640107757543986, "step": 544 }, { "epoch": 0.7442178031919433, - "grad_norm": 0.11192991923018139, + "grad_norm": 0.21662115139126908, "learning_rate": 3.727844979394526e-06, - "loss": 0.5198, - "mean_token_accuracy": 0.83368486827743, + "loss": 0.4171, + "mean_token_accuracy": 0.8583177721469002, "step": 545 }, { "epoch": 0.7455833404455066, - "grad_norm": 0.11669088292147, + "grad_norm": 0.24171854514556812, "learning_rate": 3.6907309536312276e-06, - "loss": 0.5188, - "mean_token_accuracy": 0.8319960843202355, + "loss": 0.4157, + "mean_token_accuracy": 0.8568488785351885, "step": 546 }, { "epoch": 0.7469488776990697, - "grad_norm": 0.11266193325419242, + "grad_norm": 0.20105071964248444, "learning_rate": 3.6537607501516716e-06, - "loss": 0.4941, - "mean_token_accuracy": 0.8420980395842147, + "loss": 0.3969, + "mean_token_accuracy": 0.8654952584767601, "step": 547 }, { "epoch": 0.748314414952633, - "grad_norm": 0.12165538127524424, + "grad_norm": 0.22792825494280627, "learning_rate": 3.616935211706275e-06, - "loss": 0.4886, - "mean_token_accuracy": 0.8422889851073869, + "loss": 0.3932, + "mean_token_accuracy": 0.8653115392045422, "step": 548 }, { "epoch": 0.7496799522061961, - "grad_norm": 0.11965371171863708, + "grad_norm": 0.2314969835671945, "learning_rate": 3.5802551777477477e-06, - "loss": 0.491, - "mean_token_accuracy": 0.8416745411101729, + "loss": 0.3936, + "mean_token_accuracy": 0.8653473275464092, "step": 549 }, { "epoch": 0.7510454894597594, - "grad_norm": 0.11909727322352157, + "grad_norm": 0.21855230450778143, "learning_rate": 3.543721484411976e-06, - "loss": 0.5218, - "mean_token_accuracy": 0.8325199317383627, + "loss": 0.4186, + "mean_token_accuracy": 0.8572386847718454, "step": 550 }, { "epoch": 0.7524110267133225, - "grad_norm": 0.11803640716315746, + "grad_norm": 0.2179108335195832, "learning_rate": 3.5073349644989563e-06, - "loss": 0.4898, - "mean_token_accuracy": 0.841949507324, + "loss": 0.3948, + "mean_token_accuracy": 0.8645206681557961, "step": 551 }, { "epoch": 0.7537765639668857, - "grad_norm": 0.12236059543804083, + "grad_norm": 0.2216215829360909, "learning_rate": 3.4710964474537967e-06, - "loss": 0.4953, - "mean_token_accuracy": 0.8408841343249858, + "loss": 0.3988, + "mean_token_accuracy": 0.8638077505445968, "step": 552 }, { "epoch": 0.7551421012204489, - "grad_norm": 0.1248562001171319, + "grad_norm": 0.2693768802921377, "learning_rate": 3.435006759347835e-06, - "loss": 0.5167, - "mean_token_accuracy": 0.8332227001168905, + "loss": 0.4142, + "mean_token_accuracy": 0.8578324828930424, "step": 553 }, { "epoch": 0.7565076384740121, - "grad_norm": 0.11276895581435083, + "grad_norm": 0.2226472320558841, "learning_rate": 3.3990667228597816e-06, - "loss": 0.5016, - "mean_token_accuracy": 0.8375532053821566, + "loss": 0.4028, + "mean_token_accuracy": 0.8616278351864007, "step": 554 }, { "epoch": 0.7578731757275753, - "grad_norm": 0.11456688627866897, + "grad_norm": 0.23046863738461842, "learning_rate": 3.3632771572569878e-06, - "loss": 0.511, - "mean_token_accuracy": 0.8342648821170481, + "loss": 0.4096, + "mean_token_accuracy": 0.858772543145848, "step": 555 }, { "epoch": 0.7592387129811385, - "grad_norm": 0.11618184494474865, + "grad_norm": 0.21268282367594163, "learning_rate": 3.3276388783767644e-06, - "loss": 0.5131, - "mean_token_accuracy": 0.8354638851357399, + "loss": 0.4126, + "mean_token_accuracy": 0.8590454775080085, "step": 556 }, { "epoch": 0.7606042502347017, - "grad_norm": 0.11756884620112286, + "grad_norm": 0.2258472656546357, "learning_rate": 3.292152698607768e-06, - "loss": 0.4796, - "mean_token_accuracy": 0.8422441403680243, + "loss": 0.3868, + "mean_token_accuracy": 0.8656557706456838, "step": 557 }, { "epoch": 0.7619697874882649, - "grad_norm": 0.12189085569173493, + "grad_norm": 0.21882486076601618, "learning_rate": 3.256819426871507e-06, - "loss": 0.5089, - "mean_token_accuracy": 0.8346661572016196, + "loss": 0.4104, + "mean_token_accuracy": 0.8588001715592681, "step": 558 }, { "epoch": 0.7633353247418281, - "grad_norm": 0.11761348875726384, + "grad_norm": 0.20092513277692103, "learning_rate": 3.221639868603893e-06, - "loss": 0.5009, - "mean_token_accuracy": 0.8379905760143328, + "loss": 0.4017, + "mean_token_accuracy": 0.8617123529703908, "step": 559 }, { "epoch": 0.7647008619953913, - "grad_norm": 0.11558888224910148, + "grad_norm": 0.2121347752460375, "learning_rate": 3.1866148257368666e-06, - "loss": 0.4961, - "mean_token_accuracy": 0.8398296939485912, + "loss": 0.3975, + "mean_token_accuracy": 0.8637410178015515, "step": 560 }, { "epoch": 0.7660663992489545, - "grad_norm": 0.11961163553792255, + "grad_norm": 0.22278871306746495, "learning_rate": 3.15174509668014e-06, - "loss": 0.501, - "mean_token_accuracy": 0.8373589623309448, + "loss": 0.4025, + "mean_token_accuracy": 0.8612379444311294, "step": 561 }, { "epoch": 0.7674319365025177, - "grad_norm": 0.1139745464836453, + "grad_norm": 0.23213748039939272, "learning_rate": 3.117031476302975e-06, - "loss": 0.5045, - "mean_token_accuracy": 0.8358571079115035, + "loss": 0.4055, + "mean_token_accuracy": 0.8602274083124468, "step": 562 }, { "epoch": 0.7687974737560809, - "grad_norm": 0.11419308684750709, + "grad_norm": 0.2254901351758214, "learning_rate": 3.082474755916084e-06, - "loss": 0.508, - "mean_token_accuracy": 0.8351294121573015, + "loss": 0.4075, + "mean_token_accuracy": 0.8596426308589397, "step": 563 }, { "epoch": 0.7701630110096441, - "grad_norm": 0.11012799550012738, + "grad_norm": 0.2005034006194305, "learning_rate": 3.0480757232535773e-06, - "loss": 0.4957, - "mean_token_accuracy": 0.8399643159918971, + "loss": 0.3981, + "mean_token_accuracy": 0.8630627269518893, "step": 564 }, { "epoch": 0.7715285482632073, - "grad_norm": 0.1129810067399378, + "grad_norm": 0.19345269846448648, "learning_rate": 3.0138351624550165e-06, - "loss": 0.5029, - "mean_token_accuracy": 0.8376797607623926, + "loss": 0.4049, + "mean_token_accuracy": 0.8611821074357777, "step": 565 }, { "epoch": 0.7728940855167705, - "grad_norm": 0.10982581748446492, + "grad_norm": 0.20062995179443605, "learning_rate": 2.9797538540475223e-06, - "loss": 0.4947, - "mean_token_accuracy": 0.8402974842155376, + "loss": 0.3988, + "mean_token_accuracy": 0.8634990578142336, "step": 566 }, { "epoch": 0.7742596227703337, - "grad_norm": 0.13479390205133246, + "grad_norm": 0.23164980231281976, "learning_rate": 2.945832574928006e-06, - "loss": 0.5235, - "mean_token_accuracy": 0.8315519425090838, + "loss": 0.4215, + "mean_token_accuracy": 0.8557358468375269, "step": 567 }, { "epoch": 0.7756251600238969, - "grad_norm": 0.11787918748373294, + "grad_norm": 0.20342056199797945, "learning_rate": 2.9120720983454465e-06, - "loss": 0.511, - "mean_token_accuracy": 0.8331791928656408, + "loss": 0.4121, + "mean_token_accuracy": 0.8570414758839693, "step": 568 }, { "epoch": 0.7769906972774601, - "grad_norm": 0.11736892948341286, + "grad_norm": 0.22910941122839265, "learning_rate": 2.8784731938832556e-06, - "loss": 0.5084, - "mean_token_accuracy": 0.8353107062002897, + "loss": 0.4087, + "mean_token_accuracy": 0.8596079874910577, "step": 569 }, { "epoch": 0.7783562345310233, - "grad_norm": 0.11262788968600397, + "grad_norm": 0.21416157641581723, "learning_rate": 2.845036627441755e-06, - "loss": 0.4792, - "mean_token_accuracy": 0.8460800279044619, + "loss": 0.3833, + "mean_token_accuracy": 0.8691450521802337, "step": 570 }, { "epoch": 0.7797217717845865, - "grad_norm": 0.11688222818165418, + "grad_norm": 0.1977219313107081, "learning_rate": 2.8117631612207084e-06, - "loss": 0.5132, - "mean_token_accuracy": 0.8374022325550948, + "loss": 0.4121, + "mean_token_accuracy": 0.8610763435801025, "step": 571 }, { "epoch": 0.7810873090381497, - "grad_norm": 0.11460785824882291, + "grad_norm": 0.20321588988831457, "learning_rate": 2.778653553701932e-06, - "loss": 0.5071, - "mean_token_accuracy": 0.8364371647164465, + "loss": 0.4057, + "mean_token_accuracy": 0.8604487017005163, "step": 572 }, { "epoch": 0.7824528462917129, - "grad_norm": 0.11674788128613457, + "grad_norm": 0.21685215914126496, "learning_rate": 2.745708559632032e-06, - "loss": 0.493, - "mean_token_accuracy": 0.8395484350248054, + "loss": 0.3965, + "mean_token_accuracy": 0.8631075593834338, "step": 573 }, { "epoch": 0.7838183835452761, - "grad_norm": 0.10973525866915593, + "grad_norm": 0.21247170342394922, "learning_rate": 2.7129289300051788e-06, - "loss": 0.5074, - "mean_token_accuracy": 0.836805511584643, + "loss": 0.4066, + "mean_token_accuracy": 0.8607861090034046, "step": 574 }, { "epoch": 0.7851839207988393, - "grad_norm": 0.11898153388966362, + "grad_norm": 0.22155579546490167, "learning_rate": 2.6803154120460007e-06, - "loss": 0.5083, - "mean_token_accuracy": 0.8379666968188276, + "loss": 0.4072, + "mean_token_accuracy": 0.8621495738744454, "step": 575 }, { "epoch": 0.7865494580524025, - "grad_norm": 0.11711375259726986, + "grad_norm": 0.20936523040531713, "learning_rate": 2.647868749192536e-06, - "loss": 0.5242, - "mean_token_accuracy": 0.830850717305942, + "loss": 0.4187, + "mean_token_accuracy": 0.8561885820547165, "step": 576 }, { "epoch": 0.7879149953059656, - "grad_norm": 0.11840694396718332, + "grad_norm": 0.2169635727623937, "learning_rate": 2.6155896810793036e-06, - "loss": 0.5152, - "mean_token_accuracy": 0.8346155102504653, + "loss": 0.4118, + "mean_token_accuracy": 0.8589148130153295, "step": 577 }, { "epoch": 0.7892805325595289, - "grad_norm": 0.11538532654552464, + "grad_norm": 0.20681767217525526, "learning_rate": 2.5834789435204245e-06, - "loss": 0.5124, - "mean_token_accuracy": 0.8349452790087847, + "loss": 0.4088, + "mean_token_accuracy": 0.8606342267092991, "step": 578 }, { "epoch": 0.790646069813092, - "grad_norm": 0.11660606783702616, + "grad_norm": 0.2213528976805314, "learning_rate": 2.5515372684928687e-06, - "loss": 0.525, - "mean_token_accuracy": 0.8306076418299736, + "loss": 0.4222, + "mean_token_accuracy": 0.8550210063940413, "step": 579 }, { "epoch": 0.7920116070666553, - "grad_norm": 0.12059875147184261, + "grad_norm": 0.22149304881479331, "learning_rate": 2.5197653841197546e-06, - "loss": 0.5105, - "mean_token_accuracy": 0.834809458566115, + "loss": 0.4114, + "mean_token_accuracy": 0.8591611937568427, "step": 580 }, { "epoch": 0.7933771443202184, - "grad_norm": 0.11347992046856244, + "grad_norm": 0.19889209166062613, "learning_rate": 2.48816401465375e-06, - "loss": 0.5061, - "mean_token_accuracy": 0.8368637398560137, + "loss": 0.4057, + "mean_token_accuracy": 0.861537611573371, "step": 581 }, { "epoch": 0.7947426815737817, - "grad_norm": 0.12036923892295148, + "grad_norm": 0.24049345793108362, "learning_rate": 2.4567338804605756e-06, - "loss": 0.4976, - "mean_token_accuracy": 0.8388082379825385, + "loss": 0.4, + "mean_token_accuracy": 0.86237969255477, "step": 582 }, { "epoch": 0.7961082188273448, - "grad_norm": 0.12069961503530723, + "grad_norm": 0.23885430272558048, "learning_rate": 2.425475698002577e-06, - "loss": 0.5178, - "mean_token_accuracy": 0.8335224317901346, + "loss": 0.4154, + "mean_token_accuracy": 0.8575069291832286, "step": 583 }, { "epoch": 0.7974737560809081, - "grad_norm": 0.11639077541880397, + "grad_norm": 0.22824330369698562, "learning_rate": 2.394390179822382e-06, - "loss": 0.5206, - "mean_token_accuracy": 0.8330568081118227, + "loss": 0.4182, + "mean_token_accuracy": 0.8576008710768586, "step": 584 }, { "epoch": 0.7988392933344712, - "grad_norm": 0.1196751424663686, + "grad_norm": 0.21974915679821652, "learning_rate": 2.3634780345266805e-06, - "loss": 0.4966, - "mean_token_accuracy": 0.8404542399892221, + "loss": 0.3963, + "mean_token_accuracy": 0.8647796993722388, "step": 585 }, { "epoch": 0.8002048305880345, - "grad_norm": 0.12162568785532, + "grad_norm": 0.2310148673403252, "learning_rate": 2.332739966770048e-06, - "loss": 0.4986, - "mean_token_accuracy": 0.8371755384062571, + "loss": 0.4005, + "mean_token_accuracy": 0.8606528012118717, "step": 586 }, { "epoch": 0.8015703678415976, - "grad_norm": 0.11988395331929189, + "grad_norm": 0.21268357727111933, "learning_rate": 2.3021766772388986e-06, - "loss": 0.5093, - "mean_token_accuracy": 0.8352646608742899, + "loss": 0.4104, + "mean_token_accuracy": 0.8596851429925462, "step": 587 }, { "epoch": 0.8029359050951609, - "grad_norm": 0.11882054644862249, + "grad_norm": 0.22436026449039995, "learning_rate": 2.271788862635513e-06, - "loss": 0.517, - "mean_token_accuracy": 0.8330110392874214, + "loss": 0.415, + "mean_token_accuracy": 0.8579554313395271, "step": 588 }, { "epoch": 0.804301442348724, - "grad_norm": 0.11682688459158977, + "grad_norm": 0.2401381634931562, "learning_rate": 2.2415772156621387e-06, - "loss": 0.5316, - "mean_token_accuracy": 0.827527468841931, + "loss": 0.4281, + "mean_token_accuracy": 0.8525374331069293, "step": 589 }, { "epoch": 0.8056669796022873, - "grad_norm": 0.11445211984808887, + "grad_norm": 0.23642180241330024, "learning_rate": 2.211542425005223e-06, - "loss": 0.5317, - "mean_token_accuracy": 0.8296895731511927, + "loss": 0.427, + "mean_token_accuracy": 0.8546586111147729, "step": 590 }, { "epoch": 0.8070325168558504, - "grad_norm": 0.11599146566146142, + "grad_norm": 0.2115384062221628, "learning_rate": 2.1816851753197023e-06, - "loss": 0.5127, - "mean_token_accuracy": 0.8346026160724543, + "loss": 0.4114, + "mean_token_accuracy": 0.8595850270091632, "step": 591 }, { "epoch": 0.8083980541094137, - "grad_norm": 0.11748830758899777, + "grad_norm": 0.21400197301419835, "learning_rate": 2.1520061472133903e-06, - "loss": 0.519, - "mean_token_accuracy": 0.8355540659862308, + "loss": 0.4163, + "mean_token_accuracy": 0.8604726451795677, "step": 592 }, { "epoch": 0.8097635913629768, - "grad_norm": 0.11171334221970475, + "grad_norm": 0.2017117241684196, "learning_rate": 2.1225060172314773e-06, - "loss": 0.4964, - "mean_token_accuracy": 0.8396549091980724, + "loss": 0.3995, + "mean_token_accuracy": 0.8632549469473196, "step": 593 }, { "epoch": 0.8111291286165401, - "grad_norm": 0.11063090997610646, + "grad_norm": 0.19894011888129304, "learning_rate": 2.0931854578410904e-06, - "loss": 0.4986, - "mean_token_accuracy": 0.8407643654980306, + "loss": 0.3991, + "mean_token_accuracy": 0.8642028385916231, "step": 594 }, { "epoch": 0.8124946658701032, - "grad_norm": 0.11729739862861373, + "grad_norm": 0.2298631393329404, "learning_rate": 2.064045137415982e-06, - "loss": 0.5177, - "mean_token_accuracy": 0.833629232409927, + "loss": 0.4166, + "mean_token_accuracy": 0.8576577016298331, "step": 595 }, { "epoch": 0.8138602031236665, - "grad_norm": 0.11559783542365167, + "grad_norm": 0.19264903302284703, "learning_rate": 2.0350857202212883e-06, - "loss": 0.5016, - "mean_token_accuracy": 0.839657608725398, + "loss": 0.4031, + "mean_token_accuracy": 0.8630792968512606, "step": 596 }, { "epoch": 0.8152257403772296, - "grad_norm": 0.11406110487413934, + "grad_norm": 0.24074756197143313, "learning_rate": 2.0063078663983716e-06, - "loss": 0.4794, - "mean_token_accuracy": 0.8443633389839449, + "loss": 0.3853, + "mean_token_accuracy": 0.867487619049804, "step": 597 }, { "epoch": 0.8165912776307929, - "grad_norm": 0.11979346631518473, + "grad_norm": 0.22693604058548678, "learning_rate": 1.977712231949799e-06, - "loss": 0.5015, - "mean_token_accuracy": 0.8373713200653236, + "loss": 0.4025, + "mean_token_accuracy": 0.8613917192076461, "step": 598 }, { "epoch": 0.817956814884356, - "grad_norm": 0.11558793774565151, + "grad_norm": 0.2158385478614588, "learning_rate": 1.9492994687243715e-06, - "loss": 0.516, - "mean_token_accuracy": 0.832604701143526, + "loss": 0.4148, + "mean_token_accuracy": 0.8566573413371225, "step": 599 }, { "epoch": 0.8193223521379193, - "grad_norm": 0.11998008263108416, + "grad_norm": 0.2531283651455049, "learning_rate": 1.9210702244022616e-06, - "loss": 0.526, - "mean_token_accuracy": 0.8303269585033245, + "loss": 0.4223, + "mean_token_accuracy": 0.8552704816161152, "step": 600 }, { "epoch": 0.8206878893914825, - "grad_norm": 0.11632873200164885, + "grad_norm": 0.22124530534828935, "learning_rate": 1.8930251424802648e-06, - "loss": 0.5209, - "mean_token_accuracy": 0.8320041006028278, + "loss": 0.4173, + "mean_token_accuracy": 0.857422562323455, "step": 601 }, { "epoch": 0.8220534266450457, - "grad_norm": 0.12308333277590386, + "grad_norm": 0.22916123571012667, "learning_rate": 1.8651648622571128e-06, - "loss": 0.5086, - "mean_token_accuracy": 0.8355395950991652, + "loss": 0.4093, + "mean_token_accuracy": 0.8592705248451191, "step": 602 }, { "epoch": 0.8234189638986089, - "grad_norm": 0.11697523667295892, + "grad_norm": 0.23653504780930373, "learning_rate": 1.8374900188189172e-06, - "loss": 0.5111, - "mean_token_accuracy": 0.8362037328117217, + "loss": 0.4115, + "mean_token_accuracy": 0.859768333670514, "step": 603 }, { "epoch": 0.8247845011521721, - "grad_norm": 0.11564007588567202, + "grad_norm": 0.2335856356579208, "learning_rate": 1.8100012430246838e-06, - "loss": 0.4971, - "mean_token_accuracy": 0.8388235219823784, + "loss": 0.3975, + "mean_token_accuracy": 0.8634662846156729, "step": 604 }, { "epoch": 0.8261500384057353, - "grad_norm": 0.11214233024693822, + "grad_norm": 0.25602917462585195, "learning_rate": 1.7826991614919264e-06, - "loss": 0.4958, - "mean_token_accuracy": 0.8407072686009562, + "loss": 0.399, + "mean_token_accuracy": 0.8641089744244426, "step": 605 }, { "epoch": 0.8275155756592985, - "grad_norm": 0.11635824570211564, + "grad_norm": 0.2764089716972941, "learning_rate": 1.7555843965823992e-06, - "loss": 0.518, - "mean_token_accuracy": 0.8344412049911978, + "loss": 0.4155, + "mean_token_accuracy": 0.8591620612370429, "step": 606 }, { "epoch": 0.8288811129128617, - "grad_norm": 0.11597654730359575, + "grad_norm": 0.22584056903219576, "learning_rate": 1.728657566387888e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8426503994278581, + "loss": 0.3899, + "mean_token_accuracy": 0.8659440792321365, "step": 607 }, { "epoch": 0.8302466501664248, - "grad_norm": 0.11870412986446237, + "grad_norm": 0.20560426304959245, "learning_rate": 1.7019192847161425e-06, - "loss": 0.5004, - "mean_token_accuracy": 0.8408751527420388, + "loss": 0.4031, + "mean_token_accuracy": 0.8641318011003372, "step": 608 }, { "epoch": 0.8316121874199881, - "grad_norm": 0.11528641611496719, + "grad_norm": 0.2153106481024899, "learning_rate": 1.6753701610768724e-06, - "loss": 0.4999, - "mean_token_accuracy": 0.8378618191266598, + "loss": 0.4017, + "mean_token_accuracy": 0.8615005117867286, "step": 609 }, { "epoch": 0.8329777246735512, - "grad_norm": 0.1177831931060255, + "grad_norm": 0.21220319167956136, "learning_rate": 1.6490108006678495e-06, - "loss": 0.5288, - "mean_token_accuracy": 0.8313870578742857, + "loss": 0.4252, + "mean_token_accuracy": 0.8564356615328043, "step": 610 }, { "epoch": 0.8343432619271145, - "grad_norm": 0.11470573263848499, + "grad_norm": 0.22552058407128117, "learning_rate": 1.6228418043611227e-06, - "loss": 0.5202, - "mean_token_accuracy": 0.8344051779654352, + "loss": 0.4168, + "mean_token_accuracy": 0.8588175846725294, "step": 611 }, { "epoch": 0.8357087991806776, - "grad_norm": 0.11129408661129161, + "grad_norm": 0.24876240121034418, "learning_rate": 1.5968637686893186e-06, - "loss": 0.5099, - "mean_token_accuracy": 0.8349943980749097, + "loss": 0.4094, + "mean_token_accuracy": 0.8588914171490348, "step": 612 }, { "epoch": 0.8370743364342409, - "grad_norm": 0.11134000261717522, + "grad_norm": 0.21709497476785303, "learning_rate": 1.57107728583203e-06, - "loss": 0.4642, - "mean_token_accuracy": 0.8485976867844193, + "loss": 0.3742, + "mean_token_accuracy": 0.8702544465648172, "step": 613 }, { "epoch": 0.838439873687804, - "grad_norm": 0.11529812819046667, + "grad_norm": 0.2300739934314909, "learning_rate": 1.5454829436023411e-06, - "loss": 0.5001, - "mean_token_accuracy": 0.8390528996851931, + "loss": 0.4011, + "mean_token_accuracy": 0.8626383013253395, "step": 614 }, { "epoch": 0.8398054109413673, - "grad_norm": 0.11316916607937834, + "grad_norm": 0.2233661831188188, "learning_rate": 1.5200813254334013e-06, - "loss": 0.4995, - "mean_token_accuracy": 0.8376760897431939, + "loss": 0.4013, + "mean_token_accuracy": 0.8617884600134901, "step": 615 }, { "epoch": 0.8411709481949304, - "grad_norm": 0.11882350595565981, + "grad_norm": 0.2352272063568746, "learning_rate": 1.4948730103651498e-06, - "loss": 0.4894, - "mean_token_accuracy": 0.8412361716603581, + "loss": 0.3927, + "mean_token_accuracy": 0.8648909153839965, "step": 616 }, { "epoch": 0.8425364854484937, - "grad_norm": 0.10982632788718007, + "grad_norm": 0.2040512264971376, "learning_rate": 1.4698585730311e-06, - "loss": 0.5096, - "mean_token_accuracy": 0.8361300345687833, + "loss": 0.409, + "mean_token_accuracy": 0.8602341351681331, "step": 617 }, { "epoch": 0.8439020227020568, - "grad_norm": 0.11621170622862408, + "grad_norm": 0.2318894554427145, "learning_rate": 1.445038583645243e-06, - "loss": 0.5181, - "mean_token_accuracy": 0.835967541618596, + "loss": 0.4161, + "mean_token_accuracy": 0.8594872435807032, "step": 618 }, { "epoch": 0.8452675599556201, - "grad_norm": 0.1150010070222551, + "grad_norm": 0.20445356255956093, "learning_rate": 1.4204136079890585e-06, - "loss": 0.5029, - "mean_token_accuracy": 0.8400967106993116, + "loss": 0.4031, + "mean_token_accuracy": 0.8638401437613202, "step": 619 }, { "epoch": 0.8466330972091832, - "grad_norm": 0.10915712480301275, + "grad_norm": 0.21641039619872623, "learning_rate": 1.3959842073986085e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8416240033706907, + "loss": 0.3941, + "mean_token_accuracy": 0.8646075956314934, "step": 620 }, { "epoch": 0.8479986344627465, - "grad_norm": 0.11201163464497914, + "grad_norm": 0.20773725495938888, "learning_rate": 1.3717509387517393e-06, - "loss": 0.5113, - "mean_token_accuracy": 0.8350683521842426, + "loss": 0.4082, + "mean_token_accuracy": 0.860045570440718, "step": 621 }, { "epoch": 0.8493641717163096, - "grad_norm": 0.1099077063188645, + "grad_norm": 0.21271505922050996, "learning_rate": 1.3477143544553994e-06, - "loss": 0.5072, - "mean_token_accuracy": 0.8361682952111748, + "loss": 0.4082, + "mean_token_accuracy": 0.8599915463398872, "step": 622 }, { "epoch": 0.8507297089698729, - "grad_norm": 0.1177480852130544, + "grad_norm": 0.22401954459630313, "learning_rate": 1.3238750024330338e-06, - "loss": 0.5216, - "mean_token_accuracy": 0.8311884203044047, + "loss": 0.4192, + "mean_token_accuracy": 0.8563828512549275, "step": 623 }, { "epoch": 0.852095246223436, - "grad_norm": 0.11298940731199696, + "grad_norm": 0.22260874162883426, "learning_rate": 1.300233426112103e-06, - "loss": 0.5022, - "mean_token_accuracy": 0.8376622325065157, + "loss": 0.4048, + "mean_token_accuracy": 0.8611301206735221, "step": 624 }, { "epoch": 0.8534607834769993, - "grad_norm": 0.14190492793596424, + "grad_norm": 0.21152827385871018, "learning_rate": 1.2767901644116943e-06, - "loss": 0.5179, - "mean_token_accuracy": 0.8356995508191599, + "loss": 0.4146, + "mean_token_accuracy": 0.8595371254035457, "step": 625 }, { "epoch": 0.8548263207305624, - "grad_norm": 0.11512453699797792, + "grad_norm": 0.22819406153754265, "learning_rate": 1.2535457517302262e-06, - "loss": 0.507, - "mean_token_accuracy": 0.8360134229515888, + "loss": 0.4084, + "mean_token_accuracy": 0.860028563771859, "step": 626 }, { "epoch": 0.8561918579841257, - "grad_norm": 0.10742493165304996, + "grad_norm": 0.20005533487863839, "learning_rate": 1.2305007179332851e-06, - "loss": 0.5026, - "mean_token_accuracy": 0.8383856018309209, + "loss": 0.4017, + "mean_token_accuracy": 0.8622309813185363, "step": 627 }, { "epoch": 0.8575573952376888, - "grad_norm": 0.10790894270685043, + "grad_norm": 0.20176762634176584, "learning_rate": 1.2076555883415342e-06, - "loss": 0.5348, - "mean_token_accuracy": 0.828341897981255, + "loss": 0.4277, + "mean_token_accuracy": 0.8536327333990061, "step": 628 }, { "epoch": 0.8589229324912521, - "grad_norm": 0.11139506266424495, + "grad_norm": 0.2011241526651703, "learning_rate": 1.1850108837187336e-06, - "loss": 0.5111, - "mean_token_accuracy": 0.8342853594255236, + "loss": 0.4086, + "mean_token_accuracy": 0.8592316098999795, "step": 629 }, { "epoch": 0.8602884697448152, - "grad_norm": 0.11126208800725737, + "grad_norm": 0.22681886387157124, "learning_rate": 1.1625671202598875e-06, - "loss": 0.517, - "mean_token_accuracy": 0.8347189488130645, + "loss": 0.4146, + "mean_token_accuracy": 0.8590380670131639, "step": 630 }, { "epoch": 0.8616540069983785, - "grad_norm": 0.11386011889991697, + "grad_norm": 0.21912007754709245, "learning_rate": 1.1403248095794629e-06, - "loss": 0.4974, - "mean_token_accuracy": 0.8381483169472407, + "loss": 0.3988, + "mean_token_accuracy": 0.862137042972187, "step": 631 }, { "epoch": 0.8630195442519416, - "grad_norm": 0.11214346488745128, + "grad_norm": 0.1991013202311376, "learning_rate": 1.1182844586997266e-06, - "loss": 0.5089, - "mean_token_accuracy": 0.837121430727933, + "loss": 0.4096, + "mean_token_accuracy": 0.8605257073793695, "step": 632 }, { "epoch": 0.8643850815055049, - "grad_norm": 0.11981293100575567, + "grad_norm": 0.23467672143416735, "learning_rate": 1.0964465700391979e-06, - "loss": 0.5126, - "mean_token_accuracy": 0.8352447284170138, + "loss": 0.4108, + "mean_token_accuracy": 0.8596458375918582, "step": 633 }, { "epoch": 0.865750618759068, - "grad_norm": 0.11390812887343923, + "grad_norm": 0.21912392965364608, "learning_rate": 1.074811641401189e-06, - "loss": 0.5328, - "mean_token_accuracy": 0.8305933498749366, + "loss": 0.4262, + "mean_token_accuracy": 0.8559964604503356, "step": 634 }, { "epoch": 0.8671161560126313, - "grad_norm": 0.11186386777882368, + "grad_norm": 0.20615463939848364, "learning_rate": 1.0533801659624531e-06, - "loss": 0.5207, - "mean_token_accuracy": 0.8324386068365852, + "loss": 0.4204, + "mean_token_accuracy": 0.8560936656792137, "step": 635 }, { "epoch": 0.8684816932661944, - "grad_norm": 0.11078824964803047, + "grad_norm": 0.19348528692872238, "learning_rate": 1.0321526322619536e-06, - "loss": 0.5052, - "mean_token_accuracy": 0.8358015203038585, + "loss": 0.4037, + "mean_token_accuracy": 0.8607088903270385, "step": 636 }, { "epoch": 0.8698472305197577, - "grad_norm": 0.11321950338788017, + "grad_norm": 0.20196629259646406, "learning_rate": 1.0111295241897156e-06, - "loss": 0.5041, - "mean_token_accuracy": 0.8390811938259863, + "loss": 0.4049, + "mean_token_accuracy": 0.8631067104794972, "step": 637 }, { "epoch": 0.8712127677733208, - "grad_norm": 0.11775153761667745, + "grad_norm": 0.2106867654585255, "learning_rate": 9.903113209758098e-07, - "loss": 0.5047, - "mean_token_accuracy": 0.835893830641056, + "loss": 0.4042, + "mean_token_accuracy": 0.8603632609752478, "step": 638 }, { "epoch": 0.872578305026884, - "grad_norm": 0.11033783865348244, + "grad_norm": 0.20073052012465886, "learning_rate": 9.696984971794066e-07, - "loss": 0.496, - "mean_token_accuracy": 0.8388374081104281, + "loss": 0.3981, + "mean_token_accuracy": 0.8624080819096971, "step": 639 }, { "epoch": 0.8739438422804472, - "grad_norm": 0.11065055423649138, + "grad_norm": 0.20533835471618186, "learning_rate": 9.492915226779809e-07, - "loss": 0.4866, - "mean_token_accuracy": 0.841654586282271, + "loss": 0.3915, + "mean_token_accuracy": 0.8646312124436534, "step": 640 }, { "epoch": 0.8753093795340104, - "grad_norm": 0.11484091301092435, + "grad_norm": 0.19390504484310184, "learning_rate": 9.290908626565931e-07, - "loss": 0.5082, - "mean_token_accuracy": 0.8388804422871653, + "loss": 0.4064, + "mean_token_accuracy": 0.8629363178699737, "step": 641 }, { "epoch": 0.8766749167875736, - "grad_norm": 0.11365765069737992, + "grad_norm": 0.21843007849082088, "learning_rate": 9.090969775972736e-07, - "loss": 0.501, - "mean_token_accuracy": 0.8374278913380397, + "loss": 0.4028, + "mean_token_accuracy": 0.8616676869214925, "step": 642 }, { "epoch": 0.8780404540411368, - "grad_norm": 0.11346825635449162, + "grad_norm": 0.2219811855088307, "learning_rate": 8.89310323268544e-07, - "loss": 0.5003, - "mean_token_accuracy": 0.8378310199566601, + "loss": 0.4001, + "mean_token_accuracy": 0.8622636044555403, "step": 643 }, { "epoch": 0.8794059912947, - "grad_norm": 0.1101692402345365, + "grad_norm": 0.24251626425587186, "learning_rate": 8.697313507150184e-07, - "loss": 0.5038, - "mean_token_accuracy": 0.8359207427295466, + "loss": 0.4033, + "mean_token_accuracy": 0.8601248910938767, "step": 644 }, { "epoch": 0.8807715285482632, - "grad_norm": 0.11057021493179596, + "grad_norm": 0.19641237184380816, "learning_rate": 8.503605062471187e-07, - "loss": 0.5037, - "mean_token_accuracy": 0.8356548380625114, + "loss": 0.4061, + "mean_token_accuracy": 0.8596051420978411, "step": 645 }, { "epoch": 0.8821370658018264, - "grad_norm": 0.11424492006269765, + "grad_norm": 0.20131416744818392, "learning_rate": 8.311982314309109e-07, - "loss": 0.5128, - "mean_token_accuracy": 0.8356388965546533, + "loss": 0.4113, + "mean_token_accuracy": 0.8598440955132611, "step": 646 }, { "epoch": 0.8835026030553896, - "grad_norm": 0.11919856526301476, + "grad_norm": 0.21190627535161183, "learning_rate": 8.122449630780238e-07, - "loss": 0.519, - "mean_token_accuracy": 0.8327954123234851, + "loss": 0.416, + "mean_token_accuracy": 0.8575635125425897, "step": 647 }, { "epoch": 0.8848681403089528, - "grad_norm": 0.10886205229555825, + "grad_norm": 0.2167630347166107, "learning_rate": 7.935011332357113e-07, - "loss": 0.5296, - "mean_token_accuracy": 0.8308169729542884, + "loss": 0.4242, + "mean_token_accuracy": 0.8560313013882472, "step": 648 }, { "epoch": 0.886233677562516, - "grad_norm": 0.11083997314202437, + "grad_norm": 0.19473548273128163, "learning_rate": 7.749671691769911e-07, - "loss": 0.4784, - "mean_token_accuracy": 0.844519885116075, + "loss": 0.3845, + "mean_token_accuracy": 0.8675812913768379, "step": 649 }, { "epoch": 0.8875992148160792, - "grad_norm": 0.11126429019921633, + "grad_norm": 0.2060396098320868, "learning_rate": 7.566434933909006e-07, - "loss": 0.506, - "mean_token_accuracy": 0.8355286632834008, + "loss": 0.4075, + "mean_token_accuracy": 0.8595329796977811, "step": 650 }, { "epoch": 0.8889647520696424, - "grad_norm": 0.11071687473778655, + "grad_norm": 0.20145474208727362, "learning_rate": 7.385305235728801e-07, - "loss": 0.5073, - "mean_token_accuracy": 0.8351483707336379, + "loss": 0.4082, + "mean_token_accuracy": 0.8592664341512477, "step": 651 }, { "epoch": 0.8903302893232056, - "grad_norm": 0.10719881397350657, + "grad_norm": 0.19909231314570291, "learning_rate": 7.206286726152434e-07, - "loss": 0.5063, - "mean_token_accuracy": 0.8345491084167596, + "loss": 0.4054, + "mean_token_accuracy": 0.8594394158909082, "step": 652 }, { "epoch": 0.8916958265767688, - "grad_norm": 0.11208421820496033, + "grad_norm": 0.1919793886619024, "learning_rate": 7.029383485977625e-07, - "loss": 0.5042, - "mean_token_accuracy": 0.839051173845506, + "loss": 0.4035, + "mean_token_accuracy": 0.8634067122462318, "step": 653 }, { "epoch": 0.893061363830332, - "grad_norm": 0.11710324217320399, + "grad_norm": 0.20544119012492554, "learning_rate": 6.854599547783736e-07, - "loss": 0.5052, - "mean_token_accuracy": 0.8360346698009601, + "loss": 0.4058, + "mean_token_accuracy": 0.8596657002546876, "step": 654 }, { "epoch": 0.8944269010838952, - "grad_norm": 0.11432931293058372, + "grad_norm": 0.2228496860347481, "learning_rate": 6.681938895839746e-07, - "loss": 0.4989, - "mean_token_accuracy": 0.8383115370138072, + "loss": 0.4002, + "mean_token_accuracy": 0.862631288272368, "step": 655 }, { "epoch": 0.8957924383374584, - "grad_norm": 0.11097700087706944, + "grad_norm": 0.19018327112122752, "learning_rate": 6.511405466013532e-07, - "loss": 0.5208, - "mean_token_accuracy": 0.8323431431340523, + "loss": 0.418, + "mean_token_accuracy": 0.8573043089730195, "step": 656 }, { "epoch": 0.8971579755910216, - "grad_norm": 0.11277952204872761, + "grad_norm": 0.19393215931484176, "learning_rate": 6.343003145682114e-07, - "loss": 0.5023, - "mean_token_accuracy": 0.8378914518462077, + "loss": 0.4026, + "mean_token_accuracy": 0.8614415069733373, "step": 657 }, { "epoch": 0.8985235128445848, - "grad_norm": 0.1116468721910487, + "grad_norm": 0.20746626572888896, "learning_rate": 6.176735773642962e-07, - "loss": 0.5053, - "mean_token_accuracy": 0.836355200870225, + "loss": 0.4043, + "mean_token_accuracy": 0.8609983269079549, "step": 658 }, { "epoch": 0.899889050098148, - "grad_norm": 0.1171076831692857, + "grad_norm": 0.23075626629760573, "learning_rate": 6.012607140026605e-07, - "loss": 0.5125, - "mean_token_accuracy": 0.8352880849088211, + "loss": 0.4122, + "mean_token_accuracy": 0.8590837704293297, "step": 659 }, { "epoch": 0.9012545873517112, - "grad_norm": 0.11198395063209084, + "grad_norm": 0.21081958088922736, "learning_rate": 5.850620986210198e-07, - "loss": 0.4941, - "mean_token_accuracy": 0.8421397675914322, + "loss": 0.3963, + "mean_token_accuracy": 0.8651597082259812, "step": 660 }, { "epoch": 0.9026201246052744, - "grad_norm": 0.10745690514308064, + "grad_norm": 0.21363684414515646, "learning_rate": 5.69078100473216e-07, - "loss": 0.5014, - "mean_token_accuracy": 0.8385299910132467, + "loss": 0.4012, + "mean_token_accuracy": 0.8625379850354203, "step": 661 }, { "epoch": 0.9039856618588376, - "grad_norm": 0.11420887630440482, + "grad_norm": 0.19570623175347346, "learning_rate": 5.533090839208133e-07, - "loss": 0.5154, - "mean_token_accuracy": 0.832754059967286, + "loss": 0.414, + "mean_token_accuracy": 0.8574432460527732, "step": 662 }, { "epoch": 0.9053511991124008, - "grad_norm": 0.11154480390012243, + "grad_norm": 0.20136402481480803, "learning_rate": 5.377554084247772e-07, - "loss": 0.5046, - "mean_token_accuracy": 0.8361881346374621, + "loss": 0.4064, + "mean_token_accuracy": 0.8597524152182047, "step": 663 }, { "epoch": 0.906716736365964, - "grad_norm": 0.11482991482585568, + "grad_norm": 0.20829638794206976, "learning_rate": 5.224174285372973e-07, - "loss": 0.5059, - "mean_token_accuracy": 0.837772663780607, + "loss": 0.4066, + "mean_token_accuracy": 0.8612470283150278, "step": 664 }, { "epoch": 0.9080822736195272, - "grad_norm": 0.1171602146921763, + "grad_norm": 0.20559896049133375, "learning_rate": 5.072954938936925e-07, - "loss": 0.4941, - "mean_token_accuracy": 0.8406731439063213, + "loss": 0.3977, + "mean_token_accuracy": 0.8636562342939346, "step": 665 }, { "epoch": 0.9094478108730903, - "grad_norm": 0.11720042900398887, + "grad_norm": 0.24180013304406187, "learning_rate": 4.923899492044437e-07, - "loss": 0.5079, - "mean_token_accuracy": 0.8382324138906574, + "loss": 0.4071, + "mean_token_accuracy": 0.8618068633494765, "step": 666 }, { "epoch": 0.9108133481266536, - "grad_norm": 0.11373179065032414, + "grad_norm": 0.20544052086394768, "learning_rate": 4.777011342473392e-07, - "loss": 0.5068, - "mean_token_accuracy": 0.8362159975468011, + "loss": 0.4069, + "mean_token_accuracy": 0.8607662403616217, "step": 667 }, { "epoch": 0.9121788853802167, - "grad_norm": 0.11077588522387365, + "grad_norm": 0.20218844698835448, "learning_rate": 4.632293838597246e-07, - "loss": 0.5066, - "mean_token_accuracy": 0.8378361441476436, + "loss": 0.4086, + "mean_token_accuracy": 0.8614379669227725, "step": 668 }, { "epoch": 0.91354442263378, - "grad_norm": 0.11191958943870293, + "grad_norm": 0.19849540081460978, "learning_rate": 4.4897502793087576e-07, - "loss": 0.504, - "mean_token_accuracy": 0.8406458461676389, + "loss": 0.4041, + "mean_token_accuracy": 0.8642634187828342, "step": 669 }, { "epoch": 0.9149099598873431, - "grad_norm": 0.11474595337745339, + "grad_norm": 0.20745329155441244, "learning_rate": 4.3493839139447716e-07, - "loss": 0.5416, - "mean_token_accuracy": 0.8293075174041336, + "loss": 0.4348, + "mean_token_accuracy": 0.8541953811730532, "step": 670 }, { "epoch": 0.9162754971409064, - "grad_norm": 0.11598843401090912, + "grad_norm": 0.2287863594624806, "learning_rate": 4.2111979422120863e-07, - "loss": 0.5062, - "mean_token_accuracy": 0.8366627757644588, + "loss": 0.4053, + "mean_token_accuracy": 0.8608941533244443, "step": 671 }, { "epoch": 0.9176410343944695, - "grad_norm": 0.10817410367026609, + "grad_norm": 0.1925356796364094, "learning_rate": 4.075195514114594e-07, - "loss": 0.4726, - "mean_token_accuracy": 0.8458128662654261, + "loss": 0.3803, + "mean_token_accuracy": 0.8682317326543381, "step": 672 }, { "epoch": 0.9190065716480328, - "grad_norm": 0.1090866323593536, + "grad_norm": 0.18548297023650745, "learning_rate": 3.941379729881456e-07, - "loss": 0.4913, - "mean_token_accuracy": 0.841529020698105, + "loss": 0.3944, + "mean_token_accuracy": 0.864907673080918, "step": 673 }, { "epoch": 0.9203721089015959, - "grad_norm": 0.1131303073662891, + "grad_norm": 0.21719662700717418, "learning_rate": 3.8097536398963965e-07, - "loss": 0.4942, - "mean_token_accuracy": 0.8399036755552375, + "loss": 0.3976, + "mean_token_accuracy": 0.8632850150504787, "step": 674 }, { "epoch": 0.9217376461551592, - "grad_norm": 0.11143888958165932, + "grad_norm": 0.21291688480092102, "learning_rate": 3.6803202446282217e-07, - "loss": 0.5397, - "mean_token_accuracy": 0.8266207918888656, + "loss": 0.4331, + "mean_token_accuracy": 0.8520979656086647, "step": 675 }, { "epoch": 0.9231031834087223, - "grad_norm": 0.10944128693026685, + "grad_norm": 0.19317640877157852, "learning_rate": 3.553082494562354e-07, - "loss": 0.5038, - "mean_token_accuracy": 0.8369626475554812, + "loss": 0.4044, + "mean_token_accuracy": 0.860968928653511, "step": 676 }, { "epoch": 0.9244687206622856, - "grad_norm": 0.11375054904732852, + "grad_norm": 0.20029096615777936, "learning_rate": 3.4280432901336423e-07, - "loss": 0.5393, - "mean_token_accuracy": 0.8268004653334061, + "loss": 0.4327, + "mean_token_accuracy": 0.8523410994278862, "step": 677 }, { "epoch": 0.9258342579158487, - "grad_norm": 0.11099843408182695, + "grad_norm": 0.19089996560484057, "learning_rate": 3.3052054816602455e-07, - "loss": 0.5112, - "mean_token_accuracy": 0.8350002507419866, + "loss": 0.4118, + "mean_token_accuracy": 0.8587402967704104, "step": 678 }, { "epoch": 0.927199795169412, - "grad_norm": 0.11133902569939337, + "grad_norm": 0.21562854865168446, "learning_rate": 3.1845718692785743e-07, - "loss": 0.4952, - "mean_token_accuracy": 0.8400570955751384, + "loss": 0.3998, + "mean_token_accuracy": 0.863486058955801, "step": 679 }, { "epoch": 0.9285653324229751, - "grad_norm": 0.11954261519520169, + "grad_norm": 0.2074181207840677, "learning_rate": 3.0661452028795335e-07, - "loss": 0.5222, - "mean_token_accuracy": 0.8332074136235826, + "loss": 0.418, + "mean_token_accuracy": 0.8581571716176858, "step": 680 }, { "epoch": 0.9299308696765384, - "grad_norm": 0.11132412628670138, + "grad_norm": 0.19559493828044613, "learning_rate": 2.949928182045869e-07, - "loss": 0.4817, - "mean_token_accuracy": 0.843604963711222, + "loss": 0.3859, + "mean_token_accuracy": 0.8668547803361453, "step": 681 }, { "epoch": 0.9312964069301015, - "grad_norm": 0.11013524048193757, + "grad_norm": 0.20748047485633067, "learning_rate": 2.835923455990508e-07, - "loss": 0.5037, - "mean_token_accuracy": 0.8382973927396834, + "loss": 0.4038, + "mean_token_accuracy": 0.862158416990831, "step": 682 }, { "epoch": 0.9326619441836648, - "grad_norm": 0.1120857706474797, + "grad_norm": 0.20156584121794527, "learning_rate": 2.7241336234962943e-07, - "loss": 0.4938, - "mean_token_accuracy": 0.8410198955082097, + "loss": 0.3962, + "mean_token_accuracy": 0.864386738221901, "step": 683 }, { "epoch": 0.9340274814372279, - "grad_norm": 0.10269858963290679, + "grad_norm": 0.18328492764164395, "learning_rate": 2.614561232856672e-07, - "loss": 0.4766, - "mean_token_accuracy": 0.8449651961978727, + "loss": 0.3818, + "mean_token_accuracy": 0.8677611815631928, "step": 684 }, { "epoch": 0.9353930186907912, - "grad_norm": 0.11917730172441471, + "grad_norm": 0.20904145120342477, "learning_rate": 2.507208781817638e-07, - "loss": 0.5345, - "mean_token_accuracy": 0.8290748576003336, + "loss": 0.4285, + "mean_token_accuracy": 0.8544474486773861, "step": 685 }, { "epoch": 0.9367585559443543, - "grad_norm": 0.11534601639379333, + "grad_norm": 0.20813281413241, "learning_rate": 2.402078717520795e-07, - "loss": 0.5142, - "mean_token_accuracy": 0.8336728185105551, + "loss": 0.413, + "mean_token_accuracy": 0.8585056450386133, "step": 686 }, { "epoch": 0.9381240931979176, - "grad_norm": 0.11323913792298873, + "grad_norm": 0.19274067038403314, "learning_rate": 2.2991734364475214e-07, - "loss": 0.5313, - "mean_token_accuracy": 0.8303561043165688, + "loss": 0.4253, + "mean_token_accuracy": 0.8552607489362258, "step": 687 }, { "epoch": 0.9394896304514807, - "grad_norm": 0.10830718670244747, + "grad_norm": 0.18657512512018273, "learning_rate": 2.1984952843644104e-07, - "loss": 0.5045, - "mean_token_accuracy": 0.8368037824233935, + "loss": 0.4048, + "mean_token_accuracy": 0.8605767691236879, "step": 688 }, { "epoch": 0.940855167705044, - "grad_norm": 0.11415126410402644, + "grad_norm": 0.20415038653399858, "learning_rate": 2.1000465562697858e-07, - "loss": 0.5035, - "mean_token_accuracy": 0.837307190939996, + "loss": 0.403, + "mean_token_accuracy": 0.8614550447092127, "step": 689 }, { "epoch": 0.9422207049586071, - "grad_norm": 0.1075543793866937, + "grad_norm": 0.18802302287270456, "learning_rate": 2.0038294963413251e-07, - "loss": 0.509, - "mean_token_accuracy": 0.8343315535721969, + "loss": 0.4102, + "mean_token_accuracy": 0.8589263672995899, "step": 690 }, { "epoch": 0.9435862422121704, - "grad_norm": 0.1073826068957417, + "grad_norm": 0.20800845771223211, "learning_rate": 1.9098462978849875e-07, - "loss": 0.5065, - "mean_token_accuracy": 0.8361526575116471, + "loss": 0.4061, + "mean_token_accuracy": 0.8600364168613337, "step": 691 }, { "epoch": 0.9449517794657335, - "grad_norm": 0.11670435239433453, + "grad_norm": 0.21444000519203277, "learning_rate": 1.8180991032849426e-07, - "loss": 0.5302, - "mean_token_accuracy": 0.829077340166264, + "loss": 0.4256, + "mean_token_accuracy": 0.8541757101430708, "step": 692 }, { "epoch": 0.9463173167192968, - "grad_norm": 0.10859239923506202, + "grad_norm": 0.2049726644452516, "learning_rate": 1.7285900039547997e-07, - "loss": 0.4972, - "mean_token_accuracy": 0.8402639938871189, + "loss": 0.3977, + "mean_token_accuracy": 0.8644176084772834, "step": 693 }, { "epoch": 0.9476828539728599, - "grad_norm": 0.11569282274284483, + "grad_norm": 0.19845365768398393, "learning_rate": 1.6413210402898895e-07, - "loss": 0.5264, - "mean_token_accuracy": 0.8313261442399045, + "loss": 0.422, + "mean_token_accuracy": 0.8564894930291981, "step": 694 }, { "epoch": 0.9490483912264231, - "grad_norm": 0.11126250951873592, + "grad_norm": 0.20134810058434158, "learning_rate": 1.556294201620734e-07, - "loss": 0.4964, - "mean_token_accuracy": 0.8388015350464768, + "loss": 0.3986, + "mean_token_accuracy": 0.8628961715481271, "step": 695 }, { "epoch": 0.9504139284799863, - "grad_norm": 0.11249380860651531, + "grad_norm": 0.20332703044962025, "learning_rate": 1.4735114261677842e-07, - "loss": 0.5179, - "mean_token_accuracy": 0.8357265532703604, + "loss": 0.4159, + "mean_token_accuracy": 0.8602014785454245, "step": 696 }, { "epoch": 0.9517794657335495, - "grad_norm": 0.11262403057740748, + "grad_norm": 0.20262245169618026, "learning_rate": 1.3929746009971434e-07, - "loss": 0.523, - "mean_token_accuracy": 0.8309361338737956, + "loss": 0.4205, + "mean_token_accuracy": 0.8558728368817288, "step": 697 }, { "epoch": 0.9531450029871127, - "grad_norm": 0.11315567105824463, + "grad_norm": 0.1986654702159638, "learning_rate": 1.3146855619776134e-07, - "loss": 0.5195, - "mean_token_accuracy": 0.8335453788413849, + "loss": 0.4164, + "mean_token_accuracy": 0.8583016759076048, "step": 698 }, { "epoch": 0.9545105402406759, - "grad_norm": 0.11275651926260329, + "grad_norm": 0.20538122761572822, "learning_rate": 1.2386460937387824e-07, - "loss": 0.514, - "mean_token_accuracy": 0.8357001378826753, + "loss": 0.4122, + "mean_token_accuracy": 0.8596236188677384, "step": 699 }, { "epoch": 0.9558760774942391, - "grad_norm": 0.10563928645234845, + "grad_norm": 0.18771743571795288, "learning_rate": 1.1648579296304252e-07, - "loss": 0.4853, - "mean_token_accuracy": 0.8425545681752713, + "loss": 0.389, + "mean_token_accuracy": 0.8655003903926352, "step": 700 }, { "epoch": 0.9572416147478023, - "grad_norm": 0.10631148483968364, + "grad_norm": 0.18918786074101446, "learning_rate": 1.0933227516829348e-07, - "loss": 0.4757, - "mean_token_accuracy": 0.8448280583377341, + "loss": 0.3804, + "mean_token_accuracy": 0.8682751397466016, "step": 701 }, { "epoch": 0.9586071520013656, - "grad_norm": 0.1116246116698196, + "grad_norm": 0.21947757366560472, "learning_rate": 1.0240421905689746e-07, - "loss": 0.5136, - "mean_token_accuracy": 0.8345537901047713, + "loss": 0.4128, + "mean_token_accuracy": 0.858528673911104, "step": 702 }, { "epoch": 0.9599726892549287, - "grad_norm": 0.11250893950819854, + "grad_norm": 0.197515434889879, "learning_rate": 9.570178255663532e-08, - "loss": 0.4886, - "mean_token_accuracy": 0.8415111567006566, + "loss": 0.3926, + "mean_token_accuracy": 0.8648129525692759, "step": 703 }, { "epoch": 0.961338226508492, - "grad_norm": 0.11242686840117742, + "grad_norm": 0.1879435454356452, "learning_rate": 8.922511845219972e-08, - "loss": 0.5186, - "mean_token_accuracy": 0.8325617579128456, + "loss": 0.4138, + "mean_token_accuracy": 0.8581702095308102, "step": 704 }, { "epoch": 0.9627037637620551, - "grad_norm": 0.11042162080380281, + "grad_norm": 0.18890815403985045, "learning_rate": 8.297437438170797e-08, - "loss": 0.4878, - "mean_token_accuracy": 0.8420177217115964, + "loss": 0.3907, + "mean_token_accuracy": 0.865506216595041, "step": 705 }, { "epoch": 0.9640693010156184, - "grad_norm": 0.11285413455019924, + "grad_norm": 0.286341814862515, "learning_rate": 7.694969283334575e-08, - "loss": 0.5128, - "mean_token_accuracy": 0.8350998482860418, + "loss": 0.4113, + "mean_token_accuracy": 0.8591874470356847, "step": 706 }, { "epoch": 0.9654348382691815, - "grad_norm": 0.11358901275656513, + "grad_norm": 0.20273945460392515, "learning_rate": 7.115121114211198e-08, - "loss": 0.4904, - "mean_token_accuracy": 0.8426595636980773, + "loss": 0.3943, + "mean_token_accuracy": 0.8657841466466322, "step": 707 }, { "epoch": 0.9668003755227448, - "grad_norm": 0.11380914283906224, + "grad_norm": 0.20652336093906176, "learning_rate": 6.557906148669025e-08, - "loss": 0.5303, - "mean_token_accuracy": 0.8298721092040836, + "loss": 0.4253, + "mean_token_accuracy": 0.8548559094292058, "step": 708 }, { "epoch": 0.9681659127763079, - "grad_norm": 0.10972070785695759, + "grad_norm": 0.19629348230830565, "learning_rate": 6.023337088643665e-08, - "loss": 0.4862, - "mean_token_accuracy": 0.8434361937936683, + "loss": 0.3905, + "mean_token_accuracy": 0.866214795452116, "step": 709 }, { "epoch": 0.9695314500298712, - "grad_norm": 0.11176532733381031, + "grad_norm": 0.18612630221965354, "learning_rate": 5.51142611984834e-08, - "loss": 0.4995, - "mean_token_accuracy": 0.8391750652828646, + "loss": 0.4012, + "mean_token_accuracy": 0.8630792198659271, "step": 710 }, { "epoch": 0.9708969872834343, - "grad_norm": 0.11147890051891436, + "grad_norm": 0.20471539063414793, "learning_rate": 5.022184911495864e-08, - "loss": 0.5087, - "mean_token_accuracy": 0.8376114784434925, + "loss": 0.4088, + "mean_token_accuracy": 0.8615388696106141, "step": 711 }, { "epoch": 0.9722625245369976, - "grad_norm": 0.12108995943509993, + "grad_norm": 0.20615922685602697, "learning_rate": 4.555624616033427e-08, - "loss": 0.5534, - "mean_token_accuracy": 0.8245600027395658, + "loss": 0.4423, + "mean_token_accuracy": 0.8509749138520655, "step": 712 }, { "epoch": 0.9736280617905607, - "grad_norm": 0.11291799754120939, + "grad_norm": 0.1905175337604626, "learning_rate": 4.111755868887346e-08, - "loss": 0.4934, - "mean_token_accuracy": 0.8407968721180822, + "loss": 0.3965, + "mean_token_accuracy": 0.8641722387632514, "step": 713 }, { "epoch": 0.974993599044124, - "grad_norm": 0.11580048386559677, + "grad_norm": 0.21850511626165098, "learning_rate": 3.690588788221372e-08, - "loss": 0.5062, - "mean_token_accuracy": 0.8370589010476339, + "loss": 0.4073, + "mean_token_accuracy": 0.8611413177219913, "step": 714 }, { "epoch": 0.9763591362976871, - "grad_norm": 0.1165646157016916, + "grad_norm": 0.19471127549408462, "learning_rate": 3.2921329747056527e-08, - "loss": 0.5249, - "mean_token_accuracy": 0.8316874151625394, + "loss": 0.4208, + "mean_token_accuracy": 0.8565208481356911, "step": 715 }, { "epoch": 0.9777246735512504, - "grad_norm": 0.1148390599104358, + "grad_norm": 0.20634538207455483, "learning_rate": 2.916397511298019e-08, - "loss": 0.5007, - "mean_token_accuracy": 0.838817563638364, + "loss": 0.4022, + "mean_token_accuracy": 0.8626477917792553, "step": 716 }, { "epoch": 0.9790902108048135, - "grad_norm": 0.10787022557798077, + "grad_norm": 0.17898675224982286, "learning_rate": 2.563390963037149e-08, - "loss": 0.5085, - "mean_token_accuracy": 0.8345250187854458, + "loss": 0.4084, + "mean_token_accuracy": 0.858742060603683, "step": 717 }, { "epoch": 0.9804557480583768, - "grad_norm": 0.11422578023510434, + "grad_norm": 0.20186274942480947, "learning_rate": 2.2331213768468363e-08, - "loss": 0.5118, - "mean_token_accuracy": 0.8366227185937417, + "loss": 0.4112, + "mean_token_accuracy": 0.8604082683839368, "step": 718 }, { "epoch": 0.9818212853119399, - "grad_norm": 0.1168111192741652, + "grad_norm": 0.20068940005862676, "learning_rate": 1.925596281353026e-08, - "loss": 0.5046, - "mean_token_accuracy": 0.8374854044951943, + "loss": 0.4061, + "mean_token_accuracy": 0.8611462253446862, "step": 719 }, { "epoch": 0.9831868225655032, - "grad_norm": 0.10984649971114911, + "grad_norm": 0.1949558079302809, "learning_rate": 1.6408226867118404e-08, - "loss": 0.4996, - "mean_token_accuracy": 0.8384380589187508, + "loss": 0.3991, + "mean_token_accuracy": 0.8624983217087219, "step": 720 }, { "epoch": 0.9845523598190663, - "grad_norm": 0.11526342447978216, + "grad_norm": 0.20269545070379796, "learning_rate": 1.3788070844501511e-08, - "loss": 0.4933, - "mean_token_accuracy": 0.8428332722225506, + "loss": 0.3964, + "mean_token_accuracy": 0.8662190034977201, "step": 721 }, { "epoch": 0.9859178970726296, - "grad_norm": 0.11361085519857161, + "grad_norm": 0.19803360839942985, "learning_rate": 1.1395554473171421e-08, - "loss": 0.527, - "mean_token_accuracy": 0.8311565052971914, + "loss": 0.4237, + "mean_token_accuracy": 0.8558418698658579, "step": 722 }, { "epoch": 0.9872834343261927, - "grad_norm": 0.12554083246906578, + "grad_norm": 0.20911643943570965, "learning_rate": 9.2307322914853e-09, - "loss": 0.5007, - "mean_token_accuracy": 0.8380183632026863, + "loss": 0.4014, + "mean_token_accuracy": 0.8616119983012264, "step": 723 }, { "epoch": 0.988648971579756, - "grad_norm": 0.11211586167271237, + "grad_norm": 0.20281334643579427, "learning_rate": 7.293653647421073e-09, - "loss": 0.4908, - "mean_token_accuracy": 0.841736460853231, + "loss": 0.3948, + "mean_token_accuracy": 0.8643835215948916, "step": 724 }, { "epoch": 0.9900145088333191, - "grad_norm": 0.10901070978840653, + "grad_norm": 0.1901095357986784, "learning_rate": 5.584362697453882e-09, - "loss": 0.5061, - "mean_token_accuracy": 0.8376564587833005, + "loss": 0.4048, + "mean_token_accuracy": 0.8623485947951701, "step": 725 }, { "epoch": 0.9913800460868823, - "grad_norm": 0.11025089502617615, + "grad_norm": 0.20379731214220034, "learning_rate": 4.1028984055457856e-09, - "loss": 0.4976, - "mean_token_accuracy": 0.8422375376246224, + "loss": 0.3983, + "mean_token_accuracy": 0.8660865418772313, "step": 726 }, { "epoch": 0.9927455833404455, - "grad_norm": 0.11514204079963179, + "grad_norm": 0.21708635868498447, "learning_rate": 2.8492945422620157e-09, - "loss": 0.5186, - "mean_token_accuracy": 0.8331608032533112, + "loss": 0.4147, + "mean_token_accuracy": 0.8587614489423447, "step": 727 }, { "epoch": 0.9941111205940087, - "grad_norm": 0.1117081234562899, + "grad_norm": 0.1929220391858935, "learning_rate": 1.8235796839982667e-09, - "loss": 0.5408, - "mean_token_accuracy": 0.8262145936053681, + "loss": 0.435, + "mean_token_accuracy": 0.8512693561141161, "step": 728 }, { "epoch": 0.9954766578475719, - "grad_norm": 0.10869335333321983, + "grad_norm": 0.20428589636781275, "learning_rate": 1.0257772123312137e-09, - "loss": 0.4927, - "mean_token_accuracy": 0.8416734533471927, + "loss": 0.3959, + "mean_token_accuracy": 0.8652902515910991, "step": 729 }, { "epoch": 0.9968421951011351, - "grad_norm": 0.10862799297448121, + "grad_norm": 0.1904705638210679, "learning_rate": 4.5590531348227443e-10, - "loss": 0.5044, - "mean_token_accuracy": 0.8368493539447776, + "loss": 0.4054, + "mean_token_accuracy": 0.8607762718652394, "step": 730 }, { "epoch": 0.9982077323546983, - "grad_norm": 0.11144689793245134, + "grad_norm": 0.19964279194650797, "learning_rate": 1.1397697790793693e-10, - "loss": 0.5154, - "mean_token_accuracy": 0.834050180242974, + "loss": 0.4149, + "mean_token_accuracy": 0.8587917368074317, "step": 731 }, { "epoch": 0.9995732696082615, - "grad_norm": 0.10826741405421844, + "grad_norm": 0.19717881385523706, "learning_rate": 0.0, - "loss": 0.5045, - "mean_token_accuracy": 0.836847565742045, + "loss": 0.4046, + "mean_token_accuracy": 0.8610176941050893, "step": 732 }, { "epoch": 0.9995732696082615, "step": 732, - "total_flos": 5.591154468847616e+18, - "train_loss": 0.5708037986130011, - "train_runtime": 80006.8909, - "train_samples_per_second": 1.172, - "train_steps_per_second": 0.009 + "total_flos": 2.7844372399870968e+19, + "train_loss": 0.4556342206203221, + "train_runtime": 169635.2867, + "train_samples_per_second": 0.553, + "train_steps_per_second": 0.004 } ], "logging_steps": 1, @@ -5891,7 +5891,7 @@ "attributes": {} } }, - "total_flos": 5.591154468847616e+18, + "total_flos": 2.7844372399870968e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null