{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3092, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008085381630012937, "grad_norm": 0.3009789288043976, "learning_rate": 0.00013440860215053763, "loss": 1.7048, "mean_token_accuracy": 0.661682380437851, "step": 25 }, { "epoch": 0.016170763260025874, "grad_norm": 0.4460005462169647, "learning_rate": 0.00026881720430107527, "loss": 1.2718, "mean_token_accuracy": 0.713244981765747, "step": 50 }, { "epoch": 0.02425614489003881, "grad_norm": 0.25601527094841003, "learning_rate": 0.0004032258064516129, "loss": 0.9995, "mean_token_accuracy": 0.7553433787822723, "step": 75 }, { "epoch": 0.03234152652005175, "grad_norm": 0.2627596855163574, "learning_rate": 0.0004999932787358948, "loss": 0.9036, "mean_token_accuracy": 0.7764883422851563, "step": 100 }, { "epoch": 0.04042690815006468, "grad_norm": 0.4143337905406952, "learning_rate": 0.0004998595518201121, "loss": 0.8985, "mean_token_accuracy": 0.7727250349521637, "step": 125 }, { "epoch": 0.04851228978007762, "grad_norm": 0.23922297358512878, "learning_rate": 0.00049955446943686, "loss": 0.7909, "mean_token_accuracy": 0.7968144822120666, "step": 150 }, { "epoch": 0.056597671410090554, "grad_norm": 0.2834464907646179, "learning_rate": 0.0004990782408138185, "loss": 0.822, "mean_token_accuracy": 0.7941457486152649, "step": 175 }, { "epoch": 0.0646830530401035, "grad_norm": 0.29430675506591797, "learning_rate": 0.000498431192551983, "loss": 0.8546, "mean_token_accuracy": 0.7836022305488587, "step": 200 }, { "epoch": 0.07276843467011643, "grad_norm": 0.23746679723262787, "learning_rate": 0.0004976137684016788, "loss": 0.7788, "mean_token_accuracy": 0.8025749707221985, "step": 225 }, { "epoch": 0.08085381630012936, "grad_norm": 0.24229322373867035, "learning_rate": 0.0004966265289582338, "loss": 0.8583, "mean_token_accuracy": 0.7800547671318054, "step": 250 }, { "epoch": 0.08893919793014231, "grad_norm": 0.24889522790908813, "learning_rate": 0.0004954701512775184, "loss": 0.685, "mean_token_accuracy": 0.8196275687217712, "step": 275 }, { "epoch": 0.09702457956015524, "grad_norm": 0.22737586498260498, "learning_rate": 0.0004941454284116157, "loss": 0.8068, "mean_token_accuracy": 0.7950991153717041, "step": 300 }, { "epoch": 0.10510996119016817, "grad_norm": 0.2224387228488922, "learning_rate": 0.0004926532688649407, "loss": 0.8054, "mean_token_accuracy": 0.7932506108283996, "step": 325 }, { "epoch": 0.11319534282018111, "grad_norm": 0.2293468415737152, "learning_rate": 0.0004909946959711816, "loss": 0.7816, "mean_token_accuracy": 0.7965369141101837, "step": 350 }, { "epoch": 0.12128072445019406, "grad_norm": 0.2561907470226288, "learning_rate": 0.0004891708471914897, "loss": 0.7487, "mean_token_accuracy": 0.8107043826580047, "step": 375 }, { "epoch": 0.129366106080207, "grad_norm": 0.2583022117614746, "learning_rate": 0.0004871829733344012, "loss": 0.7786, "mean_token_accuracy": 0.7999367988109589, "step": 400 }, { "epoch": 0.13745148771021992, "grad_norm": 0.2550680339336395, "learning_rate": 0.00048503243769802327, "loss": 0.7576, "mean_token_accuracy": 0.8000009500980377, "step": 425 }, { "epoch": 0.14553686934023286, "grad_norm": 0.24982689321041107, "learning_rate": 0.0004827207151350745, "loss": 0.7897, "mean_token_accuracy": 0.7945795893669129, "step": 450 }, { "epoch": 0.1536222509702458, "grad_norm": 0.2784912884235382, "learning_rate": 0.0004802493910414205, "loss": 0.8013, "mean_token_accuracy": 0.792938197851181, "step": 475 }, { "epoch": 0.16170763260025872, "grad_norm": 0.2220473736524582, "learning_rate": 0.00047762016026879807, "loss": 0.711, "mean_token_accuracy": 0.8124627935886383, "step": 500 }, { "epoch": 0.16979301423027165, "grad_norm": 0.2841341197490692, "learning_rate": 0.00047483482596247353, "loss": 0.7382, "mean_token_accuracy": 0.8129073297977447, "step": 525 }, { "epoch": 0.17787839586028462, "grad_norm": 0.23232664167881012, "learning_rate": 0.00047189529832463296, "loss": 0.7588, "mean_token_accuracy": 0.8059421420097351, "step": 550 }, { "epoch": 0.18596377749029755, "grad_norm": 0.22178468108177185, "learning_rate": 0.00046880359330435216, "loss": 0.8146, "mean_token_accuracy": 0.7915572142601013, "step": 575 }, { "epoch": 0.19404915912031048, "grad_norm": 0.23006677627563477, "learning_rate": 0.0004655618312150437, "loss": 0.7822, "mean_token_accuracy": 0.7942324769496918, "step": 600 }, { "epoch": 0.20213454075032342, "grad_norm": 0.32772719860076904, "learning_rate": 0.00046217223528033146, "loss": 0.7632, "mean_token_accuracy": 0.8043809008598327, "step": 625 }, { "epoch": 0.21021992238033635, "grad_norm": 0.18335266411304474, "learning_rate": 0.0004586371301093476, "loss": 0.7218, "mean_token_accuracy": 0.816128705739975, "step": 650 }, { "epoch": 0.21830530401034928, "grad_norm": 0.22064125537872314, "learning_rate": 0.00045495894010249915, "loss": 0.7364, "mean_token_accuracy": 0.8087958478927613, "step": 675 }, { "epoch": 0.22639068564036222, "grad_norm": 0.21018439531326294, "learning_rate": 0.0004511401877887967, "loss": 0.738, "mean_token_accuracy": 0.8069515633583069, "step": 700 }, { "epoch": 0.23447606727037515, "grad_norm": 0.21225683391094208, "learning_rate": 0.0004471834920958864, "loss": 0.7397, "mean_token_accuracy": 0.8051351046562195, "step": 725 }, { "epoch": 0.2425614489003881, "grad_norm": 0.22577480971813202, "learning_rate": 0.00044309156655397003, "loss": 0.7872, "mean_token_accuracy": 0.7951467871665955, "step": 750 }, { "epoch": 0.25064683053040104, "grad_norm": 0.19751884043216705, "learning_rate": 0.000438867217434847, "loss": 0.7147, "mean_token_accuracy": 0.8134795188903808, "step": 775 }, { "epoch": 0.258732212160414, "grad_norm": 0.2011658251285553, "learning_rate": 0.0004345133418273529, "loss": 0.7923, "mean_token_accuracy": 0.7959077858924866, "step": 800 }, { "epoch": 0.2668175937904269, "grad_norm": 0.2174764722585678, "learning_rate": 0.00043003292565051544, "loss": 0.7576, "mean_token_accuracy": 0.8044932758808137, "step": 825 }, { "epoch": 0.27490297542043984, "grad_norm": 0.20990514755249023, "learning_rate": 0.0004254290416057898, "loss": 0.739, "mean_token_accuracy": 0.8073045027256012, "step": 850 }, { "epoch": 0.2829883570504528, "grad_norm": 0.1976221799850464, "learning_rate": 0.0004207048470697777, "loss": 0.6717, "mean_token_accuracy": 0.824974125623703, "step": 875 }, { "epoch": 0.2910737386804657, "grad_norm": 0.2552309036254883, "learning_rate": 0.0004158635819288762, "loss": 0.7311, "mean_token_accuracy": 0.8078971183300019, "step": 900 }, { "epoch": 0.29915912031047864, "grad_norm": 0.26244303584098816, "learning_rate": 0.00041090856635734067, "loss": 0.7264, "mean_token_accuracy": 0.8127052938938141, "step": 925 }, { "epoch": 0.3072445019404916, "grad_norm": 0.24333028495311737, "learning_rate": 0.000405843198540285, "loss": 0.7184, "mean_token_accuracy": 0.8114526355266571, "step": 950 }, { "epoch": 0.3153298835705045, "grad_norm": 0.19133129715919495, "learning_rate": 0.0004006709523431822, "loss": 0.7538, "mean_token_accuracy": 0.8016650295257568, "step": 975 }, { "epoch": 0.32341526520051744, "grad_norm": 0.25047338008880615, "learning_rate": 0.00039539537492946285, "loss": 0.8019, "mean_token_accuracy": 0.7935136258602142, "step": 1000 }, { "epoch": 0.3315006468305304, "grad_norm": 0.19724124670028687, "learning_rate": 0.0003900200843278449, "loss": 0.6892, "mean_token_accuracy": 0.8166925406455994, "step": 1025 }, { "epoch": 0.3395860284605433, "grad_norm": 0.21111617982387543, "learning_rate": 0.0003845487669510631, "loss": 0.7281, "mean_token_accuracy": 0.8110716784000397, "step": 1050 }, { "epoch": 0.3476714100905563, "grad_norm": 0.2487187534570694, "learning_rate": 0.00037898517506770196, "loss": 0.7962, "mean_token_accuracy": 0.7921491277217865, "step": 1075 }, { "epoch": 0.35575679172056923, "grad_norm": 0.2757723927497864, "learning_rate": 0.0003733331242288622, "loss": 0.7533, "mean_token_accuracy": 0.8056223785877228, "step": 1100 }, { "epoch": 0.36384217335058217, "grad_norm": 0.27630847692489624, "learning_rate": 0.0003675964906514289, "loss": 0.7885, "mean_token_accuracy": 0.7973137283325196, "step": 1125 }, { "epoch": 0.3719275549805951, "grad_norm": 0.2365068644285202, "learning_rate": 0.00036177920855973405, "loss": 0.7275, "mean_token_accuracy": 0.8077067303657531, "step": 1150 }, { "epoch": 0.38001293661060803, "grad_norm": 0.2511255443096161, "learning_rate": 0.00035588526748743754, "loss": 0.8052, "mean_token_accuracy": 0.7931141972541809, "step": 1175 }, { "epoch": 0.38809831824062097, "grad_norm": 0.2489156723022461, "learning_rate": 0.0003499187095414763, "loss": 0.7369, "mean_token_accuracy": 0.8076127851009369, "step": 1200 }, { "epoch": 0.3961836998706339, "grad_norm": 0.23067978024482727, "learning_rate": 0.00034388362662995855, "loss": 0.7393, "mean_token_accuracy": 0.8086310243606567, "step": 1225 }, { "epoch": 0.40426908150064683, "grad_norm": 0.2202920764684677, "learning_rate": 0.000337784157655904, "loss": 0.7578, "mean_token_accuracy": 0.8064273929595948, "step": 1250 }, { "epoch": 0.41235446313065977, "grad_norm": 0.21269731223583221, "learning_rate": 0.0003316244856787544, "loss": 0.7937, "mean_token_accuracy": 0.795601452589035, "step": 1275 }, { "epoch": 0.4204398447606727, "grad_norm": 0.24461720883846283, "learning_rate": 0.0003254088350456017, "loss": 0.6656, "mean_token_accuracy": 0.8226878666877746, "step": 1300 }, { "epoch": 0.42852522639068563, "grad_norm": 0.24032443761825562, "learning_rate": 0.0003191414684941003, "loss": 0.7778, "mean_token_accuracy": 0.7960509729385375, "step": 1325 }, { "epoch": 0.43661060802069857, "grad_norm": 0.25362804532051086, "learning_rate": 0.0003128266842290513, "loss": 0.6967, "mean_token_accuracy": 0.8132575881481171, "step": 1350 }, { "epoch": 0.4446959896507115, "grad_norm": 0.2388894110918045, "learning_rate": 0.0003064688129746629, "loss": 0.716, "mean_token_accuracy": 0.8112483811378479, "step": 1375 }, { "epoch": 0.45278137128072443, "grad_norm": 0.2121650129556656, "learning_rate": 0.0003000722150045085, "loss": 0.6942, "mean_token_accuracy": 0.8156200110912323, "step": 1400 }, { "epoch": 0.46086675291073737, "grad_norm": 0.2317550778388977, "learning_rate": 0.0002936412771512206, "loss": 0.7493, "mean_token_accuracy": 0.8051575112342835, "step": 1425 }, { "epoch": 0.4689521345407503, "grad_norm": 0.23476050794124603, "learning_rate": 0.0002871804097979687, "loss": 0.7136, "mean_token_accuracy": 0.8104170382022857, "step": 1450 }, { "epoch": 0.4770375161707633, "grad_norm": 0.2009560465812683, "learning_rate": 0.00028069404385378736, "loss": 0.7117, "mean_token_accuracy": 0.8178813803195953, "step": 1475 }, { "epoch": 0.4851228978007762, "grad_norm": 0.21633079648017883, "learning_rate": 0.0002741866277148276, "loss": 0.7392, "mean_token_accuracy": 0.8080459308624267, "step": 1500 }, { "epoch": 0.49320827943078915, "grad_norm": 0.2863864004611969, "learning_rate": 0.00026766262421361407, "loss": 0.7429, "mean_token_accuracy": 0.8051086151599884, "step": 1525 }, { "epoch": 0.5012936610608021, "grad_norm": 0.2746196389198303, "learning_rate": 0.0002611265075584034, "loss": 0.7378, "mean_token_accuracy": 0.8071331679821014, "step": 1550 }, { "epoch": 0.509379042690815, "grad_norm": 0.1745605319738388, "learning_rate": 0.0002545827602647397, "loss": 0.7329, "mean_token_accuracy": 0.8071370398998261, "step": 1575 }, { "epoch": 0.517464424320828, "grad_norm": 0.2692930996417999, "learning_rate": 0.0002480358700813135, "loss": 0.6469, "mean_token_accuracy": 0.8244569575786591, "step": 1600 }, { "epoch": 0.5255498059508409, "grad_norm": 0.2551010549068451, "learning_rate": 0.00024149032691223173, "loss": 0.7093, "mean_token_accuracy": 0.8107299065589905, "step": 1625 }, { "epoch": 0.5336351875808538, "grad_norm": 0.19366180896759033, "learning_rate": 0.0002349506197378092, "loss": 0.7518, "mean_token_accuracy": 0.8024619662761688, "step": 1650 }, { "epoch": 0.5417205692108668, "grad_norm": 0.22391283512115479, "learning_rate": 0.00022842123353599369, "loss": 0.6933, "mean_token_accuracy": 0.8174584257602692, "step": 1675 }, { "epoch": 0.5498059508408797, "grad_norm": 0.24723011255264282, "learning_rate": 0.0002219066462065364, "loss": 0.669, "mean_token_accuracy": 0.822363510131836, "step": 1700 }, { "epoch": 0.5578913324708926, "grad_norm": 0.20229819416999817, "learning_rate": 0.00021541132550001584, "loss": 0.6508, "mean_token_accuracy": 0.8255820453166962, "step": 1725 }, { "epoch": 0.5659767141009056, "grad_norm": 0.2818906605243683, "learning_rate": 0.00020893972595382274, "loss": 0.6417, "mean_token_accuracy": 0.830688863992691, "step": 1750 }, { "epoch": 0.5740620957309185, "grad_norm": 0.24256223440170288, "learning_rate": 0.00020249628583720672, "loss": 0.7353, "mean_token_accuracy": 0.8104202616214752, "step": 1775 }, { "epoch": 0.5821474773609314, "grad_norm": 0.24385611712932587, "learning_rate": 0.00019608542410747888, "loss": 0.6876, "mean_token_accuracy": 0.819042581319809, "step": 1800 }, { "epoch": 0.5902328589909444, "grad_norm": 0.15913242101669312, "learning_rate": 0.00018971153737945968, "loss": 0.646, "mean_token_accuracy": 0.8284247839450836, "step": 1825 }, { "epoch": 0.5983182406209573, "grad_norm": 0.17542122304439545, "learning_rate": 0.00018337899691024914, "loss": 0.6216, "mean_token_accuracy": 0.83616614818573, "step": 1850 }, { "epoch": 0.6064036222509702, "grad_norm": 0.22214658558368683, "learning_rate": 0.0001770921456013872, "loss": 0.6947, "mean_token_accuracy": 0.8170740747451782, "step": 1875 }, { "epoch": 0.6144890038809832, "grad_norm": 0.22132755815982819, "learning_rate": 0.00017085529502046073, "loss": 0.6788, "mean_token_accuracy": 0.820680763721466, "step": 1900 }, { "epoch": 0.6225743855109961, "grad_norm": 0.26897531747817993, "learning_rate": 0.00016467272244420029, "loss": 0.6833, "mean_token_accuracy": 0.822064242362976, "step": 1925 }, { "epoch": 0.630659767141009, "grad_norm": 0.25688934326171875, "learning_rate": 0.0001585486679250922, "loss": 0.6945, "mean_token_accuracy": 0.8143122482299805, "step": 1950 }, { "epoch": 0.638745148771022, "grad_norm": 0.21207553148269653, "learning_rate": 0.0001524873313835208, "loss": 0.6596, "mean_token_accuracy": 0.8273860597610474, "step": 1975 }, { "epoch": 0.6468305304010349, "grad_norm": 0.281393438577652, "learning_rate": 0.00014649286972743319, "loss": 0.6767, "mean_token_accuracy": 0.8178416419029236, "step": 2000 }, { "epoch": 0.6549159120310478, "grad_norm": 0.27408191561698914, "learning_rate": 0.00014056939400150143, "loss": 0.6974, "mean_token_accuracy": 0.8189209842681885, "step": 2025 }, { "epoch": 0.6630012936610608, "grad_norm": 0.26886942982673645, "learning_rate": 0.00013472096656773913, "loss": 0.6497, "mean_token_accuracy": 0.8288757252693176, "step": 2050 }, { "epoch": 0.6710866752910737, "grad_norm": 0.21919454634189606, "learning_rate": 0.00012895159831950462, "loss": 0.693, "mean_token_accuracy": 0.8163833570480347, "step": 2075 }, { "epoch": 0.6791720569210866, "grad_norm": 0.21283280849456787, "learning_rate": 0.0001232652459308012, "loss": 0.7117, "mean_token_accuracy": 0.808628898859024, "step": 2100 }, { "epoch": 0.6872574385510997, "grad_norm": 0.229765385389328, "learning_rate": 0.00011766580914276209, "loss": 0.7317, "mean_token_accuracy": 0.8035627353191376, "step": 2125 }, { "epoch": 0.6953428201811126, "grad_norm": 0.21411098539829254, "learning_rate": 0.00011215712808918003, "loss": 0.6469, "mean_token_accuracy": 0.8277445828914642, "step": 2150 }, { "epoch": 0.7034282018111255, "grad_norm": 0.2254790961742401, "learning_rate": 0.00010674298066291601, "loss": 0.6976, "mean_token_accuracy": 0.8171502375602722, "step": 2175 }, { "epoch": 0.7115135834411385, "grad_norm": 0.27148380875587463, "learning_rate": 0.0001014270799249933, "loss": 0.717, "mean_token_accuracy": 0.8086051964759826, "step": 2200 }, { "epoch": 0.7195989650711514, "grad_norm": 0.2047407031059265, "learning_rate": 9.621307155815398e-05, "loss": 0.718, "mean_token_accuracy": 0.8121638822555542, "step": 2225 }, { "epoch": 0.7276843467011643, "grad_norm": 0.22144050896167755, "learning_rate": 9.11045313666231e-05, "loss": 0.6623, "mean_token_accuracy": 0.8254709720611573, "step": 2250 }, { "epoch": 0.7357697283311773, "grad_norm": 0.27873218059539795, "learning_rate": 8.610496282379687e-05, "loss": 0.7073, "mean_token_accuracy": 0.8168034076690673, "step": 2275 }, { "epoch": 0.7438551099611902, "grad_norm": 0.25058674812316895, "learning_rate": 8.121779466953572e-05, "loss": 0.7961, "mean_token_accuracy": 0.7905523943901062, "step": 2300 }, { "epoch": 0.7519404915912031, "grad_norm": 0.2540716826915741, "learning_rate": 7.644637855870959e-05, "loss": 0.7561, "mean_token_accuracy": 0.8036962306499481, "step": 2325 }, { "epoch": 0.7600258732212161, "grad_norm": 0.2069474756717682, "learning_rate": 7.179398676260923e-05, "loss": 0.7163, "mean_token_accuracy": 0.8117474913597107, "step": 2350 }, { "epoch": 0.768111254851229, "grad_norm": 0.23127734661102295, "learning_rate": 6.726380992479941e-05, "loss": 0.6983, "mean_token_accuracy": 0.8151715826988221, "step": 2375 }, { "epoch": 0.7761966364812419, "grad_norm": 0.19075877964496613, "learning_rate": 6.285895487295229e-05, "loss": 0.6644, "mean_token_accuracy": 0.8226857626438141, "step": 2400 }, { "epoch": 0.7842820181112549, "grad_norm": 0.26920101046562195, "learning_rate": 5.858244248816302e-05, "loss": 0.678, "mean_token_accuracy": 0.8184169673919678, "step": 2425 }, { "epoch": 0.7923673997412678, "grad_norm": 0.22957506775856018, "learning_rate": 5.443720563320792e-05, "loss": 0.7125, "mean_token_accuracy": 0.8165527045726776, "step": 2450 }, { "epoch": 0.8004527813712807, "grad_norm": 0.23255349695682526, "learning_rate": 5.042608714116612e-05, "loss": 0.6648, "mean_token_accuracy": 0.8205063927173615, "step": 2475 }, { "epoch": 0.8085381630012937, "grad_norm": 0.21010981500148773, "learning_rate": 4.655183786578426e-05, "loss": 0.6833, "mean_token_accuracy": 0.8196286606788635, "step": 2500 }, { "epoch": 0.8166235446313066, "grad_norm": 0.21943055093288422, "learning_rate": 4.2817114794921677e-05, "loss": 0.6897, "mean_token_accuracy": 0.8164256310462952, "step": 2525 }, { "epoch": 0.8247089262613195, "grad_norm": 0.21263104677200317, "learning_rate": 3.92244792283685e-05, "loss": 0.6553, "mean_token_accuracy": 0.8244921159744263, "step": 2550 }, { "epoch": 0.8327943078913325, "grad_norm": 0.30405542254447937, "learning_rate": 3.577639502128843e-05, "loss": 0.6771, "mean_token_accuracy": 0.8220798122882843, "step": 2575 }, { "epoch": 0.8408796895213454, "grad_norm": 0.2544702887535095, "learning_rate": 3.247522689448923e-05, "loss": 0.6825, "mean_token_accuracy": 0.8175348448753357, "step": 2600 }, { "epoch": 0.8489650711513583, "grad_norm": 0.2168809473514557, "learning_rate": 2.9323238812679982e-05, "loss": 0.7437, "mean_token_accuracy": 0.8062794303894043, "step": 2625 }, { "epoch": 0.8570504527813713, "grad_norm": 0.32856041193008423, "learning_rate": 2.6322592431828136e-05, "loss": 0.6968, "mean_token_accuracy": 0.8184567129611969, "step": 2650 }, { "epoch": 0.8651358344113842, "grad_norm": 0.2495715171098709, "learning_rate": 2.3475345616680327e-05, "loss": 0.6864, "mean_token_accuracy": 0.817262338399887, "step": 2675 }, { "epoch": 0.8732212160413971, "grad_norm": 0.21743454039096832, "learning_rate": 2.0783451029463995e-05, "loss": 0.6554, "mean_token_accuracy": 0.8261248970031738, "step": 2700 }, { "epoch": 0.8813065976714101, "grad_norm": 0.19413378834724426, "learning_rate": 1.8248754790737733e-05, "loss": 0.691, "mean_token_accuracy": 0.8169219958782196, "step": 2725 }, { "epoch": 0.889391979301423, "grad_norm": 0.2755376100540161, "learning_rate": 1.5872995213308566e-05, "loss": 0.6868, "mean_token_accuracy": 0.8156666767597198, "step": 2750 }, { "epoch": 0.8974773609314359, "grad_norm": 0.23579077422618866, "learning_rate": 1.3657801610084563e-05, "loss": 0.669, "mean_token_accuracy": 0.8254081463813782, "step": 2775 }, { "epoch": 0.9055627425614489, "grad_norm": 0.21246632933616638, "learning_rate": 1.1604693176680392e-05, "loss": 0.6519, "mean_token_accuracy": 0.8244655966758728, "step": 2800 }, { "epoch": 0.9136481241914618, "grad_norm": 0.29115888476371765, "learning_rate": 9.715077949542184e-06, "loss": 0.6673, "mean_token_accuracy": 0.825350991487503, "step": 2825 }, { "epoch": 0.9217335058214747, "grad_norm": 0.24020685255527496, "learning_rate": 7.990251840305996e-06, "loss": 0.6349, "mean_token_accuracy": 0.8334643471240998, "step": 2850 }, { "epoch": 0.9298188874514877, "grad_norm": 0.2604895532131195, "learning_rate": 6.431397747052342e-06, "loss": 0.6659, "mean_token_accuracy": 0.8195065236091614, "step": 2875 }, { "epoch": 0.9379042690815006, "grad_norm": 0.19995881617069244, "learning_rate": 5.039584743066344e-06, "loss": 0.706, "mean_token_accuracy": 0.8151924252510071, "step": 2900 }, { "epoch": 0.9459896507115135, "grad_norm": 0.22976571321487427, "learning_rate": 3.815767343659377e-06, "loss": 0.6477, "mean_token_accuracy": 0.8256394731998443, "step": 2925 }, { "epoch": 0.9540750323415266, "grad_norm": 0.22688381373882294, "learning_rate": 2.760784851555953e-06, "loss": 0.672, "mean_token_accuracy": 0.8214276111125947, "step": 2950 }, { "epoch": 0.9621604139715395, "grad_norm": 0.23771828413009644, "learning_rate": 1.875360781293689e-06, "loss": 0.6642, "mean_token_accuracy": 0.8211515319347381, "step": 2975 }, { "epoch": 0.9702457956015524, "grad_norm": 0.22044353187084198, "learning_rate": 1.1601023630319064e-06, "loss": 0.6148, "mean_token_accuracy": 0.838649377822876, "step": 3000 }, { "epoch": 0.9783311772315654, "grad_norm": 0.21916313469409943, "learning_rate": 6.155001261089477e-07, "loss": 0.7242, "mean_token_accuracy": 0.8094534778594971, "step": 3025 }, { "epoch": 0.9864165588615783, "grad_norm": 0.20908524096012115, "learning_rate": 2.4192756263349826e-07, "loss": 0.7293, "mean_token_accuracy": 0.8055576062202454, "step": 3050 }, { "epoch": 0.9945019404915912, "grad_norm": 0.2526475489139557, "learning_rate": 3.9640871341173336e-08, "loss": 0.6961, "mean_token_accuracy": 0.8134376859664917, "step": 3075 } ], "logging_steps": 25, "max_steps": 3092, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2584006622747034e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }