{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999768695209678, "eval_steps": 100, "global_step": 2702, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018504383225776606, "grad_norm": 3.2766246795654297, "learning_rate": 3.690036900369004e-07, "loss": 1.3084, "mean_token_accuracy": 0.6738631311804741, "step": 5 }, { "epoch": 0.003700876645155321, "grad_norm": 3.404125452041626, "learning_rate": 7.380073800738008e-07, "loss": 1.2822, "mean_token_accuracy": 0.6823436709727585, "step": 10 }, { "epoch": 0.005551314967732981, "grad_norm": 3.664184093475342, "learning_rate": 1.1070110701107011e-06, "loss": 1.3204, "mean_token_accuracy": 0.6697103577274166, "step": 15 }, { "epoch": 0.007401753290310642, "grad_norm": 2.8933751583099365, "learning_rate": 1.4760147601476015e-06, "loss": 1.3266, "mean_token_accuracy": 0.6671803351578223, "step": 20 }, { "epoch": 0.009252191612888302, "grad_norm": 2.534046173095703, "learning_rate": 1.845018450184502e-06, "loss": 1.2231, "mean_token_accuracy": 0.6901344745535501, "step": 25 }, { "epoch": 0.011102629935465963, "grad_norm": 2.2067267894744873, "learning_rate": 2.2140221402214023e-06, "loss": 1.2715, "mean_token_accuracy": 0.6754532741145192, "step": 30 }, { "epoch": 0.012953068258043625, "grad_norm": 1.7358348369598389, "learning_rate": 2.5830258302583027e-06, "loss": 1.1998, "mean_token_accuracy": 0.6898413280984148, "step": 35 }, { "epoch": 0.014803506580621285, "grad_norm": 1.7121713161468506, "learning_rate": 2.952029520295203e-06, "loss": 1.1642, "mean_token_accuracy": 0.6958384680047558, "step": 40 }, { "epoch": 0.016653944903198947, "grad_norm": 1.55677330493927, "learning_rate": 3.3210332103321034e-06, "loss": 1.131, "mean_token_accuracy": 0.6990899712188562, "step": 45 }, { "epoch": 0.018504383225776605, "grad_norm": 1.4929264783859253, "learning_rate": 3.690036900369004e-06, "loss": 1.0993, "mean_token_accuracy": 0.7033520375781326, "step": 50 }, { "epoch": 0.020354821548354267, "grad_norm": 1.5617218017578125, "learning_rate": 4.059040590405905e-06, "loss": 1.103, "mean_token_accuracy": 0.7004610115160694, "step": 55 }, { "epoch": 0.022205259870931925, "grad_norm": 1.2556904554367065, "learning_rate": 4.428044280442805e-06, "loss": 1.0614, "mean_token_accuracy": 0.7098527718822583, "step": 60 }, { "epoch": 0.024055698193509587, "grad_norm": 1.5663235187530518, "learning_rate": 4.797047970479705e-06, "loss": 1.0304, "mean_token_accuracy": 0.7172534410927646, "step": 65 }, { "epoch": 0.02590613651608725, "grad_norm": 1.2256712913513184, "learning_rate": 5.166051660516605e-06, "loss": 1.067, "mean_token_accuracy": 0.7091192188708707, "step": 70 }, { "epoch": 0.027756574838664907, "grad_norm": 1.2844516038894653, "learning_rate": 5.535055350553506e-06, "loss": 1.0161, "mean_token_accuracy": 0.7196393054758531, "step": 75 }, { "epoch": 0.02960701316124257, "grad_norm": 1.3047950267791748, "learning_rate": 5.904059040590406e-06, "loss": 0.988, "mean_token_accuracy": 0.7261756622816481, "step": 80 }, { "epoch": 0.03145745148382023, "grad_norm": 1.1991015672683716, "learning_rate": 6.273062730627307e-06, "loss": 1.0304, "mean_token_accuracy": 0.7153248429171789, "step": 85 }, { "epoch": 0.03330788980639789, "grad_norm": 1.422570824623108, "learning_rate": 6.642066420664207e-06, "loss": 0.981, "mean_token_accuracy": 0.7279154357680314, "step": 90 }, { "epoch": 0.03515832812897555, "grad_norm": 1.2546910047531128, "learning_rate": 7.011070110701108e-06, "loss": 1.0018, "mean_token_accuracy": 0.7217563432093013, "step": 95 }, { "epoch": 0.03700876645155321, "grad_norm": 1.2376253604888916, "learning_rate": 7.380073800738008e-06, "loss": 0.9746, "mean_token_accuracy": 0.7268496288420679, "step": 100 }, { "epoch": 0.03700876645155321, "eval_loss": 1.015299916267395, "eval_mean_token_accuracy": 0.7169665389206827, "eval_runtime": 50.7642, "eval_samples_per_second": 10.106, "eval_steps_per_second": 10.106, "step": 100 }, { "epoch": 0.03885920477413087, "grad_norm": 1.1947935819625854, "learning_rate": 7.749077490774908e-06, "loss": 0.9884, "mean_token_accuracy": 0.7237562053639273, "step": 105 }, { "epoch": 0.040709643096708534, "grad_norm": 1.1547764539718628, "learning_rate": 8.11808118081181e-06, "loss": 1.0021, "mean_token_accuracy": 0.7196395141478394, "step": 110 }, { "epoch": 0.042560081419286196, "grad_norm": 1.1664645671844482, "learning_rate": 8.48708487084871e-06, "loss": 0.9492, "mean_token_accuracy": 0.7334516951299597, "step": 115 }, { "epoch": 0.04441051974186385, "grad_norm": 1.3240509033203125, "learning_rate": 8.85608856088561e-06, "loss": 0.9716, "mean_token_accuracy": 0.7275700433987599, "step": 120 }, { "epoch": 0.04626095806444151, "grad_norm": 1.388859748840332, "learning_rate": 9.22509225092251e-06, "loss": 0.987, "mean_token_accuracy": 0.7230461683516555, "step": 125 }, { "epoch": 0.048111396387019174, "grad_norm": 1.271419882774353, "learning_rate": 9.59409594095941e-06, "loss": 0.9604, "mean_token_accuracy": 0.7282658568793423, "step": 130 }, { "epoch": 0.049961834709596836, "grad_norm": 1.343276023864746, "learning_rate": 9.963099630996312e-06, "loss": 0.9697, "mean_token_accuracy": 0.7295744106084986, "step": 135 }, { "epoch": 0.0518122730321745, "grad_norm": 1.2812855243682861, "learning_rate": 1.033210332103321e-05, "loss": 0.9761, "mean_token_accuracy": 0.7243435776946641, "step": 140 }, { "epoch": 0.05366271135475216, "grad_norm": 1.296567440032959, "learning_rate": 1.0701107011070112e-05, "loss": 0.9804, "mean_token_accuracy": 0.7237954107958932, "step": 145 }, { "epoch": 0.055513149677329815, "grad_norm": 1.3086339235305786, "learning_rate": 1.1070110701107012e-05, "loss": 0.9411, "mean_token_accuracy": 0.7346382404393238, "step": 150 }, { "epoch": 0.05736358799990748, "grad_norm": 1.3291301727294922, "learning_rate": 1.1439114391143913e-05, "loss": 0.9385, "mean_token_accuracy": 0.7342997457804343, "step": 155 }, { "epoch": 0.05921402632248514, "grad_norm": 1.1311614513397217, "learning_rate": 1.1808118081180812e-05, "loss": 0.9666, "mean_token_accuracy": 0.7260505604675033, "step": 160 }, { "epoch": 0.0610644646450628, "grad_norm": 1.2835899591445923, "learning_rate": 1.2177121771217713e-05, "loss": 0.977, "mean_token_accuracy": 0.7244513931183378, "step": 165 }, { "epoch": 0.06291490296764046, "grad_norm": 1.2636359930038452, "learning_rate": 1.2546125461254614e-05, "loss": 0.9551, "mean_token_accuracy": 0.7282803329395334, "step": 170 }, { "epoch": 0.06476534129021812, "grad_norm": 1.5195773839950562, "learning_rate": 1.2915129151291515e-05, "loss": 0.9524, "mean_token_accuracy": 0.7309458986105004, "step": 175 }, { "epoch": 0.06661577961279579, "grad_norm": 1.256447196006775, "learning_rate": 1.3284132841328414e-05, "loss": 0.9232, "mean_token_accuracy": 0.7378681775799953, "step": 180 }, { "epoch": 0.06846621793537344, "grad_norm": 1.2798049449920654, "learning_rate": 1.3653136531365315e-05, "loss": 0.9897, "mean_token_accuracy": 0.7213663838416546, "step": 185 }, { "epoch": 0.0703166562579511, "grad_norm": 1.5569769144058228, "learning_rate": 1.4022140221402215e-05, "loss": 0.9585, "mean_token_accuracy": 0.7270874163121077, "step": 190 }, { "epoch": 0.07216709458052877, "grad_norm": 1.1950033903121948, "learning_rate": 1.4391143911439116e-05, "loss": 0.9519, "mean_token_accuracy": 0.7320935494632173, "step": 195 }, { "epoch": 0.07401753290310642, "grad_norm": 1.2576892375946045, "learning_rate": 1.4760147601476015e-05, "loss": 0.9207, "mean_token_accuracy": 0.736455222510665, "step": 200 }, { "epoch": 0.07401753290310642, "eval_loss": 0.9724639058113098, "eval_mean_token_accuracy": 0.7245206744181131, "eval_runtime": 50.6905, "eval_samples_per_second": 10.12, "eval_steps_per_second": 10.12, "step": 200 }, { "epoch": 0.07586797122568409, "grad_norm": 1.32821524143219, "learning_rate": 1.5129151291512916e-05, "loss": 0.9343, "mean_token_accuracy": 0.7352068673658746, "step": 205 }, { "epoch": 0.07771840954826174, "grad_norm": 1.2804689407348633, "learning_rate": 1.5498154981549817e-05, "loss": 0.9867, "mean_token_accuracy": 0.71895514692202, "step": 210 }, { "epoch": 0.0795688478708394, "grad_norm": 1.3231711387634277, "learning_rate": 1.5867158671586716e-05, "loss": 0.9472, "mean_token_accuracy": 0.7307138213343876, "step": 215 }, { "epoch": 0.08141928619341707, "grad_norm": 1.4526623487472534, "learning_rate": 1.623616236162362e-05, "loss": 0.9821, "mean_token_accuracy": 0.7217701773341699, "step": 220 }, { "epoch": 0.08326972451599472, "grad_norm": 1.317412257194519, "learning_rate": 1.6605166051660518e-05, "loss": 0.9396, "mean_token_accuracy": 0.7325251665277629, "step": 225 }, { "epoch": 0.08512016283857239, "grad_norm": 1.4098377227783203, "learning_rate": 1.697416974169742e-05, "loss": 0.9651, "mean_token_accuracy": 0.7255207585860529, "step": 230 }, { "epoch": 0.08697060116115005, "grad_norm": 1.3292772769927979, "learning_rate": 1.734317343173432e-05, "loss": 0.9247, "mean_token_accuracy": 0.7352377124328339, "step": 235 }, { "epoch": 0.0888210394837277, "grad_norm": 1.2886641025543213, "learning_rate": 1.771217712177122e-05, "loss": 0.9342, "mean_token_accuracy": 0.7347689630621619, "step": 240 }, { "epoch": 0.09067147780630537, "grad_norm": 1.4062870740890503, "learning_rate": 1.8081180811808117e-05, "loss": 0.9255, "mean_token_accuracy": 0.7344553522178484, "step": 245 }, { "epoch": 0.09252191612888302, "grad_norm": 1.415042757987976, "learning_rate": 1.845018450184502e-05, "loss": 0.9222, "mean_token_accuracy": 0.738081657358488, "step": 250 }, { "epoch": 0.0943723544514607, "grad_norm": 1.1698943376541138, "learning_rate": 1.8819188191881922e-05, "loss": 0.9294, "mean_token_accuracy": 0.7366268991041479, "step": 255 }, { "epoch": 0.09622279277403835, "grad_norm": 1.2418172359466553, "learning_rate": 1.918819188191882e-05, "loss": 0.9143, "mean_token_accuracy": 0.7391926803693939, "step": 260 }, { "epoch": 0.09807323109661602, "grad_norm": 1.5692613124847412, "learning_rate": 1.955719557195572e-05, "loss": 0.926, "mean_token_accuracy": 0.7368639311244666, "step": 265 }, { "epoch": 0.09992366941919367, "grad_norm": 1.1764543056488037, "learning_rate": 1.9926199261992623e-05, "loss": 0.9358, "mean_token_accuracy": 0.7338136905409297, "step": 270 }, { "epoch": 0.10177410774177133, "grad_norm": 1.240388035774231, "learning_rate": 1.9999866396188624e-05, "loss": 0.9157, "mean_token_accuracy": 0.7374722373906926, "step": 275 }, { "epoch": 0.103624546064349, "grad_norm": 1.2100290060043335, "learning_rate": 1.9999323636823398e-05, "loss": 0.9179, "mean_token_accuracy": 0.7363942630932592, "step": 280 }, { "epoch": 0.10547498438692665, "grad_norm": 1.2169815301895142, "learning_rate": 1.9998363394309497e-05, "loss": 0.9379, "mean_token_accuracy": 0.7341725861043149, "step": 285 }, { "epoch": 0.10732542270950432, "grad_norm": 1.3079049587249756, "learning_rate": 1.9996985708738146e-05, "loss": 0.9318, "mean_token_accuracy": 0.7340476100242543, "step": 290 }, { "epoch": 0.10917586103208198, "grad_norm": 1.3396153450012207, "learning_rate": 1.999519063762928e-05, "loss": 0.9187, "mean_token_accuracy": 0.7370102479106311, "step": 295 }, { "epoch": 0.11102629935465963, "grad_norm": 1.274489402770996, "learning_rate": 1.9992978255929168e-05, "loss": 0.884, "mean_token_accuracy": 0.7474725751891694, "step": 300 }, { "epoch": 0.11102629935465963, "eval_loss": 0.9594149589538574, "eval_mean_token_accuracy": 0.7260321389978355, "eval_runtime": 78.825, "eval_samples_per_second": 6.508, "eval_steps_per_second": 6.508, "step": 300 }, { "epoch": 0.1128767376772373, "grad_norm": 1.2882938385009766, "learning_rate": 1.999034865600726e-05, "loss": 0.8989, "mean_token_accuracy": 0.7411822076550901, "step": 305 }, { "epoch": 0.11472717599981495, "grad_norm": 1.2861721515655518, "learning_rate": 1.9987301947652354e-05, "loss": 0.9024, "mean_token_accuracy": 0.7414158014701427, "step": 310 }, { "epoch": 0.11657761432239262, "grad_norm": 1.4147839546203613, "learning_rate": 1.998383825806799e-05, "loss": 0.9163, "mean_token_accuracy": 0.7390650543887309, "step": 315 }, { "epoch": 0.11842805264497028, "grad_norm": 1.2372761964797974, "learning_rate": 1.9979957731867143e-05, "loss": 0.9133, "mean_token_accuracy": 0.7386950204852115, "step": 320 }, { "epoch": 0.12027849096754793, "grad_norm": 1.2764817476272583, "learning_rate": 1.9975660531066215e-05, "loss": 0.9377, "mean_token_accuracy": 0.7348123426639266, "step": 325 }, { "epoch": 0.1221289292901256, "grad_norm": 1.178112268447876, "learning_rate": 1.9970946835078227e-05, "loss": 0.9341, "mean_token_accuracy": 0.7334139550957592, "step": 330 }, { "epoch": 0.12397936761270326, "grad_norm": 1.2661017179489136, "learning_rate": 1.9965816840705355e-05, "loss": 0.9666, "mean_token_accuracy": 0.723251078215526, "step": 335 }, { "epoch": 0.12582980593528093, "grad_norm": 1.20822274684906, "learning_rate": 1.9960270762130705e-05, "loss": 0.9388, "mean_token_accuracy": 0.7318659879034792, "step": 340 }, { "epoch": 0.1276802442578586, "grad_norm": 1.260669469833374, "learning_rate": 1.9954308830909372e-05, "loss": 0.8983, "mean_token_accuracy": 0.7420506330369535, "step": 345 }, { "epoch": 0.12953068258043623, "grad_norm": 1.22865891456604, "learning_rate": 1.9947931295958778e-05, "loss": 0.8898, "mean_token_accuracy": 0.74350299707088, "step": 350 }, { "epoch": 0.1313811209030139, "grad_norm": 1.306064486503601, "learning_rate": 1.9941138423548266e-05, "loss": 0.9358, "mean_token_accuracy": 0.7305222666688053, "step": 355 }, { "epoch": 0.13323155922559157, "grad_norm": 1.2020059823989868, "learning_rate": 1.9933930497287996e-05, "loss": 0.9269, "mean_token_accuracy": 0.7334965080243132, "step": 360 }, { "epoch": 0.1350819975481692, "grad_norm": 1.274563193321228, "learning_rate": 1.9926307818117098e-05, "loss": 0.9225, "mean_token_accuracy": 0.735283383137161, "step": 365 }, { "epoch": 0.13693243587074688, "grad_norm": 1.2880144119262695, "learning_rate": 1.9918270704291104e-05, "loss": 0.9039, "mean_token_accuracy": 0.7401982686077297, "step": 370 }, { "epoch": 0.13878287419332455, "grad_norm": 1.5040298700332642, "learning_rate": 1.9909819491368677e-05, "loss": 0.9211, "mean_token_accuracy": 0.7366455081833134, "step": 375 }, { "epoch": 0.1406333125159022, "grad_norm": 1.1860566139221191, "learning_rate": 1.990095453219757e-05, "loss": 0.9008, "mean_token_accuracy": 0.7395652659782839, "step": 380 }, { "epoch": 0.14248375083847986, "grad_norm": 1.2104592323303223, "learning_rate": 1.989167619689993e-05, "loss": 0.9402, "mean_token_accuracy": 0.7305055021182448, "step": 385 }, { "epoch": 0.14433418916105753, "grad_norm": 1.305290937423706, "learning_rate": 1.988198487285682e-05, "loss": 0.9542, "mean_token_accuracy": 0.7249303930518376, "step": 390 }, { "epoch": 0.1461846274836352, "grad_norm": 1.2977765798568726, "learning_rate": 1.9871880964692055e-05, "loss": 0.9075, "mean_token_accuracy": 0.7387939424587044, "step": 395 }, { "epoch": 0.14803506580621284, "grad_norm": 1.309501051902771, "learning_rate": 1.9861364894255306e-05, "loss": 0.9207, "mean_token_accuracy": 0.7343055812075197, "step": 400 }, { "epoch": 0.14803506580621284, "eval_loss": 0.9451568126678467, "eval_mean_token_accuracy": 0.72881764551406, "eval_runtime": 79.77, "eval_samples_per_second": 6.431, "eval_steps_per_second": 6.431, "step": 400 }, { "epoch": 0.1498855041287905, "grad_norm": 1.2279026508331299, "learning_rate": 1.985043710060449e-05, "loss": 0.8866, "mean_token_accuracy": 0.7445145321342795, "step": 405 }, { "epoch": 0.15173594245136818, "grad_norm": 1.4455894231796265, "learning_rate": 1.9839098039987435e-05, "loss": 0.9268, "mean_token_accuracy": 0.7340181375673809, "step": 410 }, { "epoch": 0.15358638077394582, "grad_norm": 1.2705790996551514, "learning_rate": 1.9827348185822834e-05, "loss": 0.8996, "mean_token_accuracy": 0.7409869253740823, "step": 415 }, { "epoch": 0.1554368190965235, "grad_norm": 1.206468939781189, "learning_rate": 1.981518802868048e-05, "loss": 0.9142, "mean_token_accuracy": 0.7350925363737065, "step": 420 }, { "epoch": 0.15728725741910116, "grad_norm": 1.1508530378341675, "learning_rate": 1.9802618076260784e-05, "loss": 0.9287, "mean_token_accuracy": 0.7330718566128385, "step": 425 }, { "epoch": 0.1591376957416788, "grad_norm": 1.3081907033920288, "learning_rate": 1.9789638853373563e-05, "loss": 0.9385, "mean_token_accuracy": 0.731924339293188, "step": 430 }, { "epoch": 0.16098813406425647, "grad_norm": 1.292937994003296, "learning_rate": 1.9776250901916168e-05, "loss": 0.8869, "mean_token_accuracy": 0.7459759129187684, "step": 435 }, { "epoch": 0.16283857238683414, "grad_norm": 1.196614384651184, "learning_rate": 1.9762454780850807e-05, "loss": 0.9365, "mean_token_accuracy": 0.7308315683480648, "step": 440 }, { "epoch": 0.1646890107094118, "grad_norm": 1.2768101692199707, "learning_rate": 1.9748251066181247e-05, "loss": 0.9458, "mean_token_accuracy": 0.7291328350939127, "step": 445 }, { "epoch": 0.16653944903198944, "grad_norm": 1.28200364112854, "learning_rate": 1.973364035092875e-05, "loss": 0.9296, "mean_token_accuracy": 0.7317795343677007, "step": 450 }, { "epoch": 0.1683898873545671, "grad_norm": 1.200141429901123, "learning_rate": 1.971862324510732e-05, "loss": 0.9127, "mean_token_accuracy": 0.7369081696753893, "step": 455 }, { "epoch": 0.17024032567714478, "grad_norm": 1.2196623086929321, "learning_rate": 1.9703200375698223e-05, "loss": 0.9347, "mean_token_accuracy": 0.7317238542786955, "step": 460 }, { "epoch": 0.17209076399972242, "grad_norm": 1.1578480005264282, "learning_rate": 1.968737238662382e-05, "loss": 0.9266, "mean_token_accuracy": 0.7334499338025752, "step": 465 }, { "epoch": 0.1739412023223001, "grad_norm": 1.2814745903015137, "learning_rate": 1.9671139938720678e-05, "loss": 0.9161, "mean_token_accuracy": 0.7351944384479114, "step": 470 }, { "epoch": 0.17579164064487776, "grad_norm": 1.2746986150741577, "learning_rate": 1.9654503709711984e-05, "loss": 0.9144, "mean_token_accuracy": 0.738140942384358, "step": 475 }, { "epoch": 0.1776420789674554, "grad_norm": 1.1708147525787354, "learning_rate": 1.963746439417924e-05, "loss": 0.906, "mean_token_accuracy": 0.7386923400408494, "step": 480 }, { "epoch": 0.17949251729003307, "grad_norm": 1.220799207687378, "learning_rate": 1.962002270353328e-05, "loss": 0.9308, "mean_token_accuracy": 0.7312089465302491, "step": 485 }, { "epoch": 0.18134295561261074, "grad_norm": 1.379558801651001, "learning_rate": 1.960217936598454e-05, "loss": 0.8983, "mean_token_accuracy": 0.741063724019383, "step": 490 }, { "epoch": 0.1831933939351884, "grad_norm": 1.2207688093185425, "learning_rate": 1.958393512651269e-05, "loss": 0.9266, "mean_token_accuracy": 0.7335562049097655, "step": 495 }, { "epoch": 0.18504383225776605, "grad_norm": 1.1398905515670776, "learning_rate": 1.956529074683551e-05, "loss": 0.9342, "mean_token_accuracy": 0.7320654199930504, "step": 500 }, { "epoch": 0.18504383225776605, "eval_loss": 0.9367051720619202, "eval_mean_token_accuracy": 0.7303092480029469, "eval_runtime": 66.3213, "eval_samples_per_second": 7.735, "eval_steps_per_second": 7.735, "step": 500 }, { "epoch": 0.18689427058034372, "grad_norm": 1.2544001340866089, "learning_rate": 1.9546247005377065e-05, "loss": 0.8923, "mean_token_accuracy": 0.7418732998401973, "step": 505 }, { "epoch": 0.1887447089029214, "grad_norm": 1.2007049322128296, "learning_rate": 1.952680469723526e-05, "loss": 0.9018, "mean_token_accuracy": 0.7390339266062526, "step": 510 }, { "epoch": 0.19059514722549903, "grad_norm": 1.3158226013183594, "learning_rate": 1.9506964634148597e-05, "loss": 0.8859, "mean_token_accuracy": 0.7457988652681784, "step": 515 }, { "epoch": 0.1924455855480767, "grad_norm": 1.191190242767334, "learning_rate": 1.9486727644462306e-05, "loss": 0.907, "mean_token_accuracy": 0.7398091503885992, "step": 520 }, { "epoch": 0.19429602387065437, "grad_norm": 1.1855626106262207, "learning_rate": 1.9466094573093744e-05, "loss": 0.913, "mean_token_accuracy": 0.7377695160507135, "step": 525 }, { "epoch": 0.19614646219323204, "grad_norm": 1.226499080657959, "learning_rate": 1.9445066281497144e-05, "loss": 0.9199, "mean_token_accuracy": 0.7334141588016627, "step": 530 }, { "epoch": 0.19799690051580968, "grad_norm": 1.1852086782455444, "learning_rate": 1.9423643647627625e-05, "loss": 0.8999, "mean_token_accuracy": 0.7383552793509549, "step": 535 }, { "epoch": 0.19984733883838735, "grad_norm": 1.1205965280532837, "learning_rate": 1.940182756590454e-05, "loss": 0.9184, "mean_token_accuracy": 0.736318010130394, "step": 540 }, { "epoch": 0.201697777160965, "grad_norm": 1.2531499862670898, "learning_rate": 1.9379618947174155e-05, "loss": 0.9443, "mean_token_accuracy": 0.7257061607299684, "step": 545 }, { "epoch": 0.20354821548354265, "grad_norm": 1.2362135648727417, "learning_rate": 1.935701871867158e-05, "loss": 0.8954, "mean_token_accuracy": 0.7395217784179058, "step": 550 }, { "epoch": 0.20539865380612032, "grad_norm": 1.1606743335723877, "learning_rate": 1.9334027823982103e-05, "loss": 0.891, "mean_token_accuracy": 0.743093979178, "step": 555 }, { "epoch": 0.207249092128698, "grad_norm": 1.2009831666946411, "learning_rate": 1.9310647223001752e-05, "loss": 0.9474, "mean_token_accuracy": 0.7266491757165843, "step": 560 }, { "epoch": 0.20909953045127563, "grad_norm": 1.1111478805541992, "learning_rate": 1.9286877891897244e-05, "loss": 0.8886, "mean_token_accuracy": 0.7429377197565084, "step": 565 }, { "epoch": 0.2109499687738533, "grad_norm": 1.199816107749939, "learning_rate": 1.9262720823065217e-05, "loss": 0.9071, "mean_token_accuracy": 0.7380534542801875, "step": 570 }, { "epoch": 0.21280040709643097, "grad_norm": 1.2556802034378052, "learning_rate": 1.923817702509081e-05, "loss": 0.9063, "mean_token_accuracy": 0.7376037582197961, "step": 575 }, { "epoch": 0.21465084541900864, "grad_norm": 1.175026535987854, "learning_rate": 1.9213247522705532e-05, "loss": 0.9126, "mean_token_accuracy": 0.7368979682438305, "step": 580 }, { "epoch": 0.21650128374158628, "grad_norm": 1.1675595045089722, "learning_rate": 1.9187933356744504e-05, "loss": 0.876, "mean_token_accuracy": 0.7452829314493784, "step": 585 }, { "epoch": 0.21835172206416395, "grad_norm": 1.2250741720199585, "learning_rate": 1.9162235584102973e-05, "loss": 0.9056, "mean_token_accuracy": 0.7382800839384349, "step": 590 }, { "epoch": 0.22020216038674162, "grad_norm": 1.2157166004180908, "learning_rate": 1.9136155277692215e-05, "loss": 0.9289, "mean_token_accuracy": 0.7296502066251958, "step": 595 }, { "epoch": 0.22205259870931926, "grad_norm": 1.2010993957519531, "learning_rate": 1.9109693526394722e-05, "loss": 0.9288, "mean_token_accuracy": 0.7325398892814555, "step": 600 }, { "epoch": 0.22205259870931926, "eval_loss": 0.9283245801925659, "eval_mean_token_accuracy": 0.7320316075391832, "eval_runtime": 50.6599, "eval_samples_per_second": 10.126, "eval_steps_per_second": 10.126, "step": 600 }, { "epoch": 0.22390303703189693, "grad_norm": 1.1503756046295166, "learning_rate": 1.9082851435018743e-05, "loss": 0.8932, "mean_token_accuracy": 0.7414583125820019, "step": 605 }, { "epoch": 0.2257534753544746, "grad_norm": 1.2121210098266602, "learning_rate": 1.905563012425216e-05, "loss": 0.91, "mean_token_accuracy": 0.7360977430386327, "step": 610 }, { "epoch": 0.22760391367705224, "grad_norm": 1.2150719165802002, "learning_rate": 1.9028030730615696e-05, "loss": 0.9003, "mean_token_accuracy": 0.738964053140992, "step": 615 }, { "epoch": 0.2294543519996299, "grad_norm": 1.1619393825531006, "learning_rate": 1.9000054406415467e-05, "loss": 0.8475, "mean_token_accuracy": 0.754003586658783, "step": 620 }, { "epoch": 0.23130479032220758, "grad_norm": 1.1259123086929321, "learning_rate": 1.897170231969486e-05, "loss": 0.8836, "mean_token_accuracy": 0.7426343946369743, "step": 625 }, { "epoch": 0.23315522864478525, "grad_norm": 1.2872394323349, "learning_rate": 1.8942975654185788e-05, "loss": 0.9003, "mean_token_accuracy": 0.7400158833605734, "step": 630 }, { "epoch": 0.2350056669673629, "grad_norm": 1.1954014301300049, "learning_rate": 1.8913875609259246e-05, "loss": 0.9009, "mean_token_accuracy": 0.7401640259366237, "step": 635 }, { "epoch": 0.23685610528994056, "grad_norm": 1.3242663145065308, "learning_rate": 1.8884403399875252e-05, "loss": 0.9173, "mean_token_accuracy": 0.7366328368067789, "step": 640 }, { "epoch": 0.23870654361251822, "grad_norm": 1.1375986337661743, "learning_rate": 1.8854560256532098e-05, "loss": 0.9079, "mean_token_accuracy": 0.738805148743965, "step": 645 }, { "epoch": 0.24055698193509586, "grad_norm": 1.1969835758209229, "learning_rate": 1.8824347425215016e-05, "loss": 0.9001, "mean_token_accuracy": 0.7413721409763886, "step": 650 }, { "epoch": 0.24240742025767353, "grad_norm": 1.1655569076538086, "learning_rate": 1.8793766167344115e-05, "loss": 0.8596, "mean_token_accuracy": 0.7499591110003503, "step": 655 }, { "epoch": 0.2442578585802512, "grad_norm": 1.2159533500671387, "learning_rate": 1.8762817759721735e-05, "loss": 0.9074, "mean_token_accuracy": 0.7375846045002021, "step": 660 }, { "epoch": 0.24610829690282887, "grad_norm": 1.1925801038742065, "learning_rate": 1.8731503494479132e-05, "loss": 0.8976, "mean_token_accuracy": 0.7398126959719362, "step": 665 }, { "epoch": 0.2479587352254065, "grad_norm": 1.2198364734649658, "learning_rate": 1.869982467902255e-05, "loss": 0.8633, "mean_token_accuracy": 0.7489514762175022, "step": 670 }, { "epoch": 0.24980917354798418, "grad_norm": 1.0977855920791626, "learning_rate": 1.8667782635978597e-05, "loss": 0.8873, "mean_token_accuracy": 0.7421541213702059, "step": 675 }, { "epoch": 0.25165961187056185, "grad_norm": 1.2008975744247437, "learning_rate": 1.8635378703139066e-05, "loss": 0.9017, "mean_token_accuracy": 0.7381848400310442, "step": 680 }, { "epoch": 0.2535100501931395, "grad_norm": 1.2168858051300049, "learning_rate": 1.8602614233405047e-05, "loss": 0.9115, "mean_token_accuracy": 0.7354954553001021, "step": 685 }, { "epoch": 0.2553604885157172, "grad_norm": 1.3037954568862915, "learning_rate": 1.8569490594730474e-05, "loss": 0.8864, "mean_token_accuracy": 0.7435983539382823, "step": 690 }, { "epoch": 0.25721092683829483, "grad_norm": 1.2055126428604126, "learning_rate": 1.853600917006497e-05, "loss": 0.8766, "mean_token_accuracy": 0.7440811039197984, "step": 695 }, { "epoch": 0.25906136516087247, "grad_norm": 1.1308127641677856, "learning_rate": 1.8502171357296144e-05, "loss": 0.8857, "mean_token_accuracy": 0.7449071002901505, "step": 700 }, { "epoch": 0.25906136516087247, "eval_loss": 0.921642541885376, "eval_mean_token_accuracy": 0.7332576670109641, "eval_runtime": 50.7893, "eval_samples_per_second": 10.101, "eval_steps_per_second": 10.101, "step": 700 }, { "epoch": 0.26091180348345017, "grad_norm": 1.186762809753418, "learning_rate": 1.8467978569191216e-05, "loss": 0.9174, "mean_token_accuracy": 0.7333227510987468, "step": 705 }, { "epoch": 0.2627622418060278, "grad_norm": 1.2207229137420654, "learning_rate": 1.8433432233338027e-05, "loss": 0.8746, "mean_token_accuracy": 0.7463131881651428, "step": 710 }, { "epoch": 0.26461268012860545, "grad_norm": 1.196184754371643, "learning_rate": 1.8398533792085436e-05, "loss": 0.88, "mean_token_accuracy": 0.7460801215672447, "step": 715 }, { "epoch": 0.26646311845118315, "grad_norm": 1.1278691291809082, "learning_rate": 1.8363284702483106e-05, "loss": 0.9095, "mean_token_accuracy": 0.7360591613918188, "step": 720 }, { "epoch": 0.2683135567737608, "grad_norm": 1.1802078485488892, "learning_rate": 1.832768643622067e-05, "loss": 0.9267, "mean_token_accuracy": 0.7342043041294353, "step": 725 }, { "epoch": 0.2701639950963384, "grad_norm": 1.1267465353012085, "learning_rate": 1.8291740479566286e-05, "loss": 0.8988, "mean_token_accuracy": 0.7399242844986644, "step": 730 }, { "epoch": 0.2720144334189161, "grad_norm": 1.0495985746383667, "learning_rate": 1.825544833330457e-05, "loss": 0.8687, "mean_token_accuracy": 0.7469224892009212, "step": 735 }, { "epoch": 0.27386487174149377, "grad_norm": 1.146759271621704, "learning_rate": 1.8218811512673958e-05, "loss": 0.8858, "mean_token_accuracy": 0.744181844578751, "step": 740 }, { "epoch": 0.2757153100640714, "grad_norm": 1.1623131036758423, "learning_rate": 1.818183154730344e-05, "loss": 0.8777, "mean_token_accuracy": 0.7464787908985644, "step": 745 }, { "epoch": 0.2775657483866491, "grad_norm": 1.1983212232589722, "learning_rate": 1.8144509981148675e-05, "loss": 0.8857, "mean_token_accuracy": 0.74380218229527, "step": 750 }, { "epoch": 0.27941618670922674, "grad_norm": 1.202833652496338, "learning_rate": 1.810684837242755e-05, "loss": 0.9102, "mean_token_accuracy": 0.7358472675370848, "step": 755 }, { "epoch": 0.2812666250318044, "grad_norm": 1.2793760299682617, "learning_rate": 1.8068848293555118e-05, "loss": 0.8961, "mean_token_accuracy": 0.7392355986645086, "step": 760 }, { "epoch": 0.2831170633543821, "grad_norm": 1.1184508800506592, "learning_rate": 1.8030511331077945e-05, "loss": 0.8868, "mean_token_accuracy": 0.7439213670336557, "step": 765 }, { "epoch": 0.2849675016769597, "grad_norm": 1.1284053325653076, "learning_rate": 1.799183908560787e-05, "loss": 0.8959, "mean_token_accuracy": 0.7389006136503797, "step": 770 }, { "epoch": 0.28681793999953736, "grad_norm": 1.1895016431808472, "learning_rate": 1.795283317175518e-05, "loss": 0.8829, "mean_token_accuracy": 0.7442627836085055, "step": 775 }, { "epoch": 0.28866837832211506, "grad_norm": 1.0703201293945312, "learning_rate": 1.7913495218061202e-05, "loss": 0.8947, "mean_token_accuracy": 0.7412916509680103, "step": 780 }, { "epoch": 0.2905188166446927, "grad_norm": 1.1624069213867188, "learning_rate": 1.787382686693029e-05, "loss": 0.862, "mean_token_accuracy": 0.7471440251545716, "step": 785 }, { "epoch": 0.2923692549672704, "grad_norm": 1.2251648902893066, "learning_rate": 1.783382977456128e-05, "loss": 0.8923, "mean_token_accuracy": 0.7410979902586569, "step": 790 }, { "epoch": 0.29421969328984804, "grad_norm": 1.143458366394043, "learning_rate": 1.779350561087833e-05, "loss": 0.8853, "mean_token_accuracy": 0.7423898317827671, "step": 795 }, { "epoch": 0.2960701316124257, "grad_norm": 1.1026256084442139, "learning_rate": 1.775285605946119e-05, "loss": 0.8586, "mean_token_accuracy": 0.751576924914566, "step": 800 }, { "epoch": 0.2960701316124257, "eval_loss": 0.9158464670181274, "eval_mean_token_accuracy": 0.7348994079566449, "eval_runtime": 51.1725, "eval_samples_per_second": 10.025, "eval_steps_per_second": 10.025, "step": 800 }, { "epoch": 0.2979205699350034, "grad_norm": 1.1115987300872803, "learning_rate": 1.7711882817474922e-05, "loss": 0.867, "mean_token_accuracy": 0.7480320992990822, "step": 805 }, { "epoch": 0.299771008257581, "grad_norm": 1.2489161491394043, "learning_rate": 1.7670587595599034e-05, "loss": 0.8892, "mean_token_accuracy": 0.7421057530949784, "step": 810 }, { "epoch": 0.30162144658015866, "grad_norm": 1.1725574731826782, "learning_rate": 1.762897211795607e-05, "loss": 0.8758, "mean_token_accuracy": 0.744083576959311, "step": 815 }, { "epoch": 0.30347188490273636, "grad_norm": 1.2797213792800903, "learning_rate": 1.758703812203961e-05, "loss": 0.89, "mean_token_accuracy": 0.7418882377620613, "step": 820 }, { "epoch": 0.305322323225314, "grad_norm": 1.2301912307739258, "learning_rate": 1.7544787358641735e-05, "loss": 0.8787, "mean_token_accuracy": 0.7447037499906078, "step": 825 }, { "epoch": 0.30717276154789164, "grad_norm": 1.285537600517273, "learning_rate": 1.7502221591779932e-05, "loss": 0.88, "mean_token_accuracy": 0.744204531409647, "step": 830 }, { "epoch": 0.30902319987046933, "grad_norm": 1.1410529613494873, "learning_rate": 1.7459342598623438e-05, "loss": 0.8914, "mean_token_accuracy": 0.7435018667641985, "step": 835 }, { "epoch": 0.310873638193047, "grad_norm": 1.1405855417251587, "learning_rate": 1.741615216941905e-05, "loss": 0.893, "mean_token_accuracy": 0.7407222726883036, "step": 840 }, { "epoch": 0.3127240765156246, "grad_norm": 1.0677855014801025, "learning_rate": 1.7372652107416364e-05, "loss": 0.9012, "mean_token_accuracy": 0.7392550453159372, "step": 845 }, { "epoch": 0.3145745148382023, "grad_norm": 1.1801300048828125, "learning_rate": 1.7328844228792513e-05, "loss": 0.8887, "mean_token_accuracy": 0.7418257843405092, "step": 850 }, { "epoch": 0.31642495316077995, "grad_norm": 1.101582407951355, "learning_rate": 1.7284730362576308e-05, "loss": 0.88, "mean_token_accuracy": 0.7453135253974471, "step": 855 }, { "epoch": 0.3182753914833576, "grad_norm": 1.1998122930526733, "learning_rate": 1.7240312350571905e-05, "loss": 0.8864, "mean_token_accuracy": 0.7427352454319888, "step": 860 }, { "epoch": 0.3201258298059353, "grad_norm": 1.13681161403656, "learning_rate": 1.719559204728188e-05, "loss": 0.901, "mean_token_accuracy": 0.7380651694454986, "step": 865 }, { "epoch": 0.32197626812851293, "grad_norm": 1.216718316078186, "learning_rate": 1.715057131982983e-05, "loss": 0.8885, "mean_token_accuracy": 0.7411097870617352, "step": 870 }, { "epoch": 0.32382670645109063, "grad_norm": 1.1352806091308594, "learning_rate": 1.710525204788239e-05, "loss": 0.8727, "mean_token_accuracy": 0.7462809426313219, "step": 875 }, { "epoch": 0.32567714477366827, "grad_norm": 1.2423046827316284, "learning_rate": 1.7059636123570767e-05, "loss": 0.9074, "mean_token_accuracy": 0.7375009900783651, "step": 880 }, { "epoch": 0.3275275830962459, "grad_norm": 1.0711945295333862, "learning_rate": 1.7013725451411757e-05, "loss": 0.8866, "mean_token_accuracy": 0.7420681995170615, "step": 885 }, { "epoch": 0.3293780214188236, "grad_norm": 1.147957444190979, "learning_rate": 1.696752194822819e-05, "loss": 0.8758, "mean_token_accuracy": 0.7470981950404637, "step": 890 }, { "epoch": 0.33122845974140125, "grad_norm": 1.1426104307174683, "learning_rate": 1.692102754306895e-05, "loss": 0.888, "mean_token_accuracy": 0.7420897783947114, "step": 895 }, { "epoch": 0.3330788980639789, "grad_norm": 1.12646484375, "learning_rate": 1.6874244177128395e-05, "loss": 0.891, "mean_token_accuracy": 0.7410190186584571, "step": 900 }, { "epoch": 0.3330788980639789, "eval_loss": 0.9115377068519592, "eval_mean_token_accuracy": 0.7353900755746844, "eval_runtime": 78.6969, "eval_samples_per_second": 6.519, "eval_steps_per_second": 6.519, "step": 900 }, { "epoch": 0.3349293363865566, "grad_norm": 1.1655060052871704, "learning_rate": 1.6827173803665328e-05, "loss": 0.8861, "mean_token_accuracy": 0.7420408174595587, "step": 905 }, { "epoch": 0.3367797747091342, "grad_norm": 1.189650058746338, "learning_rate": 1.677981838792144e-05, "loss": 0.8939, "mean_token_accuracy": 0.741134526619554, "step": 910 }, { "epoch": 0.33863021303171187, "grad_norm": 1.301318645477295, "learning_rate": 1.6732179907039266e-05, "loss": 0.9091, "mean_token_accuracy": 0.7390976253952309, "step": 915 }, { "epoch": 0.34048065135428957, "grad_norm": 1.250311255455017, "learning_rate": 1.6684260349979637e-05, "loss": 0.8621, "mean_token_accuracy": 0.748405369402981, "step": 920 }, { "epoch": 0.3423310896768672, "grad_norm": 1.1436880826950073, "learning_rate": 1.6636061717438626e-05, "loss": 0.8591, "mean_token_accuracy": 0.7498006647906157, "step": 925 }, { "epoch": 0.34418152799944485, "grad_norm": 1.0761877298355103, "learning_rate": 1.6587586021764022e-05, "loss": 0.8765, "mean_token_accuracy": 0.7454029311737017, "step": 930 }, { "epoch": 0.34603196632202254, "grad_norm": 1.1089153289794922, "learning_rate": 1.653883528687133e-05, "loss": 0.8434, "mean_token_accuracy": 0.7529633045613144, "step": 935 }, { "epoch": 0.3478824046446002, "grad_norm": 1.1343047618865967, "learning_rate": 1.6489811548159245e-05, "loss": 0.8618, "mean_token_accuracy": 0.747673238978996, "step": 940 }, { "epoch": 0.3497328429671778, "grad_norm": 1.1737910509109497, "learning_rate": 1.6440516852424678e-05, "loss": 0.8918, "mean_token_accuracy": 0.741031903005348, "step": 945 }, { "epoch": 0.3515832812897555, "grad_norm": 1.3350353240966797, "learning_rate": 1.6390953257777324e-05, "loss": 0.908, "mean_token_accuracy": 0.7361824438406707, "step": 950 }, { "epoch": 0.35343371961233316, "grad_norm": 1.179598331451416, "learning_rate": 1.634112283355369e-05, "loss": 0.9018, "mean_token_accuracy": 0.7387367278989166, "step": 955 }, { "epoch": 0.3552841579349108, "grad_norm": 1.1995991468429565, "learning_rate": 1.6291027660230735e-05, "loss": 0.8618, "mean_token_accuracy": 0.7483973267665744, "step": 960 }, { "epoch": 0.3571345962574885, "grad_norm": 1.1567256450653076, "learning_rate": 1.6240669829338992e-05, "loss": 0.8953, "mean_token_accuracy": 0.736539362307222, "step": 965 }, { "epoch": 0.35898503458006614, "grad_norm": 1.1351323127746582, "learning_rate": 1.6190051443375248e-05, "loss": 0.8847, "mean_token_accuracy": 0.7417752874148211, "step": 970 }, { "epoch": 0.36083547290264384, "grad_norm": 1.1404730081558228, "learning_rate": 1.6139174615714753e-05, "loss": 0.8763, "mean_token_accuracy": 0.7457316258582162, "step": 975 }, { "epoch": 0.3626859112252215, "grad_norm": 1.2140825986862183, "learning_rate": 1.6088041470523005e-05, "loss": 0.8694, "mean_token_accuracy": 0.745424525779393, "step": 980 }, { "epoch": 0.3645363495477991, "grad_norm": 1.0882492065429688, "learning_rate": 1.6036654142667043e-05, "loss": 0.8524, "mean_token_accuracy": 0.7514694551764137, "step": 985 }, { "epoch": 0.3663867878703768, "grad_norm": 1.2724220752716064, "learning_rate": 1.598501477762632e-05, "loss": 0.9048, "mean_token_accuracy": 0.7358279308224847, "step": 990 }, { "epoch": 0.36823722619295446, "grad_norm": 1.059874176979065, "learning_rate": 1.5933125531403135e-05, "loss": 0.8849, "mean_token_accuracy": 0.7424272766741411, "step": 995 }, { "epoch": 0.3700876645155321, "grad_norm": 1.0729840993881226, "learning_rate": 1.5880988570432603e-05, "loss": 0.8809, "mean_token_accuracy": 0.7437243222926472, "step": 1000 }, { "epoch": 0.3700876645155321, "eval_loss": 0.9052200317382812, "eval_mean_token_accuracy": 0.7371176382735689, "eval_runtime": 78.2052, "eval_samples_per_second": 6.56, "eval_steps_per_second": 6.56, "step": 1000 }, { "epoch": 0.3719381028381098, "grad_norm": 1.1344876289367676, "learning_rate": 1.582860607149222e-05, "loss": 0.9101, "mean_token_accuracy": 0.7372656047371745, "step": 1005 }, { "epoch": 0.37378854116068744, "grad_norm": 1.1230980157852173, "learning_rate": 1.5775980221610966e-05, "loss": 0.8475, "mean_token_accuracy": 0.750859103854946, "step": 1010 }, { "epoch": 0.3756389794832651, "grad_norm": 1.2423635721206665, "learning_rate": 1.5723113217978e-05, "loss": 0.8619, "mean_token_accuracy": 0.7491803187809288, "step": 1015 }, { "epoch": 0.3774894178058428, "grad_norm": 1.1783347129821777, "learning_rate": 1.567000726785093e-05, "loss": 0.8685, "mean_token_accuracy": 0.7480539254792099, "step": 1020 }, { "epoch": 0.3793398561284204, "grad_norm": 1.1289883852005005, "learning_rate": 1.561666458846365e-05, "loss": 0.8302, "mean_token_accuracy": 0.7585016645128863, "step": 1025 }, { "epoch": 0.38119029445099806, "grad_norm": 1.1546114683151245, "learning_rate": 1.5563087406933762e-05, "loss": 0.8562, "mean_token_accuracy": 0.7499629460804305, "step": 1030 }, { "epoch": 0.38304073277357575, "grad_norm": 1.1813424825668335, "learning_rate": 1.550927796016961e-05, "loss": 0.8805, "mean_token_accuracy": 0.7437648212454886, "step": 1035 }, { "epoch": 0.3848911710961534, "grad_norm": 1.0651295185089111, "learning_rate": 1.5455238494776876e-05, "loss": 0.897, "mean_token_accuracy": 0.7403527790535602, "step": 1040 }, { "epoch": 0.38674160941873104, "grad_norm": 1.1787759065628052, "learning_rate": 1.5400971266964772e-05, "loss": 0.8274, "mean_token_accuracy": 0.7571744736657067, "step": 1045 }, { "epoch": 0.38859204774130873, "grad_norm": 1.1317715644836426, "learning_rate": 1.5346478542451862e-05, "loss": 0.8492, "mean_token_accuracy": 0.7520489518464395, "step": 1050 }, { "epoch": 0.3904424860638864, "grad_norm": 1.1802644729614258, "learning_rate": 1.529176259637145e-05, "loss": 0.8883, "mean_token_accuracy": 0.7416859799704518, "step": 1055 }, { "epoch": 0.39229292438646407, "grad_norm": 1.1477546691894531, "learning_rate": 1.5236825713176584e-05, "loss": 0.8453, "mean_token_accuracy": 0.7534802458771865, "step": 1060 }, { "epoch": 0.3941433627090417, "grad_norm": 1.0698646306991577, "learning_rate": 1.5181670186544706e-05, "loss": 0.8841, "mean_token_accuracy": 0.7430932063263405, "step": 1065 }, { "epoch": 0.39599380103161935, "grad_norm": 1.133300542831421, "learning_rate": 1.5126298319281859e-05, "loss": 0.8591, "mean_token_accuracy": 0.7490520840317914, "step": 1070 }, { "epoch": 0.39784423935419705, "grad_norm": 1.1771512031555176, "learning_rate": 1.5070712423226552e-05, "loss": 0.8835, "mean_token_accuracy": 0.742742449439321, "step": 1075 }, { "epoch": 0.3996946776767747, "grad_norm": 1.183058500289917, "learning_rate": 1.5014914819153252e-05, "loss": 0.8647, "mean_token_accuracy": 0.7484671674745523, "step": 1080 }, { "epoch": 0.40154511599935233, "grad_norm": 1.1038291454315186, "learning_rate": 1.4958907836675467e-05, "loss": 0.898, "mean_token_accuracy": 0.7388261469205153, "step": 1085 }, { "epoch": 0.40339555432193, "grad_norm": 1.2172880172729492, "learning_rate": 1.490269381414849e-05, "loss": 0.9214, "mean_token_accuracy": 0.7315794003843903, "step": 1090 }, { "epoch": 0.40524599264450767, "grad_norm": 1.1312109231948853, "learning_rate": 1.484627509857178e-05, "loss": 0.8548, "mean_token_accuracy": 0.7498190106815769, "step": 1095 }, { "epoch": 0.4070964309670853, "grad_norm": 1.172813892364502, "learning_rate": 1.4789654045490957e-05, "loss": 0.8523, "mean_token_accuracy": 0.7516241507062726, "step": 1100 }, { "epoch": 0.4070964309670853, "eval_loss": 0.9006803631782532, "eval_mean_token_accuracy": 0.7380126150724604, "eval_runtime": 78.0529, "eval_samples_per_second": 6.572, "eval_steps_per_second": 6.572, "step": 1100 }, { "epoch": 0.408946869289663, "grad_norm": 1.2298842668533325, "learning_rate": 1.4732833018899468e-05, "loss": 0.9118, "mean_token_accuracy": 0.7363720229201065, "step": 1105 }, { "epoch": 0.41079730761224065, "grad_norm": 1.2025749683380127, "learning_rate": 1.4675814391139875e-05, "loss": 0.8789, "mean_token_accuracy": 0.7419271920026719, "step": 1110 }, { "epoch": 0.4126477459348183, "grad_norm": 1.237095832824707, "learning_rate": 1.4618600542804819e-05, "loss": 0.8407, "mean_token_accuracy": 0.7536702060776138, "step": 1115 }, { "epoch": 0.414498184257396, "grad_norm": 1.15714430809021, "learning_rate": 1.4561193862637621e-05, "loss": 0.8755, "mean_token_accuracy": 0.7445992346991004, "step": 1120 }, { "epoch": 0.4163486225799736, "grad_norm": 1.1363252401351929, "learning_rate": 1.4503596747432554e-05, "loss": 0.8493, "mean_token_accuracy": 0.7512692116507991, "step": 1125 }, { "epoch": 0.41819906090255127, "grad_norm": 1.219039797782898, "learning_rate": 1.4445811601934763e-05, "loss": 0.8591, "mean_token_accuracy": 0.7476566205055524, "step": 1130 }, { "epoch": 0.42004949922512896, "grad_norm": 1.061074137687683, "learning_rate": 1.4387840838739875e-05, "loss": 0.8537, "mean_token_accuracy": 0.7499305110493293, "step": 1135 }, { "epoch": 0.4218999375477066, "grad_norm": 1.1493220329284668, "learning_rate": 1.4329686878193271e-05, "loss": 0.8465, "mean_token_accuracy": 0.7527445329146073, "step": 1140 }, { "epoch": 0.42375037587028425, "grad_norm": 1.1958634853363037, "learning_rate": 1.4271352148289025e-05, "loss": 0.8885, "mean_token_accuracy": 0.7418003324662512, "step": 1145 }, { "epoch": 0.42560081419286194, "grad_norm": 1.1059014797210693, "learning_rate": 1.421283908456854e-05, "loss": 0.8861, "mean_token_accuracy": 0.7397866706192172, "step": 1150 }, { "epoch": 0.4274512525154396, "grad_norm": 1.0388813018798828, "learning_rate": 1.4154150130018867e-05, "loss": 0.8482, "mean_token_accuracy": 0.750769571523241, "step": 1155 }, { "epoch": 0.4293016908380173, "grad_norm": 1.25522780418396, "learning_rate": 1.4095287734970678e-05, "loss": 0.8886, "mean_token_accuracy": 0.7420644725182851, "step": 1160 }, { "epoch": 0.4311521291605949, "grad_norm": 1.2089345455169678, "learning_rate": 1.4036254356996004e-05, "loss": 0.8429, "mean_token_accuracy": 0.7526113236357822, "step": 1165 }, { "epoch": 0.43300256748317256, "grad_norm": 1.1411499977111816, "learning_rate": 1.3977052460805597e-05, "loss": 0.8612, "mean_token_accuracy": 0.7494721167484096, "step": 1170 }, { "epoch": 0.43485300580575026, "grad_norm": 1.062648057937622, "learning_rate": 1.3917684518146044e-05, "loss": 0.9042, "mean_token_accuracy": 0.7372648541482008, "step": 1175 }, { "epoch": 0.4367034441283279, "grad_norm": 1.1319317817687988, "learning_rate": 1.3858153007696552e-05, "loss": 0.88, "mean_token_accuracy": 0.7426588688482566, "step": 1180 }, { "epoch": 0.43855388245090554, "grad_norm": 1.2135462760925293, "learning_rate": 1.3798460414965475e-05, "loss": 0.8617, "mean_token_accuracy": 0.7475439228361497, "step": 1185 }, { "epoch": 0.44040432077348324, "grad_norm": 1.1821489334106445, "learning_rate": 1.3738609232186537e-05, "loss": 0.8774, "mean_token_accuracy": 0.7442542768849871, "step": 1190 }, { "epoch": 0.4422547590960609, "grad_norm": 1.1596920490264893, "learning_rate": 1.3678601958214779e-05, "loss": 0.9043, "mean_token_accuracy": 0.737392939549257, "step": 1195 }, { "epoch": 0.4441051974186385, "grad_norm": 1.05031418800354, "learning_rate": 1.3618441098422215e-05, "loss": 0.858, "mean_token_accuracy": 0.7495626407835908, "step": 1200 }, { "epoch": 0.4441051974186385, "eval_loss": 0.8962021470069885, "eval_mean_token_accuracy": 0.7389938838082026, "eval_runtime": 50.4621, "eval_samples_per_second": 10.166, "eval_steps_per_second": 10.166, "step": 1200 }, { "epoch": 0.4459556357412162, "grad_norm": 1.1134624481201172, "learning_rate": 1.3558129164593256e-05, "loss": 0.8615, "mean_token_accuracy": 0.7486654072277983, "step": 1205 }, { "epoch": 0.44780607406379386, "grad_norm": 1.1625741720199585, "learning_rate": 1.349766867481982e-05, "loss": 0.8784, "mean_token_accuracy": 0.7430844560579396, "step": 1210 }, { "epoch": 0.4496565123863715, "grad_norm": 1.2227801084518433, "learning_rate": 1.3437062153396201e-05, "loss": 0.867, "mean_token_accuracy": 0.7453914280475592, "step": 1215 }, { "epoch": 0.4515069507089492, "grad_norm": 1.1665985584259033, "learning_rate": 1.337631213071369e-05, "loss": 0.862, "mean_token_accuracy": 0.7485184284333548, "step": 1220 }, { "epoch": 0.45335738903152684, "grad_norm": 1.1335411071777344, "learning_rate": 1.331542114315491e-05, "loss": 0.8621, "mean_token_accuracy": 0.7484168861668221, "step": 1225 }, { "epoch": 0.4552078273541045, "grad_norm": 1.1079161167144775, "learning_rate": 1.325439173298793e-05, "loss": 0.8932, "mean_token_accuracy": 0.7378822601715721, "step": 1230 }, { "epoch": 0.4570582656766822, "grad_norm": 1.0776060819625854, "learning_rate": 1.3193226448260128e-05, "loss": 0.8516, "mean_token_accuracy": 0.7499922526524231, "step": 1235 }, { "epoch": 0.4589087039992598, "grad_norm": 1.1441576480865479, "learning_rate": 1.3131927842691793e-05, "loss": 0.8608, "mean_token_accuracy": 0.7493913958395965, "step": 1240 }, { "epoch": 0.4607591423218375, "grad_norm": 1.1261563301086426, "learning_rate": 1.3070498475569507e-05, "loss": 0.8751, "mean_token_accuracy": 0.7454891607846024, "step": 1245 }, { "epoch": 0.46260958064441515, "grad_norm": 1.1878074407577515, "learning_rate": 1.3008940911639302e-05, "loss": 0.8354, "mean_token_accuracy": 0.7543122392839603, "step": 1250 }, { "epoch": 0.4644600189669928, "grad_norm": 1.1509381532669067, "learning_rate": 1.2947257720999577e-05, "loss": 0.8474, "mean_token_accuracy": 0.7522147924985935, "step": 1255 }, { "epoch": 0.4663104572895705, "grad_norm": 1.1772030591964722, "learning_rate": 1.2885451478993777e-05, "loss": 0.8666, "mean_token_accuracy": 0.7461336524892079, "step": 1260 }, { "epoch": 0.46816089561214813, "grad_norm": 1.0959590673446655, "learning_rate": 1.282352476610289e-05, "loss": 0.8095, "mean_token_accuracy": 0.7617041479079839, "step": 1265 }, { "epoch": 0.4700113339347258, "grad_norm": 1.0549659729003906, "learning_rate": 1.2761480167837705e-05, "loss": 0.8633, "mean_token_accuracy": 0.7483238275951523, "step": 1270 }, { "epoch": 0.47186177225730347, "grad_norm": 1.1503080129623413, "learning_rate": 1.2699320274630847e-05, "loss": 0.8889, "mean_token_accuracy": 0.7395314122897056, "step": 1275 }, { "epoch": 0.4737122105798811, "grad_norm": 1.1426583528518677, "learning_rate": 1.263704768172864e-05, "loss": 0.9038, "mean_token_accuracy": 0.7375030812712198, "step": 1280 }, { "epoch": 0.47556264890245875, "grad_norm": 1.1978209018707275, "learning_rate": 1.257466498908276e-05, "loss": 0.8658, "mean_token_accuracy": 0.747567498713126, "step": 1285 }, { "epoch": 0.47741308722503645, "grad_norm": 1.0632010698318481, "learning_rate": 1.2512174801241657e-05, "loss": 0.8186, "mean_token_accuracy": 0.7578140334225285, "step": 1290 }, { "epoch": 0.4792635255476141, "grad_norm": 1.0424400568008423, "learning_rate": 1.2449579727241834e-05, "loss": 0.8369, "mean_token_accuracy": 0.7558098493900247, "step": 1295 }, { "epoch": 0.48111396387019173, "grad_norm": 1.134675145149231, "learning_rate": 1.2386882380498918e-05, "loss": 0.8464, "mean_token_accuracy": 0.7537183149776496, "step": 1300 }, { "epoch": 0.48111396387019173, "eval_loss": 0.8923500180244446, "eval_mean_token_accuracy": 0.7399605975475906, "eval_runtime": 50.3806, "eval_samples_per_second": 10.182, "eval_steps_per_second": 10.182, "step": 1300 }, { "epoch": 0.4829644021927694, "grad_norm": 1.1513569355010986, "learning_rate": 1.2324085378698529e-05, "loss": 0.8676, "mean_token_accuracy": 0.748164952205539, "step": 1305 }, { "epoch": 0.48481484051534707, "grad_norm": 1.0311802625656128, "learning_rate": 1.2261191343687e-05, "loss": 0.8518, "mean_token_accuracy": 0.7521871498641179, "step": 1310 }, { "epoch": 0.4866652788379247, "grad_norm": 1.1396102905273438, "learning_rate": 1.219820290136192e-05, "loss": 0.8839, "mean_token_accuracy": 0.7428744239445215, "step": 1315 }, { "epoch": 0.4885157171605024, "grad_norm": 1.1476742029190063, "learning_rate": 1.2135122681562481e-05, "loss": 0.8913, "mean_token_accuracy": 0.7383458968011796, "step": 1320 }, { "epoch": 0.49036615548308005, "grad_norm": 1.2438527345657349, "learning_rate": 1.2071953317959692e-05, "loss": 0.8744, "mean_token_accuracy": 0.7452624657333512, "step": 1325 }, { "epoch": 0.49221659380565774, "grad_norm": 1.1306873559951782, "learning_rate": 1.2008697447946421e-05, "loss": 0.8751, "mean_token_accuracy": 0.7424721560380638, "step": 1330 }, { "epoch": 0.4940670321282354, "grad_norm": 1.1581846475601196, "learning_rate": 1.1945357712527273e-05, "loss": 0.8585, "mean_token_accuracy": 0.7497287108747003, "step": 1335 }, { "epoch": 0.495917470450813, "grad_norm": 1.3308736085891724, "learning_rate": 1.1881936756208329e-05, "loss": 0.8868, "mean_token_accuracy": 0.7392369943901189, "step": 1340 }, { "epoch": 0.4977679087733907, "grad_norm": 1.143922209739685, "learning_rate": 1.1818437226886738e-05, "loss": 0.8286, "mean_token_accuracy": 0.7565615599698585, "step": 1345 }, { "epoch": 0.49961834709596836, "grad_norm": 1.3555456399917603, "learning_rate": 1.1754861775740163e-05, "loss": 0.8799, "mean_token_accuracy": 0.7441001926506251, "step": 1350 }, { "epoch": 0.501468785418546, "grad_norm": 1.0333168506622314, "learning_rate": 1.1691213057116082e-05, "loss": 0.833, "mean_token_accuracy": 0.7563766714351905, "step": 1355 }, { "epoch": 0.5033192237411237, "grad_norm": 1.1685550212860107, "learning_rate": 1.1627493728420978e-05, "loss": 0.8312, "mean_token_accuracy": 0.7566255940877488, "step": 1360 }, { "epoch": 0.5051696620637013, "grad_norm": 1.2081339359283447, "learning_rate": 1.1563706450009391e-05, "loss": 0.8407, "mean_token_accuracy": 0.754443525932113, "step": 1365 }, { "epoch": 0.507020100386279, "grad_norm": 1.0863213539123535, "learning_rate": 1.1499853885072827e-05, "loss": 0.8339, "mean_token_accuracy": 0.7545843545749367, "step": 1370 }, { "epoch": 0.5088705387088567, "grad_norm": 1.1328675746917725, "learning_rate": 1.1435938699528586e-05, "loss": 0.8911, "mean_token_accuracy": 0.7379121479666565, "step": 1375 }, { "epoch": 0.5107209770314344, "grad_norm": 1.1014608144760132, "learning_rate": 1.137196356190845e-05, "loss": 0.8486, "mean_token_accuracy": 0.7507691158185413, "step": 1380 }, { "epoch": 0.512571415354012, "grad_norm": 1.1279276609420776, "learning_rate": 1.1307931143247268e-05, "loss": 0.846, "mean_token_accuracy": 0.7543217557206249, "step": 1385 }, { "epoch": 0.5144218536765897, "grad_norm": 1.2000517845153809, "learning_rate": 1.1243844116971433e-05, "loss": 0.8628, "mean_token_accuracy": 0.7468883900958883, "step": 1390 }, { "epoch": 0.5162722919991674, "grad_norm": 1.4059293270111084, "learning_rate": 1.1179705158787276e-05, "loss": 0.8491, "mean_token_accuracy": 0.7522363707168316, "step": 1395 }, { "epoch": 0.5181227303217449, "grad_norm": 1.1212759017944336, "learning_rate": 1.1115516946569333e-05, "loss": 0.8497, "mean_token_accuracy": 0.7499685858393633, "step": 1400 }, { "epoch": 0.5181227303217449, "eval_loss": 0.8875036239624023, "eval_mean_token_accuracy": 0.7405125948791582, "eval_runtime": 50.4671, "eval_samples_per_second": 10.165, "eval_steps_per_second": 10.165, "step": 1400 }, { "epoch": 0.5199731686443226, "grad_norm": 1.0802546739578247, "learning_rate": 1.105128216024857e-05, "loss": 0.8321, "mean_token_accuracy": 0.7570983833156995, "step": 1405 }, { "epoch": 0.5218236069669003, "grad_norm": 1.1517388820648193, "learning_rate": 1.0987003481700456e-05, "loss": 0.8682, "mean_token_accuracy": 0.7435588169757222, "step": 1410 }, { "epoch": 0.5236740452894779, "grad_norm": 1.335225224494934, "learning_rate": 1.092268359463302e-05, "loss": 0.8709, "mean_token_accuracy": 0.7452917373238905, "step": 1415 }, { "epoch": 0.5255244836120556, "grad_norm": 1.1167656183242798, "learning_rate": 1.0858325184474796e-05, "loss": 0.8927, "mean_token_accuracy": 0.7391699468887288, "step": 1420 }, { "epoch": 0.5273749219346333, "grad_norm": 1.127293348312378, "learning_rate": 1.0793930938262689e-05, "loss": 0.8771, "mean_token_accuracy": 0.7431836462390942, "step": 1425 }, { "epoch": 0.5292253602572109, "grad_norm": 1.219251275062561, "learning_rate": 1.0729503544529814e-05, "loss": 0.8919, "mean_token_accuracy": 0.7401217708520834, "step": 1430 }, { "epoch": 0.5310757985797886, "grad_norm": 1.0866413116455078, "learning_rate": 1.0665045693193226e-05, "loss": 0.8561, "mean_token_accuracy": 0.7487249142355408, "step": 1435 }, { "epoch": 0.5329262369023663, "grad_norm": 1.0730805397033691, "learning_rate": 1.0600560075441617e-05, "loss": 0.8763, "mean_token_accuracy": 0.7433546825854954, "step": 1440 }, { "epoch": 0.5347766752249439, "grad_norm": 1.1865015029907227, "learning_rate": 1.0536049383622966e-05, "loss": 0.873, "mean_token_accuracy": 0.744337296630846, "step": 1445 }, { "epoch": 0.5366271135475216, "grad_norm": 1.058233618736267, "learning_rate": 1.047151631113212e-05, "loss": 0.8574, "mean_token_accuracy": 0.7489177692249259, "step": 1450 }, { "epoch": 0.5384775518700993, "grad_norm": 1.0962151288986206, "learning_rate": 1.0406963552298332e-05, "loss": 0.8577, "mean_token_accuracy": 0.7472454520686317, "step": 1455 }, { "epoch": 0.5403279901926769, "grad_norm": 1.0905910730361938, "learning_rate": 1.034239380227281e-05, "loss": 0.8406, "mean_token_accuracy": 0.7517492389021212, "step": 1460 }, { "epoch": 0.5421784285152546, "grad_norm": 1.0674951076507568, "learning_rate": 1.0277809756916134e-05, "loss": 0.884, "mean_token_accuracy": 0.7403210456441213, "step": 1465 }, { "epoch": 0.5440288668378322, "grad_norm": 0.9999077916145325, "learning_rate": 1.0213214112685747e-05, "loss": 0.8357, "mean_token_accuracy": 0.7536248701632074, "step": 1470 }, { "epoch": 0.5458793051604098, "grad_norm": 1.1569849252700806, "learning_rate": 1.0148609566523358e-05, "loss": 0.8252, "mean_token_accuracy": 0.7566011284124972, "step": 1475 }, { "epoch": 0.5477297434829875, "grad_norm": 1.1695085763931274, "learning_rate": 1.0083998815742335e-05, "loss": 0.8526, "mean_token_accuracy": 0.7503246035344221, "step": 1480 }, { "epoch": 0.5495801818055652, "grad_norm": 1.2002400159835815, "learning_rate": 1.0019384557915099e-05, "loss": 0.8481, "mean_token_accuracy": 0.7512016801534671, "step": 1485 }, { "epoch": 0.5514306201281428, "grad_norm": 1.0182387828826904, "learning_rate": 9.9547694907605e-06, "loss": 0.8598, "mean_token_accuracy": 0.7493889074191754, "step": 1490 }, { "epoch": 0.5532810584507205, "grad_norm": 1.1398547887802124, "learning_rate": 9.890156312031165e-06, "loss": 0.8649, "mean_token_accuracy": 0.7465289067053387, "step": 1495 }, { "epoch": 0.5551314967732982, "grad_norm": 1.0890473127365112, "learning_rate": 9.825547719400889e-06, "loss": 0.8412, "mean_token_accuracy": 0.7527299833547442, "step": 1500 }, { "epoch": 0.5551314967732982, "eval_loss": 0.883466899394989, "eval_mean_token_accuracy": 0.7415391964540806, "eval_runtime": 78.5634, "eval_samples_per_second": 6.53, "eval_steps_per_second": 6.53, "step": 1500 }, { "epoch": 0.5569819350958758, "grad_norm": 1.083844780921936, "learning_rate": 9.760946410351988e-06, "loss": 0.8254, "mean_token_accuracy": 0.7577779781862539, "step": 1505 }, { "epoch": 0.5588323734184535, "grad_norm": 1.12702214717865, "learning_rate": 9.696355082062679e-06, "loss": 0.8684, "mean_token_accuracy": 0.7457827779769488, "step": 1510 }, { "epoch": 0.5606828117410312, "grad_norm": 1.1723198890686035, "learning_rate": 9.631776431294475e-06, "loss": 0.8358, "mean_token_accuracy": 0.7553337331891314, "step": 1515 }, { "epoch": 0.5625332500636088, "grad_norm": 1.0552607774734497, "learning_rate": 9.567213154279582e-06, "loss": 0.8269, "mean_token_accuracy": 0.7580377257670661, "step": 1520 }, { "epoch": 0.5643836883861865, "grad_norm": 1.1943864822387695, "learning_rate": 9.502667946608332e-06, "loss": 0.8834, "mean_token_accuracy": 0.7412889558685449, "step": 1525 }, { "epoch": 0.5662341267087642, "grad_norm": 1.0926896333694458, "learning_rate": 9.43814350311666e-06, "loss": 0.8516, "mean_token_accuracy": 0.7485775551693337, "step": 1530 }, { "epoch": 0.5680845650313417, "grad_norm": 1.1257902383804321, "learning_rate": 9.37364251777355e-06, "loss": 0.8406, "mean_token_accuracy": 0.755104972794591, "step": 1535 }, { "epoch": 0.5699350033539194, "grad_norm": 1.184830904006958, "learning_rate": 9.309167683568597e-06, "loss": 0.9104, "mean_token_accuracy": 0.7347233994686599, "step": 1540 }, { "epoch": 0.5717854416764971, "grad_norm": 1.0849738121032715, "learning_rate": 9.244721692399545e-06, "loss": 0.8368, "mean_token_accuracy": 0.754807061603084, "step": 1545 }, { "epoch": 0.5736358799990747, "grad_norm": 1.211175560951233, "learning_rate": 9.180307234959918e-06, "loss": 0.8697, "mean_token_accuracy": 0.7446549598340149, "step": 1550 }, { "epoch": 0.5754863183216524, "grad_norm": 1.1779320240020752, "learning_rate": 9.115927000626665e-06, "loss": 0.8436, "mean_token_accuracy": 0.7515610439271313, "step": 1555 }, { "epoch": 0.5773367566442301, "grad_norm": 1.085847020149231, "learning_rate": 9.051583677347879e-06, "loss": 0.8828, "mean_token_accuracy": 0.7409961502294228, "step": 1560 }, { "epoch": 0.5791871949668078, "grad_norm": 1.0653111934661865, "learning_rate": 8.987279951530586e-06, "loss": 0.8776, "mean_token_accuracy": 0.7426297316689073, "step": 1565 }, { "epoch": 0.5810376332893854, "grad_norm": 1.0567981004714966, "learning_rate": 8.923018507928564e-06, "loss": 0.8323, "mean_token_accuracy": 0.7550816699978279, "step": 1570 }, { "epoch": 0.5828880716119631, "grad_norm": 1.0699466466903687, "learning_rate": 8.85880202953026e-06, "loss": 0.8056, "mean_token_accuracy": 0.7621896696985008, "step": 1575 }, { "epoch": 0.5847385099345408, "grad_norm": 1.1082216501235962, "learning_rate": 8.79463319744677e-06, "loss": 0.845, "mean_token_accuracy": 0.7515386166583966, "step": 1580 }, { "epoch": 0.5865889482571184, "grad_norm": 1.1345179080963135, "learning_rate": 8.730514690799916e-06, "loss": 0.8686, "mean_token_accuracy": 0.7446916177566473, "step": 1585 }, { "epoch": 0.5884393865796961, "grad_norm": 1.1079617738723755, "learning_rate": 8.666449186610353e-06, "loss": 0.8707, "mean_token_accuracy": 0.7437692859659788, "step": 1590 }, { "epoch": 0.5902898249022738, "grad_norm": 1.1664999723434448, "learning_rate": 8.60243935968585e-06, "loss": 0.8707, "mean_token_accuracy": 0.7435284059896061, "step": 1595 }, { "epoch": 0.5921402632248514, "grad_norm": 1.0710796117782593, "learning_rate": 8.538487882509568e-06, "loss": 0.8377, "mean_token_accuracy": 0.7522821982666854, "step": 1600 }, { "epoch": 0.5921402632248514, "eval_loss": 0.8806459307670593, "eval_mean_token_accuracy": 0.7420680891691662, "eval_runtime": 78.0774, "eval_samples_per_second": 6.57, "eval_steps_per_second": 6.57, "step": 1600 }, { "epoch": 0.5939907015474291, "grad_norm": 1.0592025518417358, "learning_rate": 8.474597425128501e-06, "loss": 0.8578, "mean_token_accuracy": 0.7485613520965848, "step": 1605 }, { "epoch": 0.5958411398700068, "grad_norm": 1.1598796844482422, "learning_rate": 8.410770655042003e-06, "loss": 0.8678, "mean_token_accuracy": 0.7454409843236658, "step": 1610 }, { "epoch": 0.5976915781925843, "grad_norm": 1.1448596715927124, "learning_rate": 8.347010237090408e-06, "loss": 0.8641, "mean_token_accuracy": 0.744198405162078, "step": 1615 }, { "epoch": 0.599542016515162, "grad_norm": 1.1301459074020386, "learning_rate": 8.283318833343773e-06, "loss": 0.8677, "mean_token_accuracy": 0.7457724650975702, "step": 1620 }, { "epoch": 0.6013924548377397, "grad_norm": 1.0771557092666626, "learning_rate": 8.219699102990735e-06, "loss": 0.8159, "mean_token_accuracy": 0.7596411077898149, "step": 1625 }, { "epoch": 0.6032428931603173, "grad_norm": 1.100756049156189, "learning_rate": 8.156153702227484e-06, "loss": 0.8235, "mean_token_accuracy": 0.7558995024863349, "step": 1630 }, { "epoch": 0.605093331482895, "grad_norm": 1.0815366506576538, "learning_rate": 8.092685284146865e-06, "loss": 0.8634, "mean_token_accuracy": 0.7451292739524256, "step": 1635 }, { "epoch": 0.6069437698054727, "grad_norm": 1.01080322265625, "learning_rate": 8.029296498627608e-06, "loss": 0.8689, "mean_token_accuracy": 0.7460930718951544, "step": 1640 }, { "epoch": 0.6087942081280503, "grad_norm": 1.0338646173477173, "learning_rate": 7.965989992223693e-06, "loss": 0.8386, "mean_token_accuracy": 0.7530607004502294, "step": 1645 }, { "epoch": 0.610644646450628, "grad_norm": 1.0714292526245117, "learning_rate": 7.90276840805385e-06, "loss": 0.821, "mean_token_accuracy": 0.759004615934364, "step": 1650 }, { "epoch": 0.6124950847732057, "grad_norm": 1.0714163780212402, "learning_rate": 7.839634385691214e-06, "loss": 0.8522, "mean_token_accuracy": 0.7480340127823741, "step": 1655 }, { "epoch": 0.6143455230957833, "grad_norm": 1.1181557178497314, "learning_rate": 7.776590561053117e-06, "loss": 0.8497, "mean_token_accuracy": 0.7490940834342857, "step": 1660 }, { "epoch": 0.616195961418361, "grad_norm": 1.2162024974822998, "learning_rate": 7.713639566291028e-06, "loss": 0.8385, "mean_token_accuracy": 0.7524457191655578, "step": 1665 }, { "epoch": 0.6180463997409387, "grad_norm": 1.132047176361084, "learning_rate": 7.650784029680662e-06, "loss": 0.8177, "mean_token_accuracy": 0.7589660088987882, "step": 1670 }, { "epoch": 0.6198968380635163, "grad_norm": 1.0826492309570312, "learning_rate": 7.58802657551225e-06, "loss": 0.8475, "mean_token_accuracy": 0.7521249389316524, "step": 1675 }, { "epoch": 0.621747276386094, "grad_norm": 1.0509703159332275, "learning_rate": 7.52536982398097e-06, "loss": 0.8506, "mean_token_accuracy": 0.7483188436822825, "step": 1680 }, { "epoch": 0.6235977147086716, "grad_norm": 1.0793825387954712, "learning_rate": 7.46281639107755e-06, "loss": 0.8725, "mean_token_accuracy": 0.7428339069332885, "step": 1685 }, { "epoch": 0.6254481530312492, "grad_norm": 1.1245028972625732, "learning_rate": 7.400368888479048e-06, "loss": 0.8874, "mean_token_accuracy": 0.738876062207339, "step": 1690 }, { "epoch": 0.6272985913538269, "grad_norm": 1.0940346717834473, "learning_rate": 7.3380299234398076e-06, "loss": 0.8712, "mean_token_accuracy": 0.7450572147899929, "step": 1695 }, { "epoch": 0.6291490296764046, "grad_norm": 1.1423110961914062, "learning_rate": 7.275802098682612e-06, "loss": 0.8464, "mean_token_accuracy": 0.750851552690575, "step": 1700 }, { "epoch": 0.6291490296764046, "eval_loss": 0.8765305280685425, "eval_mean_token_accuracy": 0.7429402834830536, "eval_runtime": 79.7639, "eval_samples_per_second": 6.431, "eval_steps_per_second": 6.431, "step": 1700 }, { "epoch": 0.6309994679989822, "grad_norm": 1.0875543355941772, "learning_rate": 7.213688012290004e-06, "loss": 0.8261, "mean_token_accuracy": 0.7555179092653466, "step": 1705 }, { "epoch": 0.6328499063215599, "grad_norm": 1.1560087203979492, "learning_rate": 7.151690257595826e-06, "loss": 0.8231, "mean_token_accuracy": 0.7575116156195778, "step": 1710 }, { "epoch": 0.6347003446441376, "grad_norm": 1.1857202053070068, "learning_rate": 7.089811423076936e-06, "loss": 0.8271, "mean_token_accuracy": 0.7540097382022544, "step": 1715 }, { "epoch": 0.6365507829667152, "grad_norm": 1.0499897003173828, "learning_rate": 7.028054092245134e-06, "loss": 0.833, "mean_token_accuracy": 0.7516812101912956, "step": 1720 }, { "epoch": 0.6384012212892929, "grad_norm": 1.1187822818756104, "learning_rate": 6.966420843539321e-06, "loss": 0.8359, "mean_token_accuracy": 0.7528163038917939, "step": 1725 }, { "epoch": 0.6402516596118706, "grad_norm": 1.066787838935852, "learning_rate": 6.90491425021781e-06, "loss": 0.8509, "mean_token_accuracy": 0.749689118169066, "step": 1730 }, { "epoch": 0.6421020979344482, "grad_norm": 1.076682209968567, "learning_rate": 6.843536880250914e-06, "loss": 0.8533, "mean_token_accuracy": 0.7507021768711251, "step": 1735 }, { "epoch": 0.6439525362570259, "grad_norm": 1.1074074506759644, "learning_rate": 6.7822912962137225e-06, "loss": 0.8477, "mean_token_accuracy": 0.749668775072545, "step": 1740 }, { "epoch": 0.6458029745796036, "grad_norm": 1.1533567905426025, "learning_rate": 6.721180055179113e-06, "loss": 0.8694, "mean_token_accuracy": 0.7447913980843776, "step": 1745 }, { "epoch": 0.6476534129021813, "grad_norm": 1.170899510383606, "learning_rate": 6.660205708610987e-06, "loss": 0.8558, "mean_token_accuracy": 0.7510611028124827, "step": 1750 }, { "epoch": 0.6495038512247588, "grad_norm": 1.1935901641845703, "learning_rate": 6.599370802257755e-06, "loss": 0.8635, "mean_token_accuracy": 0.7451695366987459, "step": 1755 }, { "epoch": 0.6513542895473365, "grad_norm": 1.1270606517791748, "learning_rate": 6.5386778760460316e-06, "loss": 0.854, "mean_token_accuracy": 0.7491003127984905, "step": 1760 }, { "epoch": 0.6532047278699142, "grad_norm": 1.0685646533966064, "learning_rate": 6.478129463974598e-06, "loss": 0.8602, "mean_token_accuracy": 0.7494648164169282, "step": 1765 }, { "epoch": 0.6550551661924918, "grad_norm": 1.1324131488800049, "learning_rate": 6.417728094008613e-06, "loss": 0.8525, "mean_token_accuracy": 0.7496021909362418, "step": 1770 }, { "epoch": 0.6569056045150695, "grad_norm": 1.2056822776794434, "learning_rate": 6.357476287974051e-06, "loss": 0.8365, "mean_token_accuracy": 0.7528655783760423, "step": 1775 }, { "epoch": 0.6587560428376472, "grad_norm": 1.0940866470336914, "learning_rate": 6.297376561452428e-06, "loss": 0.8206, "mean_token_accuracy": 0.7591131075443316, "step": 1780 }, { "epoch": 0.6606064811602248, "grad_norm": 1.097103476524353, "learning_rate": 6.237431423675764e-06, "loss": 0.8619, "mean_token_accuracy": 0.7475563002250533, "step": 1785 }, { "epoch": 0.6624569194828025, "grad_norm": 1.071036458015442, "learning_rate": 6.177643377421827e-06, "loss": 0.8497, "mean_token_accuracy": 0.7490388522721844, "step": 1790 }, { "epoch": 0.6643073578053802, "grad_norm": 1.1054531335830688, "learning_rate": 6.118014918909633e-06, "loss": 0.851, "mean_token_accuracy": 0.7503596366057965, "step": 1795 }, { "epoch": 0.6661577961279578, "grad_norm": 1.1277004480361938, "learning_rate": 6.058548537695225e-06, "loss": 0.8576, "mean_token_accuracy": 0.7482101706448133, "step": 1800 }, { "epoch": 0.6661577961279578, "eval_loss": 0.8728470206260681, "eval_mean_token_accuracy": 0.7436341718378576, "eval_runtime": 78.5895, "eval_samples_per_second": 6.528, "eval_steps_per_second": 6.528, "step": 1800 }, { "epoch": 0.6680082344505355, "grad_norm": 1.2152836322784424, "learning_rate": 5.999246716567737e-06, "loss": 0.863, "mean_token_accuracy": 0.7482331663974422, "step": 1805 }, { "epoch": 0.6698586727731132, "grad_norm": 1.0343924760818481, "learning_rate": 5.940111931445731e-06, "loss": 0.8596, "mean_token_accuracy": 0.7477439117392353, "step": 1810 }, { "epoch": 0.6717091110956908, "grad_norm": 1.1265689134597778, "learning_rate": 5.881146651273825e-06, "loss": 0.8287, "mean_token_accuracy": 0.7555190617342097, "step": 1815 }, { "epoch": 0.6735595494182685, "grad_norm": 1.0546154975891113, "learning_rate": 5.822353337919616e-06, "loss": 0.8692, "mean_token_accuracy": 0.7420459758922366, "step": 1820 }, { "epoch": 0.6754099877408462, "grad_norm": 1.1439138650894165, "learning_rate": 5.763734446070892e-06, "loss": 0.8767, "mean_token_accuracy": 0.7410674912058572, "step": 1825 }, { "epoch": 0.6772604260634237, "grad_norm": 1.083448052406311, "learning_rate": 5.705292423133133e-06, "loss": 0.8433, "mean_token_accuracy": 0.7535584571887595, "step": 1830 }, { "epoch": 0.6791108643860014, "grad_norm": 1.1094322204589844, "learning_rate": 5.647029709127355e-06, "loss": 0.8708, "mean_token_accuracy": 0.7444246110938043, "step": 1835 }, { "epoch": 0.6809613027085791, "grad_norm": 1.0760412216186523, "learning_rate": 5.5889487365882065e-06, "loss": 0.8201, "mean_token_accuracy": 0.7574787063158566, "step": 1840 }, { "epoch": 0.6828117410311567, "grad_norm": 1.108798861503601, "learning_rate": 5.531051930462437e-06, "loss": 0.8827, "mean_token_accuracy": 0.7418184447228666, "step": 1845 }, { "epoch": 0.6846621793537344, "grad_norm": 1.0983977317810059, "learning_rate": 5.4733417080076325e-06, "loss": 0.826, "mean_token_accuracy": 0.7576316725845673, "step": 1850 }, { "epoch": 0.6865126176763121, "grad_norm": 1.0830743312835693, "learning_rate": 5.415820478691301e-06, "loss": 0.8401, "mean_token_accuracy": 0.7521840490748731, "step": 1855 }, { "epoch": 0.6883630559988897, "grad_norm": 1.115321159362793, "learning_rate": 5.358490644090263e-06, "loss": 0.8533, "mean_token_accuracy": 0.7493569745174777, "step": 1860 }, { "epoch": 0.6902134943214674, "grad_norm": 1.0904319286346436, "learning_rate": 5.3013545977904005e-06, "loss": 0.8768, "mean_token_accuracy": 0.7450953030774816, "step": 1865 }, { "epoch": 0.6920639326440451, "grad_norm": 1.0365054607391357, "learning_rate": 5.244414725286717e-06, "loss": 0.8291, "mean_token_accuracy": 0.7560889898260071, "step": 1870 }, { "epoch": 0.6939143709666227, "grad_norm": 1.0902795791625977, "learning_rate": 5.187673403883721e-06, "loss": 0.8422, "mean_token_accuracy": 0.7514938359735968, "step": 1875 }, { "epoch": 0.6957648092892004, "grad_norm": 1.0894972085952759, "learning_rate": 5.131133002596199e-06, "loss": 0.8533, "mean_token_accuracy": 0.7497225973502779, "step": 1880 }, { "epoch": 0.6976152476117781, "grad_norm": 1.0798072814941406, "learning_rate": 5.074795882050293e-06, "loss": 0.833, "mean_token_accuracy": 0.7528341392472259, "step": 1885 }, { "epoch": 0.6994656859343557, "grad_norm": 1.0907217264175415, "learning_rate": 5.018664394384942e-06, "loss": 0.8508, "mean_token_accuracy": 0.7494421319615481, "step": 1890 }, { "epoch": 0.7013161242569333, "grad_norm": 1.0164741277694702, "learning_rate": 4.9627408831536705e-06, "loss": 0.8341, "mean_token_accuracy": 0.7529435144223482, "step": 1895 }, { "epoch": 0.703166562579511, "grad_norm": 1.142519474029541, "learning_rate": 4.907027683226761e-06, "loss": 0.8249, "mean_token_accuracy": 0.7567480300956836, "step": 1900 }, { "epoch": 0.703166562579511, "eval_loss": 0.8700699806213379, "eval_mean_token_accuracy": 0.7442610694529287, "eval_runtime": 50.1711, "eval_samples_per_second": 10.225, "eval_steps_per_second": 10.225, "step": 1900 }, { "epoch": 0.7050170009020886, "grad_norm": 1.0815749168395996, "learning_rate": 4.85152712069375e-06, "loss": 0.8636, "mean_token_accuracy": 0.7466736465102184, "step": 1905 }, { "epoch": 0.7068674392246663, "grad_norm": 1.016316533088684, "learning_rate": 4.7962415127663265e-06, "loss": 0.8541, "mean_token_accuracy": 0.7478070670678898, "step": 1910 }, { "epoch": 0.708717877547244, "grad_norm": 1.15891432762146, "learning_rate": 4.74117316768158e-06, "loss": 0.8165, "mean_token_accuracy": 0.759968391663386, "step": 1915 }, { "epoch": 0.7105683158698216, "grad_norm": 1.0917257070541382, "learning_rate": 4.686324384605629e-06, "loss": 0.8273, "mean_token_accuracy": 0.7553093824763246, "step": 1920 }, { "epoch": 0.7124187541923993, "grad_norm": 1.0935016870498657, "learning_rate": 4.631697453537623e-06, "loss": 0.8322, "mean_token_accuracy": 0.7561149061155692, "step": 1925 }, { "epoch": 0.714269192514977, "grad_norm": 1.049316167831421, "learning_rate": 4.577294655214144e-06, "loss": 0.8759, "mean_token_accuracy": 0.7433473744990514, "step": 1930 }, { "epoch": 0.7161196308375547, "grad_norm": 1.1667238473892212, "learning_rate": 4.523118261013969e-06, "loss": 0.8357, "mean_token_accuracy": 0.7534449112393112, "step": 1935 }, { "epoch": 0.7179700691601323, "grad_norm": 1.144909381866455, "learning_rate": 4.469170532863254e-06, "loss": 0.8332, "mean_token_accuracy": 0.7536943727299822, "step": 1940 }, { "epoch": 0.71982050748271, "grad_norm": 1.1312308311462402, "learning_rate": 4.415453723141081e-06, "loss": 0.845, "mean_token_accuracy": 0.7519524749356737, "step": 1945 }, { "epoch": 0.7216709458052877, "grad_norm": 1.1640957593917847, "learning_rate": 4.361970074585426e-06, "loss": 0.8243, "mean_token_accuracy": 0.7592206858230149, "step": 1950 }, { "epoch": 0.7235213841278653, "grad_norm": 1.1180189847946167, "learning_rate": 4.308721820199529e-06, "loss": 0.8461, "mean_token_accuracy": 0.7509569768017925, "step": 1955 }, { "epoch": 0.725371822450443, "grad_norm": 1.16630220413208, "learning_rate": 4.255711183158635e-06, "loss": 0.8303, "mean_token_accuracy": 0.7560554269072497, "step": 1960 }, { "epoch": 0.7272222607730207, "grad_norm": 1.0474036931991577, "learning_rate": 4.2029403767172175e-06, "loss": 0.8028, "mean_token_accuracy": 0.7623219816059803, "step": 1965 }, { "epoch": 0.7290726990955982, "grad_norm": 1.136406421661377, "learning_rate": 4.150411604116531e-06, "loss": 0.8422, "mean_token_accuracy": 0.7511481450162742, "step": 1970 }, { "epoch": 0.7309231374181759, "grad_norm": 1.0562430620193481, "learning_rate": 4.098127058492652e-06, "loss": 0.8256, "mean_token_accuracy": 0.7573706945444444, "step": 1975 }, { "epoch": 0.7327735757407536, "grad_norm": 1.1478077173233032, "learning_rate": 4.0460889227849e-06, "loss": 0.8249, "mean_token_accuracy": 0.7569546880750191, "step": 1980 }, { "epoch": 0.7346240140633312, "grad_norm": 1.00325608253479, "learning_rate": 3.9942993696447045e-06, "loss": 0.8385, "mean_token_accuracy": 0.7506293642917294, "step": 1985 }, { "epoch": 0.7364744523859089, "grad_norm": 1.0663394927978516, "learning_rate": 3.942760561344877e-06, "loss": 0.8432, "mean_token_accuracy": 0.7502340169200126, "step": 1990 }, { "epoch": 0.7383248907084866, "grad_norm": 1.130427360534668, "learning_rate": 3.891474649689362e-06, "loss": 0.8118, "mean_token_accuracy": 0.7618501171744374, "step": 1995 }, { "epoch": 0.7401753290310642, "grad_norm": 1.0855380296707153, "learning_rate": 3.840443775923365e-06, "loss": 0.8368, "mean_token_accuracy": 0.7534221021997818, "step": 2000 }, { "epoch": 0.7401753290310642, "eval_loss": 0.8671350479125977, "eval_mean_token_accuracy": 0.7454383935250886, "eval_runtime": 51.5876, "eval_samples_per_second": 9.944, "eval_steps_per_second": 9.944, "step": 2000 }, { "epoch": 0.7420257673536419, "grad_norm": 1.209213137626648, "learning_rate": 3.7896700706439826e-06, "loss": 0.8141, "mean_token_accuracy": 0.7606564087590786, "step": 2005 }, { "epoch": 0.7438762056762196, "grad_norm": 1.148069977760315, "learning_rate": 3.7391556537112282e-06, "loss": 0.7874, "mean_token_accuracy": 0.7669905418548161, "step": 2010 }, { "epoch": 0.7457266439987972, "grad_norm": 1.1433311700820923, "learning_rate": 3.6889026341595378e-06, "loss": 0.8431, "mean_token_accuracy": 0.7535976661109324, "step": 2015 }, { "epoch": 0.7475770823213749, "grad_norm": 1.129437804222107, "learning_rate": 3.6389131101096953e-06, "loss": 0.8576, "mean_token_accuracy": 0.7475487510668153, "step": 2020 }, { "epoch": 0.7494275206439526, "grad_norm": 1.0358542203903198, "learning_rate": 3.5891891686812597e-06, "loss": 0.8469, "mean_token_accuracy": 0.7502852908167374, "step": 2025 }, { "epoch": 0.7512779589665302, "grad_norm": 1.1142020225524902, "learning_rate": 3.5397328859054138e-06, "loss": 0.8551, "mean_token_accuracy": 0.749887645218871, "step": 2030 }, { "epoch": 0.7531283972891079, "grad_norm": 1.1345794200897217, "learning_rate": 3.490546326638273e-06, "loss": 0.8218, "mean_token_accuracy": 0.7597895364859456, "step": 2035 }, { "epoch": 0.7549788356116856, "grad_norm": 1.0666804313659668, "learning_rate": 3.441631544474705e-06, "loss": 0.8366, "mean_token_accuracy": 0.7516444578066305, "step": 2040 }, { "epoch": 0.7568292739342631, "grad_norm": 1.1180936098098755, "learning_rate": 3.3929905816625653e-06, "loss": 0.8673, "mean_token_accuracy": 0.7454598959843268, "step": 2045 }, { "epoch": 0.7586797122568408, "grad_norm": 1.1804002523422241, "learning_rate": 3.344625469017445e-06, "loss": 0.8462, "mean_token_accuracy": 0.7508597223030974, "step": 2050 }, { "epoch": 0.7605301505794185, "grad_norm": 1.0938820838928223, "learning_rate": 3.2965382258378674e-06, "loss": 0.8503, "mean_token_accuracy": 0.7488317085016187, "step": 2055 }, { "epoch": 0.7623805889019961, "grad_norm": 1.055139422416687, "learning_rate": 3.248730859821002e-06, "loss": 0.7933, "mean_token_accuracy": 0.764882704807944, "step": 2060 }, { "epoch": 0.7642310272245738, "grad_norm": 1.1458957195281982, "learning_rate": 3.2012053669788136e-06, "loss": 0.822, "mean_token_accuracy": 0.7570119050077394, "step": 2065 }, { "epoch": 0.7660814655471515, "grad_norm": 1.0989375114440918, "learning_rate": 3.1539637315547524e-06, "loss": 0.843, "mean_token_accuracy": 0.7496361163745409, "step": 2070 }, { "epoch": 0.7679319038697291, "grad_norm": 1.0754046440124512, "learning_rate": 3.1070079259408934e-06, "loss": 0.807, "mean_token_accuracy": 0.7616431298931425, "step": 2075 }, { "epoch": 0.7697823421923068, "grad_norm": 1.02739417552948, "learning_rate": 3.0603399105955966e-06, "loss": 0.8312, "mean_token_accuracy": 0.7544530392919, "step": 2080 }, { "epoch": 0.7716327805148845, "grad_norm": 1.1344877481460571, "learning_rate": 3.0139616339616394e-06, "loss": 0.8406, "mean_token_accuracy": 0.7536931725103136, "step": 2085 }, { "epoch": 0.7734832188374621, "grad_norm": 1.112585186958313, "learning_rate": 2.9678750323848893e-06, "loss": 0.8152, "mean_token_accuracy": 0.7586732704570804, "step": 2090 }, { "epoch": 0.7753336571600398, "grad_norm": 0.986150324344635, "learning_rate": 2.922082030033446e-06, "loss": 0.8169, "mean_token_accuracy": 0.7574742660162402, "step": 2095 }, { "epoch": 0.7771840954826175, "grad_norm": 1.0907959938049316, "learning_rate": 2.8765845388172955e-06, "loss": 0.822, "mean_token_accuracy": 0.7595165878036109, "step": 2100 }, { "epoch": 0.7771840954826175, "eval_loss": 0.865083634853363, "eval_mean_token_accuracy": 0.7459035597008792, "eval_runtime": 50.488, "eval_samples_per_second": 10.161, "eval_steps_per_second": 10.161, "step": 2100 }, { "epoch": 0.779034533805195, "grad_norm": 1.1251463890075684, "learning_rate": 2.831384458308518e-06, "loss": 0.8223, "mean_token_accuracy": 0.7561818495362099, "step": 2105 }, { "epoch": 0.7808849721277727, "grad_norm": 1.0305039882659912, "learning_rate": 2.7864836756619407e-06, "loss": 0.8503, "mean_token_accuracy": 0.748632069328805, "step": 2110 }, { "epoch": 0.7827354104503504, "grad_norm": 1.0406534671783447, "learning_rate": 2.741884065536373e-06, "loss": 0.821, "mean_token_accuracy": 0.7567806612457586, "step": 2115 }, { "epoch": 0.7845858487729281, "grad_norm": 1.117790937423706, "learning_rate": 2.6975874900163223e-06, "loss": 0.8417, "mean_token_accuracy": 0.7546855864916886, "step": 2120 }, { "epoch": 0.7864362870955057, "grad_norm": 1.065445065498352, "learning_rate": 2.6535957985342653e-06, "loss": 0.81, "mean_token_accuracy": 0.7606240695791034, "step": 2125 }, { "epoch": 0.7882867254180834, "grad_norm": 1.116972804069519, "learning_rate": 2.6099108277934105e-06, "loss": 0.8259, "mean_token_accuracy": 0.756568328742729, "step": 2130 }, { "epoch": 0.7901371637406611, "grad_norm": 1.0490765571594238, "learning_rate": 2.5665344016910367e-06, "loss": 0.8272, "mean_token_accuracy": 0.7554207633123468, "step": 2135 }, { "epoch": 0.7919876020632387, "grad_norm": 1.2332850694656372, "learning_rate": 2.523468331242329e-06, "loss": 0.8167, "mean_token_accuracy": 0.7584442633530236, "step": 2140 }, { "epoch": 0.7938380403858164, "grad_norm": 1.1261318922042847, "learning_rate": 2.4807144145047734e-06, "loss": 0.8272, "mean_token_accuracy": 0.7562805852137349, "step": 2145 }, { "epoch": 0.7956884787083941, "grad_norm": 1.0403841733932495, "learning_rate": 2.438274436503074e-06, "loss": 0.8476, "mean_token_accuracy": 0.7497747898973967, "step": 2150 }, { "epoch": 0.7975389170309717, "grad_norm": 1.0887295007705688, "learning_rate": 2.396150169154644e-06, "loss": 0.8612, "mean_token_accuracy": 0.7454726202967417, "step": 2155 }, { "epoch": 0.7993893553535494, "grad_norm": 1.0545668601989746, "learning_rate": 2.3543433711956197e-06, "loss": 0.8161, "mean_token_accuracy": 0.7603535982628888, "step": 2160 }, { "epoch": 0.8012397936761271, "grad_norm": 1.0553205013275146, "learning_rate": 2.3128557881074153e-06, "loss": 0.855, "mean_token_accuracy": 0.7482917388282289, "step": 2165 }, { "epoch": 0.8030902319987047, "grad_norm": 1.0758658647537231, "learning_rate": 2.271689152043873e-06, "loss": 0.82, "mean_token_accuracy": 0.7562723919351871, "step": 2170 }, { "epoch": 0.8049406703212824, "grad_norm": 1.0912736654281616, "learning_rate": 2.230845181758928e-06, "loss": 0.8518, "mean_token_accuracy": 0.7509281602598273, "step": 2175 }, { "epoch": 0.80679110864386, "grad_norm": 1.091189980506897, "learning_rate": 2.1903255825348533e-06, "loss": 0.8233, "mean_token_accuracy": 0.7585578303522367, "step": 2180 }, { "epoch": 0.8086415469664376, "grad_norm": 1.1467081308364868, "learning_rate": 2.150132046111054e-06, "loss": 0.8167, "mean_token_accuracy": 0.7596410948645677, "step": 2185 }, { "epoch": 0.8104919852890153, "grad_norm": 1.0988317728042603, "learning_rate": 2.1102662506134506e-06, "loss": 0.8554, "mean_token_accuracy": 0.7478351620581012, "step": 2190 }, { "epoch": 0.812342423611593, "grad_norm": 1.0135034322738647, "learning_rate": 2.0707298604843964e-06, "loss": 0.82, "mean_token_accuracy": 0.7557867360545079, "step": 2195 }, { "epoch": 0.8141928619341706, "grad_norm": 1.1506068706512451, "learning_rate": 2.03152452641321e-06, "loss": 0.8168, "mean_token_accuracy": 0.7570490789110471, "step": 2200 }, { "epoch": 0.8141928619341706, "eval_loss": 0.8633614182472229, "eval_mean_token_accuracy": 0.7464084312231831, "eval_runtime": 50.5455, "eval_samples_per_second": 10.149, "eval_steps_per_second": 10.149, "step": 2200 }, { "epoch": 0.8160433002567483, "grad_norm": 1.1421040296554565, "learning_rate": 1.9926518852672294e-06, "loss": 0.804, "mean_token_accuracy": 0.7624462261672521, "step": 2205 }, { "epoch": 0.817893738579326, "grad_norm": 1.1922048330307007, "learning_rate": 1.9541135600234917e-06, "loss": 0.8409, "mean_token_accuracy": 0.7499927786124297, "step": 2210 }, { "epoch": 0.8197441769019036, "grad_norm": 1.0640493631362915, "learning_rate": 1.9159111597009584e-06, "loss": 0.8556, "mean_token_accuracy": 0.749026967175465, "step": 2215 }, { "epoch": 0.8215946152244813, "grad_norm": 1.0632871389389038, "learning_rate": 1.8780462792933473e-06, "loss": 0.8311, "mean_token_accuracy": 0.7554019995751211, "step": 2220 }, { "epoch": 0.823445053547059, "grad_norm": 1.0224618911743164, "learning_rate": 1.8405204997025394e-06, "loss": 0.787, "mean_token_accuracy": 0.7662021634887914, "step": 2225 }, { "epoch": 0.8252954918696366, "grad_norm": 1.1255993843078613, "learning_rate": 1.8033353876725578e-06, "loss": 0.8139, "mean_token_accuracy": 0.7588138965092439, "step": 2230 }, { "epoch": 0.8271459301922143, "grad_norm": 1.0906031131744385, "learning_rate": 1.766492495724178e-06, "loss": 0.7866, "mean_token_accuracy": 0.7669799425354095, "step": 2235 }, { "epoch": 0.828996368514792, "grad_norm": 1.0577834844589233, "learning_rate": 1.7299933620900945e-06, "loss": 0.8255, "mean_token_accuracy": 0.7578440794991569, "step": 2240 }, { "epoch": 0.8308468068373696, "grad_norm": 1.1319166421890259, "learning_rate": 1.6938395106507034e-06, "loss": 0.8174, "mean_token_accuracy": 0.7602561091479803, "step": 2245 }, { "epoch": 0.8326972451599473, "grad_norm": 1.0294007062911987, "learning_rate": 1.658032450870467e-06, "loss": 0.8332, "mean_token_accuracy": 0.7548117730645351, "step": 2250 }, { "epoch": 0.834547683482525, "grad_norm": 1.0928364992141724, "learning_rate": 1.622573677734911e-06, "loss": 0.848, "mean_token_accuracy": 0.7495608047356359, "step": 2255 }, { "epoch": 0.8363981218051025, "grad_norm": 1.0050818920135498, "learning_rate": 1.587464671688187e-06, "loss": 0.8525, "mean_token_accuracy": 0.7487124174368831, "step": 2260 }, { "epoch": 0.8382485601276802, "grad_norm": 1.1902474164962769, "learning_rate": 1.552706898571288e-06, "loss": 0.8394, "mean_token_accuracy": 0.7522115625399438, "step": 2265 }, { "epoch": 0.8400989984502579, "grad_norm": 1.0865709781646729, "learning_rate": 1.5183018095608138e-06, "loss": 0.8648, "mean_token_accuracy": 0.746335436420623, "step": 2270 }, { "epoch": 0.8419494367728355, "grad_norm": 1.0647828578948975, "learning_rate": 1.4842508411084145e-06, "loss": 0.8436, "mean_token_accuracy": 0.7510108224281564, "step": 2275 }, { "epoch": 0.8437998750954132, "grad_norm": 1.049154281616211, "learning_rate": 1.4505554148807954e-06, "loss": 0.8406, "mean_token_accuracy": 0.7497810896600887, "step": 2280 }, { "epoch": 0.8456503134179909, "grad_norm": 1.0914316177368164, "learning_rate": 1.4172169377003775e-06, "loss": 0.8119, "mean_token_accuracy": 0.7590832228798032, "step": 2285 }, { "epoch": 0.8475007517405685, "grad_norm": 1.1115086078643799, "learning_rate": 1.3842368014865414e-06, "loss": 0.8289, "mean_token_accuracy": 0.7552640437678644, "step": 2290 }, { "epoch": 0.8493511900631462, "grad_norm": 1.0784647464752197, "learning_rate": 1.3516163831975337e-06, "loss": 0.8279, "mean_token_accuracy": 0.7538963935240591, "step": 2295 }, { "epoch": 0.8512016283857239, "grad_norm": 1.0417414903640747, "learning_rate": 1.3193570447729642e-06, "loss": 0.8291, "mean_token_accuracy": 0.7568542404327636, "step": 2300 }, { "epoch": 0.8512016283857239, "eval_loss": 0.8622255921363831, "eval_mean_token_accuracy": 0.7465137017145697, "eval_runtime": 79.4443, "eval_samples_per_second": 6.457, "eval_steps_per_second": 6.457, "step": 2300 }, { "epoch": 0.8530520667083016, "grad_norm": 1.0113840103149414, "learning_rate": 1.2874601330769488e-06, "loss": 0.8175, "mean_token_accuracy": 0.7557854286956263, "step": 2305 }, { "epoch": 0.8549025050308792, "grad_norm": 1.0227932929992676, "learning_rate": 1.255926979841876e-06, "loss": 0.8179, "mean_token_accuracy": 0.759109213108344, "step": 2310 }, { "epoch": 0.8567529433534569, "grad_norm": 1.0153535604476929, "learning_rate": 1.224758901612796e-06, "loss": 0.8392, "mean_token_accuracy": 0.7530020536038673, "step": 2315 }, { "epoch": 0.8586033816760346, "grad_norm": 1.023781657218933, "learning_rate": 1.1939571996924738e-06, "loss": 0.7914, "mean_token_accuracy": 0.76419716565575, "step": 2320 }, { "epoch": 0.8604538199986121, "grad_norm": 1.0560698509216309, "learning_rate": 1.1635231600870334e-06, "loss": 0.854, "mean_token_accuracy": 0.7493070283055853, "step": 2325 }, { "epoch": 0.8623042583211898, "grad_norm": 0.9805640578269958, "learning_rate": 1.1334580534522932e-06, "loss": 0.8357, "mean_token_accuracy": 0.7544720834621528, "step": 2330 }, { "epoch": 0.8641546966437675, "grad_norm": 1.0332831144332886, "learning_rate": 1.1037631350406874e-06, "loss": 0.7991, "mean_token_accuracy": 0.7650346153959394, "step": 2335 }, { "epoch": 0.8660051349663451, "grad_norm": 1.1042824983596802, "learning_rate": 1.0744396446488781e-06, "loss": 0.8365, "mean_token_accuracy": 0.7516225687482119, "step": 2340 }, { "epoch": 0.8678555732889228, "grad_norm": 1.0638376474380493, "learning_rate": 1.0454888065659775e-06, "loss": 0.836, "mean_token_accuracy": 0.752746712682739, "step": 2345 }, { "epoch": 0.8697060116115005, "grad_norm": 1.018282175064087, "learning_rate": 1.0169118295224488e-06, "loss": 0.8376, "mean_token_accuracy": 0.7523867122596545, "step": 2350 }, { "epoch": 0.8715564499340781, "grad_norm": 1.0781739950180054, "learning_rate": 9.887099066396178e-07, "loss": 0.8172, "mean_token_accuracy": 0.7580928881316813, "step": 2355 }, { "epoch": 0.8734068882566558, "grad_norm": 1.0491783618927002, "learning_rate": 9.608842153798903e-07, "loss": 0.819, "mean_token_accuracy": 0.7578187936876695, "step": 2360 }, { "epoch": 0.8752573265792335, "grad_norm": 1.1399493217468262, "learning_rate": 9.33435917497556e-07, "loss": 0.8229, "mean_token_accuracy": 0.7576256207110993, "step": 2365 }, { "epoch": 0.8771077649018111, "grad_norm": 1.0112969875335693, "learning_rate": 9.063661589903116e-07, "loss": 0.7902, "mean_token_accuracy": 0.7659863187941902, "step": 2370 }, { "epoch": 0.8789582032243888, "grad_norm": 1.130234718322754, "learning_rate": 8.796760700513984e-07, "loss": 0.8418, "mean_token_accuracy": 0.7490887679748288, "step": 2375 }, { "epoch": 0.8808086415469665, "grad_norm": 1.168599247932434, "learning_rate": 8.533667650224253e-07, "loss": 0.8041, "mean_token_accuracy": 0.7634415195741673, "step": 2380 }, { "epoch": 0.8826590798695441, "grad_norm": 1.1895118951797485, "learning_rate": 8.274393423468385e-07, "loss": 0.8365, "mean_token_accuracy": 0.7523803448191964, "step": 2385 }, { "epoch": 0.8845095181921218, "grad_norm": 1.0385193824768066, "learning_rate": 8.018948845240538e-07, "loss": 0.8287, "mean_token_accuracy": 0.7545559225361973, "step": 2390 }, { "epoch": 0.8863599565146995, "grad_norm": 0.9930892586708069, "learning_rate": 7.767344580642821e-07, "loss": 0.8429, "mean_token_accuracy": 0.7506830450011294, "step": 2395 }, { "epoch": 0.888210394837277, "grad_norm": 1.0381356477737427, "learning_rate": 7.519591134439753e-07, "loss": 0.8338, "mean_token_accuracy": 0.7532646629408799, "step": 2400 }, { "epoch": 0.888210394837277, "eval_loss": 0.8614717721939087, "eval_mean_token_accuracy": 0.746557203419715, "eval_runtime": 78.5694, "eval_samples_per_second": 6.529, "eval_steps_per_second": 6.529, "step": 2400 }, { "epoch": 0.8900608331598547, "grad_norm": 1.0127090215682983, "learning_rate": 7.275698850619861e-07, "loss": 0.8342, "mean_token_accuracy": 0.7525232539905243, "step": 2405 }, { "epoch": 0.8919112714824324, "grad_norm": 1.0005366802215576, "learning_rate": 7.035677911963712e-07, "loss": 0.8323, "mean_token_accuracy": 0.7529371812088047, "step": 2410 }, { "epoch": 0.89376170980501, "grad_norm": 1.0537385940551758, "learning_rate": 6.799538339618838e-07, "loss": 0.8355, "mean_token_accuracy": 0.7534756236115644, "step": 2415 }, { "epoch": 0.8956121481275877, "grad_norm": 1.1355481147766113, "learning_rate": 6.567289992681258e-07, "loss": 0.847, "mean_token_accuracy": 0.7503863788178711, "step": 2420 }, { "epoch": 0.8974625864501654, "grad_norm": 1.00087571144104, "learning_rate": 6.33894256778399e-07, "loss": 0.8086, "mean_token_accuracy": 0.7609406991600837, "step": 2425 }, { "epoch": 0.899313024772743, "grad_norm": 1.063475251197815, "learning_rate": 6.114505598692011e-07, "loss": 0.801, "mean_token_accuracy": 0.7642469955916888, "step": 2430 }, { "epoch": 0.9011634630953207, "grad_norm": 1.0210872888565063, "learning_rate": 5.893988455904387e-07, "loss": 0.8469, "mean_token_accuracy": 0.7497207039220398, "step": 2435 }, { "epoch": 0.9030139014178984, "grad_norm": 1.10099196434021, "learning_rate": 5.677400346262918e-07, "loss": 0.8375, "mean_token_accuracy": 0.7533247716063558, "step": 2440 }, { "epoch": 0.904864339740476, "grad_norm": 1.0928220748901367, "learning_rate": 5.464750312567835e-07, "loss": 0.8053, "mean_token_accuracy": 0.7616478061524402, "step": 2445 }, { "epoch": 0.9067147780630537, "grad_norm": 1.1153428554534912, "learning_rate": 5.256047233200201e-07, "loss": 0.8256, "mean_token_accuracy": 0.7550626247557867, "step": 2450 }, { "epoch": 0.9085652163856314, "grad_norm": 1.173052430152893, "learning_rate": 5.051299821751254e-07, "loss": 0.8144, "mean_token_accuracy": 0.7592750603434603, "step": 2455 }, { "epoch": 0.910415654708209, "grad_norm": 1.0116368532180786, "learning_rate": 4.850516626658585e-07, "loss": 0.84, "mean_token_accuracy": 0.754272834810296, "step": 2460 }, { "epoch": 0.9122660930307867, "grad_norm": 1.0587445497512817, "learning_rate": 4.653706030849214e-07, "loss": 0.8268, "mean_token_accuracy": 0.7556583265085172, "step": 2465 }, { "epoch": 0.9141165313533643, "grad_norm": 1.144407868385315, "learning_rate": 4.4608762513896455e-07, "loss": 0.8311, "mean_token_accuracy": 0.7554218387899451, "step": 2470 }, { "epoch": 0.9159669696759419, "grad_norm": 1.0363072156906128, "learning_rate": 4.2720353391427547e-07, "loss": 0.8452, "mean_token_accuracy": 0.7492191739469968, "step": 2475 }, { "epoch": 0.9178174079985196, "grad_norm": 1.108762502670288, "learning_rate": 4.087191178431682e-07, "loss": 0.8436, "mean_token_accuracy": 0.7492986192564005, "step": 2480 }, { "epoch": 0.9196678463210973, "grad_norm": 1.2412290573120117, "learning_rate": 3.9063514867105914e-07, "loss": 0.8517, "mean_token_accuracy": 0.7480052030779112, "step": 2485 }, { "epoch": 0.921518284643675, "grad_norm": 1.16669499874115, "learning_rate": 3.729523814242608e-07, "loss": 0.843, "mean_token_accuracy": 0.752875461916825, "step": 2490 }, { "epoch": 0.9233687229662526, "grad_norm": 1.0161534547805786, "learning_rate": 3.5567155437843725e-07, "loss": 0.8345, "mean_token_accuracy": 0.7507810240759982, "step": 2495 }, { "epoch": 0.9252191612888303, "grad_norm": 1.0766394138336182, "learning_rate": 3.3879338902779945e-07, "loss": 0.8275, "mean_token_accuracy": 0.7544474125593488, "step": 2500 }, { "epoch": 0.9252191612888303, "eval_loss": 0.8608765602111816, "eval_mean_token_accuracy": 0.7467540683531231, "eval_runtime": 80.3695, "eval_samples_per_second": 6.383, "eval_steps_per_second": 6.383, "step": 2500 }, { "epoch": 0.927069599611408, "grad_norm": 1.044554591178894, "learning_rate": 3.223185900549686e-07, "loss": 0.8351, "mean_token_accuracy": 0.753569345529369, "step": 2505 }, { "epoch": 0.9289200379339856, "grad_norm": 1.0807628631591797, "learning_rate": 3.0624784530156384e-07, "loss": 0.8297, "mean_token_accuracy": 0.7554732138827583, "step": 2510 }, { "epoch": 0.9307704762565633, "grad_norm": 1.0551187992095947, "learning_rate": 2.905818257394799e-07, "loss": 0.8208, "mean_token_accuracy": 0.7557148210943935, "step": 2515 }, { "epoch": 0.932620914579141, "grad_norm": 1.0487548112869263, "learning_rate": 2.753211854428728e-07, "loss": 0.8201, "mean_token_accuracy": 0.7570401368075597, "step": 2520 }, { "epoch": 0.9344713529017186, "grad_norm": 1.063825011253357, "learning_rate": 2.604665615608526e-07, "loss": 0.8632, "mean_token_accuracy": 0.7460995869477525, "step": 2525 }, { "epoch": 0.9363217912242963, "grad_norm": 1.0384564399719238, "learning_rate": 2.460185742908816e-07, "loss": 0.8312, "mean_token_accuracy": 0.7550518429016114, "step": 2530 }, { "epoch": 0.938172229546874, "grad_norm": 1.0318973064422607, "learning_rate": 2.3197782685288385e-07, "loss": 0.8246, "mean_token_accuracy": 0.7562569357485306, "step": 2535 }, { "epoch": 0.9400226678694515, "grad_norm": 1.0621023178100586, "learning_rate": 2.1834490546405186e-07, "loss": 0.8316, "mean_token_accuracy": 0.7541125212942166, "step": 2540 }, { "epoch": 0.9418731061920292, "grad_norm": 1.0686606168746948, "learning_rate": 2.0512037931437855e-07, "loss": 0.8402, "mean_token_accuracy": 0.7520761903722292, "step": 2545 }, { "epoch": 0.9437235445146069, "grad_norm": 1.0552746057510376, "learning_rate": 1.9230480054288958e-07, "loss": 0.8163, "mean_token_accuracy": 0.7575913676064547, "step": 2550 }, { "epoch": 0.9455739828371845, "grad_norm": 1.107093334197998, "learning_rate": 1.7989870421459498e-07, "loss": 0.8288, "mean_token_accuracy": 0.7551395264348012, "step": 2555 }, { "epoch": 0.9474244211597622, "grad_norm": 1.0550057888031006, "learning_rate": 1.6790260829814053e-07, "loss": 0.8283, "mean_token_accuracy": 0.75225936122926, "step": 2560 }, { "epoch": 0.9492748594823399, "grad_norm": 1.040071725845337, "learning_rate": 1.5631701364419492e-07, "loss": 0.8351, "mean_token_accuracy": 0.7528520564952128, "step": 2565 }, { "epoch": 0.9511252978049175, "grad_norm": 1.1225743293762207, "learning_rate": 1.4514240396452438e-07, "loss": 0.8297, "mean_token_accuracy": 0.7534344284757897, "step": 2570 }, { "epoch": 0.9529757361274952, "grad_norm": 1.0739967823028564, "learning_rate": 1.3437924581181205e-07, "loss": 0.821, "mean_token_accuracy": 0.7576335268115326, "step": 2575 }, { "epoch": 0.9548261744500729, "grad_norm": 1.108796238899231, "learning_rate": 1.2402798856016474e-07, "loss": 0.8542, "mean_token_accuracy": 0.7467002667880294, "step": 2580 }, { "epoch": 0.9566766127726505, "grad_norm": 1.180294394493103, "learning_rate": 1.1408906438636236e-07, "loss": 0.8843, "mean_token_accuracy": 0.7412087449919116, "step": 2585 }, { "epoch": 0.9585270510952282, "grad_norm": 1.0492500066757202, "learning_rate": 1.045628882518046e-07, "loss": 0.8091, "mean_token_accuracy": 0.7590382032512298, "step": 2590 }, { "epoch": 0.9603774894178059, "grad_norm": 1.1209532022476196, "learning_rate": 9.544985788519589e-08, "loss": 0.8384, "mean_token_accuracy": 0.7516073324809519, "step": 2595 }, { "epoch": 0.9622279277403835, "grad_norm": 1.0509525537490845, "learning_rate": 8.675035376593088e-08, "loss": 0.8496, "mean_token_accuracy": 0.7503324838100137, "step": 2600 }, { "epoch": 0.9622279277403835, "eval_loss": 0.8606518507003784, "eval_mean_token_accuracy": 0.7467864840774796, "eval_runtime": 50.4189, "eval_samples_per_second": 10.175, "eval_steps_per_second": 10.175, "step": 2600 }, { "epoch": 0.9640783660629612, "grad_norm": 1.1271697282791138, "learning_rate": 7.846473910821162e-08, "loss": 0.8575, "mean_token_accuracy": 0.7484827357208227, "step": 2605 }, { "epoch": 0.9659288043855389, "grad_norm": 1.0665255784988403, "learning_rate": 7.059335984588634e-08, "loss": 0.8431, "mean_token_accuracy": 0.7513135573046622, "step": 2610 }, { "epoch": 0.9677792427081164, "grad_norm": 1.1128307580947876, "learning_rate": 6.313654461800322e-08, "loss": 0.8304, "mean_token_accuracy": 0.7541726078785163, "step": 2615 }, { "epoch": 0.9696296810306941, "grad_norm": 1.0116729736328125, "learning_rate": 5.609460475509032e-08, "loss": 0.8228, "mean_token_accuracy": 0.7557024711248536, "step": 2620 }, { "epoch": 0.9714801193532718, "grad_norm": 1.0868538618087769, "learning_rate": 4.9467834266154756e-08, "loss": 0.8236, "mean_token_accuracy": 0.7560558219373958, "step": 2625 }, { "epoch": 0.9733305576758494, "grad_norm": 1.0722366571426392, "learning_rate": 4.325650982641039e-08, "loss": 0.8459, "mean_token_accuracy": 0.7509982117560366, "step": 2630 }, { "epoch": 0.9751809959984271, "grad_norm": 1.075330376625061, "learning_rate": 3.746089076572701e-08, "loss": 0.7883, "mean_token_accuracy": 0.7683721646194278, "step": 2635 }, { "epoch": 0.9770314343210048, "grad_norm": 1.0114487409591675, "learning_rate": 3.208121905779904e-08, "loss": 0.8639, "mean_token_accuracy": 0.746171101860165, "step": 2640 }, { "epoch": 0.9788818726435824, "grad_norm": 1.048780083656311, "learning_rate": 2.711771931004692e-08, "loss": 0.7718, "mean_token_accuracy": 0.7700529319379708, "step": 2645 }, { "epoch": 0.9807323109661601, "grad_norm": 0.9830264449119568, "learning_rate": 2.257059875423795e-08, "loss": 0.8044, "mean_token_accuracy": 0.7627527924691726, "step": 2650 }, { "epoch": 0.9825827492887378, "grad_norm": 1.0851788520812988, "learning_rate": 1.8440047237832105e-08, "loss": 0.825, "mean_token_accuracy": 0.7561628552480948, "step": 2655 }, { "epoch": 0.9844331876113155, "grad_norm": 1.1513216495513916, "learning_rate": 1.472623721606059e-08, "loss": 0.8282, "mean_token_accuracy": 0.7562615035996398, "step": 2660 }, { "epoch": 0.9862836259338931, "grad_norm": 1.0593661069869995, "learning_rate": 1.1429323744720499e-08, "loss": 0.8275, "mean_token_accuracy": 0.7561241815379223, "step": 2665 }, { "epoch": 0.9881340642564708, "grad_norm": 1.0882458686828613, "learning_rate": 8.549444473702207e-09, "loss": 0.8509, "mean_token_accuracy": 0.7492303053916414, "step": 2670 }, { "epoch": 0.9899845025790485, "grad_norm": 1.0708626508712769, "learning_rate": 6.086719641246186e-09, "loss": 0.7946, "mean_token_accuracy": 0.7647034307432874, "step": 2675 }, { "epoch": 0.991834940901626, "grad_norm": 1.0481311082839966, "learning_rate": 4.041252068918145e-09, "loss": 0.838, "mean_token_accuracy": 0.751368344670206, "step": 2680 }, { "epoch": 0.9936853792242037, "grad_norm": 1.0070174932479858, "learning_rate": 2.4131271573191172e-09, "loss": 0.826, "mean_token_accuracy": 0.7566141631737711, "step": 2685 }, { "epoch": 0.9955358175467814, "grad_norm": 1.024017095565796, "learning_rate": 1.2024128825172121e-09, "loss": 0.8202, "mean_token_accuracy": 0.7572512065390945, "step": 2690 }, { "epoch": 0.997386255869359, "grad_norm": 1.0600864887237549, "learning_rate": 4.0915979321320967e-10, "loss": 0.7981, "mean_token_accuracy": 0.7627408443007209, "step": 2695 }, { "epoch": 0.9992366941919367, "grad_norm": 1.0456680059432983, "learning_rate": 3.3401008625588706e-11, "loss": 0.7911, "mean_token_accuracy": 0.766019055501874, "step": 2700 }, { "epoch": 0.9992366941919367, "eval_loss": 0.860656201839447, "eval_mean_token_accuracy": 0.7467079128693144, "eval_runtime": 50.3479, "eval_samples_per_second": 10.189, "eval_steps_per_second": 10.189, "step": 2700 }, { "epoch": 0.9999768695209678, "mean_token_accuracy": 0.7699437678031534, "step": 2702, "total_flos": 76965426954240.0, "train_loss": 0.8771114350247613, "train_runtime": 110619.4732, "train_samples_per_second": 0.782, "train_steps_per_second": 0.024 } ], "logging_steps": 5, "max_steps": 2702, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 76965426954240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }