{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999953722985793, "eval_steps": 100, "global_step": 10804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00046277014207043364, "grad_norm": 9.375, "learning_rate": 9.250693802035154e-08, "loss": 1.6692, "mean_token_accuracy": 0.6094422700587084, "step": 5 }, { "epoch": 0.0009255402841408673, "grad_norm": 10.0625, "learning_rate": 1.8501387604070307e-07, "loss": 1.7297, "mean_token_accuracy": 0.598312133072407, "step": 10 }, { "epoch": 0.0013883104262113009, "grad_norm": 9.9375, "learning_rate": 2.7752081406105456e-07, "loss": 1.6979, "mean_token_accuracy": 0.6070694716242662, "step": 15 }, { "epoch": 0.0018510805682817346, "grad_norm": 10.1875, "learning_rate": 3.7002775208140615e-07, "loss": 1.5931, "mean_token_accuracy": 0.629549902152642, "step": 20 }, { "epoch": 0.0023138507103521683, "grad_norm": 10.625, "learning_rate": 4.6253469010175763e-07, "loss": 1.6857, "mean_token_accuracy": 0.6097847358121331, "step": 25 }, { "epoch": 0.0027766208524226017, "grad_norm": 9.3125, "learning_rate": 5.550416281221091e-07, "loss": 1.6642, "mean_token_accuracy": 0.6138454011741683, "step": 30 }, { "epoch": 0.003239390994493035, "grad_norm": 12.875, "learning_rate": 6.475485661424607e-07, "loss": 1.7261, "mean_token_accuracy": 0.6058708414872798, "step": 35 }, { "epoch": 0.003702161136563469, "grad_norm": 9.5, "learning_rate": 7.400555041628123e-07, "loss": 1.6074, "mean_token_accuracy": 0.6249266144814091, "step": 40 }, { "epoch": 0.004164931278633903, "grad_norm": 10.375, "learning_rate": 8.325624421831638e-07, "loss": 1.6461, "mean_token_accuracy": 0.6216731898238748, "step": 45 }, { "epoch": 0.0046277014207043365, "grad_norm": 10.375, "learning_rate": 9.250693802035153e-07, "loss": 1.6408, "mean_token_accuracy": 0.6214041095890412, "step": 50 }, { "epoch": 0.0050904715627747696, "grad_norm": 9.8125, "learning_rate": 1.0175763182238669e-06, "loss": 1.6955, "mean_token_accuracy": 0.610004892367906, "step": 55 }, { "epoch": 0.0055532417048452035, "grad_norm": 7.875, "learning_rate": 1.1100832562442182e-06, "loss": 1.6239, "mean_token_accuracy": 0.6232632093933463, "step": 60 }, { "epoch": 0.006016011846915637, "grad_norm": 9.3125, "learning_rate": 1.20259019426457e-06, "loss": 1.6727, "mean_token_accuracy": 0.6091731898238748, "step": 65 }, { "epoch": 0.00647878198898607, "grad_norm": 8.8125, "learning_rate": 1.2950971322849214e-06, "loss": 1.6528, "mean_token_accuracy": 0.6149217221135028, "step": 70 }, { "epoch": 0.006941552131056504, "grad_norm": 8.4375, "learning_rate": 1.3876040703052728e-06, "loss": 1.6744, "mean_token_accuracy": 0.6105185909980431, "step": 75 }, { "epoch": 0.007404322273126938, "grad_norm": 8.9375, "learning_rate": 1.4801110083256246e-06, "loss": 1.7074, "mean_token_accuracy": 0.599192759295499, "step": 80 }, { "epoch": 0.007867092415197372, "grad_norm": 8.5625, "learning_rate": 1.5726179463459762e-06, "loss": 1.6745, "mean_token_accuracy": 0.6085616438356164, "step": 85 }, { "epoch": 0.008329862557267805, "grad_norm": 7.28125, "learning_rate": 1.6651248843663276e-06, "loss": 1.6471, "mean_token_accuracy": 0.6140655577299412, "step": 90 }, { "epoch": 0.008792632699338238, "grad_norm": 8.125, "learning_rate": 1.7576318223866792e-06, "loss": 1.5668, "mean_token_accuracy": 0.6296722113502936, "step": 95 }, { "epoch": 0.009255402841408673, "grad_norm": 8.9375, "learning_rate": 1.8501387604070305e-06, "loss": 1.722, "mean_token_accuracy": 0.6009784735812135, "step": 100 }, { "epoch": 0.009255402841408673, "eval_loss": 1.6742737293243408, "eval_mean_token_accuracy": 0.6102196978962823, "eval_runtime": 39.6932, "eval_samples_per_second": 25.798, "eval_steps_per_second": 6.449, "step": 100 }, { "epoch": 0.009718172983479106, "grad_norm": 7.78125, "learning_rate": 1.942645698427382e-06, "loss": 1.5815, "mean_token_accuracy": 0.6247064579256361, "step": 105 }, { "epoch": 0.010180943125549539, "grad_norm": 7.46875, "learning_rate": 2.0351526364477337e-06, "loss": 1.5911, "mean_token_accuracy": 0.6260273972602739, "step": 110 }, { "epoch": 0.010643713267619974, "grad_norm": 7.0, "learning_rate": 2.1276595744680853e-06, "loss": 1.6116, "mean_token_accuracy": 0.6214285714285716, "step": 115 }, { "epoch": 0.011106483409690407, "grad_norm": 7.59375, "learning_rate": 2.2201665124884365e-06, "loss": 1.6609, "mean_token_accuracy": 0.6101761252446184, "step": 120 }, { "epoch": 0.01156925355176084, "grad_norm": 6.96875, "learning_rate": 2.3126734505087885e-06, "loss": 1.6582, "mean_token_accuracy": 0.6104941291585126, "step": 125 }, { "epoch": 0.012032023693831275, "grad_norm": 7.03125, "learning_rate": 2.40518038852914e-06, "loss": 1.5703, "mean_token_accuracy": 0.6279354207436398, "step": 130 }, { "epoch": 0.012494793835901708, "grad_norm": 6.875, "learning_rate": 2.4976873265494912e-06, "loss": 1.5795, "mean_token_accuracy": 0.6283512720156555, "step": 135 }, { "epoch": 0.01295756397797214, "grad_norm": 6.9375, "learning_rate": 2.590194264569843e-06, "loss": 1.622, "mean_token_accuracy": 0.6183219178082192, "step": 140 }, { "epoch": 0.013420334120042576, "grad_norm": 7.96875, "learning_rate": 2.6827012025901944e-06, "loss": 1.5149, "mean_token_accuracy": 0.6432974559686888, "step": 145 }, { "epoch": 0.013883104262113009, "grad_norm": 6.65625, "learning_rate": 2.7752081406105456e-06, "loss": 1.5702, "mean_token_accuracy": 0.6275684931506849, "step": 150 }, { "epoch": 0.014345874404183442, "grad_norm": 7.3125, "learning_rate": 2.8677150786308976e-06, "loss": 1.6262, "mean_token_accuracy": 0.6151174168297455, "step": 155 }, { "epoch": 0.014808644546253876, "grad_norm": 7.25, "learning_rate": 2.960222016651249e-06, "loss": 1.6005, "mean_token_accuracy": 0.6177837573385518, "step": 160 }, { "epoch": 0.01527141468832431, "grad_norm": 7.15625, "learning_rate": 3.0527289546716004e-06, "loss": 1.571, "mean_token_accuracy": 0.6262230919765168, "step": 165 }, { "epoch": 0.015734184830394744, "grad_norm": 6.9375, "learning_rate": 3.1452358926919524e-06, "loss": 1.599, "mean_token_accuracy": 0.6210861056751467, "step": 170 }, { "epoch": 0.016196954972465177, "grad_norm": 6.53125, "learning_rate": 3.237742830712304e-06, "loss": 1.5976, "mean_token_accuracy": 0.6217465753424658, "step": 175 }, { "epoch": 0.01665972511453561, "grad_norm": 6.125, "learning_rate": 3.330249768732655e-06, "loss": 1.625, "mean_token_accuracy": 0.6125978473581214, "step": 180 }, { "epoch": 0.017122495256606043, "grad_norm": 7.0, "learning_rate": 3.4227567067530067e-06, "loss": 1.5928, "mean_token_accuracy": 0.6207191780821919, "step": 185 }, { "epoch": 0.017585265398676476, "grad_norm": 6.1875, "learning_rate": 3.5152636447733583e-06, "loss": 1.5108, "mean_token_accuracy": 0.6382338551859098, "step": 190 }, { "epoch": 0.01804803554074691, "grad_norm": 6.84375, "learning_rate": 3.60777058279371e-06, "loss": 1.5879, "mean_token_accuracy": 0.618444227005871, "step": 195 }, { "epoch": 0.018510805682817346, "grad_norm": 7.65625, "learning_rate": 3.700277520814061e-06, "loss": 1.5974, "mean_token_accuracy": 0.6173434442270059, "step": 200 }, { "epoch": 0.018510805682817346, "eval_loss": 1.5768176317214966, "eval_mean_token_accuracy": 0.621884937622309, "eval_runtime": 39.8614, "eval_samples_per_second": 25.689, "eval_steps_per_second": 6.422, "step": 200 }, { "epoch": 0.01897357582488778, "grad_norm": 6.8125, "learning_rate": 3.792784458834413e-06, "loss": 1.5797, "mean_token_accuracy": 0.6224559686888453, "step": 205 }, { "epoch": 0.019436345966958212, "grad_norm": 5.625, "learning_rate": 3.885291396854764e-06, "loss": 1.5071, "mean_token_accuracy": 0.6356653620352251, "step": 210 }, { "epoch": 0.019899116109028645, "grad_norm": 6.0625, "learning_rate": 3.977798334875116e-06, "loss": 1.6003, "mean_token_accuracy": 0.6160469667318982, "step": 215 }, { "epoch": 0.020361886251099078, "grad_norm": 6.59375, "learning_rate": 4.0703052728954674e-06, "loss": 1.5964, "mean_token_accuracy": 0.6139921722113503, "step": 220 }, { "epoch": 0.02082465639316951, "grad_norm": 5.84375, "learning_rate": 4.162812210915819e-06, "loss": 1.4985, "mean_token_accuracy": 0.6356409001956946, "step": 225 }, { "epoch": 0.021287426535239948, "grad_norm": 6.25, "learning_rate": 4.255319148936171e-06, "loss": 1.5068, "mean_token_accuracy": 0.6374510763209394, "step": 230 }, { "epoch": 0.02175019667731038, "grad_norm": 5.8125, "learning_rate": 4.347826086956522e-06, "loss": 1.4623, "mean_token_accuracy": 0.6424902152641879, "step": 235 }, { "epoch": 0.022212966819380814, "grad_norm": 5.53125, "learning_rate": 4.440333024976873e-06, "loss": 1.466, "mean_token_accuracy": 0.6434931506849315, "step": 240 }, { "epoch": 0.022675736961451247, "grad_norm": 5.75, "learning_rate": 4.532839962997225e-06, "loss": 1.4955, "mean_token_accuracy": 0.6363258317025441, "step": 245 }, { "epoch": 0.02313850710352168, "grad_norm": 7.09375, "learning_rate": 4.625346901017577e-06, "loss": 1.4298, "mean_token_accuracy": 0.6478962818003914, "step": 250 }, { "epoch": 0.023601277245592113, "grad_norm": 6.78125, "learning_rate": 4.717853839037928e-06, "loss": 1.4405, "mean_token_accuracy": 0.6439823874755382, "step": 255 }, { "epoch": 0.02406404738766255, "grad_norm": 5.9375, "learning_rate": 4.81036077705828e-06, "loss": 1.4726, "mean_token_accuracy": 0.6389432485322896, "step": 260 }, { "epoch": 0.024526817529732983, "grad_norm": 6.28125, "learning_rate": 4.902867715078632e-06, "loss": 1.4316, "mean_token_accuracy": 0.6500978473581212, "step": 265 }, { "epoch": 0.024989587671803416, "grad_norm": 5.625, "learning_rate": 4.9953746530989825e-06, "loss": 1.4278, "mean_token_accuracy": 0.6507093933463797, "step": 270 }, { "epoch": 0.02545235781387385, "grad_norm": 5.75, "learning_rate": 5.087881591119335e-06, "loss": 1.3925, "mean_token_accuracy": 0.6559931506849314, "step": 275 }, { "epoch": 0.02591512795594428, "grad_norm": 5.75, "learning_rate": 5.180388529139686e-06, "loss": 1.4997, "mean_token_accuracy": 0.6297700587084148, "step": 280 }, { "epoch": 0.026377898098014715, "grad_norm": 5.59375, "learning_rate": 5.272895467160037e-06, "loss": 1.4691, "mean_token_accuracy": 0.638307240704501, "step": 285 }, { "epoch": 0.02684066824008515, "grad_norm": 5.40625, "learning_rate": 5.365402405180389e-06, "loss": 1.4792, "mean_token_accuracy": 0.635812133072407, "step": 290 }, { "epoch": 0.027303438382155584, "grad_norm": 5.90625, "learning_rate": 5.4579093432007404e-06, "loss": 1.4882, "mean_token_accuracy": 0.6287915851272017, "step": 295 }, { "epoch": 0.027766208524226017, "grad_norm": 8.5625, "learning_rate": 5.550416281221091e-06, "loss": 1.4375, "mean_token_accuracy": 0.6427592954990216, "step": 300 }, { "epoch": 0.027766208524226017, "eval_loss": 1.4833598136901855, "eval_mean_token_accuracy": 0.6334890227495106, "eval_runtime": 39.9204, "eval_samples_per_second": 25.651, "eval_steps_per_second": 6.413, "step": 300 }, { "epoch": 0.02822897866629645, "grad_norm": 5.875, "learning_rate": 5.642923219241444e-06, "loss": 1.4231, "mean_token_accuracy": 0.6446183953033268, "step": 305 }, { "epoch": 0.028691748808366883, "grad_norm": 5.375, "learning_rate": 5.735430157261795e-06, "loss": 1.4451, "mean_token_accuracy": 0.6396771037181995, "step": 310 }, { "epoch": 0.029154518950437316, "grad_norm": 5.15625, "learning_rate": 5.827937095282147e-06, "loss": 1.4412, "mean_token_accuracy": 0.6390655577299413, "step": 315 }, { "epoch": 0.029617289092507753, "grad_norm": 6.09375, "learning_rate": 5.920444033302498e-06, "loss": 1.3984, "mean_token_accuracy": 0.6502201565557729, "step": 320 }, { "epoch": 0.030080059234578186, "grad_norm": 5.625, "learning_rate": 6.012950971322849e-06, "loss": 1.4551, "mean_token_accuracy": 0.6412915851272015, "step": 325 }, { "epoch": 0.03054282937664862, "grad_norm": 6.125, "learning_rate": 6.105457909343201e-06, "loss": 1.4119, "mean_token_accuracy": 0.6480185909980429, "step": 330 }, { "epoch": 0.031005599518719052, "grad_norm": 5.78125, "learning_rate": 6.197964847363552e-06, "loss": 1.3725, "mean_token_accuracy": 0.6574853228962818, "step": 335 }, { "epoch": 0.03146836966078949, "grad_norm": 6.75, "learning_rate": 6.290471785383905e-06, "loss": 1.367, "mean_token_accuracy": 0.6581213307240704, "step": 340 }, { "epoch": 0.03193113980285992, "grad_norm": 5.875, "learning_rate": 6.382978723404256e-06, "loss": 1.4312, "mean_token_accuracy": 0.6429305283757338, "step": 345 }, { "epoch": 0.032393909944930355, "grad_norm": 5.125, "learning_rate": 6.475485661424608e-06, "loss": 1.4317, "mean_token_accuracy": 0.6415117416829745, "step": 350 }, { "epoch": 0.03285668008700079, "grad_norm": 5.34375, "learning_rate": 6.567992599444959e-06, "loss": 1.4603, "mean_token_accuracy": 0.632999021526419, "step": 355 }, { "epoch": 0.03331945022907122, "grad_norm": 5.75, "learning_rate": 6.66049953746531e-06, "loss": 1.3966, "mean_token_accuracy": 0.6477250489236792, "step": 360 }, { "epoch": 0.033782220371141654, "grad_norm": 5.78125, "learning_rate": 6.753006475485662e-06, "loss": 1.4476, "mean_token_accuracy": 0.6435909980430528, "step": 365 }, { "epoch": 0.03424499051321209, "grad_norm": 4.9375, "learning_rate": 6.8455134135060135e-06, "loss": 1.4297, "mean_token_accuracy": 0.6438600782778865, "step": 370 }, { "epoch": 0.03470776065528252, "grad_norm": 5.28125, "learning_rate": 6.938020351526366e-06, "loss": 1.4577, "mean_token_accuracy": 0.634344422700587, "step": 375 }, { "epoch": 0.03517053079735295, "grad_norm": 5.1875, "learning_rate": 7.030527289546717e-06, "loss": 1.4046, "mean_token_accuracy": 0.6465998043052839, "step": 380 }, { "epoch": 0.035633300939423386, "grad_norm": 5.34375, "learning_rate": 7.123034227567068e-06, "loss": 1.3319, "mean_token_accuracy": 0.6621086105675147, "step": 385 }, { "epoch": 0.03609607108149382, "grad_norm": 5.0, "learning_rate": 7.21554116558742e-06, "loss": 1.3875, "mean_token_accuracy": 0.6484099804305282, "step": 390 }, { "epoch": 0.03655884122356426, "grad_norm": 4.875, "learning_rate": 7.308048103607771e-06, "loss": 1.3852, "mean_token_accuracy": 0.6495596868884541, "step": 395 }, { "epoch": 0.03702161136563469, "grad_norm": 5.1875, "learning_rate": 7.400555041628122e-06, "loss": 1.3498, "mean_token_accuracy": 0.6582191780821918, "step": 400 }, { "epoch": 0.03702161136563469, "eval_loss": 1.4238693714141846, "eval_mean_token_accuracy": 0.6415021862769079, "eval_runtime": 39.8593, "eval_samples_per_second": 25.69, "eval_steps_per_second": 6.423, "step": 400 }, { "epoch": 0.037484381507705125, "grad_norm": 5.25, "learning_rate": 7.493061979648474e-06, "loss": 1.3877, "mean_token_accuracy": 0.6494618395303328, "step": 405 }, { "epoch": 0.03794715164977556, "grad_norm": 4.8125, "learning_rate": 7.585568917668826e-06, "loss": 1.3942, "mean_token_accuracy": 0.6526174168297457, "step": 410 }, { "epoch": 0.03840992179184599, "grad_norm": 5.25, "learning_rate": 7.678075855689177e-06, "loss": 1.3904, "mean_token_accuracy": 0.6495841487279843, "step": 415 }, { "epoch": 0.038872691933916424, "grad_norm": 5.375, "learning_rate": 7.770582793709529e-06, "loss": 1.4529, "mean_token_accuracy": 0.6357632093933465, "step": 420 }, { "epoch": 0.03933546207598686, "grad_norm": 6.0, "learning_rate": 7.86308973172988e-06, "loss": 1.3824, "mean_token_accuracy": 0.646183953033268, "step": 425 }, { "epoch": 0.03979823221805729, "grad_norm": 4.9375, "learning_rate": 7.955596669750232e-06, "loss": 1.4563, "mean_token_accuracy": 0.6317270058708415, "step": 430 }, { "epoch": 0.04026100236012772, "grad_norm": 5.96875, "learning_rate": 8.048103607770583e-06, "loss": 1.3497, "mean_token_accuracy": 0.6577544031311156, "step": 435 }, { "epoch": 0.040723772502198156, "grad_norm": 4.75, "learning_rate": 8.140610545790935e-06, "loss": 1.4242, "mean_token_accuracy": 0.6423189823874755, "step": 440 }, { "epoch": 0.04118654264426859, "grad_norm": 5.25, "learning_rate": 8.233117483811286e-06, "loss": 1.3238, "mean_token_accuracy": 0.6659735812133073, "step": 445 }, { "epoch": 0.04164931278633902, "grad_norm": 5.5625, "learning_rate": 8.325624421831638e-06, "loss": 1.3881, "mean_token_accuracy": 0.6484099804305284, "step": 450 }, { "epoch": 0.04211208292840946, "grad_norm": 4.78125, "learning_rate": 8.41813135985199e-06, "loss": 1.37, "mean_token_accuracy": 0.6557240704500977, "step": 455 }, { "epoch": 0.042574853070479896, "grad_norm": 4.875, "learning_rate": 8.510638297872341e-06, "loss": 1.3906, "mean_token_accuracy": 0.6470156555772995, "step": 460 }, { "epoch": 0.04303762321255033, "grad_norm": 4.90625, "learning_rate": 8.603145235892693e-06, "loss": 1.3766, "mean_token_accuracy": 0.6512964774951077, "step": 465 }, { "epoch": 0.04350039335462076, "grad_norm": 5.125, "learning_rate": 8.695652173913044e-06, "loss": 1.3816, "mean_token_accuracy": 0.6522260273972602, "step": 470 }, { "epoch": 0.043963163496691195, "grad_norm": 6.0625, "learning_rate": 8.788159111933396e-06, "loss": 1.3968, "mean_token_accuracy": 0.646991193737769, "step": 475 }, { "epoch": 0.04442593363876163, "grad_norm": 5.34375, "learning_rate": 8.880666049953746e-06, "loss": 1.3913, "mean_token_accuracy": 0.647945205479452, "step": 480 }, { "epoch": 0.04488870378083206, "grad_norm": 5.8125, "learning_rate": 8.9731729879741e-06, "loss": 1.4849, "mean_token_accuracy": 0.6258317025440313, "step": 485 }, { "epoch": 0.045351473922902494, "grad_norm": 4.875, "learning_rate": 9.06567992599445e-06, "loss": 1.3881, "mean_token_accuracy": 0.6482142857142856, "step": 490 }, { "epoch": 0.04581424406497293, "grad_norm": 4.625, "learning_rate": 9.158186864014802e-06, "loss": 1.2978, "mean_token_accuracy": 0.6695450097847357, "step": 495 }, { "epoch": 0.04627701420704336, "grad_norm": 5.84375, "learning_rate": 9.250693802035154e-06, "loss": 1.3469, "mean_token_accuracy": 0.6593444227005871, "step": 500 }, { "epoch": 0.04627701420704336, "eval_loss": 1.380774974822998, "eval_mean_token_accuracy": 0.6482406586350292, "eval_runtime": 40.6421, "eval_samples_per_second": 25.196, "eval_steps_per_second": 6.299, "step": 500 }, { "epoch": 0.04673978434911379, "grad_norm": 5.09375, "learning_rate": 9.343200740055506e-06, "loss": 1.3312, "mean_token_accuracy": 0.6594178082191781, "step": 505 }, { "epoch": 0.047202554491184226, "grad_norm": 5.375, "learning_rate": 9.435707678075855e-06, "loss": 1.3565, "mean_token_accuracy": 0.6547700587084149, "step": 510 }, { "epoch": 0.04766532463325466, "grad_norm": 4.71875, "learning_rate": 9.528214616096207e-06, "loss": 1.3995, "mean_token_accuracy": 0.6455234833659489, "step": 515 }, { "epoch": 0.0481280947753251, "grad_norm": 5.0, "learning_rate": 9.62072155411656e-06, "loss": 1.3296, "mean_token_accuracy": 0.6602250489236791, "step": 520 }, { "epoch": 0.04859086491739553, "grad_norm": 4.78125, "learning_rate": 9.713228492136912e-06, "loss": 1.3204, "mean_token_accuracy": 0.6600538160469668, "step": 525 }, { "epoch": 0.049053635059465965, "grad_norm": 4.625, "learning_rate": 9.805735430157263e-06, "loss": 1.2955, "mean_token_accuracy": 0.668150684931507, "step": 530 }, { "epoch": 0.0495164052015364, "grad_norm": 4.46875, "learning_rate": 9.898242368177613e-06, "loss": 1.3048, "mean_token_accuracy": 0.6654843444227005, "step": 535 }, { "epoch": 0.04997917534360683, "grad_norm": 4.96875, "learning_rate": 9.990749306197965e-06, "loss": 1.3802, "mean_token_accuracy": 0.6492172211350294, "step": 540 }, { "epoch": 0.050441945485677264, "grad_norm": 4.78125, "learning_rate": 1.0083256244218318e-05, "loss": 1.4444, "mean_token_accuracy": 0.6354941291585127, "step": 545 }, { "epoch": 0.0509047156277477, "grad_norm": 4.84375, "learning_rate": 1.017576318223867e-05, "loss": 1.3561, "mean_token_accuracy": 0.6558463796477496, "step": 550 }, { "epoch": 0.05136748576981813, "grad_norm": 4.875, "learning_rate": 1.0268270120259021e-05, "loss": 1.3745, "mean_token_accuracy": 0.6444471624266145, "step": 555 }, { "epoch": 0.05183025591188856, "grad_norm": 4.71875, "learning_rate": 1.0360777058279371e-05, "loss": 1.2879, "mean_token_accuracy": 0.6686399217221135, "step": 560 }, { "epoch": 0.052293026053958996, "grad_norm": 4.6875, "learning_rate": 1.0453283996299723e-05, "loss": 1.3263, "mean_token_accuracy": 0.6586839530332682, "step": 565 }, { "epoch": 0.05275579619602943, "grad_norm": 5.03125, "learning_rate": 1.0545790934320075e-05, "loss": 1.352, "mean_token_accuracy": 0.6531800391389433, "step": 570 }, { "epoch": 0.05321856633809986, "grad_norm": 5.09375, "learning_rate": 1.0638297872340426e-05, "loss": 1.3639, "mean_token_accuracy": 0.6503913894324852, "step": 575 }, { "epoch": 0.0536813364801703, "grad_norm": 4.71875, "learning_rate": 1.0730804810360778e-05, "loss": 1.2489, "mean_token_accuracy": 0.6782289628180039, "step": 580 }, { "epoch": 0.054144106622240736, "grad_norm": 4.90625, "learning_rate": 1.082331174838113e-05, "loss": 1.357, "mean_token_accuracy": 0.6516634050880626, "step": 585 }, { "epoch": 0.05460687676431117, "grad_norm": 6.75, "learning_rate": 1.0915818686401481e-05, "loss": 1.3258, "mean_token_accuracy": 0.6621575342465753, "step": 590 }, { "epoch": 0.0550696469063816, "grad_norm": 5.25, "learning_rate": 1.1008325624421832e-05, "loss": 1.336, "mean_token_accuracy": 0.6588062622309196, "step": 595 }, { "epoch": 0.055532417048452035, "grad_norm": 4.9375, "learning_rate": 1.1100832562442182e-05, "loss": 1.3435, "mean_token_accuracy": 0.6564334637964774, "step": 600 }, { "epoch": 0.055532417048452035, "eval_loss": 1.3527331352233887, "eval_mean_token_accuracy": 0.652513836227984, "eval_runtime": 39.7965, "eval_samples_per_second": 25.731, "eval_steps_per_second": 6.433, "step": 600 }, { "epoch": 0.05599518719052247, "grad_norm": 4.5, "learning_rate": 1.1193339500462537e-05, "loss": 1.358, "mean_token_accuracy": 0.6475293542074364, "step": 605 }, { "epoch": 0.0564579573325929, "grad_norm": 4.4375, "learning_rate": 1.1285846438482887e-05, "loss": 1.3097, "mean_token_accuracy": 0.662426614481409, "step": 610 }, { "epoch": 0.056920727474663334, "grad_norm": 5.0, "learning_rate": 1.1378353376503239e-05, "loss": 1.2873, "mean_token_accuracy": 0.670156555772994, "step": 615 }, { "epoch": 0.05738349761673377, "grad_norm": 6.0, "learning_rate": 1.147086031452359e-05, "loss": 1.2854, "mean_token_accuracy": 0.6651174168297457, "step": 620 }, { "epoch": 0.0578462677588042, "grad_norm": 4.53125, "learning_rate": 1.1563367252543942e-05, "loss": 1.3268, "mean_token_accuracy": 0.6587084148727985, "step": 625 }, { "epoch": 0.05830903790087463, "grad_norm": 4.53125, "learning_rate": 1.1655874190564294e-05, "loss": 1.2755, "mean_token_accuracy": 0.667490215264188, "step": 630 }, { "epoch": 0.058771808042945066, "grad_norm": 4.625, "learning_rate": 1.1748381128584645e-05, "loss": 1.3931, "mean_token_accuracy": 0.639995107632094, "step": 635 }, { "epoch": 0.059234578185015506, "grad_norm": 4.21875, "learning_rate": 1.1840888066604997e-05, "loss": 1.2808, "mean_token_accuracy": 0.6734833659491195, "step": 640 }, { "epoch": 0.05969734832708594, "grad_norm": 4.53125, "learning_rate": 1.1933395004625348e-05, "loss": 1.2894, "mean_token_accuracy": 0.6705968688845401, "step": 645 }, { "epoch": 0.06016011846915637, "grad_norm": 4.46875, "learning_rate": 1.2025901942645698e-05, "loss": 1.2822, "mean_token_accuracy": 0.6709393346379648, "step": 650 }, { "epoch": 0.060622888611226805, "grad_norm": 4.34375, "learning_rate": 1.211840888066605e-05, "loss": 1.273, "mean_token_accuracy": 0.6692759295499023, "step": 655 }, { "epoch": 0.06108565875329724, "grad_norm": 5.1875, "learning_rate": 1.2210915818686401e-05, "loss": 1.3203, "mean_token_accuracy": 0.6609344422700587, "step": 660 }, { "epoch": 0.06154842889536767, "grad_norm": 4.78125, "learning_rate": 1.2303422756706753e-05, "loss": 1.3446, "mean_token_accuracy": 0.6545009784735811, "step": 665 }, { "epoch": 0.062011199037438104, "grad_norm": 4.5, "learning_rate": 1.2395929694727105e-05, "loss": 1.3419, "mean_token_accuracy": 0.6584148727984345, "step": 670 }, { "epoch": 0.06247396917950854, "grad_norm": 4.59375, "learning_rate": 1.2488436632747456e-05, "loss": 1.2544, "mean_token_accuracy": 0.6741927592954988, "step": 675 }, { "epoch": 0.06293673932157898, "grad_norm": 5.0625, "learning_rate": 1.258094357076781e-05, "loss": 1.2699, "mean_token_accuracy": 0.6747553816046967, "step": 680 }, { "epoch": 0.0633995094636494, "grad_norm": 4.96875, "learning_rate": 1.2673450508788161e-05, "loss": 1.2732, "mean_token_accuracy": 0.6733365949119373, "step": 685 }, { "epoch": 0.06386227960571984, "grad_norm": 4.84375, "learning_rate": 1.2765957446808513e-05, "loss": 1.3232, "mean_token_accuracy": 0.6623043052837575, "step": 690 }, { "epoch": 0.06432504974779027, "grad_norm": 4.40625, "learning_rate": 1.2858464384828864e-05, "loss": 1.2604, "mean_token_accuracy": 0.6704990215264187, "step": 695 }, { "epoch": 0.06478781988986071, "grad_norm": 4.59375, "learning_rate": 1.2950971322849216e-05, "loss": 1.3599, "mean_token_accuracy": 0.6530577299412915, "step": 700 }, { "epoch": 0.06478781988986071, "eval_loss": 1.3315926790237427, "eval_mean_token_accuracy": 0.6561659124266139, "eval_runtime": 39.7849, "eval_samples_per_second": 25.738, "eval_steps_per_second": 6.435, "step": 700 }, { "epoch": 0.06525059003193114, "grad_norm": 4.78125, "learning_rate": 1.3043478260869566e-05, "loss": 1.2559, "mean_token_accuracy": 0.675293542074364, "step": 705 }, { "epoch": 0.06571336017400158, "grad_norm": 4.6875, "learning_rate": 1.3135985198889917e-05, "loss": 1.3584, "mean_token_accuracy": 0.6512964774951077, "step": 710 }, { "epoch": 0.066176130316072, "grad_norm": 4.75, "learning_rate": 1.3228492136910269e-05, "loss": 1.3447, "mean_token_accuracy": 0.6600293542074362, "step": 715 }, { "epoch": 0.06663890045814244, "grad_norm": 4.28125, "learning_rate": 1.332099907493062e-05, "loss": 1.2856, "mean_token_accuracy": 0.665215264187867, "step": 720 }, { "epoch": 0.06710167060021287, "grad_norm": 4.78125, "learning_rate": 1.3413506012950972e-05, "loss": 1.2994, "mean_token_accuracy": 0.6622798434442272, "step": 725 }, { "epoch": 0.06756444074228331, "grad_norm": 4.34375, "learning_rate": 1.3506012950971324e-05, "loss": 1.2373, "mean_token_accuracy": 0.6759784735812133, "step": 730 }, { "epoch": 0.06802721088435375, "grad_norm": 4.75, "learning_rate": 1.3598519888991675e-05, "loss": 1.2889, "mean_token_accuracy": 0.6695205479452054, "step": 735 }, { "epoch": 0.06848998102642417, "grad_norm": 4.78125, "learning_rate": 1.3691026827012027e-05, "loss": 1.2214, "mean_token_accuracy": 0.6832681017612524, "step": 740 }, { "epoch": 0.06895275116849461, "grad_norm": 5.5, "learning_rate": 1.3783533765032377e-05, "loss": 1.25, "mean_token_accuracy": 0.6775195694716242, "step": 745 }, { "epoch": 0.06941552131056504, "grad_norm": 4.65625, "learning_rate": 1.3876040703052732e-05, "loss": 1.2804, "mean_token_accuracy": 0.6680528375733854, "step": 750 }, { "epoch": 0.06987829145263548, "grad_norm": 4.59375, "learning_rate": 1.3968547641073082e-05, "loss": 1.2966, "mean_token_accuracy": 0.6675880626223094, "step": 755 }, { "epoch": 0.0703410615947059, "grad_norm": 4.6875, "learning_rate": 1.4061054579093433e-05, "loss": 1.2941, "mean_token_accuracy": 0.6638454011741681, "step": 760 }, { "epoch": 0.07080383173677635, "grad_norm": 5.25, "learning_rate": 1.4153561517113785e-05, "loss": 1.3782, "mean_token_accuracy": 0.6399706457925637, "step": 765 }, { "epoch": 0.07126660187884677, "grad_norm": 4.40625, "learning_rate": 1.4246068455134136e-05, "loss": 1.2471, "mean_token_accuracy": 0.6729941291585126, "step": 770 }, { "epoch": 0.07172937202091721, "grad_norm": 4.78125, "learning_rate": 1.4338575393154488e-05, "loss": 1.2619, "mean_token_accuracy": 0.6720645792563601, "step": 775 }, { "epoch": 0.07219214216298764, "grad_norm": 4.46875, "learning_rate": 1.443108233117484e-05, "loss": 1.3722, "mean_token_accuracy": 0.6459882583170253, "step": 780 }, { "epoch": 0.07265491230505808, "grad_norm": 6.59375, "learning_rate": 1.4523589269195191e-05, "loss": 1.197, "mean_token_accuracy": 0.6849315068493151, "step": 785 }, { "epoch": 0.07311768244712852, "grad_norm": 4.3125, "learning_rate": 1.4616096207215543e-05, "loss": 1.2391, "mean_token_accuracy": 0.6797455968688848, "step": 790 }, { "epoch": 0.07358045258919894, "grad_norm": 4.28125, "learning_rate": 1.4708603145235893e-05, "loss": 1.308, "mean_token_accuracy": 0.662059686888454, "step": 795 }, { "epoch": 0.07404322273126938, "grad_norm": 5.375, "learning_rate": 1.4801110083256244e-05, "loss": 1.3566, "mean_token_accuracy": 0.6492416829745598, "step": 800 }, { "epoch": 0.07404322273126938, "eval_loss": 1.3135346174240112, "eval_mean_token_accuracy": 0.6590936888454004, "eval_runtime": 39.7961, "eval_samples_per_second": 25.731, "eval_steps_per_second": 6.433, "step": 800 }, { "epoch": 0.07450599287333981, "grad_norm": 5.96875, "learning_rate": 1.4893617021276596e-05, "loss": 1.3293, "mean_token_accuracy": 0.6541095890410957, "step": 805 }, { "epoch": 0.07496876301541025, "grad_norm": 4.9375, "learning_rate": 1.4986123959296947e-05, "loss": 1.2887, "mean_token_accuracy": 0.6652641878669275, "step": 810 }, { "epoch": 0.07543153315748068, "grad_norm": 5.375, "learning_rate": 1.5078630897317299e-05, "loss": 1.3193, "mean_token_accuracy": 0.6605919765166341, "step": 815 }, { "epoch": 0.07589430329955112, "grad_norm": 4.90625, "learning_rate": 1.5171137835337652e-05, "loss": 1.3028, "mean_token_accuracy": 0.6654598825831702, "step": 820 }, { "epoch": 0.07635707344162154, "grad_norm": 4.4375, "learning_rate": 1.5263644773358006e-05, "loss": 1.3194, "mean_token_accuracy": 0.6570450097847358, "step": 825 }, { "epoch": 0.07681984358369198, "grad_norm": 4.5625, "learning_rate": 1.5356151711378354e-05, "loss": 1.2598, "mean_token_accuracy": 0.6688356164383562, "step": 830 }, { "epoch": 0.07728261372576241, "grad_norm": 4.5, "learning_rate": 1.5448658649398705e-05, "loss": 1.2834, "mean_token_accuracy": 0.6653620352250489, "step": 835 }, { "epoch": 0.07774538386783285, "grad_norm": 4.4375, "learning_rate": 1.5541165587419057e-05, "loss": 1.2374, "mean_token_accuracy": 0.6756115459882585, "step": 840 }, { "epoch": 0.07820815400990327, "grad_norm": 4.53125, "learning_rate": 1.563367252543941e-05, "loss": 1.2287, "mean_token_accuracy": 0.680088062622309, "step": 845 }, { "epoch": 0.07867092415197371, "grad_norm": 4.5, "learning_rate": 1.572617946345976e-05, "loss": 1.3121, "mean_token_accuracy": 0.661179060665362, "step": 850 }, { "epoch": 0.07913369429404415, "grad_norm": 5.0625, "learning_rate": 1.5818686401480112e-05, "loss": 1.2912, "mean_token_accuracy": 0.6651908023483365, "step": 855 }, { "epoch": 0.07959646443611458, "grad_norm": 4.125, "learning_rate": 1.5911193339500463e-05, "loss": 1.2994, "mean_token_accuracy": 0.6600538160469667, "step": 860 }, { "epoch": 0.08005923457818502, "grad_norm": 4.40625, "learning_rate": 1.6003700277520815e-05, "loss": 1.2986, "mean_token_accuracy": 0.6603228962818004, "step": 865 }, { "epoch": 0.08052200472025545, "grad_norm": 4.90625, "learning_rate": 1.6096207215541167e-05, "loss": 1.2518, "mean_token_accuracy": 0.6732632093933463, "step": 870 }, { "epoch": 0.08098477486232589, "grad_norm": 4.84375, "learning_rate": 1.6188714153561518e-05, "loss": 1.3248, "mean_token_accuracy": 0.6538649706457925, "step": 875 }, { "epoch": 0.08144754500439631, "grad_norm": 4.53125, "learning_rate": 1.628122109158187e-05, "loss": 1.3246, "mean_token_accuracy": 0.6565802348336593, "step": 880 }, { "epoch": 0.08191031514646675, "grad_norm": 4.65625, "learning_rate": 1.637372802960222e-05, "loss": 1.3062, "mean_token_accuracy": 0.6603962818003912, "step": 885 }, { "epoch": 0.08237308528853718, "grad_norm": 5.125, "learning_rate": 1.6466234967622573e-05, "loss": 1.3155, "mean_token_accuracy": 0.6537671232876712, "step": 890 }, { "epoch": 0.08283585543060762, "grad_norm": 4.3125, "learning_rate": 1.6558741905642925e-05, "loss": 1.2084, "mean_token_accuracy": 0.6805283757338552, "step": 895 }, { "epoch": 0.08329862557267805, "grad_norm": 4.96875, "learning_rate": 1.6651248843663276e-05, "loss": 1.3182, "mean_token_accuracy": 0.6541829745596869, "step": 900 }, { "epoch": 0.08329862557267805, "eval_loss": 1.2987834215164185, "eval_mean_token_accuracy": 0.6613258317025444, "eval_runtime": 39.771, "eval_samples_per_second": 25.747, "eval_steps_per_second": 6.437, "step": 900 }, { "epoch": 0.08376139571474848, "grad_norm": 4.46875, "learning_rate": 1.6743755781683628e-05, "loss": 1.3117, "mean_token_accuracy": 0.6610322896281801, "step": 905 }, { "epoch": 0.08422416585681892, "grad_norm": 4.21875, "learning_rate": 1.683626271970398e-05, "loss": 1.253, "mean_token_accuracy": 0.6714041095890411, "step": 910 }, { "epoch": 0.08468693599888935, "grad_norm": 4.3125, "learning_rate": 1.692876965772433e-05, "loss": 1.2426, "mean_token_accuracy": 0.6736056751467709, "step": 915 }, { "epoch": 0.08514970614095979, "grad_norm": 4.4375, "learning_rate": 1.7021276595744682e-05, "loss": 1.3505, "mean_token_accuracy": 0.6465753424657533, "step": 920 }, { "epoch": 0.08561247628303022, "grad_norm": 4.78125, "learning_rate": 1.7113783533765034e-05, "loss": 1.3269, "mean_token_accuracy": 0.6586350293542074, "step": 925 }, { "epoch": 0.08607524642510066, "grad_norm": 4.3125, "learning_rate": 1.7206290471785386e-05, "loss": 1.2774, "mean_token_accuracy": 0.6664383561643836, "step": 930 }, { "epoch": 0.08653801656717108, "grad_norm": 4.5625, "learning_rate": 1.7298797409805737e-05, "loss": 1.2809, "mean_token_accuracy": 0.665949119373777, "step": 935 }, { "epoch": 0.08700078670924152, "grad_norm": 4.75, "learning_rate": 1.739130434782609e-05, "loss": 1.2796, "mean_token_accuracy": 0.6647749510763209, "step": 940 }, { "epoch": 0.08746355685131195, "grad_norm": 4.3125, "learning_rate": 1.748381128584644e-05, "loss": 1.3196, "mean_token_accuracy": 0.6594667318982387, "step": 945 }, { "epoch": 0.08792632699338239, "grad_norm": 4.53125, "learning_rate": 1.7576318223866792e-05, "loss": 1.2825, "mean_token_accuracy": 0.6619373776908024, "step": 950 }, { "epoch": 0.08838909713545282, "grad_norm": 4.6875, "learning_rate": 1.7668825161887144e-05, "loss": 1.2856, "mean_token_accuracy": 0.6655577299412917, "step": 955 }, { "epoch": 0.08885186727752326, "grad_norm": 4.5625, "learning_rate": 1.7761332099907492e-05, "loss": 1.2666, "mean_token_accuracy": 0.6677103718199608, "step": 960 }, { "epoch": 0.08931463741959368, "grad_norm": 4.46875, "learning_rate": 1.7853839037927847e-05, "loss": 1.2674, "mean_token_accuracy": 0.6667808219178083, "step": 965 }, { "epoch": 0.08977740756166412, "grad_norm": 4.40625, "learning_rate": 1.79463459759482e-05, "loss": 1.2296, "mean_token_accuracy": 0.6755381604696673, "step": 970 }, { "epoch": 0.09024017770373456, "grad_norm": 4.21875, "learning_rate": 1.803885291396855e-05, "loss": 1.3009, "mean_token_accuracy": 0.6604207436399218, "step": 975 }, { "epoch": 0.09070294784580499, "grad_norm": 4.4375, "learning_rate": 1.81313598519889e-05, "loss": 1.2697, "mean_token_accuracy": 0.6683953033268102, "step": 980 }, { "epoch": 0.09116571798787543, "grad_norm": 4.34375, "learning_rate": 1.8223866790009253e-05, "loss": 1.2835, "mean_token_accuracy": 0.665728962818004, "step": 985 }, { "epoch": 0.09162848812994585, "grad_norm": 4.21875, "learning_rate": 1.8316373728029605e-05, "loss": 1.2673, "mean_token_accuracy": 0.6684442270058709, "step": 990 }, { "epoch": 0.0920912582720163, "grad_norm": 4.34375, "learning_rate": 1.8408880666049956e-05, "loss": 1.2494, "mean_token_accuracy": 0.672945205479452, "step": 995 }, { "epoch": 0.09255402841408672, "grad_norm": 4.59375, "learning_rate": 1.8501387604070308e-05, "loss": 1.3511, "mean_token_accuracy": 0.6471135029354207, "step": 1000 }, { "epoch": 0.09255402841408672, "eval_loss": 1.2871110439300537, "eval_mean_token_accuracy": 0.6632063356164377, "eval_runtime": 39.8818, "eval_samples_per_second": 25.676, "eval_steps_per_second": 6.419, "step": 1000 }, { "epoch": 0.09301679855615716, "grad_norm": 4.59375, "learning_rate": 1.859389454209066e-05, "loss": 1.2894, "mean_token_accuracy": 0.6648727984344422, "step": 1005 }, { "epoch": 0.09347956869822759, "grad_norm": 4.5, "learning_rate": 1.868640148011101e-05, "loss": 1.2924, "mean_token_accuracy": 0.6629403131115461, "step": 1010 }, { "epoch": 0.09394233884029803, "grad_norm": 4.34375, "learning_rate": 1.877890841813136e-05, "loss": 1.3312, "mean_token_accuracy": 0.651320939334638, "step": 1015 }, { "epoch": 0.09440510898236845, "grad_norm": 4.28125, "learning_rate": 1.887141535615171e-05, "loss": 1.2347, "mean_token_accuracy": 0.6769324853228962, "step": 1020 }, { "epoch": 0.09486787912443889, "grad_norm": 4.59375, "learning_rate": 1.8963922294172062e-05, "loss": 1.3135, "mean_token_accuracy": 0.6541585127201566, "step": 1025 }, { "epoch": 0.09533064926650932, "grad_norm": 4.65625, "learning_rate": 1.9056429232192414e-05, "loss": 1.2366, "mean_token_accuracy": 0.6753180039138943, "step": 1030 }, { "epoch": 0.09579341940857976, "grad_norm": 4.9375, "learning_rate": 1.914893617021277e-05, "loss": 1.197, "mean_token_accuracy": 0.6845401174168297, "step": 1035 }, { "epoch": 0.0962561895506502, "grad_norm": 4.5625, "learning_rate": 1.924144310823312e-05, "loss": 1.2569, "mean_token_accuracy": 0.6700097847358121, "step": 1040 }, { "epoch": 0.09671895969272062, "grad_norm": 5.90625, "learning_rate": 1.9333950046253472e-05, "loss": 1.2836, "mean_token_accuracy": 0.6629158512720157, "step": 1045 }, { "epoch": 0.09718172983479106, "grad_norm": 4.28125, "learning_rate": 1.9426456984273824e-05, "loss": 1.2609, "mean_token_accuracy": 0.6693003913894324, "step": 1050 }, { "epoch": 0.09764449997686149, "grad_norm": 4.875, "learning_rate": 1.9518963922294175e-05, "loss": 1.2679, "mean_token_accuracy": 0.6632827788649707, "step": 1055 }, { "epoch": 0.09810727011893193, "grad_norm": 4.3125, "learning_rate": 1.9611470860314527e-05, "loss": 1.1324, "mean_token_accuracy": 0.7009784735812132, "step": 1060 }, { "epoch": 0.09857004026100236, "grad_norm": 4.4375, "learning_rate": 1.9703977798334875e-05, "loss": 1.2225, "mean_token_accuracy": 0.6807240704500981, "step": 1065 }, { "epoch": 0.0990328104030728, "grad_norm": 5.46875, "learning_rate": 1.9796484736355227e-05, "loss": 1.2072, "mean_token_accuracy": 0.6842221135029354, "step": 1070 }, { "epoch": 0.09949558054514322, "grad_norm": 5.0, "learning_rate": 1.988899167437558e-05, "loss": 1.1828, "mean_token_accuracy": 0.6865704500978472, "step": 1075 }, { "epoch": 0.09995835068721366, "grad_norm": 4.28125, "learning_rate": 1.998149861239593e-05, "loss": 1.2861, "mean_token_accuracy": 0.6629403131115461, "step": 1080 }, { "epoch": 0.10042112082928409, "grad_norm": 5.25, "learning_rate": 1.999999164802664e-05, "loss": 1.2287, "mean_token_accuracy": 0.6751956947162426, "step": 1085 }, { "epoch": 0.10088389097135453, "grad_norm": 4.59375, "learning_rate": 1.9999957718158776e-05, "loss": 1.1864, "mean_token_accuracy": 0.6863013698630136, "step": 1090 }, { "epoch": 0.10134666111342497, "grad_norm": 4.8125, "learning_rate": 1.9999897688486557e-05, "loss": 1.2136, "mean_token_accuracy": 0.6771037181996086, "step": 1095 }, { "epoch": 0.1018094312554954, "grad_norm": 4.59375, "learning_rate": 1.9999811559166664e-05, "loss": 1.2516, "mean_token_accuracy": 0.6717954990215265, "step": 1100 }, { "epoch": 0.1018094312554954, "eval_loss": 1.2775851488113403, "eval_mean_token_accuracy": 0.664972174657534, "eval_runtime": 40.6927, "eval_samples_per_second": 25.164, "eval_steps_per_second": 6.291, "step": 1100 }, { "epoch": 0.10227220139756583, "grad_norm": 4.40625, "learning_rate": 1.9999699330423892e-05, "loss": 1.2485, "mean_token_accuracy": 0.671917808219178, "step": 1105 }, { "epoch": 0.10273497153963626, "grad_norm": 4.40625, "learning_rate": 1.999956100255116e-05, "loss": 1.2896, "mean_token_accuracy": 0.6611545988258316, "step": 1110 }, { "epoch": 0.1031977416817067, "grad_norm": 4.46875, "learning_rate": 1.9999396575909498e-05, "loss": 1.1777, "mean_token_accuracy": 0.6877446183953033, "step": 1115 }, { "epoch": 0.10366051182377713, "grad_norm": 4.34375, "learning_rate": 1.999920605092806e-05, "loss": 1.2162, "mean_token_accuracy": 0.6811643835616439, "step": 1120 }, { "epoch": 0.10412328196584757, "grad_norm": 4.25, "learning_rate": 1.9998989428104115e-05, "loss": 1.2319, "mean_token_accuracy": 0.6786448140900195, "step": 1125 }, { "epoch": 0.10458605210791799, "grad_norm": 5.3125, "learning_rate": 1.9998746708003044e-05, "loss": 1.2494, "mean_token_accuracy": 0.6738502935420743, "step": 1130 }, { "epoch": 0.10504882224998843, "grad_norm": 4.28125, "learning_rate": 1.9998477891258352e-05, "loss": 1.2701, "mean_token_accuracy": 0.6679305283757337, "step": 1135 }, { "epoch": 0.10551159239205886, "grad_norm": 4.75, "learning_rate": 1.9998182978571643e-05, "loss": 1.2398, "mean_token_accuracy": 0.6742172211350294, "step": 1140 }, { "epoch": 0.1059743625341293, "grad_norm": 4.15625, "learning_rate": 1.9997861970712636e-05, "loss": 1.2642, "mean_token_accuracy": 0.6634784735812134, "step": 1145 }, { "epoch": 0.10643713267619972, "grad_norm": 4.625, "learning_rate": 1.9997514868519156e-05, "loss": 1.2239, "mean_token_accuracy": 0.6788649706457924, "step": 1150 }, { "epoch": 0.10689990281827016, "grad_norm": 4.40625, "learning_rate": 1.9997141672897147e-05, "loss": 1.2336, "mean_token_accuracy": 0.676908023483366, "step": 1155 }, { "epoch": 0.1073626729603406, "grad_norm": 4.34375, "learning_rate": 1.9996742384820637e-05, "loss": 1.2074, "mean_token_accuracy": 0.6842465753424657, "step": 1160 }, { "epoch": 0.10782544310241103, "grad_norm": 4.21875, "learning_rate": 1.9996317005331768e-05, "loss": 1.2694, "mean_token_accuracy": 0.664481409001957, "step": 1165 }, { "epoch": 0.10828821324448147, "grad_norm": 4.34375, "learning_rate": 1.9995865535540776e-05, "loss": 1.2169, "mean_token_accuracy": 0.6759050880626224, "step": 1170 }, { "epoch": 0.1087509833865519, "grad_norm": 4.65625, "learning_rate": 1.999538797662599e-05, "loss": 1.2526, "mean_token_accuracy": 0.6713307240704502, "step": 1175 }, { "epoch": 0.10921375352862234, "grad_norm": 4.25, "learning_rate": 1.9994884329833844e-05, "loss": 1.272, "mean_token_accuracy": 0.6680772994129158, "step": 1180 }, { "epoch": 0.10967652367069276, "grad_norm": 4.1875, "learning_rate": 1.9994354596478843e-05, "loss": 1.2569, "mean_token_accuracy": 0.669251467710372, "step": 1185 }, { "epoch": 0.1101392938127632, "grad_norm": 4.28125, "learning_rate": 1.9993798777943593e-05, "loss": 1.3002, "mean_token_accuracy": 0.6612769080234833, "step": 1190 }, { "epoch": 0.11060206395483363, "grad_norm": 6.34375, "learning_rate": 1.9993216875678765e-05, "loss": 1.3071, "mean_token_accuracy": 0.6554549902152642, "step": 1195 }, { "epoch": 0.11106483409690407, "grad_norm": 4.1875, "learning_rate": 1.9992608891203135e-05, "loss": 1.2588, "mean_token_accuracy": 0.6716487279843445, "step": 1200 }, { "epoch": 0.11106483409690407, "eval_loss": 1.2679208517074585, "eval_mean_token_accuracy": 0.6665889493639917, "eval_runtime": 39.8973, "eval_samples_per_second": 25.666, "eval_steps_per_second": 6.416, "step": 1200 }, { "epoch": 0.1115276042389745, "grad_norm": 4.6875, "learning_rate": 1.9991974826103525e-05, "loss": 1.221, "mean_token_accuracy": 0.6755626223091977, "step": 1205 }, { "epoch": 0.11199037438104494, "grad_norm": 4.71875, "learning_rate": 1.999131468203484e-05, "loss": 1.1843, "mean_token_accuracy": 0.6861545988258317, "step": 1210 }, { "epoch": 0.11245314452311538, "grad_norm": 4.5, "learning_rate": 1.999062846072006e-05, "loss": 1.2228, "mean_token_accuracy": 0.6807729941291586, "step": 1215 }, { "epoch": 0.1129159146651858, "grad_norm": 4.59375, "learning_rate": 1.9989916163950203e-05, "loss": 1.2261, "mean_token_accuracy": 0.6779354207436401, "step": 1220 }, { "epoch": 0.11337868480725624, "grad_norm": 4.40625, "learning_rate": 1.9989177793584367e-05, "loss": 1.2561, "mean_token_accuracy": 0.6670009784735811, "step": 1225 }, { "epoch": 0.11384145494932667, "grad_norm": 5.59375, "learning_rate": 1.9988413351549694e-05, "loss": 1.215, "mean_token_accuracy": 0.6804305283757339, "step": 1230 }, { "epoch": 0.11430422509139711, "grad_norm": 4.28125, "learning_rate": 1.998762283984136e-05, "loss": 1.281, "mean_token_accuracy": 0.6622309197651663, "step": 1235 }, { "epoch": 0.11476699523346753, "grad_norm": 4.15625, "learning_rate": 1.9986806260522608e-05, "loss": 1.263, "mean_token_accuracy": 0.6702544031311154, "step": 1240 }, { "epoch": 0.11522976537553797, "grad_norm": 5.1875, "learning_rate": 1.9985963615724696e-05, "loss": 1.2422, "mean_token_accuracy": 0.6708414872798434, "step": 1245 }, { "epoch": 0.1156925355176084, "grad_norm": 4.21875, "learning_rate": 1.9985094907646916e-05, "loss": 1.2942, "mean_token_accuracy": 0.6594178082191782, "step": 1250 }, { "epoch": 0.11615530565967884, "grad_norm": 4.625, "learning_rate": 1.9984200138556593e-05, "loss": 1.2709, "mean_token_accuracy": 0.6613502935420742, "step": 1255 }, { "epoch": 0.11661807580174927, "grad_norm": 4.34375, "learning_rate": 1.9983279310789068e-05, "loss": 1.2789, "mean_token_accuracy": 0.6713307240704501, "step": 1260 }, { "epoch": 0.1170808459438197, "grad_norm": 5.125, "learning_rate": 1.9982332426747692e-05, "loss": 1.1882, "mean_token_accuracy": 0.6875000000000001, "step": 1265 }, { "epoch": 0.11754361608589013, "grad_norm": 4.65625, "learning_rate": 1.9981359488903818e-05, "loss": 1.2078, "mean_token_accuracy": 0.6867906066536202, "step": 1270 }, { "epoch": 0.11800638622796057, "grad_norm": 4.15625, "learning_rate": 1.9980360499796822e-05, "loss": 1.218, "mean_token_accuracy": 0.6802837573385518, "step": 1275 }, { "epoch": 0.11846915637003101, "grad_norm": 4.3125, "learning_rate": 1.997933546203404e-05, "loss": 1.265, "mean_token_accuracy": 0.6714041095890411, "step": 1280 }, { "epoch": 0.11893192651210144, "grad_norm": 4.3125, "learning_rate": 1.997828437829082e-05, "loss": 1.2558, "mean_token_accuracy": 0.6682729941291584, "step": 1285 }, { "epoch": 0.11939469665417188, "grad_norm": 4.15625, "learning_rate": 1.997720725131048e-05, "loss": 1.1923, "mean_token_accuracy": 0.6816291585127201, "step": 1290 }, { "epoch": 0.1198574667962423, "grad_norm": 4.34375, "learning_rate": 1.997610408390431e-05, "loss": 1.2717, "mean_token_accuracy": 0.6661692759295498, "step": 1295 }, { "epoch": 0.12032023693831274, "grad_norm": 4.5625, "learning_rate": 1.9974974878951574e-05, "loss": 1.2464, "mean_token_accuracy": 0.6693248532289628, "step": 1300 }, { "epoch": 0.12032023693831274, "eval_loss": 1.259621024131775, "eval_mean_token_accuracy": 0.6674164475293541, "eval_runtime": 39.8134, "eval_samples_per_second": 25.72, "eval_steps_per_second": 6.43, "step": 1300 }, { "epoch": 0.12078300708038317, "grad_norm": 4.71875, "learning_rate": 1.997381963939948e-05, "loss": 1.2804, "mean_token_accuracy": 0.6617416829745597, "step": 1305 }, { "epoch": 0.12124577722245361, "grad_norm": 5.0, "learning_rate": 1.9972638368263196e-05, "loss": 1.1766, "mean_token_accuracy": 0.6887720156555773, "step": 1310 }, { "epoch": 0.12170854736452404, "grad_norm": 4.40625, "learning_rate": 1.997143106862583e-05, "loss": 1.253, "mean_token_accuracy": 0.6726761252446184, "step": 1315 }, { "epoch": 0.12217131750659448, "grad_norm": 4.6875, "learning_rate": 1.997019774363843e-05, "loss": 1.1565, "mean_token_accuracy": 0.6972113502935422, "step": 1320 }, { "epoch": 0.1226340876486649, "grad_norm": 4.53125, "learning_rate": 1.9968938396519953e-05, "loss": 1.2905, "mean_token_accuracy": 0.6626223091976517, "step": 1325 }, { "epoch": 0.12309685779073534, "grad_norm": 4.0, "learning_rate": 1.996765303055729e-05, "loss": 1.1257, "mean_token_accuracy": 0.7029843444227006, "step": 1330 }, { "epoch": 0.12355962793280578, "grad_norm": 4.53125, "learning_rate": 1.9966341649105228e-05, "loss": 1.268, "mean_token_accuracy": 0.666756360078278, "step": 1335 }, { "epoch": 0.12402239807487621, "grad_norm": 4.71875, "learning_rate": 1.9965004255586476e-05, "loss": 1.2444, "mean_token_accuracy": 0.6704990215264188, "step": 1340 }, { "epoch": 0.12448516821694665, "grad_norm": 4.625, "learning_rate": 1.9963640853491607e-05, "loss": 1.2044, "mean_token_accuracy": 0.6828033268101762, "step": 1345 }, { "epoch": 0.12494793835901707, "grad_norm": 4.40625, "learning_rate": 1.9962251446379092e-05, "loss": 1.2512, "mean_token_accuracy": 0.6699853228962819, "step": 1350 }, { "epoch": 0.12541070850108751, "grad_norm": 4.46875, "learning_rate": 1.9960836037875274e-05, "loss": 1.1668, "mean_token_accuracy": 0.6954011741682975, "step": 1355 }, { "epoch": 0.12587347864315795, "grad_norm": 4.28125, "learning_rate": 1.9959394631674357e-05, "loss": 1.207, "mean_token_accuracy": 0.6817025440313111, "step": 1360 }, { "epoch": 0.12633624878522837, "grad_norm": 4.4375, "learning_rate": 1.9957927231538395e-05, "loss": 1.1989, "mean_token_accuracy": 0.6853473581213307, "step": 1365 }, { "epoch": 0.1267990189272988, "grad_norm": 4.25, "learning_rate": 1.9956433841297292e-05, "loss": 1.2315, "mean_token_accuracy": 0.6735812133072407, "step": 1370 }, { "epoch": 0.12726178906936925, "grad_norm": 4.34375, "learning_rate": 1.9954914464848787e-05, "loss": 1.2872, "mean_token_accuracy": 0.6588796477495108, "step": 1375 }, { "epoch": 0.1277245592114397, "grad_norm": 4.1875, "learning_rate": 1.9953369106158436e-05, "loss": 1.2713, "mean_token_accuracy": 0.6659735812133072, "step": 1380 }, { "epoch": 0.1281873293535101, "grad_norm": 4.5, "learning_rate": 1.9951797769259615e-05, "loss": 1.2478, "mean_token_accuracy": 0.6707925636007828, "step": 1385 }, { "epoch": 0.12865009949558054, "grad_norm": 4.625, "learning_rate": 1.9950200458253492e-05, "loss": 1.244, "mean_token_accuracy": 0.6756604696673189, "step": 1390 }, { "epoch": 0.12911286963765098, "grad_norm": 4.21875, "learning_rate": 1.9948577177309042e-05, "loss": 1.1667, "mean_token_accuracy": 0.6892123287671234, "step": 1395 }, { "epoch": 0.12957563977972142, "grad_norm": 4.3125, "learning_rate": 1.9946927930663018e-05, "loss": 1.2698, "mean_token_accuracy": 0.6643101761252448, "step": 1400 }, { "epoch": 0.12957563977972142, "eval_loss": 1.2534548044204712, "eval_mean_token_accuracy": 0.668488564090019, "eval_runtime": 39.7229, "eval_samples_per_second": 25.779, "eval_steps_per_second": 6.445, "step": 1400 }, { "epoch": 0.13003840992179186, "grad_norm": 4.5625, "learning_rate": 1.9945252722619934e-05, "loss": 1.2352, "mean_token_accuracy": 0.672504892367906, "step": 1405 }, { "epoch": 0.13050118006386227, "grad_norm": 4.28125, "learning_rate": 1.9943551557552064e-05, "loss": 1.3246, "mean_token_accuracy": 0.653816046966732, "step": 1410 }, { "epoch": 0.1309639502059327, "grad_norm": 4.03125, "learning_rate": 1.9941824439899444e-05, "loss": 1.2006, "mean_token_accuracy": 0.6818737769080235, "step": 1415 }, { "epoch": 0.13142672034800315, "grad_norm": 3.984375, "learning_rate": 1.9940071374169835e-05, "loss": 1.183, "mean_token_accuracy": 0.6869129158512721, "step": 1420 }, { "epoch": 0.1318894904900736, "grad_norm": 4.59375, "learning_rate": 1.9938292364938717e-05, "loss": 1.2497, "mean_token_accuracy": 0.6707925636007828, "step": 1425 }, { "epoch": 0.132352260632144, "grad_norm": 4.5, "learning_rate": 1.9936487416849293e-05, "loss": 1.2226, "mean_token_accuracy": 0.6761252446183954, "step": 1430 }, { "epoch": 0.13281503077421444, "grad_norm": 4.1875, "learning_rate": 1.993465653461246e-05, "loss": 1.2183, "mean_token_accuracy": 0.6760029354207436, "step": 1435 }, { "epoch": 0.13327780091628488, "grad_norm": 4.15625, "learning_rate": 1.9932799723006813e-05, "loss": 1.1974, "mean_token_accuracy": 0.6828767123287669, "step": 1440 }, { "epoch": 0.13374057105835532, "grad_norm": 4.5, "learning_rate": 1.9930916986878605e-05, "loss": 1.2362, "mean_token_accuracy": 0.6776174168297455, "step": 1445 }, { "epoch": 0.13420334120042574, "grad_norm": 4.46875, "learning_rate": 1.9929008331141764e-05, "loss": 1.1778, "mean_token_accuracy": 0.6907778864970646, "step": 1450 }, { "epoch": 0.13466611134249618, "grad_norm": 4.75, "learning_rate": 1.9927073760777865e-05, "loss": 1.2073, "mean_token_accuracy": 0.6796722113502935, "step": 1455 }, { "epoch": 0.13512888148456662, "grad_norm": 4.4375, "learning_rate": 1.9925113280836124e-05, "loss": 1.2903, "mean_token_accuracy": 0.6593688845401176, "step": 1460 }, { "epoch": 0.13559165162663706, "grad_norm": 4.46875, "learning_rate": 1.9923126896433373e-05, "loss": 1.2572, "mean_token_accuracy": 0.6678816046966731, "step": 1465 }, { "epoch": 0.1360544217687075, "grad_norm": 4.3125, "learning_rate": 1.992111461275406e-05, "loss": 1.2647, "mean_token_accuracy": 0.6663405088062622, "step": 1470 }, { "epoch": 0.1365171919107779, "grad_norm": 4.71875, "learning_rate": 1.9919076435050228e-05, "loss": 1.265, "mean_token_accuracy": 0.6662671232876713, "step": 1475 }, { "epoch": 0.13697996205284835, "grad_norm": 4.34375, "learning_rate": 1.9917012368641497e-05, "loss": 1.1426, "mean_token_accuracy": 0.6974804305283757, "step": 1480 }, { "epoch": 0.1374427321949188, "grad_norm": 3.96875, "learning_rate": 1.9914922418915074e-05, "loss": 1.1435, "mean_token_accuracy": 0.6932240704500979, "step": 1485 }, { "epoch": 0.13790550233698923, "grad_norm": 4.9375, "learning_rate": 1.99128065913257e-05, "loss": 1.2748, "mean_token_accuracy": 0.666536203522505, "step": 1490 }, { "epoch": 0.13836827247905964, "grad_norm": 4.15625, "learning_rate": 1.991066489139568e-05, "loss": 1.3009, "mean_token_accuracy": 0.659637964774951, "step": 1495 }, { "epoch": 0.13883104262113008, "grad_norm": 4.125, "learning_rate": 1.9908497324714816e-05, "loss": 1.103, "mean_token_accuracy": 0.7054794520547942, "step": 1500 }, { "epoch": 0.13883104262113008, "eval_loss": 1.2488734722137451, "eval_mean_token_accuracy": 0.6692682852250483, "eval_runtime": 39.8766, "eval_samples_per_second": 25.679, "eval_steps_per_second": 6.42, "step": 1500 }, { "epoch": 0.13929381276320052, "grad_norm": 4.375, "learning_rate": 1.990630389694045e-05, "loss": 1.2076, "mean_token_accuracy": 0.6800146771037182, "step": 1505 }, { "epoch": 0.13975658290527096, "grad_norm": 4.125, "learning_rate": 1.990408461379741e-05, "loss": 1.1935, "mean_token_accuracy": 0.6789872798434441, "step": 1510 }, { "epoch": 0.14021935304734137, "grad_norm": 4.5625, "learning_rate": 1.9901839481077997e-05, "loss": 1.2708, "mean_token_accuracy": 0.6588062622309198, "step": 1515 }, { "epoch": 0.1406821231894118, "grad_norm": 4.21875, "learning_rate": 1.9899568504641993e-05, "loss": 1.2558, "mean_token_accuracy": 0.6655577299412916, "step": 1520 }, { "epoch": 0.14114489333148225, "grad_norm": 4.0625, "learning_rate": 1.9897271690416634e-05, "loss": 1.2519, "mean_token_accuracy": 0.6701810176125247, "step": 1525 }, { "epoch": 0.1416076634735527, "grad_norm": 4.21875, "learning_rate": 1.9894949044396574e-05, "loss": 1.1665, "mean_token_accuracy": 0.6921477495107633, "step": 1530 }, { "epoch": 0.14207043361562313, "grad_norm": 4.9375, "learning_rate": 1.9892600572643905e-05, "loss": 1.215, "mean_token_accuracy": 0.6756604696673192, "step": 1535 }, { "epoch": 0.14253320375769354, "grad_norm": 4.5, "learning_rate": 1.989022628128812e-05, "loss": 1.2807, "mean_token_accuracy": 0.6666585127201565, "step": 1540 }, { "epoch": 0.14299597389976398, "grad_norm": 4.53125, "learning_rate": 1.98878261765261e-05, "loss": 1.1631, "mean_token_accuracy": 0.6943493150684932, "step": 1545 }, { "epoch": 0.14345874404183442, "grad_norm": 4.8125, "learning_rate": 1.988540026462209e-05, "loss": 1.1955, "mean_token_accuracy": 0.6812133072407045, "step": 1550 }, { "epoch": 0.14392151418390486, "grad_norm": 4.21875, "learning_rate": 1.9882948551907705e-05, "loss": 1.1824, "mean_token_accuracy": 0.6856164383561645, "step": 1555 }, { "epoch": 0.14438428432597528, "grad_norm": 4.4375, "learning_rate": 1.9880471044781898e-05, "loss": 1.1677, "mean_token_accuracy": 0.6863747553816049, "step": 1560 }, { "epoch": 0.14484705446804572, "grad_norm": 4.46875, "learning_rate": 1.987796774971093e-05, "loss": 1.2887, "mean_token_accuracy": 0.6609589041095891, "step": 1565 }, { "epoch": 0.14530982461011616, "grad_norm": 4.75, "learning_rate": 1.9875438673228393e-05, "loss": 1.1879, "mean_token_accuracy": 0.6843199608610566, "step": 1570 }, { "epoch": 0.1457725947521866, "grad_norm": 4.375, "learning_rate": 1.987288382193515e-05, "loss": 1.2357, "mean_token_accuracy": 0.6737524461839531, "step": 1575 }, { "epoch": 0.14623536489425704, "grad_norm": 4.375, "learning_rate": 1.9870303202499337e-05, "loss": 1.1717, "mean_token_accuracy": 0.6880626223091977, "step": 1580 }, { "epoch": 0.14669813503632745, "grad_norm": 4.125, "learning_rate": 1.986769682165635e-05, "loss": 1.2605, "mean_token_accuracy": 0.6651663405088062, "step": 1585 }, { "epoch": 0.1471609051783979, "grad_norm": 4.53125, "learning_rate": 1.9865064686208826e-05, "loss": 1.2681, "mean_token_accuracy": 0.6624021526418787, "step": 1590 }, { "epoch": 0.14762367532046833, "grad_norm": 4.40625, "learning_rate": 1.9862406803026612e-05, "loss": 1.2151, "mean_token_accuracy": 0.6765655577299413, "step": 1595 }, { "epoch": 0.14808644546253877, "grad_norm": 4.5, "learning_rate": 1.985972317904676e-05, "loss": 1.2333, "mean_token_accuracy": 0.6714530332681017, "step": 1600 }, { "epoch": 0.14808644546253877, "eval_loss": 1.2439508438110352, "eval_mean_token_accuracy": 0.6705831090998038, "eval_runtime": 39.8966, "eval_samples_per_second": 25.666, "eval_steps_per_second": 6.417, "step": 1600 }, { "epoch": 0.14854921560460918, "grad_norm": 4.34375, "learning_rate": 1.985701382127351e-05, "loss": 1.2186, "mean_token_accuracy": 0.6786937377690802, "step": 1605 }, { "epoch": 0.14901198574667962, "grad_norm": 4.0625, "learning_rate": 1.9854278736778257e-05, "loss": 1.1686, "mean_token_accuracy": 0.6852984344422699, "step": 1610 }, { "epoch": 0.14947475588875006, "grad_norm": 4.40625, "learning_rate": 1.985151793269955e-05, "loss": 1.1864, "mean_token_accuracy": 0.6855185909980431, "step": 1615 }, { "epoch": 0.1499375260308205, "grad_norm": 4.875, "learning_rate": 1.9848731416243066e-05, "loss": 1.1908, "mean_token_accuracy": 0.6853228962818001, "step": 1620 }, { "epoch": 0.1504002961728909, "grad_norm": 4.46875, "learning_rate": 1.984591919468159e-05, "loss": 1.2216, "mean_token_accuracy": 0.6708659491193737, "step": 1625 }, { "epoch": 0.15086306631496135, "grad_norm": 5.375, "learning_rate": 1.9843081275354993e-05, "loss": 1.1569, "mean_token_accuracy": 0.6921966731898238, "step": 1630 }, { "epoch": 0.1513258364570318, "grad_norm": 4.34375, "learning_rate": 1.984021766567022e-05, "loss": 1.1545, "mean_token_accuracy": 0.6923923679060666, "step": 1635 }, { "epoch": 0.15178860659910223, "grad_norm": 4.5625, "learning_rate": 1.983732837310127e-05, "loss": 1.2111, "mean_token_accuracy": 0.6803571428571429, "step": 1640 }, { "epoch": 0.15225137674117267, "grad_norm": 4.5, "learning_rate": 1.983441340518917e-05, "loss": 1.2791, "mean_token_accuracy": 0.664774951076321, "step": 1645 }, { "epoch": 0.15271414688324308, "grad_norm": 4.4375, "learning_rate": 1.9831472769541967e-05, "loss": 1.1672, "mean_token_accuracy": 0.6898727984344422, "step": 1650 }, { "epoch": 0.15317691702531352, "grad_norm": 4.5625, "learning_rate": 1.9828506473834687e-05, "loss": 1.2461, "mean_token_accuracy": 0.6692025440313112, "step": 1655 }, { "epoch": 0.15363968716738396, "grad_norm": 4.0, "learning_rate": 1.9825514525809343e-05, "loss": 1.182, "mean_token_accuracy": 0.681971624266145, "step": 1660 }, { "epoch": 0.1541024573094544, "grad_norm": 4.0, "learning_rate": 1.982249693327489e-05, "loss": 1.311, "mean_token_accuracy": 0.6569960861056752, "step": 1665 }, { "epoch": 0.15456522745152482, "grad_norm": 4.71875, "learning_rate": 1.981945370410722e-05, "loss": 1.1807, "mean_token_accuracy": 0.6848336594911937, "step": 1670 }, { "epoch": 0.15502799759359526, "grad_norm": 4.21875, "learning_rate": 1.9816384846249132e-05, "loss": 1.2897, "mean_token_accuracy": 0.658757338551859, "step": 1675 }, { "epoch": 0.1554907677356657, "grad_norm": 4.34375, "learning_rate": 1.9813290367710323e-05, "loss": 1.2758, "mean_token_accuracy": 0.6623532289628181, "step": 1680 }, { "epoch": 0.15595353787773614, "grad_norm": 4.0625, "learning_rate": 1.9810170276567357e-05, "loss": 1.2056, "mean_token_accuracy": 0.6785714285714286, "step": 1685 }, { "epoch": 0.15641630801980655, "grad_norm": 4.3125, "learning_rate": 1.980702458096364e-05, "loss": 1.2228, "mean_token_accuracy": 0.674779843444227, "step": 1690 }, { "epoch": 0.156879078161877, "grad_norm": 4.75, "learning_rate": 1.9803853289109414e-05, "loss": 1.2346, "mean_token_accuracy": 0.6711594911937377, "step": 1695 }, { "epoch": 0.15734184830394743, "grad_norm": 4.34375, "learning_rate": 1.9800656409281724e-05, "loss": 1.198, "mean_token_accuracy": 0.6809197651663406, "step": 1700 }, { "epoch": 0.15734184830394743, "eval_loss": 1.2401008605957031, "eval_mean_token_accuracy": 0.6709366591242657, "eval_runtime": 39.8123, "eval_samples_per_second": 25.721, "eval_steps_per_second": 6.43, "step": 1700 }, { "epoch": 0.15780461844601787, "grad_norm": 5.4375, "learning_rate": 1.9797433949824398e-05, "loss": 1.2026, "mean_token_accuracy": 0.6789383561643836, "step": 1705 }, { "epoch": 0.1582673885880883, "grad_norm": 4.21875, "learning_rate": 1.9794185919148028e-05, "loss": 1.2445, "mean_token_accuracy": 0.6690802348336593, "step": 1710 }, { "epoch": 0.15873015873015872, "grad_norm": 4.375, "learning_rate": 1.9790912325729945e-05, "loss": 1.2332, "mean_token_accuracy": 0.6724804305283757, "step": 1715 }, { "epoch": 0.15919292887222916, "grad_norm": 4.4375, "learning_rate": 1.9787613178114202e-05, "loss": 1.2096, "mean_token_accuracy": 0.6802837573385518, "step": 1720 }, { "epoch": 0.1596556990142996, "grad_norm": 4.375, "learning_rate": 1.9784288484911546e-05, "loss": 1.2394, "mean_token_accuracy": 0.6690313111545988, "step": 1725 }, { "epoch": 0.16011846915637004, "grad_norm": 4.65625, "learning_rate": 1.9780938254799396e-05, "loss": 1.1383, "mean_token_accuracy": 0.6969422700587085, "step": 1730 }, { "epoch": 0.16058123929844045, "grad_norm": 4.9375, "learning_rate": 1.9777562496521824e-05, "loss": 1.1865, "mean_token_accuracy": 0.6829500978473579, "step": 1735 }, { "epoch": 0.1610440094405109, "grad_norm": 4.1875, "learning_rate": 1.9774161218889534e-05, "loss": 1.2608, "mean_token_accuracy": 0.6699853228962818, "step": 1740 }, { "epoch": 0.16150677958258133, "grad_norm": 4.40625, "learning_rate": 1.977073443077983e-05, "loss": 1.2871, "mean_token_accuracy": 0.658170254403131, "step": 1745 }, { "epoch": 0.16196954972465177, "grad_norm": 4.65625, "learning_rate": 1.9767282141136594e-05, "loss": 1.253, "mean_token_accuracy": 0.6665117416829747, "step": 1750 }, { "epoch": 0.16243231986672219, "grad_norm": 4.25, "learning_rate": 1.976380435897028e-05, "loss": 1.1974, "mean_token_accuracy": 0.6822407045009785, "step": 1755 }, { "epoch": 0.16289509000879263, "grad_norm": 4.65625, "learning_rate": 1.9760301093357868e-05, "loss": 1.1773, "mean_token_accuracy": 0.6848825831702545, "step": 1760 }, { "epoch": 0.16335786015086307, "grad_norm": 4.09375, "learning_rate": 1.9756772353442848e-05, "loss": 1.1438, "mean_token_accuracy": 0.6928082191780822, "step": 1765 }, { "epoch": 0.1638206302929335, "grad_norm": 4.0625, "learning_rate": 1.9753218148435207e-05, "loss": 1.188, "mean_token_accuracy": 0.6820939334637964, "step": 1770 }, { "epoch": 0.16428340043500395, "grad_norm": 5.21875, "learning_rate": 1.9749638487611384e-05, "loss": 1.2193, "mean_token_accuracy": 0.6788649706457925, "step": 1775 }, { "epoch": 0.16474617057707436, "grad_norm": 4.03125, "learning_rate": 1.9746033380314267e-05, "loss": 1.1871, "mean_token_accuracy": 0.6829990215264188, "step": 1780 }, { "epoch": 0.1652089407191448, "grad_norm": 4.3125, "learning_rate": 1.9742402835953155e-05, "loss": 1.2219, "mean_token_accuracy": 0.674412915851272, "step": 1785 }, { "epoch": 0.16567171086121524, "grad_norm": 4.34375, "learning_rate": 1.9738746864003734e-05, "loss": 1.1933, "mean_token_accuracy": 0.6801369863013698, "step": 1790 }, { "epoch": 0.16613448100328568, "grad_norm": 4.53125, "learning_rate": 1.973506547400806e-05, "loss": 1.1775, "mean_token_accuracy": 0.6884540117416831, "step": 1795 }, { "epoch": 0.1665972511453561, "grad_norm": 4.4375, "learning_rate": 1.9731358675574536e-05, "loss": 1.2055, "mean_token_accuracy": 0.6869863013698628, "step": 1800 }, { "epoch": 0.1665972511453561, "eval_loss": 1.236541986465454, "eval_mean_token_accuracy": 0.67160935971135, "eval_runtime": 39.7098, "eval_samples_per_second": 25.787, "eval_steps_per_second": 6.447, "step": 1800 }, { "epoch": 0.16706002128742653, "grad_norm": 4.0, "learning_rate": 1.972762647837787e-05, "loss": 1.0749, "mean_token_accuracy": 0.7116682974559686, "step": 1805 }, { "epoch": 0.16752279142949697, "grad_norm": 4.3125, "learning_rate": 1.9723868892159063e-05, "loss": 1.2015, "mean_token_accuracy": 0.6785714285714286, "step": 1810 }, { "epoch": 0.1679855615715674, "grad_norm": 4.40625, "learning_rate": 1.9720085926725385e-05, "loss": 1.1918, "mean_token_accuracy": 0.6816780821917808, "step": 1815 }, { "epoch": 0.16844833171363785, "grad_norm": 4.34375, "learning_rate": 1.9716277591950344e-05, "loss": 1.1694, "mean_token_accuracy": 0.6858610567514678, "step": 1820 }, { "epoch": 0.16891110185570826, "grad_norm": 4.3125, "learning_rate": 1.9712443897773662e-05, "loss": 1.2039, "mean_token_accuracy": 0.6765900195694716, "step": 1825 }, { "epoch": 0.1693738719977787, "grad_norm": 4.21875, "learning_rate": 1.9708584854201244e-05, "loss": 1.2676, "mean_token_accuracy": 0.6628669275929551, "step": 1830 }, { "epoch": 0.16983664213984914, "grad_norm": 5.0625, "learning_rate": 1.9704700471305166e-05, "loss": 1.1969, "mean_token_accuracy": 0.6820450097847358, "step": 1835 }, { "epoch": 0.17029941228191958, "grad_norm": 4.28125, "learning_rate": 1.9700790759223632e-05, "loss": 1.1753, "mean_token_accuracy": 0.6837328767123287, "step": 1840 }, { "epoch": 0.17076218242399, "grad_norm": 5.0625, "learning_rate": 1.9696855728160955e-05, "loss": 1.2541, "mean_token_accuracy": 0.6695205479452055, "step": 1845 }, { "epoch": 0.17122495256606043, "grad_norm": 4.09375, "learning_rate": 1.9692895388387536e-05, "loss": 1.1385, "mean_token_accuracy": 0.6920254403131115, "step": 1850 }, { "epoch": 0.17168772270813087, "grad_norm": 4.84375, "learning_rate": 1.9688909750239828e-05, "loss": 1.2116, "mean_token_accuracy": 0.6771037181996087, "step": 1855 }, { "epoch": 0.17215049285020131, "grad_norm": 4.375, "learning_rate": 1.9684898824120306e-05, "loss": 1.2386, "mean_token_accuracy": 0.670229941291585, "step": 1860 }, { "epoch": 0.17261326299227173, "grad_norm": 4.4375, "learning_rate": 1.968086262049746e-05, "loss": 1.2375, "mean_token_accuracy": 0.6715508806262231, "step": 1865 }, { "epoch": 0.17307603313434217, "grad_norm": 4.15625, "learning_rate": 1.967680114990575e-05, "loss": 1.1967, "mean_token_accuracy": 0.6807729941291585, "step": 1870 }, { "epoch": 0.1735388032764126, "grad_norm": 4.34375, "learning_rate": 1.967271442294557e-05, "loss": 1.239, "mean_token_accuracy": 0.6718199608610568, "step": 1875 }, { "epoch": 0.17400157341848305, "grad_norm": 4.375, "learning_rate": 1.966860245028325e-05, "loss": 1.209, "mean_token_accuracy": 0.6776908023483366, "step": 1880 }, { "epoch": 0.1744643435605535, "grad_norm": 4.34375, "learning_rate": 1.9664465242650998e-05, "loss": 1.234, "mean_token_accuracy": 0.673679060665362, "step": 1885 }, { "epoch": 0.1749271137026239, "grad_norm": 4.1875, "learning_rate": 1.9660302810846894e-05, "loss": 1.1381, "mean_token_accuracy": 0.6967710371819961, "step": 1890 }, { "epoch": 0.17538988384469434, "grad_norm": 4.5, "learning_rate": 1.965611516573485e-05, "loss": 1.2074, "mean_token_accuracy": 0.67353228962818, "step": 1895 }, { "epoch": 0.17585265398676478, "grad_norm": 4.1875, "learning_rate": 1.9651902318244582e-05, "loss": 1.1599, "mean_token_accuracy": 0.6913649706457926, "step": 1900 }, { "epoch": 0.17585265398676478, "eval_loss": 1.2331769466400146, "eval_mean_token_accuracy": 0.6721826840753423, "eval_runtime": 39.896, "eval_samples_per_second": 25.667, "eval_steps_per_second": 6.417, "step": 1900 }, { "epoch": 0.17631542412883522, "grad_norm": 4.4375, "learning_rate": 1.9647664279371588e-05, "loss": 1.239, "mean_token_accuracy": 0.671404109589041, "step": 1905 }, { "epoch": 0.17677819427090563, "grad_norm": 4.4375, "learning_rate": 1.9643401060177112e-05, "loss": 1.2225, "mean_token_accuracy": 0.672871819960861, "step": 1910 }, { "epoch": 0.17724096441297607, "grad_norm": 4.21875, "learning_rate": 1.963911267178812e-05, "loss": 1.1515, "mean_token_accuracy": 0.6943982387475539, "step": 1915 }, { "epoch": 0.1777037345550465, "grad_norm": 4.5, "learning_rate": 1.963479912539727e-05, "loss": 1.1714, "mean_token_accuracy": 0.6890410958904111, "step": 1920 }, { "epoch": 0.17816650469711695, "grad_norm": 4.28125, "learning_rate": 1.963046043226288e-05, "loss": 1.1234, "mean_token_accuracy": 0.7015900195694718, "step": 1925 }, { "epoch": 0.17862927483918736, "grad_norm": 4.4375, "learning_rate": 1.96260966037089e-05, "loss": 1.1443, "mean_token_accuracy": 0.6961594911937377, "step": 1930 }, { "epoch": 0.1790920449812578, "grad_norm": 4.34375, "learning_rate": 1.962170765112489e-05, "loss": 1.1564, "mean_token_accuracy": 0.6899706457925635, "step": 1935 }, { "epoch": 0.17955481512332824, "grad_norm": 4.15625, "learning_rate": 1.9617293585965975e-05, "loss": 1.1643, "mean_token_accuracy": 0.688747553816047, "step": 1940 }, { "epoch": 0.18001758526539868, "grad_norm": 4.71875, "learning_rate": 1.9612854419752833e-05, "loss": 1.2113, "mean_token_accuracy": 0.6779109589041096, "step": 1945 }, { "epoch": 0.18048035540746912, "grad_norm": 4.15625, "learning_rate": 1.9608390164071643e-05, "loss": 1.1627, "mean_token_accuracy": 0.6896281800391388, "step": 1950 }, { "epoch": 0.18094312554953954, "grad_norm": 4.125, "learning_rate": 1.9603900830574078e-05, "loss": 1.1634, "mean_token_accuracy": 0.6849804305283757, "step": 1955 }, { "epoch": 0.18140589569160998, "grad_norm": 4.375, "learning_rate": 1.9599386430977258e-05, "loss": 1.2325, "mean_token_accuracy": 0.6748776908023484, "step": 1960 }, { "epoch": 0.18186866583368042, "grad_norm": 4.34375, "learning_rate": 1.959484697706373e-05, "loss": 1.1851, "mean_token_accuracy": 0.6844911937377691, "step": 1965 }, { "epoch": 0.18233143597575086, "grad_norm": 4.46875, "learning_rate": 1.9590282480681428e-05, "loss": 1.1552, "mean_token_accuracy": 0.6858610567514677, "step": 1970 }, { "epoch": 0.18279420611782127, "grad_norm": 4.6875, "learning_rate": 1.958569295374365e-05, "loss": 1.1512, "mean_token_accuracy": 0.6905577299412917, "step": 1975 }, { "epoch": 0.1832569762598917, "grad_norm": 4.15625, "learning_rate": 1.958107840822902e-05, "loss": 1.2847, "mean_token_accuracy": 0.6599315068493151, "step": 1980 }, { "epoch": 0.18371974640196215, "grad_norm": 4.0, "learning_rate": 1.957643885618147e-05, "loss": 1.1768, "mean_token_accuracy": 0.6828277886497064, "step": 1985 }, { "epoch": 0.1841825165440326, "grad_norm": 4.125, "learning_rate": 1.9571774309710177e-05, "loss": 1.1908, "mean_token_accuracy": 0.6833904109589042, "step": 1990 }, { "epoch": 0.184645286686103, "grad_norm": 4.34375, "learning_rate": 1.9567084780989582e-05, "loss": 1.1492, "mean_token_accuracy": 0.6919520547945206, "step": 1995 }, { "epoch": 0.18510805682817344, "grad_norm": 4.8125, "learning_rate": 1.956237028225931e-05, "loss": 1.2044, "mean_token_accuracy": 0.6780332681017612, "step": 2000 }, { "epoch": 0.18510805682817344, "eval_loss": 1.2301238775253296, "eval_mean_token_accuracy": 0.673044581702544, "eval_runtime": 39.7843, "eval_samples_per_second": 25.739, "eval_steps_per_second": 6.435, "step": 2000 }, { "epoch": 0.18557082697024388, "grad_norm": 4.375, "learning_rate": 1.9557630825824156e-05, "loss": 1.1047, "mean_token_accuracy": 0.7028375733855187, "step": 2005 }, { "epoch": 0.18603359711231432, "grad_norm": 3.875, "learning_rate": 1.955286642405407e-05, "loss": 1.1196, "mean_token_accuracy": 0.6988502935420743, "step": 2010 }, { "epoch": 0.18649636725438476, "grad_norm": 4.3125, "learning_rate": 1.954807708938409e-05, "loss": 1.2151, "mean_token_accuracy": 0.6745107632093934, "step": 2015 }, { "epoch": 0.18695913739645517, "grad_norm": 4.125, "learning_rate": 1.9543262834314354e-05, "loss": 1.21, "mean_token_accuracy": 0.6770547945205478, "step": 2020 }, { "epoch": 0.1874219075385256, "grad_norm": 4.0625, "learning_rate": 1.9538423671410012e-05, "loss": 1.1642, "mean_token_accuracy": 0.6916095890410958, "step": 2025 }, { "epoch": 0.18788467768059605, "grad_norm": 5.15625, "learning_rate": 1.9533559613301243e-05, "loss": 1.1506, "mean_token_accuracy": 0.693688845401174, "step": 2030 }, { "epoch": 0.1883474478226665, "grad_norm": 5.53125, "learning_rate": 1.95286706726832e-05, "loss": 1.1839, "mean_token_accuracy": 0.6864726027397258, "step": 2035 }, { "epoch": 0.1888102179647369, "grad_norm": 4.1875, "learning_rate": 1.952375686231597e-05, "loss": 1.1747, "mean_token_accuracy": 0.6889677103718199, "step": 2040 }, { "epoch": 0.18927298810680734, "grad_norm": 4.25, "learning_rate": 1.9518818195024565e-05, "loss": 1.2063, "mean_token_accuracy": 0.6751956947162426, "step": 2045 }, { "epoch": 0.18973575824887778, "grad_norm": 4.28125, "learning_rate": 1.9513854683698853e-05, "loss": 1.2321, "mean_token_accuracy": 0.6737035225048924, "step": 2050 }, { "epoch": 0.19019852839094822, "grad_norm": 4.1875, "learning_rate": 1.950886634129357e-05, "loss": 1.2307, "mean_token_accuracy": 0.667270058708415, "step": 2055 }, { "epoch": 0.19066129853301864, "grad_norm": 5.03125, "learning_rate": 1.9503853180828235e-05, "loss": 1.2186, "mean_token_accuracy": 0.6739481409001957, "step": 2060 }, { "epoch": 0.19112406867508908, "grad_norm": 5.625, "learning_rate": 1.9498815215387158e-05, "loss": 1.2363, "mean_token_accuracy": 0.6727495107632093, "step": 2065 }, { "epoch": 0.19158683881715952, "grad_norm": 4.03125, "learning_rate": 1.949375245811939e-05, "loss": 1.1657, "mean_token_accuracy": 0.6924902152641879, "step": 2070 }, { "epoch": 0.19204960895922996, "grad_norm": 4.21875, "learning_rate": 1.9488664922238682e-05, "loss": 1.2021, "mean_token_accuracy": 0.6820205479452055, "step": 2075 }, { "epoch": 0.1925123791013004, "grad_norm": 4.4375, "learning_rate": 1.9483552621023455e-05, "loss": 1.1727, "mean_token_accuracy": 0.6861790606653619, "step": 2080 }, { "epoch": 0.1929751492433708, "grad_norm": 5.40625, "learning_rate": 1.9478415567816777e-05, "loss": 1.2184, "mean_token_accuracy": 0.6777641878669275, "step": 2085 }, { "epoch": 0.19343791938544125, "grad_norm": 4.125, "learning_rate": 1.9473253776026308e-05, "loss": 1.2045, "mean_token_accuracy": 0.6801614481409001, "step": 2090 }, { "epoch": 0.1939006895275117, "grad_norm": 4.21875, "learning_rate": 1.9468067259124286e-05, "loss": 1.2116, "mean_token_accuracy": 0.6786692759295498, "step": 2095 }, { "epoch": 0.19436345966958213, "grad_norm": 4.09375, "learning_rate": 1.946285603064747e-05, "loss": 1.1809, "mean_token_accuracy": 0.6827299412915852, "step": 2100 }, { "epoch": 0.19436345966958213, "eval_loss": 1.227554202079773, "eval_mean_token_accuracy": 0.6730140044031308, "eval_runtime": 39.7917, "eval_samples_per_second": 25.734, "eval_steps_per_second": 6.433, "step": 2100 }, { "epoch": 0.19482622981165254, "grad_norm": 4.15625, "learning_rate": 1.945762010419713e-05, "loss": 1.1658, "mean_token_accuracy": 0.6908512720156554, "step": 2105 }, { "epoch": 0.19528899995372298, "grad_norm": 4.15625, "learning_rate": 1.9452359493438984e-05, "loss": 1.1841, "mean_token_accuracy": 0.6876712328767122, "step": 2110 }, { "epoch": 0.19575177009579342, "grad_norm": 4.0625, "learning_rate": 1.944707421210318e-05, "loss": 1.1427, "mean_token_accuracy": 0.6944716242661448, "step": 2115 }, { "epoch": 0.19621454023786386, "grad_norm": 4.1875, "learning_rate": 1.9441764273984258e-05, "loss": 1.1754, "mean_token_accuracy": 0.6829990215264188, "step": 2120 }, { "epoch": 0.1966773103799343, "grad_norm": 4.28125, "learning_rate": 1.9436429692941115e-05, "loss": 1.2333, "mean_token_accuracy": 0.6692025440313112, "step": 2125 }, { "epoch": 0.1971400805220047, "grad_norm": 4.25, "learning_rate": 1.943107048289696e-05, "loss": 1.2123, "mean_token_accuracy": 0.6787426614481409, "step": 2130 }, { "epoch": 0.19760285066407515, "grad_norm": 4.3125, "learning_rate": 1.9425686657839276e-05, "loss": 1.1469, "mean_token_accuracy": 0.6938111545988258, "step": 2135 }, { "epoch": 0.1980656208061456, "grad_norm": 4.3125, "learning_rate": 1.9420278231819817e-05, "loss": 1.1139, "mean_token_accuracy": 0.6993884540117418, "step": 2140 }, { "epoch": 0.19852839094821603, "grad_norm": 4.15625, "learning_rate": 1.9414845218954523e-05, "loss": 1.184, "mean_token_accuracy": 0.681824853228963, "step": 2145 }, { "epoch": 0.19899116109028644, "grad_norm": 4.78125, "learning_rate": 1.9409387633423505e-05, "loss": 1.2639, "mean_token_accuracy": 0.6666340508806262, "step": 2150 }, { "epoch": 0.19945393123235688, "grad_norm": 4.21875, "learning_rate": 1.940390548947102e-05, "loss": 1.1923, "mean_token_accuracy": 0.6825342465753425, "step": 2155 }, { "epoch": 0.19991670137442732, "grad_norm": 4.0, "learning_rate": 1.9398398801405415e-05, "loss": 1.2008, "mean_token_accuracy": 0.6853718199608612, "step": 2160 }, { "epoch": 0.20037947151649776, "grad_norm": 5.5, "learning_rate": 1.93928675835991e-05, "loss": 1.1856, "mean_token_accuracy": 0.6818003913894324, "step": 2165 }, { "epoch": 0.20084224165856818, "grad_norm": 4.21875, "learning_rate": 1.9387311850488506e-05, "loss": 1.211, "mean_token_accuracy": 0.6749755381604697, "step": 2170 }, { "epoch": 0.20130501180063862, "grad_norm": 4.21875, "learning_rate": 1.938173161657405e-05, "loss": 1.1485, "mean_token_accuracy": 0.6929549902152641, "step": 2175 }, { "epoch": 0.20176778194270906, "grad_norm": 4.8125, "learning_rate": 1.937612689642009e-05, "loss": 1.1715, "mean_token_accuracy": 0.6864236790606654, "step": 2180 }, { "epoch": 0.2022305520847795, "grad_norm": 4.375, "learning_rate": 1.9370497704654907e-05, "loss": 1.2419, "mean_token_accuracy": 0.6695939334637966, "step": 2185 }, { "epoch": 0.20269332222684994, "grad_norm": 4.34375, "learning_rate": 1.9364844055970638e-05, "loss": 1.1867, "mean_token_accuracy": 0.6798923679060664, "step": 2190 }, { "epoch": 0.20315609236892035, "grad_norm": 4.40625, "learning_rate": 1.9359165965123255e-05, "loss": 1.1762, "mean_token_accuracy": 0.6869373776908023, "step": 2195 }, { "epoch": 0.2036188625109908, "grad_norm": 4.375, "learning_rate": 1.9353463446932532e-05, "loss": 1.2021, "mean_token_accuracy": 0.6762720156555773, "step": 2200 }, { "epoch": 0.2036188625109908, "eval_loss": 1.2250375747680664, "eval_mean_token_accuracy": 0.6738051920254395, "eval_runtime": 39.5646, "eval_samples_per_second": 25.882, "eval_steps_per_second": 6.47, "step": 2200 }, { "epoch": 0.20408163265306123, "grad_norm": 4.46875, "learning_rate": 1.934773651628199e-05, "loss": 1.2786, "mean_token_accuracy": 0.6596868884540117, "step": 2205 }, { "epoch": 0.20454440279513167, "grad_norm": 5.625, "learning_rate": 1.934198518811887e-05, "loss": 1.1614, "mean_token_accuracy": 0.6879892367906068, "step": 2210 }, { "epoch": 0.20500717293720208, "grad_norm": 4.15625, "learning_rate": 1.933620947745409e-05, "loss": 1.2162, "mean_token_accuracy": 0.6759540117416828, "step": 2215 }, { "epoch": 0.20546994307927252, "grad_norm": 4.59375, "learning_rate": 1.9330409399362203e-05, "loss": 1.2219, "mean_token_accuracy": 0.6736790606653621, "step": 2220 }, { "epoch": 0.20593271322134296, "grad_norm": 4.25, "learning_rate": 1.9324584968981366e-05, "loss": 1.1338, "mean_token_accuracy": 0.6967465753424656, "step": 2225 }, { "epoch": 0.2063954833634134, "grad_norm": 4.09375, "learning_rate": 1.931873620151329e-05, "loss": 1.1796, "mean_token_accuracy": 0.6855185909980431, "step": 2230 }, { "epoch": 0.2068582535054838, "grad_norm": 4.15625, "learning_rate": 1.9312863112223214e-05, "loss": 1.2492, "mean_token_accuracy": 0.6671232876712329, "step": 2235 }, { "epoch": 0.20732102364755425, "grad_norm": 4.40625, "learning_rate": 1.9306965716439844e-05, "loss": 1.2456, "mean_token_accuracy": 0.6697162426614482, "step": 2240 }, { "epoch": 0.2077837937896247, "grad_norm": 4.1875, "learning_rate": 1.9301044029555335e-05, "loss": 1.1584, "mean_token_accuracy": 0.6909980430528375, "step": 2245 }, { "epoch": 0.20824656393169513, "grad_norm": 4.125, "learning_rate": 1.9295098067025245e-05, "loss": 1.1143, "mean_token_accuracy": 0.6982142857142858, "step": 2250 }, { "epoch": 0.20870933407376557, "grad_norm": 3.953125, "learning_rate": 1.9289127844368475e-05, "loss": 1.166, "mean_token_accuracy": 0.6900929549902155, "step": 2255 }, { "epoch": 0.20917210421583599, "grad_norm": 4.40625, "learning_rate": 1.9283133377167263e-05, "loss": 1.2225, "mean_token_accuracy": 0.6758317025440312, "step": 2260 }, { "epoch": 0.20963487435790643, "grad_norm": 4.03125, "learning_rate": 1.9277114681067114e-05, "loss": 1.159, "mean_token_accuracy": 0.6939090019569472, "step": 2265 }, { "epoch": 0.21009764449997687, "grad_norm": 3.90625, "learning_rate": 1.9271071771776773e-05, "loss": 1.1675, "mean_token_accuracy": 0.6902397260273974, "step": 2270 }, { "epoch": 0.2105604146420473, "grad_norm": 4.21875, "learning_rate": 1.9265004665068187e-05, "loss": 1.2309, "mean_token_accuracy": 0.6719667318982387, "step": 2275 }, { "epoch": 0.21102318478411772, "grad_norm": 3.953125, "learning_rate": 1.9258913376776445e-05, "loss": 1.1761, "mean_token_accuracy": 0.6864970645792564, "step": 2280 }, { "epoch": 0.21148595492618816, "grad_norm": 4.25, "learning_rate": 1.925279792279976e-05, "loss": 1.2178, "mean_token_accuracy": 0.6745841487279843, "step": 2285 }, { "epoch": 0.2119487250682586, "grad_norm": 4.25, "learning_rate": 1.924665831909942e-05, "loss": 1.1223, "mean_token_accuracy": 0.7006115459882584, "step": 2290 }, { "epoch": 0.21241149521032904, "grad_norm": 4.28125, "learning_rate": 1.9240494581699734e-05, "loss": 1.2013, "mean_token_accuracy": 0.6750489236790608, "step": 2295 }, { "epoch": 0.21287426535239945, "grad_norm": 4.3125, "learning_rate": 1.9234306726688007e-05, "loss": 1.159, "mean_token_accuracy": 0.6878669275929549, "step": 2300 }, { "epoch": 0.21287426535239945, "eval_loss": 1.2222580909729004, "eval_mean_token_accuracy": 0.6744492263943239, "eval_runtime": 41.1133, "eval_samples_per_second": 24.907, "eval_steps_per_second": 6.227, "step": 2300 }, { "epoch": 0.2133370354944699, "grad_norm": 5.84375, "learning_rate": 1.922809477021448e-05, "loss": 1.161, "mean_token_accuracy": 0.6889432485322897, "step": 2305 }, { "epoch": 0.21379980563654033, "grad_norm": 4.1875, "learning_rate": 1.9221858728492314e-05, "loss": 1.1712, "mean_token_accuracy": 0.687793542074364, "step": 2310 }, { "epoch": 0.21426257577861077, "grad_norm": 4.09375, "learning_rate": 1.921559861779752e-05, "loss": 1.1951, "mean_token_accuracy": 0.6821183953033269, "step": 2315 }, { "epoch": 0.2147253459206812, "grad_norm": 4.1875, "learning_rate": 1.9209314454468935e-05, "loss": 1.2735, "mean_token_accuracy": 0.6611545988258318, "step": 2320 }, { "epoch": 0.21518811606275162, "grad_norm": 4.15625, "learning_rate": 1.9203006254908178e-05, "loss": 1.2006, "mean_token_accuracy": 0.6759784735812133, "step": 2325 }, { "epoch": 0.21565088620482206, "grad_norm": 4.21875, "learning_rate": 1.919667403557959e-05, "loss": 1.1242, "mean_token_accuracy": 0.6981898238747555, "step": 2330 }, { "epoch": 0.2161136563468925, "grad_norm": 4.71875, "learning_rate": 1.9190317813010203e-05, "loss": 1.2081, "mean_token_accuracy": 0.6782045009784736, "step": 2335 }, { "epoch": 0.21657642648896294, "grad_norm": 4.53125, "learning_rate": 1.9183937603789715e-05, "loss": 1.1759, "mean_token_accuracy": 0.6840753424657533, "step": 2340 }, { "epoch": 0.21703919663103335, "grad_norm": 4.375, "learning_rate": 1.9177533424570414e-05, "loss": 1.2159, "mean_token_accuracy": 0.675587084148728, "step": 2345 }, { "epoch": 0.2175019667731038, "grad_norm": 4.125, "learning_rate": 1.9171105292067152e-05, "loss": 1.2236, "mean_token_accuracy": 0.676394324853229, "step": 2350 }, { "epoch": 0.21796473691517423, "grad_norm": 4.28125, "learning_rate": 1.91646532230573e-05, "loss": 1.1738, "mean_token_accuracy": 0.6844911937377691, "step": 2355 }, { "epoch": 0.21842750705724467, "grad_norm": 5.03125, "learning_rate": 1.9158177234380707e-05, "loss": 1.2077, "mean_token_accuracy": 0.6828522504892367, "step": 2360 }, { "epoch": 0.2188902771993151, "grad_norm": 4.28125, "learning_rate": 1.9151677342939637e-05, "loss": 1.2057, "mean_token_accuracy": 0.6772994129158513, "step": 2365 }, { "epoch": 0.21935304734138553, "grad_norm": 4.0625, "learning_rate": 1.9145153565698766e-05, "loss": 1.1563, "mean_token_accuracy": 0.6888698630136985, "step": 2370 }, { "epoch": 0.21981581748345597, "grad_norm": 4.125, "learning_rate": 1.9138605919685088e-05, "loss": 1.2178, "mean_token_accuracy": 0.6727495107632093, "step": 2375 }, { "epoch": 0.2202785876255264, "grad_norm": 3.96875, "learning_rate": 1.9132034421987903e-05, "loss": 1.1755, "mean_token_accuracy": 0.6862035225048924, "step": 2380 }, { "epoch": 0.22074135776759685, "grad_norm": 4.59375, "learning_rate": 1.912543908975877e-05, "loss": 1.1199, "mean_token_accuracy": 0.6976761252446184, "step": 2385 }, { "epoch": 0.22120412790966726, "grad_norm": 3.90625, "learning_rate": 1.9118819940211443e-05, "loss": 1.2421, "mean_token_accuracy": 0.6676859099804305, "step": 2390 }, { "epoch": 0.2216668980517377, "grad_norm": 4.375, "learning_rate": 1.9112176990621848e-05, "loss": 1.2108, "mean_token_accuracy": 0.6787671232876711, "step": 2395 }, { "epoch": 0.22212966819380814, "grad_norm": 4.84375, "learning_rate": 1.910551025832803e-05, "loss": 1.1949, "mean_token_accuracy": 0.6777397260273974, "step": 2400 }, { "epoch": 0.22212966819380814, "eval_loss": 1.2204567193984985, "eval_mean_token_accuracy": 0.6746154904598822, "eval_runtime": 39.7312, "eval_samples_per_second": 25.773, "eval_steps_per_second": 6.443, "step": 2400 }, { "epoch": 0.22259243833587858, "grad_norm": 4.1875, "learning_rate": 1.90988197607301e-05, "loss": 1.2195, "mean_token_accuracy": 0.6764432485322895, "step": 2405 }, { "epoch": 0.223055208477949, "grad_norm": 4.125, "learning_rate": 1.9092105515290203e-05, "loss": 1.2072, "mean_token_accuracy": 0.6759050880626223, "step": 2410 }, { "epoch": 0.22351797862001943, "grad_norm": 4.21875, "learning_rate": 1.908536753953246e-05, "loss": 1.2361, "mean_token_accuracy": 0.6727495107632093, "step": 2415 }, { "epoch": 0.22398074876208987, "grad_norm": 4.34375, "learning_rate": 1.9078605851042928e-05, "loss": 1.1576, "mean_token_accuracy": 0.6938111545988259, "step": 2420 }, { "epoch": 0.2244435189041603, "grad_norm": 4.3125, "learning_rate": 1.907182046746956e-05, "loss": 1.213, "mean_token_accuracy": 0.6764187866927592, "step": 2425 }, { "epoch": 0.22490628904623075, "grad_norm": 4.59375, "learning_rate": 1.9065011406522145e-05, "loss": 1.1662, "mean_token_accuracy": 0.6906311154598825, "step": 2430 }, { "epoch": 0.22536905918830116, "grad_norm": 4.5, "learning_rate": 1.9058178685972278e-05, "loss": 1.1742, "mean_token_accuracy": 0.6845890410958904, "step": 2435 }, { "epoch": 0.2258318293303716, "grad_norm": 4.40625, "learning_rate": 1.9051322323653304e-05, "loss": 1.2305, "mean_token_accuracy": 0.6687866927592954, "step": 2440 }, { "epoch": 0.22629459947244204, "grad_norm": 3.984375, "learning_rate": 1.904444233746026e-05, "loss": 1.1578, "mean_token_accuracy": 0.6907534246575344, "step": 2445 }, { "epoch": 0.22675736961451248, "grad_norm": 4.15625, "learning_rate": 1.9037538745349863e-05, "loss": 1.1756, "mean_token_accuracy": 0.6841487279843443, "step": 2450 }, { "epoch": 0.2272201397565829, "grad_norm": 4.34375, "learning_rate": 1.903061156534042e-05, "loss": 1.2181, "mean_token_accuracy": 0.6785714285714285, "step": 2455 }, { "epoch": 0.22768290989865334, "grad_norm": 4.21875, "learning_rate": 1.9023660815511818e-05, "loss": 1.1998, "mean_token_accuracy": 0.6794765166340511, "step": 2460 }, { "epoch": 0.22814568004072378, "grad_norm": 4.15625, "learning_rate": 1.9016686514005456e-05, "loss": 1.1031, "mean_token_accuracy": 0.7017123287671232, "step": 2465 }, { "epoch": 0.22860845018279422, "grad_norm": 4.15625, "learning_rate": 1.900968867902419e-05, "loss": 1.2009, "mean_token_accuracy": 0.6785714285714286, "step": 2470 }, { "epoch": 0.22907122032486463, "grad_norm": 4.21875, "learning_rate": 1.900266732883233e-05, "loss": 1.197, "mean_token_accuracy": 0.676614481409002, "step": 2475 }, { "epoch": 0.22953399046693507, "grad_norm": 4.34375, "learning_rate": 1.8995622481755517e-05, "loss": 1.1997, "mean_token_accuracy": 0.6756849315068494, "step": 2480 }, { "epoch": 0.2299967606090055, "grad_norm": 3.90625, "learning_rate": 1.8988554156180753e-05, "loss": 1.138, "mean_token_accuracy": 0.6954256360078279, "step": 2485 }, { "epoch": 0.23045953075107595, "grad_norm": 4.84375, "learning_rate": 1.8981462370556315e-05, "loss": 1.2095, "mean_token_accuracy": 0.6785714285714286, "step": 2490 }, { "epoch": 0.2309223008931464, "grad_norm": 4.25, "learning_rate": 1.8974347143391688e-05, "loss": 1.1392, "mean_token_accuracy": 0.6928082191780822, "step": 2495 }, { "epoch": 0.2313850710352168, "grad_norm": 4.21875, "learning_rate": 1.8967208493257564e-05, "loss": 1.1312, "mean_token_accuracy": 0.6949119373776907, "step": 2500 }, { "epoch": 0.2313850710352168, "eval_loss": 1.2185319662094116, "eval_mean_token_accuracy": 0.6749766848091971, "eval_runtime": 39.7479, "eval_samples_per_second": 25.762, "eval_steps_per_second": 6.441, "step": 2500 }, { "epoch": 0.23184784117728724, "grad_norm": 4.34375, "learning_rate": 1.8960046438785754e-05, "loss": 1.1318, "mean_token_accuracy": 0.6952788649706456, "step": 2505 }, { "epoch": 0.23231061131935768, "grad_norm": 4.96875, "learning_rate": 1.8952860998669165e-05, "loss": 1.1986, "mean_token_accuracy": 0.6803326810176126, "step": 2510 }, { "epoch": 0.23277338146142812, "grad_norm": 4.15625, "learning_rate": 1.894565219166173e-05, "loss": 1.2245, "mean_token_accuracy": 0.6735078277886497, "step": 2515 }, { "epoch": 0.23323615160349853, "grad_norm": 5.53125, "learning_rate": 1.8938420036578372e-05, "loss": 1.2347, "mean_token_accuracy": 0.6704990215264187, "step": 2520 }, { "epoch": 0.23369892174556897, "grad_norm": 4.3125, "learning_rate": 1.893116455229496e-05, "loss": 1.2384, "mean_token_accuracy": 0.6693982387475538, "step": 2525 }, { "epoch": 0.2341616918876394, "grad_norm": 5.375, "learning_rate": 1.892388575774824e-05, "loss": 1.1809, "mean_token_accuracy": 0.6843933463796477, "step": 2530 }, { "epoch": 0.23462446202970985, "grad_norm": 4.375, "learning_rate": 1.8916583671935805e-05, "loss": 1.197, "mean_token_accuracy": 0.6814090019569472, "step": 2535 }, { "epoch": 0.23508723217178026, "grad_norm": 4.125, "learning_rate": 1.8909258313916042e-05, "loss": 1.1432, "mean_token_accuracy": 0.6929060665362035, "step": 2540 }, { "epoch": 0.2355500023138507, "grad_norm": 4.0625, "learning_rate": 1.890190970280807e-05, "loss": 1.1265, "mean_token_accuracy": 0.6984589041095888, "step": 2545 }, { "epoch": 0.23601277245592114, "grad_norm": 4.40625, "learning_rate": 1.8894537857791693e-05, "loss": 1.2086, "mean_token_accuracy": 0.6768101761252445, "step": 2550 }, { "epoch": 0.23647554259799158, "grad_norm": 4.375, "learning_rate": 1.888714279810738e-05, "loss": 1.1856, "mean_token_accuracy": 0.6817514677103719, "step": 2555 }, { "epoch": 0.23693831274006202, "grad_norm": 4.84375, "learning_rate": 1.887972454305616e-05, "loss": 1.2107, "mean_token_accuracy": 0.6729696673189823, "step": 2560 }, { "epoch": 0.23740108288213244, "grad_norm": 4.15625, "learning_rate": 1.887228311199962e-05, "loss": 1.1726, "mean_token_accuracy": 0.6865704500978473, "step": 2565 }, { "epoch": 0.23786385302420288, "grad_norm": 4.5, "learning_rate": 1.8864818524359837e-05, "loss": 1.2431, "mean_token_accuracy": 0.6703522504892369, "step": 2570 }, { "epoch": 0.23832662316627332, "grad_norm": 4.09375, "learning_rate": 1.8857330799619308e-05, "loss": 1.1367, "mean_token_accuracy": 0.6933708414872798, "step": 2575 }, { "epoch": 0.23878939330834376, "grad_norm": 4.0625, "learning_rate": 1.8849819957320942e-05, "loss": 1.2482, "mean_token_accuracy": 0.665508806262231, "step": 2580 }, { "epoch": 0.23925216345041417, "grad_norm": 6.25, "learning_rate": 1.884228601706797e-05, "loss": 1.1786, "mean_token_accuracy": 0.68456457925636, "step": 2585 }, { "epoch": 0.2397149335924846, "grad_norm": 4.75, "learning_rate": 1.8834728998523913e-05, "loss": 1.2563, "mean_token_accuracy": 0.6626956947162426, "step": 2590 }, { "epoch": 0.24017770373455505, "grad_norm": 4.5, "learning_rate": 1.8827148921412522e-05, "loss": 1.1732, "mean_token_accuracy": 0.6846379647749511, "step": 2595 }, { "epoch": 0.2406404738766255, "grad_norm": 4.15625, "learning_rate": 1.8819545805517733e-05, "loss": 1.2013, "mean_token_accuracy": 0.6785958904109589, "step": 2600 }, { "epoch": 0.2406404738766255, "eval_loss": 1.2162375450134277, "eval_mean_token_accuracy": 0.6754506329500974, "eval_runtime": 39.7146, "eval_samples_per_second": 25.784, "eval_steps_per_second": 6.446, "step": 2600 }, { "epoch": 0.2411032440186959, "grad_norm": 4.15625, "learning_rate": 1.881191967068362e-05, "loss": 1.1696, "mean_token_accuracy": 0.6858855185909979, "step": 2605 }, { "epoch": 0.24156601416076634, "grad_norm": 4.03125, "learning_rate": 1.8804270536814323e-05, "loss": 1.1483, "mean_token_accuracy": 0.6892123287671234, "step": 2610 }, { "epoch": 0.24202878430283678, "grad_norm": 4.40625, "learning_rate": 1.879659842387402e-05, "loss": 1.2106, "mean_token_accuracy": 0.6809931506849314, "step": 2615 }, { "epoch": 0.24249155444490722, "grad_norm": 4.09375, "learning_rate": 1.8788903351886855e-05, "loss": 1.1808, "mean_token_accuracy": 0.683757338551859, "step": 2620 }, { "epoch": 0.24295432458697766, "grad_norm": 4.15625, "learning_rate": 1.878118534093691e-05, "loss": 1.2233, "mean_token_accuracy": 0.676394324853229, "step": 2625 }, { "epoch": 0.24341709472904807, "grad_norm": 5.6875, "learning_rate": 1.877344441116812e-05, "loss": 1.2606, "mean_token_accuracy": 0.6672700587084148, "step": 2630 }, { "epoch": 0.2438798648711185, "grad_norm": 4.09375, "learning_rate": 1.876568058278425e-05, "loss": 1.1411, "mean_token_accuracy": 0.6939090019569472, "step": 2635 }, { "epoch": 0.24434263501318895, "grad_norm": 4.53125, "learning_rate": 1.8757893876048823e-05, "loss": 1.1468, "mean_token_accuracy": 0.6916095890410959, "step": 2640 }, { "epoch": 0.2448054051552594, "grad_norm": 3.9375, "learning_rate": 1.8750084311285074e-05, "loss": 1.1273, "mean_token_accuracy": 0.6979941291585127, "step": 2645 }, { "epoch": 0.2452681752973298, "grad_norm": 5.40625, "learning_rate": 1.8742251908875908e-05, "loss": 1.1383, "mean_token_accuracy": 0.6934197651663406, "step": 2650 }, { "epoch": 0.24573094543940024, "grad_norm": 4.15625, "learning_rate": 1.8734396689263822e-05, "loss": 1.2276, "mean_token_accuracy": 0.6710861056751468, "step": 2655 }, { "epoch": 0.24619371558147068, "grad_norm": 5.5625, "learning_rate": 1.872651867295088e-05, "loss": 1.1801, "mean_token_accuracy": 0.6832681017612525, "step": 2660 }, { "epoch": 0.24665648572354112, "grad_norm": 4.15625, "learning_rate": 1.871861788049863e-05, "loss": 1.1813, "mean_token_accuracy": 0.6776663405088064, "step": 2665 }, { "epoch": 0.24711925586561156, "grad_norm": 5.15625, "learning_rate": 1.8710694332528073e-05, "loss": 1.1189, "mean_token_accuracy": 0.697284735812133, "step": 2670 }, { "epoch": 0.24758202600768198, "grad_norm": 5.0625, "learning_rate": 1.8702748049719605e-05, "loss": 1.2094, "mean_token_accuracy": 0.6779354207436399, "step": 2675 }, { "epoch": 0.24804479614975242, "grad_norm": 4.03125, "learning_rate": 1.8694779052812957e-05, "loss": 1.1255, "mean_token_accuracy": 0.6958414872798435, "step": 2680 }, { "epoch": 0.24850756629182286, "grad_norm": 4.5, "learning_rate": 1.868678736260714e-05, "loss": 1.1419, "mean_token_accuracy": 0.6923679060665362, "step": 2685 }, { "epoch": 0.2489703364338933, "grad_norm": 4.625, "learning_rate": 1.8678772999960397e-05, "loss": 1.1841, "mean_token_accuracy": 0.6847847358121332, "step": 2690 }, { "epoch": 0.2494331065759637, "grad_norm": 3.921875, "learning_rate": 1.867073598579015e-05, "loss": 1.1723, "mean_token_accuracy": 0.6853228962818004, "step": 2695 }, { "epoch": 0.24989587671803415, "grad_norm": 5.65625, "learning_rate": 1.8662676341072935e-05, "loss": 1.1902, "mean_token_accuracy": 0.6764187866927592, "step": 2700 }, { "epoch": 0.24989587671803415, "eval_loss": 1.215235948562622, "eval_mean_token_accuracy": 0.6759685359589035, "eval_runtime": 39.7065, "eval_samples_per_second": 25.789, "eval_steps_per_second": 6.447, "step": 2700 }, { "epoch": 0.25035864686010456, "grad_norm": 4.40625, "learning_rate": 1.8654594086844353e-05, "loss": 1.1735, "mean_token_accuracy": 0.6859099804305283, "step": 2705 }, { "epoch": 0.25082141700217503, "grad_norm": 3.90625, "learning_rate": 1.8646489244199024e-05, "loss": 1.1453, "mean_token_accuracy": 0.6912915851272015, "step": 2710 }, { "epoch": 0.25128418714424544, "grad_norm": 5.34375, "learning_rate": 1.8638361834290517e-05, "loss": 1.1441, "mean_token_accuracy": 0.693517612524462, "step": 2715 }, { "epoch": 0.2517469572863159, "grad_norm": 4.46875, "learning_rate": 1.86302118783313e-05, "loss": 1.1916, "mean_token_accuracy": 0.6821673189823876, "step": 2720 }, { "epoch": 0.2522097274283863, "grad_norm": 4.09375, "learning_rate": 1.8622039397592687e-05, "loss": 1.1686, "mean_token_accuracy": 0.6879158512720156, "step": 2725 }, { "epoch": 0.25267249757045673, "grad_norm": 4.1875, "learning_rate": 1.8613844413404792e-05, "loss": 1.1556, "mean_token_accuracy": 0.6869129158512722, "step": 2730 }, { "epoch": 0.2531352677125272, "grad_norm": 4.4375, "learning_rate": 1.8605626947156444e-05, "loss": 1.2346, "mean_token_accuracy": 0.6716976516634051, "step": 2735 }, { "epoch": 0.2535980378545976, "grad_norm": 3.953125, "learning_rate": 1.859738702029516e-05, "loss": 1.1509, "mean_token_accuracy": 0.6926125244618395, "step": 2740 }, { "epoch": 0.2540608079966681, "grad_norm": 4.9375, "learning_rate": 1.8589124654327087e-05, "loss": 1.2064, "mean_token_accuracy": 0.6773972602739727, "step": 2745 }, { "epoch": 0.2545235781387385, "grad_norm": 4.0, "learning_rate": 1.8580839870816928e-05, "loss": 1.1758, "mean_token_accuracy": 0.6852739726027398, "step": 2750 }, { "epoch": 0.2549863482808089, "grad_norm": 4.1875, "learning_rate": 1.8572532691387892e-05, "loss": 1.1897, "mean_token_accuracy": 0.682216242661448, "step": 2755 }, { "epoch": 0.2554491184228794, "grad_norm": 4.15625, "learning_rate": 1.8564203137721647e-05, "loss": 1.1299, "mean_token_accuracy": 0.7009295499021527, "step": 2760 }, { "epoch": 0.2559118885649498, "grad_norm": 4.28125, "learning_rate": 1.8555851231558258e-05, "loss": 1.1835, "mean_token_accuracy": 0.6826076320939336, "step": 2765 }, { "epoch": 0.2563746587070202, "grad_norm": 4.34375, "learning_rate": 1.8547476994696132e-05, "loss": 1.1249, "mean_token_accuracy": 0.695229941291585, "step": 2770 }, { "epoch": 0.25683742884909067, "grad_norm": 4.3125, "learning_rate": 1.8539080448991954e-05, "loss": 1.1742, "mean_token_accuracy": 0.6845156555772994, "step": 2775 }, { "epoch": 0.2573001989911611, "grad_norm": 4.625, "learning_rate": 1.8530661616360636e-05, "loss": 1.2212, "mean_token_accuracy": 0.6729452054794521, "step": 2780 }, { "epoch": 0.25776296913323155, "grad_norm": 4.46875, "learning_rate": 1.8522220518775257e-05, "loss": 1.1262, "mean_token_accuracy": 0.6964530332681017, "step": 2785 }, { "epoch": 0.25822573927530196, "grad_norm": 6.28125, "learning_rate": 1.8513757178267013e-05, "loss": 1.1681, "mean_token_accuracy": 0.6846379647749511, "step": 2790 }, { "epoch": 0.25868850941737237, "grad_norm": 4.25, "learning_rate": 1.8505271616925152e-05, "loss": 1.1509, "mean_token_accuracy": 0.6902152641878667, "step": 2795 }, { "epoch": 0.25915127955944284, "grad_norm": 4.40625, "learning_rate": 1.8496763856896915e-05, "loss": 1.145, "mean_token_accuracy": 0.6912915851272015, "step": 2800 }, { "epoch": 0.25915127955944284, "eval_loss": 1.212759017944336, "eval_mean_token_accuracy": 0.6762074211105668, "eval_runtime": 39.8224, "eval_samples_per_second": 25.714, "eval_steps_per_second": 6.429, "step": 2800 }, { "epoch": 0.25961404970151325, "grad_norm": 4.125, "learning_rate": 1.8488233920387478e-05, "loss": 1.1539, "mean_token_accuracy": 0.690362035225049, "step": 2805 }, { "epoch": 0.2600768198435837, "grad_norm": 5.15625, "learning_rate": 1.847968182965991e-05, "loss": 1.1987, "mean_token_accuracy": 0.6774217221135029, "step": 2810 }, { "epoch": 0.26053958998565413, "grad_norm": 4.3125, "learning_rate": 1.8471107607035092e-05, "loss": 1.1667, "mean_token_accuracy": 0.6860567514677103, "step": 2815 }, { "epoch": 0.26100236012772454, "grad_norm": 4.0625, "learning_rate": 1.8462511274891673e-05, "loss": 1.214, "mean_token_accuracy": 0.6770058708414873, "step": 2820 }, { "epoch": 0.261465130269795, "grad_norm": 4.34375, "learning_rate": 1.8453892855666003e-05, "loss": 1.2333, "mean_token_accuracy": 0.6718933463796478, "step": 2825 }, { "epoch": 0.2619279004118654, "grad_norm": 4.125, "learning_rate": 1.844525237185209e-05, "loss": 1.108, "mean_token_accuracy": 0.7023727984344422, "step": 2830 }, { "epoch": 0.26239067055393583, "grad_norm": 4.375, "learning_rate": 1.8436589846001514e-05, "loss": 1.1477, "mean_token_accuracy": 0.6902397260273974, "step": 2835 }, { "epoch": 0.2628534406960063, "grad_norm": 5.21875, "learning_rate": 1.84279053007234e-05, "loss": 1.1895, "mean_token_accuracy": 0.6826565557729942, "step": 2840 }, { "epoch": 0.2633162108380767, "grad_norm": 4.09375, "learning_rate": 1.841919875868433e-05, "loss": 1.1911, "mean_token_accuracy": 0.6825342465753425, "step": 2845 }, { "epoch": 0.2637789809801472, "grad_norm": 4.125, "learning_rate": 1.8410470242608312e-05, "loss": 1.1217, "mean_token_accuracy": 0.6950587084148728, "step": 2850 }, { "epoch": 0.2642417511222176, "grad_norm": 4.15625, "learning_rate": 1.84017197752767e-05, "loss": 1.1612, "mean_token_accuracy": 0.6881115459882583, "step": 2855 }, { "epoch": 0.264704521264288, "grad_norm": 4.125, "learning_rate": 1.839294737952813e-05, "loss": 1.2191, "mean_token_accuracy": 0.6726761252446184, "step": 2860 }, { "epoch": 0.2651672914063585, "grad_norm": 4.03125, "learning_rate": 1.838415307825849e-05, "loss": 1.1929, "mean_token_accuracy": 0.6791829745596869, "step": 2865 }, { "epoch": 0.2656300615484289, "grad_norm": 4.34375, "learning_rate": 1.8375336894420824e-05, "loss": 1.197, "mean_token_accuracy": 0.6799168297455968, "step": 2870 }, { "epoch": 0.26609283169049935, "grad_norm": 4.875, "learning_rate": 1.8366498851025297e-05, "loss": 1.1496, "mean_token_accuracy": 0.6907778864970646, "step": 2875 }, { "epoch": 0.26655560183256977, "grad_norm": 4.3125, "learning_rate": 1.835763897113913e-05, "loss": 1.1868, "mean_token_accuracy": 0.6835616438356165, "step": 2880 }, { "epoch": 0.2670183719746402, "grad_norm": 4.125, "learning_rate": 1.8348757277886535e-05, "loss": 1.1628, "mean_token_accuracy": 0.6887475538160468, "step": 2885 }, { "epoch": 0.26748114211671065, "grad_norm": 4.375, "learning_rate": 1.8339853794448652e-05, "loss": 1.2156, "mean_token_accuracy": 0.676247553816047, "step": 2890 }, { "epoch": 0.26794391225878106, "grad_norm": 4.8125, "learning_rate": 1.8330928544063496e-05, "loss": 1.2153, "mean_token_accuracy": 0.6800391389432485, "step": 2895 }, { "epoch": 0.26840668240085147, "grad_norm": 4.375, "learning_rate": 1.83219815500259e-05, "loss": 1.1604, "mean_token_accuracy": 0.6883806262230919, "step": 2900 }, { "epoch": 0.26840668240085147, "eval_loss": 1.2116366624832153, "eval_mean_token_accuracy": 0.6761711105675144, "eval_runtime": 39.7605, "eval_samples_per_second": 25.754, "eval_steps_per_second": 6.439, "step": 2900 }, { "epoch": 0.26886945254292194, "grad_norm": 5.125, "learning_rate": 1.8313012835687437e-05, "loss": 1.2234, "mean_token_accuracy": 0.6750244618395304, "step": 2905 }, { "epoch": 0.26933222268499235, "grad_norm": 4.40625, "learning_rate": 1.8304022424456375e-05, "loss": 1.2539, "mean_token_accuracy": 0.6671722113502936, "step": 2910 }, { "epoch": 0.2697949928270628, "grad_norm": 4.125, "learning_rate": 1.8295010339797615e-05, "loss": 1.2066, "mean_token_accuracy": 0.6757827788649704, "step": 2915 }, { "epoch": 0.27025776296913323, "grad_norm": 4.0625, "learning_rate": 1.828597660523262e-05, "loss": 1.1777, "mean_token_accuracy": 0.6836105675146771, "step": 2920 }, { "epoch": 0.27072053311120364, "grad_norm": 3.953125, "learning_rate": 1.8276921244339358e-05, "loss": 1.1224, "mean_token_accuracy": 0.6983121330724071, "step": 2925 }, { "epoch": 0.2711833032532741, "grad_norm": 4.21875, "learning_rate": 1.826784428075224e-05, "loss": 1.212, "mean_token_accuracy": 0.677862035225049, "step": 2930 }, { "epoch": 0.2716460733953445, "grad_norm": 4.03125, "learning_rate": 1.825874573816207e-05, "loss": 1.1133, "mean_token_accuracy": 0.7021771037181995, "step": 2935 }, { "epoch": 0.272108843537415, "grad_norm": 5.5, "learning_rate": 1.8249625640315963e-05, "loss": 1.2569, "mean_token_accuracy": 0.6660958904109588, "step": 2940 }, { "epoch": 0.2725716136794854, "grad_norm": 4.1875, "learning_rate": 1.8240484011017302e-05, "loss": 1.1871, "mean_token_accuracy": 0.6814823874755381, "step": 2945 }, { "epoch": 0.2730343838215558, "grad_norm": 4.28125, "learning_rate": 1.8231320874125656e-05, "loss": 1.2466, "mean_token_accuracy": 0.6678326810176125, "step": 2950 }, { "epoch": 0.2734971539636263, "grad_norm": 4.8125, "learning_rate": 1.8222136253556736e-05, "loss": 1.2359, "mean_token_accuracy": 0.6724070450097848, "step": 2955 }, { "epoch": 0.2739599241056967, "grad_norm": 4.125, "learning_rate": 1.8212930173282323e-05, "loss": 1.1839, "mean_token_accuracy": 0.6829256360078277, "step": 2960 }, { "epoch": 0.2744226942477671, "grad_norm": 4.1875, "learning_rate": 1.8203702657330207e-05, "loss": 1.1345, "mean_token_accuracy": 0.6952544031311156, "step": 2965 }, { "epoch": 0.2748854643898376, "grad_norm": 4.0625, "learning_rate": 1.8194453729784127e-05, "loss": 1.1079, "mean_token_accuracy": 0.7042563600782779, "step": 2970 }, { "epoch": 0.275348234531908, "grad_norm": 4.5625, "learning_rate": 1.8185183414783706e-05, "loss": 1.2185, "mean_token_accuracy": 0.673238747553816, "step": 2975 }, { "epoch": 0.27581100467397845, "grad_norm": 5.78125, "learning_rate": 1.8175891736524382e-05, "loss": 1.1879, "mean_token_accuracy": 0.6845645792563599, "step": 2980 }, { "epoch": 0.27627377481604887, "grad_norm": 4.03125, "learning_rate": 1.8166578719257363e-05, "loss": 1.1601, "mean_token_accuracy": 0.6875, "step": 2985 }, { "epoch": 0.2767365449581193, "grad_norm": 4.5, "learning_rate": 1.8157244387289542e-05, "loss": 1.2179, "mean_token_accuracy": 0.6762720156555774, "step": 2990 }, { "epoch": 0.27719931510018975, "grad_norm": 4.03125, "learning_rate": 1.8147888764983444e-05, "loss": 1.1224, "mean_token_accuracy": 0.6959882583170254, "step": 2995 }, { "epoch": 0.27766208524226016, "grad_norm": 4.0, "learning_rate": 1.813851187675717e-05, "loss": 1.1624, "mean_token_accuracy": 0.6841487279843445, "step": 3000 }, { "epoch": 0.27766208524226016, "eval_loss": 1.2096575498580933, "eval_mean_token_accuracy": 0.6766278589774947, "eval_runtime": 39.6977, "eval_samples_per_second": 25.795, "eval_steps_per_second": 6.449, "step": 3000 }, { "epoch": 0.2781248553843306, "grad_norm": 4.1875, "learning_rate": 1.8129113747084308e-05, "loss": 1.1665, "mean_token_accuracy": 0.6858610567514677, "step": 3005 }, { "epoch": 0.27858762552640104, "grad_norm": 5.8125, "learning_rate": 1.81196944004939e-05, "loss": 1.1744, "mean_token_accuracy": 0.6859833659491192, "step": 3010 }, { "epoch": 0.27905039566847145, "grad_norm": 4.5, "learning_rate": 1.8110253861570367e-05, "loss": 1.1938, "mean_token_accuracy": 0.6789138943248533, "step": 3015 }, { "epoch": 0.2795131658105419, "grad_norm": 4.28125, "learning_rate": 1.8100792154953425e-05, "loss": 1.195, "mean_token_accuracy": 0.6806996086105676, "step": 3020 }, { "epoch": 0.27997593595261233, "grad_norm": 4.0, "learning_rate": 1.8091309305338053e-05, "loss": 1.1441, "mean_token_accuracy": 0.6918052837573385, "step": 3025 }, { "epoch": 0.28043870609468274, "grad_norm": 4.0625, "learning_rate": 1.8081805337474413e-05, "loss": 1.1647, "mean_token_accuracy": 0.689995107632094, "step": 3030 }, { "epoch": 0.2809014762367532, "grad_norm": 4.21875, "learning_rate": 1.807228027616777e-05, "loss": 1.2073, "mean_token_accuracy": 0.6783512720156556, "step": 3035 }, { "epoch": 0.2813642463788236, "grad_norm": 4.28125, "learning_rate": 1.8062734146278452e-05, "loss": 1.234, "mean_token_accuracy": 0.6723091976516633, "step": 3040 }, { "epoch": 0.2818270165208941, "grad_norm": 4.03125, "learning_rate": 1.8053166972721787e-05, "loss": 1.1598, "mean_token_accuracy": 0.6858365949119376, "step": 3045 }, { "epoch": 0.2822897866629645, "grad_norm": 4.21875, "learning_rate": 1.804357878046801e-05, "loss": 1.2195, "mean_token_accuracy": 0.673972602739726, "step": 3050 }, { "epoch": 0.2827525568050349, "grad_norm": 4.0625, "learning_rate": 1.8033969594542223e-05, "loss": 1.1594, "mean_token_accuracy": 0.6867172211350295, "step": 3055 }, { "epoch": 0.2832153269471054, "grad_norm": 4.375, "learning_rate": 1.8024339440024315e-05, "loss": 1.231, "mean_token_accuracy": 0.6700097847358121, "step": 3060 }, { "epoch": 0.2836780970891758, "grad_norm": 4.6875, "learning_rate": 1.801468834204891e-05, "loss": 1.1773, "mean_token_accuracy": 0.6894324853228964, "step": 3065 }, { "epoch": 0.28414086723124626, "grad_norm": 4.03125, "learning_rate": 1.8005016325805292e-05, "loss": 1.1828, "mean_token_accuracy": 0.6853473581213307, "step": 3070 }, { "epoch": 0.2846036373733167, "grad_norm": 4.0625, "learning_rate": 1.7995323416537346e-05, "loss": 1.1031, "mean_token_accuracy": 0.7025195694716243, "step": 3075 }, { "epoch": 0.2850664075153871, "grad_norm": 4.125, "learning_rate": 1.798560963954347e-05, "loss": 1.2051, "mean_token_accuracy": 0.6771771037181995, "step": 3080 }, { "epoch": 0.28552917765745756, "grad_norm": 4.40625, "learning_rate": 1.797587502017655e-05, "loss": 1.18, "mean_token_accuracy": 0.6854207436399218, "step": 3085 }, { "epoch": 0.28599194779952797, "grad_norm": 4.8125, "learning_rate": 1.7966119583843852e-05, "loss": 1.2145, "mean_token_accuracy": 0.677054794520548, "step": 3090 }, { "epoch": 0.28645471794159844, "grad_norm": 4.625, "learning_rate": 1.795634335600699e-05, "loss": 1.1708, "mean_token_accuracy": 0.6886497064579257, "step": 3095 }, { "epoch": 0.28691748808366885, "grad_norm": 4.375, "learning_rate": 1.7946546362181826e-05, "loss": 1.2252, "mean_token_accuracy": 0.6714041095890411, "step": 3100 }, { "epoch": 0.28691748808366885, "eval_loss": 1.2090357542037964, "eval_mean_token_accuracy": 0.6766718138454006, "eval_runtime": 39.7396, "eval_samples_per_second": 25.768, "eval_steps_per_second": 6.442, "step": 3100 }, { "epoch": 0.28738025822573926, "grad_norm": 4.03125, "learning_rate": 1.793672862793844e-05, "loss": 1.1787, "mean_token_accuracy": 0.6844422700587085, "step": 3105 }, { "epoch": 0.28784302836780973, "grad_norm": 4.4375, "learning_rate": 1.7926890178901028e-05, "loss": 1.2587, "mean_token_accuracy": 0.6641634050880627, "step": 3110 }, { "epoch": 0.28830579850988014, "grad_norm": 4.28125, "learning_rate": 1.7917031040747858e-05, "loss": 1.1861, "mean_token_accuracy": 0.6791585127201565, "step": 3115 }, { "epoch": 0.28876856865195055, "grad_norm": 5.6875, "learning_rate": 1.7907151239211194e-05, "loss": 1.117, "mean_token_accuracy": 0.700293542074364, "step": 3120 }, { "epoch": 0.289231338794021, "grad_norm": 4.375, "learning_rate": 1.7897250800077244e-05, "loss": 1.1711, "mean_token_accuracy": 0.6888454011741683, "step": 3125 }, { "epoch": 0.28969410893609143, "grad_norm": 4.15625, "learning_rate": 1.7887329749186063e-05, "loss": 1.1502, "mean_token_accuracy": 0.6929794520547945, "step": 3130 }, { "epoch": 0.2901568790781619, "grad_norm": 4.21875, "learning_rate": 1.7877388112431508e-05, "loss": 1.1052, "mean_token_accuracy": 0.7009540117416829, "step": 3135 }, { "epoch": 0.2906196492202323, "grad_norm": 4.0, "learning_rate": 1.786742591576117e-05, "loss": 1.1987, "mean_token_accuracy": 0.6771771037181997, "step": 3140 }, { "epoch": 0.2910824193623027, "grad_norm": 4.3125, "learning_rate": 1.785744318517629e-05, "loss": 1.22, "mean_token_accuracy": 0.6731164383561643, "step": 3145 }, { "epoch": 0.2915451895043732, "grad_norm": 4.125, "learning_rate": 1.784743994673172e-05, "loss": 1.1722, "mean_token_accuracy": 0.6863502935420744, "step": 3150 }, { "epoch": 0.2920079596464436, "grad_norm": 4.15625, "learning_rate": 1.783741622653582e-05, "loss": 1.1676, "mean_token_accuracy": 0.6850782778864971, "step": 3155 }, { "epoch": 0.29247072978851407, "grad_norm": 4.9375, "learning_rate": 1.7827372050750414e-05, "loss": 1.1334, "mean_token_accuracy": 0.697945205479452, "step": 3160 }, { "epoch": 0.2929334999305845, "grad_norm": 4.3125, "learning_rate": 1.7817307445590724e-05, "loss": 1.1589, "mean_token_accuracy": 0.6912915851272017, "step": 3165 }, { "epoch": 0.2933962700726549, "grad_norm": 4.6875, "learning_rate": 1.780722243732527e-05, "loss": 1.2034, "mean_token_accuracy": 0.678742661448141, "step": 3170 }, { "epoch": 0.29385904021472536, "grad_norm": 4.25, "learning_rate": 1.7797117052275847e-05, "loss": 1.1032, "mean_token_accuracy": 0.7031311154598826, "step": 3175 }, { "epoch": 0.2943218103567958, "grad_norm": 4.5, "learning_rate": 1.7786991316817424e-05, "loss": 1.1974, "mean_token_accuracy": 0.6835371819960863, "step": 3180 }, { "epoch": 0.2947845804988662, "grad_norm": 5.15625, "learning_rate": 1.7776845257378078e-05, "loss": 1.2389, "mean_token_accuracy": 0.6662426614481409, "step": 3185 }, { "epoch": 0.29524735064093666, "grad_norm": 3.921875, "learning_rate": 1.776667890043894e-05, "loss": 1.1848, "mean_token_accuracy": 0.6837084148727985, "step": 3190 }, { "epoch": 0.29571012078300707, "grad_norm": 5.6875, "learning_rate": 1.7756492272534118e-05, "loss": 1.1685, "mean_token_accuracy": 0.6882827788649706, "step": 3195 }, { "epoch": 0.29617289092507754, "grad_norm": 4.03125, "learning_rate": 1.7746285400250622e-05, "loss": 1.093, "mean_token_accuracy": 0.7034735812133072, "step": 3200 }, { "epoch": 0.29617289092507754, "eval_loss": 1.2073590755462646, "eval_mean_token_accuracy": 0.6773082038894319, "eval_runtime": 39.7188, "eval_samples_per_second": 25.781, "eval_steps_per_second": 6.445, "step": 3200 }, { "epoch": 0.29663566106714795, "grad_norm": 4.0, "learning_rate": 1.77360583102283e-05, "loss": 1.1686, "mean_token_accuracy": 0.6851516634050882, "step": 3205 }, { "epoch": 0.29709843120921836, "grad_norm": 3.921875, "learning_rate": 1.7725811029159776e-05, "loss": 1.1823, "mean_token_accuracy": 0.6827788649706455, "step": 3210 }, { "epoch": 0.29756120135128883, "grad_norm": 4.53125, "learning_rate": 1.7715543583790355e-05, "loss": 1.1294, "mean_token_accuracy": 0.6990704500978474, "step": 3215 }, { "epoch": 0.29802397149335924, "grad_norm": 4.4375, "learning_rate": 1.7705256000917988e-05, "loss": 1.1799, "mean_token_accuracy": 0.6830234833659491, "step": 3220 }, { "epoch": 0.2984867416354297, "grad_norm": 4.09375, "learning_rate": 1.7694948307393183e-05, "loss": 1.1653, "mean_token_accuracy": 0.6881849315068493, "step": 3225 }, { "epoch": 0.2989495117775001, "grad_norm": 4.0625, "learning_rate": 1.768462053011893e-05, "loss": 1.1387, "mean_token_accuracy": 0.6925391389432485, "step": 3230 }, { "epoch": 0.29941228191957053, "grad_norm": 4.90625, "learning_rate": 1.767427269605063e-05, "loss": 1.1545, "mean_token_accuracy": 0.6863258317025439, "step": 3235 }, { "epoch": 0.299875052061641, "grad_norm": 4.125, "learning_rate": 1.7663904832196054e-05, "loss": 1.1618, "mean_token_accuracy": 0.6907289628180041, "step": 3240 }, { "epoch": 0.3003378222037114, "grad_norm": 4.125, "learning_rate": 1.7653516965615232e-05, "loss": 1.1446, "mean_token_accuracy": 0.6912181996086106, "step": 3245 }, { "epoch": 0.3008005923457818, "grad_norm": 4.75, "learning_rate": 1.7643109123420407e-05, "loss": 1.1882, "mean_token_accuracy": 0.6817025440313114, "step": 3250 }, { "epoch": 0.3012633624878523, "grad_norm": 4.125, "learning_rate": 1.763268133277596e-05, "loss": 1.156, "mean_token_accuracy": 0.6817514677103718, "step": 3255 }, { "epoch": 0.3017261326299227, "grad_norm": 4.03125, "learning_rate": 1.762223362089834e-05, "loss": 1.0817, "mean_token_accuracy": 0.7055283757338552, "step": 3260 }, { "epoch": 0.3021889027719932, "grad_norm": 4.0625, "learning_rate": 1.7611766015055984e-05, "loss": 1.1854, "mean_token_accuracy": 0.6830724070450097, "step": 3265 }, { "epoch": 0.3026516729140636, "grad_norm": 10.75, "learning_rate": 1.760127854256924e-05, "loss": 1.107, "mean_token_accuracy": 0.6998043052837574, "step": 3270 }, { "epoch": 0.303114443056134, "grad_norm": 4.34375, "learning_rate": 1.7590771230810344e-05, "loss": 1.1407, "mean_token_accuracy": 0.6945450097847357, "step": 3275 }, { "epoch": 0.30357721319820447, "grad_norm": 4.1875, "learning_rate": 1.7580244107203273e-05, "loss": 1.2454, "mean_token_accuracy": 0.6707681017612523, "step": 3280 }, { "epoch": 0.3040399833402749, "grad_norm": 4.28125, "learning_rate": 1.756969719922374e-05, "loss": 1.2214, "mean_token_accuracy": 0.6727984344422702, "step": 3285 }, { "epoch": 0.30450275348234535, "grad_norm": 4.25, "learning_rate": 1.7559130534399078e-05, "loss": 1.2271, "mean_token_accuracy": 0.677128180039139, "step": 3290 }, { "epoch": 0.30496552362441576, "grad_norm": 3.984375, "learning_rate": 1.7548544140308196e-05, "loss": 1.1147, "mean_token_accuracy": 0.6979207436399217, "step": 3295 }, { "epoch": 0.30542829376648617, "grad_norm": 4.21875, "learning_rate": 1.753793804458149e-05, "loss": 1.0894, "mean_token_accuracy": 0.705283757338552, "step": 3300 }, { "epoch": 0.30542829376648617, "eval_loss": 1.2063908576965332, "eval_mean_token_accuracy": 0.6771323844178075, "eval_runtime": 39.6415, "eval_samples_per_second": 25.831, "eval_steps_per_second": 6.458, "step": 3300 }, { "epoch": 0.30589106390855664, "grad_norm": 3.921875, "learning_rate": 1.7527312274900782e-05, "loss": 1.193, "mean_token_accuracy": 0.6769080234833659, "step": 3305 }, { "epoch": 0.30635383405062705, "grad_norm": 4.1875, "learning_rate": 1.751666685899924e-05, "loss": 1.1722, "mean_token_accuracy": 0.6823140900195697, "step": 3310 }, { "epoch": 0.30681660419269746, "grad_norm": 5.15625, "learning_rate": 1.7506001824661307e-05, "loss": 1.2487, "mean_token_accuracy": 0.6652397260273973, "step": 3315 }, { "epoch": 0.30727937433476793, "grad_norm": 4.28125, "learning_rate": 1.7495317199722632e-05, "loss": 1.113, "mean_token_accuracy": 0.6980675146771036, "step": 3320 }, { "epoch": 0.30774214447683834, "grad_norm": 5.03125, "learning_rate": 1.7484613012069997e-05, "loss": 1.1636, "mean_token_accuracy": 0.6848336594911936, "step": 3325 }, { "epoch": 0.3082049146189088, "grad_norm": 4.21875, "learning_rate": 1.7473889289641235e-05, "loss": 1.1117, "mean_token_accuracy": 0.7030577299412915, "step": 3330 }, { "epoch": 0.3086676847609792, "grad_norm": 4.53125, "learning_rate": 1.7463146060425177e-05, "loss": 1.2243, "mean_token_accuracy": 0.674192759295499, "step": 3335 }, { "epoch": 0.30913045490304963, "grad_norm": 5.40625, "learning_rate": 1.7452383352461556e-05, "loss": 1.1779, "mean_token_accuracy": 0.6823140900195694, "step": 3340 }, { "epoch": 0.3095932250451201, "grad_norm": 4.25, "learning_rate": 1.7441601193840947e-05, "loss": 1.1688, "mean_token_accuracy": 0.686179060665362, "step": 3345 }, { "epoch": 0.3100559951871905, "grad_norm": 4.5625, "learning_rate": 1.7430799612704694e-05, "loss": 1.2372, "mean_token_accuracy": 0.6704256360078279, "step": 3350 }, { "epoch": 0.310518765329261, "grad_norm": 4.375, "learning_rate": 1.741997863724483e-05, "loss": 1.1499, "mean_token_accuracy": 0.6957925636007827, "step": 3355 }, { "epoch": 0.3109815354713314, "grad_norm": 4.09375, "learning_rate": 1.7409138295704013e-05, "loss": 1.2618, "mean_token_accuracy": 0.6672211350293541, "step": 3360 }, { "epoch": 0.3114443056134018, "grad_norm": 4.25, "learning_rate": 1.7398278616375436e-05, "loss": 1.2743, "mean_token_accuracy": 0.661912915851272, "step": 3365 }, { "epoch": 0.3119070757554723, "grad_norm": 4.40625, "learning_rate": 1.7387399627602773e-05, "loss": 1.0554, "mean_token_accuracy": 0.715875733855186, "step": 3370 }, { "epoch": 0.3123698458975427, "grad_norm": 4.40625, "learning_rate": 1.7376501357780088e-05, "loss": 1.1199, "mean_token_accuracy": 0.7017123287671233, "step": 3375 }, { "epoch": 0.3128326160396131, "grad_norm": 4.34375, "learning_rate": 1.736558383535178e-05, "loss": 1.252, "mean_token_accuracy": 0.663013698630137, "step": 3380 }, { "epoch": 0.31329538618168357, "grad_norm": 4.09375, "learning_rate": 1.735464708881248e-05, "loss": 1.1465, "mean_token_accuracy": 0.6907534246575341, "step": 3385 }, { "epoch": 0.313758156323754, "grad_norm": 3.984375, "learning_rate": 1.734369114670701e-05, "loss": 1.1478, "mean_token_accuracy": 0.6907778864970646, "step": 3390 }, { "epoch": 0.31422092646582445, "grad_norm": 4.21875, "learning_rate": 1.7332716037630287e-05, "loss": 1.2406, "mean_token_accuracy": 0.6660225048923679, "step": 3395 }, { "epoch": 0.31468369660789486, "grad_norm": 4.53125, "learning_rate": 1.7321721790227246e-05, "loss": 1.183, "mean_token_accuracy": 0.6803571428571429, "step": 3400 }, { "epoch": 0.31468369660789486, "eval_loss": 1.2051206827163696, "eval_mean_token_accuracy": 0.6774343352495099, "eval_runtime": 41.0517, "eval_samples_per_second": 24.944, "eval_steps_per_second": 6.236, "step": 3400 }, { "epoch": 0.31514646674996527, "grad_norm": 4.0625, "learning_rate": 1.731070843319278e-05, "loss": 1.176, "mean_token_accuracy": 0.6824119373776908, "step": 3405 }, { "epoch": 0.31560923689203574, "grad_norm": 4.375, "learning_rate": 1.7299675995271656e-05, "loss": 1.2282, "mean_token_accuracy": 0.6706702544031312, "step": 3410 }, { "epoch": 0.31607200703410615, "grad_norm": 4.125, "learning_rate": 1.728862450525845e-05, "loss": 1.1487, "mean_token_accuracy": 0.6949853228962818, "step": 3415 }, { "epoch": 0.3165347771761766, "grad_norm": 5.53125, "learning_rate": 1.7277553991997452e-05, "loss": 1.1354, "mean_token_accuracy": 0.6942270058708415, "step": 3420 }, { "epoch": 0.31699754731824703, "grad_norm": 3.890625, "learning_rate": 1.7266464484382604e-05, "loss": 1.1158, "mean_token_accuracy": 0.6954011741682975, "step": 3425 }, { "epoch": 0.31746031746031744, "grad_norm": 4.15625, "learning_rate": 1.7255356011357428e-05, "loss": 1.1255, "mean_token_accuracy": 0.695083170254403, "step": 3430 }, { "epoch": 0.3179230876023879, "grad_norm": 5.0625, "learning_rate": 1.724422860191495e-05, "loss": 1.1278, "mean_token_accuracy": 0.6971135029354206, "step": 3435 }, { "epoch": 0.3183858577444583, "grad_norm": 4.4375, "learning_rate": 1.7233082285097606e-05, "loss": 1.1753, "mean_token_accuracy": 0.6888698630136986, "step": 3440 }, { "epoch": 0.31884862788652873, "grad_norm": 4.1875, "learning_rate": 1.722191708999719e-05, "loss": 1.0998, "mean_token_accuracy": 0.703375733855186, "step": 3445 }, { "epoch": 0.3193113980285992, "grad_norm": 4.75, "learning_rate": 1.721073304575477e-05, "loss": 1.1866, "mean_token_accuracy": 0.6826320939334638, "step": 3450 }, { "epoch": 0.3197741681706696, "grad_norm": 4.15625, "learning_rate": 1.7199530181560603e-05, "loss": 1.2514, "mean_token_accuracy": 0.6645303326810177, "step": 3455 }, { "epoch": 0.3202369383127401, "grad_norm": 4.09375, "learning_rate": 1.7188308526654066e-05, "loss": 1.1613, "mean_token_accuracy": 0.6913405088062622, "step": 3460 }, { "epoch": 0.3206997084548105, "grad_norm": 4.34375, "learning_rate": 1.7177068110323586e-05, "loss": 1.199, "mean_token_accuracy": 0.6811888454011743, "step": 3465 }, { "epoch": 0.3211624785968809, "grad_norm": 4.46875, "learning_rate": 1.7165808961906552e-05, "loss": 1.173, "mean_token_accuracy": 0.6830234833659492, "step": 3470 }, { "epoch": 0.3216252487389514, "grad_norm": 4.71875, "learning_rate": 1.715453111078925e-05, "loss": 1.12, "mean_token_accuracy": 0.6985567514677105, "step": 3475 }, { "epoch": 0.3220880188810218, "grad_norm": 4.21875, "learning_rate": 1.714323458640677e-05, "loss": 1.1834, "mean_token_accuracy": 0.6837328767123287, "step": 3480 }, { "epoch": 0.32255078902309225, "grad_norm": 4.3125, "learning_rate": 1.7131919418242944e-05, "loss": 1.1945, "mean_token_accuracy": 0.6803082191780823, "step": 3485 }, { "epoch": 0.32301355916516267, "grad_norm": 4.03125, "learning_rate": 1.7120585635830268e-05, "loss": 1.2318, "mean_token_accuracy": 0.6717221135029353, "step": 3490 }, { "epoch": 0.3234763293072331, "grad_norm": 4.21875, "learning_rate": 1.7109233268749818e-05, "loss": 1.1843, "mean_token_accuracy": 0.6787671232876711, "step": 3495 }, { "epoch": 0.32393909944930355, "grad_norm": 4.21875, "learning_rate": 1.7097862346631168e-05, "loss": 1.1901, "mean_token_accuracy": 0.6815802348336594, "step": 3500 }, { "epoch": 0.32393909944930355, "eval_loss": 1.2039048671722412, "eval_mean_token_accuracy": 0.6775700220156554, "eval_runtime": 39.8611, "eval_samples_per_second": 25.689, "eval_steps_per_second": 6.422, "step": 3500 }, { "epoch": 0.32440186959137396, "grad_norm": 4.3125, "learning_rate": 1.7086472899152338e-05, "loss": 1.108, "mean_token_accuracy": 0.698605675146771, "step": 3505 }, { "epoch": 0.32486463973344437, "grad_norm": 5.625, "learning_rate": 1.707506495603968e-05, "loss": 1.1838, "mean_token_accuracy": 0.6824119373776908, "step": 3510 }, { "epoch": 0.32532740987551484, "grad_norm": 4.03125, "learning_rate": 1.7063638547067834e-05, "loss": 1.1176, "mean_token_accuracy": 0.696404109589041, "step": 3515 }, { "epoch": 0.32579018001758525, "grad_norm": 4.03125, "learning_rate": 1.7052193702059626e-05, "loss": 1.1966, "mean_token_accuracy": 0.6806262230919766, "step": 3520 }, { "epoch": 0.3262529501596557, "grad_norm": 4.15625, "learning_rate": 1.704073045088601e-05, "loss": 1.1866, "mean_token_accuracy": 0.68383072407045, "step": 3525 }, { "epoch": 0.32671572030172613, "grad_norm": 3.90625, "learning_rate": 1.7029248823465964e-05, "loss": 1.1307, "mean_token_accuracy": 0.6948385518590998, "step": 3530 }, { "epoch": 0.32717849044379654, "grad_norm": 4.0625, "learning_rate": 1.7017748849766444e-05, "loss": 1.1618, "mean_token_accuracy": 0.6872064579256361, "step": 3535 }, { "epoch": 0.327641260585867, "grad_norm": 4.1875, "learning_rate": 1.7006230559802285e-05, "loss": 1.1773, "mean_token_accuracy": 0.6849559686888453, "step": 3540 }, { "epoch": 0.3281040307279374, "grad_norm": 4.3125, "learning_rate": 1.699469398363612e-05, "loss": 1.1542, "mean_token_accuracy": 0.6927103718199608, "step": 3545 }, { "epoch": 0.3285668008700079, "grad_norm": 4.25, "learning_rate": 1.6983139151378317e-05, "loss": 1.1644, "mean_token_accuracy": 0.6854696673189824, "step": 3550 }, { "epoch": 0.3290295710120783, "grad_norm": 4.875, "learning_rate": 1.697156609318689e-05, "loss": 1.1671, "mean_token_accuracy": 0.6838551859099804, "step": 3555 }, { "epoch": 0.3294923411541487, "grad_norm": 4.25, "learning_rate": 1.6959974839267426e-05, "loss": 1.172, "mean_token_accuracy": 0.6887964774951076, "step": 3560 }, { "epoch": 0.3299551112962192, "grad_norm": 4.0625, "learning_rate": 1.694836541987299e-05, "loss": 1.0501, "mean_token_accuracy": 0.7159980430528379, "step": 3565 }, { "epoch": 0.3304178814382896, "grad_norm": 5.03125, "learning_rate": 1.6936737865304076e-05, "loss": 1.1969, "mean_token_accuracy": 0.6821428571428569, "step": 3570 }, { "epoch": 0.33088065158036, "grad_norm": 4.34375, "learning_rate": 1.6925092205908498e-05, "loss": 1.1325, "mean_token_accuracy": 0.695963796477495, "step": 3575 }, { "epoch": 0.3313434217224305, "grad_norm": 4.125, "learning_rate": 1.6913428472081328e-05, "loss": 1.1688, "mean_token_accuracy": 0.6841487279843443, "step": 3580 }, { "epoch": 0.3318061918645009, "grad_norm": 4.09375, "learning_rate": 1.6901746694264813e-05, "loss": 1.2333, "mean_token_accuracy": 0.6727739726027397, "step": 3585 }, { "epoch": 0.33226896200657136, "grad_norm": 3.984375, "learning_rate": 1.689004690294829e-05, "loss": 1.1309, "mean_token_accuracy": 0.6956457925636006, "step": 3590 }, { "epoch": 0.33273173214864177, "grad_norm": 4.40625, "learning_rate": 1.6878329128668123e-05, "loss": 1.1803, "mean_token_accuracy": 0.6844178082191782, "step": 3595 }, { "epoch": 0.3331945022907122, "grad_norm": 4.25, "learning_rate": 1.6866593402007593e-05, "loss": 1.1479, "mean_token_accuracy": 0.6895792563600782, "step": 3600 }, { "epoch": 0.3331945022907122, "eval_loss": 1.2029253244400024, "eval_mean_token_accuracy": 0.6778910836594911, "eval_runtime": 39.7973, "eval_samples_per_second": 25.73, "eval_steps_per_second": 6.433, "step": 3600 }, { "epoch": 0.33365727243278265, "grad_norm": 5.84375, "learning_rate": 1.685483975359685e-05, "loss": 1.1582, "mean_token_accuracy": 0.6855919765166341, "step": 3605 }, { "epoch": 0.33412004257485306, "grad_norm": 4.28125, "learning_rate": 1.684306821411282e-05, "loss": 1.1925, "mean_token_accuracy": 0.6806751467710372, "step": 3610 }, { "epoch": 0.33458281271692353, "grad_norm": 4.5, "learning_rate": 1.683127881427912e-05, "loss": 1.2141, "mean_token_accuracy": 0.6763943248532291, "step": 3615 }, { "epoch": 0.33504558285899394, "grad_norm": 5.15625, "learning_rate": 1.6819471584865975e-05, "loss": 1.2217, "mean_token_accuracy": 0.6708414872798435, "step": 3620 }, { "epoch": 0.33550835300106435, "grad_norm": 4.4375, "learning_rate": 1.680764655669016e-05, "loss": 1.1538, "mean_token_accuracy": 0.6862035225048924, "step": 3625 }, { "epoch": 0.3359711231431348, "grad_norm": 4.25, "learning_rate": 1.6795803760614896e-05, "loss": 1.1839, "mean_token_accuracy": 0.6820694716242662, "step": 3630 }, { "epoch": 0.33643389328520523, "grad_norm": 4.46875, "learning_rate": 1.6783943227549782e-05, "loss": 1.1995, "mean_token_accuracy": 0.6791340508806261, "step": 3635 }, { "epoch": 0.3368966634272757, "grad_norm": 4.125, "learning_rate": 1.67720649884507e-05, "loss": 1.1999, "mean_token_accuracy": 0.6770303326810174, "step": 3640 }, { "epoch": 0.3373594335693461, "grad_norm": 5.4375, "learning_rate": 1.6760169074319766e-05, "loss": 1.1261, "mean_token_accuracy": 0.6979941291585126, "step": 3645 }, { "epoch": 0.3378222037114165, "grad_norm": 4.34375, "learning_rate": 1.6748255516205206e-05, "loss": 1.1806, "mean_token_accuracy": 0.6835127201565558, "step": 3650 }, { "epoch": 0.338284973853487, "grad_norm": 4.0625, "learning_rate": 1.6736324345201305e-05, "loss": 1.171, "mean_token_accuracy": 0.6826810176125245, "step": 3655 }, { "epoch": 0.3387477439955574, "grad_norm": 5.53125, "learning_rate": 1.672437559244832e-05, "loss": 1.1896, "mean_token_accuracy": 0.6790851272015657, "step": 3660 }, { "epoch": 0.3392105141376278, "grad_norm": 4.46875, "learning_rate": 1.671240928913239e-05, "loss": 1.2495, "mean_token_accuracy": 0.6627446183953034, "step": 3665 }, { "epoch": 0.3396732842796983, "grad_norm": 4.09375, "learning_rate": 1.670042546648547e-05, "loss": 1.1713, "mean_token_accuracy": 0.6846868884540116, "step": 3670 }, { "epoch": 0.3401360544217687, "grad_norm": 4.28125, "learning_rate": 1.668842415578523e-05, "loss": 1.2459, "mean_token_accuracy": 0.6690557729941292, "step": 3675 }, { "epoch": 0.34059882456383916, "grad_norm": 4.71875, "learning_rate": 1.6676405388354993e-05, "loss": 1.1172, "mean_token_accuracy": 0.7006849315068494, "step": 3680 }, { "epoch": 0.3410615947059096, "grad_norm": 3.84375, "learning_rate": 1.6664369195563635e-05, "loss": 1.1467, "mean_token_accuracy": 0.6896771037181997, "step": 3685 }, { "epoch": 0.34152436484798, "grad_norm": 4.0, "learning_rate": 1.665231560882552e-05, "loss": 1.1173, "mean_token_accuracy": 0.698312133072407, "step": 3690 }, { "epoch": 0.34198713499005046, "grad_norm": 4.15625, "learning_rate": 1.6640244659600403e-05, "loss": 1.1783, "mean_token_accuracy": 0.6835616438356164, "step": 3695 }, { "epoch": 0.34244990513212087, "grad_norm": 4.03125, "learning_rate": 1.6628156379393363e-05, "loss": 1.2426, "mean_token_accuracy": 0.6677592954990217, "step": 3700 }, { "epoch": 0.34244990513212087, "eval_loss": 1.2022091150283813, "eval_mean_token_accuracy": 0.6779503271771032, "eval_runtime": 39.8914, "eval_samples_per_second": 25.67, "eval_steps_per_second": 6.417, "step": 3700 }, { "epoch": 0.34291267527419134, "grad_norm": 4.15625, "learning_rate": 1.6616050799754707e-05, "loss": 1.1787, "mean_token_accuracy": 0.6836350293542074, "step": 3705 }, { "epoch": 0.34337544541626175, "grad_norm": 4.34375, "learning_rate": 1.6603927952279894e-05, "loss": 1.1688, "mean_token_accuracy": 0.687133072407045, "step": 3710 }, { "epoch": 0.34383821555833216, "grad_norm": 4.09375, "learning_rate": 1.6591787868609452e-05, "loss": 1.14, "mean_token_accuracy": 0.6931751467710373, "step": 3715 }, { "epoch": 0.34430098570040263, "grad_norm": 3.953125, "learning_rate": 1.6579630580428895e-05, "loss": 1.1972, "mean_token_accuracy": 0.6795499021526418, "step": 3720 }, { "epoch": 0.34476375584247304, "grad_norm": 4.25, "learning_rate": 1.656745611946864e-05, "loss": 1.1608, "mean_token_accuracy": 0.6844422700587085, "step": 3725 }, { "epoch": 0.34522652598454345, "grad_norm": 4.625, "learning_rate": 1.6555264517503933e-05, "loss": 1.2222, "mean_token_accuracy": 0.6742661448140901, "step": 3730 }, { "epoch": 0.3456892961266139, "grad_norm": 4.28125, "learning_rate": 1.6543055806354743e-05, "loss": 1.1495, "mean_token_accuracy": 0.6913405088062623, "step": 3735 }, { "epoch": 0.34615206626868433, "grad_norm": 4.125, "learning_rate": 1.6530830017885707e-05, "loss": 1.1342, "mean_token_accuracy": 0.6939579256360078, "step": 3740 }, { "epoch": 0.3466148364107548, "grad_norm": 3.984375, "learning_rate": 1.6518587184006028e-05, "loss": 1.1554, "mean_token_accuracy": 0.6903620352250491, "step": 3745 }, { "epoch": 0.3470776065528252, "grad_norm": 4.375, "learning_rate": 1.65063273366694e-05, "loss": 1.1136, "mean_token_accuracy": 0.701687866927593, "step": 3750 }, { "epoch": 0.3475403766948956, "grad_norm": 4.59375, "learning_rate": 1.6494050507873913e-05, "loss": 1.1817, "mean_token_accuracy": 0.6777641878669276, "step": 3755 }, { "epoch": 0.3480031468369661, "grad_norm": 4.28125, "learning_rate": 1.6481756729661992e-05, "loss": 1.1612, "mean_token_accuracy": 0.6873043052837574, "step": 3760 }, { "epoch": 0.3484659169790365, "grad_norm": 4.28125, "learning_rate": 1.6469446034120294e-05, "loss": 1.2305, "mean_token_accuracy": 0.6689823874755383, "step": 3765 }, { "epoch": 0.348928687121107, "grad_norm": 4.3125, "learning_rate": 1.6457118453379624e-05, "loss": 1.1344, "mean_token_accuracy": 0.6950831702544031, "step": 3770 }, { "epoch": 0.3493914572631774, "grad_norm": 3.796875, "learning_rate": 1.644477401961486e-05, "loss": 1.2086, "mean_token_accuracy": 0.6719178082191781, "step": 3775 }, { "epoch": 0.3498542274052478, "grad_norm": 4.375, "learning_rate": 1.643241276504488e-05, "loss": 1.2079, "mean_token_accuracy": 0.6781555772994129, "step": 3780 }, { "epoch": 0.35031699754731827, "grad_norm": 4.5625, "learning_rate": 1.642003472193245e-05, "loss": 1.2256, "mean_token_accuracy": 0.6734589041095891, "step": 3785 }, { "epoch": 0.3507797676893887, "grad_norm": 4.21875, "learning_rate": 1.6407639922584148e-05, "loss": 1.0648, "mean_token_accuracy": 0.7079990215264188, "step": 3790 }, { "epoch": 0.3512425378314591, "grad_norm": 4.34375, "learning_rate": 1.6395228399350304e-05, "loss": 1.2342, "mean_token_accuracy": 0.6712328767123288, "step": 3795 }, { "epoch": 0.35170530797352956, "grad_norm": 4.0625, "learning_rate": 1.638280018462488e-05, "loss": 1.1958, "mean_token_accuracy": 0.6822407045009785, "step": 3800 }, { "epoch": 0.35170530797352956, "eval_loss": 1.201136827468872, "eval_mean_token_accuracy": 0.6782580112524459, "eval_runtime": 39.9908, "eval_samples_per_second": 25.606, "eval_steps_per_second": 6.401, "step": 3800 }, { "epoch": 0.35216807811559997, "grad_norm": 4.1875, "learning_rate": 1.6370355310845417e-05, "loss": 1.178, "mean_token_accuracy": 0.6817514677103718, "step": 3805 }, { "epoch": 0.35263084825767044, "grad_norm": 4.625, "learning_rate": 1.635789381049293e-05, "loss": 1.1346, "mean_token_accuracy": 0.6927592954990216, "step": 3810 }, { "epoch": 0.35309361839974085, "grad_norm": 5.09375, "learning_rate": 1.634541571609182e-05, "loss": 1.1909, "mean_token_accuracy": 0.6781311154598826, "step": 3815 }, { "epoch": 0.35355638854181126, "grad_norm": 4.25, "learning_rate": 1.6332921060209817e-05, "loss": 1.206, "mean_token_accuracy": 0.6751467710371821, "step": 3820 }, { "epoch": 0.35401915868388173, "grad_norm": 4.6875, "learning_rate": 1.6320409875457864e-05, "loss": 1.1902, "mean_token_accuracy": 0.6793786692759296, "step": 3825 }, { "epoch": 0.35448192882595214, "grad_norm": 4.4375, "learning_rate": 1.6307882194490038e-05, "loss": 1.2337, "mean_token_accuracy": 0.6692025440313112, "step": 3830 }, { "epoch": 0.3549446989680226, "grad_norm": 4.09375, "learning_rate": 1.629533805000349e-05, "loss": 1.1976, "mean_token_accuracy": 0.6805528375733855, "step": 3835 }, { "epoch": 0.355407469110093, "grad_norm": 4.59375, "learning_rate": 1.6282777474738324e-05, "loss": 1.2302, "mean_token_accuracy": 0.6700097847358121, "step": 3840 }, { "epoch": 0.35587023925216343, "grad_norm": 4.28125, "learning_rate": 1.627020050147754e-05, "loss": 1.1248, "mean_token_accuracy": 0.6969178082191781, "step": 3845 }, { "epoch": 0.3563330093942339, "grad_norm": 4.53125, "learning_rate": 1.6257607163046925e-05, "loss": 1.0962, "mean_token_accuracy": 0.7045254403131115, "step": 3850 }, { "epoch": 0.3567957795363043, "grad_norm": 4.1875, "learning_rate": 1.624499749231499e-05, "loss": 1.202, "mean_token_accuracy": 0.6787915851272016, "step": 3855 }, { "epoch": 0.3572585496783747, "grad_norm": 4.90625, "learning_rate": 1.6232371522192862e-05, "loss": 1.1437, "mean_token_accuracy": 0.6892857142857143, "step": 3860 }, { "epoch": 0.3577213198204452, "grad_norm": 4.40625, "learning_rate": 1.6219729285634217e-05, "loss": 1.2135, "mean_token_accuracy": 0.6767857142857141, "step": 3865 }, { "epoch": 0.3581840899625156, "grad_norm": 4.21875, "learning_rate": 1.620707081563519e-05, "loss": 1.14, "mean_token_accuracy": 0.6881849315068495, "step": 3870 }, { "epoch": 0.3586468601045861, "grad_norm": 4.4375, "learning_rate": 1.6194396145234273e-05, "loss": 1.2021, "mean_token_accuracy": 0.6778375733855185, "step": 3875 }, { "epoch": 0.3591096302466565, "grad_norm": 4.25, "learning_rate": 1.6181705307512254e-05, "loss": 1.2246, "mean_token_accuracy": 0.6685665362035225, "step": 3880 }, { "epoch": 0.3595724003887269, "grad_norm": 4.09375, "learning_rate": 1.6168998335592104e-05, "loss": 1.2096, "mean_token_accuracy": 0.6749266144814089, "step": 3885 }, { "epoch": 0.36003517053079737, "grad_norm": 4.28125, "learning_rate": 1.6156275262638918e-05, "loss": 1.1565, "mean_token_accuracy": 0.6892612524461839, "step": 3890 }, { "epoch": 0.3604979406728678, "grad_norm": 5.78125, "learning_rate": 1.6143536121859805e-05, "loss": 1.1702, "mean_token_accuracy": 0.6888209393346381, "step": 3895 }, { "epoch": 0.36096071081493825, "grad_norm": 4.375, "learning_rate": 1.613078094650381e-05, "loss": 1.176, "mean_token_accuracy": 0.6833904109589042, "step": 3900 }, { "epoch": 0.36096071081493825, "eval_loss": 1.200246810913086, "eval_mean_token_accuracy": 0.6785886283023482, "eval_runtime": 39.8766, "eval_samples_per_second": 25.679, "eval_steps_per_second": 6.42, "step": 3900 }, { "epoch": 0.36142348095700866, "grad_norm": 4.09375, "learning_rate": 1.611800976986184e-05, "loss": 1.1894, "mean_token_accuracy": 0.6857632093933463, "step": 3905 }, { "epoch": 0.36188625109907907, "grad_norm": 4.15625, "learning_rate": 1.610522262526655e-05, "loss": 1.142, "mean_token_accuracy": 0.6924168297455968, "step": 3910 }, { "epoch": 0.36234902124114954, "grad_norm": 4.15625, "learning_rate": 1.6092419546092282e-05, "loss": 1.1669, "mean_token_accuracy": 0.6890900195694716, "step": 3915 }, { "epoch": 0.36281179138321995, "grad_norm": 4.09375, "learning_rate": 1.6079600565754963e-05, "loss": 1.1152, "mean_token_accuracy": 0.6994129158512721, "step": 3920 }, { "epoch": 0.36327456152529036, "grad_norm": 4.28125, "learning_rate": 1.6066765717712024e-05, "loss": 1.2237, "mean_token_accuracy": 0.6741927592954989, "step": 3925 }, { "epoch": 0.36373733166736083, "grad_norm": 4.96875, "learning_rate": 1.60539150354623e-05, "loss": 1.2182, "mean_token_accuracy": 0.6732387475538161, "step": 3930 }, { "epoch": 0.36420010180943124, "grad_norm": 5.1875, "learning_rate": 1.6041048552545974e-05, "loss": 1.1273, "mean_token_accuracy": 0.6962818003913895, "step": 3935 }, { "epoch": 0.3646628719515017, "grad_norm": 4.1875, "learning_rate": 1.6028166302544455e-05, "loss": 1.1543, "mean_token_accuracy": 0.6913649706457924, "step": 3940 }, { "epoch": 0.3651256420935721, "grad_norm": 4.375, "learning_rate": 1.6015268319080294e-05, "loss": 1.2027, "mean_token_accuracy": 0.6802103718199609, "step": 3945 }, { "epoch": 0.36558841223564253, "grad_norm": 4.8125, "learning_rate": 1.6002354635817132e-05, "loss": 1.1282, "mean_token_accuracy": 0.6963796477495107, "step": 3950 }, { "epoch": 0.366051182377713, "grad_norm": 4.28125, "learning_rate": 1.5989425286459565e-05, "loss": 1.2102, "mean_token_accuracy": 0.6767857142857142, "step": 3955 }, { "epoch": 0.3665139525197834, "grad_norm": 4.5625, "learning_rate": 1.5976480304753083e-05, "loss": 1.1851, "mean_token_accuracy": 0.6797455968688846, "step": 3960 }, { "epoch": 0.3669767226618539, "grad_norm": 4.21875, "learning_rate": 1.5963519724483982e-05, "loss": 1.1647, "mean_token_accuracy": 0.6853473581213306, "step": 3965 }, { "epoch": 0.3674394928039243, "grad_norm": 4.0, "learning_rate": 1.5950543579479268e-05, "loss": 1.1731, "mean_token_accuracy": 0.6839041095890411, "step": 3970 }, { "epoch": 0.3679022629459947, "grad_norm": 4.1875, "learning_rate": 1.593755190360656e-05, "loss": 1.2058, "mean_token_accuracy": 0.67573385518591, "step": 3975 }, { "epoch": 0.3683650330880652, "grad_norm": 4.125, "learning_rate": 1.5924544730774033e-05, "loss": 1.2266, "mean_token_accuracy": 0.6731898238747555, "step": 3980 }, { "epoch": 0.3688278032301356, "grad_norm": 4.09375, "learning_rate": 1.5911522094930284e-05, "loss": 1.1856, "mean_token_accuracy": 0.6786692759295498, "step": 3985 }, { "epoch": 0.369290573372206, "grad_norm": 4.15625, "learning_rate": 1.5898484030064293e-05, "loss": 1.1581, "mean_token_accuracy": 0.6912915851272016, "step": 3990 }, { "epoch": 0.36975334351427647, "grad_norm": 4.15625, "learning_rate": 1.5885430570205288e-05, "loss": 1.2133, "mean_token_accuracy": 0.675440313111546, "step": 3995 }, { "epoch": 0.3702161136563469, "grad_norm": 4.34375, "learning_rate": 1.5872361749422694e-05, "loss": 1.1704, "mean_token_accuracy": 0.6837084148727987, "step": 4000 }, { "epoch": 0.3702161136563469, "eval_loss": 1.1994986534118652, "eval_mean_token_accuracy": 0.6786172945205474, "eval_runtime": 39.8103, "eval_samples_per_second": 25.722, "eval_steps_per_second": 6.43, "step": 4000 }, { "epoch": 0.37067888379841735, "grad_norm": 4.375, "learning_rate": 1.5859277601826023e-05, "loss": 1.1967, "mean_token_accuracy": 0.6802103718199609, "step": 4005 }, { "epoch": 0.37114165394048776, "grad_norm": 4.0625, "learning_rate": 1.584617816156479e-05, "loss": 1.1467, "mean_token_accuracy": 0.6870596868884541, "step": 4010 }, { "epoch": 0.37160442408255817, "grad_norm": 4.09375, "learning_rate": 1.5833063462828416e-05, "loss": 1.1846, "mean_token_accuracy": 0.6823874755381603, "step": 4015 }, { "epoch": 0.37206719422462864, "grad_norm": 3.953125, "learning_rate": 1.5819933539846153e-05, "loss": 1.166, "mean_token_accuracy": 0.6839285714285713, "step": 4020 }, { "epoch": 0.37252996436669905, "grad_norm": 4.375, "learning_rate": 1.5806788426886994e-05, "loss": 1.1069, "mean_token_accuracy": 0.6975048923679062, "step": 4025 }, { "epoch": 0.3729927345087695, "grad_norm": 4.1875, "learning_rate": 1.5793628158259565e-05, "loss": 1.1921, "mean_token_accuracy": 0.6814579256360076, "step": 4030 }, { "epoch": 0.37345550465083993, "grad_norm": 4.28125, "learning_rate": 1.578045276831206e-05, "loss": 1.1902, "mean_token_accuracy": 0.6864726027397259, "step": 4035 }, { "epoch": 0.37391827479291034, "grad_norm": 4.15625, "learning_rate": 1.5767262291432135e-05, "loss": 1.1161, "mean_token_accuracy": 0.6972358121330724, "step": 4040 }, { "epoch": 0.3743810449349808, "grad_norm": 4.34375, "learning_rate": 1.575405676204682e-05, "loss": 1.2313, "mean_token_accuracy": 0.6727739726027397, "step": 4045 }, { "epoch": 0.3748438150770512, "grad_norm": 4.15625, "learning_rate": 1.574083621462243e-05, "loss": 1.1116, "mean_token_accuracy": 0.7001467710371819, "step": 4050 }, { "epoch": 0.37530658521912164, "grad_norm": 5.53125, "learning_rate": 1.5727600683664488e-05, "loss": 1.0904, "mean_token_accuracy": 0.7054549902152643, "step": 4055 }, { "epoch": 0.3757693553611921, "grad_norm": 4.375, "learning_rate": 1.5714350203717614e-05, "loss": 1.2259, "mean_token_accuracy": 0.6711839530332682, "step": 4060 }, { "epoch": 0.3762321255032625, "grad_norm": 4.75, "learning_rate": 1.5701084809365447e-05, "loss": 1.1662, "mean_token_accuracy": 0.6862769080234834, "step": 4065 }, { "epoch": 0.376694895645333, "grad_norm": 4.1875, "learning_rate": 1.568780453523055e-05, "loss": 1.1706, "mean_token_accuracy": 0.6868884540117416, "step": 4070 }, { "epoch": 0.3771576657874034, "grad_norm": 4.625, "learning_rate": 1.567450941597433e-05, "loss": 1.097, "mean_token_accuracy": 0.7032778864970647, "step": 4075 }, { "epoch": 0.3776204359294738, "grad_norm": 3.84375, "learning_rate": 1.566119948629694e-05, "loss": 1.1584, "mean_token_accuracy": 0.6893590998043052, "step": 4080 }, { "epoch": 0.3780832060715443, "grad_norm": 4.1875, "learning_rate": 1.5647874780937173e-05, "loss": 1.1672, "mean_token_accuracy": 0.6925880626223092, "step": 4085 }, { "epoch": 0.3785459762136147, "grad_norm": 4.09375, "learning_rate": 1.5634535334672397e-05, "loss": 1.1927, "mean_token_accuracy": 0.6797211350293542, "step": 4090 }, { "epoch": 0.37900874635568516, "grad_norm": 4.15625, "learning_rate": 1.5621181182318458e-05, "loss": 1.1256, "mean_token_accuracy": 0.6951320939334639, "step": 4095 }, { "epoch": 0.37947151649775557, "grad_norm": 4.25, "learning_rate": 1.5607812358729576e-05, "loss": 1.1266, "mean_token_accuracy": 0.698091976516634, "step": 4100 }, { "epoch": 0.37947151649775557, "eval_loss": 1.198913335800171, "eval_mean_token_accuracy": 0.6786115612769077, "eval_runtime": 41.0029, "eval_samples_per_second": 24.974, "eval_steps_per_second": 6.243, "step": 4100 }, { "epoch": 0.379934286639826, "grad_norm": 3.984375, "learning_rate": 1.5594428898798272e-05, "loss": 1.1516, "mean_token_accuracy": 0.6905332681017613, "step": 4105 }, { "epoch": 0.38039705678189645, "grad_norm": 5.09375, "learning_rate": 1.558103083745526e-05, "loss": 1.1861, "mean_token_accuracy": 0.6829745596868884, "step": 4110 }, { "epoch": 0.38085982692396686, "grad_norm": 4.125, "learning_rate": 1.5567618209669358e-05, "loss": 1.1936, "mean_token_accuracy": 0.6824608610567514, "step": 4115 }, { "epoch": 0.38132259706603727, "grad_norm": 4.3125, "learning_rate": 1.5554191050447422e-05, "loss": 1.2183, "mean_token_accuracy": 0.668101761252446, "step": 4120 }, { "epoch": 0.38178536720810774, "grad_norm": 4.4375, "learning_rate": 1.5540749394834213e-05, "loss": 1.111, "mean_token_accuracy": 0.7018101761252448, "step": 4125 }, { "epoch": 0.38224813735017815, "grad_norm": 4.25, "learning_rate": 1.5527293277912344e-05, "loss": 1.1783, "mean_token_accuracy": 0.6841487279843445, "step": 4130 }, { "epoch": 0.3827109074922486, "grad_norm": 4.03125, "learning_rate": 1.5513822734802163e-05, "loss": 1.1318, "mean_token_accuracy": 0.6955479452054796, "step": 4135 }, { "epoch": 0.38317367763431903, "grad_norm": 3.953125, "learning_rate": 1.550033780066167e-05, "loss": 1.1284, "mean_token_accuracy": 0.6951810176125245, "step": 4140 }, { "epoch": 0.38363644777638944, "grad_norm": 4.125, "learning_rate": 1.5486838510686436e-05, "loss": 1.1476, "mean_token_accuracy": 0.6907534246575342, "step": 4145 }, { "epoch": 0.3840992179184599, "grad_norm": 4.21875, "learning_rate": 1.5473324900109484e-05, "loss": 1.1688, "mean_token_accuracy": 0.685004892367906, "step": 4150 }, { "epoch": 0.3845619880605303, "grad_norm": 4.28125, "learning_rate": 1.5459797004201228e-05, "loss": 1.1326, "mean_token_accuracy": 0.6939090019569473, "step": 4155 }, { "epoch": 0.3850247582026008, "grad_norm": 4.09375, "learning_rate": 1.544625485826936e-05, "loss": 1.1047, "mean_token_accuracy": 0.7015900195694715, "step": 4160 }, { "epoch": 0.3854875283446712, "grad_norm": 4.28125, "learning_rate": 1.5432698497658765e-05, "loss": 1.16, "mean_token_accuracy": 0.6914138943248532, "step": 4165 }, { "epoch": 0.3859502984867416, "grad_norm": 4.1875, "learning_rate": 1.5419127957751426e-05, "loss": 1.1965, "mean_token_accuracy": 0.6807485322896281, "step": 4170 }, { "epoch": 0.3864130686288121, "grad_norm": 4.28125, "learning_rate": 1.540554327396634e-05, "loss": 1.1782, "mean_token_accuracy": 0.6795009784735812, "step": 4175 }, { "epoch": 0.3868758387708825, "grad_norm": 5.09375, "learning_rate": 1.5391944481759412e-05, "loss": 1.0831, "mean_token_accuracy": 0.7051614481409002, "step": 4180 }, { "epoch": 0.3873386089129529, "grad_norm": 4.09375, "learning_rate": 1.537833161662338e-05, "loss": 1.1961, "mean_token_accuracy": 0.6801369863013698, "step": 4185 }, { "epoch": 0.3878013790550234, "grad_norm": 4.0625, "learning_rate": 1.5364704714087697e-05, "loss": 1.1952, "mean_token_accuracy": 0.6788160469667319, "step": 4190 }, { "epoch": 0.3882641491970938, "grad_norm": 4.5, "learning_rate": 1.535106380971847e-05, "loss": 1.1714, "mean_token_accuracy": 0.6888698630136986, "step": 4195 }, { "epoch": 0.38872691933916426, "grad_norm": 5.5625, "learning_rate": 1.533740893911834e-05, "loss": 1.2003, "mean_token_accuracy": 0.6781066536203523, "step": 4200 }, { "epoch": 0.38872691933916426, "eval_loss": 1.197932243347168, "eval_mean_token_accuracy": 0.6786765380381597, "eval_runtime": 39.7476, "eval_samples_per_second": 25.763, "eval_steps_per_second": 6.441, "step": 4200 }, { "epoch": 0.38918968948123467, "grad_norm": 4.78125, "learning_rate": 1.5323740137926403e-05, "loss": 1.2297, "mean_token_accuracy": 0.6704500978473582, "step": 4205 }, { "epoch": 0.3896524596233051, "grad_norm": 4.21875, "learning_rate": 1.5310057441818115e-05, "loss": 1.1424, "mean_token_accuracy": 0.6945205479452055, "step": 4210 }, { "epoch": 0.39011522976537555, "grad_norm": 4.28125, "learning_rate": 1.529636088650519e-05, "loss": 1.1986, "mean_token_accuracy": 0.6741927592954989, "step": 4215 }, { "epoch": 0.39057799990744596, "grad_norm": 4.1875, "learning_rate": 1.5282650507735528e-05, "loss": 1.2247, "mean_token_accuracy": 0.6718199608610568, "step": 4220 }, { "epoch": 0.39104077004951643, "grad_norm": 4.09375, "learning_rate": 1.5268926341293098e-05, "loss": 1.0853, "mean_token_accuracy": 0.7066046966731899, "step": 4225 }, { "epoch": 0.39150354019158684, "grad_norm": 4.1875, "learning_rate": 1.5255188422997855e-05, "loss": 1.1573, "mean_token_accuracy": 0.6914872798434442, "step": 4230 }, { "epoch": 0.39196631033365725, "grad_norm": 4.75, "learning_rate": 1.5241436788705657e-05, "loss": 1.1782, "mean_token_accuracy": 0.6852250489236791, "step": 4235 }, { "epoch": 0.3924290804757277, "grad_norm": 4.34375, "learning_rate": 1.5227671474308143e-05, "loss": 1.2245, "mean_token_accuracy": 0.6752935420743639, "step": 4240 }, { "epoch": 0.39289185061779813, "grad_norm": 4.15625, "learning_rate": 1.5213892515732664e-05, "loss": 1.1736, "mean_token_accuracy": 0.6840264187866929, "step": 4245 }, { "epoch": 0.3933546207598686, "grad_norm": 4.21875, "learning_rate": 1.5200099948942198e-05, "loss": 1.1166, "mean_token_accuracy": 0.6976272015655578, "step": 4250 }, { "epoch": 0.393817390901939, "grad_norm": 4.53125, "learning_rate": 1.5186293809935211e-05, "loss": 1.2036, "mean_token_accuracy": 0.6782778864970647, "step": 4255 }, { "epoch": 0.3942801610440094, "grad_norm": 4.46875, "learning_rate": 1.517247413474562e-05, "loss": 1.1768, "mean_token_accuracy": 0.6811154598825833, "step": 4260 }, { "epoch": 0.3947429311860799, "grad_norm": 4.5, "learning_rate": 1.5158640959442654e-05, "loss": 1.1227, "mean_token_accuracy": 0.6970890410958904, "step": 4265 }, { "epoch": 0.3952057013281503, "grad_norm": 4.3125, "learning_rate": 1.5144794320130784e-05, "loss": 1.198, "mean_token_accuracy": 0.6764677103718199, "step": 4270 }, { "epoch": 0.3956684714702207, "grad_norm": 4.3125, "learning_rate": 1.5130934252949621e-05, "loss": 1.0794, "mean_token_accuracy": 0.706042074363992, "step": 4275 }, { "epoch": 0.3961312416122912, "grad_norm": 4.5, "learning_rate": 1.5117060794073815e-05, "loss": 1.0933, "mean_token_accuracy": 0.7039383561643836, "step": 4280 }, { "epoch": 0.3965940117543616, "grad_norm": 4.1875, "learning_rate": 1.5103173979712987e-05, "loss": 1.135, "mean_token_accuracy": 0.6971624266144816, "step": 4285 }, { "epoch": 0.39705678189643206, "grad_norm": 4.53125, "learning_rate": 1.5089273846111595e-05, "loss": 1.1556, "mean_token_accuracy": 0.6928082191780822, "step": 4290 }, { "epoch": 0.3975195520385025, "grad_norm": 4.375, "learning_rate": 1.5075360429548873e-05, "loss": 1.1596, "mean_token_accuracy": 0.6869618395303325, "step": 4295 }, { "epoch": 0.3979823221805729, "grad_norm": 4.21875, "learning_rate": 1.506143376633872e-05, "loss": 1.1887, "mean_token_accuracy": 0.6817514677103719, "step": 4300 }, { "epoch": 0.3979823221805729, "eval_loss": 1.1971828937530518, "eval_mean_token_accuracy": 0.6789632002201559, "eval_runtime": 39.776, "eval_samples_per_second": 25.744, "eval_steps_per_second": 6.436, "step": 4300 }, { "epoch": 0.39844509232264336, "grad_norm": 4.09375, "learning_rate": 1.5047493892829605e-05, "loss": 1.1185, "mean_token_accuracy": 0.6980675146771038, "step": 4305 }, { "epoch": 0.39890786246471377, "grad_norm": 4.0625, "learning_rate": 1.5033540845404484e-05, "loss": 1.1452, "mean_token_accuracy": 0.6898972602739727, "step": 4310 }, { "epoch": 0.39937063260678424, "grad_norm": 4.4375, "learning_rate": 1.5019574660480685e-05, "loss": 1.1822, "mean_token_accuracy": 0.6843199608610567, "step": 4315 }, { "epoch": 0.39983340274885465, "grad_norm": 4.5, "learning_rate": 1.5005595374509838e-05, "loss": 1.139, "mean_token_accuracy": 0.6946673189823875, "step": 4320 }, { "epoch": 0.40029617289092506, "grad_norm": 4.34375, "learning_rate": 1.4991603023977757e-05, "loss": 1.2851, "mean_token_accuracy": 0.6592465753424659, "step": 4325 }, { "epoch": 0.40075894303299553, "grad_norm": 4.03125, "learning_rate": 1.4977597645404361e-05, "loss": 1.135, "mean_token_accuracy": 0.6938111545988258, "step": 4330 }, { "epoch": 0.40122171317506594, "grad_norm": 4.21875, "learning_rate": 1.4963579275343563e-05, "loss": 1.1443, "mean_token_accuracy": 0.6933463796477496, "step": 4335 }, { "epoch": 0.40168448331713635, "grad_norm": 4.84375, "learning_rate": 1.4949547950383191e-05, "loss": 1.1952, "mean_token_accuracy": 0.6816780821917808, "step": 4340 }, { "epoch": 0.4021472534592068, "grad_norm": 4.40625, "learning_rate": 1.4935503707144885e-05, "loss": 1.1798, "mean_token_accuracy": 0.6851027397260274, "step": 4345 }, { "epoch": 0.40261002360127723, "grad_norm": 4.4375, "learning_rate": 1.4921446582284e-05, "loss": 1.1885, "mean_token_accuracy": 0.6840019569471624, "step": 4350 }, { "epoch": 0.4030727937433477, "grad_norm": 4.125, "learning_rate": 1.4907376612489512e-05, "loss": 1.096, "mean_token_accuracy": 0.7068248532289626, "step": 4355 }, { "epoch": 0.4035355638854181, "grad_norm": 4.5625, "learning_rate": 1.4893293834483921e-05, "loss": 1.2284, "mean_token_accuracy": 0.6726761252446184, "step": 4360 }, { "epoch": 0.4039983340274885, "grad_norm": 4.3125, "learning_rate": 1.4879198285023159e-05, "loss": 1.1507, "mean_token_accuracy": 0.6901663405088063, "step": 4365 }, { "epoch": 0.404461104169559, "grad_norm": 4.0625, "learning_rate": 1.486509000089649e-05, "loss": 1.1342, "mean_token_accuracy": 0.6935176125244616, "step": 4370 }, { "epoch": 0.4049238743116294, "grad_norm": 4.1875, "learning_rate": 1.4850969018926412e-05, "loss": 1.082, "mean_token_accuracy": 0.7097847358121331, "step": 4375 }, { "epoch": 0.4053866444536999, "grad_norm": 4.25, "learning_rate": 1.483683537596857e-05, "loss": 1.1304, "mean_token_accuracy": 0.6954745596868885, "step": 4380 }, { "epoch": 0.4058494145957703, "grad_norm": 4.1875, "learning_rate": 1.4822689108911652e-05, "loss": 1.1472, "mean_token_accuracy": 0.6887720156555772, "step": 4385 }, { "epoch": 0.4063121847378407, "grad_norm": 4.09375, "learning_rate": 1.4808530254677296e-05, "loss": 1.1901, "mean_token_accuracy": 0.6825342465753426, "step": 4390 }, { "epoch": 0.40677495487991117, "grad_norm": 4.125, "learning_rate": 1.4794358850219985e-05, "loss": 1.1188, "mean_token_accuracy": 0.6968444227005872, "step": 4395 }, { "epoch": 0.4072377250219816, "grad_norm": 4.1875, "learning_rate": 1.478017493252697e-05, "loss": 1.1712, "mean_token_accuracy": 0.6821183953033269, "step": 4400 }, { "epoch": 0.4072377250219816, "eval_loss": 1.1967071294784546, "eval_mean_token_accuracy": 0.6790759540117411, "eval_runtime": 39.6182, "eval_samples_per_second": 25.847, "eval_steps_per_second": 6.462, "step": 4400 }, { "epoch": 0.407700495164052, "grad_norm": 4.1875, "learning_rate": 1.4765978538618152e-05, "loss": 1.2021, "mean_token_accuracy": 0.6770792563600783, "step": 4405 }, { "epoch": 0.40816326530612246, "grad_norm": 9.0, "learning_rate": 1.4751769705545997e-05, "loss": 1.16, "mean_token_accuracy": 0.6855430528375732, "step": 4410 }, { "epoch": 0.40862603544819287, "grad_norm": 3.890625, "learning_rate": 1.473754847039544e-05, "loss": 1.2051, "mean_token_accuracy": 0.6745596868884541, "step": 4415 }, { "epoch": 0.40908880559026334, "grad_norm": 4.125, "learning_rate": 1.4723314870283783e-05, "loss": 1.1739, "mean_token_accuracy": 0.6843688845401174, "step": 4420 }, { "epoch": 0.40955157573233375, "grad_norm": 4.1875, "learning_rate": 1.4709068942360602e-05, "loss": 1.1479, "mean_token_accuracy": 0.6886252446183954, "step": 4425 }, { "epoch": 0.41001434587440416, "grad_norm": 4.09375, "learning_rate": 1.4694810723807647e-05, "loss": 1.1958, "mean_token_accuracy": 0.6782045009784736, "step": 4430 }, { "epoch": 0.41047711601647463, "grad_norm": 4.0625, "learning_rate": 1.4680540251838742e-05, "loss": 1.1592, "mean_token_accuracy": 0.684344422700587, "step": 4435 }, { "epoch": 0.41093988615854504, "grad_norm": 4.1875, "learning_rate": 1.4666257563699704e-05, "loss": 1.1457, "mean_token_accuracy": 0.6899461839530333, "step": 4440 }, { "epoch": 0.4114026563006155, "grad_norm": 4.1875, "learning_rate": 1.4651962696668225e-05, "loss": 1.1054, "mean_token_accuracy": 0.7034735812133073, "step": 4445 }, { "epoch": 0.4118654264426859, "grad_norm": 3.921875, "learning_rate": 1.4637655688053785e-05, "loss": 1.2059, "mean_token_accuracy": 0.6777641878669276, "step": 4450 }, { "epoch": 0.41232819658475633, "grad_norm": 4.125, "learning_rate": 1.4623336575197555e-05, "loss": 1.0844, "mean_token_accuracy": 0.702495107632094, "step": 4455 }, { "epoch": 0.4127909667268268, "grad_norm": 4.28125, "learning_rate": 1.4609005395472298e-05, "loss": 1.1393, "mean_token_accuracy": 0.693248532289628, "step": 4460 }, { "epoch": 0.4132537368688972, "grad_norm": 4.46875, "learning_rate": 1.4594662186282271e-05, "loss": 1.1586, "mean_token_accuracy": 0.6886986301369864, "step": 4465 }, { "epoch": 0.4137165070109676, "grad_norm": 4.40625, "learning_rate": 1.4580306985063124e-05, "loss": 1.1519, "mean_token_accuracy": 0.6895303326810176, "step": 4470 }, { "epoch": 0.4141792771530381, "grad_norm": 5.1875, "learning_rate": 1.4565939829281814e-05, "loss": 1.1238, "mean_token_accuracy": 0.6974315068493151, "step": 4475 }, { "epoch": 0.4146420472951085, "grad_norm": 4.03125, "learning_rate": 1.4551560756436494e-05, "loss": 1.1641, "mean_token_accuracy": 0.6848825831702543, "step": 4480 }, { "epoch": 0.415104817437179, "grad_norm": 4.28125, "learning_rate": 1.453716980405642e-05, "loss": 1.1501, "mean_token_accuracy": 0.6910469667318982, "step": 4485 }, { "epoch": 0.4155675875792494, "grad_norm": 4.125, "learning_rate": 1.4522767009701857e-05, "loss": 1.0987, "mean_token_accuracy": 0.7027886497064582, "step": 4490 }, { "epoch": 0.4160303577213198, "grad_norm": 4.25, "learning_rate": 1.4508352410963976e-05, "loss": 1.1655, "mean_token_accuracy": 0.6876712328767123, "step": 4495 }, { "epoch": 0.41649312786339027, "grad_norm": 4.90625, "learning_rate": 1.4493926045464752e-05, "loss": 1.1452, "mean_token_accuracy": 0.6912181996086105, "step": 4500 }, { "epoch": 0.41649312786339027, "eval_loss": 1.195981502532959, "eval_mean_token_accuracy": 0.6790778650929545, "eval_runtime": 39.695, "eval_samples_per_second": 25.797, "eval_steps_per_second": 6.449, "step": 4500 }, { "epoch": 0.4169558980054607, "grad_norm": 5.0, "learning_rate": 1.4479487950856881e-05, "loss": 1.2293, "mean_token_accuracy": 0.6676614481409002, "step": 4505 }, { "epoch": 0.41741866814753115, "grad_norm": 4.875, "learning_rate": 1.4465038164823665e-05, "loss": 1.0922, "mean_token_accuracy": 0.7034980430528376, "step": 4510 }, { "epoch": 0.41788143828960156, "grad_norm": 4.25, "learning_rate": 1.4450576725078923e-05, "loss": 1.1747, "mean_token_accuracy": 0.6855675146771038, "step": 4515 }, { "epoch": 0.41834420843167197, "grad_norm": 4.03125, "learning_rate": 1.443610366936689e-05, "loss": 1.1736, "mean_token_accuracy": 0.6843199608610566, "step": 4520 }, { "epoch": 0.41880697857374244, "grad_norm": 4.125, "learning_rate": 1.4421619035462115e-05, "loss": 1.1231, "mean_token_accuracy": 0.6974559686888454, "step": 4525 }, { "epoch": 0.41926974871581285, "grad_norm": 4.03125, "learning_rate": 1.440712286116937e-05, "loss": 1.1819, "mean_token_accuracy": 0.6809197651663406, "step": 4530 }, { "epoch": 0.41973251885788326, "grad_norm": 4.09375, "learning_rate": 1.4392615184323545e-05, "loss": 1.1088, "mean_token_accuracy": 0.7005381604696674, "step": 4535 }, { "epoch": 0.42019528899995373, "grad_norm": 3.953125, "learning_rate": 1.4378096042789558e-05, "loss": 1.1652, "mean_token_accuracy": 0.6881360078277885, "step": 4540 }, { "epoch": 0.42065805914202414, "grad_norm": 4.65625, "learning_rate": 1.436356547446224e-05, "loss": 1.1588, "mean_token_accuracy": 0.6869618395303327, "step": 4545 }, { "epoch": 0.4211208292840946, "grad_norm": 3.890625, "learning_rate": 1.434902351726625e-05, "loss": 1.1637, "mean_token_accuracy": 0.6872309197651664, "step": 4550 }, { "epoch": 0.421583599426165, "grad_norm": 6.1875, "learning_rate": 1.4334470209155978e-05, "loss": 1.176, "mean_token_accuracy": 0.6808219178082192, "step": 4555 }, { "epoch": 0.42204636956823544, "grad_norm": 4.21875, "learning_rate": 1.4319905588115428e-05, "loss": 1.1496, "mean_token_accuracy": 0.6908512720156554, "step": 4560 }, { "epoch": 0.4225091397103059, "grad_norm": 4.21875, "learning_rate": 1.4305329692158136e-05, "loss": 1.1492, "mean_token_accuracy": 0.6928082191780822, "step": 4565 }, { "epoch": 0.4229719098523763, "grad_norm": 4.40625, "learning_rate": 1.4290742559327072e-05, "loss": 1.1318, "mean_token_accuracy": 0.6970890410958903, "step": 4570 }, { "epoch": 0.4234346799944468, "grad_norm": 4.375, "learning_rate": 1.4276144227694522e-05, "loss": 1.2522, "mean_token_accuracy": 0.6642857142857144, "step": 4575 }, { "epoch": 0.4238974501365172, "grad_norm": 4.21875, "learning_rate": 1.4261534735362013e-05, "loss": 1.2377, "mean_token_accuracy": 0.6699119373776907, "step": 4580 }, { "epoch": 0.4243602202785876, "grad_norm": 4.46875, "learning_rate": 1.4246914120460193e-05, "loss": 1.213, "mean_token_accuracy": 0.6756115459882583, "step": 4585 }, { "epoch": 0.4248229904206581, "grad_norm": 4.78125, "learning_rate": 1.423228242114874e-05, "loss": 1.2573, "mean_token_accuracy": 0.6641144814090019, "step": 4590 }, { "epoch": 0.4252857605627285, "grad_norm": 4.84375, "learning_rate": 1.4217639675616268e-05, "loss": 1.2595, "mean_token_accuracy": 0.6620596868884541, "step": 4595 }, { "epoch": 0.4257485307047989, "grad_norm": 3.96875, "learning_rate": 1.4202985922080212e-05, "loss": 1.1591, "mean_token_accuracy": 0.6869618395303327, "step": 4600 }, { "epoch": 0.4257485307047989, "eval_loss": 1.1954898834228516, "eval_mean_token_accuracy": 0.679412304305283, "eval_runtime": 41.2625, "eval_samples_per_second": 24.817, "eval_steps_per_second": 6.204, "step": 4600 }, { "epoch": 0.42621130084686937, "grad_norm": 4.46875, "learning_rate": 1.4188321198786752e-05, "loss": 1.1797, "mean_token_accuracy": 0.6843199608610566, "step": 4605 }, { "epoch": 0.4266740709889398, "grad_norm": 4.15625, "learning_rate": 1.4173645544010682e-05, "loss": 1.1967, "mean_token_accuracy": 0.6746575342465755, "step": 4610 }, { "epoch": 0.42713684113101025, "grad_norm": 4.40625, "learning_rate": 1.4158958996055343e-05, "loss": 1.1224, "mean_token_accuracy": 0.697358121330724, "step": 4615 }, { "epoch": 0.42759961127308066, "grad_norm": 4.3125, "learning_rate": 1.4144261593252497e-05, "loss": 1.1863, "mean_token_accuracy": 0.6816046966731898, "step": 4620 }, { "epoch": 0.42806238141515107, "grad_norm": 4.21875, "learning_rate": 1.412955337396224e-05, "loss": 1.1544, "mean_token_accuracy": 0.6923189823874755, "step": 4625 }, { "epoch": 0.42852515155722154, "grad_norm": 4.125, "learning_rate": 1.4114834376572898e-05, "loss": 1.1459, "mean_token_accuracy": 0.6884784735812133, "step": 4630 }, { "epoch": 0.42898792169929195, "grad_norm": 4.09375, "learning_rate": 1.4100104639500929e-05, "loss": 1.1666, "mean_token_accuracy": 0.6902397260273972, "step": 4635 }, { "epoch": 0.4294506918413624, "grad_norm": 4.65625, "learning_rate": 1.4085364201190825e-05, "loss": 1.2077, "mean_token_accuracy": 0.6780332681017612, "step": 4640 }, { "epoch": 0.42991346198343283, "grad_norm": 4.125, "learning_rate": 1.4070613100115001e-05, "loss": 1.142, "mean_token_accuracy": 0.6906800391389432, "step": 4645 }, { "epoch": 0.43037623212550324, "grad_norm": 5.46875, "learning_rate": 1.4055851374773703e-05, "loss": 1.1844, "mean_token_accuracy": 0.6786448140900198, "step": 4650 }, { "epoch": 0.4308390022675737, "grad_norm": 4.21875, "learning_rate": 1.4041079063694912e-05, "loss": 1.1213, "mean_token_accuracy": 0.699853228962818, "step": 4655 }, { "epoch": 0.4313017724096441, "grad_norm": 4.9375, "learning_rate": 1.4026296205434228e-05, "loss": 1.1483, "mean_token_accuracy": 0.6896037181996086, "step": 4660 }, { "epoch": 0.43176454255171454, "grad_norm": 4.125, "learning_rate": 1.4011502838574793e-05, "loss": 1.1515, "mean_token_accuracy": 0.6896037181996085, "step": 4665 }, { "epoch": 0.432227312693785, "grad_norm": 4.21875, "learning_rate": 1.3996699001727163e-05, "loss": 1.2112, "mean_token_accuracy": 0.6747309197651663, "step": 4670 }, { "epoch": 0.4326900828358554, "grad_norm": 4.21875, "learning_rate": 1.398188473352923e-05, "loss": 1.1624, "mean_token_accuracy": 0.6913894324853229, "step": 4675 }, { "epoch": 0.4331528529779259, "grad_norm": 4.09375, "learning_rate": 1.3967060072646099e-05, "loss": 1.223, "mean_token_accuracy": 0.6745841487279844, "step": 4680 }, { "epoch": 0.4336156231199963, "grad_norm": 4.09375, "learning_rate": 1.3952225057770022e-05, "loss": 1.1637, "mean_token_accuracy": 0.6843933463796479, "step": 4685 }, { "epoch": 0.4340783932620667, "grad_norm": 4.0625, "learning_rate": 1.3937379727620249e-05, "loss": 1.1534, "mean_token_accuracy": 0.6871575342465754, "step": 4690 }, { "epoch": 0.4345411634041372, "grad_norm": 4.59375, "learning_rate": 1.392252412094297e-05, "loss": 1.1346, "mean_token_accuracy": 0.6974804305283756, "step": 4695 }, { "epoch": 0.4350039335462076, "grad_norm": 4.1875, "learning_rate": 1.3907658276511193e-05, "loss": 1.1231, "mean_token_accuracy": 0.6963551859099805, "step": 4700 }, { "epoch": 0.4350039335462076, "eval_loss": 1.1946104764938354, "eval_mean_token_accuracy": 0.6797161662181994, "eval_runtime": 39.7241, "eval_samples_per_second": 25.778, "eval_steps_per_second": 6.444, "step": 4700 }, { "epoch": 0.43546670368827806, "grad_norm": 4.3125, "learning_rate": 1.3892782233124644e-05, "loss": 1.1878, "mean_token_accuracy": 0.6840508806262231, "step": 4705 }, { "epoch": 0.43592947383034847, "grad_norm": 4.21875, "learning_rate": 1.3877896029609667e-05, "loss": 1.1518, "mean_token_accuracy": 0.6916585127201567, "step": 4710 }, { "epoch": 0.4363922439724189, "grad_norm": 4.59375, "learning_rate": 1.3862999704819134e-05, "loss": 1.2167, "mean_token_accuracy": 0.6705724070450099, "step": 4715 }, { "epoch": 0.43685501411448935, "grad_norm": 4.1875, "learning_rate": 1.3848093297632312e-05, "loss": 1.2684, "mean_token_accuracy": 0.6626223091976517, "step": 4720 }, { "epoch": 0.43731778425655976, "grad_norm": 4.28125, "learning_rate": 1.383317684695481e-05, "loss": 1.1628, "mean_token_accuracy": 0.6880136986301371, "step": 4725 }, { "epoch": 0.4377805543986302, "grad_norm": 4.1875, "learning_rate": 1.381825039171843e-05, "loss": 1.1636, "mean_token_accuracy": 0.6864481409001957, "step": 4730 }, { "epoch": 0.43824332454070064, "grad_norm": 4.46875, "learning_rate": 1.3803313970881093e-05, "loss": 1.1826, "mean_token_accuracy": 0.6852495107632094, "step": 4735 }, { "epoch": 0.43870609468277105, "grad_norm": 5.28125, "learning_rate": 1.3788367623426733e-05, "loss": 1.1231, "mean_token_accuracy": 0.6947407045009785, "step": 4740 }, { "epoch": 0.4391688648248415, "grad_norm": 4.15625, "learning_rate": 1.3773411388365188e-05, "loss": 1.1616, "mean_token_accuracy": 0.6885273972602741, "step": 4745 }, { "epoch": 0.43963163496691193, "grad_norm": 4.84375, "learning_rate": 1.3758445304732104e-05, "loss": 1.126, "mean_token_accuracy": 0.6938600782778864, "step": 4750 }, { "epoch": 0.44009440510898234, "grad_norm": 3.90625, "learning_rate": 1.374346941158883e-05, "loss": 1.1214, "mean_token_accuracy": 0.6966731898238747, "step": 4755 }, { "epoch": 0.4405571752510528, "grad_norm": 4.125, "learning_rate": 1.3728483748022322e-05, "loss": 1.1803, "mean_token_accuracy": 0.681335616438356, "step": 4760 }, { "epoch": 0.4410199453931232, "grad_norm": 4.4375, "learning_rate": 1.371348835314503e-05, "loss": 1.1623, "mean_token_accuracy": 0.6874021526418785, "step": 4765 }, { "epoch": 0.4414827155351937, "grad_norm": 4.0625, "learning_rate": 1.3698483266094814e-05, "loss": 1.0918, "mean_token_accuracy": 0.7014921722113503, "step": 4770 }, { "epoch": 0.4419454856772641, "grad_norm": 4.3125, "learning_rate": 1.3683468526034817e-05, "loss": 1.2065, "mean_token_accuracy": 0.6811888454011741, "step": 4775 }, { "epoch": 0.4424082558193345, "grad_norm": 4.21875, "learning_rate": 1.3668444172153385e-05, "loss": 1.1711, "mean_token_accuracy": 0.6852739726027397, "step": 4780 }, { "epoch": 0.442871025961405, "grad_norm": 4.71875, "learning_rate": 1.3653410243663953e-05, "loss": 1.1934, "mean_token_accuracy": 0.6795988258317026, "step": 4785 }, { "epoch": 0.4433337961034754, "grad_norm": 4.1875, "learning_rate": 1.3638366779804943e-05, "loss": 1.1748, "mean_token_accuracy": 0.6866682974559686, "step": 4790 }, { "epoch": 0.44379656624554586, "grad_norm": 4.21875, "learning_rate": 1.3623313819839674e-05, "loss": 1.1267, "mean_token_accuracy": 0.6943737769080235, "step": 4795 }, { "epoch": 0.4442593363876163, "grad_norm": 4.21875, "learning_rate": 1.3608251403056236e-05, "loss": 1.1865, "mean_token_accuracy": 0.6834637964774951, "step": 4800 }, { "epoch": 0.4442593363876163, "eval_loss": 1.1944069862365723, "eval_mean_token_accuracy": 0.679397015655577, "eval_runtime": 39.6831, "eval_samples_per_second": 25.804, "eval_steps_per_second": 6.451, "step": 4800 }, { "epoch": 0.4447221065296867, "grad_norm": 4.78125, "learning_rate": 1.3593179568767415e-05, "loss": 1.1055, "mean_token_accuracy": 0.7050146771037181, "step": 4805 }, { "epoch": 0.44518487667175716, "grad_norm": 4.1875, "learning_rate": 1.3578098356310565e-05, "loss": 1.1583, "mean_token_accuracy": 0.6878669275929551, "step": 4810 }, { "epoch": 0.44564764681382757, "grad_norm": 4.1875, "learning_rate": 1.3563007805047523e-05, "loss": 1.2158, "mean_token_accuracy": 0.6738992172211351, "step": 4815 }, { "epoch": 0.446110416955898, "grad_norm": 4.1875, "learning_rate": 1.35479079543645e-05, "loss": 1.1971, "mean_token_accuracy": 0.6766144814090019, "step": 4820 }, { "epoch": 0.44657318709796845, "grad_norm": 4.0625, "learning_rate": 1.3532798843671976e-05, "loss": 1.1601, "mean_token_accuracy": 0.6900440313111547, "step": 4825 }, { "epoch": 0.44703595724003886, "grad_norm": 4.5, "learning_rate": 1.3517680512404605e-05, "loss": 1.1489, "mean_token_accuracy": 0.6872064579256361, "step": 4830 }, { "epoch": 0.44749872738210933, "grad_norm": 6.15625, "learning_rate": 1.35025530000211e-05, "loss": 1.1193, "mean_token_accuracy": 0.6989970645792565, "step": 4835 }, { "epoch": 0.44796149752417974, "grad_norm": 4.0, "learning_rate": 1.3487416346004139e-05, "loss": 1.1241, "mean_token_accuracy": 0.6979696673189824, "step": 4840 }, { "epoch": 0.44842426766625015, "grad_norm": 4.15625, "learning_rate": 1.3472270589860259e-05, "loss": 1.1373, "mean_token_accuracy": 0.6918297455968689, "step": 4845 }, { "epoch": 0.4488870378083206, "grad_norm": 4.21875, "learning_rate": 1.3457115771119754e-05, "loss": 1.1949, "mean_token_accuracy": 0.6813600782778866, "step": 4850 }, { "epoch": 0.44934980795039103, "grad_norm": 4.03125, "learning_rate": 1.3441951929336574e-05, "loss": 1.2073, "mean_token_accuracy": 0.67573385518591, "step": 4855 }, { "epoch": 0.4498125780924615, "grad_norm": 4.25, "learning_rate": 1.3426779104088214e-05, "loss": 1.1069, "mean_token_accuracy": 0.7003913894324852, "step": 4860 }, { "epoch": 0.4502753482345319, "grad_norm": 4.28125, "learning_rate": 1.3411597334975624e-05, "loss": 1.1287, "mean_token_accuracy": 0.6946183953033269, "step": 4865 }, { "epoch": 0.4507381183766023, "grad_norm": 4.09375, "learning_rate": 1.3396406661623086e-05, "loss": 1.1281, "mean_token_accuracy": 0.6938356164383563, "step": 4870 }, { "epoch": 0.4512008885186728, "grad_norm": 4.125, "learning_rate": 1.3381207123678129e-05, "loss": 1.1437, "mean_token_accuracy": 0.6912915851272017, "step": 4875 }, { "epoch": 0.4516636586607432, "grad_norm": 4.15625, "learning_rate": 1.336599876081142e-05, "loss": 1.1925, "mean_token_accuracy": 0.6777641878669276, "step": 4880 }, { "epoch": 0.4521264288028136, "grad_norm": 4.21875, "learning_rate": 1.3350781612716655e-05, "loss": 1.1936, "mean_token_accuracy": 0.6781555772994129, "step": 4885 }, { "epoch": 0.4525891989448841, "grad_norm": 4.25, "learning_rate": 1.333555571911046e-05, "loss": 1.268, "mean_token_accuracy": 0.6607142857142857, "step": 4890 }, { "epoch": 0.4530519690869545, "grad_norm": 4.125, "learning_rate": 1.3320321119732289e-05, "loss": 1.2046, "mean_token_accuracy": 0.67866927592955, "step": 4895 }, { "epoch": 0.45351473922902497, "grad_norm": 4.125, "learning_rate": 1.3305077854344313e-05, "loss": 1.1139, "mean_token_accuracy": 0.6973336594911936, "step": 4900 }, { "epoch": 0.45351473922902497, "eval_loss": 1.193894624710083, "eval_mean_token_accuracy": 0.6797027886497056, "eval_runtime": 39.6597, "eval_samples_per_second": 25.82, "eval_steps_per_second": 6.455, "step": 4900 }, { "epoch": 0.4539775093710954, "grad_norm": 3.953125, "learning_rate": 1.3289825962731328e-05, "loss": 1.1983, "mean_token_accuracy": 0.6781555772994129, "step": 4905 }, { "epoch": 0.4544402795131658, "grad_norm": 4.40625, "learning_rate": 1.327456548470064e-05, "loss": 1.185, "mean_token_accuracy": 0.6767123287671233, "step": 4910 }, { "epoch": 0.45490304965523626, "grad_norm": 6.6875, "learning_rate": 1.3259296460081967e-05, "loss": 1.2339, "mean_token_accuracy": 0.6715264187866927, "step": 4915 }, { "epoch": 0.45536581979730667, "grad_norm": 4.375, "learning_rate": 1.3244018928727332e-05, "loss": 1.1588, "mean_token_accuracy": 0.6886007827788649, "step": 4920 }, { "epoch": 0.45582858993937714, "grad_norm": 4.5625, "learning_rate": 1.3228732930510963e-05, "loss": 1.1863, "mean_token_accuracy": 0.6829011741682975, "step": 4925 }, { "epoch": 0.45629136008144755, "grad_norm": 4.3125, "learning_rate": 1.3213438505329184e-05, "loss": 1.164, "mean_token_accuracy": 0.686252446183953, "step": 4930 }, { "epoch": 0.45675413022351796, "grad_norm": 4.5, "learning_rate": 1.3198135693100314e-05, "loss": 1.181, "mean_token_accuracy": 0.6840753424657533, "step": 4935 }, { "epoch": 0.45721690036558843, "grad_norm": 4.4375, "learning_rate": 1.3182824533764565e-05, "loss": 1.1451, "mean_token_accuracy": 0.6896771037181996, "step": 4940 }, { "epoch": 0.45767967050765884, "grad_norm": 4.40625, "learning_rate": 1.3167505067283926e-05, "loss": 1.1449, "mean_token_accuracy": 0.6946183953033269, "step": 4945 }, { "epoch": 0.45814244064972925, "grad_norm": 4.375, "learning_rate": 1.315217733364208e-05, "loss": 1.1286, "mean_token_accuracy": 0.6958659491193738, "step": 4950 }, { "epoch": 0.4586052107917997, "grad_norm": 4.25, "learning_rate": 1.3136841372844277e-05, "loss": 1.157, "mean_token_accuracy": 0.6882583170254402, "step": 4955 }, { "epoch": 0.45906798093387013, "grad_norm": 4.21875, "learning_rate": 1.3121497224917247e-05, "loss": 1.1935, "mean_token_accuracy": 0.6816046966731898, "step": 4960 }, { "epoch": 0.4595307510759406, "grad_norm": 4.1875, "learning_rate": 1.3106144929909085e-05, "loss": 1.1584, "mean_token_accuracy": 0.6863502935420743, "step": 4965 }, { "epoch": 0.459993521218011, "grad_norm": 4.40625, "learning_rate": 1.309078452788915e-05, "loss": 1.177, "mean_token_accuracy": 0.6813845401174168, "step": 4970 }, { "epoch": 0.4604562913600814, "grad_norm": 4.09375, "learning_rate": 1.3075416058947958e-05, "loss": 1.1454, "mean_token_accuracy": 0.6885273972602739, "step": 4975 }, { "epoch": 0.4609190615021519, "grad_norm": 4.15625, "learning_rate": 1.3060039563197082e-05, "loss": 1.1156, "mean_token_accuracy": 0.6951320939334639, "step": 4980 }, { "epoch": 0.4613818316442223, "grad_norm": 5.40625, "learning_rate": 1.3044655080769048e-05, "loss": 1.1468, "mean_token_accuracy": 0.6966731898238747, "step": 4985 }, { "epoch": 0.4618446017862928, "grad_norm": 4.125, "learning_rate": 1.3029262651817218e-05, "loss": 1.0871, "mean_token_accuracy": 0.7053816046966732, "step": 4990 }, { "epoch": 0.4623073719283632, "grad_norm": 4.03125, "learning_rate": 1.301386231651571e-05, "loss": 1.1374, "mean_token_accuracy": 0.6909001956947163, "step": 4995 }, { "epoch": 0.4627701420704336, "grad_norm": 4.21875, "learning_rate": 1.2998454115059258e-05, "loss": 1.2152, "mean_token_accuracy": 0.6750489236790607, "step": 5000 }, { "epoch": 0.4627701420704336, "eval_loss": 1.1934036016464233, "eval_mean_token_accuracy": 0.6798098091976511, "eval_runtime": 39.7488, "eval_samples_per_second": 25.762, "eval_steps_per_second": 6.44, "step": 5000 }, { "epoch": 0.46323291221250407, "grad_norm": 4.09375, "learning_rate": 1.298303808766314e-05, "loss": 1.1511, "mean_token_accuracy": 0.6871575342465752, "step": 5005 }, { "epoch": 0.4636956823545745, "grad_norm": 4.34375, "learning_rate": 1.296761427456306e-05, "loss": 1.1848, "mean_token_accuracy": 0.6776908023483366, "step": 5010 }, { "epoch": 0.4641584524966449, "grad_norm": 5.78125, "learning_rate": 1.295218271601503e-05, "loss": 1.0839, "mean_token_accuracy": 0.7119373776908025, "step": 5015 }, { "epoch": 0.46462122263871536, "grad_norm": 4.25, "learning_rate": 1.2936743452295292e-05, "loss": 1.201, "mean_token_accuracy": 0.678082191780822, "step": 5020 }, { "epoch": 0.46508399278078577, "grad_norm": 4.0625, "learning_rate": 1.2921296523700195e-05, "loss": 1.1937, "mean_token_accuracy": 0.6782045009784736, "step": 5025 }, { "epoch": 0.46554676292285624, "grad_norm": 3.90625, "learning_rate": 1.2905841970546087e-05, "loss": 1.1594, "mean_token_accuracy": 0.6855919765166341, "step": 5030 }, { "epoch": 0.46600953306492665, "grad_norm": 4.1875, "learning_rate": 1.2890379833169222e-05, "loss": 1.1865, "mean_token_accuracy": 0.6809931506849315, "step": 5035 }, { "epoch": 0.46647230320699706, "grad_norm": 4.0, "learning_rate": 1.2874910151925639e-05, "loss": 1.1619, "mean_token_accuracy": 0.6891144814090018, "step": 5040 }, { "epoch": 0.46693507334906753, "grad_norm": 4.46875, "learning_rate": 1.2859432967191086e-05, "loss": 1.1833, "mean_token_accuracy": 0.6801859099804305, "step": 5045 }, { "epoch": 0.46739784349113794, "grad_norm": 4.15625, "learning_rate": 1.2843948319360875e-05, "loss": 1.1988, "mean_token_accuracy": 0.6798189823874756, "step": 5050 }, { "epoch": 0.4678606136332084, "grad_norm": 4.25, "learning_rate": 1.2828456248849808e-05, "loss": 1.1907, "mean_token_accuracy": 0.6830724070450096, "step": 5055 }, { "epoch": 0.4683233837752788, "grad_norm": 4.125, "learning_rate": 1.2812956796092056e-05, "loss": 1.1678, "mean_token_accuracy": 0.6863992172211351, "step": 5060 }, { "epoch": 0.46878615391734924, "grad_norm": 4.09375, "learning_rate": 1.2797450001541058e-05, "loss": 1.1554, "mean_token_accuracy": 0.6872553816046968, "step": 5065 }, { "epoch": 0.4692489240594197, "grad_norm": 4.03125, "learning_rate": 1.2781935905669417e-05, "loss": 1.1607, "mean_token_accuracy": 0.6894324853228962, "step": 5070 }, { "epoch": 0.4697116942014901, "grad_norm": 4.21875, "learning_rate": 1.2766414548968789e-05, "loss": 1.2054, "mean_token_accuracy": 0.6744618395303327, "step": 5075 }, { "epoch": 0.4701744643435605, "grad_norm": 4.125, "learning_rate": 1.2750885971949788e-05, "loss": 1.1983, "mean_token_accuracy": 0.6808463796477495, "step": 5080 }, { "epoch": 0.470637234485631, "grad_norm": 4.1875, "learning_rate": 1.2735350215141857e-05, "loss": 1.2058, "mean_token_accuracy": 0.6760273972602739, "step": 5085 }, { "epoch": 0.4711000046277014, "grad_norm": 4.4375, "learning_rate": 1.27198073190932e-05, "loss": 1.0901, "mean_token_accuracy": 0.7041095890410959, "step": 5090 }, { "epoch": 0.4715627747697719, "grad_norm": 4.28125, "learning_rate": 1.2704257324370634e-05, "loss": 1.1759, "mean_token_accuracy": 0.6846868884540116, "step": 5095 }, { "epoch": 0.4720255449118423, "grad_norm": 4.875, "learning_rate": 1.268870027155952e-05, "loss": 1.1393, "mean_token_accuracy": 0.6939334637964774, "step": 5100 }, { "epoch": 0.4720255449118423, "eval_loss": 1.1932510137557983, "eval_mean_token_accuracy": 0.6797639432485321, "eval_runtime": 39.7136, "eval_samples_per_second": 25.785, "eval_steps_per_second": 6.446, "step": 5100 }, { "epoch": 0.4724883150539127, "grad_norm": 4.03125, "learning_rate": 1.267313620126363e-05, "loss": 1.1133, "mean_token_accuracy": 0.6959637964774951, "step": 5105 }, { "epoch": 0.47295108519598317, "grad_norm": 4.15625, "learning_rate": 1.2657565154105051e-05, "loss": 1.2026, "mean_token_accuracy": 0.6768835616438358, "step": 5110 }, { "epoch": 0.4734138553380536, "grad_norm": 4.09375, "learning_rate": 1.2641987170724088e-05, "loss": 1.0891, "mean_token_accuracy": 0.7059197651663406, "step": 5115 }, { "epoch": 0.47387662548012405, "grad_norm": 4.25, "learning_rate": 1.2626402291779142e-05, "loss": 1.2284, "mean_token_accuracy": 0.6695939334637966, "step": 5120 }, { "epoch": 0.47433939562219446, "grad_norm": 4.625, "learning_rate": 1.2610810557946622e-05, "loss": 1.1481, "mean_token_accuracy": 0.69043542074364, "step": 5125 }, { "epoch": 0.47480216576426487, "grad_norm": 4.21875, "learning_rate": 1.259521200992081e-05, "loss": 1.1642, "mean_token_accuracy": 0.6882827788649706, "step": 5130 }, { "epoch": 0.47526493590633534, "grad_norm": 4.46875, "learning_rate": 1.2579606688413785e-05, "loss": 1.1428, "mean_token_accuracy": 0.6914628180039138, "step": 5135 }, { "epoch": 0.47572770604840575, "grad_norm": 5.03125, "learning_rate": 1.256399463415531e-05, "loss": 1.1886, "mean_token_accuracy": 0.6828767123287671, "step": 5140 }, { "epoch": 0.47619047619047616, "grad_norm": 5.0, "learning_rate": 1.2548375887892705e-05, "loss": 1.1087, "mean_token_accuracy": 0.7013698630136987, "step": 5145 }, { "epoch": 0.47665324633254663, "grad_norm": 4.5, "learning_rate": 1.2532750490390773e-05, "loss": 1.105, "mean_token_accuracy": 0.699853228962818, "step": 5150 }, { "epoch": 0.47711601647461704, "grad_norm": 5.28125, "learning_rate": 1.251711848243166e-05, "loss": 1.1926, "mean_token_accuracy": 0.6806996086105677, "step": 5155 }, { "epoch": 0.4775787866166875, "grad_norm": 4.375, "learning_rate": 1.250147990481478e-05, "loss": 1.09, "mean_token_accuracy": 0.7042074363992172, "step": 5160 }, { "epoch": 0.4780415567587579, "grad_norm": 3.921875, "learning_rate": 1.2485834798356682e-05, "loss": 1.1839, "mean_token_accuracy": 0.6842465753424659, "step": 5165 }, { "epoch": 0.47850432690082834, "grad_norm": 4.21875, "learning_rate": 1.2470183203890959e-05, "loss": 1.1703, "mean_token_accuracy": 0.683463796477495, "step": 5170 }, { "epoch": 0.4789670970428988, "grad_norm": 4.125, "learning_rate": 1.2454525162268149e-05, "loss": 1.1972, "mean_token_accuracy": 0.6787181996086107, "step": 5175 }, { "epoch": 0.4794298671849692, "grad_norm": 4.0625, "learning_rate": 1.2438860714355596e-05, "loss": 1.0912, "mean_token_accuracy": 0.7039628180039139, "step": 5180 }, { "epoch": 0.4798926373270397, "grad_norm": 3.921875, "learning_rate": 1.2423189901037383e-05, "loss": 1.1589, "mean_token_accuracy": 0.6907289628180039, "step": 5185 }, { "epoch": 0.4803554074691101, "grad_norm": 6.1875, "learning_rate": 1.2407512763214196e-05, "loss": 1.1842, "mean_token_accuracy": 0.6793052837573386, "step": 5190 }, { "epoch": 0.4808181776111805, "grad_norm": 4.03125, "learning_rate": 1.239182934180323e-05, "loss": 1.1491, "mean_token_accuracy": 0.6911448140900195, "step": 5195 }, { "epoch": 0.481280947753251, "grad_norm": 4.03125, "learning_rate": 1.2376139677738083e-05, "loss": 1.1385, "mean_token_accuracy": 0.6948630136986302, "step": 5200 }, { "epoch": 0.481280947753251, "eval_loss": 1.1927549839019775, "eval_mean_token_accuracy": 0.6797868762230912, "eval_runtime": 41.117, "eval_samples_per_second": 24.905, "eval_steps_per_second": 6.226, "step": 5200 }, { "epoch": 0.4817437178953214, "grad_norm": 4.21875, "learning_rate": 1.2360443811968645e-05, "loss": 1.144, "mean_token_accuracy": 0.6943737769080236, "step": 5205 }, { "epoch": 0.4822064880373918, "grad_norm": 4.40625, "learning_rate": 1.234474178546099e-05, "loss": 1.2147, "mean_token_accuracy": 0.6761986301369863, "step": 5210 }, { "epoch": 0.48266925817946227, "grad_norm": 4.1875, "learning_rate": 1.2329033639197275e-05, "loss": 1.1795, "mean_token_accuracy": 0.6857142857142857, "step": 5215 }, { "epoch": 0.4831320283215327, "grad_norm": 5.25, "learning_rate": 1.2313319414175626e-05, "loss": 1.1371, "mean_token_accuracy": 0.6904843444227006, "step": 5220 }, { "epoch": 0.48359479846360315, "grad_norm": 4.28125, "learning_rate": 1.2297599151410036e-05, "loss": 1.2033, "mean_token_accuracy": 0.6745596868884539, "step": 5225 }, { "epoch": 0.48405756860567356, "grad_norm": 4.03125, "learning_rate": 1.2281872891930257e-05, "loss": 1.1096, "mean_token_accuracy": 0.6948874755381605, "step": 5230 }, { "epoch": 0.484520338747744, "grad_norm": 4.15625, "learning_rate": 1.2266140676781695e-05, "loss": 1.2242, "mean_token_accuracy": 0.6768590998043053, "step": 5235 }, { "epoch": 0.48498310888981444, "grad_norm": 4.3125, "learning_rate": 1.2250402547025293e-05, "loss": 1.1789, "mean_token_accuracy": 0.6812622309197651, "step": 5240 }, { "epoch": 0.48544587903188485, "grad_norm": 4.3125, "learning_rate": 1.2234658543737439e-05, "loss": 1.1605, "mean_token_accuracy": 0.6849804305283758, "step": 5245 }, { "epoch": 0.4859086491739553, "grad_norm": 4.34375, "learning_rate": 1.2218908708009842e-05, "loss": 1.1307, "mean_token_accuracy": 0.6939334637964776, "step": 5250 }, { "epoch": 0.48637141931602573, "grad_norm": 3.796875, "learning_rate": 1.2203153080949448e-05, "loss": 1.1576, "mean_token_accuracy": 0.6883561643835617, "step": 5255 }, { "epoch": 0.48683418945809614, "grad_norm": 4.15625, "learning_rate": 1.2187391703678301e-05, "loss": 1.1941, "mean_token_accuracy": 0.6805283757338553, "step": 5260 }, { "epoch": 0.4872969596001666, "grad_norm": 4.21875, "learning_rate": 1.2171624617333464e-05, "loss": 1.1451, "mean_token_accuracy": 0.688307240704501, "step": 5265 }, { "epoch": 0.487759729742237, "grad_norm": 4.25, "learning_rate": 1.21558518630669e-05, "loss": 1.1298, "mean_token_accuracy": 0.6941780821917807, "step": 5270 }, { "epoch": 0.48822249988430744, "grad_norm": 4.0625, "learning_rate": 1.2140073482045363e-05, "loss": 1.1397, "mean_token_accuracy": 0.6965019569471624, "step": 5275 }, { "epoch": 0.4886852700263779, "grad_norm": 4.09375, "learning_rate": 1.2124289515450287e-05, "loss": 1.12, "mean_token_accuracy": 0.6941291585127201, "step": 5280 }, { "epoch": 0.4891480401684483, "grad_norm": 3.90625, "learning_rate": 1.2108500004477695e-05, "loss": 1.1327, "mean_token_accuracy": 0.6952544031311154, "step": 5285 }, { "epoch": 0.4896108103105188, "grad_norm": 4.40625, "learning_rate": 1.2092704990338077e-05, "loss": 1.1947, "mean_token_accuracy": 0.6784246575342465, "step": 5290 }, { "epoch": 0.4900735804525892, "grad_norm": 4.78125, "learning_rate": 1.2076904514256285e-05, "loss": 1.1057, "mean_token_accuracy": 0.6995841487279844, "step": 5295 }, { "epoch": 0.4905363505946596, "grad_norm": 4.25, "learning_rate": 1.2061098617471421e-05, "loss": 1.1511, "mean_token_accuracy": 0.6921966731898239, "step": 5300 }, { "epoch": 0.4905363505946596, "eval_loss": 1.1923044919967651, "eval_mean_token_accuracy": 0.6800257613747549, "eval_runtime": 39.7768, "eval_samples_per_second": 25.744, "eval_steps_per_second": 6.436, "step": 5300 }, { "epoch": 0.4909991207367301, "grad_norm": 4.1875, "learning_rate": 1.2045287341236746e-05, "loss": 1.1433, "mean_token_accuracy": 0.6892612524461839, "step": 5305 }, { "epoch": 0.4914618908788005, "grad_norm": 4.46875, "learning_rate": 1.2029470726819555e-05, "loss": 1.1864, "mean_token_accuracy": 0.6788160469667319, "step": 5310 }, { "epoch": 0.49192466102087096, "grad_norm": 4.125, "learning_rate": 1.201364881550108e-05, "loss": 1.1609, "mean_token_accuracy": 0.6906311154598824, "step": 5315 }, { "epoch": 0.49238743116294137, "grad_norm": 4.0625, "learning_rate": 1.1997821648576366e-05, "loss": 1.1522, "mean_token_accuracy": 0.6905332681017612, "step": 5320 }, { "epoch": 0.4928502013050118, "grad_norm": 3.9375, "learning_rate": 1.1981989267354194e-05, "loss": 1.1202, "mean_token_accuracy": 0.6941780821917809, "step": 5325 }, { "epoch": 0.49331297144708225, "grad_norm": 5.09375, "learning_rate": 1.1966151713156939e-05, "loss": 1.145, "mean_token_accuracy": 0.6901908023483367, "step": 5330 }, { "epoch": 0.49377574158915266, "grad_norm": 4.90625, "learning_rate": 1.195030902732048e-05, "loss": 1.1548, "mean_token_accuracy": 0.6919520547945204, "step": 5335 }, { "epoch": 0.49423851173122313, "grad_norm": 4.28125, "learning_rate": 1.1934461251194096e-05, "loss": 1.1493, "mean_token_accuracy": 0.6917563600782778, "step": 5340 }, { "epoch": 0.49470128187329354, "grad_norm": 4.25, "learning_rate": 1.1918608426140346e-05, "loss": 1.2571, "mean_token_accuracy": 0.665655577299413, "step": 5345 }, { "epoch": 0.49516405201536395, "grad_norm": 3.984375, "learning_rate": 1.1902750593534969e-05, "loss": 1.1381, "mean_token_accuracy": 0.6915606653620354, "step": 5350 }, { "epoch": 0.4956268221574344, "grad_norm": 4.90625, "learning_rate": 1.188688779476677e-05, "loss": 1.1166, "mean_token_accuracy": 0.6974070450097848, "step": 5355 }, { "epoch": 0.49608959229950483, "grad_norm": 4.4375, "learning_rate": 1.187102007123752e-05, "loss": 1.1645, "mean_token_accuracy": 0.685958904109589, "step": 5360 }, { "epoch": 0.49655236244157525, "grad_norm": 4.6875, "learning_rate": 1.1855147464361845e-05, "loss": 1.0989, "mean_token_accuracy": 0.7016389432485322, "step": 5365 }, { "epoch": 0.4970151325836457, "grad_norm": 4.3125, "learning_rate": 1.1839270015567105e-05, "loss": 1.2114, "mean_token_accuracy": 0.6709148727984345, "step": 5370 }, { "epoch": 0.4974779027257161, "grad_norm": 4.34375, "learning_rate": 1.182338776629332e-05, "loss": 1.2325, "mean_token_accuracy": 0.6664872798434442, "step": 5375 }, { "epoch": 0.4979406728677866, "grad_norm": 4.09375, "learning_rate": 1.1807500757993012e-05, "loss": 1.1463, "mean_token_accuracy": 0.6886497064579256, "step": 5380 }, { "epoch": 0.498403443009857, "grad_norm": 5.375, "learning_rate": 1.1791609032131143e-05, "loss": 1.1732, "mean_token_accuracy": 0.6824608610567514, "step": 5385 }, { "epoch": 0.4988662131519274, "grad_norm": 5.15625, "learning_rate": 1.1775712630184984e-05, "loss": 1.1245, "mean_token_accuracy": 0.6964285714285715, "step": 5390 }, { "epoch": 0.4993289832939979, "grad_norm": 4.53125, "learning_rate": 1.1759811593644002e-05, "loss": 1.1723, "mean_token_accuracy": 0.681922700587084, "step": 5395 }, { "epoch": 0.4997917534360683, "grad_norm": 4.5, "learning_rate": 1.1743905964009774e-05, "loss": 1.2019, "mean_token_accuracy": 0.67646771037182, "step": 5400 }, { "epoch": 0.4997917534360683, "eval_loss": 1.1920087337493896, "eval_mean_token_accuracy": 0.6800181170499012, "eval_runtime": 39.5109, "eval_samples_per_second": 25.917, "eval_steps_per_second": 6.479, "step": 5400 }, { "epoch": 0.5002545235781387, "grad_norm": 4.125, "learning_rate": 1.172799578279585e-05, "loss": 1.1192, "mean_token_accuracy": 0.6940068493150686, "step": 5405 }, { "epoch": 0.5007172937202091, "grad_norm": 4.0625, "learning_rate": 1.1712081091527675e-05, "loss": 1.1506, "mean_token_accuracy": 0.690362035225049, "step": 5410 }, { "epoch": 0.5011800638622796, "grad_norm": 4.53125, "learning_rate": 1.1696161931742454e-05, "loss": 1.1338, "mean_token_accuracy": 0.6939090019569472, "step": 5415 }, { "epoch": 0.5016428340043501, "grad_norm": 4.1875, "learning_rate": 1.1680238344989056e-05, "loss": 1.1989, "mean_token_accuracy": 0.679721135029354, "step": 5420 }, { "epoch": 0.5021056041464205, "grad_norm": 4.375, "learning_rate": 1.1664310372827911e-05, "loss": 1.0826, "mean_token_accuracy": 0.7077299412915852, "step": 5425 }, { "epoch": 0.5025683742884909, "grad_norm": 4.28125, "learning_rate": 1.1648378056830886e-05, "loss": 1.2044, "mean_token_accuracy": 0.6760763209393346, "step": 5430 }, { "epoch": 0.5030311444305613, "grad_norm": 4.25, "learning_rate": 1.1632441438581195e-05, "loss": 1.2126, "mean_token_accuracy": 0.672431506849315, "step": 5435 }, { "epoch": 0.5034939145726318, "grad_norm": 4.96875, "learning_rate": 1.1616500559673269e-05, "loss": 1.1878, "mean_token_accuracy": 0.6845645792563599, "step": 5440 }, { "epoch": 0.5039566847147022, "grad_norm": 4.375, "learning_rate": 1.1600555461712673e-05, "loss": 1.2492, "mean_token_accuracy": 0.6681017612524461, "step": 5445 }, { "epoch": 0.5044194548567726, "grad_norm": 4.0, "learning_rate": 1.1584606186315971e-05, "loss": 1.1447, "mean_token_accuracy": 0.6918297455968688, "step": 5450 }, { "epoch": 0.504882224998843, "grad_norm": 4.53125, "learning_rate": 1.1568652775110638e-05, "loss": 1.182, "mean_token_accuracy": 0.6812377690802347, "step": 5455 }, { "epoch": 0.5053449951409135, "grad_norm": 4.46875, "learning_rate": 1.1552695269734943e-05, "loss": 1.1788, "mean_token_accuracy": 0.6800636007827788, "step": 5460 }, { "epoch": 0.505807765282984, "grad_norm": 4.46875, "learning_rate": 1.153673371183784e-05, "loss": 1.1701, "mean_token_accuracy": 0.683683953033268, "step": 5465 }, { "epoch": 0.5062705354250544, "grad_norm": 5.59375, "learning_rate": 1.1520768143078853e-05, "loss": 1.2061, "mean_token_accuracy": 0.6744373776908024, "step": 5470 }, { "epoch": 0.5067333055671248, "grad_norm": 4.1875, "learning_rate": 1.1504798605127984e-05, "loss": 1.1774, "mean_token_accuracy": 0.6812377690802348, "step": 5475 }, { "epoch": 0.5071960757091952, "grad_norm": 4.25, "learning_rate": 1.1488825139665592e-05, "loss": 1.1931, "mean_token_accuracy": 0.6753913894324854, "step": 5480 }, { "epoch": 0.5076588458512656, "grad_norm": 4.46875, "learning_rate": 1.1472847788382282e-05, "loss": 1.1241, "mean_token_accuracy": 0.6980675146771038, "step": 5485 }, { "epoch": 0.5081216159933362, "grad_norm": 3.9375, "learning_rate": 1.1456866592978805e-05, "loss": 1.2141, "mean_token_accuracy": 0.6779109589041095, "step": 5490 }, { "epoch": 0.5085843861354066, "grad_norm": 5.40625, "learning_rate": 1.1440881595165951e-05, "loss": 1.176, "mean_token_accuracy": 0.6887475538160469, "step": 5495 }, { "epoch": 0.509047156277477, "grad_norm": 4.34375, "learning_rate": 1.1424892836664418e-05, "loss": 1.1759, "mean_token_accuracy": 0.6815068493150684, "step": 5500 }, { "epoch": 0.509047156277477, "eval_loss": 1.1918017864227295, "eval_mean_token_accuracy": 0.6799818065068488, "eval_runtime": 39.6291, "eval_samples_per_second": 25.84, "eval_steps_per_second": 6.46, "step": 5500 }, { "epoch": 0.5095099264195474, "grad_norm": 4.34375, "learning_rate": 1.140890035920474e-05, "loss": 1.1451, "mean_token_accuracy": 0.6955724070450099, "step": 5505 }, { "epoch": 0.5099726965616178, "grad_norm": 4.125, "learning_rate": 1.1392904204527138e-05, "loss": 1.1396, "mean_token_accuracy": 0.6909735812133075, "step": 5510 }, { "epoch": 0.5104354667036882, "grad_norm": 4.5625, "learning_rate": 1.1376904414381442e-05, "loss": 1.1673, "mean_token_accuracy": 0.6886497064579254, "step": 5515 }, { "epoch": 0.5108982368457587, "grad_norm": 4.4375, "learning_rate": 1.1360901030526972e-05, "loss": 1.1336, "mean_token_accuracy": 0.6937133072407045, "step": 5520 }, { "epoch": 0.5113610069878292, "grad_norm": 4.09375, "learning_rate": 1.1344894094732415e-05, "loss": 1.1908, "mean_token_accuracy": 0.6805528375733856, "step": 5525 }, { "epoch": 0.5118237771298996, "grad_norm": 4.53125, "learning_rate": 1.1328883648775746e-05, "loss": 1.1205, "mean_token_accuracy": 0.6948140900195694, "step": 5530 }, { "epoch": 0.51228654727197, "grad_norm": 4.59375, "learning_rate": 1.1312869734444088e-05, "loss": 1.1262, "mean_token_accuracy": 0.6947407045009785, "step": 5535 }, { "epoch": 0.5127493174140404, "grad_norm": 5.03125, "learning_rate": 1.1296852393533619e-05, "loss": 1.235, "mean_token_accuracy": 0.669716242661448, "step": 5540 }, { "epoch": 0.5132120875561109, "grad_norm": 4.625, "learning_rate": 1.1280831667849465e-05, "loss": 1.1285, "mean_token_accuracy": 0.6935909980430528, "step": 5545 }, { "epoch": 0.5136748576981813, "grad_norm": 4.1875, "learning_rate": 1.1264807599205581e-05, "loss": 1.1121, "mean_token_accuracy": 0.6969667318982388, "step": 5550 }, { "epoch": 0.5141376278402517, "grad_norm": 4.28125, "learning_rate": 1.1248780229424652e-05, "loss": 1.1888, "mean_token_accuracy": 0.6809931506849315, "step": 5555 }, { "epoch": 0.5146003979823222, "grad_norm": 3.984375, "learning_rate": 1.1232749600337973e-05, "loss": 1.1579, "mean_token_accuracy": 0.6859099804305285, "step": 5560 }, { "epoch": 0.5150631681243926, "grad_norm": 4.34375, "learning_rate": 1.1216715753785357e-05, "loss": 1.1955, "mean_token_accuracy": 0.6795988258317026, "step": 5565 }, { "epoch": 0.5155259382664631, "grad_norm": 4.34375, "learning_rate": 1.1200678731614995e-05, "loss": 1.1654, "mean_token_accuracy": 0.6895058708414873, "step": 5570 }, { "epoch": 0.5159887084085335, "grad_norm": 4.3125, "learning_rate": 1.1184638575683388e-05, "loss": 1.1302, "mean_token_accuracy": 0.6927837573385517, "step": 5575 }, { "epoch": 0.5164514785506039, "grad_norm": 4.375, "learning_rate": 1.1168595327855203e-05, "loss": 1.1733, "mean_token_accuracy": 0.6826076320939334, "step": 5580 }, { "epoch": 0.5169142486926743, "grad_norm": 4.1875, "learning_rate": 1.1152549030003177e-05, "loss": 1.151, "mean_token_accuracy": 0.6921966731898238, "step": 5585 }, { "epoch": 0.5173770188347447, "grad_norm": 4.125, "learning_rate": 1.1136499724008017e-05, "loss": 1.1768, "mean_token_accuracy": 0.6839530332681016, "step": 5590 }, { "epoch": 0.5178397889768153, "grad_norm": 4.28125, "learning_rate": 1.112044745175827e-05, "loss": 1.2114, "mean_token_accuracy": 0.6770303326810176, "step": 5595 }, { "epoch": 0.5183025591188857, "grad_norm": 4.40625, "learning_rate": 1.1104392255150228e-05, "loss": 1.1187, "mean_token_accuracy": 0.6988502935420743, "step": 5600 }, { "epoch": 0.5183025591188857, "eval_loss": 1.191306471824646, "eval_mean_token_accuracy": 0.6800257613747547, "eval_runtime": 39.6745, "eval_samples_per_second": 25.81, "eval_steps_per_second": 6.453, "step": 5600 }, { "epoch": 0.5187653292609561, "grad_norm": 4.09375, "learning_rate": 1.1088334176087821e-05, "loss": 1.1579, "mean_token_accuracy": 0.6864481409001959, "step": 5605 }, { "epoch": 0.5192280994030265, "grad_norm": 4.5, "learning_rate": 1.1072273256482498e-05, "loss": 1.1576, "mean_token_accuracy": 0.6840753424657534, "step": 5610 }, { "epoch": 0.5196908695450969, "grad_norm": 4.8125, "learning_rate": 1.105620953825312e-05, "loss": 1.1641, "mean_token_accuracy": 0.6858855185909981, "step": 5615 }, { "epoch": 0.5201536396871674, "grad_norm": 4.21875, "learning_rate": 1.1040143063325854e-05, "loss": 1.1574, "mean_token_accuracy": 0.6874755381604697, "step": 5620 }, { "epoch": 0.5206164098292378, "grad_norm": 4.375, "learning_rate": 1.1024073873634067e-05, "loss": 1.1062, "mean_token_accuracy": 0.697945205479452, "step": 5625 }, { "epoch": 0.5210791799713083, "grad_norm": 4.96875, "learning_rate": 1.1008002011118201e-05, "loss": 1.1559, "mean_token_accuracy": 0.6936154598825831, "step": 5630 }, { "epoch": 0.5215419501133787, "grad_norm": 4.3125, "learning_rate": 1.0991927517725688e-05, "loss": 1.2023, "mean_token_accuracy": 0.6787671232876713, "step": 5635 }, { "epoch": 0.5220047202554491, "grad_norm": 3.984375, "learning_rate": 1.0975850435410813e-05, "loss": 1.1882, "mean_token_accuracy": 0.6810665362035225, "step": 5640 }, { "epoch": 0.5224674903975196, "grad_norm": 4.78125, "learning_rate": 1.0959770806134625e-05, "loss": 1.1897, "mean_token_accuracy": 0.681922700587084, "step": 5645 }, { "epoch": 0.52293026053959, "grad_norm": 4.34375, "learning_rate": 1.0943688671864818e-05, "loss": 1.1912, "mean_token_accuracy": 0.6755626223091978, "step": 5650 }, { "epoch": 0.5233930306816604, "grad_norm": 4.125, "learning_rate": 1.0927604074575626e-05, "loss": 1.1192, "mean_token_accuracy": 0.6976027397260275, "step": 5655 }, { "epoch": 0.5238558008237308, "grad_norm": 4.4375, "learning_rate": 1.0911517056247711e-05, "loss": 1.1671, "mean_token_accuracy": 0.6893101761252446, "step": 5660 }, { "epoch": 0.5243185709658013, "grad_norm": 5.09375, "learning_rate": 1.0895427658868056e-05, "loss": 1.1434, "mean_token_accuracy": 0.6899951076320938, "step": 5665 }, { "epoch": 0.5247813411078717, "grad_norm": 4.4375, "learning_rate": 1.0879335924429848e-05, "loss": 1.1817, "mean_token_accuracy": 0.6804305283757337, "step": 5670 }, { "epoch": 0.5252441112499422, "grad_norm": 4.96875, "learning_rate": 1.0863241894932378e-05, "loss": 1.1018, "mean_token_accuracy": 0.7068003913894324, "step": 5675 }, { "epoch": 0.5257068813920126, "grad_norm": 4.0625, "learning_rate": 1.0847145612380922e-05, "loss": 1.1979, "mean_token_accuracy": 0.6784246575342465, "step": 5680 }, { "epoch": 0.526169651534083, "grad_norm": 4.40625, "learning_rate": 1.0831047118786646e-05, "loss": 1.1129, "mean_token_accuracy": 0.6993150684931507, "step": 5685 }, { "epoch": 0.5266324216761534, "grad_norm": 4.21875, "learning_rate": 1.0814946456166475e-05, "loss": 1.1503, "mean_token_accuracy": 0.6894324853228964, "step": 5690 }, { "epoch": 0.5270951918182238, "grad_norm": 4.34375, "learning_rate": 1.0798843666543008e-05, "loss": 1.1835, "mean_token_accuracy": 0.6857142857142856, "step": 5695 }, { "epoch": 0.5275579619602944, "grad_norm": 4.0625, "learning_rate": 1.078273879194438e-05, "loss": 1.1125, "mean_token_accuracy": 0.697431506849315, "step": 5700 }, { "epoch": 0.5275579619602944, "eval_loss": 1.190914511680603, "eval_mean_token_accuracy": 0.6802436246330718, "eval_runtime": 39.5429, "eval_samples_per_second": 25.896, "eval_steps_per_second": 6.474, "step": 5700 }, { "epoch": 0.5280207321023648, "grad_norm": 4.6875, "learning_rate": 1.0766631874404188e-05, "loss": 1.2239, "mean_token_accuracy": 0.6682485322896283, "step": 5705 }, { "epoch": 0.5284835022444352, "grad_norm": 3.984375, "learning_rate": 1.0750522955961339e-05, "loss": 1.1196, "mean_token_accuracy": 0.6992172211350294, "step": 5710 }, { "epoch": 0.5289462723865056, "grad_norm": 4.09375, "learning_rate": 1.0734412078659975e-05, "loss": 1.188, "mean_token_accuracy": 0.6823630136986301, "step": 5715 }, { "epoch": 0.529409042528576, "grad_norm": 4.3125, "learning_rate": 1.0718299284549352e-05, "loss": 1.1381, "mean_token_accuracy": 0.6931017612524463, "step": 5720 }, { "epoch": 0.5298718126706465, "grad_norm": 4.5, "learning_rate": 1.0702184615683721e-05, "loss": 1.1935, "mean_token_accuracy": 0.6816536203522504, "step": 5725 }, { "epoch": 0.530334582812717, "grad_norm": 4.09375, "learning_rate": 1.0686068114122236e-05, "loss": 1.2219, "mean_token_accuracy": 0.6749266144814091, "step": 5730 }, { "epoch": 0.5307973529547874, "grad_norm": 4.125, "learning_rate": 1.066994982192882e-05, "loss": 1.1408, "mean_token_accuracy": 0.6908757338551859, "step": 5735 }, { "epoch": 0.5312601230968578, "grad_norm": 4.9375, "learning_rate": 1.0653829781172087e-05, "loss": 1.1695, "mean_token_accuracy": 0.6881604696673189, "step": 5740 }, { "epoch": 0.5317228932389282, "grad_norm": 4.40625, "learning_rate": 1.0637708033925201e-05, "loss": 1.1178, "mean_token_accuracy": 0.6942759295499021, "step": 5745 }, { "epoch": 0.5321856633809987, "grad_norm": 3.921875, "learning_rate": 1.0621584622265782e-05, "loss": 1.1196, "mean_token_accuracy": 0.6948140900195695, "step": 5750 }, { "epoch": 0.5326484335230691, "grad_norm": 5.1875, "learning_rate": 1.0605459588275803e-05, "loss": 1.1366, "mean_token_accuracy": 0.6970890410958903, "step": 5755 }, { "epoch": 0.5331112036651395, "grad_norm": 5.90625, "learning_rate": 1.0589332974041461e-05, "loss": 1.2408, "mean_token_accuracy": 0.669324853228963, "step": 5760 }, { "epoch": 0.53357397380721, "grad_norm": 4.40625, "learning_rate": 1.0573204821653087e-05, "loss": 1.1341, "mean_token_accuracy": 0.6927837573385519, "step": 5765 }, { "epoch": 0.5340367439492804, "grad_norm": 4.1875, "learning_rate": 1.0557075173205016e-05, "loss": 1.2625, "mean_token_accuracy": 0.6652641878669278, "step": 5770 }, { "epoch": 0.5344995140913509, "grad_norm": 5.1875, "learning_rate": 1.0540944070795491e-05, "loss": 1.1435, "mean_token_accuracy": 0.6926369863013698, "step": 5775 }, { "epoch": 0.5349622842334213, "grad_norm": 5.25, "learning_rate": 1.052481155652656e-05, "loss": 1.1053, "mean_token_accuracy": 0.6983365949119371, "step": 5780 }, { "epoch": 0.5354250543754917, "grad_norm": 4.40625, "learning_rate": 1.0508677672503942e-05, "loss": 1.2804, "mean_token_accuracy": 0.6584148727984345, "step": 5785 }, { "epoch": 0.5358878245175621, "grad_norm": 4.4375, "learning_rate": 1.0492542460836937e-05, "loss": 1.1435, "mean_token_accuracy": 0.690215264187867, "step": 5790 }, { "epoch": 0.5363505946596325, "grad_norm": 4.40625, "learning_rate": 1.0476405963638307e-05, "loss": 1.1671, "mean_token_accuracy": 0.6843688845401174, "step": 5795 }, { "epoch": 0.5368133648017029, "grad_norm": 4.1875, "learning_rate": 1.046026822302418e-05, "loss": 1.1584, "mean_token_accuracy": 0.6867906066536204, "step": 5800 }, { "epoch": 0.5368133648017029, "eval_loss": 1.1907869577407837, "eval_mean_token_accuracy": 0.6801996697651655, "eval_runtime": 39.7188, "eval_samples_per_second": 25.781, "eval_steps_per_second": 6.445, "step": 5800 }, { "epoch": 0.5372761349437735, "grad_norm": 4.4375, "learning_rate": 1.0444129281113913e-05, "loss": 1.233, "mean_token_accuracy": 0.6700587084148728, "step": 5805 }, { "epoch": 0.5377389050858439, "grad_norm": 4.375, "learning_rate": 1.0427989180030011e-05, "loss": 1.152, "mean_token_accuracy": 0.6899706457925636, "step": 5810 }, { "epoch": 0.5382016752279143, "grad_norm": 4.125, "learning_rate": 1.0411847961898e-05, "loss": 1.178, "mean_token_accuracy": 0.6831702544031311, "step": 5815 }, { "epoch": 0.5386644453699847, "grad_norm": 4.25, "learning_rate": 1.039570566884632e-05, "loss": 1.1807, "mean_token_accuracy": 0.6802348336594913, "step": 5820 }, { "epoch": 0.5391272155120551, "grad_norm": 4.90625, "learning_rate": 1.0379562343006221e-05, "loss": 1.221, "mean_token_accuracy": 0.6692025440313112, "step": 5825 }, { "epoch": 0.5395899856541256, "grad_norm": 3.890625, "learning_rate": 1.0363418026511644e-05, "loss": 1.1577, "mean_token_accuracy": 0.6867172211350293, "step": 5830 }, { "epoch": 0.540052755796196, "grad_norm": 4.1875, "learning_rate": 1.0347272761499118e-05, "loss": 1.1267, "mean_token_accuracy": 0.6940068493150686, "step": 5835 }, { "epoch": 0.5405155259382665, "grad_norm": 4.28125, "learning_rate": 1.0331126590107647e-05, "loss": 1.0607, "mean_token_accuracy": 0.7126223091976517, "step": 5840 }, { "epoch": 0.5409782960803369, "grad_norm": 3.96875, "learning_rate": 1.0314979554478599e-05, "loss": 1.2145, "mean_token_accuracy": 0.6757583170254405, "step": 5845 }, { "epoch": 0.5414410662224073, "grad_norm": 4.3125, "learning_rate": 1.02988316967556e-05, "loss": 1.1584, "mean_token_accuracy": 0.6882827788649706, "step": 5850 }, { "epoch": 0.5419038363644778, "grad_norm": 4.46875, "learning_rate": 1.0282683059084422e-05, "loss": 1.1788, "mean_token_accuracy": 0.6853962818003914, "step": 5855 }, { "epoch": 0.5423666065065482, "grad_norm": 4.34375, "learning_rate": 1.0266533683612871e-05, "loss": 1.1565, "mean_token_accuracy": 0.6896037181996085, "step": 5860 }, { "epoch": 0.5428293766486186, "grad_norm": 4.1875, "learning_rate": 1.0250383612490679e-05, "loss": 1.1992, "mean_token_accuracy": 0.6773238747553816, "step": 5865 }, { "epoch": 0.543292146790689, "grad_norm": 5.5, "learning_rate": 1.0234232887869394e-05, "loss": 1.1943, "mean_token_accuracy": 0.6800391389432486, "step": 5870 }, { "epoch": 0.5437549169327595, "grad_norm": 4.9375, "learning_rate": 1.021808155190227e-05, "loss": 1.1866, "mean_token_accuracy": 0.6803816046966731, "step": 5875 }, { "epoch": 0.54421768707483, "grad_norm": 4.65625, "learning_rate": 1.0201929646744154e-05, "loss": 1.1814, "mean_token_accuracy": 0.6797700587084148, "step": 5880 }, { "epoch": 0.5446804572169004, "grad_norm": 5.65625, "learning_rate": 1.0185777214551383e-05, "loss": 1.1377, "mean_token_accuracy": 0.6920254403131116, "step": 5885 }, { "epoch": 0.5451432273589708, "grad_norm": 4.03125, "learning_rate": 1.0169624297481663e-05, "loss": 1.091, "mean_token_accuracy": 0.7049657534246574, "step": 5890 }, { "epoch": 0.5456059975010412, "grad_norm": 4.1875, "learning_rate": 1.0153470937693979e-05, "loss": 1.116, "mean_token_accuracy": 0.6989970645792563, "step": 5895 }, { "epoch": 0.5460687676431116, "grad_norm": 4.28125, "learning_rate": 1.0137317177348455e-05, "loss": 1.0932, "mean_token_accuracy": 0.7047700587084147, "step": 5900 }, { "epoch": 0.5460687676431116, "eval_loss": 1.1903951168060303, "eval_mean_token_accuracy": 0.6803430008561637, "eval_runtime": 39.6619, "eval_samples_per_second": 25.818, "eval_steps_per_second": 6.455, "step": 5900 }, { "epoch": 0.5465315377851822, "grad_norm": 4.40625, "learning_rate": 1.0121163058606267e-05, "loss": 1.1677, "mean_token_accuracy": 0.6837084148727985, "step": 5905 }, { "epoch": 0.5469943079272526, "grad_norm": 4.21875, "learning_rate": 1.0105008623629533e-05, "loss": 1.1326, "mean_token_accuracy": 0.6931506849315069, "step": 5910 }, { "epoch": 0.547457078069323, "grad_norm": 4.125, "learning_rate": 1.0088853914581188e-05, "loss": 1.1623, "mean_token_accuracy": 0.6819227005870842, "step": 5915 }, { "epoch": 0.5479198482113934, "grad_norm": 4.0, "learning_rate": 1.007269897362488e-05, "loss": 1.1623, "mean_token_accuracy": 0.6871330724070449, "step": 5920 }, { "epoch": 0.5483826183534638, "grad_norm": 4.375, "learning_rate": 1.005654384292487e-05, "loss": 1.2543, "mean_token_accuracy": 0.6652397260273973, "step": 5925 }, { "epoch": 0.5488453884955342, "grad_norm": 4.0, "learning_rate": 1.0040388564645915e-05, "loss": 1.1052, "mean_token_accuracy": 0.7033023483365948, "step": 5930 }, { "epoch": 0.5493081586376047, "grad_norm": 4.3125, "learning_rate": 1.0024233180953151e-05, "loss": 1.1378, "mean_token_accuracy": 0.6952544031311153, "step": 5935 }, { "epoch": 0.5497709287796751, "grad_norm": 4.40625, "learning_rate": 1.000807773401199e-05, "loss": 1.141, "mean_token_accuracy": 0.6919520547945206, "step": 5940 }, { "epoch": 0.5502336989217456, "grad_norm": 6.1875, "learning_rate": 9.991922265988014e-06, "loss": 1.1274, "mean_token_accuracy": 0.6974804305283756, "step": 5945 }, { "epoch": 0.550696469063816, "grad_norm": 4.15625, "learning_rate": 9.975766819046854e-06, "loss": 1.157, "mean_token_accuracy": 0.6898483365949117, "step": 5950 }, { "epoch": 0.5511592392058864, "grad_norm": 4.5625, "learning_rate": 9.959611435354085e-06, "loss": 1.1318, "mean_token_accuracy": 0.6954745596868884, "step": 5955 }, { "epoch": 0.5516220093479569, "grad_norm": 4.46875, "learning_rate": 9.94345615707513e-06, "loss": 1.1233, "mean_token_accuracy": 0.6977250489236791, "step": 5960 }, { "epoch": 0.5520847794900273, "grad_norm": 4.4375, "learning_rate": 9.927301026375122e-06, "loss": 1.2195, "mean_token_accuracy": 0.674926614481409, "step": 5965 }, { "epoch": 0.5525475496320977, "grad_norm": 4.40625, "learning_rate": 9.911146085418817e-06, "loss": 1.1495, "mean_token_accuracy": 0.6930039138943249, "step": 5970 }, { "epoch": 0.5530103197741681, "grad_norm": 4.125, "learning_rate": 9.89499137637047e-06, "loss": 1.147, "mean_token_accuracy": 0.691536203522505, "step": 5975 }, { "epoch": 0.5534730899162386, "grad_norm": 5.5, "learning_rate": 9.878836941393732e-06, "loss": 1.1364, "mean_token_accuracy": 0.6949608610567515, "step": 5980 }, { "epoch": 0.5539358600583091, "grad_norm": 4.09375, "learning_rate": 9.862682822651546e-06, "loss": 1.1465, "mean_token_accuracy": 0.6900684931506849, "step": 5985 }, { "epoch": 0.5543986302003795, "grad_norm": 4.09375, "learning_rate": 9.846529062306023e-06, "loss": 1.1642, "mean_token_accuracy": 0.6848091976516633, "step": 5990 }, { "epoch": 0.5548614003424499, "grad_norm": 4.21875, "learning_rate": 9.830375702518339e-06, "loss": 1.1594, "mean_token_accuracy": 0.6897504892367906, "step": 5995 }, { "epoch": 0.5553241704845203, "grad_norm": 4.1875, "learning_rate": 9.814222785448622e-06, "loss": 1.1283, "mean_token_accuracy": 0.6962084148727985, "step": 6000 }, { "epoch": 0.5553241704845203, "eval_loss": 1.1902129650115967, "eval_mean_token_accuracy": 0.6801423373287666, "eval_runtime": 41.2898, "eval_samples_per_second": 24.8, "eval_steps_per_second": 6.2, "step": 6000 }, { "epoch": 0.5557869406265907, "grad_norm": 3.984375, "learning_rate": 9.798070353255848e-06, "loss": 1.1224, "mean_token_accuracy": 0.6977005870841487, "step": 6005 }, { "epoch": 0.5562497107686613, "grad_norm": 4.4375, "learning_rate": 9.781918448097734e-06, "loss": 1.1545, "mean_token_accuracy": 0.6868150684931507, "step": 6010 }, { "epoch": 0.5567124809107317, "grad_norm": 4.09375, "learning_rate": 9.76576711213061e-06, "loss": 1.1865, "mean_token_accuracy": 0.6804060665362035, "step": 6015 }, { "epoch": 0.5571752510528021, "grad_norm": 4.84375, "learning_rate": 9.749616387509326e-06, "loss": 1.1591, "mean_token_accuracy": 0.6886986301369863, "step": 6020 }, { "epoch": 0.5576380211948725, "grad_norm": 4.0625, "learning_rate": 9.733466316387134e-06, "loss": 1.1181, "mean_token_accuracy": 0.697137964774951, "step": 6025 }, { "epoch": 0.5581007913369429, "grad_norm": 4.625, "learning_rate": 9.717316940915582e-06, "loss": 1.1587, "mean_token_accuracy": 0.6873287671232876, "step": 6030 }, { "epoch": 0.5585635614790134, "grad_norm": 4.125, "learning_rate": 9.701168303244402e-06, "loss": 1.1345, "mean_token_accuracy": 0.6940802348336595, "step": 6035 }, { "epoch": 0.5590263316210838, "grad_norm": 4.25, "learning_rate": 9.685020445521403e-06, "loss": 1.1779, "mean_token_accuracy": 0.6798923679060664, "step": 6040 }, { "epoch": 0.5594891017631543, "grad_norm": 4.15625, "learning_rate": 9.668873409892355e-06, "loss": 1.1396, "mean_token_accuracy": 0.6954011741682973, "step": 6045 }, { "epoch": 0.5599518719052247, "grad_norm": 4.28125, "learning_rate": 9.652727238500882e-06, "loss": 1.1956, "mean_token_accuracy": 0.681482387475538, "step": 6050 }, { "epoch": 0.5604146420472951, "grad_norm": 4.28125, "learning_rate": 9.636581973488358e-06, "loss": 1.176, "mean_token_accuracy": 0.6824363992172211, "step": 6055 }, { "epoch": 0.5608774121893655, "grad_norm": 4.03125, "learning_rate": 9.62043765699378e-06, "loss": 1.1753, "mean_token_accuracy": 0.6819471624266146, "step": 6060 }, { "epoch": 0.561340182331436, "grad_norm": 4.0625, "learning_rate": 9.604294331153681e-06, "loss": 1.1053, "mean_token_accuracy": 0.6975782778864972, "step": 6065 }, { "epoch": 0.5618029524735064, "grad_norm": 4.4375, "learning_rate": 9.588152038102005e-06, "loss": 1.1621, "mean_token_accuracy": 0.6854452054794521, "step": 6070 }, { "epoch": 0.5622657226155768, "grad_norm": 4.15625, "learning_rate": 9.57201081996999e-06, "loss": 1.174, "mean_token_accuracy": 0.6857142857142856, "step": 6075 }, { "epoch": 0.5627284927576472, "grad_norm": 4.3125, "learning_rate": 9.555870718886089e-06, "loss": 1.2057, "mean_token_accuracy": 0.6784246575342465, "step": 6080 }, { "epoch": 0.5631912628997177, "grad_norm": 4.3125, "learning_rate": 9.539731776975824e-06, "loss": 1.11, "mean_token_accuracy": 0.6994618395303327, "step": 6085 }, { "epoch": 0.5636540330417882, "grad_norm": 4.25, "learning_rate": 9.523594036361695e-06, "loss": 1.1402, "mean_token_accuracy": 0.6921232876712328, "step": 6090 }, { "epoch": 0.5641168031838586, "grad_norm": 5.03125, "learning_rate": 9.507457539163069e-06, "loss": 1.133, "mean_token_accuracy": 0.696917808219178, "step": 6095 }, { "epoch": 0.564579573325929, "grad_norm": 4.125, "learning_rate": 9.491322327496062e-06, "loss": 1.1338, "mean_token_accuracy": 0.6923679060665362, "step": 6100 }, { "epoch": 0.564579573325929, "eval_loss": 1.1899802684783936, "eval_mean_token_accuracy": 0.6803277122064574, "eval_runtime": 39.635, "eval_samples_per_second": 25.836, "eval_steps_per_second": 6.459, "step": 6100 }, { "epoch": 0.5650423434679994, "grad_norm": 3.9375, "learning_rate": 9.475188443473443e-06, "loss": 1.0907, "mean_token_accuracy": 0.7036692759295498, "step": 6105 }, { "epoch": 0.5655051136100698, "grad_norm": 4.09375, "learning_rate": 9.45905592920451e-06, "loss": 1.0896, "mean_token_accuracy": 0.7036692759295498, "step": 6110 }, { "epoch": 0.5659678837521404, "grad_norm": 4.34375, "learning_rate": 9.442924826794989e-06, "loss": 1.2263, "mean_token_accuracy": 0.6677348336594912, "step": 6115 }, { "epoch": 0.5664306538942108, "grad_norm": 4.125, "learning_rate": 9.42679517834692e-06, "loss": 1.1299, "mean_token_accuracy": 0.6913894324853229, "step": 6120 }, { "epoch": 0.5668934240362812, "grad_norm": 3.921875, "learning_rate": 9.410667025958539e-06, "loss": 1.0964, "mean_token_accuracy": 0.7067025440313113, "step": 6125 }, { "epoch": 0.5673561941783516, "grad_norm": 4.09375, "learning_rate": 9.3945404117242e-06, "loss": 1.1465, "mean_token_accuracy": 0.6920988258317025, "step": 6130 }, { "epoch": 0.567818964320422, "grad_norm": 3.9375, "learning_rate": 9.378415377734222e-06, "loss": 1.1607, "mean_token_accuracy": 0.6841242661448141, "step": 6135 }, { "epoch": 0.5682817344624925, "grad_norm": 4.1875, "learning_rate": 9.362291966074806e-06, "loss": 1.1333, "mean_token_accuracy": 0.6951565557729942, "step": 6140 }, { "epoch": 0.5687445046045629, "grad_norm": 4.25, "learning_rate": 9.346170218827914e-06, "loss": 1.0973, "mean_token_accuracy": 0.7047945205479451, "step": 6145 }, { "epoch": 0.5692072747466334, "grad_norm": 4.28125, "learning_rate": 9.33005017807118e-06, "loss": 1.1037, "mean_token_accuracy": 0.7001223091976517, "step": 6150 }, { "epoch": 0.5696700448887038, "grad_norm": 5.0625, "learning_rate": 9.313931885877767e-06, "loss": 1.1162, "mean_token_accuracy": 0.6995352250489237, "step": 6155 }, { "epoch": 0.5701328150307742, "grad_norm": 4.28125, "learning_rate": 9.29781538431628e-06, "loss": 1.1689, "mean_token_accuracy": 0.6863747553816045, "step": 6160 }, { "epoch": 0.5705955851728447, "grad_norm": 5.0, "learning_rate": 9.281700715450651e-06, "loss": 1.2296, "mean_token_accuracy": 0.673752446183953, "step": 6165 }, { "epoch": 0.5710583553149151, "grad_norm": 4.375, "learning_rate": 9.265587921340027e-06, "loss": 1.179, "mean_token_accuracy": 0.6792808219178081, "step": 6170 }, { "epoch": 0.5715211254569855, "grad_norm": 4.46875, "learning_rate": 9.249477044038663e-06, "loss": 1.1452, "mean_token_accuracy": 0.6909735812133071, "step": 6175 }, { "epoch": 0.5719838955990559, "grad_norm": 4.1875, "learning_rate": 9.233368125595816e-06, "loss": 1.2375, "mean_token_accuracy": 0.6683219178082191, "step": 6180 }, { "epoch": 0.5724466657411263, "grad_norm": 4.0625, "learning_rate": 9.217261208055621e-06, "loss": 1.1194, "mean_token_accuracy": 0.6974804305283758, "step": 6185 }, { "epoch": 0.5729094358831969, "grad_norm": 4.5, "learning_rate": 9.201156333456997e-06, "loss": 1.1786, "mean_token_accuracy": 0.6810420743639923, "step": 6190 }, { "epoch": 0.5733722060252673, "grad_norm": 4.34375, "learning_rate": 9.185053543833525e-06, "loss": 1.1877, "mean_token_accuracy": 0.6773972602739726, "step": 6195 }, { "epoch": 0.5738349761673377, "grad_norm": 4.53125, "learning_rate": 9.168952881213358e-06, "loss": 1.103, "mean_token_accuracy": 0.7036203522504894, "step": 6200 }, { "epoch": 0.5738349761673377, "eval_loss": 1.1897413730621338, "eval_mean_token_accuracy": 0.6802359803082184, "eval_runtime": 39.4701, "eval_samples_per_second": 25.944, "eval_steps_per_second": 6.486, "step": 6200 }, { "epoch": 0.5742977463094081, "grad_norm": 4.875, "learning_rate": 9.152854387619081e-06, "loss": 1.0993, "mean_token_accuracy": 0.7034001956947161, "step": 6205 }, { "epoch": 0.5747605164514785, "grad_norm": 4.875, "learning_rate": 9.136758105067627e-06, "loss": 1.1448, "mean_token_accuracy": 0.6903620352250489, "step": 6210 }, { "epoch": 0.5752232865935489, "grad_norm": 4.0625, "learning_rate": 9.120664075570157e-06, "loss": 1.1806, "mean_token_accuracy": 0.6831702544031312, "step": 6215 }, { "epoch": 0.5756860567356195, "grad_norm": 4.3125, "learning_rate": 9.104572341131946e-06, "loss": 1.1786, "mean_token_accuracy": 0.6872309197651665, "step": 6220 }, { "epoch": 0.5761488268776899, "grad_norm": 4.09375, "learning_rate": 9.088482943752292e-06, "loss": 1.1995, "mean_token_accuracy": 0.6775195694716242, "step": 6225 }, { "epoch": 0.5766115970197603, "grad_norm": 3.90625, "learning_rate": 9.072395925424375e-06, "loss": 1.1505, "mean_token_accuracy": 0.6896526418786693, "step": 6230 }, { "epoch": 0.5770743671618307, "grad_norm": 4.4375, "learning_rate": 9.056311328135185e-06, "loss": 1.1735, "mean_token_accuracy": 0.6862769080234833, "step": 6235 }, { "epoch": 0.5775371373039011, "grad_norm": 4.09375, "learning_rate": 9.040229193865377e-06, "loss": 1.2044, "mean_token_accuracy": 0.6768590998043054, "step": 6240 }, { "epoch": 0.5779999074459716, "grad_norm": 4.21875, "learning_rate": 9.024149564589189e-06, "loss": 1.1463, "mean_token_accuracy": 0.6929794520547945, "step": 6245 }, { "epoch": 0.578462677588042, "grad_norm": 4.21875, "learning_rate": 9.008072482274315e-06, "loss": 1.0823, "mean_token_accuracy": 0.7081457925636008, "step": 6250 }, { "epoch": 0.5789254477301125, "grad_norm": 3.953125, "learning_rate": 8.9919979888818e-06, "loss": 1.0714, "mean_token_accuracy": 0.7113013698630135, "step": 6255 }, { "epoch": 0.5793882178721829, "grad_norm": 4.4375, "learning_rate": 8.975926126365938e-06, "loss": 1.1837, "mean_token_accuracy": 0.6806996086105673, "step": 6260 }, { "epoch": 0.5798509880142533, "grad_norm": 4.375, "learning_rate": 8.959856936674148e-06, "loss": 1.1087, "mean_token_accuracy": 0.7009784735812132, "step": 6265 }, { "epoch": 0.5803137581563238, "grad_norm": 4.28125, "learning_rate": 8.943790461746884e-06, "loss": 1.1841, "mean_token_accuracy": 0.6804794520547944, "step": 6270 }, { "epoch": 0.5807765282983942, "grad_norm": 4.3125, "learning_rate": 8.927726743517507e-06, "loss": 1.1585, "mean_token_accuracy": 0.6868639921722114, "step": 6275 }, { "epoch": 0.5812392984404646, "grad_norm": 5.0, "learning_rate": 8.911665823912184e-06, "loss": 1.1311, "mean_token_accuracy": 0.6926614481409002, "step": 6280 }, { "epoch": 0.581702068582535, "grad_norm": 4.96875, "learning_rate": 8.895607744849777e-06, "loss": 1.1228, "mean_token_accuracy": 0.6987279843444225, "step": 6285 }, { "epoch": 0.5821648387246054, "grad_norm": 4.6875, "learning_rate": 8.879552548241734e-06, "loss": 1.1225, "mean_token_accuracy": 0.6933708414872799, "step": 6290 }, { "epoch": 0.582627608866676, "grad_norm": 4.0625, "learning_rate": 8.863500275991987e-06, "loss": 1.1087, "mean_token_accuracy": 0.6970156555772994, "step": 6295 }, { "epoch": 0.5830903790087464, "grad_norm": 4.21875, "learning_rate": 8.847450969996825e-06, "loss": 1.1279, "mean_token_accuracy": 0.6975048923679061, "step": 6300 }, { "epoch": 0.5830903790087464, "eval_loss": 1.189475417137146, "eval_mean_token_accuracy": 0.6803086013943243, "eval_runtime": 39.6866, "eval_samples_per_second": 25.802, "eval_steps_per_second": 6.451, "step": 6300 }, { "epoch": 0.5835531491508168, "grad_norm": 4.28125, "learning_rate": 8.831404672144802e-06, "loss": 1.1723, "mean_token_accuracy": 0.6845890410958905, "step": 6305 }, { "epoch": 0.5840159192928872, "grad_norm": 4.0625, "learning_rate": 8.815361424316617e-06, "loss": 1.1512, "mean_token_accuracy": 0.6918052837573386, "step": 6310 }, { "epoch": 0.5844786894349576, "grad_norm": 4.28125, "learning_rate": 8.799321268385005e-06, "loss": 1.1288, "mean_token_accuracy": 0.695743639921722, "step": 6315 }, { "epoch": 0.5849414595770281, "grad_norm": 4.40625, "learning_rate": 8.783284246214647e-06, "loss": 1.151, "mean_token_accuracy": 0.6852495107632094, "step": 6320 }, { "epoch": 0.5854042297190986, "grad_norm": 4.28125, "learning_rate": 8.767250399662028e-06, "loss": 1.2117, "mean_token_accuracy": 0.6770303326810175, "step": 6325 }, { "epoch": 0.585866999861169, "grad_norm": 5.0625, "learning_rate": 8.751219770575353e-06, "loss": 1.1458, "mean_token_accuracy": 0.6936154598825832, "step": 6330 }, { "epoch": 0.5863297700032394, "grad_norm": 4.1875, "learning_rate": 8.73519240079442e-06, "loss": 1.1351, "mean_token_accuracy": 0.6901908023483365, "step": 6335 }, { "epoch": 0.5867925401453098, "grad_norm": 4.15625, "learning_rate": 8.719168332150537e-06, "loss": 1.1832, "mean_token_accuracy": 0.6795988258317027, "step": 6340 }, { "epoch": 0.5872553102873802, "grad_norm": 3.953125, "learning_rate": 8.703147606466383e-06, "loss": 1.192, "mean_token_accuracy": 0.6804794520547945, "step": 6345 }, { "epoch": 0.5877180804294507, "grad_norm": 4.15625, "learning_rate": 8.687130265555916e-06, "loss": 1.2232, "mean_token_accuracy": 0.6696917808219179, "step": 6350 }, { "epoch": 0.5881808505715211, "grad_norm": 4.21875, "learning_rate": 8.671116351224257e-06, "loss": 1.1066, "mean_token_accuracy": 0.7013698630136987, "step": 6355 }, { "epoch": 0.5886436207135916, "grad_norm": 4.21875, "learning_rate": 8.655105905267584e-06, "loss": 1.1416, "mean_token_accuracy": 0.6936154598825832, "step": 6360 }, { "epoch": 0.589106390855662, "grad_norm": 4.09375, "learning_rate": 8.639098969473031e-06, "loss": 1.1242, "mean_token_accuracy": 0.6977739726027395, "step": 6365 }, { "epoch": 0.5895691609977324, "grad_norm": 4.25, "learning_rate": 8.62309558561856e-06, "loss": 1.1262, "mean_token_accuracy": 0.6987769080234834, "step": 6370 }, { "epoch": 0.5900319311398029, "grad_norm": 4.125, "learning_rate": 8.607095795472867e-06, "loss": 1.1734, "mean_token_accuracy": 0.6885029354207437, "step": 6375 }, { "epoch": 0.5904947012818733, "grad_norm": 3.984375, "learning_rate": 8.591099640795267e-06, "loss": 1.1092, "mean_token_accuracy": 0.7015410958904109, "step": 6380 }, { "epoch": 0.5909574714239437, "grad_norm": 4.34375, "learning_rate": 8.57510716333558e-06, "loss": 1.1572, "mean_token_accuracy": 0.6905088062622308, "step": 6385 }, { "epoch": 0.5914202415660141, "grad_norm": 4.25, "learning_rate": 8.559118404834052e-06, "loss": 1.1861, "mean_token_accuracy": 0.678008806262231, "step": 6390 }, { "epoch": 0.5918830117080845, "grad_norm": 4.09375, "learning_rate": 8.543133407021196e-06, "loss": 1.1639, "mean_token_accuracy": 0.6869618395303327, "step": 6395 }, { "epoch": 0.5923457818501551, "grad_norm": 4.71875, "learning_rate": 8.527152211617721e-06, "loss": 1.191, "mean_token_accuracy": 0.6783512720156555, "step": 6400 }, { "epoch": 0.5923457818501551, "eval_loss": 1.1894431114196777, "eval_mean_token_accuracy": 0.6804060665362026, "eval_runtime": 39.6215, "eval_samples_per_second": 25.845, "eval_steps_per_second": 6.461, "step": 6400 }, { "epoch": 0.5928085519922255, "grad_norm": 4.3125, "learning_rate": 8.511174860334412e-06, "loss": 1.1897, "mean_token_accuracy": 0.6828033268101762, "step": 6405 }, { "epoch": 0.5932713221342959, "grad_norm": 4.09375, "learning_rate": 8.495201394872019e-06, "loss": 1.0822, "mean_token_accuracy": 0.7042563600782779, "step": 6410 }, { "epoch": 0.5937340922763663, "grad_norm": 4.25, "learning_rate": 8.479231856921149e-06, "loss": 1.1862, "mean_token_accuracy": 0.6842465753424658, "step": 6415 }, { "epoch": 0.5941968624184367, "grad_norm": 4.40625, "learning_rate": 8.463266288162163e-06, "loss": 1.1724, "mean_token_accuracy": 0.6870352250489237, "step": 6420 }, { "epoch": 0.5946596325605072, "grad_norm": 4.1875, "learning_rate": 8.447304730265058e-06, "loss": 1.0549, "mean_token_accuracy": 0.7168297455968691, "step": 6425 }, { "epoch": 0.5951224027025777, "grad_norm": 4.375, "learning_rate": 8.431347224889362e-06, "loss": 1.1436, "mean_token_accuracy": 0.6918542074363991, "step": 6430 }, { "epoch": 0.5955851728446481, "grad_norm": 3.9375, "learning_rate": 8.41539381368403e-06, "loss": 1.1782, "mean_token_accuracy": 0.6795009784735813, "step": 6435 }, { "epoch": 0.5960479429867185, "grad_norm": 4.6875, "learning_rate": 8.39944453828733e-06, "loss": 1.1668, "mean_token_accuracy": 0.686325831702544, "step": 6440 }, { "epoch": 0.5965107131287889, "grad_norm": 4.125, "learning_rate": 8.383499440326734e-06, "loss": 1.1271, "mean_token_accuracy": 0.6946183953033268, "step": 6445 }, { "epoch": 0.5969734832708594, "grad_norm": 4.25, "learning_rate": 8.367558561418812e-06, "loss": 1.131, "mean_token_accuracy": 0.6928082191780821, "step": 6450 }, { "epoch": 0.5974362534129298, "grad_norm": 4.125, "learning_rate": 8.351621943169116e-06, "loss": 1.0839, "mean_token_accuracy": 0.7045009784735813, "step": 6455 }, { "epoch": 0.5978990235550002, "grad_norm": 4.0625, "learning_rate": 8.33568962717209e-06, "loss": 1.1638, "mean_token_accuracy": 0.6873532289628181, "step": 6460 }, { "epoch": 0.5983617936970707, "grad_norm": 4.28125, "learning_rate": 8.319761655010945e-06, "loss": 1.1174, "mean_token_accuracy": 0.6990215264187867, "step": 6465 }, { "epoch": 0.5988245638391411, "grad_norm": 3.96875, "learning_rate": 8.30383806825755e-06, "loss": 1.1506, "mean_token_accuracy": 0.6875, "step": 6470 }, { "epoch": 0.5992873339812115, "grad_norm": 4.375, "learning_rate": 8.287918908472328e-06, "loss": 1.1977, "mean_token_accuracy": 0.6808219178082191, "step": 6475 }, { "epoch": 0.599750104123282, "grad_norm": 6.53125, "learning_rate": 8.272004217204151e-06, "loss": 1.1806, "mean_token_accuracy": 0.6839041095890411, "step": 6480 }, { "epoch": 0.6002128742653524, "grad_norm": 4.09375, "learning_rate": 8.256094035990229e-06, "loss": 1.2293, "mean_token_accuracy": 0.6684931506849316, "step": 6485 }, { "epoch": 0.6006756444074228, "grad_norm": 5.15625, "learning_rate": 8.240188406356001e-06, "loss": 1.1635, "mean_token_accuracy": 0.6883806262230919, "step": 6490 }, { "epoch": 0.6011384145494932, "grad_norm": 4.21875, "learning_rate": 8.224287369815021e-06, "loss": 1.2181, "mean_token_accuracy": 0.6721379647749511, "step": 6495 }, { "epoch": 0.6016011846915637, "grad_norm": 4.46875, "learning_rate": 8.208390967868857e-06, "loss": 1.1055, "mean_token_accuracy": 0.702568493150685, "step": 6500 }, { "epoch": 0.6016011846915637, "eval_loss": 1.1892268657684326, "eval_mean_token_accuracy": 0.6806258408757332, "eval_runtime": 39.4859, "eval_samples_per_second": 25.933, "eval_steps_per_second": 6.483, "step": 6500 }, { "epoch": 0.6020639548336342, "grad_norm": 4.28125, "learning_rate": 8.19249924200699e-06, "loss": 1.1442, "mean_token_accuracy": 0.6907289628180038, "step": 6505 }, { "epoch": 0.6025267249757046, "grad_norm": 4.46875, "learning_rate": 8.176612233706683e-06, "loss": 1.1462, "mean_token_accuracy": 0.6876956947162427, "step": 6510 }, { "epoch": 0.602989495117775, "grad_norm": 4.28125, "learning_rate": 8.160729984432897e-06, "loss": 1.2836, "mean_token_accuracy": 0.6606164383561645, "step": 6515 }, { "epoch": 0.6034522652598454, "grad_norm": 4.5625, "learning_rate": 8.144852535638161e-06, "loss": 1.19, "mean_token_accuracy": 0.6829745596868885, "step": 6520 }, { "epoch": 0.6039150354019158, "grad_norm": 4.21875, "learning_rate": 8.128979928762481e-06, "loss": 1.0886, "mean_token_accuracy": 0.7063845401174169, "step": 6525 }, { "epoch": 0.6043778055439863, "grad_norm": 4.03125, "learning_rate": 8.113112205233232e-06, "loss": 1.2078, "mean_token_accuracy": 0.6740949119373777, "step": 6530 }, { "epoch": 0.6048405756860568, "grad_norm": 4.0625, "learning_rate": 8.097249406465035e-06, "loss": 1.1818, "mean_token_accuracy": 0.680234833659491, "step": 6535 }, { "epoch": 0.6053033458281272, "grad_norm": 4.15625, "learning_rate": 8.081391573859657e-06, "loss": 1.168, "mean_token_accuracy": 0.6815313111545989, "step": 6540 }, { "epoch": 0.6057661159701976, "grad_norm": 4.3125, "learning_rate": 8.065538748805907e-06, "loss": 1.1795, "mean_token_accuracy": 0.6847358121330724, "step": 6545 }, { "epoch": 0.606228886112268, "grad_norm": 3.96875, "learning_rate": 8.049690972679522e-06, "loss": 1.1477, "mean_token_accuracy": 0.6887230919765166, "step": 6550 }, { "epoch": 0.6066916562543385, "grad_norm": 4.3125, "learning_rate": 8.033848286843065e-06, "loss": 1.19, "mean_token_accuracy": 0.6766878669275929, "step": 6555 }, { "epoch": 0.6071544263964089, "grad_norm": 4.65625, "learning_rate": 8.018010732645809e-06, "loss": 1.1914, "mean_token_accuracy": 0.6809686888454014, "step": 6560 }, { "epoch": 0.6076171965384793, "grad_norm": 4.25, "learning_rate": 8.002178351423637e-06, "loss": 1.1699, "mean_token_accuracy": 0.6869863013698632, "step": 6565 }, { "epoch": 0.6080799666805498, "grad_norm": 4.40625, "learning_rate": 7.986351184498926e-06, "loss": 1.1026, "mean_token_accuracy": 0.7008072407045011, "step": 6570 }, { "epoch": 0.6085427368226202, "grad_norm": 4.53125, "learning_rate": 7.970529273180445e-06, "loss": 1.1441, "mean_token_accuracy": 0.6897260273972602, "step": 6575 }, { "epoch": 0.6090055069646907, "grad_norm": 4.0, "learning_rate": 7.954712658763256e-06, "loss": 1.1751, "mean_token_accuracy": 0.6827788649706458, "step": 6580 }, { "epoch": 0.6094682771067611, "grad_norm": 4.34375, "learning_rate": 7.938901382528584e-06, "loss": 1.1606, "mean_token_accuracy": 0.6880626223091976, "step": 6585 }, { "epoch": 0.6099310472488315, "grad_norm": 4.1875, "learning_rate": 7.923095485743722e-06, "loss": 1.1868, "mean_token_accuracy": 0.6776663405088063, "step": 6590 }, { "epoch": 0.6103938173909019, "grad_norm": 4.28125, "learning_rate": 7.907295009661926e-06, "loss": 1.2089, "mean_token_accuracy": 0.6716976516634051, "step": 6595 }, { "epoch": 0.6108565875329723, "grad_norm": 4.96875, "learning_rate": 7.891499995522307e-06, "loss": 1.1691, "mean_token_accuracy": 0.687426614481409, "step": 6600 }, { "epoch": 0.6108565875329723, "eval_loss": 1.1889545917510986, "eval_mean_token_accuracy": 0.6806621514187864, "eval_runtime": 39.6661, "eval_samples_per_second": 25.815, "eval_steps_per_second": 6.454, "step": 6600 }, { "epoch": 0.6113193576750428, "grad_norm": 4.0625, "learning_rate": 7.875710484549714e-06, "loss": 1.1695, "mean_token_accuracy": 0.6844178082191783, "step": 6605 }, { "epoch": 0.6117821278171133, "grad_norm": 4.0, "learning_rate": 7.859926517954642e-06, "loss": 1.1123, "mean_token_accuracy": 0.6996086105675148, "step": 6610 }, { "epoch": 0.6122448979591837, "grad_norm": 4.4375, "learning_rate": 7.844148136933103e-06, "loss": 1.1287, "mean_token_accuracy": 0.6937377690802348, "step": 6615 }, { "epoch": 0.6127076681012541, "grad_norm": 4.125, "learning_rate": 7.828375382666537e-06, "loss": 1.1515, "mean_token_accuracy": 0.6886252446183952, "step": 6620 }, { "epoch": 0.6131704382433245, "grad_norm": 4.53125, "learning_rate": 7.8126082963217e-06, "loss": 1.2698, "mean_token_accuracy": 0.6644569471624268, "step": 6625 }, { "epoch": 0.6136332083853949, "grad_norm": 3.9375, "learning_rate": 7.796846919050555e-06, "loss": 1.1555, "mean_token_accuracy": 0.68676614481409, "step": 6630 }, { "epoch": 0.6140959785274654, "grad_norm": 4.1875, "learning_rate": 7.781091291990161e-06, "loss": 1.1901, "mean_token_accuracy": 0.6825097847358121, "step": 6635 }, { "epoch": 0.6145587486695359, "grad_norm": 4.09375, "learning_rate": 7.765341456262568e-06, "loss": 1.1937, "mean_token_accuracy": 0.6812133072407046, "step": 6640 }, { "epoch": 0.6150215188116063, "grad_norm": 4.46875, "learning_rate": 7.749597452974709e-06, "loss": 1.1824, "mean_token_accuracy": 0.6847602739726026, "step": 6645 }, { "epoch": 0.6154842889536767, "grad_norm": 3.984375, "learning_rate": 7.73385932321831e-06, "loss": 1.1182, "mean_token_accuracy": 0.6981898238747553, "step": 6650 }, { "epoch": 0.6159470590957471, "grad_norm": 4.15625, "learning_rate": 7.718127108069746e-06, "loss": 1.1435, "mean_token_accuracy": 0.6928571428571428, "step": 6655 }, { "epoch": 0.6164098292378176, "grad_norm": 4.4375, "learning_rate": 7.702400848589968e-06, "loss": 1.2124, "mean_token_accuracy": 0.673458904109589, "step": 6660 }, { "epoch": 0.616872599379888, "grad_norm": 3.96875, "learning_rate": 7.68668058582438e-06, "loss": 1.0689, "mean_token_accuracy": 0.7095890410958904, "step": 6665 }, { "epoch": 0.6173353695219584, "grad_norm": 4.0, "learning_rate": 7.670966360802729e-06, "loss": 1.1738, "mean_token_accuracy": 0.6861056751467711, "step": 6670 }, { "epoch": 0.6177981396640289, "grad_norm": 5.1875, "learning_rate": 7.655258214539013e-06, "loss": 1.1268, "mean_token_accuracy": 0.6953033268101761, "step": 6675 }, { "epoch": 0.6182609098060993, "grad_norm": 4.25, "learning_rate": 7.639556188031358e-06, "loss": 1.1108, "mean_token_accuracy": 0.698238747553816, "step": 6680 }, { "epoch": 0.6187236799481698, "grad_norm": 4.65625, "learning_rate": 7.62386032226192e-06, "loss": 1.1802, "mean_token_accuracy": 0.6870841487279844, "step": 6685 }, { "epoch": 0.6191864500902402, "grad_norm": 4.3125, "learning_rate": 7.6081706581967695e-06, "loss": 1.205, "mean_token_accuracy": 0.6754158512720158, "step": 6690 }, { "epoch": 0.6196492202323106, "grad_norm": 4.28125, "learning_rate": 7.592487236785806e-06, "loss": 1.1847, "mean_token_accuracy": 0.6830479452054794, "step": 6695 }, { "epoch": 0.620111990374381, "grad_norm": 4.53125, "learning_rate": 7.576810098962619e-06, "loss": 1.1072, "mean_token_accuracy": 0.7002446183953033, "step": 6700 }, { "epoch": 0.620111990374381, "eval_loss": 1.1888518333435059, "eval_mean_token_accuracy": 0.6807882827788639, "eval_runtime": 39.6606, "eval_samples_per_second": 25.819, "eval_steps_per_second": 6.455, "step": 6700 }, { "epoch": 0.6205747605164514, "grad_norm": 4.3125, "learning_rate": 7.561139285644406e-06, "loss": 1.2276, "mean_token_accuracy": 0.6678082191780822, "step": 6705 }, { "epoch": 0.621037530658522, "grad_norm": 4.34375, "learning_rate": 7.545474837731857e-06, "loss": 1.0728, "mean_token_accuracy": 0.7076320939334637, "step": 6710 }, { "epoch": 0.6215003008005924, "grad_norm": 4.4375, "learning_rate": 7.529816796109039e-06, "loss": 1.1958, "mean_token_accuracy": 0.6790362035225048, "step": 6715 }, { "epoch": 0.6219630709426628, "grad_norm": 4.625, "learning_rate": 7.514165201643322e-06, "loss": 1.1813, "mean_token_accuracy": 0.6818493150684932, "step": 6720 }, { "epoch": 0.6224258410847332, "grad_norm": 4.28125, "learning_rate": 7.498520095185224e-06, "loss": 1.1962, "mean_token_accuracy": 0.6776663405088061, "step": 6725 }, { "epoch": 0.6228886112268036, "grad_norm": 3.953125, "learning_rate": 7.482881517568344e-06, "loss": 1.1133, "mean_token_accuracy": 0.6980919765166341, "step": 6730 }, { "epoch": 0.6233513813688741, "grad_norm": 4.03125, "learning_rate": 7.467249509609231e-06, "loss": 1.2037, "mean_token_accuracy": 0.6783757338551861, "step": 6735 }, { "epoch": 0.6238141515109445, "grad_norm": 3.9375, "learning_rate": 7.4516241121072955e-06, "loss": 1.0651, "mean_token_accuracy": 0.7104696673189824, "step": 6740 }, { "epoch": 0.624276921653015, "grad_norm": 4.28125, "learning_rate": 7.436005365844693e-06, "loss": 1.203, "mean_token_accuracy": 0.6767123287671233, "step": 6745 }, { "epoch": 0.6247396917950854, "grad_norm": 3.890625, "learning_rate": 7.4203933115862155e-06, "loss": 1.1358, "mean_token_accuracy": 0.6900440313111545, "step": 6750 }, { "epoch": 0.6252024619371558, "grad_norm": 3.96875, "learning_rate": 7.404787990079194e-06, "loss": 1.1399, "mean_token_accuracy": 0.6909735812133071, "step": 6755 }, { "epoch": 0.6256652320792262, "grad_norm": 4.21875, "learning_rate": 7.389189442053384e-06, "loss": 1.1287, "mean_token_accuracy": 0.6959393346379648, "step": 6760 }, { "epoch": 0.6261280022212967, "grad_norm": 4.28125, "learning_rate": 7.373597708220857e-06, "loss": 1.1359, "mean_token_accuracy": 0.6932240704500978, "step": 6765 }, { "epoch": 0.6265907723633671, "grad_norm": 4.65625, "learning_rate": 7.358012829275914e-06, "loss": 1.0717, "mean_token_accuracy": 0.7100538160469667, "step": 6770 }, { "epoch": 0.6270535425054375, "grad_norm": 4.65625, "learning_rate": 7.3424348458949524e-06, "loss": 1.1666, "mean_token_accuracy": 0.6887230919765166, "step": 6775 }, { "epoch": 0.627516312647508, "grad_norm": 4.0, "learning_rate": 7.326863798736377e-06, "loss": 1.1827, "mean_token_accuracy": 0.6803082191780819, "step": 6780 }, { "epoch": 0.6279790827895784, "grad_norm": 4.1875, "learning_rate": 7.311299728440484e-06, "loss": 1.1371, "mean_token_accuracy": 0.6912671232876713, "step": 6785 }, { "epoch": 0.6284418529316489, "grad_norm": 4.34375, "learning_rate": 7.295742675629369e-06, "loss": 1.1843, "mean_token_accuracy": 0.6826076320939334, "step": 6790 }, { "epoch": 0.6289046230737193, "grad_norm": 4.65625, "learning_rate": 7.280192680906804e-06, "loss": 1.1577, "mean_token_accuracy": 0.6875489236790606, "step": 6795 }, { "epoch": 0.6293673932157897, "grad_norm": 3.96875, "learning_rate": 7.264649784858144e-06, "loss": 1.1457, "mean_token_accuracy": 0.6923189823874754, "step": 6800 }, { "epoch": 0.6293673932157897, "eval_loss": 1.1887832880020142, "eval_mean_token_accuracy": 0.6805685084393338, "eval_runtime": 39.2922, "eval_samples_per_second": 26.061, "eval_steps_per_second": 6.515, "step": 6800 }, { "epoch": 0.6298301633578601, "grad_norm": 4.40625, "learning_rate": 7.249114028050217e-06, "loss": 1.1748, "mean_token_accuracy": 0.6861545988258319, "step": 6805 }, { "epoch": 0.6302929334999305, "grad_norm": 4.21875, "learning_rate": 7.233585451031211e-06, "loss": 1.1872, "mean_token_accuracy": 0.6811154598825832, "step": 6810 }, { "epoch": 0.6307557036420011, "grad_norm": 4.0625, "learning_rate": 7.218064094330586e-06, "loss": 1.1138, "mean_token_accuracy": 0.6968688845401174, "step": 6815 }, { "epoch": 0.6312184737840715, "grad_norm": 4.09375, "learning_rate": 7.202549998458946e-06, "loss": 1.122, "mean_token_accuracy": 0.6938845401174168, "step": 6820 }, { "epoch": 0.6316812439261419, "grad_norm": 4.09375, "learning_rate": 7.187043203907948e-06, "loss": 1.1835, "mean_token_accuracy": 0.6793542074363992, "step": 6825 }, { "epoch": 0.6321440140682123, "grad_norm": 4.375, "learning_rate": 7.171543751150197e-06, "loss": 1.1839, "mean_token_accuracy": 0.6803326810176126, "step": 6830 }, { "epoch": 0.6326067842102827, "grad_norm": 4.28125, "learning_rate": 7.156051680639127e-06, "loss": 1.2008, "mean_token_accuracy": 0.679696673189824, "step": 6835 }, { "epoch": 0.6330695543523532, "grad_norm": 4.1875, "learning_rate": 7.140567032808916e-06, "loss": 1.1427, "mean_token_accuracy": 0.6933463796477495, "step": 6840 }, { "epoch": 0.6335323244944236, "grad_norm": 4.15625, "learning_rate": 7.125089848074362e-06, "loss": 1.1369, "mean_token_accuracy": 0.6927103718199608, "step": 6845 }, { "epoch": 0.6339950946364941, "grad_norm": 4.375, "learning_rate": 7.109620166830784e-06, "loss": 1.1106, "mean_token_accuracy": 0.6989970645792564, "step": 6850 }, { "epoch": 0.6344578647785645, "grad_norm": 5.0, "learning_rate": 7.094158029453917e-06, "loss": 1.1186, "mean_token_accuracy": 0.6968688845401174, "step": 6855 }, { "epoch": 0.6349206349206349, "grad_norm": 4.21875, "learning_rate": 7.0787034762998065e-06, "loss": 1.1405, "mean_token_accuracy": 0.6923679060665362, "step": 6860 }, { "epoch": 0.6353834050627054, "grad_norm": 4.5625, "learning_rate": 7.063256547704709e-06, "loss": 1.1694, "mean_token_accuracy": 0.6823630136986301, "step": 6865 }, { "epoch": 0.6358461752047758, "grad_norm": 4.0, "learning_rate": 7.047817283984972e-06, "loss": 1.1849, "mean_token_accuracy": 0.6813845401174168, "step": 6870 }, { "epoch": 0.6363089453468462, "grad_norm": 4.3125, "learning_rate": 7.032385725436946e-06, "loss": 1.1712, "mean_token_accuracy": 0.6839285714285714, "step": 6875 }, { "epoch": 0.6367717154889166, "grad_norm": 4.15625, "learning_rate": 7.01696191233686e-06, "loss": 1.1938, "mean_token_accuracy": 0.6820694716242662, "step": 6880 }, { "epoch": 0.6372344856309871, "grad_norm": 4.375, "learning_rate": 7.001545884940743e-06, "loss": 1.141, "mean_token_accuracy": 0.6919275929549902, "step": 6885 }, { "epoch": 0.6376972557730575, "grad_norm": 4.0625, "learning_rate": 6.986137683484293e-06, "loss": 1.096, "mean_token_accuracy": 0.700587084148728, "step": 6890 }, { "epoch": 0.638160025915128, "grad_norm": 4.21875, "learning_rate": 6.970737348182783e-06, "loss": 1.2045, "mean_token_accuracy": 0.6794765166340508, "step": 6895 }, { "epoch": 0.6386227960571984, "grad_norm": 4.28125, "learning_rate": 6.955344919230957e-06, "loss": 1.1874, "mean_token_accuracy": 0.6781066536203523, "step": 6900 }, { "epoch": 0.6386227960571984, "eval_loss": 1.1886165142059326, "eval_mean_token_accuracy": 0.6806143743884533, "eval_runtime": 40.0154, "eval_samples_per_second": 25.59, "eval_steps_per_second": 6.398, "step": 6900 }, { "epoch": 0.6390855661992688, "grad_norm": 4.875, "learning_rate": 6.939960436802918e-06, "loss": 1.0537, "mean_token_accuracy": 0.7139187866927593, "step": 6905 }, { "epoch": 0.6395483363413392, "grad_norm": 4.125, "learning_rate": 6.924583941052043e-06, "loss": 1.1027, "mean_token_accuracy": 0.7021037181996086, "step": 6910 }, { "epoch": 0.6400111064834096, "grad_norm": 4.25, "learning_rate": 6.909215472110853e-06, "loss": 1.1665, "mean_token_accuracy": 0.6828277886497064, "step": 6915 }, { "epoch": 0.6404738766254802, "grad_norm": 4.25, "learning_rate": 6.893855070090917e-06, "loss": 1.1326, "mean_token_accuracy": 0.6935176125244618, "step": 6920 }, { "epoch": 0.6409366467675506, "grad_norm": 4.0625, "learning_rate": 6.878502775082755e-06, "loss": 1.2306, "mean_token_accuracy": 0.6686154598825831, "step": 6925 }, { "epoch": 0.641399416909621, "grad_norm": 4.90625, "learning_rate": 6.863158627155724e-06, "loss": 1.1341, "mean_token_accuracy": 0.6932729941291584, "step": 6930 }, { "epoch": 0.6418621870516914, "grad_norm": 4.0, "learning_rate": 6.847822666357924e-06, "loss": 1.1143, "mean_token_accuracy": 0.694055772994129, "step": 6935 }, { "epoch": 0.6423249571937618, "grad_norm": 3.984375, "learning_rate": 6.832494932716078e-06, "loss": 1.1801, "mean_token_accuracy": 0.6830968688845402, "step": 6940 }, { "epoch": 0.6427877273358323, "grad_norm": 4.4375, "learning_rate": 6.8171754662354416e-06, "loss": 1.1018, "mean_token_accuracy": 0.704770058708415, "step": 6945 }, { "epoch": 0.6432504974779027, "grad_norm": 4.59375, "learning_rate": 6.80186430689969e-06, "loss": 1.1351, "mean_token_accuracy": 0.6930283757338553, "step": 6950 }, { "epoch": 0.6437132676199732, "grad_norm": 4.1875, "learning_rate": 6.786561494670818e-06, "loss": 1.21, "mean_token_accuracy": 0.6758561643835617, "step": 6955 }, { "epoch": 0.6441760377620436, "grad_norm": 3.875, "learning_rate": 6.7712670694890384e-06, "loss": 1.2393, "mean_token_accuracy": 0.6663649706457926, "step": 6960 }, { "epoch": 0.644638807904114, "grad_norm": 4.125, "learning_rate": 6.755981071272671e-06, "loss": 1.1758, "mean_token_accuracy": 0.687720156555773, "step": 6965 }, { "epoch": 0.6451015780461845, "grad_norm": 4.375, "learning_rate": 6.740703539918036e-06, "loss": 1.2206, "mean_token_accuracy": 0.6724559686888453, "step": 6970 }, { "epoch": 0.6455643481882549, "grad_norm": 4.3125, "learning_rate": 6.725434515299362e-06, "loss": 1.1852, "mean_token_accuracy": 0.6813356164383563, "step": 6975 }, { "epoch": 0.6460271183303253, "grad_norm": 4.03125, "learning_rate": 6.710174037268673e-06, "loss": 1.1191, "mean_token_accuracy": 0.7, "step": 6980 }, { "epoch": 0.6464898884723957, "grad_norm": 4.375, "learning_rate": 6.694922145655689e-06, "loss": 1.1754, "mean_token_accuracy": 0.6818982387475538, "step": 6985 }, { "epoch": 0.6469526586144662, "grad_norm": 4.03125, "learning_rate": 6.679678880267715e-06, "loss": 1.1786, "mean_token_accuracy": 0.6838551859099804, "step": 6990 }, { "epoch": 0.6474154287565367, "grad_norm": 4.40625, "learning_rate": 6.664444280889544e-06, "loss": 1.1005, "mean_token_accuracy": 0.703008806262231, "step": 6995 }, { "epoch": 0.6478781988986071, "grad_norm": 4.75, "learning_rate": 6.6492183872833475e-06, "loss": 1.161, "mean_token_accuracy": 0.6857387475538161, "step": 7000 }, { "epoch": 0.6478781988986071, "eval_loss": 1.1886982917785645, "eval_mean_token_accuracy": 0.6805742416829732, "eval_runtime": 41.8252, "eval_samples_per_second": 24.483, "eval_steps_per_second": 6.121, "step": 7000 }, { "epoch": 0.6483409690406775, "grad_norm": 4.125, "learning_rate": 6.634001239188582e-06, "loss": 1.1344, "mean_token_accuracy": 0.6984099804305284, "step": 7005 }, { "epoch": 0.6488037391827479, "grad_norm": 4.1875, "learning_rate": 6.618792876321873e-06, "loss": 1.1156, "mean_token_accuracy": 0.6970401174168297, "step": 7010 }, { "epoch": 0.6492665093248183, "grad_norm": 4.5, "learning_rate": 6.6035933383769194e-06, "loss": 1.1763, "mean_token_accuracy": 0.6848336594911938, "step": 7015 }, { "epoch": 0.6497292794668887, "grad_norm": 4.21875, "learning_rate": 6.588402665024381e-06, "loss": 1.1731, "mean_token_accuracy": 0.6831947162426615, "step": 7020 }, { "epoch": 0.6501920496089593, "grad_norm": 4.28125, "learning_rate": 6.573220895911786e-06, "loss": 1.1483, "mean_token_accuracy": 0.6909246575342466, "step": 7025 }, { "epoch": 0.6506548197510297, "grad_norm": 4.46875, "learning_rate": 6.558048070663428e-06, "loss": 1.089, "mean_token_accuracy": 0.7010273972602741, "step": 7030 }, { "epoch": 0.6511175898931001, "grad_norm": 4.5, "learning_rate": 6.54288422888025e-06, "loss": 1.1157, "mean_token_accuracy": 0.6970645792563601, "step": 7035 }, { "epoch": 0.6515803600351705, "grad_norm": 4.1875, "learning_rate": 6.527729410139747e-06, "loss": 1.0771, "mean_token_accuracy": 0.7063600782778864, "step": 7040 }, { "epoch": 0.6520431301772409, "grad_norm": 4.3125, "learning_rate": 6.512583653995867e-06, "loss": 1.1993, "mean_token_accuracy": 0.678082191780822, "step": 7045 }, { "epoch": 0.6525059003193114, "grad_norm": 4.90625, "learning_rate": 6.497446999978902e-06, "loss": 1.1261, "mean_token_accuracy": 0.6979941291585127, "step": 7050 }, { "epoch": 0.6529686704613819, "grad_norm": 4.15625, "learning_rate": 6.482319487595396e-06, "loss": 1.171, "mean_token_accuracy": 0.6862279843444228, "step": 7055 }, { "epoch": 0.6534314406034523, "grad_norm": 4.0625, "learning_rate": 6.4672011563280254e-06, "loss": 1.1723, "mean_token_accuracy": 0.6806017612524461, "step": 7060 }, { "epoch": 0.6538942107455227, "grad_norm": 4.15625, "learning_rate": 6.452092045635504e-06, "loss": 1.161, "mean_token_accuracy": 0.6839530332681016, "step": 7065 }, { "epoch": 0.6543569808875931, "grad_norm": 4.65625, "learning_rate": 6.436992194952478e-06, "loss": 1.1963, "mean_token_accuracy": 0.6807240704500978, "step": 7070 }, { "epoch": 0.6548197510296636, "grad_norm": 4.25, "learning_rate": 6.421901643689438e-06, "loss": 1.0916, "mean_token_accuracy": 0.7058463796477494, "step": 7075 }, { "epoch": 0.655282521171734, "grad_norm": 4.125, "learning_rate": 6.406820431232588e-06, "loss": 1.1653, "mean_token_accuracy": 0.688307240704501, "step": 7080 }, { "epoch": 0.6557452913138044, "grad_norm": 4.09375, "learning_rate": 6.3917485969437665e-06, "loss": 1.1841, "mean_token_accuracy": 0.6822651663405088, "step": 7085 }, { "epoch": 0.6562080614558748, "grad_norm": 4.25, "learning_rate": 6.376686180160331e-06, "loss": 1.1828, "mean_token_accuracy": 0.684417808219178, "step": 7090 }, { "epoch": 0.6566708315979453, "grad_norm": 4.59375, "learning_rate": 6.361633220195057e-06, "loss": 1.1693, "mean_token_accuracy": 0.6845156555772994, "step": 7095 }, { "epoch": 0.6571336017400158, "grad_norm": 4.625, "learning_rate": 6.34658975633605e-06, "loss": 1.1763, "mean_token_accuracy": 0.6824608610567514, "step": 7100 }, { "epoch": 0.6571336017400158, "eval_loss": 1.1885125637054443, "eval_mean_token_accuracy": 0.6806506849315062, "eval_runtime": 41.8956, "eval_samples_per_second": 24.442, "eval_steps_per_second": 6.11, "step": 7100 }, { "epoch": 0.6575963718820862, "grad_norm": 5.40625, "learning_rate": 6.331555827846618e-06, "loss": 1.1866, "mean_token_accuracy": 0.6777641878669277, "step": 7105 }, { "epoch": 0.6580591420241566, "grad_norm": 4.46875, "learning_rate": 6.316531473965186e-06, "loss": 1.1768, "mean_token_accuracy": 0.6832191780821918, "step": 7110 }, { "epoch": 0.658521912166227, "grad_norm": 4.34375, "learning_rate": 6.301516733905189e-06, "loss": 1.1916, "mean_token_accuracy": 0.6799657534246576, "step": 7115 }, { "epoch": 0.6589846823082974, "grad_norm": 4.34375, "learning_rate": 6.2865116468549705e-06, "loss": 1.0768, "mean_token_accuracy": 0.7055283757338553, "step": 7120 }, { "epoch": 0.659447452450368, "grad_norm": 4.0, "learning_rate": 6.271516251977681e-06, "loss": 1.1751, "mean_token_accuracy": 0.6834393346379647, "step": 7125 }, { "epoch": 0.6599102225924384, "grad_norm": 4.21875, "learning_rate": 6.256530588411174e-06, "loss": 1.1854, "mean_token_accuracy": 0.6820205479452055, "step": 7130 }, { "epoch": 0.6603729927345088, "grad_norm": 4.0625, "learning_rate": 6.2415546952679015e-06, "loss": 1.2015, "mean_token_accuracy": 0.6801614481409002, "step": 7135 }, { "epoch": 0.6608357628765792, "grad_norm": 3.953125, "learning_rate": 6.226588611634817e-06, "loss": 1.2043, "mean_token_accuracy": 0.6771037181996087, "step": 7140 }, { "epoch": 0.6612985330186496, "grad_norm": 4.78125, "learning_rate": 6.211632376573269e-06, "loss": 1.1308, "mean_token_accuracy": 0.696037181996086, "step": 7145 }, { "epoch": 0.66176130316072, "grad_norm": 4.3125, "learning_rate": 6.196686029118909e-06, "loss": 1.14, "mean_token_accuracy": 0.6946428571428572, "step": 7150 }, { "epoch": 0.6622240733027905, "grad_norm": 4.8125, "learning_rate": 6.181749608281574e-06, "loss": 1.1476, "mean_token_accuracy": 0.6873532289628179, "step": 7155 }, { "epoch": 0.662686843444861, "grad_norm": 4.34375, "learning_rate": 6.166823153045195e-06, "loss": 1.1977, "mean_token_accuracy": 0.6818248532289629, "step": 7160 }, { "epoch": 0.6631496135869314, "grad_norm": 3.984375, "learning_rate": 6.15190670236769e-06, "loss": 1.1522, "mean_token_accuracy": 0.6888698630136987, "step": 7165 }, { "epoch": 0.6636123837290018, "grad_norm": 4.3125, "learning_rate": 6.13700029518087e-06, "loss": 1.0722, "mean_token_accuracy": 0.7076320939334636, "step": 7170 }, { "epoch": 0.6640751538710722, "grad_norm": 4.28125, "learning_rate": 6.122103970390334e-06, "loss": 1.2362, "mean_token_accuracy": 0.6734344422700587, "step": 7175 }, { "epoch": 0.6645379240131427, "grad_norm": 4.3125, "learning_rate": 6.1072177668753594e-06, "loss": 1.197, "mean_token_accuracy": 0.6822896281800392, "step": 7180 }, { "epoch": 0.6650006941552131, "grad_norm": 4.0, "learning_rate": 6.092341723488811e-06, "loss": 1.1691, "mean_token_accuracy": 0.6860812133072407, "step": 7185 }, { "epoch": 0.6654634642972835, "grad_norm": 4.0625, "learning_rate": 6.077475879057031e-06, "loss": 1.1443, "mean_token_accuracy": 0.6923434442270059, "step": 7190 }, { "epoch": 0.665926234439354, "grad_norm": 3.921875, "learning_rate": 6.062620272379754e-06, "loss": 1.1572, "mean_token_accuracy": 0.6851272015655577, "step": 7195 }, { "epoch": 0.6663890045814244, "grad_norm": 4.53125, "learning_rate": 6.047774942229983e-06, "loss": 1.0978, "mean_token_accuracy": 0.7027397260273973, "step": 7200 }, { "epoch": 0.6663890045814244, "eval_loss": 1.1883617639541626, "eval_mean_token_accuracy": 0.6805341089774946, "eval_runtime": 39.6431, "eval_samples_per_second": 25.83, "eval_steps_per_second": 6.458, "step": 7200 }, { "epoch": 0.6668517747234949, "grad_norm": 4.40625, "learning_rate": 6.032939927353902e-06, "loss": 1.1472, "mean_token_accuracy": 0.6906311154598825, "step": 7205 }, { "epoch": 0.6673145448655653, "grad_norm": 4.09375, "learning_rate": 6.018115266470777e-06, "loss": 1.1456, "mean_token_accuracy": 0.6912915851272017, "step": 7210 }, { "epoch": 0.6677773150076357, "grad_norm": 4.25, "learning_rate": 6.003300998272839e-06, "loss": 1.2087, "mean_token_accuracy": 0.6738502935420744, "step": 7215 }, { "epoch": 0.6682400851497061, "grad_norm": 4.75, "learning_rate": 5.9884971614252085e-06, "loss": 1.1269, "mean_token_accuracy": 0.693468688845401, "step": 7220 }, { "epoch": 0.6687028552917765, "grad_norm": 4.15625, "learning_rate": 5.973703794565774e-06, "loss": 1.1631, "mean_token_accuracy": 0.6886497064579257, "step": 7225 }, { "epoch": 0.6691656254338471, "grad_norm": 4.28125, "learning_rate": 5.958920936305092e-06, "loss": 1.1497, "mean_token_accuracy": 0.6909246575342467, "step": 7230 }, { "epoch": 0.6696283955759175, "grad_norm": 3.890625, "learning_rate": 5.9441486252263e-06, "loss": 1.1526, "mean_token_accuracy": 0.6865949119373778, "step": 7235 }, { "epoch": 0.6700911657179879, "grad_norm": 4.1875, "learning_rate": 5.929386899885003e-06, "loss": 1.1628, "mean_token_accuracy": 0.689041095890411, "step": 7240 }, { "epoch": 0.6705539358600583, "grad_norm": 4.3125, "learning_rate": 5.914635798809179e-06, "loss": 1.1631, "mean_token_accuracy": 0.6867416829745598, "step": 7245 }, { "epoch": 0.6710167060021287, "grad_norm": 4.3125, "learning_rate": 5.899895360499073e-06, "loss": 1.1515, "mean_token_accuracy": 0.689774951076321, "step": 7250 }, { "epoch": 0.6714794761441992, "grad_norm": 4.40625, "learning_rate": 5.885165623427106e-06, "loss": 1.1416, "mean_token_accuracy": 0.6900929549902152, "step": 7255 }, { "epoch": 0.6719422462862696, "grad_norm": 4.0625, "learning_rate": 5.870446626037762e-06, "loss": 1.1362, "mean_token_accuracy": 0.6940068493150686, "step": 7260 }, { "epoch": 0.67240501642834, "grad_norm": 4.6875, "learning_rate": 5.855738406747505e-06, "loss": 1.152, "mean_token_accuracy": 0.6930528375733853, "step": 7265 }, { "epoch": 0.6728677865704105, "grad_norm": 4.40625, "learning_rate": 5.84104100394466e-06, "loss": 1.1511, "mean_token_accuracy": 0.6922211350293541, "step": 7270 }, { "epoch": 0.6733305567124809, "grad_norm": 4.25, "learning_rate": 5.826354455989318e-06, "loss": 1.1869, "mean_token_accuracy": 0.6792318982387474, "step": 7275 }, { "epoch": 0.6737933268545514, "grad_norm": 3.921875, "learning_rate": 5.811678801213252e-06, "loss": 1.1272, "mean_token_accuracy": 0.6953522504892369, "step": 7280 }, { "epoch": 0.6742560969966218, "grad_norm": 5.15625, "learning_rate": 5.797014077919791e-06, "loss": 1.2225, "mean_token_accuracy": 0.6732387475538163, "step": 7285 }, { "epoch": 0.6747188671386922, "grad_norm": 4.4375, "learning_rate": 5.782360324383738e-06, "loss": 1.107, "mean_token_accuracy": 0.6991438356164383, "step": 7290 }, { "epoch": 0.6751816372807626, "grad_norm": 4.65625, "learning_rate": 5.767717578851264e-06, "loss": 1.1613, "mean_token_accuracy": 0.6857876712328769, "step": 7295 }, { "epoch": 0.675644407422833, "grad_norm": 4.75, "learning_rate": 5.7530858795398126e-06, "loss": 1.263, "mean_token_accuracy": 0.6609833659491192, "step": 7300 }, { "epoch": 0.675644407422833, "eval_loss": 1.1882083415985107, "eval_mean_token_accuracy": 0.6804614878913892, "eval_runtime": 39.6683, "eval_samples_per_second": 25.814, "eval_steps_per_second": 6.454, "step": 7300 }, { "epoch": 0.6761071775649035, "grad_norm": 4.25, "learning_rate": 5.7384652646379915e-06, "loss": 1.1696, "mean_token_accuracy": 0.6869618395303327, "step": 7305 }, { "epoch": 0.676569947706974, "grad_norm": 4.15625, "learning_rate": 5.723855772305478e-06, "loss": 1.1818, "mean_token_accuracy": 0.6798434442270057, "step": 7310 }, { "epoch": 0.6770327178490444, "grad_norm": 4.09375, "learning_rate": 5.709257440672931e-06, "loss": 1.1006, "mean_token_accuracy": 0.704109589041096, "step": 7315 }, { "epoch": 0.6774954879911148, "grad_norm": 4.34375, "learning_rate": 5.694670307841867e-06, "loss": 1.1756, "mean_token_accuracy": 0.6865459882583171, "step": 7320 }, { "epoch": 0.6779582581331852, "grad_norm": 4.3125, "learning_rate": 5.680094411884578e-06, "loss": 1.2159, "mean_token_accuracy": 0.6715998043052838, "step": 7325 }, { "epoch": 0.6784210282752556, "grad_norm": 4.0625, "learning_rate": 5.665529790844024e-06, "loss": 1.1074, "mean_token_accuracy": 0.7019324853228963, "step": 7330 }, { "epoch": 0.6788837984173262, "grad_norm": 4.125, "learning_rate": 5.65097648273375e-06, "loss": 1.1899, "mean_token_accuracy": 0.6790117416829744, "step": 7335 }, { "epoch": 0.6793465685593966, "grad_norm": 5.28125, "learning_rate": 5.6364345255377614e-06, "loss": 1.115, "mean_token_accuracy": 0.6954500978473581, "step": 7340 }, { "epoch": 0.679809338701467, "grad_norm": 4.15625, "learning_rate": 5.621903957210446e-06, "loss": 1.1213, "mean_token_accuracy": 0.6957191780821919, "step": 7345 }, { "epoch": 0.6802721088435374, "grad_norm": 4.5, "learning_rate": 5.607384815676458e-06, "loss": 1.1146, "mean_token_accuracy": 0.7009295499021524, "step": 7350 }, { "epoch": 0.6807348789856078, "grad_norm": 4.25, "learning_rate": 5.592877138830633e-06, "loss": 1.1829, "mean_token_accuracy": 0.6806751467710372, "step": 7355 }, { "epoch": 0.6811976491276783, "grad_norm": 4.28125, "learning_rate": 5.578380964537889e-06, "loss": 1.1662, "mean_token_accuracy": 0.6869373776908023, "step": 7360 }, { "epoch": 0.6816604192697487, "grad_norm": 4.25, "learning_rate": 5.5638963306331125e-06, "loss": 1.2662, "mean_token_accuracy": 0.6600782778864971, "step": 7365 }, { "epoch": 0.6821231894118192, "grad_norm": 4.34375, "learning_rate": 5.549423274921079e-06, "loss": 1.1509, "mean_token_accuracy": 0.6877446183953033, "step": 7370 }, { "epoch": 0.6825859595538896, "grad_norm": 4.1875, "learning_rate": 5.534961835176337e-06, "loss": 1.1813, "mean_token_accuracy": 0.6798434442270058, "step": 7375 }, { "epoch": 0.68304872969596, "grad_norm": 4.3125, "learning_rate": 5.520512049143117e-06, "loss": 1.167, "mean_token_accuracy": 0.6847847358121332, "step": 7380 }, { "epoch": 0.6835114998380305, "grad_norm": 4.34375, "learning_rate": 5.506073954535247e-06, "loss": 1.1883, "mean_token_accuracy": 0.6799657534246575, "step": 7385 }, { "epoch": 0.6839742699801009, "grad_norm": 4.3125, "learning_rate": 5.491647589036027e-06, "loss": 1.0739, "mean_token_accuracy": 0.7080479452054795, "step": 7390 }, { "epoch": 0.6844370401221713, "grad_norm": 4.21875, "learning_rate": 5.477232990298145e-06, "loss": 1.1907, "mean_token_accuracy": 0.6827054794520548, "step": 7395 }, { "epoch": 0.6848998102642417, "grad_norm": 4.125, "learning_rate": 5.462830195943583e-06, "loss": 1.1626, "mean_token_accuracy": 0.6885273972602739, "step": 7400 }, { "epoch": 0.6848998102642417, "eval_loss": 1.188185453414917, "eval_mean_token_accuracy": 0.6806181965508801, "eval_runtime": 39.5022, "eval_samples_per_second": 25.923, "eval_steps_per_second": 6.481, "step": 7400 }, { "epoch": 0.6853625804063121, "grad_norm": 4.6875, "learning_rate": 5.44843924356351e-06, "loss": 1.1546, "mean_token_accuracy": 0.6903131115459883, "step": 7405 }, { "epoch": 0.6858253505483827, "grad_norm": 4.0, "learning_rate": 5.434060170718189e-06, "loss": 1.2181, "mean_token_accuracy": 0.6732387475538161, "step": 7410 }, { "epoch": 0.6862881206904531, "grad_norm": 4.34375, "learning_rate": 5.4196930149368805e-06, "loss": 1.1494, "mean_token_accuracy": 0.6942025440313113, "step": 7415 }, { "epoch": 0.6867508908325235, "grad_norm": 4.8125, "learning_rate": 5.405337813717736e-06, "loss": 1.175, "mean_token_accuracy": 0.6832191780821917, "step": 7420 }, { "epoch": 0.6872136609745939, "grad_norm": 4.34375, "learning_rate": 5.390994604527704e-06, "loss": 1.1465, "mean_token_accuracy": 0.689628180039139, "step": 7425 }, { "epoch": 0.6876764311166643, "grad_norm": 5.375, "learning_rate": 5.376663424802448e-06, "loss": 1.1164, "mean_token_accuracy": 0.6977005870841487, "step": 7430 }, { "epoch": 0.6881392012587347, "grad_norm": 4.0, "learning_rate": 5.362344311946219e-06, "loss": 1.1513, "mean_token_accuracy": 0.687353228962818, "step": 7435 }, { "epoch": 0.6886019714008053, "grad_norm": 4.65625, "learning_rate": 5.348037303331779e-06, "loss": 1.2076, "mean_token_accuracy": 0.67426614481409, "step": 7440 }, { "epoch": 0.6890647415428757, "grad_norm": 4.15625, "learning_rate": 5.3337424363003e-06, "loss": 1.1078, "mean_token_accuracy": 0.6962818003913894, "step": 7445 }, { "epoch": 0.6895275116849461, "grad_norm": 5.1875, "learning_rate": 5.319459748161259e-06, "loss": 1.1551, "mean_token_accuracy": 0.6839530332681018, "step": 7450 }, { "epoch": 0.6899902818270165, "grad_norm": 3.765625, "learning_rate": 5.305189276192357e-06, "loss": 1.0416, "mean_token_accuracy": 0.7184931506849315, "step": 7455 }, { "epoch": 0.6904530519690869, "grad_norm": 4.3125, "learning_rate": 5.290931057639401e-06, "loss": 1.2063, "mean_token_accuracy": 0.6734833659491193, "step": 7460 }, { "epoch": 0.6909158221111574, "grad_norm": 3.96875, "learning_rate": 5.27668512971622e-06, "loss": 1.1035, "mean_token_accuracy": 0.7017123287671233, "step": 7465 }, { "epoch": 0.6913785922532278, "grad_norm": 3.84375, "learning_rate": 5.2624515296045645e-06, "loss": 1.1362, "mean_token_accuracy": 0.6962084148727985, "step": 7470 }, { "epoch": 0.6918413623952983, "grad_norm": 4.78125, "learning_rate": 5.2482302944540046e-06, "loss": 1.198, "mean_token_accuracy": 0.6787426614481411, "step": 7475 }, { "epoch": 0.6923041325373687, "grad_norm": 4.3125, "learning_rate": 5.234021461381851e-06, "loss": 1.1762, "mean_token_accuracy": 0.686619373776908, "step": 7480 }, { "epoch": 0.6927669026794391, "grad_norm": 4.21875, "learning_rate": 5.219825067473032e-06, "loss": 1.1487, "mean_token_accuracy": 0.6891144814090018, "step": 7485 }, { "epoch": 0.6932296728215096, "grad_norm": 4.65625, "learning_rate": 5.205641149780016e-06, "loss": 1.1487, "mean_token_accuracy": 0.6856164383561644, "step": 7490 }, { "epoch": 0.69369244296358, "grad_norm": 4.6875, "learning_rate": 5.191469745322708e-06, "loss": 1.1908, "mean_token_accuracy": 0.6776663405088064, "step": 7495 }, { "epoch": 0.6941552131056504, "grad_norm": 4.21875, "learning_rate": 5.177310891088347e-06, "loss": 1.1894, "mean_token_accuracy": 0.6842954990215263, "step": 7500 }, { "epoch": 0.6941552131056504, "eval_loss": 1.1881524324417114, "eval_mean_token_accuracy": 0.6804366438356158, "eval_runtime": 39.6125, "eval_samples_per_second": 25.85, "eval_steps_per_second": 6.463, "step": 7500 }, { "epoch": 0.6946179832477208, "grad_norm": 4.6875, "learning_rate": 5.1631646240314295e-06, "loss": 1.1372, "mean_token_accuracy": 0.69043542074364, "step": 7505 }, { "epoch": 0.6950807533897913, "grad_norm": 4.53125, "learning_rate": 5.149030981073588e-06, "loss": 1.1731, "mean_token_accuracy": 0.6813845401174168, "step": 7510 }, { "epoch": 0.6955435235318618, "grad_norm": 4.15625, "learning_rate": 5.134909999103513e-06, "loss": 1.1658, "mean_token_accuracy": 0.6848825831702546, "step": 7515 }, { "epoch": 0.6960062936739322, "grad_norm": 8.5, "learning_rate": 5.120801714976844e-06, "loss": 1.1365, "mean_token_accuracy": 0.6990949119373777, "step": 7520 }, { "epoch": 0.6964690638160026, "grad_norm": 4.375, "learning_rate": 5.106706165516082e-06, "loss": 1.1396, "mean_token_accuracy": 0.6949853228962819, "step": 7525 }, { "epoch": 0.696931833958073, "grad_norm": 4.5625, "learning_rate": 5.092623387510491e-06, "loss": 1.1639, "mean_token_accuracy": 0.6891878669275929, "step": 7530 }, { "epoch": 0.6973946041001434, "grad_norm": 4.125, "learning_rate": 5.0785534177160044e-06, "loss": 1.1164, "mean_token_accuracy": 0.6987035225048924, "step": 7535 }, { "epoch": 0.697857374242214, "grad_norm": 4.4375, "learning_rate": 5.06449629285512e-06, "loss": 1.1698, "mean_token_accuracy": 0.6856898238747552, "step": 7540 }, { "epoch": 0.6983201443842844, "grad_norm": 4.25, "learning_rate": 5.050452049616812e-06, "loss": 1.1486, "mean_token_accuracy": 0.6942514677103719, "step": 7545 }, { "epoch": 0.6987829145263548, "grad_norm": 4.46875, "learning_rate": 5.036420724656441e-06, "loss": 1.1452, "mean_token_accuracy": 0.6910225048923679, "step": 7550 }, { "epoch": 0.6992456846684252, "grad_norm": 4.53125, "learning_rate": 5.022402354595645e-06, "loss": 1.1993, "mean_token_accuracy": 0.6773238747553816, "step": 7555 }, { "epoch": 0.6997084548104956, "grad_norm": 4.3125, "learning_rate": 5.0083969760222476e-06, "loss": 1.1314, "mean_token_accuracy": 0.689554794520548, "step": 7560 }, { "epoch": 0.700171224952566, "grad_norm": 6.09375, "learning_rate": 4.994404625490167e-06, "loss": 1.1259, "mean_token_accuracy": 0.696110567514677, "step": 7565 }, { "epoch": 0.7006339950946365, "grad_norm": 4.03125, "learning_rate": 4.980425339519316e-06, "loss": 1.2125, "mean_token_accuracy": 0.6751956947162427, "step": 7570 }, { "epoch": 0.7010967652367069, "grad_norm": 4.03125, "learning_rate": 4.96645915459552e-06, "loss": 1.13, "mean_token_accuracy": 0.6927837573385518, "step": 7575 }, { "epoch": 0.7015595353787774, "grad_norm": 4.25, "learning_rate": 4.952506107170398e-06, "loss": 1.1508, "mean_token_accuracy": 0.6912181996086105, "step": 7580 }, { "epoch": 0.7020223055208478, "grad_norm": 4.34375, "learning_rate": 4.938566233661286e-06, "loss": 1.1814, "mean_token_accuracy": 0.6830968688845401, "step": 7585 }, { "epoch": 0.7024850756629182, "grad_norm": 4.1875, "learning_rate": 4.924639570451133e-06, "loss": 1.125, "mean_token_accuracy": 0.6965019569471625, "step": 7590 }, { "epoch": 0.7029478458049887, "grad_norm": 4.15625, "learning_rate": 4.910726153888408e-06, "loss": 1.1235, "mean_token_accuracy": 0.6970401174168298, "step": 7595 }, { "epoch": 0.7034106159470591, "grad_norm": 4.96875, "learning_rate": 4.8968260202870165e-06, "loss": 1.1751, "mean_token_accuracy": 0.687451076320939, "step": 7600 }, { "epoch": 0.7034106159470591, "eval_loss": 1.1881890296936035, "eval_mean_token_accuracy": 0.6806239297945198, "eval_runtime": 39.4516, "eval_samples_per_second": 25.956, "eval_steps_per_second": 6.489, "step": 7600 }, { "epoch": 0.7038733860891295, "grad_norm": 4.34375, "learning_rate": 4.8829392059261874e-06, "loss": 1.1949, "mean_token_accuracy": 0.6779843444227006, "step": 7605 }, { "epoch": 0.7043361562311999, "grad_norm": 4.34375, "learning_rate": 4.869065747050386e-06, "loss": 1.1768, "mean_token_accuracy": 0.6816536203522505, "step": 7610 }, { "epoch": 0.7047989263732704, "grad_norm": 4.21875, "learning_rate": 4.855205679869217e-06, "loss": 1.1523, "mean_token_accuracy": 0.6931996086105676, "step": 7615 }, { "epoch": 0.7052616965153409, "grad_norm": 4.5625, "learning_rate": 4.841359040557346e-06, "loss": 1.189, "mean_token_accuracy": 0.6791585127201567, "step": 7620 }, { "epoch": 0.7057244666574113, "grad_norm": 4.15625, "learning_rate": 4.82752586525438e-06, "loss": 1.1216, "mean_token_accuracy": 0.6970890410958905, "step": 7625 }, { "epoch": 0.7061872367994817, "grad_norm": 4.6875, "learning_rate": 4.813706190064789e-06, "loss": 1.1285, "mean_token_accuracy": 0.6933953033268102, "step": 7630 }, { "epoch": 0.7066500069415521, "grad_norm": 4.03125, "learning_rate": 4.799900051057806e-06, "loss": 1.1278, "mean_token_accuracy": 0.6944716242661447, "step": 7635 }, { "epoch": 0.7071127770836225, "grad_norm": 4.46875, "learning_rate": 4.786107484267332e-06, "loss": 1.188, "mean_token_accuracy": 0.6820939334637964, "step": 7640 }, { "epoch": 0.707575547225693, "grad_norm": 5.28125, "learning_rate": 4.772328525691859e-06, "loss": 1.1577, "mean_token_accuracy": 0.6846624266144813, "step": 7645 }, { "epoch": 0.7080383173677635, "grad_norm": 5.4375, "learning_rate": 4.758563211294345e-06, "loss": 1.2053, "mean_token_accuracy": 0.6772749510763211, "step": 7650 }, { "epoch": 0.7085010875098339, "grad_norm": 4.25, "learning_rate": 4.7448115770021445e-06, "loss": 1.0983, "mean_token_accuracy": 0.7052592954990213, "step": 7655 }, { "epoch": 0.7089638576519043, "grad_norm": 4.6875, "learning_rate": 4.731073658706904e-06, "loss": 1.1305, "mean_token_accuracy": 0.6977984344422701, "step": 7660 }, { "epoch": 0.7094266277939747, "grad_norm": 4.59375, "learning_rate": 4.717349492264474e-06, "loss": 1.1282, "mean_token_accuracy": 0.6959393346379648, "step": 7665 }, { "epoch": 0.7098893979360452, "grad_norm": 4.1875, "learning_rate": 4.703639113494813e-06, "loss": 1.2103, "mean_token_accuracy": 0.6732876712328765, "step": 7670 }, { "epoch": 0.7103521680781156, "grad_norm": 4.0625, "learning_rate": 4.689942558181893e-06, "loss": 1.1614, "mean_token_accuracy": 0.6868639921722114, "step": 7675 }, { "epoch": 0.710814938220186, "grad_norm": 4.25, "learning_rate": 4.676259862073604e-06, "loss": 1.1411, "mean_token_accuracy": 0.6912915851272016, "step": 7680 }, { "epoch": 0.7112777083622565, "grad_norm": 4.125, "learning_rate": 4.662591060881666e-06, "loss": 1.1542, "mean_token_accuracy": 0.6908268101761253, "step": 7685 }, { "epoch": 0.7117404785043269, "grad_norm": 4.5625, "learning_rate": 4.648936190281533e-06, "loss": 1.1614, "mean_token_accuracy": 0.6892123287671234, "step": 7690 }, { "epoch": 0.7122032486463973, "grad_norm": 4.65625, "learning_rate": 4.635295285912305e-06, "loss": 1.2252, "mean_token_accuracy": 0.670817025440313, "step": 7695 }, { "epoch": 0.7126660187884678, "grad_norm": 4.15625, "learning_rate": 4.621668383376625e-06, "loss": 1.1751, "mean_token_accuracy": 0.6814823874755384, "step": 7700 }, { "epoch": 0.7126660187884678, "eval_loss": 1.1881129741668701, "eval_mean_token_accuracy": 0.6806048189823866, "eval_runtime": 39.7, "eval_samples_per_second": 25.793, "eval_steps_per_second": 6.448, "step": 7700 }, { "epoch": 0.7131287889305382, "grad_norm": 4.15625, "learning_rate": 4.608055518240592e-06, "loss": 1.2014, "mean_token_accuracy": 0.6767367906066536, "step": 7705 }, { "epoch": 0.7135915590726086, "grad_norm": 5.375, "learning_rate": 4.594456726033662e-06, "loss": 1.1246, "mean_token_accuracy": 0.6961350293542073, "step": 7710 }, { "epoch": 0.714054329214679, "grad_norm": 4.25, "learning_rate": 4.580872042248575e-06, "loss": 1.1437, "mean_token_accuracy": 0.6932485322896282, "step": 7715 }, { "epoch": 0.7145170993567495, "grad_norm": 4.21875, "learning_rate": 4.567301502341238e-06, "loss": 1.1829, "mean_token_accuracy": 0.6811399217221135, "step": 7720 }, { "epoch": 0.71497986949882, "grad_norm": 4.125, "learning_rate": 4.5537451417306424e-06, "loss": 1.1115, "mean_token_accuracy": 0.7003180039138943, "step": 7725 }, { "epoch": 0.7154426396408904, "grad_norm": 4.59375, "learning_rate": 4.540202995798773e-06, "loss": 1.1397, "mean_token_accuracy": 0.6946428571428571, "step": 7730 }, { "epoch": 0.7159054097829608, "grad_norm": 5.65625, "learning_rate": 4.526675099890515e-06, "loss": 1.1009, "mean_token_accuracy": 0.6989481409001955, "step": 7735 }, { "epoch": 0.7163681799250312, "grad_norm": 4.40625, "learning_rate": 4.5131614893135655e-06, "loss": 1.1672, "mean_token_accuracy": 0.6859344422700586, "step": 7740 }, { "epoch": 0.7168309500671016, "grad_norm": 4.25, "learning_rate": 4.4996621993383305e-06, "loss": 1.1737, "mean_token_accuracy": 0.681115459882583, "step": 7745 }, { "epoch": 0.7172937202091721, "grad_norm": 4.34375, "learning_rate": 4.486177265197841e-06, "loss": 1.1623, "mean_token_accuracy": 0.6858855185909979, "step": 7750 }, { "epoch": 0.7177564903512426, "grad_norm": 4.4375, "learning_rate": 4.472706722087661e-06, "loss": 1.1994, "mean_token_accuracy": 0.6761252446183954, "step": 7755 }, { "epoch": 0.718219260493313, "grad_norm": 4.3125, "learning_rate": 4.459250605165788e-06, "loss": 1.2244, "mean_token_accuracy": 0.6711839530332682, "step": 7760 }, { "epoch": 0.7186820306353834, "grad_norm": 4.5, "learning_rate": 4.4458089495525814e-06, "loss": 1.1807, "mean_token_accuracy": 0.6795743639921721, "step": 7765 }, { "epoch": 0.7191448007774538, "grad_norm": 4.21875, "learning_rate": 4.432381790330643e-06, "loss": 1.214, "mean_token_accuracy": 0.677421722113503, "step": 7770 }, { "epoch": 0.7196075709195243, "grad_norm": 4.1875, "learning_rate": 4.418969162544745e-06, "loss": 1.2026, "mean_token_accuracy": 0.6771037181996086, "step": 7775 }, { "epoch": 0.7200703410615947, "grad_norm": 4.3125, "learning_rate": 4.40557110120173e-06, "loss": 1.1778, "mean_token_accuracy": 0.6813356164383562, "step": 7780 }, { "epoch": 0.7205331112036651, "grad_norm": 4.3125, "learning_rate": 4.392187641270424e-06, "loss": 1.1585, "mean_token_accuracy": 0.6892857142857144, "step": 7785 }, { "epoch": 0.7209958813457356, "grad_norm": 4.28125, "learning_rate": 4.378818817681545e-06, "loss": 1.173, "mean_token_accuracy": 0.6842221135029354, "step": 7790 }, { "epoch": 0.721458651487806, "grad_norm": 4.75, "learning_rate": 4.365464665327607e-06, "loss": 1.1164, "mean_token_accuracy": 0.6975782778864972, "step": 7795 }, { "epoch": 0.7219214216298765, "grad_norm": 3.859375, "learning_rate": 4.352125219062834e-06, "loss": 1.1757, "mean_token_accuracy": 0.6840753424657534, "step": 7800 }, { "epoch": 0.7219214216298765, "eval_loss": 1.1879887580871582, "eval_mean_token_accuracy": 0.6806334852005863, "eval_runtime": 39.6737, "eval_samples_per_second": 25.811, "eval_steps_per_second": 6.453, "step": 7800 }, { "epoch": 0.7223841917719469, "grad_norm": 4.21875, "learning_rate": 4.338800513703063e-06, "loss": 1.207, "mean_token_accuracy": 0.6731898238747555, "step": 7805 }, { "epoch": 0.7228469619140173, "grad_norm": 5.21875, "learning_rate": 4.3254905840256686e-06, "loss": 1.1416, "mean_token_accuracy": 0.6955724070450099, "step": 7810 }, { "epoch": 0.7233097320560877, "grad_norm": 4.21875, "learning_rate": 4.312195464769452e-06, "loss": 1.2211, "mean_token_accuracy": 0.673238747553816, "step": 7815 }, { "epoch": 0.7237725021981581, "grad_norm": 4.125, "learning_rate": 4.298915190634558e-06, "loss": 1.2121, "mean_token_accuracy": 0.674853228962818, "step": 7820 }, { "epoch": 0.7242352723402286, "grad_norm": 4.28125, "learning_rate": 4.285649796282391e-06, "loss": 1.2394, "mean_token_accuracy": 0.6663649706457926, "step": 7825 }, { "epoch": 0.7246980424822991, "grad_norm": 4.34375, "learning_rate": 4.272399316335514e-06, "loss": 1.1662, "mean_token_accuracy": 0.6841242661448141, "step": 7830 }, { "epoch": 0.7251608126243695, "grad_norm": 4.3125, "learning_rate": 4.259163785377572e-06, "loss": 1.1473, "mean_token_accuracy": 0.6909491193737768, "step": 7835 }, { "epoch": 0.7256235827664399, "grad_norm": 4.1875, "learning_rate": 4.245943237953184e-06, "loss": 1.1048, "mean_token_accuracy": 0.6977495107632095, "step": 7840 }, { "epoch": 0.7260863529085103, "grad_norm": 4.125, "learning_rate": 4.232737708567868e-06, "loss": 1.1129, "mean_token_accuracy": 0.697504892367906, "step": 7845 }, { "epoch": 0.7265491230505807, "grad_norm": 4.8125, "learning_rate": 4.2195472316879414e-06, "loss": 1.1467, "mean_token_accuracy": 0.6903620352250489, "step": 7850 }, { "epoch": 0.7270118931926512, "grad_norm": 4.5625, "learning_rate": 4.206371841740435e-06, "loss": 1.1789, "mean_token_accuracy": 0.6790606653620351, "step": 7855 }, { "epoch": 0.7274746633347217, "grad_norm": 4.28125, "learning_rate": 4.193211573113009e-06, "loss": 1.1807, "mean_token_accuracy": 0.6826320939334638, "step": 7860 }, { "epoch": 0.7279374334767921, "grad_norm": 4.15625, "learning_rate": 4.180066460153849e-06, "loss": 1.1959, "mean_token_accuracy": 0.6784246575342465, "step": 7865 }, { "epoch": 0.7284002036188625, "grad_norm": 4.4375, "learning_rate": 4.166936537171588e-06, "loss": 1.1707, "mean_token_accuracy": 0.6836839530332681, "step": 7870 }, { "epoch": 0.7288629737609329, "grad_norm": 4.65625, "learning_rate": 4.153821838435215e-06, "loss": 1.2001, "mean_token_accuracy": 0.6808708414872797, "step": 7875 }, { "epoch": 0.7293257439030034, "grad_norm": 3.90625, "learning_rate": 4.140722398173976e-06, "loss": 1.1448, "mean_token_accuracy": 0.6931262230919766, "step": 7880 }, { "epoch": 0.7297885140450738, "grad_norm": 4.25, "learning_rate": 4.127638250577305e-06, "loss": 1.1492, "mean_token_accuracy": 0.6881849315068493, "step": 7885 }, { "epoch": 0.7302512841871442, "grad_norm": 4.40625, "learning_rate": 4.114569429794714e-06, "loss": 1.1681, "mean_token_accuracy": 0.6916829745596869, "step": 7890 }, { "epoch": 0.7307140543292147, "grad_norm": 4.5, "learning_rate": 4.1015159699357125e-06, "loss": 1.1347, "mean_token_accuracy": 0.6977739726027398, "step": 7895 }, { "epoch": 0.7311768244712851, "grad_norm": 4.28125, "learning_rate": 4.08847790506972e-06, "loss": 1.1717, "mean_token_accuracy": 0.6876223091976517, "step": 7900 }, { "epoch": 0.7311768244712851, "eval_loss": 1.187973976135254, "eval_mean_token_accuracy": 0.6805474865459876, "eval_runtime": 39.473, "eval_samples_per_second": 25.942, "eval_steps_per_second": 6.485, "step": 7900 }, { "epoch": 0.7316395946133556, "grad_norm": 4.375, "learning_rate": 4.0754552692259745e-06, "loss": 1.2091, "mean_token_accuracy": 0.6734589041095891, "step": 7905 }, { "epoch": 0.732102364755426, "grad_norm": 4.3125, "learning_rate": 4.062448096393443e-06, "loss": 1.1334, "mean_token_accuracy": 0.6982632093933463, "step": 7910 }, { "epoch": 0.7325651348974964, "grad_norm": 4.28125, "learning_rate": 4.049456420520738e-06, "loss": 1.1601, "mean_token_accuracy": 0.6860812133072407, "step": 7915 }, { "epoch": 0.7330279050395668, "grad_norm": 4.28125, "learning_rate": 4.036480275516022e-06, "loss": 1.1417, "mean_token_accuracy": 0.6902397260273974, "step": 7920 }, { "epoch": 0.7334906751816372, "grad_norm": 4.15625, "learning_rate": 4.023519695246918e-06, "loss": 1.2313, "mean_token_accuracy": 0.6700831702544032, "step": 7925 }, { "epoch": 0.7339534453237078, "grad_norm": 4.4375, "learning_rate": 4.0105747135404395e-06, "loss": 1.1746, "mean_token_accuracy": 0.6833414872798433, "step": 7930 }, { "epoch": 0.7344162154657782, "grad_norm": 4.96875, "learning_rate": 3.997645364182871e-06, "loss": 1.2222, "mean_token_accuracy": 0.6752201565557729, "step": 7935 }, { "epoch": 0.7348789856078486, "grad_norm": 3.9375, "learning_rate": 3.984731680919708e-06, "loss": 1.2177, "mean_token_accuracy": 0.6726516634050881, "step": 7940 }, { "epoch": 0.735341755749919, "grad_norm": 4.125, "learning_rate": 3.971833697455552e-06, "loss": 1.1398, "mean_token_accuracy": 0.6923434442270058, "step": 7945 }, { "epoch": 0.7358045258919894, "grad_norm": 4.0, "learning_rate": 3.958951447454026e-06, "loss": 1.1382, "mean_token_accuracy": 0.692441291585127, "step": 7950 }, { "epoch": 0.7362672960340599, "grad_norm": 4.5625, "learning_rate": 3.946084964537699e-06, "loss": 1.1083, "mean_token_accuracy": 0.6991438356164383, "step": 7955 }, { "epoch": 0.7367300661761303, "grad_norm": 4.5, "learning_rate": 3.933234282287981e-06, "loss": 1.1264, "mean_token_accuracy": 0.6938111545988257, "step": 7960 }, { "epoch": 0.7371928363182008, "grad_norm": 4.6875, "learning_rate": 3.92039943424504e-06, "loss": 1.1695, "mean_token_accuracy": 0.6794031311154598, "step": 7965 }, { "epoch": 0.7376556064602712, "grad_norm": 5.0625, "learning_rate": 3.907580453907721e-06, "loss": 1.1295, "mean_token_accuracy": 0.695303326810176, "step": 7970 }, { "epoch": 0.7381183766023416, "grad_norm": 4.09375, "learning_rate": 3.894777374733451e-06, "loss": 1.1449, "mean_token_accuracy": 0.6863013698630137, "step": 7975 }, { "epoch": 0.738581146744412, "grad_norm": 4.375, "learning_rate": 3.881990230138163e-06, "loss": 1.1194, "mean_token_accuracy": 0.6985812133072409, "step": 7980 }, { "epoch": 0.7390439168864825, "grad_norm": 4.21875, "learning_rate": 3.869219053496192e-06, "loss": 1.1788, "mean_token_accuracy": 0.6800880626223091, "step": 7985 }, { "epoch": 0.7395066870285529, "grad_norm": 4.375, "learning_rate": 3.8564638781402e-06, "loss": 1.1352, "mean_token_accuracy": 0.6960861056751468, "step": 7990 }, { "epoch": 0.7399694571706233, "grad_norm": 5.1875, "learning_rate": 3.843724737361083e-06, "loss": 1.2496, "mean_token_accuracy": 0.6651174168297455, "step": 7995 }, { "epoch": 0.7404322273126938, "grad_norm": 4.15625, "learning_rate": 3.831001664407896e-06, "loss": 1.1404, "mean_token_accuracy": 0.6950587084148728, "step": 8000 }, { "epoch": 0.7404322273126938, "eval_loss": 1.188039779663086, "eval_mean_token_accuracy": 0.6805665973581212, "eval_runtime": 39.3933, "eval_samples_per_second": 25.994, "eval_steps_per_second": 6.499, "step": 8000 }, { "epoch": 0.7408949974547642, "grad_norm": 4.71875, "learning_rate": 3.818294692487749e-06, "loss": 1.2211, "mean_token_accuracy": 0.672211350293542, "step": 8005 }, { "epoch": 0.7413577675968347, "grad_norm": 4.0625, "learning_rate": 3.805603854765728e-06, "loss": 1.1691, "mean_token_accuracy": 0.6854941291585127, "step": 8010 }, { "epoch": 0.7418205377389051, "grad_norm": 4.15625, "learning_rate": 3.792929184364813e-06, "loss": 1.1143, "mean_token_accuracy": 0.7004647749510763, "step": 8015 }, { "epoch": 0.7422833078809755, "grad_norm": 3.921875, "learning_rate": 3.7802707143657813e-06, "loss": 1.1268, "mean_token_accuracy": 0.6961594911937379, "step": 8020 }, { "epoch": 0.7427460780230459, "grad_norm": 4.6875, "learning_rate": 3.7676284778071394e-06, "loss": 1.1162, "mean_token_accuracy": 0.6994618395303327, "step": 8025 }, { "epoch": 0.7432088481651163, "grad_norm": 4.0625, "learning_rate": 3.755002507685013e-06, "loss": 1.1312, "mean_token_accuracy": 0.6982632093933463, "step": 8030 }, { "epoch": 0.7436716183071869, "grad_norm": 4.09375, "learning_rate": 3.742392836953076e-06, "loss": 1.1479, "mean_token_accuracy": 0.6907778864970645, "step": 8035 }, { "epoch": 0.7441343884492573, "grad_norm": 4.65625, "learning_rate": 3.7297994985224617e-06, "loss": 1.2283, "mean_token_accuracy": 0.6691291585127203, "step": 8040 }, { "epoch": 0.7445971585913277, "grad_norm": 4.28125, "learning_rate": 3.717222525261677e-06, "loss": 1.2108, "mean_token_accuracy": 0.673238747553816, "step": 8045 }, { "epoch": 0.7450599287333981, "grad_norm": 4.25, "learning_rate": 3.704661949996512e-06, "loss": 1.1752, "mean_token_accuracy": 0.6837328767123287, "step": 8050 }, { "epoch": 0.7455226988754685, "grad_norm": 4.03125, "learning_rate": 3.6921178055099637e-06, "loss": 1.1117, "mean_token_accuracy": 0.7022015655577298, "step": 8055 }, { "epoch": 0.745985469017539, "grad_norm": 4.1875, "learning_rate": 3.6795901245421427e-06, "loss": 1.2013, "mean_token_accuracy": 0.6796477495107632, "step": 8060 }, { "epoch": 0.7464482391596095, "grad_norm": 4.28125, "learning_rate": 3.667078939790183e-06, "loss": 1.1972, "mean_token_accuracy": 0.6771037181996087, "step": 8065 }, { "epoch": 0.7469110093016799, "grad_norm": 4.1875, "learning_rate": 3.6545842839081802e-06, "loss": 1.1562, "mean_token_accuracy": 0.6882338551859101, "step": 8070 }, { "epoch": 0.7473737794437503, "grad_norm": 4.03125, "learning_rate": 3.6421061895070743e-06, "loss": 1.1289, "mean_token_accuracy": 0.6956457925636008, "step": 8075 }, { "epoch": 0.7478365495858207, "grad_norm": 4.09375, "learning_rate": 3.6296446891545845e-06, "loss": 1.1303, "mean_token_accuracy": 0.6931262230919766, "step": 8080 }, { "epoch": 0.7482993197278912, "grad_norm": 4.78125, "learning_rate": 3.617199815375123e-06, "loss": 1.1646, "mean_token_accuracy": 0.6872553816046966, "step": 8085 }, { "epoch": 0.7487620898699616, "grad_norm": 4.75, "learning_rate": 3.604771600649698e-06, "loss": 1.1762, "mean_token_accuracy": 0.681409001956947, "step": 8090 }, { "epoch": 0.749224860012032, "grad_norm": 4.09375, "learning_rate": 3.592360077415853e-06, "loss": 1.0908, "mean_token_accuracy": 0.7056751467710373, "step": 8095 }, { "epoch": 0.7496876301541024, "grad_norm": 4.25, "learning_rate": 3.5799652780675543e-06, "loss": 1.2264, "mean_token_accuracy": 0.6669031311154598, "step": 8100 }, { "epoch": 0.7496876301541024, "eval_loss": 1.1878974437713623, "eval_mean_token_accuracy": 0.6806678846624261, "eval_runtime": 39.4118, "eval_samples_per_second": 25.982, "eval_steps_per_second": 6.496, "step": 8100 }, { "epoch": 0.7501504002961729, "grad_norm": 4.125, "learning_rate": 3.56758723495512e-06, "loss": 1.2119, "mean_token_accuracy": 0.6728228962818005, "step": 8105 }, { "epoch": 0.7506131704382433, "grad_norm": 4.1875, "learning_rate": 3.5552259803851396e-06, "loss": 1.2255, "mean_token_accuracy": 0.6716731898238748, "step": 8110 }, { "epoch": 0.7510759405803138, "grad_norm": 4.1875, "learning_rate": 3.542881546620378e-06, "loss": 1.135, "mean_token_accuracy": 0.6959637964774952, "step": 8115 }, { "epoch": 0.7515387107223842, "grad_norm": 4.96875, "learning_rate": 3.5305539658797084e-06, "loss": 1.1591, "mean_token_accuracy": 0.6913405088062623, "step": 8120 }, { "epoch": 0.7520014808644546, "grad_norm": 3.921875, "learning_rate": 3.5182432703380086e-06, "loss": 1.1157, "mean_token_accuracy": 0.6976272015655576, "step": 8125 }, { "epoch": 0.752464251006525, "grad_norm": 4.125, "learning_rate": 3.505949492126088e-06, "loss": 1.131, "mean_token_accuracy": 0.6967954990215264, "step": 8130 }, { "epoch": 0.7529270211485954, "grad_norm": 4.28125, "learning_rate": 3.493672663330604e-06, "loss": 1.1722, "mean_token_accuracy": 0.6824853228962819, "step": 8135 }, { "epoch": 0.753389791290666, "grad_norm": 4.125, "learning_rate": 3.481412815993971e-06, "loss": 1.1833, "mean_token_accuracy": 0.6805528375733856, "step": 8140 }, { "epoch": 0.7538525614327364, "grad_norm": 4.40625, "learning_rate": 3.469169982114293e-06, "loss": 1.1806, "mean_token_accuracy": 0.6818003913894325, "step": 8145 }, { "epoch": 0.7543153315748068, "grad_norm": 4.15625, "learning_rate": 3.4569441936452587e-06, "loss": 1.1197, "mean_token_accuracy": 0.7011986301369866, "step": 8150 }, { "epoch": 0.7547781017168772, "grad_norm": 4.25, "learning_rate": 3.444735482496071e-06, "loss": 1.1369, "mean_token_accuracy": 0.6925146771037182, "step": 8155 }, { "epoch": 0.7552408718589476, "grad_norm": 4.59375, "learning_rate": 3.4325438805313625e-06, "loss": 1.191, "mean_token_accuracy": 0.6794031311154598, "step": 8160 }, { "epoch": 0.7557036420010181, "grad_norm": 4.03125, "learning_rate": 3.4203694195711103e-06, "loss": 1.1746, "mean_token_accuracy": 0.684344422700587, "step": 8165 }, { "epoch": 0.7561664121430886, "grad_norm": 4.0625, "learning_rate": 3.4082121313905537e-06, "loss": 1.1426, "mean_token_accuracy": 0.6896037181996086, "step": 8170 }, { "epoch": 0.756629182285159, "grad_norm": 4.1875, "learning_rate": 3.3960720477201105e-06, "loss": 1.1512, "mean_token_accuracy": 0.6899951076320939, "step": 8175 }, { "epoch": 0.7570919524272294, "grad_norm": 4.375, "learning_rate": 3.3839492002452966e-06, "loss": 1.1538, "mean_token_accuracy": 0.6876467710371821, "step": 8180 }, { "epoch": 0.7575547225692998, "grad_norm": 4.15625, "learning_rate": 3.3718436206066373e-06, "loss": 1.1409, "mean_token_accuracy": 0.6907289628180041, "step": 8185 }, { "epoch": 0.7580174927113703, "grad_norm": 3.921875, "learning_rate": 3.3597553403995973e-06, "loss": 1.1308, "mean_token_accuracy": 0.6924168297455967, "step": 8190 }, { "epoch": 0.7584802628534407, "grad_norm": 4.96875, "learning_rate": 3.3476843911744837e-06, "loss": 1.1754, "mean_token_accuracy": 0.6861056751467709, "step": 8195 }, { "epoch": 0.7589430329955111, "grad_norm": 4.09375, "learning_rate": 3.3356308044363683e-06, "loss": 1.0563, "mean_token_accuracy": 0.7124755381604698, "step": 8200 }, { "epoch": 0.7589430329955111, "eval_loss": 1.1878548860549927, "eval_mean_token_accuracy": 0.6807156616927589, "eval_runtime": 39.4192, "eval_samples_per_second": 25.977, "eval_steps_per_second": 6.494, "step": 8200 }, { "epoch": 0.7594058031375815, "grad_norm": 4.3125, "learning_rate": 3.3235946116450125e-06, "loss": 1.1652, "mean_token_accuracy": 0.684417808219178, "step": 8205 }, { "epoch": 0.759868573279652, "grad_norm": 4.59375, "learning_rate": 3.311575844214772e-06, "loss": 1.1013, "mean_token_accuracy": 0.7001712328767123, "step": 8210 }, { "epoch": 0.7603313434217225, "grad_norm": 4.125, "learning_rate": 3.2995745335145323e-06, "loss": 1.2958, "mean_token_accuracy": 0.6568493150684931, "step": 8215 }, { "epoch": 0.7607941135637929, "grad_norm": 4.0, "learning_rate": 3.287590710867612e-06, "loss": 1.0905, "mean_token_accuracy": 0.7035225048923678, "step": 8220 }, { "epoch": 0.7612568837058633, "grad_norm": 4.1875, "learning_rate": 3.275624407551684e-06, "loss": 1.1358, "mean_token_accuracy": 0.6922945205479452, "step": 8225 }, { "epoch": 0.7617196538479337, "grad_norm": 4.0, "learning_rate": 3.2636756547986993e-06, "loss": 1.1393, "mean_token_accuracy": 0.6942514677103717, "step": 8230 }, { "epoch": 0.7621824239900041, "grad_norm": 4.125, "learning_rate": 3.2517444837947954e-06, "loss": 1.2205, "mean_token_accuracy": 0.6739726027397259, "step": 8235 }, { "epoch": 0.7626451941320745, "grad_norm": 4.21875, "learning_rate": 3.2398309256802354e-06, "loss": 1.1586, "mean_token_accuracy": 0.6879892367906066, "step": 8240 }, { "epoch": 0.7631079642741451, "grad_norm": 4.40625, "learning_rate": 3.2279350115492993e-06, "loss": 1.0616, "mean_token_accuracy": 0.7091731898238747, "step": 8245 }, { "epoch": 0.7635707344162155, "grad_norm": 4.09375, "learning_rate": 3.2160567724502225e-06, "loss": 1.1386, "mean_token_accuracy": 0.692930528375734, "step": 8250 }, { "epoch": 0.7640335045582859, "grad_norm": 4.09375, "learning_rate": 3.2041962393851033e-06, "loss": 1.127, "mean_token_accuracy": 0.6918786692759296, "step": 8255 }, { "epoch": 0.7644962747003563, "grad_norm": 4.09375, "learning_rate": 3.1923534433098403e-06, "loss": 1.1611, "mean_token_accuracy": 0.6869618395303327, "step": 8260 }, { "epoch": 0.7649590448424267, "grad_norm": 5.0, "learning_rate": 3.1805284151340266e-06, "loss": 1.1418, "mean_token_accuracy": 0.6935665362035224, "step": 8265 }, { "epoch": 0.7654218149844972, "grad_norm": 4.03125, "learning_rate": 3.1687211857208834e-06, "loss": 1.0564, "mean_token_accuracy": 0.711325831702544, "step": 8270 }, { "epoch": 0.7658845851265677, "grad_norm": 4.40625, "learning_rate": 3.156931785887181e-06, "loss": 1.1726, "mean_token_accuracy": 0.6840753424657533, "step": 8275 }, { "epoch": 0.7663473552686381, "grad_norm": 3.984375, "learning_rate": 3.1451602464031495e-06, "loss": 1.1095, "mean_token_accuracy": 0.6986790606653621, "step": 8280 }, { "epoch": 0.7668101254107085, "grad_norm": 4.125, "learning_rate": 3.133406597992409e-06, "loss": 1.1543, "mean_token_accuracy": 0.6937622309197652, "step": 8285 }, { "epoch": 0.7672728955527789, "grad_norm": 5.75, "learning_rate": 3.121670871331881e-06, "loss": 1.1806, "mean_token_accuracy": 0.6797211350293542, "step": 8290 }, { "epoch": 0.7677356656948494, "grad_norm": 5.46875, "learning_rate": 3.1099530970517112e-06, "loss": 1.1233, "mean_token_accuracy": 0.6959393346379648, "step": 8295 }, { "epoch": 0.7681984358369198, "grad_norm": 4.25, "learning_rate": 3.0982533057351903e-06, "loss": 1.1795, "mean_token_accuracy": 0.6878180039138944, "step": 8300 }, { "epoch": 0.7681984358369198, "eval_loss": 1.1879552602767944, "eval_mean_token_accuracy": 0.6807194838551852, "eval_runtime": 39.5506, "eval_samples_per_second": 25.891, "eval_steps_per_second": 6.473, "step": 8300 }, { "epoch": 0.7686612059789902, "grad_norm": 4.21875, "learning_rate": 3.0865715279186724e-06, "loss": 1.2167, "mean_token_accuracy": 0.6755626223091976, "step": 8305 }, { "epoch": 0.7691239761210606, "grad_norm": 4.125, "learning_rate": 3.074907794091504e-06, "loss": 1.1288, "mean_token_accuracy": 0.6965264187866927, "step": 8310 }, { "epoch": 0.7695867462631311, "grad_norm": 3.984375, "learning_rate": 3.063262134695927e-06, "loss": 1.169, "mean_token_accuracy": 0.6832925636007828, "step": 8315 }, { "epoch": 0.7700495164052016, "grad_norm": 4.53125, "learning_rate": 3.051634580127012e-06, "loss": 1.1708, "mean_token_accuracy": 0.6839530332681018, "step": 8320 }, { "epoch": 0.770512286547272, "grad_norm": 4.0625, "learning_rate": 3.0400251607325792e-06, "loss": 1.1484, "mean_token_accuracy": 0.6920988258317025, "step": 8325 }, { "epoch": 0.7709750566893424, "grad_norm": 4.28125, "learning_rate": 3.0284339068131098e-06, "loss": 1.2118, "mean_token_accuracy": 0.674412915851272, "step": 8330 }, { "epoch": 0.7714378268314128, "grad_norm": 4.71875, "learning_rate": 3.0168608486216855e-06, "loss": 1.1132, "mean_token_accuracy": 0.6981653620352251, "step": 8335 }, { "epoch": 0.7719005969734832, "grad_norm": 5.1875, "learning_rate": 3.0053060163638846e-06, "loss": 1.1646, "mean_token_accuracy": 0.6853962818003914, "step": 8340 }, { "epoch": 0.7723633671155538, "grad_norm": 4.375, "learning_rate": 2.9937694401977213e-06, "loss": 1.1457, "mean_token_accuracy": 0.6907045009784735, "step": 8345 }, { "epoch": 0.7728261372576242, "grad_norm": 4.5, "learning_rate": 2.982251150233558e-06, "loss": 1.1967, "mean_token_accuracy": 0.6789383561643835, "step": 8350 }, { "epoch": 0.7732889073996946, "grad_norm": 4.0625, "learning_rate": 2.970751176534039e-06, "loss": 1.1633, "mean_token_accuracy": 0.6871575342465753, "step": 8355 }, { "epoch": 0.773751677541765, "grad_norm": 4.28125, "learning_rate": 2.9592695491139956e-06, "loss": 1.1655, "mean_token_accuracy": 0.684344422700587, "step": 8360 }, { "epoch": 0.7742144476838354, "grad_norm": 4.71875, "learning_rate": 2.9478062979403767e-06, "loss": 1.1708, "mean_token_accuracy": 0.686692759295499, "step": 8365 }, { "epoch": 0.7746772178259058, "grad_norm": 4.0625, "learning_rate": 2.9363614529321707e-06, "loss": 1.2008, "mean_token_accuracy": 0.6746575342465754, "step": 8370 }, { "epoch": 0.7751399879679763, "grad_norm": 3.953125, "learning_rate": 2.924935043960321e-06, "loss": 1.1366, "mean_token_accuracy": 0.6926369863013698, "step": 8375 }, { "epoch": 0.7756027581100468, "grad_norm": 4.09375, "learning_rate": 2.913527100847665e-06, "loss": 1.0979, "mean_token_accuracy": 0.7016634050880627, "step": 8380 }, { "epoch": 0.7760655282521172, "grad_norm": 4.0625, "learning_rate": 2.9021376533688327e-06, "loss": 1.2565, "mean_token_accuracy": 0.665582191780822, "step": 8385 }, { "epoch": 0.7765282983941876, "grad_norm": 4.6875, "learning_rate": 2.8907667312501865e-06, "loss": 1.1812, "mean_token_accuracy": 0.6789383561643835, "step": 8390 }, { "epoch": 0.776991068536258, "grad_norm": 4.53125, "learning_rate": 2.8794143641697347e-06, "loss": 1.1726, "mean_token_accuracy": 0.6826810176125245, "step": 8395 }, { "epoch": 0.7774538386783285, "grad_norm": 3.890625, "learning_rate": 2.8680805817570555e-06, "loss": 1.0973, "mean_token_accuracy": 0.7045254403131115, "step": 8400 }, { "epoch": 0.7774538386783285, "eval_loss": 1.1878743171691895, "eval_mean_token_accuracy": 0.6805608641144812, "eval_runtime": 39.7008, "eval_samples_per_second": 25.793, "eval_steps_per_second": 6.448, "step": 8400 }, { "epoch": 0.7779166088203989, "grad_norm": 4.21875, "learning_rate": 2.856765413593232e-06, "loss": 1.1553, "mean_token_accuracy": 0.6868884540117416, "step": 8405 }, { "epoch": 0.7783793789624693, "grad_norm": 4.3125, "learning_rate": 2.8454688892107518e-06, "loss": 1.1433, "mean_token_accuracy": 0.6921477495107633, "step": 8410 }, { "epoch": 0.7788421491045397, "grad_norm": 4.65625, "learning_rate": 2.834191038093449e-06, "loss": 1.1333, "mean_token_accuracy": 0.6973825831702543, "step": 8415 }, { "epoch": 0.7793049192466102, "grad_norm": 4.09375, "learning_rate": 2.822931889676417e-06, "loss": 1.12, "mean_token_accuracy": 0.6948385518590998, "step": 8420 }, { "epoch": 0.7797676893886807, "grad_norm": 4.03125, "learning_rate": 2.8116914733459376e-06, "loss": 1.1763, "mean_token_accuracy": 0.6851516634050879, "step": 8425 }, { "epoch": 0.7802304595307511, "grad_norm": 4.34375, "learning_rate": 2.800469818439402e-06, "loss": 1.1879, "mean_token_accuracy": 0.6855430528375733, "step": 8430 }, { "epoch": 0.7806932296728215, "grad_norm": 4.4375, "learning_rate": 2.7892669542452333e-06, "loss": 1.2162, "mean_token_accuracy": 0.6745107632093932, "step": 8435 }, { "epoch": 0.7811559998148919, "grad_norm": 4.25, "learning_rate": 2.778082910002812e-06, "loss": 1.1498, "mean_token_accuracy": 0.6898238747553816, "step": 8440 }, { "epoch": 0.7816187699569623, "grad_norm": 4.03125, "learning_rate": 2.7669177149023953e-06, "loss": 1.1376, "mean_token_accuracy": 0.6942270058708415, "step": 8445 }, { "epoch": 0.7820815400990329, "grad_norm": 4.25, "learning_rate": 2.755771398085052e-06, "loss": 1.2019, "mean_token_accuracy": 0.6753424657534246, "step": 8450 }, { "epoch": 0.7825443102411033, "grad_norm": 4.0625, "learning_rate": 2.7446439886425726e-06, "loss": 1.1102, "mean_token_accuracy": 0.6971135029354208, "step": 8455 }, { "epoch": 0.7830070803831737, "grad_norm": 5.0625, "learning_rate": 2.7335355156173993e-06, "loss": 1.2374, "mean_token_accuracy": 0.6689823874755383, "step": 8460 }, { "epoch": 0.7834698505252441, "grad_norm": 4.09375, "learning_rate": 2.722446008002553e-06, "loss": 1.1319, "mean_token_accuracy": 0.6928082191780822, "step": 8465 }, { "epoch": 0.7839326206673145, "grad_norm": 4.15625, "learning_rate": 2.7113754947415505e-06, "loss": 1.1416, "mean_token_accuracy": 0.6950587084148727, "step": 8470 }, { "epoch": 0.784395390809385, "grad_norm": 4.28125, "learning_rate": 2.700324004728344e-06, "loss": 1.161, "mean_token_accuracy": 0.6894324853228962, "step": 8475 }, { "epoch": 0.7848581609514554, "grad_norm": 4.09375, "learning_rate": 2.689291566807224e-06, "loss": 1.1215, "mean_token_accuracy": 0.6947407045009786, "step": 8480 }, { "epoch": 0.7853209310935259, "grad_norm": 4.25, "learning_rate": 2.678278209772759e-06, "loss": 1.1926, "mean_token_accuracy": 0.6821428571428572, "step": 8485 }, { "epoch": 0.7857837012355963, "grad_norm": 4.0, "learning_rate": 2.667283962369718e-06, "loss": 1.1527, "mean_token_accuracy": 0.6869373776908023, "step": 8490 }, { "epoch": 0.7862464713776667, "grad_norm": 4.125, "learning_rate": 2.6563088532929892e-06, "loss": 1.1845, "mean_token_accuracy": 0.680675146771037, "step": 8495 }, { "epoch": 0.7867092415197372, "grad_norm": 4.53125, "learning_rate": 2.6453529111875196e-06, "loss": 1.1326, "mean_token_accuracy": 0.6951076320939334, "step": 8500 }, { "epoch": 0.7867092415197372, "eval_loss": 1.1878032684326172, "eval_mean_token_accuracy": 0.6806697957436398, "eval_runtime": 39.6193, "eval_samples_per_second": 25.846, "eval_steps_per_second": 6.461, "step": 8500 }, { "epoch": 0.7871720116618076, "grad_norm": 4.5, "learning_rate": 2.6344161646482236e-06, "loss": 1.1581, "mean_token_accuracy": 0.6885273972602739, "step": 8505 }, { "epoch": 0.787634781803878, "grad_norm": 3.9375, "learning_rate": 2.623498642219914e-06, "loss": 1.1236, "mean_token_accuracy": 0.6926369863013699, "step": 8510 }, { "epoch": 0.7880975519459484, "grad_norm": 4.09375, "learning_rate": 2.6126003723972325e-06, "loss": 1.1352, "mean_token_accuracy": 0.6927592954990215, "step": 8515 }, { "epoch": 0.7885603220880189, "grad_norm": 4.53125, "learning_rate": 2.601721383624566e-06, "loss": 1.1542, "mean_token_accuracy": 0.6882583170254405, "step": 8520 }, { "epoch": 0.7890230922300893, "grad_norm": 4.0, "learning_rate": 2.5908617042959906e-06, "loss": 1.1642, "mean_token_accuracy": 0.6901908023483366, "step": 8525 }, { "epoch": 0.7894858623721598, "grad_norm": 4.34375, "learning_rate": 2.580021362755172e-06, "loss": 1.1611, "mean_token_accuracy": 0.6867661448140899, "step": 8530 }, { "epoch": 0.7899486325142302, "grad_norm": 6.9375, "learning_rate": 2.5692003872953097e-06, "loss": 1.2106, "mean_token_accuracy": 0.6786937377690802, "step": 8535 }, { "epoch": 0.7904114026563006, "grad_norm": 4.0, "learning_rate": 2.558398806159057e-06, "loss": 1.151, "mean_token_accuracy": 0.6906066536203521, "step": 8540 }, { "epoch": 0.790874172798371, "grad_norm": 4.25, "learning_rate": 2.547616647538449e-06, "loss": 1.1778, "mean_token_accuracy": 0.6830479452054794, "step": 8545 }, { "epoch": 0.7913369429404414, "grad_norm": 4.34375, "learning_rate": 2.536853939574827e-06, "loss": 1.1987, "mean_token_accuracy": 0.6774461839530332, "step": 8550 }, { "epoch": 0.791799713082512, "grad_norm": 4.09375, "learning_rate": 2.5261107103587688e-06, "loss": 1.1754, "mean_token_accuracy": 0.6790851272015657, "step": 8555 }, { "epoch": 0.7922624832245824, "grad_norm": 4.21875, "learning_rate": 2.5153869879300087e-06, "loss": 1.2083, "mean_token_accuracy": 0.6758561643835617, "step": 8560 }, { "epoch": 0.7927252533666528, "grad_norm": 3.734375, "learning_rate": 2.504682800277369e-06, "loss": 1.1419, "mean_token_accuracy": 0.6911692759295498, "step": 8565 }, { "epoch": 0.7931880235087232, "grad_norm": 4.15625, "learning_rate": 2.4939981753386955e-06, "loss": 1.177, "mean_token_accuracy": 0.6848336594911937, "step": 8570 }, { "epoch": 0.7936507936507936, "grad_norm": 4.25, "learning_rate": 2.483333141000762e-06, "loss": 1.1187, "mean_token_accuracy": 0.6990704500978474, "step": 8575 }, { "epoch": 0.7941135637928641, "grad_norm": 4.25, "learning_rate": 2.4726877250992196e-06, "loss": 1.1885, "mean_token_accuracy": 0.6819960861056751, "step": 8580 }, { "epoch": 0.7945763339349345, "grad_norm": 4.21875, "learning_rate": 2.462061955418512e-06, "loss": 1.1909, "mean_token_accuracy": 0.6784246575342467, "step": 8585 }, { "epoch": 0.795039104077005, "grad_norm": 4.5, "learning_rate": 2.451455859691806e-06, "loss": 1.1441, "mean_token_accuracy": 0.6917074363992172, "step": 8590 }, { "epoch": 0.7955018742190754, "grad_norm": 4.125, "learning_rate": 2.4408694656009236e-06, "loss": 1.0977, "mean_token_accuracy": 0.69926614481409, "step": 8595 }, { "epoch": 0.7959646443611458, "grad_norm": 4.03125, "learning_rate": 2.430302800776263e-06, "loss": 1.1911, "mean_token_accuracy": 0.6809442270058708, "step": 8600 }, { "epoch": 0.7959646443611458, "eval_loss": 1.187868595123291, "eval_mean_token_accuracy": 0.6807061062866921, "eval_runtime": 40.1688, "eval_samples_per_second": 25.492, "eval_steps_per_second": 6.373, "step": 8600 }, { "epoch": 0.7964274145032163, "grad_norm": 4.34375, "learning_rate": 2.419755892796729e-06, "loss": 1.1536, "mean_token_accuracy": 0.6870352250489237, "step": 8605 }, { "epoch": 0.7968901846452867, "grad_norm": 4.1875, "learning_rate": 2.409228769189661e-06, "loss": 1.1456, "mean_token_accuracy": 0.6911203522504892, "step": 8610 }, { "epoch": 0.7973529547873571, "grad_norm": 4.15625, "learning_rate": 2.3987214574307584e-06, "loss": 1.1691, "mean_token_accuracy": 0.6831213307240704, "step": 8615 }, { "epoch": 0.7978157249294275, "grad_norm": 4.1875, "learning_rate": 2.3882339849440206e-06, "loss": 1.1511, "mean_token_accuracy": 0.6848825831702545, "step": 8620 }, { "epoch": 0.798278495071498, "grad_norm": 4.1875, "learning_rate": 2.3777663791016605e-06, "loss": 1.1942, "mean_token_accuracy": 0.6741193737769081, "step": 8625 }, { "epoch": 0.7987412652135685, "grad_norm": 4.65625, "learning_rate": 2.3673186672240388e-06, "loss": 1.1782, "mean_token_accuracy": 0.6809442270058708, "step": 8630 }, { "epoch": 0.7992040353556389, "grad_norm": 4.25, "learning_rate": 2.356890876579592e-06, "loss": 1.1727, "mean_token_accuracy": 0.6844422700587083, "step": 8635 }, { "epoch": 0.7996668054977093, "grad_norm": 3.9375, "learning_rate": 2.346483034384769e-06, "loss": 1.2035, "mean_token_accuracy": 0.6779354207436399, "step": 8640 }, { "epoch": 0.8001295756397797, "grad_norm": 4.125, "learning_rate": 2.3360951678039477e-06, "loss": 1.2253, "mean_token_accuracy": 0.6704011741682974, "step": 8645 }, { "epoch": 0.8005923457818501, "grad_norm": 4.125, "learning_rate": 2.325727303949371e-06, "loss": 1.1625, "mean_token_accuracy": 0.6876467710371819, "step": 8650 }, { "epoch": 0.8010551159239205, "grad_norm": 3.78125, "learning_rate": 2.315379469881076e-06, "loss": 1.1559, "mean_token_accuracy": 0.690215264187867, "step": 8655 }, { "epoch": 0.8015178860659911, "grad_norm": 4.0625, "learning_rate": 2.305051692606819e-06, "loss": 1.1031, "mean_token_accuracy": 0.6990215264187868, "step": 8660 }, { "epoch": 0.8019806562080615, "grad_norm": 4.25, "learning_rate": 2.294743999082013e-06, "loss": 1.2265, "mean_token_accuracy": 0.6721868884540119, "step": 8665 }, { "epoch": 0.8024434263501319, "grad_norm": 3.953125, "learning_rate": 2.2844564162096493e-06, "loss": 1.178, "mean_token_accuracy": 0.6846379647749512, "step": 8670 }, { "epoch": 0.8029061964922023, "grad_norm": 5.21875, "learning_rate": 2.274188970840231e-06, "loss": 1.0728, "mean_token_accuracy": 0.7089530332681018, "step": 8675 }, { "epoch": 0.8033689666342727, "grad_norm": 4.0625, "learning_rate": 2.2639416897717037e-06, "loss": 1.1872, "mean_token_accuracy": 0.6800636007827789, "step": 8680 }, { "epoch": 0.8038317367763432, "grad_norm": 4.4375, "learning_rate": 2.2537145997493805e-06, "loss": 1.1209, "mean_token_accuracy": 0.696917808219178, "step": 8685 }, { "epoch": 0.8042945069184136, "grad_norm": 4.0625, "learning_rate": 2.2435077274658846e-06, "loss": 1.1524, "mean_token_accuracy": 0.6901663405088062, "step": 8690 }, { "epoch": 0.804757277060484, "grad_norm": 4.15625, "learning_rate": 2.233321099561062e-06, "loss": 1.23, "mean_token_accuracy": 0.6704011741682973, "step": 8695 }, { "epoch": 0.8052200472025545, "grad_norm": 4.125, "learning_rate": 2.223154742621927e-06, "loss": 1.0739, "mean_token_accuracy": 0.7110812133072406, "step": 8700 }, { "epoch": 0.8052200472025545, "eval_loss": 1.1877801418304443, "eval_mean_token_accuracy": 0.6806411295254398, "eval_runtime": 39.9992, "eval_samples_per_second": 25.601, "eval_steps_per_second": 6.4, "step": 8700 }, { "epoch": 0.8056828173446249, "grad_norm": 4.5, "learning_rate": 2.2130086831825814e-06, "loss": 1.2069, "mean_token_accuracy": 0.6748776908023483, "step": 8705 }, { "epoch": 0.8061455874866954, "grad_norm": 4.1875, "learning_rate": 2.2028829477241532e-06, "loss": 1.0463, "mean_token_accuracy": 0.7145792563600784, "step": 8710 }, { "epoch": 0.8066083576287658, "grad_norm": 4.03125, "learning_rate": 2.1927775626747305e-06, "loss": 1.1099, "mean_token_accuracy": 0.6976516634050881, "step": 8715 }, { "epoch": 0.8070711277708362, "grad_norm": 4.1875, "learning_rate": 2.18269255440928e-06, "loss": 1.1482, "mean_token_accuracy": 0.6871575342465753, "step": 8720 }, { "epoch": 0.8075338979129066, "grad_norm": 4.65625, "learning_rate": 2.172627949249586e-06, "loss": 1.2007, "mean_token_accuracy": 0.6743884540117416, "step": 8725 }, { "epoch": 0.807996668054977, "grad_norm": 4.34375, "learning_rate": 2.162583773464181e-06, "loss": 1.1732, "mean_token_accuracy": 0.6792318982387474, "step": 8730 }, { "epoch": 0.8084594381970476, "grad_norm": 4.34375, "learning_rate": 2.152560053268281e-06, "loss": 1.2286, "mean_token_accuracy": 0.6673189823874756, "step": 8735 }, { "epoch": 0.808922208339118, "grad_norm": 4.25, "learning_rate": 2.1425568148237106e-06, "loss": 1.1741, "mean_token_accuracy": 0.6827544031311155, "step": 8740 }, { "epoch": 0.8093849784811884, "grad_norm": 4.25, "learning_rate": 2.132574084238834e-06, "loss": 1.2349, "mean_token_accuracy": 0.6671966731898238, "step": 8745 }, { "epoch": 0.8098477486232588, "grad_norm": 4.21875, "learning_rate": 2.1226118875684953e-06, "loss": 1.1731, "mean_token_accuracy": 0.6841487279843443, "step": 8750 }, { "epoch": 0.8103105187653292, "grad_norm": 4.03125, "learning_rate": 2.1126702508139385e-06, "loss": 1.1715, "mean_token_accuracy": 0.685518590998043, "step": 8755 }, { "epoch": 0.8107732889073997, "grad_norm": 4.125, "learning_rate": 2.102749199922757e-06, "loss": 1.1301, "mean_token_accuracy": 0.6949608610567515, "step": 8760 }, { "epoch": 0.8112360590494702, "grad_norm": 4.46875, "learning_rate": 2.0928487607888057e-06, "loss": 1.2027, "mean_token_accuracy": 0.6767857142857144, "step": 8765 }, { "epoch": 0.8116988291915406, "grad_norm": 4.0, "learning_rate": 2.0829689592521464e-06, "loss": 1.1031, "mean_token_accuracy": 0.7026418786692759, "step": 8770 }, { "epoch": 0.812161599333611, "grad_norm": 4.375, "learning_rate": 2.0731098210989775e-06, "loss": 1.1729, "mean_token_accuracy": 0.6851272015655577, "step": 8775 }, { "epoch": 0.8126243694756814, "grad_norm": 4.5625, "learning_rate": 2.0632713720615616e-06, "loss": 1.1993, "mean_token_accuracy": 0.6760029354207436, "step": 8780 }, { "epoch": 0.8130871396177518, "grad_norm": 4.40625, "learning_rate": 2.0534536378181734e-06, "loss": 1.1279, "mean_token_accuracy": 0.6930528375733856, "step": 8785 }, { "epoch": 0.8135499097598223, "grad_norm": 3.984375, "learning_rate": 2.0436566439930127e-06, "loss": 1.151, "mean_token_accuracy": 0.6881360078277886, "step": 8790 }, { "epoch": 0.8140126799018927, "grad_norm": 4.21875, "learning_rate": 2.0338804161561486e-06, "loss": 1.1514, "mean_token_accuracy": 0.690655577299413, "step": 8795 }, { "epoch": 0.8144754500439632, "grad_norm": 4.03125, "learning_rate": 2.024124979823453e-06, "loss": 1.171, "mean_token_accuracy": 0.6855185909980429, "step": 8800 }, { "epoch": 0.8144754500439632, "eval_loss": 1.1878849267959595, "eval_mean_token_accuracy": 0.6805054427592946, "eval_runtime": 40.1238, "eval_samples_per_second": 25.521, "eval_steps_per_second": 6.38, "step": 8800 }, { "epoch": 0.8149382201860336, "grad_norm": 4.25, "learning_rate": 2.014390360456533e-06, "loss": 1.1118, "mean_token_accuracy": 0.6965753424657535, "step": 8805 }, { "epoch": 0.815400990328104, "grad_norm": 4.0, "learning_rate": 2.0046765834626603e-06, "loss": 1.1359, "mean_token_accuracy": 0.6948140900195695, "step": 8810 }, { "epoch": 0.8158637604701745, "grad_norm": 4.1875, "learning_rate": 1.9949836741947102e-06, "loss": 1.1236, "mean_token_accuracy": 0.6961839530332681, "step": 8815 }, { "epoch": 0.8163265306122449, "grad_norm": 4.28125, "learning_rate": 1.985311657951093e-06, "loss": 1.201, "mean_token_accuracy": 0.6823140900195694, "step": 8820 }, { "epoch": 0.8167893007543153, "grad_norm": 4.4375, "learning_rate": 1.9756605599756873e-06, "loss": 1.1903, "mean_token_accuracy": 0.6789872798434442, "step": 8825 }, { "epoch": 0.8172520708963857, "grad_norm": 4.0625, "learning_rate": 1.9660304054577815e-06, "loss": 1.1347, "mean_token_accuracy": 0.693835616438356, "step": 8830 }, { "epoch": 0.8177148410384562, "grad_norm": 4.21875, "learning_rate": 1.956421219531993e-06, "loss": 1.201, "mean_token_accuracy": 0.6827788649706459, "step": 8835 }, { "epoch": 0.8181776111805267, "grad_norm": 4.15625, "learning_rate": 1.9468330272782154e-06, "loss": 1.1224, "mean_token_accuracy": 0.6959393346379648, "step": 8840 }, { "epoch": 0.8186403813225971, "grad_norm": 4.34375, "learning_rate": 1.9372658537215507e-06, "loss": 1.1107, "mean_token_accuracy": 0.6987035225048925, "step": 8845 }, { "epoch": 0.8191031514646675, "grad_norm": 4.65625, "learning_rate": 1.9277197238322343e-06, "loss": 1.1925, "mean_token_accuracy": 0.6807485322896283, "step": 8850 }, { "epoch": 0.8195659216067379, "grad_norm": 4.0625, "learning_rate": 1.9181946625255907e-06, "loss": 1.0894, "mean_token_accuracy": 0.7035469667318982, "step": 8855 }, { "epoch": 0.8200286917488083, "grad_norm": 4.0625, "learning_rate": 1.9086906946619465e-06, "loss": 1.1315, "mean_token_accuracy": 0.6949608610567515, "step": 8860 }, { "epoch": 0.8204914618908788, "grad_norm": 4.28125, "learning_rate": 1.8992078450465768e-06, "loss": 1.1157, "mean_token_accuracy": 0.7003669275929548, "step": 8865 }, { "epoch": 0.8209542320329493, "grad_norm": 4.1875, "learning_rate": 1.8897461384296367e-06, "loss": 1.2171, "mean_token_accuracy": 0.6743884540117416, "step": 8870 }, { "epoch": 0.8214170021750197, "grad_norm": 6.375, "learning_rate": 1.8803055995060993e-06, "loss": 1.1724, "mean_token_accuracy": 0.6824119373776907, "step": 8875 }, { "epoch": 0.8218797723170901, "grad_norm": 4.1875, "learning_rate": 1.8708862529156946e-06, "loss": 1.1214, "mean_token_accuracy": 0.6976027397260272, "step": 8880 }, { "epoch": 0.8223425424591605, "grad_norm": 4.125, "learning_rate": 1.8614881232428339e-06, "loss": 1.0985, "mean_token_accuracy": 0.7054060665362035, "step": 8885 }, { "epoch": 0.822805312601231, "grad_norm": 5.15625, "learning_rate": 1.852111235016556e-06, "loss": 1.0992, "mean_token_accuracy": 0.7010763209393345, "step": 8890 }, { "epoch": 0.8232680827433014, "grad_norm": 4.40625, "learning_rate": 1.8427556127104572e-06, "loss": 1.1296, "mean_token_accuracy": 0.6937377690802348, "step": 8895 }, { "epoch": 0.8237308528853718, "grad_norm": 4.21875, "learning_rate": 1.8334212807426366e-06, "loss": 1.2327, "mean_token_accuracy": 0.673091976516634, "step": 8900 }, { "epoch": 0.8237308528853718, "eval_loss": 1.1876757144927979, "eval_mean_token_accuracy": 0.6805914414138942, "eval_runtime": 40.0567, "eval_samples_per_second": 25.564, "eval_steps_per_second": 6.391, "step": 8900 }, { "epoch": 0.8241936230274423, "grad_norm": 4.375, "learning_rate": 1.824108263475618e-06, "loss": 1.1543, "mean_token_accuracy": 0.6864236790606653, "step": 8905 }, { "epoch": 0.8246563931695127, "grad_norm": 4.1875, "learning_rate": 1.814816585216297e-06, "loss": 1.1627, "mean_token_accuracy": 0.6852250489236791, "step": 8910 }, { "epoch": 0.8251191633115831, "grad_norm": 4.40625, "learning_rate": 1.8055462702158755e-06, "loss": 1.1692, "mean_token_accuracy": 0.6853228962818004, "step": 8915 }, { "epoch": 0.8255819334536536, "grad_norm": 4.21875, "learning_rate": 1.7962973426697961e-06, "loss": 1.2043, "mean_token_accuracy": 0.6776174168297455, "step": 8920 }, { "epoch": 0.826044703595724, "grad_norm": 4.1875, "learning_rate": 1.787069826717681e-06, "loss": 1.1652, "mean_token_accuracy": 0.6880381604696673, "step": 8925 }, { "epoch": 0.8265074737377944, "grad_norm": 4.53125, "learning_rate": 1.777863746443267e-06, "loss": 1.1069, "mean_token_accuracy": 0.6989236790606654, "step": 8930 }, { "epoch": 0.8269702438798648, "grad_norm": 4.34375, "learning_rate": 1.7686791258743475e-06, "loss": 1.1325, "mean_token_accuracy": 0.6941046966731899, "step": 8935 }, { "epoch": 0.8274330140219353, "grad_norm": 5.09375, "learning_rate": 1.7595159889827008e-06, "loss": 1.0958, "mean_token_accuracy": 0.7020058708414871, "step": 8940 }, { "epoch": 0.8278957841640058, "grad_norm": 4.5625, "learning_rate": 1.7503743596840361e-06, "loss": 1.1696, "mean_token_accuracy": 0.6848091976516635, "step": 8945 }, { "epoch": 0.8283585543060762, "grad_norm": 4.25, "learning_rate": 1.7412542618379313e-06, "loss": 1.1942, "mean_token_accuracy": 0.6805283757338552, "step": 8950 }, { "epoch": 0.8288213244481466, "grad_norm": 4.21875, "learning_rate": 1.7321557192477623e-06, "loss": 1.1275, "mean_token_accuracy": 0.6965753424657535, "step": 8955 }, { "epoch": 0.829284094590217, "grad_norm": 4.3125, "learning_rate": 1.7230787556606476e-06, "loss": 1.124, "mean_token_accuracy": 0.700587084148728, "step": 8960 }, { "epoch": 0.8297468647322874, "grad_norm": 3.984375, "learning_rate": 1.7140233947673845e-06, "loss": 1.1156, "mean_token_accuracy": 0.6978962818003913, "step": 8965 }, { "epoch": 0.830209634874358, "grad_norm": 4.4375, "learning_rate": 1.7049896602023864e-06, "loss": 1.1863, "mean_token_accuracy": 0.6856898238747553, "step": 8970 }, { "epoch": 0.8306724050164284, "grad_norm": 4.125, "learning_rate": 1.6959775755436258e-06, "loss": 1.1973, "mean_token_accuracy": 0.6797700587084149, "step": 8975 }, { "epoch": 0.8311351751584988, "grad_norm": 4.5625, "learning_rate": 1.6869871643125667e-06, "loss": 1.1753, "mean_token_accuracy": 0.6798189823874755, "step": 8980 }, { "epoch": 0.8315979453005692, "grad_norm": 4.3125, "learning_rate": 1.6780184499741048e-06, "loss": 1.1611, "mean_token_accuracy": 0.6882827788649706, "step": 8985 }, { "epoch": 0.8320607154426396, "grad_norm": 5.09375, "learning_rate": 1.6690714559365052e-06, "loss": 1.0972, "mean_token_accuracy": 0.7033512720156555, "step": 8990 }, { "epoch": 0.8325234855847101, "grad_norm": 4.40625, "learning_rate": 1.6601462055513517e-06, "loss": 1.168, "mean_token_accuracy": 0.6827544031311155, "step": 8995 }, { "epoch": 0.8329862557267805, "grad_norm": 4.15625, "learning_rate": 1.6512427221134687e-06, "loss": 1.2072, "mean_token_accuracy": 0.6758072407045009, "step": 9000 }, { "epoch": 0.8329862557267805, "eval_loss": 1.187744379043579, "eval_mean_token_accuracy": 0.680765349804304, "eval_runtime": 40.1481, "eval_samples_per_second": 25.506, "eval_steps_per_second": 6.376, "step": 9000 }, { "epoch": 0.833449025868851, "grad_norm": 4.25, "learning_rate": 1.6423610288608716e-06, "loss": 1.0633, "mean_token_accuracy": 0.7093688845401173, "step": 9005 }, { "epoch": 0.8339117960109214, "grad_norm": 5.46875, "learning_rate": 1.6335011489747054e-06, "loss": 1.1596, "mean_token_accuracy": 0.6861301369863012, "step": 9010 }, { "epoch": 0.8343745661529918, "grad_norm": 4.46875, "learning_rate": 1.6246631055791784e-06, "loss": 1.1581, "mean_token_accuracy": 0.6888698630136986, "step": 9015 }, { "epoch": 0.8348373362950623, "grad_norm": 4.28125, "learning_rate": 1.6158469217415118e-06, "loss": 1.196, "mean_token_accuracy": 0.6783757338551858, "step": 9020 }, { "epoch": 0.8353001064371327, "grad_norm": 4.03125, "learning_rate": 1.6070526204718695e-06, "loss": 1.1491, "mean_token_accuracy": 0.6894569471624266, "step": 9025 }, { "epoch": 0.8357628765792031, "grad_norm": 3.96875, "learning_rate": 1.598280224723302e-06, "loss": 1.1702, "mean_token_accuracy": 0.6860322896281801, "step": 9030 }, { "epoch": 0.8362256467212735, "grad_norm": 4.03125, "learning_rate": 1.589529757391688e-06, "loss": 1.1206, "mean_token_accuracy": 0.6945939334637965, "step": 9035 }, { "epoch": 0.8366884168633439, "grad_norm": 5.5625, "learning_rate": 1.5808012413156715e-06, "loss": 1.1321, "mean_token_accuracy": 0.6926369863013698, "step": 9040 }, { "epoch": 0.8371511870054145, "grad_norm": 4.4375, "learning_rate": 1.572094699276605e-06, "loss": 1.1329, "mean_token_accuracy": 0.6939334637964775, "step": 9045 }, { "epoch": 0.8376139571474849, "grad_norm": 3.8125, "learning_rate": 1.5634101539984903e-06, "loss": 1.1287, "mean_token_accuracy": 0.6940802348336594, "step": 9050 }, { "epoch": 0.8380767272895553, "grad_norm": 4.21875, "learning_rate": 1.5547476281479158e-06, "loss": 1.1395, "mean_token_accuracy": 0.6924412915851272, "step": 9055 }, { "epoch": 0.8385394974316257, "grad_norm": 4.21875, "learning_rate": 1.5461071443340003e-06, "loss": 1.1434, "mean_token_accuracy": 0.6881360078277885, "step": 9060 }, { "epoch": 0.8390022675736961, "grad_norm": 4.25, "learning_rate": 1.5374887251083293e-06, "loss": 1.1519, "mean_token_accuracy": 0.6909735812133073, "step": 9065 }, { "epoch": 0.8394650377157665, "grad_norm": 4.0, "learning_rate": 1.5288923929649103e-06, "loss": 1.1114, "mean_token_accuracy": 0.698899217221135, "step": 9070 }, { "epoch": 0.839927807857837, "grad_norm": 4.28125, "learning_rate": 1.5203181703400928e-06, "loss": 1.1314, "mean_token_accuracy": 0.6937133072407045, "step": 9075 }, { "epoch": 0.8403905779999075, "grad_norm": 4.4375, "learning_rate": 1.5117660796125244e-06, "loss": 1.207, "mean_token_accuracy": 0.6724070450097848, "step": 9080 }, { "epoch": 0.8408533481419779, "grad_norm": 4.25, "learning_rate": 1.5032361431030883e-06, "loss": 1.166, "mean_token_accuracy": 0.6855919765166341, "step": 9085 }, { "epoch": 0.8413161182840483, "grad_norm": 4.125, "learning_rate": 1.4947283830748504e-06, "loss": 1.2077, "mean_token_accuracy": 0.6779598825831703, "step": 9090 }, { "epoch": 0.8417788884261187, "grad_norm": 4.125, "learning_rate": 1.4862428217329883e-06, "loss": 1.104, "mean_token_accuracy": 0.7005870841487281, "step": 9095 }, { "epoch": 0.8422416585681892, "grad_norm": 4.375, "learning_rate": 1.4777794812247459e-06, "loss": 1.1935, "mean_token_accuracy": 0.6769814090019568, "step": 9100 }, { "epoch": 0.8422416585681892, "eval_loss": 1.1878008842468262, "eval_mean_token_accuracy": 0.6806564181751463, "eval_runtime": 39.7475, "eval_samples_per_second": 25.763, "eval_steps_per_second": 6.441, "step": 9100 }, { "epoch": 0.8427044287102596, "grad_norm": 5.21875, "learning_rate": 1.4693383836393693e-06, "loss": 1.1468, "mean_token_accuracy": 0.6905332681017613, "step": 9105 }, { "epoch": 0.84316719885233, "grad_norm": 4.4375, "learning_rate": 1.4609195510080487e-06, "loss": 1.177, "mean_token_accuracy": 0.6802837573385518, "step": 9110 }, { "epoch": 0.8436299689944005, "grad_norm": 4.53125, "learning_rate": 1.4525230053038697e-06, "loss": 1.1254, "mean_token_accuracy": 0.6981164383561645, "step": 9115 }, { "epoch": 0.8440927391364709, "grad_norm": 3.984375, "learning_rate": 1.4441487684417433e-06, "loss": 1.1005, "mean_token_accuracy": 0.7016878669275929, "step": 9120 }, { "epoch": 0.8445555092785414, "grad_norm": 4.0625, "learning_rate": 1.435796862278357e-06, "loss": 1.1734, "mean_token_accuracy": 0.6912671232876713, "step": 9125 }, { "epoch": 0.8450182794206118, "grad_norm": 4.125, "learning_rate": 1.4274673086121127e-06, "loss": 1.1114, "mean_token_accuracy": 0.7009540117416829, "step": 9130 }, { "epoch": 0.8454810495626822, "grad_norm": 4.375, "learning_rate": 1.4191601291830737e-06, "loss": 1.1376, "mean_token_accuracy": 0.6924412915851273, "step": 9135 }, { "epoch": 0.8459438197047526, "grad_norm": 4.375, "learning_rate": 1.410875345672912e-06, "loss": 1.1847, "mean_token_accuracy": 0.6799412915851272, "step": 9140 }, { "epoch": 0.846406589846823, "grad_norm": 4.25, "learning_rate": 1.4026129797048393e-06, "loss": 1.1347, "mean_token_accuracy": 0.6950587084148727, "step": 9145 }, { "epoch": 0.8468693599888936, "grad_norm": 3.875, "learning_rate": 1.3943730528435595e-06, "loss": 1.1201, "mean_token_accuracy": 0.6963062622309197, "step": 9150 }, { "epoch": 0.847332130130964, "grad_norm": 4.1875, "learning_rate": 1.3861555865952126e-06, "loss": 1.135, "mean_token_accuracy": 0.6935420743639922, "step": 9155 }, { "epoch": 0.8477949002730344, "grad_norm": 4.1875, "learning_rate": 1.3779606024073123e-06, "loss": 1.1697, "mean_token_accuracy": 0.6871819960861056, "step": 9160 }, { "epoch": 0.8482576704151048, "grad_norm": 4.3125, "learning_rate": 1.3697881216687027e-06, "loss": 1.0979, "mean_token_accuracy": 0.7034001956947162, "step": 9165 }, { "epoch": 0.8487204405571752, "grad_norm": 4.09375, "learning_rate": 1.3616381657094857e-06, "loss": 1.1477, "mean_token_accuracy": 0.6889432485322897, "step": 9170 }, { "epoch": 0.8491832106992457, "grad_norm": 4.1875, "learning_rate": 1.3535107558009774e-06, "loss": 1.117, "mean_token_accuracy": 0.6954011741682973, "step": 9175 }, { "epoch": 0.8496459808413162, "grad_norm": 4.25, "learning_rate": 1.3454059131556485e-06, "loss": 1.1847, "mean_token_accuracy": 0.6772015655577301, "step": 9180 }, { "epoch": 0.8501087509833866, "grad_norm": 4.21875, "learning_rate": 1.3373236589270688e-06, "loss": 1.0388, "mean_token_accuracy": 0.7199363992172212, "step": 9185 }, { "epoch": 0.850571521125457, "grad_norm": 4.53125, "learning_rate": 1.3292640142098535e-06, "loss": 1.1778, "mean_token_accuracy": 0.6822407045009785, "step": 9190 }, { "epoch": 0.8510342912675274, "grad_norm": 4.3125, "learning_rate": 1.3212270000396067e-06, "loss": 1.1541, "mean_token_accuracy": 0.6842710371819961, "step": 9195 }, { "epoch": 0.8514970614095978, "grad_norm": 3.9375, "learning_rate": 1.3132126373928644e-06, "loss": 1.1282, "mean_token_accuracy": 0.6952299412915852, "step": 9200 }, { "epoch": 0.8514970614095978, "eval_loss": 1.1878225803375244, "eval_mean_token_accuracy": 0.6806850843933452, "eval_runtime": 39.6002, "eval_samples_per_second": 25.858, "eval_steps_per_second": 6.465, "step": 9200 }, { "epoch": 0.8519598315516683, "grad_norm": 4.625, "learning_rate": 1.305220947187046e-06, "loss": 1.1987, "mean_token_accuracy": 0.676761252446184, "step": 9205 }, { "epoch": 0.8524226016937387, "grad_norm": 4.21875, "learning_rate": 1.2972519502803972e-06, "loss": 1.1465, "mean_token_accuracy": 0.6882827788649706, "step": 9210 }, { "epoch": 0.8528853718358091, "grad_norm": 4.21875, "learning_rate": 1.2893056674719295e-06, "loss": 1.177, "mean_token_accuracy": 0.6833414872798433, "step": 9215 }, { "epoch": 0.8533481419778796, "grad_norm": 3.953125, "learning_rate": 1.2813821195013742e-06, "loss": 1.1508, "mean_token_accuracy": 0.6885518590998043, "step": 9220 }, { "epoch": 0.85381091211995, "grad_norm": 4.375, "learning_rate": 1.2734813270491242e-06, "loss": 1.1873, "mean_token_accuracy": 0.6835861056751468, "step": 9225 }, { "epoch": 0.8542736822620205, "grad_norm": 4.59375, "learning_rate": 1.2656033107361777e-06, "loss": 1.1878, "mean_token_accuracy": 0.6815068493150684, "step": 9230 }, { "epoch": 0.8547364524040909, "grad_norm": 4.40625, "learning_rate": 1.2577480911240936e-06, "loss": 1.1663, "mean_token_accuracy": 0.6877446183953034, "step": 9235 }, { "epoch": 0.8551992225461613, "grad_norm": 4.4375, "learning_rate": 1.2499156887149278e-06, "loss": 1.129, "mean_token_accuracy": 0.6964774951076321, "step": 9240 }, { "epoch": 0.8556619926882317, "grad_norm": 3.953125, "learning_rate": 1.2421061239511812e-06, "loss": 1.1123, "mean_token_accuracy": 0.6969667318982388, "step": 9245 }, { "epoch": 0.8561247628303021, "grad_norm": 4.9375, "learning_rate": 1.2343194172157535e-06, "loss": 1.1418, "mean_token_accuracy": 0.6924412915851272, "step": 9250 }, { "epoch": 0.8565875329723727, "grad_norm": 4.15625, "learning_rate": 1.2265555888318802e-06, "loss": 1.2001, "mean_token_accuracy": 0.6770547945205478, "step": 9255 }, { "epoch": 0.8570503031144431, "grad_norm": 4.28125, "learning_rate": 1.2188146590630913e-06, "loss": 1.132, "mean_token_accuracy": 0.6930772994129158, "step": 9260 }, { "epoch": 0.8575130732565135, "grad_norm": 4.15625, "learning_rate": 1.2110966481131448e-06, "loss": 1.135, "mean_token_accuracy": 0.6933708414872797, "step": 9265 }, { "epoch": 0.8579758433985839, "grad_norm": 4.03125, "learning_rate": 1.2034015761259832e-06, "loss": 1.1886, "mean_token_accuracy": 0.683463796477495, "step": 9270 }, { "epoch": 0.8584386135406543, "grad_norm": 4.625, "learning_rate": 1.1957294631856786e-06, "loss": 1.108, "mean_token_accuracy": 0.7020547945205479, "step": 9275 }, { "epoch": 0.8589013836827248, "grad_norm": 4.1875, "learning_rate": 1.1880803293163823e-06, "loss": 1.0755, "mean_token_accuracy": 0.7056262230919765, "step": 9280 }, { "epoch": 0.8593641538247953, "grad_norm": 4.09375, "learning_rate": 1.1804541944822678e-06, "loss": 1.1543, "mean_token_accuracy": 0.6896037181996086, "step": 9285 }, { "epoch": 0.8598269239668657, "grad_norm": 3.953125, "learning_rate": 1.1728510785874813e-06, "loss": 1.121, "mean_token_accuracy": 0.6951565557729941, "step": 9290 }, { "epoch": 0.8602896941089361, "grad_norm": 4.25, "learning_rate": 1.1652710014760904e-06, "loss": 1.1752, "mean_token_accuracy": 0.6819471624266145, "step": 9295 }, { "epoch": 0.8607524642510065, "grad_norm": 4.8125, "learning_rate": 1.1577139829320317e-06, "loss": 1.1689, "mean_token_accuracy": 0.6898238747553816, "step": 9300 }, { "epoch": 0.8607524642510065, "eval_loss": 1.1877785921096802, "eval_mean_token_accuracy": 0.6806468627690796, "eval_runtime": 39.6724, "eval_samples_per_second": 25.811, "eval_steps_per_second": 6.453, "step": 9300 }, { "epoch": 0.861215234393077, "grad_norm": 4.375, "learning_rate": 1.15018004267906e-06, "loss": 1.1558, "mean_token_accuracy": 0.688674168297456, "step": 9305 }, { "epoch": 0.8616780045351474, "grad_norm": 4.34375, "learning_rate": 1.1426692003806949e-06, "loss": 1.2344, "mean_token_accuracy": 0.6670743639921721, "step": 9310 }, { "epoch": 0.8621407746772178, "grad_norm": 4.4375, "learning_rate": 1.135181475640169e-06, "loss": 1.204, "mean_token_accuracy": 0.6751956947162425, "step": 9315 }, { "epoch": 0.8626035448192882, "grad_norm": 4.03125, "learning_rate": 1.1277168880003819e-06, "loss": 1.1187, "mean_token_accuracy": 0.697211350293542, "step": 9320 }, { "epoch": 0.8630663149613587, "grad_norm": 4.03125, "learning_rate": 1.1202754569438412e-06, "loss": 1.1368, "mean_token_accuracy": 0.6913894324853229, "step": 9325 }, { "epoch": 0.8635290851034291, "grad_norm": 3.953125, "learning_rate": 1.112857201892623e-06, "loss": 1.1658, "mean_token_accuracy": 0.685445205479452, "step": 9330 }, { "epoch": 0.8639918552454996, "grad_norm": 6.5625, "learning_rate": 1.1054621422083057e-06, "loss": 1.1729, "mean_token_accuracy": 0.686912915851272, "step": 9335 }, { "epoch": 0.86445462538757, "grad_norm": 4.09375, "learning_rate": 1.0980902971919349e-06, "loss": 1.1559, "mean_token_accuracy": 0.6886007827788652, "step": 9340 }, { "epoch": 0.8649173955296404, "grad_norm": 4.25, "learning_rate": 1.0907416860839605e-06, "loss": 1.1802, "mean_token_accuracy": 0.6802103718199609, "step": 9345 }, { "epoch": 0.8653801656717108, "grad_norm": 3.953125, "learning_rate": 1.083416328064194e-06, "loss": 1.1153, "mean_token_accuracy": 0.700366927592955, "step": 9350 }, { "epoch": 0.8658429358137812, "grad_norm": 4.0625, "learning_rate": 1.0761142422517623e-06, "loss": 1.1539, "mean_token_accuracy": 0.6868639921722114, "step": 9355 }, { "epoch": 0.8663057059558518, "grad_norm": 4.46875, "learning_rate": 1.0688354477050433e-06, "loss": 1.1951, "mean_token_accuracy": 0.6767857142857142, "step": 9360 }, { "epoch": 0.8667684760979222, "grad_norm": 4.15625, "learning_rate": 1.0615799634216305e-06, "loss": 1.2352, "mean_token_accuracy": 0.6707681017612523, "step": 9365 }, { "epoch": 0.8672312462399926, "grad_norm": 3.890625, "learning_rate": 1.054347808338272e-06, "loss": 1.0554, "mean_token_accuracy": 0.7120107632093934, "step": 9370 }, { "epoch": 0.867694016382063, "grad_norm": 4.34375, "learning_rate": 1.047139001330837e-06, "loss": 1.1326, "mean_token_accuracy": 0.6945205479452055, "step": 9375 }, { "epoch": 0.8681567865241334, "grad_norm": 4.375, "learning_rate": 1.0399535612142464e-06, "loss": 1.1395, "mean_token_accuracy": 0.689408023483366, "step": 9380 }, { "epoch": 0.8686195566662039, "grad_norm": 4.34375, "learning_rate": 1.0327915067424388e-06, "loss": 1.1535, "mean_token_accuracy": 0.6911203522504892, "step": 9385 }, { "epoch": 0.8690823268082744, "grad_norm": 4.28125, "learning_rate": 1.0256528566083146e-06, "loss": 1.1888, "mean_token_accuracy": 0.6787426614481409, "step": 9390 }, { "epoch": 0.8695450969503448, "grad_norm": 5.1875, "learning_rate": 1.0185376294436889e-06, "loss": 1.1783, "mean_token_accuracy": 0.6808463796477494, "step": 9395 }, { "epoch": 0.8700078670924152, "grad_norm": 4.1875, "learning_rate": 1.0114458438192466e-06, "loss": 1.222, "mean_token_accuracy": 0.6744618395303326, "step": 9400 }, { "epoch": 0.8700078670924152, "eval_loss": 1.1877846717834473, "eval_mean_token_accuracy": 0.6807768162915848, "eval_runtime": 39.7552, "eval_samples_per_second": 25.758, "eval_steps_per_second": 6.439, "step": 9400 }, { "epoch": 0.8704706372344856, "grad_norm": 4.21875, "learning_rate": 1.004377518244486e-06, "loss": 1.1579, "mean_token_accuracy": 0.6855430528375733, "step": 9405 }, { "epoch": 0.8709334073765561, "grad_norm": 4.4375, "learning_rate": 9.973326711676767e-07, "loss": 1.2073, "mean_token_accuracy": 0.6738747553816048, "step": 9410 }, { "epoch": 0.8713961775186265, "grad_norm": 4.90625, "learning_rate": 9.903113209758098e-07, "loss": 1.1364, "mean_token_accuracy": 0.6939823874755381, "step": 9415 }, { "epoch": 0.8718589476606969, "grad_norm": 5.09375, "learning_rate": 9.83313485994548e-07, "loss": 1.1634, "mean_token_accuracy": 0.687720156555773, "step": 9420 }, { "epoch": 0.8723217178027673, "grad_norm": 4.125, "learning_rate": 9.763391844881831e-07, "loss": 1.1231, "mean_token_accuracy": 0.6943982387475539, "step": 9425 }, { "epoch": 0.8727844879448378, "grad_norm": 4.21875, "learning_rate": 9.693884346595818e-07, "loss": 1.2461, "mean_token_accuracy": 0.6632583170254404, "step": 9430 }, { "epoch": 0.8732472580869083, "grad_norm": 4.125, "learning_rate": 9.624612546501411e-07, "loss": 1.1584, "mean_token_accuracy": 0.6835861056751467, "step": 9435 }, { "epoch": 0.8737100282289787, "grad_norm": 4.25, "learning_rate": 9.555576625397423e-07, "loss": 1.1612, "mean_token_accuracy": 0.6888943248532289, "step": 9440 }, { "epoch": 0.8741727983710491, "grad_norm": 4.03125, "learning_rate": 9.486776763467009e-07, "loss": 1.1512, "mean_token_accuracy": 0.6914383561643835, "step": 9445 }, { "epoch": 0.8746355685131195, "grad_norm": 4.21875, "learning_rate": 9.418213140277232e-07, "loss": 1.1409, "mean_token_accuracy": 0.6932240704500978, "step": 9450 }, { "epoch": 0.8750983386551899, "grad_norm": 4.375, "learning_rate": 9.349885934778579e-07, "loss": 1.1879, "mean_token_accuracy": 0.6818248532289629, "step": 9455 }, { "epoch": 0.8755611087972603, "grad_norm": 4.125, "learning_rate": 9.281795325304454e-07, "loss": 1.1223, "mean_token_accuracy": 0.7003913894324854, "step": 9460 }, { "epoch": 0.8760238789393309, "grad_norm": 4.375, "learning_rate": 9.213941489570743e-07, "loss": 1.0895, "mean_token_accuracy": 0.702862035225049, "step": 9465 }, { "epoch": 0.8764866490814013, "grad_norm": 4.34375, "learning_rate": 9.146324604675439e-07, "loss": 1.1695, "mean_token_accuracy": 0.6867172211350294, "step": 9470 }, { "epoch": 0.8769494192234717, "grad_norm": 4.125, "learning_rate": 9.078944847097992e-07, "loss": 1.1274, "mean_token_accuracy": 0.6949608610567515, "step": 9475 }, { "epoch": 0.8774121893655421, "grad_norm": 4.28125, "learning_rate": 9.01180239269901e-07, "loss": 1.1538, "mean_token_accuracy": 0.6854207436399218, "step": 9480 }, { "epoch": 0.8778749595076125, "grad_norm": 5.03125, "learning_rate": 8.944897416719711e-07, "loss": 1.1322, "mean_token_accuracy": 0.6943003913894324, "step": 9485 }, { "epoch": 0.878337729649683, "grad_norm": 4.1875, "learning_rate": 8.878230093781514e-07, "loss": 1.1514, "mean_token_accuracy": 0.6889187866927593, "step": 9490 }, { "epoch": 0.8788004997917535, "grad_norm": 4.65625, "learning_rate": 8.811800597885578e-07, "loss": 1.2419, "mean_token_accuracy": 0.6706213307240703, "step": 9495 }, { "epoch": 0.8792632699338239, "grad_norm": 4.28125, "learning_rate": 8.745609102412323e-07, "loss": 1.166, "mean_token_accuracy": 0.686325831702544, "step": 9500 }, { "epoch": 0.8792632699338239, "eval_loss": 1.1877548694610596, "eval_mean_token_accuracy": 0.6805990857387473, "eval_runtime": 39.7019, "eval_samples_per_second": 25.792, "eval_steps_per_second": 6.448, "step": 9500 }, { "epoch": 0.8797260400758943, "grad_norm": 4.3125, "learning_rate": 8.679655780120977e-07, "loss": 1.1994, "mean_token_accuracy": 0.67646771037182, "step": 9505 }, { "epoch": 0.8801888102179647, "grad_norm": 4.6875, "learning_rate": 8.613940803149146e-07, "loss": 1.1192, "mean_token_accuracy": 0.6959148727984344, "step": 9510 }, { "epoch": 0.8806515803600352, "grad_norm": 4.5, "learning_rate": 8.548464343012342e-07, "loss": 1.1627, "mean_token_accuracy": 0.6900440313111547, "step": 9515 }, { "epoch": 0.8811143505021056, "grad_norm": 4.0625, "learning_rate": 8.48322657060362e-07, "loss": 1.1798, "mean_token_accuracy": 0.682558708414873, "step": 9520 }, { "epoch": 0.881577120644176, "grad_norm": 4.1875, "learning_rate": 8.418227656192967e-07, "loss": 1.1848, "mean_token_accuracy": 0.6850048923679061, "step": 9525 }, { "epoch": 0.8820398907862464, "grad_norm": 4.03125, "learning_rate": 8.353467769427004e-07, "loss": 1.09, "mean_token_accuracy": 0.7031555772994129, "step": 9530 }, { "epoch": 0.8825026609283169, "grad_norm": 6.59375, "learning_rate": 8.288947079328491e-07, "loss": 1.1935, "mean_token_accuracy": 0.6780088062622308, "step": 9535 }, { "epoch": 0.8829654310703874, "grad_norm": 4.1875, "learning_rate": 8.224665754295857e-07, "loss": 1.1541, "mean_token_accuracy": 0.6902641878669274, "step": 9540 }, { "epoch": 0.8834282012124578, "grad_norm": 3.921875, "learning_rate": 8.160623962102843e-07, "loss": 1.1971, "mean_token_accuracy": 0.6772994129158512, "step": 9545 }, { "epoch": 0.8838909713545282, "grad_norm": 4.125, "learning_rate": 8.096821869897964e-07, "loss": 1.0314, "mean_token_accuracy": 0.7185420743639922, "step": 9550 }, { "epoch": 0.8843537414965986, "grad_norm": 4.875, "learning_rate": 8.03325964420415e-07, "loss": 1.1698, "mean_token_accuracy": 0.6805528375733856, "step": 9555 }, { "epoch": 0.884816511638669, "grad_norm": 4.1875, "learning_rate": 7.969937450918241e-07, "loss": 1.2, "mean_token_accuracy": 0.6751712328767123, "step": 9560 }, { "epoch": 0.8852792817807396, "grad_norm": 4.25, "learning_rate": 7.906855455310647e-07, "loss": 1.1185, "mean_token_accuracy": 0.6986545988258317, "step": 9565 }, { "epoch": 0.88574205192281, "grad_norm": 4.5625, "learning_rate": 7.844013822024821e-07, "loss": 1.1486, "mean_token_accuracy": 0.6900929549902152, "step": 9570 }, { "epoch": 0.8862048220648804, "grad_norm": 4.15625, "learning_rate": 7.781412715076897e-07, "loss": 1.1948, "mean_token_accuracy": 0.6767612524461839, "step": 9575 }, { "epoch": 0.8866675922069508, "grad_norm": 4.34375, "learning_rate": 7.719052297855234e-07, "loss": 1.1136, "mean_token_accuracy": 0.699339530332681, "step": 9580 }, { "epoch": 0.8871303623490212, "grad_norm": 4.40625, "learning_rate": 7.65693273311997e-07, "loss": 1.1341, "mean_token_accuracy": 0.6927592954990215, "step": 9585 }, { "epoch": 0.8875931324910917, "grad_norm": 3.953125, "learning_rate": 7.59505418300267e-07, "loss": 1.1208, "mean_token_accuracy": 0.699559686888454, "step": 9590 }, { "epoch": 0.8880559026331621, "grad_norm": 5.28125, "learning_rate": 7.5334168090058e-07, "loss": 1.1446, "mean_token_accuracy": 0.6904109589041096, "step": 9595 }, { "epoch": 0.8885186727752326, "grad_norm": 4.375, "learning_rate": 7.472020772002398e-07, "loss": 1.1749, "mean_token_accuracy": 0.6864970645792564, "step": 9600 }, { "epoch": 0.8885186727752326, "eval_loss": 1.187734842300415, "eval_mean_token_accuracy": 0.6806315741193738, "eval_runtime": 39.3739, "eval_samples_per_second": 26.007, "eval_steps_per_second": 6.502, "step": 9600 }, { "epoch": 0.888981442917303, "grad_norm": 4.25, "learning_rate": 7.410866232235569e-07, "loss": 1.0961, "mean_token_accuracy": 0.7022994129158512, "step": 9605 }, { "epoch": 0.8894442130593734, "grad_norm": 5.65625, "learning_rate": 7.349953349318151e-07, "loss": 1.1933, "mean_token_accuracy": 0.677641878669276, "step": 9610 }, { "epoch": 0.8899069832014438, "grad_norm": 4.40625, "learning_rate": 7.289282282232279e-07, "loss": 1.2223, "mean_token_accuracy": 0.671771037181996, "step": 9615 }, { "epoch": 0.8903697533435143, "grad_norm": 4.125, "learning_rate": 7.228853189328888e-07, "loss": 1.0133, "mean_token_accuracy": 0.7238013698630137, "step": 9620 }, { "epoch": 0.8908325234855847, "grad_norm": 4.28125, "learning_rate": 7.168666228327403e-07, "loss": 1.1549, "mean_token_accuracy": 0.6900440313111547, "step": 9625 }, { "epoch": 0.8912952936276551, "grad_norm": 4.1875, "learning_rate": 7.108721556315279e-07, "loss": 1.143, "mean_token_accuracy": 0.689041095890411, "step": 9630 }, { "epoch": 0.8917580637697256, "grad_norm": 3.90625, "learning_rate": 7.049019329747575e-07, "loss": 1.1141, "mean_token_accuracy": 0.6975293542074364, "step": 9635 }, { "epoch": 0.892220833911796, "grad_norm": 4.40625, "learning_rate": 6.989559704446647e-07, "loss": 1.1555, "mean_token_accuracy": 0.691242661448141, "step": 9640 }, { "epoch": 0.8926836040538665, "grad_norm": 4.34375, "learning_rate": 6.930342835601577e-07, "loss": 1.1564, "mean_token_accuracy": 0.6903864970645792, "step": 9645 }, { "epoch": 0.8931463741959369, "grad_norm": 4.21875, "learning_rate": 6.871368877767881e-07, "loss": 1.1458, "mean_token_accuracy": 0.6911937377690801, "step": 9650 }, { "epoch": 0.8936091443380073, "grad_norm": 4.375, "learning_rate": 6.812637984867087e-07, "loss": 1.1804, "mean_token_accuracy": 0.6790362035225048, "step": 9655 }, { "epoch": 0.8940719144800777, "grad_norm": 4.1875, "learning_rate": 6.754150310186347e-07, "loss": 1.194, "mean_token_accuracy": 0.6819227005870843, "step": 9660 }, { "epoch": 0.8945346846221481, "grad_norm": 4.15625, "learning_rate": 6.695906006377983e-07, "loss": 1.1755, "mean_token_accuracy": 0.6812622309197651, "step": 9665 }, { "epoch": 0.8949974547642187, "grad_norm": 4.09375, "learning_rate": 6.637905225459129e-07, "loss": 1.1605, "mean_token_accuracy": 0.6882827788649708, "step": 9670 }, { "epoch": 0.8954602249062891, "grad_norm": 4.34375, "learning_rate": 6.580148118811314e-07, "loss": 1.153, "mean_token_accuracy": 0.6881115459882583, "step": 9675 }, { "epoch": 0.8959229950483595, "grad_norm": 4.09375, "learning_rate": 6.522634837180119e-07, "loss": 1.1877, "mean_token_accuracy": 0.6820694716242661, "step": 9680 }, { "epoch": 0.8963857651904299, "grad_norm": 5.25, "learning_rate": 6.465365530674705e-07, "loss": 1.1503, "mean_token_accuracy": 0.688454011741683, "step": 9685 }, { "epoch": 0.8968485353325003, "grad_norm": 4.21875, "learning_rate": 6.408340348767473e-07, "loss": 1.1628, "mean_token_accuracy": 0.6868395303326811, "step": 9690 }, { "epoch": 0.8973113054745708, "grad_norm": 4.3125, "learning_rate": 6.351559440293653e-07, "loss": 1.1631, "mean_token_accuracy": 0.684197651663405, "step": 9695 }, { "epoch": 0.8977740756166412, "grad_norm": 4.25, "learning_rate": 6.295022953450958e-07, "loss": 1.1691, "mean_token_accuracy": 0.6861301369863013, "step": 9700 }, { "epoch": 0.8977740756166412, "eval_loss": 1.187730312347412, "eval_mean_token_accuracy": 0.6806583292563597, "eval_runtime": 40.5099, "eval_samples_per_second": 25.278, "eval_steps_per_second": 6.319, "step": 9700 }, { "epoch": 0.8982368457587117, "grad_norm": 4.09375, "learning_rate": 6.238731035799095e-07, "loss": 1.1718, "mean_token_accuracy": 0.6873043052837573, "step": 9705 }, { "epoch": 0.8986996159007821, "grad_norm": 5.6875, "learning_rate": 6.182683834259528e-07, "loss": 1.1568, "mean_token_accuracy": 0.6897260273972601, "step": 9710 }, { "epoch": 0.8991623860428525, "grad_norm": 4.53125, "learning_rate": 6.126881495114967e-07, "loss": 1.1738, "mean_token_accuracy": 0.6837328767123287, "step": 9715 }, { "epoch": 0.899625156184923, "grad_norm": 4.28125, "learning_rate": 6.07132416400903e-07, "loss": 1.2168, "mean_token_accuracy": 0.6727739726027399, "step": 9720 }, { "epoch": 0.9000879263269934, "grad_norm": 4.3125, "learning_rate": 6.016011985945868e-07, "loss": 1.1934, "mean_token_accuracy": 0.6768835616438357, "step": 9725 }, { "epoch": 0.9005506964690638, "grad_norm": 4.375, "learning_rate": 5.960945105289829e-07, "loss": 1.145, "mean_token_accuracy": 0.6892367906066537, "step": 9730 }, { "epoch": 0.9010134666111342, "grad_norm": 4.25, "learning_rate": 5.906123665764974e-07, "loss": 1.0843, "mean_token_accuracy": 0.7081702544031312, "step": 9735 }, { "epoch": 0.9014762367532047, "grad_norm": 4.09375, "learning_rate": 5.851547810454805e-07, "loss": 1.139, "mean_token_accuracy": 0.6935909980430529, "step": 9740 }, { "epoch": 0.9019390068952751, "grad_norm": 4.375, "learning_rate": 5.797217681801836e-07, "loss": 1.1373, "mean_token_accuracy": 0.6945450097847359, "step": 9745 }, { "epoch": 0.9024017770373456, "grad_norm": 4.125, "learning_rate": 5.743133421607216e-07, "loss": 1.1627, "mean_token_accuracy": 0.6868884540117419, "step": 9750 }, { "epoch": 0.902864547179416, "grad_norm": 4.28125, "learning_rate": 5.68929517103044e-07, "loss": 1.1596, "mean_token_accuracy": 0.6884295499021527, "step": 9755 }, { "epoch": 0.9033273173214864, "grad_norm": 4.25, "learning_rate": 5.635703070588872e-07, "loss": 1.1563, "mean_token_accuracy": 0.6888454011741685, "step": 9760 }, { "epoch": 0.9037900874635568, "grad_norm": 4.09375, "learning_rate": 5.582357260157434e-07, "loss": 1.1706, "mean_token_accuracy": 0.6855430528375733, "step": 9765 }, { "epoch": 0.9042528576056272, "grad_norm": 4.5, "learning_rate": 5.529257878968219e-07, "loss": 1.1577, "mean_token_accuracy": 0.6868639921722114, "step": 9770 }, { "epoch": 0.9047156277476978, "grad_norm": 4.34375, "learning_rate": 5.47640506561018e-07, "loss": 1.155, "mean_token_accuracy": 0.6915851272015656, "step": 9775 }, { "epoch": 0.9051783978897682, "grad_norm": 4.21875, "learning_rate": 5.423798958028703e-07, "loss": 1.1528, "mean_token_accuracy": 0.69043542074364, "step": 9780 }, { "epoch": 0.9056411680318386, "grad_norm": 4.28125, "learning_rate": 5.371439693525282e-07, "loss": 1.2499, "mean_token_accuracy": 0.6659980430528376, "step": 9785 }, { "epoch": 0.906103938173909, "grad_norm": 5.03125, "learning_rate": 5.319327408757158e-07, "loss": 1.1402, "mean_token_accuracy": 0.6922945205479452, "step": 9790 }, { "epoch": 0.9065667083159794, "grad_norm": 4.0625, "learning_rate": 5.267462239736931e-07, "loss": 1.1677, "mean_token_accuracy": 0.6872064579256361, "step": 9795 }, { "epoch": 0.9070294784580499, "grad_norm": 4.65625, "learning_rate": 5.215844321832264e-07, "loss": 1.1791, "mean_token_accuracy": 0.6826076320939336, "step": 9800 }, { "epoch": 0.9070294784580499, "eval_loss": 1.18779456615448, "eval_mean_token_accuracy": 0.6806468627690793, "eval_runtime": 39.5758, "eval_samples_per_second": 25.874, "eval_steps_per_second": 6.469, "step": 9800 }, { "epoch": 0.9074922486001203, "grad_norm": 4.03125, "learning_rate": 5.164473789765478e-07, "loss": 1.0833, "mean_token_accuracy": 0.7081947162426616, "step": 9805 }, { "epoch": 0.9079550187421908, "grad_norm": 4.09375, "learning_rate": 5.113350777613224e-07, "loss": 1.1452, "mean_token_accuracy": 0.6921722113502934, "step": 9810 }, { "epoch": 0.9084177888842612, "grad_norm": 4.0625, "learning_rate": 5.062475418806123e-07, "loss": 1.1114, "mean_token_accuracy": 0.6944227005870841, "step": 9815 }, { "epoch": 0.9088805590263316, "grad_norm": 4.53125, "learning_rate": 5.011847846128415e-07, "loss": 1.1129, "mean_token_accuracy": 0.7009784735812133, "step": 9820 }, { "epoch": 0.9093433291684021, "grad_norm": 4.59375, "learning_rate": 4.961468191717678e-07, "loss": 1.1808, "mean_token_accuracy": 0.6826810176125243, "step": 9825 }, { "epoch": 0.9098060993104725, "grad_norm": 4.53125, "learning_rate": 4.911336587064341e-07, "loss": 1.1575, "mean_token_accuracy": 0.6870841487279844, "step": 9830 }, { "epoch": 0.9102688694525429, "grad_norm": 4.0, "learning_rate": 4.861453163011476e-07, "loss": 1.0815, "mean_token_accuracy": 0.7073385518590999, "step": 9835 }, { "epoch": 0.9107316395946133, "grad_norm": 4.96875, "learning_rate": 4.811818049754402e-07, "loss": 1.1429, "mean_token_accuracy": 0.6902152641878667, "step": 9840 }, { "epoch": 0.9111944097366838, "grad_norm": 4.28125, "learning_rate": 4.7624313768403044e-07, "loss": 1.1243, "mean_token_accuracy": 0.69486301369863, "step": 9845 }, { "epoch": 0.9116571798787543, "grad_norm": 4.3125, "learning_rate": 4.7132932731680294e-07, "loss": 1.1495, "mean_token_accuracy": 0.6867661448140899, "step": 9850 }, { "epoch": 0.9121199500208247, "grad_norm": 4.59375, "learning_rate": 4.664403866987588e-07, "loss": 1.2406, "mean_token_accuracy": 0.6676859099804303, "step": 9855 }, { "epoch": 0.9125827201628951, "grad_norm": 3.90625, "learning_rate": 4.61576328589991e-07, "loss": 1.2097, "mean_token_accuracy": 0.6795499021526419, "step": 9860 }, { "epoch": 0.9130454903049655, "grad_norm": 4.125, "learning_rate": 4.567371656856501e-07, "loss": 1.1652, "mean_token_accuracy": 0.6849315068493149, "step": 9865 }, { "epoch": 0.9135082604470359, "grad_norm": 4.34375, "learning_rate": 4.519229106159084e-07, "loss": 1.184, "mean_token_accuracy": 0.6792074363992172, "step": 9870 }, { "epoch": 0.9139710305891063, "grad_norm": 4.4375, "learning_rate": 4.471335759459328e-07, "loss": 1.1601, "mean_token_accuracy": 0.6867172211350294, "step": 9875 }, { "epoch": 0.9144338007311769, "grad_norm": 4.96875, "learning_rate": 4.4236917417584513e-07, "loss": 1.1571, "mean_token_accuracy": 0.6884784735812133, "step": 9880 }, { "epoch": 0.9148965708732473, "grad_norm": 4.1875, "learning_rate": 4.3762971774069406e-07, "loss": 1.1202, "mean_token_accuracy": 0.6959637964774952, "step": 9885 }, { "epoch": 0.9153593410153177, "grad_norm": 4.1875, "learning_rate": 4.329152190104191e-07, "loss": 1.1715, "mean_token_accuracy": 0.6890166340508805, "step": 9890 }, { "epoch": 0.9158221111573881, "grad_norm": 4.46875, "learning_rate": 4.2822569028982074e-07, "loss": 1.1197, "mean_token_accuracy": 0.6992416829745596, "step": 9895 }, { "epoch": 0.9162848812994585, "grad_norm": 4.40625, "learning_rate": 4.2356114381853276e-07, "loss": 1.1866, "mean_token_accuracy": 0.676320939334638, "step": 9900 }, { "epoch": 0.9162848812994585, "eval_loss": 1.1877682209014893, "eval_mean_token_accuracy": 0.6808551706213305, "eval_runtime": 39.8586, "eval_samples_per_second": 25.691, "eval_steps_per_second": 6.423, "step": 9900 }, { "epoch": 0.916747651441529, "grad_norm": 4.25, "learning_rate": 4.189215917709788e-07, "loss": 1.204, "mean_token_accuracy": 0.6773972602739726, "step": 9905 }, { "epoch": 0.9172104215835994, "grad_norm": 4.9375, "learning_rate": 4.1430704625635144e-07, "loss": 1.1868, "mean_token_accuracy": 0.6777397260273971, "step": 9910 }, { "epoch": 0.9176731917256699, "grad_norm": 4.53125, "learning_rate": 4.0971751931857205e-07, "loss": 1.1633, "mean_token_accuracy": 0.6852005870841487, "step": 9915 }, { "epoch": 0.9181359618677403, "grad_norm": 4.3125, "learning_rate": 4.05153022936271e-07, "loss": 1.1845, "mean_token_accuracy": 0.6818493150684933, "step": 9920 }, { "epoch": 0.9185987320098107, "grad_norm": 3.9375, "learning_rate": 4.0061356902274306e-07, "loss": 1.1986, "mean_token_accuracy": 0.6804794520547943, "step": 9925 }, { "epoch": 0.9190615021518812, "grad_norm": 4.1875, "learning_rate": 3.960991694259242e-07, "loss": 1.1848, "mean_token_accuracy": 0.6851027397260274, "step": 9930 }, { "epoch": 0.9195242722939516, "grad_norm": 4.40625, "learning_rate": 3.916098359283593e-07, "loss": 1.1688, "mean_token_accuracy": 0.6861301369863014, "step": 9935 }, { "epoch": 0.919987042436022, "grad_norm": 4.34375, "learning_rate": 3.8714558024717017e-07, "loss": 1.1662, "mean_token_accuracy": 0.6844422700587084, "step": 9940 }, { "epoch": 0.9204498125780924, "grad_norm": 4.28125, "learning_rate": 3.827064140340264e-07, "loss": 1.1071, "mean_token_accuracy": 0.7009784735812132, "step": 9945 }, { "epoch": 0.9209125827201629, "grad_norm": 4.6875, "learning_rate": 3.7829234887511334e-07, "loss": 1.1922, "mean_token_accuracy": 0.6789872798434443, "step": 9950 }, { "epoch": 0.9213753528622334, "grad_norm": 4.125, "learning_rate": 3.7390339629110316e-07, "loss": 1.1538, "mean_token_accuracy": 0.6904843444227006, "step": 9955 }, { "epoch": 0.9218381230043038, "grad_norm": 4.375, "learning_rate": 3.6953956773712495e-07, "loss": 1.1641, "mean_token_accuracy": 0.6851516634050882, "step": 9960 }, { "epoch": 0.9223008931463742, "grad_norm": 4.40625, "learning_rate": 3.6520087460273357e-07, "loss": 1.2017, "mean_token_accuracy": 0.6782045009784735, "step": 9965 }, { "epoch": 0.9227636632884446, "grad_norm": 4.15625, "learning_rate": 3.60887328211883e-07, "loss": 1.1516, "mean_token_accuracy": 0.6912181996086104, "step": 9970 }, { "epoch": 0.923226433430515, "grad_norm": 4.375, "learning_rate": 3.5659893982289083e-07, "loss": 1.1538, "mean_token_accuracy": 0.6883317025440314, "step": 9975 }, { "epoch": 0.9236892035725855, "grad_norm": 4.375, "learning_rate": 3.5233572062841504e-07, "loss": 1.1085, "mean_token_accuracy": 0.6976761252446184, "step": 9980 }, { "epoch": 0.924151973714656, "grad_norm": 4.46875, "learning_rate": 3.4809768175542046e-07, "loss": 1.1458, "mean_token_accuracy": 0.688894324853229, "step": 9985 }, { "epoch": 0.9246147438567264, "grad_norm": 4.65625, "learning_rate": 3.438848342651524e-07, "loss": 1.1542, "mean_token_accuracy": 0.6884784735812133, "step": 9990 }, { "epoch": 0.9250775139987968, "grad_norm": 4.375, "learning_rate": 3.396971891531076e-07, "loss": 1.1236, "mean_token_accuracy": 0.6934442270058709, "step": 9995 }, { "epoch": 0.9255402841408672, "grad_norm": 4.21875, "learning_rate": 3.355347573490053e-07, "loss": 1.1863, "mean_token_accuracy": 0.681335616438356, "step": 10000 }, { "epoch": 0.9255402841408672, "eval_loss": 1.1877409219741821, "eval_mean_token_accuracy": 0.6806506849315066, "eval_runtime": 40.2391, "eval_samples_per_second": 25.448, "eval_steps_per_second": 6.362, "step": 10000 }, { "epoch": 0.9260030542829376, "grad_norm": 4.125, "learning_rate": 3.313975497167543e-07, "loss": 1.2104, "mean_token_accuracy": 0.673385518590998, "step": 10005 }, { "epoch": 0.9264658244250081, "grad_norm": 3.84375, "learning_rate": 3.272855770544314e-07, "loss": 1.1399, "mean_token_accuracy": 0.6934197651663406, "step": 10010 }, { "epoch": 0.9269285945670785, "grad_norm": 4.375, "learning_rate": 3.231988500942529e-07, "loss": 1.1642, "mean_token_accuracy": 0.6847602739726028, "step": 10015 }, { "epoch": 0.927391364709149, "grad_norm": 4.09375, "learning_rate": 3.191373795025399e-07, "loss": 1.183, "mean_token_accuracy": 0.6841487279843443, "step": 10020 }, { "epoch": 0.9278541348512194, "grad_norm": 4.1875, "learning_rate": 3.1510117587969516e-07, "loss": 1.0332, "mean_token_accuracy": 0.7193493150684931, "step": 10025 }, { "epoch": 0.9283169049932898, "grad_norm": 4.5625, "learning_rate": 3.110902497601764e-07, "loss": 1.1419, "mean_token_accuracy": 0.6951810176125243, "step": 10030 }, { "epoch": 0.9287796751353603, "grad_norm": 4.53125, "learning_rate": 3.071046116124654e-07, "loss": 1.1688, "mean_token_accuracy": 0.6825342465753426, "step": 10035 }, { "epoch": 0.9292424452774307, "grad_norm": 4.25, "learning_rate": 3.0314427183904647e-07, "loss": 1.0958, "mean_token_accuracy": 0.7039628180039139, "step": 10040 }, { "epoch": 0.9297052154195011, "grad_norm": 4.4375, "learning_rate": 2.9920924077637027e-07, "loss": 1.1443, "mean_token_accuracy": 0.6931262230919765, "step": 10045 }, { "epoch": 0.9301679855615715, "grad_norm": 4.28125, "learning_rate": 2.9529952869483567e-07, "loss": 1.0841, "mean_token_accuracy": 0.7060665362035226, "step": 10050 }, { "epoch": 0.930630755703642, "grad_norm": 4.4375, "learning_rate": 2.9141514579875685e-07, "loss": 1.1712, "mean_token_accuracy": 0.6867172211350294, "step": 10055 }, { "epoch": 0.9310935258457125, "grad_norm": 4.78125, "learning_rate": 2.8755610222634066e-07, "loss": 1.1396, "mean_token_accuracy": 0.6916585127201567, "step": 10060 }, { "epoch": 0.9315562959877829, "grad_norm": 5.03125, "learning_rate": 2.837224080496581e-07, "loss": 1.2261, "mean_token_accuracy": 0.6710127201565557, "step": 10065 }, { "epoch": 0.9320190661298533, "grad_norm": 4.1875, "learning_rate": 2.799140732746175e-07, "loss": 1.1722, "mean_token_accuracy": 0.6868395303326811, "step": 10070 }, { "epoch": 0.9324818362719237, "grad_norm": 4.53125, "learning_rate": 2.761311078409401e-07, "loss": 1.1295, "mean_token_accuracy": 0.6943003913894324, "step": 10075 }, { "epoch": 0.9329446064139941, "grad_norm": 4.5, "learning_rate": 2.723735216221335e-07, "loss": 1.1217, "mean_token_accuracy": 0.6956947162426613, "step": 10080 }, { "epoch": 0.9334073765560646, "grad_norm": 4.03125, "learning_rate": 2.6864132442546596e-07, "loss": 1.0812, "mean_token_accuracy": 0.7051125244618395, "step": 10085 }, { "epoch": 0.9338701466981351, "grad_norm": 4.5, "learning_rate": 2.6493452599194115e-07, "loss": 1.1227, "mean_token_accuracy": 0.6945694716242661, "step": 10090 }, { "epoch": 0.9343329168402055, "grad_norm": 3.984375, "learning_rate": 2.6125313599627e-07, "loss": 1.1698, "mean_token_accuracy": 0.686179060665362, "step": 10095 }, { "epoch": 0.9347956869822759, "grad_norm": 4.03125, "learning_rate": 2.5759716404685e-07, "loss": 1.0886, "mean_token_accuracy": 0.7069960861056752, "step": 10100 }, { "epoch": 0.9347956869822759, "eval_loss": 1.1877822875976562, "eval_mean_token_accuracy": 0.6806506849315063, "eval_runtime": 40.5065, "eval_samples_per_second": 25.28, "eval_steps_per_second": 6.32, "step": 10100 }, { "epoch": 0.9352584571243463, "grad_norm": 4.03125, "learning_rate": 2.539666196857349e-07, "loss": 1.0328, "mean_token_accuracy": 0.7223091976516632, "step": 10105 }, { "epoch": 0.9357212272664168, "grad_norm": 4.25, "learning_rate": 2.503615123886172e-07, "loss": 1.1497, "mean_token_accuracy": 0.6897504892367906, "step": 10110 }, { "epoch": 0.9361839974084872, "grad_norm": 4.40625, "learning_rate": 2.4678185156479573e-07, "loss": 1.1817, "mean_token_accuracy": 0.6842710371819961, "step": 10115 }, { "epoch": 0.9366467675505576, "grad_norm": 4.09375, "learning_rate": 2.4322764655715257e-07, "loss": 1.0742, "mean_token_accuracy": 0.7100293542074364, "step": 10120 }, { "epoch": 0.9371095376926281, "grad_norm": 4.28125, "learning_rate": 2.396989066421351e-07, "loss": 1.1281, "mean_token_accuracy": 0.6943493150684931, "step": 10125 }, { "epoch": 0.9375723078346985, "grad_norm": 4.3125, "learning_rate": 2.361956410297217e-07, "loss": 1.1688, "mean_token_accuracy": 0.6819716242661448, "step": 10130 }, { "epoch": 0.938035077976769, "grad_norm": 4.25, "learning_rate": 2.3271785886340715e-07, "loss": 1.1331, "mean_token_accuracy": 0.6944471624266144, "step": 10135 }, { "epoch": 0.9384978481188394, "grad_norm": 3.921875, "learning_rate": 2.2926556922017397e-07, "loss": 1.0839, "mean_token_accuracy": 0.7043542074363991, "step": 10140 }, { "epoch": 0.9389606182609098, "grad_norm": 4.46875, "learning_rate": 2.2583878111046785e-07, "loss": 1.1966, "mean_token_accuracy": 0.6783757338551859, "step": 10145 }, { "epoch": 0.9394233884029802, "grad_norm": 4.28125, "learning_rate": 2.2243750347817783e-07, "loss": 1.1812, "mean_token_accuracy": 0.6829745596868885, "step": 10150 }, { "epoch": 0.9398861585450506, "grad_norm": 4.53125, "learning_rate": 2.1906174520060608e-07, "loss": 1.1645, "mean_token_accuracy": 0.685004892367906, "step": 10155 }, { "epoch": 0.940348928687121, "grad_norm": 4.78125, "learning_rate": 2.157115150884559e-07, "loss": 1.1723, "mean_token_accuracy": 0.6850048923679062, "step": 10160 }, { "epoch": 0.9408116988291916, "grad_norm": 4.8125, "learning_rate": 2.123868218857994e-07, "loss": 1.1284, "mean_token_accuracy": 0.6938845401174168, "step": 10165 }, { "epoch": 0.941274468971262, "grad_norm": 4.40625, "learning_rate": 2.090876742700565e-07, "loss": 1.18, "mean_token_accuracy": 0.6811399217221135, "step": 10170 }, { "epoch": 0.9417372391133324, "grad_norm": 4.4375, "learning_rate": 2.0581408085197485e-07, "loss": 1.1501, "mean_token_accuracy": 0.6876467710371821, "step": 10175 }, { "epoch": 0.9422000092554028, "grad_norm": 4.375, "learning_rate": 2.0256605017560437e-07, "loss": 1.15, "mean_token_accuracy": 0.6883317025440313, "step": 10180 }, { "epoch": 0.9426627793974732, "grad_norm": 4.5625, "learning_rate": 1.9934359071827836e-07, "loss": 1.2005, "mean_token_accuracy": 0.6767857142857142, "step": 10185 }, { "epoch": 0.9431255495395438, "grad_norm": 4.1875, "learning_rate": 1.961467108905879e-07, "loss": 1.1166, "mean_token_accuracy": 0.698605675146771, "step": 10190 }, { "epoch": 0.9435883196816142, "grad_norm": 4.875, "learning_rate": 1.9297541903636196e-07, "loss": 1.1899, "mean_token_accuracy": 0.6806262230919766, "step": 10195 }, { "epoch": 0.9440510898236846, "grad_norm": 4.0625, "learning_rate": 1.8982972343264516e-07, "loss": 1.2691, "mean_token_accuracy": 0.6629892367906065, "step": 10200 }, { "epoch": 0.9440510898236846, "eval_loss": 1.1876749992370605, "eval_mean_token_accuracy": 0.6807271281800384, "eval_runtime": 39.2215, "eval_samples_per_second": 26.108, "eval_steps_per_second": 6.527, "step": 10200 }, { "epoch": 0.944513859965755, "grad_norm": 4.0625, "learning_rate": 1.8670963228967775e-07, "loss": 1.1653, "mean_token_accuracy": 0.6876223091976517, "step": 10205 }, { "epoch": 0.9449766301078254, "grad_norm": 4.90625, "learning_rate": 1.8361515375087014e-07, "loss": 1.1376, "mean_token_accuracy": 0.6979941291585127, "step": 10210 }, { "epoch": 0.9454394002498959, "grad_norm": 4.40625, "learning_rate": 1.8054629589278394e-07, "loss": 1.2089, "mean_token_accuracy": 0.6760763209393347, "step": 10215 }, { "epoch": 0.9459021703919663, "grad_norm": 4.1875, "learning_rate": 1.775030667251143e-07, "loss": 1.18, "mean_token_accuracy": 0.6836839530332682, "step": 10220 }, { "epoch": 0.9463649405340367, "grad_norm": 4.9375, "learning_rate": 1.7448547419065985e-07, "loss": 1.2247, "mean_token_accuracy": 0.6714774951076321, "step": 10225 }, { "epoch": 0.9468277106761072, "grad_norm": 4.25, "learning_rate": 1.714935261653139e-07, "loss": 1.1677, "mean_token_accuracy": 0.6860322896281801, "step": 10230 }, { "epoch": 0.9472904808181776, "grad_norm": 4.40625, "learning_rate": 1.6852723045803542e-07, "loss": 1.1733, "mean_token_accuracy": 0.685958904109589, "step": 10235 }, { "epoch": 0.9477532509602481, "grad_norm": 4.375, "learning_rate": 1.6558659481083038e-07, "loss": 1.1497, "mean_token_accuracy": 0.688820939334638, "step": 10240 }, { "epoch": 0.9482160211023185, "grad_norm": 4.3125, "learning_rate": 1.626716268987316e-07, "loss": 1.1842, "mean_token_accuracy": 0.6828277886497063, "step": 10245 }, { "epoch": 0.9486787912443889, "grad_norm": 4.0625, "learning_rate": 1.5978233432978107e-07, "loss": 1.1234, "mean_token_accuracy": 0.6960861056751468, "step": 10250 }, { "epoch": 0.9491415613864593, "grad_norm": 4.46875, "learning_rate": 1.569187246450099e-07, "loss": 1.1963, "mean_token_accuracy": 0.6792074363992172, "step": 10255 }, { "epoch": 0.9496043315285297, "grad_norm": 4.15625, "learning_rate": 1.5408080531841175e-07, "loss": 1.1633, "mean_token_accuracy": 0.6890166340508806, "step": 10260 }, { "epoch": 0.9500671016706003, "grad_norm": 5.65625, "learning_rate": 1.5126858375693387e-07, "loss": 1.1446, "mean_token_accuracy": 0.6940313111545989, "step": 10265 }, { "epoch": 0.9505298718126707, "grad_norm": 4.46875, "learning_rate": 1.4848206730045055e-07, "loss": 1.1095, "mean_token_accuracy": 0.6963307240704502, "step": 10270 }, { "epoch": 0.9509926419547411, "grad_norm": 4.125, "learning_rate": 1.4572126322174419e-07, "loss": 1.1321, "mean_token_accuracy": 0.6962818003913894, "step": 10275 }, { "epoch": 0.9514554120968115, "grad_norm": 4.09375, "learning_rate": 1.429861787264919e-07, "loss": 1.1483, "mean_token_accuracy": 0.693884540117417, "step": 10280 }, { "epoch": 0.9519181822388819, "grad_norm": 4.40625, "learning_rate": 1.4027682095324015e-07, "loss": 1.1569, "mean_token_accuracy": 0.6874021526418786, "step": 10285 }, { "epoch": 0.9523809523809523, "grad_norm": 4.625, "learning_rate": 1.37593196973389e-07, "loss": 1.1562, "mean_token_accuracy": 0.6874755381604697, "step": 10290 }, { "epoch": 0.9528437225230229, "grad_norm": 4.28125, "learning_rate": 1.3493531379117463e-07, "loss": 1.1321, "mean_token_accuracy": 0.6936643835616438, "step": 10295 }, { "epoch": 0.9533064926650933, "grad_norm": 4.15625, "learning_rate": 1.3230317834365013e-07, "loss": 1.2274, "mean_token_accuracy": 0.6722113502935421, "step": 10300 }, { "epoch": 0.9533064926650933, "eval_loss": 1.1876964569091797, "eval_mean_token_accuracy": 0.680729039261252, "eval_runtime": 39.5968, "eval_samples_per_second": 25.861, "eval_steps_per_second": 6.465, "step": 10300 }, { "epoch": 0.9537692628071637, "grad_norm": 4.25, "learning_rate": 1.2969679750066577e-07, "loss": 1.1471, "mean_token_accuracy": 0.6887720156555774, "step": 10305 }, { "epoch": 0.9542320329492341, "grad_norm": 4.125, "learning_rate": 1.271161780648533e-07, "loss": 1.1401, "mean_token_accuracy": 0.6942025440313111, "step": 10310 }, { "epoch": 0.9546948030913045, "grad_norm": 4.375, "learning_rate": 1.2456132677160836e-07, "loss": 1.2106, "mean_token_accuracy": 0.6769324853228962, "step": 10315 }, { "epoch": 0.955157573233375, "grad_norm": 4.21875, "learning_rate": 1.2203225028906918e-07, "loss": 1.1699, "mean_token_accuracy": 0.6847358121330724, "step": 10320 }, { "epoch": 0.9556203433754454, "grad_norm": 4.125, "learning_rate": 1.195289552181056e-07, "loss": 1.1679, "mean_token_accuracy": 0.6871819960861056, "step": 10325 }, { "epoch": 0.9560831135175158, "grad_norm": 4.78125, "learning_rate": 1.1705144809229684e-07, "loss": 1.129, "mean_token_accuracy": 0.6935909980430528, "step": 10330 }, { "epoch": 0.9565458836595863, "grad_norm": 4.0625, "learning_rate": 1.145997353779127e-07, "loss": 1.1612, "mean_token_accuracy": 0.6858855185909981, "step": 10335 }, { "epoch": 0.9570086538016567, "grad_norm": 4.375, "learning_rate": 1.1217382347390448e-07, "loss": 1.1787, "mean_token_accuracy": 0.6818982387475538, "step": 10340 }, { "epoch": 0.9574714239437272, "grad_norm": 4.4375, "learning_rate": 1.097737187118808e-07, "loss": 1.2586, "mean_token_accuracy": 0.6625244618395303, "step": 10345 }, { "epoch": 0.9579341940857976, "grad_norm": 4.28125, "learning_rate": 1.0739942735609521e-07, "loss": 1.168, "mean_token_accuracy": 0.6868884540117416, "step": 10350 }, { "epoch": 0.958396964227868, "grad_norm": 4.34375, "learning_rate": 1.0505095560342848e-07, "loss": 1.1019, "mean_token_accuracy": 0.699706457925636, "step": 10355 }, { "epoch": 0.9588597343699384, "grad_norm": 4.84375, "learning_rate": 1.0272830958336976e-07, "loss": 1.1646, "mean_token_accuracy": 0.6867172211350294, "step": 10360 }, { "epoch": 0.9593225045120088, "grad_norm": 3.90625, "learning_rate": 1.0043149535800766e-07, "loss": 1.0481, "mean_token_accuracy": 0.7152641878669275, "step": 10365 }, { "epoch": 0.9597852746540794, "grad_norm": 4.15625, "learning_rate": 9.816051892200584e-08, "loss": 1.2075, "mean_token_accuracy": 0.6712084148727985, "step": 10370 }, { "epoch": 0.9602480447961498, "grad_norm": 4.21875, "learning_rate": 9.59153862025941e-08, "loss": 1.1596, "mean_token_accuracy": 0.6842221135029353, "step": 10375 }, { "epoch": 0.9607108149382202, "grad_norm": 4.84375, "learning_rate": 9.369610305955067e-08, "loss": 1.1738, "mean_token_accuracy": 0.6861301369863014, "step": 10380 }, { "epoch": 0.9611735850802906, "grad_norm": 4.09375, "learning_rate": 9.150267528518442e-08, "loss": 1.1393, "mean_token_accuracy": 0.6887720156555772, "step": 10385 }, { "epoch": 0.961636355222361, "grad_norm": 4.375, "learning_rate": 8.933510860432371e-08, "loss": 1.2047, "mean_token_accuracy": 0.6742661448140901, "step": 10390 }, { "epoch": 0.9620991253644315, "grad_norm": 4.34375, "learning_rate": 8.719340867429871e-08, "loss": 1.2185, "mean_token_accuracy": 0.6721624266144813, "step": 10395 }, { "epoch": 0.962561895506502, "grad_norm": 4.34375, "learning_rate": 8.507758108492803e-08, "loss": 1.1062, "mean_token_accuracy": 0.6992172211350295, "step": 10400 }, { "epoch": 0.962561895506502, "eval_loss": 1.1877918243408203, "eval_mean_token_accuracy": 0.6806774400684926, "eval_runtime": 39.4992, "eval_samples_per_second": 25.925, "eval_steps_per_second": 6.481, "step": 10400 }, { "epoch": 0.9630246656485724, "grad_norm": 5.21875, "learning_rate": 8.29876313585043e-08, "loss": 1.1375, "mean_token_accuracy": 0.6950831702544031, "step": 10405 }, { "epoch": 0.9634874357906428, "grad_norm": 4.3125, "learning_rate": 8.092356494977749e-08, "loss": 1.1386, "mean_token_accuracy": 0.692270058708415, "step": 10410 }, { "epoch": 0.9639502059327132, "grad_norm": 3.96875, "learning_rate": 7.888538724594275e-08, "loss": 1.1985, "mean_token_accuracy": 0.6770303326810176, "step": 10415 }, { "epoch": 0.9644129760747836, "grad_norm": 4.21875, "learning_rate": 7.687310356662814e-08, "loss": 1.21, "mean_token_accuracy": 0.6748043052837573, "step": 10420 }, { "epoch": 0.9648757462168541, "grad_norm": 4.125, "learning_rate": 7.488671916387691e-08, "loss": 1.2082, "mean_token_accuracy": 0.6741682974559686, "step": 10425 }, { "epoch": 0.9653385163589245, "grad_norm": 4.15625, "learning_rate": 7.292623922213416e-08, "loss": 1.1155, "mean_token_accuracy": 0.6995596868884538, "step": 10430 }, { "epoch": 0.965801286500995, "grad_norm": 4.15625, "learning_rate": 7.099166885823683e-08, "loss": 1.1499, "mean_token_accuracy": 0.6899461839530334, "step": 10435 }, { "epoch": 0.9662640566430654, "grad_norm": 4.28125, "learning_rate": 6.90830131213971e-08, "loss": 1.0625, "mean_token_accuracy": 0.7115459882583172, "step": 10440 }, { "epoch": 0.9667268267851358, "grad_norm": 4.75, "learning_rate": 6.720027699318899e-08, "loss": 1.1506, "mean_token_accuracy": 0.6939579256360079, "step": 10445 }, { "epoch": 0.9671895969272063, "grad_norm": 4.125, "learning_rate": 6.534346538753955e-08, "loss": 1.1932, "mean_token_accuracy": 0.6802103718199609, "step": 10450 }, { "epoch": 0.9676523670692767, "grad_norm": 4.15625, "learning_rate": 6.351258315070886e-08, "loss": 1.1791, "mean_token_accuracy": 0.6813111545988259, "step": 10455 }, { "epoch": 0.9681151372113471, "grad_norm": 4.0625, "learning_rate": 6.170763506128552e-08, "loss": 1.0973, "mean_token_accuracy": 0.7025440313111544, "step": 10460 }, { "epoch": 0.9685779073534175, "grad_norm": 4.1875, "learning_rate": 5.99286258301679e-08, "loss": 1.188, "mean_token_accuracy": 0.6789138943248533, "step": 10465 }, { "epoch": 0.969040677495488, "grad_norm": 4.0625, "learning_rate": 5.817556010055625e-08, "loss": 1.039, "mean_token_accuracy": 0.7186399217221136, "step": 10470 }, { "epoch": 0.9695034476375585, "grad_norm": 4.15625, "learning_rate": 5.644844244793613e-08, "loss": 1.1319, "mean_token_accuracy": 0.6909491193737769, "step": 10475 }, { "epoch": 0.9699662177796289, "grad_norm": 4.46875, "learning_rate": 5.474727738007057e-08, "loss": 1.1468, "mean_token_accuracy": 0.6903131115459883, "step": 10480 }, { "epoch": 0.9704289879216993, "grad_norm": 4.25, "learning_rate": 5.30720693369835e-08, "loss": 1.1702, "mean_token_accuracy": 0.6847847358121332, "step": 10485 }, { "epoch": 0.9708917580637697, "grad_norm": 4.09375, "learning_rate": 5.142282269095744e-08, "loss": 1.1472, "mean_token_accuracy": 0.6916095890410958, "step": 10490 }, { "epoch": 0.9713545282058401, "grad_norm": 4.125, "learning_rate": 4.979954174650914e-08, "loss": 1.168, "mean_token_accuracy": 0.6864970645792562, "step": 10495 }, { "epoch": 0.9718172983479106, "grad_norm": 3.984375, "learning_rate": 4.8202230740389545e-08, "loss": 1.1215, "mean_token_accuracy": 0.6950831702544031, "step": 10500 }, { "epoch": 0.9718172983479106, "eval_loss": 1.1877927780151367, "eval_mean_token_accuracy": 0.6807443279109585, "eval_runtime": 39.5774, "eval_samples_per_second": 25.873, "eval_steps_per_second": 6.468, "step": 10500 }, { "epoch": 0.972280068489981, "grad_norm": 4.28125, "learning_rate": 4.663089384156605e-08, "loss": 1.2703, "mean_token_accuracy": 0.6601272015655578, "step": 10505 }, { "epoch": 0.9727428386320515, "grad_norm": 4.3125, "learning_rate": 4.508553515121472e-08, "loss": 1.1528, "mean_token_accuracy": 0.6897749510763209, "step": 10510 }, { "epoch": 0.9732056087741219, "grad_norm": 4.03125, "learning_rate": 4.356615870270808e-08, "loss": 1.1865, "mean_token_accuracy": 0.678082191780822, "step": 10515 }, { "epoch": 0.9736683789161923, "grad_norm": 4.28125, "learning_rate": 4.207276846160735e-08, "loss": 1.1158, "mean_token_accuracy": 0.6954500978473581, "step": 10520 }, { "epoch": 0.9741311490582628, "grad_norm": 4.21875, "learning_rate": 4.0605368325645764e-08, "loss": 1.158, "mean_token_accuracy": 0.6846379647749509, "step": 10525 }, { "epoch": 0.9745939192003332, "grad_norm": 4.40625, "learning_rate": 3.916396212472751e-08, "loss": 1.1859, "mean_token_accuracy": 0.6805283757338553, "step": 10530 }, { "epoch": 0.9750566893424036, "grad_norm": 4.34375, "learning_rate": 3.774855362090879e-08, "loss": 1.1421, "mean_token_accuracy": 0.6920988258317026, "step": 10535 }, { "epoch": 0.975519459484474, "grad_norm": 4.25, "learning_rate": 3.635914650839567e-08, "loss": 1.0928, "mean_token_accuracy": 0.705381604696673, "step": 10540 }, { "epoch": 0.9759822296265445, "grad_norm": 4.0625, "learning_rate": 3.4995744413526265e-08, "loss": 1.1682, "mean_token_accuracy": 0.685958904109589, "step": 10545 }, { "epoch": 0.9764449997686149, "grad_norm": 4.53125, "learning_rate": 3.3658350894770766e-08, "loss": 1.1995, "mean_token_accuracy": 0.680283757338552, "step": 10550 }, { "epoch": 0.9769077699106854, "grad_norm": 4.21875, "learning_rate": 3.2346969442713647e-08, "loss": 1.1751, "mean_token_accuracy": 0.6853962818003915, "step": 10555 }, { "epoch": 0.9773705400527558, "grad_norm": 4.125, "learning_rate": 3.106160348004927e-08, "loss": 1.1094, "mean_token_accuracy": 0.6980430528375736, "step": 10560 }, { "epoch": 0.9778333101948262, "grad_norm": 4.53125, "learning_rate": 2.9802256361572967e-08, "loss": 1.2064, "mean_token_accuracy": 0.6755381604696674, "step": 10565 }, { "epoch": 0.9782960803368966, "grad_norm": 5.25, "learning_rate": 2.856893137416772e-08, "loss": 1.1579, "mean_token_accuracy": 0.6878424657534247, "step": 10570 }, { "epoch": 0.978758850478967, "grad_norm": 4.34375, "learning_rate": 2.7361631736804174e-08, "loss": 1.1906, "mean_token_accuracy": 0.6811888454011741, "step": 10575 }, { "epoch": 0.9792216206210376, "grad_norm": 4.375, "learning_rate": 2.618036060052176e-08, "loss": 1.1604, "mean_token_accuracy": 0.6860812133072407, "step": 10580 }, { "epoch": 0.979684390763108, "grad_norm": 5.25, "learning_rate": 2.502512104842869e-08, "loss": 1.1577, "mean_token_accuracy": 0.6899951076320939, "step": 10585 }, { "epoch": 0.9801471609051784, "grad_norm": 4.25, "learning_rate": 2.3895916095690865e-08, "loss": 1.1429, "mean_token_accuracy": 0.6944471624266145, "step": 10590 }, { "epoch": 0.9806099310472488, "grad_norm": 5.28125, "learning_rate": 2.279274868952297e-08, "loss": 1.1823, "mean_token_accuracy": 0.6846868884540118, "step": 10595 }, { "epoch": 0.9810727011893192, "grad_norm": 4.21875, "learning_rate": 2.1715621709182953e-08, "loss": 1.2088, "mean_token_accuracy": 0.6732632093933463, "step": 10600 }, { "epoch": 0.9810727011893192, "eval_loss": 1.187716007232666, "eval_mean_token_accuracy": 0.6805379311399209, "eval_runtime": 39.0779, "eval_samples_per_second": 26.204, "eval_steps_per_second": 6.551, "step": 10600 }, { "epoch": 0.9815354713313897, "grad_norm": 3.890625, "learning_rate": 2.0664537965962016e-08, "loss": 1.0902, "mean_token_accuracy": 0.7021037181996086, "step": 10605 }, { "epoch": 0.9819982414734602, "grad_norm": 4.46875, "learning_rate": 1.9639500203181282e-08, "loss": 1.1778, "mean_token_accuracy": 0.6839530332681016, "step": 10610 }, { "epoch": 0.9824610116155306, "grad_norm": 4.1875, "learning_rate": 1.86405110961807e-08, "loss": 1.1391, "mean_token_accuracy": 0.6929305283757338, "step": 10615 }, { "epoch": 0.982923781757601, "grad_norm": 4.125, "learning_rate": 1.7667573252311275e-08, "loss": 1.1885, "mean_token_accuracy": 0.6800880626223089, "step": 10620 }, { "epoch": 0.9833865518996714, "grad_norm": 4.03125, "learning_rate": 1.672068921093395e-08, "loss": 1.1466, "mean_token_accuracy": 0.6873776908023483, "step": 10625 }, { "epoch": 0.9838493220417419, "grad_norm": 4.0, "learning_rate": 1.5799861443408504e-08, "loss": 1.1598, "mean_token_accuracy": 0.6906311154598825, "step": 10630 }, { "epoch": 0.9843120921838123, "grad_norm": 4.46875, "learning_rate": 1.49050923530869e-08, "loss": 1.1402, "mean_token_accuracy": 0.6934931506849314, "step": 10635 }, { "epoch": 0.9847748623258827, "grad_norm": 3.96875, "learning_rate": 1.4036384275307735e-08, "loss": 1.1498, "mean_token_accuracy": 0.6945939334637964, "step": 10640 }, { "epoch": 0.9852376324679532, "grad_norm": 3.859375, "learning_rate": 1.3193739477392886e-08, "loss": 1.0886, "mean_token_accuracy": 0.7014921722113503, "step": 10645 }, { "epoch": 0.9857004026100236, "grad_norm": 4.125, "learning_rate": 1.2377160158638657e-08, "loss": 1.1918, "mean_token_accuracy": 0.6816536203522505, "step": 10650 }, { "epoch": 0.9861631727520941, "grad_norm": 4.21875, "learning_rate": 1.1586648450309102e-08, "loss": 1.1212, "mean_token_accuracy": 0.6948140900195694, "step": 10655 }, { "epoch": 0.9866259428941645, "grad_norm": 4.21875, "learning_rate": 1.0822206415632697e-08, "loss": 1.13, "mean_token_accuracy": 0.6993395303326808, "step": 10660 }, { "epoch": 0.9870887130362349, "grad_norm": 4.40625, "learning_rate": 1.0083836049797901e-08, "loss": 1.1592, "mean_token_accuracy": 0.6827544031311155, "step": 10665 }, { "epoch": 0.9875514831783053, "grad_norm": 4.125, "learning_rate": 9.371539279944275e-09, "loss": 1.2052, "mean_token_accuracy": 0.6745107632093932, "step": 10670 }, { "epoch": 0.9880142533203757, "grad_norm": 4.875, "learning_rate": 8.685317965160256e-09, "loss": 1.1573, "mean_token_accuracy": 0.6925880626223091, "step": 10675 }, { "epoch": 0.9884770234624463, "grad_norm": 4.21875, "learning_rate": 8.025173896477612e-09, "loss": 1.1923, "mean_token_accuracy": 0.6766144814090019, "step": 10680 }, { "epoch": 0.9889397936045167, "grad_norm": 4.5, "learning_rate": 7.391108796866997e-09, "loss": 1.2021, "mean_token_accuracy": 0.6781311154598826, "step": 10685 }, { "epoch": 0.9894025637465871, "grad_norm": 4.25, "learning_rate": 6.783124321233514e-09, "loss": 1.2066, "mean_token_accuracy": 0.6738502935420744, "step": 10690 }, { "epoch": 0.9898653338886575, "grad_norm": 4.40625, "learning_rate": 6.20122205641005e-09, "loss": 1.1199, "mean_token_accuracy": 0.6967954990215264, "step": 10695 }, { "epoch": 0.9903281040307279, "grad_norm": 4.25, "learning_rate": 5.645403521158388e-09, "loss": 1.1836, "mean_token_accuracy": 0.6815802348336597, "step": 10700 }, { "epoch": 0.9903281040307279, "eval_loss": 1.187723994255066, "eval_mean_token_accuracy": 0.680637307363013, "eval_runtime": 40.0546, "eval_samples_per_second": 25.565, "eval_steps_per_second": 6.391, "step": 10700 }, { "epoch": 0.9907908741727983, "grad_norm": 4.15625, "learning_rate": 5.115670166158104e-09, "loss": 1.2051, "mean_token_accuracy": 0.6784246575342465, "step": 10705 }, { "epoch": 0.9912536443148688, "grad_norm": 4.1875, "learning_rate": 4.612023374009899e-09, "loss": 1.1999, "mean_token_accuracy": 0.6768590998043054, "step": 10710 }, { "epoch": 0.9917164144569393, "grad_norm": 5.09375, "learning_rate": 4.134464459226717e-09, "loss": 1.2282, "mean_token_accuracy": 0.6668297455968688, "step": 10715 }, { "epoch": 0.9921791845990097, "grad_norm": 4.3125, "learning_rate": 3.682994668234852e-09, "loss": 1.1484, "mean_token_accuracy": 0.6906066536203521, "step": 10720 }, { "epoch": 0.9926419547410801, "grad_norm": 4.4375, "learning_rate": 3.25761517936507e-09, "loss": 1.1856, "mean_token_accuracy": 0.6816780821917809, "step": 10725 }, { "epoch": 0.9931047248831505, "grad_norm": 4.625, "learning_rate": 2.8583271028548297e-09, "loss": 1.1307, "mean_token_accuracy": 0.6962084148727985, "step": 10730 }, { "epoch": 0.993567495025221, "grad_norm": 4.375, "learning_rate": 2.485131480843839e-09, "loss": 1.2104, "mean_token_accuracy": 0.676541095890411, "step": 10735 }, { "epoch": 0.9940302651672914, "grad_norm": 5.46875, "learning_rate": 2.1380292873673937e-09, "loss": 1.2013, "mean_token_accuracy": 0.6781311154598825, "step": 10740 }, { "epoch": 0.9944930353093618, "grad_norm": 4.5, "learning_rate": 1.817021428359711e-09, "loss": 1.193, "mean_token_accuracy": 0.67573385518591, "step": 10745 }, { "epoch": 0.9949558054514323, "grad_norm": 4.1875, "learning_rate": 1.5221087416494862e-09, "loss": 1.2011, "mean_token_accuracy": 0.6754647749510763, "step": 10750 }, { "epoch": 0.9954185755935027, "grad_norm": 4.15625, "learning_rate": 1.2532919969554525e-09, "loss": 1.1314, "mean_token_accuracy": 0.6922211350293542, "step": 10755 }, { "epoch": 0.9958813457355732, "grad_norm": 4.84375, "learning_rate": 1.010571895887491e-09, "loss": 1.1715, "mean_token_accuracy": 0.6834882583170254, "step": 10760 }, { "epoch": 0.9963441158776436, "grad_norm": 4.125, "learning_rate": 7.939490719433007e-10, "loss": 1.1496, "mean_token_accuracy": 0.6945694716242662, "step": 10765 }, { "epoch": 0.996806886019714, "grad_norm": 4.28125, "learning_rate": 6.03424090505067e-10, "loss": 1.2012, "mean_token_accuracy": 0.6748043052837573, "step": 10770 }, { "epoch": 0.9972696561617844, "grad_norm": 4.4375, "learning_rate": 4.3899744884390306e-10, "loss": 1.0955, "mean_token_accuracy": 0.705724070450098, "step": 10775 }, { "epoch": 0.9977324263038548, "grad_norm": 4.1875, "learning_rate": 3.006695761098577e-10, "loss": 1.1532, "mean_token_accuracy": 0.690582191780822, "step": 10780 }, { "epoch": 0.9981951964459254, "grad_norm": 4.1875, "learning_rate": 1.8844083333746654e-10, "loss": 1.1861, "mean_token_accuracy": 0.6774461839530332, "step": 10785 }, { "epoch": 0.9986579665879958, "grad_norm": 4.0625, "learning_rate": 1.0231151344464174e-10, "loss": 1.1616, "mean_token_accuracy": 0.6847847358121332, "step": 10790 }, { "epoch": 0.9991207367300662, "grad_norm": 4.0625, "learning_rate": 4.228184122601065e-11, "loss": 1.1729, "mean_token_accuracy": 0.6848091976516634, "step": 10795 }, { "epoch": 0.9995835068721366, "grad_norm": 4.0625, "learning_rate": 8.351973360687383e-12, "loss": 1.1822, "mean_token_accuracy": 0.6768835616438356, "step": 10800 }, { "epoch": 0.9995835068721366, "eval_loss": 1.1877331733703613, "eval_mean_token_accuracy": 0.6808494373776907, "eval_runtime": 40.4077, "eval_samples_per_second": 25.342, "eval_steps_per_second": 6.335, "step": 10800 }, { "epoch": 0.999953722985793, "mean_token_accuracy": 0.700556506849315, "step": 10804, "total_flos": 1.900575720430633e+17, "train_loss": 1.19111062203986, "train_runtime": 26492.3013, "train_samples_per_second": 6.525, "train_steps_per_second": 0.408 } ], "logging_steps": 5, "max_steps": 10804, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.900575720430633e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }