diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,2684 +10,2684 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.007106057914372002, - "grad_norm": 19.49246232730451, - "learning_rate": 1.86046511627907e-06, - "loss": 2.4934, - "mean_token_accuracy": 0.5258669804781675, - "num_tokens": 4763584.0, + "epoch": 0.007109215320359016, + "grad_norm": 22.657577985866105, + "learning_rate": 9.302325581395349e-06, + "loss": 2.574, + "mean_token_accuracy": 0.5464246176183224, + "num_tokens": 4589382.0, "step": 5 }, { - "epoch": 0.014212115828744005, - "grad_norm": 12.095302419910517, - "learning_rate": 4.186046511627907e-06, - "loss": 2.3185, - "mean_token_accuracy": 0.5427483215928077, - "num_tokens": 9531720.0, + "epoch": 0.014218430640718031, + "grad_norm": 2.3543289370923013, + "learning_rate": 2.0930232558139536e-05, + "loss": 1.4882, + "mean_token_accuracy": 0.6589333653450012, + "num_tokens": 9171524.0, "step": 10 }, { - "epoch": 0.021318173743116006, - "grad_norm": 6.796780116822856, - "learning_rate": 6.511627906976745e-06, - "loss": 1.8082, - "mean_token_accuracy": 0.5906881660223007, - "num_tokens": 14272673.0, + "epoch": 0.021327645961077047, + "grad_norm": 0.8063547574982903, + "learning_rate": 3.2558139534883724e-05, + "loss": 1.0174, + "mean_token_accuracy": 0.7330243036150932, + "num_tokens": 13765157.0, "step": 15 }, { - "epoch": 0.02842423165748801, - "grad_norm": 2.028130586531004, - "learning_rate": 8.837209302325582e-06, - "loss": 1.4468, - "mean_token_accuracy": 0.639113237708807, - "num_tokens": 19045659.0, + "epoch": 0.028436861281436063, + "grad_norm": 0.572573905518242, + "learning_rate": 4.418604651162791e-05, + "loss": 0.8773, + "mean_token_accuracy": 0.7569610200822353, + "num_tokens": 18369874.0, "step": 20 }, { - "epoch": 0.03553028957186001, - "grad_norm": 1.0588899013746833, - "learning_rate": 1.116279069767442e-05, - "loss": 1.2276, - "mean_token_accuracy": 0.6780799143016338, - "num_tokens": 23811446.0, + "epoch": 0.035546076601795075, + "grad_norm": 0.5738482260117446, + "learning_rate": 5.5813953488372095e-05, + "loss": 0.7975, + "mean_token_accuracy": 0.7729738861322403, + "num_tokens": 22960290.0, "step": 25 }, { - "epoch": 0.04263634748623201, - "grad_norm": 0.6488648360367262, - "learning_rate": 1.3488372093023257e-05, - "loss": 1.088, - "mean_token_accuracy": 0.7027405865490437, - "num_tokens": 28572862.0, + "epoch": 0.042655291922154094, + "grad_norm": 0.5016568944917689, + "learning_rate": 6.744186046511628e-05, + "loss": 0.7632, + "mean_token_accuracy": 0.778630904853344, + "num_tokens": 27556623.0, "step": 30 }, { - "epoch": 0.04974240540060401, - "grad_norm": 0.5415057756218988, - "learning_rate": 1.5813953488372095e-05, - "loss": 1.0242, - "mean_token_accuracy": 0.7144973143935204, - "num_tokens": 33349882.0, + "epoch": 0.049764507242513106, + "grad_norm": 0.4845613474361907, + "learning_rate": 7.906976744186047e-05, + "loss": 0.7326, + "mean_token_accuracy": 0.7872321248054505, + "num_tokens": 32158408.0, "step": 35 }, { - "epoch": 0.05684846331497602, - "grad_norm": 0.48382195510047, - "learning_rate": 1.813953488372093e-05, - "loss": 0.9712, - "mean_token_accuracy": 0.7250196196138858, - "num_tokens": 38137624.0, + "epoch": 0.056873722562872125, + "grad_norm": 0.4270154516127363, + "learning_rate": 9.069767441860465e-05, + "loss": 0.7095, + "mean_token_accuracy": 0.7919960044324398, + "num_tokens": 36742233.0, "step": 40 }, { - "epoch": 0.06395452122934801, - "grad_norm": 0.49727864530874677, - "learning_rate": 1.99999761632652e-05, - "loss": 0.9247, - "mean_token_accuracy": 0.7344170436263084, - "num_tokens": 42901997.0, + "epoch": 0.06398293788323114, + "grad_norm": 0.499498695141066, + "learning_rate": 9.9999880816326e-05, + "loss": 0.6973, + "mean_token_accuracy": 0.7952379912137986, + "num_tokens": 41335670.0, "step": 45 }, { - "epoch": 0.07106057914372002, - "grad_norm": 0.4396836808821335, - "learning_rate": 1.999914189080485e-05, - "loss": 0.8892, - "mean_token_accuracy": 0.7411722339689731, - "num_tokens": 47660479.0, + "epoch": 0.07109215320359015, + "grad_norm": 0.4645180201543763, + "learning_rate": 9.999570945402425e-05, + "loss": 0.6853, + "mean_token_accuracy": 0.7981184311211109, + "num_tokens": 45940079.0, "step": 50 }, { - "epoch": 0.07816663705809203, - "grad_norm": 0.4277947889677102, - "learning_rate": 1.9997115907865857e-05, - "loss": 0.8745, - "mean_token_accuracy": 0.7443610817193985, - "num_tokens": 52437558.0, + "epoch": 0.07820136852394917, + "grad_norm": 0.434255531179794, + "learning_rate": 9.998557953932929e-05, + "loss": 0.6688, + "mean_token_accuracy": 0.8012012615799904, + "num_tokens": 50533771.0, "step": 55 }, { - "epoch": 0.08527269497246402, - "grad_norm": 0.4233524880605398, - "learning_rate": 1.999389848273882e-05, - "loss": 0.8603, - "mean_token_accuracy": 0.747463022172451, - "num_tokens": 57198957.0, + "epoch": 0.08531058384430819, + "grad_norm": 0.393754634337621, + "learning_rate": 9.99694924136941e-05, + "loss": 0.6725, + "mean_token_accuracy": 0.800255061686039, + "num_tokens": 55133444.0, "step": 60 }, { - "epoch": 0.09237875288683603, - "grad_norm": 0.409057413495153, - "learning_rate": 1.998949004149094e-05, - "loss": 0.8537, - "mean_token_accuracy": 0.7483336836099624, - "num_tokens": 61961974.0, + "epoch": 0.0924197991646672, + "grad_norm": 0.49718727212066355, + "learning_rate": 9.99474502074547e-05, + "loss": 0.6664, + "mean_token_accuracy": 0.801218880712986, + "num_tokens": 59726447.0, "step": 65 }, { - "epoch": 0.09948481080120802, - "grad_norm": 0.46927095285141973, - "learning_rate": 1.9983891167909617e-05, - "loss": 0.8375, - "mean_token_accuracy": 0.7526809796690941, - "num_tokens": 66725536.0, + "epoch": 0.09952901448502621, + "grad_norm": 0.4005142024066312, + "learning_rate": 9.991945583954808e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.8056452445685863, + "num_tokens": 64319917.0, "step": 70 }, { - "epoch": 0.10659086871558003, - "grad_norm": 0.4106777573300543, - "learning_rate": 1.9977102603425134e-05, - "loss": 0.8309, - "mean_token_accuracy": 0.7542230375111103, - "num_tokens": 71469095.0, + "epoch": 0.10663822980538523, + "grad_norm": 0.3774090383980249, + "learning_rate": 9.988551301712567e-05, + "loss": 0.6454, + "mean_token_accuracy": 0.806719920784235, + "num_tokens": 68898868.0, "step": 75 }, { - "epoch": 0.11369692662995204, - "grad_norm": 0.40584561150068, - "learning_rate": 1.996912524701247e-05, - "loss": 0.8258, - "mean_token_accuracy": 0.7558744698762894, - "num_tokens": 76221430.0, + "epoch": 0.11374744512574425, + "grad_norm": 0.3995895890256704, + "learning_rate": 9.984562623506235e-05, + "loss": 0.6464, + "mean_token_accuracy": 0.8064703330397606, + "num_tokens": 73481972.0, "step": 80 }, { - "epoch": 0.12080298454432403, - "grad_norm": 0.4436779763048366, - "learning_rate": 1.995996015507227e-05, - "loss": 0.8152, - "mean_token_accuracy": 0.7582607261836529, - "num_tokens": 80991235.0, + "epoch": 0.12085666044610326, + "grad_norm": 0.3801619159341505, + "learning_rate": 9.979980077536136e-05, + "loss": 0.6462, + "mean_token_accuracy": 0.8080633491277694, + "num_tokens": 78079419.0, "step": 85 }, { - "epoch": 0.12790904245869603, - "grad_norm": 0.4011663933394793, - "learning_rate": 1.9949608541290924e-05, - "loss": 0.8128, - "mean_token_accuracy": 0.7592827767133713, - "num_tokens": 85760262.0, + "epoch": 0.1279658757664623, + "grad_norm": 0.37074794226689833, + "learning_rate": 9.974804270645462e-05, + "loss": 0.6362, + "mean_token_accuracy": 0.8091117829084397, + "num_tokens": 82670195.0, "step": 90 }, { - "epoch": 0.13501510037306805, - "grad_norm": 0.4484975750242541, - "learning_rate": 1.9938071776479875e-05, - "loss": 0.8015, - "mean_token_accuracy": 0.7621250681579113, - "num_tokens": 90505979.0, + "epoch": 0.13507509108682128, + "grad_norm": 0.37193721608812236, + "learning_rate": 9.969035888239937e-05, + "loss": 0.635, + "mean_token_accuracy": 0.8079991653561592, + "num_tokens": 87257953.0, "step": 95 }, { - "epoch": 0.14212115828744004, - "grad_norm": 0.3973118091711915, - "learning_rate": 1.992535138839406e-05, - "loss": 0.7956, - "mean_token_accuracy": 0.7619979940354824, - "num_tokens": 95259341.0, + "epoch": 0.1421843064071803, + "grad_norm": 0.36251703620037773, + "learning_rate": 9.96267569419703e-05, + "loss": 0.6315, + "mean_token_accuracy": 0.8096475720405578, + "num_tokens": 91838382.0, "step": 100 }, { - "epoch": 0.14212115828744004, - "eval_loss": 0.7710759043693542, - "eval_mean_token_accuracy": 0.7617696215186203, - "eval_num_tokens": 95259341.0, - "eval_runtime": 149.4719, - "eval_samples_per_second": 24.346, - "eval_steps_per_second": 0.763, + "epoch": 0.1421843064071803, + "eval_loss": 0.5971412062644958, + "eval_mean_token_accuracy": 0.8093206621052926, + "eval_num_tokens": 91838382.0, + "eval_runtime": 141.8153, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 0.804, "step": 100 }, { - "epoch": 0.14922721620181204, - "grad_norm": 0.4055637275946486, - "learning_rate": 1.991144906152962e-05, - "loss": 0.804, - "mean_token_accuracy": 0.7599063582718373, - "num_tokens": 100023224.0, + "epoch": 0.14929352172753932, + "grad_norm": 0.41583625971563776, + "learning_rate": 9.955724530764809e-05, + "loss": 0.6381, + "mean_token_accuracy": 0.8077230393886566, + "num_tokens": 96431755.0, "step": 105 }, { - "epoch": 0.15633327411618406, - "grad_norm": 0.5425556091455714, - "learning_rate": 1.9896366636900826e-05, - "loss": 0.7951, - "mean_token_accuracy": 0.7621362045407295, - "num_tokens": 104771415.0, + "epoch": 0.15640273704789834, + "grad_norm": 0.3705693803444073, + "learning_rate": 9.948183318450413e-05, + "loss": 0.6197, + "mean_token_accuracy": 0.8116156131029129, + "num_tokens": 101027690.0, "step": 110 }, { - "epoch": 0.16343933203055605, - "grad_norm": 0.4581521538010428, - "learning_rate": 1.9880106111796266e-05, - "loss": 0.7903, - "mean_token_accuracy": 0.7619842484593391, - "num_tokens": 109537100.0, + "epoch": 0.16351195236825736, + "grad_norm": 0.3214510651452395, + "learning_rate": 9.940053055898133e-05, + "loss": 0.6313, + "mean_token_accuracy": 0.8089181430637836, + "num_tokens": 105628547.0, "step": 115 }, { - "epoch": 0.17054538994492804, - "grad_norm": 0.42123272880797796, - "learning_rate": 1.9862669639514382e-05, - "loss": 0.7886, - "mean_token_accuracy": 0.7638748176395893, - "num_tokens": 114304125.0, + "epoch": 0.17062116768861638, + "grad_norm": 0.34220731720085373, + "learning_rate": 9.93133481975719e-05, + "loss": 0.6077, + "mean_token_accuracy": 0.814984206855297, + "num_tokens": 110243592.0, "step": 120 }, { - "epoch": 0.17765144785930007, - "grad_norm": 0.3957407600605023, - "learning_rate": 1.9844059529078297e-05, - "loss": 0.7763, - "mean_token_accuracy": 0.7664125673472881, - "num_tokens": 119067772.0, + "epoch": 0.1777303830089754, + "grad_norm": 0.35675802487560043, + "learning_rate": 9.922029764539148e-05, + "loss": 0.6263, + "mean_token_accuracy": 0.8096928559243679, + "num_tokens": 114832845.0, "step": 125 }, { - "epoch": 0.18475750577367206, - "grad_norm": 1.1428359800811156, - "learning_rate": 1.9824278244930052e-05, - "loss": 0.7736, - "mean_token_accuracy": 0.7676285386085511, - "num_tokens": 123805148.0, + "epoch": 0.1848395983293344, + "grad_norm": 0.3422296936678833, + "learning_rate": 9.912139122465027e-05, + "loss": 0.6116, + "mean_token_accuracy": 0.8140982151031494, + "num_tokens": 119435421.0, "step": 130 }, { - "epoch": 0.19186356368804405, - "grad_norm": 0.4240204964738557, - "learning_rate": 1.9803328406604252e-05, - "loss": 0.7841, - "mean_token_accuracy": 0.763801097869873, - "num_tokens": 128568367.0, + "epoch": 0.1919488136496934, + "grad_norm": 0.3599918244922273, + "learning_rate": 9.901664203302126e-05, + "loss": 0.6052, + "mean_token_accuracy": 0.8154805108904839, + "num_tokens": 124028647.0, "step": 135 }, { - "epoch": 0.19896962160241605, - "grad_norm": 0.41090851408734014, - "learning_rate": 1.9781212788381177e-05, - "loss": 0.7819, - "mean_token_accuracy": 0.7644710555672646, - "num_tokens": 133343344.0, + "epoch": 0.19905802897005243, + "grad_norm": 0.3595154303423279, + "learning_rate": 9.890606394190588e-05, + "loss": 0.6126, + "mean_token_accuracy": 0.8132404424250126, + "num_tokens": 128628413.0, "step": 140 }, { - "epoch": 0.20607567951678807, - "grad_norm": 0.38627786124116875, - "learning_rate": 1.9757934318919386e-05, - "loss": 0.7586, - "mean_token_accuracy": 0.7707050330936909, - "num_tokens": 138078681.0, + "epoch": 0.20616724429041144, + "grad_norm": 0.3711012466200944, + "learning_rate": 9.878967159459693e-05, + "loss": 0.6068, + "mean_token_accuracy": 0.8164977565407753, + "num_tokens": 133219422.0, "step": 145 }, { - "epoch": 0.21318173743116006, - "grad_norm": 0.35413336695126674, - "learning_rate": 1.973349608086791e-05, - "loss": 0.7579, - "mean_token_accuracy": 0.7715410716831684, - "num_tokens": 142812366.0, + "epoch": 0.21327645961077046, + "grad_norm": 0.35910926284617867, + "learning_rate": 9.866748040433956e-05, + "loss": 0.6099, + "mean_token_accuracy": 0.8152773261070252, + "num_tokens": 137825952.0, "step": 150 }, { - "epoch": 0.22028779534553206, - "grad_norm": 0.44320259589774924, - "learning_rate": 1.9707901310458017e-05, - "loss": 0.7649, - "mean_token_accuracy": 0.7688324272632598, - "num_tokens": 147567508.0, + "epoch": 0.22038567493112948, + "grad_norm": 0.4205439208166243, + "learning_rate": 9.853950655229009e-05, + "loss": 0.6064, + "mean_token_accuracy": 0.815191026777029, + "num_tokens": 142422368.0, "step": 155 }, { - "epoch": 0.22739385325990408, - "grad_norm": 0.4272220313623565, - "learning_rate": 1.9681153397074658e-05, - "loss": 0.779, - "mean_token_accuracy": 0.7649676457047463, - "num_tokens": 152348975.0, + "epoch": 0.2274948902514885, + "grad_norm": 0.32091150374802263, + "learning_rate": 9.840576698537329e-05, + "loss": 0.6093, + "mean_token_accuracy": 0.8135301224887371, + "num_tokens": 147015990.0, "step": 160 }, { - "epoch": 0.23449991117427607, - "grad_norm": 0.36879913627107413, - "learning_rate": 1.9653255882807625e-05, - "loss": 0.7547, - "mean_token_accuracy": 0.7709379114210606, - "num_tokens": 157094616.0, + "epoch": 0.23460410557184752, + "grad_norm": 0.32627028158119226, + "learning_rate": 9.826627941403811e-05, + "loss": 0.5969, + "mean_token_accuracy": 0.8182829037308693, + "num_tokens": 151627096.0, "step": 165 }, { - "epoch": 0.24160596908864806, - "grad_norm": 0.39000799041219736, - "learning_rate": 1.9624212461982497e-05, - "loss": 0.7594, - "mean_token_accuracy": 0.7707360699772835, - "num_tokens": 161849805.0, + "epoch": 0.2417133208922065, + "grad_norm": 0.32405674248273, + "learning_rate": 9.812106230991248e-05, + "loss": 0.6068, + "mean_token_accuracy": 0.8159149341285229, + "num_tokens": 156218968.0, "step": 170 }, { - "epoch": 0.2487120270030201, - "grad_norm": 0.37089862448174604, - "learning_rate": 1.9594026980671423e-05, - "loss": 0.7555, - "mean_token_accuracy": 0.7713063634932041, - "num_tokens": 166609253.0, + "epoch": 0.24882253621256553, + "grad_norm": 0.3206982540127891, + "learning_rate": 9.79701349033571e-05, + "loss": 0.6039, + "mean_token_accuracy": 0.8161494679749012, + "num_tokens": 160797401.0, "step": 175 }, { - "epoch": 0.25581808491739205, - "grad_norm": 0.3895218613587052, - "learning_rate": 1.9562703436183783e-05, - "loss": 0.7641, - "mean_token_accuracy": 0.7704252451658249, - "num_tokens": 171374935.0, + "epoch": 0.2559317515329246, + "grad_norm": 0.3360732448004463, + "learning_rate": 9.78135171809189e-05, + "loss": 0.6068, + "mean_token_accuracy": 0.8159954428672791, + "num_tokens": 165402684.0, "step": 180 }, { - "epoch": 0.2629241428317641, - "grad_norm": 0.41062410007633293, - "learning_rate": 1.953024597653688e-05, - "loss": 0.7549, - "mean_token_accuracy": 0.771364139765501, - "num_tokens": 176152999.0, + "epoch": 0.26304096685328354, + "grad_norm": 0.33789233259366286, + "learning_rate": 9.76512298826844e-05, + "loss": 0.6026, + "mean_token_accuracy": 0.8167447924613953, + "num_tokens": 169997282.0, "step": 185 }, { - "epoch": 0.2700302007461361, - "grad_norm": 0.41614535212567993, - "learning_rate": 1.9496658899906605e-05, - "loss": 0.7479, - "mean_token_accuracy": 0.7711076475679874, - "num_tokens": 180913997.0, + "epoch": 0.27015018217364256, + "grad_norm": 0.3089560668153988, + "learning_rate": 9.748329449953302e-05, + "loss": 0.5904, + "mean_token_accuracy": 0.8193566597998142, + "num_tokens": 174589836.0, "step": 190 }, { - "epoch": 0.27713625866050806, - "grad_norm": 0.3927226802114006, - "learning_rate": 1.946194665405828e-05, - "loss": 0.7563, - "mean_token_accuracy": 0.7716620303690434, - "num_tokens": 185674484.0, + "epoch": 0.2772593974940016, + "grad_norm": 0.32060053414915524, + "learning_rate": 9.73097332702914e-05, + "loss": 0.6044, + "mean_token_accuracy": 0.8175870932638645, + "num_tokens": 179181747.0, "step": 195 }, { - "epoch": 0.2842423165748801, - "grad_norm": 0.3794392681437391, - "learning_rate": 1.9426113835757637e-05, - "loss": 0.7537, - "mean_token_accuracy": 0.7706545531749726, - "num_tokens": 190438181.0, + "epoch": 0.2843686128143606, + "grad_norm": 0.32004664048912745, + "learning_rate": 9.713056917878818e-05, + "loss": 0.5888, + "mean_token_accuracy": 0.8192018747329712, + "num_tokens": 183760367.0, "step": 200 }, { - "epoch": 0.2842423165748801, - "eval_loss": 0.7250556349754333, - "eval_mean_token_accuracy": 0.7720574731366676, - "eval_num_tokens": 190438181.0, - "eval_runtime": 149.9052, - "eval_samples_per_second": 24.275, - "eval_steps_per_second": 0.76, + "epoch": 0.2843686128143606, + "eval_loss": 0.5599971413612366, + "eval_mean_token_accuracy": 0.8188303473748659, + "eval_num_tokens": 183760367.0, + "eval_runtime": 145.8536, + "eval_samples_per_second": 24.943, + "eval_steps_per_second": 0.782, "step": 200 }, { - "epoch": 0.2913483744892521, - "grad_norm": 0.40984266753765214, - "learning_rate": 1.9389165190162114e-05, - "loss": 0.753, - "mean_token_accuracy": 0.7713685400784016, - "num_tokens": 195189498.0, + "epoch": 0.2914778281347196, + "grad_norm": 0.3094551492752116, + "learning_rate": 9.694582595081057e-05, + "loss": 0.5872, + "mean_token_accuracy": 0.819921114295721, + "num_tokens": 188360903.0, "step": 205 }, { - "epoch": 0.29845443240362407, - "grad_norm": 0.39761042206791, - "learning_rate": 1.935110561019246e-05, - "loss": 0.7424, - "mean_token_accuracy": 0.7745413303375244, - "num_tokens": 199953059.0, + "epoch": 0.29858704345507864, + "grad_norm": 0.36254147904822126, + "learning_rate": 9.67555280509623e-05, + "loss": 0.5942, + "mean_token_accuracy": 0.817745155096054, + "num_tokens": 192932381.0, "step": 210 }, { - "epoch": 0.3055604903179961, - "grad_norm": 0.37462688474606465, - "learning_rate": 1.931194013588481e-05, - "loss": 0.7504, - "mean_token_accuracy": 0.7734048135578633, - "num_tokens": 204732059.0, + "epoch": 0.30569625877543766, + "grad_norm": 0.3377909564779145, + "learning_rate": 9.655970067942405e-05, + "loss": 0.5994, + "mean_token_accuracy": 0.8163805276155471, + "num_tokens": 197505985.0, "step": 215 }, { - "epoch": 0.3126665482323681, - "grad_norm": 0.39514621265905453, - "learning_rate": 1.927167395372324e-05, - "loss": 0.746, - "mean_token_accuracy": 0.7722579926252365, - "num_tokens": 209507798.0, + "epoch": 0.3128054740957967, + "grad_norm": 0.30751780494672465, + "learning_rate": 9.63583697686162e-05, + "loss": 0.5902, + "mean_token_accuracy": 0.8196643941104412, + "num_tokens": 202105424.0, "step": 220 }, { - "epoch": 0.3197726061467401, - "grad_norm": 0.3734047512754032, - "learning_rate": 1.9230312395952955e-05, - "loss": 0.7444, - "mean_token_accuracy": 0.7728776805102825, - "num_tokens": 214261827.0, + "epoch": 0.3199146894161557, + "grad_norm": 0.34345028355301316, + "learning_rate": 9.615156197976477e-05, + "loss": 0.582, + "mean_token_accuracy": 0.8217154465615749, + "num_tokens": 206686951.0, "step": 225 }, { - "epoch": 0.3268786640611121, - "grad_norm": 0.39160079857564095, - "learning_rate": 1.9187860939874176e-05, - "loss": 0.7509, - "mean_token_accuracy": 0.771727342903614, - "num_tokens": 219027585.0, + "epoch": 0.3270239047365147, + "grad_norm": 0.3216135018716631, + "learning_rate": 9.593930469937087e-05, + "loss": 0.5708, + "mean_token_accuracy": 0.8250658005475998, + "num_tokens": 211278788.0, "step": 230 }, { - "epoch": 0.3339847219754841, - "grad_norm": 0.38214353202928264, - "learning_rate": 1.9144325207116785e-05, - "loss": 0.7388, - "mean_token_accuracy": 0.7766963444650173, - "num_tokens": 223775141.0, + "epoch": 0.33413312005687373, + "grad_norm": 0.32564659909940696, + "learning_rate": 9.572162603558393e-05, + "loss": 0.5928, + "mean_token_accuracy": 0.819525595754385, + "num_tokens": 215877205.0, "step": 235 }, { - "epoch": 0.3410907798898561, - "grad_norm": 0.40133338005632113, - "learning_rate": 1.909971096289591e-05, - "loss": 0.7454, - "mean_token_accuracy": 0.7735047489404678, - "num_tokens": 228541180.0, + "epoch": 0.34124233537723275, + "grad_norm": 0.4839583335140069, + "learning_rate": 9.549855481447954e-05, + "loss": 0.5882, + "mean_token_accuracy": 0.8204580388963223, + "num_tokens": 220486454.0, "step": 240 }, { - "epoch": 0.3481968378042281, - "grad_norm": 0.4094942578953397, - "learning_rate": 1.9054024115248448e-05, - "loss": 0.7401, - "mean_token_accuracy": 0.7752777233719825, - "num_tokens": 233303417.0, + "epoch": 0.34835155069759177, + "grad_norm": 0.3268671171699921, + "learning_rate": 9.527012057624224e-05, + "loss": 0.5836, + "mean_token_accuracy": 0.8208626843988895, + "num_tokens": 225080225.0, "step": 245 }, { - "epoch": 0.35530289571860013, - "grad_norm": 0.37342705131252163, - "learning_rate": 1.90072707142507e-05, - "loss": 0.746, - "mean_token_accuracy": 0.7732348993420601, - "num_tokens": 238086815.0, + "epoch": 0.3554607660179508, + "grad_norm": 0.3244498327733708, + "learning_rate": 9.50363535712535e-05, + "loss": 0.586, + "mean_token_accuracy": 0.8207595020532608, + "num_tokens": 229657012.0, "step": 250 }, { - "epoch": 0.3624089536329721, - "grad_norm": 0.37950791393566236, - "learning_rate": 1.8959456951217187e-05, - "loss": 0.7324, - "mean_token_accuracy": 0.7766066655516625, - "num_tokens": 242856686.0, + "epoch": 0.3625699813383098, + "grad_norm": 0.29889265357291406, + "learning_rate": 9.479728475608593e-05, + "loss": 0.5919, + "mean_token_accuracy": 0.8190862230956555, + "num_tokens": 234248976.0, "step": 255 }, { - "epoch": 0.3695150115473441, - "grad_norm": 0.36431061688841127, - "learning_rate": 1.8910589157880766e-05, - "loss": 0.7389, - "mean_token_accuracy": 0.7757058747112751, - "num_tokens": 247606311.0, + "epoch": 0.3696791966586688, + "grad_norm": 0.34636883393423384, + "learning_rate": 9.455294578940384e-05, + "loss": 0.5765, + "mean_token_accuracy": 0.8226364821195602, + "num_tokens": 238829734.0, "step": 260 }, { - "epoch": 0.37662106946171614, - "grad_norm": 0.4264408595860345, - "learning_rate": 1.8860673805554167e-05, - "loss": 0.74, - "mean_token_accuracy": 0.7750592313706874, - "num_tokens": 252376279.0, + "epoch": 0.3767884119790278, + "grad_norm": 0.3092592234408446, + "learning_rate": 9.430336902777083e-05, + "loss": 0.576, + "mean_token_accuracy": 0.821333235502243, + "num_tokens": 243418989.0, "step": 265 }, { - "epoch": 0.3837271273760881, - "grad_norm": 0.3649269356510504, - "learning_rate": 1.8809717504273e-05, - "loss": 0.7294, - "mean_token_accuracy": 0.7773133426904678, - "num_tokens": 257157622.0, + "epoch": 0.3838976272993868, + "grad_norm": 0.30454136223380207, + "learning_rate": 9.404858752136499e-05, + "loss": 0.5771, + "mean_token_accuracy": 0.8237294301390647, + "num_tokens": 248015701.0, "step": 270 }, { - "epoch": 0.39083318529046013, - "grad_norm": 0.4595703905257225, - "learning_rate": 1.8757727001920446e-05, - "loss": 0.7376, - "mean_token_accuracy": 0.7763620682060719, - "num_tokens": 261918809.0, + "epoch": 0.39100684261974583, + "grad_norm": 0.30289215095264577, + "learning_rate": 9.378863500960222e-05, + "loss": 0.5709, + "mean_token_accuracy": 0.8236084163188935, + "num_tokens": 252613191.0, "step": 275 }, { - "epoch": 0.3979392432048321, - "grad_norm": 0.5280527224785482, - "learning_rate": 1.8704709183333653e-05, - "loss": 0.7329, - "mean_token_accuracy": 0.7755794525146484, - "num_tokens": 266684284.0, + "epoch": 0.39811605794010485, + "grad_norm": 0.3010273864601919, + "learning_rate": 9.352354591666827e-05, + "loss": 0.5861, + "mean_token_accuracy": 0.820894256979227, + "num_tokens": 257210808.0, "step": 280 }, { - "epoch": 0.4050453011192041, - "grad_norm": 0.37695307187112215, - "learning_rate": 1.8650671069392034e-05, - "loss": 0.7331, - "mean_token_accuracy": 0.776630100607872, - "num_tokens": 271450936.0, + "epoch": 0.40522527326046387, + "grad_norm": 0.30175911100812025, + "learning_rate": 9.325335534696017e-05, + "loss": 0.5753, + "mean_token_accuracy": 0.8225005254149437, + "num_tokens": 261790131.0, "step": 285 }, { - "epoch": 0.41215135903357614, - "grad_norm": 0.4776029246227445, - "learning_rate": 1.85956198160875e-05, - "loss": 0.7262, - "mean_token_accuracy": 0.7792015597224236, - "num_tokens": 276196347.0, + "epoch": 0.4123344885808229, + "grad_norm": 0.28871941798325856, + "learning_rate": 9.29780990804375e-05, + "loss": 0.5799, + "mean_token_accuracy": 0.821347926557064, + "num_tokens": 266377324.0, "step": 290 }, { - "epoch": 0.4192574169479481, - "grad_norm": 0.40416881485925626, - "learning_rate": 1.853956271357685e-05, - "loss": 0.7207, - "mean_token_accuracy": 0.7793174132704734, - "num_tokens": 280956754.0, + "epoch": 0.4194437039011819, + "grad_norm": 0.28095014086273895, + "learning_rate": 9.269781356788424e-05, + "loss": 0.581, + "mean_token_accuracy": 0.8209108576178551, + "num_tokens": 270967910.0, "step": 295 }, { - "epoch": 0.4263634748623201, - "grad_norm": 0.36640042813026624, - "learning_rate": 1.8482507185216365e-05, - "loss": 0.7417, - "mean_token_accuracy": 0.7740650460124016, - "num_tokens": 285730218.0, + "epoch": 0.4265529192215409, + "grad_norm": 0.2893211807696515, + "learning_rate": 9.241253592608183e-05, + "loss": 0.5755, + "mean_token_accuracy": 0.8242007777094841, + "num_tokens": 275570273.0, "step": 300 }, { - "epoch": 0.4263634748623201, - "eval_loss": 0.7050633430480957, - "eval_mean_token_accuracy": 0.7762745759989086, - "eval_num_tokens": 285730218.0, - "eval_runtime": 149.2344, - "eval_samples_per_second": 24.384, - "eval_steps_per_second": 0.764, + "epoch": 0.4265529192215409, + "eval_loss": 0.5416839122772217, + "eval_mean_token_accuracy": 0.8231211885025627, + "eval_num_tokens": 275570273.0, + "eval_runtime": 145.5254, + "eval_samples_per_second": 24.999, + "eval_steps_per_second": 0.783, "step": 300 }, { - "epoch": 0.43346953277669215, - "grad_norm": 0.39913292298186365, - "learning_rate": 1.842446078657877e-05, - "loss": 0.7328, - "mean_token_accuracy": 0.7760866671800614, - "num_tokens": 290497109.0, + "epoch": 0.43366213454189995, + "grad_norm": 0.30733885282429685, + "learning_rate": 9.212230393289385e-05, + "loss": 0.5781, + "mean_token_accuracy": 0.8230207331478596, + "num_tokens": 280172533.0, "step": 305 }, { - "epoch": 0.4405755906910641, - "grad_norm": 1.1471762547137223, - "learning_rate": 1.8365431204452683e-05, - "loss": 0.7364, - "mean_token_accuracy": 0.7759052954614163, - "num_tokens": 295276239.0, + "epoch": 0.44077134986225897, + "grad_norm": 0.2682470819307261, + "learning_rate": 9.182715602226341e-05, + "loss": 0.5625, + "mean_token_accuracy": 0.8270745746791363, + "num_tokens": 284763929.0, "step": 310 }, { - "epoch": 0.44768164860543613, - "grad_norm": 0.37744675876656947, - "learning_rate": 1.8305426255824713e-05, - "loss": 0.7317, - "mean_token_accuracy": 0.7751947946846485, - "num_tokens": 300042876.0, + "epoch": 0.447880565182618, + "grad_norm": 0.2962012849994535, + "learning_rate": 9.152713127912355e-05, + "loss": 0.5848, + "mean_token_accuracy": 0.8201167277991772, + "num_tokens": 289376903.0, "step": 315 }, { - "epoch": 0.45478770651980815, - "grad_norm": 0.47691506104424974, - "learning_rate": 1.824445388684426e-05, - "loss": 0.7277, - "mean_token_accuracy": 0.7777360931038857, - "num_tokens": 304798068.0, + "epoch": 0.454989780502977, + "grad_norm": 0.28564514411407316, + "learning_rate": 9.12222694342213e-05, + "loss": 0.5732, + "mean_token_accuracy": 0.8246621482074261, + "num_tokens": 293966796.0, "step": 320 }, { - "epoch": 0.4618937644341801, - "grad_norm": 0.3588696331537949, - "learning_rate": 1.8182522171771293e-05, - "loss": 0.726, - "mean_token_accuracy": 0.7783321216702461, - "num_tokens": 309546102.0, + "epoch": 0.462098995823336, + "grad_norm": 0.30020425973519915, + "learning_rate": 9.091261085885646e-05, + "loss": 0.5606, + "mean_token_accuracy": 0.826822079718113, + "num_tokens": 298540346.0, "step": 325 }, { - "epoch": 0.46899982234855214, - "grad_norm": 0.4952808278830928, - "learning_rate": 1.8119639311907074e-05, - "loss": 0.738, - "mean_token_accuracy": 0.7744948998093605, - "num_tokens": 314307721.0, + "epoch": 0.46920821114369504, + "grad_norm": 0.2887047887642146, + "learning_rate": 9.059819655953536e-05, + "loss": 0.5738, + "mean_token_accuracy": 0.823461939394474, + "num_tokens": 303112604.0, "step": 330 }, { - "epoch": 0.47610588026292416, - "grad_norm": 0.37908339080858117, - "learning_rate": 1.805581363450813e-05, - "loss": 0.7309, - "mean_token_accuracy": 0.7773234643042087, - "num_tokens": 319073411.0, + "epoch": 0.476317426464054, + "grad_norm": 0.3180269352697689, + "learning_rate": 9.027906817254063e-05, + "loss": 0.5654, + "mean_token_accuracy": 0.8256018176674843, + "num_tokens": 307694241.0, "step": 335 }, { - "epoch": 0.48321193817729613, - "grad_norm": 0.3852822861687584, - "learning_rate": 1.7991053591683508e-05, - "loss": 0.731, - "mean_token_accuracy": 0.7765265628695488, - "num_tokens": 323831947.0, + "epoch": 0.483426641784413, + "grad_norm": 0.29567931374872014, + "learning_rate": 8.995526795841753e-05, + "loss": 0.558, + "mean_token_accuracy": 0.8256605207920075, + "num_tokens": 312289299.0, "step": 340 }, { - "epoch": 0.49031799609166815, - "grad_norm": 0.36165483826370953, - "learning_rate": 1.7925367759275495e-05, - "loss": 0.7232, - "mean_token_accuracy": 0.7792682178318501, - "num_tokens": 328590613.0, + "epoch": 0.49053585710477204, + "grad_norm": 0.3336504103662035, + "learning_rate": 8.962683879637747e-05, + "loss": 0.5617, + "mean_token_accuracy": 0.8257805988192558, + "num_tokens": 316884766.0, "step": 345 }, { - "epoch": 0.4974240540060402, - "grad_norm": 0.4313963783745435, - "learning_rate": 1.7858764835723984e-05, - "loss": 0.7247, - "mean_token_accuracy": 0.7771173417568207, - "num_tokens": 333348383.0, + "epoch": 0.49764507242513106, + "grad_norm": 0.3705167375534613, + "learning_rate": 8.929382417861991e-05, + "loss": 0.561, + "mean_token_accuracy": 0.8267210200428963, + "num_tokens": 321461198.0, "step": 350 }, { - "epoch": 0.5045301119204122, - "grad_norm": 0.4013920878913357, - "learning_rate": 1.7791253640914566e-05, - "loss": 0.7236, - "mean_token_accuracy": 0.778332532197237, - "num_tokens": 338109943.0, + "epoch": 0.5047542877454901, + "grad_norm": 0.2946584460529412, + "learning_rate": 8.895626820457283e-05, + "loss": 0.557, + "mean_token_accuracy": 0.828194110840559, + "num_tokens": 326064722.0, "step": 355 }, { - "epoch": 0.5116361698347841, - "grad_norm": 0.4057100730404507, - "learning_rate": 1.7722843115010564e-05, - "loss": 0.7221, - "mean_token_accuracy": 0.7787548579275608, - "num_tokens": 342887490.0, + "epoch": 0.5118635030658492, + "grad_norm": 0.31227448766803945, + "learning_rate": 8.861421557505282e-05, + "loss": 0.5522, + "mean_token_accuracy": 0.8295037761330605, + "num_tokens": 330652094.0, "step": 360 }, { - "epoch": 0.5187422277491561, - "grad_norm": 0.3675393081924701, - "learning_rate": 1.7653542317269134e-05, - "loss": 0.7171, - "mean_token_accuracy": 0.7794929854571819, - "num_tokens": 347628813.0, + "epoch": 0.5189727183862082, + "grad_norm": 1.0759474066945163, + "learning_rate": 8.826771158634567e-05, + "loss": 0.5629, + "mean_token_accuracy": 0.8260238766670227, + "num_tokens": 335255835.0, "step": 365 }, { - "epoch": 0.5258482856635281, - "grad_norm": 0.3635771034527727, - "learning_rate": 1.7583360424841595e-05, - "loss": 0.7272, - "mean_token_accuracy": 0.7774313412606716, - "num_tokens": 352403713.0, + "epoch": 0.5260819337065671, + "grad_norm": 0.2758992633553522, + "learning_rate": 8.791680212420797e-05, + "loss": 0.5502, + "mean_token_accuracy": 0.828965923935175, + "num_tokens": 339843476.0, "step": 370 }, { - "epoch": 0.5329543435779002, - "grad_norm": 0.38584219141761933, - "learning_rate": 1.7512306731558133e-05, - "loss": 0.7194, - "mean_token_accuracy": 0.7801453106105327, - "num_tokens": 357150848.0, + "epoch": 0.5331911490269261, + "grad_norm": 0.29696149610793166, + "learning_rate": 8.756153365779066e-05, + "loss": 0.5542, + "mean_token_accuracy": 0.8278730027377605, + "num_tokens": 344420533.0, "step": 375 }, { - "epoch": 0.5400604014922722, - "grad_norm": 0.36346011493475233, - "learning_rate": 1.744039064669709e-05, - "loss": 0.7253, - "mean_token_accuracy": 0.7780666872859001, - "num_tokens": 361924581.0, + "epoch": 0.5403003643472851, + "grad_norm": 0.284706804181623, + "learning_rate": 8.720195323348545e-05, + "loss": 0.559, + "mean_token_accuracy": 0.8278782211244107, + "num_tokens": 349010370.0, "step": 380 }, { - "epoch": 0.5471664594066442, - "grad_norm": 0.404515192018998, - "learning_rate": 1.7367621693738917e-05, - "loss": 0.715, - "mean_token_accuracy": 0.7817073427140713, - "num_tokens": 366676773.0, + "epoch": 0.5474095796676441, + "grad_norm": 0.3046957362601185, + "learning_rate": 8.68381084686946e-05, + "loss": 0.5576, + "mean_token_accuracy": 0.8258513130247593, + "num_tokens": 353598451.0, "step": 385 }, { - "epoch": 0.5542725173210161, - "grad_norm": 0.36988377045483584, - "learning_rate": 1.7294009509105052e-05, - "loss": 0.7131, - "mean_token_accuracy": 0.7806239545345306, - "num_tokens": 371452085.0, + "epoch": 0.5545187949880032, + "grad_norm": 0.3134773718519533, + "learning_rate": 8.647004754552526e-05, + "loss": 0.5612, + "mean_token_accuracy": 0.8255665130913258, + "num_tokens": 358195615.0, "step": 390 }, { - "epoch": 0.5613785752353881, - "grad_norm": 0.36659851484970235, - "learning_rate": 1.7219563840881783e-05, - "loss": 0.7116, - "mean_token_accuracy": 0.782407358288765, - "num_tokens": 376207953.0, + "epoch": 0.5616280103083622, + "grad_norm": 0.33349640254961, + "learning_rate": 8.609781920440891e-05, + "loss": 0.552, + "mean_token_accuracy": 0.8278413727879524, + "num_tokens": 362764034.0, "step": 395 }, { - "epoch": 0.5684846331497602, - "grad_norm": 0.3624183655015318, - "learning_rate": 1.71442945475294e-05, - "loss": 0.7169, - "mean_token_accuracy": 0.7801115453243256, - "num_tokens": 380979250.0, + "epoch": 0.5687372256287212, + "grad_norm": 0.32034152048464726, + "learning_rate": 8.5721472737647e-05, + "loss": 0.5534, + "mean_token_accuracy": 0.8273369200527668, + "num_tokens": 367350265.0, "step": 400 }, { - "epoch": 0.5684846331497602, - "eval_loss": 0.6918764114379883, - "eval_mean_token_accuracy": 0.7797647902840062, - "eval_num_tokens": 380979250.0, - "eval_runtime": 150.2483, - "eval_samples_per_second": 24.22, - "eval_steps_per_second": 0.759, + "epoch": 0.5687372256287212, + "eval_loss": 0.5274047255516052, + "eval_mean_token_accuracy": 0.8264030280866121, + "eval_num_tokens": 367350265.0, + "eval_runtime": 146.0134, + "eval_samples_per_second": 24.916, + "eval_steps_per_second": 0.781, "step": 400 }, { - "epoch": 0.5755906910641322, - "grad_norm": 0.3779480713941837, - "learning_rate": 1.7068211596576662e-05, - "loss": 0.716, - "mean_token_accuracy": 0.7807160533964634, - "num_tokens": 385752024.0, + "epoch": 0.5758464409490802, + "grad_norm": 0.29085093151843905, + "learning_rate": 8.534105798288331e-05, + "loss": 0.5506, + "mean_token_accuracy": 0.830031219124794, + "num_tokens": 371939618.0, "step": 405 }, { - "epoch": 0.5826967489785042, - "grad_norm": 0.3956915407192127, - "learning_rate": 1.699132506330086e-05, - "loss": 0.7168, - "mean_token_accuracy": 0.780977015197277, - "num_tokens": 390510208.0, + "epoch": 0.5829556562694392, + "grad_norm": 0.27710417408529203, + "learning_rate": 8.49566253165043e-05, + "loss": 0.5439, + "mean_token_accuracy": 0.8304261237382888, + "num_tokens": 376519800.0, "step": 410 }, { - "epoch": 0.5898028068928762, - "grad_norm": 0.37756059351173565, - "learning_rate": 1.691364512939358e-05, - "loss": 0.7138, - "mean_token_accuracy": 0.7802788965404034, - "num_tokens": 395264854.0, + "epoch": 0.5900648715897983, + "grad_norm": 0.2611394917691902, + "learning_rate": 8.456822564696789e-05, + "loss": 0.5409, + "mean_token_accuracy": 0.832954341173172, + "num_tokens": 381102299.0, "step": 415 }, { - "epoch": 0.5969088648072481, - "grad_norm": 0.4087318897216221, - "learning_rate": 1.6835182081612426e-05, - "loss": 0.7136, - "mean_token_accuracy": 0.782038314640522, - "num_tokens": 400017717.0, + "epoch": 0.5971740869101573, + "grad_norm": 0.42771473321829473, + "learning_rate": 8.417591040806213e-05, + "loss": 0.5504, + "mean_token_accuracy": 0.8300940133631229, + "num_tokens": 385700779.0, "step": 420 }, { - "epoch": 0.6040149227216202, - "grad_norm": 0.40966025914497906, - "learning_rate": 1.6755946310418777e-05, - "loss": 0.7162, - "mean_token_accuracy": 0.7809364423155785, - "num_tokens": 404785855.0, + "epoch": 0.6042833022305163, + "grad_norm": 0.28194050483515865, + "learning_rate": 8.377973155209387e-05, + "loss": 0.5553, + "mean_token_accuracy": 0.8270630918443203, + "num_tokens": 390294365.0, "step": 425 }, { - "epoch": 0.6111209806359922, - "grad_norm": 0.34380470044932104, - "learning_rate": 1.6675948308601826e-05, - "loss": 0.7088, - "mean_token_accuracy": 0.7824217259883881, - "num_tokens": 409545265.0, + "epoch": 0.6113925175508753, + "grad_norm": 0.27563889901609234, + "learning_rate": 8.337974154300913e-05, + "loss": 0.5427, + "mean_token_accuracy": 0.8309814311563969, + "num_tokens": 394889149.0, "step": 430 }, { - "epoch": 0.6182270385503642, - "grad_norm": 0.3999223329715891, - "learning_rate": 1.6595198669889086e-05, - "loss": 0.7178, - "mean_token_accuracy": 0.7794642865657806, - "num_tokens": 414313757.0, + "epoch": 0.6185017328712343, + "grad_norm": 0.27875362292884753, + "learning_rate": 8.297599334944542e-05, + "loss": 0.5561, + "mean_token_accuracy": 0.8275676898658275, + "num_tokens": 399459807.0, "step": 435 }, { - "epoch": 0.6253330964647362, - "grad_norm": 0.4093202361120024, - "learning_rate": 1.6513708087543507e-05, - "loss": 0.7112, - "mean_token_accuracy": 0.7812661081552505, - "num_tokens": 419067741.0, + "epoch": 0.6256109481915934, + "grad_norm": 0.7336148967265075, + "learning_rate": 8.256854043771754e-05, + "loss": 0.5507, + "mean_token_accuracy": 0.8285100273787975, + "num_tokens": 404034333.0, "step": 440 }, { - "epoch": 0.6324391543791081, - "grad_norm": 0.3927913861855057, - "learning_rate": 1.643148735294744e-05, - "loss": 0.7085, - "mean_token_accuracy": 0.7821477875113487, - "num_tokens": 423849699.0, + "epoch": 0.6327201635119524, + "grad_norm": 0.3259646654441019, + "learning_rate": 8.215743676473719e-05, + "loss": 0.5503, + "mean_token_accuracy": 0.8290993146598339, + "num_tokens": 408627270.0, "step": 445 }, { - "epoch": 0.6395452122934802, - "grad_norm": 0.38988729195797683, - "learning_rate": 1.634854735417356e-05, - "loss": 0.7184, - "mean_token_accuracy": 0.7806262195110321, - "num_tokens": 428613216.0, + "epoch": 0.6398293788323114, + "grad_norm": 0.3012299941832976, + "learning_rate": 8.174273677086779e-05, + "loss": 0.552, + "mean_token_accuracy": 0.8279682919383049, + "num_tokens": 413222911.0, "step": 450 }, { - "epoch": 0.6466512702078522, - "grad_norm": 0.35125767095510474, - "learning_rate": 1.6264899074543038e-05, - "loss": 0.7244, - "mean_token_accuracy": 0.7782423093914985, - "num_tokens": 433373732.0, + "epoch": 0.6469385941526704, + "grad_norm": 0.30771992691522176, + "learning_rate": 8.132449537271519e-05, + "loss": 0.552, + "mean_token_accuracy": 0.8296807646751404, + "num_tokens": 417806274.0, "step": 455 }, { - "epoch": 0.6537573281222242, - "grad_norm": 0.3717296312246723, - "learning_rate": 1.6180553591171064e-05, - "loss": 0.7134, - "mean_token_accuracy": 0.7801944658160209, - "num_tokens": 438144634.0, + "epoch": 0.6540478094730294, + "grad_norm": 0.2810763807856677, + "learning_rate": 8.090276795585531e-05, + "loss": 0.5414, + "mean_token_accuracy": 0.8314659893512726, + "num_tokens": 422401434.0, "step": 460 }, { - "epoch": 0.6608633860365962, - "grad_norm": 0.3514665073580472, - "learning_rate": 1.6095522073499968e-05, - "loss": 0.7094, - "mean_token_accuracy": 0.782074099034071, - "num_tokens": 442899589.0, + "epoch": 0.6611570247933884, + "grad_norm": 0.2672336811508722, + "learning_rate": 8.047761036749985e-05, + "loss": 0.5564, + "mean_token_accuracy": 0.8265900291502476, + "num_tokens": 426986385.0, "step": 465 }, { - "epoch": 0.6679694439509682, - "grad_norm": 0.3635349909050601, - "learning_rate": 1.600981578182011e-05, - "loss": 0.7125, - "mean_token_accuracy": 0.7808018557727336, - "num_tokens": 447672633.0, + "epoch": 0.6682662401137475, + "grad_norm": 0.25924906311163326, + "learning_rate": 8.004907890910055e-05, + "loss": 0.5452, + "mean_token_accuracy": 0.8297064855694771, + "num_tokens": 431585703.0, "step": 470 }, { - "epoch": 0.6750755018653402, - "grad_norm": 0.3548670898985585, - "learning_rate": 1.5923446065778715e-05, - "loss": 0.7162, - "mean_token_accuracy": 0.7795430406928062, - "num_tokens": 452431439.0, + "epoch": 0.6753754554341065, + "grad_norm": 0.2772688573388134, + "learning_rate": 7.961723032889358e-05, + "loss": 0.5292, + "mean_token_accuracy": 0.8346129797399044, + "num_tokens": 436150194.0, "step": 475 }, { - "epoch": 0.6821815597797122, - "grad_norm": 0.34932388566686257, - "learning_rate": 1.5836424362876933e-05, - "loss": 0.6984, - "mean_token_accuracy": 0.7855889156460762, - "num_tokens": 457177703.0, + "epoch": 0.6824846707544655, + "grad_norm": 0.25573353155086187, + "learning_rate": 7.918212181438467e-05, + "loss": 0.5397, + "mean_token_accuracy": 0.8314497999846935, + "num_tokens": 440736901.0, "step": 480 }, { - "epoch": 0.6892876176940842, - "grad_norm": 0.3981248379617205, - "learning_rate": 1.5748762196955198e-05, - "loss": 0.7036, - "mean_token_accuracy": 0.7827964283525943, - "num_tokens": 461930774.0, + "epoch": 0.6895938860748245, + "grad_norm": 0.2640386419783165, + "learning_rate": 7.874381098477599e-05, + "loss": 0.5359, + "mean_token_accuracy": 0.8328767582774163, + "num_tokens": 445334774.0, "step": 485 }, { - "epoch": 0.6963936756084562, - "grad_norm": 0.3393463040601756, - "learning_rate": 1.5660471176667194e-05, - "loss": 0.7092, - "mean_token_accuracy": 0.7816402152180671, - "num_tokens": 466702045.0, + "epoch": 0.6967031013951835, + "grad_norm": 0.2662269663206075, + "learning_rate": 7.830235588333597e-05, + "loss": 0.5578, + "mean_token_accuracy": 0.8268053226172924, + "num_tokens": 449908855.0, "step": 490 }, { - "epoch": 0.7034997335228282, - "grad_norm": 0.3747245118209944, - "learning_rate": 1.5571562993942594e-05, - "loss": 0.7063, - "mean_token_accuracy": 0.7829745762050152, - "num_tokens": 471461872.0, + "epoch": 0.7038123167155426, + "grad_norm": 0.2756351015892551, + "learning_rate": 7.785781496971297e-05, + "loss": 0.5503, + "mean_token_accuracy": 0.8284729138016701, + "num_tokens": 454513487.0, "step": 495 }, { - "epoch": 0.7106057914372003, - "grad_norm": 0.356673539185162, - "learning_rate": 1.5482049422438732e-05, - "loss": 0.7052, - "mean_token_accuracy": 0.7823217682540416, - "num_tokens": 476233238.0, + "epoch": 0.7109215320359016, + "grad_norm": 0.4547105928976161, + "learning_rate": 7.741024711219366e-05, + "loss": 0.5431, + "mean_token_accuracy": 0.8298681430518627, + "num_tokens": 459106365.0, "step": 500 }, { - "epoch": 0.7106057914372003, - "eval_loss": 0.6809196472167969, - "eval_mean_token_accuracy": 0.7825243828589457, - "eval_num_tokens": 476233238.0, - "eval_runtime": 150.1867, - "eval_samples_per_second": 24.23, - "eval_steps_per_second": 0.759, + "epoch": 0.7109215320359016, + "eval_loss": 0.5168540477752686, + "eval_mean_token_accuracy": 0.8290872861418808, + "eval_num_tokens": 459106365.0, + "eval_runtime": 146.2066, + "eval_samples_per_second": 24.883, + "eval_steps_per_second": 0.78, "step": 500 }, { - "epoch": 0.7177118493515722, - "grad_norm": 0.342268344236882, - "learning_rate": 1.5391942315981506e-05, - "loss": 0.7124, - "mean_token_accuracy": 0.7804363466799259, - "num_tokens": 481010410.0, + "epoch": 0.7180307473562606, + "grad_norm": 1.6021704699780053, + "learning_rate": 7.695971157990754e-05, + "loss": 0.5646, + "mean_token_accuracy": 0.8263038910925389, + "num_tokens": 463703240.0, "step": 505 }, { - "epoch": 0.7248179072659442, - "grad_norm": 0.41188433212292186, - "learning_rate": 1.530125360699561e-05, - "loss": 0.7089, - "mean_token_accuracy": 0.7815835013985634, - "num_tokens": 485757825.0, + "epoch": 0.7251399626766196, + "grad_norm": 4.625968090811763, + "learning_rate": 7.650626803497806e-05, + "loss": 0.5581, + "mean_token_accuracy": 0.8270722553133965, + "num_tokens": 468295660.0, "step": 510 }, { - "epoch": 0.7319239651803162, - "grad_norm": 0.456891730601901, - "learning_rate": 1.520999530492441e-05, - "loss": 0.7022, - "mean_token_accuracy": 0.7851340644061565, - "num_tokens": 490512360.0, + "epoch": 0.7322491779969785, + "grad_norm": 0.27503115183353516, + "learning_rate": 7.604997652462205e-05, + "loss": 0.5492, + "mean_token_accuracy": 0.8294327199459076, + "num_tokens": 472896751.0, "step": 515 }, { - "epoch": 0.7390300230946882, - "grad_norm": 0.347445377971525, - "learning_rate": 1.511817949463956e-05, - "loss": 0.7066, - "mean_token_accuracy": 0.7829876273870469, - "num_tokens": 495265025.0, + "epoch": 0.7393583933173375, + "grad_norm": 0.267416722991217, + "learning_rate": 7.55908974731978e-05, + "loss": 0.5418, + "mean_token_accuracy": 0.8326966613531113, + "num_tokens": 477480918.0, "step": 520 }, { - "epoch": 0.7461360810090603, - "grad_norm": 0.3649440044406707, - "learning_rate": 1.5025818334840695e-05, - "loss": 0.7057, - "mean_token_accuracy": 0.7825053557753563, - "num_tokens": 500030371.0, + "epoch": 0.7464676086376966, + "grad_norm": 0.25628361203172423, + "learning_rate": 7.512909167420347e-05, + "loss": 0.5404, + "mean_token_accuracy": 0.8324044570326805, + "num_tokens": 482064392.0, "step": 525 }, { - "epoch": 0.7532421389234323, - "grad_norm": 0.3848276798074321, - "learning_rate": 1.493292405644531e-05, - "loss": 0.6916, - "mean_token_accuracy": 0.7862150557339191, - "num_tokens": 504787581.0, + "epoch": 0.7535768239580556, + "grad_norm": 0.24597696845219366, + "learning_rate": 7.466462028222654e-05, + "loss": 0.5353, + "mean_token_accuracy": 0.8331540204584599, + "num_tokens": 486649806.0, "step": 530 }, { - "epoch": 0.7603481968378042, - "grad_norm": 0.3545112606618106, - "learning_rate": 1.4839508960969071e-05, - "loss": 0.7041, - "mean_token_accuracy": 0.7828620508313179, - "num_tokens": 509570758.0, + "epoch": 0.7606860392784146, + "grad_norm": 0.2497969231256322, + "learning_rate": 7.419754480484536e-05, + "loss": 0.5378, + "mean_token_accuracy": 0.8323175966739654, + "num_tokens": 491217398.0, "step": 535 }, { - "epoch": 0.7674542547521762, - "grad_norm": 0.37573645816136086, - "learning_rate": 1.4745585418896799e-05, - "loss": 0.7022, - "mean_token_accuracy": 0.7837928868830204, - "num_tokens": 514321600.0, + "epoch": 0.7677952545987736, + "grad_norm": 0.27136426093422567, + "learning_rate": 7.3727927094484e-05, + "loss": 0.5303, + "mean_token_accuracy": 0.8346898458898068, + "num_tokens": 495798334.0, "step": 540 }, { - "epoch": 0.7745603126665482, - "grad_norm": 0.367683939418967, - "learning_rate": 1.4651165868044301e-05, - "loss": 0.6995, - "mean_token_accuracy": 0.7847208097577095, - "num_tokens": 519082348.0, + "epoch": 0.7749044699191326, + "grad_norm": 0.263928683082665, + "learning_rate": 7.32558293402215e-05, + "loss": 0.5193, + "mean_token_accuracy": 0.8367893837392331, + "num_tokens": 500382331.0, "step": 545 }, { - "epoch": 0.7816663705809203, - "grad_norm": 0.37333173963401106, - "learning_rate": 1.45562628119113e-05, - "loss": 0.7008, - "mean_token_accuracy": 0.7843301363289357, - "num_tokens": 523847671.0, + "epoch": 0.7820136852394917, + "grad_norm": 0.2697485453052082, + "learning_rate": 7.27813140595565e-05, + "loss": 0.5249, + "mean_token_accuracy": 0.836308328807354, + "num_tokens": 504972961.0, "step": 550 }, { - "epoch": 0.7887724284952923, - "grad_norm": 0.35304534000101323, - "learning_rate": 1.446088881802566e-05, - "loss": 0.7113, - "mean_token_accuracy": 0.780696228891611, - "num_tokens": 528620044.0, + "epoch": 0.7891229005598507, + "grad_norm": 0.47577994241811294, + "learning_rate": 7.23044440901283e-05, + "loss": 0.5386, + "mean_token_accuracy": 0.832004614919424, + "num_tokens": 509556175.0, "step": 555 }, { - "epoch": 0.7958784864096642, - "grad_norm": 0.37158140908427656, - "learning_rate": 1.4365056516279126e-05, - "loss": 0.7016, - "mean_token_accuracy": 0.7839049801230431, - "num_tokens": 533367563.0, + "epoch": 0.7962321158802097, + "grad_norm": 0.26812210950339255, + "learning_rate": 7.182528258139563e-05, + "loss": 0.5327, + "mean_token_accuracy": 0.8331871695816517, + "num_tokens": 514159170.0, "step": 560 }, { - "epoch": 0.8029845443240362, - "grad_norm": 0.3666636970376427, - "learning_rate": 1.426877859725482e-05, - "loss": 0.7013, - "mean_token_accuracy": 0.7832373000681401, - "num_tokens": 538117561.0, + "epoch": 0.8033413312005687, + "grad_norm": 0.2590503131411491, + "learning_rate": 7.13438929862741e-05, + "loss": 0.5447, + "mean_token_accuracy": 0.8303000062704087, + "num_tokens": 518758083.0, "step": 565 }, { - "epoch": 0.8100906022384082, - "grad_norm": 0.3480002426515768, - "learning_rate": 1.4172067810546689e-05, - "loss": 0.7024, - "mean_token_accuracy": 0.7843490958213806, - "num_tokens": 542889289.0, + "epoch": 0.8104505465209277, + "grad_norm": 0.2700164600845211, + "learning_rate": 7.086033905273344e-05, + "loss": 0.5367, + "mean_token_accuracy": 0.8323484763503075, + "num_tokens": 523345629.0, "step": 570 }, { - "epoch": 0.8171966601527803, - "grad_norm": 0.34159899298495605, - "learning_rate": 1.4074936963071135e-05, - "loss": 0.7034, - "mean_token_accuracy": 0.7836663112044334, - "num_tokens": 547637481.0, + "epoch": 0.8175597618412868, + "grad_norm": 0.26967028018820877, + "learning_rate": 7.037468481535567e-05, + "loss": 0.5212, + "mean_token_accuracy": 0.8371426187455654, + "num_tokens": 527940592.0, "step": 575 }, { - "epoch": 0.8243027180671523, - "grad_norm": 0.346312037596486, - "learning_rate": 1.3977398917371074e-05, - "loss": 0.6952, - "mean_token_accuracy": 0.7860016152262688, - "num_tokens": 552402659.0, + "epoch": 0.8246689771616458, + "grad_norm": 0.3154368910279167, + "learning_rate": 6.988699458685537e-05, + "loss": 0.5275, + "mean_token_accuracy": 0.8351783238351345, + "num_tokens": 532516910.0, "step": 580 }, { - "epoch": 0.8314087759815243, - "grad_norm": 0.4021455809002062, - "learning_rate": 1.3879466589912598e-05, - "loss": 0.6938, - "mean_token_accuracy": 0.785366540402174, - "num_tokens": 557156063.0, + "epoch": 0.8317781924820048, + "grad_norm": 0.26226153440650185, + "learning_rate": 6.9397332949563e-05, + "loss": 0.5335, + "mean_token_accuracy": 0.8329351760447026, + "num_tokens": 537121758.0, "step": 585 }, { - "epoch": 0.8385148338958962, - "grad_norm": 0.35612007953198216, - "learning_rate": 1.3781152949374527e-05, - "loss": 0.7012, - "mean_token_accuracy": 0.7838830970227718, - "num_tokens": 561916767.0, + "epoch": 0.8388874078023638, + "grad_norm": 0.31223173870328286, + "learning_rate": 6.890576474687263e-05, + "loss": 0.5458, + "mean_token_accuracy": 0.829648780822754, + "num_tokens": 541734519.0, "step": 590 }, { - "epoch": 0.8456208918102682, - "grad_norm": 0.3655479416355732, - "learning_rate": 1.3682471014931031e-05, - "loss": 0.7019, - "mean_token_accuracy": 0.7831911854445934, - "num_tokens": 566684863.0, + "epoch": 0.8459966231227228, + "grad_norm": 0.2565970150956528, + "learning_rate": 6.841235507465515e-05, + "loss": 0.5415, + "mean_token_accuracy": 0.8324811846017838, + "num_tokens": 546326546.0, "step": 595 }, { - "epoch": 0.8527269497246402, - "grad_norm": 0.35487507828239484, - "learning_rate": 1.3583433854527557e-05, - "loss": 0.6967, - "mean_token_accuracy": 0.7847252510488033, - "num_tokens": 571452634.0, + "epoch": 0.8531058384430819, + "grad_norm": 0.29462309278409743, + "learning_rate": 6.791716927263778e-05, + "loss": 0.5354, + "mean_token_accuracy": 0.8325764186680317, + "num_tokens": 550923667.0, "step": 600 }, { - "epoch": 0.8527269497246402, - "eval_loss": 0.672866702079773, - "eval_mean_token_accuracy": 0.784141309951481, - "eval_num_tokens": 571452634.0, - "eval_runtime": 149.8383, - "eval_samples_per_second": 24.286, - "eval_steps_per_second": 0.761, + "epoch": 0.8531058384430819, + "eval_loss": 0.5030205249786377, + "eval_mean_token_accuracy": 0.8328834866222582, + "eval_num_tokens": 550923667.0, + "eval_runtime": 145.5099, + "eval_samples_per_second": 25.002, + "eval_steps_per_second": 0.783, "step": 600 }, { - "epoch": 0.8598330076390123, - "grad_norm": 0.3456230748389429, - "learning_rate": 1.3484054583150315e-05, - "loss": 0.6906, - "mean_token_accuracy": 0.7867132879793644, - "num_tokens": 576198167.0, + "epoch": 0.8602150537634409, + "grad_norm": 0.2995740161508053, + "learning_rate": 6.742027291575156e-05, + "loss": 0.5351, + "mean_token_accuracy": 0.8337548352777958, + "num_tokens": 555521300.0, "step": 605 }, { - "epoch": 0.8669390655533843, - "grad_norm": 0.4168654830920555, - "learning_rate": 1.3384346361089535e-05, - "loss": 0.6885, - "mean_token_accuracy": 0.7866604030132294, - "num_tokens": 580952899.0, + "epoch": 0.8673242690837999, + "grad_norm": 0.256895454866442, + "learning_rate": 6.692173180544768e-05, + "loss": 0.527, + "mean_token_accuracy": 0.8346491247415543, + "num_tokens": 560114622.0, "step": 610 }, { - "epoch": 0.8740451234677563, - "grad_norm": 0.3723864301027309, - "learning_rate": 1.3284322392196703e-05, - "loss": 0.6943, - "mean_token_accuracy": 0.7859079904854298, - "num_tokens": 585731060.0, + "epoch": 0.8744334844041589, + "grad_norm": 0.26124663621839667, + "learning_rate": 6.642161196098351e-05, + "loss": 0.5299, + "mean_token_accuracy": 0.835064522176981, + "num_tokens": 564707120.0, "step": 615 }, { - "epoch": 0.8811511813821282, - "grad_norm": 0.3470961871922244, - "learning_rate": 1.3183995922136048e-05, - "loss": 0.712, - "mean_token_accuracy": 0.7812197484076023, - "num_tokens": 590504105.0, + "epoch": 0.8815426997245179, + "grad_norm": 0.30629789668279445, + "learning_rate": 6.591997961068024e-05, + "loss": 0.5391, + "mean_token_accuracy": 0.8325687229633332, + "num_tokens": 569285949.0, "step": 620 }, { - "epoch": 0.8882572392965002, - "grad_norm": 0.36809607687600115, - "learning_rate": 1.308338023663049e-05, - "loss": 0.7012, - "mean_token_accuracy": 0.7837964847683907, - "num_tokens": 595263948.0, + "epoch": 0.888651915044877, + "grad_norm": 0.2517010032545197, + "learning_rate": 6.541690118315245e-05, + "loss": 0.528, + "mean_token_accuracy": 0.834906804561615, + "num_tokens": 573871769.0, "step": 625 }, { - "epoch": 0.8953632972108723, - "grad_norm": 0.40488523074148874, - "learning_rate": 1.2982488659702269e-05, - "loss": 0.696, - "mean_token_accuracy": 0.7849378556013107, - "num_tokens": 600009699.0, + "epoch": 0.895761130365236, + "grad_norm": 0.3714356282666368, + "learning_rate": 6.491244329851133e-05, + "loss": 0.521, + "mean_token_accuracy": 0.8374850310385227, + "num_tokens": 578461250.0, "step": 630 }, { - "epoch": 0.9024693551252443, - "grad_norm": 0.36084599960401653, - "learning_rate": 1.2881334551908524e-05, - "loss": 0.6932, - "mean_token_accuracy": 0.785707937926054, - "num_tokens": 604750827.0, + "epoch": 0.902870345685595, + "grad_norm": 0.2513550517622928, + "learning_rate": 6.440667275954262e-05, + "loss": 0.5151, + "mean_token_accuracy": 0.8384780243039132, + "num_tokens": 583046607.0, "step": 635 }, { - "epoch": 0.9095754130396163, - "grad_norm": 0.47712658403892855, - "learning_rate": 1.2779931308572022e-05, - "loss": 0.6932, - "mean_token_accuracy": 0.7863863408565521, - "num_tokens": 609500130.0, + "epoch": 0.909979561005954, + "grad_norm": 0.2790784344252937, + "learning_rate": 6.389965654286011e-05, + "loss": 0.5287, + "mean_token_accuracy": 0.8349935576319695, + "num_tokens": 587648232.0, "step": 640 }, { - "epoch": 0.9166814709539882, - "grad_norm": 0.3494828834988836, - "learning_rate": 1.2678292358007274e-05, - "loss": 0.6859, - "mean_token_accuracy": 0.7878520257771016, - "num_tokens": 614261653.0, + "epoch": 0.917088776326313, + "grad_norm": 0.27767689120972117, + "learning_rate": 6.339146179003636e-05, + "loss": 0.5207, + "mean_token_accuracy": 0.837136809527874, + "num_tokens": 592239729.0, "step": 645 }, { - "epoch": 0.9237875288683602, - "grad_norm": 0.38391809182329456, - "learning_rate": 1.2576431159742298e-05, - "loss": 0.7083, - "mean_token_accuracy": 0.7823263764381408, - "num_tokens": 619054673.0, + "epoch": 0.924197991646672, + "grad_norm": 0.2805149976836277, + "learning_rate": 6.288215579871148e-05, + "loss": 0.5229, + "mean_token_accuracy": 0.8374404884874821, + "num_tokens": 596831306.0, "step": 650 }, { - "epoch": 0.9308935867827323, - "grad_norm": 0.341786835416634, - "learning_rate": 1.247436120273624e-05, - "loss": 0.7049, - "mean_token_accuracy": 0.7822027482092381, - "num_tokens": 623817276.0, + "epoch": 0.9313072069670311, + "grad_norm": 0.24703194529226574, + "learning_rate": 6.23718060136812e-05, + "loss": 0.5152, + "mean_token_accuracy": 0.8385937295854091, + "num_tokens": 601427733.0, "step": 655 }, { - "epoch": 0.9379996446971043, - "grad_norm": 0.43974221008421865, - "learning_rate": 1.237209600359311e-05, - "loss": 0.6935, - "mean_token_accuracy": 0.7854240909218788, - "num_tokens": 628574712.0, + "epoch": 0.9384164222873901, + "grad_norm": 0.33949011504626453, + "learning_rate": 6.186048001796556e-05, + "loss": 0.5204, + "mean_token_accuracy": 0.8384438544511795, + "num_tokens": 606006466.0, "step": 660 }, { - "epoch": 0.9451057026114763, - "grad_norm": 0.4040988616310412, - "learning_rate": 1.226964910477183e-05, - "loss": 0.6898, - "mean_token_accuracy": 0.7870114140212536, - "num_tokens": 633335145.0, + "epoch": 0.945525637607749, + "grad_norm": 0.24749318396547174, + "learning_rate": 6.134824552385915e-05, + "loss": 0.5256, + "mean_token_accuracy": 0.8357278972864151, + "num_tokens": 610597552.0, "step": 665 }, { - "epoch": 0.9522117605258483, - "grad_norm": 0.38559148131636184, - "learning_rate": 1.2167034072792887e-05, - "loss": 0.6937, - "mean_token_accuracy": 0.7853186056017876, - "num_tokens": 638086757.0, + "epoch": 0.952634852928108, + "grad_norm": 0.26267746218214755, + "learning_rate": 6.0835170363964434e-05, + "loss": 0.528, + "mean_token_accuracy": 0.8351906433701515, + "num_tokens": 615193994.0, "step": 670 }, { - "epoch": 0.9593178184402202, - "grad_norm": 0.35182778835643425, - "learning_rate": 1.2064264496441786e-05, - "loss": 0.6893, - "mean_token_accuracy": 0.7859195664525032, - "num_tokens": 642864800.0, + "epoch": 0.959744068248467, + "grad_norm": 0.25519090759528035, + "learning_rate": 6.032132248220893e-05, + "loss": 0.518, + "mean_token_accuracy": 0.8378535941243171, + "num_tokens": 619786315.0, "step": 675 }, { - "epoch": 0.9664238763545923, - "grad_norm": 0.37801611596609275, - "learning_rate": 1.1961353984969557e-05, - "loss": 0.689, - "mean_token_accuracy": 0.7867573000490665, - "num_tokens": 647632233.0, + "epoch": 0.966853283568826, + "grad_norm": 0.25149430173186577, + "learning_rate": 5.9806769924847784e-05, + "loss": 0.5175, + "mean_token_accuracy": 0.8372136250138282, + "num_tokens": 624383919.0, "step": 680 }, { - "epoch": 0.9735299342689643, - "grad_norm": 0.3676417856657156, - "learning_rate": 1.1858316166290542e-05, - "loss": 0.6933, - "mean_token_accuracy": 0.7860686622560025, - "num_tokens": 652408527.0, + "epoch": 0.9739624988891851, + "grad_norm": 0.2669872598294479, + "learning_rate": 5.929158083145271e-05, + "loss": 0.5166, + "mean_token_accuracy": 0.8380297608673573, + "num_tokens": 628976906.0, "step": 685 }, { - "epoch": 0.9806359921833363, - "grad_norm": 0.32727128259002763, - "learning_rate": 1.1755164685177733e-05, - "loss": 0.6909, - "mean_token_accuracy": 0.7855750493705272, - "num_tokens": 657175363.0, + "epoch": 0.9810717142095441, + "grad_norm": 0.3079990980800955, + "learning_rate": 5.8775823425888664e-05, + "loss": 0.5171, + "mean_token_accuracy": 0.8365243822336197, + "num_tokens": 633557562.0, "step": 690 }, { - "epoch": 0.9877420500977083, - "grad_norm": 0.346480246739393, - "learning_rate": 1.1651913201455865e-05, - "loss": 0.6901, - "mean_token_accuracy": 0.78620011433959, - "num_tokens": 661940460.0, + "epoch": 0.9881809295299031, + "grad_norm": 0.26934237379344833, + "learning_rate": 5.825956600727932e-05, + "loss": 0.5176, + "mean_token_accuracy": 0.8371751248836518, + "num_tokens": 638143938.0, "step": 695 }, { - "epoch": 0.9948481080120803, - "grad_norm": 0.3382141590702366, - "learning_rate": 1.154857538819249e-05, - "loss": 0.6935, - "mean_token_accuracy": 0.7859153963625432, - "num_tokens": 666706092.0, + "epoch": 0.9952901448502621, + "grad_norm": 0.24892879578477203, + "learning_rate": 5.774287694096246e-05, + "loss": 0.5203, + "mean_token_accuracy": 0.8368992209434509, + "num_tokens": 642760408.0, "step": 700 }, { - "epoch": 0.9948481080120803, - "eval_loss": 0.6661998629570007, - "eval_mean_token_accuracy": 0.7860168757145864, - "eval_num_tokens": 666706092.0, - "eval_runtime": 150.5473, - "eval_samples_per_second": 24.172, - "eval_steps_per_second": 0.757, + "epoch": 0.9952901448502621, + "eval_loss": 0.49169814586639404, + "eval_mean_token_accuracy": 0.8366760449451313, + "eval_num_tokens": 642760408.0, + "eval_runtime": 148.141, + "eval_samples_per_second": 24.558, + "eval_steps_per_second": 0.77, "step": 700 }, { - "epoch": 1.0014212115828744, - "grad_norm": 1.0385755552019693, - "learning_rate": 1.144516492988736e-05, - "loss": 0.6823, - "mean_token_accuracy": 0.7878129852784647, - "num_tokens": 671095094.0, + "epoch": 1.0014218430640718, + "grad_norm": 0.5358904769553885, + "learning_rate": 5.72258246494368e-05, + "loss": 0.4893, + "mean_token_accuracy": 0.8436046752376832, + "num_tokens": 646718128.0, "step": 705 }, { - "epoch": 1.0085272694972465, - "grad_norm": 0.34901946667846845, - "learning_rate": 1.134169552066023e-05, - "loss": 0.6613, - "mean_token_accuracy": 0.7918078258633614, - "num_tokens": 675850987.0, + "epoch": 1.008531058384431, + "grad_norm": 0.25743890956382126, + "learning_rate": 5.6708477603301146e-05, + "loss": 0.461, + "mean_token_accuracy": 0.8506338618695736, + "num_tokens": 651304404.0, "step": 710 }, { - "epoch": 1.0156333274116185, - "grad_norm": 0.35371147066198844, - "learning_rate": 1.1238180862437431e-05, - "loss": 0.6616, - "mean_token_accuracy": 0.7930883727967739, - "num_tokens": 680610147.0, + "epoch": 1.0156402737047898, + "grad_norm": 0.2648866270558085, + "learning_rate": 5.6190904312187154e-05, + "loss": 0.4544, + "mean_token_accuracy": 0.8519260853528976, + "num_tokens": 655879909.0, "step": 715 }, { - "epoch": 1.0227393853259905, - "grad_norm": 0.371067048471263, - "learning_rate": 1.1134634663137373e-05, - "loss": 0.6552, - "mean_token_accuracy": 0.7940364375710487, - "num_tokens": 685353908.0, + "epoch": 1.022749489025149, + "grad_norm": 0.27694330822934976, + "learning_rate": 5.567317331568687e-05, + "loss": 0.4474, + "mean_token_accuracy": 0.8545098066329956, + "num_tokens": 660449626.0, "step": 720 }, { - "epoch": 1.0298454432403623, - "grad_norm": 0.3481611518266803, - "learning_rate": 1.1031070634855314e-05, - "loss": 0.6593, - "mean_token_accuracy": 0.7930267058312893, - "num_tokens": 690111045.0, + "epoch": 1.0298587043455079, + "grad_norm": 0.24825528169946715, + "learning_rate": 5.515535317427657e-05, + "loss": 0.4517, + "mean_token_accuracy": 0.8533940657973289, + "num_tokens": 665058163.0, "step": 725 }, { - "epoch": 1.0369515011547343, - "grad_norm": 0.35013978798256634, - "learning_rate": 1.0927502492047492e-05, - "loss": 0.6673, - "mean_token_accuracy": 0.7919997818768024, - "num_tokens": 694881554.0, + "epoch": 1.0369679196658668, + "grad_norm": 0.24464581183689546, + "learning_rate": 5.463751246023746e-05, + "loss": 0.4559, + "mean_token_accuracy": 0.8523735709488391, + "num_tokens": 669654595.0, "step": 730 }, { - "epoch": 1.0440575590691064, - "grad_norm": 0.33993734586176455, - "learning_rate": 1.0823943949715022e-05, - "loss": 0.67, - "mean_token_accuracy": 0.7910104177892208, - "num_tokens": 699670214.0, + "epoch": 1.044077134986226, + "grad_norm": 0.24930171479148333, + "learning_rate": 5.4119719748575106e-05, + "loss": 0.4487, + "mean_token_accuracy": 0.8542089037597179, + "num_tokens": 674232882.0, "step": 735 }, { - "epoch": 1.0511636169834784, - "grad_norm": 0.3412860380013659, - "learning_rate": 1.0720408721587671e-05, - "loss": 0.6715, - "mean_token_accuracy": 0.7910432547330857, - "num_tokens": 704426344.0, + "epoch": 1.0511863503065848, + "grad_norm": 0.23303088594874635, + "learning_rate": 5.360204360793836e-05, + "loss": 0.4436, + "mean_token_accuracy": 0.8547257304191589, + "num_tokens": 678813498.0, "step": 740 }, { - "epoch": 1.0582696748978504, - "grad_norm": 0.3627320420498548, - "learning_rate": 1.061691051830783e-05, - "loss": 0.668, - "mean_token_accuracy": 0.7916376106441021, - "num_tokens": 709184272.0, + "epoch": 1.058295565626944, + "grad_norm": 0.317097982341769, + "learning_rate": 5.308455259153915e-05, + "loss": 0.458, + "mean_token_accuracy": 0.8515614397823811, + "num_tokens": 683401148.0, "step": 745 }, { - "epoch": 1.0653757328122224, - "grad_norm": 0.3551359244101548, - "learning_rate": 1.0513463045614873e-05, - "loss": 0.6732, - "mean_token_accuracy": 0.7899613387882709, - "num_tokens": 713964117.0, + "epoch": 1.0654047809473028, + "grad_norm": 0.24160258781744343, + "learning_rate": 5.256731522807436e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8526393964886665, + "num_tokens": 687982154.0, "step": 750 }, { - "epoch": 1.0724817907265944, - "grad_norm": 0.35291410759778385, - "learning_rate": 1.0410080002530188e-05, - "loss": 0.6653, - "mean_token_accuracy": 0.7933160819113254, - "num_tokens": 718714498.0, + "epoch": 1.072513996267662, + "grad_norm": 0.23602108922437653, + "learning_rate": 5.205040001265094e-05, + "loss": 0.4515, + "mean_token_accuracy": 0.8521531477570534, + "num_tokens": 692583016.0, "step": 755 }, { - "epoch": 1.0795878486409665, - "grad_norm": 0.5183974245642026, - "learning_rate": 1.030677507954307e-05, - "loss": 0.669, - "mean_token_accuracy": 0.7922067753970623, - "num_tokens": 723480315.0, + "epoch": 1.0796232115880209, + "grad_norm": 0.2431546567595459, + "learning_rate": 5.1533875397715345e-05, + "loss": 0.455, + "mean_token_accuracy": 0.8529531605541706, + "num_tokens": 697183950.0, "step": 760 }, { - "epoch": 1.0866939065553385, - "grad_norm": 0.36178536189786653, - "learning_rate": 1.0203561956797777e-05, - "loss": 0.6592, - "mean_token_accuracy": 0.7924857877194882, - "num_tokens": 728257943.0, + "epoch": 1.08673242690838, + "grad_norm": 0.27597324346348756, + "learning_rate": 5.101780978398888e-05, + "loss": 0.4518, + "mean_token_accuracy": 0.8528432317078114, + "num_tokens": 701785548.0, "step": 765 }, { - "epoch": 1.0937999644697105, - "grad_norm": 0.350161084492629, - "learning_rate": 1.0100454302281917e-05, - "loss": 0.6708, - "mean_token_accuracy": 0.7903590828180314, - "num_tokens": 733027792.0, + "epoch": 1.093841642228739, + "grad_norm": 0.26932926236063864, + "learning_rate": 5.050227151140958e-05, + "loss": 0.4536, + "mean_token_accuracy": 0.852679468691349, + "num_tokens": 706364188.0, "step": 770 }, { - "epoch": 1.1009060223840825, - "grad_norm": 0.3924504778281434, - "learning_rate": 9.997465770016488e-06, - "loss": 0.665, - "mean_token_accuracy": 0.7942788422107696, - "num_tokens": 737777094.0, + "epoch": 1.100950857549098, + "grad_norm": 0.2587220894683173, + "learning_rate": 4.998732885008244e-05, + "loss": 0.4503, + "mean_token_accuracy": 0.8526183031499386, + "num_tokens": 710949271.0, "step": 775 }, { - "epoch": 1.1080120802984545, - "grad_norm": 0.3784959263919085, - "learning_rate": 9.894609998247735e-06, - "loss": 0.667, - "mean_token_accuracy": 0.7912828728556633, - "num_tokens": 742543159.0, + "epoch": 1.108060072869457, + "grad_norm": 0.24430696998738718, + "learning_rate": 4.947304999123867e-05, + "loss": 0.4357, + "mean_token_accuracy": 0.8572968378663063, + "num_tokens": 715539336.0, "step": 780 }, { - "epoch": 1.1151181382128263, - "grad_norm": 0.3498944950913003, - "learning_rate": 9.791900607641104e-06, - "loss": 0.6635, - "mean_token_accuracy": 0.7930950812995434, - "num_tokens": 747308343.0, + "epoch": 1.115169288189816, + "grad_norm": 0.24614402366250857, + "learning_rate": 4.895950303820552e-05, + "loss": 0.4525, + "mean_token_accuracy": 0.8526603005826473, + "num_tokens": 720147357.0, "step": 785 }, { - "epoch": 1.1222241961271984, - "grad_norm": 0.3820207111204085, - "learning_rate": 9.68935119947753e-06, - "loss": 0.668, - "mean_token_accuracy": 0.7914499528706074, - "num_tokens": 752079771.0, + "epoch": 1.122278503510175, + "grad_norm": 0.23262198319374294, + "learning_rate": 4.844675599738765e-05, + "loss": 0.4523, + "mean_token_accuracy": 0.852922348678112, + "num_tokens": 724741149.0, "step": 790 }, { - "epoch": 1.1293302540415704, - "grad_norm": 0.37200401458209914, - "learning_rate": 9.586975353852284e-06, - "loss": 0.6639, - "mean_token_accuracy": 0.7913541235029697, - "num_tokens": 756847538.0, + "epoch": 1.1293877188305341, + "grad_norm": 0.2551816873924689, + "learning_rate": 4.793487676926142e-05, + "loss": 0.4562, + "mean_token_accuracy": 0.8518377915024757, + "num_tokens": 729327424.0, "step": 795 }, { - "epoch": 1.1364363119559424, - "grad_norm": 0.3953405586571805, - "learning_rate": 9.484786627876655e-06, - "loss": 0.6697, - "mean_token_accuracy": 0.790704844892025, - "num_tokens": 761615758.0, + "epoch": 1.136496934150893, + "grad_norm": 0.23754167080648592, + "learning_rate": 4.742393313938327e-05, + "loss": 0.445, + "mean_token_accuracy": 0.8547273397445678, + "num_tokens": 733921218.0, "step": 800 }, { - "epoch": 1.1364363119559424, - "eval_loss": 0.6620959043502808, - "eval_mean_token_accuracy": 0.7867940282612517, - "eval_num_tokens": 761615758.0, - "eval_runtime": 149.9826, - "eval_samples_per_second": 24.263, - "eval_steps_per_second": 0.76, + "epoch": 1.136496934150893, + "eval_loss": 0.4879998564720154, + "eval_mean_token_accuracy": 0.8380277005203983, + "eval_num_tokens": 733921218.0, + "eval_runtime": 146.7948, + "eval_samples_per_second": 24.783, + "eval_steps_per_second": 0.777, "step": 800 }, { - "epoch": 1.1435423698703144, - "grad_norm": 0.3501237218712183, - "learning_rate": 9.382798553882605e-06, - "loss": 0.6685, - "mean_token_accuracy": 0.789706601947546, - "num_tokens": 766396725.0, + "epoch": 1.1436061494712522, + "grad_norm": 0.25050469601877845, + "learning_rate": 4.6913992769413026e-05, + "loss": 0.4552, + "mean_token_accuracy": 0.8521495588123799, + "num_tokens": 738503816.0, "step": 805 }, { - "epoch": 1.1506484277846865, - "grad_norm": 0.35257144104430527, - "learning_rate": 9.281024637630794e-06, - "loss": 0.656, - "mean_token_accuracy": 0.7935691051185131, - "num_tokens": 771153408.0, + "epoch": 1.150715364791611, + "grad_norm": 0.24476661787598053, + "learning_rate": 4.6405123188153966e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8532384999096394, + "num_tokens": 743095770.0, "step": 810 }, { - "epoch": 1.1577544856990585, - "grad_norm": 0.3686235797828206, - "learning_rate": 9.179478356522055e-06, - "loss": 0.6617, - "mean_token_accuracy": 0.7928701542317868, - "num_tokens": 775910085.0, + "epoch": 1.1578245801119702, + "grad_norm": 0.24115136773182058, + "learning_rate": 4.589739178261028e-05, + "loss": 0.4471, + "mean_token_accuracy": 0.8549422182142734, + "num_tokens": 747676184.0, "step": 815 }, { - "epoch": 1.1648605436134305, - "grad_norm": 0.35944613382478946, - "learning_rate": 9.078173157812669e-06, - "loss": 0.6673, - "mean_token_accuracy": 0.7925907090306282, - "num_tokens": 780683650.0, + "epoch": 1.1649337954323291, + "grad_norm": 0.24283949811905522, + "learning_rate": 4.5390865789063344e-05, + "loss": 0.448, + "mean_token_accuracy": 0.8543575026094914, + "num_tokens": 752274534.0, "step": 820 }, { - "epoch": 1.1719666015278025, - "grad_norm": 0.33694316783063644, - "learning_rate": 8.97712245683359e-06, - "loss": 0.6686, - "mean_token_accuracy": 0.7904776819050312, - "num_tokens": 785473480.0, + "epoch": 1.1720430107526882, + "grad_norm": 0.2701107129425895, + "learning_rate": 4.4885612284167955e-05, + "loss": 0.4411, + "mean_token_accuracy": 0.8565104402601719, + "num_tokens": 756863683.0, "step": 825 }, { - "epoch": 1.1790726594421745, - "grad_norm": 0.37743982477075316, - "learning_rate": 8.876339635213951e-06, - "loss": 0.6672, - "mean_token_accuracy": 0.7913396395742893, - "num_tokens": 790244466.0, + "epoch": 1.1791522260730471, + "grad_norm": 0.2886054721404824, + "learning_rate": 4.4381698176069754e-05, + "loss": 0.4379, + "mean_token_accuracy": 0.8567862503230572, + "num_tokens": 761453110.0, "step": 830 }, { - "epoch": 1.1861787173565466, - "grad_norm": 0.3852825297524554, - "learning_rate": 8.775838039108975e-06, - "loss": 0.6577, - "mean_token_accuracy": 0.7940163776278496, - "num_tokens": 794986608.0, + "epoch": 1.1862614413934063, + "grad_norm": 0.2561982737144238, + "learning_rate": 4.387919019554487e-05, + "loss": 0.4532, + "mean_token_accuracy": 0.8531202852725983, + "num_tokens": 766041248.0, "step": 835 }, { - "epoch": 1.1932847752709184, - "grad_norm": 0.3584263516575822, - "learning_rate": 8.67563097743263e-06, - "loss": 0.6589, - "mean_token_accuracy": 0.7941929534077644, - "num_tokens": 799743654.0, + "epoch": 1.1933706567137652, + "grad_norm": 0.26412588441218454, + "learning_rate": 4.3378154887163144e-05, + "loss": 0.4453, + "mean_token_accuracy": 0.853339533507824, + "num_tokens": 770624920.0, "step": 840 }, { - "epoch": 1.2003908331852904, - "grad_norm": 0.36222679284602094, - "learning_rate": 8.575731720095194e-06, - "loss": 0.6558, - "mean_token_accuracy": 0.7949050404131413, - "num_tokens": 804510301.0, + "epoch": 1.2004798720341243, + "grad_norm": 0.25032821222177587, + "learning_rate": 4.287865860047596e-05, + "loss": 0.4558, + "mean_token_accuracy": 0.8522251404821872, + "num_tokens": 775225729.0, "step": 845 }, { - "epoch": 1.2074968910996624, - "grad_norm": 0.36601316470081835, - "learning_rate": 8.476153496245978e-06, - "loss": 0.6765, - "mean_token_accuracy": 0.7888801738619804, - "num_tokens": 809295294.0, + "epoch": 1.2075890873544832, + "grad_norm": 0.23998083533004458, + "learning_rate": 4.2380767481229886e-05, + "loss": 0.4418, + "mean_token_accuracy": 0.8569207176566124, + "num_tokens": 779811918.0, "step": 850 }, { - "epoch": 1.2146029490140344, - "grad_norm": 0.35840290876863273, - "learning_rate": 8.376909492521465e-06, - "loss": 0.6651, - "mean_token_accuracy": 0.7920581080019474, - "num_tokens": 814063402.0, + "epoch": 1.2146983026748424, + "grad_norm": 0.2456015755421057, + "learning_rate": 4.1884547462607326e-05, + "loss": 0.4454, + "mean_token_accuracy": 0.8553664483129978, + "num_tokens": 784391305.0, "step": 855 }, { - "epoch": 1.2217090069284064, - "grad_norm": 0.35918850454361245, - "learning_rate": 8.278012851299082e-06, - "loss": 0.6604, - "mean_token_accuracy": 0.793212516605854, - "num_tokens": 818822580.0, + "epoch": 1.2218075179952013, + "grad_norm": 0.25612737416807746, + "learning_rate": 4.139006425649541e-05, + "loss": 0.4504, + "mean_token_accuracy": 0.8527485050261021, + "num_tokens": 788981682.0, "step": 860 }, { - "epoch": 1.2288150648427785, - "grad_norm": 0.36931586095521574, - "learning_rate": 8.179476668956799e-06, - "loss": 0.6697, - "mean_token_accuracy": 0.789932218939066, - "num_tokens": 823577622.0, + "epoch": 1.2289167333155602, + "grad_norm": 0.24215144672428524, + "learning_rate": 4.089738334478399e-05, + "loss": 0.4466, + "mean_token_accuracy": 0.8540120802819728, + "num_tokens": 793548878.0, "step": 865 }, { - "epoch": 1.2359211227571505, - "grad_norm": 0.3618218146346168, - "learning_rate": 8.081313994138857e-06, - "loss": 0.6573, - "mean_token_accuracy": 0.7943486146628856, - "num_tokens": 828319732.0, + "epoch": 1.2360259486359193, + "grad_norm": 0.251956160570565, + "learning_rate": 4.0406569970694285e-05, + "loss": 0.4514, + "mean_token_accuracy": 0.8536942526698112, + "num_tokens": 798145090.0, "step": 870 }, { - "epoch": 1.2430271806715225, - "grad_norm": 0.3947688956702076, - "learning_rate": 7.983537826027808e-06, - "loss": 0.6677, - "mean_token_accuracy": 0.7911505416035652, - "num_tokens": 833074724.0, + "epoch": 1.2431351639562784, + "grad_norm": 0.24137828427946414, + "learning_rate": 3.991768913013904e-05, + "loss": 0.4408, + "mean_token_accuracy": 0.8566184468567372, + "num_tokens": 802721141.0, "step": 875 }, { - "epoch": 1.2501332385858945, - "grad_norm": 0.49837585254908734, - "learning_rate": 7.886161112623072e-06, - "loss": 0.6549, - "mean_token_accuracy": 0.7948375590145588, - "num_tokens": 837847437.0, + "epoch": 1.2502443792766373, + "grad_norm": 0.3769699788745637, + "learning_rate": 3.943080556311536e-05, + "loss": 0.438, + "mean_token_accuracy": 0.8581221453845501, + "num_tokens": 807303824.0, "step": 880 }, { - "epoch": 1.2572392965002666, - "grad_norm": 0.3875626366429437, - "learning_rate": 7.789196749026349e-06, - "loss": 0.6519, - "mean_token_accuracy": 0.7962292313575745, - "num_tokens": 842595165.0, + "epoch": 1.2573535945969962, + "grad_norm": 0.251278759950789, + "learning_rate": 3.894598374513174e-05, + "loss": 0.4485, + "mean_token_accuracy": 0.8541063219308853, + "num_tokens": 811911762.0, "step": 885 }, { - "epoch": 1.2643453544146386, - "grad_norm": 0.3461153977326627, - "learning_rate": 7.692657575733928e-06, - "loss": 0.6591, - "mean_token_accuracy": 0.7930607885122299, - "num_tokens": 847377566.0, + "epoch": 1.2644628099173554, + "grad_norm": 0.24068163801342848, + "learning_rate": 3.846328787866964e-05, + "loss": 0.4339, + "mean_token_accuracy": 0.859130322188139, + "num_tokens": 816508640.0, "step": 890 }, { - "epoch": 1.2714514123290104, - "grad_norm": 0.3539458234583982, - "learning_rate": 7.596556376936328e-06, - "loss": 0.6585, - "mean_token_accuracy": 0.7939456604421139, - "num_tokens": 852138878.0, + "epoch": 1.2715720252377145, + "grad_norm": 0.23232711368022352, + "learning_rate": 3.798278188468164e-05, + "loss": 0.4445, + "mean_token_accuracy": 0.8543654963374138, + "num_tokens": 821100737.0, "step": 895 }, { - "epoch": 1.2785574702433826, - "grad_norm": 0.3776944250010228, - "learning_rate": 7.500905878825335e-06, - "loss": 0.6552, - "mean_token_accuracy": 0.7954832412302494, - "num_tokens": 856908856.0, + "epoch": 1.2786812405580734, + "grad_norm": 0.2368572559014999, + "learning_rate": 3.750452939412667e-05, + "loss": 0.4434, + "mean_token_accuracy": 0.8547687388956546, + "num_tokens": 825694727.0, "step": 900 }, { - "epoch": 1.2785574702433826, - "eval_loss": 0.6587108373641968, - "eval_mean_token_accuracy": 0.7879830112582759, - "eval_num_tokens": 856908856.0, - "eval_runtime": 150.1109, - "eval_samples_per_second": 24.242, - "eval_steps_per_second": 0.759, + "epoch": 1.2786812405580734, + "eval_loss": 0.4800785183906555, + "eval_mean_token_accuracy": 0.8407511988229919, + "eval_num_tokens": 825694727.0, + "eval_runtime": 146.4602, + "eval_samples_per_second": 24.84, + "eval_steps_per_second": 0.778, "step": 900 }, { - "epoch": 1.2856635281577544, - "grad_norm": 0.35417475896948203, - "learning_rate": 7.405718747908743e-06, - "loss": 0.6554, - "mean_token_accuracy": 0.7936457127332688, - "num_tokens": 861668793.0, + "epoch": 1.2857904558784323, + "grad_norm": 0.26166517034573067, + "learning_rate": 3.7028593739543715e-05, + "loss": 0.4475, + "mean_token_accuracy": 0.854764747619629, + "num_tokens": 830291180.0, "step": 905 }, { - "epoch": 1.2927695860721264, - "grad_norm": 0.3717646188257627, - "learning_rate": 7.311007589332986e-06, - "loss": 0.6587, - "mean_token_accuracy": 0.7932697109878063, - "num_tokens": 866418403.0, + "epoch": 1.2928996711987915, + "grad_norm": 0.24015937616460478, + "learning_rate": 3.6555037946664926e-05, + "loss": 0.4455, + "mean_token_accuracy": 0.8552566647529602, + "num_tokens": 834892125.0, "step": 910 }, { - "epoch": 1.2998756439864985, - "grad_norm": 0.38878740217599195, - "learning_rate": 7.216784945213913e-06, - "loss": 0.6625, - "mean_token_accuracy": 0.7936202257871627, - "num_tokens": 871159945.0, + "epoch": 1.3000088865191506, + "grad_norm": 0.252313420976958, + "learning_rate": 3.608392472606956e-05, + "loss": 0.4441, + "mean_token_accuracy": 0.8559129044413567, + "num_tokens": 839486375.0, "step": 915 }, { - "epoch": 1.3069817019008705, - "grad_norm": 0.3422730868265931, - "learning_rate": 7.123063292975889e-06, - "loss": 0.6525, - "mean_token_accuracy": 0.794795686006546, - "num_tokens": 875924929.0, + "epoch": 1.3071181018395095, + "grad_norm": 0.256487918121681, + "learning_rate": 3.5615316464879445e-05, + "loss": 0.4401, + "mean_token_accuracy": 0.8565216913819313, + "num_tokens": 844107444.0, "step": 920 }, { - "epoch": 1.3140877598152425, - "grad_norm": 0.3694651695886196, - "learning_rate": 7.02985504369949e-06, - "loss": 0.6547, - "mean_token_accuracy": 0.7951326429843902, - "num_tokens": 880666672.0, + "epoch": 1.3142273171598684, + "grad_norm": 0.23448215102314007, + "learning_rate": 3.5149275218497445e-05, + "loss": 0.4383, + "mean_token_accuracy": 0.8571599997580052, + "num_tokens": 848704492.0, "step": 925 }, { - "epoch": 1.3211938177296145, - "grad_norm": 0.3621570732014425, - "learning_rate": 6.937172540477944e-06, - "loss": 0.6654, - "mean_token_accuracy": 0.7919820554554462, - "num_tokens": 885436601.0, + "epoch": 1.3213365324802275, + "grad_norm": 0.24419792529251788, + "learning_rate": 3.4685862702389714e-05, + "loss": 0.4429, + "mean_token_accuracy": 0.855844734609127, + "num_tokens": 853292585.0, "step": 930 }, { - "epoch": 1.3282998756439865, - "grad_norm": 0.7187487529688721, - "learning_rate": 6.8450280567826074e-06, - "loss": 0.6636, - "mean_token_accuracy": 0.792605972290039, - "num_tokens": 890209266.0, + "epoch": 1.3284457478005864, + "grad_norm": 0.23566825561303636, + "learning_rate": 3.422514028391304e-05, + "loss": 0.4354, + "mean_token_accuracy": 0.8570930063724518, + "num_tokens": 857867604.0, "step": 935 }, { - "epoch": 1.3354059335583586, - "grad_norm": 0.3607746297969546, - "learning_rate": 6.753433794837663e-06, - "loss": 0.655, - "mean_token_accuracy": 0.7943654432892799, - "num_tokens": 894978447.0, + "epoch": 1.3355549631209456, + "grad_norm": 0.2454162982602229, + "learning_rate": 3.376716897418831e-05, + "loss": 0.4447, + "mean_token_accuracy": 0.8552064374089241, + "num_tokens": 862460961.0, "step": 940 }, { - "epoch": 1.3425119914727306, - "grad_norm": 0.34656870317990024, - "learning_rate": 6.662401884004226e-06, - "loss": 0.6594, - "mean_token_accuracy": 0.7929094567894935, - "num_tokens": 899731953.0, + "epoch": 1.3426641784413045, + "grad_norm": 0.2524163496767361, + "learning_rate": 3.331200942002113e-05, + "loss": 0.4525, + "mean_token_accuracy": 0.8537895001471043, + "num_tokens": 867058298.0, "step": 945 }, { - "epoch": 1.3496180493871024, - "grad_norm": 0.3556403292186795, - "learning_rate": 6.571944379174128e-06, - "loss": 0.6557, - "mean_token_accuracy": 0.7939096741378308, - "num_tokens": 904484204.0, + "epoch": 1.3497733937616636, + "grad_norm": 0.23190520165291026, + "learning_rate": 3.2859721895870635e-05, + "loss": 0.44, + "mean_token_accuracy": 0.8565752863883972, + "num_tokens": 871661806.0, "step": 950 }, { - "epoch": 1.3567241073014746, - "grad_norm": 0.3481029203946856, - "learning_rate": 6.482073259173533e-06, - "loss": 0.6558, - "mean_token_accuracy": 0.795223805308342, - "num_tokens": 909254980.0, + "epoch": 1.3568826090820225, + "grad_norm": 0.24782970977401894, + "learning_rate": 3.2410366295867664e-05, + "loss": 0.4352, + "mean_token_accuracy": 0.8579383887350559, + "num_tokens": 876250262.0, "step": 955 }, { - "epoch": 1.3638301652158464, - "grad_norm": 0.36246749377385634, - "learning_rate": 6.39280042517666e-06, - "loss": 0.6576, - "mean_token_accuracy": 0.794485367834568, - "num_tokens": 914013636.0, + "epoch": 1.3639918244023816, + "grad_norm": 0.22786025696468146, + "learning_rate": 3.19640021258833e-05, + "loss": 0.444, + "mean_token_accuracy": 0.8550498209893703, + "num_tokens": 880839029.0, "step": 960 }, { - "epoch": 1.3709362231302185, - "grad_norm": 0.35744165552118257, - "learning_rate": 6.304137699129758e-06, - "loss": 0.6521, - "mean_token_accuracy": 0.7954901576042175, - "num_tokens": 918774652.0, + "epoch": 1.3711010397227406, + "grad_norm": 0.2265711418699179, + "learning_rate": 3.152068849564879e-05, + "loss": 0.4435, + "mean_token_accuracy": 0.8563594095408916, + "num_tokens": 885417939.0, "step": 965 }, { - "epoch": 1.3780422810445905, - "grad_norm": 0.34190890162690535, - "learning_rate": 6.216096822185591e-06, - "loss": 0.6596, - "mean_token_accuracy": 0.7934505857527256, - "num_tokens": 923523836.0, + "epoch": 1.3782102550430997, + "grad_norm": 0.23977507514952898, + "learning_rate": 3.1080484110927954e-05, + "loss": 0.4325, + "mean_token_accuracy": 0.8590381443500519, + "num_tokens": 890005207.0, "step": 970 }, { - "epoch": 1.3851483389589625, - "grad_norm": 0.3417362920171981, - "learning_rate": 6.12868945314862e-06, - "loss": 0.6647, - "mean_token_accuracy": 0.7919908218085766, - "num_tokens": 928304038.0, + "epoch": 1.3853194703634586, + "grad_norm": 0.24689756755824815, + "learning_rate": 3.0643447265743096e-05, + "loss": 0.44, + "mean_token_accuracy": 0.85642144754529, + "num_tokens": 894591297.0, "step": 975 }, { - "epoch": 1.3922543968733345, - "grad_norm": 0.3435048840223176, - "learning_rate": 6.041927166931078e-06, - "loss": 0.6577, - "mean_token_accuracy": 0.7943571574985981, - "num_tokens": 933073919.0, + "epoch": 1.3924286856838177, + "grad_norm": 0.24051873631020942, + "learning_rate": 3.0209635834655392e-05, + "loss": 0.435, + "mean_token_accuracy": 0.8576522074639797, + "num_tokens": 899178832.0, "step": 980 }, { - "epoch": 1.3993604547877065, - "grad_norm": 0.40933619958354717, - "learning_rate": 5.9558214530201784e-06, - "loss": 0.6575, - "mean_token_accuracy": 0.7943412482738494, - "num_tokens": 937846004.0, + "epoch": 1.3995379010041766, + "grad_norm": 0.2413492029135495, + "learning_rate": 2.9779107265100892e-05, + "loss": 0.4369, + "mean_token_accuracy": 0.857710150629282, + "num_tokens": 903773147.0, "step": 985 }, { - "epoch": 1.4064665127020786, - "grad_norm": 0.3515117903938119, - "learning_rate": 5.870383713956601e-06, - "loss": 0.6599, - "mean_token_accuracy": 0.7938548773527145, - "num_tokens": 942601267.0, + "epoch": 1.4066471163245358, + "grad_norm": 0.23506138046697497, + "learning_rate": 2.9351918569783006e-05, + "loss": 0.4364, + "mean_token_accuracy": 0.8576699584722519, + "num_tokens": 908371284.0, "step": 990 }, { - "epoch": 1.4135725706164506, - "grad_norm": 0.38419237136120965, - "learning_rate": 5.785625263824531e-06, - "loss": 0.6552, - "mean_token_accuracy": 0.7948469713330268, - "num_tokens": 947375335.0, + "epoch": 1.4137563316448947, + "grad_norm": 0.25438867805085685, + "learning_rate": 2.892812631912265e-05, + "loss": 0.4349, + "mean_token_accuracy": 0.8586409255862236, + "num_tokens": 912978481.0, "step": 995 }, { - "epoch": 1.4206786285308226, - "grad_norm": 0.38108767014595607, - "learning_rate": 5.701557326753375e-06, - "loss": 0.6504, - "mean_token_accuracy": 0.7960710853338242, - "num_tokens": 952105402.0, + "epoch": 1.4208655469652536, + "grad_norm": 0.24429497699288996, + "learning_rate": 2.8507786633766877e-05, + "loss": 0.4354, + "mean_token_accuracy": 0.8573046490550041, + "num_tokens": 917574029.0, "step": 1000 }, { - "epoch": 1.4206786285308226, - "eval_loss": 0.6552348136901855, - "eval_mean_token_accuracy": 0.7893575147578591, - "eval_num_tokens": 952105402.0, - "eval_runtime": 149.5045, - "eval_samples_per_second": 24.34, - "eval_steps_per_second": 0.763, + "epoch": 1.4208655469652536, + "eval_loss": 0.47304314374923706, + "eval_mean_token_accuracy": 0.842672534156264, + "eval_num_tokens": 917574029.0, + "eval_runtime": 145.3562, + "eval_samples_per_second": 25.028, + "eval_steps_per_second": 0.784, "step": 1000 }, { - "epoch": 1.4277846864451944, - "grad_norm": 0.3429174906843955, - "learning_rate": 5.6181910354314265e-06, - "loss": 0.6596, - "mean_token_accuracy": 0.7940759062767029, - "num_tokens": 956874826.0, + "epoch": 1.4279747622856127, + "grad_norm": 0.24463063083449332, + "learning_rate": 2.809095517715713e-05, + "loss": 0.4303, + "mean_token_accuracy": 0.858917984366417, + "num_tokens": 922160147.0, "step": 1005 }, { - "epoch": 1.4348907443595666, - "grad_norm": 0.37307240441832, - "learning_rate": 5.5355374296316e-06, - "loss": 0.6589, - "mean_token_accuracy": 0.7940549589693546, - "num_tokens": 961632193.0, + "epoch": 1.4350839776059718, + "grad_norm": 0.24348846567727375, + "learning_rate": 2.7677687148157998e-05, + "loss": 0.4367, + "mean_token_accuracy": 0.8577364660799504, + "num_tokens": 926746028.0, "step": 1010 }, { - "epoch": 1.4419968022739384, - "grad_norm": 0.3463607871324184, - "learning_rate": 5.4536074547495055e-06, - "loss": 0.6576, - "mean_token_accuracy": 0.7948333404958248, - "num_tokens": 966392410.0, + "epoch": 1.4421931929263307, + "grad_norm": 0.24745049020205356, + "learning_rate": 2.7268037273747525e-05, + "loss": 0.4368, + "mean_token_accuracy": 0.857840034365654, + "num_tokens": 931337261.0, "step": 1015 }, { - "epoch": 1.4491028601883105, - "grad_norm": 0.34652042003246536, - "learning_rate": 5.372411960353996e-06, - "loss": 0.6636, - "mean_token_accuracy": 0.7924063883721828, - "num_tokens": 971170949.0, + "epoch": 1.4493024082466897, + "grad_norm": 0.2439587698234042, + "learning_rate": 2.686205980176998e-05, + "loss": 0.4447, + "mean_token_accuracy": 0.8548872321844101, + "num_tokens": 935941769.0, "step": 1020 }, { - "epoch": 1.4562089181026825, - "grad_norm": 0.33480848996072166, - "learning_rate": 5.2919616987504205e-06, - "loss": 0.6436, - "mean_token_accuracy": 0.7979453206062317, - "num_tokens": 975920452.0, + "epoch": 1.4564116235670488, + "grad_norm": 0.25142114078442956, + "learning_rate": 2.6459808493752102e-05, + "loss": 0.4284, + "mean_token_accuracy": 0.8603815868496895, + "num_tokens": 940535643.0, "step": 1025 }, { - "epoch": 1.4633149760170545, - "grad_norm": 0.35444982881545845, - "learning_rate": 5.212267323556754e-06, - "loss": 0.6488, - "mean_token_accuracy": 0.7975021339952946, - "num_tokens": 980657772.0, + "epoch": 1.463520838887408, + "grad_norm": 0.2444154895688051, + "learning_rate": 2.606133661778377e-05, + "loss": 0.4368, + "mean_token_accuracy": 0.8575351513922215, + "num_tokens": 945124519.0, "step": 1030 }, { - "epoch": 1.4704210339314265, - "grad_norm": 0.3236146641950486, - "learning_rate": 5.1333393882927776e-06, - "loss": 0.6656, - "mean_token_accuracy": 0.7911154888570309, - "num_tokens": 985424225.0, + "epoch": 1.4706300542077668, + "grad_norm": 0.2397327728518288, + "learning_rate": 2.5666696941463885e-05, + "loss": 0.4307, + "mean_token_accuracy": 0.8594269149005413, + "num_tokens": 949709974.0, "step": 1035 }, { - "epoch": 1.4775270918457986, - "grad_norm": 0.36258575664825715, - "learning_rate": 5.055188344982549e-06, - "loss": 0.653, - "mean_token_accuracy": 0.7950214244425297, - "num_tokens": 990170268.0, + "epoch": 1.4777392695281257, + "grad_norm": 0.3077470484547689, + "learning_rate": 2.5275941724912743e-05, + "loss": 0.4288, + "mean_token_accuracy": 0.8588724002242089, + "num_tokens": 954294899.0, "step": 1040 }, { - "epoch": 1.4846331497601706, - "grad_norm": 0.3626547592700128, - "learning_rate": 4.977824542770279e-06, - "loss": 0.6645, - "mean_token_accuracy": 0.7932340361177921, - "num_tokens": 994933612.0, + "epoch": 1.4848484848484849, + "grad_norm": 0.24584716924955974, + "learning_rate": 2.4889122713851394e-05, + "loss": 0.4304, + "mean_token_accuracy": 0.8590269833803177, + "num_tokens": 958889833.0, "step": 1045 }, { - "epoch": 1.4917392076745426, - "grad_norm": 0.3424014804959087, - "learning_rate": 4.901258226549855e-06, - "loss": 0.6499, - "mean_token_accuracy": 0.7964041963219642, - "num_tokens": 999695033.0, + "epoch": 1.491957700168844, + "grad_norm": 0.24260820183680837, + "learning_rate": 2.4506291132749272e-05, + "loss": 0.4322, + "mean_token_accuracy": 0.8588926158845425, + "num_tokens": 963479630.0, "step": 1050 }, { - "epoch": 1.4988452655889146, - "grad_norm": 0.3457300931458133, - "learning_rate": 4.825499535608169e-06, - "loss": 0.659, - "mean_token_accuracy": 0.7942204736173153, - "num_tokens": 1004453306.0, + "epoch": 1.499066915489203, + "grad_norm": 0.2512439219193439, + "learning_rate": 2.4127497678040846e-05, + "loss": 0.4338, + "mean_token_accuracy": 0.8590321697294712, + "num_tokens": 968086693.0, "step": 1055 }, { - "epoch": 1.5059513235032864, - "grad_norm": 0.407404649365606, - "learning_rate": 4.750558502282403e-06, - "loss": 0.6466, - "mean_token_accuracy": 0.7969782948493958, - "num_tokens": 1009222958.0, + "epoch": 1.5061761308095618, + "grad_norm": 0.25788120133019554, + "learning_rate": 2.375279251141201e-05, + "loss": 0.4302, + "mean_token_accuracy": 0.8599278099834919, + "num_tokens": 972668807.0, "step": 1060 }, { - "epoch": 1.5130573814176587, - "grad_norm": 0.36455466451492874, - "learning_rate": 4.676445050631517e-06, - "loss": 0.6669, - "mean_token_accuracy": 0.7919491566717625, - "num_tokens": 1013988411.0, + "epoch": 1.513285346129921, + "grad_norm": 0.24857387974370135, + "learning_rate": 2.338222525315758e-05, + "loss": 0.4371, + "mean_token_accuracy": 0.8579599760472775, + "num_tokens": 977267842.0, "step": 1065 }, { - "epoch": 1.5201634393320305, - "grad_norm": 0.3352428252596833, - "learning_rate": 4.603168995122048e-06, - "loss": 0.653, - "mean_token_accuracy": 0.7959543123841286, - "num_tokens": 1018736541.0, + "epoch": 1.52039456145028, + "grad_norm": 0.24022880991860499, + "learning_rate": 2.301584497561024e-05, + "loss": 0.4234, + "mean_token_accuracy": 0.862085721641779, + "num_tokens": 981857003.0, "step": 1070 }, { - "epoch": 1.5272694972464027, - "grad_norm": 0.336146168164258, - "learning_rate": 4.530740039328427e-06, - "loss": 0.6527, - "mean_token_accuracy": 0.795566051453352, - "num_tokens": 1023492540.0, + "epoch": 1.527503776770639, + "grad_norm": 0.27120541109477303, + "learning_rate": 2.2653700196642134e-05, + "loss": 0.4396, + "mean_token_accuracy": 0.857264555990696, + "num_tokens": 986456929.0, "step": 1075 }, { - "epoch": 1.5343755551607745, - "grad_norm": 0.35998475713849004, - "learning_rate": 4.4591677746479935e-06, - "loss": 0.6542, - "mean_token_accuracy": 0.7954114884138107, - "num_tokens": 1028251642.0, + "epoch": 1.5346129920909979, + "grad_norm": 0.24114703590240177, + "learning_rate": 2.2295838873239965e-05, + "loss": 0.4296, + "mean_token_accuracy": 0.8604548752307892, + "num_tokens": 991061372.0, "step": 1080 }, { - "epoch": 1.5414816130751465, - "grad_norm": 0.4210390284353885, - "learning_rate": 4.38846167903085e-06, - "loss": 0.6501, - "mean_token_accuracy": 0.7963161066174507, - "num_tokens": 1033004523.0, + "epoch": 1.541722207411357, + "grad_norm": 0.23963844839444817, + "learning_rate": 2.194230839515425e-05, + "loss": 0.4336, + "mean_token_accuracy": 0.8584208697080612, + "num_tokens": 995660319.0, "step": 1085 }, { - "epoch": 1.5485876709895185, - "grad_norm": 0.4960482537413715, - "learning_rate": 4.318631115724741e-06, - "loss": 0.6553, - "mean_token_accuracy": 0.7946652464568615, - "num_tokens": 1037760729.0, + "epoch": 1.5488314227317161, + "grad_norm": 0.24314988814533856, + "learning_rate": 2.1593155578623702e-05, + "loss": 0.4306, + "mean_token_accuracy": 0.8601135425269604, + "num_tokens": 1000236933.0, "step": 1090 }, { - "epoch": 1.5556937289038906, - "grad_norm": 0.3799332566110809, - "learning_rate": 4.2496853320351424e-06, - "loss": 0.6607, - "mean_token_accuracy": 0.7947109803557396, - "num_tokens": 1042523723.0, + "epoch": 1.555940638052075, + "grad_norm": 0.2566886574453899, + "learning_rate": 2.1248426660175713e-05, + "loss": 0.4384, + "mean_token_accuracy": 0.8573588460683823, + "num_tokens": 1004820862.0, "step": 1095 }, { - "epoch": 1.5627997868182626, - "grad_norm": 0.33946558046244235, - "learning_rate": 4.1816334581006656e-06, - "loss": 0.6651, - "mean_token_accuracy": 0.792590418457985, - "num_tokens": 1047291640.0, + "epoch": 1.563049853372434, + "grad_norm": 0.2621075128506793, + "learning_rate": 2.0908167290503326e-05, + "loss": 0.4298, + "mean_token_accuracy": 0.8607131637632847, + "num_tokens": 1009411521.0, "step": 1100 }, { - "epoch": 1.5627997868182626, - "eval_loss": 0.6525910496711731, - "eval_mean_token_accuracy": 0.7899543700510996, - "eval_num_tokens": 1047291640.0, - "eval_runtime": 150.3086, - "eval_samples_per_second": 24.21, - "eval_steps_per_second": 0.758, + "epoch": 1.563049853372434, + "eval_loss": 0.4672245681285858, + "eval_mean_token_accuracy": 0.844007690747579, + "eval_num_tokens": 1009411521.0, + "eval_runtime": 146.3617, + "eval_samples_per_second": 24.856, + "eval_steps_per_second": 0.779, "step": 1100 }, { - "epoch": 1.5699058447326346, - "grad_norm": 0.32287060668756534, - "learning_rate": 4.114484505684019e-06, - "loss": 0.6541, - "mean_token_accuracy": 0.7952132284641266, - "num_tokens": 1052042031.0, + "epoch": 1.570159068692793, + "grad_norm": 0.23570827346042514, + "learning_rate": 2.0572422528420095e-05, + "loss": 0.4206, + "mean_token_accuracy": 0.8622309692203999, + "num_tokens": 1013995376.0, "step": 1105 }, { - "epoch": 1.5770119026470066, - "grad_norm": 0.3370631693518313, - "learning_rate": 4.048247366978606e-06, - "loss": 0.658, - "mean_token_accuracy": 0.7935857936739922, - "num_tokens": 1056804804.0, + "epoch": 1.577268284013152, + "grad_norm": 0.28786088105829327, + "learning_rate": 2.024123683489303e-05, + "loss": 0.4195, + "mean_token_accuracy": 0.8634026922285557, + "num_tokens": 1018562407.0, "step": 1110 }, { - "epoch": 1.5841179605613784, - "grad_norm": 0.414769500614358, - "learning_rate": 3.9829308134309995e-06, - "loss": 0.6475, - "mean_token_accuracy": 0.7969807527959347, - "num_tokens": 1061577783.0, + "epoch": 1.584377499333511, + "grad_norm": 0.22477409346403396, + "learning_rate": 1.9914654067154996e-05, + "loss": 0.4345, + "mean_token_accuracy": 0.8584335811436177, + "num_tokens": 1023168118.0, "step": 1115 }, { - "epoch": 1.5912240184757507, - "grad_norm": 0.35600756916022547, - "learning_rate": 3.9185434945793725e-06, - "loss": 0.6559, - "mean_token_accuracy": 0.7951311826705932, - "num_tokens": 1066355020.0, + "epoch": 1.59148671465387, + "grad_norm": 0.24599345473106599, + "learning_rate": 1.959271747289686e-05, + "loss": 0.4278, + "mean_token_accuracy": 0.8616135574877262, + "num_tokens": 1027754848.0, "step": 1120 }, { - "epoch": 1.5983300763901225, - "grad_norm": 0.36188029855593157, - "learning_rate": 3.855093936908081e-06, - "loss": 0.6664, - "mean_token_accuracy": 0.7921065390110016, - "num_tokens": 1071139121.0, + "epoch": 1.5985959299742292, + "grad_norm": 0.24491593894054278, + "learning_rate": 1.9275469684540404e-05, + "loss": 0.4294, + "mean_token_accuracy": 0.8590353332459927, + "num_tokens": 1032347251.0, "step": 1125 }, { - "epoch": 1.6054361343044947, - "grad_norm": 0.3632538405728989, - "learning_rate": 3.7925905427185504e-06, - "loss": 0.6569, - "mean_token_accuracy": 0.7936886362731457, - "num_tokens": 1075914044.0, + "epoch": 1.605705145294588, + "grad_norm": 0.2540751338276317, + "learning_rate": 1.8962952713592752e-05, + "loss": 0.4242, + "mean_token_accuracy": 0.8608104437589645, + "num_tokens": 1036931829.0, "step": 1130 }, { - "epoch": 1.6125421922188665, - "grad_norm": 0.3669437696101656, - "learning_rate": 3.7310415890166e-06, - "loss": 0.6512, - "mean_token_accuracy": 0.7960372731089592, - "num_tokens": 1080682478.0, + "epoch": 1.612814360614947, + "grad_norm": 0.2510287685288083, + "learning_rate": 1.8655207945083e-05, + "loss": 0.4239, + "mean_token_accuracy": 0.8617179103195667, + "num_tokens": 1041532224.0, "step": 1135 }, { - "epoch": 1.6196482501332385, - "grad_norm": 0.3331543509156551, - "learning_rate": 3.6704552264163695e-06, - "loss": 0.6561, - "mean_token_accuracy": 0.7935027062892914, - "num_tokens": 1085456231.0, + "epoch": 1.6199235759353061, + "grad_norm": 0.2693350827409704, + "learning_rate": 1.8352276132081847e-05, + "loss": 0.4357, + "mean_token_accuracy": 0.8589904353022575, + "num_tokens": 1046120676.0, "step": 1140 }, { - "epoch": 1.6267543080476106, - "grad_norm": 0.3299992523767914, - "learning_rate": 3.6108394780609513e-06, - "loss": 0.6506, - "mean_token_accuracy": 0.7957184061408042, - "num_tokens": 1090215557.0, + "epoch": 1.6270327912556652, + "grad_norm": 0.24443054034299724, + "learning_rate": 1.8054197390304755e-05, + "loss": 0.4275, + "mean_token_accuracy": 0.8615889854729175, + "num_tokens": 1050708153.0, "step": 1145 }, { - "epoch": 1.6338603659619826, - "grad_norm": 0.3450779481067245, - "learning_rate": 3.552202238559953e-06, - "loss": 0.6429, - "mean_token_accuracy": 0.798128329962492, - "num_tokens": 1094959524.0, + "epoch": 1.6341420065760242, + "grad_norm": 0.24588007040764026, + "learning_rate": 1.7761011192799764e-05, + "loss": 0.4238, + "mean_token_accuracy": 0.8622479006648064, + "num_tokens": 1055294826.0, "step": 1150 }, { - "epoch": 1.6409664238763546, - "grad_norm": 0.3531321257611881, - "learning_rate": 3.4945512729440413e-06, - "loss": 0.6503, - "mean_token_accuracy": 0.7954187601804733, - "num_tokens": 1099731395.0, + "epoch": 1.641251221896383, + "grad_norm": 0.24561473837992528, + "learning_rate": 1.7472756364720206e-05, + "loss": 0.4243, + "mean_token_accuracy": 0.8616314500570297, + "num_tokens": 1059896792.0, "step": 1155 }, { - "epoch": 1.6480724817907266, - "grad_norm": 0.3498741705682094, - "learning_rate": 3.437894215636661e-06, - "loss": 0.6578, - "mean_token_accuracy": 0.7941137261688709, - "num_tokens": 1104494157.0, + "epoch": 1.6483604372167422, + "grad_norm": 0.23202476301237993, + "learning_rate": 1.7189471078183302e-05, + "loss": 0.4313, + "mean_token_accuracy": 0.860023857653141, + "num_tokens": 1064504870.0, "step": 1160 }, { - "epoch": 1.6551785397050987, - "grad_norm": 0.3777101584751149, - "learning_rate": 3.382238569443045e-06, - "loss": 0.6529, - "mean_token_accuracy": 0.7957448020577431, - "num_tokens": 1109252674.0, + "epoch": 1.6554696525371013, + "grad_norm": 0.2403111932989795, + "learning_rate": 1.6911192847215225e-05, + "loss": 0.4315, + "mean_token_accuracy": 0.85991101115942, + "num_tokens": 1069092813.0, "step": 1165 }, { - "epoch": 1.6622845976194704, - "grad_norm": 0.3915263029819705, - "learning_rate": 3.3275917045566596e-06, - "loss": 0.6517, - "mean_token_accuracy": 0.7957381546497345, - "num_tokens": 1114004017.0, + "epoch": 1.6625788678574602, + "grad_norm": 0.23285052418281263, + "learning_rate": 1.6637958522783298e-05, + "loss": 0.4286, + "mean_token_accuracy": 0.8603983536362648, + "num_tokens": 1073673087.0, "step": 1170 }, { - "epoch": 1.6693906555338427, - "grad_norm": 0.33568853526043657, - "learning_rate": 3.2739608575832056e-06, - "loss": 0.6412, - "mean_token_accuracy": 0.7980836987495422, - "num_tokens": 1118768157.0, + "epoch": 1.6696880831778191, + "grad_norm": 0.23644436345090544, + "learning_rate": 1.6369804287916028e-05, + "loss": 0.4237, + "mean_token_accuracy": 0.8625174552202225, + "num_tokens": 1078263989.0, "step": 1175 }, { - "epoch": 1.6764967134482145, - "grad_norm": 0.34752341789739466, - "learning_rate": 3.2213531305823125e-06, - "loss": 0.6613, - "mean_token_accuracy": 0.7935202896595002, - "num_tokens": 1123535145.0, + "epoch": 1.6767972984981783, + "grad_norm": 0.2283809036559784, + "learning_rate": 1.6106765652911563e-05, + "loss": 0.4196, + "mean_token_accuracy": 0.8629219397902489, + "num_tokens": 1082858600.0, "step": 1180 }, { - "epoch": 1.6836027713625867, - "grad_norm": 0.34083716495360583, - "learning_rate": 3.1697754901270477e-06, - "loss": 0.6507, - "mean_token_accuracy": 0.7964440450072289, - "num_tokens": 1128307445.0, + "epoch": 1.6839065138185374, + "grad_norm": 0.2437421457507895, + "learning_rate": 1.5848877450635237e-05, + "loss": 0.431, + "mean_token_accuracy": 0.8596989519894123, + "num_tokens": 1087463215.0, "step": 1185 }, { - "epoch": 1.6907088292769585, - "grad_norm": 0.3774093707155365, - "learning_rate": 3.1192347663813684e-06, - "loss": 0.6547, - "mean_token_accuracy": 0.7946882367134094, - "num_tokens": 1133071242.0, + "epoch": 1.6910157291388963, + "grad_norm": 0.24997191755310427, + "learning_rate": 1.559617383190684e-05, + "loss": 0.4258, + "mean_token_accuracy": 0.8600839108228684, + "num_tokens": 1092046691.0, "step": 1190 }, { - "epoch": 1.6978148871913306, - "grad_norm": 0.36333647453863616, - "learning_rate": 3.0697376521956377e-06, - "loss": 0.6526, - "mean_token_accuracy": 0.7956908911466598, - "num_tokens": 1137831284.0, + "epoch": 1.6981249444592552, + "grad_norm": 0.24275510902589129, + "learning_rate": 1.5348688260978188e-05, + "loss": 0.4198, + "mean_token_accuracy": 0.8634254619479179, + "num_tokens": 1096635412.0, "step": 1195 }, { - "epoch": 1.7049209451057026, - "grad_norm": 0.33780601240351715, - "learning_rate": 3.021290702220331e-06, - "loss": 0.6561, - "mean_token_accuracy": 0.7948304824531078, - "num_tokens": 1142587626.0, + "epoch": 1.7052341597796143, + "grad_norm": 0.25771028141912433, + "learning_rate": 1.5106453511101657e-05, + "loss": 0.4198, + "mean_token_accuracy": 0.8630197443068027, + "num_tokens": 1101239957.0, "step": 1200 }, { - "epoch": 1.7049209451057026, - "eval_loss": 0.6507056355476379, - "eval_mean_token_accuracy": 0.790492679466281, - "eval_num_tokens": 1142587626.0, - "eval_runtime": 149.5446, - "eval_samples_per_second": 24.334, - "eval_steps_per_second": 0.762, + "epoch": 1.7052341597796143, + "eval_loss": 0.4617161452770233, + "eval_mean_token_accuracy": 0.8460459296117749, + "eval_num_tokens": 1101239957.0, + "eval_runtime": 143.0225, + "eval_samples_per_second": 25.437, + "eval_steps_per_second": 0.797, "step": 1200 }, { - "epoch": 1.7120270030200746, - "grad_norm": 0.3537743025090112, - "learning_rate": 2.9739003320380237e-06, - "loss": 0.6624, - "mean_token_accuracy": 0.793489520996809, - "num_tokens": 1147357460.0, + "epoch": 1.7123433750999735, + "grad_norm": 0.2465846462175401, + "learning_rate": 1.4869501660190118e-05, + "loss": 0.4269, + "mean_token_accuracy": 0.8613091327250004, + "num_tokens": 1105835727.0, "step": 1205 }, { - "epoch": 1.7191330609344466, - "grad_norm": 0.4133446002595886, - "learning_rate": 2.927572817313823e-06, - "loss": 0.6585, - "mean_token_accuracy": 0.7936319254338742, - "num_tokens": 1152138440.0, + "epoch": 1.7194525904203324, + "grad_norm": 0.24343231445496366, + "learning_rate": 1.4637864086569114e-05, + "loss": 0.4189, + "mean_token_accuracy": 0.8625466778874398, + "num_tokens": 1110431832.0, "step": 1210 }, { - "epoch": 1.7262391188488186, - "grad_norm": 0.3563260686728207, - "learning_rate": 2.8823142929643043e-06, - "loss": 0.6426, - "mean_token_accuracy": 0.797927625477314, - "num_tokens": 1156890428.0, + "epoch": 1.7265618057406913, + "grad_norm": 0.24500024608031826, + "learning_rate": 1.4411571464821522e-05, + "loss": 0.4178, + "mean_token_accuracy": 0.8632443450391293, + "num_tokens": 1115003545.0, "step": 1215 }, { - "epoch": 1.7333451767631907, - "grad_norm": 0.3989494711298427, - "learning_rate": 2.838130752345092e-06, - "loss": 0.6582, - "mean_token_accuracy": 0.7947382763028145, - "num_tokens": 1161657895.0, + "epoch": 1.7336710210610504, + "grad_norm": 0.24384954499049283, + "learning_rate": 1.4190653761725458e-05, + "loss": 0.4331, + "mean_token_accuracy": 0.8595723591744899, + "num_tokens": 1119594038.0, "step": 1220 }, { - "epoch": 1.7404512346775625, - "grad_norm": 0.3432409644849126, - "learning_rate": 2.7950280464572066e-06, - "loss": 0.6541, - "mean_token_accuracy": 0.7953485876321793, - "num_tokens": 1166423043.0, + "epoch": 1.7407802363814096, + "grad_norm": 0.24988962843301607, + "learning_rate": 1.3975140232286033e-05, + "loss": 0.4292, + "mean_token_accuracy": 0.8610283821821213, + "num_tokens": 1124191272.0, "step": 1225 }, { - "epoch": 1.7475572925919347, - "grad_norm": 0.3368844710605464, - "learning_rate": 2.7530118831722286e-06, - "loss": 0.6481, - "mean_token_accuracy": 0.796825060248375, - "num_tokens": 1171166643.0, + "epoch": 1.7478894517017685, + "grad_norm": 0.23666630913921613, + "learning_rate": 1.3765059415861142e-05, + "loss": 0.4256, + "mean_token_accuracy": 0.8612963631749153, + "num_tokens": 1128787024.0, "step": 1230 }, { - "epoch": 1.7546633505063065, - "grad_norm": 0.45251140872957446, - "learning_rate": 2.7120878264764437e-06, - "loss": 0.6473, - "mean_token_accuracy": 0.7977175071835518, - "num_tokens": 1175924107.0, + "epoch": 1.7549986670221274, + "grad_norm": 0.24377997978707636, + "learning_rate": 1.3560439132382218e-05, + "loss": 0.4249, + "mean_token_accuracy": 0.8616208277642727, + "num_tokens": 1133369468.0, "step": 1235 }, { - "epoch": 1.7617694084206788, - "grad_norm": 0.3545121531531998, - "learning_rate": 2.67226129573403e-06, - "loss": 0.6512, - "mean_token_accuracy": 0.7960038974881172, - "num_tokens": 1180681893.0, + "epoch": 1.7621078823424865, + "grad_norm": 0.24473326280197544, + "learning_rate": 1.336130647867015e-05, + "loss": 0.4233, + "mean_token_accuracy": 0.8611096739768982, + "num_tokens": 1137960753.0, "step": 1240 }, { - "epoch": 1.7688754663350506, - "grad_norm": 0.3323525365320921, - "learning_rate": 2.633537564969398e-06, - "loss": 0.6557, - "mean_token_accuracy": 0.7952632494270802, - "num_tokens": 1185447027.0, + "epoch": 1.7692170976628456, + "grad_norm": 0.2814923829698822, + "learning_rate": 1.3167687824846988e-05, + "loss": 0.4345, + "mean_token_accuracy": 0.8590093135833741, + "num_tokens": 1142557989.0, "step": 1245 }, { - "epoch": 1.7759815242494228, - "grad_norm": 0.34600724842490793, - "learning_rate": 2.5959217621687823e-06, - "loss": 0.6608, - "mean_token_accuracy": 0.7938597463071346, - "num_tokens": 1190231791.0, + "epoch": 1.7763263129832043, + "grad_norm": 0.24671237642090413, + "learning_rate": 1.297960881084391e-05, + "loss": 0.4136, + "mean_token_accuracy": 0.8641826197504997, + "num_tokens": 1147139033.0, "step": 1250 }, { - "epoch": 1.7830875821637946, - "grad_norm": 0.3678379104234008, - "learning_rate": 2.5594188686011616e-06, - "loss": 0.6541, - "mean_token_accuracy": 0.7947786100208759, - "num_tokens": 1194998688.0, + "epoch": 1.7834355283035634, + "grad_norm": 0.23802525665842986, + "learning_rate": 1.2797094343005807e-05, + "loss": 0.4212, + "mean_token_accuracy": 0.8627298250794411, + "num_tokens": 1151728912.0, "step": 1255 }, { - "epoch": 1.7901936400781666, - "grad_norm": 0.36986419243022695, - "learning_rate": 2.524033718158621e-06, - "loss": 0.6492, - "mean_token_accuracy": 0.7966626077890396, - "num_tokens": 1199764688.0, + "epoch": 1.7905447436239226, + "grad_norm": 0.24514167574215462, + "learning_rate": 1.2620168590793105e-05, + "loss": 0.4243, + "mean_token_accuracy": 0.8623115479946136, + "num_tokens": 1156315343.0, "step": 1260 }, { - "epoch": 1.7972996979925386, - "grad_norm": 0.36634691155162535, - "learning_rate": 2.489770996716227e-06, - "loss": 0.6549, - "mean_token_accuracy": 0.7945116639137269, - "num_tokens": 1204526423.0, + "epoch": 1.7976539589442815, + "grad_norm": 0.24177052216503225, + "learning_rate": 1.2448854983581134e-05, + "loss": 0.4205, + "mean_token_accuracy": 0.8636125177145004, + "num_tokens": 1160905222.0, "step": 1265 }, { - "epoch": 1.8044057559069107, - "grad_norm": 0.3592637763107603, - "learning_rate": 2.456635241511491e-06, - "loss": 0.6436, - "mean_token_accuracy": 0.7984024800360203, - "num_tokens": 1209280668.0, + "epoch": 1.8047631742646404, + "grad_norm": 0.25623340057701793, + "learning_rate": 1.2283176207557455e-05, + "loss": 0.4204, + "mean_token_accuracy": 0.863289151340723, + "num_tokens": 1165469584.0, "step": 1270 }, { - "epoch": 1.8115118138212827, - "grad_norm": 0.3637888435360703, - "learning_rate": 2.4246308405435314e-06, - "loss": 0.6503, - "mean_token_accuracy": 0.7954847238957882, - "num_tokens": 1214048139.0, + "epoch": 1.8118723895849995, + "grad_norm": 0.2366529819101992, + "learning_rate": 1.2123154202717656e-05, + "loss": 0.4205, + "mean_token_accuracy": 0.8623673833906651, + "num_tokens": 1170087058.0, "step": 1275 }, { - "epoch": 1.8186178717356547, - "grad_norm": 0.3515690726134511, - "learning_rate": 2.3937620319919966e-06, - "loss": 0.6471, - "mean_token_accuracy": 0.7975172877311707, - "num_tokens": 1218805359.0, + "epoch": 1.8189816049053587, + "grad_norm": 0.23815408906221286, + "learning_rate": 1.1968810159959982e-05, + "loss": 0.4167, + "mean_token_accuracy": 0.8636409521102906, + "num_tokens": 1174675450.0, "step": 1280 }, { - "epoch": 1.8257239296500267, - "grad_norm": 0.37901691417441497, - "learning_rate": 2.3640329036558167e-06, - "loss": 0.6458, - "mean_token_accuracy": 0.7973252393305301, - "num_tokens": 1223580683.0, + "epoch": 1.8260908202257176, + "grad_norm": 0.25161717096488057, + "learning_rate": 1.1820164518279083e-05, + "loss": 0.4308, + "mean_token_accuracy": 0.8603747352957726, + "num_tokens": 1179252086.0, "step": 1285 }, { - "epoch": 1.8328299875643985, - "grad_norm": 0.48078408366926273, - "learning_rate": 2.3354473924118843e-06, - "loss": 0.6517, - "mean_token_accuracy": 0.7954902827739716, - "num_tokens": 1228344380.0, + "epoch": 1.8332000355460765, + "grad_norm": 0.23828924023109987, + "learning_rate": 1.1677236962059421e-05, + "loss": 0.4161, + "mean_token_accuracy": 0.8636845953762531, + "num_tokens": 1183846581.0, "step": 1290 }, { - "epoch": 1.8399360454787708, - "grad_norm": 0.3487084891659478, - "learning_rate": 2.3080092836937124e-06, - "loss": 0.649, - "mean_token_accuracy": 0.7968501009047031, - "num_tokens": 1233124681.0, + "epoch": 1.8403092508664356, + "grad_norm": 0.2389439298878492, + "learning_rate": 1.1540046418468561e-05, + "loss": 0.4093, + "mean_token_accuracy": 0.8666847251355648, + "num_tokens": 1188439447.0, "step": 1295 }, { - "epoch": 1.8470421033931426, - "grad_norm": 0.34640131501737065, - "learning_rate": 2.2817222109901442e-06, - "loss": 0.6448, - "mean_token_accuracy": 0.7978550389409065, - "num_tokens": 1237873166.0, + "epoch": 1.8474184661867947, + "grad_norm": 0.26036762406039, + "learning_rate": 1.1408611054950722e-05, + "loss": 0.4187, + "mean_token_accuracy": 0.8630855195224285, + "num_tokens": 1193031482.0, "step": 1300 }, { - "epoch": 1.8470421033931426, - "eval_loss": 0.6490960121154785, - "eval_mean_token_accuracy": 0.7908800155447241, - "eval_num_tokens": 1237873166.0, - "eval_runtime": 149.9569, - "eval_samples_per_second": 24.267, - "eval_steps_per_second": 0.76, + "epoch": 1.8474184661867947, + "eval_loss": 0.45738622546195984, + "eval_mean_token_accuracy": 0.847679163803134, + "eval_num_tokens": 1193031482.0, + "eval_runtime": 143.6355, + "eval_samples_per_second": 25.328, + "eval_steps_per_second": 0.794, "step": 1300 }, { - "epoch": 1.8541481613075148, - "grad_norm": 0.3243906306128693, - "learning_rate": 2.256589655364193e-06, - "loss": 0.6593, - "mean_token_accuracy": 0.7929202131927013, - "num_tokens": 1242627340.0, + "epoch": 1.8545276815071536, + "grad_norm": 0.2419491832206913, + "learning_rate": 1.1282948276820963e-05, + "loss": 0.4223, + "mean_token_accuracy": 0.8626484178006649, + "num_tokens": 1197621510.0, "step": 1305 }, { - "epoch": 1.8612542192218866, - "grad_norm": 0.37597198041798413, - "learning_rate": 2.2326149449920653e-06, - "loss": 0.6446, - "mean_token_accuracy": 0.797098808735609, - "num_tokens": 1247387461.0, + "epoch": 1.8616368968275125, + "grad_norm": 0.2366717377397619, + "learning_rate": 1.1163074724960326e-05, + "loss": 0.4202, + "mean_token_accuracy": 0.8629304811358451, + "num_tokens": 1202214988.0, "step": 1310 }, { - "epoch": 1.8683602771362586, - "grad_norm": 0.35265594906604686, - "learning_rate": 2.2098012547224197e-06, - "loss": 0.6513, - "mean_token_accuracy": 0.7950267992913723, - "num_tokens": 1252135688.0, + "epoch": 1.8687461121478717, + "grad_norm": 0.24750576690261594, + "learning_rate": 1.10490062736121e-05, + "loss": 0.4159, + "mean_token_accuracy": 0.8640658937394619, + "num_tokens": 1206801749.0, "step": 1315 }, { - "epoch": 1.8754663350506307, - "grad_norm": 0.3583812845832173, - "learning_rate": 2.188151605655942e-06, - "loss": 0.6521, - "mean_token_accuracy": 0.7945805780589581, - "num_tokens": 1256903702.0, + "epoch": 1.8758553274682308, + "grad_norm": 0.2754980560042937, + "learning_rate": 1.094075802827971e-05, + "loss": 0.4224, + "mean_token_accuracy": 0.8619605071842671, + "num_tokens": 1211394066.0, "step": 1320 }, { - "epoch": 1.8825723929650027, - "grad_norm": 0.3577801661959976, - "learning_rate": 2.1676688647452795e-06, - "loss": 0.6437, - "mean_token_accuracy": 0.7986263297498226, - "num_tokens": 1261633144.0, + "epoch": 1.8829645427885897, + "grad_norm": 0.2441756409539309, + "learning_rate": 1.0838344323726395e-05, + "loss": 0.4159, + "mean_token_accuracy": 0.8641899891197682, + "num_tokens": 1215982389.0, "step": 1325 }, { - "epoch": 1.8896784508793747, - "grad_norm": 0.35744367217582623, - "learning_rate": 2.1483557444153795e-06, - "loss": 0.649, - "mean_token_accuracy": 0.7966003373265267, - "num_tokens": 1266390903.0, + "epoch": 1.8900737581089486, + "grad_norm": 0.25017331261640485, + "learning_rate": 1.0741778722076896e-05, + "loss": 0.4141, + "mean_token_accuracy": 0.864534319192171, + "num_tokens": 1220561480.0, "step": 1330 }, { - "epoch": 1.8967845087937467, - "grad_norm": 0.39747974453689555, - "learning_rate": 2.1302148022042993e-06, - "loss": 0.6491, - "mean_token_accuracy": 0.7970162339508533, - "num_tokens": 1271162270.0, + "epoch": 1.8971829734293078, + "grad_norm": 0.24928323459761015, + "learning_rate": 1.0651074011021495e-05, + "loss": 0.4148, + "mean_token_accuracy": 0.8647311642765999, + "num_tokens": 1225151015.0, "step": 1335 }, { - "epoch": 1.9038905667081187, - "grad_norm": 0.3547995480225708, - "learning_rate": 2.113248440424526e-06, - "loss": 0.643, - "mean_token_accuracy": 0.7987522542476654, - "num_tokens": 1275906083.0, + "epoch": 1.9042921887496669, + "grad_norm": 0.26117744577378244, + "learning_rate": 1.056624220212263e-05, + "loss": 0.4227, + "mean_token_accuracy": 0.8627439729869366, + "num_tokens": 1229753553.0, "step": 1340 }, { - "epoch": 1.9109966246224905, - "grad_norm": 0.3924659274346196, - "learning_rate": 2.0974589058448456e-06, - "loss": 0.6499, - "mean_token_accuracy": 0.7970600210130214, - "num_tokens": 1280649985.0, + "epoch": 1.9114014040700258, + "grad_norm": 0.250926981430339, + "learning_rate": 1.048729452922423e-05, + "loss": 0.4118, + "mean_token_accuracy": 0.8654024370014668, + "num_tokens": 1234324722.0, "step": 1345 }, { - "epoch": 1.9181026825368628, - "grad_norm": 0.3450930632168253, - "learning_rate": 2.0828482893928208e-06, - "loss": 0.6525, - "mean_token_accuracy": 0.795515525341034, - "num_tokens": 1285434113.0, + "epoch": 1.9185106193903847, + "grad_norm": 0.26445464932369295, + "learning_rate": 1.0414241446964102e-05, + "loss": 0.4176, + "mean_token_accuracy": 0.8638374984264374, + "num_tokens": 1238945254.0, "step": 1350 }, { - "epoch": 1.9252087404512346, - "grad_norm": 0.33567083461731984, - "learning_rate": 2.069418525877897e-06, - "loss": 0.644, - "mean_token_accuracy": 0.798255106061697, - "num_tokens": 1290191830.0, + "epoch": 1.9256198347107438, + "grad_norm": 0.24942959940503223, + "learning_rate": 1.0347092629389484e-05, + "loss": 0.4098, + "mean_token_accuracy": 0.8681537143886089, + "num_tokens": 1243530120.0, "step": 1355 }, { - "epoch": 1.9323147983656068, - "grad_norm": 0.36694627449422723, - "learning_rate": 2.0571713937351834e-06, - "loss": 0.6397, - "mean_token_accuracy": 0.7977312818169594, - "num_tokens": 1294948980.0, + "epoch": 1.932729050031103, + "grad_norm": 0.25517475920539473, + "learning_rate": 1.0285856968675917e-05, + "loss": 0.4104, + "mean_token_accuracy": 0.8657238759100437, + "num_tokens": 1248126495.0, "step": 1360 }, { - "epoch": 1.9394208562799786, - "grad_norm": 0.362232508705221, - "learning_rate": 2.0461085147899497e-06, - "loss": 0.6457, - "mean_token_accuracy": 0.7973731994628906, - "num_tokens": 1299719386.0, + "epoch": 1.9398382653514619, + "grad_norm": 0.24624704699692396, + "learning_rate": 1.0230542573949747e-05, + "loss": 0.4053, + "mean_token_accuracy": 0.8677756235003471, + "num_tokens": 1252728208.0, "step": 1365 }, { - "epoch": 1.9465269141943506, - "grad_norm": 0.3669523670641092, - "learning_rate": 2.0362313540428485e-06, - "loss": 0.6472, - "mean_token_accuracy": 0.797086289525032, - "num_tokens": 1304487261.0, + "epoch": 1.9469474806718208, + "grad_norm": 0.24811417447193737, + "learning_rate": 1.0181156770214243e-05, + "loss": 0.4193, + "mean_token_accuracy": 0.8637429274618625, + "num_tokens": 1257314007.0, "step": 1370 }, { - "epoch": 1.9536329721087227, - "grad_norm": 0.37200879998568015, - "learning_rate": 2.027541219475922e-06, - "loss": 0.6475, - "mean_token_accuracy": 0.7960396580398083, - "num_tokens": 1309241194.0, + "epoch": 1.95405669599218, + "grad_norm": 0.2553291480205661, + "learning_rate": 1.013770609737961e-05, + "loss": 0.4153, + "mean_token_accuracy": 0.8649327427148819, + "num_tokens": 1261908378.0, "step": 1375 }, { - "epoch": 1.9607390300230947, - "grad_norm": 0.3688539723341265, - "learning_rate": 2.020039261879382e-06, - "loss": 0.6573, - "mean_token_accuracy": 0.7950874969363213, - "num_tokens": 1314011836.0, + "epoch": 1.961165911312539, + "grad_norm": 0.24846642652489853, + "learning_rate": 1.010019630939691e-05, + "loss": 0.4204, + "mean_token_accuracy": 0.8626691080629826, + "num_tokens": 1266492690.0, "step": 1380 }, { - "epoch": 1.9678450879374667, - "grad_norm": 0.43966459213132447, - "learning_rate": 2.013726474699225e-06, - "loss": 0.6505, - "mean_token_accuracy": 0.7958736583590508, - "num_tokens": 1318761485.0, + "epoch": 1.968275126632898, + "grad_norm": 0.24853442428779762, + "learning_rate": 1.0068632373496125e-05, + "loss": 0.4213, + "mean_token_accuracy": 0.862095658481121, + "num_tokens": 1271089050.0, "step": 1385 }, { - "epoch": 1.9749511458518387, - "grad_norm": 0.34824004564185856, - "learning_rate": 2.008603693905673e-06, - "loss": 0.6476, - "mean_token_accuracy": 0.7972124963998795, - "num_tokens": 1323527340.0, + "epoch": 1.9753843419532569, + "grad_norm": 0.25447008393745496, + "learning_rate": 1.0043018469528365e-05, + "loss": 0.4186, + "mean_token_accuracy": 0.8638553529977798, + "num_tokens": 1275693685.0, "step": 1390 }, { - "epoch": 1.9820572037662108, - "grad_norm": 0.37468360563296904, - "learning_rate": 2.0046715978824663e-06, - "loss": 0.6496, - "mean_token_accuracy": 0.7958178780972958, - "num_tokens": 1328302362.0, + "epoch": 1.982493557273616, + "grad_norm": 0.25146974784680387, + "learning_rate": 1.0023357989412332e-05, + "loss": 0.4132, + "mean_token_accuracy": 0.8654829584062099, + "num_tokens": 1280282291.0, "step": 1395 }, { - "epoch": 1.9891632616805826, - "grad_norm": 0.3561744251531493, - "learning_rate": 2.001930707337034e-06, - "loss": 0.6501, - "mean_token_accuracy": 0.7963785864412785, - "num_tokens": 1333062144.0, + "epoch": 1.9896027725939749, + "grad_norm": 0.25186861166219776, + "learning_rate": 1.000965353668517e-05, + "loss": 0.4097, + "mean_token_accuracy": 0.8660168826580048, + "num_tokens": 1284878893.0, "step": 1400 }, { - "epoch": 1.9891632616805826, - "eval_loss": 0.6482434868812561, - "eval_mean_token_accuracy": 0.7910316936802446, - "eval_num_tokens": 1333062144.0, - "eval_runtime": 149.6853, - "eval_samples_per_second": 24.311, - "eval_steps_per_second": 0.762, + "epoch": 1.9896027725939749, + "eval_loss": 0.45450538396835327, + "eval_mean_token_accuracy": 0.8486974662856052, + "eval_num_tokens": 1284878893.0, + "eval_runtime": 143.4865, + "eval_samples_per_second": 25.354, + "eval_steps_per_second": 0.794, "step": 1400 }, { - "epoch": 1.9962693195949548, - "grad_norm": 0.3552540336023921, - "learning_rate": 2.000381385231536e-06, - "loss": 0.656, - "mean_token_accuracy": 0.7951462939381599, - "num_tokens": 1337810166.0, + "epoch": 1.9967119879143338, + "grad_norm": 0.2548741967506241, + "learning_rate": 1.0001906926157681e-05, + "loss": 0.4088, + "mean_token_accuracy": 0.8670746453106404, + "num_tokens": 1289465244.0, "step": 1405 }, { "epoch": 2.0, - "mean_token_accuracy": 0.7962638111341567, - "num_tokens": 1340314150.0, + "mean_token_accuracy": 0.8681698522052249, + "num_tokens": 1291584473.0, "step": 1408, - "total_flos": 1.0314062938243072e+16, - "train_loss": 0.7184352108531378, - "train_runtime": 49977.4428, - "train_samples_per_second": 14.417, - "train_steps_per_second": 0.028 + "total_flos": 9795365997903872.0, + "train_loss": 0.5166227378120477, + "train_runtime": 48333.5779, + "train_samples_per_second": 14.899, + "train_steps_per_second": 0.029 } ], "logging_steps": 5, @@ -2707,7 +2707,7 @@ "attributes": {} } }, - "total_flos": 1.0314062938243072e+16, + "total_flos": 9795365997903872.0, "train_batch_size": 4, "trial_name": null, "trial_params": null