diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,29089 @@ +{ + "best_metric": 0.04092838987708092, + "best_model_checkpoint": "saves/CADICA_qwenvl_detect_classify_augmented/lora/sft/checkpoint-9050", + "epoch": 1.9999081192000245, + "eval_steps": 50, + "global_step": 16324, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009188079997549845, + "grad_norm": 7.056973234954522, + "learning_rate": 2.9411764705882355e-06, + "loss": 1.9191, + "num_input_tokens_seen": 50528, + "step": 5 + }, + { + "epoch": 0.001837615999509969, + "grad_norm": 5.599372737598058, + "learning_rate": 5.882352941176471e-06, + "loss": 2.0838, + "num_input_tokens_seen": 100880, + "step": 10 + }, + { + "epoch": 0.002756423999264954, + "grad_norm": 3.923505564876021, + "learning_rate": 8.823529411764707e-06, + "loss": 1.9921, + "num_input_tokens_seen": 151328, + "step": 15 + }, + { + "epoch": 0.003675231999019938, + "grad_norm": 6.613609608781249, + "learning_rate": 1.1764705882352942e-05, + "loss": 2.0412, + "num_input_tokens_seen": 200656, + "step": 20 + }, + { + "epoch": 0.004594039998774922, + "grad_norm": 4.965186485045144, + "learning_rate": 1.4705882352941177e-05, + "loss": 1.9913, + "num_input_tokens_seen": 250656, + "step": 25 + }, + { + "epoch": 0.005512847998529908, + "grad_norm": 6.947041204609111, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.8291, + "num_input_tokens_seen": 300688, + "step": 30 + }, + { + "epoch": 0.006431655998284892, + "grad_norm": 4.428122870076037, + "learning_rate": 2.058823529411765e-05, + "loss": 1.6077, + "num_input_tokens_seen": 350984, + "step": 35 + }, + { + "epoch": 0.007350463998039876, + "grad_norm": 3.963503322009353, + "learning_rate": 2.3529411764705884e-05, + "loss": 1.4619, + "num_input_tokens_seen": 401200, + "step": 40 + }, + { + "epoch": 0.00826927199779486, + "grad_norm": 2.654125563007153, + "learning_rate": 2.647058823529412e-05, + "loss": 1.1743, + "num_input_tokens_seen": 452256, + "step": 45 + }, + { + "epoch": 0.009188079997549845, + "grad_norm": 2.2463614826933607, + "learning_rate": 2.9411764705882354e-05, + "loss": 1.0351, + "num_input_tokens_seen": 502368, + "step": 50 + }, + { + "epoch": 0.009188079997549845, + "eval_loss": 0.9696781039237976, + "eval_runtime": 46.7147, + "eval_samples_per_second": 1.284, + "eval_steps_per_second": 0.321, + "num_input_tokens_seen": 502368, + "step": 50 + }, + { + "epoch": 0.006737925331536553, + "grad_norm": 2.9313064922005103, + "learning_rate": 3.235294117647059e-05, + "loss": 0.9428, + "num_input_tokens_seen": 535968, + "step": 55 + }, + { + "epoch": 0.007350463998039876, + "grad_norm": 3.260215069553855, + "learning_rate": 3.529411764705883e-05, + "loss": 0.9121, + "num_input_tokens_seen": 569344, + "step": 60 + }, + { + "epoch": 0.007963002664543199, + "grad_norm": 2.4616389085641757, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.915, + "num_input_tokens_seen": 603816, + "step": 65 + }, + { + "epoch": 0.008575541331046523, + "grad_norm": 1.3763238196623047, + "learning_rate": 4.11764705882353e-05, + "loss": 0.7588, + "num_input_tokens_seen": 637176, + "step": 70 + }, + { + "epoch": 0.009188079997549845, + "grad_norm": 36.34227648025132, + "learning_rate": 4.411764705882353e-05, + "loss": 0.8561, + "num_input_tokens_seen": 670512, + "step": 75 + }, + { + "epoch": 0.009800618664053169, + "grad_norm": 1.071482216851512, + "learning_rate": 4.705882352941177e-05, + "loss": 0.7766, + "num_input_tokens_seen": 704160, + "step": 80 + }, + { + "epoch": 0.010413157330556491, + "grad_norm": 3.4859797194036437, + "learning_rate": 5e-05, + "loss": 0.8268, + "num_input_tokens_seen": 738120, + "step": 85 + }, + { + "epoch": 0.011025695997059815, + "grad_norm": 1.3363836365243216, + "learning_rate": 5.294117647058824e-05, + "loss": 0.7961, + "num_input_tokens_seen": 771632, + "step": 90 + }, + { + "epoch": 0.011638234663563138, + "grad_norm": 0.852060257896713, + "learning_rate": 5.588235294117647e-05, + "loss": 0.7732, + "num_input_tokens_seen": 805328, + "step": 95 + }, + { + "epoch": 0.01225077333006646, + "grad_norm": 0.9476545112029625, + "learning_rate": 5.882352941176471e-05, + "loss": 0.8279, + "num_input_tokens_seen": 838824, + "step": 100 + }, + { + "epoch": 0.01225077333006646, + "eval_loss": 0.9583126902580261, + "eval_runtime": 47.2572, + "eval_samples_per_second": 1.27, + "eval_steps_per_second": 0.317, + "num_input_tokens_seen": 838824, + "step": 100 + }, + { + "epoch": 0.012863311996569784, + "grad_norm": 1.2292075797606308, + "learning_rate": 6.176470588235295e-05, + "loss": 0.7322, + "num_input_tokens_seen": 871656, + "step": 105 + }, + { + "epoch": 0.013475850663073106, + "grad_norm": 2.0869834555807256, + "learning_rate": 6.470588235294118e-05, + "loss": 0.7735, + "num_input_tokens_seen": 905144, + "step": 110 + }, + { + "epoch": 0.01408838932957643, + "grad_norm": 0.7712807603880864, + "learning_rate": 6.764705882352942e-05, + "loss": 0.8563, + "num_input_tokens_seen": 938656, + "step": 115 + }, + { + "epoch": 0.014700927996079752, + "grad_norm": 0.7987667122887838, + "learning_rate": 7.058823529411765e-05, + "loss": 0.8066, + "num_input_tokens_seen": 972304, + "step": 120 + }, + { + "epoch": 0.015313466662583075, + "grad_norm": 0.6808868402371856, + "learning_rate": 7.352941176470589e-05, + "loss": 0.7967, + "num_input_tokens_seen": 1005576, + "step": 125 + }, + { + "epoch": 0.015926005329086397, + "grad_norm": 1.4528547921697086, + "learning_rate": 7.647058823529411e-05, + "loss": 0.7882, + "num_input_tokens_seen": 1039400, + "step": 130 + }, + { + "epoch": 0.01653854399558972, + "grad_norm": 0.7471906312974941, + "learning_rate": 7.941176470588235e-05, + "loss": 0.7331, + "num_input_tokens_seen": 1072528, + "step": 135 + }, + { + "epoch": 0.017151082662093045, + "grad_norm": 0.6557439630503187, + "learning_rate": 8.23529411764706e-05, + "loss": 0.751, + "num_input_tokens_seen": 1106184, + "step": 140 + }, + { + "epoch": 0.01776362132859637, + "grad_norm": 0.6475691746502584, + "learning_rate": 8.529411764705883e-05, + "loss": 0.7722, + "num_input_tokens_seen": 1139520, + "step": 145 + }, + { + "epoch": 0.01837615999509969, + "grad_norm": 0.8309881130436262, + "learning_rate": 8.823529411764706e-05, + "loss": 0.7354, + "num_input_tokens_seen": 1173256, + "step": 150 + }, + { + "epoch": 0.01837615999509969, + "eval_loss": 0.9015713930130005, + "eval_runtime": 19.9788, + "eval_samples_per_second": 3.003, + "eval_steps_per_second": 0.751, + "num_input_tokens_seen": 1173256, + "step": 150 + }, + { + "epoch": 0.018988698661603014, + "grad_norm": 0.9248361025512521, + "learning_rate": 9.11764705882353e-05, + "loss": 0.7864, + "num_input_tokens_seen": 1206816, + "step": 155 + }, + { + "epoch": 0.019601237328106338, + "grad_norm": 0.7322062897289474, + "learning_rate": 9.411764705882353e-05, + "loss": 0.7358, + "num_input_tokens_seen": 1240040, + "step": 160 + }, + { + "epoch": 0.02021377599460966, + "grad_norm": 1.2646715731502596, + "learning_rate": 9.705882352941177e-05, + "loss": 0.7654, + "num_input_tokens_seen": 1273760, + "step": 165 + }, + { + "epoch": 0.020826314661112982, + "grad_norm": 0.7281319010829045, + "learning_rate": 0.0001, + "loss": 0.737, + "num_input_tokens_seen": 1307496, + "step": 170 + }, + { + "epoch": 0.021438853327616306, + "grad_norm": 0.8029690002762221, + "learning_rate": 9.999940874631277e-05, + "loss": 0.7722, + "num_input_tokens_seen": 1341208, + "step": 175 + }, + { + "epoch": 0.02205139199411963, + "grad_norm": 0.658189368691658, + "learning_rate": 9.999763499923432e-05, + "loss": 0.7565, + "num_input_tokens_seen": 1374880, + "step": 180 + }, + { + "epoch": 0.02266393066062295, + "grad_norm": 4.105546054228758, + "learning_rate": 9.999467880071402e-05, + "loss": 0.7794, + "num_input_tokens_seen": 1408456, + "step": 185 + }, + { + "epoch": 0.023276469327126275, + "grad_norm": 0.8309303291642915, + "learning_rate": 9.999054022066641e-05, + "loss": 0.767, + "num_input_tokens_seen": 1442192, + "step": 190 + }, + { + "epoch": 0.0238890079936296, + "grad_norm": 1.0471942353964192, + "learning_rate": 9.998521935696953e-05, + "loss": 0.7617, + "num_input_tokens_seen": 1476184, + "step": 195 + }, + { + "epoch": 0.02450154666013292, + "grad_norm": 1.4698372844060559, + "learning_rate": 9.997871633546257e-05, + "loss": 0.7706, + "num_input_tokens_seen": 1509888, + "step": 200 + }, + { + "epoch": 0.02450154666013292, + "eval_loss": 0.9173043966293335, + "eval_runtime": 19.6856, + "eval_samples_per_second": 3.048, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 1509888, + "step": 200 + }, + { + "epoch": 0.025114085326636244, + "grad_norm": 1.696878306454003, + "learning_rate": 9.997103130994296e-05, + "loss": 0.7753, + "num_input_tokens_seen": 1543512, + "step": 205 + }, + { + "epoch": 0.025726623993139568, + "grad_norm": 0.7845989478696659, + "learning_rate": 9.996216446216267e-05, + "loss": 0.7671, + "num_input_tokens_seen": 1578600, + "step": 210 + }, + { + "epoch": 0.02633916265964289, + "grad_norm": 0.8246288917375009, + "learning_rate": 9.995211600182397e-05, + "loss": 0.7609, + "num_input_tokens_seen": 1611752, + "step": 215 + }, + { + "epoch": 0.026951701326146212, + "grad_norm": 1.5335049569017705, + "learning_rate": 9.994088616657444e-05, + "loss": 0.7415, + "num_input_tokens_seen": 1645048, + "step": 220 + }, + { + "epoch": 0.027564239992649536, + "grad_norm": 1.9254745521056273, + "learning_rate": 9.992847522200133e-05, + "loss": 0.7586, + "num_input_tokens_seen": 1678384, + "step": 225 + }, + { + "epoch": 0.02817677865915286, + "grad_norm": 1.0040336244156336, + "learning_rate": 9.99148834616253e-05, + "loss": 0.7651, + "num_input_tokens_seen": 1711528, + "step": 230 + }, + { + "epoch": 0.02878931732565618, + "grad_norm": 0.6669364335673071, + "learning_rate": 9.990011120689351e-05, + "loss": 0.7579, + "num_input_tokens_seen": 1744672, + "step": 235 + }, + { + "epoch": 0.029401855992159505, + "grad_norm": 1.2476787204095123, + "learning_rate": 9.988415880717194e-05, + "loss": 0.6783, + "num_input_tokens_seen": 1778296, + "step": 240 + }, + { + "epoch": 0.03001439465866283, + "grad_norm": 0.5577553705493616, + "learning_rate": 9.986702663973722e-05, + "loss": 0.7298, + "num_input_tokens_seen": 1812184, + "step": 245 + }, + { + "epoch": 0.03062693332516615, + "grad_norm": 1.7348851601605075, + "learning_rate": 9.98487151097676e-05, + "loss": 0.7321, + "num_input_tokens_seen": 1845992, + "step": 250 + }, + { + "epoch": 0.03062693332516615, + "eval_loss": 0.9278064966201782, + "eval_runtime": 19.9678, + "eval_samples_per_second": 3.005, + "eval_steps_per_second": 0.751, + "num_input_tokens_seen": 1845992, + "step": 250 + }, + { + "epoch": 0.031239471991669474, + "grad_norm": 0.6985354803676266, + "learning_rate": 9.98292246503335e-05, + "loss": 0.6957, + "num_input_tokens_seen": 1879552, + "step": 255 + }, + { + "epoch": 0.031852010658172794, + "grad_norm": 1.1376773493288004, + "learning_rate": 9.980855572238714e-05, + "loss": 0.7431, + "num_input_tokens_seen": 1912984, + "step": 260 + }, + { + "epoch": 0.03246454932467612, + "grad_norm": 1.9521637190219152, + "learning_rate": 9.978670881475172e-05, + "loss": 0.7521, + "num_input_tokens_seen": 1946696, + "step": 265 + }, + { + "epoch": 0.03307708799117944, + "grad_norm": 1.9048411987779241, + "learning_rate": 9.976368444410985e-05, + "loss": 0.6869, + "num_input_tokens_seen": 1980440, + "step": 270 + }, + { + "epoch": 0.03368962665768277, + "grad_norm": 2.8118449701492305, + "learning_rate": 9.973948315499126e-05, + "loss": 0.7235, + "num_input_tokens_seen": 2014144, + "step": 275 + }, + { + "epoch": 0.03430216532418609, + "grad_norm": 2.838883201290406, + "learning_rate": 9.971410551976002e-05, + "loss": 0.7413, + "num_input_tokens_seen": 2047712, + "step": 280 + }, + { + "epoch": 0.03491470399068941, + "grad_norm": 1.1330057448986988, + "learning_rate": 9.968755213860094e-05, + "loss": 0.7457, + "num_input_tokens_seen": 2080856, + "step": 285 + }, + { + "epoch": 0.03552724265719274, + "grad_norm": 1.2084484897232208, + "learning_rate": 9.96598236395054e-05, + "loss": 0.7467, + "num_input_tokens_seen": 2114608, + "step": 290 + }, + { + "epoch": 0.03613978132369606, + "grad_norm": 1.2541308774320654, + "learning_rate": 9.96309206782565e-05, + "loss": 0.7178, + "num_input_tokens_seen": 2148240, + "step": 295 + }, + { + "epoch": 0.03675231999019938, + "grad_norm": 1.0507733718936678, + "learning_rate": 9.960084393841355e-05, + "loss": 0.7602, + "num_input_tokens_seen": 2181624, + "step": 300 + }, + { + "epoch": 0.03675231999019938, + "eval_loss": 0.9220121502876282, + "eval_runtime": 47.747, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "num_input_tokens_seen": 2181624, + "step": 300 + }, + { + "epoch": 0.03736485865670271, + "grad_norm": 1.2287366659537722, + "learning_rate": 9.956959413129585e-05, + "loss": 0.7192, + "num_input_tokens_seen": 2215832, + "step": 305 + }, + { + "epoch": 0.03797739732320603, + "grad_norm": 1.2088252441222305, + "learning_rate": 9.953717199596598e-05, + "loss": 0.7425, + "num_input_tokens_seen": 2249328, + "step": 310 + }, + { + "epoch": 0.03858993598970935, + "grad_norm": 0.9779645916962421, + "learning_rate": 9.95035782992122e-05, + "loss": 0.7005, + "num_input_tokens_seen": 2282584, + "step": 315 + }, + { + "epoch": 0.039202474656212676, + "grad_norm": 0.7582575661373627, + "learning_rate": 9.94688138355304e-05, + "loss": 0.6911, + "num_input_tokens_seen": 2315728, + "step": 320 + }, + { + "epoch": 0.039815013322715996, + "grad_norm": 1.7020610733412422, + "learning_rate": 9.943287942710527e-05, + "loss": 0.7158, + "num_input_tokens_seen": 2349176, + "step": 325 + }, + { + "epoch": 0.04042755198921932, + "grad_norm": 1.5507794680678755, + "learning_rate": 9.939577592379088e-05, + "loss": 0.6953, + "num_input_tokens_seen": 2382408, + "step": 330 + }, + { + "epoch": 0.041040090655722644, + "grad_norm": 1.0071689454740862, + "learning_rate": 9.935750420309055e-05, + "loss": 0.7205, + "num_input_tokens_seen": 2415856, + "step": 335 + }, + { + "epoch": 0.041652629322225965, + "grad_norm": 1.3027954993825355, + "learning_rate": 9.931806517013612e-05, + "loss": 0.7072, + "num_input_tokens_seen": 2449744, + "step": 340 + }, + { + "epoch": 0.042265167988729285, + "grad_norm": 1.471783301983191, + "learning_rate": 9.927745975766654e-05, + "loss": 0.665, + "num_input_tokens_seen": 2482488, + "step": 345 + }, + { + "epoch": 0.04287770665523261, + "grad_norm": 1.2540361740283843, + "learning_rate": 9.923568892600578e-05, + "loss": 0.7052, + "num_input_tokens_seen": 2516008, + "step": 350 + }, + { + "epoch": 0.04287770665523261, + "eval_loss": 0.8991873264312744, + "eval_runtime": 19.1048, + "eval_samples_per_second": 3.141, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 2516008, + "step": 350 + }, + { + "epoch": 0.04349024532173593, + "grad_norm": 3.1375393275115413, + "learning_rate": 9.91927536630402e-05, + "loss": 0.7688, + "num_input_tokens_seen": 2550696, + "step": 355 + }, + { + "epoch": 0.04410278398823926, + "grad_norm": 1.2877539139154415, + "learning_rate": 9.91486549841951e-05, + "loss": 0.6736, + "num_input_tokens_seen": 2584440, + "step": 360 + }, + { + "epoch": 0.04471532265474258, + "grad_norm": 1.0836641883645721, + "learning_rate": 9.91033939324107e-05, + "loss": 0.6963, + "num_input_tokens_seen": 2617984, + "step": 365 + }, + { + "epoch": 0.0453278613212459, + "grad_norm": 1.1044660271442335, + "learning_rate": 9.905697157811761e-05, + "loss": 0.7171, + "num_input_tokens_seen": 2651328, + "step": 370 + }, + { + "epoch": 0.04594039998774923, + "grad_norm": 1.0312511016551493, + "learning_rate": 9.900938901921131e-05, + "loss": 0.6854, + "num_input_tokens_seen": 2684728, + "step": 375 + }, + { + "epoch": 0.04655293865425255, + "grad_norm": 3.1943334036457554, + "learning_rate": 9.896064738102635e-05, + "loss": 0.7017, + "num_input_tokens_seen": 2718408, + "step": 380 + }, + { + "epoch": 0.04716547732075587, + "grad_norm": 4.19932666811745, + "learning_rate": 9.891074781630966e-05, + "loss": 0.7372, + "num_input_tokens_seen": 2752744, + "step": 385 + }, + { + "epoch": 0.0477780159872592, + "grad_norm": 0.9870457310845876, + "learning_rate": 9.885969150519331e-05, + "loss": 0.6381, + "num_input_tokens_seen": 2785664, + "step": 390 + }, + { + "epoch": 0.04839055465376252, + "grad_norm": 1.4809373390810405, + "learning_rate": 9.88074796551666e-05, + "loss": 0.674, + "num_input_tokens_seen": 2818336, + "step": 395 + }, + { + "epoch": 0.04900309332026584, + "grad_norm": 1.6715863739758474, + "learning_rate": 9.875411350104744e-05, + "loss": 0.709, + "num_input_tokens_seen": 2852080, + "step": 400 + }, + { + "epoch": 0.04900309332026584, + "eval_loss": 0.8526390790939331, + "eval_runtime": 19.4089, + "eval_samples_per_second": 3.091, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 2852080, + "step": 400 + }, + { + "epoch": 0.04961563198676917, + "grad_norm": 1.1853719369247329, + "learning_rate": 9.86995943049533e-05, + "loss": 0.6803, + "num_input_tokens_seen": 2885472, + "step": 405 + }, + { + "epoch": 0.05022817065327249, + "grad_norm": 2.224311331481107, + "learning_rate": 9.864392335627117e-05, + "loss": 0.7016, + "num_input_tokens_seen": 2918920, + "step": 410 + }, + { + "epoch": 0.05084070931977581, + "grad_norm": 4.455615321891611, + "learning_rate": 9.858710197162721e-05, + "loss": 0.6904, + "num_input_tokens_seen": 2952360, + "step": 415 + }, + { + "epoch": 0.051453247986279135, + "grad_norm": 0.9367222326990758, + "learning_rate": 9.852913149485556e-05, + "loss": 0.6498, + "num_input_tokens_seen": 2985496, + "step": 420 + }, + { + "epoch": 0.052065786652782456, + "grad_norm": 1.835945023296102, + "learning_rate": 9.847001329696653e-05, + "loss": 0.7565, + "num_input_tokens_seen": 3019296, + "step": 425 + }, + { + "epoch": 0.05267832531928578, + "grad_norm": 2.8988988120755503, + "learning_rate": 9.840974877611422e-05, + "loss": 0.6492, + "num_input_tokens_seen": 3052440, + "step": 430 + }, + { + "epoch": 0.053290863985789104, + "grad_norm": 2.1152426541889153, + "learning_rate": 9.834833935756344e-05, + "loss": 0.6917, + "num_input_tokens_seen": 3085888, + "step": 435 + }, + { + "epoch": 0.053903402652292425, + "grad_norm": 1.0731984284011387, + "learning_rate": 9.828578649365601e-05, + "loss": 0.6842, + "num_input_tokens_seen": 3120584, + "step": 440 + }, + { + "epoch": 0.05451594131879575, + "grad_norm": 2.3976033190607913, + "learning_rate": 9.822209166377635e-05, + "loss": 0.6519, + "num_input_tokens_seen": 3154432, + "step": 445 + }, + { + "epoch": 0.05512847998529907, + "grad_norm": 2.7229361154311977, + "learning_rate": 9.815725637431662e-05, + "loss": 0.7008, + "num_input_tokens_seen": 3187536, + "step": 450 + }, + { + "epoch": 0.05512847998529907, + "eval_loss": 0.8276960253715515, + "eval_runtime": 19.2443, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 3187536, + "step": 450 + }, + { + "epoch": 0.05574101865180239, + "grad_norm": 3.026174340924071, + "learning_rate": 9.809128215864097e-05, + "loss": 0.6618, + "num_input_tokens_seen": 3220896, + "step": 455 + }, + { + "epoch": 0.05635355731830572, + "grad_norm": 2.862505331104642, + "learning_rate": 9.802417057704931e-05, + "loss": 0.6723, + "num_input_tokens_seen": 3254424, + "step": 460 + }, + { + "epoch": 0.05696609598480904, + "grad_norm": 1.5075301899372289, + "learning_rate": 9.795592321674045e-05, + "loss": 0.6156, + "num_input_tokens_seen": 3287624, + "step": 465 + }, + { + "epoch": 0.05757863465131236, + "grad_norm": 1.7311667362041405, + "learning_rate": 9.788654169177453e-05, + "loss": 0.664, + "num_input_tokens_seen": 3321048, + "step": 470 + }, + { + "epoch": 0.05819117331781569, + "grad_norm": 3.683274822872176, + "learning_rate": 9.781602764303487e-05, + "loss": 0.6419, + "num_input_tokens_seen": 3354200, + "step": 475 + }, + { + "epoch": 0.05880371198431901, + "grad_norm": 3.802610619270737, + "learning_rate": 9.774438273818911e-05, + "loss": 0.7027, + "num_input_tokens_seen": 3386816, + "step": 480 + }, + { + "epoch": 0.05941625065082233, + "grad_norm": 1.4777926613323287, + "learning_rate": 9.767160867164979e-05, + "loss": 0.6849, + "num_input_tokens_seen": 3420400, + "step": 485 + }, + { + "epoch": 0.06002878931732566, + "grad_norm": 4.040206689986627, + "learning_rate": 9.759770716453436e-05, + "loss": 0.6643, + "num_input_tokens_seen": 3454056, + "step": 490 + }, + { + "epoch": 0.06064132798382898, + "grad_norm": 4.20335141703578, + "learning_rate": 9.752267996462434e-05, + "loss": 0.6659, + "num_input_tokens_seen": 3488072, + "step": 495 + }, + { + "epoch": 0.0612538666503323, + "grad_norm": 2.5111170738706403, + "learning_rate": 9.744652884632406e-05, + "loss": 0.6225, + "num_input_tokens_seen": 3521680, + "step": 500 + }, + { + "epoch": 0.0612538666503323, + "eval_loss": 0.8026401996612549, + "eval_runtime": 19.2584, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 3521680, + "step": 500 + }, + { + "epoch": 0.061866405316835626, + "grad_norm": 2.7255925680561095, + "learning_rate": 9.736925561061871e-05, + "loss": 0.6372, + "num_input_tokens_seen": 3555288, + "step": 505 + }, + { + "epoch": 0.06247894398333895, + "grad_norm": 1.9706474145345436, + "learning_rate": 9.729086208503174e-05, + "loss": 0.68, + "num_input_tokens_seen": 3589264, + "step": 510 + }, + { + "epoch": 0.06309148264984227, + "grad_norm": 1.5465006446556857, + "learning_rate": 9.721135012358156e-05, + "loss": 0.6723, + "num_input_tokens_seen": 3622936, + "step": 515 + }, + { + "epoch": 0.06370402131634559, + "grad_norm": 3.2409562883575673, + "learning_rate": 9.713072160673777e-05, + "loss": 0.6608, + "num_input_tokens_seen": 3656688, + "step": 520 + }, + { + "epoch": 0.06431655998284892, + "grad_norm": 5.802704416868047, + "learning_rate": 9.704897844137673e-05, + "loss": 0.6821, + "num_input_tokens_seen": 3691056, + "step": 525 + }, + { + "epoch": 0.06492909864935224, + "grad_norm": 1.2817746678164137, + "learning_rate": 9.696612256073633e-05, + "loss": 0.611, + "num_input_tokens_seen": 3724952, + "step": 530 + }, + { + "epoch": 0.06554163731585556, + "grad_norm": 4.676080860554197, + "learning_rate": 9.688215592437039e-05, + "loss": 0.6804, + "num_input_tokens_seen": 3758240, + "step": 535 + }, + { + "epoch": 0.06615417598235888, + "grad_norm": 1.2146487253591667, + "learning_rate": 9.679708051810221e-05, + "loss": 0.634, + "num_input_tokens_seen": 3791608, + "step": 540 + }, + { + "epoch": 0.0667667146488622, + "grad_norm": 2.5582887682564173, + "learning_rate": 9.67108983539777e-05, + "loss": 0.6331, + "num_input_tokens_seen": 3825168, + "step": 545 + }, + { + "epoch": 0.06737925331536554, + "grad_norm": 1.8101531263387969, + "learning_rate": 9.662361147021779e-05, + "loss": 0.6115, + "num_input_tokens_seen": 3858752, + "step": 550 + }, + { + "epoch": 0.06737925331536554, + "eval_loss": 0.8088525533676147, + "eval_runtime": 19.3413, + "eval_samples_per_second": 3.102, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 3858752, + "step": 550 + }, + { + "epoch": 0.06799179198186886, + "grad_norm": 2.1676145521729935, + "learning_rate": 9.653522193117013e-05, + "loss": 0.6279, + "num_input_tokens_seen": 3891816, + "step": 555 + }, + { + "epoch": 0.06860433064837218, + "grad_norm": 6.729051118714118, + "learning_rate": 9.644573182726035e-05, + "loss": 0.668, + "num_input_tokens_seen": 3925384, + "step": 560 + }, + { + "epoch": 0.0692168693148755, + "grad_norm": 3.419756330433667, + "learning_rate": 9.63551432749426e-05, + "loss": 0.6761, + "num_input_tokens_seen": 3959232, + "step": 565 + }, + { + "epoch": 0.06982940798137882, + "grad_norm": 3.658496159802149, + "learning_rate": 9.626345841664953e-05, + "loss": 0.5765, + "num_input_tokens_seen": 3992768, + "step": 570 + }, + { + "epoch": 0.07044194664788214, + "grad_norm": 2.761812948875525, + "learning_rate": 9.617067942074153e-05, + "loss": 0.649, + "num_input_tokens_seen": 4026720, + "step": 575 + }, + { + "epoch": 0.07105448531438548, + "grad_norm": 2.6310111120682698, + "learning_rate": 9.607680848145558e-05, + "loss": 0.634, + "num_input_tokens_seen": 4060536, + "step": 580 + }, + { + "epoch": 0.0716670239808888, + "grad_norm": 1.978483160458763, + "learning_rate": 9.598184781885318e-05, + "loss": 0.6074, + "num_input_tokens_seen": 4094408, + "step": 585 + }, + { + "epoch": 0.07227956264739212, + "grad_norm": 8.85667420431102, + "learning_rate": 9.588579967876806e-05, + "loss": 0.6965, + "num_input_tokens_seen": 4128600, + "step": 590 + }, + { + "epoch": 0.07289210131389544, + "grad_norm": 14.421553099119425, + "learning_rate": 9.578866633275288e-05, + "loss": 0.6751, + "num_input_tokens_seen": 4162136, + "step": 595 + }, + { + "epoch": 0.07350463998039876, + "grad_norm": 1.0481567365611477, + "learning_rate": 9.569045007802559e-05, + "loss": 0.6367, + "num_input_tokens_seen": 4195632, + "step": 600 + }, + { + "epoch": 0.07350463998039876, + "eval_loss": 0.6892650723457336, + "eval_runtime": 19.5409, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 4195632, + "step": 600 + }, + { + "epoch": 0.07411717864690208, + "grad_norm": 2.810662394247293, + "learning_rate": 9.55911532374151e-05, + "loss": 0.5934, + "num_input_tokens_seen": 4228296, + "step": 605 + }, + { + "epoch": 0.07472971731340541, + "grad_norm": 3.0046897604450264, + "learning_rate": 9.549077815930636e-05, + "loss": 0.5862, + "num_input_tokens_seen": 4261592, + "step": 610 + }, + { + "epoch": 0.07534225597990873, + "grad_norm": 3.21731760394247, + "learning_rate": 9.538932721758474e-05, + "loss": 0.6037, + "num_input_tokens_seen": 4295064, + "step": 615 + }, + { + "epoch": 0.07595479464641205, + "grad_norm": 2.983397763523546, + "learning_rate": 9.528680281157999e-05, + "loss": 0.6898, + "num_input_tokens_seen": 4328512, + "step": 620 + }, + { + "epoch": 0.07656733331291538, + "grad_norm": 3.376304983448965, + "learning_rate": 9.518320736600943e-05, + "loss": 0.6429, + "num_input_tokens_seen": 4362280, + "step": 625 + }, + { + "epoch": 0.0771798719794187, + "grad_norm": 7.7314355325290425, + "learning_rate": 9.507854333092063e-05, + "loss": 0.6793, + "num_input_tokens_seen": 4396624, + "step": 630 + }, + { + "epoch": 0.07779241064592203, + "grad_norm": 5.255756760659528, + "learning_rate": 9.497281318163346e-05, + "loss": 0.6721, + "num_input_tokens_seen": 4430344, + "step": 635 + }, + { + "epoch": 0.07840494931242535, + "grad_norm": 1.6538156724511512, + "learning_rate": 9.486601941868154e-05, + "loss": 0.607, + "num_input_tokens_seen": 4463560, + "step": 640 + }, + { + "epoch": 0.07901748797892867, + "grad_norm": 2.983872060054735, + "learning_rate": 9.475816456775313e-05, + "loss": 0.6382, + "num_input_tokens_seen": 4496896, + "step": 645 + }, + { + "epoch": 0.07963002664543199, + "grad_norm": 3.734697580822006, + "learning_rate": 9.464925117963133e-05, + "loss": 0.6238, + "num_input_tokens_seen": 4530824, + "step": 650 + }, + { + "epoch": 0.07963002664543199, + "eval_loss": 0.7062045335769653, + "eval_runtime": 19.5289, + "eval_samples_per_second": 3.072, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 4530824, + "step": 650 + }, + { + "epoch": 0.08024256531193531, + "grad_norm": 2.782685449476194, + "learning_rate": 9.453928183013385e-05, + "loss": 0.6077, + "num_input_tokens_seen": 4564536, + "step": 655 + }, + { + "epoch": 0.08085510397843863, + "grad_norm": 3.5688581672829827, + "learning_rate": 9.442825912005202e-05, + "loss": 0.6365, + "num_input_tokens_seen": 4598088, + "step": 660 + }, + { + "epoch": 0.08146764264494197, + "grad_norm": 3.578157895358488, + "learning_rate": 9.431618567508933e-05, + "loss": 0.6289, + "num_input_tokens_seen": 4631952, + "step": 665 + }, + { + "epoch": 0.08208018131144529, + "grad_norm": 3.2738129491987764, + "learning_rate": 9.420306414579925e-05, + "loss": 0.5946, + "num_input_tokens_seen": 4665632, + "step": 670 + }, + { + "epoch": 0.08269271997794861, + "grad_norm": 6.216684602936897, + "learning_rate": 9.408889720752266e-05, + "loss": 0.596, + "num_input_tokens_seen": 4699240, + "step": 675 + }, + { + "epoch": 0.08330525864445193, + "grad_norm": 8.409935297753568, + "learning_rate": 9.397368756032445e-05, + "loss": 0.6328, + "num_input_tokens_seen": 4732400, + "step": 680 + }, + { + "epoch": 0.08391779731095525, + "grad_norm": 5.7011099370727605, + "learning_rate": 9.385743792892982e-05, + "loss": 0.6424, + "num_input_tokens_seen": 4766448, + "step": 685 + }, + { + "epoch": 0.08453033597745857, + "grad_norm": 6.833122814675099, + "learning_rate": 9.374015106265968e-05, + "loss": 0.6137, + "num_input_tokens_seen": 4800344, + "step": 690 + }, + { + "epoch": 0.0851428746439619, + "grad_norm": 1.7856723258929246, + "learning_rate": 9.362182973536569e-05, + "loss": 0.6233, + "num_input_tokens_seen": 4833096, + "step": 695 + }, + { + "epoch": 0.08575541331046523, + "grad_norm": 6.54158195181985, + "learning_rate": 9.35024767453647e-05, + "loss": 0.6897, + "num_input_tokens_seen": 4867048, + "step": 700 + }, + { + "epoch": 0.08575541331046523, + "eval_loss": 0.6905214190483093, + "eval_runtime": 19.3593, + "eval_samples_per_second": 3.099, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 4867048, + "step": 700 + }, + { + "epoch": 0.08636795197696855, + "grad_norm": 2.1373109886458557, + "learning_rate": 9.338209491537257e-05, + "loss": 0.6004, + "num_input_tokens_seen": 4900696, + "step": 705 + }, + { + "epoch": 0.08698049064347187, + "grad_norm": 3.2210127167239047, + "learning_rate": 9.326068709243727e-05, + "loss": 0.6088, + "num_input_tokens_seen": 4934312, + "step": 710 + }, + { + "epoch": 0.08759302930997519, + "grad_norm": 7.4730604583121, + "learning_rate": 9.313825614787177e-05, + "loss": 0.6589, + "num_input_tokens_seen": 4967176, + "step": 715 + }, + { + "epoch": 0.08820556797647852, + "grad_norm": 2.851816402176268, + "learning_rate": 9.301480497718593e-05, + "loss": 0.6056, + "num_input_tokens_seen": 5001336, + "step": 720 + }, + { + "epoch": 0.08881810664298184, + "grad_norm": 6.877501983217889, + "learning_rate": 9.289033650001817e-05, + "loss": 0.6026, + "num_input_tokens_seen": 5035240, + "step": 725 + }, + { + "epoch": 0.08943064530948516, + "grad_norm": 15.219223697646738, + "learning_rate": 9.276485366006634e-05, + "loss": 0.5947, + "num_input_tokens_seen": 5068640, + "step": 730 + }, + { + "epoch": 0.09004318397598848, + "grad_norm": 3.9825986287261648, + "learning_rate": 9.263835942501807e-05, + "loss": 0.6654, + "num_input_tokens_seen": 5102912, + "step": 735 + }, + { + "epoch": 0.0906557226424918, + "grad_norm": 6.7531395992498835, + "learning_rate": 9.251085678648072e-05, + "loss": 0.6167, + "num_input_tokens_seen": 5136064, + "step": 740 + }, + { + "epoch": 0.09126826130899512, + "grad_norm": 6.425970645615935, + "learning_rate": 9.238234875991046e-05, + "loss": 0.5852, + "num_input_tokens_seen": 5169640, + "step": 745 + }, + { + "epoch": 0.09188079997549846, + "grad_norm": 4.290834615065844, + "learning_rate": 9.225283838454111e-05, + "loss": 0.6515, + "num_input_tokens_seen": 5203408, + "step": 750 + }, + { + "epoch": 0.09188079997549846, + "eval_loss": 0.7262604832649231, + "eval_runtime": 47.977, + "eval_samples_per_second": 1.251, + "eval_steps_per_second": 0.313, + "num_input_tokens_seen": 5203408, + "step": 750 + }, + { + "epoch": 0.09249333864200178, + "grad_norm": 4.995287857292328, + "learning_rate": 9.21223287233121e-05, + "loss": 0.6032, + "num_input_tokens_seen": 5237296, + "step": 755 + }, + { + "epoch": 0.0931058773085051, + "grad_norm": 10.807725663755052, + "learning_rate": 9.199082286279622e-05, + "loss": 0.5835, + "num_input_tokens_seen": 5270792, + "step": 760 + }, + { + "epoch": 0.09371841597500842, + "grad_norm": 4.957266246509638, + "learning_rate": 9.185832391312644e-05, + "loss": 0.5838, + "num_input_tokens_seen": 5304352, + "step": 765 + }, + { + "epoch": 0.09433095464151174, + "grad_norm": 11.771322684480195, + "learning_rate": 9.172483500792244e-05, + "loss": 0.5733, + "num_input_tokens_seen": 5338544, + "step": 770 + }, + { + "epoch": 0.09494349330801506, + "grad_norm": 4.7502624769852755, + "learning_rate": 9.159035930421658e-05, + "loss": 0.6251, + "num_input_tokens_seen": 5371568, + "step": 775 + }, + { + "epoch": 0.0955560319745184, + "grad_norm": 4.227298978568238, + "learning_rate": 9.145489998237902e-05, + "loss": 0.6165, + "num_input_tokens_seen": 5405472, + "step": 780 + }, + { + "epoch": 0.09616857064102172, + "grad_norm": 2.6881739844950663, + "learning_rate": 9.131846024604274e-05, + "loss": 0.5865, + "num_input_tokens_seen": 5438600, + "step": 785 + }, + { + "epoch": 0.09678110930752504, + "grad_norm": 5.390975180874373, + "learning_rate": 9.11810433220276e-05, + "loss": 0.5387, + "num_input_tokens_seen": 5472624, + "step": 790 + }, + { + "epoch": 0.09739364797402836, + "grad_norm": 3.230136003067753, + "learning_rate": 9.104265246026415e-05, + "loss": 0.576, + "num_input_tokens_seen": 5506720, + "step": 795 + }, + { + "epoch": 0.09800618664053168, + "grad_norm": 9.293399494838159, + "learning_rate": 9.090329093371666e-05, + "loss": 0.6221, + "num_input_tokens_seen": 5540920, + "step": 800 + }, + { + "epoch": 0.09800618664053168, + "eval_loss": 0.6957933306694031, + "eval_runtime": 19.4259, + "eval_samples_per_second": 3.089, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 5540920, + "step": 800 + }, + { + "epoch": 0.09861872530703501, + "grad_norm": 5.533396678909399, + "learning_rate": 9.076296203830579e-05, + "loss": 0.5566, + "num_input_tokens_seen": 5574216, + "step": 805 + }, + { + "epoch": 0.09923126397353833, + "grad_norm": 4.608598361444419, + "learning_rate": 9.062166909283062e-05, + "loss": 0.5927, + "num_input_tokens_seen": 5607720, + "step": 810 + }, + { + "epoch": 0.09984380264004165, + "grad_norm": 7.087390080784584, + "learning_rate": 9.047941543889014e-05, + "loss": 0.6153, + "num_input_tokens_seen": 5641192, + "step": 815 + }, + { + "epoch": 0.10045634130654497, + "grad_norm": 4.225766285330844, + "learning_rate": 9.033620444080428e-05, + "loss": 0.5996, + "num_input_tokens_seen": 5675392, + "step": 820 + }, + { + "epoch": 0.1010688799730483, + "grad_norm": 7.662207429625662, + "learning_rate": 9.019203948553422e-05, + "loss": 0.691, + "num_input_tokens_seen": 5709464, + "step": 825 + }, + { + "epoch": 0.10168141863955162, + "grad_norm": 3.5848582811571155, + "learning_rate": 9.004692398260244e-05, + "loss": 0.5388, + "num_input_tokens_seen": 5742464, + "step": 830 + }, + { + "epoch": 0.10229395730605495, + "grad_norm": 5.786676484932829, + "learning_rate": 8.9900861364012e-05, + "loss": 0.5515, + "num_input_tokens_seen": 5775832, + "step": 835 + }, + { + "epoch": 0.10290649597255827, + "grad_norm": 2.2498889398492734, + "learning_rate": 8.975385508416532e-05, + "loss": 0.5835, + "num_input_tokens_seen": 5809264, + "step": 840 + }, + { + "epoch": 0.10351903463906159, + "grad_norm": 4.25720944646852, + "learning_rate": 8.960590861978265e-05, + "loss": 0.6318, + "num_input_tokens_seen": 5842968, + "step": 845 + }, + { + "epoch": 0.10413157330556491, + "grad_norm": 17.444193132577993, + "learning_rate": 8.945702546981969e-05, + "loss": 0.6148, + "num_input_tokens_seen": 5876600, + "step": 850 + }, + { + "epoch": 0.10413157330556491, + "eval_loss": 0.6981882452964783, + "eval_runtime": 47.331, + "eval_samples_per_second": 1.268, + "eval_steps_per_second": 0.317, + "num_input_tokens_seen": 5876600, + "step": 850 + }, + { + "epoch": 0.10474411197206823, + "grad_norm": 9.256921705370909, + "learning_rate": 8.930720915538487e-05, + "loss": 0.6409, + "num_input_tokens_seen": 5910016, + "step": 855 + }, + { + "epoch": 0.10535665063857157, + "grad_norm": 4.457076857370394, + "learning_rate": 8.915646321965614e-05, + "loss": 0.5827, + "num_input_tokens_seen": 5943680, + "step": 860 + }, + { + "epoch": 0.10596918930507489, + "grad_norm": 4.80476554148149, + "learning_rate": 8.900479122779712e-05, + "loss": 0.5722, + "num_input_tokens_seen": 5977216, + "step": 865 + }, + { + "epoch": 0.10658172797157821, + "grad_norm": 3.8589418150693464, + "learning_rate": 8.885219676687277e-05, + "loss": 0.5395, + "num_input_tokens_seen": 6010352, + "step": 870 + }, + { + "epoch": 0.10719426663808153, + "grad_norm": 6.802560595029875, + "learning_rate": 8.869868344576459e-05, + "loss": 0.5898, + "num_input_tokens_seen": 6044008, + "step": 875 + }, + { + "epoch": 0.10780680530458485, + "grad_norm": 4.951406983333656, + "learning_rate": 8.854425489508532e-05, + "loss": 0.5175, + "num_input_tokens_seen": 6077272, + "step": 880 + }, + { + "epoch": 0.10841934397108817, + "grad_norm": 13.504943915352268, + "learning_rate": 8.838891476709288e-05, + "loss": 0.613, + "num_input_tokens_seen": 6110864, + "step": 885 + }, + { + "epoch": 0.1090318826375915, + "grad_norm": 3.597935041871932, + "learning_rate": 8.823266673560426e-05, + "loss": 0.61, + "num_input_tokens_seen": 6143976, + "step": 890 + }, + { + "epoch": 0.10964442130409482, + "grad_norm": 3.0470019400307744, + "learning_rate": 8.807551449590846e-05, + "loss": 0.6117, + "num_input_tokens_seen": 6176768, + "step": 895 + }, + { + "epoch": 0.11025695997059815, + "grad_norm": 1.9989851697424825, + "learning_rate": 8.791746176467907e-05, + "loss": 0.5434, + "num_input_tokens_seen": 6209928, + "step": 900 + }, + { + "epoch": 0.11025695997059815, + "eval_loss": 0.6621683239936829, + "eval_runtime": 19.7217, + "eval_samples_per_second": 3.042, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 6209928, + "step": 900 + }, + { + "epoch": 0.11086949863710147, + "grad_norm": 3.5694191856984805, + "learning_rate": 8.775851227988656e-05, + "loss": 0.5905, + "num_input_tokens_seen": 6242384, + "step": 905 + }, + { + "epoch": 0.11148203730360479, + "grad_norm": 10.279162637561306, + "learning_rate": 8.759866980070963e-05, + "loss": 0.5894, + "num_input_tokens_seen": 6275768, + "step": 910 + }, + { + "epoch": 0.1120945759701081, + "grad_norm": 14.273252227167301, + "learning_rate": 8.743793810744654e-05, + "loss": 0.5514, + "num_input_tokens_seen": 6309936, + "step": 915 + }, + { + "epoch": 0.11270711463661144, + "grad_norm": 10.54839085750101, + "learning_rate": 8.727632100142551e-05, + "loss": 0.5772, + "num_input_tokens_seen": 6343768, + "step": 920 + }, + { + "epoch": 0.11331965330311476, + "grad_norm": 43.758128952480924, + "learning_rate": 8.711382230491493e-05, + "loss": 0.6127, + "num_input_tokens_seen": 6377040, + "step": 925 + }, + { + "epoch": 0.11393219196961808, + "grad_norm": 3.046545501571352, + "learning_rate": 8.695044586103296e-05, + "loss": 0.5769, + "num_input_tokens_seen": 6410232, + "step": 930 + }, + { + "epoch": 0.1145447306361214, + "grad_norm": 6.732707558642389, + "learning_rate": 8.678619553365659e-05, + "loss": 0.5851, + "num_input_tokens_seen": 6443920, + "step": 935 + }, + { + "epoch": 0.11515726930262472, + "grad_norm": 2.844834729036257, + "learning_rate": 8.662107520733027e-05, + "loss": 0.4821, + "num_input_tokens_seen": 6476464, + "step": 940 + }, + { + "epoch": 0.11576980796912806, + "grad_norm": 6.5044856478369395, + "learning_rate": 8.64550887871741e-05, + "loss": 0.5848, + "num_input_tokens_seen": 6510856, + "step": 945 + }, + { + "epoch": 0.11638234663563138, + "grad_norm": 3.3855715000356343, + "learning_rate": 8.628824019879137e-05, + "loss": 0.5729, + "num_input_tokens_seen": 6544656, + "step": 950 + }, + { + "epoch": 0.11638234663563138, + "eval_loss": 0.6741500496864319, + "eval_runtime": 19.2032, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 6544656, + "step": 950 + }, + { + "epoch": 0.1169948853021347, + "grad_norm": 7.122796737825454, + "learning_rate": 8.612053338817581e-05, + "loss": 0.5924, + "num_input_tokens_seen": 6578488, + "step": 955 + }, + { + "epoch": 0.11760742396863802, + "grad_norm": 4.508377480122389, + "learning_rate": 8.595197232161824e-05, + "loss": 0.5946, + "num_input_tokens_seen": 6611616, + "step": 960 + }, + { + "epoch": 0.11821996263514134, + "grad_norm": 5.733858610554117, + "learning_rate": 8.578256098561275e-05, + "loss": 0.5692, + "num_input_tokens_seen": 6645456, + "step": 965 + }, + { + "epoch": 0.11883250130164466, + "grad_norm": 1.904868426487018, + "learning_rate": 8.561230338676239e-05, + "loss": 0.5347, + "num_input_tokens_seen": 6678648, + "step": 970 + }, + { + "epoch": 0.119445039968148, + "grad_norm": 11.713013089175845, + "learning_rate": 8.544120355168451e-05, + "loss": 0.5792, + "num_input_tokens_seen": 6712928, + "step": 975 + }, + { + "epoch": 0.12005757863465132, + "grad_norm": 6.617168404497372, + "learning_rate": 8.526926552691544e-05, + "loss": 0.5768, + "num_input_tokens_seen": 6746616, + "step": 980 + }, + { + "epoch": 0.12067011730115464, + "grad_norm": 3.1162749994283203, + "learning_rate": 8.509649337881483e-05, + "loss": 0.53, + "num_input_tokens_seen": 6779552, + "step": 985 + }, + { + "epoch": 0.12128265596765796, + "grad_norm": 4.612530110988974, + "learning_rate": 8.492289119346943e-05, + "loss": 0.5095, + "num_input_tokens_seen": 6812968, + "step": 990 + }, + { + "epoch": 0.12189519463416128, + "grad_norm": 8.47574836104151, + "learning_rate": 8.474846307659658e-05, + "loss": 0.5278, + "num_input_tokens_seen": 6846816, + "step": 995 + }, + { + "epoch": 0.1225077333006646, + "grad_norm": 21.010943165658347, + "learning_rate": 8.457321315344694e-05, + "loss": 0.55, + "num_input_tokens_seen": 6880016, + "step": 1000 + }, + { + "epoch": 0.1225077333006646, + "eval_loss": 0.7234537601470947, + "eval_runtime": 19.3824, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 6880016, + "step": 1000 + }, + { + "epoch": 0.12312027196716793, + "grad_norm": 1.1897377835811873, + "learning_rate": 8.439714556870704e-05, + "loss": 0.5095, + "num_input_tokens_seen": 6913088, + "step": 1005 + }, + { + "epoch": 0.12373281063367125, + "grad_norm": 9.789832304928483, + "learning_rate": 8.422026448640124e-05, + "loss": 0.5987, + "num_input_tokens_seen": 6946072, + "step": 1010 + }, + { + "epoch": 0.12434534930017457, + "grad_norm": 12.969885182035801, + "learning_rate": 8.40425740897932e-05, + "loss": 0.5267, + "num_input_tokens_seen": 6979400, + "step": 1015 + }, + { + "epoch": 0.1249578879666779, + "grad_norm": 10.459849610808234, + "learning_rate": 8.386407858128706e-05, + "loss": 0.5209, + "num_input_tokens_seen": 7013384, + "step": 1020 + }, + { + "epoch": 0.12557042663318121, + "grad_norm": 12.787610852611158, + "learning_rate": 8.368478218232787e-05, + "loss": 0.5702, + "num_input_tokens_seen": 7047256, + "step": 1025 + }, + { + "epoch": 0.12618296529968454, + "grad_norm": 13.113416468962454, + "learning_rate": 8.350468913330192e-05, + "loss": 0.5883, + "num_input_tokens_seen": 7080464, + "step": 1030 + }, + { + "epoch": 0.12679550396618786, + "grad_norm": 3.4834910221085544, + "learning_rate": 8.33238036934364e-05, + "loss": 0.6062, + "num_input_tokens_seen": 7114864, + "step": 1035 + }, + { + "epoch": 0.12740804263269118, + "grad_norm": 4.056295700994003, + "learning_rate": 8.31421301406986e-05, + "loss": 0.6236, + "num_input_tokens_seen": 7148968, + "step": 1040 + }, + { + "epoch": 0.12802058129919452, + "grad_norm": 1.488958951906305, + "learning_rate": 8.29596727716949e-05, + "loss": 0.6023, + "num_input_tokens_seen": 7182000, + "step": 1045 + }, + { + "epoch": 0.12863311996569785, + "grad_norm": 2.39679495526721, + "learning_rate": 8.277643590156894e-05, + "loss": 0.5312, + "num_input_tokens_seen": 7215776, + "step": 1050 + }, + { + "epoch": 0.12863311996569785, + "eval_loss": 0.6906282901763916, + "eval_runtime": 19.3177, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 7215776, + "step": 1050 + }, + { + "epoch": 0.12924565863220117, + "grad_norm": 3.8028164296566516, + "learning_rate": 8.259242386389973e-05, + "loss": 0.5912, + "num_input_tokens_seen": 7249440, + "step": 1055 + }, + { + "epoch": 0.1298581972987045, + "grad_norm": 6.137278679473767, + "learning_rate": 8.240764101059912e-05, + "loss": 0.521, + "num_input_tokens_seen": 7283432, + "step": 1060 + }, + { + "epoch": 0.1304707359652078, + "grad_norm": 3.7500573615659376, + "learning_rate": 8.222209171180883e-05, + "loss": 0.5359, + "num_input_tokens_seen": 7317360, + "step": 1065 + }, + { + "epoch": 0.13108327463171113, + "grad_norm": 6.360476685600312, + "learning_rate": 8.203578035579715e-05, + "loss": 0.5271, + "num_input_tokens_seen": 7352112, + "step": 1070 + }, + { + "epoch": 0.13169581329821445, + "grad_norm": 1.7458123705897874, + "learning_rate": 8.184871134885513e-05, + "loss": 0.5044, + "num_input_tokens_seen": 7384880, + "step": 1075 + }, + { + "epoch": 0.13230835196471777, + "grad_norm": 7.172013405019242, + "learning_rate": 8.166088911519235e-05, + "loss": 0.5953, + "num_input_tokens_seen": 7418368, + "step": 1080 + }, + { + "epoch": 0.1329208906312211, + "grad_norm": 2.476642615613007, + "learning_rate": 8.147231809683236e-05, + "loss": 0.5123, + "num_input_tokens_seen": 7451872, + "step": 1085 + }, + { + "epoch": 0.1335334292977244, + "grad_norm": 13.817194120585148, + "learning_rate": 8.128300275350756e-05, + "loss": 0.5632, + "num_input_tokens_seen": 7485712, + "step": 1090 + }, + { + "epoch": 0.13414596796422773, + "grad_norm": 2.20726405661794, + "learning_rate": 8.109294756255375e-05, + "loss": 0.5144, + "num_input_tokens_seen": 7519168, + "step": 1095 + }, + { + "epoch": 0.13475850663073108, + "grad_norm": 5.79546351031941, + "learning_rate": 8.090215701880419e-05, + "loss": 0.5307, + "num_input_tokens_seen": 7552920, + "step": 1100 + }, + { + "epoch": 0.13475850663073108, + "eval_loss": 0.6174182295799255, + "eval_runtime": 19.257, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 7552920, + "step": 1100 + }, + { + "epoch": 0.1353710452972344, + "grad_norm": 1.6568269633234864, + "learning_rate": 8.07106356344834e-05, + "loss": 0.5067, + "num_input_tokens_seen": 7586336, + "step": 1105 + }, + { + "epoch": 0.13598358396373772, + "grad_norm": 9.881127130998916, + "learning_rate": 8.051838793910038e-05, + "loss": 0.5564, + "num_input_tokens_seen": 7620456, + "step": 1110 + }, + { + "epoch": 0.13659612263024104, + "grad_norm": 5.434757636785574, + "learning_rate": 8.032541847934146e-05, + "loss": 0.5854, + "num_input_tokens_seen": 7654064, + "step": 1115 + }, + { + "epoch": 0.13720866129674436, + "grad_norm": 8.431770854556952, + "learning_rate": 8.013173181896283e-05, + "loss": 0.5729, + "num_input_tokens_seen": 7687768, + "step": 1120 + }, + { + "epoch": 0.13782119996324768, + "grad_norm": 2.7261911659276343, + "learning_rate": 7.993733253868256e-05, + "loss": 0.6122, + "num_input_tokens_seen": 7721544, + "step": 1125 + }, + { + "epoch": 0.138433738629751, + "grad_norm": 2.456505697190831, + "learning_rate": 7.974222523607236e-05, + "loss": 0.5618, + "num_input_tokens_seen": 7755464, + "step": 1130 + }, + { + "epoch": 0.13904627729625432, + "grad_norm": 3.8454579857105626, + "learning_rate": 7.954641452544865e-05, + "loss": 0.5487, + "num_input_tokens_seen": 7789152, + "step": 1135 + }, + { + "epoch": 0.13965881596275764, + "grad_norm": 5.774785303618935, + "learning_rate": 7.934990503776363e-05, + "loss": 0.5132, + "num_input_tokens_seen": 7822128, + "step": 1140 + }, + { + "epoch": 0.14027135462926096, + "grad_norm": 6.086190598879861, + "learning_rate": 7.915270142049566e-05, + "loss": 0.5576, + "num_input_tokens_seen": 7855584, + "step": 1145 + }, + { + "epoch": 0.14088389329576428, + "grad_norm": 5.537023500299833, + "learning_rate": 7.89548083375394e-05, + "loss": 0.5403, + "num_input_tokens_seen": 7889016, + "step": 1150 + }, + { + "epoch": 0.14088389329576428, + "eval_loss": 0.6082175970077515, + "eval_runtime": 19.2261, + "eval_samples_per_second": 3.121, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 7889016, + "step": 1150 + }, + { + "epoch": 0.1414964319622676, + "grad_norm": 5.2350445648477315, + "learning_rate": 7.875623046909544e-05, + "loss": 0.531, + "num_input_tokens_seen": 7922176, + "step": 1155 + }, + { + "epoch": 0.14210897062877095, + "grad_norm": 5.184737532915708, + "learning_rate": 7.855697251155967e-05, + "loss": 0.5251, + "num_input_tokens_seen": 7955904, + "step": 1160 + }, + { + "epoch": 0.14272150929527427, + "grad_norm": 10.3925643217796, + "learning_rate": 7.835703917741212e-05, + "loss": 0.613, + "num_input_tokens_seen": 7990096, + "step": 1165 + }, + { + "epoch": 0.1433340479617776, + "grad_norm": 2.0510833692065225, + "learning_rate": 7.81564351951057e-05, + "loss": 0.5264, + "num_input_tokens_seen": 8024344, + "step": 1170 + }, + { + "epoch": 0.14394658662828091, + "grad_norm": 4.531421525747291, + "learning_rate": 7.795516530895414e-05, + "loss": 0.4993, + "num_input_tokens_seen": 8059128, + "step": 1175 + }, + { + "epoch": 0.14455912529478424, + "grad_norm": 2.3009325888627155, + "learning_rate": 7.775323427901993e-05, + "loss": 0.5187, + "num_input_tokens_seen": 8092072, + "step": 1180 + }, + { + "epoch": 0.14517166396128756, + "grad_norm": 6.253270602192802, + "learning_rate": 7.755064688100171e-05, + "loss": 0.5339, + "num_input_tokens_seen": 8125568, + "step": 1185 + }, + { + "epoch": 0.14578420262779088, + "grad_norm": 3.5757925641211483, + "learning_rate": 7.734740790612136e-05, + "loss": 0.5192, + "num_input_tokens_seen": 8159296, + "step": 1190 + }, + { + "epoch": 0.1463967412942942, + "grad_norm": 4.569421141701825, + "learning_rate": 7.714352216101055e-05, + "loss": 0.4876, + "num_input_tokens_seen": 8192992, + "step": 1195 + }, + { + "epoch": 0.14700927996079752, + "grad_norm": 5.452661792444726, + "learning_rate": 7.693899446759727e-05, + "loss": 0.4855, + "num_input_tokens_seen": 8226648, + "step": 1200 + }, + { + "epoch": 0.14700927996079752, + "eval_loss": 0.7277763485908508, + "eval_runtime": 19.1608, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 8226648, + "step": 1200 + }, + { + "epoch": 0.14762181862730084, + "grad_norm": 5.752036808070382, + "learning_rate": 7.673382966299163e-05, + "loss": 0.5579, + "num_input_tokens_seen": 8259304, + "step": 1205 + }, + { + "epoch": 0.14823435729380416, + "grad_norm": 3.992568230735242, + "learning_rate": 7.65280325993715e-05, + "loss": 0.5636, + "num_input_tokens_seen": 8292128, + "step": 1210 + }, + { + "epoch": 0.1488468959603075, + "grad_norm": 7.734398194807091, + "learning_rate": 7.63216081438678e-05, + "loss": 0.4929, + "num_input_tokens_seen": 8325272, + "step": 1215 + }, + { + "epoch": 0.14945943462681083, + "grad_norm": 16.702104083055847, + "learning_rate": 7.611456117844934e-05, + "loss": 0.5441, + "num_input_tokens_seen": 8359496, + "step": 1220 + }, + { + "epoch": 0.15007197329331415, + "grad_norm": 9.002016037657278, + "learning_rate": 7.59068965998074e-05, + "loss": 0.4872, + "num_input_tokens_seen": 8392792, + "step": 1225 + }, + { + "epoch": 0.15068451195981747, + "grad_norm": 2.809443933801699, + "learning_rate": 7.569861931923989e-05, + "loss": 0.572, + "num_input_tokens_seen": 8426176, + "step": 1230 + }, + { + "epoch": 0.1512970506263208, + "grad_norm": 9.859439460848197, + "learning_rate": 7.548973426253521e-05, + "loss": 0.5814, + "num_input_tokens_seen": 8459848, + "step": 1235 + }, + { + "epoch": 0.1519095892928241, + "grad_norm": 3.0162025660062524, + "learning_rate": 7.528024636985575e-05, + "loss": 0.5583, + "num_input_tokens_seen": 8492616, + "step": 1240 + }, + { + "epoch": 0.15252212795932743, + "grad_norm": 5.508205384983547, + "learning_rate": 7.507016059562107e-05, + "loss": 0.529, + "num_input_tokens_seen": 8526152, + "step": 1245 + }, + { + "epoch": 0.15313466662583075, + "grad_norm": 6.8002879433239585, + "learning_rate": 7.485948190839077e-05, + "loss": 0.604, + "num_input_tokens_seen": 8560104, + "step": 1250 + }, + { + "epoch": 0.15313466662583075, + "eval_loss": 0.6421969532966614, + "eval_runtime": 19.4047, + "eval_samples_per_second": 3.092, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 8560104, + "step": 1250 + }, + { + "epoch": 0.15374720529233407, + "grad_norm": 6.211672151037699, + "learning_rate": 7.464821529074679e-05, + "loss": 0.4929, + "num_input_tokens_seen": 8594200, + "step": 1255 + }, + { + "epoch": 0.1543597439588374, + "grad_norm": 3.26015997954923, + "learning_rate": 7.443636573917585e-05, + "loss": 0.5225, + "num_input_tokens_seen": 8627784, + "step": 1260 + }, + { + "epoch": 0.1549722826253407, + "grad_norm": 1.7598154746479808, + "learning_rate": 7.422393826395108e-05, + "loss": 0.4588, + "num_input_tokens_seen": 8661280, + "step": 1265 + }, + { + "epoch": 0.15558482129184406, + "grad_norm": 2.1163402779996523, + "learning_rate": 7.40109378890136e-05, + "loss": 0.4687, + "num_input_tokens_seen": 8694496, + "step": 1270 + }, + { + "epoch": 0.15619735995834738, + "grad_norm": 4.523003839971559, + "learning_rate": 7.379736965185368e-05, + "loss": 0.5354, + "num_input_tokens_seen": 8727848, + "step": 1275 + }, + { + "epoch": 0.1568098986248507, + "grad_norm": 9.255700063105946, + "learning_rate": 7.358323860339165e-05, + "loss": 0.4773, + "num_input_tokens_seen": 8761416, + "step": 1280 + }, + { + "epoch": 0.15742243729135402, + "grad_norm": 10.756762069149207, + "learning_rate": 7.336854980785839e-05, + "loss": 0.4848, + "num_input_tokens_seen": 8794672, + "step": 1285 + }, + { + "epoch": 0.15803497595785734, + "grad_norm": 8.850740494844128, + "learning_rate": 7.315330834267553e-05, + "loss": 0.5411, + "num_input_tokens_seen": 8828392, + "step": 1290 + }, + { + "epoch": 0.15864751462436066, + "grad_norm": 25.225017139812458, + "learning_rate": 7.293751929833553e-05, + "loss": 0.5434, + "num_input_tokens_seen": 8861664, + "step": 1295 + }, + { + "epoch": 0.15926005329086398, + "grad_norm": 4.379564811463093, + "learning_rate": 7.272118777828108e-05, + "loss": 0.5032, + "num_input_tokens_seen": 8895088, + "step": 1300 + }, + { + "epoch": 0.15926005329086398, + "eval_loss": 0.643197774887085, + "eval_runtime": 19.5194, + "eval_samples_per_second": 3.074, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 8895088, + "step": 1300 + }, + { + "epoch": 0.1598725919573673, + "grad_norm": 3.8660747498397523, + "learning_rate": 7.250431889878455e-05, + "loss": 0.4661, + "num_input_tokens_seen": 8928264, + "step": 1305 + }, + { + "epoch": 0.16048513062387063, + "grad_norm": 22.310168392120698, + "learning_rate": 7.228691778882693e-05, + "loss": 0.5823, + "num_input_tokens_seen": 8961680, + "step": 1310 + }, + { + "epoch": 0.16109766929037395, + "grad_norm": 5.59389987365445, + "learning_rate": 7.20689895899765e-05, + "loss": 0.5125, + "num_input_tokens_seen": 8995272, + "step": 1315 + }, + { + "epoch": 0.16171020795687727, + "grad_norm": 5.171611814028534, + "learning_rate": 7.185053945626733e-05, + "loss": 0.5206, + "num_input_tokens_seen": 9029064, + "step": 1320 + }, + { + "epoch": 0.1623227466233806, + "grad_norm": 8.895819633569912, + "learning_rate": 7.163157255407732e-05, + "loss": 0.5189, + "num_input_tokens_seen": 9063336, + "step": 1325 + }, + { + "epoch": 0.16293528528988394, + "grad_norm": 9.66605025155887, + "learning_rate": 7.141209406200599e-05, + "loss": 0.5273, + "num_input_tokens_seen": 9097336, + "step": 1330 + }, + { + "epoch": 0.16354782395638726, + "grad_norm": 3.6663094454446163, + "learning_rate": 7.1192109170752e-05, + "loss": 0.4906, + "num_input_tokens_seen": 9130160, + "step": 1335 + }, + { + "epoch": 0.16416036262289058, + "grad_norm": 6.96401026699699, + "learning_rate": 7.097162308299054e-05, + "loss": 0.4826, + "num_input_tokens_seen": 9163616, + "step": 1340 + }, + { + "epoch": 0.1647729012893939, + "grad_norm": 45.646671151913786, + "learning_rate": 7.07506410132501e-05, + "loss": 0.5058, + "num_input_tokens_seen": 9197368, + "step": 1345 + }, + { + "epoch": 0.16538543995589722, + "grad_norm": 3.3363312775285863, + "learning_rate": 7.052916818778918e-05, + "loss": 0.5053, + "num_input_tokens_seen": 9231680, + "step": 1350 + }, + { + "epoch": 0.16538543995589722, + "eval_loss": 0.5727524161338806, + "eval_runtime": 19.6174, + "eval_samples_per_second": 3.059, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 9231680, + "step": 1350 + }, + { + "epoch": 0.16599797862240054, + "grad_norm": 4.684347920621317, + "learning_rate": 7.030720984447279e-05, + "loss": 0.5549, + "num_input_tokens_seen": 9265288, + "step": 1355 + }, + { + "epoch": 0.16661051728890386, + "grad_norm": 3.633006740253046, + "learning_rate": 7.008477123264848e-05, + "loss": 0.4997, + "num_input_tokens_seen": 9299312, + "step": 1360 + }, + { + "epoch": 0.16722305595540718, + "grad_norm": 3.994491612036205, + "learning_rate": 6.986185761302224e-05, + "loss": 0.5059, + "num_input_tokens_seen": 9332808, + "step": 1365 + }, + { + "epoch": 0.1678355946219105, + "grad_norm": 3.9612574101216893, + "learning_rate": 6.963847425753403e-05, + "loss": 0.4701, + "num_input_tokens_seen": 9366504, + "step": 1370 + }, + { + "epoch": 0.16844813328841382, + "grad_norm": 9.059412856243592, + "learning_rate": 6.941462644923318e-05, + "loss": 0.4739, + "num_input_tokens_seen": 9399752, + "step": 1375 + }, + { + "epoch": 0.16906067195491714, + "grad_norm": 2.6467153867677915, + "learning_rate": 6.919031948215335e-05, + "loss": 0.4967, + "num_input_tokens_seen": 9433272, + "step": 1380 + }, + { + "epoch": 0.1696732106214205, + "grad_norm": 2.1665659612531094, + "learning_rate": 6.896555866118741e-05, + "loss": 0.4201, + "num_input_tokens_seen": 9466912, + "step": 1385 + }, + { + "epoch": 0.1702857492879238, + "grad_norm": 14.154453773252786, + "learning_rate": 6.87403493019619e-05, + "loss": 0.495, + "num_input_tokens_seen": 9499568, + "step": 1390 + }, + { + "epoch": 0.17089828795442713, + "grad_norm": 2.695555898133394, + "learning_rate": 6.851469673071143e-05, + "loss": 0.5148, + "num_input_tokens_seen": 9532896, + "step": 1395 + }, + { + "epoch": 0.17151082662093045, + "grad_norm": 4.919840688491465, + "learning_rate": 6.828860628415253e-05, + "loss": 0.4987, + "num_input_tokens_seen": 9566000, + "step": 1400 + }, + { + "epoch": 0.17151082662093045, + "eval_loss": 0.4715874493122101, + "eval_runtime": 19.4349, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 9566000, + "step": 1400 + }, + { + "epoch": 0.17212336528743377, + "grad_norm": 7.741057532313517, + "learning_rate": 6.806208330935766e-05, + "loss": 0.499, + "num_input_tokens_seen": 9599640, + "step": 1405 + }, + { + "epoch": 0.1727359039539371, + "grad_norm": 1.6504471731236128, + "learning_rate": 6.783513316362855e-05, + "loss": 0.4612, + "num_input_tokens_seen": 9632472, + "step": 1410 + }, + { + "epoch": 0.1733484426204404, + "grad_norm": 8.591564888030183, + "learning_rate": 6.760776121436962e-05, + "loss": 0.5042, + "num_input_tokens_seen": 9665592, + "step": 1415 + }, + { + "epoch": 0.17396098128694373, + "grad_norm": 4.478873945731118, + "learning_rate": 6.737997283896103e-05, + "loss": 0.4785, + "num_input_tokens_seen": 9699384, + "step": 1420 + }, + { + "epoch": 0.17457351995344705, + "grad_norm": 1.2906170502840724, + "learning_rate": 6.715177342463145e-05, + "loss": 0.402, + "num_input_tokens_seen": 9732128, + "step": 1425 + }, + { + "epoch": 0.17518605861995037, + "grad_norm": 5.114167697299532, + "learning_rate": 6.692316836833065e-05, + "loss": 0.5755, + "num_input_tokens_seen": 9765864, + "step": 1430 + }, + { + "epoch": 0.1757985972864537, + "grad_norm": 7.684603024954763, + "learning_rate": 6.6694163076602e-05, + "loss": 0.4779, + "num_input_tokens_seen": 9799608, + "step": 1435 + }, + { + "epoch": 0.17641113595295704, + "grad_norm": 2.8697832057788206, + "learning_rate": 6.646476296545434e-05, + "loss": 0.5047, + "num_input_tokens_seen": 9832880, + "step": 1440 + }, + { + "epoch": 0.17702367461946036, + "grad_norm": 3.024432041176809, + "learning_rate": 6.623497346023418e-05, + "loss": 0.4869, + "num_input_tokens_seen": 9865744, + "step": 1445 + }, + { + "epoch": 0.17763621328596368, + "grad_norm": 1.7701711760338612, + "learning_rate": 6.60047999954972e-05, + "loss": 0.44, + "num_input_tokens_seen": 9899080, + "step": 1450 + }, + { + "epoch": 0.17763621328596368, + "eval_loss": 0.5081247091293335, + "eval_runtime": 19.5618, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 9899080, + "step": 1450 + }, + { + "epoch": 0.178248751952467, + "grad_norm": 6.765067002881245, + "learning_rate": 6.57742480148798e-05, + "loss": 0.4498, + "num_input_tokens_seen": 9932296, + "step": 1455 + }, + { + "epoch": 0.17886129061897033, + "grad_norm": 4.466389883174556, + "learning_rate": 6.554332297097031e-05, + "loss": 0.5155, + "num_input_tokens_seen": 9965544, + "step": 1460 + }, + { + "epoch": 0.17947382928547365, + "grad_norm": 5.0557445306364865, + "learning_rate": 6.53120303251801e-05, + "loss": 0.5036, + "num_input_tokens_seen": 9999688, + "step": 1465 + }, + { + "epoch": 0.18008636795197697, + "grad_norm": 6.4846295825850095, + "learning_rate": 6.508037554761432e-05, + "loss": 0.4837, + "num_input_tokens_seen": 10033136, + "step": 1470 + }, + { + "epoch": 0.1806989066184803, + "grad_norm": 3.0111405904068085, + "learning_rate": 6.484836411694267e-05, + "loss": 0.4418, + "num_input_tokens_seen": 10066832, + "step": 1475 + }, + { + "epoch": 0.1813114452849836, + "grad_norm": 3.696017880148453, + "learning_rate": 6.461600152026965e-05, + "loss": 0.4343, + "num_input_tokens_seen": 10100952, + "step": 1480 + }, + { + "epoch": 0.18192398395148693, + "grad_norm": 14.563638252173728, + "learning_rate": 6.438329325300499e-05, + "loss": 0.4359, + "num_input_tokens_seen": 10134200, + "step": 1485 + }, + { + "epoch": 0.18253652261799025, + "grad_norm": 2.255326722686918, + "learning_rate": 6.415024481873352e-05, + "loss": 0.4746, + "num_input_tokens_seen": 10168056, + "step": 1490 + }, + { + "epoch": 0.1831490612844936, + "grad_norm": 1.3918379738269266, + "learning_rate": 6.391686172908506e-05, + "loss": 0.4221, + "num_input_tokens_seen": 10201120, + "step": 1495 + }, + { + "epoch": 0.18376159995099692, + "grad_norm": 9.497435208925031, + "learning_rate": 6.368314950360415e-05, + "loss": 0.4207, + "num_input_tokens_seen": 10234856, + "step": 1500 + }, + { + "epoch": 0.18376159995099692, + "eval_loss": 0.5098862648010254, + "eval_runtime": 19.432, + "eval_samples_per_second": 3.088, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 10234856, + "step": 1500 + }, + { + "epoch": 0.18437413861750024, + "grad_norm": 1.3864612953069486, + "learning_rate": 6.344911366961934e-05, + "loss": 0.4755, + "num_input_tokens_seen": 10268128, + "step": 1505 + }, + { + "epoch": 0.18498667728400356, + "grad_norm": 4.356350101962207, + "learning_rate": 6.321475976211266e-05, + "loss": 0.4894, + "num_input_tokens_seen": 10301440, + "step": 1510 + }, + { + "epoch": 0.18559921595050688, + "grad_norm": 6.4236347714453075, + "learning_rate": 6.298009332358856e-05, + "loss": 0.4294, + "num_input_tokens_seen": 10334896, + "step": 1515 + }, + { + "epoch": 0.1862117546170102, + "grad_norm": 10.135022020769075, + "learning_rate": 6.274511990394294e-05, + "loss": 0.5088, + "num_input_tokens_seen": 10368584, + "step": 1520 + }, + { + "epoch": 0.18682429328351352, + "grad_norm": 9.418474229161003, + "learning_rate": 6.250984506033183e-05, + "loss": 0.4386, + "num_input_tokens_seen": 10401576, + "step": 1525 + }, + { + "epoch": 0.18743683195001684, + "grad_norm": 10.370745667271652, + "learning_rate": 6.227427435703997e-05, + "loss": 0.4836, + "num_input_tokens_seen": 10435032, + "step": 1530 + }, + { + "epoch": 0.18804937061652016, + "grad_norm": 5.185731977505902, + "learning_rate": 6.203841336534924e-05, + "loss": 0.536, + "num_input_tokens_seen": 10468336, + "step": 1535 + }, + { + "epoch": 0.18866190928302348, + "grad_norm": 1.9824580512505723, + "learning_rate": 6.180226766340688e-05, + "loss": 0.4544, + "num_input_tokens_seen": 10502128, + "step": 1540 + }, + { + "epoch": 0.1892744479495268, + "grad_norm": 8.502162189428063, + "learning_rate": 6.156584283609359e-05, + "loss": 0.4647, + "num_input_tokens_seen": 10536432, + "step": 1545 + }, + { + "epoch": 0.18988698661603012, + "grad_norm": 5.974448872500084, + "learning_rate": 6.132914447489137e-05, + "loss": 0.4664, + "num_input_tokens_seen": 10570832, + "step": 1550 + }, + { + "epoch": 0.18988698661603012, + "eval_loss": 0.4264271855354309, + "eval_runtime": 19.8237, + "eval_samples_per_second": 3.027, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 10570832, + "step": 1550 + }, + { + "epoch": 0.19049952528253347, + "grad_norm": 5.121820867293283, + "learning_rate": 6.109217817775139e-05, + "loss": 0.401, + "num_input_tokens_seen": 10604448, + "step": 1555 + }, + { + "epoch": 0.1911120639490368, + "grad_norm": 7.100371005459897, + "learning_rate": 6.085494954896156e-05, + "loss": 0.4273, + "num_input_tokens_seen": 10638216, + "step": 1560 + }, + { + "epoch": 0.1917246026155401, + "grad_norm": 8.936525327373612, + "learning_rate": 6.061746419901388e-05, + "loss": 0.528, + "num_input_tokens_seen": 10672472, + "step": 1565 + }, + { + "epoch": 0.19233714128204343, + "grad_norm": 14.602240917541037, + "learning_rate": 6.0379727744471936e-05, + "loss": 0.4963, + "num_input_tokens_seen": 10705584, + "step": 1570 + }, + { + "epoch": 0.19294967994854675, + "grad_norm": 11.855331751101119, + "learning_rate": 6.014174580783794e-05, + "loss": 0.4823, + "num_input_tokens_seen": 10739776, + "step": 1575 + }, + { + "epoch": 0.19356221861505007, + "grad_norm": 9.719201402626247, + "learning_rate": 5.990352401741981e-05, + "loss": 0.4346, + "num_input_tokens_seen": 10773128, + "step": 1580 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 9.728724735445457, + "learning_rate": 5.9665068007197976e-05, + "loss": 0.4038, + "num_input_tokens_seen": 10807296, + "step": 1585 + }, + { + "epoch": 0.19478729594805672, + "grad_norm": 1.7241609666647286, + "learning_rate": 5.94263834166923e-05, + "loss": 0.5107, + "num_input_tokens_seen": 10840664, + "step": 1590 + }, + { + "epoch": 0.19539983461456004, + "grad_norm": 1.4074914573563075, + "learning_rate": 5.918747589082853e-05, + "loss": 0.4815, + "num_input_tokens_seen": 10873584, + "step": 1595 + }, + { + "epoch": 0.19601237328106336, + "grad_norm": 12.781467204070063, + "learning_rate": 5.8948351079804875e-05, + "loss": 0.4713, + "num_input_tokens_seen": 10907608, + "step": 1600 + }, + { + "epoch": 0.19601237328106336, + "eval_loss": 0.503852128982544, + "eval_runtime": 19.6279, + "eval_samples_per_second": 3.057, + "eval_steps_per_second": 0.764, + "num_input_tokens_seen": 10907608, + "step": 1600 + }, + { + "epoch": 0.19662491194756668, + "grad_norm": 11.366948883426232, + "learning_rate": 5.8709014638958404e-05, + "loss": 0.4977, + "num_input_tokens_seen": 10941040, + "step": 1605 + }, + { + "epoch": 0.19723745061407003, + "grad_norm": 6.471326401400977, + "learning_rate": 5.846947222863123e-05, + "loss": 0.4665, + "num_input_tokens_seen": 10974800, + "step": 1610 + }, + { + "epoch": 0.19784998928057335, + "grad_norm": 3.599522546558751, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.4976, + "num_input_tokens_seen": 11008560, + "step": 1615 + }, + { + "epoch": 0.19846252794707667, + "grad_norm": 11.22738100315125, + "learning_rate": 5.7989792165125356e-05, + "loss": 0.5012, + "num_input_tokens_seen": 11041792, + "step": 1620 + }, + { + "epoch": 0.19907506661358, + "grad_norm": 10.739653066852702, + "learning_rate": 5.774966585645092e-05, + "loss": 0.5137, + "num_input_tokens_seen": 11075256, + "step": 1625 + }, + { + "epoch": 0.1996876052800833, + "grad_norm": 8.478040113620175, + "learning_rate": 5.7509356267035975e-05, + "loss": 0.4566, + "num_input_tokens_seen": 11109320, + "step": 1630 + }, + { + "epoch": 0.20030014394658663, + "grad_norm": 9.098300294796118, + "learning_rate": 5.726886908023776e-05, + "loss": 0.4311, + "num_input_tokens_seen": 11142816, + "step": 1635 + }, + { + "epoch": 0.20091268261308995, + "grad_norm": 3.502110738643531, + "learning_rate": 5.702820998361373e-05, + "loss": 0.5047, + "num_input_tokens_seen": 11175992, + "step": 1640 + }, + { + "epoch": 0.20152522127959327, + "grad_norm": 5.385944472211836, + "learning_rate": 5.6787384668786994e-05, + "loss": 0.3824, + "num_input_tokens_seen": 11209952, + "step": 1645 + }, + { + "epoch": 0.2021377599460966, + "grad_norm": 8.694905773461128, + "learning_rate": 5.654639883131178e-05, + "loss": 0.4385, + "num_input_tokens_seen": 11243856, + "step": 1650 + }, + { + "epoch": 0.2021377599460966, + "eval_loss": 0.4518810510635376, + "eval_runtime": 19.7884, + "eval_samples_per_second": 3.032, + "eval_steps_per_second": 0.758, + "num_input_tokens_seen": 11243856, + "step": 1650 + }, + { + "epoch": 0.2027502986125999, + "grad_norm": 8.839831223557832, + "learning_rate": 5.6305258170538676e-05, + "loss": 0.4356, + "num_input_tokens_seen": 11277312, + "step": 1655 + }, + { + "epoch": 0.20336283727910323, + "grad_norm": 9.582783583521389, + "learning_rate": 5.606396838947988e-05, + "loss": 0.4458, + "num_input_tokens_seen": 11310576, + "step": 1660 + }, + { + "epoch": 0.20397537594560658, + "grad_norm": 31.581364520370983, + "learning_rate": 5.582253519467432e-05, + "loss": 0.4618, + "num_input_tokens_seen": 11344448, + "step": 1665 + }, + { + "epoch": 0.2045879146121099, + "grad_norm": 4.9445636584070405, + "learning_rate": 5.558096429605263e-05, + "loss": 0.4647, + "num_input_tokens_seen": 11377768, + "step": 1670 + }, + { + "epoch": 0.20520045327861322, + "grad_norm": 5.914575788346067, + "learning_rate": 5.533926140680221e-05, + "loss": 0.4304, + "num_input_tokens_seen": 11411408, + "step": 1675 + }, + { + "epoch": 0.20581299194511654, + "grad_norm": 9.949985795513346, + "learning_rate": 5.509743224323203e-05, + "loss": 0.5269, + "num_input_tokens_seen": 11445104, + "step": 1680 + }, + { + "epoch": 0.20642553061161986, + "grad_norm": 11.77212649220159, + "learning_rate": 5.485548252463749e-05, + "loss": 0.4352, + "num_input_tokens_seen": 11478128, + "step": 1685 + }, + { + "epoch": 0.20703806927812318, + "grad_norm": 5.315659390063466, + "learning_rate": 5.4613417973165106e-05, + "loss": 0.4879, + "num_input_tokens_seen": 11511768, + "step": 1690 + }, + { + "epoch": 0.2076506079446265, + "grad_norm": 4.668792370073011, + "learning_rate": 5.4371244313677225e-05, + "loss": 0.4436, + "num_input_tokens_seen": 11545128, + "step": 1695 + }, + { + "epoch": 0.20826314661112982, + "grad_norm": 9.478015429036938, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.4487, + "num_input_tokens_seen": 11578832, + "step": 1700 + }, + { + "epoch": 0.20826314661112982, + "eval_loss": 0.49612876772880554, + "eval_runtime": 19.3772, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 11578832, + "step": 1700 + }, + { + "epoch": 0.20887568527763314, + "grad_norm": 8.347528695570064, + "learning_rate": 5.388659258287102e-05, + "loss": 0.4203, + "num_input_tokens_seen": 11612808, + "step": 1705 + }, + { + "epoch": 0.20948822394413646, + "grad_norm": 15.597256430225785, + "learning_rate": 5.364412597363759e-05, + "loss": 0.4455, + "num_input_tokens_seen": 11646424, + "step": 1710 + }, + { + "epoch": 0.21010076261063979, + "grad_norm": 3.677381831905675, + "learning_rate": 5.3401573180287426e-05, + "loss": 0.5335, + "num_input_tokens_seen": 11679824, + "step": 1715 + }, + { + "epoch": 0.21071330127714313, + "grad_norm": 4.062997502486425, + "learning_rate": 5.315893993922986e-05, + "loss": 0.3974, + "num_input_tokens_seen": 11713928, + "step": 1720 + }, + { + "epoch": 0.21132583994364645, + "grad_norm": 10.159185964965634, + "learning_rate": 5.29162319887768e-05, + "loss": 0.479, + "num_input_tokens_seen": 11747400, + "step": 1725 + }, + { + "epoch": 0.21193837861014977, + "grad_norm": 5.441259983154067, + "learning_rate": 5.26734550690071e-05, + "loss": 0.388, + "num_input_tokens_seen": 11781016, + "step": 1730 + }, + { + "epoch": 0.2125509172766531, + "grad_norm": 19.73237170097979, + "learning_rate": 5.243061492163073e-05, + "loss": 0.4914, + "num_input_tokens_seen": 11815064, + "step": 1735 + }, + { + "epoch": 0.21316345594315642, + "grad_norm": 3.493641501182873, + "learning_rate": 5.2187717289852955e-05, + "loss": 0.4622, + "num_input_tokens_seen": 11848280, + "step": 1740 + }, + { + "epoch": 0.21377599460965974, + "grad_norm": 11.906732736443383, + "learning_rate": 5.1944767918238624e-05, + "loss": 0.4828, + "num_input_tokens_seen": 11881960, + "step": 1745 + }, + { + "epoch": 0.21438853327616306, + "grad_norm": 7.438336883777484, + "learning_rate": 5.170177255257618e-05, + "loss": 0.5235, + "num_input_tokens_seen": 11916256, + "step": 1750 + }, + { + "epoch": 0.21438853327616306, + "eval_loss": 0.42333686351776123, + "eval_runtime": 19.3903, + "eval_samples_per_second": 3.094, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 11916256, + "step": 1750 + }, + { + "epoch": 0.21500107194266638, + "grad_norm": 4.53730252225709, + "learning_rate": 5.145873693974188e-05, + "loss": 0.3928, + "num_input_tokens_seen": 11950568, + "step": 1755 + }, + { + "epoch": 0.2156136106091697, + "grad_norm": 11.02527272420348, + "learning_rate": 5.12156668275638e-05, + "loss": 0.3973, + "num_input_tokens_seen": 11984568, + "step": 1760 + }, + { + "epoch": 0.21622614927567302, + "grad_norm": 5.1988597139524195, + "learning_rate": 5.097256796468598e-05, + "loss": 0.4793, + "num_input_tokens_seen": 12017768, + "step": 1765 + }, + { + "epoch": 0.21683868794217634, + "grad_norm": 4.213782685694989, + "learning_rate": 5.072944610043232e-05, + "loss": 0.4581, + "num_input_tokens_seen": 12051280, + "step": 1770 + }, + { + "epoch": 0.21745122660867966, + "grad_norm": 4.503591673421781, + "learning_rate": 5.048630698467081e-05, + "loss": 0.4589, + "num_input_tokens_seen": 12084648, + "step": 1775 + }, + { + "epoch": 0.218063765275183, + "grad_norm": 15.808394441028096, + "learning_rate": 5.024315636767738e-05, + "loss": 0.4192, + "num_input_tokens_seen": 12118104, + "step": 1780 + }, + { + "epoch": 0.21867630394168633, + "grad_norm": 2.7188846806829714, + "learning_rate": 5e-05, + "loss": 0.4301, + "num_input_tokens_seen": 12150744, + "step": 1785 + }, + { + "epoch": 0.21928884260818965, + "grad_norm": 2.387735076616511, + "learning_rate": 4.9756843632322626e-05, + "loss": 0.4387, + "num_input_tokens_seen": 12184368, + "step": 1790 + }, + { + "epoch": 0.21990138127469297, + "grad_norm": 8.062473009971352, + "learning_rate": 4.9513693015329197e-05, + "loss": 0.3937, + "num_input_tokens_seen": 12218216, + "step": 1795 + }, + { + "epoch": 0.2205139199411963, + "grad_norm": 8.02238943963026, + "learning_rate": 4.9270553899567686e-05, + "loss": 0.4561, + "num_input_tokens_seen": 12251696, + "step": 1800 + }, + { + "epoch": 0.2205139199411963, + "eval_loss": 0.3055322766304016, + "eval_runtime": 19.5686, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 12251696, + "step": 1800 + }, + { + "epoch": 0.2211264586076996, + "grad_norm": 1.353620144495087, + "learning_rate": 4.902743203531405e-05, + "loss": 0.8177, + "num_input_tokens_seen": 12259384, + "step": 1805 + }, + { + "epoch": 0.22173899727420293, + "grad_norm": 0.5700418045320662, + "learning_rate": 4.8784333172436206e-05, + "loss": 0.7884, + "num_input_tokens_seen": 12266544, + "step": 1810 + }, + { + "epoch": 0.22235153594070625, + "grad_norm": 0.6797749565907413, + "learning_rate": 4.854126306025812e-05, + "loss": 0.8374, + "num_input_tokens_seen": 12274880, + "step": 1815 + }, + { + "epoch": 0.22296407460720957, + "grad_norm": 0.4177728052122358, + "learning_rate": 4.829822744742383e-05, + "loss": 0.8302, + "num_input_tokens_seen": 12282472, + "step": 1820 + }, + { + "epoch": 0.2235766132737129, + "grad_norm": 0.4119247756190227, + "learning_rate": 4.8055232081761395e-05, + "loss": 0.7611, + "num_input_tokens_seen": 12290488, + "step": 1825 + }, + { + "epoch": 0.2241891519402162, + "grad_norm": 0.41311976216024787, + "learning_rate": 4.781228271014704e-05, + "loss": 0.7718, + "num_input_tokens_seen": 12298600, + "step": 1830 + }, + { + "epoch": 0.22480169060671956, + "grad_norm": 0.37605463315990156, + "learning_rate": 4.756938507836929e-05, + "loss": 0.821, + "num_input_tokens_seen": 12306720, + "step": 1835 + }, + { + "epoch": 0.22541422927322288, + "grad_norm": 0.49067278201580067, + "learning_rate": 4.732654493099291e-05, + "loss": 0.7713, + "num_input_tokens_seen": 12314240, + "step": 1840 + }, + { + "epoch": 0.2260267679397262, + "grad_norm": 0.45665762418068107, + "learning_rate": 4.708376801122321e-05, + "loss": 0.8226, + "num_input_tokens_seen": 12321944, + "step": 1845 + }, + { + "epoch": 0.22663930660622952, + "grad_norm": 0.421164864448204, + "learning_rate": 4.6841060060770154e-05, + "loss": 0.7805, + "num_input_tokens_seen": 12329936, + "step": 1850 + }, + { + "epoch": 0.22663930660622952, + "eval_loss": 0.9060287475585938, + "eval_runtime": 46.026, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.326, + "num_input_tokens_seen": 12329936, + "step": 1850 + }, + { + "epoch": 0.22725184527273284, + "grad_norm": 0.3018912528420499, + "learning_rate": 4.659842681971258e-05, + "loss": 0.7034, + "num_input_tokens_seen": 12337664, + "step": 1855 + }, + { + "epoch": 0.22786438393923616, + "grad_norm": 0.3079166720080624, + "learning_rate": 4.635587402636241e-05, + "loss": 0.7901, + "num_input_tokens_seen": 12345248, + "step": 1860 + }, + { + "epoch": 0.22847692260573949, + "grad_norm": 0.429458182711573, + "learning_rate": 4.611340741712901e-05, + "loss": 0.7636, + "num_input_tokens_seen": 12353168, + "step": 1865 + }, + { + "epoch": 0.2290894612722428, + "grad_norm": 0.3321165402912945, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.7921, + "num_input_tokens_seen": 12361080, + "step": 1870 + }, + { + "epoch": 0.22970199993874613, + "grad_norm": 0.6526788000479108, + "learning_rate": 4.562875568632278e-05, + "loss": 0.7547, + "num_input_tokens_seen": 12369008, + "step": 1875 + }, + { + "epoch": 0.23031453860524945, + "grad_norm": 0.3296520809478606, + "learning_rate": 4.5386582026834906e-05, + "loss": 0.7681, + "num_input_tokens_seen": 12377544, + "step": 1880 + }, + { + "epoch": 0.23092707727175277, + "grad_norm": 0.3431777934452918, + "learning_rate": 4.5144517475362514e-05, + "loss": 0.7373, + "num_input_tokens_seen": 12384824, + "step": 1885 + }, + { + "epoch": 0.23153961593825612, + "grad_norm": 0.49744484948698575, + "learning_rate": 4.490256775676797e-05, + "loss": 0.7915, + "num_input_tokens_seen": 12392984, + "step": 1890 + }, + { + "epoch": 0.23215215460475944, + "grad_norm": 0.3448457492646803, + "learning_rate": 4.466073859319781e-05, + "loss": 0.8194, + "num_input_tokens_seen": 12401976, + "step": 1895 + }, + { + "epoch": 0.23276469327126276, + "grad_norm": 0.5914883849730865, + "learning_rate": 4.441903570394739e-05, + "loss": 0.7445, + "num_input_tokens_seen": 12410504, + "step": 1900 + }, + { + "epoch": 0.23276469327126276, + "eval_loss": 0.9240804314613342, + "eval_runtime": 17.9272, + "eval_samples_per_second": 3.347, + "eval_steps_per_second": 0.837, + "num_input_tokens_seen": 12410504, + "step": 1900 + }, + { + "epoch": 0.23337723193776608, + "grad_norm": 5.561253559620275, + "learning_rate": 4.41774648053257e-05, + "loss": 0.4769, + "num_input_tokens_seen": 12444176, + "step": 1905 + }, + { + "epoch": 0.2339897706042694, + "grad_norm": 2.2715839553470727, + "learning_rate": 4.3936031610520124e-05, + "loss": 0.5032, + "num_input_tokens_seen": 12477960, + "step": 1910 + }, + { + "epoch": 0.23460230927077272, + "grad_norm": 2.022015183766511, + "learning_rate": 4.3694741829461336e-05, + "loss": 0.4889, + "num_input_tokens_seen": 12510944, + "step": 1915 + }, + { + "epoch": 0.23521484793727604, + "grad_norm": 2.96224103966897, + "learning_rate": 4.345360116868823e-05, + "loss": 0.4184, + "num_input_tokens_seen": 12544808, + "step": 1920 + }, + { + "epoch": 0.23582738660377936, + "grad_norm": 7.708605720695005, + "learning_rate": 4.321261533121303e-05, + "loss": 0.4049, + "num_input_tokens_seen": 12577632, + "step": 1925 + }, + { + "epoch": 0.23643992527028268, + "grad_norm": 7.623302574606088, + "learning_rate": 4.2971790016386286e-05, + "loss": 0.4695, + "num_input_tokens_seen": 12610192, + "step": 1930 + }, + { + "epoch": 0.237052463936786, + "grad_norm": 1.8514536063761085, + "learning_rate": 4.273113091976225e-05, + "loss": 0.4962, + "num_input_tokens_seen": 12643688, + "step": 1935 + }, + { + "epoch": 0.23766500260328932, + "grad_norm": 19.66635272029866, + "learning_rate": 4.249064373296403e-05, + "loss": 0.4782, + "num_input_tokens_seen": 12677312, + "step": 1940 + }, + { + "epoch": 0.23827754126979264, + "grad_norm": 20.575936119954726, + "learning_rate": 4.225033414354908e-05, + "loss": 0.5615, + "num_input_tokens_seen": 12709896, + "step": 1945 + }, + { + "epoch": 0.238890079936296, + "grad_norm": 4.594313188449527, + "learning_rate": 4.201020783487464e-05, + "loss": 0.4354, + "num_input_tokens_seen": 12744416, + "step": 1950 + }, + { + "epoch": 0.238890079936296, + "eval_loss": 0.36006462574005127, + "eval_runtime": 47.6166, + "eval_samples_per_second": 1.26, + "eval_steps_per_second": 0.315, + "num_input_tokens_seen": 12744416, + "step": 1950 + }, + { + "epoch": 0.2395026186027993, + "grad_norm": 4.68189212550613, + "learning_rate": 4.17702704859633e-05, + "loss": 0.4335, + "num_input_tokens_seen": 12777816, + "step": 1955 + }, + { + "epoch": 0.24011515726930263, + "grad_norm": 23.402201287251145, + "learning_rate": 4.153052777136879e-05, + "loss": 0.4991, + "num_input_tokens_seen": 12811792, + "step": 1960 + }, + { + "epoch": 0.24072769593580595, + "grad_norm": 4.276835952734358, + "learning_rate": 4.1290985361041614e-05, + "loss": 0.4099, + "num_input_tokens_seen": 12845480, + "step": 1965 + }, + { + "epoch": 0.24134023460230927, + "grad_norm": 7.92599956097377, + "learning_rate": 4.105164892019514e-05, + "loss": 0.4272, + "num_input_tokens_seen": 12879160, + "step": 1970 + }, + { + "epoch": 0.2419527732688126, + "grad_norm": 13.615325924587754, + "learning_rate": 4.0812524109171476e-05, + "loss": 0.4238, + "num_input_tokens_seen": 12912816, + "step": 1975 + }, + { + "epoch": 0.2425653119353159, + "grad_norm": 2.7571074966923397, + "learning_rate": 4.0573616583307705e-05, + "loss": 0.368, + "num_input_tokens_seen": 12946824, + "step": 1980 + }, + { + "epoch": 0.24317785060181923, + "grad_norm": 16.927706810967727, + "learning_rate": 4.033493199280202e-05, + "loss": 0.3969, + "num_input_tokens_seen": 12980496, + "step": 1985 + }, + { + "epoch": 0.24379038926832255, + "grad_norm": 1.4605930584524072, + "learning_rate": 4.009647598258022e-05, + "loss": 0.399, + "num_input_tokens_seen": 13013696, + "step": 1990 + }, + { + "epoch": 0.24440292793482588, + "grad_norm": 8.843701569943493, + "learning_rate": 3.985825419216207e-05, + "loss": 0.4231, + "num_input_tokens_seen": 13048176, + "step": 1995 + }, + { + "epoch": 0.2450154666013292, + "grad_norm": 19.18251767586399, + "learning_rate": 3.962027225552807e-05, + "loss": 0.3997, + "num_input_tokens_seen": 13081512, + "step": 2000 + }, + { + "epoch": 0.2450154666013292, + "eval_loss": 0.3849605619907379, + "eval_runtime": 19.4758, + "eval_samples_per_second": 3.081, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 13081512, + "step": 2000 + }, + { + "epoch": 0.24562800526783254, + "grad_norm": 1.9769103366831327, + "learning_rate": 3.938253580098613e-05, + "loss": 0.4169, + "num_input_tokens_seen": 13114744, + "step": 2005 + }, + { + "epoch": 0.24624054393433586, + "grad_norm": 22.487375140704923, + "learning_rate": 3.914505045103845e-05, + "loss": 0.4124, + "num_input_tokens_seen": 13149336, + "step": 2010 + }, + { + "epoch": 0.24685308260083919, + "grad_norm": 4.015313196977713, + "learning_rate": 3.8907821822248605e-05, + "loss": 0.3863, + "num_input_tokens_seen": 13183000, + "step": 2015 + }, + { + "epoch": 0.2474656212673425, + "grad_norm": 28.645157052908086, + "learning_rate": 3.867085552510864e-05, + "loss": 0.4316, + "num_input_tokens_seen": 13216712, + "step": 2020 + }, + { + "epoch": 0.24807815993384583, + "grad_norm": 8.97088595108773, + "learning_rate": 3.843415716390644e-05, + "loss": 0.437, + "num_input_tokens_seen": 13250264, + "step": 2025 + }, + { + "epoch": 0.24869069860034915, + "grad_norm": 2.144718330247284, + "learning_rate": 3.819773233659314e-05, + "loss": 0.429, + "num_input_tokens_seen": 13283480, + "step": 2030 + }, + { + "epoch": 0.24930323726685247, + "grad_norm": 14.650720012943712, + "learning_rate": 3.7961586634650767e-05, + "loss": 0.3738, + "num_input_tokens_seen": 13316784, + "step": 2035 + }, + { + "epoch": 0.2499157759333558, + "grad_norm": 11.359385702545179, + "learning_rate": 3.772572564296005e-05, + "loss": 0.4322, + "num_input_tokens_seen": 13350408, + "step": 2040 + }, + { + "epoch": 0.25052831459985914, + "grad_norm": 20.751442880984992, + "learning_rate": 3.749015493966817e-05, + "loss": 0.488, + "num_input_tokens_seen": 13382992, + "step": 2045 + }, + { + "epoch": 0.25114085326636243, + "grad_norm": 9.202014165433594, + "learning_rate": 3.7254880096057073e-05, + "loss": 0.4267, + "num_input_tokens_seen": 13417048, + "step": 2050 + }, + { + "epoch": 0.25114085326636243, + "eval_loss": 0.494172602891922, + "eval_runtime": 48.3314, + "eval_samples_per_second": 1.241, + "eval_steps_per_second": 0.31, + "num_input_tokens_seen": 13417048, + "step": 2050 + }, + { + "epoch": 0.2517533919328658, + "grad_norm": 9.56597676789059, + "learning_rate": 3.7019906676411446e-05, + "loss": 0.499, + "num_input_tokens_seen": 13450600, + "step": 2055 + }, + { + "epoch": 0.25236593059936907, + "grad_norm": 15.111786919214577, + "learning_rate": 3.678524023788735e-05, + "loss": 0.3934, + "num_input_tokens_seen": 13483712, + "step": 2060 + }, + { + "epoch": 0.2529784692658724, + "grad_norm": 9.636847540533905, + "learning_rate": 3.6550886330380665e-05, + "loss": 0.5314, + "num_input_tokens_seen": 13517160, + "step": 2065 + }, + { + "epoch": 0.2535910079323757, + "grad_norm": 3.4096644426738534, + "learning_rate": 3.631685049639586e-05, + "loss": 0.4361, + "num_input_tokens_seen": 13550536, + "step": 2070 + }, + { + "epoch": 0.25420354659887906, + "grad_norm": 30.17044681697227, + "learning_rate": 3.608313827091493e-05, + "loss": 0.4021, + "num_input_tokens_seen": 13584624, + "step": 2075 + }, + { + "epoch": 0.25481608526538235, + "grad_norm": 25.76593356219763, + "learning_rate": 3.5849755181266474e-05, + "loss": 0.4237, + "num_input_tokens_seen": 13618640, + "step": 2080 + }, + { + "epoch": 0.2554286239318857, + "grad_norm": 13.088571765226323, + "learning_rate": 3.5616706746995026e-05, + "loss": 0.4166, + "num_input_tokens_seen": 13652304, + "step": 2085 + }, + { + "epoch": 0.25604116259838905, + "grad_norm": 23.32466588590324, + "learning_rate": 3.538399847973036e-05, + "loss": 0.3853, + "num_input_tokens_seen": 13685632, + "step": 2090 + }, + { + "epoch": 0.25665370126489234, + "grad_norm": 24.40048049188315, + "learning_rate": 3.515163588305735e-05, + "loss": 0.4844, + "num_input_tokens_seen": 13719416, + "step": 2095 + }, + { + "epoch": 0.2572662399313957, + "grad_norm": 6.703265005863666, + "learning_rate": 3.491962445238569e-05, + "loss": 0.4008, + "num_input_tokens_seen": 13752680, + "step": 2100 + }, + { + "epoch": 0.2572662399313957, + "eval_loss": 0.35964828729629517, + "eval_runtime": 20.1866, + "eval_samples_per_second": 2.972, + "eval_steps_per_second": 0.743, + "num_input_tokens_seen": 13752680, + "step": 2100 + }, + { + "epoch": 0.257878778597899, + "grad_norm": 9.515604275498823, + "learning_rate": 3.4687969674819906e-05, + "loss": 0.4508, + "num_input_tokens_seen": 13785680, + "step": 2105 + }, + { + "epoch": 0.25849131726440233, + "grad_norm": 2.7386560103066175, + "learning_rate": 3.445667702902969e-05, + "loss": 0.3373, + "num_input_tokens_seen": 13820392, + "step": 2110 + }, + { + "epoch": 0.2591038559309056, + "grad_norm": 9.003875068176624, + "learning_rate": 3.4225751985120215e-05, + "loss": 0.4516, + "num_input_tokens_seen": 13853688, + "step": 2115 + }, + { + "epoch": 0.259716394597409, + "grad_norm": 12.008310452093387, + "learning_rate": 3.3995200004502816e-05, + "loss": 0.3864, + "num_input_tokens_seen": 13887904, + "step": 2120 + }, + { + "epoch": 0.26032893326391227, + "grad_norm": 2.00737022432206, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.3819, + "num_input_tokens_seen": 13921040, + "step": 2125 + }, + { + "epoch": 0.2609414719304156, + "grad_norm": 10.440370861237607, + "learning_rate": 3.3535237034545675e-05, + "loss": 0.3935, + "num_input_tokens_seen": 13954752, + "step": 2130 + }, + { + "epoch": 0.2615540105969189, + "grad_norm": 9.230846114493719, + "learning_rate": 3.330583692339802e-05, + "loss": 0.4752, + "num_input_tokens_seen": 13988376, + "step": 2135 + }, + { + "epoch": 0.26216654926342225, + "grad_norm": 4.660334845751664, + "learning_rate": 3.307683163166934e-05, + "loss": 0.3697, + "num_input_tokens_seen": 14023000, + "step": 2140 + }, + { + "epoch": 0.2627790879299256, + "grad_norm": 5.861878648606103, + "learning_rate": 3.284822657536856e-05, + "loss": 0.448, + "num_input_tokens_seen": 14056712, + "step": 2145 + }, + { + "epoch": 0.2633916265964289, + "grad_norm": 0.9940779078202018, + "learning_rate": 3.262002716103897e-05, + "loss": 0.3858, + "num_input_tokens_seen": 14090488, + "step": 2150 + }, + { + "epoch": 0.2633916265964289, + "eval_loss": 0.31454333662986755, + "eval_runtime": 19.5976, + "eval_samples_per_second": 3.062, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 14090488, + "step": 2150 + }, + { + "epoch": 0.26400416526293224, + "grad_norm": 1.6267921296338133, + "learning_rate": 3.2392238785630386e-05, + "loss": 0.4647, + "num_input_tokens_seen": 14124000, + "step": 2155 + }, + { + "epoch": 0.26461670392943554, + "grad_norm": 9.537288786639904, + "learning_rate": 3.216486683637146e-05, + "loss": 0.4581, + "num_input_tokens_seen": 14157144, + "step": 2160 + }, + { + "epoch": 0.2652292425959389, + "grad_norm": 13.919588540482863, + "learning_rate": 3.1937916690642356e-05, + "loss": 0.4629, + "num_input_tokens_seen": 14191304, + "step": 2165 + }, + { + "epoch": 0.2658417812624422, + "grad_norm": 5.8245224037145995, + "learning_rate": 3.1711393715847476e-05, + "loss": 0.5118, + "num_input_tokens_seen": 14225392, + "step": 2170 + }, + { + "epoch": 0.2664543199289455, + "grad_norm": 6.310665585817186, + "learning_rate": 3.14853032692886e-05, + "loss": 0.3876, + "num_input_tokens_seen": 14259152, + "step": 2175 + }, + { + "epoch": 0.2670668585954488, + "grad_norm": 6.596666577714051, + "learning_rate": 3.125965069803811e-05, + "loss": 0.3996, + "num_input_tokens_seen": 14293280, + "step": 2180 + }, + { + "epoch": 0.26767939726195217, + "grad_norm": 11.841729722102258, + "learning_rate": 3.103444133881261e-05, + "loss": 0.3859, + "num_input_tokens_seen": 14326560, + "step": 2185 + }, + { + "epoch": 0.26829193592845546, + "grad_norm": 5.13083530523421, + "learning_rate": 3.080968051784666e-05, + "loss": 0.4347, + "num_input_tokens_seen": 14359792, + "step": 2190 + }, + { + "epoch": 0.2689044745949588, + "grad_norm": 12.911132033702161, + "learning_rate": 3.058537355076683e-05, + "loss": 0.5581, + "num_input_tokens_seen": 14392976, + "step": 2195 + }, + { + "epoch": 0.26951701326146216, + "grad_norm": 1.6547413471836558, + "learning_rate": 3.0361525742465973e-05, + "loss": 0.3598, + "num_input_tokens_seen": 14427600, + "step": 2200 + }, + { + "epoch": 0.26951701326146216, + "eval_loss": 0.3024410307407379, + "eval_runtime": 19.5364, + "eval_samples_per_second": 3.071, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 14427600, + "step": 2200 + }, + { + "epoch": 0.27012955192796545, + "grad_norm": 6.635357330024726, + "learning_rate": 3.0138142386977787e-05, + "loss": 0.3613, + "num_input_tokens_seen": 14461704, + "step": 2205 + }, + { + "epoch": 0.2707420905944688, + "grad_norm": 11.886638526886959, + "learning_rate": 2.991522876735154e-05, + "loss": 0.437, + "num_input_tokens_seen": 14494512, + "step": 2210 + }, + { + "epoch": 0.2713546292609721, + "grad_norm": 4.796883312461099, + "learning_rate": 2.9692790155527227e-05, + "loss": 0.442, + "num_input_tokens_seen": 14527576, + "step": 2215 + }, + { + "epoch": 0.27196716792747544, + "grad_norm": 5.891028837774642, + "learning_rate": 2.9470831812210837e-05, + "loss": 0.4013, + "num_input_tokens_seen": 14560696, + "step": 2220 + }, + { + "epoch": 0.27257970659397873, + "grad_norm": 6.713064850021583, + "learning_rate": 2.924935898674992e-05, + "loss": 0.4349, + "num_input_tokens_seen": 14593944, + "step": 2225 + }, + { + "epoch": 0.2731922452604821, + "grad_norm": 2.473226108749198, + "learning_rate": 2.902837691700945e-05, + "loss": 0.4662, + "num_input_tokens_seen": 14626400, + "step": 2230 + }, + { + "epoch": 0.2738047839269854, + "grad_norm": 7.5574669353881125, + "learning_rate": 2.880789082924798e-05, + "loss": 0.4348, + "num_input_tokens_seen": 14660184, + "step": 2235 + }, + { + "epoch": 0.2744173225934887, + "grad_norm": 2.047740113874142, + "learning_rate": 2.858790593799405e-05, + "loss": 0.4075, + "num_input_tokens_seen": 14692760, + "step": 2240 + }, + { + "epoch": 0.275029861259992, + "grad_norm": 20.949481869004938, + "learning_rate": 2.8368427445922696e-05, + "loss": 0.3818, + "num_input_tokens_seen": 14727408, + "step": 2245 + }, + { + "epoch": 0.27564239992649536, + "grad_norm": 22.392481911921553, + "learning_rate": 2.8149460543732664e-05, + "loss": 0.4352, + "num_input_tokens_seen": 14761264, + "step": 2250 + }, + { + "epoch": 0.27564239992649536, + "eval_loss": 0.2756429612636566, + "eval_runtime": 19.5342, + "eval_samples_per_second": 3.072, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 14761264, + "step": 2250 + }, + { + "epoch": 0.27625493859299866, + "grad_norm": 5.128296717638175, + "learning_rate": 2.7931010410023518e-05, + "loss": 0.4691, + "num_input_tokens_seen": 14794600, + "step": 2255 + }, + { + "epoch": 0.276867477259502, + "grad_norm": 16.038759138525798, + "learning_rate": 2.771308221117309e-05, + "loss": 0.3836, + "num_input_tokens_seen": 14828632, + "step": 2260 + }, + { + "epoch": 0.27748001592600535, + "grad_norm": 3.5907735517497867, + "learning_rate": 2.749568110121545e-05, + "loss": 0.5366, + "num_input_tokens_seen": 14862480, + "step": 2265 + }, + { + "epoch": 0.27809255459250865, + "grad_norm": 8.297812191891177, + "learning_rate": 2.7278812221718924e-05, + "loss": 0.4599, + "num_input_tokens_seen": 14896168, + "step": 2270 + }, + { + "epoch": 0.278705093259012, + "grad_norm": 9.040439146901496, + "learning_rate": 2.7062480701664488e-05, + "loss": 0.3894, + "num_input_tokens_seen": 14929688, + "step": 2275 + }, + { + "epoch": 0.2793176319255153, + "grad_norm": 10.72302291995307, + "learning_rate": 2.6846691657324473e-05, + "loss": 0.3963, + "num_input_tokens_seen": 14963456, + "step": 2280 + }, + { + "epoch": 0.27993017059201863, + "grad_norm": 15.076756776442725, + "learning_rate": 2.663145019214163e-05, + "loss": 0.4045, + "num_input_tokens_seen": 14996664, + "step": 2285 + }, + { + "epoch": 0.2805427092585219, + "grad_norm": 4.895485600989452, + "learning_rate": 2.6416761396608362e-05, + "loss": 0.4594, + "num_input_tokens_seen": 15030520, + "step": 2290 + }, + { + "epoch": 0.2811552479250253, + "grad_norm": 8.089827643849977, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.4254, + "num_input_tokens_seen": 15064448, + "step": 2295 + }, + { + "epoch": 0.28176778659152857, + "grad_norm": 5.673306903636319, + "learning_rate": 2.598906211098643e-05, + "loss": 0.3788, + "num_input_tokens_seen": 15098096, + "step": 2300 + }, + { + "epoch": 0.28176778659152857, + "eval_loss": 0.23389142751693726, + "eval_runtime": 19.9772, + "eval_samples_per_second": 3.003, + "eval_steps_per_second": 0.751, + "num_input_tokens_seen": 15098096, + "step": 2300 + }, + { + "epoch": 0.2823803252580319, + "grad_norm": 8.107264390803739, + "learning_rate": 2.577606173604894e-05, + "loss": 0.4748, + "num_input_tokens_seen": 15132152, + "step": 2305 + }, + { + "epoch": 0.2829928639245352, + "grad_norm": 7.978901949837573, + "learning_rate": 2.5563634260824175e-05, + "loss": 0.4236, + "num_input_tokens_seen": 15165944, + "step": 2310 + }, + { + "epoch": 0.28360540259103856, + "grad_norm": 3.6259304392810625, + "learning_rate": 2.535178470925323e-05, + "loss": 0.3802, + "num_input_tokens_seen": 15199864, + "step": 2315 + }, + { + "epoch": 0.2842179412575419, + "grad_norm": 4.077088729204204, + "learning_rate": 2.5140518091609256e-05, + "loss": 0.3804, + "num_input_tokens_seen": 15233216, + "step": 2320 + }, + { + "epoch": 0.2848304799240452, + "grad_norm": 1.6378722483630779, + "learning_rate": 2.4929839404378936e-05, + "loss": 0.4197, + "num_input_tokens_seen": 15266328, + "step": 2325 + }, + { + "epoch": 0.28544301859054855, + "grad_norm": 4.351117674559572, + "learning_rate": 2.471975363014428e-05, + "loss": 0.4293, + "num_input_tokens_seen": 15300120, + "step": 2330 + }, + { + "epoch": 0.28605555725705184, + "grad_norm": 2.989266130311886, + "learning_rate": 2.451026573746482e-05, + "loss": 0.3785, + "num_input_tokens_seen": 15333584, + "step": 2335 + }, + { + "epoch": 0.2866680959235552, + "grad_norm": 4.655888745721513, + "learning_rate": 2.430138068076013e-05, + "loss": 0.3369, + "num_input_tokens_seen": 15366848, + "step": 2340 + }, + { + "epoch": 0.2872806345900585, + "grad_norm": 1.5329024528705901, + "learning_rate": 2.4093103400192625e-05, + "loss": 0.4003, + "num_input_tokens_seen": 15400264, + "step": 2345 + }, + { + "epoch": 0.28789317325656183, + "grad_norm": 2.026233665473392, + "learning_rate": 2.388543882155067e-05, + "loss": 0.4806, + "num_input_tokens_seen": 15434176, + "step": 2350 + }, + { + "epoch": 0.28789317325656183, + "eval_loss": 0.28239962458610535, + "eval_runtime": 19.7123, + "eval_samples_per_second": 3.044, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 15434176, + "step": 2350 + }, + { + "epoch": 0.2885057119230651, + "grad_norm": 9.495051530761005, + "learning_rate": 2.3678391856132204e-05, + "loss": 0.477, + "num_input_tokens_seen": 15467792, + "step": 2355 + }, + { + "epoch": 0.28911825058956847, + "grad_norm": 1.9552792809582171, + "learning_rate": 2.3471967400628513e-05, + "loss": 0.4227, + "num_input_tokens_seen": 15501216, + "step": 2360 + }, + { + "epoch": 0.28973078925607176, + "grad_norm": 6.855247917430915, + "learning_rate": 2.3266170337008398e-05, + "loss": 0.4129, + "num_input_tokens_seen": 15534752, + "step": 2365 + }, + { + "epoch": 0.2903433279225751, + "grad_norm": 11.151369787236101, + "learning_rate": 2.306100553240274e-05, + "loss": 0.3405, + "num_input_tokens_seen": 15568376, + "step": 2370 + }, + { + "epoch": 0.29095586658907846, + "grad_norm": 6.315806159263407, + "learning_rate": 2.2856477838989456e-05, + "loss": 0.4279, + "num_input_tokens_seen": 15601568, + "step": 2375 + }, + { + "epoch": 0.29156840525558175, + "grad_norm": 4.862541966856092, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.3586, + "num_input_tokens_seen": 15635584, + "step": 2380 + }, + { + "epoch": 0.2921809439220851, + "grad_norm": 2.2921327645566505, + "learning_rate": 2.244935311899829e-05, + "loss": 0.4119, + "num_input_tokens_seen": 15669544, + "step": 2385 + }, + { + "epoch": 0.2927934825885884, + "grad_norm": 4.23488238196988, + "learning_rate": 2.224676572098007e-05, + "loss": 0.3929, + "num_input_tokens_seen": 15703472, + "step": 2390 + }, + { + "epoch": 0.29340602125509174, + "grad_norm": 2.6117480876414025, + "learning_rate": 2.2044834691045873e-05, + "loss": 0.4627, + "num_input_tokens_seen": 15737312, + "step": 2395 + }, + { + "epoch": 0.29401855992159504, + "grad_norm": 1.3152254182154262, + "learning_rate": 2.184356480489432e-05, + "loss": 0.3893, + "num_input_tokens_seen": 15771224, + "step": 2400 + }, + { + "epoch": 0.29401855992159504, + "eval_loss": 0.22477610409259796, + "eval_runtime": 19.5802, + "eval_samples_per_second": 3.064, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 15771224, + "step": 2400 + }, + { + "epoch": 0.2946310985880984, + "grad_norm": 1.785187763406071, + "learning_rate": 2.1642960822587878e-05, + "loss": 0.3744, + "num_input_tokens_seen": 15804752, + "step": 2405 + }, + { + "epoch": 0.2952436372546017, + "grad_norm": 1.824770535044046, + "learning_rate": 2.1443027488440338e-05, + "loss": 0.4134, + "num_input_tokens_seen": 15838624, + "step": 2410 + }, + { + "epoch": 0.295856175921105, + "grad_norm": 11.369093211676532, + "learning_rate": 2.124376953090456e-05, + "loss": 0.4031, + "num_input_tokens_seen": 15872168, + "step": 2415 + }, + { + "epoch": 0.2964687145876083, + "grad_norm": 1.3496462605690733, + "learning_rate": 2.104519166246059e-05, + "loss": 0.3465, + "num_input_tokens_seen": 15905944, + "step": 2420 + }, + { + "epoch": 0.29708125325411167, + "grad_norm": 17.968018715547498, + "learning_rate": 2.0847298579504344e-05, + "loss": 0.4785, + "num_input_tokens_seen": 15939272, + "step": 2425 + }, + { + "epoch": 0.297693791920615, + "grad_norm": 4.734710678254364, + "learning_rate": 2.065009496223638e-05, + "loss": 0.3268, + "num_input_tokens_seen": 15973248, + "step": 2430 + }, + { + "epoch": 0.2983063305871183, + "grad_norm": 3.3918739224643937, + "learning_rate": 2.045358547455138e-05, + "loss": 0.3917, + "num_input_tokens_seen": 16006744, + "step": 2435 + }, + { + "epoch": 0.29891886925362166, + "grad_norm": 1.6637998065312563, + "learning_rate": 2.0257774763927655e-05, + "loss": 0.4549, + "num_input_tokens_seen": 16039672, + "step": 2440 + }, + { + "epoch": 0.29953140792012495, + "grad_norm": 10.38555952076315, + "learning_rate": 2.0062667461317426e-05, + "loss": 0.3082, + "num_input_tokens_seen": 16073712, + "step": 2445 + }, + { + "epoch": 0.3001439465866283, + "grad_norm": 1.3310869941302306, + "learning_rate": 1.9868268181037185e-05, + "loss": 0.4445, + "num_input_tokens_seen": 16106832, + "step": 2450 + }, + { + "epoch": 0.3001439465866283, + "eval_loss": 0.26860639452934265, + "eval_runtime": 19.4427, + "eval_samples_per_second": 3.086, + "eval_steps_per_second": 0.771, + "num_input_tokens_seen": 16106832, + "step": 2450 + }, + { + "epoch": 0.3007564852531316, + "grad_norm": 7.334660391422411, + "learning_rate": 1.967458152065857e-05, + "loss": 0.4825, + "num_input_tokens_seen": 16139896, + "step": 2455 + }, + { + "epoch": 0.30136902391963494, + "grad_norm": 15.163653155833089, + "learning_rate": 1.9481612060899646e-05, + "loss": 0.482, + "num_input_tokens_seen": 16172968, + "step": 2460 + }, + { + "epoch": 0.30198156258613823, + "grad_norm": 1.3584740205937087, + "learning_rate": 1.928936436551661e-05, + "loss": 0.4084, + "num_input_tokens_seen": 16205904, + "step": 2465 + }, + { + "epoch": 0.3025941012526416, + "grad_norm": 7.376785554222767, + "learning_rate": 1.9097842981195834e-05, + "loss": 0.3349, + "num_input_tokens_seen": 16239760, + "step": 2470 + }, + { + "epoch": 0.30320663991914487, + "grad_norm": 1.3495448088590587, + "learning_rate": 1.8907052437446272e-05, + "loss": 0.4111, + "num_input_tokens_seen": 16273064, + "step": 2475 + }, + { + "epoch": 0.3038191785856482, + "grad_norm": 1.4539593192914995, + "learning_rate": 1.871699724649244e-05, + "loss": 0.4171, + "num_input_tokens_seen": 16306760, + "step": 2480 + }, + { + "epoch": 0.30443171725215157, + "grad_norm": 10.383129040039119, + "learning_rate": 1.8527681903167644e-05, + "loss": 0.4116, + "num_input_tokens_seen": 16339720, + "step": 2485 + }, + { + "epoch": 0.30504425591865486, + "grad_norm": 21.53825575558579, + "learning_rate": 1.833911088480767e-05, + "loss": 0.4053, + "num_input_tokens_seen": 16373008, + "step": 2490 + }, + { + "epoch": 0.3056567945851582, + "grad_norm": 7.582361472730492, + "learning_rate": 1.8151288651144893e-05, + "loss": 0.3783, + "num_input_tokens_seen": 16406344, + "step": 2495 + }, + { + "epoch": 0.3062693332516615, + "grad_norm": 1.61973648667638, + "learning_rate": 1.796421964420285e-05, + "loss": 0.4099, + "num_input_tokens_seen": 16439616, + "step": 2500 + }, + { + "epoch": 0.3062693332516615, + "eval_loss": 0.2817166745662689, + "eval_runtime": 19.6613, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 16439616, + "step": 2500 + }, + { + "epoch": 0.30688187191816485, + "grad_norm": 22.851302452172163, + "learning_rate": 1.7777908288191176e-05, + "loss": 0.4116, + "num_input_tokens_seen": 16473632, + "step": 2505 + }, + { + "epoch": 0.30749441058466814, + "grad_norm": 1.6534147455728052, + "learning_rate": 1.7592358989400883e-05, + "loss": 0.3251, + "num_input_tokens_seen": 16506688, + "step": 2510 + }, + { + "epoch": 0.3081069492511715, + "grad_norm": 3.904001341363943, + "learning_rate": 1.740757613610028e-05, + "loss": 0.3927, + "num_input_tokens_seen": 16539672, + "step": 2515 + }, + { + "epoch": 0.3087194879176748, + "grad_norm": 19.4972361189859, + "learning_rate": 1.7223564098431067e-05, + "loss": 0.4169, + "num_input_tokens_seen": 16573248, + "step": 2520 + }, + { + "epoch": 0.30933202658417813, + "grad_norm": 25.249052274257206, + "learning_rate": 1.704032722830512e-05, + "loss": 0.3538, + "num_input_tokens_seen": 16607280, + "step": 2525 + }, + { + "epoch": 0.3099445652506814, + "grad_norm": 1.2113354476595224, + "learning_rate": 1.68578698593014e-05, + "loss": 0.4374, + "num_input_tokens_seen": 16641208, + "step": 2530 + }, + { + "epoch": 0.3105571039171848, + "grad_norm": 17.40529053372968, + "learning_rate": 1.6676196306563613e-05, + "loss": 0.3771, + "num_input_tokens_seen": 16673784, + "step": 2535 + }, + { + "epoch": 0.3111696425836881, + "grad_norm": 12.222305683656439, + "learning_rate": 1.6495310866698093e-05, + "loss": 0.419, + "num_input_tokens_seen": 16707264, + "step": 2540 + }, + { + "epoch": 0.3117821812501914, + "grad_norm": 7.115919310670762, + "learning_rate": 1.631521781767214e-05, + "loss": 0.2976, + "num_input_tokens_seen": 16741208, + "step": 2545 + }, + { + "epoch": 0.31239471991669476, + "grad_norm": 2.2376837367188283, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.4017, + "num_input_tokens_seen": 16774544, + "step": 2550 + }, + { + "epoch": 0.31239471991669476, + "eval_loss": 0.2488662749528885, + "eval_runtime": 19.5345, + "eval_samples_per_second": 3.071, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 16774544, + "step": 2550 + }, + { + "epoch": 0.31300725858319806, + "grad_norm": 5.745540558325498, + "learning_rate": 1.5957425910206785e-05, + "loss": 0.3832, + "num_input_tokens_seen": 16807880, + "step": 2555 + }, + { + "epoch": 0.3136197972497014, + "grad_norm": 15.900061585891976, + "learning_rate": 1.577973551359877e-05, + "loss": 0.452, + "num_input_tokens_seen": 16840456, + "step": 2560 + }, + { + "epoch": 0.3142323359162047, + "grad_norm": 22.60762663595349, + "learning_rate": 1.560285443129296e-05, + "loss": 0.4162, + "num_input_tokens_seen": 16874408, + "step": 2565 + }, + { + "epoch": 0.31484487458270805, + "grad_norm": 1.7483061326786193, + "learning_rate": 1.542678684655306e-05, + "loss": 0.38, + "num_input_tokens_seen": 16908120, + "step": 2570 + }, + { + "epoch": 0.31545741324921134, + "grad_norm": 1.6412338330835767, + "learning_rate": 1.5251536923403426e-05, + "loss": 0.4196, + "num_input_tokens_seen": 16941248, + "step": 2575 + }, + { + "epoch": 0.3160699519157147, + "grad_norm": 2.0537696007574544, + "learning_rate": 1.5077108806530581e-05, + "loss": 0.3956, + "num_input_tokens_seen": 16975272, + "step": 2580 + }, + { + "epoch": 0.316682490582218, + "grad_norm": 1.6972109557185742, + "learning_rate": 1.4903506621185192e-05, + "loss": 0.4497, + "num_input_tokens_seen": 17008144, + "step": 2585 + }, + { + "epoch": 0.3172950292487213, + "grad_norm": 2.1323530684816463, + "learning_rate": 1.4730734473084568e-05, + "loss": 0.4013, + "num_input_tokens_seen": 17042320, + "step": 2590 + }, + { + "epoch": 0.3179075679152247, + "grad_norm": 8.23818495388703, + "learning_rate": 1.4558796448315504e-05, + "loss": 0.3767, + "num_input_tokens_seen": 17075336, + "step": 2595 + }, + { + "epoch": 0.31852010658172797, + "grad_norm": 13.376399529880672, + "learning_rate": 1.4387696613237612e-05, + "loss": 0.4698, + "num_input_tokens_seen": 17108368, + "step": 2600 + }, + { + "epoch": 0.31852010658172797, + "eval_loss": 0.20822136104106903, + "eval_runtime": 19.2103, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 17108368, + "step": 2600 + }, + { + "epoch": 0.3191326452482313, + "grad_norm": 2.139085348337642, + "learning_rate": 1.4217439014387251e-05, + "loss": 0.3813, + "num_input_tokens_seen": 17141208, + "step": 2605 + }, + { + "epoch": 0.3197451839147346, + "grad_norm": 10.882544645433732, + "learning_rate": 1.404802767838176e-05, + "loss": 0.4327, + "num_input_tokens_seen": 17174376, + "step": 2610 + }, + { + "epoch": 0.32035772258123796, + "grad_norm": 6.941813532036941, + "learning_rate": 1.3879466611824199e-05, + "loss": 0.4071, + "num_input_tokens_seen": 17207208, + "step": 2615 + }, + { + "epoch": 0.32097026124774125, + "grad_norm": 3.619455574980742, + "learning_rate": 1.371175980120864e-05, + "loss": 0.3462, + "num_input_tokens_seen": 17240704, + "step": 2620 + }, + { + "epoch": 0.3215827999142446, + "grad_norm": 15.275775839554932, + "learning_rate": 1.3544911212825906e-05, + "loss": 0.3604, + "num_input_tokens_seen": 17274096, + "step": 2625 + }, + { + "epoch": 0.3221953385807479, + "grad_norm": 3.5282860892228407, + "learning_rate": 1.337892479266974e-05, + "loss": 0.4219, + "num_input_tokens_seen": 17306760, + "step": 2630 + }, + { + "epoch": 0.32280787724725124, + "grad_norm": 1.8606086847663792, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.3395, + "num_input_tokens_seen": 17340600, + "step": 2635 + }, + { + "epoch": 0.32342041591375453, + "grad_norm": 12.837682752435335, + "learning_rate": 1.3049554138967051e-05, + "loss": 0.4356, + "num_input_tokens_seen": 17373800, + "step": 2640 + }, + { + "epoch": 0.3240329545802579, + "grad_norm": 3.8435561496863575, + "learning_rate": 1.2886177695085078e-05, + "loss": 0.3651, + "num_input_tokens_seen": 17407296, + "step": 2645 + }, + { + "epoch": 0.3246454932467612, + "grad_norm": 9.959670396486281, + "learning_rate": 1.2723678998574512e-05, + "loss": 0.3722, + "num_input_tokens_seen": 17440920, + "step": 2650 + }, + { + "epoch": 0.3246454932467612, + "eval_loss": 0.23050636053085327, + "eval_runtime": 20.1258, + "eval_samples_per_second": 2.981, + "eval_steps_per_second": 0.745, + "num_input_tokens_seen": 17440920, + "step": 2650 + }, + { + "epoch": 0.3252580319132645, + "grad_norm": 2.6545226056586455, + "learning_rate": 1.2562061892553473e-05, + "loss": 0.3658, + "num_input_tokens_seen": 17473920, + "step": 2655 + }, + { + "epoch": 0.32587057057976787, + "grad_norm": 12.019242095651537, + "learning_rate": 1.2401330199290367e-05, + "loss": 0.4345, + "num_input_tokens_seen": 17506776, + "step": 2660 + }, + { + "epoch": 0.32648310924627116, + "grad_norm": 14.966427985843586, + "learning_rate": 1.224148772011346e-05, + "loss": 0.4469, + "num_input_tokens_seen": 17540248, + "step": 2665 + }, + { + "epoch": 0.3270956479127745, + "grad_norm": 9.185002926510967, + "learning_rate": 1.2082538235320929e-05, + "loss": 0.4398, + "num_input_tokens_seen": 17573776, + "step": 2670 + }, + { + "epoch": 0.3277081865792778, + "grad_norm": 12.024883832936526, + "learning_rate": 1.1924485504091565e-05, + "loss": 0.387, + "num_input_tokens_seen": 17607368, + "step": 2675 + }, + { + "epoch": 0.32832072524578115, + "grad_norm": 3.0352486488445054, + "learning_rate": 1.1767333264395736e-05, + "loss": 0.3381, + "num_input_tokens_seen": 17641536, + "step": 2680 + }, + { + "epoch": 0.32893326391228445, + "grad_norm": 1.1498840332996212, + "learning_rate": 1.1611085232907132e-05, + "loss": 0.3718, + "num_input_tokens_seen": 17674656, + "step": 2685 + }, + { + "epoch": 0.3295458025787878, + "grad_norm": 1.648674233229992, + "learning_rate": 1.14557451049147e-05, + "loss": 0.3983, + "num_input_tokens_seen": 17708072, + "step": 2690 + }, + { + "epoch": 0.3301583412452911, + "grad_norm": 14.316103154296064, + "learning_rate": 1.1301316554235397e-05, + "loss": 0.3805, + "num_input_tokens_seen": 17741152, + "step": 2695 + }, + { + "epoch": 0.33077087991179444, + "grad_norm": 4.329219321311998, + "learning_rate": 1.114780323312724e-05, + "loss": 0.4018, + "num_input_tokens_seen": 17774336, + "step": 2700 + }, + { + "epoch": 0.33077087991179444, + "eval_loss": 0.20385108888149261, + "eval_runtime": 19.5414, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 17774336, + "step": 2700 + }, + { + "epoch": 0.33138341857829773, + "grad_norm": 7.91356814327871, + "learning_rate": 1.0995208772202897e-05, + "loss": 0.3834, + "num_input_tokens_seen": 17808736, + "step": 2705 + }, + { + "epoch": 0.3319959572448011, + "grad_norm": 12.264907326394177, + "learning_rate": 1.0843536780343865e-05, + "loss": 0.3883, + "num_input_tokens_seen": 17842056, + "step": 2710 + }, + { + "epoch": 0.3326084959113044, + "grad_norm": 5.463705071978054, + "learning_rate": 1.069279084461513e-05, + "loss": 0.3865, + "num_input_tokens_seen": 17876008, + "step": 2715 + }, + { + "epoch": 0.3332210345778077, + "grad_norm": 5.09804272682971, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.4189, + "num_input_tokens_seen": 17909480, + "step": 2720 + }, + { + "epoch": 0.33383357324431107, + "grad_norm": 9.683616928990322, + "learning_rate": 1.0394091380217352e-05, + "loss": 0.3897, + "num_input_tokens_seen": 17942952, + "step": 2725 + }, + { + "epoch": 0.33444611191081436, + "grad_norm": 5.21920604011471, + "learning_rate": 1.0246144915834683e-05, + "loss": 0.3536, + "num_input_tokens_seen": 17976368, + "step": 2730 + }, + { + "epoch": 0.3350586505773177, + "grad_norm": 1.346409306329859, + "learning_rate": 1.0099138635988026e-05, + "loss": 0.3618, + "num_input_tokens_seen": 18009520, + "step": 2735 + }, + { + "epoch": 0.335671189243821, + "grad_norm": 6.6856243979872625, + "learning_rate": 9.953076017397578e-06, + "loss": 0.3658, + "num_input_tokens_seen": 18042664, + "step": 2740 + }, + { + "epoch": 0.33628372791032435, + "grad_norm": 1.3057574673207777, + "learning_rate": 9.807960514465792e-06, + "loss": 0.3484, + "num_input_tokens_seen": 18076976, + "step": 2745 + }, + { + "epoch": 0.33689626657682764, + "grad_norm": 5.841965266821186, + "learning_rate": 9.663795559195733e-06, + "loss": 0.3938, + "num_input_tokens_seen": 18110672, + "step": 2750 + }, + { + "epoch": 0.33689626657682764, + "eval_loss": 0.18945012986660004, + "eval_runtime": 19.4552, + "eval_samples_per_second": 3.084, + "eval_steps_per_second": 0.771, + "num_input_tokens_seen": 18110672, + "step": 2750 + }, + { + "epoch": 0.337508805243331, + "grad_norm": 1.5830795265495232, + "learning_rate": 9.520584561109864e-06, + "loss": 0.3582, + "num_input_tokens_seen": 18143336, + "step": 2755 + }, + { + "epoch": 0.3381213439098343, + "grad_norm": 2.0238056195413074, + "learning_rate": 9.378330907169386e-06, + "loss": 0.3499, + "num_input_tokens_seen": 18177216, + "step": 2760 + }, + { + "epoch": 0.33873388257633763, + "grad_norm": 1.2716851600414891, + "learning_rate": 9.237037961694223e-06, + "loss": 0.3259, + "num_input_tokens_seen": 18210408, + "step": 2765 + }, + { + "epoch": 0.339346421242841, + "grad_norm": 18.696339654229728, + "learning_rate": 9.096709066283354e-06, + "loss": 0.4411, + "num_input_tokens_seen": 18244544, + "step": 2770 + }, + { + "epoch": 0.33995895990934427, + "grad_norm": 7.78221245775907, + "learning_rate": 8.957347539735872e-06, + "loss": 0.3333, + "num_input_tokens_seen": 18278472, + "step": 2775 + }, + { + "epoch": 0.3405714985758476, + "grad_norm": 1.9755846872848903, + "learning_rate": 8.818956677972406e-06, + "loss": 0.3915, + "num_input_tokens_seen": 18312176, + "step": 2780 + }, + { + "epoch": 0.3411840372423509, + "grad_norm": 1.1399732752521283, + "learning_rate": 8.681539753957269e-06, + "loss": 0.3634, + "num_input_tokens_seen": 18346400, + "step": 2785 + }, + { + "epoch": 0.34179657590885426, + "grad_norm": 5.063302416668255, + "learning_rate": 8.545100017620988e-06, + "loss": 0.4182, + "num_input_tokens_seen": 18379800, + "step": 2790 + }, + { + "epoch": 0.34240911457535755, + "grad_norm": 13.331292772748537, + "learning_rate": 8.409640695783443e-06, + "loss": 0.3936, + "num_input_tokens_seen": 18412976, + "step": 2795 + }, + { + "epoch": 0.3430216532418609, + "grad_norm": 20.18962033720586, + "learning_rate": 8.275164992077556e-06, + "loss": 0.4413, + "num_input_tokens_seen": 18446936, + "step": 2800 + }, + { + "epoch": 0.3430216532418609, + "eval_loss": 0.1981697380542755, + "eval_runtime": 46.798, + "eval_samples_per_second": 1.282, + "eval_steps_per_second": 0.321, + "num_input_tokens_seen": 18446936, + "step": 2800 + }, + { + "epoch": 0.3436341919083642, + "grad_norm": 10.172112292978525, + "learning_rate": 8.141676086873572e-06, + "loss": 0.3995, + "num_input_tokens_seen": 18480624, + "step": 2805 + }, + { + "epoch": 0.34424673057486754, + "grad_norm": 2.869897815181081, + "learning_rate": 8.009177137203794e-06, + "loss": 0.3649, + "num_input_tokens_seen": 18514456, + "step": 2810 + }, + { + "epoch": 0.34485926924137084, + "grad_norm": 3.2851508257986057, + "learning_rate": 7.877671276687898e-06, + "loss": 0.3796, + "num_input_tokens_seen": 18548808, + "step": 2815 + }, + { + "epoch": 0.3454718079078742, + "grad_norm": 1.7457127157431902, + "learning_rate": 7.747161615458902e-06, + "loss": 0.481, + "num_input_tokens_seen": 18582240, + "step": 2820 + }, + { + "epoch": 0.34608434657437753, + "grad_norm": 6.464773386325027, + "learning_rate": 7.617651240089546e-06, + "loss": 0.3266, + "num_input_tokens_seen": 18616176, + "step": 2825 + }, + { + "epoch": 0.3466968852408808, + "grad_norm": 5.493808197264997, + "learning_rate": 7.489143213519301e-06, + "loss": 0.3436, + "num_input_tokens_seen": 18650088, + "step": 2830 + }, + { + "epoch": 0.3473094239073842, + "grad_norm": 4.871081382382885, + "learning_rate": 7.361640574981937e-06, + "loss": 0.3959, + "num_input_tokens_seen": 18683096, + "step": 2835 + }, + { + "epoch": 0.34792196257388747, + "grad_norm": 6.935001517346113, + "learning_rate": 7.2351463399336735e-06, + "loss": 0.348, + "num_input_tokens_seen": 18716984, + "step": 2840 + }, + { + "epoch": 0.3485345012403908, + "grad_norm": 4.148606344456004, + "learning_rate": 7.109663499981834e-06, + "loss": 0.3712, + "num_input_tokens_seen": 18750952, + "step": 2845 + }, + { + "epoch": 0.3491470399068941, + "grad_norm": 10.039460846182614, + "learning_rate": 6.985195022814067e-06, + "loss": 0.4121, + "num_input_tokens_seen": 18784560, + "step": 2850 + }, + { + "epoch": 0.3491470399068941, + "eval_loss": 0.19169984757900238, + "eval_runtime": 19.8086, + "eval_samples_per_second": 3.029, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 18784560, + "step": 2850 + }, + { + "epoch": 0.34975957857339746, + "grad_norm": 7.181747769908282, + "learning_rate": 6.861743852128233e-06, + "loss": 0.3724, + "num_input_tokens_seen": 18817608, + "step": 2855 + }, + { + "epoch": 0.35037211723990075, + "grad_norm": 9.422491881760019, + "learning_rate": 6.7393129075627335e-06, + "loss": 0.3968, + "num_input_tokens_seen": 18851504, + "step": 2860 + }, + { + "epoch": 0.3509846559064041, + "grad_norm": 1.2744392950984473, + "learning_rate": 6.6179050846274515e-06, + "loss": 0.4142, + "num_input_tokens_seen": 18884968, + "step": 2865 + }, + { + "epoch": 0.3515971945729074, + "grad_norm": 1.2254663375756965, + "learning_rate": 6.497523254635296e-06, + "loss": 0.3784, + "num_input_tokens_seen": 18917928, + "step": 2870 + }, + { + "epoch": 0.35220973323941074, + "grad_norm": 2.516722735154844, + "learning_rate": 6.37817026463432e-06, + "loss": 0.339, + "num_input_tokens_seen": 18952048, + "step": 2875 + }, + { + "epoch": 0.3528222719059141, + "grad_norm": 4.426292624162038, + "learning_rate": 6.25984893734034e-06, + "loss": 0.4563, + "num_input_tokens_seen": 18984864, + "step": 2880 + }, + { + "epoch": 0.3534348105724174, + "grad_norm": 1.1903137489917255, + "learning_rate": 6.142562071070179e-06, + "loss": 0.3283, + "num_input_tokens_seen": 19018664, + "step": 2885 + }, + { + "epoch": 0.3540473492389207, + "grad_norm": 20.151668474221264, + "learning_rate": 6.026312439675552e-06, + "loss": 0.422, + "num_input_tokens_seen": 19052696, + "step": 2890 + }, + { + "epoch": 0.354659887905424, + "grad_norm": 16.730620312733837, + "learning_rate": 5.911102792477357e-06, + "loss": 0.3772, + "num_input_tokens_seen": 19086368, + "step": 2895 + }, + { + "epoch": 0.35527242657192737, + "grad_norm": 2.393495604544419, + "learning_rate": 5.796935854200763e-06, + "loss": 0.4081, + "num_input_tokens_seen": 19120400, + "step": 2900 + }, + { + "epoch": 0.35527242657192737, + "eval_loss": 0.16432562470436096, + "eval_runtime": 19.4007, + "eval_samples_per_second": 3.093, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 19120400, + "step": 2900 + }, + { + "epoch": 0.35588496523843066, + "grad_norm": 18.589785379497847, + "learning_rate": 5.683814324910685e-06, + "loss": 0.3497, + "num_input_tokens_seen": 19154080, + "step": 2905 + }, + { + "epoch": 0.356497503904934, + "grad_norm": 19.652202448245504, + "learning_rate": 5.571740879947979e-06, + "loss": 0.3912, + "num_input_tokens_seen": 19187704, + "step": 2910 + }, + { + "epoch": 0.3571100425714373, + "grad_norm": 4.891668233678917, + "learning_rate": 5.4607181698661634e-06, + "loss": 0.3573, + "num_input_tokens_seen": 19221704, + "step": 2915 + }, + { + "epoch": 0.35772258123794065, + "grad_norm": 9.350739687040472, + "learning_rate": 5.35074882036869e-06, + "loss": 0.3441, + "num_input_tokens_seen": 19255184, + "step": 2920 + }, + { + "epoch": 0.35833511990444394, + "grad_norm": 1.2640718715406216, + "learning_rate": 5.241835432246889e-06, + "loss": 0.3662, + "num_input_tokens_seen": 19287920, + "step": 2925 + }, + { + "epoch": 0.3589476585709473, + "grad_norm": 8.347531502476556, + "learning_rate": 5.133980581318459e-06, + "loss": 0.3524, + "num_input_tokens_seen": 19321384, + "step": 2930 + }, + { + "epoch": 0.35956019723745064, + "grad_norm": 2.7857821993654794, + "learning_rate": 5.027186818366542e-06, + "loss": 0.3397, + "num_input_tokens_seen": 19355128, + "step": 2935 + }, + { + "epoch": 0.36017273590395393, + "grad_norm": 2.1581179715565892, + "learning_rate": 4.921456669079366e-06, + "loss": 0.3737, + "num_input_tokens_seen": 19388880, + "step": 2940 + }, + { + "epoch": 0.3607852745704573, + "grad_norm": 9.11640207168517, + "learning_rate": 4.816792633990569e-06, + "loss": 0.3888, + "num_input_tokens_seen": 19422408, + "step": 2945 + }, + { + "epoch": 0.3613978132369606, + "grad_norm": 2.2395229797691685, + "learning_rate": 4.713197188420026e-06, + "loss": 0.3425, + "num_input_tokens_seen": 19455392, + "step": 2950 + }, + { + "epoch": 0.3613978132369606, + "eval_loss": 0.14024704694747925, + "eval_runtime": 19.8653, + "eval_samples_per_second": 3.02, + "eval_steps_per_second": 0.755, + "num_input_tokens_seen": 19455392, + "step": 2950 + }, + { + "epoch": 0.3620103519034639, + "grad_norm": 4.126092392205192, + "learning_rate": 4.610672782415276e-06, + "loss": 0.367, + "num_input_tokens_seen": 19489208, + "step": 2955 + }, + { + "epoch": 0.3626228905699672, + "grad_norm": 1.3095519284145, + "learning_rate": 4.509221840693656e-06, + "loss": 0.4011, + "num_input_tokens_seen": 19522184, + "step": 2960 + }, + { + "epoch": 0.36323542923647056, + "grad_norm": 19.368370113743765, + "learning_rate": 4.408846762584901e-06, + "loss": 0.3989, + "num_input_tokens_seen": 19556560, + "step": 2965 + }, + { + "epoch": 0.36384796790297386, + "grad_norm": 5.257190192056534, + "learning_rate": 4.309549921974421e-06, + "loss": 0.328, + "num_input_tokens_seen": 19590856, + "step": 2970 + }, + { + "epoch": 0.3644605065694772, + "grad_norm": 2.7746844236624373, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.4069, + "num_input_tokens_seen": 19623992, + "step": 2975 + }, + { + "epoch": 0.3650730452359805, + "grad_norm": 1.545366329168081, + "learning_rate": 4.114200321231937e-06, + "loss": 0.4154, + "num_input_tokens_seen": 19657384, + "step": 2980 + }, + { + "epoch": 0.36568558390248385, + "grad_norm": 2.9580169063691972, + "learning_rate": 4.018152181146823e-06, + "loss": 0.3801, + "num_input_tokens_seen": 19690344, + "step": 2985 + }, + { + "epoch": 0.3662981225689872, + "grad_norm": 3.704505694925942, + "learning_rate": 3.923191518544434e-06, + "loss": 0.4359, + "num_input_tokens_seen": 19723432, + "step": 2990 + }, + { + "epoch": 0.3669106612354905, + "grad_norm": 8.800525826867208, + "learning_rate": 3.829320579258466e-06, + "loss": 0.366, + "num_input_tokens_seen": 19756768, + "step": 2995 + }, + { + "epoch": 0.36752319990199384, + "grad_norm": 16.396927662635772, + "learning_rate": 3.7365415833504725e-06, + "loss": 0.3874, + "num_input_tokens_seen": 19790096, + "step": 3000 + }, + { + "epoch": 0.36752319990199384, + "eval_loss": 0.16951163113117218, + "eval_runtime": 19.2942, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 19790096, + "step": 3000 + }, + { + "epoch": 0.36813573856849713, + "grad_norm": 22.635970730424834, + "learning_rate": 9.516767703953432e-05, + "loss": 0.374, + "num_input_tokens_seen": 19823560, + "step": 3005 + }, + { + "epoch": 0.3687482772350005, + "grad_norm": 1.2641968812862068, + "learning_rate": 9.514593112044912e-05, + "loss": 0.4155, + "num_input_tokens_seen": 19857216, + "step": 3010 + }, + { + "epoch": 0.36936081590150377, + "grad_norm": 3.9981411120005235, + "learning_rate": 9.512413887771539e-05, + "loss": 0.4984, + "num_input_tokens_seen": 19891488, + "step": 3015 + }, + { + "epoch": 0.3699733545680071, + "grad_norm": 1.6878244654566803, + "learning_rate": 9.510230033369388e-05, + "loss": 0.4179, + "num_input_tokens_seen": 19924968, + "step": 3020 + }, + { + "epoch": 0.3705858932345104, + "grad_norm": 6.006858493244344, + "learning_rate": 9.508041551079284e-05, + "loss": 0.4198, + "num_input_tokens_seen": 19958360, + "step": 3025 + }, + { + "epoch": 0.37119843190101376, + "grad_norm": 8.437979073547194, + "learning_rate": 9.505848443146796e-05, + "loss": 0.3959, + "num_input_tokens_seen": 19991912, + "step": 3030 + }, + { + "epoch": 0.37181097056751705, + "grad_norm": 2.9245462689976742, + "learning_rate": 9.503650711822247e-05, + "loss": 0.4709, + "num_input_tokens_seen": 20025264, + "step": 3035 + }, + { + "epoch": 0.3724235092340204, + "grad_norm": 13.303552872965604, + "learning_rate": 9.501448359360698e-05, + "loss": 0.3901, + "num_input_tokens_seen": 20059392, + "step": 3040 + }, + { + "epoch": 0.3730360479005237, + "grad_norm": 10.196708229214408, + "learning_rate": 9.499241388021955e-05, + "loss": 0.4342, + "num_input_tokens_seen": 20092560, + "step": 3045 + }, + { + "epoch": 0.37364858656702704, + "grad_norm": 1.7568462524096162, + "learning_rate": 9.497029800070565e-05, + "loss": 0.3706, + "num_input_tokens_seen": 20126152, + "step": 3050 + }, + { + "epoch": 0.37364858656702704, + "eval_loss": 0.5081549882888794, + "eval_runtime": 48.8137, + "eval_samples_per_second": 1.229, + "eval_steps_per_second": 0.307, + "num_input_tokens_seen": 20126152, + "step": 3050 + }, + { + "epoch": 0.3742611252335304, + "grad_norm": 15.238996254821476, + "learning_rate": 9.494813597775805e-05, + "loss": 0.5132, + "num_input_tokens_seen": 20159744, + "step": 3055 + }, + { + "epoch": 0.3748736639000337, + "grad_norm": 2.1550656180028813, + "learning_rate": 9.492592783411694e-05, + "loss": 0.4203, + "num_input_tokens_seen": 20192704, + "step": 3060 + }, + { + "epoch": 0.37548620256653703, + "grad_norm": 10.290517947566332, + "learning_rate": 9.490367359256979e-05, + "loss": 0.5163, + "num_input_tokens_seen": 20226224, + "step": 3065 + }, + { + "epoch": 0.3760987412330403, + "grad_norm": 14.172621776304688, + "learning_rate": 9.48813732759514e-05, + "loss": 0.5524, + "num_input_tokens_seen": 20260560, + "step": 3070 + }, + { + "epoch": 0.37671127989954367, + "grad_norm": 2.3405791636530706, + "learning_rate": 9.485902690714381e-05, + "loss": 0.4073, + "num_input_tokens_seen": 20293408, + "step": 3075 + }, + { + "epoch": 0.37732381856604696, + "grad_norm": 13.973907864742015, + "learning_rate": 9.483663450907635e-05, + "loss": 0.4307, + "num_input_tokens_seen": 20327600, + "step": 3080 + }, + { + "epoch": 0.3779363572325503, + "grad_norm": 7.92605972901117, + "learning_rate": 9.481419610472559e-05, + "loss": 0.5101, + "num_input_tokens_seen": 20361000, + "step": 3085 + }, + { + "epoch": 0.3785488958990536, + "grad_norm": 6.418869434925558, + "learning_rate": 9.479171171711525e-05, + "loss": 0.4317, + "num_input_tokens_seen": 20393928, + "step": 3090 + }, + { + "epoch": 0.37916143456555695, + "grad_norm": 10.416185495325212, + "learning_rate": 9.476918136931627e-05, + "loss": 0.4347, + "num_input_tokens_seen": 20427032, + "step": 3095 + }, + { + "epoch": 0.37977397323206025, + "grad_norm": 6.037711206270561, + "learning_rate": 9.474660508444674e-05, + "loss": 0.372, + "num_input_tokens_seen": 20460696, + "step": 3100 + }, + { + "epoch": 0.37977397323206025, + "eval_loss": 0.27452927827835083, + "eval_runtime": 19.6754, + "eval_samples_per_second": 3.049, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 20460696, + "step": 3100 + }, + { + "epoch": 0.3803865118985636, + "grad_norm": 16.43876595975965, + "learning_rate": 9.472398288567192e-05, + "loss": 0.4703, + "num_input_tokens_seen": 20494616, + "step": 3105 + }, + { + "epoch": 0.38099905056506694, + "grad_norm": 5.768244459700721, + "learning_rate": 9.470131479620412e-05, + "loss": 0.3934, + "num_input_tokens_seen": 20528808, + "step": 3110 + }, + { + "epoch": 0.38161158923157024, + "grad_norm": 20.375234134058275, + "learning_rate": 9.46786008393028e-05, + "loss": 0.4117, + "num_input_tokens_seen": 20562384, + "step": 3115 + }, + { + "epoch": 0.3822241278980736, + "grad_norm": 1.5676040034403327, + "learning_rate": 9.465584103827442e-05, + "loss": 0.3918, + "num_input_tokens_seen": 20595448, + "step": 3120 + }, + { + "epoch": 0.3828366665645769, + "grad_norm": 8.354647435799665, + "learning_rate": 9.463303541647253e-05, + "loss": 0.4222, + "num_input_tokens_seen": 20628992, + "step": 3125 + }, + { + "epoch": 0.3834492052310802, + "grad_norm": 9.005862052877694, + "learning_rate": 9.461018399729768e-05, + "loss": 0.4089, + "num_input_tokens_seen": 20662448, + "step": 3130 + }, + { + "epoch": 0.3840617438975835, + "grad_norm": 18.840015911199007, + "learning_rate": 9.458728680419741e-05, + "loss": 0.4812, + "num_input_tokens_seen": 20696496, + "step": 3135 + }, + { + "epoch": 0.38467428256408687, + "grad_norm": 26.0882963608467, + "learning_rate": 9.456434386066624e-05, + "loss": 0.4318, + "num_input_tokens_seen": 20729976, + "step": 3140 + }, + { + "epoch": 0.38528682123059016, + "grad_norm": 9.038757836051163, + "learning_rate": 9.454135519024561e-05, + "loss": 0.3784, + "num_input_tokens_seen": 20764304, + "step": 3145 + }, + { + "epoch": 0.3858993598970935, + "grad_norm": 10.383655553647545, + "learning_rate": 9.451832081652389e-05, + "loss": 0.3967, + "num_input_tokens_seen": 20798040, + "step": 3150 + }, + { + "epoch": 0.3858993598970935, + "eval_loss": 0.45261481404304504, + "eval_runtime": 19.6184, + "eval_samples_per_second": 3.058, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 20798040, + "step": 3150 + }, + { + "epoch": 0.3865118985635968, + "grad_norm": 8.661599648250453, + "learning_rate": 9.449524076313637e-05, + "loss": 0.4064, + "num_input_tokens_seen": 20832088, + "step": 3155 + }, + { + "epoch": 0.38712443723010015, + "grad_norm": 8.674526733428278, + "learning_rate": 9.447211505376518e-05, + "loss": 0.4767, + "num_input_tokens_seen": 20865448, + "step": 3160 + }, + { + "epoch": 0.3877369758966035, + "grad_norm": 5.128496725029501, + "learning_rate": 9.444894371213929e-05, + "loss": 0.4363, + "num_input_tokens_seen": 20898416, + "step": 3165 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 6.067003129904418, + "learning_rate": 9.442572676203454e-05, + "loss": 0.3689, + "num_input_tokens_seen": 20931984, + "step": 3170 + }, + { + "epoch": 0.38896205322961014, + "grad_norm": 6.3723854169456065, + "learning_rate": 9.44024642272735e-05, + "loss": 0.4678, + "num_input_tokens_seen": 20965456, + "step": 3175 + }, + { + "epoch": 0.38957459189611343, + "grad_norm": 7.068052895527911, + "learning_rate": 9.437915613172562e-05, + "loss": 0.4174, + "num_input_tokens_seen": 20998864, + "step": 3180 + }, + { + "epoch": 0.3901871305626168, + "grad_norm": 5.158388795892552, + "learning_rate": 9.435580249930696e-05, + "loss": 0.4286, + "num_input_tokens_seen": 21032200, + "step": 3185 + }, + { + "epoch": 0.3907996692291201, + "grad_norm": 6.143800097591995, + "learning_rate": 9.433240335398039e-05, + "loss": 0.4015, + "num_input_tokens_seen": 21066128, + "step": 3190 + }, + { + "epoch": 0.3914122078956234, + "grad_norm": 16.169141316390395, + "learning_rate": 9.430895871975549e-05, + "loss": 0.4048, + "num_input_tokens_seen": 21100016, + "step": 3195 + }, + { + "epoch": 0.3920247465621267, + "grad_norm": 5.1929225441793765, + "learning_rate": 9.42854686206885e-05, + "loss": 0.4331, + "num_input_tokens_seen": 21133184, + "step": 3200 + }, + { + "epoch": 0.3920247465621267, + "eval_loss": 0.2645382583141327, + "eval_runtime": 19.5942, + "eval_samples_per_second": 3.062, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 21133184, + "step": 3200 + }, + { + "epoch": 0.39263728522863006, + "grad_norm": 11.586568777462906, + "learning_rate": 9.426193308088226e-05, + "loss": 0.3808, + "num_input_tokens_seen": 21165984, + "step": 3205 + }, + { + "epoch": 0.39324982389513335, + "grad_norm": 1.3214549567226983, + "learning_rate": 9.423835212448632e-05, + "loss": 0.3748, + "num_input_tokens_seen": 21200392, + "step": 3210 + }, + { + "epoch": 0.3938623625616367, + "grad_norm": 5.984432352053421, + "learning_rate": 9.421472577569677e-05, + "loss": 0.4623, + "num_input_tokens_seen": 21234232, + "step": 3215 + }, + { + "epoch": 0.39447490122814005, + "grad_norm": 9.98295556471336, + "learning_rate": 9.419105405875631e-05, + "loss": 0.4058, + "num_input_tokens_seen": 21267600, + "step": 3220 + }, + { + "epoch": 0.39508743989464334, + "grad_norm": 2.1818057375892876, + "learning_rate": 9.416733699795417e-05, + "loss": 0.4227, + "num_input_tokens_seen": 21300688, + "step": 3225 + }, + { + "epoch": 0.3956999785611467, + "grad_norm": 7.823074246381935, + "learning_rate": 9.41435746176261e-05, + "loss": 0.4075, + "num_input_tokens_seen": 21334832, + "step": 3230 + }, + { + "epoch": 0.39631251722765, + "grad_norm": 7.842635393805346, + "learning_rate": 9.41197669421544e-05, + "loss": 0.4461, + "num_input_tokens_seen": 21368160, + "step": 3235 + }, + { + "epoch": 0.39692505589415333, + "grad_norm": 7.060345238014142, + "learning_rate": 9.409591399596779e-05, + "loss": 0.3888, + "num_input_tokens_seen": 21401088, + "step": 3240 + }, + { + "epoch": 0.3975375945606566, + "grad_norm": 14.136598354517995, + "learning_rate": 9.407201580354148e-05, + "loss": 0.4325, + "num_input_tokens_seen": 21435856, + "step": 3245 + }, + { + "epoch": 0.39815013322716, + "grad_norm": 2.7211108051324606, + "learning_rate": 9.404807238939707e-05, + "loss": 0.3708, + "num_input_tokens_seen": 21468896, + "step": 3250 + }, + { + "epoch": 0.39815013322716, + "eval_loss": 0.3505937457084656, + "eval_runtime": 19.5163, + "eval_samples_per_second": 3.074, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 21468896, + "step": 3250 + }, + { + "epoch": 0.39876267189366327, + "grad_norm": 6.562243182983243, + "learning_rate": 9.402408377810261e-05, + "loss": 0.4381, + "num_input_tokens_seen": 21502040, + "step": 3255 + }, + { + "epoch": 0.3993752105601666, + "grad_norm": 5.823048338889232, + "learning_rate": 9.400004999427248e-05, + "loss": 0.3854, + "num_input_tokens_seen": 21536112, + "step": 3260 + }, + { + "epoch": 0.3999877492266699, + "grad_norm": 136.06473927611398, + "learning_rate": 9.397597106256744e-05, + "loss": 0.4841, + "num_input_tokens_seen": 21570152, + "step": 3265 + }, + { + "epoch": 0.40060028789317326, + "grad_norm": 3.8260418625641543, + "learning_rate": 9.395184700769457e-05, + "loss": 0.3739, + "num_input_tokens_seen": 21603832, + "step": 3270 + }, + { + "epoch": 0.4012128265596766, + "grad_norm": 7.7405376583041825, + "learning_rate": 9.392767785440723e-05, + "loss": 0.4677, + "num_input_tokens_seen": 21637048, + "step": 3275 + }, + { + "epoch": 0.4018253652261799, + "grad_norm": 3.1647828710845696, + "learning_rate": 9.39034636275051e-05, + "loss": 0.4087, + "num_input_tokens_seen": 21670896, + "step": 3280 + }, + { + "epoch": 0.40243790389268325, + "grad_norm": 13.762480712907257, + "learning_rate": 9.387920435183407e-05, + "loss": 0.4207, + "num_input_tokens_seen": 21704680, + "step": 3285 + }, + { + "epoch": 0.40305044255918654, + "grad_norm": 8.193924384571604, + "learning_rate": 9.385490005228624e-05, + "loss": 0.4417, + "num_input_tokens_seen": 21738520, + "step": 3290 + }, + { + "epoch": 0.4036629812256899, + "grad_norm": 7.923032463660513, + "learning_rate": 9.383055075379996e-05, + "loss": 0.4105, + "num_input_tokens_seen": 21772376, + "step": 3295 + }, + { + "epoch": 0.4042755198921932, + "grad_norm": 5.990441460693076, + "learning_rate": 9.380615648135973e-05, + "loss": 0.3826, + "num_input_tokens_seen": 21805544, + "step": 3300 + }, + { + "epoch": 0.4042755198921932, + "eval_loss": 0.42393144965171814, + "eval_runtime": 19.4526, + "eval_samples_per_second": 3.084, + "eval_steps_per_second": 0.771, + "num_input_tokens_seen": 21805544, + "step": 3300 + }, + { + "epoch": 0.40488805855869653, + "grad_norm": 5.690928370267841, + "learning_rate": 9.378171725999618e-05, + "loss": 0.3376, + "num_input_tokens_seen": 21839512, + "step": 3305 + }, + { + "epoch": 0.4055005972251998, + "grad_norm": 28.56912013422237, + "learning_rate": 9.375723311478606e-05, + "loss": 0.5174, + "num_input_tokens_seen": 21873384, + "step": 3310 + }, + { + "epoch": 0.40611313589170317, + "grad_norm": 4.16987907495165, + "learning_rate": 9.373270407085227e-05, + "loss": 0.4144, + "num_input_tokens_seen": 21906552, + "step": 3315 + }, + { + "epoch": 0.40672567455820646, + "grad_norm": 1.7743294355584325, + "learning_rate": 9.37081301533637e-05, + "loss": 0.4661, + "num_input_tokens_seen": 21939496, + "step": 3320 + }, + { + "epoch": 0.4073382132247098, + "grad_norm": 1.261327318134752, + "learning_rate": 9.368351138753535e-05, + "loss": 0.3998, + "num_input_tokens_seen": 21972984, + "step": 3325 + }, + { + "epoch": 0.40795075189121316, + "grad_norm": 1.0524880219591264, + "learning_rate": 9.36588477986282e-05, + "loss": 0.4144, + "num_input_tokens_seen": 22006464, + "step": 3330 + }, + { + "epoch": 0.40856329055771645, + "grad_norm": 1.5402146436463033, + "learning_rate": 9.363413941194926e-05, + "loss": 0.3585, + "num_input_tokens_seen": 22040312, + "step": 3335 + }, + { + "epoch": 0.4091758292242198, + "grad_norm": 12.735729703502708, + "learning_rate": 9.360938625285144e-05, + "loss": 0.5134, + "num_input_tokens_seen": 22073584, + "step": 3340 + }, + { + "epoch": 0.4097883678907231, + "grad_norm": 7.529473626172697, + "learning_rate": 9.35845883467337e-05, + "loss": 0.4455, + "num_input_tokens_seen": 22106912, + "step": 3345 + }, + { + "epoch": 0.41040090655722644, + "grad_norm": 2.3482318343023834, + "learning_rate": 9.355974571904076e-05, + "loss": 0.4719, + "num_input_tokens_seen": 22140064, + "step": 3350 + }, + { + "epoch": 0.41040090655722644, + "eval_loss": 0.3364885747432709, + "eval_runtime": 19.5595, + "eval_samples_per_second": 3.068, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 22140064, + "step": 3350 + }, + { + "epoch": 0.41101344522372973, + "grad_norm": 6.824538152961066, + "learning_rate": 9.353485839526337e-05, + "loss": 0.4072, + "num_input_tokens_seen": 22173000, + "step": 3355 + }, + { + "epoch": 0.4116259838902331, + "grad_norm": 14.810173274272177, + "learning_rate": 9.350992640093807e-05, + "loss": 0.3736, + "num_input_tokens_seen": 22206640, + "step": 3360 + }, + { + "epoch": 0.4122385225567364, + "grad_norm": 3.2709751503442472, + "learning_rate": 9.348494976164727e-05, + "loss": 0.4371, + "num_input_tokens_seen": 22239976, + "step": 3365 + }, + { + "epoch": 0.4128510612232397, + "grad_norm": 1.6816763046218217, + "learning_rate": 9.345992850301917e-05, + "loss": 0.3755, + "num_input_tokens_seen": 22273640, + "step": 3370 + }, + { + "epoch": 0.413463599889743, + "grad_norm": 5.407767563440517, + "learning_rate": 9.343486265072772e-05, + "loss": 0.4523, + "num_input_tokens_seen": 22306584, + "step": 3375 + }, + { + "epoch": 0.41407613855624636, + "grad_norm": 8.68560018786178, + "learning_rate": 9.340975223049271e-05, + "loss": 0.4513, + "num_input_tokens_seen": 22340328, + "step": 3380 + }, + { + "epoch": 0.4146886772227497, + "grad_norm": 11.452464978794895, + "learning_rate": 9.338459726807957e-05, + "loss": 0.4645, + "num_input_tokens_seen": 22375056, + "step": 3385 + }, + { + "epoch": 0.415301215889253, + "grad_norm": 8.308762819246287, + "learning_rate": 9.335939778929952e-05, + "loss": 0.4131, + "num_input_tokens_seen": 22409792, + "step": 3390 + }, + { + "epoch": 0.41591375455575635, + "grad_norm": 11.529130270993797, + "learning_rate": 9.333415382000938e-05, + "loss": 0.4678, + "num_input_tokens_seen": 22443432, + "step": 3395 + }, + { + "epoch": 0.41652629322225965, + "grad_norm": 5.014481150791811, + "learning_rate": 9.330886538611168e-05, + "loss": 0.452, + "num_input_tokens_seen": 22477472, + "step": 3400 + }, + { + "epoch": 0.41652629322225965, + "eval_loss": 0.3813478350639343, + "eval_runtime": 19.5063, + "eval_samples_per_second": 3.076, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 22477472, + "step": 3400 + }, + { + "epoch": 0.417138831888763, + "grad_norm": 4.344440488869656, + "learning_rate": 9.328353251355453e-05, + "loss": 0.473, + "num_input_tokens_seen": 22511088, + "step": 3405 + }, + { + "epoch": 0.4177513705552663, + "grad_norm": 10.876942615083081, + "learning_rate": 9.325815522833167e-05, + "loss": 0.4002, + "num_input_tokens_seen": 22545728, + "step": 3410 + }, + { + "epoch": 0.41836390922176964, + "grad_norm": 4.606431238273991, + "learning_rate": 9.323273355648241e-05, + "loss": 0.4302, + "num_input_tokens_seen": 22578928, + "step": 3415 + }, + { + "epoch": 0.41897644788827293, + "grad_norm": 2.744198133366261, + "learning_rate": 9.320726752409157e-05, + "loss": 0.3674, + "num_input_tokens_seen": 22612896, + "step": 3420 + }, + { + "epoch": 0.4195889865547763, + "grad_norm": 1.3972371974035387, + "learning_rate": 9.318175715728954e-05, + "loss": 0.3903, + "num_input_tokens_seen": 22645784, + "step": 3425 + }, + { + "epoch": 0.42020152522127957, + "grad_norm": 8.173117810995006, + "learning_rate": 9.315620248225217e-05, + "loss": 0.4181, + "num_input_tokens_seen": 22678528, + "step": 3430 + }, + { + "epoch": 0.4208140638877829, + "grad_norm": 10.575896578498723, + "learning_rate": 9.313060352520073e-05, + "loss": 0.4324, + "num_input_tokens_seen": 22712640, + "step": 3435 + }, + { + "epoch": 0.42142660255428627, + "grad_norm": 10.174914211044738, + "learning_rate": 9.310496031240205e-05, + "loss": 0.3523, + "num_input_tokens_seen": 22746800, + "step": 3440 + }, + { + "epoch": 0.42203914122078956, + "grad_norm": 11.780143581971112, + "learning_rate": 9.307927287016823e-05, + "loss": 0.3652, + "num_input_tokens_seen": 22780448, + "step": 3445 + }, + { + "epoch": 0.4226516798872929, + "grad_norm": 6.497564787005414, + "learning_rate": 9.305354122485686e-05, + "loss": 0.4566, + "num_input_tokens_seen": 22813528, + "step": 3450 + }, + { + "epoch": 0.4226516798872929, + "eval_loss": 0.4307055175304413, + "eval_runtime": 19.0694, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 22813528, + "step": 3450 + }, + { + "epoch": 0.4232642185537962, + "grad_norm": 7.241543525453529, + "learning_rate": 9.302776540287082e-05, + "loss": 0.414, + "num_input_tokens_seen": 22846864, + "step": 3455 + }, + { + "epoch": 0.42387675722029955, + "grad_norm": 11.627847168483536, + "learning_rate": 9.300194543065835e-05, + "loss": 0.5134, + "num_input_tokens_seen": 22879984, + "step": 3460 + }, + { + "epoch": 0.42448929588680284, + "grad_norm": 31.395391147031035, + "learning_rate": 9.2976081334713e-05, + "loss": 0.4776, + "num_input_tokens_seen": 22913752, + "step": 3465 + }, + { + "epoch": 0.4251018345533062, + "grad_norm": 1.0545088361406252, + "learning_rate": 9.295017314157354e-05, + "loss": 0.4034, + "num_input_tokens_seen": 22947360, + "step": 3470 + }, + { + "epoch": 0.4257143732198095, + "grad_norm": 9.26693252720066, + "learning_rate": 9.292422087782408e-05, + "loss": 0.4691, + "num_input_tokens_seen": 22980784, + "step": 3475 + }, + { + "epoch": 0.42632691188631283, + "grad_norm": 9.182773573991268, + "learning_rate": 9.289822457009388e-05, + "loss": 0.3737, + "num_input_tokens_seen": 23015304, + "step": 3480 + }, + { + "epoch": 0.4269394505528161, + "grad_norm": 2.880393123809363, + "learning_rate": 9.287218424505736e-05, + "loss": 0.431, + "num_input_tokens_seen": 23048248, + "step": 3485 + }, + { + "epoch": 0.4275519892193195, + "grad_norm": 10.88263353160836, + "learning_rate": 9.284609992943423e-05, + "loss": 0.3861, + "num_input_tokens_seen": 23082144, + "step": 3490 + }, + { + "epoch": 0.42816452788582277, + "grad_norm": 5.98687910253018, + "learning_rate": 9.281997164998923e-05, + "loss": 0.4227, + "num_input_tokens_seen": 23115584, + "step": 3495 + }, + { + "epoch": 0.4287770665523261, + "grad_norm": 11.303720302613218, + "learning_rate": 9.279379943353226e-05, + "loss": 0.4345, + "num_input_tokens_seen": 23149384, + "step": 3500 + }, + { + "epoch": 0.4287770665523261, + "eval_loss": 0.2862895727157593, + "eval_runtime": 19.6292, + "eval_samples_per_second": 3.057, + "eval_steps_per_second": 0.764, + "num_input_tokens_seen": 23149384, + "step": 3500 + }, + { + "epoch": 0.42938960521882946, + "grad_norm": 2.0886889718680224, + "learning_rate": 9.276758330691826e-05, + "loss": 0.4072, + "num_input_tokens_seen": 23183112, + "step": 3505 + }, + { + "epoch": 0.43000214388533275, + "grad_norm": 4.241585224492075, + "learning_rate": 9.27413232970473e-05, + "loss": 0.5979, + "num_input_tokens_seen": 23216232, + "step": 3510 + }, + { + "epoch": 0.4306146825518361, + "grad_norm": 4.405866753873196, + "learning_rate": 9.271501943086437e-05, + "loss": 0.5025, + "num_input_tokens_seen": 23249256, + "step": 3515 + }, + { + "epoch": 0.4312272212183394, + "grad_norm": 5.915991656898724, + "learning_rate": 9.268867173535957e-05, + "loss": 0.4179, + "num_input_tokens_seen": 23283064, + "step": 3520 + }, + { + "epoch": 0.43183975988484274, + "grad_norm": 4.390913492794889, + "learning_rate": 9.266228023756791e-05, + "loss": 0.3838, + "num_input_tokens_seen": 23316448, + "step": 3525 + }, + { + "epoch": 0.43245229855134604, + "grad_norm": 3.506780045332695, + "learning_rate": 9.263584496456937e-05, + "loss": 0.4015, + "num_input_tokens_seen": 23349592, + "step": 3530 + }, + { + "epoch": 0.4330648372178494, + "grad_norm": 9.198391624029652, + "learning_rate": 9.260936594348882e-05, + "loss": 0.4496, + "num_input_tokens_seen": 23383184, + "step": 3535 + }, + { + "epoch": 0.4336773758843527, + "grad_norm": 7.3384425431889735, + "learning_rate": 9.258284320149605e-05, + "loss": 0.4206, + "num_input_tokens_seen": 23416576, + "step": 3540 + }, + { + "epoch": 0.434289914550856, + "grad_norm": 3.01250049316644, + "learning_rate": 9.25562767658057e-05, + "loss": 0.4029, + "num_input_tokens_seen": 23450336, + "step": 3545 + }, + { + "epoch": 0.4349024532173593, + "grad_norm": 3.187881691721142, + "learning_rate": 9.252966666367722e-05, + "loss": 0.4101, + "num_input_tokens_seen": 23484072, + "step": 3550 + }, + { + "epoch": 0.4349024532173593, + "eval_loss": 0.33307310938835144, + "eval_runtime": 19.4367, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 23484072, + "step": 3550 + }, + { + "epoch": 0.43551499188386267, + "grad_norm": 4.733953949863986, + "learning_rate": 9.250301292241492e-05, + "loss": 0.4278, + "num_input_tokens_seen": 23516352, + "step": 3555 + }, + { + "epoch": 0.436127530550366, + "grad_norm": 18.438502925017318, + "learning_rate": 9.247631556936782e-05, + "loss": 0.4285, + "num_input_tokens_seen": 23550240, + "step": 3560 + }, + { + "epoch": 0.4367400692168693, + "grad_norm": 2.3175219122163595, + "learning_rate": 9.244957463192976e-05, + "loss": 0.4004, + "num_input_tokens_seen": 23584792, + "step": 3565 + }, + { + "epoch": 0.43735260788337266, + "grad_norm": 1.2620927350871958, + "learning_rate": 9.242279013753922e-05, + "loss": 0.3487, + "num_input_tokens_seen": 23617416, + "step": 3570 + }, + { + "epoch": 0.43796514654987595, + "grad_norm": 1.4269842152904832, + "learning_rate": 9.239596211367948e-05, + "loss": 0.4213, + "num_input_tokens_seen": 23650840, + "step": 3575 + }, + { + "epoch": 0.4385776852163793, + "grad_norm": 10.601241552669514, + "learning_rate": 9.236909058787837e-05, + "loss": 0.4415, + "num_input_tokens_seen": 23684112, + "step": 3580 + }, + { + "epoch": 0.4391902238828826, + "grad_norm": 14.06963445216711, + "learning_rate": 9.234217558770844e-05, + "loss": 0.5179, + "num_input_tokens_seen": 23718136, + "step": 3585 + }, + { + "epoch": 0.43980276254938594, + "grad_norm": 8.561086551328941, + "learning_rate": 9.231521714078681e-05, + "loss": 0.4178, + "num_input_tokens_seen": 23751840, + "step": 3590 + }, + { + "epoch": 0.44041530121588923, + "grad_norm": 1.5198455306968865, + "learning_rate": 9.228821527477519e-05, + "loss": 0.3969, + "num_input_tokens_seen": 23785544, + "step": 3595 + }, + { + "epoch": 0.4410278398823926, + "grad_norm": 3.300282492311308, + "learning_rate": 9.226117001737983e-05, + "loss": 0.3588, + "num_input_tokens_seen": 23819160, + "step": 3600 + }, + { + "epoch": 0.4410278398823926, + "eval_loss": 0.2863926589488983, + "eval_runtime": 19.4033, + "eval_samples_per_second": 3.092, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 23819160, + "step": 3600 + }, + { + "epoch": 0.4416403785488959, + "grad_norm": 7.685979894080164, + "learning_rate": 9.223408139635153e-05, + "loss": 0.4315, + "num_input_tokens_seen": 23852120, + "step": 3605 + }, + { + "epoch": 0.4422529172153992, + "grad_norm": 1.0143631236810027, + "learning_rate": 9.220694943948556e-05, + "loss": 0.3867, + "num_input_tokens_seen": 23885720, + "step": 3610 + }, + { + "epoch": 0.44286545588190257, + "grad_norm": 9.879435189716988, + "learning_rate": 9.217977417462167e-05, + "loss": 0.405, + "num_input_tokens_seen": 23918856, + "step": 3615 + }, + { + "epoch": 0.44347799454840586, + "grad_norm": 1.4310302703185884, + "learning_rate": 9.215255562964404e-05, + "loss": 0.402, + "num_input_tokens_seen": 23951984, + "step": 3620 + }, + { + "epoch": 0.4440905332149092, + "grad_norm": 23.507505451325468, + "learning_rate": 9.212529383248127e-05, + "loss": 0.3315, + "num_input_tokens_seen": 23984912, + "step": 3625 + }, + { + "epoch": 0.4447030718814125, + "grad_norm": 17.574464296534668, + "learning_rate": 9.209798881110634e-05, + "loss": 0.4262, + "num_input_tokens_seen": 24018568, + "step": 3630 + }, + { + "epoch": 0.44531561054791585, + "grad_norm": 3.247543755432227, + "learning_rate": 9.207064059353654e-05, + "loss": 0.3885, + "num_input_tokens_seen": 24052536, + "step": 3635 + }, + { + "epoch": 0.44592814921441914, + "grad_norm": 2.354232285690562, + "learning_rate": 9.204324920783355e-05, + "loss": 0.3797, + "num_input_tokens_seen": 24086168, + "step": 3640 + }, + { + "epoch": 0.4465406878809225, + "grad_norm": 9.33175773179384, + "learning_rate": 9.20158146821033e-05, + "loss": 0.3571, + "num_input_tokens_seen": 24120160, + "step": 3645 + }, + { + "epoch": 0.4471532265474258, + "grad_norm": 8.591121001861783, + "learning_rate": 9.1988337044496e-05, + "loss": 0.4439, + "num_input_tokens_seen": 24153672, + "step": 3650 + }, + { + "epoch": 0.4471532265474258, + "eval_loss": 0.3060784935951233, + "eval_runtime": 19.2693, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 24153672, + "step": 3650 + }, + { + "epoch": 0.44776576521392913, + "grad_norm": 1.8630184229259468, + "learning_rate": 9.19608163232061e-05, + "loss": 0.3653, + "num_input_tokens_seen": 24187312, + "step": 3655 + }, + { + "epoch": 0.4483783038804324, + "grad_norm": 1.2981072903690989, + "learning_rate": 9.193325254647225e-05, + "loss": 0.4087, + "num_input_tokens_seen": 24220960, + "step": 3660 + }, + { + "epoch": 0.4489908425469358, + "grad_norm": 1.1706706485903808, + "learning_rate": 9.190564574257727e-05, + "loss": 0.4211, + "num_input_tokens_seen": 24254224, + "step": 3665 + }, + { + "epoch": 0.4496033812134391, + "grad_norm": 1.9560446040142832, + "learning_rate": 9.187799593984814e-05, + "loss": 0.2987, + "num_input_tokens_seen": 24288128, + "step": 3670 + }, + { + "epoch": 0.4502159198799424, + "grad_norm": 4.477722791597271, + "learning_rate": 9.185030316665597e-05, + "loss": 0.4294, + "num_input_tokens_seen": 24321200, + "step": 3675 + }, + { + "epoch": 0.45082845854644576, + "grad_norm": 8.475751770373268, + "learning_rate": 9.182256745141595e-05, + "loss": 0.3842, + "num_input_tokens_seen": 24354864, + "step": 3680 + }, + { + "epoch": 0.45144099721294906, + "grad_norm": 2.6990935869003203, + "learning_rate": 9.179478882258732e-05, + "loss": 0.463, + "num_input_tokens_seen": 24388472, + "step": 3685 + }, + { + "epoch": 0.4520535358794524, + "grad_norm": 0.9770726173732045, + "learning_rate": 9.176696730867339e-05, + "loss": 0.4246, + "num_input_tokens_seen": 24421688, + "step": 3690 + }, + { + "epoch": 0.4526660745459557, + "grad_norm": 4.992261748327737, + "learning_rate": 9.173910293822145e-05, + "loss": 0.541, + "num_input_tokens_seen": 24455024, + "step": 3695 + }, + { + "epoch": 0.45327861321245905, + "grad_norm": 11.27597582677581, + "learning_rate": 9.171119573982274e-05, + "loss": 0.3865, + "num_input_tokens_seen": 24489272, + "step": 3700 + }, + { + "epoch": 0.45327861321245905, + "eval_loss": 0.3225303292274475, + "eval_runtime": 19.2906, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 24489272, + "step": 3700 + }, + { + "epoch": 0.45389115187896234, + "grad_norm": 6.405046354282304, + "learning_rate": 9.168324574211249e-05, + "loss": 0.3733, + "num_input_tokens_seen": 24523128, + "step": 3705 + }, + { + "epoch": 0.4545036905454657, + "grad_norm": 3.969413094633488, + "learning_rate": 9.165525297376982e-05, + "loss": 0.3909, + "num_input_tokens_seen": 24557744, + "step": 3710 + }, + { + "epoch": 0.455116229211969, + "grad_norm": 14.10278008460725, + "learning_rate": 9.162721746351774e-05, + "loss": 0.4227, + "num_input_tokens_seen": 24590912, + "step": 3715 + }, + { + "epoch": 0.45572876787847233, + "grad_norm": 4.235489205097211, + "learning_rate": 9.159913924012314e-05, + "loss": 0.4473, + "num_input_tokens_seen": 24624144, + "step": 3720 + }, + { + "epoch": 0.4563413065449757, + "grad_norm": 2.010762750873219, + "learning_rate": 9.15710183323967e-05, + "loss": 0.3752, + "num_input_tokens_seen": 24658424, + "step": 3725 + }, + { + "epoch": 0.45695384521147897, + "grad_norm": 1.3182588719273844, + "learning_rate": 9.154285476919291e-05, + "loss": 0.384, + "num_input_tokens_seen": 24691680, + "step": 3730 + }, + { + "epoch": 0.4575663838779823, + "grad_norm": 1.290290287588427, + "learning_rate": 9.151464857941003e-05, + "loss": 0.3846, + "num_input_tokens_seen": 24724784, + "step": 3735 + }, + { + "epoch": 0.4581789225444856, + "grad_norm": 2.173827525826482, + "learning_rate": 9.148639979199009e-05, + "loss": 0.3126, + "num_input_tokens_seen": 24758064, + "step": 3740 + }, + { + "epoch": 0.45879146121098896, + "grad_norm": 13.376107690880517, + "learning_rate": 9.145810843591879e-05, + "loss": 0.4587, + "num_input_tokens_seen": 24791832, + "step": 3745 + }, + { + "epoch": 0.45940399987749225, + "grad_norm": 12.233137994740318, + "learning_rate": 9.14297745402255e-05, + "loss": 0.4547, + "num_input_tokens_seen": 24825512, + "step": 3750 + }, + { + "epoch": 0.45940399987749225, + "eval_loss": 0.329493910074234, + "eval_runtime": 19.7135, + "eval_samples_per_second": 3.044, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 24825512, + "step": 3750 + }, + { + "epoch": 0.4600165385439956, + "grad_norm": 3.826360363966104, + "learning_rate": 9.140139813398328e-05, + "loss": 0.3968, + "num_input_tokens_seen": 24859008, + "step": 3755 + }, + { + "epoch": 0.4606290772104989, + "grad_norm": 2.30336156556904, + "learning_rate": 9.137297924630876e-05, + "loss": 0.4446, + "num_input_tokens_seen": 24892584, + "step": 3760 + }, + { + "epoch": 0.46124161587700224, + "grad_norm": 4.180049847592076, + "learning_rate": 9.134451790636223e-05, + "loss": 0.3624, + "num_input_tokens_seen": 24926904, + "step": 3765 + }, + { + "epoch": 0.46185415454350554, + "grad_norm": 1.157060577791603, + "learning_rate": 9.131601414334748e-05, + "loss": 0.3832, + "num_input_tokens_seen": 24959880, + "step": 3770 + }, + { + "epoch": 0.4624666932100089, + "grad_norm": 3.6894240220991255, + "learning_rate": 9.128746798651184e-05, + "loss": 0.4412, + "num_input_tokens_seen": 24993064, + "step": 3775 + }, + { + "epoch": 0.46307923187651223, + "grad_norm": 1.03164950572145, + "learning_rate": 9.125887946514615e-05, + "loss": 0.3528, + "num_input_tokens_seen": 25027216, + "step": 3780 + }, + { + "epoch": 0.4636917705430155, + "grad_norm": 4.214066033291009, + "learning_rate": 9.123024860858473e-05, + "loss": 0.4049, + "num_input_tokens_seen": 25059744, + "step": 3785 + }, + { + "epoch": 0.4643043092095189, + "grad_norm": 2.474465777206733, + "learning_rate": 9.120157544620532e-05, + "loss": 0.4077, + "num_input_tokens_seen": 25093136, + "step": 3790 + }, + { + "epoch": 0.46491684787602217, + "grad_norm": 1.5446522598891546, + "learning_rate": 9.117286000742907e-05, + "loss": 0.3809, + "num_input_tokens_seen": 25126312, + "step": 3795 + }, + { + "epoch": 0.4655293865425255, + "grad_norm": 8.505303942369093, + "learning_rate": 9.114410232172051e-05, + "loss": 0.3747, + "num_input_tokens_seen": 25159152, + "step": 3800 + }, + { + "epoch": 0.4655293865425255, + "eval_loss": 0.3077337443828583, + "eval_runtime": 19.2621, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 25159152, + "step": 3800 + }, + { + "epoch": 0.4661419252090288, + "grad_norm": 2.845735344014034, + "learning_rate": 9.111530241858752e-05, + "loss": 0.4342, + "num_input_tokens_seen": 25192400, + "step": 3805 + }, + { + "epoch": 0.46675446387553216, + "grad_norm": 2.2133441301209147, + "learning_rate": 9.108646032758134e-05, + "loss": 0.3581, + "num_input_tokens_seen": 25225960, + "step": 3810 + }, + { + "epoch": 0.46736700254203545, + "grad_norm": 1.8277145752019064, + "learning_rate": 9.105757607829644e-05, + "loss": 0.3421, + "num_input_tokens_seen": 25259928, + "step": 3815 + }, + { + "epoch": 0.4679795412085388, + "grad_norm": 20.098923001464286, + "learning_rate": 9.102864970037055e-05, + "loss": 0.5049, + "num_input_tokens_seen": 25293656, + "step": 3820 + }, + { + "epoch": 0.4685920798750421, + "grad_norm": 3.5784671779241046, + "learning_rate": 9.099968122348467e-05, + "loss": 0.5269, + "num_input_tokens_seen": 25326880, + "step": 3825 + }, + { + "epoch": 0.46920461854154544, + "grad_norm": 10.843761894162492, + "learning_rate": 9.097067067736298e-05, + "loss": 0.4591, + "num_input_tokens_seen": 25360360, + "step": 3830 + }, + { + "epoch": 0.4698171572080488, + "grad_norm": 5.29918425846746, + "learning_rate": 9.094161809177281e-05, + "loss": 0.4192, + "num_input_tokens_seen": 25394592, + "step": 3835 + }, + { + "epoch": 0.4704296958745521, + "grad_norm": 4.221276735198347, + "learning_rate": 9.091252349652465e-05, + "loss": 0.3305, + "num_input_tokens_seen": 25427840, + "step": 3840 + }, + { + "epoch": 0.4710422345410554, + "grad_norm": 11.408473071091255, + "learning_rate": 9.088338692147205e-05, + "loss": 0.4276, + "num_input_tokens_seen": 25461840, + "step": 3845 + }, + { + "epoch": 0.4716547732075587, + "grad_norm": 4.6528846798763634, + "learning_rate": 9.085420839651171e-05, + "loss": 0.3792, + "num_input_tokens_seen": 25495704, + "step": 3850 + }, + { + "epoch": 0.4716547732075587, + "eval_loss": 0.22186821699142456, + "eval_runtime": 19.3722, + "eval_samples_per_second": 3.097, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 25495704, + "step": 3850 + }, + { + "epoch": 0.47226731187406207, + "grad_norm": 6.437911816092746, + "learning_rate": 9.08249879515833e-05, + "loss": 0.3757, + "num_input_tokens_seen": 25529528, + "step": 3855 + }, + { + "epoch": 0.47287985054056536, + "grad_norm": 1.3263623143912566, + "learning_rate": 9.079572561666956e-05, + "loss": 0.3942, + "num_input_tokens_seen": 25562760, + "step": 3860 + }, + { + "epoch": 0.4734923892070687, + "grad_norm": 1.7696546161137154, + "learning_rate": 9.076642142179616e-05, + "loss": 0.3056, + "num_input_tokens_seen": 25596896, + "step": 3865 + }, + { + "epoch": 0.474104927873572, + "grad_norm": 5.092703750150224, + "learning_rate": 9.07370753970318e-05, + "loss": 0.4779, + "num_input_tokens_seen": 25630144, + "step": 3870 + }, + { + "epoch": 0.47471746654007535, + "grad_norm": 21.863587409192906, + "learning_rate": 9.0707687572488e-05, + "loss": 0.3364, + "num_input_tokens_seen": 25664128, + "step": 3875 + }, + { + "epoch": 0.47533000520657864, + "grad_norm": 8.73514589503755, + "learning_rate": 9.067825797831924e-05, + "loss": 0.3854, + "num_input_tokens_seen": 25698152, + "step": 3880 + }, + { + "epoch": 0.475942543873082, + "grad_norm": 7.292038666249636, + "learning_rate": 9.064878664472282e-05, + "loss": 0.3675, + "num_input_tokens_seen": 25732096, + "step": 3885 + }, + { + "epoch": 0.4765550825395853, + "grad_norm": 6.546742018383025, + "learning_rate": 9.061927360193894e-05, + "loss": 0.3702, + "num_input_tokens_seen": 25765792, + "step": 3890 + }, + { + "epoch": 0.47716762120608863, + "grad_norm": 1.450020256267094, + "learning_rate": 9.058971888025052e-05, + "loss": 0.407, + "num_input_tokens_seen": 25798856, + "step": 3895 + }, + { + "epoch": 0.477780159872592, + "grad_norm": 1.2381041336305705, + "learning_rate": 9.056012250998325e-05, + "loss": 0.4111, + "num_input_tokens_seen": 25831784, + "step": 3900 + }, + { + "epoch": 0.477780159872592, + "eval_loss": 0.4086068868637085, + "eval_runtime": 19.3608, + "eval_samples_per_second": 3.099, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 25831784, + "step": 3900 + }, + { + "epoch": 0.4783926985390953, + "grad_norm": 16.320316988551394, + "learning_rate": 9.053048452150562e-05, + "loss": 0.3328, + "num_input_tokens_seen": 25865600, + "step": 3905 + }, + { + "epoch": 0.4790052372055986, + "grad_norm": 1.1346625720919554, + "learning_rate": 9.050080494522875e-05, + "loss": 0.4061, + "num_input_tokens_seen": 25899352, + "step": 3910 + }, + { + "epoch": 0.4796177758721019, + "grad_norm": 2.4563741452274366, + "learning_rate": 9.047108381160647e-05, + "loss": 0.4168, + "num_input_tokens_seen": 25932640, + "step": 3915 + }, + { + "epoch": 0.48023031453860526, + "grad_norm": 13.026115052113138, + "learning_rate": 9.044132115113525e-05, + "loss": 0.395, + "num_input_tokens_seen": 25966192, + "step": 3920 + }, + { + "epoch": 0.48084285320510856, + "grad_norm": 17.248960425810434, + "learning_rate": 9.041151699435417e-05, + "loss": 0.3331, + "num_input_tokens_seen": 25999896, + "step": 3925 + }, + { + "epoch": 0.4814553918716119, + "grad_norm": 7.787399667914957, + "learning_rate": 9.038167137184488e-05, + "loss": 0.3548, + "num_input_tokens_seen": 26033344, + "step": 3930 + }, + { + "epoch": 0.4820679305381152, + "grad_norm": 1.0181146122815241, + "learning_rate": 9.035178431423159e-05, + "loss": 0.3051, + "num_input_tokens_seen": 26066968, + "step": 3935 + }, + { + "epoch": 0.48268046920461855, + "grad_norm": 9.382877070525275, + "learning_rate": 9.0321855852181e-05, + "loss": 0.3944, + "num_input_tokens_seen": 26100048, + "step": 3940 + }, + { + "epoch": 0.48329300787112184, + "grad_norm": 0.7932512696330944, + "learning_rate": 9.029188601640234e-05, + "loss": 0.3257, + "num_input_tokens_seen": 26134240, + "step": 3945 + }, + { + "epoch": 0.4839055465376252, + "grad_norm": 0.8124448108792727, + "learning_rate": 9.026187483764725e-05, + "loss": 0.3546, + "num_input_tokens_seen": 26167704, + "step": 3950 + }, + { + "epoch": 0.4839055465376252, + "eval_loss": 0.31338992714881897, + "eval_runtime": 19.3621, + "eval_samples_per_second": 3.099, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 26167704, + "step": 3950 + }, + { + "epoch": 0.48451808520412853, + "grad_norm": 1.677918528425117, + "learning_rate": 9.023182234670981e-05, + "loss": 0.4172, + "num_input_tokens_seen": 26200904, + "step": 3955 + }, + { + "epoch": 0.4851306238706318, + "grad_norm": 3.3211698740416336, + "learning_rate": 9.020172857442647e-05, + "loss": 0.3223, + "num_input_tokens_seen": 26234944, + "step": 3960 + }, + { + "epoch": 0.4857431625371352, + "grad_norm": 3.542321247547096, + "learning_rate": 9.017159355167609e-05, + "loss": 0.3836, + "num_input_tokens_seen": 26268272, + "step": 3965 + }, + { + "epoch": 0.48635570120363847, + "grad_norm": 1.0953410746604542, + "learning_rate": 9.014141730937978e-05, + "loss": 0.3538, + "num_input_tokens_seen": 26301736, + "step": 3970 + }, + { + "epoch": 0.4869682398701418, + "grad_norm": 11.648372722187634, + "learning_rate": 9.011119987850103e-05, + "loss": 0.4015, + "num_input_tokens_seen": 26334800, + "step": 3975 + }, + { + "epoch": 0.4875807785366451, + "grad_norm": 7.419598174624838, + "learning_rate": 9.008094129004552e-05, + "loss": 0.4309, + "num_input_tokens_seen": 26368176, + "step": 3980 + }, + { + "epoch": 0.48819331720314846, + "grad_norm": 1.0414133411521413, + "learning_rate": 9.00506415750612e-05, + "loss": 0.3684, + "num_input_tokens_seen": 26401184, + "step": 3985 + }, + { + "epoch": 0.48880585586965175, + "grad_norm": 1.3161505451783133, + "learning_rate": 9.00203007646382e-05, + "loss": 0.3302, + "num_input_tokens_seen": 26435504, + "step": 3990 + }, + { + "epoch": 0.4894183945361551, + "grad_norm": 10.440727963377881, + "learning_rate": 8.998991888990886e-05, + "loss": 0.4337, + "num_input_tokens_seen": 26468848, + "step": 3995 + }, + { + "epoch": 0.4900309332026584, + "grad_norm": 0.7324585924589593, + "learning_rate": 8.99594959820476e-05, + "loss": 0.3654, + "num_input_tokens_seen": 26502544, + "step": 4000 + }, + { + "epoch": 0.4900309332026584, + "eval_loss": 0.21694067120552063, + "eval_runtime": 19.1963, + "eval_samples_per_second": 3.126, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 26502544, + "step": 4000 + }, + { + "epoch": 0.49064347186916174, + "grad_norm": 3.713900581808831, + "learning_rate": 8.992903207227097e-05, + "loss": 0.3121, + "num_input_tokens_seen": 26535712, + "step": 4005 + }, + { + "epoch": 0.4912560105356651, + "grad_norm": 2.6973485236194654, + "learning_rate": 8.98985271918376e-05, + "loss": 0.4167, + "num_input_tokens_seen": 26568952, + "step": 4010 + }, + { + "epoch": 0.4918685492021684, + "grad_norm": 14.49212632898443, + "learning_rate": 8.986798137204813e-05, + "loss": 0.3339, + "num_input_tokens_seen": 26602464, + "step": 4015 + }, + { + "epoch": 0.49248108786867173, + "grad_norm": 19.831877691402223, + "learning_rate": 8.983739464424522e-05, + "loss": 0.3833, + "num_input_tokens_seen": 26635984, + "step": 4020 + }, + { + "epoch": 0.493093626535175, + "grad_norm": 8.258999293506754, + "learning_rate": 8.980676703981354e-05, + "loss": 0.3197, + "num_input_tokens_seen": 26670048, + "step": 4025 + }, + { + "epoch": 0.49370616520167837, + "grad_norm": 1.021833327621968, + "learning_rate": 8.977609859017964e-05, + "loss": 0.367, + "num_input_tokens_seen": 26703248, + "step": 4030 + }, + { + "epoch": 0.49431870386818166, + "grad_norm": 28.97059310351216, + "learning_rate": 8.974538932681204e-05, + "loss": 0.474, + "num_input_tokens_seen": 26736920, + "step": 4035 + }, + { + "epoch": 0.494931242534685, + "grad_norm": 1.5739695383793706, + "learning_rate": 8.971463928122113e-05, + "loss": 0.392, + "num_input_tokens_seen": 26770000, + "step": 4040 + }, + { + "epoch": 0.4955437812011883, + "grad_norm": 1.3653995373187096, + "learning_rate": 8.968384848495908e-05, + "loss": 0.3828, + "num_input_tokens_seen": 26803344, + "step": 4045 + }, + { + "epoch": 0.49615631986769165, + "grad_norm": 20.177791200048823, + "learning_rate": 8.965301696961994e-05, + "loss": 0.4304, + "num_input_tokens_seen": 26837152, + "step": 4050 + }, + { + "epoch": 0.49615631986769165, + "eval_loss": 0.5118626356124878, + "eval_runtime": 19.3755, + "eval_samples_per_second": 3.097, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 26837152, + "step": 4050 + }, + { + "epoch": 0.49676885853419495, + "grad_norm": 3.9037891994168143, + "learning_rate": 8.962214476683954e-05, + "loss": 0.4811, + "num_input_tokens_seen": 26870528, + "step": 4055 + }, + { + "epoch": 0.4973813972006983, + "grad_norm": 9.410022185385456, + "learning_rate": 8.959123190829544e-05, + "loss": 0.4701, + "num_input_tokens_seen": 26903176, + "step": 4060 + }, + { + "epoch": 0.49799393586720164, + "grad_norm": 1.7206836377069623, + "learning_rate": 8.956027842570692e-05, + "loss": 0.4449, + "num_input_tokens_seen": 26936832, + "step": 4065 + }, + { + "epoch": 0.49860647453370494, + "grad_norm": 10.467397020039593, + "learning_rate": 8.952928435083491e-05, + "loss": 0.3572, + "num_input_tokens_seen": 26970504, + "step": 4070 + }, + { + "epoch": 0.4992190132002083, + "grad_norm": 1.1363968166174134, + "learning_rate": 8.949824971548206e-05, + "loss": 0.3907, + "num_input_tokens_seen": 27003736, + "step": 4075 + }, + { + "epoch": 0.4998315518667116, + "grad_norm": 1.157435983723505, + "learning_rate": 8.946717455149259e-05, + "loss": 0.361, + "num_input_tokens_seen": 27037648, + "step": 4080 + }, + { + "epoch": 0.5004440905332149, + "grad_norm": 2.079444407529442, + "learning_rate": 8.943605889075233e-05, + "loss": 0.3149, + "num_input_tokens_seen": 27071560, + "step": 4085 + }, + { + "epoch": 0.5010566291997183, + "grad_norm": 1.1772377164666639, + "learning_rate": 8.940490276518864e-05, + "loss": 0.3322, + "num_input_tokens_seen": 27106032, + "step": 4090 + }, + { + "epoch": 0.5016691678662215, + "grad_norm": 14.80980548134689, + "learning_rate": 8.937370620677042e-05, + "loss": 0.3581, + "num_input_tokens_seen": 27139656, + "step": 4095 + }, + { + "epoch": 0.5022817065327249, + "grad_norm": 1.2175184036128284, + "learning_rate": 8.934246924750804e-05, + "loss": 0.3471, + "num_input_tokens_seen": 27173784, + "step": 4100 + }, + { + "epoch": 0.5022817065327249, + "eval_loss": 0.20771612226963043, + "eval_runtime": 19.2115, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 27173784, + "step": 4100 + }, + { + "epoch": 0.5028942451992282, + "grad_norm": 5.5711147093951165, + "learning_rate": 8.931119191945334e-05, + "loss": 0.3476, + "num_input_tokens_seen": 27208344, + "step": 4105 + }, + { + "epoch": 0.5035067838657316, + "grad_norm": 2.540443627150266, + "learning_rate": 8.927987425469957e-05, + "loss": 0.3914, + "num_input_tokens_seen": 27242560, + "step": 4110 + }, + { + "epoch": 0.5041193225322348, + "grad_norm": 0.750807285452677, + "learning_rate": 8.92485162853814e-05, + "loss": 0.3901, + "num_input_tokens_seen": 27276256, + "step": 4115 + }, + { + "epoch": 0.5047318611987381, + "grad_norm": 2.723873815398748, + "learning_rate": 8.921711804367481e-05, + "loss": 0.3934, + "num_input_tokens_seen": 27310360, + "step": 4120 + }, + { + "epoch": 0.5053443998652415, + "grad_norm": 0.8895378173656604, + "learning_rate": 8.918567956179711e-05, + "loss": 0.3704, + "num_input_tokens_seen": 27343672, + "step": 4125 + }, + { + "epoch": 0.5059569385317448, + "grad_norm": 15.523107374641429, + "learning_rate": 8.915420087200694e-05, + "loss": 0.3748, + "num_input_tokens_seen": 27377504, + "step": 4130 + }, + { + "epoch": 0.5065694771982482, + "grad_norm": 9.378474978338842, + "learning_rate": 8.912268200660418e-05, + "loss": 0.3259, + "num_input_tokens_seen": 27411072, + "step": 4135 + }, + { + "epoch": 0.5071820158647514, + "grad_norm": 13.059573031029371, + "learning_rate": 8.90911229979299e-05, + "loss": 0.4131, + "num_input_tokens_seen": 27444096, + "step": 4140 + }, + { + "epoch": 0.5077945545312548, + "grad_norm": 7.445255765143421, + "learning_rate": 8.905952387836639e-05, + "loss": 0.3484, + "num_input_tokens_seen": 27477864, + "step": 4145 + }, + { + "epoch": 0.5084070931977581, + "grad_norm": 2.923457649686931, + "learning_rate": 8.90278846803371e-05, + "loss": 0.3778, + "num_input_tokens_seen": 27512048, + "step": 4150 + }, + { + "epoch": 0.5084070931977581, + "eval_loss": 0.1999550759792328, + "eval_runtime": 19.4161, + "eval_samples_per_second": 3.09, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 27512048, + "step": 4150 + }, + { + "epoch": 0.5090196318642615, + "grad_norm": 9.827750929250229, + "learning_rate": 8.89962054363066e-05, + "loss": 0.3974, + "num_input_tokens_seen": 27545032, + "step": 4155 + }, + { + "epoch": 0.5096321705307647, + "grad_norm": 0.804208451188561, + "learning_rate": 8.896448617878053e-05, + "loss": 0.3533, + "num_input_tokens_seen": 27578320, + "step": 4160 + }, + { + "epoch": 0.510244709197268, + "grad_norm": 1.2739862179401913, + "learning_rate": 8.893272694030562e-05, + "loss": 0.2897, + "num_input_tokens_seen": 27612376, + "step": 4165 + }, + { + "epoch": 0.5108572478637714, + "grad_norm": 2.0092531409099994, + "learning_rate": 8.890092775346961e-05, + "loss": 0.3093, + "num_input_tokens_seen": 27646232, + "step": 4170 + }, + { + "epoch": 0.5114697865302748, + "grad_norm": 1.0996520560204182, + "learning_rate": 8.886908865090121e-05, + "loss": 0.3161, + "num_input_tokens_seen": 27679704, + "step": 4175 + }, + { + "epoch": 0.5120823251967781, + "grad_norm": 2.1810054856309833, + "learning_rate": 8.883720966527012e-05, + "loss": 0.3491, + "num_input_tokens_seen": 27713616, + "step": 4180 + }, + { + "epoch": 0.5126948638632813, + "grad_norm": 6.9343000104252415, + "learning_rate": 8.880529082928696e-05, + "loss": 0.348, + "num_input_tokens_seen": 27747272, + "step": 4185 + }, + { + "epoch": 0.5133074025297847, + "grad_norm": 13.039674707378921, + "learning_rate": 8.877333217570323e-05, + "loss": 0.417, + "num_input_tokens_seen": 27780888, + "step": 4190 + }, + { + "epoch": 0.513919941196288, + "grad_norm": 10.44505647471351, + "learning_rate": 8.874133373731123e-05, + "loss": 0.3867, + "num_input_tokens_seen": 27814576, + "step": 4195 + }, + { + "epoch": 0.5145324798627914, + "grad_norm": 1.5944120715896433, + "learning_rate": 8.870929554694423e-05, + "loss": 0.3802, + "num_input_tokens_seen": 27847792, + "step": 4200 + }, + { + "epoch": 0.5145324798627914, + "eval_loss": 0.354165256023407, + "eval_runtime": 20.0222, + "eval_samples_per_second": 2.997, + "eval_steps_per_second": 0.749, + "num_input_tokens_seen": 27847792, + "step": 4200 + }, + { + "epoch": 0.5151450185292946, + "grad_norm": 5.8145756417708565, + "learning_rate": 8.867721763747613e-05, + "loss": 0.428, + "num_input_tokens_seen": 27880936, + "step": 4205 + }, + { + "epoch": 0.515757557195798, + "grad_norm": 0.9932058377351455, + "learning_rate": 8.864510004182167e-05, + "loss": 0.3322, + "num_input_tokens_seen": 27914960, + "step": 4210 + }, + { + "epoch": 0.5163700958623013, + "grad_norm": 0.9569748299366264, + "learning_rate": 8.86129427929363e-05, + "loss": 0.3358, + "num_input_tokens_seen": 27948640, + "step": 4215 + }, + { + "epoch": 0.5169826345288047, + "grad_norm": 21.515285924955847, + "learning_rate": 8.858074592381616e-05, + "loss": 0.3476, + "num_input_tokens_seen": 27982736, + "step": 4220 + }, + { + "epoch": 0.5175951731953079, + "grad_norm": 1.4236366756496261, + "learning_rate": 8.854850946749801e-05, + "loss": 0.3419, + "num_input_tokens_seen": 28016472, + "step": 4225 + }, + { + "epoch": 0.5182077118618112, + "grad_norm": 1.8163427947390796, + "learning_rate": 8.851623345705927e-05, + "loss": 0.2932, + "num_input_tokens_seen": 28050552, + "step": 4230 + }, + { + "epoch": 0.5188202505283146, + "grad_norm": 0.9780907349165444, + "learning_rate": 8.848391792561793e-05, + "loss": 0.3589, + "num_input_tokens_seen": 28084200, + "step": 4235 + }, + { + "epoch": 0.519432789194818, + "grad_norm": 5.162340933446907, + "learning_rate": 8.845156290633255e-05, + "loss": 0.3527, + "num_input_tokens_seen": 28117840, + "step": 4240 + }, + { + "epoch": 0.5200453278613213, + "grad_norm": 11.180513025169638, + "learning_rate": 8.841916843240216e-05, + "loss": 0.4401, + "num_input_tokens_seen": 28151088, + "step": 4245 + }, + { + "epoch": 0.5206578665278245, + "grad_norm": 1.7731704572854683, + "learning_rate": 8.838673453706632e-05, + "loss": 0.3934, + "num_input_tokens_seen": 28184288, + "step": 4250 + }, + { + "epoch": 0.5206578665278245, + "eval_loss": 0.27066436409950256, + "eval_runtime": 19.1773, + "eval_samples_per_second": 3.129, + "eval_steps_per_second": 0.782, + "num_input_tokens_seen": 28184288, + "step": 4250 + }, + { + "epoch": 0.5212704051943279, + "grad_norm": 1.0287979038795037, + "learning_rate": 8.835426125360504e-05, + "loss": 0.3608, + "num_input_tokens_seen": 28218704, + "step": 4255 + }, + { + "epoch": 0.5218829438608312, + "grad_norm": 1.0098104404242263, + "learning_rate": 8.83217486153387e-05, + "loss": 0.3643, + "num_input_tokens_seen": 28251552, + "step": 4260 + }, + { + "epoch": 0.5224954825273346, + "grad_norm": 1.236212775970636, + "learning_rate": 8.828919665562814e-05, + "loss": 0.4165, + "num_input_tokens_seen": 28284760, + "step": 4265 + }, + { + "epoch": 0.5231080211938378, + "grad_norm": 0.8801115368190273, + "learning_rate": 8.825660540787444e-05, + "loss": 0.3577, + "num_input_tokens_seen": 28318008, + "step": 4270 + }, + { + "epoch": 0.5237205598603412, + "grad_norm": 1.0889711673397269, + "learning_rate": 8.82239749055191e-05, + "loss": 0.3533, + "num_input_tokens_seen": 28351584, + "step": 4275 + }, + { + "epoch": 0.5243330985268445, + "grad_norm": 1.1666793566977307, + "learning_rate": 8.819130518204383e-05, + "loss": 0.3971, + "num_input_tokens_seen": 28385808, + "step": 4280 + }, + { + "epoch": 0.5249456371933479, + "grad_norm": 12.125503428663208, + "learning_rate": 8.815859627097059e-05, + "loss": 0.3658, + "num_input_tokens_seen": 28419816, + "step": 4285 + }, + { + "epoch": 0.5255581758598512, + "grad_norm": 2.472831920913406, + "learning_rate": 8.81258482058616e-05, + "loss": 0.3209, + "num_input_tokens_seen": 28453888, + "step": 4290 + }, + { + "epoch": 0.5261707145263544, + "grad_norm": 1.018240149582722, + "learning_rate": 8.80930610203192e-05, + "loss": 0.3307, + "num_input_tokens_seen": 28487400, + "step": 4295 + }, + { + "epoch": 0.5267832531928578, + "grad_norm": 0.7745456763917933, + "learning_rate": 8.806023474798591e-05, + "loss": 0.3213, + "num_input_tokens_seen": 28520768, + "step": 4300 + }, + { + "epoch": 0.5267832531928578, + "eval_loss": 0.1924230009317398, + "eval_runtime": 19.3981, + "eval_samples_per_second": 3.093, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 28520768, + "step": 4300 + }, + { + "epoch": 0.5273957918593611, + "grad_norm": 8.76896189529179, + "learning_rate": 8.80273694225443e-05, + "loss": 0.3658, + "num_input_tokens_seen": 28554056, + "step": 4305 + }, + { + "epoch": 0.5280083305258645, + "grad_norm": 1.1013643037942171, + "learning_rate": 8.799446507771709e-05, + "loss": 0.3763, + "num_input_tokens_seen": 28587184, + "step": 4310 + }, + { + "epoch": 0.5286208691923677, + "grad_norm": 1.2466372730640443, + "learning_rate": 8.796152174726697e-05, + "loss": 0.3491, + "num_input_tokens_seen": 28621016, + "step": 4315 + }, + { + "epoch": 0.5292334078588711, + "grad_norm": 5.930690770045515, + "learning_rate": 8.792853946499666e-05, + "loss": 0.2954, + "num_input_tokens_seen": 28655368, + "step": 4320 + }, + { + "epoch": 0.5298459465253744, + "grad_norm": 2.2115034879808815, + "learning_rate": 8.789551826474886e-05, + "loss": 0.3704, + "num_input_tokens_seen": 28688848, + "step": 4325 + }, + { + "epoch": 0.5304584851918778, + "grad_norm": 8.550679892046267, + "learning_rate": 8.786245818040622e-05, + "loss": 0.2967, + "num_input_tokens_seen": 28723488, + "step": 4330 + }, + { + "epoch": 0.531071023858381, + "grad_norm": 0.824220718044988, + "learning_rate": 8.78293592458912e-05, + "loss": 0.3078, + "num_input_tokens_seen": 28756968, + "step": 4335 + }, + { + "epoch": 0.5316835625248844, + "grad_norm": 13.051460326257242, + "learning_rate": 8.779622149516621e-05, + "loss": 0.3285, + "num_input_tokens_seen": 28790376, + "step": 4340 + }, + { + "epoch": 0.5322961011913877, + "grad_norm": 1.2287332901223997, + "learning_rate": 8.776304496223349e-05, + "loss": 0.4012, + "num_input_tokens_seen": 28823400, + "step": 4345 + }, + { + "epoch": 0.532908639857891, + "grad_norm": 1.7231749947486708, + "learning_rate": 8.772982968113499e-05, + "loss": 0.33, + "num_input_tokens_seen": 28857320, + "step": 4350 + }, + { + "epoch": 0.532908639857891, + "eval_loss": 0.34251755475997925, + "eval_runtime": 19.2805, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 28857320, + "step": 4350 + }, + { + "epoch": 0.5335211785243944, + "grad_norm": 10.990860133262277, + "learning_rate": 8.769657568595252e-05, + "loss": 0.3825, + "num_input_tokens_seen": 28891544, + "step": 4355 + }, + { + "epoch": 0.5341337171908976, + "grad_norm": 5.8977601464289835, + "learning_rate": 8.766328301080756e-05, + "loss": 0.3988, + "num_input_tokens_seen": 28925232, + "step": 4360 + }, + { + "epoch": 0.534746255857401, + "grad_norm": 12.269774799599503, + "learning_rate": 8.762995168986126e-05, + "loss": 0.328, + "num_input_tokens_seen": 28958664, + "step": 4365 + }, + { + "epoch": 0.5353587945239043, + "grad_norm": 1.5662776237180442, + "learning_rate": 8.759658175731449e-05, + "loss": 0.3096, + "num_input_tokens_seen": 28992488, + "step": 4370 + }, + { + "epoch": 0.5359713331904077, + "grad_norm": 11.52416825675092, + "learning_rate": 8.756317324740766e-05, + "loss": 0.2844, + "num_input_tokens_seen": 29026336, + "step": 4375 + }, + { + "epoch": 0.5365838718569109, + "grad_norm": 1.3708133212985363, + "learning_rate": 8.752972619442086e-05, + "loss": 0.3573, + "num_input_tokens_seen": 29059736, + "step": 4380 + }, + { + "epoch": 0.5371964105234143, + "grad_norm": 23.804423007657647, + "learning_rate": 8.749624063267362e-05, + "loss": 0.3899, + "num_input_tokens_seen": 29092848, + "step": 4385 + }, + { + "epoch": 0.5378089491899176, + "grad_norm": 0.9335913480297487, + "learning_rate": 8.746271659652506e-05, + "loss": 0.3047, + "num_input_tokens_seen": 29126336, + "step": 4390 + }, + { + "epoch": 0.538421487856421, + "grad_norm": 5.273515405061106, + "learning_rate": 8.742915412037376e-05, + "loss": 0.3134, + "num_input_tokens_seen": 29160048, + "step": 4395 + }, + { + "epoch": 0.5390340265229243, + "grad_norm": 0.8787024338942186, + "learning_rate": 8.739555323865771e-05, + "loss": 0.4625, + "num_input_tokens_seen": 29193456, + "step": 4400 + }, + { + "epoch": 0.5390340265229243, + "eval_loss": 0.2848778963088989, + "eval_runtime": 19.5656, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 29193456, + "step": 4400 + }, + { + "epoch": 0.5396465651894276, + "grad_norm": 3.1074893244379784, + "learning_rate": 8.736191398585436e-05, + "loss": 0.3406, + "num_input_tokens_seen": 29226768, + "step": 4405 + }, + { + "epoch": 0.5402591038559309, + "grad_norm": 17.52662716396404, + "learning_rate": 8.732823639648052e-05, + "loss": 0.4326, + "num_input_tokens_seen": 29260664, + "step": 4410 + }, + { + "epoch": 0.5408716425224342, + "grad_norm": 1.3269173279721735, + "learning_rate": 8.729452050509228e-05, + "loss": 0.3559, + "num_input_tokens_seen": 29293776, + "step": 4415 + }, + { + "epoch": 0.5414841811889376, + "grad_norm": 10.88675892643795, + "learning_rate": 8.726076634628511e-05, + "loss": 0.3238, + "num_input_tokens_seen": 29327128, + "step": 4420 + }, + { + "epoch": 0.5420967198554408, + "grad_norm": 3.8861553390053722, + "learning_rate": 8.722697395469368e-05, + "loss": 0.4011, + "num_input_tokens_seen": 29360832, + "step": 4425 + }, + { + "epoch": 0.5427092585219442, + "grad_norm": 21.940278252753345, + "learning_rate": 8.719314336499196e-05, + "loss": 0.3679, + "num_input_tokens_seen": 29394440, + "step": 4430 + }, + { + "epoch": 0.5433217971884475, + "grad_norm": 11.43144015641003, + "learning_rate": 8.715927461189304e-05, + "loss": 0.3443, + "num_input_tokens_seen": 29428240, + "step": 4435 + }, + { + "epoch": 0.5439343358549509, + "grad_norm": 1.2961863306180368, + "learning_rate": 8.71253677301492e-05, + "loss": 0.3434, + "num_input_tokens_seen": 29461536, + "step": 4440 + }, + { + "epoch": 0.5445468745214541, + "grad_norm": 45.4954800892913, + "learning_rate": 8.709142275455189e-05, + "loss": 0.4073, + "num_input_tokens_seen": 29495024, + "step": 4445 + }, + { + "epoch": 0.5451594131879575, + "grad_norm": 1.187518904269452, + "learning_rate": 8.705743971993157e-05, + "loss": 0.2876, + "num_input_tokens_seen": 29528840, + "step": 4450 + }, + { + "epoch": 0.5451594131879575, + "eval_loss": 0.2150648534297943, + "eval_runtime": 19.1162, + "eval_samples_per_second": 3.139, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 29528840, + "step": 4450 + }, + { + "epoch": 0.5457719518544608, + "grad_norm": 1.2931549289460524, + "learning_rate": 8.70234186611578e-05, + "loss": 0.3817, + "num_input_tokens_seen": 29561992, + "step": 4455 + }, + { + "epoch": 0.5463844905209642, + "grad_norm": 7.155944344695173, + "learning_rate": 8.698935961313913e-05, + "loss": 0.3339, + "num_input_tokens_seen": 29595288, + "step": 4460 + }, + { + "epoch": 0.5469970291874675, + "grad_norm": 0.8824877621327393, + "learning_rate": 8.695526261082311e-05, + "loss": 0.3188, + "num_input_tokens_seen": 29629048, + "step": 4465 + }, + { + "epoch": 0.5476095678539707, + "grad_norm": 0.927496315915325, + "learning_rate": 8.692112768919624e-05, + "loss": 0.3273, + "num_input_tokens_seen": 29662200, + "step": 4470 + }, + { + "epoch": 0.5482221065204741, + "grad_norm": 18.07631965758471, + "learning_rate": 8.68869548832839e-05, + "loss": 0.3223, + "num_input_tokens_seen": 29696128, + "step": 4475 + }, + { + "epoch": 0.5488346451869774, + "grad_norm": 16.481412276061462, + "learning_rate": 8.685274422815035e-05, + "loss": 0.4173, + "num_input_tokens_seen": 29729768, + "step": 4480 + }, + { + "epoch": 0.5494471838534808, + "grad_norm": 1.17206695686124, + "learning_rate": 8.681849575889873e-05, + "loss": 0.3312, + "num_input_tokens_seen": 29763736, + "step": 4485 + }, + { + "epoch": 0.550059722519984, + "grad_norm": 9.99660372612491, + "learning_rate": 8.678420951067091e-05, + "loss": 0.3364, + "num_input_tokens_seen": 29797848, + "step": 4490 + }, + { + "epoch": 0.5506722611864874, + "grad_norm": 21.284953048987383, + "learning_rate": 8.674988551864758e-05, + "loss": 0.4096, + "num_input_tokens_seen": 29832160, + "step": 4495 + }, + { + "epoch": 0.5512847998529907, + "grad_norm": 8.254091983198762, + "learning_rate": 8.671552381804813e-05, + "loss": 0.4329, + "num_input_tokens_seen": 29865992, + "step": 4500 + }, + { + "epoch": 0.5512847998529907, + "eval_loss": 0.19009599089622498, + "eval_runtime": 19.2109, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 29865992, + "step": 4500 + }, + { + "epoch": 0.5518973385194941, + "grad_norm": 2.5207694837256884, + "learning_rate": 8.668112444413065e-05, + "loss": 0.2654, + "num_input_tokens_seen": 29900248, + "step": 4505 + }, + { + "epoch": 0.5525098771859973, + "grad_norm": 1.3056516563891742, + "learning_rate": 8.664668743219188e-05, + "loss": 0.2877, + "num_input_tokens_seen": 29934856, + "step": 4510 + }, + { + "epoch": 0.5531224158525007, + "grad_norm": 1.4049206381873205, + "learning_rate": 8.661221281756722e-05, + "loss": 0.3991, + "num_input_tokens_seen": 29967480, + "step": 4515 + }, + { + "epoch": 0.553734954519004, + "grad_norm": 1.2421348350705734, + "learning_rate": 8.657770063563058e-05, + "loss": 0.3031, + "num_input_tokens_seen": 30001000, + "step": 4520 + }, + { + "epoch": 0.5543474931855074, + "grad_norm": 1.243137042901242, + "learning_rate": 8.654315092179449e-05, + "loss": 0.348, + "num_input_tokens_seen": 30035464, + "step": 4525 + }, + { + "epoch": 0.5549600318520107, + "grad_norm": 1.3570345542156022, + "learning_rate": 8.650856371150993e-05, + "loss": 0.3051, + "num_input_tokens_seen": 30069096, + "step": 4530 + }, + { + "epoch": 0.5555725705185139, + "grad_norm": 7.054682889633165, + "learning_rate": 8.64739390402664e-05, + "loss": 0.4479, + "num_input_tokens_seen": 30102376, + "step": 4535 + }, + { + "epoch": 0.5561851091850173, + "grad_norm": 1.6916914357286421, + "learning_rate": 8.643927694359182e-05, + "loss": 0.3024, + "num_input_tokens_seen": 30135120, + "step": 4540 + }, + { + "epoch": 0.5567976478515206, + "grad_norm": 1.5467686717403928, + "learning_rate": 8.640457745705252e-05, + "loss": 0.3314, + "num_input_tokens_seen": 30169416, + "step": 4545 + }, + { + "epoch": 0.557410186518024, + "grad_norm": 0.8287330828843039, + "learning_rate": 8.636984061625317e-05, + "loss": 0.2804, + "num_input_tokens_seen": 30203424, + "step": 4550 + }, + { + "epoch": 0.557410186518024, + "eval_loss": 0.2320593297481537, + "eval_runtime": 19.3981, + "eval_samples_per_second": 3.093, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 30203424, + "step": 4550 + }, + { + "epoch": 0.5580227251845272, + "grad_norm": 2.434912692153064, + "learning_rate": 8.63350664568368e-05, + "loss": 0.2574, + "num_input_tokens_seen": 30238032, + "step": 4555 + }, + { + "epoch": 0.5586352638510306, + "grad_norm": 1.8611033279067186, + "learning_rate": 8.630025501448473e-05, + "loss": 0.4057, + "num_input_tokens_seen": 30272112, + "step": 4560 + }, + { + "epoch": 0.5592478025175339, + "grad_norm": 9.603086797094486, + "learning_rate": 8.62654063249165e-05, + "loss": 0.3657, + "num_input_tokens_seen": 30305896, + "step": 4565 + }, + { + "epoch": 0.5598603411840373, + "grad_norm": 1.3290952088146073, + "learning_rate": 8.62305204238899e-05, + "loss": 0.3327, + "num_input_tokens_seen": 30339184, + "step": 4570 + }, + { + "epoch": 0.5604728798505406, + "grad_norm": 25.038809043335647, + "learning_rate": 8.619559734720092e-05, + "loss": 0.3472, + "num_input_tokens_seen": 30372520, + "step": 4575 + }, + { + "epoch": 0.5610854185170439, + "grad_norm": 1.3541787366037061, + "learning_rate": 8.616063713068365e-05, + "loss": 0.3242, + "num_input_tokens_seen": 30406224, + "step": 4580 + }, + { + "epoch": 0.5616979571835472, + "grad_norm": 5.04588140922768, + "learning_rate": 8.612563981021031e-05, + "loss": 0.3828, + "num_input_tokens_seen": 30439864, + "step": 4585 + }, + { + "epoch": 0.5623104958500506, + "grad_norm": 19.082891793288294, + "learning_rate": 8.60906054216912e-05, + "loss": 0.3547, + "num_input_tokens_seen": 30473688, + "step": 4590 + }, + { + "epoch": 0.5629230345165539, + "grad_norm": 2.6532070738337987, + "learning_rate": 8.605553400107463e-05, + "loss": 0.3963, + "num_input_tokens_seen": 30506632, + "step": 4595 + }, + { + "epoch": 0.5635355731830571, + "grad_norm": 1.1133116385802913, + "learning_rate": 8.602042558434696e-05, + "loss": 0.3363, + "num_input_tokens_seen": 30539672, + "step": 4600 + }, + { + "epoch": 0.5635355731830571, + "eval_loss": 0.3674183487892151, + "eval_runtime": 19.0936, + "eval_samples_per_second": 3.142, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 30539672, + "step": 4600 + }, + { + "epoch": 0.5641481118495605, + "grad_norm": 1.6465801221898555, + "learning_rate": 8.598528020753244e-05, + "loss": 0.3515, + "num_input_tokens_seen": 30573928, + "step": 4605 + }, + { + "epoch": 0.5647606505160638, + "grad_norm": 1.251219294511213, + "learning_rate": 8.595009790669331e-05, + "loss": 0.4061, + "num_input_tokens_seen": 30607080, + "step": 4610 + }, + { + "epoch": 0.5653731891825672, + "grad_norm": 13.925843649119223, + "learning_rate": 8.591487871792963e-05, + "loss": 0.267, + "num_input_tokens_seen": 30640832, + "step": 4615 + }, + { + "epoch": 0.5659857278490704, + "grad_norm": 6.97082107304633, + "learning_rate": 8.58796226773794e-05, + "loss": 0.3081, + "num_input_tokens_seen": 30675144, + "step": 4620 + }, + { + "epoch": 0.5665982665155738, + "grad_norm": 9.076440760840617, + "learning_rate": 8.584432982121832e-05, + "loss": 0.3699, + "num_input_tokens_seen": 30708688, + "step": 4625 + }, + { + "epoch": 0.5672108051820771, + "grad_norm": 11.99706025929299, + "learning_rate": 8.580900018565998e-05, + "loss": 0.3889, + "num_input_tokens_seen": 30741960, + "step": 4630 + }, + { + "epoch": 0.5678233438485805, + "grad_norm": 1.1134038323251132, + "learning_rate": 8.577363380695564e-05, + "loss": 0.3255, + "num_input_tokens_seen": 30775624, + "step": 4635 + }, + { + "epoch": 0.5684358825150838, + "grad_norm": 1.1035947211149206, + "learning_rate": 8.573823072139429e-05, + "loss": 0.3598, + "num_input_tokens_seen": 30808448, + "step": 4640 + }, + { + "epoch": 0.569048421181587, + "grad_norm": 3.973160084960025, + "learning_rate": 8.570279096530253e-05, + "loss": 0.3538, + "num_input_tokens_seen": 30842072, + "step": 4645 + }, + { + "epoch": 0.5696609598480904, + "grad_norm": 1.34947548321894, + "learning_rate": 8.566731457504466e-05, + "loss": 0.2994, + "num_input_tokens_seen": 30875840, + "step": 4650 + }, + { + "epoch": 0.5696609598480904, + "eval_loss": 0.27580884099006653, + "eval_runtime": 19.5117, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 30875840, + "step": 4650 + }, + { + "epoch": 0.5702734985145937, + "grad_norm": 16.584750537247377, + "learning_rate": 8.563180158702256e-05, + "loss": 0.3418, + "num_input_tokens_seen": 30908664, + "step": 4655 + }, + { + "epoch": 0.5708860371810971, + "grad_norm": 8.034429369034413, + "learning_rate": 8.559625203767559e-05, + "loss": 0.3987, + "num_input_tokens_seen": 30942328, + "step": 4660 + }, + { + "epoch": 0.5714985758476003, + "grad_norm": 18.161915122117193, + "learning_rate": 8.556066596348072e-05, + "loss": 0.3358, + "num_input_tokens_seen": 30976456, + "step": 4665 + }, + { + "epoch": 0.5721111145141037, + "grad_norm": 1.520985716520499, + "learning_rate": 8.552504340095233e-05, + "loss": 0.3643, + "num_input_tokens_seen": 31009328, + "step": 4670 + }, + { + "epoch": 0.572723653180607, + "grad_norm": 2.0017818487111394, + "learning_rate": 8.548938438664229e-05, + "loss": 0.3039, + "num_input_tokens_seen": 31043200, + "step": 4675 + }, + { + "epoch": 0.5733361918471104, + "grad_norm": 1.6028797690264296, + "learning_rate": 8.545368895713982e-05, + "loss": 0.3185, + "num_input_tokens_seen": 31077480, + "step": 4680 + }, + { + "epoch": 0.5739487305136137, + "grad_norm": 1.6648548553437348, + "learning_rate": 8.541795714907155e-05, + "loss": 0.4018, + "num_input_tokens_seen": 31110000, + "step": 4685 + }, + { + "epoch": 0.574561269180117, + "grad_norm": 8.783617089988839, + "learning_rate": 8.53821889991014e-05, + "loss": 0.3636, + "num_input_tokens_seen": 31143728, + "step": 4690 + }, + { + "epoch": 0.5751738078466203, + "grad_norm": 1.1095354158214776, + "learning_rate": 8.534638454393063e-05, + "loss": 0.351, + "num_input_tokens_seen": 31177720, + "step": 4695 + }, + { + "epoch": 0.5757863465131237, + "grad_norm": 1.2884524012986178, + "learning_rate": 8.53105438202977e-05, + "loss": 0.3163, + "num_input_tokens_seen": 31211360, + "step": 4700 + }, + { + "epoch": 0.5757863465131237, + "eval_loss": 0.18082624673843384, + "eval_runtime": 19.4059, + "eval_samples_per_second": 3.092, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 31211360, + "step": 4700 + }, + { + "epoch": 0.576398885179627, + "grad_norm": 1.057717973964587, + "learning_rate": 8.527466686497831e-05, + "loss": 0.3304, + "num_input_tokens_seen": 31244648, + "step": 4705 + }, + { + "epoch": 0.5770114238461302, + "grad_norm": 6.277083294511332, + "learning_rate": 8.523875371478534e-05, + "loss": 0.3744, + "num_input_tokens_seen": 31278528, + "step": 4710 + }, + { + "epoch": 0.5776239625126336, + "grad_norm": 1.442050274302173, + "learning_rate": 8.520280440656882e-05, + "loss": 0.4074, + "num_input_tokens_seen": 31312144, + "step": 4715 + }, + { + "epoch": 0.5782365011791369, + "grad_norm": 13.07671682984774, + "learning_rate": 8.516681897721583e-05, + "loss": 0.4165, + "num_input_tokens_seen": 31346248, + "step": 4720 + }, + { + "epoch": 0.5788490398456403, + "grad_norm": 2.3401441709765347, + "learning_rate": 8.513079746365055e-05, + "loss": 0.3424, + "num_input_tokens_seen": 31379752, + "step": 4725 + }, + { + "epoch": 0.5794615785121435, + "grad_norm": 1.2186686327418907, + "learning_rate": 8.509473990283421e-05, + "loss": 0.4001, + "num_input_tokens_seen": 31412984, + "step": 4730 + }, + { + "epoch": 0.5800741171786469, + "grad_norm": 1.056303019169678, + "learning_rate": 8.505864633176499e-05, + "loss": 0.31, + "num_input_tokens_seen": 31446768, + "step": 4735 + }, + { + "epoch": 0.5806866558451502, + "grad_norm": 1.0880107568821609, + "learning_rate": 8.502251678747802e-05, + "loss": 0.3824, + "num_input_tokens_seen": 31480608, + "step": 4740 + }, + { + "epoch": 0.5812991945116536, + "grad_norm": 3.7734442760755567, + "learning_rate": 8.498635130704536e-05, + "loss": 0.3319, + "num_input_tokens_seen": 31513536, + "step": 4745 + }, + { + "epoch": 0.5819117331781569, + "grad_norm": 1.2968829151174839, + "learning_rate": 8.495014992757595e-05, + "loss": 0.307, + "num_input_tokens_seen": 31547504, + "step": 4750 + }, + { + "epoch": 0.5819117331781569, + "eval_loss": 0.22180898487567902, + "eval_runtime": 19.5552, + "eval_samples_per_second": 3.068, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 31547504, + "step": 4750 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 1.1122562834270953, + "learning_rate": 8.491391268621552e-05, + "loss": 0.3643, + "num_input_tokens_seen": 31580736, + "step": 4755 + }, + { + "epoch": 0.5831368105111635, + "grad_norm": 1.2103622472433337, + "learning_rate": 8.487763962014665e-05, + "loss": 0.4018, + "num_input_tokens_seen": 31613464, + "step": 4760 + }, + { + "epoch": 0.5837493491776669, + "grad_norm": 1.3712347445042234, + "learning_rate": 8.484133076658865e-05, + "loss": 0.3828, + "num_input_tokens_seen": 31646976, + "step": 4765 + }, + { + "epoch": 0.5843618878441702, + "grad_norm": 8.596287880483802, + "learning_rate": 8.480498616279756e-05, + "loss": 0.3694, + "num_input_tokens_seen": 31679736, + "step": 4770 + }, + { + "epoch": 0.5849744265106734, + "grad_norm": 3.3147588251641005, + "learning_rate": 8.47686058460661e-05, + "loss": 0.3272, + "num_input_tokens_seen": 31713832, + "step": 4775 + }, + { + "epoch": 0.5855869651771768, + "grad_norm": 17.81670222702863, + "learning_rate": 8.473218985372366e-05, + "loss": 0.4287, + "num_input_tokens_seen": 31748352, + "step": 4780 + }, + { + "epoch": 0.5861995038436801, + "grad_norm": 3.742982690183957, + "learning_rate": 8.469573822313617e-05, + "loss": 0.3326, + "num_input_tokens_seen": 31782192, + "step": 4785 + }, + { + "epoch": 0.5868120425101835, + "grad_norm": 1.0547598817951536, + "learning_rate": 8.465925099170621e-05, + "loss": 0.3272, + "num_input_tokens_seen": 31815816, + "step": 4790 + }, + { + "epoch": 0.5874245811766868, + "grad_norm": 2.552675849170281, + "learning_rate": 8.462272819687283e-05, + "loss": 0.3232, + "num_input_tokens_seen": 31849808, + "step": 4795 + }, + { + "epoch": 0.5880371198431901, + "grad_norm": 1.4289935628990047, + "learning_rate": 8.458616987611158e-05, + "loss": 0.2896, + "num_input_tokens_seen": 31883816, + "step": 4800 + }, + { + "epoch": 0.5880371198431901, + "eval_loss": 0.19881321489810944, + "eval_runtime": 19.5454, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 31883816, + "step": 4800 + }, + { + "epoch": 0.5886496585096934, + "grad_norm": 2.788839553914431, + "learning_rate": 8.45495760669345e-05, + "loss": 0.254, + "num_input_tokens_seen": 31917488, + "step": 4805 + }, + { + "epoch": 0.5892621971761968, + "grad_norm": 2.243951321644685, + "learning_rate": 8.451294680689002e-05, + "loss": 0.3292, + "num_input_tokens_seen": 31950880, + "step": 4810 + }, + { + "epoch": 0.5898747358427001, + "grad_norm": 9.924933124581358, + "learning_rate": 8.447628213356291e-05, + "loss": 0.3146, + "num_input_tokens_seen": 31984952, + "step": 4815 + }, + { + "epoch": 0.5904872745092034, + "grad_norm": 1.3491029750163082, + "learning_rate": 8.443958208457434e-05, + "loss": 0.2873, + "num_input_tokens_seen": 32018408, + "step": 4820 + }, + { + "epoch": 0.5910998131757067, + "grad_norm": 1.1622888761241994, + "learning_rate": 8.440284669758177e-05, + "loss": 0.3149, + "num_input_tokens_seen": 32052368, + "step": 4825 + }, + { + "epoch": 0.59171235184221, + "grad_norm": 2.8620080397386203, + "learning_rate": 8.436607601027883e-05, + "loss": 0.3625, + "num_input_tokens_seen": 32085792, + "step": 4830 + }, + { + "epoch": 0.5923248905087134, + "grad_norm": 5.079390158129337, + "learning_rate": 8.43292700603955e-05, + "loss": 0.2981, + "num_input_tokens_seen": 32119160, + "step": 4835 + }, + { + "epoch": 0.5929374291752166, + "grad_norm": 1.3687585076703497, + "learning_rate": 8.429242888569786e-05, + "loss": 0.358, + "num_input_tokens_seen": 32152400, + "step": 4840 + }, + { + "epoch": 0.59354996784172, + "grad_norm": 15.522348093042638, + "learning_rate": 8.425555252398816e-05, + "loss": 0.3486, + "num_input_tokens_seen": 32185912, + "step": 4845 + }, + { + "epoch": 0.5941625065082233, + "grad_norm": 2.514070148910201, + "learning_rate": 8.421864101310475e-05, + "loss": 0.3107, + "num_input_tokens_seen": 32220000, + "step": 4850 + }, + { + "epoch": 0.5941625065082233, + "eval_loss": 0.3321171998977661, + "eval_runtime": 19.4442, + "eval_samples_per_second": 3.086, + "eval_steps_per_second": 0.771, + "num_input_tokens_seen": 32220000, + "step": 4850 + }, + { + "epoch": 0.5947750451747267, + "grad_norm": 1.1596539437605815, + "learning_rate": 8.418169439092207e-05, + "loss": 0.3067, + "num_input_tokens_seen": 32254304, + "step": 4855 + }, + { + "epoch": 0.59538758384123, + "grad_norm": 1.12500333981509, + "learning_rate": 8.414471269535053e-05, + "loss": 0.3232, + "num_input_tokens_seen": 32288320, + "step": 4860 + }, + { + "epoch": 0.5960001225077333, + "grad_norm": 7.005525396106571, + "learning_rate": 8.410769596433659e-05, + "loss": 0.335, + "num_input_tokens_seen": 32321632, + "step": 4865 + }, + { + "epoch": 0.5966126611742366, + "grad_norm": 16.4855948660077, + "learning_rate": 8.40706442358626e-05, + "loss": 0.3497, + "num_input_tokens_seen": 32355120, + "step": 4870 + }, + { + "epoch": 0.59722519984074, + "grad_norm": 1.2973393747327064, + "learning_rate": 8.403355754794688e-05, + "loss": 0.3302, + "num_input_tokens_seen": 32388696, + "step": 4875 + }, + { + "epoch": 0.5978377385072433, + "grad_norm": 1.0117936082198296, + "learning_rate": 8.399643593864361e-05, + "loss": 0.3205, + "num_input_tokens_seen": 32422728, + "step": 4880 + }, + { + "epoch": 0.5984502771737465, + "grad_norm": 1.5105289618812754, + "learning_rate": 8.395927944604276e-05, + "loss": 0.3074, + "num_input_tokens_seen": 32456296, + "step": 4885 + }, + { + "epoch": 0.5990628158402499, + "grad_norm": 0.8004742615871359, + "learning_rate": 8.392208810827011e-05, + "loss": 0.3231, + "num_input_tokens_seen": 32489280, + "step": 4890 + }, + { + "epoch": 0.5996753545067532, + "grad_norm": 0.9237390061199923, + "learning_rate": 8.388486196348724e-05, + "loss": 0.2702, + "num_input_tokens_seen": 32523128, + "step": 4895 + }, + { + "epoch": 0.6002878931732566, + "grad_norm": 0.9453431951503825, + "learning_rate": 8.384760104989139e-05, + "loss": 0.3079, + "num_input_tokens_seen": 32556072, + "step": 4900 + }, + { + "epoch": 0.6002878931732566, + "eval_loss": 0.2282978892326355, + "eval_runtime": 19.7104, + "eval_samples_per_second": 3.044, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 32556072, + "step": 4900 + }, + { + "epoch": 0.6009004318397598, + "grad_norm": 0.795317714103761, + "learning_rate": 8.38103054057155e-05, + "loss": 0.3373, + "num_input_tokens_seen": 32590584, + "step": 4905 + }, + { + "epoch": 0.6015129705062632, + "grad_norm": 6.892545585130292, + "learning_rate": 8.377297506922812e-05, + "loss": 0.3009, + "num_input_tokens_seen": 32624472, + "step": 4910 + }, + { + "epoch": 0.6021255091727665, + "grad_norm": 1.260940184407605, + "learning_rate": 8.373561007873345e-05, + "loss": 0.3028, + "num_input_tokens_seen": 32658640, + "step": 4915 + }, + { + "epoch": 0.6027380478392699, + "grad_norm": 1.128267246330843, + "learning_rate": 8.369821047257121e-05, + "loss": 0.3556, + "num_input_tokens_seen": 32692552, + "step": 4920 + }, + { + "epoch": 0.6033505865057732, + "grad_norm": 12.439081707285615, + "learning_rate": 8.366077628911664e-05, + "loss": 0.324, + "num_input_tokens_seen": 32725792, + "step": 4925 + }, + { + "epoch": 0.6039631251722765, + "grad_norm": 0.9109938284028427, + "learning_rate": 8.362330756678046e-05, + "loss": 0.336, + "num_input_tokens_seen": 32759344, + "step": 4930 + }, + { + "epoch": 0.6045756638387798, + "grad_norm": 1.073777936406254, + "learning_rate": 8.358580434400884e-05, + "loss": 0.2681, + "num_input_tokens_seen": 32792880, + "step": 4935 + }, + { + "epoch": 0.6051882025052832, + "grad_norm": 18.123513497906366, + "learning_rate": 8.354826665928335e-05, + "loss": 0.3509, + "num_input_tokens_seen": 32826952, + "step": 4940 + }, + { + "epoch": 0.6058007411717865, + "grad_norm": 1.1242646169499206, + "learning_rate": 8.351069455112091e-05, + "loss": 0.3395, + "num_input_tokens_seen": 32860128, + "step": 4945 + }, + { + "epoch": 0.6064132798382897, + "grad_norm": 1.0980915649808303, + "learning_rate": 8.347308805807378e-05, + "loss": 0.3462, + "num_input_tokens_seen": 32893512, + "step": 4950 + }, + { + "epoch": 0.6064132798382897, + "eval_loss": 0.1940447837114334, + "eval_runtime": 19.437, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 32893512, + "step": 4950 + }, + { + "epoch": 0.6070258185047931, + "grad_norm": 9.985943293472609, + "learning_rate": 8.343544721872947e-05, + "loss": 0.3274, + "num_input_tokens_seen": 32927352, + "step": 4955 + }, + { + "epoch": 0.6076383571712964, + "grad_norm": 9.370637484582613, + "learning_rate": 8.339777207171078e-05, + "loss": 0.2968, + "num_input_tokens_seen": 32961520, + "step": 4960 + }, + { + "epoch": 0.6082508958377998, + "grad_norm": 1.8288696730680583, + "learning_rate": 8.336006265567567e-05, + "loss": 0.3236, + "num_input_tokens_seen": 32994296, + "step": 4965 + }, + { + "epoch": 0.6088634345043031, + "grad_norm": 1.0546446267954288, + "learning_rate": 8.332231900931727e-05, + "loss": 0.3188, + "num_input_tokens_seen": 33028048, + "step": 4970 + }, + { + "epoch": 0.6094759731708064, + "grad_norm": 20.783522799765596, + "learning_rate": 8.328454117136382e-05, + "loss": 0.3119, + "num_input_tokens_seen": 33061592, + "step": 4975 + }, + { + "epoch": 0.6100885118373097, + "grad_norm": 1.0351627225816726, + "learning_rate": 8.324672918057872e-05, + "loss": 0.3797, + "num_input_tokens_seen": 33094608, + "step": 4980 + }, + { + "epoch": 0.6107010505038131, + "grad_norm": 8.019993279226364, + "learning_rate": 8.320888307576034e-05, + "loss": 0.3213, + "num_input_tokens_seen": 33127896, + "step": 4985 + }, + { + "epoch": 0.6113135891703164, + "grad_norm": 3.952559376298466, + "learning_rate": 8.317100289574204e-05, + "loss": 0.3225, + "num_input_tokens_seen": 33161720, + "step": 4990 + }, + { + "epoch": 0.6119261278368197, + "grad_norm": 1.7407253768835742, + "learning_rate": 8.31330886793922e-05, + "loss": 0.3007, + "num_input_tokens_seen": 33195408, + "step": 4995 + }, + { + "epoch": 0.612538666503323, + "grad_norm": 3.726318668103449, + "learning_rate": 8.309514046561412e-05, + "loss": 0.3169, + "num_input_tokens_seen": 33229224, + "step": 5000 + }, + { + "epoch": 0.612538666503323, + "eval_loss": 0.16666926443576813, + "eval_runtime": 20.1245, + "eval_samples_per_second": 2.981, + "eval_steps_per_second": 0.745, + "num_input_tokens_seen": 33229224, + "step": 5000 + }, + { + "epoch": 0.6131512051698264, + "grad_norm": 1.2742718694385125, + "learning_rate": 8.305715829334592e-05, + "loss": 0.2759, + "num_input_tokens_seen": 33262888, + "step": 5005 + }, + { + "epoch": 0.6137637438363297, + "grad_norm": 0.7978248327648949, + "learning_rate": 8.301914220156067e-05, + "loss": 0.2963, + "num_input_tokens_seen": 33296648, + "step": 5010 + }, + { + "epoch": 0.6143762825028329, + "grad_norm": 9.174631033172222, + "learning_rate": 8.29810922292661e-05, + "loss": 0.301, + "num_input_tokens_seen": 33330264, + "step": 5015 + }, + { + "epoch": 0.6149888211693363, + "grad_norm": 19.818673335325112, + "learning_rate": 8.294300841550486e-05, + "loss": 0.3329, + "num_input_tokens_seen": 33363344, + "step": 5020 + }, + { + "epoch": 0.6156013598358396, + "grad_norm": 0.8108093026159426, + "learning_rate": 8.290489079935423e-05, + "loss": 0.2672, + "num_input_tokens_seen": 33398208, + "step": 5025 + }, + { + "epoch": 0.616213898502343, + "grad_norm": 0.8261053891279281, + "learning_rate": 8.286673941992621e-05, + "loss": 0.3415, + "num_input_tokens_seen": 33432144, + "step": 5030 + }, + { + "epoch": 0.6168264371688463, + "grad_norm": 8.060948356342344, + "learning_rate": 8.28285543163674e-05, + "loss": 0.312, + "num_input_tokens_seen": 33465232, + "step": 5035 + }, + { + "epoch": 0.6174389758353496, + "grad_norm": 1.1179653704695816, + "learning_rate": 8.279033552785906e-05, + "loss": 0.2891, + "num_input_tokens_seen": 33499568, + "step": 5040 + }, + { + "epoch": 0.6180515145018529, + "grad_norm": 2.315737761050764, + "learning_rate": 8.275208309361697e-05, + "loss": 0.3449, + "num_input_tokens_seen": 33532984, + "step": 5045 + }, + { + "epoch": 0.6186640531683563, + "grad_norm": 1.6239785662629203, + "learning_rate": 8.271379705289147e-05, + "loss": 0.3488, + "num_input_tokens_seen": 33566056, + "step": 5050 + }, + { + "epoch": 0.6186640531683563, + "eval_loss": 0.29742637276649475, + "eval_runtime": 19.6902, + "eval_samples_per_second": 3.047, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 33566056, + "step": 5050 + }, + { + "epoch": 0.6192765918348596, + "grad_norm": 0.8711792621781863, + "learning_rate": 8.267547744496734e-05, + "loss": 0.29, + "num_input_tokens_seen": 33599808, + "step": 5055 + }, + { + "epoch": 0.6198891305013629, + "grad_norm": 1.0472074909123574, + "learning_rate": 8.263712430916385e-05, + "loss": 0.3881, + "num_input_tokens_seen": 33633440, + "step": 5060 + }, + { + "epoch": 0.6205016691678662, + "grad_norm": 0.8962556749518172, + "learning_rate": 8.259873768483464e-05, + "loss": 0.3024, + "num_input_tokens_seen": 33667064, + "step": 5065 + }, + { + "epoch": 0.6211142078343695, + "grad_norm": 1.9013725333717562, + "learning_rate": 8.256031761136773e-05, + "loss": 0.3578, + "num_input_tokens_seen": 33700856, + "step": 5070 + }, + { + "epoch": 0.6217267465008729, + "grad_norm": 14.286976613605376, + "learning_rate": 8.252186412818546e-05, + "loss": 0.3412, + "num_input_tokens_seen": 33733776, + "step": 5075 + }, + { + "epoch": 0.6223392851673762, + "grad_norm": 1.0467305308601131, + "learning_rate": 8.248337727474442e-05, + "loss": 0.3171, + "num_input_tokens_seen": 33766872, + "step": 5080 + }, + { + "epoch": 0.6229518238338795, + "grad_norm": 1.0973995011060007, + "learning_rate": 8.244485709053549e-05, + "loss": 0.3452, + "num_input_tokens_seen": 33800032, + "step": 5085 + }, + { + "epoch": 0.6235643625003828, + "grad_norm": 1.3307955881611988, + "learning_rate": 8.240630361508373e-05, + "loss": 0.2639, + "num_input_tokens_seen": 33833960, + "step": 5090 + }, + { + "epoch": 0.6241769011668862, + "grad_norm": 35.6458391618994, + "learning_rate": 8.236771688794832e-05, + "loss": 0.3078, + "num_input_tokens_seen": 33867984, + "step": 5095 + }, + { + "epoch": 0.6247894398333895, + "grad_norm": 4.291436608772151, + "learning_rate": 8.232909694872261e-05, + "loss": 0.3996, + "num_input_tokens_seen": 33901184, + "step": 5100 + }, + { + "epoch": 0.6247894398333895, + "eval_loss": 0.31558582186698914, + "eval_runtime": 19.3735, + "eval_samples_per_second": 3.097, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 33901184, + "step": 5100 + }, + { + "epoch": 0.6254019784998928, + "grad_norm": 19.348476844235755, + "learning_rate": 8.229044383703406e-05, + "loss": 0.3072, + "num_input_tokens_seen": 33934000, + "step": 5105 + }, + { + "epoch": 0.6260145171663961, + "grad_norm": 0.8181322235691803, + "learning_rate": 8.225175759254406e-05, + "loss": 0.3554, + "num_input_tokens_seen": 33967616, + "step": 5110 + }, + { + "epoch": 0.6266270558328995, + "grad_norm": 1.1961440987589618, + "learning_rate": 8.221303825494809e-05, + "loss": 0.3061, + "num_input_tokens_seen": 34001784, + "step": 5115 + }, + { + "epoch": 0.6272395944994028, + "grad_norm": 1.0253727560342802, + "learning_rate": 8.217428586397555e-05, + "loss": 0.3074, + "num_input_tokens_seen": 34035480, + "step": 5120 + }, + { + "epoch": 0.627852133165906, + "grad_norm": 0.9966962845756676, + "learning_rate": 8.213550045938976e-05, + "loss": 0.3414, + "num_input_tokens_seen": 34069440, + "step": 5125 + }, + { + "epoch": 0.6284646718324094, + "grad_norm": 16.44987185073983, + "learning_rate": 8.20966820809879e-05, + "loss": 0.2914, + "num_input_tokens_seen": 34103032, + "step": 5130 + }, + { + "epoch": 0.6290772104989127, + "grad_norm": 1.2286350818961804, + "learning_rate": 8.205783076860102e-05, + "loss": 0.2738, + "num_input_tokens_seen": 34137288, + "step": 5135 + }, + { + "epoch": 0.6296897491654161, + "grad_norm": 9.287360521530747, + "learning_rate": 8.201894656209395e-05, + "loss": 0.3311, + "num_input_tokens_seen": 34171000, + "step": 5140 + }, + { + "epoch": 0.6303022878319194, + "grad_norm": 1.1247521108667227, + "learning_rate": 8.198002950136524e-05, + "loss": 0.3003, + "num_input_tokens_seen": 34204896, + "step": 5145 + }, + { + "epoch": 0.6309148264984227, + "grad_norm": 0.9092969606603548, + "learning_rate": 8.194107962634719e-05, + "loss": 0.3088, + "num_input_tokens_seen": 34238496, + "step": 5150 + }, + { + "epoch": 0.6309148264984227, + "eval_loss": 0.2916165292263031, + "eval_runtime": 19.0109, + "eval_samples_per_second": 3.156, + "eval_steps_per_second": 0.789, + "num_input_tokens_seen": 34238496, + "step": 5150 + }, + { + "epoch": 0.631527365164926, + "grad_norm": 13.618293449891818, + "learning_rate": 8.190209697700575e-05, + "loss": 0.2772, + "num_input_tokens_seen": 34271792, + "step": 5155 + }, + { + "epoch": 0.6321399038314294, + "grad_norm": 1.0269642414355773, + "learning_rate": 8.186308159334051e-05, + "loss": 0.3122, + "num_input_tokens_seen": 34305288, + "step": 5160 + }, + { + "epoch": 0.6327524424979327, + "grad_norm": 1.2036635746182562, + "learning_rate": 8.182403351538466e-05, + "loss": 0.2928, + "num_input_tokens_seen": 34338808, + "step": 5165 + }, + { + "epoch": 0.633364981164436, + "grad_norm": 0.6412628512647625, + "learning_rate": 8.178495278320489e-05, + "loss": 0.3602, + "num_input_tokens_seen": 34372672, + "step": 5170 + }, + { + "epoch": 0.6339775198309393, + "grad_norm": 1.830369597560759, + "learning_rate": 8.174583943690146e-05, + "loss": 0.4253, + "num_input_tokens_seen": 34405760, + "step": 5175 + }, + { + "epoch": 0.6345900584974427, + "grad_norm": 0.9218995195522908, + "learning_rate": 8.170669351660803e-05, + "loss": 0.27, + "num_input_tokens_seen": 34439920, + "step": 5180 + }, + { + "epoch": 0.635202597163946, + "grad_norm": 1.3033015622933757, + "learning_rate": 8.166751506249172e-05, + "loss": 0.3228, + "num_input_tokens_seen": 34473336, + "step": 5185 + }, + { + "epoch": 0.6358151358304494, + "grad_norm": 16.65913709543503, + "learning_rate": 8.162830411475306e-05, + "loss": 0.3353, + "num_input_tokens_seen": 34507024, + "step": 5190 + }, + { + "epoch": 0.6364276744969526, + "grad_norm": 10.101584861674526, + "learning_rate": 8.158906071362587e-05, + "loss": 0.3101, + "num_input_tokens_seen": 34540456, + "step": 5195 + }, + { + "epoch": 0.6370402131634559, + "grad_norm": 1.1745806049333212, + "learning_rate": 8.154978489937728e-05, + "loss": 0.2997, + "num_input_tokens_seen": 34574624, + "step": 5200 + }, + { + "epoch": 0.6370402131634559, + "eval_loss": 0.18659941852092743, + "eval_runtime": 19.2342, + "eval_samples_per_second": 3.119, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 34574624, + "step": 5200 + }, + { + "epoch": 0.6376527518299593, + "grad_norm": 1.2213447587288657, + "learning_rate": 8.15104767123077e-05, + "loss": 0.2585, + "num_input_tokens_seen": 34608464, + "step": 5205 + }, + { + "epoch": 0.6382652904964626, + "grad_norm": 1.199558071455866, + "learning_rate": 8.147113619275075e-05, + "loss": 0.3456, + "num_input_tokens_seen": 34641632, + "step": 5210 + }, + { + "epoch": 0.6388778291629659, + "grad_norm": 4.089887493210488, + "learning_rate": 8.143176338107321e-05, + "loss": 0.3748, + "num_input_tokens_seen": 34675528, + "step": 5215 + }, + { + "epoch": 0.6394903678294692, + "grad_norm": 14.965295655020169, + "learning_rate": 8.1392358317675e-05, + "loss": 0.3116, + "num_input_tokens_seen": 34708832, + "step": 5220 + }, + { + "epoch": 0.6401029064959726, + "grad_norm": 1.2147597746241763, + "learning_rate": 8.135292104298917e-05, + "loss": 0.4067, + "num_input_tokens_seen": 34742912, + "step": 5225 + }, + { + "epoch": 0.6407154451624759, + "grad_norm": 10.242587678031159, + "learning_rate": 8.131345159748177e-05, + "loss": 0.2883, + "num_input_tokens_seen": 34777040, + "step": 5230 + }, + { + "epoch": 0.6413279838289792, + "grad_norm": 1.0538531241315783, + "learning_rate": 8.127395002165187e-05, + "loss": 0.3315, + "num_input_tokens_seen": 34810352, + "step": 5235 + }, + { + "epoch": 0.6419405224954825, + "grad_norm": 1.4394471416345185, + "learning_rate": 8.123441635603155e-05, + "loss": 0.4601, + "num_input_tokens_seen": 34843504, + "step": 5240 + }, + { + "epoch": 0.6425530611619859, + "grad_norm": 1.2907299127038077, + "learning_rate": 8.119485064118577e-05, + "loss": 0.3493, + "num_input_tokens_seen": 34877344, + "step": 5245 + }, + { + "epoch": 0.6431655998284892, + "grad_norm": 0.9552223224304538, + "learning_rate": 8.115525291771237e-05, + "loss": 0.2817, + "num_input_tokens_seen": 34912360, + "step": 5250 + }, + { + "epoch": 0.6431655998284892, + "eval_loss": 0.15959948301315308, + "eval_runtime": 19.5834, + "eval_samples_per_second": 3.064, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 34912360, + "step": 5250 + }, + { + "epoch": 0.6437781384949925, + "grad_norm": 1.9867829916670714, + "learning_rate": 8.111562322624208e-05, + "loss": 0.3115, + "num_input_tokens_seen": 34945928, + "step": 5255 + }, + { + "epoch": 0.6443906771614958, + "grad_norm": 11.162632984090655, + "learning_rate": 8.107596160743842e-05, + "loss": 0.2517, + "num_input_tokens_seen": 34980184, + "step": 5260 + }, + { + "epoch": 0.6450032158279991, + "grad_norm": 0.7647666248201724, + "learning_rate": 8.103626810199762e-05, + "loss": 0.2597, + "num_input_tokens_seen": 35014128, + "step": 5265 + }, + { + "epoch": 0.6456157544945025, + "grad_norm": 1.114830530036619, + "learning_rate": 8.099654275064871e-05, + "loss": 0.3011, + "num_input_tokens_seen": 35047936, + "step": 5270 + }, + { + "epoch": 0.6462282931610058, + "grad_norm": 13.37350432998382, + "learning_rate": 8.095678559415332e-05, + "loss": 0.3512, + "num_input_tokens_seen": 35081808, + "step": 5275 + }, + { + "epoch": 0.6468408318275091, + "grad_norm": 0.9545967280282956, + "learning_rate": 8.091699667330574e-05, + "loss": 0.2854, + "num_input_tokens_seen": 35115352, + "step": 5280 + }, + { + "epoch": 0.6474533704940124, + "grad_norm": 0.986710584934151, + "learning_rate": 8.08771760289329e-05, + "loss": 0.3199, + "num_input_tokens_seen": 35148976, + "step": 5285 + }, + { + "epoch": 0.6480659091605158, + "grad_norm": 1.102662602227991, + "learning_rate": 8.083732370189421e-05, + "loss": 0.246, + "num_input_tokens_seen": 35183240, + "step": 5290 + }, + { + "epoch": 0.6486784478270191, + "grad_norm": 6.950819235331565, + "learning_rate": 8.079743973308163e-05, + "loss": 0.2896, + "num_input_tokens_seen": 35216928, + "step": 5295 + }, + { + "epoch": 0.6492909864935223, + "grad_norm": 4.227414089910658, + "learning_rate": 8.075752416341959e-05, + "loss": 0.2997, + "num_input_tokens_seen": 35251096, + "step": 5300 + }, + { + "epoch": 0.6492909864935223, + "eval_loss": 0.11087673157453537, + "eval_runtime": 19.3094, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 35251096, + "step": 5300 + }, + { + "epoch": 0.6499035251600257, + "grad_norm": 2.634182668127156, + "learning_rate": 8.07175770338649e-05, + "loss": 0.2363, + "num_input_tokens_seen": 35285272, + "step": 5305 + }, + { + "epoch": 0.650516063826529, + "grad_norm": 0.720303275407589, + "learning_rate": 8.067759838540682e-05, + "loss": 0.2582, + "num_input_tokens_seen": 35319216, + "step": 5310 + }, + { + "epoch": 0.6511286024930324, + "grad_norm": 2.9458852358134466, + "learning_rate": 8.06375882590669e-05, + "loss": 0.3376, + "num_input_tokens_seen": 35353264, + "step": 5315 + }, + { + "epoch": 0.6517411411595357, + "grad_norm": 25.035619848937376, + "learning_rate": 8.0597546695899e-05, + "loss": 0.2647, + "num_input_tokens_seen": 35386600, + "step": 5320 + }, + { + "epoch": 0.652353679826039, + "grad_norm": 1.0946221325614773, + "learning_rate": 8.055747373698927e-05, + "loss": 0.3179, + "num_input_tokens_seen": 35419584, + "step": 5325 + }, + { + "epoch": 0.6529662184925423, + "grad_norm": 17.547876780426762, + "learning_rate": 8.051736942345603e-05, + "loss": 0.3219, + "num_input_tokens_seen": 35452936, + "step": 5330 + }, + { + "epoch": 0.6535787571590457, + "grad_norm": 1.2486336999116627, + "learning_rate": 8.04772337964498e-05, + "loss": 0.3395, + "num_input_tokens_seen": 35486136, + "step": 5335 + }, + { + "epoch": 0.654191295825549, + "grad_norm": 1.1098979265256579, + "learning_rate": 8.04370668971532e-05, + "loss": 0.2821, + "num_input_tokens_seen": 35519144, + "step": 5340 + }, + { + "epoch": 0.6548038344920523, + "grad_norm": 1.0159100508666539, + "learning_rate": 8.039686876678099e-05, + "loss": 0.2706, + "num_input_tokens_seen": 35552264, + "step": 5345 + }, + { + "epoch": 0.6554163731585556, + "grad_norm": 1.1662704798014212, + "learning_rate": 8.035663944657992e-05, + "loss": 0.3163, + "num_input_tokens_seen": 35586128, + "step": 5350 + }, + { + "epoch": 0.6554163731585556, + "eval_loss": 0.18712441623210907, + "eval_runtime": 19.5723, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 35586128, + "step": 5350 + }, + { + "epoch": 0.656028911825059, + "grad_norm": 0.8094408792690156, + "learning_rate": 8.031637897782877e-05, + "loss": 0.2562, + "num_input_tokens_seen": 35620280, + "step": 5355 + }, + { + "epoch": 0.6566414504915623, + "grad_norm": 1.4920519985445275, + "learning_rate": 8.027608740183828e-05, + "loss": 0.3284, + "num_input_tokens_seen": 35653840, + "step": 5360 + }, + { + "epoch": 0.6572539891580657, + "grad_norm": 4.341375979053494, + "learning_rate": 8.023576475995113e-05, + "loss": 0.3455, + "num_input_tokens_seen": 35687320, + "step": 5365 + }, + { + "epoch": 0.6578665278245689, + "grad_norm": 0.892470676432067, + "learning_rate": 8.019541109354184e-05, + "loss": 0.3068, + "num_input_tokens_seen": 35721288, + "step": 5370 + }, + { + "epoch": 0.6584790664910722, + "grad_norm": 0.7948815493687846, + "learning_rate": 8.015502644401678e-05, + "loss": 0.2947, + "num_input_tokens_seen": 35754632, + "step": 5375 + }, + { + "epoch": 0.6590916051575756, + "grad_norm": 1.3596628587755044, + "learning_rate": 8.01146108528141e-05, + "loss": 0.3521, + "num_input_tokens_seen": 35787736, + "step": 5380 + }, + { + "epoch": 0.6597041438240789, + "grad_norm": 0.909247481224003, + "learning_rate": 8.007416436140373e-05, + "loss": 0.2652, + "num_input_tokens_seen": 35821344, + "step": 5385 + }, + { + "epoch": 0.6603166824905822, + "grad_norm": 1.1524477613257795, + "learning_rate": 8.003368701128727e-05, + "loss": 0.2908, + "num_input_tokens_seen": 35855384, + "step": 5390 + }, + { + "epoch": 0.6609292211570855, + "grad_norm": 0.9650449221914386, + "learning_rate": 7.999317884399798e-05, + "loss": 0.3399, + "num_input_tokens_seen": 35888456, + "step": 5395 + }, + { + "epoch": 0.6615417598235889, + "grad_norm": 27.101974075041202, + "learning_rate": 7.995263990110079e-05, + "loss": 0.3341, + "num_input_tokens_seen": 35922304, + "step": 5400 + }, + { + "epoch": 0.6615417598235889, + "eval_loss": 0.20246051251888275, + "eval_runtime": 19.3833, + "eval_samples_per_second": 3.095, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 35922304, + "step": 5400 + }, + { + "epoch": 0.6621542984900922, + "grad_norm": 0.9794572125283704, + "learning_rate": 7.991207022419219e-05, + "loss": 0.3332, + "num_input_tokens_seen": 35956112, + "step": 5405 + }, + { + "epoch": 0.6627668371565955, + "grad_norm": 0.9680471405442795, + "learning_rate": 7.987146985490014e-05, + "loss": 0.329, + "num_input_tokens_seen": 35989040, + "step": 5410 + }, + { + "epoch": 0.6633793758230988, + "grad_norm": 0.8285532165071167, + "learning_rate": 7.983083883488419e-05, + "loss": 0.3065, + "num_input_tokens_seen": 36023016, + "step": 5415 + }, + { + "epoch": 0.6639919144896022, + "grad_norm": 1.2820324501136056, + "learning_rate": 7.97901772058353e-05, + "loss": 0.3266, + "num_input_tokens_seen": 36056648, + "step": 5420 + }, + { + "epoch": 0.6646044531561055, + "grad_norm": 1.12434902241413, + "learning_rate": 7.974948500947582e-05, + "loss": 0.3161, + "num_input_tokens_seen": 36089784, + "step": 5425 + }, + { + "epoch": 0.6652169918226088, + "grad_norm": 1.0684974607516224, + "learning_rate": 7.970876228755945e-05, + "loss": 0.3045, + "num_input_tokens_seen": 36123648, + "step": 5430 + }, + { + "epoch": 0.6658295304891121, + "grad_norm": 2.126187523097948, + "learning_rate": 7.96680090818713e-05, + "loss": 0.2872, + "num_input_tokens_seen": 36157176, + "step": 5435 + }, + { + "epoch": 0.6664420691556154, + "grad_norm": 0.8203054677829115, + "learning_rate": 7.962722543422767e-05, + "loss": 0.28, + "num_input_tokens_seen": 36190944, + "step": 5440 + }, + { + "epoch": 0.6670546078221188, + "grad_norm": 9.144617368276464, + "learning_rate": 7.958641138647615e-05, + "loss": 0.3365, + "num_input_tokens_seen": 36224464, + "step": 5445 + }, + { + "epoch": 0.6676671464886221, + "grad_norm": 0.9677450047901596, + "learning_rate": 7.954556698049544e-05, + "loss": 0.3126, + "num_input_tokens_seen": 36256800, + "step": 5450 + }, + { + "epoch": 0.6676671464886221, + "eval_loss": 0.20314465463161469, + "eval_runtime": 19.4714, + "eval_samples_per_second": 3.081, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 36256800, + "step": 5450 + }, + { + "epoch": 0.6682796851551254, + "grad_norm": 8.536944858496785, + "learning_rate": 7.950469225819553e-05, + "loss": 0.3205, + "num_input_tokens_seen": 36289856, + "step": 5455 + }, + { + "epoch": 0.6688922238216287, + "grad_norm": 1.9373591563447092, + "learning_rate": 7.946378726151736e-05, + "loss": 0.3886, + "num_input_tokens_seen": 36322792, + "step": 5460 + }, + { + "epoch": 0.6695047624881321, + "grad_norm": 8.782484436204891, + "learning_rate": 7.942285203243306e-05, + "loss": 0.3497, + "num_input_tokens_seen": 36356144, + "step": 5465 + }, + { + "epoch": 0.6701173011546354, + "grad_norm": 1.4342270815535454, + "learning_rate": 7.938188661294574e-05, + "loss": 0.3647, + "num_input_tokens_seen": 36389240, + "step": 5470 + }, + { + "epoch": 0.6707298398211388, + "grad_norm": 6.911819349216433, + "learning_rate": 7.934089104508943e-05, + "loss": 0.405, + "num_input_tokens_seen": 36422536, + "step": 5475 + }, + { + "epoch": 0.671342378487642, + "grad_norm": 0.8915620473525774, + "learning_rate": 7.92998653709292e-05, + "loss": 0.3243, + "num_input_tokens_seen": 36455680, + "step": 5480 + }, + { + "epoch": 0.6719549171541453, + "grad_norm": 1.4556253022703332, + "learning_rate": 7.925880963256094e-05, + "loss": 0.2469, + "num_input_tokens_seen": 36489320, + "step": 5485 + }, + { + "epoch": 0.6725674558206487, + "grad_norm": 1.0332936870633787, + "learning_rate": 7.921772387211138e-05, + "loss": 0.309, + "num_input_tokens_seen": 36522792, + "step": 5490 + }, + { + "epoch": 0.673179994487152, + "grad_norm": 1.0666723244614056, + "learning_rate": 7.917660813173811e-05, + "loss": 0.3236, + "num_input_tokens_seen": 36556056, + "step": 5495 + }, + { + "epoch": 0.6737925331536553, + "grad_norm": 1.1599568028820035, + "learning_rate": 7.913546245362942e-05, + "loss": 0.3145, + "num_input_tokens_seen": 36589616, + "step": 5500 + }, + { + "epoch": 0.6737925331536553, + "eval_loss": 0.1359092891216278, + "eval_runtime": 19.6151, + "eval_samples_per_second": 3.059, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 36589616, + "step": 5500 + }, + { + "epoch": 0.6744050718201586, + "grad_norm": 1.387314309249753, + "learning_rate": 7.909428688000438e-05, + "loss": 0.3184, + "num_input_tokens_seen": 36623040, + "step": 5505 + }, + { + "epoch": 0.675017610486662, + "grad_norm": 1.19853021775431, + "learning_rate": 7.905308145311272e-05, + "loss": 0.2705, + "num_input_tokens_seen": 36656560, + "step": 5510 + }, + { + "epoch": 0.6756301491531653, + "grad_norm": 0.9413353611556725, + "learning_rate": 7.901184621523475e-05, + "loss": 0.3864, + "num_input_tokens_seen": 36689320, + "step": 5515 + }, + { + "epoch": 0.6762426878196686, + "grad_norm": 1.059500061047453, + "learning_rate": 7.897058120868143e-05, + "loss": 0.3058, + "num_input_tokens_seen": 36722560, + "step": 5520 + }, + { + "epoch": 0.6768552264861719, + "grad_norm": 1.1648691093219095, + "learning_rate": 7.892928647579423e-05, + "loss": 0.3803, + "num_input_tokens_seen": 36755400, + "step": 5525 + }, + { + "epoch": 0.6774677651526753, + "grad_norm": 1.1513712858076808, + "learning_rate": 7.888796205894516e-05, + "loss": 0.3232, + "num_input_tokens_seen": 36789136, + "step": 5530 + }, + { + "epoch": 0.6780803038191786, + "grad_norm": 9.891418770726798, + "learning_rate": 7.884660800053662e-05, + "loss": 0.3131, + "num_input_tokens_seen": 36822600, + "step": 5535 + }, + { + "epoch": 0.678692842485682, + "grad_norm": 1.0571421613441934, + "learning_rate": 7.880522434300152e-05, + "loss": 0.3068, + "num_input_tokens_seen": 36855528, + "step": 5540 + }, + { + "epoch": 0.6793053811521852, + "grad_norm": 1.1038105184283817, + "learning_rate": 7.876381112880305e-05, + "loss": 0.3238, + "num_input_tokens_seen": 36888496, + "step": 5545 + }, + { + "epoch": 0.6799179198186885, + "grad_norm": 1.0883701427599628, + "learning_rate": 7.872236840043477e-05, + "loss": 0.337, + "num_input_tokens_seen": 36922056, + "step": 5550 + }, + { + "epoch": 0.6799179198186885, + "eval_loss": 0.09095880389213562, + "eval_runtime": 19.8561, + "eval_samples_per_second": 3.022, + "eval_steps_per_second": 0.755, + "num_input_tokens_seen": 36922056, + "step": 5550 + }, + { + "epoch": 0.6805304584851919, + "grad_norm": 1.0334393954721572, + "learning_rate": 7.868089620042054e-05, + "loss": 0.2545, + "num_input_tokens_seen": 36955456, + "step": 5555 + }, + { + "epoch": 0.6811429971516952, + "grad_norm": 2.489835403419518, + "learning_rate": 7.863939457131443e-05, + "loss": 0.3144, + "num_input_tokens_seen": 36988504, + "step": 5560 + }, + { + "epoch": 0.6817555358181985, + "grad_norm": 0.9596999763510171, + "learning_rate": 7.859786355570071e-05, + "loss": 0.3051, + "num_input_tokens_seen": 37021504, + "step": 5565 + }, + { + "epoch": 0.6823680744847018, + "grad_norm": 0.8565038861713138, + "learning_rate": 7.855630319619383e-05, + "loss": 0.2666, + "num_input_tokens_seen": 37055704, + "step": 5570 + }, + { + "epoch": 0.6829806131512052, + "grad_norm": 1.1817004067609151, + "learning_rate": 7.851471353543831e-05, + "loss": 0.2894, + "num_input_tokens_seen": 37089240, + "step": 5575 + }, + { + "epoch": 0.6835931518177085, + "grad_norm": 19.98043549238189, + "learning_rate": 7.847309461610878e-05, + "loss": 0.3601, + "num_input_tokens_seen": 37122600, + "step": 5580 + }, + { + "epoch": 0.6842056904842119, + "grad_norm": 1.0508797882537255, + "learning_rate": 7.843144648090984e-05, + "loss": 0.3257, + "num_input_tokens_seen": 37155976, + "step": 5585 + }, + { + "epoch": 0.6848182291507151, + "grad_norm": 3.174815916344745, + "learning_rate": 7.838976917257612e-05, + "loss": 0.277, + "num_input_tokens_seen": 37190544, + "step": 5590 + }, + { + "epoch": 0.6854307678172185, + "grad_norm": 21.592890745389333, + "learning_rate": 7.834806273387214e-05, + "loss": 0.287, + "num_input_tokens_seen": 37224600, + "step": 5595 + }, + { + "epoch": 0.6860433064837218, + "grad_norm": 1.6348418831755478, + "learning_rate": 7.830632720759234e-05, + "loss": 0.2861, + "num_input_tokens_seen": 37258136, + "step": 5600 + }, + { + "epoch": 0.6860433064837218, + "eval_loss": 0.16602179408073425, + "eval_runtime": 19.6982, + "eval_samples_per_second": 3.046, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 37258136, + "step": 5600 + }, + { + "epoch": 0.6866558451502252, + "grad_norm": 1.30364170665399, + "learning_rate": 7.8264562636561e-05, + "loss": 0.2793, + "num_input_tokens_seen": 37291664, + "step": 5605 + }, + { + "epoch": 0.6872683838167284, + "grad_norm": 10.6838701811589, + "learning_rate": 7.822276906363222e-05, + "loss": 0.3119, + "num_input_tokens_seen": 37324896, + "step": 5610 + }, + { + "epoch": 0.6878809224832317, + "grad_norm": 1.1186916588611795, + "learning_rate": 7.818094653168978e-05, + "loss": 0.3234, + "num_input_tokens_seen": 37358664, + "step": 5615 + }, + { + "epoch": 0.6884934611497351, + "grad_norm": 8.998376302902917, + "learning_rate": 7.813909508364727e-05, + "loss": 0.3019, + "num_input_tokens_seen": 37392600, + "step": 5620 + }, + { + "epoch": 0.6891059998162384, + "grad_norm": 3.684670678087589, + "learning_rate": 7.809721476244792e-05, + "loss": 0.3575, + "num_input_tokens_seen": 37426144, + "step": 5625 + }, + { + "epoch": 0.6897185384827417, + "grad_norm": 8.267517614750494, + "learning_rate": 7.805530561106458e-05, + "loss": 0.4029, + "num_input_tokens_seen": 37460184, + "step": 5630 + }, + { + "epoch": 0.690331077149245, + "grad_norm": 1.1804936001760904, + "learning_rate": 7.801336767249965e-05, + "loss": 0.3128, + "num_input_tokens_seen": 37493384, + "step": 5635 + }, + { + "epoch": 0.6909436158157484, + "grad_norm": 6.748718227772172, + "learning_rate": 7.797140098978512e-05, + "loss": 0.257, + "num_input_tokens_seen": 37527904, + "step": 5640 + }, + { + "epoch": 0.6915561544822517, + "grad_norm": 0.761313291675114, + "learning_rate": 7.792940560598245e-05, + "loss": 0.2803, + "num_input_tokens_seen": 37561984, + "step": 5645 + }, + { + "epoch": 0.6921686931487551, + "grad_norm": 8.495471993286849, + "learning_rate": 7.788738156418256e-05, + "loss": 0.383, + "num_input_tokens_seen": 37594912, + "step": 5650 + }, + { + "epoch": 0.6921686931487551, + "eval_loss": 0.15634669363498688, + "eval_runtime": 19.1233, + "eval_samples_per_second": 3.138, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 37594912, + "step": 5650 + }, + { + "epoch": 0.6927812318152583, + "grad_norm": 7.592852343924127, + "learning_rate": 7.784532890750576e-05, + "loss": 0.3315, + "num_input_tokens_seen": 37627672, + "step": 5655 + }, + { + "epoch": 0.6933937704817617, + "grad_norm": 0.9018958256749179, + "learning_rate": 7.780324767910174e-05, + "loss": 0.284, + "num_input_tokens_seen": 37661200, + "step": 5660 + }, + { + "epoch": 0.694006309148265, + "grad_norm": 0.9388776579434677, + "learning_rate": 7.77611379221495e-05, + "loss": 0.3313, + "num_input_tokens_seen": 37694472, + "step": 5665 + }, + { + "epoch": 0.6946188478147683, + "grad_norm": 1.2537622027658646, + "learning_rate": 7.771899967985728e-05, + "loss": 0.3156, + "num_input_tokens_seen": 37727712, + "step": 5670 + }, + { + "epoch": 0.6952313864812716, + "grad_norm": 1.4389263243323127, + "learning_rate": 7.76768329954626e-05, + "loss": 0.3487, + "num_input_tokens_seen": 37760528, + "step": 5675 + }, + { + "epoch": 0.6958439251477749, + "grad_norm": 1.338387802860716, + "learning_rate": 7.763463791223216e-05, + "loss": 0.2953, + "num_input_tokens_seen": 37793968, + "step": 5680 + }, + { + "epoch": 0.6964564638142783, + "grad_norm": 1.0394956213960027, + "learning_rate": 7.759241447346175e-05, + "loss": 0.3258, + "num_input_tokens_seen": 37827352, + "step": 5685 + }, + { + "epoch": 0.6970690024807816, + "grad_norm": 1.207500610665226, + "learning_rate": 7.755016272247631e-05, + "loss": 0.3724, + "num_input_tokens_seen": 37860544, + "step": 5690 + }, + { + "epoch": 0.6976815411472849, + "grad_norm": 1.2412785284836358, + "learning_rate": 7.75078827026298e-05, + "loss": 0.2496, + "num_input_tokens_seen": 37894560, + "step": 5695 + }, + { + "epoch": 0.6982940798137882, + "grad_norm": 0.6303951073185801, + "learning_rate": 7.746557445730517e-05, + "loss": 0.3001, + "num_input_tokens_seen": 37928296, + "step": 5700 + }, + { + "epoch": 0.6982940798137882, + "eval_loss": 0.1474122703075409, + "eval_runtime": 19.8082, + "eval_samples_per_second": 3.029, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 37928296, + "step": 5700 + }, + { + "epoch": 0.6989066184802916, + "grad_norm": 4.360190945379924, + "learning_rate": 7.742323802991442e-05, + "loss": 0.262, + "num_input_tokens_seen": 37962624, + "step": 5705 + }, + { + "epoch": 0.6995191571467949, + "grad_norm": 1.1574253332311994, + "learning_rate": 7.738087346389835e-05, + "loss": 0.2939, + "num_input_tokens_seen": 37996520, + "step": 5710 + }, + { + "epoch": 0.7001316958132983, + "grad_norm": 1.2773556836385291, + "learning_rate": 7.733848080272668e-05, + "loss": 0.2748, + "num_input_tokens_seen": 38029384, + "step": 5715 + }, + { + "epoch": 0.7007442344798015, + "grad_norm": 1.2142511945147847, + "learning_rate": 7.729606008989801e-05, + "loss": 0.3109, + "num_input_tokens_seen": 38062808, + "step": 5720 + }, + { + "epoch": 0.7013567731463048, + "grad_norm": 6.511964085443724, + "learning_rate": 7.725361136893963e-05, + "loss": 0.2942, + "num_input_tokens_seen": 38096552, + "step": 5725 + }, + { + "epoch": 0.7019693118128082, + "grad_norm": 0.8430485048080425, + "learning_rate": 7.721113468340766e-05, + "loss": 0.2759, + "num_input_tokens_seen": 38130696, + "step": 5730 + }, + { + "epoch": 0.7025818504793115, + "grad_norm": 1.6753229139971713, + "learning_rate": 7.716863007688684e-05, + "loss": 0.2912, + "num_input_tokens_seen": 38164136, + "step": 5735 + }, + { + "epoch": 0.7031943891458148, + "grad_norm": 0.8942900628122847, + "learning_rate": 7.71260975929906e-05, + "loss": 0.2909, + "num_input_tokens_seen": 38197608, + "step": 5740 + }, + { + "epoch": 0.7038069278123181, + "grad_norm": 9.839176331355313, + "learning_rate": 7.708353727536097e-05, + "loss": 0.334, + "num_input_tokens_seen": 38230808, + "step": 5745 + }, + { + "epoch": 0.7044194664788215, + "grad_norm": 1.3039617164515944, + "learning_rate": 7.70409491676685e-05, + "loss": 0.3218, + "num_input_tokens_seen": 38264144, + "step": 5750 + }, + { + "epoch": 0.7044194664788215, + "eval_loss": 0.07439830899238586, + "eval_runtime": 19.5606, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 38264144, + "step": 5750 + }, + { + "epoch": 0.7050320051453248, + "grad_norm": 0.990932259478708, + "learning_rate": 7.699833331361234e-05, + "loss": 0.3302, + "num_input_tokens_seen": 38297728, + "step": 5755 + }, + { + "epoch": 0.7056445438118282, + "grad_norm": 0.5864682006017778, + "learning_rate": 7.695568975692005e-05, + "loss": 0.3029, + "num_input_tokens_seen": 38332192, + "step": 5760 + }, + { + "epoch": 0.7062570824783314, + "grad_norm": 3.603240931833326, + "learning_rate": 7.69130185413476e-05, + "loss": 0.2635, + "num_input_tokens_seen": 38365608, + "step": 5765 + }, + { + "epoch": 0.7068696211448348, + "grad_norm": 36.00214777647525, + "learning_rate": 7.687031971067937e-05, + "loss": 0.2867, + "num_input_tokens_seen": 38398936, + "step": 5770 + }, + { + "epoch": 0.7074821598113381, + "grad_norm": 1.0591012638919646, + "learning_rate": 7.682759330872809e-05, + "loss": 0.3174, + "num_input_tokens_seen": 38432144, + "step": 5775 + }, + { + "epoch": 0.7080946984778415, + "grad_norm": 0.9928079969462144, + "learning_rate": 7.678483937933474e-05, + "loss": 0.3089, + "num_input_tokens_seen": 38466128, + "step": 5780 + }, + { + "epoch": 0.7087072371443447, + "grad_norm": 0.8565070325312787, + "learning_rate": 7.674205796636858e-05, + "loss": 0.3194, + "num_input_tokens_seen": 38499408, + "step": 5785 + }, + { + "epoch": 0.709319775810848, + "grad_norm": 1.07600341265933, + "learning_rate": 7.669924911372706e-05, + "loss": 0.248, + "num_input_tokens_seen": 38533184, + "step": 5790 + }, + { + "epoch": 0.7099323144773514, + "grad_norm": 32.80055306694013, + "learning_rate": 7.665641286533578e-05, + "loss": 0.2899, + "num_input_tokens_seen": 38567016, + "step": 5795 + }, + { + "epoch": 0.7105448531438547, + "grad_norm": 0.9668912978602835, + "learning_rate": 7.661354926514843e-05, + "loss": 0.2762, + "num_input_tokens_seen": 38601128, + "step": 5800 + }, + { + "epoch": 0.7105448531438547, + "eval_loss": 0.2444901168346405, + "eval_runtime": 19.4279, + "eval_samples_per_second": 3.088, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 38601128, + "step": 5800 + }, + { + "epoch": 0.711157391810358, + "grad_norm": 0.8230917744276421, + "learning_rate": 7.657065835714683e-05, + "loss": 0.2591, + "num_input_tokens_seen": 38635176, + "step": 5805 + }, + { + "epoch": 0.7117699304768613, + "grad_norm": 1.2258640700970294, + "learning_rate": 7.652774018534076e-05, + "loss": 0.2323, + "num_input_tokens_seen": 38669304, + "step": 5810 + }, + { + "epoch": 0.7123824691433647, + "grad_norm": 1.0472070530785598, + "learning_rate": 7.648479479376797e-05, + "loss": 0.3004, + "num_input_tokens_seen": 38703192, + "step": 5815 + }, + { + "epoch": 0.712995007809868, + "grad_norm": 7.764627799848026, + "learning_rate": 7.64418222264942e-05, + "loss": 0.2882, + "num_input_tokens_seen": 38736688, + "step": 5820 + }, + { + "epoch": 0.7136075464763714, + "grad_norm": 3.361804056712567, + "learning_rate": 7.639882252761302e-05, + "loss": 0.2939, + "num_input_tokens_seen": 38770184, + "step": 5825 + }, + { + "epoch": 0.7142200851428746, + "grad_norm": 0.628441072715731, + "learning_rate": 7.635579574124586e-05, + "loss": 0.3033, + "num_input_tokens_seen": 38803864, + "step": 5830 + }, + { + "epoch": 0.714832623809378, + "grad_norm": 0.9527606091010404, + "learning_rate": 7.631274191154194e-05, + "loss": 0.312, + "num_input_tokens_seen": 38837440, + "step": 5835 + }, + { + "epoch": 0.7154451624758813, + "grad_norm": 4.337339875088874, + "learning_rate": 7.626966108267823e-05, + "loss": 0.3425, + "num_input_tokens_seen": 38871192, + "step": 5840 + }, + { + "epoch": 0.7160577011423847, + "grad_norm": 0.9379794924403808, + "learning_rate": 7.622655329885943e-05, + "loss": 0.338, + "num_input_tokens_seen": 38904744, + "step": 5845 + }, + { + "epoch": 0.7166702398088879, + "grad_norm": 9.819702621074244, + "learning_rate": 7.618341860431783e-05, + "loss": 0.2624, + "num_input_tokens_seen": 38939296, + "step": 5850 + }, + { + "epoch": 0.7166702398088879, + "eval_loss": 0.1622983068227768, + "eval_runtime": 19.6592, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 38939296, + "step": 5850 + }, + { + "epoch": 0.7172827784753912, + "grad_norm": 1.1125729628897534, + "learning_rate": 7.614025704331342e-05, + "loss": 0.3184, + "num_input_tokens_seen": 38972184, + "step": 5855 + }, + { + "epoch": 0.7178953171418946, + "grad_norm": 1.565063540617456, + "learning_rate": 7.609706866013368e-05, + "loss": 0.3129, + "num_input_tokens_seen": 39005072, + "step": 5860 + }, + { + "epoch": 0.7185078558083979, + "grad_norm": 1.7399470333553213, + "learning_rate": 7.605385349909362e-05, + "loss": 0.329, + "num_input_tokens_seen": 39039112, + "step": 5865 + }, + { + "epoch": 0.7191203944749013, + "grad_norm": 2.0861143924532666, + "learning_rate": 7.601061160453581e-05, + "loss": 0.3301, + "num_input_tokens_seen": 39072768, + "step": 5870 + }, + { + "epoch": 0.7197329331414045, + "grad_norm": 1.0328934281950841, + "learning_rate": 7.596734302083016e-05, + "loss": 0.3299, + "num_input_tokens_seen": 39106416, + "step": 5875 + }, + { + "epoch": 0.7203454718079079, + "grad_norm": 0.7170888603212452, + "learning_rate": 7.592404779237401e-05, + "loss": 0.2706, + "num_input_tokens_seen": 39140168, + "step": 5880 + }, + { + "epoch": 0.7209580104744112, + "grad_norm": 1.0145582003921796, + "learning_rate": 7.588072596359201e-05, + "loss": 0.2823, + "num_input_tokens_seen": 39174160, + "step": 5885 + }, + { + "epoch": 0.7215705491409146, + "grad_norm": 3.63703158870152, + "learning_rate": 7.583737757893612e-05, + "loss": 0.2834, + "num_input_tokens_seen": 39207944, + "step": 5890 + }, + { + "epoch": 0.7221830878074178, + "grad_norm": 1.0781313013105536, + "learning_rate": 7.579400268288557e-05, + "loss": 0.2995, + "num_input_tokens_seen": 39241768, + "step": 5895 + }, + { + "epoch": 0.7227956264739211, + "grad_norm": 1.587506087503931, + "learning_rate": 7.575060131994676e-05, + "loss": 0.3096, + "num_input_tokens_seen": 39275432, + "step": 5900 + }, + { + "epoch": 0.7227956264739211, + "eval_loss": 0.20010535418987274, + "eval_runtime": 19.7179, + "eval_samples_per_second": 3.043, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 39275432, + "step": 5900 + }, + { + "epoch": 0.7234081651404245, + "grad_norm": 0.7846820436711315, + "learning_rate": 7.570717353465327e-05, + "loss": 0.2899, + "num_input_tokens_seen": 39308840, + "step": 5905 + }, + { + "epoch": 0.7240207038069278, + "grad_norm": 11.079965506667692, + "learning_rate": 7.566371937156574e-05, + "loss": 0.3417, + "num_input_tokens_seen": 39342200, + "step": 5910 + }, + { + "epoch": 0.7246332424734311, + "grad_norm": 1.3867296768262012, + "learning_rate": 7.562023887527198e-05, + "loss": 0.3779, + "num_input_tokens_seen": 39375392, + "step": 5915 + }, + { + "epoch": 0.7252457811399344, + "grad_norm": 0.9636723101875656, + "learning_rate": 7.557673209038672e-05, + "loss": 0.2938, + "num_input_tokens_seen": 39409016, + "step": 5920 + }, + { + "epoch": 0.7258583198064378, + "grad_norm": 32.4729795615117, + "learning_rate": 7.553319906155169e-05, + "loss": 0.2468, + "num_input_tokens_seen": 39443520, + "step": 5925 + }, + { + "epoch": 0.7264708584729411, + "grad_norm": 0.4861017676590545, + "learning_rate": 7.548963983343562e-05, + "loss": 0.2442, + "num_input_tokens_seen": 39477320, + "step": 5930 + }, + { + "epoch": 0.7270833971394445, + "grad_norm": 1.5230695687845068, + "learning_rate": 7.544605445073401e-05, + "loss": 0.2802, + "num_input_tokens_seen": 39511200, + "step": 5935 + }, + { + "epoch": 0.7276959358059477, + "grad_norm": 1.0907424112889428, + "learning_rate": 7.540244295816926e-05, + "loss": 0.3173, + "num_input_tokens_seen": 39544072, + "step": 5940 + }, + { + "epoch": 0.7283084744724511, + "grad_norm": 1.0472773974920466, + "learning_rate": 7.535880540049054e-05, + "loss": 0.2781, + "num_input_tokens_seen": 39577952, + "step": 5945 + }, + { + "epoch": 0.7289210131389544, + "grad_norm": 1.2142568208841384, + "learning_rate": 7.531514182247382e-05, + "loss": 0.2848, + "num_input_tokens_seen": 39611784, + "step": 5950 + }, + { + "epoch": 0.7289210131389544, + "eval_loss": 0.30868977308273315, + "eval_runtime": 19.6688, + "eval_samples_per_second": 3.051, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 39611784, + "step": 5950 + }, + { + "epoch": 0.7295335518054578, + "grad_norm": 25.598924485813317, + "learning_rate": 7.527145226892167e-05, + "loss": 0.2944, + "num_input_tokens_seen": 39645280, + "step": 5955 + }, + { + "epoch": 0.730146090471961, + "grad_norm": 1.4152834731542323, + "learning_rate": 7.522773678466341e-05, + "loss": 0.2649, + "num_input_tokens_seen": 39678616, + "step": 5960 + }, + { + "epoch": 0.7307586291384643, + "grad_norm": 8.785566149162682, + "learning_rate": 7.51839954145549e-05, + "loss": 0.2404, + "num_input_tokens_seen": 39713016, + "step": 5965 + }, + { + "epoch": 0.7313711678049677, + "grad_norm": 1.0343743973159378, + "learning_rate": 7.514022820347862e-05, + "loss": 0.3474, + "num_input_tokens_seen": 39746720, + "step": 5970 + }, + { + "epoch": 0.731983706471471, + "grad_norm": 6.415858289650573, + "learning_rate": 7.509643519634348e-05, + "loss": 0.3241, + "num_input_tokens_seen": 39780176, + "step": 5975 + }, + { + "epoch": 0.7325962451379744, + "grad_norm": 1.0884937358074747, + "learning_rate": 7.505261643808497e-05, + "loss": 0.3367, + "num_input_tokens_seen": 39813608, + "step": 5980 + }, + { + "epoch": 0.7332087838044776, + "grad_norm": 1.1251506407618364, + "learning_rate": 7.500877197366492e-05, + "loss": 0.2756, + "num_input_tokens_seen": 39847256, + "step": 5985 + }, + { + "epoch": 0.733821322470981, + "grad_norm": 0.9342590482714533, + "learning_rate": 7.496490184807152e-05, + "loss": 0.2507, + "num_input_tokens_seen": 39880848, + "step": 5990 + }, + { + "epoch": 0.7344338611374843, + "grad_norm": 1.1974532503686337, + "learning_rate": 7.492100610631939e-05, + "loss": 0.3189, + "num_input_tokens_seen": 39913792, + "step": 5995 + }, + { + "epoch": 0.7350463998039877, + "grad_norm": 1.033745217492127, + "learning_rate": 7.487708479344934e-05, + "loss": 0.3025, + "num_input_tokens_seen": 39947544, + "step": 6000 + }, + { + "epoch": 0.7350463998039877, + "eval_loss": 0.14401155710220337, + "eval_runtime": 19.3888, + "eval_samples_per_second": 3.095, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 39947544, + "step": 6000 + }, + { + "epoch": 0.7356589384704909, + "grad_norm": 0.9433894484784726, + "learning_rate": 7.483313795452848e-05, + "loss": 0.2161, + "num_input_tokens_seen": 39982296, + "step": 6005 + }, + { + "epoch": 0.7362714771369943, + "grad_norm": 1.2389757652487523, + "learning_rate": 7.478916563465007e-05, + "loss": 0.2991, + "num_input_tokens_seen": 40015240, + "step": 6010 + }, + { + "epoch": 0.7368840158034976, + "grad_norm": 3.1916468885424987, + "learning_rate": 7.474516787893354e-05, + "loss": 0.2634, + "num_input_tokens_seen": 40048800, + "step": 6015 + }, + { + "epoch": 0.737496554470001, + "grad_norm": 1.5365903336724966, + "learning_rate": 7.470114473252439e-05, + "loss": 0.3008, + "num_input_tokens_seen": 40081872, + "step": 6020 + }, + { + "epoch": 0.7381090931365042, + "grad_norm": 0.7174184201641504, + "learning_rate": 7.46570962405942e-05, + "loss": 0.2665, + "num_input_tokens_seen": 40115376, + "step": 6025 + }, + { + "epoch": 0.7387216318030075, + "grad_norm": 0.8569187076537471, + "learning_rate": 7.461302244834058e-05, + "loss": 0.2847, + "num_input_tokens_seen": 40148936, + "step": 6030 + }, + { + "epoch": 0.7393341704695109, + "grad_norm": 1.0425446948141777, + "learning_rate": 7.456892340098703e-05, + "loss": 0.2553, + "num_input_tokens_seen": 40182880, + "step": 6035 + }, + { + "epoch": 0.7399467091360142, + "grad_norm": 0.8061343175239635, + "learning_rate": 7.452479914378304e-05, + "loss": 0.3273, + "num_input_tokens_seen": 40216432, + "step": 6040 + }, + { + "epoch": 0.7405592478025176, + "grad_norm": 17.38906535762418, + "learning_rate": 7.448064972200392e-05, + "loss": 0.3363, + "num_input_tokens_seen": 40250064, + "step": 6045 + }, + { + "epoch": 0.7411717864690208, + "grad_norm": 0.6523459184879088, + "learning_rate": 7.443647518095078e-05, + "loss": 0.2845, + "num_input_tokens_seen": 40284288, + "step": 6050 + }, + { + "epoch": 0.7411717864690208, + "eval_loss": 0.26300135254859924, + "eval_runtime": 19.7175, + "eval_samples_per_second": 3.043, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 40284288, + "step": 6050 + }, + { + "epoch": 0.7417843251355242, + "grad_norm": 18.9029358674341, + "learning_rate": 7.43922755659506e-05, + "loss": 0.3481, + "num_input_tokens_seen": 40317688, + "step": 6055 + }, + { + "epoch": 0.7423968638020275, + "grad_norm": 0.9977039835355207, + "learning_rate": 7.434805092235599e-05, + "loss": 0.2952, + "num_input_tokens_seen": 40350808, + "step": 6060 + }, + { + "epoch": 0.7430094024685309, + "grad_norm": 0.8366471947276023, + "learning_rate": 7.430380129554527e-05, + "loss": 0.2918, + "num_input_tokens_seen": 40384432, + "step": 6065 + }, + { + "epoch": 0.7436219411350341, + "grad_norm": 1.0479827016490308, + "learning_rate": 7.42595267309224e-05, + "loss": 0.3835, + "num_input_tokens_seen": 40418328, + "step": 6070 + }, + { + "epoch": 0.7442344798015375, + "grad_norm": 1.561720698274668, + "learning_rate": 7.421522727391696e-05, + "loss": 0.3326, + "num_input_tokens_seen": 40451488, + "step": 6075 + }, + { + "epoch": 0.7448470184680408, + "grad_norm": 0.8486383048662409, + "learning_rate": 7.417090296998404e-05, + "loss": 0.2993, + "num_input_tokens_seen": 40485496, + "step": 6080 + }, + { + "epoch": 0.7454595571345441, + "grad_norm": 0.7364745497547495, + "learning_rate": 7.412655386460419e-05, + "loss": 0.3238, + "num_input_tokens_seen": 40518408, + "step": 6085 + }, + { + "epoch": 0.7460720958010474, + "grad_norm": 5.242867994487514, + "learning_rate": 7.408218000328349e-05, + "loss": 0.2752, + "num_input_tokens_seen": 40553264, + "step": 6090 + }, + { + "epoch": 0.7466846344675507, + "grad_norm": 2.3644034360060124, + "learning_rate": 7.403778143155336e-05, + "loss": 0.3469, + "num_input_tokens_seen": 40587264, + "step": 6095 + }, + { + "epoch": 0.7472971731340541, + "grad_norm": 1.1162791048604286, + "learning_rate": 7.399335819497057e-05, + "loss": 0.3172, + "num_input_tokens_seen": 40619872, + "step": 6100 + }, + { + "epoch": 0.7472971731340541, + "eval_loss": 0.16394594311714172, + "eval_runtime": 19.7173, + "eval_samples_per_second": 3.043, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 40619872, + "step": 6100 + }, + { + "epoch": 0.7479097118005574, + "grad_norm": 1.0044051002548748, + "learning_rate": 7.394891033911724e-05, + "loss": 0.2465, + "num_input_tokens_seen": 40653952, + "step": 6105 + }, + { + "epoch": 0.7485222504670608, + "grad_norm": 0.9466992292869305, + "learning_rate": 7.390443790960077e-05, + "loss": 0.3163, + "num_input_tokens_seen": 40687256, + "step": 6110 + }, + { + "epoch": 0.749134789133564, + "grad_norm": 3.490996414913539, + "learning_rate": 7.385994095205366e-05, + "loss": 0.286, + "num_input_tokens_seen": 40721512, + "step": 6115 + }, + { + "epoch": 0.7497473278000674, + "grad_norm": 22.12490754633054, + "learning_rate": 7.381541951213373e-05, + "loss": 0.2814, + "num_input_tokens_seen": 40754824, + "step": 6120 + }, + { + "epoch": 0.7503598664665707, + "grad_norm": 0.8001713759131907, + "learning_rate": 7.37708736355238e-05, + "loss": 0.2902, + "num_input_tokens_seen": 40788144, + "step": 6125 + }, + { + "epoch": 0.7509724051330741, + "grad_norm": 0.8790305316978093, + "learning_rate": 7.372630336793183e-05, + "loss": 0.2628, + "num_input_tokens_seen": 40821984, + "step": 6130 + }, + { + "epoch": 0.7515849437995773, + "grad_norm": 2.4942838855650447, + "learning_rate": 7.368170875509078e-05, + "loss": 0.2101, + "num_input_tokens_seen": 40856448, + "step": 6135 + }, + { + "epoch": 0.7521974824660806, + "grad_norm": 10.42227715020944, + "learning_rate": 7.36370898427586e-05, + "loss": 0.2821, + "num_input_tokens_seen": 40889960, + "step": 6140 + }, + { + "epoch": 0.752810021132584, + "grad_norm": 1.2895794225054122, + "learning_rate": 7.359244667671818e-05, + "loss": 0.2582, + "num_input_tokens_seen": 40924176, + "step": 6145 + }, + { + "epoch": 0.7534225597990873, + "grad_norm": 1.1556137698796218, + "learning_rate": 7.354777930277725e-05, + "loss": 0.2963, + "num_input_tokens_seen": 40956544, + "step": 6150 + }, + { + "epoch": 0.7534225597990873, + "eval_loss": 0.15747645497322083, + "eval_runtime": 19.7047, + "eval_samples_per_second": 3.045, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 40956544, + "step": 6150 + }, + { + "epoch": 0.7540350984655907, + "grad_norm": 0.9419122260750067, + "learning_rate": 7.350308776676848e-05, + "loss": 0.2898, + "num_input_tokens_seen": 40991016, + "step": 6155 + }, + { + "epoch": 0.7546476371320939, + "grad_norm": 1.2325919833106898, + "learning_rate": 7.345837211454922e-05, + "loss": 0.3118, + "num_input_tokens_seen": 41024800, + "step": 6160 + }, + { + "epoch": 0.7552601757985973, + "grad_norm": 0.9417473094977749, + "learning_rate": 7.341363239200164e-05, + "loss": 0.2695, + "num_input_tokens_seen": 41058208, + "step": 6165 + }, + { + "epoch": 0.7558727144651006, + "grad_norm": 16.473947656675687, + "learning_rate": 7.336886864503256e-05, + "loss": 0.3063, + "num_input_tokens_seen": 41092048, + "step": 6170 + }, + { + "epoch": 0.756485253131604, + "grad_norm": 0.6465478473488592, + "learning_rate": 7.33240809195735e-05, + "loss": 0.2352, + "num_input_tokens_seen": 41125720, + "step": 6175 + }, + { + "epoch": 0.7570977917981072, + "grad_norm": 1.3356886904696732, + "learning_rate": 7.327926926158054e-05, + "loss": 0.2885, + "num_input_tokens_seen": 41159528, + "step": 6180 + }, + { + "epoch": 0.7577103304646106, + "grad_norm": 0.9812189049736046, + "learning_rate": 7.323443371703434e-05, + "loss": 0.3619, + "num_input_tokens_seen": 41192496, + "step": 6185 + }, + { + "epoch": 0.7583228691311139, + "grad_norm": 1.0858193934831015, + "learning_rate": 7.318957433194009e-05, + "loss": 0.2902, + "num_input_tokens_seen": 41226168, + "step": 6190 + }, + { + "epoch": 0.7589354077976173, + "grad_norm": 1.2708887644354112, + "learning_rate": 7.314469115232736e-05, + "loss": 0.3258, + "num_input_tokens_seen": 41258896, + "step": 6195 + }, + { + "epoch": 0.7595479464641205, + "grad_norm": 0.9930719017582388, + "learning_rate": 7.309978422425026e-05, + "loss": 0.307, + "num_input_tokens_seen": 41291792, + "step": 6200 + }, + { + "epoch": 0.7595479464641205, + "eval_loss": 0.1483473926782608, + "eval_runtime": 19.9297, + "eval_samples_per_second": 3.011, + "eval_steps_per_second": 0.753, + "num_input_tokens_seen": 41291792, + "step": 6200 + }, + { + "epoch": 0.7601604851306238, + "grad_norm": 1.8076565598626977, + "learning_rate": 7.305485359378715e-05, + "loss": 0.3005, + "num_input_tokens_seen": 41325280, + "step": 6205 + }, + { + "epoch": 0.7607730237971272, + "grad_norm": 14.154640922484367, + "learning_rate": 7.300989930704077e-05, + "loss": 0.261, + "num_input_tokens_seen": 41358808, + "step": 6210 + }, + { + "epoch": 0.7613855624636305, + "grad_norm": 1.2057406415409626, + "learning_rate": 7.296492141013816e-05, + "loss": 0.3242, + "num_input_tokens_seen": 41392496, + "step": 6215 + }, + { + "epoch": 0.7619981011301339, + "grad_norm": 3.9029022305571277, + "learning_rate": 7.29199199492305e-05, + "loss": 0.3319, + "num_input_tokens_seen": 41426512, + "step": 6220 + }, + { + "epoch": 0.7626106397966371, + "grad_norm": 1.0841226239800639, + "learning_rate": 7.287489497049324e-05, + "loss": 0.2954, + "num_input_tokens_seen": 41460136, + "step": 6225 + }, + { + "epoch": 0.7632231784631405, + "grad_norm": 15.093260522692887, + "learning_rate": 7.28298465201259e-05, + "loss": 0.3331, + "num_input_tokens_seen": 41493096, + "step": 6230 + }, + { + "epoch": 0.7638357171296438, + "grad_norm": 1.1719263889374696, + "learning_rate": 7.278477464435212e-05, + "loss": 0.3114, + "num_input_tokens_seen": 41526376, + "step": 6235 + }, + { + "epoch": 0.7644482557961472, + "grad_norm": 1.1648111353870754, + "learning_rate": 7.273967938941954e-05, + "loss": 0.2628, + "num_input_tokens_seen": 41560264, + "step": 6240 + }, + { + "epoch": 0.7650607944626504, + "grad_norm": 0.9461824698695033, + "learning_rate": 7.269456080159983e-05, + "loss": 0.2767, + "num_input_tokens_seen": 41593080, + "step": 6245 + }, + { + "epoch": 0.7656733331291538, + "grad_norm": 9.107112636022196, + "learning_rate": 7.264941892718857e-05, + "loss": 0.2854, + "num_input_tokens_seen": 41626144, + "step": 6250 + }, + { + "epoch": 0.7656733331291538, + "eval_loss": 0.08869815617799759, + "eval_runtime": 19.6358, + "eval_samples_per_second": 3.056, + "eval_steps_per_second": 0.764, + "num_input_tokens_seen": 41626144, + "step": 6250 + }, + { + "epoch": 0.7662858717956571, + "grad_norm": 0.9965854851662691, + "learning_rate": 7.260425381250526e-05, + "loss": 0.2942, + "num_input_tokens_seen": 41659576, + "step": 6255 + }, + { + "epoch": 0.7668984104621605, + "grad_norm": 1.1575991909785084, + "learning_rate": 7.255906550389321e-05, + "loss": 0.2992, + "num_input_tokens_seen": 41693032, + "step": 6260 + }, + { + "epoch": 0.7675109491286638, + "grad_norm": 2.036528384907489, + "learning_rate": 7.251385404771956e-05, + "loss": 0.265, + "num_input_tokens_seen": 41726952, + "step": 6265 + }, + { + "epoch": 0.768123487795167, + "grad_norm": 0.8648010072654181, + "learning_rate": 7.24686194903752e-05, + "loss": 0.33, + "num_input_tokens_seen": 41760288, + "step": 6270 + }, + { + "epoch": 0.7687360264616704, + "grad_norm": 0.8758703810737747, + "learning_rate": 7.242336187827472e-05, + "loss": 0.2968, + "num_input_tokens_seen": 41793904, + "step": 6275 + }, + { + "epoch": 0.7693485651281737, + "grad_norm": 0.8805544706408599, + "learning_rate": 7.237808125785635e-05, + "loss": 0.2806, + "num_input_tokens_seen": 41827464, + "step": 6280 + }, + { + "epoch": 0.7699611037946771, + "grad_norm": 0.9198794462568145, + "learning_rate": 7.233277767558194e-05, + "loss": 0.3241, + "num_input_tokens_seen": 41861128, + "step": 6285 + }, + { + "epoch": 0.7705736424611803, + "grad_norm": 1.0680481742358945, + "learning_rate": 7.228745117793692e-05, + "loss": 0.2861, + "num_input_tokens_seen": 41894512, + "step": 6290 + }, + { + "epoch": 0.7711861811276837, + "grad_norm": 1.3131839922459358, + "learning_rate": 7.224210181143022e-05, + "loss": 0.3877, + "num_input_tokens_seen": 41927648, + "step": 6295 + }, + { + "epoch": 0.771798719794187, + "grad_norm": 15.956745023528141, + "learning_rate": 7.21967296225942e-05, + "loss": 0.2569, + "num_input_tokens_seen": 41961432, + "step": 6300 + }, + { + "epoch": 0.771798719794187, + "eval_loss": 0.08859598636627197, + "eval_runtime": 19.6561, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 41961432, + "step": 6300 + }, + { + "epoch": 0.7724112584606904, + "grad_norm": 1.1694747757405668, + "learning_rate": 7.21513346579847e-05, + "loss": 0.3425, + "num_input_tokens_seen": 41994280, + "step": 6305 + }, + { + "epoch": 0.7730237971271936, + "grad_norm": 1.551556148569726, + "learning_rate": 7.210591696418086e-05, + "loss": 0.3086, + "num_input_tokens_seen": 42027712, + "step": 6310 + }, + { + "epoch": 0.773636335793697, + "grad_norm": 1.0001111377683787, + "learning_rate": 7.206047658778521e-05, + "loss": 0.2838, + "num_input_tokens_seen": 42060632, + "step": 6315 + }, + { + "epoch": 0.7742488744602003, + "grad_norm": 5.089688034667302, + "learning_rate": 7.201501357542352e-05, + "loss": 0.2823, + "num_input_tokens_seen": 42093888, + "step": 6320 + }, + { + "epoch": 0.7748614131267036, + "grad_norm": 1.038511133168517, + "learning_rate": 7.19695279737448e-05, + "loss": 0.3035, + "num_input_tokens_seen": 42127968, + "step": 6325 + }, + { + "epoch": 0.775473951793207, + "grad_norm": 11.91166617218219, + "learning_rate": 7.192401982942122e-05, + "loss": 0.3127, + "num_input_tokens_seen": 42161584, + "step": 6330 + }, + { + "epoch": 0.7760864904597102, + "grad_norm": 0.8798640525607245, + "learning_rate": 7.187848918914808e-05, + "loss": 0.2813, + "num_input_tokens_seen": 42195320, + "step": 6335 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 1.040745555010597, + "learning_rate": 7.18329360996438e-05, + "loss": 0.3069, + "num_input_tokens_seen": 42228600, + "step": 6340 + }, + { + "epoch": 0.7773115677927169, + "grad_norm": 18.364247317361095, + "learning_rate": 7.178736060764977e-05, + "loss": 0.3452, + "num_input_tokens_seen": 42262592, + "step": 6345 + }, + { + "epoch": 0.7779241064592203, + "grad_norm": 0.8334184792045247, + "learning_rate": 7.174176275993046e-05, + "loss": 0.2468, + "num_input_tokens_seen": 42296360, + "step": 6350 + }, + { + "epoch": 0.7779241064592203, + "eval_loss": 0.1457025557756424, + "eval_runtime": 19.1537, + "eval_samples_per_second": 3.133, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 42296360, + "step": 6350 + }, + { + "epoch": 0.7785366451257235, + "grad_norm": 0.9277187068556979, + "learning_rate": 7.169614260327317e-05, + "loss": 0.2664, + "num_input_tokens_seen": 42330296, + "step": 6355 + }, + { + "epoch": 0.7791491837922269, + "grad_norm": 1.7077172478241494, + "learning_rate": 7.165050018448818e-05, + "loss": 0.2861, + "num_input_tokens_seen": 42364048, + "step": 6360 + }, + { + "epoch": 0.7797617224587302, + "grad_norm": 3.2919482146591736, + "learning_rate": 7.160483555040855e-05, + "loss": 0.3407, + "num_input_tokens_seen": 42397600, + "step": 6365 + }, + { + "epoch": 0.7803742611252336, + "grad_norm": 0.9392708438947994, + "learning_rate": 7.15591487478902e-05, + "loss": 0.2991, + "num_input_tokens_seen": 42430264, + "step": 6370 + }, + { + "epoch": 0.7809867997917369, + "grad_norm": 6.608995563391263, + "learning_rate": 7.151343982381172e-05, + "loss": 0.3304, + "num_input_tokens_seen": 42463064, + "step": 6375 + }, + { + "epoch": 0.7815993384582401, + "grad_norm": 1.4313155515282774, + "learning_rate": 7.146770882507448e-05, + "loss": 0.294, + "num_input_tokens_seen": 42497032, + "step": 6380 + }, + { + "epoch": 0.7822118771247435, + "grad_norm": 0.9856274405271017, + "learning_rate": 7.142195579860242e-05, + "loss": 0.3075, + "num_input_tokens_seen": 42530448, + "step": 6385 + }, + { + "epoch": 0.7828244157912468, + "grad_norm": 0.9080497365646462, + "learning_rate": 7.137618079134213e-05, + "loss": 0.2973, + "num_input_tokens_seen": 42563872, + "step": 6390 + }, + { + "epoch": 0.7834369544577502, + "grad_norm": 7.386632687812333, + "learning_rate": 7.133038385026276e-05, + "loss": 0.2532, + "num_input_tokens_seen": 42597152, + "step": 6395 + }, + { + "epoch": 0.7840494931242534, + "grad_norm": 1.0701650502350721, + "learning_rate": 7.128456502235595e-05, + "loss": 0.2472, + "num_input_tokens_seen": 42631184, + "step": 6400 + }, + { + "epoch": 0.7840494931242534, + "eval_loss": 0.12907207012176514, + "eval_runtime": 19.5863, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 42631184, + "step": 6400 + }, + { + "epoch": 0.7846620317907568, + "grad_norm": 0.9434100854211623, + "learning_rate": 7.123872435463577e-05, + "loss": 0.2744, + "num_input_tokens_seen": 42664864, + "step": 6405 + }, + { + "epoch": 0.7852745704572601, + "grad_norm": 1.0628555666896178, + "learning_rate": 7.119286189413877e-05, + "loss": 0.3276, + "num_input_tokens_seen": 42697720, + "step": 6410 + }, + { + "epoch": 0.7858871091237635, + "grad_norm": 1.1125602273629662, + "learning_rate": 7.114697768792378e-05, + "loss": 0.3092, + "num_input_tokens_seen": 42731136, + "step": 6415 + }, + { + "epoch": 0.7864996477902667, + "grad_norm": 0.9737340454529199, + "learning_rate": 7.110107178307205e-05, + "loss": 0.2763, + "num_input_tokens_seen": 42764792, + "step": 6420 + }, + { + "epoch": 0.7871121864567701, + "grad_norm": 0.5718475820960789, + "learning_rate": 7.105514422668694e-05, + "loss": 0.2399, + "num_input_tokens_seen": 42799336, + "step": 6425 + }, + { + "epoch": 0.7877247251232734, + "grad_norm": 0.9755145076717289, + "learning_rate": 7.100919506589418e-05, + "loss": 0.2724, + "num_input_tokens_seen": 42833272, + "step": 6430 + }, + { + "epoch": 0.7883372637897768, + "grad_norm": 1.0718247144738957, + "learning_rate": 7.096322434784155e-05, + "loss": 0.2987, + "num_input_tokens_seen": 42866792, + "step": 6435 + }, + { + "epoch": 0.7889498024562801, + "grad_norm": 0.88192416782013, + "learning_rate": 7.091723211969906e-05, + "loss": 0.2642, + "num_input_tokens_seen": 42899928, + "step": 6440 + }, + { + "epoch": 0.7895623411227833, + "grad_norm": 1.1835426442696, + "learning_rate": 7.087121842865866e-05, + "loss": 0.24, + "num_input_tokens_seen": 42934056, + "step": 6445 + }, + { + "epoch": 0.7901748797892867, + "grad_norm": 1.0671294028226728, + "learning_rate": 7.082518332193447e-05, + "loss": 0.2823, + "num_input_tokens_seen": 42967536, + "step": 6450 + }, + { + "epoch": 0.7901748797892867, + "eval_loss": 0.08559667319059372, + "eval_runtime": 18.8982, + "eval_samples_per_second": 3.175, + "eval_steps_per_second": 0.794, + "num_input_tokens_seen": 42967536, + "step": 6450 + }, + { + "epoch": 0.79078741845579, + "grad_norm": 1.1648545322468236, + "learning_rate": 7.077912684676247e-05, + "loss": 0.2871, + "num_input_tokens_seen": 43000680, + "step": 6455 + }, + { + "epoch": 0.7913999571222934, + "grad_norm": 0.9706416535796455, + "learning_rate": 7.073304905040061e-05, + "loss": 0.3005, + "num_input_tokens_seen": 43033896, + "step": 6460 + }, + { + "epoch": 0.7920124957887966, + "grad_norm": 2.405242200130845, + "learning_rate": 7.068694998012872e-05, + "loss": 0.2582, + "num_input_tokens_seen": 43067696, + "step": 6465 + }, + { + "epoch": 0.7926250344553, + "grad_norm": 0.6571189632232431, + "learning_rate": 7.064082968324846e-05, + "loss": 0.2367, + "num_input_tokens_seen": 43101232, + "step": 6470 + }, + { + "epoch": 0.7932375731218033, + "grad_norm": 0.8549133539447846, + "learning_rate": 7.059468820708324e-05, + "loss": 0.2661, + "num_input_tokens_seen": 43135256, + "step": 6475 + }, + { + "epoch": 0.7938501117883067, + "grad_norm": 1.0756402725870293, + "learning_rate": 7.054852559897822e-05, + "loss": 0.2757, + "num_input_tokens_seen": 43168880, + "step": 6480 + }, + { + "epoch": 0.7944626504548099, + "grad_norm": 0.6147290365439143, + "learning_rate": 7.050234190630028e-05, + "loss": 0.2786, + "num_input_tokens_seen": 43202464, + "step": 6485 + }, + { + "epoch": 0.7950751891213133, + "grad_norm": 1.241771943375093, + "learning_rate": 7.045613717643788e-05, + "loss": 0.2872, + "num_input_tokens_seen": 43235952, + "step": 6490 + }, + { + "epoch": 0.7956877277878166, + "grad_norm": 0.9545529262613525, + "learning_rate": 7.040991145680109e-05, + "loss": 0.3142, + "num_input_tokens_seen": 43268736, + "step": 6495 + }, + { + "epoch": 0.79630026645432, + "grad_norm": 1.0471773065978716, + "learning_rate": 7.036366479482153e-05, + "loss": 0.2839, + "num_input_tokens_seen": 43302272, + "step": 6500 + }, + { + "epoch": 0.79630026645432, + "eval_loss": 0.2148544043302536, + "eval_runtime": 19.1633, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 43302272, + "step": 6500 + }, + { + "epoch": 0.7969128051208233, + "grad_norm": 0.6067188566918624, + "learning_rate": 7.031739723795227e-05, + "loss": 0.2997, + "num_input_tokens_seen": 43335592, + "step": 6505 + }, + { + "epoch": 0.7975253437873265, + "grad_norm": 1.0791908641767403, + "learning_rate": 7.027110883366787e-05, + "loss": 0.2643, + "num_input_tokens_seen": 43369112, + "step": 6510 + }, + { + "epoch": 0.7981378824538299, + "grad_norm": 1.1247087962551001, + "learning_rate": 7.022479962946422e-05, + "loss": 0.2879, + "num_input_tokens_seen": 43402568, + "step": 6515 + }, + { + "epoch": 0.7987504211203332, + "grad_norm": 13.995694184540953, + "learning_rate": 7.017846967285862e-05, + "loss": 0.3314, + "num_input_tokens_seen": 43436112, + "step": 6520 + }, + { + "epoch": 0.7993629597868366, + "grad_norm": 1.062475026587463, + "learning_rate": 7.013211901138964e-05, + "loss": 0.2422, + "num_input_tokens_seen": 43470496, + "step": 6525 + }, + { + "epoch": 0.7999754984533398, + "grad_norm": 13.261509778545655, + "learning_rate": 7.008574769261704e-05, + "loss": 0.4454, + "num_input_tokens_seen": 43503760, + "step": 6530 + }, + { + "epoch": 0.8005880371198432, + "grad_norm": 1.2739242708131109, + "learning_rate": 7.003935576412185e-05, + "loss": 0.318, + "num_input_tokens_seen": 43536552, + "step": 6535 + }, + { + "epoch": 0.8012005757863465, + "grad_norm": 5.718983649466009, + "learning_rate": 6.999294327350625e-05, + "loss": 0.2918, + "num_input_tokens_seen": 43569608, + "step": 6540 + }, + { + "epoch": 0.8018131144528499, + "grad_norm": 1.573654376560078, + "learning_rate": 6.994651026839344e-05, + "loss": 0.2728, + "num_input_tokens_seen": 43603048, + "step": 6545 + }, + { + "epoch": 0.8024256531193532, + "grad_norm": 0.7823639077936713, + "learning_rate": 6.990005679642773e-05, + "loss": 0.2764, + "num_input_tokens_seen": 43636808, + "step": 6550 + }, + { + "epoch": 0.8024256531193532, + "eval_loss": 0.15134289860725403, + "eval_runtime": 19.7706, + "eval_samples_per_second": 3.035, + "eval_steps_per_second": 0.759, + "num_input_tokens_seen": 43636808, + "step": 6550 + }, + { + "epoch": 0.8030381917858564, + "grad_norm": 0.9895411538112874, + "learning_rate": 6.985358290527443e-05, + "loss": 0.2778, + "num_input_tokens_seen": 43670288, + "step": 6555 + }, + { + "epoch": 0.8036507304523598, + "grad_norm": 1.222509294501007, + "learning_rate": 6.980708864261979e-05, + "loss": 0.2343, + "num_input_tokens_seen": 43703216, + "step": 6560 + }, + { + "epoch": 0.8042632691188631, + "grad_norm": 1.0260836668799267, + "learning_rate": 6.976057405617096e-05, + "loss": 0.2788, + "num_input_tokens_seen": 43737192, + "step": 6565 + }, + { + "epoch": 0.8048758077853665, + "grad_norm": 1.3269160971238514, + "learning_rate": 6.971403919365596e-05, + "loss": 0.2827, + "num_input_tokens_seen": 43770280, + "step": 6570 + }, + { + "epoch": 0.8054883464518697, + "grad_norm": 0.8679066665476355, + "learning_rate": 6.966748410282358e-05, + "loss": 0.2085, + "num_input_tokens_seen": 43804504, + "step": 6575 + }, + { + "epoch": 0.8061008851183731, + "grad_norm": 1.09285704869787, + "learning_rate": 6.96209088314434e-05, + "loss": 0.3103, + "num_input_tokens_seen": 43837480, + "step": 6580 + }, + { + "epoch": 0.8067134237848764, + "grad_norm": 0.9643402971549988, + "learning_rate": 6.95743134273057e-05, + "loss": 0.258, + "num_input_tokens_seen": 43871144, + "step": 6585 + }, + { + "epoch": 0.8073259624513798, + "grad_norm": 1.0744914188716888, + "learning_rate": 6.95276979382214e-05, + "loss": 0.2682, + "num_input_tokens_seen": 43904576, + "step": 6590 + }, + { + "epoch": 0.807938501117883, + "grad_norm": 7.448974952923534, + "learning_rate": 6.948106241202206e-05, + "loss": 0.3714, + "num_input_tokens_seen": 43938152, + "step": 6595 + }, + { + "epoch": 0.8085510397843864, + "grad_norm": 1.4621805935974503, + "learning_rate": 6.943440689655978e-05, + "loss": 0.308, + "num_input_tokens_seen": 43971808, + "step": 6600 + }, + { + "epoch": 0.8085510397843864, + "eval_loss": 0.19454015791416168, + "eval_runtime": 19.3883, + "eval_samples_per_second": 3.095, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 43971808, + "step": 6600 + }, + { + "epoch": 0.8091635784508897, + "grad_norm": 0.7250406554689472, + "learning_rate": 6.938773143970716e-05, + "loss": 0.322, + "num_input_tokens_seen": 44005224, + "step": 6605 + }, + { + "epoch": 0.8097761171173931, + "grad_norm": 0.9121049461393465, + "learning_rate": 6.934103608935729e-05, + "loss": 0.296, + "num_input_tokens_seen": 44037992, + "step": 6610 + }, + { + "epoch": 0.8103886557838964, + "grad_norm": 6.839539888748308, + "learning_rate": 6.929432089342365e-05, + "loss": 0.2729, + "num_input_tokens_seen": 44071872, + "step": 6615 + }, + { + "epoch": 0.8110011944503996, + "grad_norm": 2.1061123095071483, + "learning_rate": 6.92475858998401e-05, + "loss": 0.2848, + "num_input_tokens_seen": 44104968, + "step": 6620 + }, + { + "epoch": 0.811613733116903, + "grad_norm": 26.488721159817096, + "learning_rate": 6.92008311565608e-05, + "loss": 0.2987, + "num_input_tokens_seen": 44139184, + "step": 6625 + }, + { + "epoch": 0.8122262717834063, + "grad_norm": 1.126912694286578, + "learning_rate": 6.915405671156014e-05, + "loss": 0.3034, + "num_input_tokens_seen": 44172288, + "step": 6630 + }, + { + "epoch": 0.8128388104499097, + "grad_norm": 11.949507319366525, + "learning_rate": 6.910726261283283e-05, + "loss": 0.3226, + "num_input_tokens_seen": 44205384, + "step": 6635 + }, + { + "epoch": 0.8134513491164129, + "grad_norm": 0.8966370553578922, + "learning_rate": 6.906044890839365e-05, + "loss": 0.2152, + "num_input_tokens_seen": 44239384, + "step": 6640 + }, + { + "epoch": 0.8140638877829163, + "grad_norm": 0.9320105029125886, + "learning_rate": 6.901361564627753e-05, + "loss": 0.2887, + "num_input_tokens_seen": 44272696, + "step": 6645 + }, + { + "epoch": 0.8146764264494196, + "grad_norm": 2.1948798425971963, + "learning_rate": 6.896676287453948e-05, + "loss": 0.2873, + "num_input_tokens_seen": 44306360, + "step": 6650 + }, + { + "epoch": 0.8146764264494196, + "eval_loss": 0.13924284279346466, + "eval_runtime": 19.6717, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 44306360, + "step": 6650 + }, + { + "epoch": 0.815288965115923, + "grad_norm": 0.8735546065928086, + "learning_rate": 6.891989064125447e-05, + "loss": 0.251, + "num_input_tokens_seen": 44340600, + "step": 6655 + }, + { + "epoch": 0.8159015037824263, + "grad_norm": 0.9586282481066526, + "learning_rate": 6.887299899451754e-05, + "loss": 0.2395, + "num_input_tokens_seen": 44374144, + "step": 6660 + }, + { + "epoch": 0.8165140424489296, + "grad_norm": 0.8723790002418388, + "learning_rate": 6.882608798244358e-05, + "loss": 0.2767, + "num_input_tokens_seen": 44407408, + "step": 6665 + }, + { + "epoch": 0.8171265811154329, + "grad_norm": 1.1140198354673876, + "learning_rate": 6.877915765316735e-05, + "loss": 0.287, + "num_input_tokens_seen": 44441208, + "step": 6670 + }, + { + "epoch": 0.8177391197819363, + "grad_norm": 1.1967355904817658, + "learning_rate": 6.873220805484343e-05, + "loss": 0.2508, + "num_input_tokens_seen": 44474368, + "step": 6675 + }, + { + "epoch": 0.8183516584484396, + "grad_norm": 1.1540018292704373, + "learning_rate": 6.868523923564625e-05, + "loss": 0.2729, + "num_input_tokens_seen": 44507824, + "step": 6680 + }, + { + "epoch": 0.8189641971149428, + "grad_norm": 1.1971221927018119, + "learning_rate": 6.863825124376984e-05, + "loss": 0.2981, + "num_input_tokens_seen": 44541288, + "step": 6685 + }, + { + "epoch": 0.8195767357814462, + "grad_norm": 0.7762590312265599, + "learning_rate": 6.859124412742798e-05, + "loss": 0.228, + "num_input_tokens_seen": 44575072, + "step": 6690 + }, + { + "epoch": 0.8201892744479495, + "grad_norm": 0.9229572181274658, + "learning_rate": 6.854421793485408e-05, + "loss": 0.2697, + "num_input_tokens_seen": 44608856, + "step": 6695 + }, + { + "epoch": 0.8208018131144529, + "grad_norm": 1.031556715200953, + "learning_rate": 6.849717271430108e-05, + "loss": 0.2768, + "num_input_tokens_seen": 44641896, + "step": 6700 + }, + { + "epoch": 0.8208018131144529, + "eval_loss": 0.09934918582439423, + "eval_runtime": 19.7024, + "eval_samples_per_second": 3.045, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 44641896, + "step": 6700 + }, + { + "epoch": 0.8214143517809561, + "grad_norm": 1.0126508904822624, + "learning_rate": 6.845010851404146e-05, + "loss": 0.2711, + "num_input_tokens_seen": 44675200, + "step": 6705 + }, + { + "epoch": 0.8220268904474595, + "grad_norm": 1.0355722804841154, + "learning_rate": 6.840302538236718e-05, + "loss": 0.2986, + "num_input_tokens_seen": 44708840, + "step": 6710 + }, + { + "epoch": 0.8226394291139628, + "grad_norm": 1.526726185162278, + "learning_rate": 6.835592336758963e-05, + "loss": 0.3317, + "num_input_tokens_seen": 44742240, + "step": 6715 + }, + { + "epoch": 0.8232519677804662, + "grad_norm": 1.1303282171630396, + "learning_rate": 6.830880251803955e-05, + "loss": 0.2862, + "num_input_tokens_seen": 44775232, + "step": 6720 + }, + { + "epoch": 0.8238645064469695, + "grad_norm": 0.6735134261563207, + "learning_rate": 6.826166288206708e-05, + "loss": 0.288, + "num_input_tokens_seen": 44809216, + "step": 6725 + }, + { + "epoch": 0.8244770451134728, + "grad_norm": 1.098202555796108, + "learning_rate": 6.821450450804154e-05, + "loss": 0.2734, + "num_input_tokens_seen": 44842656, + "step": 6730 + }, + { + "epoch": 0.8250895837799761, + "grad_norm": 1.0843331473537496, + "learning_rate": 6.816732744435153e-05, + "loss": 0.2868, + "num_input_tokens_seen": 44876144, + "step": 6735 + }, + { + "epoch": 0.8257021224464794, + "grad_norm": 0.9173826787812185, + "learning_rate": 6.812013173940482e-05, + "loss": 0.2909, + "num_input_tokens_seen": 44908984, + "step": 6740 + }, + { + "epoch": 0.8263146611129828, + "grad_norm": 0.6329427895003542, + "learning_rate": 6.807291744162829e-05, + "loss": 0.3024, + "num_input_tokens_seen": 44942704, + "step": 6745 + }, + { + "epoch": 0.826927199779486, + "grad_norm": 1.1560924794533454, + "learning_rate": 6.802568459946792e-05, + "loss": 0.2599, + "num_input_tokens_seen": 44976072, + "step": 6750 + }, + { + "epoch": 0.826927199779486, + "eval_loss": 0.1032017320394516, + "eval_runtime": 19.6562, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 44976072, + "step": 6750 + }, + { + "epoch": 0.8275397384459894, + "grad_norm": 1.0121114260230626, + "learning_rate": 6.797843326138871e-05, + "loss": 0.2842, + "num_input_tokens_seen": 45009792, + "step": 6755 + }, + { + "epoch": 0.8281522771124927, + "grad_norm": 1.4153009688799005, + "learning_rate": 6.793116347587467e-05, + "loss": 0.2743, + "num_input_tokens_seen": 45043328, + "step": 6760 + }, + { + "epoch": 0.8287648157789961, + "grad_norm": 0.8904035968406322, + "learning_rate": 6.788387529142865e-05, + "loss": 0.3115, + "num_input_tokens_seen": 45076920, + "step": 6765 + }, + { + "epoch": 0.8293773544454994, + "grad_norm": 1.1063456667777443, + "learning_rate": 6.783656875657248e-05, + "loss": 0.2929, + "num_input_tokens_seen": 45110256, + "step": 6770 + }, + { + "epoch": 0.8299898931120027, + "grad_norm": 13.298705276643853, + "learning_rate": 6.778924391984676e-05, + "loss": 0.3656, + "num_input_tokens_seen": 45143792, + "step": 6775 + }, + { + "epoch": 0.830602431778506, + "grad_norm": 14.198843623538776, + "learning_rate": 6.774190082981089e-05, + "loss": 0.261, + "num_input_tokens_seen": 45178448, + "step": 6780 + }, + { + "epoch": 0.8312149704450094, + "grad_norm": 7.7134142202800575, + "learning_rate": 6.7694539535043e-05, + "loss": 0.3413, + "num_input_tokens_seen": 45211664, + "step": 6785 + }, + { + "epoch": 0.8318275091115127, + "grad_norm": 1.3042127972636788, + "learning_rate": 6.764716008413988e-05, + "loss": 0.2459, + "num_input_tokens_seen": 45245336, + "step": 6790 + }, + { + "epoch": 0.832440047778016, + "grad_norm": 1.1466807650290998, + "learning_rate": 6.759976252571698e-05, + "loss": 0.2311, + "num_input_tokens_seen": 45279552, + "step": 6795 + }, + { + "epoch": 0.8330525864445193, + "grad_norm": 0.9239389319474149, + "learning_rate": 6.755234690840828e-05, + "loss": 0.2864, + "num_input_tokens_seen": 45313600, + "step": 6800 + }, + { + "epoch": 0.8330525864445193, + "eval_loss": 0.11257417500019073, + "eval_runtime": 19.419, + "eval_samples_per_second": 3.09, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 45313600, + "step": 6800 + }, + { + "epoch": 0.8336651251110226, + "grad_norm": 0.871577915238052, + "learning_rate": 6.750491328086636e-05, + "loss": 0.2353, + "num_input_tokens_seen": 45346792, + "step": 6805 + }, + { + "epoch": 0.834277663777526, + "grad_norm": 1.331290142361264, + "learning_rate": 6.745746169176224e-05, + "loss": 0.2854, + "num_input_tokens_seen": 45380272, + "step": 6810 + }, + { + "epoch": 0.8348902024440292, + "grad_norm": 29.43497394469696, + "learning_rate": 6.740999218978536e-05, + "loss": 0.3025, + "num_input_tokens_seen": 45413232, + "step": 6815 + }, + { + "epoch": 0.8355027411105326, + "grad_norm": 1.4408517029655037, + "learning_rate": 6.736250482364356e-05, + "loss": 0.3557, + "num_input_tokens_seen": 45446816, + "step": 6820 + }, + { + "epoch": 0.8361152797770359, + "grad_norm": 3.6555431582373252, + "learning_rate": 6.731499964206301e-05, + "loss": 0.287, + "num_input_tokens_seen": 45480432, + "step": 6825 + }, + { + "epoch": 0.8367278184435393, + "grad_norm": 1.0625615126937737, + "learning_rate": 6.726747669378815e-05, + "loss": 0.2686, + "num_input_tokens_seen": 45514408, + "step": 6830 + }, + { + "epoch": 0.8373403571100426, + "grad_norm": 1.3754936812181364, + "learning_rate": 6.721993602758164e-05, + "loss": 0.3115, + "num_input_tokens_seen": 45547120, + "step": 6835 + }, + { + "epoch": 0.8379528957765459, + "grad_norm": 1.1206316282101074, + "learning_rate": 6.717237769222437e-05, + "loss": 0.349, + "num_input_tokens_seen": 45580944, + "step": 6840 + }, + { + "epoch": 0.8385654344430492, + "grad_norm": 1.4402206324042128, + "learning_rate": 6.712480173651527e-05, + "loss": 0.3055, + "num_input_tokens_seen": 45614296, + "step": 6845 + }, + { + "epoch": 0.8391779731095526, + "grad_norm": 0.7875532336645297, + "learning_rate": 6.707720820927146e-05, + "loss": 0.3468, + "num_input_tokens_seen": 45647976, + "step": 6850 + }, + { + "epoch": 0.8391779731095526, + "eval_loss": 0.11305823177099228, + "eval_runtime": 19.3243, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 45647976, + "step": 6850 + }, + { + "epoch": 0.8397905117760559, + "grad_norm": 1.1998747442251176, + "learning_rate": 6.702959715932802e-05, + "loss": 0.3153, + "num_input_tokens_seen": 45680992, + "step": 6855 + }, + { + "epoch": 0.8404030504425591, + "grad_norm": 1.0220005457718246, + "learning_rate": 6.698196863553799e-05, + "loss": 0.3125, + "num_input_tokens_seen": 45714544, + "step": 6860 + }, + { + "epoch": 0.8410155891090625, + "grad_norm": 7.209802926591873, + "learning_rate": 6.693432268677244e-05, + "loss": 0.2774, + "num_input_tokens_seen": 45748488, + "step": 6865 + }, + { + "epoch": 0.8416281277755658, + "grad_norm": 0.8039380507752786, + "learning_rate": 6.688665936192019e-05, + "loss": 0.2185, + "num_input_tokens_seen": 45782288, + "step": 6870 + }, + { + "epoch": 0.8422406664420692, + "grad_norm": 0.8009868668620282, + "learning_rate": 6.6838978709888e-05, + "loss": 0.2745, + "num_input_tokens_seen": 45815728, + "step": 6875 + }, + { + "epoch": 0.8428532051085725, + "grad_norm": 6.813761185157568, + "learning_rate": 6.679128077960031e-05, + "loss": 0.2921, + "num_input_tokens_seen": 45849616, + "step": 6880 + }, + { + "epoch": 0.8434657437750758, + "grad_norm": 1.1348431419455682, + "learning_rate": 6.67435656199994e-05, + "loss": 0.2403, + "num_input_tokens_seen": 45883224, + "step": 6885 + }, + { + "epoch": 0.8440782824415791, + "grad_norm": 2.2444705113669268, + "learning_rate": 6.669583328004511e-05, + "loss": 0.3004, + "num_input_tokens_seen": 45916512, + "step": 6890 + }, + { + "epoch": 0.8446908211080825, + "grad_norm": 1.2486811576659174, + "learning_rate": 6.664808380871503e-05, + "loss": 0.2926, + "num_input_tokens_seen": 45950112, + "step": 6895 + }, + { + "epoch": 0.8453033597745858, + "grad_norm": 1.934707674277018, + "learning_rate": 6.660031725500424e-05, + "loss": 0.2933, + "num_input_tokens_seen": 45983624, + "step": 6900 + }, + { + "epoch": 0.8453033597745858, + "eval_loss": 0.048528868705034256, + "eval_runtime": 19.9826, + "eval_samples_per_second": 3.003, + "eval_steps_per_second": 0.751, + "num_input_tokens_seen": 45983624, + "step": 6900 + }, + { + "epoch": 0.845915898441089, + "grad_norm": 8.507628157896342, + "learning_rate": 6.655253366792537e-05, + "loss": 0.3146, + "num_input_tokens_seen": 46017008, + "step": 6905 + }, + { + "epoch": 0.8465284371075924, + "grad_norm": 0.6943269887753836, + "learning_rate": 6.650473309650855e-05, + "loss": 0.2708, + "num_input_tokens_seen": 46050160, + "step": 6910 + }, + { + "epoch": 0.8471409757740958, + "grad_norm": 1.1453763425921433, + "learning_rate": 6.645691558980127e-05, + "loss": 0.2725, + "num_input_tokens_seen": 46083872, + "step": 6915 + }, + { + "epoch": 0.8477535144405991, + "grad_norm": 1.0824602087286639, + "learning_rate": 6.640908119686852e-05, + "loss": 0.3157, + "num_input_tokens_seen": 46117064, + "step": 6920 + }, + { + "epoch": 0.8483660531071023, + "grad_norm": 1.4439561692735352, + "learning_rate": 6.636122996679249e-05, + "loss": 0.3113, + "num_input_tokens_seen": 46150520, + "step": 6925 + }, + { + "epoch": 0.8489785917736057, + "grad_norm": 0.9368877994870469, + "learning_rate": 6.631336194867271e-05, + "loss": 0.2832, + "num_input_tokens_seen": 46183832, + "step": 6930 + }, + { + "epoch": 0.849591130440109, + "grad_norm": 1.425566395186628, + "learning_rate": 6.626547719162595e-05, + "loss": 0.29, + "num_input_tokens_seen": 46217064, + "step": 6935 + }, + { + "epoch": 0.8502036691066124, + "grad_norm": 1.3263910381467745, + "learning_rate": 6.621757574478611e-05, + "loss": 0.2726, + "num_input_tokens_seen": 46250464, + "step": 6940 + }, + { + "epoch": 0.8508162077731157, + "grad_norm": 0.7778656583021529, + "learning_rate": 6.616965765730426e-05, + "loss": 0.2943, + "num_input_tokens_seen": 46284400, + "step": 6945 + }, + { + "epoch": 0.851428746439619, + "grad_norm": 1.0029998664252922, + "learning_rate": 6.612172297834849e-05, + "loss": 0.3528, + "num_input_tokens_seen": 46317728, + "step": 6950 + }, + { + "epoch": 0.851428746439619, + "eval_loss": 0.05446135997772217, + "eval_runtime": 19.8454, + "eval_samples_per_second": 3.023, + "eval_steps_per_second": 0.756, + "num_input_tokens_seen": 46317728, + "step": 6950 + }, + { + "epoch": 0.8520412851061223, + "grad_norm": 12.061194184304284, + "learning_rate": 6.607377175710396e-05, + "loss": 0.2848, + "num_input_tokens_seen": 46351912, + "step": 6955 + }, + { + "epoch": 0.8526538237726257, + "grad_norm": 1.0734269402474226, + "learning_rate": 6.60258040427728e-05, + "loss": 0.2825, + "num_input_tokens_seen": 46385576, + "step": 6960 + }, + { + "epoch": 0.853266362439129, + "grad_norm": 0.9729983697743235, + "learning_rate": 6.597781988457405e-05, + "loss": 0.2624, + "num_input_tokens_seen": 46418776, + "step": 6965 + }, + { + "epoch": 0.8538789011056322, + "grad_norm": 2.049104440649758, + "learning_rate": 6.59298193317436e-05, + "loss": 0.3017, + "num_input_tokens_seen": 46452688, + "step": 6970 + }, + { + "epoch": 0.8544914397721356, + "grad_norm": 0.9142758573822131, + "learning_rate": 6.588180243353424e-05, + "loss": 0.2531, + "num_input_tokens_seen": 46485920, + "step": 6975 + }, + { + "epoch": 0.855103978438639, + "grad_norm": 0.6790750371611499, + "learning_rate": 6.583376923921542e-05, + "loss": 0.2594, + "num_input_tokens_seen": 46520152, + "step": 6980 + }, + { + "epoch": 0.8557165171051423, + "grad_norm": 0.5065767485475414, + "learning_rate": 6.578571979807342e-05, + "loss": 0.2598, + "num_input_tokens_seen": 46553776, + "step": 6985 + }, + { + "epoch": 0.8563290557716455, + "grad_norm": 1.0371814329275, + "learning_rate": 6.573765415941113e-05, + "loss": 0.2809, + "num_input_tokens_seen": 46586840, + "step": 6990 + }, + { + "epoch": 0.8569415944381489, + "grad_norm": 9.224118632867627, + "learning_rate": 6.568957237254805e-05, + "loss": 0.3272, + "num_input_tokens_seen": 46620216, + "step": 6995 + }, + { + "epoch": 0.8575541331046522, + "grad_norm": 9.11467110778075, + "learning_rate": 6.564147448682028e-05, + "loss": 0.2581, + "num_input_tokens_seen": 46653728, + "step": 7000 + }, + { + "epoch": 0.8575541331046522, + "eval_loss": 0.1109340488910675, + "eval_runtime": 19.8108, + "eval_samples_per_second": 3.029, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 46653728, + "step": 7000 + }, + { + "epoch": 0.8581666717711556, + "grad_norm": 1.0291954093881615, + "learning_rate": 6.559336055158045e-05, + "loss": 0.2989, + "num_input_tokens_seen": 46687312, + "step": 7005 + }, + { + "epoch": 0.8587792104376589, + "grad_norm": 0.8937611574933138, + "learning_rate": 6.554523061619764e-05, + "loss": 0.2734, + "num_input_tokens_seen": 46721712, + "step": 7010 + }, + { + "epoch": 0.8593917491041622, + "grad_norm": 1.0674448886802093, + "learning_rate": 6.549708473005731e-05, + "loss": 0.2504, + "num_input_tokens_seen": 46754840, + "step": 7015 + }, + { + "epoch": 0.8600042877706655, + "grad_norm": 6.758944156939381, + "learning_rate": 6.544892294256135e-05, + "loss": 0.2449, + "num_input_tokens_seen": 46788624, + "step": 7020 + }, + { + "epoch": 0.8606168264371689, + "grad_norm": 1.059898103262488, + "learning_rate": 6.540074530312795e-05, + "loss": 0.288, + "num_input_tokens_seen": 46821304, + "step": 7025 + }, + { + "epoch": 0.8612293651036722, + "grad_norm": 0.7587869297866502, + "learning_rate": 6.535255186119153e-05, + "loss": 0.2344, + "num_input_tokens_seen": 46855192, + "step": 7030 + }, + { + "epoch": 0.8618419037701754, + "grad_norm": 1.1700617755977196, + "learning_rate": 6.530434266620277e-05, + "loss": 0.2126, + "num_input_tokens_seen": 46889632, + "step": 7035 + }, + { + "epoch": 0.8624544424366788, + "grad_norm": 1.1392429727316147, + "learning_rate": 6.525611776762845e-05, + "loss": 0.3162, + "num_input_tokens_seen": 46923080, + "step": 7040 + }, + { + "epoch": 0.8630669811031821, + "grad_norm": 1.2506010418601048, + "learning_rate": 6.520787721495157e-05, + "loss": 0.3463, + "num_input_tokens_seen": 46956576, + "step": 7045 + }, + { + "epoch": 0.8636795197696855, + "grad_norm": 0.8924865330695361, + "learning_rate": 6.515962105767109e-05, + "loss": 0.2281, + "num_input_tokens_seen": 46990264, + "step": 7050 + }, + { + "epoch": 0.8636795197696855, + "eval_loss": 0.09453262388706207, + "eval_runtime": 19.5403, + "eval_samples_per_second": 3.071, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 46990264, + "step": 7050 + }, + { + "epoch": 0.8642920584361888, + "grad_norm": 3.269768453499626, + "learning_rate": 6.511134934530203e-05, + "loss": 0.2473, + "num_input_tokens_seen": 47024144, + "step": 7055 + }, + { + "epoch": 0.8649045971026921, + "grad_norm": 1.7942527290492862, + "learning_rate": 6.506306212737536e-05, + "loss": 0.25, + "num_input_tokens_seen": 47058016, + "step": 7060 + }, + { + "epoch": 0.8655171357691954, + "grad_norm": 1.3700575138206175, + "learning_rate": 6.501475945343796e-05, + "loss": 0.296, + "num_input_tokens_seen": 47091352, + "step": 7065 + }, + { + "epoch": 0.8661296744356988, + "grad_norm": 1.2570124796830562, + "learning_rate": 6.496644137305258e-05, + "loss": 0.3165, + "num_input_tokens_seen": 47124248, + "step": 7070 + }, + { + "epoch": 0.8667422131022021, + "grad_norm": 1.0382181635731613, + "learning_rate": 6.491810793579777e-05, + "loss": 0.2932, + "num_input_tokens_seen": 47157576, + "step": 7075 + }, + { + "epoch": 0.8673547517687054, + "grad_norm": 0.9228982101815008, + "learning_rate": 6.486975919126783e-05, + "loss": 0.2516, + "num_input_tokens_seen": 47191296, + "step": 7080 + }, + { + "epoch": 0.8679672904352087, + "grad_norm": 1.2184899316274584, + "learning_rate": 6.482139518907276e-05, + "loss": 0.2734, + "num_input_tokens_seen": 47225208, + "step": 7085 + }, + { + "epoch": 0.868579829101712, + "grad_norm": 0.7821015487275793, + "learning_rate": 6.477301597883823e-05, + "loss": 0.2692, + "num_input_tokens_seen": 47258616, + "step": 7090 + }, + { + "epoch": 0.8691923677682154, + "grad_norm": 0.9498024443280475, + "learning_rate": 6.472462161020556e-05, + "loss": 0.2194, + "num_input_tokens_seen": 47292712, + "step": 7095 + }, + { + "epoch": 0.8698049064347186, + "grad_norm": 0.7928230456703819, + "learning_rate": 6.467621213283151e-05, + "loss": 0.3067, + "num_input_tokens_seen": 47326520, + "step": 7100 + }, + { + "epoch": 0.8698049064347186, + "eval_loss": 0.1614220291376114, + "eval_runtime": 19.3557, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 47326520, + "step": 7100 + }, + { + "epoch": 0.870417445101222, + "grad_norm": 13.72365041812005, + "learning_rate": 6.462778759638846e-05, + "loss": 0.3365, + "num_input_tokens_seen": 47359416, + "step": 7105 + }, + { + "epoch": 0.8710299837677253, + "grad_norm": 1.22384255853906, + "learning_rate": 6.457934805056419e-05, + "loss": 0.2462, + "num_input_tokens_seen": 47393568, + "step": 7110 + }, + { + "epoch": 0.8716425224342287, + "grad_norm": 1.0725373713008104, + "learning_rate": 6.453089354506187e-05, + "loss": 0.2549, + "num_input_tokens_seen": 47426776, + "step": 7115 + }, + { + "epoch": 0.872255061100732, + "grad_norm": 0.9150624121184844, + "learning_rate": 6.448242412960002e-05, + "loss": 0.2853, + "num_input_tokens_seen": 47460136, + "step": 7120 + }, + { + "epoch": 0.8728675997672353, + "grad_norm": 1.123940384407468, + "learning_rate": 6.443393985391248e-05, + "loss": 0.2791, + "num_input_tokens_seen": 47493496, + "step": 7125 + }, + { + "epoch": 0.8734801384337386, + "grad_norm": 43.50933100117759, + "learning_rate": 6.438544076774835e-05, + "loss": 0.2899, + "num_input_tokens_seen": 47526760, + "step": 7130 + }, + { + "epoch": 0.874092677100242, + "grad_norm": 1.6806174564596705, + "learning_rate": 6.433692692087189e-05, + "loss": 0.2632, + "num_input_tokens_seen": 47560312, + "step": 7135 + }, + { + "epoch": 0.8747052157667453, + "grad_norm": 0.5726995902061214, + "learning_rate": 6.428839836306253e-05, + "loss": 0.3208, + "num_input_tokens_seen": 47593960, + "step": 7140 + }, + { + "epoch": 0.8753177544332486, + "grad_norm": 1.0893755340416726, + "learning_rate": 6.423985514411477e-05, + "loss": 0.2891, + "num_input_tokens_seen": 47627816, + "step": 7145 + }, + { + "epoch": 0.8759302930997519, + "grad_norm": 2.96678469353664, + "learning_rate": 6.419129731383822e-05, + "loss": 0.3548, + "num_input_tokens_seen": 47661416, + "step": 7150 + }, + { + "epoch": 0.8759302930997519, + "eval_loss": 0.1315048336982727, + "eval_runtime": 19.1418, + "eval_samples_per_second": 3.134, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 47661416, + "step": 7150 + }, + { + "epoch": 0.8765428317662552, + "grad_norm": 17.05741246442364, + "learning_rate": 6.414272492205736e-05, + "loss": 0.3215, + "num_input_tokens_seen": 47694568, + "step": 7155 + }, + { + "epoch": 0.8771553704327586, + "grad_norm": 0.9913807754497348, + "learning_rate": 6.409413801861174e-05, + "loss": 0.2936, + "num_input_tokens_seen": 47727576, + "step": 7160 + }, + { + "epoch": 0.877767909099262, + "grad_norm": 1.1287539237724211, + "learning_rate": 6.40455366533557e-05, + "loss": 0.2344, + "num_input_tokens_seen": 47760752, + "step": 7165 + }, + { + "epoch": 0.8783804477657652, + "grad_norm": 1.328072713697259, + "learning_rate": 6.399692087615853e-05, + "loss": 0.2927, + "num_input_tokens_seen": 47794544, + "step": 7170 + }, + { + "epoch": 0.8789929864322685, + "grad_norm": 5.943824475492084, + "learning_rate": 6.394829073690419e-05, + "loss": 0.2445, + "num_input_tokens_seen": 47828872, + "step": 7175 + }, + { + "epoch": 0.8796055250987719, + "grad_norm": 1.433256670713281, + "learning_rate": 6.389964628549144e-05, + "loss": 0.2414, + "num_input_tokens_seen": 47862896, + "step": 7180 + }, + { + "epoch": 0.8802180637652752, + "grad_norm": 33.12289193610119, + "learning_rate": 6.385098757183373e-05, + "loss": 0.3247, + "num_input_tokens_seen": 47895992, + "step": 7185 + }, + { + "epoch": 0.8808306024317785, + "grad_norm": 0.8807393027951026, + "learning_rate": 6.38023146458591e-05, + "loss": 0.2542, + "num_input_tokens_seen": 47929568, + "step": 7190 + }, + { + "epoch": 0.8814431410982818, + "grad_norm": 1.017627140240776, + "learning_rate": 6.375362755751024e-05, + "loss": 0.2198, + "num_input_tokens_seen": 47964048, + "step": 7195 + }, + { + "epoch": 0.8820556797647852, + "grad_norm": 0.7007502605922444, + "learning_rate": 6.37049263567443e-05, + "loss": 0.2941, + "num_input_tokens_seen": 47997384, + "step": 7200 + }, + { + "epoch": 0.8820556797647852, + "eval_loss": 0.16184844076633453, + "eval_runtime": 19.4054, + "eval_samples_per_second": 3.092, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 47997384, + "step": 7200 + }, + { + "epoch": 0.8826682184312885, + "grad_norm": 0.9912517289806405, + "learning_rate": 6.365621109353294e-05, + "loss": 0.3223, + "num_input_tokens_seen": 48030784, + "step": 7205 + }, + { + "epoch": 0.8832807570977917, + "grad_norm": 13.301726093296034, + "learning_rate": 6.360748181786229e-05, + "loss": 0.3319, + "num_input_tokens_seen": 48064664, + "step": 7210 + }, + { + "epoch": 0.8838932957642951, + "grad_norm": 1.5434725253235835, + "learning_rate": 6.35587385797328e-05, + "loss": 0.3325, + "num_input_tokens_seen": 48097784, + "step": 7215 + }, + { + "epoch": 0.8845058344307984, + "grad_norm": 1.3053645825267322, + "learning_rate": 6.350998142915927e-05, + "loss": 0.2594, + "num_input_tokens_seen": 48131872, + "step": 7220 + }, + { + "epoch": 0.8851183730973018, + "grad_norm": 1.066962520110482, + "learning_rate": 6.346121041617076e-05, + "loss": 0.2343, + "num_input_tokens_seen": 48165824, + "step": 7225 + }, + { + "epoch": 0.8857309117638051, + "grad_norm": 1.1480021049864335, + "learning_rate": 6.34124255908106e-05, + "loss": 0.2769, + "num_input_tokens_seen": 48198496, + "step": 7230 + }, + { + "epoch": 0.8863434504303084, + "grad_norm": 1.0882671053694362, + "learning_rate": 6.336362700313622e-05, + "loss": 0.2866, + "num_input_tokens_seen": 48232376, + "step": 7235 + }, + { + "epoch": 0.8869559890968117, + "grad_norm": 1.155928741097429, + "learning_rate": 6.331481470321923e-05, + "loss": 0.2083, + "num_input_tokens_seen": 48266576, + "step": 7240 + }, + { + "epoch": 0.8875685277633151, + "grad_norm": 1.4896289998620236, + "learning_rate": 6.32659887411453e-05, + "loss": 0.2569, + "num_input_tokens_seen": 48300576, + "step": 7245 + }, + { + "epoch": 0.8881810664298184, + "grad_norm": 1.0297582317504255, + "learning_rate": 6.321714916701411e-05, + "loss": 0.3044, + "num_input_tokens_seen": 48333536, + "step": 7250 + }, + { + "epoch": 0.8881810664298184, + "eval_loss": 0.10158420354127884, + "eval_runtime": 19.595, + "eval_samples_per_second": 3.062, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 48333536, + "step": 7250 + }, + { + "epoch": 0.8887936050963217, + "grad_norm": 1.0191090656349635, + "learning_rate": 6.316829603093931e-05, + "loss": 0.3441, + "num_input_tokens_seen": 48366624, + "step": 7255 + }, + { + "epoch": 0.889406143762825, + "grad_norm": 1.0220610500785992, + "learning_rate": 6.311942938304842e-05, + "loss": 0.2336, + "num_input_tokens_seen": 48400456, + "step": 7260 + }, + { + "epoch": 0.8900186824293284, + "grad_norm": 1.0919002823075137, + "learning_rate": 6.307054927348293e-05, + "loss": 0.2426, + "num_input_tokens_seen": 48434648, + "step": 7265 + }, + { + "epoch": 0.8906312210958317, + "grad_norm": 4.275493554789937, + "learning_rate": 6.302165575239804e-05, + "loss": 0.3274, + "num_input_tokens_seen": 48468824, + "step": 7270 + }, + { + "epoch": 0.891243759762335, + "grad_norm": 0.9075552375409077, + "learning_rate": 6.297274886996278e-05, + "loss": 0.3144, + "num_input_tokens_seen": 48502576, + "step": 7275 + }, + { + "epoch": 0.8918562984288383, + "grad_norm": 1.555328116639126, + "learning_rate": 6.292382867635981e-05, + "loss": 0.2943, + "num_input_tokens_seen": 48536224, + "step": 7280 + }, + { + "epoch": 0.8924688370953416, + "grad_norm": 0.7714558684058379, + "learning_rate": 6.287489522178556e-05, + "loss": 0.2551, + "num_input_tokens_seen": 48570024, + "step": 7285 + }, + { + "epoch": 0.893081375761845, + "grad_norm": 0.840562204049008, + "learning_rate": 6.282594855644995e-05, + "loss": 0.2862, + "num_input_tokens_seen": 48603472, + "step": 7290 + }, + { + "epoch": 0.8936939144283483, + "grad_norm": 1.1974350291348024, + "learning_rate": 6.277698873057658e-05, + "loss": 0.2366, + "num_input_tokens_seen": 48637176, + "step": 7295 + }, + { + "epoch": 0.8943064530948516, + "grad_norm": 11.786198188509974, + "learning_rate": 6.272801579440243e-05, + "loss": 0.3151, + "num_input_tokens_seen": 48670648, + "step": 7300 + }, + { + "epoch": 0.8943064530948516, + "eval_loss": 0.07981760799884796, + "eval_runtime": 19.3588, + "eval_samples_per_second": 3.099, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 48670648, + "step": 7300 + }, + { + "epoch": 0.8949189917613549, + "grad_norm": 4.479282392747499, + "learning_rate": 6.2679029798178e-05, + "loss": 0.2829, + "num_input_tokens_seen": 48704528, + "step": 7305 + }, + { + "epoch": 0.8955315304278583, + "grad_norm": 1.1638127432184913, + "learning_rate": 6.263003079216721e-05, + "loss": 0.2833, + "num_input_tokens_seen": 48738256, + "step": 7310 + }, + { + "epoch": 0.8961440690943616, + "grad_norm": 23.884813839553683, + "learning_rate": 6.258101882664725e-05, + "loss": 0.317, + "num_input_tokens_seen": 48771552, + "step": 7315 + }, + { + "epoch": 0.8967566077608649, + "grad_norm": 0.9299894322909548, + "learning_rate": 6.25319939519087e-05, + "loss": 0.2923, + "num_input_tokens_seen": 48805200, + "step": 7320 + }, + { + "epoch": 0.8973691464273682, + "grad_norm": 1.1742477393412885, + "learning_rate": 6.24829562182553e-05, + "loss": 0.3079, + "num_input_tokens_seen": 48839192, + "step": 7325 + }, + { + "epoch": 0.8979816850938716, + "grad_norm": 1.1463581026052865, + "learning_rate": 6.243390567600406e-05, + "loss": 0.3042, + "num_input_tokens_seen": 48872152, + "step": 7330 + }, + { + "epoch": 0.8985942237603749, + "grad_norm": 0.8496322911573401, + "learning_rate": 6.238484237548509e-05, + "loss": 0.2434, + "num_input_tokens_seen": 48905568, + "step": 7335 + }, + { + "epoch": 0.8992067624268782, + "grad_norm": 0.7747787445541563, + "learning_rate": 6.233576636704158e-05, + "loss": 0.238, + "num_input_tokens_seen": 48939632, + "step": 7340 + }, + { + "epoch": 0.8998193010933815, + "grad_norm": 7.237612728871367, + "learning_rate": 6.228667770102982e-05, + "loss": 0.2288, + "num_input_tokens_seen": 48973952, + "step": 7345 + }, + { + "epoch": 0.9004318397598848, + "grad_norm": 1.2018472942948732, + "learning_rate": 6.2237576427819e-05, + "loss": 0.2531, + "num_input_tokens_seen": 49007448, + "step": 7350 + }, + { + "epoch": 0.9004318397598848, + "eval_loss": 0.19394417107105255, + "eval_runtime": 19.5715, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 49007448, + "step": 7350 + }, + { + "epoch": 0.9010443784263882, + "grad_norm": 21.168554993831524, + "learning_rate": 6.218846259779133e-05, + "loss": 0.2663, + "num_input_tokens_seen": 49041288, + "step": 7355 + }, + { + "epoch": 0.9016569170928915, + "grad_norm": 1.016178587270193, + "learning_rate": 6.213933626134185e-05, + "loss": 0.2916, + "num_input_tokens_seen": 49074616, + "step": 7360 + }, + { + "epoch": 0.9022694557593948, + "grad_norm": 0.9038250095796421, + "learning_rate": 6.209019746887846e-05, + "loss": 0.2928, + "num_input_tokens_seen": 49107784, + "step": 7365 + }, + { + "epoch": 0.9028819944258981, + "grad_norm": 0.6413064984151202, + "learning_rate": 6.204104627082181e-05, + "loss": 0.2488, + "num_input_tokens_seen": 49141360, + "step": 7370 + }, + { + "epoch": 0.9034945330924015, + "grad_norm": 0.9398093061435702, + "learning_rate": 6.199188271760532e-05, + "loss": 0.2659, + "num_input_tokens_seen": 49174824, + "step": 7375 + }, + { + "epoch": 0.9041070717589048, + "grad_norm": 1.4809642153047842, + "learning_rate": 6.194270685967507e-05, + "loss": 0.3333, + "num_input_tokens_seen": 49208608, + "step": 7380 + }, + { + "epoch": 0.904719610425408, + "grad_norm": 1.2302046531642614, + "learning_rate": 6.189351874748976e-05, + "loss": 0.2677, + "num_input_tokens_seen": 49242616, + "step": 7385 + }, + { + "epoch": 0.9053321490919114, + "grad_norm": 40.24874376771697, + "learning_rate": 6.184431843152065e-05, + "loss": 0.2366, + "num_input_tokens_seen": 49277400, + "step": 7390 + }, + { + "epoch": 0.9059446877584147, + "grad_norm": 0.929476212331941, + "learning_rate": 6.179510596225157e-05, + "loss": 0.2633, + "num_input_tokens_seen": 49310528, + "step": 7395 + }, + { + "epoch": 0.9065572264249181, + "grad_norm": 0.8865829608188713, + "learning_rate": 6.174588139017878e-05, + "loss": 0.2813, + "num_input_tokens_seen": 49343416, + "step": 7400 + }, + { + "epoch": 0.9065572264249181, + "eval_loss": 0.12092752754688263, + "eval_runtime": 19.4054, + "eval_samples_per_second": 3.092, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 49343416, + "step": 7400 + }, + { + "epoch": 0.9071697650914214, + "grad_norm": 2.0964567690544333, + "learning_rate": 6.169664476581095e-05, + "loss": 0.2468, + "num_input_tokens_seen": 49377224, + "step": 7405 + }, + { + "epoch": 0.9077823037579247, + "grad_norm": 1.1620649535004972, + "learning_rate": 6.164739613966915e-05, + "loss": 0.2945, + "num_input_tokens_seen": 49410368, + "step": 7410 + }, + { + "epoch": 0.908394842424428, + "grad_norm": 2.6588374773940644, + "learning_rate": 6.159813556228674e-05, + "loss": 0.261, + "num_input_tokens_seen": 49444360, + "step": 7415 + }, + { + "epoch": 0.9090073810909314, + "grad_norm": 1.2503582016545145, + "learning_rate": 6.154886308420937e-05, + "loss": 0.2423, + "num_input_tokens_seen": 49478040, + "step": 7420 + }, + { + "epoch": 0.9096199197574347, + "grad_norm": 1.369005811222348, + "learning_rate": 6.149957875599489e-05, + "loss": 0.3007, + "num_input_tokens_seen": 49512288, + "step": 7425 + }, + { + "epoch": 0.910232458423938, + "grad_norm": 1.3030450300984429, + "learning_rate": 6.145028262821328e-05, + "loss": 0.3359, + "num_input_tokens_seen": 49545200, + "step": 7430 + }, + { + "epoch": 0.9108449970904413, + "grad_norm": 6.542523553390861, + "learning_rate": 6.140097475144666e-05, + "loss": 0.2323, + "num_input_tokens_seen": 49579520, + "step": 7435 + }, + { + "epoch": 0.9114575357569447, + "grad_norm": 0.8866084833902607, + "learning_rate": 6.135165517628919e-05, + "loss": 0.2624, + "num_input_tokens_seen": 49613224, + "step": 7440 + }, + { + "epoch": 0.912070074423448, + "grad_norm": 0.875287415565878, + "learning_rate": 6.130232395334705e-05, + "loss": 0.2297, + "num_input_tokens_seen": 49647256, + "step": 7445 + }, + { + "epoch": 0.9126826130899514, + "grad_norm": 1.3931205338743322, + "learning_rate": 6.125298113323836e-05, + "loss": 0.2411, + "num_input_tokens_seen": 49681144, + "step": 7450 + }, + { + "epoch": 0.9126826130899514, + "eval_loss": 0.13680709898471832, + "eval_runtime": 19.6951, + "eval_samples_per_second": 3.046, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 49681144, + "step": 7450 + }, + { + "epoch": 0.9132951517564546, + "grad_norm": 0.7933355467933993, + "learning_rate": 6.120362676659315e-05, + "loss": 0.2216, + "num_input_tokens_seen": 49714752, + "step": 7455 + }, + { + "epoch": 0.9139076904229579, + "grad_norm": 0.7687963865674856, + "learning_rate": 6.115426090405328e-05, + "loss": 0.256, + "num_input_tokens_seen": 49748256, + "step": 7460 + }, + { + "epoch": 0.9145202290894613, + "grad_norm": 0.7747477739961705, + "learning_rate": 6.110488359627239e-05, + "loss": 0.2347, + "num_input_tokens_seen": 49781400, + "step": 7465 + }, + { + "epoch": 0.9151327677559646, + "grad_norm": 1.2524015279171472, + "learning_rate": 6.105549489391593e-05, + "loss": 0.3048, + "num_input_tokens_seen": 49814944, + "step": 7470 + }, + { + "epoch": 0.9157453064224679, + "grad_norm": 1.1116862807828085, + "learning_rate": 6.100609484766098e-05, + "loss": 0.2603, + "num_input_tokens_seen": 49848608, + "step": 7475 + }, + { + "epoch": 0.9163578450889712, + "grad_norm": 1.15968413656217, + "learning_rate": 6.095668350819629e-05, + "loss": 0.2747, + "num_input_tokens_seen": 49882168, + "step": 7480 + }, + { + "epoch": 0.9169703837554746, + "grad_norm": 1.2053461310099642, + "learning_rate": 6.090726092622216e-05, + "loss": 0.2553, + "num_input_tokens_seen": 49915840, + "step": 7485 + }, + { + "epoch": 0.9175829224219779, + "grad_norm": 1.3138189587905238, + "learning_rate": 6.08578271524505e-05, + "loss": 0.2541, + "num_input_tokens_seen": 49949480, + "step": 7490 + }, + { + "epoch": 0.9181954610884812, + "grad_norm": 1.120663136199709, + "learning_rate": 6.080838223760462e-05, + "loss": 0.2757, + "num_input_tokens_seen": 49983888, + "step": 7495 + }, + { + "epoch": 0.9188079997549845, + "grad_norm": 1.1641850109489835, + "learning_rate": 6.0758926232419325e-05, + "loss": 0.2807, + "num_input_tokens_seen": 50016856, + "step": 7500 + }, + { + "epoch": 0.9188079997549845, + "eval_loss": 0.1768772155046463, + "eval_runtime": 19.7931, + "eval_samples_per_second": 3.031, + "eval_steps_per_second": 0.758, + "num_input_tokens_seen": 50016856, + "step": 7500 + }, + { + "epoch": 0.9194205384214879, + "grad_norm": 1.3141715859889223, + "learning_rate": 6.070945918764075e-05, + "loss": 0.2769, + "num_input_tokens_seen": 50050272, + "step": 7505 + }, + { + "epoch": 0.9200330770879912, + "grad_norm": 10.352170655755083, + "learning_rate": 6.065998115402639e-05, + "loss": 0.2663, + "num_input_tokens_seen": 50083664, + "step": 7510 + }, + { + "epoch": 0.9206456157544946, + "grad_norm": 0.7645750026425057, + "learning_rate": 6.0610492182345035e-05, + "loss": 0.2737, + "num_input_tokens_seen": 50117640, + "step": 7515 + }, + { + "epoch": 0.9212581544209978, + "grad_norm": 1.294394957889442, + "learning_rate": 6.0560992323376617e-05, + "loss": 0.2674, + "num_input_tokens_seen": 50151144, + "step": 7520 + }, + { + "epoch": 0.9218706930875011, + "grad_norm": 0.9879231664430183, + "learning_rate": 6.0511481627912334e-05, + "loss": 0.2974, + "num_input_tokens_seen": 50184120, + "step": 7525 + }, + { + "epoch": 0.9224832317540045, + "grad_norm": 0.9438430365327111, + "learning_rate": 6.046196014675445e-05, + "loss": 0.2534, + "num_input_tokens_seen": 50217472, + "step": 7530 + }, + { + "epoch": 0.9230957704205078, + "grad_norm": 0.8606681619538181, + "learning_rate": 6.041242793071631e-05, + "loss": 0.2732, + "num_input_tokens_seen": 50251224, + "step": 7535 + }, + { + "epoch": 0.9237083090870111, + "grad_norm": 0.8604093128188222, + "learning_rate": 6.036288503062227e-05, + "loss": 0.269, + "num_input_tokens_seen": 50284144, + "step": 7540 + }, + { + "epoch": 0.9243208477535144, + "grad_norm": 1.1101173344986042, + "learning_rate": 6.031333149730765e-05, + "loss": 0.2523, + "num_input_tokens_seen": 50317784, + "step": 7545 + }, + { + "epoch": 0.9249333864200178, + "grad_norm": 0.8936338111277344, + "learning_rate": 6.0263767381618674e-05, + "loss": 0.2508, + "num_input_tokens_seen": 50351504, + "step": 7550 + }, + { + "epoch": 0.9249333864200178, + "eval_loss": 0.32131484150886536, + "eval_runtime": 19.6888, + "eval_samples_per_second": 3.047, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 50351504, + "step": 7550 + }, + { + "epoch": 0.9255459250865211, + "grad_norm": 10.5088496304406, + "learning_rate": 6.021419273441246e-05, + "loss": 0.2933, + "num_input_tokens_seen": 50385360, + "step": 7555 + }, + { + "epoch": 0.9261584637530245, + "grad_norm": 0.9670560441746768, + "learning_rate": 6.016460760655687e-05, + "loss": 0.2925, + "num_input_tokens_seen": 50418744, + "step": 7560 + }, + { + "epoch": 0.9267710024195277, + "grad_norm": 1.0908895752662584, + "learning_rate": 6.011501204893056e-05, + "loss": 0.2047, + "num_input_tokens_seen": 50453160, + "step": 7565 + }, + { + "epoch": 0.927383541086031, + "grad_norm": 1.0635350994061221, + "learning_rate": 6.006540611242291e-05, + "loss": 0.2542, + "num_input_tokens_seen": 50486256, + "step": 7570 + }, + { + "epoch": 0.9279960797525344, + "grad_norm": 1.1672430013935515, + "learning_rate": 6.001578984793388e-05, + "loss": 0.2613, + "num_input_tokens_seen": 50519696, + "step": 7575 + }, + { + "epoch": 0.9286086184190377, + "grad_norm": 1.7125032266243287, + "learning_rate": 5.9966163306374115e-05, + "loss": 0.2945, + "num_input_tokens_seen": 50552912, + "step": 7580 + }, + { + "epoch": 0.929221157085541, + "grad_norm": 0.9532938254202957, + "learning_rate": 5.991652653866472e-05, + "loss": 0.2545, + "num_input_tokens_seen": 50587184, + "step": 7585 + }, + { + "epoch": 0.9298336957520443, + "grad_norm": 1.1443510182368488, + "learning_rate": 5.9866879595737344e-05, + "loss": 0.2947, + "num_input_tokens_seen": 50620448, + "step": 7590 + }, + { + "epoch": 0.9304462344185477, + "grad_norm": 1.1186627596701106, + "learning_rate": 5.981722252853409e-05, + "loss": 0.2863, + "num_input_tokens_seen": 50653968, + "step": 7595 + }, + { + "epoch": 0.931058773085051, + "grad_norm": 12.568111325920182, + "learning_rate": 5.976755538800738e-05, + "loss": 0.3394, + "num_input_tokens_seen": 50687088, + "step": 7600 + }, + { + "epoch": 0.931058773085051, + "eval_loss": 0.2350298911333084, + "eval_runtime": 19.8614, + "eval_samples_per_second": 3.021, + "eval_steps_per_second": 0.755, + "num_input_tokens_seen": 50687088, + "step": 7600 + }, + { + "epoch": 0.9316713117515543, + "grad_norm": 0.9363634557991117, + "learning_rate": 5.971787822512005e-05, + "loss": 0.264, + "num_input_tokens_seen": 50720400, + "step": 7605 + }, + { + "epoch": 0.9322838504180576, + "grad_norm": 0.8952130026874213, + "learning_rate": 5.966819109084516e-05, + "loss": 0.2592, + "num_input_tokens_seen": 50753984, + "step": 7610 + }, + { + "epoch": 0.932896389084561, + "grad_norm": 0.9165771602583295, + "learning_rate": 5.961849403616606e-05, + "loss": 0.2627, + "num_input_tokens_seen": 50787920, + "step": 7615 + }, + { + "epoch": 0.9335089277510643, + "grad_norm": 1.3344975685514422, + "learning_rate": 5.956878711207623e-05, + "loss": 0.2497, + "num_input_tokens_seen": 50821736, + "step": 7620 + }, + { + "epoch": 0.9341214664175677, + "grad_norm": 0.8419557723167906, + "learning_rate": 5.951907036957927e-05, + "loss": 0.287, + "num_input_tokens_seen": 50855040, + "step": 7625 + }, + { + "epoch": 0.9347340050840709, + "grad_norm": 1.0808026315792447, + "learning_rate": 5.946934385968892e-05, + "loss": 0.2704, + "num_input_tokens_seen": 50888688, + "step": 7630 + }, + { + "epoch": 0.9353465437505742, + "grad_norm": 0.7862560945449588, + "learning_rate": 5.9419607633428885e-05, + "loss": 0.2227, + "num_input_tokens_seen": 50922464, + "step": 7635 + }, + { + "epoch": 0.9359590824170776, + "grad_norm": 0.818005480604586, + "learning_rate": 5.9369861741832847e-05, + "loss": 0.2602, + "num_input_tokens_seen": 50955944, + "step": 7640 + }, + { + "epoch": 0.9365716210835809, + "grad_norm": 0.925307253478405, + "learning_rate": 5.932010623594441e-05, + "loss": 0.2673, + "num_input_tokens_seen": 50989432, + "step": 7645 + }, + { + "epoch": 0.9371841597500842, + "grad_norm": 0.9321281123385093, + "learning_rate": 5.927034116681705e-05, + "loss": 0.2424, + "num_input_tokens_seen": 51023288, + "step": 7650 + }, + { + "epoch": 0.9371841597500842, + "eval_loss": 0.1906452476978302, + "eval_runtime": 20.0124, + "eval_samples_per_second": 2.998, + "eval_steps_per_second": 0.75, + "num_input_tokens_seen": 51023288, + "step": 7650 + }, + { + "epoch": 0.9377966984165875, + "grad_norm": 0.9391154460080438, + "learning_rate": 5.922056658551407e-05, + "loss": 0.26, + "num_input_tokens_seen": 51056376, + "step": 7655 + }, + { + "epoch": 0.9384092370830909, + "grad_norm": 1.3808688563489353, + "learning_rate": 5.91707825431085e-05, + "loss": 0.2123, + "num_input_tokens_seen": 51090288, + "step": 7660 + }, + { + "epoch": 0.9390217757495942, + "grad_norm": 0.8715963987265422, + "learning_rate": 5.91209890906831e-05, + "loss": 0.2648, + "num_input_tokens_seen": 51124128, + "step": 7665 + }, + { + "epoch": 0.9396343144160976, + "grad_norm": 1.1342494777890755, + "learning_rate": 5.907118627933027e-05, + "loss": 0.264, + "num_input_tokens_seen": 51158152, + "step": 7670 + }, + { + "epoch": 0.9402468530826008, + "grad_norm": 0.9577567739728793, + "learning_rate": 5.902137416015204e-05, + "loss": 0.2314, + "num_input_tokens_seen": 51191840, + "step": 7675 + }, + { + "epoch": 0.9408593917491042, + "grad_norm": 0.9721040990098824, + "learning_rate": 5.897155278425995e-05, + "loss": 0.2395, + "num_input_tokens_seen": 51225560, + "step": 7680 + }, + { + "epoch": 0.9414719304156075, + "grad_norm": 1.6690595670888888, + "learning_rate": 5.892172220277509e-05, + "loss": 0.239, + "num_input_tokens_seen": 51258880, + "step": 7685 + }, + { + "epoch": 0.9420844690821109, + "grad_norm": 1.2304077942613303, + "learning_rate": 5.887188246682792e-05, + "loss": 0.2422, + "num_input_tokens_seen": 51292504, + "step": 7690 + }, + { + "epoch": 0.9426970077486141, + "grad_norm": 0.7464309499294809, + "learning_rate": 5.882203362755839e-05, + "loss": 0.2685, + "num_input_tokens_seen": 51326264, + "step": 7695 + }, + { + "epoch": 0.9433095464151174, + "grad_norm": 3.9660094349677966, + "learning_rate": 5.877217573611572e-05, + "loss": 0.2568, + "num_input_tokens_seen": 51359824, + "step": 7700 + }, + { + "epoch": 0.9433095464151174, + "eval_loss": 0.2279118001461029, + "eval_runtime": 19.5171, + "eval_samples_per_second": 3.074, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 51359824, + "step": 7700 + }, + { + "epoch": 0.9439220850816208, + "grad_norm": 1.3716999512530115, + "learning_rate": 5.8722308843658436e-05, + "loss": 0.2776, + "num_input_tokens_seen": 51393152, + "step": 7705 + }, + { + "epoch": 0.9445346237481241, + "grad_norm": 11.586082796259, + "learning_rate": 5.867243300135431e-05, + "loss": 0.3103, + "num_input_tokens_seen": 51426424, + "step": 7710 + }, + { + "epoch": 0.9451471624146274, + "grad_norm": 1.1046402552425767, + "learning_rate": 5.8622548260380294e-05, + "loss": 0.2722, + "num_input_tokens_seen": 51460504, + "step": 7715 + }, + { + "epoch": 0.9457597010811307, + "grad_norm": 2.4835112980332665, + "learning_rate": 5.8572654671922455e-05, + "loss": 0.3227, + "num_input_tokens_seen": 51493200, + "step": 7720 + }, + { + "epoch": 0.9463722397476341, + "grad_norm": 1.049684542434729, + "learning_rate": 5.852275228717595e-05, + "loss": 0.2477, + "num_input_tokens_seen": 51526208, + "step": 7725 + }, + { + "epoch": 0.9469847784141374, + "grad_norm": 1.275347082923904, + "learning_rate": 5.847284115734497e-05, + "loss": 0.2477, + "num_input_tokens_seen": 51559472, + "step": 7730 + }, + { + "epoch": 0.9475973170806408, + "grad_norm": 1.508095888631952, + "learning_rate": 5.8422921333642676e-05, + "loss": 0.2826, + "num_input_tokens_seen": 51592584, + "step": 7735 + }, + { + "epoch": 0.948209855747144, + "grad_norm": 1.055479779826538, + "learning_rate": 5.8372992867291146e-05, + "loss": 0.2885, + "num_input_tokens_seen": 51626208, + "step": 7740 + }, + { + "epoch": 0.9488223944136474, + "grad_norm": 11.504210130479999, + "learning_rate": 5.832305580952131e-05, + "loss": 0.2608, + "num_input_tokens_seen": 51659776, + "step": 7745 + }, + { + "epoch": 0.9494349330801507, + "grad_norm": 13.54934190529509, + "learning_rate": 5.827311021157292e-05, + "loss": 0.2863, + "num_input_tokens_seen": 51692760, + "step": 7750 + }, + { + "epoch": 0.9494349330801507, + "eval_loss": 0.20178191363811493, + "eval_runtime": 19.4772, + "eval_samples_per_second": 3.081, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 51692760, + "step": 7750 + }, + { + "epoch": 0.950047471746654, + "grad_norm": 0.8238080255979396, + "learning_rate": 5.822315612469455e-05, + "loss": 0.2742, + "num_input_tokens_seen": 51726896, + "step": 7755 + }, + { + "epoch": 0.9506600104131573, + "grad_norm": 1.3027770918182022, + "learning_rate": 5.8173193600143383e-05, + "loss": 0.2665, + "num_input_tokens_seen": 51760104, + "step": 7760 + }, + { + "epoch": 0.9512725490796606, + "grad_norm": 0.9895227603643579, + "learning_rate": 5.812322268918534e-05, + "loss": 0.301, + "num_input_tokens_seen": 51793832, + "step": 7765 + }, + { + "epoch": 0.951885087746164, + "grad_norm": 25.613770859111906, + "learning_rate": 5.807324344309489e-05, + "loss": 0.267, + "num_input_tokens_seen": 51827408, + "step": 7770 + }, + { + "epoch": 0.9524976264126673, + "grad_norm": 1.2287696717265044, + "learning_rate": 5.802325591315512e-05, + "loss": 0.2462, + "num_input_tokens_seen": 51860984, + "step": 7775 + }, + { + "epoch": 0.9531101650791706, + "grad_norm": 1.0785855087471947, + "learning_rate": 5.797326015065755e-05, + "loss": 0.2373, + "num_input_tokens_seen": 51894936, + "step": 7780 + }, + { + "epoch": 0.9537227037456739, + "grad_norm": 1.3750030848118344, + "learning_rate": 5.792325620690218e-05, + "loss": 0.2417, + "num_input_tokens_seen": 51929072, + "step": 7785 + }, + { + "epoch": 0.9543352424121773, + "grad_norm": 1.0268899636517712, + "learning_rate": 5.787324413319742e-05, + "loss": 0.2802, + "num_input_tokens_seen": 51963016, + "step": 7790 + }, + { + "epoch": 0.9549477810786806, + "grad_norm": 1.239267656510513, + "learning_rate": 5.782322398085999e-05, + "loss": 0.2768, + "num_input_tokens_seen": 51996552, + "step": 7795 + }, + { + "epoch": 0.955560319745184, + "grad_norm": 1.3202709221425866, + "learning_rate": 5.777319580121492e-05, + "loss": 0.3015, + "num_input_tokens_seen": 52029952, + "step": 7800 + }, + { + "epoch": 0.955560319745184, + "eval_loss": 0.17758481204509735, + "eval_runtime": 19.6452, + "eval_samples_per_second": 3.054, + "eval_steps_per_second": 0.764, + "num_input_tokens_seen": 52029952, + "step": 7800 + }, + { + "epoch": 0.9561728584116872, + "grad_norm": 1.063364954164096, + "learning_rate": 5.7723159645595445e-05, + "loss": 0.2423, + "num_input_tokens_seen": 52063720, + "step": 7805 + }, + { + "epoch": 0.9567853970781905, + "grad_norm": 0.8600460306015203, + "learning_rate": 5.767311556534305e-05, + "loss": 0.3008, + "num_input_tokens_seen": 52097360, + "step": 7810 + }, + { + "epoch": 0.9573979357446939, + "grad_norm": 1.1512308854799498, + "learning_rate": 5.762306361180727e-05, + "loss": 0.2982, + "num_input_tokens_seen": 52131184, + "step": 7815 + }, + { + "epoch": 0.9580104744111972, + "grad_norm": 1.382256538725797, + "learning_rate": 5.75730038363458e-05, + "loss": 0.2885, + "num_input_tokens_seen": 52164512, + "step": 7820 + }, + { + "epoch": 0.9586230130777005, + "grad_norm": 1.0829816521717899, + "learning_rate": 5.752293629032429e-05, + "loss": 0.3219, + "num_input_tokens_seen": 52197296, + "step": 7825 + }, + { + "epoch": 0.9592355517442038, + "grad_norm": 50.717987065019386, + "learning_rate": 5.747286102511639e-05, + "loss": 0.3073, + "num_input_tokens_seen": 52231016, + "step": 7830 + }, + { + "epoch": 0.9598480904107072, + "grad_norm": 1.1799335370899378, + "learning_rate": 5.74227780921037e-05, + "loss": 0.281, + "num_input_tokens_seen": 52263688, + "step": 7835 + }, + { + "epoch": 0.9604606290772105, + "grad_norm": 19.80378155396282, + "learning_rate": 5.7372687542675644e-05, + "loss": 0.2759, + "num_input_tokens_seen": 52296880, + "step": 7840 + }, + { + "epoch": 0.9610731677437139, + "grad_norm": 1.11784197486195, + "learning_rate": 5.7322589428229476e-05, + "loss": 0.2422, + "num_input_tokens_seen": 52330584, + "step": 7845 + }, + { + "epoch": 0.9616857064102171, + "grad_norm": 0.9898571071970598, + "learning_rate": 5.727248380017021e-05, + "loss": 0.2331, + "num_input_tokens_seen": 52365160, + "step": 7850 + }, + { + "epoch": 0.9616857064102171, + "eval_loss": 0.1693449765443802, + "eval_runtime": 20.0742, + "eval_samples_per_second": 2.989, + "eval_steps_per_second": 0.747, + "num_input_tokens_seen": 52365160, + "step": 7850 + }, + { + "epoch": 0.9622982450767205, + "grad_norm": 0.7964775560699058, + "learning_rate": 5.7222370709910586e-05, + "loss": 0.2671, + "num_input_tokens_seen": 52398664, + "step": 7855 + }, + { + "epoch": 0.9629107837432238, + "grad_norm": 1.1604952416566663, + "learning_rate": 5.7172250208871004e-05, + "loss": 0.2717, + "num_input_tokens_seen": 52432176, + "step": 7860 + }, + { + "epoch": 0.9635233224097272, + "grad_norm": 0.963550314024476, + "learning_rate": 5.7122122348479424e-05, + "loss": 0.2688, + "num_input_tokens_seen": 52465688, + "step": 7865 + }, + { + "epoch": 0.9641358610762304, + "grad_norm": 6.999833265663576, + "learning_rate": 5.7071987180171405e-05, + "loss": 0.2322, + "num_input_tokens_seen": 52498856, + "step": 7870 + }, + { + "epoch": 0.9647483997427337, + "grad_norm": 1.1676178823605188, + "learning_rate": 5.7021844755389976e-05, + "loss": 0.2777, + "num_input_tokens_seen": 52532640, + "step": 7875 + }, + { + "epoch": 0.9653609384092371, + "grad_norm": 0.9909378489768668, + "learning_rate": 5.6971695125585647e-05, + "loss": 0.2561, + "num_input_tokens_seen": 52566704, + "step": 7880 + }, + { + "epoch": 0.9659734770757404, + "grad_norm": 0.7285742482238151, + "learning_rate": 5.692153834221625e-05, + "loss": 0.2266, + "num_input_tokens_seen": 52600248, + "step": 7885 + }, + { + "epoch": 0.9665860157422437, + "grad_norm": 0.8205507532624203, + "learning_rate": 5.687137445674704e-05, + "loss": 0.2087, + "num_input_tokens_seen": 52634744, + "step": 7890 + }, + { + "epoch": 0.967198554408747, + "grad_norm": 1.322673867080366, + "learning_rate": 5.682120352065051e-05, + "loss": 0.2681, + "num_input_tokens_seen": 52667912, + "step": 7895 + }, + { + "epoch": 0.9678110930752504, + "grad_norm": 8.656108240351292, + "learning_rate": 5.677102558540641e-05, + "loss": 0.2669, + "num_input_tokens_seen": 52702264, + "step": 7900 + }, + { + "epoch": 0.9678110930752504, + "eval_loss": 0.1096976026892662, + "eval_runtime": 20.033, + "eval_samples_per_second": 2.995, + "eval_steps_per_second": 0.749, + "num_input_tokens_seen": 52702264, + "step": 7900 + }, + { + "epoch": 0.9684236317417537, + "grad_norm": 1.3929211748828418, + "learning_rate": 5.672084070250165e-05, + "loss": 0.2886, + "num_input_tokens_seen": 52735976, + "step": 7905 + }, + { + "epoch": 0.9690361704082571, + "grad_norm": 0.834146958080007, + "learning_rate": 5.6670648923430295e-05, + "loss": 0.2443, + "num_input_tokens_seen": 52769864, + "step": 7910 + }, + { + "epoch": 0.9696487090747603, + "grad_norm": 1.1180385303177762, + "learning_rate": 5.662045029969345e-05, + "loss": 0.2523, + "num_input_tokens_seen": 52803392, + "step": 7915 + }, + { + "epoch": 0.9702612477412637, + "grad_norm": 0.9517754184837989, + "learning_rate": 5.6570244882799296e-05, + "loss": 0.2446, + "num_input_tokens_seen": 52837136, + "step": 7920 + }, + { + "epoch": 0.970873786407767, + "grad_norm": 0.6826032202364467, + "learning_rate": 5.652003272426293e-05, + "loss": 0.2661, + "num_input_tokens_seen": 52870832, + "step": 7925 + }, + { + "epoch": 0.9714863250742704, + "grad_norm": 0.8994961488327184, + "learning_rate": 5.646981387560638e-05, + "loss": 0.1969, + "num_input_tokens_seen": 52905064, + "step": 7930 + }, + { + "epoch": 0.9720988637407736, + "grad_norm": 1.5519415341375493, + "learning_rate": 5.641958838835858e-05, + "loss": 0.2645, + "num_input_tokens_seen": 52938840, + "step": 7935 + }, + { + "epoch": 0.9727114024072769, + "grad_norm": 1.2410239003722612, + "learning_rate": 5.6369356314055235e-05, + "loss": 0.286, + "num_input_tokens_seen": 52973040, + "step": 7940 + }, + { + "epoch": 0.9733239410737803, + "grad_norm": 0.701502624223167, + "learning_rate": 5.63191177042388e-05, + "loss": 0.2583, + "num_input_tokens_seen": 53006936, + "step": 7945 + }, + { + "epoch": 0.9739364797402836, + "grad_norm": 0.7726188138276853, + "learning_rate": 5.626887261045847e-05, + "loss": 0.2606, + "num_input_tokens_seen": 53040192, + "step": 7950 + }, + { + "epoch": 0.9739364797402836, + "eval_loss": 0.19213582575321198, + "eval_runtime": 19.3946, + "eval_samples_per_second": 3.094, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 53040192, + "step": 7950 + }, + { + "epoch": 0.974549018406787, + "grad_norm": 1.6538040026464838, + "learning_rate": 5.6218621084270076e-05, + "loss": 0.2846, + "num_input_tokens_seen": 53073472, + "step": 7955 + }, + { + "epoch": 0.9751615570732902, + "grad_norm": 1.3232838256829276, + "learning_rate": 5.616836317723606e-05, + "loss": 0.3231, + "num_input_tokens_seen": 53106392, + "step": 7960 + }, + { + "epoch": 0.9757740957397936, + "grad_norm": 1.2467227230401035, + "learning_rate": 5.6118098940925365e-05, + "loss": 0.2599, + "num_input_tokens_seen": 53140200, + "step": 7965 + }, + { + "epoch": 0.9763866344062969, + "grad_norm": 0.9849570974591809, + "learning_rate": 5.606782842691352e-05, + "loss": 0.2592, + "num_input_tokens_seen": 53173384, + "step": 7970 + }, + { + "epoch": 0.9769991730728003, + "grad_norm": 1.2772460874526546, + "learning_rate": 5.60175516867824e-05, + "loss": 0.2737, + "num_input_tokens_seen": 53206768, + "step": 7975 + }, + { + "epoch": 0.9776117117393035, + "grad_norm": 1.128965982171075, + "learning_rate": 5.596726877212032e-05, + "loss": 0.2233, + "num_input_tokens_seen": 53240024, + "step": 7980 + }, + { + "epoch": 0.9782242504058068, + "grad_norm": 1.1252092873234842, + "learning_rate": 5.5916979734521936e-05, + "loss": 0.2648, + "num_input_tokens_seen": 53273088, + "step": 7985 + }, + { + "epoch": 0.9788367890723102, + "grad_norm": 1.2789572450535445, + "learning_rate": 5.586668462558814e-05, + "loss": 0.2603, + "num_input_tokens_seen": 53306896, + "step": 7990 + }, + { + "epoch": 0.9794493277388135, + "grad_norm": 1.1892165202732694, + "learning_rate": 5.5816383496926105e-05, + "loss": 0.2012, + "num_input_tokens_seen": 53341104, + "step": 7995 + }, + { + "epoch": 0.9800618664053168, + "grad_norm": 0.7249911925871049, + "learning_rate": 5.576607640014917e-05, + "loss": 0.2426, + "num_input_tokens_seen": 53375240, + "step": 8000 + }, + { + "epoch": 0.9800618664053168, + "eval_loss": 0.2315770536661148, + "eval_runtime": 19.5572, + "eval_samples_per_second": 3.068, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 53375240, + "step": 8000 + }, + { + "epoch": 0.9806744050718201, + "grad_norm": 1.3298129567016648, + "learning_rate": 5.5715763386876774e-05, + "loss": 0.3245, + "num_input_tokens_seen": 53407776, + "step": 8005 + }, + { + "epoch": 0.9812869437383235, + "grad_norm": 0.9746845825510395, + "learning_rate": 5.566544450873442e-05, + "loss": 0.2672, + "num_input_tokens_seen": 53441320, + "step": 8010 + }, + { + "epoch": 0.9818994824048268, + "grad_norm": 1.1463599159566267, + "learning_rate": 5.5615119817353686e-05, + "loss": 0.2563, + "num_input_tokens_seen": 53475008, + "step": 8015 + }, + { + "epoch": 0.9825120210713302, + "grad_norm": 1.189287157042674, + "learning_rate": 5.556478936437207e-05, + "loss": 0.2955, + "num_input_tokens_seen": 53508784, + "step": 8020 + }, + { + "epoch": 0.9831245597378334, + "grad_norm": 1.2056098290590576, + "learning_rate": 5.551445320143297e-05, + "loss": 0.3579, + "num_input_tokens_seen": 53542048, + "step": 8025 + }, + { + "epoch": 0.9837370984043368, + "grad_norm": 1.0486881534307972, + "learning_rate": 5.546411138018569e-05, + "loss": 0.2346, + "num_input_tokens_seen": 53575856, + "step": 8030 + }, + { + "epoch": 0.9843496370708401, + "grad_norm": 0.9842274292431644, + "learning_rate": 5.541376395228529e-05, + "loss": 0.2352, + "num_input_tokens_seen": 53609232, + "step": 8035 + }, + { + "epoch": 0.9849621757373435, + "grad_norm": 0.8112945521310746, + "learning_rate": 5.5363410969392637e-05, + "loss": 0.2512, + "num_input_tokens_seen": 53642416, + "step": 8040 + }, + { + "epoch": 0.9855747144038467, + "grad_norm": 1.629983178150261, + "learning_rate": 5.531305248317422e-05, + "loss": 0.2727, + "num_input_tokens_seen": 53675592, + "step": 8045 + }, + { + "epoch": 0.98618725307035, + "grad_norm": 1.046184830751963, + "learning_rate": 5.526268854530228e-05, + "loss": 0.2674, + "num_input_tokens_seen": 53709056, + "step": 8050 + }, + { + "epoch": 0.98618725307035, + "eval_loss": 0.1892443746328354, + "eval_runtime": 19.5139, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 53709056, + "step": 8050 + }, + { + "epoch": 0.9867997917368534, + "grad_norm": 1.5276547047832392, + "learning_rate": 5.5212319207454535e-05, + "loss": 0.2445, + "num_input_tokens_seen": 53742352, + "step": 8055 + }, + { + "epoch": 0.9874123304033567, + "grad_norm": 0.9332922339127524, + "learning_rate": 5.516194452131435e-05, + "loss": 0.2281, + "num_input_tokens_seen": 53775736, + "step": 8060 + }, + { + "epoch": 0.9880248690698601, + "grad_norm": 1.194118959407054, + "learning_rate": 5.51115645385705e-05, + "loss": 0.251, + "num_input_tokens_seen": 53809504, + "step": 8065 + }, + { + "epoch": 0.9886374077363633, + "grad_norm": 1.0050641943552643, + "learning_rate": 5.506117931091723e-05, + "loss": 0.2299, + "num_input_tokens_seen": 53843688, + "step": 8070 + }, + { + "epoch": 0.9892499464028667, + "grad_norm": 0.954151397104093, + "learning_rate": 5.501078889005419e-05, + "loss": 0.2477, + "num_input_tokens_seen": 53877968, + "step": 8075 + }, + { + "epoch": 0.98986248506937, + "grad_norm": 0.9144241619145849, + "learning_rate": 5.49603933276863e-05, + "loss": 0.2732, + "num_input_tokens_seen": 53911840, + "step": 8080 + }, + { + "epoch": 0.9904750237358734, + "grad_norm": 0.7944047263041256, + "learning_rate": 5.4909992675523804e-05, + "loss": 0.2639, + "num_input_tokens_seen": 53945424, + "step": 8085 + }, + { + "epoch": 0.9910875624023766, + "grad_norm": 1.137906752058307, + "learning_rate": 5.485958698528213e-05, + "loss": 0.323, + "num_input_tokens_seen": 53978136, + "step": 8090 + }, + { + "epoch": 0.99170010106888, + "grad_norm": 1.650198947849656, + "learning_rate": 5.480917630868192e-05, + "loss": 0.2943, + "num_input_tokens_seen": 54010768, + "step": 8095 + }, + { + "epoch": 0.9923126397353833, + "grad_norm": 1.1853194951082626, + "learning_rate": 5.4758760697448886e-05, + "loss": 0.2282, + "num_input_tokens_seen": 54044368, + "step": 8100 + }, + { + "epoch": 0.9923126397353833, + "eval_loss": 0.14283877611160278, + "eval_runtime": 19.4313, + "eval_samples_per_second": 3.088, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 54044368, + "step": 8100 + }, + { + "epoch": 0.9929251784018867, + "grad_norm": 1.6800876684763826, + "learning_rate": 5.470834020331385e-05, + "loss": 0.2701, + "num_input_tokens_seen": 54077728, + "step": 8105 + }, + { + "epoch": 0.9935377170683899, + "grad_norm": 1.1796124789727611, + "learning_rate": 5.4657914878012604e-05, + "loss": 0.3021, + "num_input_tokens_seen": 54111328, + "step": 8110 + }, + { + "epoch": 0.9941502557348932, + "grad_norm": 1.2059300990149888, + "learning_rate": 5.460748477328592e-05, + "loss": 0.262, + "num_input_tokens_seen": 54144840, + "step": 8115 + }, + { + "epoch": 0.9947627944013966, + "grad_norm": 0.9681948459083338, + "learning_rate": 5.455704994087947e-05, + "loss": 0.2603, + "num_input_tokens_seen": 54178336, + "step": 8120 + }, + { + "epoch": 0.9953753330678999, + "grad_norm": 0.6364021106394956, + "learning_rate": 5.450661043254377e-05, + "loss": 0.2087, + "num_input_tokens_seen": 54212800, + "step": 8125 + }, + { + "epoch": 0.9959878717344033, + "grad_norm": 0.7669734731100462, + "learning_rate": 5.445616630003415e-05, + "loss": 0.2833, + "num_input_tokens_seen": 54246408, + "step": 8130 + }, + { + "epoch": 0.9966004104009065, + "grad_norm": 1.2712164303662965, + "learning_rate": 5.440571759511064e-05, + "loss": 0.2866, + "num_input_tokens_seen": 54279864, + "step": 8135 + }, + { + "epoch": 0.9972129490674099, + "grad_norm": 1.0279634663858783, + "learning_rate": 5.435526436953804e-05, + "loss": 0.2228, + "num_input_tokens_seen": 54313656, + "step": 8140 + }, + { + "epoch": 0.9978254877339132, + "grad_norm": 0.8285918430595809, + "learning_rate": 5.430480667508572e-05, + "loss": 0.2519, + "num_input_tokens_seen": 54347224, + "step": 8145 + }, + { + "epoch": 0.9984380264004166, + "grad_norm": 0.8166189799627692, + "learning_rate": 5.4254344563527646e-05, + "loss": 0.2573, + "num_input_tokens_seen": 54381008, + "step": 8150 + }, + { + "epoch": 0.9984380264004166, + "eval_loss": 0.1260269433259964, + "eval_runtime": 19.8046, + "eval_samples_per_second": 3.03, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 54381008, + "step": 8150 + }, + { + "epoch": 0.9990505650669198, + "grad_norm": 1.344290969838957, + "learning_rate": 5.420387808664237e-05, + "loss": 0.2837, + "num_input_tokens_seen": 54414464, + "step": 8155 + }, + { + "epoch": 0.9996631037334232, + "grad_norm": 0.9540333301134992, + "learning_rate": 5.415340729621283e-05, + "loss": 0.3385, + "num_input_tokens_seen": 54447944, + "step": 8160 + }, + { + "epoch": 1.000367523199902, + "grad_norm": 0.8422227281527098, + "learning_rate": 5.4102932244026513e-05, + "loss": 0.2689, + "num_input_tokens_seen": 54487096, + "step": 8165 + }, + { + "epoch": 1.0009800618664053, + "grad_norm": 1.1769582679640533, + "learning_rate": 5.405245298187514e-05, + "loss": 0.2541, + "num_input_tokens_seen": 54520352, + "step": 8170 + }, + { + "epoch": 1.0015926005329086, + "grad_norm": 0.8523516200981469, + "learning_rate": 5.4001969561554856e-05, + "loss": 0.1983, + "num_input_tokens_seen": 54554976, + "step": 8175 + }, + { + "epoch": 1.002205139199412, + "grad_norm": 1.0557508886121485, + "learning_rate": 5.395148203486602e-05, + "loss": 0.2281, + "num_input_tokens_seen": 54588840, + "step": 8180 + }, + { + "epoch": 1.0028176778659152, + "grad_norm": 1.219453667927466, + "learning_rate": 5.390099045361323e-05, + "loss": 0.2676, + "num_input_tokens_seen": 54622384, + "step": 8185 + }, + { + "epoch": 1.0034302165324187, + "grad_norm": 18.826008930853202, + "learning_rate": 5.385049486960525e-05, + "loss": 0.2328, + "num_input_tokens_seen": 54656104, + "step": 8190 + }, + { + "epoch": 1.004042755198922, + "grad_norm": 1.4105833090999877, + "learning_rate": 5.379999533465492e-05, + "loss": 0.2279, + "num_input_tokens_seen": 54690048, + "step": 8195 + }, + { + "epoch": 1.0046552938654252, + "grad_norm": 0.869381099604022, + "learning_rate": 5.374949190057915e-05, + "loss": 0.2249, + "num_input_tokens_seen": 54724208, + "step": 8200 + }, + { + "epoch": 1.0046552938654252, + "eval_loss": 0.2748353183269501, + "eval_runtime": 46.6105, + "eval_samples_per_second": 1.287, + "eval_steps_per_second": 0.322, + "num_input_tokens_seen": 54724208, + "step": 8200 + }, + { + "epoch": 1.0052678325319286, + "grad_norm": 0.9601538546088345, + "learning_rate": 5.369898461919888e-05, + "loss": 0.2527, + "num_input_tokens_seen": 54757616, + "step": 8205 + }, + { + "epoch": 1.0058803711984319, + "grad_norm": 1.2517792198329583, + "learning_rate": 5.3648473542338926e-05, + "loss": 0.2637, + "num_input_tokens_seen": 54791024, + "step": 8210 + }, + { + "epoch": 1.0064929098649351, + "grad_norm": 1.2199714246362692, + "learning_rate": 5.359795872182807e-05, + "loss": 0.2377, + "num_input_tokens_seen": 54824208, + "step": 8215 + }, + { + "epoch": 1.0071054485314386, + "grad_norm": 0.935245197075868, + "learning_rate": 5.35474402094989e-05, + "loss": 0.2291, + "num_input_tokens_seen": 54858152, + "step": 8220 + }, + { + "epoch": 1.0077179871979418, + "grad_norm": 1.1742938028980292, + "learning_rate": 5.3496918057187804e-05, + "loss": 0.2697, + "num_input_tokens_seen": 54891448, + "step": 8225 + }, + { + "epoch": 1.0083305258644453, + "grad_norm": 1.489363212261162, + "learning_rate": 5.3446392316734894e-05, + "loss": 0.2626, + "num_input_tokens_seen": 54925096, + "step": 8230 + }, + { + "epoch": 1.0089430645309485, + "grad_norm": 1.211174179993466, + "learning_rate": 5.339586303998399e-05, + "loss": 0.2661, + "num_input_tokens_seen": 54958216, + "step": 8235 + }, + { + "epoch": 1.0095556031974517, + "grad_norm": 0.865853482374697, + "learning_rate": 5.334533027878248e-05, + "loss": 0.2984, + "num_input_tokens_seen": 54991184, + "step": 8240 + }, + { + "epoch": 1.0101681418639552, + "grad_norm": 1.3506620552826027, + "learning_rate": 5.329479408498141e-05, + "loss": 0.2597, + "num_input_tokens_seen": 55025552, + "step": 8245 + }, + { + "epoch": 1.0107806805304584, + "grad_norm": 1.1847329480773103, + "learning_rate": 5.324425451043529e-05, + "loss": 0.266, + "num_input_tokens_seen": 55059128, + "step": 8250 + }, + { + "epoch": 1.0107806805304584, + "eval_loss": 0.15544439852237701, + "eval_runtime": 19.9171, + "eval_samples_per_second": 3.012, + "eval_steps_per_second": 0.753, + "num_input_tokens_seen": 55059128, + "step": 8250 + }, + { + "epoch": 1.011393219196962, + "grad_norm": 0.9987172921300755, + "learning_rate": 5.3193711607002115e-05, + "loss": 0.2317, + "num_input_tokens_seen": 55092984, + "step": 8255 + }, + { + "epoch": 1.0120057578634651, + "grad_norm": 9.755864143857915, + "learning_rate": 5.314316542654328e-05, + "loss": 0.2701, + "num_input_tokens_seen": 55126376, + "step": 8260 + }, + { + "epoch": 1.0126182965299684, + "grad_norm": 1.122634947604353, + "learning_rate": 5.30926160209236e-05, + "loss": 0.2832, + "num_input_tokens_seen": 55159832, + "step": 8265 + }, + { + "epoch": 1.0132308351964718, + "grad_norm": 1.1240209411528697, + "learning_rate": 5.304206344201112e-05, + "loss": 0.2376, + "num_input_tokens_seen": 55193328, + "step": 8270 + }, + { + "epoch": 1.013843373862975, + "grad_norm": 0.8274669668726677, + "learning_rate": 5.2991507741677195e-05, + "loss": 0.2542, + "num_input_tokens_seen": 55225936, + "step": 8275 + }, + { + "epoch": 1.0144559125294785, + "grad_norm": 0.9871250586862962, + "learning_rate": 5.2940948971796365e-05, + "loss": 0.2407, + "num_input_tokens_seen": 55258992, + "step": 8280 + }, + { + "epoch": 1.0150684511959818, + "grad_norm": 1.050605559972476, + "learning_rate": 5.2890387184246324e-05, + "loss": 0.2304, + "num_input_tokens_seen": 55292976, + "step": 8285 + }, + { + "epoch": 1.015680989862485, + "grad_norm": 1.8222858468071692, + "learning_rate": 5.283982243090786e-05, + "loss": 0.1923, + "num_input_tokens_seen": 55327720, + "step": 8290 + }, + { + "epoch": 1.0162935285289885, + "grad_norm": 1.2922656209221335, + "learning_rate": 5.278925476366479e-05, + "loss": 0.2865, + "num_input_tokens_seen": 55361392, + "step": 8295 + }, + { + "epoch": 1.0169060671954917, + "grad_norm": 1.1598456492588503, + "learning_rate": 5.273868423440395e-05, + "loss": 0.2599, + "num_input_tokens_seen": 55394968, + "step": 8300 + }, + { + "epoch": 1.0169060671954917, + "eval_loss": 0.2275010645389557, + "eval_runtime": 19.3444, + "eval_samples_per_second": 3.102, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 55394968, + "step": 8300 + }, + { + "epoch": 1.017518605861995, + "grad_norm": 1.1391812920449496, + "learning_rate": 5.2688110895015096e-05, + "loss": 0.2679, + "num_input_tokens_seen": 55427424, + "step": 8305 + }, + { + "epoch": 1.0181311445284984, + "grad_norm": 5.368696592682067, + "learning_rate": 5.263753479739085e-05, + "loss": 0.2744, + "num_input_tokens_seen": 55461776, + "step": 8310 + }, + { + "epoch": 1.0187436831950016, + "grad_norm": 0.8005094815689019, + "learning_rate": 5.258695599342671e-05, + "loss": 0.2556, + "num_input_tokens_seen": 55494896, + "step": 8315 + }, + { + "epoch": 1.019356221861505, + "grad_norm": 1.0780469576053615, + "learning_rate": 5.253637453502092e-05, + "loss": 0.2656, + "num_input_tokens_seen": 55528760, + "step": 8320 + }, + { + "epoch": 1.0199687605280083, + "grad_norm": 1.6297890469471712, + "learning_rate": 5.2485790474074445e-05, + "loss": 0.2604, + "num_input_tokens_seen": 55562648, + "step": 8325 + }, + { + "epoch": 1.0205812991945116, + "grad_norm": 0.9682104958310769, + "learning_rate": 5.243520386249092e-05, + "loss": 0.2444, + "num_input_tokens_seen": 55595648, + "step": 8330 + }, + { + "epoch": 1.021193837861015, + "grad_norm": 1.4291691354785836, + "learning_rate": 5.238461475217662e-05, + "loss": 0.255, + "num_input_tokens_seen": 55629152, + "step": 8335 + }, + { + "epoch": 1.0218063765275183, + "grad_norm": 1.2206114391669503, + "learning_rate": 5.233402319504035e-05, + "loss": 0.3054, + "num_input_tokens_seen": 55661832, + "step": 8340 + }, + { + "epoch": 1.0224189151940217, + "grad_norm": 1.3519838350245388, + "learning_rate": 5.2283429242993465e-05, + "loss": 0.2577, + "num_input_tokens_seen": 55695464, + "step": 8345 + }, + { + "epoch": 1.023031453860525, + "grad_norm": 1.5236398767952584, + "learning_rate": 5.223283294794974e-05, + "loss": 0.2485, + "num_input_tokens_seen": 55728792, + "step": 8350 + }, + { + "epoch": 1.023031453860525, + "eval_loss": 0.22839440405368805, + "eval_runtime": 19.6628, + "eval_samples_per_second": 3.051, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 55728792, + "step": 8350 + }, + { + "epoch": 1.0236439925270282, + "grad_norm": 1.115443576191816, + "learning_rate": 5.2182234361825366e-05, + "loss": 0.2523, + "num_input_tokens_seen": 55762040, + "step": 8355 + }, + { + "epoch": 1.0242565311935317, + "grad_norm": 0.9504386082506932, + "learning_rate": 5.213163353653892e-05, + "loss": 0.2023, + "num_input_tokens_seen": 55795784, + "step": 8360 + }, + { + "epoch": 1.024869069860035, + "grad_norm": 0.8803220117056142, + "learning_rate": 5.208103052401122e-05, + "loss": 0.2392, + "num_input_tokens_seen": 55829712, + "step": 8365 + }, + { + "epoch": 1.0254816085265381, + "grad_norm": 1.1392782121143972, + "learning_rate": 5.203042537616536e-05, + "loss": 0.2684, + "num_input_tokens_seen": 55863320, + "step": 8370 + }, + { + "epoch": 1.0260941471930416, + "grad_norm": 0.7756726172148019, + "learning_rate": 5.1979818144926615e-05, + "loss": 0.3133, + "num_input_tokens_seen": 55896184, + "step": 8375 + }, + { + "epoch": 1.0267066858595448, + "grad_norm": 0.5843893819711142, + "learning_rate": 5.1929208882222415e-05, + "loss": 0.2341, + "num_input_tokens_seen": 55930272, + "step": 8380 + }, + { + "epoch": 1.0273192245260483, + "grad_norm": 0.8434907940152073, + "learning_rate": 5.187859763998225e-05, + "loss": 0.2398, + "num_input_tokens_seen": 55964048, + "step": 8385 + }, + { + "epoch": 1.0279317631925515, + "grad_norm": 14.045807785545248, + "learning_rate": 5.182798447013765e-05, + "loss": 0.1992, + "num_input_tokens_seen": 55998064, + "step": 8390 + }, + { + "epoch": 1.0285443018590548, + "grad_norm": 1.0782146880351968, + "learning_rate": 5.1777369424622135e-05, + "loss": 0.286, + "num_input_tokens_seen": 56030752, + "step": 8395 + }, + { + "epoch": 1.0291568405255582, + "grad_norm": 1.008837627042037, + "learning_rate": 5.172675255537113e-05, + "loss": 0.2513, + "num_input_tokens_seen": 56064312, + "step": 8400 + }, + { + "epoch": 1.0291568405255582, + "eval_loss": 0.16583839058876038, + "eval_runtime": 19.4879, + "eval_samples_per_second": 3.079, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 56064312, + "step": 8400 + }, + { + "epoch": 1.0297693791920615, + "grad_norm": 0.8984041849755142, + "learning_rate": 5.1676133914321965e-05, + "loss": 0.2424, + "num_input_tokens_seen": 56098088, + "step": 8405 + }, + { + "epoch": 1.030381917858565, + "grad_norm": 0.9483592933518322, + "learning_rate": 5.162551355341373e-05, + "loss": 0.2252, + "num_input_tokens_seen": 56132272, + "step": 8410 + }, + { + "epoch": 1.0309944565250682, + "grad_norm": 1.3194904964017427, + "learning_rate": 5.1574891524587333e-05, + "loss": 0.2803, + "num_input_tokens_seen": 56165256, + "step": 8415 + }, + { + "epoch": 1.0316069951915714, + "grad_norm": 1.1569577950085184, + "learning_rate": 5.1524267879785374e-05, + "loss": 0.229, + "num_input_tokens_seen": 56198512, + "step": 8420 + }, + { + "epoch": 1.0322195338580749, + "grad_norm": 1.4896189206445254, + "learning_rate": 5.1473642670952114e-05, + "loss": 0.2367, + "num_input_tokens_seen": 56231456, + "step": 8425 + }, + { + "epoch": 1.032832072524578, + "grad_norm": 1.5474765867511795, + "learning_rate": 5.142301595003343e-05, + "loss": 0.2478, + "num_input_tokens_seen": 56265120, + "step": 8430 + }, + { + "epoch": 1.0334446111910813, + "grad_norm": 5.190409486988193, + "learning_rate": 5.1372387768976694e-05, + "loss": 0.2885, + "num_input_tokens_seen": 56298848, + "step": 8435 + }, + { + "epoch": 1.0340571498575848, + "grad_norm": 1.123555703515511, + "learning_rate": 5.132175817973086e-05, + "loss": 0.2662, + "num_input_tokens_seen": 56331816, + "step": 8440 + }, + { + "epoch": 1.034669688524088, + "grad_norm": 0.8765976399447823, + "learning_rate": 5.1271127234246275e-05, + "loss": 0.2736, + "num_input_tokens_seen": 56366040, + "step": 8445 + }, + { + "epoch": 1.0352822271905915, + "grad_norm": 1.2236007070631774, + "learning_rate": 5.1220494984474674e-05, + "loss": 0.2644, + "num_input_tokens_seen": 56398904, + "step": 8450 + }, + { + "epoch": 1.0352822271905915, + "eval_loss": 0.17673595249652863, + "eval_runtime": 19.2694, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 56398904, + "step": 8450 + }, + { + "epoch": 1.0358947658570947, + "grad_norm": 1.4128549131244026, + "learning_rate": 5.116986148236916e-05, + "loss": 0.2463, + "num_input_tokens_seen": 56432392, + "step": 8455 + }, + { + "epoch": 1.036507304523598, + "grad_norm": 0.8135856933177207, + "learning_rate": 5.1119226779884113e-05, + "loss": 0.2756, + "num_input_tokens_seen": 56465048, + "step": 8460 + }, + { + "epoch": 1.0371198431901014, + "grad_norm": 10.18165872500146, + "learning_rate": 5.106859092897514e-05, + "loss": 0.3336, + "num_input_tokens_seen": 56498760, + "step": 8465 + }, + { + "epoch": 1.0377323818566047, + "grad_norm": 0.9519318491595695, + "learning_rate": 5.1017953981598974e-05, + "loss": 0.2393, + "num_input_tokens_seen": 56532000, + "step": 8470 + }, + { + "epoch": 1.0383449205231081, + "grad_norm": 1.251974853727647, + "learning_rate": 5.096731598971359e-05, + "loss": 0.3168, + "num_input_tokens_seen": 56565360, + "step": 8475 + }, + { + "epoch": 1.0389574591896114, + "grad_norm": 0.5113814364030088, + "learning_rate": 5.0916677005277934e-05, + "loss": 0.203, + "num_input_tokens_seen": 56599504, + "step": 8480 + }, + { + "epoch": 1.0395699978561146, + "grad_norm": 1.2458717961713626, + "learning_rate": 5.086603708025201e-05, + "loss": 0.3049, + "num_input_tokens_seen": 56631704, + "step": 8485 + }, + { + "epoch": 1.040182536522618, + "grad_norm": 18.342600522237536, + "learning_rate": 5.0815396266596794e-05, + "loss": 0.2823, + "num_input_tokens_seen": 56665032, + "step": 8490 + }, + { + "epoch": 1.0407950751891213, + "grad_norm": 1.1962655759843381, + "learning_rate": 5.076475461627415e-05, + "loss": 0.2552, + "num_input_tokens_seen": 56698112, + "step": 8495 + }, + { + "epoch": 1.0414076138556245, + "grad_norm": 0.8925761760184269, + "learning_rate": 5.07141121812468e-05, + "loss": 0.206, + "num_input_tokens_seen": 56732240, + "step": 8500 + }, + { + "epoch": 1.0414076138556245, + "eval_loss": 0.16079044342041016, + "eval_runtime": 19.0722, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 56732240, + "step": 8500 + }, + { + "epoch": 1.042020152522128, + "grad_norm": 3.4944036674025107, + "learning_rate": 5.06634690134783e-05, + "loss": 0.2222, + "num_input_tokens_seen": 56766312, + "step": 8505 + }, + { + "epoch": 1.0426326911886312, + "grad_norm": 1.4296338273629565, + "learning_rate": 5.061282516493294e-05, + "loss": 0.2588, + "num_input_tokens_seen": 56799880, + "step": 8510 + }, + { + "epoch": 1.0432452298551347, + "grad_norm": 1.2471172033667304, + "learning_rate": 5.05621806875757e-05, + "loss": 0.2503, + "num_input_tokens_seen": 56833000, + "step": 8515 + }, + { + "epoch": 1.043857768521638, + "grad_norm": 1.1579772099717602, + "learning_rate": 5.051153563337222e-05, + "loss": 0.2596, + "num_input_tokens_seen": 56867064, + "step": 8520 + }, + { + "epoch": 1.0444703071881412, + "grad_norm": 0.9007549648476204, + "learning_rate": 5.046089005428872e-05, + "loss": 0.1845, + "num_input_tokens_seen": 56900952, + "step": 8525 + }, + { + "epoch": 1.0450828458546446, + "grad_norm": 3.0321883913760703, + "learning_rate": 5.0410244002292004e-05, + "loss": 0.2324, + "num_input_tokens_seen": 56934104, + "step": 8530 + }, + { + "epoch": 1.0456953845211479, + "grad_norm": 0.9342587435355872, + "learning_rate": 5.0359597529349266e-05, + "loss": 0.2302, + "num_input_tokens_seen": 56967848, + "step": 8535 + }, + { + "epoch": 1.0463079231876513, + "grad_norm": 1.0678922924399599, + "learning_rate": 5.030895068742822e-05, + "loss": 0.2654, + "num_input_tokens_seen": 57001056, + "step": 8540 + }, + { + "epoch": 1.0469204618541545, + "grad_norm": 1.1667668048151056, + "learning_rate": 5.025830352849693e-05, + "loss": 0.2233, + "num_input_tokens_seen": 57035056, + "step": 8545 + }, + { + "epoch": 1.0475330005206578, + "grad_norm": 1.2795522675307125, + "learning_rate": 5.020765610452375e-05, + "loss": 0.2414, + "num_input_tokens_seen": 57068112, + "step": 8550 + }, + { + "epoch": 1.0475330005206578, + "eval_loss": 0.21543771028518677, + "eval_runtime": 19.4827, + "eval_samples_per_second": 3.08, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 57068112, + "step": 8550 + }, + { + "epoch": 1.0481455391871612, + "grad_norm": 1.797223924306754, + "learning_rate": 5.0157008467477385e-05, + "loss": 0.2635, + "num_input_tokens_seen": 57101496, + "step": 8555 + }, + { + "epoch": 1.0487580778536645, + "grad_norm": 0.882197810353444, + "learning_rate": 5.010636066932667e-05, + "loss": 0.233, + "num_input_tokens_seen": 57135456, + "step": 8560 + }, + { + "epoch": 1.049370616520168, + "grad_norm": 1.070082354108884, + "learning_rate": 5.0055712762040674e-05, + "loss": 0.2612, + "num_input_tokens_seen": 57168912, + "step": 8565 + }, + { + "epoch": 1.0499831551866712, + "grad_norm": 1.0460007423381288, + "learning_rate": 5.000506479758854e-05, + "loss": 0.2731, + "num_input_tokens_seen": 57202432, + "step": 8570 + }, + { + "epoch": 1.0505956938531744, + "grad_norm": 0.8733608240894085, + "learning_rate": 4.995441682793949e-05, + "loss": 0.2406, + "num_input_tokens_seen": 57235744, + "step": 8575 + }, + { + "epoch": 1.0512082325196779, + "grad_norm": 1.1153330239306591, + "learning_rate": 4.990376890506273e-05, + "loss": 0.2107, + "num_input_tokens_seen": 57269840, + "step": 8580 + }, + { + "epoch": 1.0518207711861811, + "grad_norm": 1.0804612773204887, + "learning_rate": 4.9853121080927435e-05, + "loss": 0.2, + "num_input_tokens_seen": 57304016, + "step": 8585 + }, + { + "epoch": 1.0524333098526844, + "grad_norm": 0.9253174244743283, + "learning_rate": 4.980247340750268e-05, + "loss": 0.2134, + "num_input_tokens_seen": 57337792, + "step": 8590 + }, + { + "epoch": 1.0530458485191878, + "grad_norm": 0.8895114896802918, + "learning_rate": 4.975182593675736e-05, + "loss": 0.2785, + "num_input_tokens_seen": 57371248, + "step": 8595 + }, + { + "epoch": 1.053658387185691, + "grad_norm": 1.1411282766974657, + "learning_rate": 4.970117872066021e-05, + "loss": 0.2406, + "num_input_tokens_seen": 57404752, + "step": 8600 + }, + { + "epoch": 1.053658387185691, + "eval_loss": 0.15721498429775238, + "eval_runtime": 20.2664, + "eval_samples_per_second": 2.961, + "eval_steps_per_second": 0.74, + "num_input_tokens_seen": 57404752, + "step": 8600 + }, + { + "epoch": 1.0542709258521945, + "grad_norm": 0.8807547314893227, + "learning_rate": 4.965053181117965e-05, + "loss": 0.2482, + "num_input_tokens_seen": 57438376, + "step": 8605 + }, + { + "epoch": 1.0548834645186977, + "grad_norm": 1.2941312525485775, + "learning_rate": 4.959988526028384e-05, + "loss": 0.2265, + "num_input_tokens_seen": 57471696, + "step": 8610 + }, + { + "epoch": 1.055496003185201, + "grad_norm": 1.246723562825523, + "learning_rate": 4.954923911994051e-05, + "loss": 0.2478, + "num_input_tokens_seen": 57505344, + "step": 8615 + }, + { + "epoch": 1.0561085418517044, + "grad_norm": 0.8696159822122087, + "learning_rate": 4.9498593442117e-05, + "loss": 0.2196, + "num_input_tokens_seen": 57538944, + "step": 8620 + }, + { + "epoch": 1.0567210805182077, + "grad_norm": 1.100498968635161, + "learning_rate": 4.9447948278780216e-05, + "loss": 0.2548, + "num_input_tokens_seen": 57572168, + "step": 8625 + }, + { + "epoch": 1.0573336191847111, + "grad_norm": 1.3027773516434187, + "learning_rate": 4.939730368189647e-05, + "loss": 0.2524, + "num_input_tokens_seen": 57605056, + "step": 8630 + }, + { + "epoch": 1.0579461578512144, + "grad_norm": 7.984563924980079, + "learning_rate": 4.934665970343151e-05, + "loss": 0.3168, + "num_input_tokens_seen": 57638896, + "step": 8635 + }, + { + "epoch": 1.0585586965177176, + "grad_norm": 1.0785984955978396, + "learning_rate": 4.929601639535047e-05, + "loss": 0.2335, + "num_input_tokens_seen": 57672136, + "step": 8640 + }, + { + "epoch": 1.059171235184221, + "grad_norm": 0.736383979913198, + "learning_rate": 4.924537380961779e-05, + "loss": 0.2285, + "num_input_tokens_seen": 57705352, + "step": 8645 + }, + { + "epoch": 1.0597837738507243, + "grad_norm": 0.9654493425227267, + "learning_rate": 4.919473199819715e-05, + "loss": 0.2487, + "num_input_tokens_seen": 57738424, + "step": 8650 + }, + { + "epoch": 1.0597837738507243, + "eval_loss": 0.1030297502875328, + "eval_runtime": 19.5428, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 57738424, + "step": 8650 + }, + { + "epoch": 1.0603963125172275, + "grad_norm": 0.9791592066735513, + "learning_rate": 4.914409101305146e-05, + "loss": 0.2628, + "num_input_tokens_seen": 57771856, + "step": 8655 + }, + { + "epoch": 1.061008851183731, + "grad_norm": 1.289009690394446, + "learning_rate": 4.9093450906142765e-05, + "loss": 0.2704, + "num_input_tokens_seen": 57804920, + "step": 8660 + }, + { + "epoch": 1.0616213898502342, + "grad_norm": 1.0746448752974245, + "learning_rate": 4.904281172943223e-05, + "loss": 0.2324, + "num_input_tokens_seen": 57838728, + "step": 8665 + }, + { + "epoch": 1.0622339285167377, + "grad_norm": 1.2369519049580886, + "learning_rate": 4.899217353488004e-05, + "loss": 0.246, + "num_input_tokens_seen": 57871664, + "step": 8670 + }, + { + "epoch": 1.062846467183241, + "grad_norm": 1.1205249818128704, + "learning_rate": 4.894153637444537e-05, + "loss": 0.2308, + "num_input_tokens_seen": 57905416, + "step": 8675 + }, + { + "epoch": 1.0634590058497442, + "grad_norm": 1.045481161787216, + "learning_rate": 4.889090030008638e-05, + "loss": 0.2462, + "num_input_tokens_seen": 57938832, + "step": 8680 + }, + { + "epoch": 1.0640715445162476, + "grad_norm": 1.1002634338505846, + "learning_rate": 4.8840265363760046e-05, + "loss": 0.2348, + "num_input_tokens_seen": 57972576, + "step": 8685 + }, + { + "epoch": 1.0646840831827509, + "grad_norm": 0.8460858520829989, + "learning_rate": 4.878963161742224e-05, + "loss": 0.2228, + "num_input_tokens_seen": 58006624, + "step": 8690 + }, + { + "epoch": 1.0652966218492543, + "grad_norm": 11.076510522769789, + "learning_rate": 4.873899911302758e-05, + "loss": 0.2831, + "num_input_tokens_seen": 58040088, + "step": 8695 + }, + { + "epoch": 1.0659091605157576, + "grad_norm": 0.9484892667539314, + "learning_rate": 4.86883679025294e-05, + "loss": 0.2314, + "num_input_tokens_seen": 58073704, + "step": 8700 + }, + { + "epoch": 1.0659091605157576, + "eval_loss": 0.1421510875225067, + "eval_runtime": 19.6485, + "eval_samples_per_second": 3.054, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 58073704, + "step": 8700 + }, + { + "epoch": 1.0665216991822608, + "grad_norm": 0.8399907018643262, + "learning_rate": 4.8637738037879734e-05, + "loss": 0.2336, + "num_input_tokens_seen": 58106888, + "step": 8705 + }, + { + "epoch": 1.0671342378487643, + "grad_norm": 0.8622745441572411, + "learning_rate": 4.8587109571029246e-05, + "loss": 0.2186, + "num_input_tokens_seen": 58140584, + "step": 8710 + }, + { + "epoch": 1.0677467765152675, + "grad_norm": 0.8886508590205018, + "learning_rate": 4.853648255392709e-05, + "loss": 0.2173, + "num_input_tokens_seen": 58174072, + "step": 8715 + }, + { + "epoch": 1.068359315181771, + "grad_norm": 1.4672098804127123, + "learning_rate": 4.848585703852103e-05, + "loss": 0.2116, + "num_input_tokens_seen": 58207808, + "step": 8720 + }, + { + "epoch": 1.0689718538482742, + "grad_norm": 0.757294434869373, + "learning_rate": 4.843523307675723e-05, + "loss": 0.2611, + "num_input_tokens_seen": 58241136, + "step": 8725 + }, + { + "epoch": 1.0695843925147774, + "grad_norm": 1.0465383698252306, + "learning_rate": 4.838461072058025e-05, + "loss": 0.2031, + "num_input_tokens_seen": 58275600, + "step": 8730 + }, + { + "epoch": 1.070196931181281, + "grad_norm": 19.777433792158014, + "learning_rate": 4.8333990021933065e-05, + "loss": 0.2438, + "num_input_tokens_seen": 58309360, + "step": 8735 + }, + { + "epoch": 1.0708094698477841, + "grad_norm": 0.7508437196505346, + "learning_rate": 4.8283371032756885e-05, + "loss": 0.2834, + "num_input_tokens_seen": 58342896, + "step": 8740 + }, + { + "epoch": 1.0714220085142874, + "grad_norm": 0.9934668217893162, + "learning_rate": 4.823275380499121e-05, + "loss": 0.3, + "num_input_tokens_seen": 58375272, + "step": 8745 + }, + { + "epoch": 1.0720345471807908, + "grad_norm": 1.3561170521016508, + "learning_rate": 4.818213839057371e-05, + "loss": 0.2452, + "num_input_tokens_seen": 58409256, + "step": 8750 + }, + { + "epoch": 1.0720345471807908, + "eval_loss": 0.21566660702228546, + "eval_runtime": 19.2432, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 58409256, + "step": 8750 + }, + { + "epoch": 1.072647085847294, + "grad_norm": 1.3208806294295181, + "learning_rate": 4.8131524841440176e-05, + "loss": 0.2479, + "num_input_tokens_seen": 58442384, + "step": 8755 + }, + { + "epoch": 1.0732596245137975, + "grad_norm": 0.844897161702894, + "learning_rate": 4.8080913209524534e-05, + "loss": 0.2227, + "num_input_tokens_seen": 58475920, + "step": 8760 + }, + { + "epoch": 1.0738721631803008, + "grad_norm": 0.9451341485318233, + "learning_rate": 4.8030303546758705e-05, + "loss": 0.2618, + "num_input_tokens_seen": 58509144, + "step": 8765 + }, + { + "epoch": 1.074484701846804, + "grad_norm": 1.3886849708188145, + "learning_rate": 4.797969590507261e-05, + "loss": 0.2381, + "num_input_tokens_seen": 58542680, + "step": 8770 + }, + { + "epoch": 1.0750972405133075, + "grad_norm": 1.8792061425987017, + "learning_rate": 4.792909033639409e-05, + "loss": 0.2139, + "num_input_tokens_seen": 58576552, + "step": 8775 + }, + { + "epoch": 1.0757097791798107, + "grad_norm": 1.3479801204399742, + "learning_rate": 4.787848689264883e-05, + "loss": 0.2487, + "num_input_tokens_seen": 58609576, + "step": 8780 + }, + { + "epoch": 1.076322317846314, + "grad_norm": 0.8981606321723532, + "learning_rate": 4.7827885625760385e-05, + "loss": 0.21, + "num_input_tokens_seen": 58643352, + "step": 8785 + }, + { + "epoch": 1.0769348565128174, + "grad_norm": 1.1644651897820515, + "learning_rate": 4.777728658765007e-05, + "loss": 0.2483, + "num_input_tokens_seen": 58677304, + "step": 8790 + }, + { + "epoch": 1.0775473951793206, + "grad_norm": 1.0991086635789076, + "learning_rate": 4.7726689830236834e-05, + "loss": 0.2495, + "num_input_tokens_seen": 58710320, + "step": 8795 + }, + { + "epoch": 1.078159933845824, + "grad_norm": 1.1964444636886824, + "learning_rate": 4.76760954054374e-05, + "loss": 0.2578, + "num_input_tokens_seen": 58743680, + "step": 8800 + }, + { + "epoch": 1.078159933845824, + "eval_loss": 0.13818946480751038, + "eval_runtime": 19.6156, + "eval_samples_per_second": 3.059, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 58743680, + "step": 8800 + }, + { + "epoch": 1.0787724725123273, + "grad_norm": 0.877604661964029, + "learning_rate": 4.762550336516603e-05, + "loss": 0.2521, + "num_input_tokens_seen": 58777032, + "step": 8805 + }, + { + "epoch": 1.0793850111788306, + "grad_norm": 0.8686744583453913, + "learning_rate": 4.757491376133454e-05, + "loss": 0.2287, + "num_input_tokens_seen": 58811392, + "step": 8810 + }, + { + "epoch": 1.079997549845334, + "grad_norm": 1.004155173388141, + "learning_rate": 4.752432664585228e-05, + "loss": 0.2349, + "num_input_tokens_seen": 58844584, + "step": 8815 + }, + { + "epoch": 1.0806100885118373, + "grad_norm": 4.260820074518357, + "learning_rate": 4.747374207062601e-05, + "loss": 0.2907, + "num_input_tokens_seen": 58877152, + "step": 8820 + }, + { + "epoch": 1.0812226271783407, + "grad_norm": 2.0609737658838667, + "learning_rate": 4.742316008755992e-05, + "loss": 0.2687, + "num_input_tokens_seen": 58910592, + "step": 8825 + }, + { + "epoch": 1.081835165844844, + "grad_norm": 1.251686350040789, + "learning_rate": 4.73725807485555e-05, + "loss": 0.2837, + "num_input_tokens_seen": 58943496, + "step": 8830 + }, + { + "epoch": 1.0824477045113472, + "grad_norm": 1.1541046385099845, + "learning_rate": 4.732200410551155e-05, + "loss": 0.2229, + "num_input_tokens_seen": 58976320, + "step": 8835 + }, + { + "epoch": 1.0830602431778507, + "grad_norm": 5.742903467706001, + "learning_rate": 4.727143021032412e-05, + "loss": 0.2708, + "num_input_tokens_seen": 59010264, + "step": 8840 + }, + { + "epoch": 1.083672781844354, + "grad_norm": 1.00858186367192, + "learning_rate": 4.7220859114886394e-05, + "loss": 0.2271, + "num_input_tokens_seen": 59043984, + "step": 8845 + }, + { + "epoch": 1.0842853205108574, + "grad_norm": 1.145153381406403, + "learning_rate": 4.7170290871088734e-05, + "loss": 0.2266, + "num_input_tokens_seen": 59077880, + "step": 8850 + }, + { + "epoch": 1.0842853205108574, + "eval_loss": 0.1759234070777893, + "eval_runtime": 18.9738, + "eval_samples_per_second": 3.162, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 59077880, + "step": 8850 + }, + { + "epoch": 1.0848978591773606, + "grad_norm": 0.8947224151078845, + "learning_rate": 4.711972553081855e-05, + "loss": 0.2391, + "num_input_tokens_seen": 59111400, + "step": 8855 + }, + { + "epoch": 1.0855103978438638, + "grad_norm": 0.9502024958074183, + "learning_rate": 4.706916314596025e-05, + "loss": 0.2489, + "num_input_tokens_seen": 59145032, + "step": 8860 + }, + { + "epoch": 1.0861229365103673, + "grad_norm": 1.3193842139688745, + "learning_rate": 4.701860376839526e-05, + "loss": 0.27, + "num_input_tokens_seen": 59178760, + "step": 8865 + }, + { + "epoch": 1.0867354751768705, + "grad_norm": 0.9619785555412352, + "learning_rate": 4.696804745000188e-05, + "loss": 0.2261, + "num_input_tokens_seen": 59212424, + "step": 8870 + }, + { + "epoch": 1.0873480138433738, + "grad_norm": 1.0377122269196115, + "learning_rate": 4.6917494242655325e-05, + "loss": 0.2284, + "num_input_tokens_seen": 59245624, + "step": 8875 + }, + { + "epoch": 1.0879605525098772, + "grad_norm": 1.9818945298107382, + "learning_rate": 4.6866944198227526e-05, + "loss": 0.2783, + "num_input_tokens_seen": 59278912, + "step": 8880 + }, + { + "epoch": 1.0885730911763805, + "grad_norm": 1.076257410784753, + "learning_rate": 4.681639736858725e-05, + "loss": 0.2173, + "num_input_tokens_seen": 59312872, + "step": 8885 + }, + { + "epoch": 1.089185629842884, + "grad_norm": 1.1411014546812641, + "learning_rate": 4.6765853805599914e-05, + "loss": 0.249, + "num_input_tokens_seen": 59345944, + "step": 8890 + }, + { + "epoch": 1.0897981685093872, + "grad_norm": 0.947248742884469, + "learning_rate": 4.6715313561127634e-05, + "loss": 0.2639, + "num_input_tokens_seen": 59379368, + "step": 8895 + }, + { + "epoch": 1.0904107071758904, + "grad_norm": 0.9955367790246993, + "learning_rate": 4.666477668702907e-05, + "loss": 0.2321, + "num_input_tokens_seen": 59412496, + "step": 8900 + }, + { + "epoch": 1.0904107071758904, + "eval_loss": 0.15350688993930817, + "eval_runtime": 19.3109, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 59412496, + "step": 8900 + }, + { + "epoch": 1.0910232458423939, + "grad_norm": 1.1230338923667038, + "learning_rate": 4.661424323515946e-05, + "loss": 0.236, + "num_input_tokens_seen": 59445248, + "step": 8905 + }, + { + "epoch": 1.091635784508897, + "grad_norm": 16.74453899121403, + "learning_rate": 4.6563713257370526e-05, + "loss": 0.3239, + "num_input_tokens_seen": 59478080, + "step": 8910 + }, + { + "epoch": 1.0922483231754005, + "grad_norm": 1.4150122275225778, + "learning_rate": 4.651318680551039e-05, + "loss": 0.2349, + "num_input_tokens_seen": 59511552, + "step": 8915 + }, + { + "epoch": 1.0928608618419038, + "grad_norm": 3.6169382878701204, + "learning_rate": 4.646266393142361e-05, + "loss": 0.2413, + "num_input_tokens_seen": 59545072, + "step": 8920 + }, + { + "epoch": 1.093473400508407, + "grad_norm": 0.8669784159808702, + "learning_rate": 4.6412144686951014e-05, + "loss": 0.2575, + "num_input_tokens_seen": 59578112, + "step": 8925 + }, + { + "epoch": 1.0940859391749105, + "grad_norm": 0.7563684774216004, + "learning_rate": 4.6361629123929776e-05, + "loss": 0.2172, + "num_input_tokens_seen": 59611888, + "step": 8930 + }, + { + "epoch": 1.0946984778414137, + "grad_norm": 1.6396948804168443, + "learning_rate": 4.6311117294193216e-05, + "loss": 0.2573, + "num_input_tokens_seen": 59645488, + "step": 8935 + }, + { + "epoch": 1.095311016507917, + "grad_norm": 1.4952727819463327, + "learning_rate": 4.626060924957088e-05, + "loss": 0.2463, + "num_input_tokens_seen": 59679080, + "step": 8940 + }, + { + "epoch": 1.0959235551744204, + "grad_norm": 0.9969309533794476, + "learning_rate": 4.6210105041888415e-05, + "loss": 0.2196, + "num_input_tokens_seen": 59713224, + "step": 8945 + }, + { + "epoch": 1.0965360938409237, + "grad_norm": 0.8275611068680018, + "learning_rate": 4.615960472296751e-05, + "loss": 0.2312, + "num_input_tokens_seen": 59746528, + "step": 8950 + }, + { + "epoch": 1.0965360938409237, + "eval_loss": 0.14111146330833435, + "eval_runtime": 19.5397, + "eval_samples_per_second": 3.071, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 59746528, + "step": 8950 + }, + { + "epoch": 1.0971486325074271, + "grad_norm": 1.0111161857092152, + "learning_rate": 4.610910834462591e-05, + "loss": 0.2199, + "num_input_tokens_seen": 59780704, + "step": 8955 + }, + { + "epoch": 1.0977611711739304, + "grad_norm": 0.8846126659482926, + "learning_rate": 4.605861595867726e-05, + "loss": 0.2347, + "num_input_tokens_seen": 59814448, + "step": 8960 + }, + { + "epoch": 1.0983737098404336, + "grad_norm": 0.819175063486554, + "learning_rate": 4.600812761693114e-05, + "loss": 0.2494, + "num_input_tokens_seen": 59847496, + "step": 8965 + }, + { + "epoch": 1.098986248506937, + "grad_norm": 0.9297962341408569, + "learning_rate": 4.595764337119297e-05, + "loss": 0.2494, + "num_input_tokens_seen": 59880232, + "step": 8970 + }, + { + "epoch": 1.0995987871734403, + "grad_norm": 4.414361415538194, + "learning_rate": 4.5907163273263995e-05, + "loss": 0.2589, + "num_input_tokens_seen": 59913928, + "step": 8975 + }, + { + "epoch": 1.1002113258399437, + "grad_norm": 1.0330019295685864, + "learning_rate": 4.585668737494116e-05, + "loss": 0.2584, + "num_input_tokens_seen": 59947160, + "step": 8980 + }, + { + "epoch": 1.100823864506447, + "grad_norm": 0.9744331693353389, + "learning_rate": 4.5806215728017144e-05, + "loss": 0.2586, + "num_input_tokens_seen": 59980048, + "step": 8985 + }, + { + "epoch": 1.1014364031729502, + "grad_norm": 0.8213994780893024, + "learning_rate": 4.575574838428023e-05, + "loss": 0.2098, + "num_input_tokens_seen": 60014696, + "step": 8990 + }, + { + "epoch": 1.1020489418394537, + "grad_norm": 2.258099952042849, + "learning_rate": 4.57052853955143e-05, + "loss": 0.2502, + "num_input_tokens_seen": 60048400, + "step": 8995 + }, + { + "epoch": 1.102661480505957, + "grad_norm": 0.872128884102733, + "learning_rate": 4.5654826813498764e-05, + "loss": 0.1656, + "num_input_tokens_seen": 60083320, + "step": 9000 + }, + { + "epoch": 1.102661480505957, + "eval_loss": 0.2304692566394806, + "eval_runtime": 19.0726, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 60083320, + "step": 9000 + }, + { + "epoch": 1.1032740191724604, + "grad_norm": 0.9745522813224388, + "learning_rate": 4.5604372690008506e-05, + "loss": 0.208, + "num_input_tokens_seen": 60117224, + "step": 9005 + }, + { + "epoch": 1.1038865578389636, + "grad_norm": 26.715022487825497, + "learning_rate": 4.555392307681387e-05, + "loss": 0.2704, + "num_input_tokens_seen": 60151072, + "step": 9010 + }, + { + "epoch": 1.1044990965054668, + "grad_norm": 1.1900222470251158, + "learning_rate": 4.5503478025680496e-05, + "loss": 0.2445, + "num_input_tokens_seen": 60184968, + "step": 9015 + }, + { + "epoch": 1.1051116351719703, + "grad_norm": 0.8236473661830899, + "learning_rate": 4.5453037588369436e-05, + "loss": 0.2017, + "num_input_tokens_seen": 60219128, + "step": 9020 + }, + { + "epoch": 1.1057241738384735, + "grad_norm": 1.3207534871887328, + "learning_rate": 4.540260181663694e-05, + "loss": 0.2748, + "num_input_tokens_seen": 60252200, + "step": 9025 + }, + { + "epoch": 1.1063367125049768, + "grad_norm": 1.1335531755278507, + "learning_rate": 4.535217076223449e-05, + "loss": 0.2797, + "num_input_tokens_seen": 60285264, + "step": 9030 + }, + { + "epoch": 1.1069492511714802, + "grad_norm": 1.1808545178037566, + "learning_rate": 4.530174447690875e-05, + "loss": 0.2408, + "num_input_tokens_seen": 60318840, + "step": 9035 + }, + { + "epoch": 1.1075617898379835, + "grad_norm": 1.0213517695042853, + "learning_rate": 4.5251323012401444e-05, + "loss": 0.212, + "num_input_tokens_seen": 60352632, + "step": 9040 + }, + { + "epoch": 1.108174328504487, + "grad_norm": 0.7079457769054818, + "learning_rate": 4.5200906420449394e-05, + "loss": 0.2171, + "num_input_tokens_seen": 60386072, + "step": 9045 + }, + { + "epoch": 1.1087868671709902, + "grad_norm": 1.0367325470895745, + "learning_rate": 4.5150494752784395e-05, + "loss": 0.2287, + "num_input_tokens_seen": 60419128, + "step": 9050 + }, + { + "epoch": 1.1087868671709902, + "eval_loss": 0.04092838987708092, + "eval_runtime": 19.4239, + "eval_samples_per_second": 3.089, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 60419128, + "step": 9050 + }, + { + "epoch": 1.1093994058374934, + "grad_norm": 0.7538773670635792, + "learning_rate": 4.5100088061133206e-05, + "loss": 0.2298, + "num_input_tokens_seen": 60453344, + "step": 9055 + }, + { + "epoch": 1.1100119445039969, + "grad_norm": 17.56635445096275, + "learning_rate": 4.504968639721746e-05, + "loss": 0.2425, + "num_input_tokens_seen": 60486888, + "step": 9060 + }, + { + "epoch": 1.1106244831705, + "grad_norm": 0.5760390318469033, + "learning_rate": 4.4999289812753664e-05, + "loss": 0.2285, + "num_input_tokens_seen": 60521120, + "step": 9065 + }, + { + "epoch": 1.1112370218370033, + "grad_norm": 1.1031801037784152, + "learning_rate": 4.494889835945308e-05, + "loss": 0.2351, + "num_input_tokens_seen": 60554328, + "step": 9070 + }, + { + "epoch": 1.1118495605035068, + "grad_norm": 0.7105645982299309, + "learning_rate": 4.489851208902172e-05, + "loss": 0.2191, + "num_input_tokens_seen": 60588328, + "step": 9075 + }, + { + "epoch": 1.11246209917001, + "grad_norm": 1.1154929528289679, + "learning_rate": 4.484813105316028e-05, + "loss": 0.2143, + "num_input_tokens_seen": 60621936, + "step": 9080 + }, + { + "epoch": 1.1130746378365135, + "grad_norm": 1.0758922239348947, + "learning_rate": 4.4797755303564076e-05, + "loss": 0.2731, + "num_input_tokens_seen": 60654744, + "step": 9085 + }, + { + "epoch": 1.1136871765030167, + "grad_norm": 0.8901806288698858, + "learning_rate": 4.474738489192301e-05, + "loss": 0.2238, + "num_input_tokens_seen": 60688768, + "step": 9090 + }, + { + "epoch": 1.11429971516952, + "grad_norm": 0.9021491103829905, + "learning_rate": 4.469701986992149e-05, + "loss": 0.2107, + "num_input_tokens_seen": 60722144, + "step": 9095 + }, + { + "epoch": 1.1149122538360234, + "grad_norm": 1.1700336232268902, + "learning_rate": 4.464666028923843e-05, + "loss": 0.2446, + "num_input_tokens_seen": 60755592, + "step": 9100 + }, + { + "epoch": 1.1149122538360234, + "eval_loss": 0.0737871304154396, + "eval_runtime": 19.2134, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 60755592, + "step": 9100 + }, + { + "epoch": 1.1155247925025267, + "grad_norm": 1.0539221324518135, + "learning_rate": 4.4596306201547114e-05, + "loss": 0.1852, + "num_input_tokens_seen": 60789632, + "step": 9105 + }, + { + "epoch": 1.1161373311690301, + "grad_norm": 1.0518691454899836, + "learning_rate": 4.45459576585152e-05, + "loss": 0.2752, + "num_input_tokens_seen": 60822720, + "step": 9110 + }, + { + "epoch": 1.1167498698355334, + "grad_norm": 1.0733170897771513, + "learning_rate": 4.449561471180472e-05, + "loss": 0.2468, + "num_input_tokens_seen": 60856216, + "step": 9115 + }, + { + "epoch": 1.1173624085020366, + "grad_norm": 1.1691785748422785, + "learning_rate": 4.444527741307187e-05, + "loss": 0.2363, + "num_input_tokens_seen": 60889440, + "step": 9120 + }, + { + "epoch": 1.11797494716854, + "grad_norm": 1.3441157301317685, + "learning_rate": 4.439494581396709e-05, + "loss": 0.2657, + "num_input_tokens_seen": 60922448, + "step": 9125 + }, + { + "epoch": 1.1185874858350433, + "grad_norm": 1.5128050305534737, + "learning_rate": 4.434461996613498e-05, + "loss": 0.2729, + "num_input_tokens_seen": 60955232, + "step": 9130 + }, + { + "epoch": 1.1192000245015468, + "grad_norm": 1.0743918408952606, + "learning_rate": 4.429429992121425e-05, + "loss": 0.1933, + "num_input_tokens_seen": 60989688, + "step": 9135 + }, + { + "epoch": 1.11981256316805, + "grad_norm": 1.398074866165587, + "learning_rate": 4.4243985730837616e-05, + "loss": 0.2801, + "num_input_tokens_seen": 61022928, + "step": 9140 + }, + { + "epoch": 1.1204251018345532, + "grad_norm": 1.4280522219842753, + "learning_rate": 4.4193677446631834e-05, + "loss": 0.2302, + "num_input_tokens_seen": 61056264, + "step": 9145 + }, + { + "epoch": 1.1210376405010567, + "grad_norm": 1.517006906242962, + "learning_rate": 4.4143375120217556e-05, + "loss": 0.2335, + "num_input_tokens_seen": 61089840, + "step": 9150 + }, + { + "epoch": 1.1210376405010567, + "eval_loss": 0.16165664792060852, + "eval_runtime": 19.2284, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 61089840, + "step": 9150 + }, + { + "epoch": 1.12165017916756, + "grad_norm": 0.7756944110389568, + "learning_rate": 4.409307880320934e-05, + "loss": 0.2204, + "num_input_tokens_seen": 61123296, + "step": 9155 + }, + { + "epoch": 1.1222627178340632, + "grad_norm": 1.1578834375601217, + "learning_rate": 4.4042788547215605e-05, + "loss": 0.1992, + "num_input_tokens_seen": 61157128, + "step": 9160 + }, + { + "epoch": 1.1228752565005666, + "grad_norm": 0.7541074244566018, + "learning_rate": 4.399250440383848e-05, + "loss": 0.2861, + "num_input_tokens_seen": 61189824, + "step": 9165 + }, + { + "epoch": 1.1234877951670699, + "grad_norm": 0.9440037336859107, + "learning_rate": 4.394222642467391e-05, + "loss": 0.2244, + "num_input_tokens_seen": 61223240, + "step": 9170 + }, + { + "epoch": 1.1241003338335733, + "grad_norm": 1.011960936893094, + "learning_rate": 4.389195466131143e-05, + "loss": 0.2098, + "num_input_tokens_seen": 61257016, + "step": 9175 + }, + { + "epoch": 1.1247128725000766, + "grad_norm": 14.550358781965706, + "learning_rate": 4.384168916533428e-05, + "loss": 0.2895, + "num_input_tokens_seen": 61290784, + "step": 9180 + }, + { + "epoch": 1.1253254111665798, + "grad_norm": 1.1761792982953008, + "learning_rate": 4.3791429988319196e-05, + "loss": 0.2716, + "num_input_tokens_seen": 61324656, + "step": 9185 + }, + { + "epoch": 1.1259379498330833, + "grad_norm": 27.22942333671915, + "learning_rate": 4.374117718183647e-05, + "loss": 0.2216, + "num_input_tokens_seen": 61358896, + "step": 9190 + }, + { + "epoch": 1.1265504884995865, + "grad_norm": 1.2521529743283935, + "learning_rate": 4.369093079744986e-05, + "loss": 0.246, + "num_input_tokens_seen": 61392976, + "step": 9195 + }, + { + "epoch": 1.1271630271660897, + "grad_norm": 1.1570410518690883, + "learning_rate": 4.3640690886716505e-05, + "loss": 0.2946, + "num_input_tokens_seen": 61425936, + "step": 9200 + }, + { + "epoch": 1.1271630271660897, + "eval_loss": 0.11595644801855087, + "eval_runtime": 19.5934, + "eval_samples_per_second": 3.062, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 61425936, + "step": 9200 + }, + { + "epoch": 1.1277755658325932, + "grad_norm": 1.0017314278991765, + "learning_rate": 4.359045750118693e-05, + "loss": 0.2455, + "num_input_tokens_seen": 61458944, + "step": 9205 + }, + { + "epoch": 1.1283881044990964, + "grad_norm": 1.0170891897042273, + "learning_rate": 4.354023069240494e-05, + "loss": 0.2201, + "num_input_tokens_seen": 61492624, + "step": 9210 + }, + { + "epoch": 1.1290006431656, + "grad_norm": 1.2872216051806298, + "learning_rate": 4.349001051190762e-05, + "loss": 0.2575, + "num_input_tokens_seen": 61526360, + "step": 9215 + }, + { + "epoch": 1.1296131818321031, + "grad_norm": 1.444588695555388, + "learning_rate": 4.343979701122523e-05, + "loss": 0.2324, + "num_input_tokens_seen": 61559968, + "step": 9220 + }, + { + "epoch": 1.1302257204986064, + "grad_norm": 1.1788073881461427, + "learning_rate": 4.338959024188119e-05, + "loss": 0.2494, + "num_input_tokens_seen": 61593088, + "step": 9225 + }, + { + "epoch": 1.1308382591651098, + "grad_norm": 1.1861895045364774, + "learning_rate": 4.3339390255392014e-05, + "loss": 0.202, + "num_input_tokens_seen": 61626536, + "step": 9230 + }, + { + "epoch": 1.131450797831613, + "grad_norm": 1.1162375554809394, + "learning_rate": 4.328919710326723e-05, + "loss": 0.2349, + "num_input_tokens_seen": 61659896, + "step": 9235 + }, + { + "epoch": 1.1320633364981165, + "grad_norm": 1.3493286901870916, + "learning_rate": 4.323901083700941e-05, + "loss": 0.2678, + "num_input_tokens_seen": 61693536, + "step": 9240 + }, + { + "epoch": 1.1326758751646198, + "grad_norm": 2.3483498645483585, + "learning_rate": 4.3188831508113984e-05, + "loss": 0.2432, + "num_input_tokens_seen": 61726448, + "step": 9245 + }, + { + "epoch": 1.133288413831123, + "grad_norm": 1.2116962597011374, + "learning_rate": 4.3138659168069337e-05, + "loss": 0.2601, + "num_input_tokens_seen": 61760040, + "step": 9250 + }, + { + "epoch": 1.133288413831123, + "eval_loss": 0.19667980074882507, + "eval_runtime": 19.0391, + "eval_samples_per_second": 3.151, + "eval_steps_per_second": 0.788, + "num_input_tokens_seen": 61760040, + "step": 9250 + }, + { + "epoch": 1.1339009524976265, + "grad_norm": 1.389428542417325, + "learning_rate": 4.308849386835663e-05, + "loss": 0.2285, + "num_input_tokens_seen": 61793816, + "step": 9255 + }, + { + "epoch": 1.1345134911641297, + "grad_norm": 1.336169031154297, + "learning_rate": 4.3038335660449824e-05, + "loss": 0.2498, + "num_input_tokens_seen": 61826856, + "step": 9260 + }, + { + "epoch": 1.1351260298306332, + "grad_norm": 1.4638898553009012, + "learning_rate": 4.298818459581561e-05, + "loss": 0.2659, + "num_input_tokens_seen": 61859600, + "step": 9265 + }, + { + "epoch": 1.1357385684971364, + "grad_norm": 1.2479277102181283, + "learning_rate": 4.293804072591332e-05, + "loss": 0.2753, + "num_input_tokens_seen": 61892344, + "step": 9270 + }, + { + "epoch": 1.1363511071636396, + "grad_norm": 4.072567241537122, + "learning_rate": 4.2887904102194936e-05, + "loss": 0.2441, + "num_input_tokens_seen": 61926304, + "step": 9275 + }, + { + "epoch": 1.136963645830143, + "grad_norm": 1.8551563543993175, + "learning_rate": 4.2837774776105004e-05, + "loss": 0.2048, + "num_input_tokens_seen": 61959928, + "step": 9280 + }, + { + "epoch": 1.1375761844966463, + "grad_norm": 1.14966116993182, + "learning_rate": 4.2787652799080534e-05, + "loss": 0.2422, + "num_input_tokens_seen": 61994152, + "step": 9285 + }, + { + "epoch": 1.1381887231631498, + "grad_norm": 1.2016552414651813, + "learning_rate": 4.273753822255104e-05, + "loss": 0.2168, + "num_input_tokens_seen": 62028112, + "step": 9290 + }, + { + "epoch": 1.138801261829653, + "grad_norm": 0.9533372798699556, + "learning_rate": 4.2687431097938454e-05, + "loss": 0.2453, + "num_input_tokens_seen": 62061552, + "step": 9295 + }, + { + "epoch": 1.1394138004961563, + "grad_norm": 0.9425881506630056, + "learning_rate": 4.263733147665701e-05, + "loss": 0.244, + "num_input_tokens_seen": 62094712, + "step": 9300 + }, + { + "epoch": 1.1394138004961563, + "eval_loss": 0.20997019112110138, + "eval_runtime": 19.298, + "eval_samples_per_second": 3.109, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 62094712, + "step": 9300 + }, + { + "epoch": 1.1400263391626597, + "grad_norm": 1.1543417769684954, + "learning_rate": 4.25872394101133e-05, + "loss": 0.2508, + "num_input_tokens_seen": 62128328, + "step": 9305 + }, + { + "epoch": 1.140638877829163, + "grad_norm": 1.4932208787618224, + "learning_rate": 4.2537154949706125e-05, + "loss": 0.2413, + "num_input_tokens_seen": 62162296, + "step": 9310 + }, + { + "epoch": 1.1412514164956662, + "grad_norm": 2.6850024996745945, + "learning_rate": 4.248707814682649e-05, + "loss": 0.2265, + "num_input_tokens_seen": 62195464, + "step": 9315 + }, + { + "epoch": 1.1418639551621697, + "grad_norm": 0.841879184262439, + "learning_rate": 4.243700905285756e-05, + "loss": 0.2592, + "num_input_tokens_seen": 62229200, + "step": 9320 + }, + { + "epoch": 1.142476493828673, + "grad_norm": 1.1918569314594833, + "learning_rate": 4.238694771917455e-05, + "loss": 0.2213, + "num_input_tokens_seen": 62262880, + "step": 9325 + }, + { + "epoch": 1.1430890324951763, + "grad_norm": 0.9520243516521819, + "learning_rate": 4.233689419714477e-05, + "loss": 0.2364, + "num_input_tokens_seen": 62296248, + "step": 9330 + }, + { + "epoch": 1.1437015711616796, + "grad_norm": 1.4640565726273544, + "learning_rate": 4.2286848538127466e-05, + "loss": 0.246, + "num_input_tokens_seen": 62329936, + "step": 9335 + }, + { + "epoch": 1.1443141098281828, + "grad_norm": 1.3770191770849247, + "learning_rate": 4.2236810793473844e-05, + "loss": 0.2451, + "num_input_tokens_seen": 62363832, + "step": 9340 + }, + { + "epoch": 1.1449266484946863, + "grad_norm": 1.0012062256819856, + "learning_rate": 4.218678101452696e-05, + "loss": 0.2524, + "num_input_tokens_seen": 62396696, + "step": 9345 + }, + { + "epoch": 1.1455391871611895, + "grad_norm": 0.8842302083344398, + "learning_rate": 4.213675925262173e-05, + "loss": 0.2379, + "num_input_tokens_seen": 62430368, + "step": 9350 + }, + { + "epoch": 1.1455391871611895, + "eval_loss": 0.18152250349521637, + "eval_runtime": 19.0734, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 62430368, + "step": 9350 + }, + { + "epoch": 1.1461517258276928, + "grad_norm": 1.0496369092537587, + "learning_rate": 4.208674555908484e-05, + "loss": 0.2233, + "num_input_tokens_seen": 62463696, + "step": 9355 + }, + { + "epoch": 1.1467642644941962, + "grad_norm": 1.3334902583515178, + "learning_rate": 4.203673998523467e-05, + "loss": 0.231, + "num_input_tokens_seen": 62497344, + "step": 9360 + }, + { + "epoch": 1.1473768031606995, + "grad_norm": 1.0459342974424035, + "learning_rate": 4.1986742582381274e-05, + "loss": 0.2353, + "num_input_tokens_seen": 62530120, + "step": 9365 + }, + { + "epoch": 1.147989341827203, + "grad_norm": 1.6281225447305558, + "learning_rate": 4.193675340182633e-05, + "loss": 0.2508, + "num_input_tokens_seen": 62563856, + "step": 9370 + }, + { + "epoch": 1.1486018804937062, + "grad_norm": 1.2354683445694614, + "learning_rate": 4.188677249486311e-05, + "loss": 0.2152, + "num_input_tokens_seen": 62597400, + "step": 9375 + }, + { + "epoch": 1.1492144191602094, + "grad_norm": 1.4502334890189763, + "learning_rate": 4.183679991277635e-05, + "loss": 0.2277, + "num_input_tokens_seen": 62630920, + "step": 9380 + }, + { + "epoch": 1.1498269578267128, + "grad_norm": 1.2718272094895222, + "learning_rate": 4.178683570684226e-05, + "loss": 0.2302, + "num_input_tokens_seen": 62664904, + "step": 9385 + }, + { + "epoch": 1.150439496493216, + "grad_norm": 1.1501181658268176, + "learning_rate": 4.1736879928328465e-05, + "loss": 0.2507, + "num_input_tokens_seen": 62697600, + "step": 9390 + }, + { + "epoch": 1.1510520351597195, + "grad_norm": 1.202220783813427, + "learning_rate": 4.1686932628493924e-05, + "loss": 0.2246, + "num_input_tokens_seen": 62730904, + "step": 9395 + }, + { + "epoch": 1.1516645738262228, + "grad_norm": 1.2167438845613054, + "learning_rate": 4.1636993858588925e-05, + "loss": 0.2501, + "num_input_tokens_seen": 62764152, + "step": 9400 + }, + { + "epoch": 1.1516645738262228, + "eval_loss": 0.16211895644664764, + "eval_runtime": 19.1064, + "eval_samples_per_second": 3.14, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 62764152, + "step": 9400 + }, + { + "epoch": 1.152277112492726, + "grad_norm": 0.8023265604005112, + "learning_rate": 4.158706366985496e-05, + "loss": 0.2507, + "num_input_tokens_seen": 62797240, + "step": 9405 + }, + { + "epoch": 1.1528896511592295, + "grad_norm": 15.131859056933918, + "learning_rate": 4.153714211352477e-05, + "loss": 0.253, + "num_input_tokens_seen": 62830784, + "step": 9410 + }, + { + "epoch": 1.1535021898257327, + "grad_norm": 0.8892625342022871, + "learning_rate": 4.1487229240822184e-05, + "loss": 0.2106, + "num_input_tokens_seen": 62864352, + "step": 9415 + }, + { + "epoch": 1.1541147284922362, + "grad_norm": 0.9303920555735167, + "learning_rate": 4.1437325102962166e-05, + "loss": 0.2186, + "num_input_tokens_seen": 62898080, + "step": 9420 + }, + { + "epoch": 1.1547272671587394, + "grad_norm": 1.1199946996353842, + "learning_rate": 4.13874297511507e-05, + "loss": 0.2311, + "num_input_tokens_seen": 62931672, + "step": 9425 + }, + { + "epoch": 1.1553398058252426, + "grad_norm": 1.3148444100110084, + "learning_rate": 4.1337543236584725e-05, + "loss": 0.2374, + "num_input_tokens_seen": 62965192, + "step": 9430 + }, + { + "epoch": 1.155952344491746, + "grad_norm": 1.0881847464008114, + "learning_rate": 4.128766561045218e-05, + "loss": 0.2119, + "num_input_tokens_seen": 62999184, + "step": 9435 + }, + { + "epoch": 1.1565648831582493, + "grad_norm": 1.0160094976599487, + "learning_rate": 4.123779692393182e-05, + "loss": 0.212, + "num_input_tokens_seen": 63033216, + "step": 9440 + }, + { + "epoch": 1.1571774218247528, + "grad_norm": 0.9197506131055149, + "learning_rate": 4.118793722819325e-05, + "loss": 0.2037, + "num_input_tokens_seen": 63067312, + "step": 9445 + }, + { + "epoch": 1.157789960491256, + "grad_norm": 1.0165454174427835, + "learning_rate": 4.1138086574396853e-05, + "loss": 0.2499, + "num_input_tokens_seen": 63100624, + "step": 9450 + }, + { + "epoch": 1.157789960491256, + "eval_loss": 0.26281774044036865, + "eval_runtime": 19.4806, + "eval_samples_per_second": 3.08, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 63100624, + "step": 9450 + }, + { + "epoch": 1.1584024991577593, + "grad_norm": 0.8888516141390272, + "learning_rate": 4.108824501369373e-05, + "loss": 0.3113, + "num_input_tokens_seen": 63133768, + "step": 9455 + }, + { + "epoch": 1.1590150378242627, + "grad_norm": 1.1225782833833533, + "learning_rate": 4.103841259722565e-05, + "loss": 0.237, + "num_input_tokens_seen": 63167024, + "step": 9460 + }, + { + "epoch": 1.159627576490766, + "grad_norm": 0.6009319504293407, + "learning_rate": 4.098858937612503e-05, + "loss": 0.2199, + "num_input_tokens_seen": 63200352, + "step": 9465 + }, + { + "epoch": 1.1602401151572692, + "grad_norm": 1.2524356407097776, + "learning_rate": 4.0938775401514786e-05, + "loss": 0.2444, + "num_input_tokens_seen": 63234024, + "step": 9470 + }, + { + "epoch": 1.1608526538237727, + "grad_norm": 1.2631607750741565, + "learning_rate": 4.088897072450841e-05, + "loss": 0.2142, + "num_input_tokens_seen": 63267560, + "step": 9475 + }, + { + "epoch": 1.161465192490276, + "grad_norm": 1.1860211721289, + "learning_rate": 4.083917539620983e-05, + "loss": 0.2666, + "num_input_tokens_seen": 63300920, + "step": 9480 + }, + { + "epoch": 1.1620777311567791, + "grad_norm": 0.8223434783929465, + "learning_rate": 4.0789389467713355e-05, + "loss": 0.2489, + "num_input_tokens_seen": 63334752, + "step": 9485 + }, + { + "epoch": 1.1626902698232826, + "grad_norm": 1.168742931775643, + "learning_rate": 4.0739612990103695e-05, + "loss": 0.3074, + "num_input_tokens_seen": 63368400, + "step": 9490 + }, + { + "epoch": 1.1633028084897858, + "grad_norm": 1.2966182888652038, + "learning_rate": 4.068984601445584e-05, + "loss": 0.2356, + "num_input_tokens_seen": 63401416, + "step": 9495 + }, + { + "epoch": 1.1639153471562893, + "grad_norm": 1.3637147288832816, + "learning_rate": 4.064008859183504e-05, + "loss": 0.2128, + "num_input_tokens_seen": 63435392, + "step": 9500 + }, + { + "epoch": 1.1639153471562893, + "eval_loss": 0.18865041434764862, + "eval_runtime": 19.2977, + "eval_samples_per_second": 3.109, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 63435392, + "step": 9500 + }, + { + "epoch": 1.1645278858227925, + "grad_norm": 1.2661686058759454, + "learning_rate": 4.059034077329671e-05, + "loss": 0.251, + "num_input_tokens_seen": 63468256, + "step": 9505 + }, + { + "epoch": 1.1651404244892958, + "grad_norm": 0.8663219101154317, + "learning_rate": 4.0540602609886444e-05, + "loss": 0.2268, + "num_input_tokens_seen": 63502664, + "step": 9510 + }, + { + "epoch": 1.1657529631557992, + "grad_norm": 0.9332216667050439, + "learning_rate": 4.049087415263994e-05, + "loss": 0.2266, + "num_input_tokens_seen": 63536472, + "step": 9515 + }, + { + "epoch": 1.1663655018223025, + "grad_norm": 0.9212176283835489, + "learning_rate": 4.0441155452582916e-05, + "loss": 0.2444, + "num_input_tokens_seen": 63569768, + "step": 9520 + }, + { + "epoch": 1.166978040488806, + "grad_norm": 1.2927850989309784, + "learning_rate": 4.039144656073105e-05, + "loss": 0.2204, + "num_input_tokens_seen": 63603296, + "step": 9525 + }, + { + "epoch": 1.1675905791553092, + "grad_norm": 0.8471472327425378, + "learning_rate": 4.034174752808998e-05, + "loss": 0.1856, + "num_input_tokens_seen": 63637232, + "step": 9530 + }, + { + "epoch": 1.1682031178218124, + "grad_norm": 1.3403704852262786, + "learning_rate": 4.0292058405655264e-05, + "loss": 0.2501, + "num_input_tokens_seen": 63670416, + "step": 9535 + }, + { + "epoch": 1.1688156564883159, + "grad_norm": 1.3185889201311154, + "learning_rate": 4.024237924441224e-05, + "loss": 0.2348, + "num_input_tokens_seen": 63704184, + "step": 9540 + }, + { + "epoch": 1.169428195154819, + "grad_norm": 0.9612182593599476, + "learning_rate": 4.019271009533606e-05, + "loss": 0.2098, + "num_input_tokens_seen": 63737640, + "step": 9545 + }, + { + "epoch": 1.1700407338213226, + "grad_norm": 1.4551240750022474, + "learning_rate": 4.014305100939157e-05, + "loss": 0.2689, + "num_input_tokens_seen": 63771112, + "step": 9550 + }, + { + "epoch": 1.1700407338213226, + "eval_loss": 0.09276539832353592, + "eval_runtime": 19.5641, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 63771112, + "step": 9550 + }, + { + "epoch": 1.1706532724878258, + "grad_norm": 1.2900592612406696, + "learning_rate": 4.009340203753332e-05, + "loss": 0.2242, + "num_input_tokens_seen": 63805152, + "step": 9555 + }, + { + "epoch": 1.171265811154329, + "grad_norm": 1.0633736541347167, + "learning_rate": 4.004376323070547e-05, + "loss": 0.2551, + "num_input_tokens_seen": 63838496, + "step": 9560 + }, + { + "epoch": 1.1718783498208325, + "grad_norm": 1.072867622281364, + "learning_rate": 3.999413463984174e-05, + "loss": 0.2105, + "num_input_tokens_seen": 63872464, + "step": 9565 + }, + { + "epoch": 1.1724908884873357, + "grad_norm": 0.9630305640602266, + "learning_rate": 3.994451631586539e-05, + "loss": 0.2316, + "num_input_tokens_seen": 63906352, + "step": 9570 + }, + { + "epoch": 1.1731034271538392, + "grad_norm": 0.8637567462984742, + "learning_rate": 3.989490830968911e-05, + "loss": 0.1975, + "num_input_tokens_seen": 63940936, + "step": 9575 + }, + { + "epoch": 1.1737159658203424, + "grad_norm": 1.185127488328355, + "learning_rate": 3.9845310672215054e-05, + "loss": 0.2207, + "num_input_tokens_seen": 63974536, + "step": 9580 + }, + { + "epoch": 1.1743285044868457, + "grad_norm": 1.3139126462967579, + "learning_rate": 3.979572345433469e-05, + "loss": 0.2777, + "num_input_tokens_seen": 64007880, + "step": 9585 + }, + { + "epoch": 1.1749410431533491, + "grad_norm": 1.0597917746931924, + "learning_rate": 3.974614670692881e-05, + "loss": 0.2191, + "num_input_tokens_seen": 64041728, + "step": 9590 + }, + { + "epoch": 1.1755535818198524, + "grad_norm": 1.187150219519634, + "learning_rate": 3.9696580480867476e-05, + "loss": 0.2531, + "num_input_tokens_seen": 64074776, + "step": 9595 + }, + { + "epoch": 1.1761661204863558, + "grad_norm": 1.4861667501904818, + "learning_rate": 3.964702482700993e-05, + "loss": 0.2556, + "num_input_tokens_seen": 64107872, + "step": 9600 + }, + { + "epoch": 1.1761661204863558, + "eval_loss": 0.11478094756603241, + "eval_runtime": 19.2438, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 64107872, + "step": 9600 + }, + { + "epoch": 1.176778659152859, + "grad_norm": 1.2863408396767344, + "learning_rate": 3.9597479796204625e-05, + "loss": 0.2505, + "num_input_tokens_seen": 64141168, + "step": 9605 + }, + { + "epoch": 1.1773911978193623, + "grad_norm": 1.075878787422488, + "learning_rate": 3.954794543928899e-05, + "loss": 0.2437, + "num_input_tokens_seen": 64174704, + "step": 9610 + }, + { + "epoch": 1.1780037364858658, + "grad_norm": 1.279246697076882, + "learning_rate": 3.9498421807089644e-05, + "loss": 0.2566, + "num_input_tokens_seen": 64208160, + "step": 9615 + }, + { + "epoch": 1.178616275152369, + "grad_norm": 1.1571208056034121, + "learning_rate": 3.944890895042209e-05, + "loss": 0.1975, + "num_input_tokens_seen": 64242328, + "step": 9620 + }, + { + "epoch": 1.1792288138188722, + "grad_norm": 1.3311669205446603, + "learning_rate": 3.9399406920090864e-05, + "loss": 0.1948, + "num_input_tokens_seen": 64276064, + "step": 9625 + }, + { + "epoch": 1.1798413524853757, + "grad_norm": 1.2686593328995437, + "learning_rate": 3.934991576688932e-05, + "loss": 0.2698, + "num_input_tokens_seen": 64309672, + "step": 9630 + }, + { + "epoch": 1.180453891151879, + "grad_norm": 0.9265320879352785, + "learning_rate": 3.930043554159971e-05, + "loss": 0.2303, + "num_input_tokens_seen": 64342944, + "step": 9635 + }, + { + "epoch": 1.1810664298183822, + "grad_norm": 1.0631764108883002, + "learning_rate": 3.925096629499302e-05, + "loss": 0.228, + "num_input_tokens_seen": 64377048, + "step": 9640 + }, + { + "epoch": 1.1816789684848856, + "grad_norm": 1.2922991467981881, + "learning_rate": 3.9201508077829e-05, + "loss": 0.2396, + "num_input_tokens_seen": 64410448, + "step": 9645 + }, + { + "epoch": 1.1822915071513889, + "grad_norm": 1.1681543983412161, + "learning_rate": 3.91520609408561e-05, + "loss": 0.2527, + "num_input_tokens_seen": 64443680, + "step": 9650 + }, + { + "epoch": 1.1822915071513889, + "eval_loss": 0.12790514528751373, + "eval_runtime": 19.434, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 64443680, + "step": 9650 + }, + { + "epoch": 1.1829040458178923, + "grad_norm": 0.8401822600592633, + "learning_rate": 3.910262493481135e-05, + "loss": 0.2157, + "num_input_tokens_seen": 64476992, + "step": 9655 + }, + { + "epoch": 1.1835165844843956, + "grad_norm": 1.159716095857263, + "learning_rate": 3.905320011042041e-05, + "loss": 0.2082, + "num_input_tokens_seen": 64510904, + "step": 9660 + }, + { + "epoch": 1.1841291231508988, + "grad_norm": 0.9784086028383305, + "learning_rate": 3.9003786518397434e-05, + "loss": 0.2468, + "num_input_tokens_seen": 64544304, + "step": 9665 + }, + { + "epoch": 1.1847416618174023, + "grad_norm": 4.732514674751141, + "learning_rate": 3.895438420944506e-05, + "loss": 0.1972, + "num_input_tokens_seen": 64578968, + "step": 9670 + }, + { + "epoch": 1.1853542004839055, + "grad_norm": 0.9874171333516809, + "learning_rate": 3.890499323425436e-05, + "loss": 0.1995, + "num_input_tokens_seen": 64612840, + "step": 9675 + }, + { + "epoch": 1.185966739150409, + "grad_norm": 1.3621585004849766, + "learning_rate": 3.885561364350475e-05, + "loss": 0.2309, + "num_input_tokens_seen": 64646224, + "step": 9680 + }, + { + "epoch": 1.1865792778169122, + "grad_norm": 0.5848807969259513, + "learning_rate": 3.8806245487864026e-05, + "loss": 0.2277, + "num_input_tokens_seen": 64680384, + "step": 9685 + }, + { + "epoch": 1.1871918164834154, + "grad_norm": 1.2515666082848003, + "learning_rate": 3.875688881798814e-05, + "loss": 0.2545, + "num_input_tokens_seen": 64714000, + "step": 9690 + }, + { + "epoch": 1.187804355149919, + "grad_norm": 1.2347003294011665, + "learning_rate": 3.870754368452137e-05, + "loss": 0.2513, + "num_input_tokens_seen": 64747416, + "step": 9695 + }, + { + "epoch": 1.1884168938164221, + "grad_norm": 46.19697115089226, + "learning_rate": 3.865821013809609e-05, + "loss": 0.2621, + "num_input_tokens_seen": 64781416, + "step": 9700 + }, + { + "epoch": 1.1884168938164221, + "eval_loss": 0.13076919317245483, + "eval_runtime": 19.8191, + "eval_samples_per_second": 3.027, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 64781416, + "step": 9700 + }, + { + "epoch": 1.1890294324829256, + "grad_norm": 1.0708335737541204, + "learning_rate": 3.8608888229332804e-05, + "loss": 0.1975, + "num_input_tokens_seen": 64815384, + "step": 9705 + }, + { + "epoch": 1.1896419711494288, + "grad_norm": 1.1651831894820255, + "learning_rate": 3.855957800884009e-05, + "loss": 0.2391, + "num_input_tokens_seen": 64849224, + "step": 9710 + }, + { + "epoch": 1.190254509815932, + "grad_norm": 1.1735504622319348, + "learning_rate": 3.851027952721453e-05, + "loss": 0.2046, + "num_input_tokens_seen": 64883384, + "step": 9715 + }, + { + "epoch": 1.1908670484824355, + "grad_norm": 0.8442590192791835, + "learning_rate": 3.846099283504063e-05, + "loss": 0.1976, + "num_input_tokens_seen": 64916936, + "step": 9720 + }, + { + "epoch": 1.1914795871489388, + "grad_norm": 3.6127485956011465, + "learning_rate": 3.841171798289081e-05, + "loss": 0.208, + "num_input_tokens_seen": 64950880, + "step": 9725 + }, + { + "epoch": 1.1920921258154422, + "grad_norm": 1.097406454051025, + "learning_rate": 3.836245502132538e-05, + "loss": 0.2598, + "num_input_tokens_seen": 64984160, + "step": 9730 + }, + { + "epoch": 1.1927046644819455, + "grad_norm": 1.2121900583784575, + "learning_rate": 3.831320400089239e-05, + "loss": 0.217, + "num_input_tokens_seen": 65017744, + "step": 9735 + }, + { + "epoch": 1.1933172031484487, + "grad_norm": 1.1492501405875808, + "learning_rate": 3.826396497212769e-05, + "loss": 0.2476, + "num_input_tokens_seen": 65050920, + "step": 9740 + }, + { + "epoch": 1.1939297418149522, + "grad_norm": 1.497265888081188, + "learning_rate": 3.82147379855548e-05, + "loss": 0.2501, + "num_input_tokens_seen": 65084512, + "step": 9745 + }, + { + "epoch": 1.1945422804814554, + "grad_norm": 1.0981259674367374, + "learning_rate": 3.816552309168486e-05, + "loss": 0.2257, + "num_input_tokens_seen": 65118432, + "step": 9750 + }, + { + "epoch": 1.1945422804814554, + "eval_loss": 0.12043237686157227, + "eval_runtime": 19.3738, + "eval_samples_per_second": 3.097, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 65118432, + "step": 9750 + }, + { + "epoch": 1.1951548191479586, + "grad_norm": 1.216752093127151, + "learning_rate": 3.8116320341016665e-05, + "loss": 0.2107, + "num_input_tokens_seen": 65152912, + "step": 9755 + }, + { + "epoch": 1.195767357814462, + "grad_norm": 1.313060365265826, + "learning_rate": 3.8067129784036484e-05, + "loss": 0.2184, + "num_input_tokens_seen": 65186848, + "step": 9760 + }, + { + "epoch": 1.1963798964809653, + "grad_norm": 1.236526413209765, + "learning_rate": 3.801795147121815e-05, + "loss": 0.2131, + "num_input_tokens_seen": 65220376, + "step": 9765 + }, + { + "epoch": 1.1969924351474688, + "grad_norm": 1.3403798610271145, + "learning_rate": 3.796878545302281e-05, + "loss": 0.2544, + "num_input_tokens_seen": 65253824, + "step": 9770 + }, + { + "epoch": 1.197604973813972, + "grad_norm": 1.5079634741702237, + "learning_rate": 3.791963177989912e-05, + "loss": 0.2142, + "num_input_tokens_seen": 65288104, + "step": 9775 + }, + { + "epoch": 1.1982175124804753, + "grad_norm": 0.8714356433770355, + "learning_rate": 3.7870490502283005e-05, + "loss": 0.1976, + "num_input_tokens_seen": 65321912, + "step": 9780 + }, + { + "epoch": 1.1988300511469787, + "grad_norm": 1.1178307410326833, + "learning_rate": 3.782136167059771e-05, + "loss": 0.2692, + "num_input_tokens_seen": 65355040, + "step": 9785 + }, + { + "epoch": 1.199442589813482, + "grad_norm": 1.4169463450713322, + "learning_rate": 3.777224533525365e-05, + "loss": 0.2428, + "num_input_tokens_seen": 65388392, + "step": 9790 + }, + { + "epoch": 1.2000551284799852, + "grad_norm": 1.626289658444757, + "learning_rate": 3.7723141546648484e-05, + "loss": 0.1856, + "num_input_tokens_seen": 65422864, + "step": 9795 + }, + { + "epoch": 1.2006676671464886, + "grad_norm": 0.9881180594492855, + "learning_rate": 3.767405035516695e-05, + "loss": 0.1952, + "num_input_tokens_seen": 65456984, + "step": 9800 + }, + { + "epoch": 1.2006676671464886, + "eval_loss": 0.15540121495723724, + "eval_runtime": 19.3704, + "eval_samples_per_second": 3.098, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 65456984, + "step": 9800 + }, + { + "epoch": 1.2012802058129919, + "grad_norm": 0.8278162945070349, + "learning_rate": 3.7624971811180876e-05, + "loss": 0.1921, + "num_input_tokens_seen": 65491160, + "step": 9805 + }, + { + "epoch": 1.2018927444794953, + "grad_norm": 1.0429689841712164, + "learning_rate": 3.757590596504913e-05, + "loss": 0.2193, + "num_input_tokens_seen": 65525176, + "step": 9810 + }, + { + "epoch": 1.2025052831459986, + "grad_norm": 1.1236363469928343, + "learning_rate": 3.75268528671175e-05, + "loss": 0.2093, + "num_input_tokens_seen": 65558616, + "step": 9815 + }, + { + "epoch": 1.2031178218125018, + "grad_norm": 1.5838338156038412, + "learning_rate": 3.747781256771876e-05, + "loss": 0.25, + "num_input_tokens_seen": 65592224, + "step": 9820 + }, + { + "epoch": 1.2037303604790053, + "grad_norm": 0.9567571343854455, + "learning_rate": 3.7428785117172516e-05, + "loss": 0.2461, + "num_input_tokens_seen": 65625296, + "step": 9825 + }, + { + "epoch": 1.2043428991455085, + "grad_norm": 1.3081679722279336, + "learning_rate": 3.7379770565785166e-05, + "loss": 0.2281, + "num_input_tokens_seen": 65658896, + "step": 9830 + }, + { + "epoch": 1.204955437812012, + "grad_norm": 1.4709789652143452, + "learning_rate": 3.733076896384993e-05, + "loss": 0.2641, + "num_input_tokens_seen": 65692240, + "step": 9835 + }, + { + "epoch": 1.2055679764785152, + "grad_norm": 1.320370909141904, + "learning_rate": 3.7281780361646685e-05, + "loss": 0.2389, + "num_input_tokens_seen": 65725568, + "step": 9840 + }, + { + "epoch": 1.2061805151450185, + "grad_norm": 0.9653612397970687, + "learning_rate": 3.723280480944202e-05, + "loss": 0.1951, + "num_input_tokens_seen": 65759352, + "step": 9845 + }, + { + "epoch": 1.206793053811522, + "grad_norm": 0.95721723031885, + "learning_rate": 3.718384235748909e-05, + "loss": 0.2196, + "num_input_tokens_seen": 65793256, + "step": 9850 + }, + { + "epoch": 1.206793053811522, + "eval_loss": 0.08569300174713135, + "eval_runtime": 19.2071, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 65793256, + "step": 9850 + }, + { + "epoch": 1.2074055924780251, + "grad_norm": 1.4311912452368811, + "learning_rate": 3.713489305602764e-05, + "loss": 0.241, + "num_input_tokens_seen": 65826720, + "step": 9855 + }, + { + "epoch": 1.2080181311445286, + "grad_norm": 1.2101749334622063, + "learning_rate": 3.7085956955283876e-05, + "loss": 0.2296, + "num_input_tokens_seen": 65860616, + "step": 9860 + }, + { + "epoch": 1.2086306698110318, + "grad_norm": 1.298502327185039, + "learning_rate": 3.703703410547052e-05, + "loss": 0.2382, + "num_input_tokens_seen": 65894088, + "step": 9865 + }, + { + "epoch": 1.209243208477535, + "grad_norm": 1.087225038963991, + "learning_rate": 3.6988124556786643e-05, + "loss": 0.2488, + "num_input_tokens_seen": 65927088, + "step": 9870 + }, + { + "epoch": 1.2098557471440385, + "grad_norm": 1.31289035420501, + "learning_rate": 3.693922835941772e-05, + "loss": 0.2449, + "num_input_tokens_seen": 65960688, + "step": 9875 + }, + { + "epoch": 1.2104682858105418, + "grad_norm": 0.9116870944299362, + "learning_rate": 3.689034556353549e-05, + "loss": 0.2294, + "num_input_tokens_seen": 65994224, + "step": 9880 + }, + { + "epoch": 1.2110808244770452, + "grad_norm": 1.0503265318856103, + "learning_rate": 3.6841476219297925e-05, + "loss": 0.205, + "num_input_tokens_seen": 66028432, + "step": 9885 + }, + { + "epoch": 1.2116933631435485, + "grad_norm": 0.9624064963041362, + "learning_rate": 3.679262037684926e-05, + "loss": 0.2232, + "num_input_tokens_seen": 66061560, + "step": 9890 + }, + { + "epoch": 1.2123059018100517, + "grad_norm": 1.3781388944131474, + "learning_rate": 3.674377808631979e-05, + "loss": 0.2588, + "num_input_tokens_seen": 66094600, + "step": 9895 + }, + { + "epoch": 1.2129184404765552, + "grad_norm": 0.9826823070435134, + "learning_rate": 3.669494939782599e-05, + "loss": 0.2593, + "num_input_tokens_seen": 66127864, + "step": 9900 + }, + { + "epoch": 1.2129184404765552, + "eval_loss": 0.12373431026935577, + "eval_runtime": 19.4779, + "eval_samples_per_second": 3.08, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 66127864, + "step": 9900 + }, + { + "epoch": 1.2135309791430584, + "grad_norm": 0.8495897749424548, + "learning_rate": 3.664613436147032e-05, + "loss": 0.2379, + "num_input_tokens_seen": 66160792, + "step": 9905 + }, + { + "epoch": 1.2141435178095616, + "grad_norm": 1.4129826211211765, + "learning_rate": 3.659733302734124e-05, + "loss": 0.2287, + "num_input_tokens_seen": 66194432, + "step": 9910 + }, + { + "epoch": 1.214756056476065, + "grad_norm": 2.124749098626852, + "learning_rate": 3.654854544551318e-05, + "loss": 0.2561, + "num_input_tokens_seen": 66226944, + "step": 9915 + }, + { + "epoch": 1.2153685951425683, + "grad_norm": 1.256683442522409, + "learning_rate": 3.649977166604642e-05, + "loss": 0.2355, + "num_input_tokens_seen": 66260416, + "step": 9920 + }, + { + "epoch": 1.2159811338090716, + "grad_norm": 0.8804204926535866, + "learning_rate": 3.645101173898712e-05, + "loss": 0.252, + "num_input_tokens_seen": 66294488, + "step": 9925 + }, + { + "epoch": 1.216593672475575, + "grad_norm": 0.9184258473031355, + "learning_rate": 3.640226571436718e-05, + "loss": 0.2395, + "num_input_tokens_seen": 66328552, + "step": 9930 + }, + { + "epoch": 1.2172062111420783, + "grad_norm": 1.2789738563740194, + "learning_rate": 3.635353364220426e-05, + "loss": 0.2348, + "num_input_tokens_seen": 66362368, + "step": 9935 + }, + { + "epoch": 1.2178187498085817, + "grad_norm": 1.2042122568344942, + "learning_rate": 3.630481557250171e-05, + "loss": 0.2409, + "num_input_tokens_seen": 66395760, + "step": 9940 + }, + { + "epoch": 1.218431288475085, + "grad_norm": 1.6075189504792033, + "learning_rate": 3.625611155524849e-05, + "loss": 0.226, + "num_input_tokens_seen": 66429176, + "step": 9945 + }, + { + "epoch": 1.2190438271415882, + "grad_norm": 1.59717152489572, + "learning_rate": 3.620742164041916e-05, + "loss": 0.2589, + "num_input_tokens_seen": 66462256, + "step": 9950 + }, + { + "epoch": 1.2190438271415882, + "eval_loss": 0.16011233627796173, + "eval_runtime": 20.4036, + "eval_samples_per_second": 2.941, + "eval_steps_per_second": 0.735, + "num_input_tokens_seen": 66462256, + "step": 9950 + }, + { + "epoch": 1.2196563658080917, + "grad_norm": 1.4544809627964557, + "learning_rate": 3.615874587797381e-05, + "loss": 0.2162, + "num_input_tokens_seen": 66495928, + "step": 9955 + }, + { + "epoch": 1.220268904474595, + "grad_norm": 1.314166192503101, + "learning_rate": 3.6110084317858005e-05, + "loss": 0.2371, + "num_input_tokens_seen": 66529384, + "step": 9960 + }, + { + "epoch": 1.2208814431410984, + "grad_norm": 1.8272913355200002, + "learning_rate": 3.606143701000271e-05, + "loss": 0.2425, + "num_input_tokens_seen": 66563312, + "step": 9965 + }, + { + "epoch": 1.2214939818076016, + "grad_norm": 1.1190635076196744, + "learning_rate": 3.6012804004324326e-05, + "loss": 0.228, + "num_input_tokens_seen": 66596840, + "step": 9970 + }, + { + "epoch": 1.2221065204741048, + "grad_norm": 1.417082438708923, + "learning_rate": 3.596418535072451e-05, + "loss": 0.2407, + "num_input_tokens_seen": 66630168, + "step": 9975 + }, + { + "epoch": 1.2227190591406083, + "grad_norm": 1.5615601472935723, + "learning_rate": 3.591558109909025e-05, + "loss": 0.2049, + "num_input_tokens_seen": 66663848, + "step": 9980 + }, + { + "epoch": 1.2233315978071115, + "grad_norm": 1.061850813247257, + "learning_rate": 3.586699129929373e-05, + "loss": 0.1661, + "num_input_tokens_seen": 66698912, + "step": 9985 + }, + { + "epoch": 1.223944136473615, + "grad_norm": 1.0446245994987036, + "learning_rate": 3.5818416001192276e-05, + "loss": 0.2329, + "num_input_tokens_seen": 66732112, + "step": 9990 + }, + { + "epoch": 1.2245566751401182, + "grad_norm": 1.0453311293261525, + "learning_rate": 3.576985525462841e-05, + "loss": 0.2364, + "num_input_tokens_seen": 66765832, + "step": 9995 + }, + { + "epoch": 1.2251692138066215, + "grad_norm": 1.022044445150293, + "learning_rate": 3.572130910942964e-05, + "loss": 0.2736, + "num_input_tokens_seen": 66798760, + "step": 10000 + }, + { + "epoch": 1.2251692138066215, + "eval_loss": 0.15618817508220673, + "eval_runtime": 20.4016, + "eval_samples_per_second": 2.941, + "eval_steps_per_second": 0.735, + "num_input_tokens_seen": 66798760, + "step": 10000 + }, + { + "epoch": 1.225781752473125, + "grad_norm": 1.2319192900049627, + "learning_rate": 3.567277761540856e-05, + "loss": 0.2139, + "num_input_tokens_seen": 66832616, + "step": 10005 + }, + { + "epoch": 1.2263942911396282, + "grad_norm": 1.1192074348789183, + "learning_rate": 3.56242608223627e-05, + "loss": 0.2183, + "num_input_tokens_seen": 66866584, + "step": 10010 + }, + { + "epoch": 1.2270068298061316, + "grad_norm": 0.9826408269969161, + "learning_rate": 3.5575758780074475e-05, + "loss": 0.2262, + "num_input_tokens_seen": 66900296, + "step": 10015 + }, + { + "epoch": 1.2276193684726349, + "grad_norm": 1.270723370422665, + "learning_rate": 3.5527271538311205e-05, + "loss": 0.2384, + "num_input_tokens_seen": 66933888, + "step": 10020 + }, + { + "epoch": 1.228231907139138, + "grad_norm": 1.5856866789234034, + "learning_rate": 3.5478799146825024e-05, + "loss": 0.257, + "num_input_tokens_seen": 66966912, + "step": 10025 + }, + { + "epoch": 1.2288444458056416, + "grad_norm": 1.0062064527623327, + "learning_rate": 3.543034165535282e-05, + "loss": 0.2248, + "num_input_tokens_seen": 67000680, + "step": 10030 + }, + { + "epoch": 1.2294569844721448, + "grad_norm": 1.056356299335675, + "learning_rate": 3.538189911361618e-05, + "loss": 0.2604, + "num_input_tokens_seen": 67033808, + "step": 10035 + }, + { + "epoch": 1.230069523138648, + "grad_norm": 1.1192489100705239, + "learning_rate": 3.5333471571321375e-05, + "loss": 0.1984, + "num_input_tokens_seen": 67067608, + "step": 10040 + }, + { + "epoch": 1.2306820618051515, + "grad_norm": 1.4785817376378587, + "learning_rate": 3.528505907815925e-05, + "loss": 0.1937, + "num_input_tokens_seen": 67101040, + "step": 10045 + }, + { + "epoch": 1.2312946004716547, + "grad_norm": 1.0996901038465798, + "learning_rate": 3.523666168380525e-05, + "loss": 0.2419, + "num_input_tokens_seen": 67134040, + "step": 10050 + }, + { + "epoch": 1.2312946004716547, + "eval_loss": 0.12978222966194153, + "eval_runtime": 19.9168, + "eval_samples_per_second": 3.013, + "eval_steps_per_second": 0.753, + "num_input_tokens_seen": 67134040, + "step": 10050 + }, + { + "epoch": 1.2319071391381582, + "grad_norm": 1.2139171380445146, + "learning_rate": 3.51882794379193e-05, + "loss": 0.2511, + "num_input_tokens_seen": 67167360, + "step": 10055 + }, + { + "epoch": 1.2325196778046614, + "grad_norm": 1.289528857461294, + "learning_rate": 3.513991239014579e-05, + "loss": 0.2128, + "num_input_tokens_seen": 67201216, + "step": 10060 + }, + { + "epoch": 1.2331322164711647, + "grad_norm": 1.4892651263348418, + "learning_rate": 3.509156059011352e-05, + "loss": 0.2481, + "num_input_tokens_seen": 67234824, + "step": 10065 + }, + { + "epoch": 1.2337447551376681, + "grad_norm": 1.354493452944373, + "learning_rate": 3.504322408743562e-05, + "loss": 0.2275, + "num_input_tokens_seen": 67268160, + "step": 10070 + }, + { + "epoch": 1.2343572938041714, + "grad_norm": 1.1377962133780994, + "learning_rate": 3.499490293170956e-05, + "loss": 0.2403, + "num_input_tokens_seen": 67301776, + "step": 10075 + }, + { + "epoch": 1.2349698324706746, + "grad_norm": 1.3213479082546846, + "learning_rate": 3.494659717251704e-05, + "loss": 0.2145, + "num_input_tokens_seen": 67336184, + "step": 10080 + }, + { + "epoch": 1.235582371137178, + "grad_norm": 1.0822056854819904, + "learning_rate": 3.489830685942397e-05, + "loss": 0.2269, + "num_input_tokens_seen": 67370240, + "step": 10085 + }, + { + "epoch": 1.2361949098036813, + "grad_norm": 1.095493880410798, + "learning_rate": 3.485003204198041e-05, + "loss": 0.2157, + "num_input_tokens_seen": 67404432, + "step": 10090 + }, + { + "epoch": 1.2368074484701848, + "grad_norm": 1.4155671248879054, + "learning_rate": 3.480177276972051e-05, + "loss": 0.2186, + "num_input_tokens_seen": 67437912, + "step": 10095 + }, + { + "epoch": 1.237419987136688, + "grad_norm": 1.1597648700568828, + "learning_rate": 3.475352909216246e-05, + "loss": 0.2574, + "num_input_tokens_seen": 67471000, + "step": 10100 + }, + { + "epoch": 1.237419987136688, + "eval_loss": 0.21491877734661102, + "eval_runtime": 19.4089, + "eval_samples_per_second": 3.091, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 67471000, + "step": 10100 + }, + { + "epoch": 1.2380325258031912, + "grad_norm": 61.68793183843857, + "learning_rate": 3.47053010588085e-05, + "loss": 0.2594, + "num_input_tokens_seen": 67504136, + "step": 10105 + }, + { + "epoch": 1.2386450644696947, + "grad_norm": 0.9460767854553899, + "learning_rate": 3.465708871914475e-05, + "loss": 0.2491, + "num_input_tokens_seen": 67537536, + "step": 10110 + }, + { + "epoch": 1.239257603136198, + "grad_norm": 1.2005295422829294, + "learning_rate": 3.4608892122641295e-05, + "loss": 0.2029, + "num_input_tokens_seen": 67571048, + "step": 10115 + }, + { + "epoch": 1.2398701418027014, + "grad_norm": 0.9752561237968562, + "learning_rate": 3.4560711318752016e-05, + "loss": 0.2015, + "num_input_tokens_seen": 67604768, + "step": 10120 + }, + { + "epoch": 1.2404826804692046, + "grad_norm": 1.1644311130139964, + "learning_rate": 3.451254635691458e-05, + "loss": 0.2179, + "num_input_tokens_seen": 67638424, + "step": 10125 + }, + { + "epoch": 1.2410952191357079, + "grad_norm": 1.4435187736881145, + "learning_rate": 3.446439728655047e-05, + "loss": 0.2452, + "num_input_tokens_seen": 67671792, + "step": 10130 + }, + { + "epoch": 1.2417077578022113, + "grad_norm": 1.0148079567428998, + "learning_rate": 3.441626415706477e-05, + "loss": 0.2498, + "num_input_tokens_seen": 67705816, + "step": 10135 + }, + { + "epoch": 1.2423202964687146, + "grad_norm": 1.0027085224671495, + "learning_rate": 3.4368147017846295e-05, + "loss": 0.1976, + "num_input_tokens_seen": 67740120, + "step": 10140 + }, + { + "epoch": 1.242932835135218, + "grad_norm": 1.4330460305461843, + "learning_rate": 3.432004591826739e-05, + "loss": 0.2408, + "num_input_tokens_seen": 67773216, + "step": 10145 + }, + { + "epoch": 1.2435453738017213, + "grad_norm": 1.307307643275231, + "learning_rate": 3.427196090768395e-05, + "loss": 0.2124, + "num_input_tokens_seen": 67807536, + "step": 10150 + }, + { + "epoch": 1.2435453738017213, + "eval_loss": 0.1516389101743698, + "eval_runtime": 19.1372, + "eval_samples_per_second": 3.135, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 67807536, + "step": 10150 + }, + { + "epoch": 1.2441579124682245, + "grad_norm": 1.380027226146845, + "learning_rate": 3.4223892035435414e-05, + "loss": 0.2761, + "num_input_tokens_seen": 67840480, + "step": 10155 + }, + { + "epoch": 1.244770451134728, + "grad_norm": 0.9654065219994176, + "learning_rate": 3.4175839350844576e-05, + "loss": 0.2171, + "num_input_tokens_seen": 67873680, + "step": 10160 + }, + { + "epoch": 1.2453829898012312, + "grad_norm": 0.9787005093120792, + "learning_rate": 3.41278029032177e-05, + "loss": 0.2261, + "num_input_tokens_seen": 67907360, + "step": 10165 + }, + { + "epoch": 1.2459955284677346, + "grad_norm": 1.3834394549082887, + "learning_rate": 3.407978274184435e-05, + "loss": 0.2116, + "num_input_tokens_seen": 67941136, + "step": 10170 + }, + { + "epoch": 1.2466080671342379, + "grad_norm": 0.8379514180119303, + "learning_rate": 3.4031778915997375e-05, + "loss": 0.1925, + "num_input_tokens_seen": 67975016, + "step": 10175 + }, + { + "epoch": 1.2472206058007411, + "grad_norm": 1.1872491287630014, + "learning_rate": 3.3983791474932864e-05, + "loss": 0.194, + "num_input_tokens_seen": 68009336, + "step": 10180 + }, + { + "epoch": 1.2478331444672446, + "grad_norm": 1.384942926349532, + "learning_rate": 3.393582046789012e-05, + "loss": 0.243, + "num_input_tokens_seen": 68042592, + "step": 10185 + }, + { + "epoch": 1.2484456831337478, + "grad_norm": 0.9933902622789962, + "learning_rate": 3.388786594409155e-05, + "loss": 0.2092, + "num_input_tokens_seen": 68076744, + "step": 10190 + }, + { + "epoch": 1.249058221800251, + "grad_norm": 1.303521595759064, + "learning_rate": 3.383992795274268e-05, + "loss": 0.2521, + "num_input_tokens_seen": 68110272, + "step": 10195 + }, + { + "epoch": 1.2496707604667545, + "grad_norm": 1.2843223741616177, + "learning_rate": 3.379200654303204e-05, + "loss": 0.2047, + "num_input_tokens_seen": 68143912, + "step": 10200 + }, + { + "epoch": 1.2496707604667545, + "eval_loss": 0.16254977881908417, + "eval_runtime": 19.3699, + "eval_samples_per_second": 3.098, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 68143912, + "step": 10200 + }, + { + "epoch": 1.2502832991332578, + "grad_norm": 1.001666242768362, + "learning_rate": 3.374410176413117e-05, + "loss": 0.1799, + "num_input_tokens_seen": 68177808, + "step": 10205 + }, + { + "epoch": 1.250895837799761, + "grad_norm": 1.1619611459217376, + "learning_rate": 3.369621366519453e-05, + "loss": 0.2216, + "num_input_tokens_seen": 68211568, + "step": 10210 + }, + { + "epoch": 1.2515083764662644, + "grad_norm": 1.512127989535974, + "learning_rate": 3.364834229535946e-05, + "loss": 0.2399, + "num_input_tokens_seen": 68244728, + "step": 10215 + }, + { + "epoch": 1.2521209151327677, + "grad_norm": 1.0875182289832053, + "learning_rate": 3.360048770374618e-05, + "loss": 0.2358, + "num_input_tokens_seen": 68278040, + "step": 10220 + }, + { + "epoch": 1.2527334537992711, + "grad_norm": 0.8613903903159598, + "learning_rate": 3.3552649939457634e-05, + "loss": 0.2189, + "num_input_tokens_seen": 68312056, + "step": 10225 + }, + { + "epoch": 1.2533459924657744, + "grad_norm": 1.453305242065048, + "learning_rate": 3.350482905157952e-05, + "loss": 0.2465, + "num_input_tokens_seen": 68345568, + "step": 10230 + }, + { + "epoch": 1.2539585311322776, + "grad_norm": 1.2514525034214972, + "learning_rate": 3.345702508918024e-05, + "loss": 0.2393, + "num_input_tokens_seen": 68379152, + "step": 10235 + }, + { + "epoch": 1.254571069798781, + "grad_norm": 1.2592102919373869, + "learning_rate": 3.34092381013108e-05, + "loss": 0.27, + "num_input_tokens_seen": 68412584, + "step": 10240 + }, + { + "epoch": 1.2551836084652843, + "grad_norm": 1.0958982494471285, + "learning_rate": 3.336146813700482e-05, + "loss": 0.2215, + "num_input_tokens_seen": 68446232, + "step": 10245 + }, + { + "epoch": 1.2557961471317878, + "grad_norm": 0.8092653699300046, + "learning_rate": 3.3313715245278434e-05, + "loss": 0.2368, + "num_input_tokens_seen": 68479264, + "step": 10250 + }, + { + "epoch": 1.2557961471317878, + "eval_loss": 0.1966872662305832, + "eval_runtime": 19.1612, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 68479264, + "step": 10250 + }, + { + "epoch": 1.256408685798291, + "grad_norm": 1.0352761658466105, + "learning_rate": 3.326597947513025e-05, + "loss": 0.1938, + "num_input_tokens_seen": 68513176, + "step": 10255 + }, + { + "epoch": 1.2570212244647943, + "grad_norm": 1.0777283789352847, + "learning_rate": 3.321826087554129e-05, + "loss": 0.2257, + "num_input_tokens_seen": 68547256, + "step": 10260 + }, + { + "epoch": 1.2576337631312977, + "grad_norm": 1.3423245290250527, + "learning_rate": 3.317055949547503e-05, + "loss": 0.2599, + "num_input_tokens_seen": 68580448, + "step": 10265 + }, + { + "epoch": 1.258246301797801, + "grad_norm": 1.4455392387346562, + "learning_rate": 3.3122875383877194e-05, + "loss": 0.2444, + "num_input_tokens_seen": 68614144, + "step": 10270 + }, + { + "epoch": 1.2588588404643044, + "grad_norm": 1.1766802710720077, + "learning_rate": 3.307520858967586e-05, + "loss": 0.2425, + "num_input_tokens_seen": 68647664, + "step": 10275 + }, + { + "epoch": 1.2594713791308076, + "grad_norm": 0.8034532858902004, + "learning_rate": 3.302755916178128e-05, + "loss": 0.2108, + "num_input_tokens_seen": 68681192, + "step": 10280 + }, + { + "epoch": 1.2600839177973109, + "grad_norm": 1.187551767143693, + "learning_rate": 3.297992714908589e-05, + "loss": 0.2493, + "num_input_tokens_seen": 68714600, + "step": 10285 + }, + { + "epoch": 1.2606964564638143, + "grad_norm": 1.2626992038766227, + "learning_rate": 3.293231260046431e-05, + "loss": 0.2286, + "num_input_tokens_seen": 68748432, + "step": 10290 + }, + { + "epoch": 1.2613089951303176, + "grad_norm": 0.9648518095639778, + "learning_rate": 3.288471556477317e-05, + "loss": 0.2285, + "num_input_tokens_seen": 68781728, + "step": 10295 + }, + { + "epoch": 1.261921533796821, + "grad_norm": 1.4967036865983903, + "learning_rate": 3.2837136090851205e-05, + "loss": 0.2253, + "num_input_tokens_seen": 68815336, + "step": 10300 + }, + { + "epoch": 1.261921533796821, + "eval_loss": 0.2143474966287613, + "eval_runtime": 19.5855, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 68815336, + "step": 10300 + }, + { + "epoch": 1.2625340724633243, + "grad_norm": 1.605431193393777, + "learning_rate": 3.278957422751907e-05, + "loss": 0.2482, + "num_input_tokens_seen": 68848592, + "step": 10305 + }, + { + "epoch": 1.2631466111298275, + "grad_norm": 1.1062566130229194, + "learning_rate": 3.274203002357935e-05, + "loss": 0.228, + "num_input_tokens_seen": 68882344, + "step": 10310 + }, + { + "epoch": 1.263759149796331, + "grad_norm": 1.4953017049047435, + "learning_rate": 3.2694503527816565e-05, + "loss": 0.2314, + "num_input_tokens_seen": 68916040, + "step": 10315 + }, + { + "epoch": 1.2643716884628342, + "grad_norm": 1.463458313824275, + "learning_rate": 3.2646994788997006e-05, + "loss": 0.2507, + "num_input_tokens_seen": 68949568, + "step": 10320 + }, + { + "epoch": 1.2649842271293377, + "grad_norm": 1.1531009879875487, + "learning_rate": 3.2599503855868775e-05, + "loss": 0.2489, + "num_input_tokens_seen": 68982152, + "step": 10325 + }, + { + "epoch": 1.265596765795841, + "grad_norm": 0.9924477448307528, + "learning_rate": 3.255203077716169e-05, + "loss": 0.2389, + "num_input_tokens_seen": 69015864, + "step": 10330 + }, + { + "epoch": 1.2662093044623441, + "grad_norm": 0.9805361301665197, + "learning_rate": 3.2504575601587294e-05, + "loss": 0.2492, + "num_input_tokens_seen": 69049896, + "step": 10335 + }, + { + "epoch": 1.2668218431288474, + "grad_norm": 1.6198340632093757, + "learning_rate": 3.245713837783866e-05, + "loss": 0.2217, + "num_input_tokens_seen": 69083632, + "step": 10340 + }, + { + "epoch": 1.2674343817953508, + "grad_norm": 1.279733871620096, + "learning_rate": 3.2409719154590534e-05, + "loss": 0.2422, + "num_input_tokens_seen": 69116488, + "step": 10345 + }, + { + "epoch": 1.268046920461854, + "grad_norm": 1.4374191444840163, + "learning_rate": 3.236231798049915e-05, + "loss": 0.2333, + "num_input_tokens_seen": 69150112, + "step": 10350 + }, + { + "epoch": 1.268046920461854, + "eval_loss": 0.1691417545080185, + "eval_runtime": 19.3563, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 69150112, + "step": 10350 + }, + { + "epoch": 1.2686594591283575, + "grad_norm": 1.014433988518987, + "learning_rate": 3.231493490420224e-05, + "loss": 0.217, + "num_input_tokens_seen": 69184024, + "step": 10355 + }, + { + "epoch": 1.2692719977948608, + "grad_norm": 0.9145976775310077, + "learning_rate": 3.2267569974318964e-05, + "loss": 0.2475, + "num_input_tokens_seen": 69218568, + "step": 10360 + }, + { + "epoch": 1.269884536461364, + "grad_norm": 1.1074249288744955, + "learning_rate": 3.2220223239449834e-05, + "loss": 0.1892, + "num_input_tokens_seen": 69252568, + "step": 10365 + }, + { + "epoch": 1.2704970751278675, + "grad_norm": 1.0194084035099427, + "learning_rate": 3.2172894748176743e-05, + "loss": 0.2745, + "num_input_tokens_seen": 69286336, + "step": 10370 + }, + { + "epoch": 1.2711096137943707, + "grad_norm": 0.9939548541974894, + "learning_rate": 3.21255845490628e-05, + "loss": 0.2482, + "num_input_tokens_seen": 69319784, + "step": 10375 + }, + { + "epoch": 1.2717221524608742, + "grad_norm": 1.5359006064772431, + "learning_rate": 3.207829269065242e-05, + "loss": 0.2778, + "num_input_tokens_seen": 69352992, + "step": 10380 + }, + { + "epoch": 1.2723346911273774, + "grad_norm": 1.1879233374670197, + "learning_rate": 3.203101922147113e-05, + "loss": 0.2333, + "num_input_tokens_seen": 69386192, + "step": 10385 + }, + { + "epoch": 1.2729472297938806, + "grad_norm": 1.420009795378401, + "learning_rate": 3.198376419002563e-05, + "loss": 0.1913, + "num_input_tokens_seen": 69420224, + "step": 10390 + }, + { + "epoch": 1.273559768460384, + "grad_norm": 1.028210972553943, + "learning_rate": 3.19365276448037e-05, + "loss": 0.2391, + "num_input_tokens_seen": 69453232, + "step": 10395 + }, + { + "epoch": 1.2741723071268873, + "grad_norm": 0.9784245377155898, + "learning_rate": 3.188930963427411e-05, + "loss": 0.2009, + "num_input_tokens_seen": 69486944, + "step": 10400 + }, + { + "epoch": 1.2741723071268873, + "eval_loss": 0.3153703510761261, + "eval_runtime": 19.572, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 69486944, + "step": 10400 + }, + { + "epoch": 1.2747848457933908, + "grad_norm": 0.9713327822934117, + "learning_rate": 3.184211020688667e-05, + "loss": 0.2063, + "num_input_tokens_seen": 69521024, + "step": 10405 + }, + { + "epoch": 1.275397384459894, + "grad_norm": 1.089429970441031, + "learning_rate": 3.179492941107207e-05, + "loss": 0.1923, + "num_input_tokens_seen": 69555328, + "step": 10410 + }, + { + "epoch": 1.2760099231263973, + "grad_norm": 1.8111858959264027, + "learning_rate": 3.174776729524196e-05, + "loss": 0.2565, + "num_input_tokens_seen": 69589040, + "step": 10415 + }, + { + "epoch": 1.2766224617929007, + "grad_norm": 1.2755084193576243, + "learning_rate": 3.17006239077887e-05, + "loss": 0.2575, + "num_input_tokens_seen": 69622376, + "step": 10420 + }, + { + "epoch": 1.277235000459404, + "grad_norm": 1.2029984019706983, + "learning_rate": 3.165349929708553e-05, + "loss": 0.2472, + "num_input_tokens_seen": 69654960, + "step": 10425 + }, + { + "epoch": 1.2778475391259074, + "grad_norm": 1.2226650815852944, + "learning_rate": 3.160639351148639e-05, + "loss": 0.2499, + "num_input_tokens_seen": 69688264, + "step": 10430 + }, + { + "epoch": 1.2784600777924107, + "grad_norm": 1.1852482827746924, + "learning_rate": 3.155930659932593e-05, + "loss": 0.1853, + "num_input_tokens_seen": 69722104, + "step": 10435 + }, + { + "epoch": 1.279072616458914, + "grad_norm": 0.7157981569032704, + "learning_rate": 3.15122386089194e-05, + "loss": 0.192, + "num_input_tokens_seen": 69755784, + "step": 10440 + }, + { + "epoch": 1.2796851551254174, + "grad_norm": 1.453118087596932, + "learning_rate": 3.146518958856264e-05, + "loss": 0.2412, + "num_input_tokens_seen": 69789632, + "step": 10445 + }, + { + "epoch": 1.2802976937919206, + "grad_norm": 1.4403312114135585, + "learning_rate": 3.1418159586532055e-05, + "loss": 0.2546, + "num_input_tokens_seen": 69822376, + "step": 10450 + }, + { + "epoch": 1.2802976937919206, + "eval_loss": 0.18921419978141785, + "eval_runtime": 19.3651, + "eval_samples_per_second": 3.098, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 69822376, + "step": 10450 + }, + { + "epoch": 1.280910232458424, + "grad_norm": 1.1661386976448556, + "learning_rate": 3.137114865108448e-05, + "loss": 0.2298, + "num_input_tokens_seen": 69856272, + "step": 10455 + }, + { + "epoch": 1.2815227711249273, + "grad_norm": 1.4222630401322578, + "learning_rate": 3.1324156830457265e-05, + "loss": 0.2138, + "num_input_tokens_seen": 69890232, + "step": 10460 + }, + { + "epoch": 1.2821353097914305, + "grad_norm": 0.8283643883513497, + "learning_rate": 3.127718417286806e-05, + "loss": 0.1803, + "num_input_tokens_seen": 69924584, + "step": 10465 + }, + { + "epoch": 1.2827478484579338, + "grad_norm": 0.8441760490687942, + "learning_rate": 3.1230230726514924e-05, + "loss": 0.2116, + "num_input_tokens_seen": 69957760, + "step": 10470 + }, + { + "epoch": 1.2833603871244372, + "grad_norm": 1.4391335558179372, + "learning_rate": 3.118329653957617e-05, + "loss": 0.23, + "num_input_tokens_seen": 69991520, + "step": 10475 + }, + { + "epoch": 1.2839729257909407, + "grad_norm": 1.5114333017997594, + "learning_rate": 3.113638166021032e-05, + "loss": 0.2256, + "num_input_tokens_seen": 70025056, + "step": 10480 + }, + { + "epoch": 1.284585464457444, + "grad_norm": 1.4891257632314518, + "learning_rate": 3.1089486136556154e-05, + "loss": 0.233, + "num_input_tokens_seen": 70058384, + "step": 10485 + }, + { + "epoch": 1.2851980031239472, + "grad_norm": 1.1182418086711492, + "learning_rate": 3.104261001673253e-05, + "loss": 0.2252, + "num_input_tokens_seen": 70091384, + "step": 10490 + }, + { + "epoch": 1.2858105417904504, + "grad_norm": 1.2968262300781201, + "learning_rate": 3.099575334883846e-05, + "loss": 0.2096, + "num_input_tokens_seen": 70125232, + "step": 10495 + }, + { + "epoch": 1.2864230804569539, + "grad_norm": 1.3468668615560726, + "learning_rate": 3.0948916180952904e-05, + "loss": 0.2345, + "num_input_tokens_seen": 70158672, + "step": 10500 + }, + { + "epoch": 1.2864230804569539, + "eval_loss": 0.16857217252254486, + "eval_runtime": 19.7222, + "eval_samples_per_second": 3.042, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 70158672, + "step": 10500 + }, + { + "epoch": 1.287035619123457, + "grad_norm": 0.9012879497685329, + "learning_rate": 3.0902098561134904e-05, + "loss": 0.2116, + "num_input_tokens_seen": 70192984, + "step": 10505 + }, + { + "epoch": 1.2876481577899606, + "grad_norm": 1.3789273480771664, + "learning_rate": 3.085530053742337e-05, + "loss": 0.2022, + "num_input_tokens_seen": 70226672, + "step": 10510 + }, + { + "epoch": 1.2882606964564638, + "grad_norm": 1.387479873023181, + "learning_rate": 3.080852215783719e-05, + "loss": 0.2485, + "num_input_tokens_seen": 70260368, + "step": 10515 + }, + { + "epoch": 1.288873235122967, + "grad_norm": 1.4250811467000308, + "learning_rate": 3.0761763470375013e-05, + "loss": 0.2304, + "num_input_tokens_seen": 70293664, + "step": 10520 + }, + { + "epoch": 1.2894857737894705, + "grad_norm": 1.3229833246923082, + "learning_rate": 3.071502452301531e-05, + "loss": 0.2487, + "num_input_tokens_seen": 70327240, + "step": 10525 + }, + { + "epoch": 1.2900983124559737, + "grad_norm": 1.1435247892196045, + "learning_rate": 3.066830536371633e-05, + "loss": 0.2326, + "num_input_tokens_seen": 70360248, + "step": 10530 + }, + { + "epoch": 1.2907108511224772, + "grad_norm": 0.813612109320127, + "learning_rate": 3.0621606040415975e-05, + "loss": 0.2181, + "num_input_tokens_seen": 70393960, + "step": 10535 + }, + { + "epoch": 1.2913233897889804, + "grad_norm": 1.5787677135995377, + "learning_rate": 3.0574926601031806e-05, + "loss": 0.2285, + "num_input_tokens_seen": 70427088, + "step": 10540 + }, + { + "epoch": 1.2919359284554837, + "grad_norm": 1.071253680990285, + "learning_rate": 3.0528267093460985e-05, + "loss": 0.2164, + "num_input_tokens_seen": 70460952, + "step": 10545 + }, + { + "epoch": 1.2925484671219871, + "grad_norm": 1.3304300925469836, + "learning_rate": 3.048162756558024e-05, + "loss": 0.2319, + "num_input_tokens_seen": 70494392, + "step": 10550 + }, + { + "epoch": 1.2925484671219871, + "eval_loss": 0.196364626288414, + "eval_runtime": 19.586, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 70494392, + "step": 10550 + }, + { + "epoch": 1.2931610057884904, + "grad_norm": 0.8977265208278654, + "learning_rate": 3.0435008065245756e-05, + "loss": 0.2058, + "num_input_tokens_seen": 70528424, + "step": 10555 + }, + { + "epoch": 1.2937735444549938, + "grad_norm": 1.206986830065902, + "learning_rate": 3.0388408640293198e-05, + "loss": 0.2432, + "num_input_tokens_seen": 70562120, + "step": 10560 + }, + { + "epoch": 1.294386083121497, + "grad_norm": 1.0732663047490307, + "learning_rate": 3.034182933853763e-05, + "loss": 0.2223, + "num_input_tokens_seen": 70596224, + "step": 10565 + }, + { + "epoch": 1.2949986217880003, + "grad_norm": 1.5159638008795495, + "learning_rate": 3.0295270207773464e-05, + "loss": 0.2498, + "num_input_tokens_seen": 70629656, + "step": 10570 + }, + { + "epoch": 1.2956111604545038, + "grad_norm": 1.5970725770683893, + "learning_rate": 3.024873129577443e-05, + "loss": 0.2135, + "num_input_tokens_seen": 70663696, + "step": 10575 + }, + { + "epoch": 1.296223699121007, + "grad_norm": 1.0523057107228013, + "learning_rate": 3.0202212650293448e-05, + "loss": 0.2202, + "num_input_tokens_seen": 70697176, + "step": 10580 + }, + { + "epoch": 1.2968362377875104, + "grad_norm": 1.2274819816943598, + "learning_rate": 3.0155714319062734e-05, + "loss": 0.2005, + "num_input_tokens_seen": 70731408, + "step": 10585 + }, + { + "epoch": 1.2974487764540137, + "grad_norm": 1.0426912906089882, + "learning_rate": 3.010923634979359e-05, + "loss": 0.1899, + "num_input_tokens_seen": 70765400, + "step": 10590 + }, + { + "epoch": 1.298061315120517, + "grad_norm": 1.3634097681799786, + "learning_rate": 3.0062778790176483e-05, + "loss": 0.2298, + "num_input_tokens_seen": 70798848, + "step": 10595 + }, + { + "epoch": 1.2986738537870204, + "grad_norm": 1.455375166373671, + "learning_rate": 3.001634168788088e-05, + "loss": 0.2834, + "num_input_tokens_seen": 70832136, + "step": 10600 + }, + { + "epoch": 1.2986738537870204, + "eval_loss": 0.18608230352401733, + "eval_runtime": 19.7321, + "eval_samples_per_second": 3.041, + "eval_steps_per_second": 0.76, + "num_input_tokens_seen": 70832136, + "step": 10600 + }, + { + "epoch": 1.2992863924535236, + "grad_norm": 1.035231554256891, + "learning_rate": 2.996992509055528e-05, + "loss": 0.218, + "num_input_tokens_seen": 70866120, + "step": 10605 + }, + { + "epoch": 1.299898931120027, + "grad_norm": 1.501133744865775, + "learning_rate": 2.992352904582717e-05, + "loss": 0.215, + "num_input_tokens_seen": 70899936, + "step": 10610 + }, + { + "epoch": 1.3005114697865303, + "grad_norm": 1.50291918580223, + "learning_rate": 2.9877153601302893e-05, + "loss": 0.222, + "num_input_tokens_seen": 70933288, + "step": 10615 + }, + { + "epoch": 1.3011240084530336, + "grad_norm": 1.2133468261955178, + "learning_rate": 2.9830798804567716e-05, + "loss": 0.2343, + "num_input_tokens_seen": 70967296, + "step": 10620 + }, + { + "epoch": 1.3017365471195368, + "grad_norm": 1.1725568338460373, + "learning_rate": 2.9784464703185666e-05, + "loss": 0.2402, + "num_input_tokens_seen": 71000784, + "step": 10625 + }, + { + "epoch": 1.3023490857860403, + "grad_norm": 1.0014305965405448, + "learning_rate": 2.973815134469958e-05, + "loss": 0.2326, + "num_input_tokens_seen": 71034616, + "step": 10630 + }, + { + "epoch": 1.3029616244525435, + "grad_norm": 1.3780511441318963, + "learning_rate": 2.9691858776630965e-05, + "loss": 0.2387, + "num_input_tokens_seen": 71067488, + "step": 10635 + }, + { + "epoch": 1.303574163119047, + "grad_norm": 1.544176148681233, + "learning_rate": 2.964558704648003e-05, + "loss": 0.2606, + "num_input_tokens_seen": 71100848, + "step": 10640 + }, + { + "epoch": 1.3041867017855502, + "grad_norm": 1.5606371132464487, + "learning_rate": 2.959933620172559e-05, + "loss": 0.2349, + "num_input_tokens_seen": 71134040, + "step": 10645 + }, + { + "epoch": 1.3047992404520534, + "grad_norm": 1.3061543164710514, + "learning_rate": 2.9553106289825028e-05, + "loss": 0.2106, + "num_input_tokens_seen": 71167464, + "step": 10650 + }, + { + "epoch": 1.3047992404520534, + "eval_loss": 0.14994314312934875, + "eval_runtime": 19.5296, + "eval_samples_per_second": 3.072, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 71167464, + "step": 10650 + }, + { + "epoch": 1.3054117791185569, + "grad_norm": 1.206825055856801, + "learning_rate": 2.950689735821428e-05, + "loss": 0.2511, + "num_input_tokens_seen": 71200776, + "step": 10655 + }, + { + "epoch": 1.3060243177850601, + "grad_norm": 0.8738688843235899, + "learning_rate": 2.9460709454307668e-05, + "loss": 0.2303, + "num_input_tokens_seen": 71234168, + "step": 10660 + }, + { + "epoch": 1.3066368564515636, + "grad_norm": 1.2670653369653868, + "learning_rate": 2.9414542625498032e-05, + "loss": 0.2423, + "num_input_tokens_seen": 71268008, + "step": 10665 + }, + { + "epoch": 1.3072493951180668, + "grad_norm": 1.0765877081535313, + "learning_rate": 2.9368396919156543e-05, + "loss": 0.2461, + "num_input_tokens_seen": 71301256, + "step": 10670 + }, + { + "epoch": 1.30786193378457, + "grad_norm": 1.0930641133259775, + "learning_rate": 2.932227238263271e-05, + "loss": 0.2585, + "num_input_tokens_seen": 71334304, + "step": 10675 + }, + { + "epoch": 1.3084744724510735, + "grad_norm": 1.5727229135478251, + "learning_rate": 2.927616906325432e-05, + "loss": 0.2558, + "num_input_tokens_seen": 71367784, + "step": 10680 + }, + { + "epoch": 1.3090870111175767, + "grad_norm": 1.1094592860069892, + "learning_rate": 2.9230087008327368e-05, + "loss": 0.2396, + "num_input_tokens_seen": 71400960, + "step": 10685 + }, + { + "epoch": 1.3096995497840802, + "grad_norm": 1.248724168955527, + "learning_rate": 2.9184026265136056e-05, + "loss": 0.2212, + "num_input_tokens_seen": 71433712, + "step": 10690 + }, + { + "epoch": 1.3103120884505834, + "grad_norm": 1.7911517740511143, + "learning_rate": 2.9137986880942715e-05, + "loss": 0.2074, + "num_input_tokens_seen": 71467256, + "step": 10695 + }, + { + "epoch": 1.3109246271170867, + "grad_norm": 1.2413107560000207, + "learning_rate": 2.909196890298773e-05, + "loss": 0.206, + "num_input_tokens_seen": 71501128, + "step": 10700 + }, + { + "epoch": 1.3109246271170867, + "eval_loss": 0.19016502797603607, + "eval_runtime": 19.7377, + "eval_samples_per_second": 3.04, + "eval_steps_per_second": 0.76, + "num_input_tokens_seen": 71501128, + "step": 10700 + }, + { + "epoch": 1.3115371657835901, + "grad_norm": 1.440592474356754, + "learning_rate": 2.9045972378489604e-05, + "loss": 0.2671, + "num_input_tokens_seen": 71534760, + "step": 10705 + }, + { + "epoch": 1.3121497044500934, + "grad_norm": 1.6043559122436624, + "learning_rate": 2.8999997354644698e-05, + "loss": 0.2431, + "num_input_tokens_seen": 71568000, + "step": 10710 + }, + { + "epoch": 1.3127622431165968, + "grad_norm": 1.6562341112618186, + "learning_rate": 2.8954043878627414e-05, + "loss": 0.2767, + "num_input_tokens_seen": 71601664, + "step": 10715 + }, + { + "epoch": 1.3133747817831, + "grad_norm": 1.3116654592131844, + "learning_rate": 2.890811199759003e-05, + "loss": 0.2077, + "num_input_tokens_seen": 71635832, + "step": 10720 + }, + { + "epoch": 1.3139873204496033, + "grad_norm": 13.474160151030459, + "learning_rate": 2.886220175866261e-05, + "loss": 0.2503, + "num_input_tokens_seen": 71669720, + "step": 10725 + }, + { + "epoch": 1.3145998591161068, + "grad_norm": 1.3031307301091186, + "learning_rate": 2.881631320895306e-05, + "loss": 0.2579, + "num_input_tokens_seen": 71703304, + "step": 10730 + }, + { + "epoch": 1.31521239778261, + "grad_norm": 1.193828203752651, + "learning_rate": 2.8770446395547036e-05, + "loss": 0.2089, + "num_input_tokens_seen": 71737192, + "step": 10735 + }, + { + "epoch": 1.3158249364491135, + "grad_norm": 1.0929056779676314, + "learning_rate": 2.87246013655079e-05, + "loss": 0.2317, + "num_input_tokens_seen": 71770632, + "step": 10740 + }, + { + "epoch": 1.3164374751156167, + "grad_norm": 1.0813012821730756, + "learning_rate": 2.8678778165876556e-05, + "loss": 0.2344, + "num_input_tokens_seen": 71804112, + "step": 10745 + }, + { + "epoch": 1.31705001378212, + "grad_norm": 1.228614756425054, + "learning_rate": 2.863297684367164e-05, + "loss": 0.184, + "num_input_tokens_seen": 71837520, + "step": 10750 + }, + { + "epoch": 1.31705001378212, + "eval_loss": 0.2411387860774994, + "eval_runtime": 19.9995, + "eval_samples_per_second": 3.0, + "eval_steps_per_second": 0.75, + "num_input_tokens_seen": 71837520, + "step": 10750 + }, + { + "epoch": 1.3176625524486232, + "grad_norm": 1.1943825653656646, + "learning_rate": 2.858719744588928e-05, + "loss": 0.1921, + "num_input_tokens_seen": 71870856, + "step": 10755 + }, + { + "epoch": 1.3182750911151266, + "grad_norm": 1.2513095958234757, + "learning_rate": 2.854144001950314e-05, + "loss": 0.1655, + "num_input_tokens_seen": 71904984, + "step": 10760 + }, + { + "epoch": 1.31888762978163, + "grad_norm": 0.9949634903926544, + "learning_rate": 2.8495704611464275e-05, + "loss": 0.1703, + "num_input_tokens_seen": 71939480, + "step": 10765 + }, + { + "epoch": 1.3195001684481333, + "grad_norm": 1.1828002010795184, + "learning_rate": 2.8449991268701208e-05, + "loss": 0.2253, + "num_input_tokens_seen": 71973360, + "step": 10770 + }, + { + "epoch": 1.3201127071146366, + "grad_norm": 1.375476605140409, + "learning_rate": 2.8404300038119825e-05, + "loss": 0.2047, + "num_input_tokens_seen": 72007440, + "step": 10775 + }, + { + "epoch": 1.3207252457811398, + "grad_norm": 1.1086035003719277, + "learning_rate": 2.835863096660325e-05, + "loss": 0.1839, + "num_input_tokens_seen": 72041328, + "step": 10780 + }, + { + "epoch": 1.3213377844476433, + "grad_norm": 1.202221399382527, + "learning_rate": 2.8312984101011946e-05, + "loss": 0.2406, + "num_input_tokens_seen": 72074904, + "step": 10785 + }, + { + "epoch": 1.3219503231141465, + "grad_norm": 1.3913683968268147, + "learning_rate": 2.826735948818358e-05, + "loss": 0.2341, + "num_input_tokens_seen": 72108648, + "step": 10790 + }, + { + "epoch": 1.32256286178065, + "grad_norm": 1.8386972604975618, + "learning_rate": 2.8221757174932938e-05, + "loss": 0.2294, + "num_input_tokens_seen": 72141880, + "step": 10795 + }, + { + "epoch": 1.3231754004471532, + "grad_norm": 1.2668737312898322, + "learning_rate": 2.817617720805197e-05, + "loss": 0.2369, + "num_input_tokens_seen": 72175680, + "step": 10800 + }, + { + "epoch": 1.3231754004471532, + "eval_loss": 0.17310434579849243, + "eval_runtime": 19.7821, + "eval_samples_per_second": 3.033, + "eval_steps_per_second": 0.758, + "num_input_tokens_seen": 72175680, + "step": 10800 + }, + { + "epoch": 1.3237879391136564, + "grad_norm": 1.193851678386765, + "learning_rate": 2.8130619634309676e-05, + "loss": 0.2378, + "num_input_tokens_seen": 72209280, + "step": 10805 + }, + { + "epoch": 1.32440047778016, + "grad_norm": 1.252979555050513, + "learning_rate": 2.8085084500452113e-05, + "loss": 0.1945, + "num_input_tokens_seen": 72243368, + "step": 10810 + }, + { + "epoch": 1.3250130164466631, + "grad_norm": 8.218269948560957, + "learning_rate": 2.803957185320225e-05, + "loss": 0.2723, + "num_input_tokens_seen": 72276136, + "step": 10815 + }, + { + "epoch": 1.3256255551131666, + "grad_norm": 1.5871474047445882, + "learning_rate": 2.7994081739260048e-05, + "loss": 0.2144, + "num_input_tokens_seen": 72310256, + "step": 10820 + }, + { + "epoch": 1.3262380937796698, + "grad_norm": 1.1009743124735736, + "learning_rate": 2.7948614205302288e-05, + "loss": 0.27, + "num_input_tokens_seen": 72343800, + "step": 10825 + }, + { + "epoch": 1.326850632446173, + "grad_norm": 1.0339601684569133, + "learning_rate": 2.790316929798264e-05, + "loss": 0.2218, + "num_input_tokens_seen": 72377088, + "step": 10830 + }, + { + "epoch": 1.3274631711126765, + "grad_norm": 1.3195849978293093, + "learning_rate": 2.7857747063931493e-05, + "loss": 0.2287, + "num_input_tokens_seen": 72410240, + "step": 10835 + }, + { + "epoch": 1.3280757097791798, + "grad_norm": 0.8907295131666464, + "learning_rate": 2.7812347549756035e-05, + "loss": 0.2552, + "num_input_tokens_seen": 72443824, + "step": 10840 + }, + { + "epoch": 1.3286882484456832, + "grad_norm": 1.481374985087961, + "learning_rate": 2.7766970802040114e-05, + "loss": 0.2357, + "num_input_tokens_seen": 72477288, + "step": 10845 + }, + { + "epoch": 1.3293007871121865, + "grad_norm": 1.0052206504673544, + "learning_rate": 2.772161686734419e-05, + "loss": 0.2175, + "num_input_tokens_seen": 72511424, + "step": 10850 + }, + { + "epoch": 1.3293007871121865, + "eval_loss": 0.1683052033185959, + "eval_runtime": 19.9926, + "eval_samples_per_second": 3.001, + "eval_steps_per_second": 0.75, + "num_input_tokens_seen": 72511424, + "step": 10850 + }, + { + "epoch": 1.3299133257786897, + "grad_norm": 0.9400960640670121, + "learning_rate": 2.767628579220536e-05, + "loss": 0.2181, + "num_input_tokens_seen": 72545080, + "step": 10855 + }, + { + "epoch": 1.3305258644451932, + "grad_norm": 0.9574102034561467, + "learning_rate": 2.7630977623137244e-05, + "loss": 0.2648, + "num_input_tokens_seen": 72578272, + "step": 10860 + }, + { + "epoch": 1.3311384031116964, + "grad_norm": 1.105259832070157, + "learning_rate": 2.758569240662998e-05, + "loss": 0.1966, + "num_input_tokens_seen": 72612536, + "step": 10865 + }, + { + "epoch": 1.3317509417781999, + "grad_norm": 1.2001173134882535, + "learning_rate": 2.7540430189150098e-05, + "loss": 0.247, + "num_input_tokens_seen": 72645920, + "step": 10870 + }, + { + "epoch": 1.332363480444703, + "grad_norm": 1.3396654675470272, + "learning_rate": 2.7495191017140582e-05, + "loss": 0.1914, + "num_input_tokens_seen": 72680416, + "step": 10875 + }, + { + "epoch": 1.3329760191112063, + "grad_norm": 1.373286103807185, + "learning_rate": 2.7449974937020778e-05, + "loss": 0.2471, + "num_input_tokens_seen": 72714560, + "step": 10880 + }, + { + "epoch": 1.3335885577777098, + "grad_norm": 11.397219776191838, + "learning_rate": 2.7404781995186267e-05, + "loss": 0.231, + "num_input_tokens_seen": 72748336, + "step": 10885 + }, + { + "epoch": 1.334201096444213, + "grad_norm": 1.6874289712995845, + "learning_rate": 2.7359612238008968e-05, + "loss": 0.2507, + "num_input_tokens_seen": 72782040, + "step": 10890 + }, + { + "epoch": 1.3348136351107165, + "grad_norm": 1.0849549796943527, + "learning_rate": 2.7314465711836967e-05, + "loss": 0.2298, + "num_input_tokens_seen": 72815992, + "step": 10895 + }, + { + "epoch": 1.3354261737772197, + "grad_norm": 0.9600258760416324, + "learning_rate": 2.7269342462994563e-05, + "loss": 0.197, + "num_input_tokens_seen": 72850512, + "step": 10900 + }, + { + "epoch": 1.3354261737772197, + "eval_loss": 0.22737418115139008, + "eval_runtime": 19.6079, + "eval_samples_per_second": 3.06, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 72850512, + "step": 10900 + }, + { + "epoch": 1.336038712443723, + "grad_norm": 1.3265511756999933, + "learning_rate": 2.7224242537782056e-05, + "loss": 0.1994, + "num_input_tokens_seen": 72884432, + "step": 10905 + }, + { + "epoch": 1.3366512511102262, + "grad_norm": 1.2264412034005454, + "learning_rate": 2.7179165982475924e-05, + "loss": 0.2839, + "num_input_tokens_seen": 72917608, + "step": 10910 + }, + { + "epoch": 1.3372637897767297, + "grad_norm": 1.5094223269822984, + "learning_rate": 2.713411284332863e-05, + "loss": 0.186, + "num_input_tokens_seen": 72952000, + "step": 10915 + }, + { + "epoch": 1.337876328443233, + "grad_norm": 0.8664736323227381, + "learning_rate": 2.708908316656863e-05, + "loss": 0.2018, + "num_input_tokens_seen": 72985840, + "step": 10920 + }, + { + "epoch": 1.3384888671097364, + "grad_norm": 1.317958833725117, + "learning_rate": 2.7044076998400247e-05, + "loss": 0.2494, + "num_input_tokens_seen": 73019136, + "step": 10925 + }, + { + "epoch": 1.3391014057762396, + "grad_norm": 1.6578104286015414, + "learning_rate": 2.6999094385003743e-05, + "loss": 0.199, + "num_input_tokens_seen": 73053088, + "step": 10930 + }, + { + "epoch": 1.3397139444427428, + "grad_norm": 0.8606284496425388, + "learning_rate": 2.69541353725352e-05, + "loss": 0.288, + "num_input_tokens_seen": 73086296, + "step": 10935 + }, + { + "epoch": 1.3403264831092463, + "grad_norm": 0.9567441511611826, + "learning_rate": 2.690920000712644e-05, + "loss": 0.1766, + "num_input_tokens_seen": 73120432, + "step": 10940 + }, + { + "epoch": 1.3409390217757495, + "grad_norm": 1.3031676970273844, + "learning_rate": 2.6864288334885067e-05, + "loss": 0.2227, + "num_input_tokens_seen": 73153912, + "step": 10945 + }, + { + "epoch": 1.341551560442253, + "grad_norm": 1.1674568949705564, + "learning_rate": 2.6819400401894385e-05, + "loss": 0.2192, + "num_input_tokens_seen": 73187184, + "step": 10950 + }, + { + "epoch": 1.341551560442253, + "eval_loss": 0.21316958963871002, + "eval_runtime": 19.8522, + "eval_samples_per_second": 3.022, + "eval_steps_per_second": 0.756, + "num_input_tokens_seen": 73187184, + "step": 10950 + }, + { + "epoch": 1.3421640991087562, + "grad_norm": 1.3818732126484, + "learning_rate": 2.6774536254213274e-05, + "loss": 0.2332, + "num_input_tokens_seen": 73220296, + "step": 10955 + }, + { + "epoch": 1.3427766377752595, + "grad_norm": 1.1330695909958177, + "learning_rate": 2.672969593787626e-05, + "loss": 0.1974, + "num_input_tokens_seen": 73254424, + "step": 10960 + }, + { + "epoch": 1.343389176441763, + "grad_norm": 1.8794846556627935, + "learning_rate": 2.6684879498893412e-05, + "loss": 0.2102, + "num_input_tokens_seen": 73287672, + "step": 10965 + }, + { + "epoch": 1.3440017151082662, + "grad_norm": 1.688861594294699, + "learning_rate": 2.6640086983250313e-05, + "loss": 0.2269, + "num_input_tokens_seen": 73321264, + "step": 10970 + }, + { + "epoch": 1.3446142537747696, + "grad_norm": 1.2840178088378835, + "learning_rate": 2.6595318436907924e-05, + "loss": 0.2103, + "num_input_tokens_seen": 73355680, + "step": 10975 + }, + { + "epoch": 1.3452267924412729, + "grad_norm": 1.247167956883848, + "learning_rate": 2.6550573905802718e-05, + "loss": 0.2326, + "num_input_tokens_seen": 73389408, + "step": 10980 + }, + { + "epoch": 1.345839331107776, + "grad_norm": 1.2822024259878135, + "learning_rate": 2.6505853435846422e-05, + "loss": 0.2272, + "num_input_tokens_seen": 73422048, + "step": 10985 + }, + { + "epoch": 1.3464518697742796, + "grad_norm": 1.1622973545823398, + "learning_rate": 2.6461157072926175e-05, + "loss": 0.2807, + "num_input_tokens_seen": 73455432, + "step": 10990 + }, + { + "epoch": 1.3470644084407828, + "grad_norm": 1.004943755449525, + "learning_rate": 2.6416484862904278e-05, + "loss": 0.2443, + "num_input_tokens_seen": 73488176, + "step": 10995 + }, + { + "epoch": 1.3476769471072862, + "grad_norm": 1.3365941433926147, + "learning_rate": 2.6371836851618325e-05, + "loss": 0.2141, + "num_input_tokens_seen": 73522280, + "step": 11000 + }, + { + "epoch": 1.3476769471072862, + "eval_loss": 0.18228411674499512, + "eval_runtime": 19.8096, + "eval_samples_per_second": 3.029, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 73522280, + "step": 11000 + }, + { + "epoch": 1.3482894857737895, + "grad_norm": 1.0584812304110847, + "learning_rate": 2.6327213084881087e-05, + "loss": 0.2179, + "num_input_tokens_seen": 73555864, + "step": 11005 + }, + { + "epoch": 1.3489020244402927, + "grad_norm": 1.6181583622187452, + "learning_rate": 2.6282613608480365e-05, + "loss": 0.219, + "num_input_tokens_seen": 73589336, + "step": 11010 + }, + { + "epoch": 1.3495145631067962, + "grad_norm": 1.5049147554790288, + "learning_rate": 2.623803846817913e-05, + "loss": 0.2664, + "num_input_tokens_seen": 73622416, + "step": 11015 + }, + { + "epoch": 1.3501271017732994, + "grad_norm": 1.8612030571869094, + "learning_rate": 2.6193487709715347e-05, + "loss": 0.1887, + "num_input_tokens_seen": 73656424, + "step": 11020 + }, + { + "epoch": 1.3507396404398029, + "grad_norm": 0.9601130348845878, + "learning_rate": 2.6148961378801994e-05, + "loss": 0.1939, + "num_input_tokens_seen": 73690536, + "step": 11025 + }, + { + "epoch": 1.3513521791063061, + "grad_norm": 1.3593144908372155, + "learning_rate": 2.6104459521126902e-05, + "loss": 0.2455, + "num_input_tokens_seen": 73724000, + "step": 11030 + }, + { + "epoch": 1.3519647177728094, + "grad_norm": 1.122734822407193, + "learning_rate": 2.605998218235287e-05, + "loss": 0.2239, + "num_input_tokens_seen": 73757704, + "step": 11035 + }, + { + "epoch": 1.3525772564393128, + "grad_norm": 1.0137585771221411, + "learning_rate": 2.6015529408117534e-05, + "loss": 0.1928, + "num_input_tokens_seen": 73790936, + "step": 11040 + }, + { + "epoch": 1.353189795105816, + "grad_norm": 1.2431928830058827, + "learning_rate": 2.597110124403327e-05, + "loss": 0.2083, + "num_input_tokens_seen": 73824584, + "step": 11045 + }, + { + "epoch": 1.3538023337723195, + "grad_norm": 0.909697320792005, + "learning_rate": 2.592669773568724e-05, + "loss": 0.2257, + "num_input_tokens_seen": 73858104, + "step": 11050 + }, + { + "epoch": 1.3538023337723195, + "eval_loss": 0.16352061927318573, + "eval_runtime": 19.6545, + "eval_samples_per_second": 3.053, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 73858104, + "step": 11050 + }, + { + "epoch": 1.3544148724388227, + "grad_norm": 1.3364950700655314, + "learning_rate": 2.5882318928641313e-05, + "loss": 0.2284, + "num_input_tokens_seen": 73891168, + "step": 11055 + }, + { + "epoch": 1.355027411105326, + "grad_norm": 1.3873141449877706, + "learning_rate": 2.583796486843202e-05, + "loss": 0.2511, + "num_input_tokens_seen": 73923784, + "step": 11060 + }, + { + "epoch": 1.3556399497718292, + "grad_norm": 1.3413776965717361, + "learning_rate": 2.5793635600570455e-05, + "loss": 0.2252, + "num_input_tokens_seen": 73956896, + "step": 11065 + }, + { + "epoch": 1.3562524884383327, + "grad_norm": 1.3642728146254033, + "learning_rate": 2.574933117054228e-05, + "loss": 0.2297, + "num_input_tokens_seen": 73990912, + "step": 11070 + }, + { + "epoch": 1.356865027104836, + "grad_norm": 1.4134480486036514, + "learning_rate": 2.5705051623807726e-05, + "loss": 0.2242, + "num_input_tokens_seen": 74024656, + "step": 11075 + }, + { + "epoch": 1.3574775657713394, + "grad_norm": 1.2888645288114207, + "learning_rate": 2.5660797005801452e-05, + "loss": 0.21, + "num_input_tokens_seen": 74058600, + "step": 11080 + }, + { + "epoch": 1.3580901044378426, + "grad_norm": 1.242149620681887, + "learning_rate": 2.561656736193252e-05, + "loss": 0.2284, + "num_input_tokens_seen": 74091880, + "step": 11085 + }, + { + "epoch": 1.3587026431043459, + "grad_norm": 1.48502993736038, + "learning_rate": 2.5572362737584404e-05, + "loss": 0.2295, + "num_input_tokens_seen": 74125064, + "step": 11090 + }, + { + "epoch": 1.3593151817708493, + "grad_norm": 1.0474848586542544, + "learning_rate": 2.5528183178114906e-05, + "loss": 0.1969, + "num_input_tokens_seen": 74158416, + "step": 11095 + }, + { + "epoch": 1.3599277204373525, + "grad_norm": 1.7242993602408807, + "learning_rate": 2.5484028728856058e-05, + "loss": 0.2696, + "num_input_tokens_seen": 74191944, + "step": 11100 + }, + { + "epoch": 1.3599277204373525, + "eval_loss": 0.13369590044021606, + "eval_runtime": 19.5902, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 74191944, + "step": 11100 + }, + { + "epoch": 1.360540259103856, + "grad_norm": 1.435897138829068, + "learning_rate": 2.5439899435114168e-05, + "loss": 0.2247, + "num_input_tokens_seen": 74225480, + "step": 11105 + }, + { + "epoch": 1.3611527977703592, + "grad_norm": 1.3919962829607588, + "learning_rate": 2.5395795342169748e-05, + "loss": 0.2181, + "num_input_tokens_seen": 74259160, + "step": 11110 + }, + { + "epoch": 1.3617653364368625, + "grad_norm": 1.4308887280818654, + "learning_rate": 2.5351716495277434e-05, + "loss": 0.2141, + "num_input_tokens_seen": 74292656, + "step": 11115 + }, + { + "epoch": 1.362377875103366, + "grad_norm": 1.2156729035104692, + "learning_rate": 2.530766293966592e-05, + "loss": 0.2328, + "num_input_tokens_seen": 74325968, + "step": 11120 + }, + { + "epoch": 1.3629904137698692, + "grad_norm": 1.4042960548082626, + "learning_rate": 2.5263634720538005e-05, + "loss": 0.1933, + "num_input_tokens_seen": 74360072, + "step": 11125 + }, + { + "epoch": 1.3636029524363726, + "grad_norm": 1.392194430211413, + "learning_rate": 2.5219631883070482e-05, + "loss": 0.2065, + "num_input_tokens_seen": 74393752, + "step": 11130 + }, + { + "epoch": 1.3642154911028759, + "grad_norm": 1.259698652000111, + "learning_rate": 2.5175654472414047e-05, + "loss": 0.2419, + "num_input_tokens_seen": 74427864, + "step": 11135 + }, + { + "epoch": 1.3648280297693791, + "grad_norm": 1.3897695666702072, + "learning_rate": 2.513170253369337e-05, + "loss": 0.2281, + "num_input_tokens_seen": 74461312, + "step": 11140 + }, + { + "epoch": 1.3654405684358826, + "grad_norm": 1.4438546309546925, + "learning_rate": 2.5087776112006966e-05, + "loss": 0.2062, + "num_input_tokens_seen": 74494648, + "step": 11145 + }, + { + "epoch": 1.3660531071023858, + "grad_norm": 1.1883464550173541, + "learning_rate": 2.5043875252427156e-05, + "loss": 0.236, + "num_input_tokens_seen": 74528296, + "step": 11150 + }, + { + "epoch": 1.3660531071023858, + "eval_loss": 0.14952169358730316, + "eval_runtime": 19.3786, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 74528296, + "step": 11150 + }, + { + "epoch": 1.3666656457688893, + "grad_norm": 1.1954091110116585, + "learning_rate": 2.500000000000001e-05, + "loss": 0.1971, + "num_input_tokens_seen": 74562368, + "step": 11155 + }, + { + "epoch": 1.3672781844353925, + "grad_norm": 1.448181363417585, + "learning_rate": 2.4956150399745375e-05, + "loss": 0.2395, + "num_input_tokens_seen": 74596328, + "step": 11160 + }, + { + "epoch": 1.3678907231018957, + "grad_norm": 2.1272140492982055, + "learning_rate": 2.4912326496656762e-05, + "loss": 0.2587, + "num_input_tokens_seen": 74629408, + "step": 11165 + }, + { + "epoch": 1.3685032617683992, + "grad_norm": 1.2118543871719263, + "learning_rate": 2.4868528335701275e-05, + "loss": 0.2276, + "num_input_tokens_seen": 74662920, + "step": 11170 + }, + { + "epoch": 1.3691158004349024, + "grad_norm": 1.5201689858137362, + "learning_rate": 2.4824755961819647e-05, + "loss": 0.2238, + "num_input_tokens_seen": 74696728, + "step": 11175 + }, + { + "epoch": 1.369728339101406, + "grad_norm": 1.6964360928117685, + "learning_rate": 2.478100941992613e-05, + "loss": 0.2288, + "num_input_tokens_seen": 74730448, + "step": 11180 + }, + { + "epoch": 1.3703408777679091, + "grad_norm": 1.0971640842186783, + "learning_rate": 2.4737288754908517e-05, + "loss": 0.2066, + "num_input_tokens_seen": 74764088, + "step": 11185 + }, + { + "epoch": 1.3709534164344124, + "grad_norm": 0.9738244569482346, + "learning_rate": 2.469359401162795e-05, + "loss": 0.2158, + "num_input_tokens_seen": 74797488, + "step": 11190 + }, + { + "epoch": 1.3715659551009156, + "grad_norm": 1.1636541723418108, + "learning_rate": 2.4649925234919063e-05, + "loss": 0.1934, + "num_input_tokens_seen": 74830920, + "step": 11195 + }, + { + "epoch": 1.372178493767419, + "grad_norm": 1.107536927721727, + "learning_rate": 2.4606282469589837e-05, + "loss": 0.1875, + "num_input_tokens_seen": 74864760, + "step": 11200 + }, + { + "epoch": 1.372178493767419, + "eval_loss": 0.15537530183792114, + "eval_runtime": 19.755, + "eval_samples_per_second": 3.037, + "eval_steps_per_second": 0.759, + "num_input_tokens_seen": 74864760, + "step": 11200 + }, + { + "epoch": 1.3727910324339223, + "grad_norm": 1.51229292450707, + "learning_rate": 2.4562665760421493e-05, + "loss": 0.2228, + "num_input_tokens_seen": 74898192, + "step": 11205 + }, + { + "epoch": 1.3734035711004258, + "grad_norm": 1.3818600096103932, + "learning_rate": 2.4519075152168597e-05, + "loss": 0.2745, + "num_input_tokens_seen": 74930960, + "step": 11210 + }, + { + "epoch": 1.374016109766929, + "grad_norm": 1.3756106461873228, + "learning_rate": 2.447551068955889e-05, + "loss": 0.204, + "num_input_tokens_seen": 74964680, + "step": 11215 + }, + { + "epoch": 1.3746286484334322, + "grad_norm": 1.0064209232056647, + "learning_rate": 2.4431972417293325e-05, + "loss": 0.2171, + "num_input_tokens_seen": 74998424, + "step": 11220 + }, + { + "epoch": 1.3752411870999357, + "grad_norm": 1.200693688480308, + "learning_rate": 2.4388460380045925e-05, + "loss": 0.2694, + "num_input_tokens_seen": 75031816, + "step": 11225 + }, + { + "epoch": 1.375853725766439, + "grad_norm": 1.5919153040868224, + "learning_rate": 2.4344974622463805e-05, + "loss": 0.2386, + "num_input_tokens_seen": 75065480, + "step": 11230 + }, + { + "epoch": 1.3764662644329424, + "grad_norm": 1.054999563757491, + "learning_rate": 2.4301515189167152e-05, + "loss": 0.215, + "num_input_tokens_seen": 75098816, + "step": 11235 + }, + { + "epoch": 1.3770788030994456, + "grad_norm": 1.044486761861506, + "learning_rate": 2.4258082124749137e-05, + "loss": 0.1899, + "num_input_tokens_seen": 75131904, + "step": 11240 + }, + { + "epoch": 1.3776913417659489, + "grad_norm": 1.4468459674495724, + "learning_rate": 2.4214675473775826e-05, + "loss": 0.2442, + "num_input_tokens_seen": 75164672, + "step": 11245 + }, + { + "epoch": 1.3783038804324523, + "grad_norm": 16.578633531559458, + "learning_rate": 2.417129528078623e-05, + "loss": 0.2163, + "num_input_tokens_seen": 75199264, + "step": 11250 + }, + { + "epoch": 1.3783038804324523, + "eval_loss": 0.14667704701423645, + "eval_runtime": 20.3189, + "eval_samples_per_second": 2.953, + "eval_steps_per_second": 0.738, + "num_input_tokens_seen": 75199264, + "step": 11250 + }, + { + "epoch": 1.3789164190989556, + "grad_norm": 2.338623943755047, + "learning_rate": 2.4127941590292225e-05, + "loss": 0.2058, + "num_input_tokens_seen": 75232800, + "step": 11255 + }, + { + "epoch": 1.379528957765459, + "grad_norm": 1.2367938341116378, + "learning_rate": 2.408461444677841e-05, + "loss": 0.2428, + "num_input_tokens_seen": 75266240, + "step": 11260 + }, + { + "epoch": 1.3801414964319623, + "grad_norm": 1.2910846803839258, + "learning_rate": 2.404131389470223e-05, + "loss": 0.1717, + "num_input_tokens_seen": 75300144, + "step": 11265 + }, + { + "epoch": 1.3807540350984655, + "grad_norm": 30.973267810423224, + "learning_rate": 2.399803997849381e-05, + "loss": 0.2426, + "num_input_tokens_seen": 75333520, + "step": 11270 + }, + { + "epoch": 1.381366573764969, + "grad_norm": 1.8951469707567008, + "learning_rate": 2.3954792742555964e-05, + "loss": 0.2372, + "num_input_tokens_seen": 75367128, + "step": 11275 + }, + { + "epoch": 1.3819791124314722, + "grad_norm": 0.9192799991547025, + "learning_rate": 2.391157223126409e-05, + "loss": 0.1993, + "num_input_tokens_seen": 75400848, + "step": 11280 + }, + { + "epoch": 1.3825916510979757, + "grad_norm": 1.2457276675807318, + "learning_rate": 2.386837848896619e-05, + "loss": 0.2131, + "num_input_tokens_seen": 75433880, + "step": 11285 + }, + { + "epoch": 1.383204189764479, + "grad_norm": 1.3290748045560505, + "learning_rate": 2.382521155998283e-05, + "loss": 0.1944, + "num_input_tokens_seen": 75467816, + "step": 11290 + }, + { + "epoch": 1.3838167284309821, + "grad_norm": 1.4277851260847876, + "learning_rate": 2.378207148860699e-05, + "loss": 0.2091, + "num_input_tokens_seen": 75501792, + "step": 11295 + }, + { + "epoch": 1.3844292670974856, + "grad_norm": 1.0730262567819877, + "learning_rate": 2.3738958319104145e-05, + "loss": 0.1914, + "num_input_tokens_seen": 75535760, + "step": 11300 + }, + { + "epoch": 1.3844292670974856, + "eval_loss": 0.12553976476192474, + "eval_runtime": 19.7973, + "eval_samples_per_second": 3.031, + "eval_steps_per_second": 0.758, + "num_input_tokens_seen": 75535760, + "step": 11300 + }, + { + "epoch": 1.3850418057639888, + "grad_norm": 1.6232949150301446, + "learning_rate": 2.369587209571217e-05, + "loss": 0.236, + "num_input_tokens_seen": 75568840, + "step": 11305 + }, + { + "epoch": 1.3856543444304923, + "grad_norm": 1.5155958228674475, + "learning_rate": 2.3652812862641265e-05, + "loss": 0.261, + "num_input_tokens_seen": 75601944, + "step": 11310 + }, + { + "epoch": 1.3862668830969955, + "grad_norm": 0.870646720736669, + "learning_rate": 2.3609780664073923e-05, + "loss": 0.2013, + "num_input_tokens_seen": 75635768, + "step": 11315 + }, + { + "epoch": 1.3868794217634988, + "grad_norm": 1.1925450250695349, + "learning_rate": 2.3566775544164937e-05, + "loss": 0.218, + "num_input_tokens_seen": 75669384, + "step": 11320 + }, + { + "epoch": 1.3874919604300022, + "grad_norm": 1.8228325292060459, + "learning_rate": 2.3523797547041316e-05, + "loss": 0.1893, + "num_input_tokens_seen": 75703016, + "step": 11325 + }, + { + "epoch": 1.3881044990965055, + "grad_norm": 1.4306894391386675, + "learning_rate": 2.3480846716802184e-05, + "loss": 0.2366, + "num_input_tokens_seen": 75736336, + "step": 11330 + }, + { + "epoch": 1.388717037763009, + "grad_norm": 1.19916009774191, + "learning_rate": 2.3437923097518843e-05, + "loss": 0.1948, + "num_input_tokens_seen": 75770120, + "step": 11335 + }, + { + "epoch": 1.3893295764295122, + "grad_norm": 1.2947975130523453, + "learning_rate": 2.3395026733234653e-05, + "loss": 0.2454, + "num_input_tokens_seen": 75803520, + "step": 11340 + }, + { + "epoch": 1.3899421150960154, + "grad_norm": 1.4486076958155687, + "learning_rate": 2.3352157667965034e-05, + "loss": 0.1971, + "num_input_tokens_seen": 75837704, + "step": 11345 + }, + { + "epoch": 1.3905546537625186, + "grad_norm": 1.490900454933061, + "learning_rate": 2.3309315945697335e-05, + "loss": 0.2302, + "num_input_tokens_seen": 75870800, + "step": 11350 + }, + { + "epoch": 1.3905546537625186, + "eval_loss": 0.1493845283985138, + "eval_runtime": 19.6828, + "eval_samples_per_second": 3.048, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 75870800, + "step": 11350 + }, + { + "epoch": 1.391167192429022, + "grad_norm": 1.0752398721462886, + "learning_rate": 2.3266501610390906e-05, + "loss": 0.1958, + "num_input_tokens_seen": 75905072, + "step": 11355 + }, + { + "epoch": 1.3917797310955253, + "grad_norm": 1.4944338956696954, + "learning_rate": 2.322371470597699e-05, + "loss": 0.2484, + "num_input_tokens_seen": 75938048, + "step": 11360 + }, + { + "epoch": 1.3923922697620288, + "grad_norm": 1.4611829079192886, + "learning_rate": 2.3180955276358628e-05, + "loss": 0.2457, + "num_input_tokens_seen": 75971152, + "step": 11365 + }, + { + "epoch": 1.393004808428532, + "grad_norm": 1.3441537014783147, + "learning_rate": 2.3138223365410743e-05, + "loss": 0.219, + "num_input_tokens_seen": 76004648, + "step": 11370 + }, + { + "epoch": 1.3936173470950353, + "grad_norm": 1.001647162051465, + "learning_rate": 2.3095519016979976e-05, + "loss": 0.2099, + "num_input_tokens_seen": 76037936, + "step": 11375 + }, + { + "epoch": 1.3942298857615387, + "grad_norm": 1.218728080470328, + "learning_rate": 2.3052842274884728e-05, + "loss": 0.202, + "num_input_tokens_seen": 76071568, + "step": 11380 + }, + { + "epoch": 1.394842424428042, + "grad_norm": 1.482661327395792, + "learning_rate": 2.301019318291502e-05, + "loss": 0.2149, + "num_input_tokens_seen": 76105328, + "step": 11385 + }, + { + "epoch": 1.3954549630945454, + "grad_norm": 1.3147384066460048, + "learning_rate": 2.296757178483251e-05, + "loss": 0.2472, + "num_input_tokens_seen": 76138608, + "step": 11390 + }, + { + "epoch": 1.3960675017610487, + "grad_norm": 1.1511171118587837, + "learning_rate": 2.2924978124370472e-05, + "loss": 0.1948, + "num_input_tokens_seen": 76171936, + "step": 11395 + }, + { + "epoch": 1.396680040427552, + "grad_norm": 1.4155361943897409, + "learning_rate": 2.2882412245233727e-05, + "loss": 0.2228, + "num_input_tokens_seen": 76204736, + "step": 11400 + }, + { + "epoch": 1.396680040427552, + "eval_loss": 0.13607601821422577, + "eval_runtime": 19.5448, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 76204736, + "step": 11400 + }, + { + "epoch": 1.3972925790940554, + "grad_norm": 1.5623136082019815, + "learning_rate": 2.2839874191098527e-05, + "loss": 0.246, + "num_input_tokens_seen": 76238512, + "step": 11405 + }, + { + "epoch": 1.3979051177605586, + "grad_norm": 1.0900294085981546, + "learning_rate": 2.2797364005612624e-05, + "loss": 0.207, + "num_input_tokens_seen": 76271944, + "step": 11410 + }, + { + "epoch": 1.398517656427062, + "grad_norm": 1.4912846379601163, + "learning_rate": 2.275488173239519e-05, + "loss": 0.2436, + "num_input_tokens_seen": 76305032, + "step": 11415 + }, + { + "epoch": 1.3991301950935653, + "grad_norm": 2.109179171015616, + "learning_rate": 2.271242741503669e-05, + "loss": 0.2002, + "num_input_tokens_seen": 76338824, + "step": 11420 + }, + { + "epoch": 1.3997427337600685, + "grad_norm": 1.2906027232155004, + "learning_rate": 2.267000109709896e-05, + "loss": 0.2215, + "num_input_tokens_seen": 76372360, + "step": 11425 + }, + { + "epoch": 1.400355272426572, + "grad_norm": 1.3383037733897587, + "learning_rate": 2.2627602822115083e-05, + "loss": 0.2445, + "num_input_tokens_seen": 76405752, + "step": 11430 + }, + { + "epoch": 1.4009678110930752, + "grad_norm": 1.678282337381254, + "learning_rate": 2.2585232633589402e-05, + "loss": 0.2329, + "num_input_tokens_seen": 76439832, + "step": 11435 + }, + { + "epoch": 1.4015803497595787, + "grad_norm": 32.488609000866205, + "learning_rate": 2.2542890574997372e-05, + "loss": 0.1915, + "num_input_tokens_seen": 76473504, + "step": 11440 + }, + { + "epoch": 1.402192888426082, + "grad_norm": 1.6132573142338857, + "learning_rate": 2.2500576689785645e-05, + "loss": 0.1851, + "num_input_tokens_seen": 76507384, + "step": 11445 + }, + { + "epoch": 1.4028054270925852, + "grad_norm": 1.353999319125173, + "learning_rate": 2.245829102137196e-05, + "loss": 0.2203, + "num_input_tokens_seen": 76541376, + "step": 11450 + }, + { + "epoch": 1.4028054270925852, + "eval_loss": 0.13985607028007507, + "eval_runtime": 19.3404, + "eval_samples_per_second": 3.102, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 76541376, + "step": 11450 + }, + { + "epoch": 1.4034179657590886, + "grad_norm": 0.939110259463709, + "learning_rate": 2.241603361314505e-05, + "loss": 0.2098, + "num_input_tokens_seen": 76575016, + "step": 11455 + }, + { + "epoch": 1.4040305044255919, + "grad_norm": 0.9265842466420994, + "learning_rate": 2.2373804508464707e-05, + "loss": 0.1934, + "num_input_tokens_seen": 76608464, + "step": 11460 + }, + { + "epoch": 1.4046430430920953, + "grad_norm": 1.5517003693518743, + "learning_rate": 2.233160375066168e-05, + "loss": 0.2108, + "num_input_tokens_seen": 76641856, + "step": 11465 + }, + { + "epoch": 1.4052555817585985, + "grad_norm": 1.4439917361643635, + "learning_rate": 2.2289431383037594e-05, + "loss": 0.218, + "num_input_tokens_seen": 76675760, + "step": 11470 + }, + { + "epoch": 1.4058681204251018, + "grad_norm": 1.507292547909062, + "learning_rate": 2.2247287448864952e-05, + "loss": 0.2457, + "num_input_tokens_seen": 76709472, + "step": 11475 + }, + { + "epoch": 1.406480659091605, + "grad_norm": 1.3901602936201654, + "learning_rate": 2.2205171991387104e-05, + "loss": 0.198, + "num_input_tokens_seen": 76743120, + "step": 11480 + }, + { + "epoch": 1.4070931977581085, + "grad_norm": 1.1735039185052853, + "learning_rate": 2.2163085053818173e-05, + "loss": 0.2278, + "num_input_tokens_seen": 76776432, + "step": 11485 + }, + { + "epoch": 1.407705736424612, + "grad_norm": 1.4687257536230054, + "learning_rate": 2.2121026679343028e-05, + "loss": 0.2212, + "num_input_tokens_seen": 76809752, + "step": 11490 + }, + { + "epoch": 1.4083182750911152, + "grad_norm": 1.2054513198472594, + "learning_rate": 2.2078996911117182e-05, + "loss": 0.2273, + "num_input_tokens_seen": 76843064, + "step": 11495 + }, + { + "epoch": 1.4089308137576184, + "grad_norm": 1.2331838011316778, + "learning_rate": 2.2036995792266852e-05, + "loss": 0.2274, + "num_input_tokens_seen": 76876640, + "step": 11500 + }, + { + "epoch": 1.4089308137576184, + "eval_loss": 0.14584578573703766, + "eval_runtime": 19.654, + "eval_samples_per_second": 3.053, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 76876640, + "step": 11500 + }, + { + "epoch": 1.4095433524241217, + "grad_norm": 1.9377460735178373, + "learning_rate": 2.199502336588885e-05, + "loss": 0.2269, + "num_input_tokens_seen": 76910112, + "step": 11505 + }, + { + "epoch": 1.4101558910906251, + "grad_norm": 1.2074376841787913, + "learning_rate": 2.1953079675050498e-05, + "loss": 0.2547, + "num_input_tokens_seen": 76943248, + "step": 11510 + }, + { + "epoch": 1.4107684297571284, + "grad_norm": 1.6673139338465959, + "learning_rate": 2.1911164762789682e-05, + "loss": 0.2192, + "num_input_tokens_seen": 76976416, + "step": 11515 + }, + { + "epoch": 1.4113809684236318, + "grad_norm": 1.0347317776135743, + "learning_rate": 2.1869278672114762e-05, + "loss": 0.1972, + "num_input_tokens_seen": 77010144, + "step": 11520 + }, + { + "epoch": 1.411993507090135, + "grad_norm": 1.0706084397278428, + "learning_rate": 2.182742144600448e-05, + "loss": 0.1968, + "num_input_tokens_seen": 77044016, + "step": 11525 + }, + { + "epoch": 1.4126060457566383, + "grad_norm": 1.3935001170007235, + "learning_rate": 2.1785593127407993e-05, + "loss": 0.1695, + "num_input_tokens_seen": 77077688, + "step": 11530 + }, + { + "epoch": 1.4132185844231417, + "grad_norm": 1.0019449876645863, + "learning_rate": 2.1743793759244807e-05, + "loss": 0.2303, + "num_input_tokens_seen": 77111176, + "step": 11535 + }, + { + "epoch": 1.413831123089645, + "grad_norm": 1.522429135029562, + "learning_rate": 2.1702023384404706e-05, + "loss": 0.2219, + "num_input_tokens_seen": 77144408, + "step": 11540 + }, + { + "epoch": 1.4144436617561484, + "grad_norm": 1.1253675763175544, + "learning_rate": 2.1660282045747698e-05, + "loss": 0.2047, + "num_input_tokens_seen": 77177960, + "step": 11545 + }, + { + "epoch": 1.4150562004226517, + "grad_norm": 1.163967526127202, + "learning_rate": 2.1618569786104063e-05, + "loss": 0.2299, + "num_input_tokens_seen": 77211216, + "step": 11550 + }, + { + "epoch": 1.4150562004226517, + "eval_loss": 0.18886250257492065, + "eval_runtime": 19.3477, + "eval_samples_per_second": 3.101, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 77211216, + "step": 11550 + }, + { + "epoch": 1.415668739089155, + "grad_norm": 1.1727566936780487, + "learning_rate": 2.1576886648274157e-05, + "loss": 0.2107, + "num_input_tokens_seen": 77245192, + "step": 11555 + }, + { + "epoch": 1.4162812777556584, + "grad_norm": 1.3281318166229035, + "learning_rate": 2.1535232675028556e-05, + "loss": 0.2082, + "num_input_tokens_seen": 77278776, + "step": 11560 + }, + { + "epoch": 1.4168938164221616, + "grad_norm": 1.6111420711219464, + "learning_rate": 2.14936079091078e-05, + "loss": 0.1563, + "num_input_tokens_seen": 77313896, + "step": 11565 + }, + { + "epoch": 1.417506355088665, + "grad_norm": 1.2839467464259497, + "learning_rate": 2.1452012393222544e-05, + "loss": 0.2277, + "num_input_tokens_seen": 77348024, + "step": 11570 + }, + { + "epoch": 1.4181188937551683, + "grad_norm": 2.335808482389904, + "learning_rate": 2.1410446170053418e-05, + "loss": 0.2206, + "num_input_tokens_seen": 77381360, + "step": 11575 + }, + { + "epoch": 1.4187314324216715, + "grad_norm": 1.1226802018221158, + "learning_rate": 2.1368909282250944e-05, + "loss": 0.2265, + "num_input_tokens_seen": 77414776, + "step": 11580 + }, + { + "epoch": 1.419343971088175, + "grad_norm": 27.95652082068156, + "learning_rate": 2.13274017724356e-05, + "loss": 0.2596, + "num_input_tokens_seen": 77448456, + "step": 11585 + }, + { + "epoch": 1.4199565097546782, + "grad_norm": 1.6689293397734806, + "learning_rate": 2.128592368319769e-05, + "loss": 0.1953, + "num_input_tokens_seen": 77482664, + "step": 11590 + }, + { + "epoch": 1.4205690484211817, + "grad_norm": 24.349435953184393, + "learning_rate": 2.1244475057097364e-05, + "loss": 0.2097, + "num_input_tokens_seen": 77516336, + "step": 11595 + }, + { + "epoch": 1.421181587087685, + "grad_norm": 1.3401957530661741, + "learning_rate": 2.1203055936664473e-05, + "loss": 0.2541, + "num_input_tokens_seen": 77549560, + "step": 11600 + }, + { + "epoch": 1.421181587087685, + "eval_loss": 0.15744981169700623, + "eval_runtime": 19.4999, + "eval_samples_per_second": 3.077, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 77549560, + "step": 11600 + }, + { + "epoch": 1.4217941257541882, + "grad_norm": 1.482456260147129, + "learning_rate": 2.116166636439865e-05, + "loss": 0.2383, + "num_input_tokens_seen": 77582480, + "step": 11605 + }, + { + "epoch": 1.4224066644206916, + "grad_norm": 0.7904057956562802, + "learning_rate": 2.1120306382769223e-05, + "loss": 0.2478, + "num_input_tokens_seen": 77616096, + "step": 11610 + }, + { + "epoch": 1.4230192030871949, + "grad_norm": 1.2212879423399834, + "learning_rate": 2.1078976034215076e-05, + "loss": 0.2231, + "num_input_tokens_seen": 77649232, + "step": 11615 + }, + { + "epoch": 1.4236317417536983, + "grad_norm": 1.17414440380428, + "learning_rate": 2.1037675361144764e-05, + "loss": 0.2292, + "num_input_tokens_seen": 77683000, + "step": 11620 + }, + { + "epoch": 1.4242442804202016, + "grad_norm": 1.633390357694746, + "learning_rate": 2.0996404405936388e-05, + "loss": 0.2083, + "num_input_tokens_seen": 77716136, + "step": 11625 + }, + { + "epoch": 1.4248568190867048, + "grad_norm": 0.9236835374178289, + "learning_rate": 2.0955163210937507e-05, + "loss": 0.204, + "num_input_tokens_seen": 77750064, + "step": 11630 + }, + { + "epoch": 1.425469357753208, + "grad_norm": 1.358055893320477, + "learning_rate": 2.091395181846516e-05, + "loss": 0.2225, + "num_input_tokens_seen": 77783792, + "step": 11635 + }, + { + "epoch": 1.4260818964197115, + "grad_norm": 1.2925316714831874, + "learning_rate": 2.0872770270805834e-05, + "loss": 0.207, + "num_input_tokens_seen": 77817472, + "step": 11640 + }, + { + "epoch": 1.4266944350862147, + "grad_norm": 1.0474518928956342, + "learning_rate": 2.0831618610215374e-05, + "loss": 0.2444, + "num_input_tokens_seen": 77850496, + "step": 11645 + }, + { + "epoch": 1.4273069737527182, + "grad_norm": 1.4801887733220611, + "learning_rate": 2.079049687891898e-05, + "loss": 0.23, + "num_input_tokens_seen": 77884056, + "step": 11650 + }, + { + "epoch": 1.4273069737527182, + "eval_loss": 0.17718727886676788, + "eval_runtime": 19.8826, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "num_input_tokens_seen": 77884056, + "step": 11650 + }, + { + "epoch": 1.4279195124192214, + "grad_norm": 1.385767865109891, + "learning_rate": 2.0749405119111086e-05, + "loss": 0.2166, + "num_input_tokens_seen": 77917736, + "step": 11655 + }, + { + "epoch": 1.4285320510857247, + "grad_norm": 1.181715061093167, + "learning_rate": 2.0708343372955417e-05, + "loss": 0.29, + "num_input_tokens_seen": 77951440, + "step": 11660 + }, + { + "epoch": 1.4291445897522281, + "grad_norm": 1.2509637127020838, + "learning_rate": 2.0667311682584927e-05, + "loss": 0.2354, + "num_input_tokens_seen": 77984592, + "step": 11665 + }, + { + "epoch": 1.4297571284187314, + "grad_norm": 1.1952630070074588, + "learning_rate": 2.0626310090101646e-05, + "loss": 0.199, + "num_input_tokens_seen": 78018336, + "step": 11670 + }, + { + "epoch": 1.4303696670852348, + "grad_norm": 1.289421137659356, + "learning_rate": 2.0585338637576802e-05, + "loss": 0.2172, + "num_input_tokens_seen": 78051872, + "step": 11675 + }, + { + "epoch": 1.430982205751738, + "grad_norm": 1.0855263584051233, + "learning_rate": 2.0544397367050673e-05, + "loss": 0.2054, + "num_input_tokens_seen": 78085520, + "step": 11680 + }, + { + "epoch": 1.4315947444182413, + "grad_norm": 1.2468690985988058, + "learning_rate": 2.0503486320532523e-05, + "loss": 0.2326, + "num_input_tokens_seen": 78118664, + "step": 11685 + }, + { + "epoch": 1.4322072830847448, + "grad_norm": 1.5679238415062051, + "learning_rate": 2.0462605540000668e-05, + "loss": 0.2424, + "num_input_tokens_seen": 78151408, + "step": 11690 + }, + { + "epoch": 1.432819821751248, + "grad_norm": 1.1426900959643231, + "learning_rate": 2.042175506740233e-05, + "loss": 0.2131, + "num_input_tokens_seen": 78184680, + "step": 11695 + }, + { + "epoch": 1.4334323604177515, + "grad_norm": 1.5320385814658846, + "learning_rate": 2.038093494465368e-05, + "loss": 0.2303, + "num_input_tokens_seen": 78217776, + "step": 11700 + }, + { + "epoch": 1.4334323604177515, + "eval_loss": 0.1379169523715973, + "eval_runtime": 19.8386, + "eval_samples_per_second": 3.024, + "eval_steps_per_second": 0.756, + "num_input_tokens_seen": 78217776, + "step": 11700 + }, + { + "epoch": 1.4340448990842547, + "grad_norm": 1.4326590047864796, + "learning_rate": 2.0340145213639655e-05, + "loss": 0.2127, + "num_input_tokens_seen": 78251376, + "step": 11705 + }, + { + "epoch": 1.434657437750758, + "grad_norm": 1.551965243090594, + "learning_rate": 2.0299385916214116e-05, + "loss": 0.2821, + "num_input_tokens_seen": 78284456, + "step": 11710 + }, + { + "epoch": 1.4352699764172614, + "grad_norm": 1.4266904783817593, + "learning_rate": 2.0258657094199597e-05, + "loss": 0.2111, + "num_input_tokens_seen": 78318256, + "step": 11715 + }, + { + "epoch": 1.4358825150837646, + "grad_norm": 1.4622832329237696, + "learning_rate": 2.0217958789387446e-05, + "loss": 0.1973, + "num_input_tokens_seen": 78351968, + "step": 11720 + }, + { + "epoch": 1.436495053750268, + "grad_norm": 1.2767914245182206, + "learning_rate": 2.017729104353763e-05, + "loss": 0.2047, + "num_input_tokens_seen": 78385944, + "step": 11725 + }, + { + "epoch": 1.4371075924167713, + "grad_norm": 1.0109017464140821, + "learning_rate": 2.0136653898378805e-05, + "loss": 0.2477, + "num_input_tokens_seen": 78418960, + "step": 11730 + }, + { + "epoch": 1.4377201310832746, + "grad_norm": 1.4595388451625722, + "learning_rate": 2.009604739560823e-05, + "loss": 0.215, + "num_input_tokens_seen": 78452624, + "step": 11735 + }, + { + "epoch": 1.438332669749778, + "grad_norm": 1.6265969189605476, + "learning_rate": 2.0055471576891672e-05, + "loss": 0.2267, + "num_input_tokens_seen": 78485592, + "step": 11740 + }, + { + "epoch": 1.4389452084162813, + "grad_norm": 1.6223135458613367, + "learning_rate": 2.0014926483863466e-05, + "loss": 0.2245, + "num_input_tokens_seen": 78519176, + "step": 11745 + }, + { + "epoch": 1.4395577470827847, + "grad_norm": 1.4066738070341533, + "learning_rate": 1.99744121581264e-05, + "loss": 0.2207, + "num_input_tokens_seen": 78552600, + "step": 11750 + }, + { + "epoch": 1.4395577470827847, + "eval_loss": 0.07964655011892319, + "eval_runtime": 19.3924, + "eval_samples_per_second": 3.094, + "eval_steps_per_second": 0.773, + "num_input_tokens_seen": 78552600, + "step": 11750 + }, + { + "epoch": 1.440170285749288, + "grad_norm": 1.3268858294030856, + "learning_rate": 1.9933928641251702e-05, + "loss": 0.2158, + "num_input_tokens_seen": 78586312, + "step": 11755 + }, + { + "epoch": 1.4407828244157912, + "grad_norm": 1.2359824350049413, + "learning_rate": 1.9893475974778957e-05, + "loss": 0.1939, + "num_input_tokens_seen": 78620064, + "step": 11760 + }, + { + "epoch": 1.4413953630822944, + "grad_norm": 1.1683024300087932, + "learning_rate": 1.9853054200216124e-05, + "loss": 0.212, + "num_input_tokens_seen": 78653936, + "step": 11765 + }, + { + "epoch": 1.442007901748798, + "grad_norm": 1.1805090414134891, + "learning_rate": 1.9812663359039485e-05, + "loss": 0.2009, + "num_input_tokens_seen": 78687776, + "step": 11770 + }, + { + "epoch": 1.4426204404153014, + "grad_norm": 1.7542936987483482, + "learning_rate": 1.97723034926935e-05, + "loss": 0.2234, + "num_input_tokens_seen": 78721528, + "step": 11775 + }, + { + "epoch": 1.4432329790818046, + "grad_norm": 1.1636937431093823, + "learning_rate": 1.9731974642590933e-05, + "loss": 0.2094, + "num_input_tokens_seen": 78755520, + "step": 11780 + }, + { + "epoch": 1.4438455177483078, + "grad_norm": 1.4721779800528187, + "learning_rate": 1.96916768501127e-05, + "loss": 0.2597, + "num_input_tokens_seen": 78788352, + "step": 11785 + }, + { + "epoch": 1.444458056414811, + "grad_norm": 1.3899142619840883, + "learning_rate": 1.9651410156607803e-05, + "loss": 0.1969, + "num_input_tokens_seen": 78821984, + "step": 11790 + }, + { + "epoch": 1.4450705950813145, + "grad_norm": 0.9965825392569478, + "learning_rate": 1.9611174603393402e-05, + "loss": 0.2171, + "num_input_tokens_seen": 78855176, + "step": 11795 + }, + { + "epoch": 1.4456831337478178, + "grad_norm": 1.3906640428951949, + "learning_rate": 1.9570970231754636e-05, + "loss": 0.2276, + "num_input_tokens_seen": 78888720, + "step": 11800 + }, + { + "epoch": 1.4456831337478178, + "eval_loss": 0.07458024471998215, + "eval_runtime": 19.6385, + "eval_samples_per_second": 3.055, + "eval_steps_per_second": 0.764, + "num_input_tokens_seen": 78888720, + "step": 11800 + }, + { + "epoch": 1.4462956724143212, + "grad_norm": 1.0041596901827774, + "learning_rate": 1.9530797082944713e-05, + "loss": 0.2215, + "num_input_tokens_seen": 78921912, + "step": 11805 + }, + { + "epoch": 1.4469082110808245, + "grad_norm": 1.8621846507613122, + "learning_rate": 1.9490655198184778e-05, + "loss": 0.2606, + "num_input_tokens_seen": 78954768, + "step": 11810 + }, + { + "epoch": 1.4475207497473277, + "grad_norm": 1.2377926828054628, + "learning_rate": 1.945054461866386e-05, + "loss": 0.2183, + "num_input_tokens_seen": 78988256, + "step": 11815 + }, + { + "epoch": 1.4481332884138312, + "grad_norm": 1.268900986115073, + "learning_rate": 1.941046538553892e-05, + "loss": 0.1923, + "num_input_tokens_seen": 79022296, + "step": 11820 + }, + { + "epoch": 1.4487458270803344, + "grad_norm": 1.7930800956287853, + "learning_rate": 1.937041753993475e-05, + "loss": 0.2035, + "num_input_tokens_seen": 79055696, + "step": 11825 + }, + { + "epoch": 1.4493583657468379, + "grad_norm": 1.7631735182594084, + "learning_rate": 1.933040112294389e-05, + "loss": 0.2214, + "num_input_tokens_seen": 79088904, + "step": 11830 + }, + { + "epoch": 1.449970904413341, + "grad_norm": 1.083536616074597, + "learning_rate": 1.9290416175626676e-05, + "loss": 0.2164, + "num_input_tokens_seen": 79122224, + "step": 11835 + }, + { + "epoch": 1.4505834430798443, + "grad_norm": 1.2826614418199644, + "learning_rate": 1.9250462739011148e-05, + "loss": 0.2127, + "num_input_tokens_seen": 79155792, + "step": 11840 + }, + { + "epoch": 1.4511959817463478, + "grad_norm": 1.336873875559789, + "learning_rate": 1.9210540854092978e-05, + "loss": 0.2558, + "num_input_tokens_seen": 79189896, + "step": 11845 + }, + { + "epoch": 1.451808520412851, + "grad_norm": 1.6801430367114356, + "learning_rate": 1.9170650561835506e-05, + "loss": 0.2157, + "num_input_tokens_seen": 79223936, + "step": 11850 + }, + { + "epoch": 1.451808520412851, + "eval_loss": 0.09504926949739456, + "eval_runtime": 19.797, + "eval_samples_per_second": 3.031, + "eval_steps_per_second": 0.758, + "num_input_tokens_seen": 79223936, + "step": 11850 + }, + { + "epoch": 1.4524210590793545, + "grad_norm": 1.4497023084201615, + "learning_rate": 1.9130791903169638e-05, + "loss": 0.2199, + "num_input_tokens_seen": 79257208, + "step": 11855 + }, + { + "epoch": 1.4530335977458577, + "grad_norm": 1.2276758432316133, + "learning_rate": 1.9090964918993836e-05, + "loss": 0.1793, + "num_input_tokens_seen": 79291584, + "step": 11860 + }, + { + "epoch": 1.453646136412361, + "grad_norm": 1.8110036927968387, + "learning_rate": 1.9051169650174017e-05, + "loss": 0.2024, + "num_input_tokens_seen": 79325208, + "step": 11865 + }, + { + "epoch": 1.4542586750788644, + "grad_norm": 1.3831000428719964, + "learning_rate": 1.9011406137543597e-05, + "loss": 0.235, + "num_input_tokens_seen": 79358368, + "step": 11870 + }, + { + "epoch": 1.4548712137453677, + "grad_norm": 1.6187903987724, + "learning_rate": 1.8971674421903413e-05, + "loss": 0.2488, + "num_input_tokens_seen": 79391560, + "step": 11875 + }, + { + "epoch": 1.4554837524118711, + "grad_norm": 1.6814096836601526, + "learning_rate": 1.8931974544021645e-05, + "loss": 0.2149, + "num_input_tokens_seen": 79425088, + "step": 11880 + }, + { + "epoch": 1.4560962910783743, + "grad_norm": 1.1745434421928362, + "learning_rate": 1.8892306544633798e-05, + "loss": 0.184, + "num_input_tokens_seen": 79459464, + "step": 11885 + }, + { + "epoch": 1.4567088297448776, + "grad_norm": 1.421592182572558, + "learning_rate": 1.88526704644427e-05, + "loss": 0.2283, + "num_input_tokens_seen": 79493000, + "step": 11890 + }, + { + "epoch": 1.457321368411381, + "grad_norm": 1.170123075316916, + "learning_rate": 1.881306634411844e-05, + "loss": 0.2184, + "num_input_tokens_seen": 79526712, + "step": 11895 + }, + { + "epoch": 1.4579339070778843, + "grad_norm": 1.0336918571635103, + "learning_rate": 1.8773494224298244e-05, + "loss": 0.2073, + "num_input_tokens_seen": 79560856, + "step": 11900 + }, + { + "epoch": 1.4579339070778843, + "eval_loss": 0.09442640841007233, + "eval_runtime": 19.2915, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 79560856, + "step": 11900 + }, + { + "epoch": 1.4585464457443877, + "grad_norm": 1.3679759784014216, + "learning_rate": 1.8733954145586564e-05, + "loss": 0.1984, + "num_input_tokens_seen": 79594856, + "step": 11905 + }, + { + "epoch": 1.459158984410891, + "grad_norm": 1.5475246361886241, + "learning_rate": 1.869444614855496e-05, + "loss": 0.2249, + "num_input_tokens_seen": 79628416, + "step": 11910 + }, + { + "epoch": 1.4597715230773942, + "grad_norm": 1.2083117859520573, + "learning_rate": 1.865497027374209e-05, + "loss": 0.2391, + "num_input_tokens_seen": 79661656, + "step": 11915 + }, + { + "epoch": 1.4603840617438975, + "grad_norm": 0.8092871912401626, + "learning_rate": 1.861552656165359e-05, + "loss": 0.1946, + "num_input_tokens_seen": 79695424, + "step": 11920 + }, + { + "epoch": 1.460996600410401, + "grad_norm": 1.8712024643764127, + "learning_rate": 1.857611505276216e-05, + "loss": 0.2245, + "num_input_tokens_seen": 79728696, + "step": 11925 + }, + { + "epoch": 1.4616091390769042, + "grad_norm": 1.708438260077005, + "learning_rate": 1.8536735787507447e-05, + "loss": 0.2071, + "num_input_tokens_seen": 79762584, + "step": 11930 + }, + { + "epoch": 1.4622216777434076, + "grad_norm": 1.3106131028698333, + "learning_rate": 1.849738880629597e-05, + "loss": 0.1768, + "num_input_tokens_seen": 79796928, + "step": 11935 + }, + { + "epoch": 1.4628342164099108, + "grad_norm": 1.4141721431066059, + "learning_rate": 1.8458074149501153e-05, + "loss": 0.223, + "num_input_tokens_seen": 79830296, + "step": 11940 + }, + { + "epoch": 1.463446755076414, + "grad_norm": 1.2084116693938611, + "learning_rate": 1.8418791857463287e-05, + "loss": 0.1694, + "num_input_tokens_seen": 79864528, + "step": 11945 + }, + { + "epoch": 1.4640592937429175, + "grad_norm": 1.2782206880022664, + "learning_rate": 1.8379541970489366e-05, + "loss": 0.1818, + "num_input_tokens_seen": 79898064, + "step": 11950 + }, + { + "epoch": 1.4640592937429175, + "eval_loss": 0.1088213175535202, + "eval_runtime": 19.4183, + "eval_samples_per_second": 3.09, + "eval_steps_per_second": 0.772, + "num_input_tokens_seen": 79898064, + "step": 11950 + }, + { + "epoch": 1.4646718324094208, + "grad_norm": 1.427037338582067, + "learning_rate": 1.8340324528853237e-05, + "loss": 0.2375, + "num_input_tokens_seen": 79931680, + "step": 11955 + }, + { + "epoch": 1.4652843710759242, + "grad_norm": 1.2338777567554464, + "learning_rate": 1.830113957279535e-05, + "loss": 0.2129, + "num_input_tokens_seen": 79966176, + "step": 11960 + }, + { + "epoch": 1.4658969097424275, + "grad_norm": 1.4924198149687233, + "learning_rate": 1.8261987142522906e-05, + "loss": 0.2137, + "num_input_tokens_seen": 80000440, + "step": 11965 + }, + { + "epoch": 1.4665094484089307, + "grad_norm": 1.113112713151578, + "learning_rate": 1.8222867278209714e-05, + "loss": 0.1713, + "num_input_tokens_seen": 80034544, + "step": 11970 + }, + { + "epoch": 1.4671219870754342, + "grad_norm": 1.2727009402893619, + "learning_rate": 1.8183780019996123e-05, + "loss": 0.1994, + "num_input_tokens_seen": 80068616, + "step": 11975 + }, + { + "epoch": 1.4677345257419374, + "grad_norm": 1.3941438496106238, + "learning_rate": 1.814472540798906e-05, + "loss": 0.2176, + "num_input_tokens_seen": 80102032, + "step": 11980 + }, + { + "epoch": 1.4683470644084409, + "grad_norm": 1.682655836854595, + "learning_rate": 1.8105703482261983e-05, + "loss": 0.2363, + "num_input_tokens_seen": 80136208, + "step": 11985 + }, + { + "epoch": 1.468959603074944, + "grad_norm": 1.5733409771340396, + "learning_rate": 1.806671428285474e-05, + "loss": 0.2082, + "num_input_tokens_seen": 80170208, + "step": 11990 + }, + { + "epoch": 1.4695721417414473, + "grad_norm": 0.9718487815537233, + "learning_rate": 1.8027757849773657e-05, + "loss": 0.1776, + "num_input_tokens_seen": 80205136, + "step": 11995 + }, + { + "epoch": 1.4701846804079508, + "grad_norm": 41.69567000042487, + "learning_rate": 1.798883422299143e-05, + "loss": 0.2189, + "num_input_tokens_seen": 80238176, + "step": 12000 + }, + { + "epoch": 1.4701846804079508, + "eval_loss": 0.07542052865028381, + "eval_runtime": 19.2561, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 80238176, + "step": 12000 + }, + { + "epoch": 1.470797219074454, + "grad_norm": 1.3650626475832663, + "learning_rate": 1.7949943442447054e-05, + "loss": 0.2321, + "num_input_tokens_seen": 80271304, + "step": 12005 + }, + { + "epoch": 1.4714097577409575, + "grad_norm": 1.4498555054760127, + "learning_rate": 1.7911085548045863e-05, + "loss": 0.2077, + "num_input_tokens_seen": 80305440, + "step": 12010 + }, + { + "epoch": 1.4720222964074607, + "grad_norm": 1.9320148006473867, + "learning_rate": 1.7872260579659437e-05, + "loss": 0.2186, + "num_input_tokens_seen": 80339392, + "step": 12015 + }, + { + "epoch": 1.472634835073964, + "grad_norm": 1.7689702342289593, + "learning_rate": 1.7833468577125584e-05, + "loss": 0.2285, + "num_input_tokens_seen": 80372552, + "step": 12020 + }, + { + "epoch": 1.4732473737404674, + "grad_norm": 1.2111102544083923, + "learning_rate": 1.7794709580248237e-05, + "loss": 0.2289, + "num_input_tokens_seen": 80406488, + "step": 12025 + }, + { + "epoch": 1.4738599124069707, + "grad_norm": 0.9294510047497031, + "learning_rate": 1.7755983628797508e-05, + "loss": 0.2, + "num_input_tokens_seen": 80441080, + "step": 12030 + }, + { + "epoch": 1.4744724510734741, + "grad_norm": 1.171807497448012, + "learning_rate": 1.7717290762509603e-05, + "loss": 0.1899, + "num_input_tokens_seen": 80474952, + "step": 12035 + }, + { + "epoch": 1.4750849897399774, + "grad_norm": 1.4068816507329411, + "learning_rate": 1.7678631021086755e-05, + "loss": 0.2064, + "num_input_tokens_seen": 80508448, + "step": 12040 + }, + { + "epoch": 1.4756975284064806, + "grad_norm": 1.3514871036305516, + "learning_rate": 1.76400044441972e-05, + "loss": 0.2092, + "num_input_tokens_seen": 80542456, + "step": 12045 + }, + { + "epoch": 1.4763100670729838, + "grad_norm": 1.6884641291870757, + "learning_rate": 1.7601411071475175e-05, + "loss": 0.2262, + "num_input_tokens_seen": 80575888, + "step": 12050 + }, + { + "epoch": 1.4763100670729838, + "eval_loss": 0.10674486309289932, + "eval_runtime": 19.5696, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 80575888, + "step": 12050 + }, + { + "epoch": 1.4769226057394873, + "grad_norm": 1.3222322371143302, + "learning_rate": 1.756285094252086e-05, + "loss": 0.2615, + "num_input_tokens_seen": 80609304, + "step": 12055 + }, + { + "epoch": 1.4775351444059908, + "grad_norm": 1.3038442703536144, + "learning_rate": 1.752432409690025e-05, + "loss": 0.2337, + "num_input_tokens_seen": 80642480, + "step": 12060 + }, + { + "epoch": 1.478147683072494, + "grad_norm": 1.2596288785188314, + "learning_rate": 1.7485830574145268e-05, + "loss": 0.2164, + "num_input_tokens_seen": 80676336, + "step": 12065 + }, + { + "epoch": 1.4787602217389972, + "grad_norm": 1.6648960482150614, + "learning_rate": 1.7447370413753612e-05, + "loss": 0.2001, + "num_input_tokens_seen": 80710200, + "step": 12070 + }, + { + "epoch": 1.4793727604055005, + "grad_norm": 1.4558664784219633, + "learning_rate": 1.7408943655188753e-05, + "loss": 0.1961, + "num_input_tokens_seen": 80743864, + "step": 12075 + }, + { + "epoch": 1.479985299072004, + "grad_norm": 1.3975205679385114, + "learning_rate": 1.737055033787986e-05, + "loss": 0.2163, + "num_input_tokens_seen": 80777296, + "step": 12080 + }, + { + "epoch": 1.4805978377385072, + "grad_norm": 1.0342232590621074, + "learning_rate": 1.733219050122183e-05, + "loss": 0.2491, + "num_input_tokens_seen": 80810832, + "step": 12085 + }, + { + "epoch": 1.4812103764050106, + "grad_norm": 1.461052424260357, + "learning_rate": 1.7293864184575203e-05, + "loss": 0.1939, + "num_input_tokens_seen": 80845120, + "step": 12090 + }, + { + "epoch": 1.4818229150715139, + "grad_norm": 1.0668585853398103, + "learning_rate": 1.725557142726608e-05, + "loss": 0.1617, + "num_input_tokens_seen": 80879472, + "step": 12095 + }, + { + "epoch": 1.482435453738017, + "grad_norm": 1.3906134264807248, + "learning_rate": 1.7217312268586177e-05, + "loss": 0.2088, + "num_input_tokens_seen": 80912808, + "step": 12100 + }, + { + "epoch": 1.482435453738017, + "eval_loss": 0.12759126722812653, + "eval_runtime": 19.458, + "eval_samples_per_second": 3.084, + "eval_steps_per_second": 0.771, + "num_input_tokens_seen": 80912808, + "step": 12100 + }, + { + "epoch": 1.4830479924045206, + "grad_norm": 1.350213028954588, + "learning_rate": 1.7179086747792705e-05, + "loss": 0.2205, + "num_input_tokens_seen": 80945688, + "step": 12105 + }, + { + "epoch": 1.4836605310710238, + "grad_norm": 1.7138932429222575, + "learning_rate": 1.7140894904108396e-05, + "loss": 0.2271, + "num_input_tokens_seen": 80979064, + "step": 12110 + }, + { + "epoch": 1.4842730697375273, + "grad_norm": 1.8330956643913292, + "learning_rate": 1.7102736776721383e-05, + "loss": 0.219, + "num_input_tokens_seen": 81012720, + "step": 12115 + }, + { + "epoch": 1.4848856084040305, + "grad_norm": 1.14443147495106, + "learning_rate": 1.7064612404785196e-05, + "loss": 0.1997, + "num_input_tokens_seen": 81046296, + "step": 12120 + }, + { + "epoch": 1.4854981470705337, + "grad_norm": 1.3913689156054243, + "learning_rate": 1.7026521827418786e-05, + "loss": 0.2113, + "num_input_tokens_seen": 81079272, + "step": 12125 + }, + { + "epoch": 1.4861106857370372, + "grad_norm": 1.023101620622787, + "learning_rate": 1.698846508370639e-05, + "loss": 0.1786, + "num_input_tokens_seen": 81113592, + "step": 12130 + }, + { + "epoch": 1.4867232244035404, + "grad_norm": 1.429552357254653, + "learning_rate": 1.695044221269752e-05, + "loss": 0.2348, + "num_input_tokens_seen": 81146704, + "step": 12135 + }, + { + "epoch": 1.487335763070044, + "grad_norm": 1.5634928291725132, + "learning_rate": 1.6912453253406958e-05, + "loss": 0.224, + "num_input_tokens_seen": 81179856, + "step": 12140 + }, + { + "epoch": 1.4879483017365471, + "grad_norm": 1.2283588996234667, + "learning_rate": 1.687449824481469e-05, + "loss": 0.1995, + "num_input_tokens_seen": 81213312, + "step": 12145 + }, + { + "epoch": 1.4885608404030504, + "grad_norm": 1.3793598486147676, + "learning_rate": 1.683657722586582e-05, + "loss": 0.2078, + "num_input_tokens_seen": 81247096, + "step": 12150 + }, + { + "epoch": 1.4885608404030504, + "eval_loss": 0.1698664128780365, + "eval_runtime": 19.6604, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 81247096, + "step": 12150 + }, + { + "epoch": 1.4891733790695538, + "grad_norm": 1.1337705238546327, + "learning_rate": 1.6798690235470628e-05, + "loss": 0.2268, + "num_input_tokens_seen": 81280840, + "step": 12155 + }, + { + "epoch": 1.489785917736057, + "grad_norm": 1.40443873678761, + "learning_rate": 1.6760837312504473e-05, + "loss": 0.2036, + "num_input_tokens_seen": 81314448, + "step": 12160 + }, + { + "epoch": 1.4903984564025605, + "grad_norm": 1.3409823376020191, + "learning_rate": 1.672301849580771e-05, + "loss": 0.182, + "num_input_tokens_seen": 81348608, + "step": 12165 + }, + { + "epoch": 1.4910109950690638, + "grad_norm": 1.3485704643104983, + "learning_rate": 1.6685233824185752e-05, + "loss": 0.2224, + "num_input_tokens_seen": 81382256, + "step": 12170 + }, + { + "epoch": 1.491623533735567, + "grad_norm": 1.2887176217874046, + "learning_rate": 1.664748333640896e-05, + "loss": 0.2027, + "num_input_tokens_seen": 81416208, + "step": 12175 + }, + { + "epoch": 1.4922360724020705, + "grad_norm": 1.8219342659778606, + "learning_rate": 1.660976707121262e-05, + "loss": 0.2203, + "num_input_tokens_seen": 81449672, + "step": 12180 + }, + { + "epoch": 1.4928486110685737, + "grad_norm": 1.063759680118064, + "learning_rate": 1.657208506729686e-05, + "loss": 0.2567, + "num_input_tokens_seen": 81483056, + "step": 12185 + }, + { + "epoch": 1.4934611497350772, + "grad_norm": 1.1620006289702443, + "learning_rate": 1.6534437363326726e-05, + "loss": 0.2132, + "num_input_tokens_seen": 81516992, + "step": 12190 + }, + { + "epoch": 1.4940736884015804, + "grad_norm": 1.0806475801802131, + "learning_rate": 1.6496823997932037e-05, + "loss": 0.1981, + "num_input_tokens_seen": 81551128, + "step": 12195 + }, + { + "epoch": 1.4946862270680836, + "grad_norm": 1.4839737532994606, + "learning_rate": 1.6459245009707363e-05, + "loss": 0.2199, + "num_input_tokens_seen": 81584816, + "step": 12200 + }, + { + "epoch": 1.4946862270680836, + "eval_loss": 0.16632874310016632, + "eval_runtime": 19.7165, + "eval_samples_per_second": 3.043, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 81584816, + "step": 12200 + }, + { + "epoch": 1.4952987657345869, + "grad_norm": 1.6116822754455908, + "learning_rate": 1.6421700437211996e-05, + "loss": 0.1831, + "num_input_tokens_seen": 81619184, + "step": 12205 + }, + { + "epoch": 1.4959113044010903, + "grad_norm": 1.7639793636586045, + "learning_rate": 1.638419031896994e-05, + "loss": 0.2632, + "num_input_tokens_seen": 81652656, + "step": 12210 + }, + { + "epoch": 1.4965238430675936, + "grad_norm": 1.4984568069517499, + "learning_rate": 1.634671469346986e-05, + "loss": 0.2505, + "num_input_tokens_seen": 81686304, + "step": 12215 + }, + { + "epoch": 1.497136381734097, + "grad_norm": 1.3942314943750511, + "learning_rate": 1.6309273599164963e-05, + "loss": 0.1972, + "num_input_tokens_seen": 81719752, + "step": 12220 + }, + { + "epoch": 1.4977489204006003, + "grad_norm": 1.6883058428962718, + "learning_rate": 1.627186707447309e-05, + "loss": 0.2257, + "num_input_tokens_seen": 81753160, + "step": 12225 + }, + { + "epoch": 1.4983614590671035, + "grad_norm": 1.6547016363509963, + "learning_rate": 1.6234495157776585e-05, + "loss": 0.2207, + "num_input_tokens_seen": 81786432, + "step": 12230 + }, + { + "epoch": 1.498973997733607, + "grad_norm": 1.452834904359266, + "learning_rate": 1.6197157887422298e-05, + "loss": 0.2238, + "num_input_tokens_seen": 81820360, + "step": 12235 + }, + { + "epoch": 1.4995865364001102, + "grad_norm": 1.4749165247984546, + "learning_rate": 1.615985530172148e-05, + "loss": 0.2336, + "num_input_tokens_seen": 81853752, + "step": 12240 + }, + { + "epoch": 1.5001990750666137, + "grad_norm": 1.4041694935649065, + "learning_rate": 1.6122587438949832e-05, + "loss": 0.2765, + "num_input_tokens_seen": 81887064, + "step": 12245 + }, + { + "epoch": 1.500811613733117, + "grad_norm": 1.0944462011933531, + "learning_rate": 1.6085354337347447e-05, + "loss": 0.2553, + "num_input_tokens_seen": 81919896, + "step": 12250 + }, + { + "epoch": 1.500811613733117, + "eval_loss": 0.15666794776916504, + "eval_runtime": 19.6886, + "eval_samples_per_second": 3.047, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 81919896, + "step": 12250 + }, + { + "epoch": 1.5014241523996201, + "grad_norm": 0.9675447270994949, + "learning_rate": 1.604815603511869e-05, + "loss": 0.2293, + "num_input_tokens_seen": 81953488, + "step": 12255 + }, + { + "epoch": 1.5020366910661236, + "grad_norm": 0.9673017364248944, + "learning_rate": 1.6010992570432266e-05, + "loss": 0.1958, + "num_input_tokens_seen": 81987512, + "step": 12260 + }, + { + "epoch": 1.5026492297326268, + "grad_norm": 1.2886904484634905, + "learning_rate": 1.5973863981421123e-05, + "loss": 0.2044, + "num_input_tokens_seen": 82021296, + "step": 12265 + }, + { + "epoch": 1.5032617683991303, + "grad_norm": 1.3929826290103366, + "learning_rate": 1.5936770306182425e-05, + "loss": 0.2212, + "num_input_tokens_seen": 82054576, + "step": 12270 + }, + { + "epoch": 1.5038743070656335, + "grad_norm": 1.340158991741404, + "learning_rate": 1.589971158277749e-05, + "loss": 0.2066, + "num_input_tokens_seen": 82088824, + "step": 12275 + }, + { + "epoch": 1.5044868457321368, + "grad_norm": 1.5145083871229805, + "learning_rate": 1.5862687849231805e-05, + "loss": 0.1794, + "num_input_tokens_seen": 82123328, + "step": 12280 + }, + { + "epoch": 1.5050993843986402, + "grad_norm": 1.1128986678865027, + "learning_rate": 1.582569914353491e-05, + "loss": 0.1734, + "num_input_tokens_seen": 82156640, + "step": 12285 + }, + { + "epoch": 1.5057119230651435, + "grad_norm": 1.782519034673867, + "learning_rate": 1.5788745503640474e-05, + "loss": 0.2372, + "num_input_tokens_seen": 82189512, + "step": 12290 + }, + { + "epoch": 1.506324461731647, + "grad_norm": 1.4772750268878316, + "learning_rate": 1.57518269674661e-05, + "loss": 0.2114, + "num_input_tokens_seen": 82222648, + "step": 12295 + }, + { + "epoch": 1.5069370003981502, + "grad_norm": 30.309541549224324, + "learning_rate": 1.5714943572893433e-05, + "loss": 0.19, + "num_input_tokens_seen": 82256360, + "step": 12300 + }, + { + "epoch": 1.5069370003981502, + "eval_loss": 0.13799481093883514, + "eval_runtime": 19.678, + "eval_samples_per_second": 3.049, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 82256360, + "step": 12300 + }, + { + "epoch": 1.5075495390646534, + "grad_norm": 1.630069871359781, + "learning_rate": 1.5678095357768056e-05, + "loss": 0.2132, + "num_input_tokens_seen": 82289496, + "step": 12305 + }, + { + "epoch": 1.5081620777311566, + "grad_norm": 1.9353232104678317, + "learning_rate": 1.5641282359899413e-05, + "loss": 0.2384, + "num_input_tokens_seen": 82322464, + "step": 12310 + }, + { + "epoch": 1.50877461639766, + "grad_norm": 1.318367227256339, + "learning_rate": 1.5604504617060843e-05, + "loss": 0.2558, + "num_input_tokens_seen": 82355032, + "step": 12315 + }, + { + "epoch": 1.5093871550641635, + "grad_norm": 1.8751388283383925, + "learning_rate": 1.556776216698954e-05, + "loss": 0.237, + "num_input_tokens_seen": 82388648, + "step": 12320 + }, + { + "epoch": 1.5099996937306668, + "grad_norm": 0.9066373971773678, + "learning_rate": 1.5531055047386406e-05, + "loss": 0.1899, + "num_input_tokens_seen": 82422312, + "step": 12325 + }, + { + "epoch": 1.51061223239717, + "grad_norm": 1.0271809084599768, + "learning_rate": 1.5494383295916153e-05, + "loss": 0.1856, + "num_input_tokens_seen": 82456152, + "step": 12330 + }, + { + "epoch": 1.5112247710636733, + "grad_norm": 1.3233434681739569, + "learning_rate": 1.5457746950207186e-05, + "loss": 0.1833, + "num_input_tokens_seen": 82489776, + "step": 12335 + }, + { + "epoch": 1.5118373097301767, + "grad_norm": 1.4571664000754558, + "learning_rate": 1.5421146047851598e-05, + "loss": 0.2428, + "num_input_tokens_seen": 82522720, + "step": 12340 + }, + { + "epoch": 1.5124498483966802, + "grad_norm": 1.4313082597104383, + "learning_rate": 1.538458062640506e-05, + "loss": 0.2209, + "num_input_tokens_seen": 82555976, + "step": 12345 + }, + { + "epoch": 1.5130623870631834, + "grad_norm": 1.076371714279734, + "learning_rate": 1.5348050723386885e-05, + "loss": 0.2405, + "num_input_tokens_seen": 82589056, + "step": 12350 + }, + { + "epoch": 1.5130623870631834, + "eval_loss": 0.12636159360408783, + "eval_runtime": 19.5204, + "eval_samples_per_second": 3.074, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 82589056, + "step": 12350 + }, + { + "epoch": 1.5136749257296866, + "grad_norm": 1.4932106699776109, + "learning_rate": 1.5311556376279957e-05, + "loss": 0.2444, + "num_input_tokens_seen": 82621792, + "step": 12355 + }, + { + "epoch": 1.5142874643961899, + "grad_norm": 1.9562590179318926, + "learning_rate": 1.5275097622530616e-05, + "loss": 0.2305, + "num_input_tokens_seen": 82655856, + "step": 12360 + }, + { + "epoch": 1.5149000030626933, + "grad_norm": 1.2530396592783468, + "learning_rate": 1.52386744995487e-05, + "loss": 0.21, + "num_input_tokens_seen": 82689712, + "step": 12365 + }, + { + "epoch": 1.5155125417291968, + "grad_norm": 1.0250190095939993, + "learning_rate": 1.5202287044707525e-05, + "loss": 0.2243, + "num_input_tokens_seen": 82723480, + "step": 12370 + }, + { + "epoch": 1.5161250803957, + "grad_norm": 1.3151904162015358, + "learning_rate": 1.516593529534378e-05, + "loss": 0.2285, + "num_input_tokens_seen": 82756648, + "step": 12375 + }, + { + "epoch": 1.5167376190622033, + "grad_norm": 1.2495256061621576, + "learning_rate": 1.5129619288757501e-05, + "loss": 0.2065, + "num_input_tokens_seen": 82790296, + "step": 12380 + }, + { + "epoch": 1.5173501577287065, + "grad_norm": 1.5039364560074149, + "learning_rate": 1.5093339062212081e-05, + "loss": 0.2169, + "num_input_tokens_seen": 82823520, + "step": 12385 + }, + { + "epoch": 1.51796269639521, + "grad_norm": 1.5392365328565953, + "learning_rate": 1.5057094652934179e-05, + "loss": 0.2081, + "num_input_tokens_seen": 82857288, + "step": 12390 + }, + { + "epoch": 1.5185752350617134, + "grad_norm": 0.9526246188518604, + "learning_rate": 1.5020886098113724e-05, + "loss": 0.22, + "num_input_tokens_seen": 82890528, + "step": 12395 + }, + { + "epoch": 1.5191877737282167, + "grad_norm": 1.161119731292653, + "learning_rate": 1.4984713434903813e-05, + "loss": 0.2239, + "num_input_tokens_seen": 82923840, + "step": 12400 + }, + { + "epoch": 1.5191877737282167, + "eval_loss": 0.12071842700242996, + "eval_runtime": 19.6936, + "eval_samples_per_second": 3.047, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 82923840, + "step": 12400 + }, + { + "epoch": 1.51980031239472, + "grad_norm": 0.8745558671538526, + "learning_rate": 1.4948576700420757e-05, + "loss": 0.1957, + "num_input_tokens_seen": 82957936, + "step": 12405 + }, + { + "epoch": 1.5204128510612231, + "grad_norm": 1.265669955930451, + "learning_rate": 1.4912475931744002e-05, + "loss": 0.2119, + "num_input_tokens_seen": 82991576, + "step": 12410 + }, + { + "epoch": 1.5210253897277266, + "grad_norm": 1.7307171016387652, + "learning_rate": 1.4876411165916043e-05, + "loss": 0.2164, + "num_input_tokens_seen": 83024640, + "step": 12415 + }, + { + "epoch": 1.5216379283942298, + "grad_norm": 1.2471730030945112, + "learning_rate": 1.4840382439942474e-05, + "loss": 0.1959, + "num_input_tokens_seen": 83058376, + "step": 12420 + }, + { + "epoch": 1.5222504670607333, + "grad_norm": 1.3084583173732034, + "learning_rate": 1.4804389790791906e-05, + "loss": 0.207, + "num_input_tokens_seen": 83091944, + "step": 12425 + }, + { + "epoch": 1.5228630057272365, + "grad_norm": 1.064471622914845, + "learning_rate": 1.4768433255395935e-05, + "loss": 0.2022, + "num_input_tokens_seen": 83126048, + "step": 12430 + }, + { + "epoch": 1.5234755443937398, + "grad_norm": 1.336583934175333, + "learning_rate": 1.4732512870649057e-05, + "loss": 0.2235, + "num_input_tokens_seen": 83159512, + "step": 12435 + }, + { + "epoch": 1.524088083060243, + "grad_norm": 1.1663178844002937, + "learning_rate": 1.4696628673408753e-05, + "loss": 0.1937, + "num_input_tokens_seen": 83193544, + "step": 12440 + }, + { + "epoch": 1.5247006217267465, + "grad_norm": 1.5899573520042296, + "learning_rate": 1.466078070049528e-05, + "loss": 0.1757, + "num_input_tokens_seen": 83227368, + "step": 12445 + }, + { + "epoch": 1.52531316039325, + "grad_norm": 1.9230725307418226, + "learning_rate": 1.4624968988691817e-05, + "loss": 0.2518, + "num_input_tokens_seen": 83260088, + "step": 12450 + }, + { + "epoch": 1.52531316039325, + "eval_loss": 0.09943088889122009, + "eval_runtime": 19.3223, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 83260088, + "step": 12450 + }, + { + "epoch": 1.5259256990597532, + "grad_norm": 1.0089721100911944, + "learning_rate": 1.4589193574744254e-05, + "loss": 0.1817, + "num_input_tokens_seen": 83293904, + "step": 12455 + }, + { + "epoch": 1.5265382377262564, + "grad_norm": 1.111138167025906, + "learning_rate": 1.4553454495361291e-05, + "loss": 0.214, + "num_input_tokens_seen": 83327216, + "step": 12460 + }, + { + "epoch": 1.5271507763927596, + "grad_norm": 1.209118968401165, + "learning_rate": 1.4517751787214345e-05, + "loss": 0.1842, + "num_input_tokens_seen": 83361192, + "step": 12465 + }, + { + "epoch": 1.527763315059263, + "grad_norm": 1.6021228116494042, + "learning_rate": 1.4482085486937475e-05, + "loss": 0.2159, + "num_input_tokens_seen": 83394488, + "step": 12470 + }, + { + "epoch": 1.5283758537257666, + "grad_norm": 1.4038292881431254, + "learning_rate": 1.4446455631127403e-05, + "loss": 0.2541, + "num_input_tokens_seen": 83427656, + "step": 12475 + }, + { + "epoch": 1.5289883923922698, + "grad_norm": 1.4260963812278225, + "learning_rate": 1.4410862256343472e-05, + "loss": 0.2185, + "num_input_tokens_seen": 83461144, + "step": 12480 + }, + { + "epoch": 1.529600931058773, + "grad_norm": 1.705958941222269, + "learning_rate": 1.4375305399107586e-05, + "loss": 0.268, + "num_input_tokens_seen": 83494432, + "step": 12485 + }, + { + "epoch": 1.5302134697252763, + "grad_norm": 1.6960860930278738, + "learning_rate": 1.433978509590414e-05, + "loss": 0.1842, + "num_input_tokens_seen": 83528280, + "step": 12490 + }, + { + "epoch": 1.5308260083917797, + "grad_norm": 1.9517523316024166, + "learning_rate": 1.4304301383180074e-05, + "loss": 0.2401, + "num_input_tokens_seen": 83561440, + "step": 12495 + }, + { + "epoch": 1.5314385470582832, + "grad_norm": 1.484688558412507, + "learning_rate": 1.4268854297344764e-05, + "loss": 0.2383, + "num_input_tokens_seen": 83595024, + "step": 12500 + }, + { + "epoch": 1.5314385470582832, + "eval_loss": 0.07955755293369293, + "eval_runtime": 19.6563, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 83595024, + "step": 12500 + }, + { + "epoch": 1.5320510857247864, + "grad_norm": 34.20443898871264, + "learning_rate": 1.423344387476998e-05, + "loss": 0.2208, + "num_input_tokens_seen": 83628592, + "step": 12505 + }, + { + "epoch": 1.5326636243912897, + "grad_norm": 1.633360066757723, + "learning_rate": 1.4198070151789905e-05, + "loss": 0.2076, + "num_input_tokens_seen": 83662424, + "step": 12510 + }, + { + "epoch": 1.533276163057793, + "grad_norm": 1.7796966879900669, + "learning_rate": 1.4162733164701063e-05, + "loss": 0.1989, + "num_input_tokens_seen": 83696256, + "step": 12515 + }, + { + "epoch": 1.5338887017242964, + "grad_norm": 1.6572682483268784, + "learning_rate": 1.4127432949762254e-05, + "loss": 0.2344, + "num_input_tokens_seen": 83730416, + "step": 12520 + }, + { + "epoch": 1.5345012403907998, + "grad_norm": 1.3782476541188102, + "learning_rate": 1.409216954319459e-05, + "loss": 0.2036, + "num_input_tokens_seen": 83764872, + "step": 12525 + }, + { + "epoch": 1.535113779057303, + "grad_norm": 1.0639973267083134, + "learning_rate": 1.4056942981181354e-05, + "loss": 0.1949, + "num_input_tokens_seen": 83798928, + "step": 12530 + }, + { + "epoch": 1.5357263177238063, + "grad_norm": 1.4171004905293187, + "learning_rate": 1.4021753299868101e-05, + "loss": 0.2276, + "num_input_tokens_seen": 83831792, + "step": 12535 + }, + { + "epoch": 1.5363388563903095, + "grad_norm": 2.1674779434079974, + "learning_rate": 1.3986600535362466e-05, + "loss": 0.2339, + "num_input_tokens_seen": 83864704, + "step": 12540 + }, + { + "epoch": 1.536951395056813, + "grad_norm": 1.2220767050376016, + "learning_rate": 1.3951484723734254e-05, + "loss": 0.1957, + "num_input_tokens_seen": 83898496, + "step": 12545 + }, + { + "epoch": 1.5375639337233162, + "grad_norm": 1.1084346931524138, + "learning_rate": 1.3916405901015344e-05, + "loss": 0.2197, + "num_input_tokens_seen": 83932352, + "step": 12550 + }, + { + "epoch": 1.5375639337233162, + "eval_loss": 0.13997627794742584, + "eval_runtime": 19.5209, + "eval_samples_per_second": 3.074, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 83932352, + "step": 12550 + }, + { + "epoch": 1.5381764723898197, + "grad_norm": 2.0464725986257046, + "learning_rate": 1.3881364103199667e-05, + "loss": 0.2191, + "num_input_tokens_seen": 83965416, + "step": 12555 + }, + { + "epoch": 1.538789011056323, + "grad_norm": 1.3117018294075682, + "learning_rate": 1.3846359366243128e-05, + "loss": 0.2528, + "num_input_tokens_seen": 83998008, + "step": 12560 + }, + { + "epoch": 1.5394015497228262, + "grad_norm": 1.3384719813470132, + "learning_rate": 1.3811391726063639e-05, + "loss": 0.179, + "num_input_tokens_seen": 84031432, + "step": 12565 + }, + { + "epoch": 1.5400140883893296, + "grad_norm": 2.3161240291306178, + "learning_rate": 1.3776461218541054e-05, + "loss": 0.2365, + "num_input_tokens_seen": 84064192, + "step": 12570 + }, + { + "epoch": 1.5406266270558329, + "grad_norm": 1.2336556535570713, + "learning_rate": 1.3741567879517081e-05, + "loss": 0.2428, + "num_input_tokens_seen": 84097448, + "step": 12575 + }, + { + "epoch": 1.5412391657223363, + "grad_norm": 1.610141317500612, + "learning_rate": 1.370671174479533e-05, + "loss": 0.2054, + "num_input_tokens_seen": 84131400, + "step": 12580 + }, + { + "epoch": 1.5418517043888396, + "grad_norm": 1.1962718541335504, + "learning_rate": 1.3671892850141226e-05, + "loss": 0.2132, + "num_input_tokens_seen": 84164672, + "step": 12585 + }, + { + "epoch": 1.5424642430553428, + "grad_norm": 1.1617894223933494, + "learning_rate": 1.3637111231282001e-05, + "loss": 0.2051, + "num_input_tokens_seen": 84198624, + "step": 12590 + }, + { + "epoch": 1.543076781721846, + "grad_norm": 1.8177124941153024, + "learning_rate": 1.3602366923906578e-05, + "loss": 0.2517, + "num_input_tokens_seen": 84232112, + "step": 12595 + }, + { + "epoch": 1.5436893203883495, + "grad_norm": 1.2596850804140096, + "learning_rate": 1.3567659963665657e-05, + "loss": 0.2038, + "num_input_tokens_seen": 84265664, + "step": 12600 + }, + { + "epoch": 1.5436893203883495, + "eval_loss": 0.0905894786119461, + "eval_runtime": 19.8862, + "eval_samples_per_second": 3.017, + "eval_steps_per_second": 0.754, + "num_input_tokens_seen": 84265664, + "step": 12600 + }, + { + "epoch": 1.544301859054853, + "grad_norm": 0.9653988225955885, + "learning_rate": 1.3532990386171608e-05, + "loss": 0.2199, + "num_input_tokens_seen": 84298840, + "step": 12605 + }, + { + "epoch": 1.5449143977213562, + "grad_norm": 1.6724693359383465, + "learning_rate": 1.3498358226998414e-05, + "loss": 0.2262, + "num_input_tokens_seen": 84332296, + "step": 12610 + }, + { + "epoch": 1.5455269363878594, + "grad_norm": 1.4610831695573991, + "learning_rate": 1.3463763521681672e-05, + "loss": 0.2246, + "num_input_tokens_seen": 84366136, + "step": 12615 + }, + { + "epoch": 1.5461394750543627, + "grad_norm": 1.3414491734336234, + "learning_rate": 1.3429206305718566e-05, + "loss": 0.252, + "num_input_tokens_seen": 84399600, + "step": 12620 + }, + { + "epoch": 1.5467520137208661, + "grad_norm": 1.491876135100187, + "learning_rate": 1.3394686614567813e-05, + "loss": 0.2128, + "num_input_tokens_seen": 84432544, + "step": 12625 + }, + { + "epoch": 1.5473645523873696, + "grad_norm": 1.4874050366538178, + "learning_rate": 1.3360204483649596e-05, + "loss": 0.2009, + "num_input_tokens_seen": 84465648, + "step": 12630 + }, + { + "epoch": 1.5479770910538728, + "grad_norm": 1.422353399395494, + "learning_rate": 1.3325759948345584e-05, + "loss": 0.2205, + "num_input_tokens_seen": 84499192, + "step": 12635 + }, + { + "epoch": 1.548589629720376, + "grad_norm": 1.445951445459654, + "learning_rate": 1.3291353043998872e-05, + "loss": 0.209, + "num_input_tokens_seen": 84533112, + "step": 12640 + }, + { + "epoch": 1.5492021683868793, + "grad_norm": 1.3231600102564738, + "learning_rate": 1.3256983805913937e-05, + "loss": 0.2236, + "num_input_tokens_seen": 84566368, + "step": 12645 + }, + { + "epoch": 1.5498147070533828, + "grad_norm": 1.5044882180271906, + "learning_rate": 1.3222652269356584e-05, + "loss": 0.218, + "num_input_tokens_seen": 84599960, + "step": 12650 + }, + { + "epoch": 1.5498147070533828, + "eval_loss": 0.08059627562761307, + "eval_runtime": 20.2858, + "eval_samples_per_second": 2.958, + "eval_steps_per_second": 0.739, + "num_input_tokens_seen": 84599960, + "step": 12650 + }, + { + "epoch": 1.5504272457198862, + "grad_norm": 1.6612529544071886, + "learning_rate": 1.3188358469553964e-05, + "loss": 0.2095, + "num_input_tokens_seen": 84633040, + "step": 12655 + }, + { + "epoch": 1.5510397843863895, + "grad_norm": 1.881161346586953, + "learning_rate": 1.3154102441694511e-05, + "loss": 0.2304, + "num_input_tokens_seen": 84666880, + "step": 12660 + }, + { + "epoch": 1.5516523230528927, + "grad_norm": 1.3609121687261052, + "learning_rate": 1.3119884220927859e-05, + "loss": 0.2087, + "num_input_tokens_seen": 84700520, + "step": 12665 + }, + { + "epoch": 1.552264861719396, + "grad_norm": 1.1175838898595358, + "learning_rate": 1.3085703842364888e-05, + "loss": 0.2182, + "num_input_tokens_seen": 84733512, + "step": 12670 + }, + { + "epoch": 1.5528774003858994, + "grad_norm": 1.5431519569067016, + "learning_rate": 1.3051561341077651e-05, + "loss": 0.2096, + "num_input_tokens_seen": 84767024, + "step": 12675 + }, + { + "epoch": 1.5534899390524028, + "grad_norm": 1.1053821235027512, + "learning_rate": 1.3017456752099294e-05, + "loss": 0.1873, + "num_input_tokens_seen": 84800840, + "step": 12680 + }, + { + "epoch": 1.554102477718906, + "grad_norm": 2.05865374386267, + "learning_rate": 1.298339011042412e-05, + "loss": 0.2297, + "num_input_tokens_seen": 84833904, + "step": 12685 + }, + { + "epoch": 1.5547150163854093, + "grad_norm": 1.5394834766265646, + "learning_rate": 1.294936145100743e-05, + "loss": 0.1763, + "num_input_tokens_seen": 84868168, + "step": 12690 + }, + { + "epoch": 1.5553275550519126, + "grad_norm": 1.8126239709235685, + "learning_rate": 1.2915370808765614e-05, + "loss": 0.2232, + "num_input_tokens_seen": 84901960, + "step": 12695 + }, + { + "epoch": 1.555940093718416, + "grad_norm": 1.2976128626506218, + "learning_rate": 1.2881418218576003e-05, + "loss": 0.1887, + "num_input_tokens_seen": 84935752, + "step": 12700 + }, + { + "epoch": 1.555940093718416, + "eval_loss": 0.0622766874730587, + "eval_runtime": 19.7331, + "eval_samples_per_second": 3.041, + "eval_steps_per_second": 0.76, + "num_input_tokens_seen": 84935752, + "step": 12700 + }, + { + "epoch": 1.5565526323849193, + "grad_norm": 1.1026609438641064, + "learning_rate": 1.284750371527691e-05, + "loss": 0.1813, + "num_input_tokens_seen": 84969792, + "step": 12705 + }, + { + "epoch": 1.5571651710514227, + "grad_norm": 1.2533302416845313, + "learning_rate": 1.2813627333667572e-05, + "loss": 0.2106, + "num_input_tokens_seen": 85003840, + "step": 12710 + }, + { + "epoch": 1.557777709717926, + "grad_norm": 2.0551949833095224, + "learning_rate": 1.2779789108508111e-05, + "loss": 0.2457, + "num_input_tokens_seen": 85036840, + "step": 12715 + }, + { + "epoch": 1.5583902483844292, + "grad_norm": 2.7947120800704783, + "learning_rate": 1.2745989074519454e-05, + "loss": 0.1992, + "num_input_tokens_seen": 85070776, + "step": 12720 + }, + { + "epoch": 1.5590027870509324, + "grad_norm": 1.037429941041952, + "learning_rate": 1.2712227266383392e-05, + "loss": 0.1745, + "num_input_tokens_seen": 85105048, + "step": 12725 + }, + { + "epoch": 1.5596153257174359, + "grad_norm": 1.7135116784032658, + "learning_rate": 1.2678503718742491e-05, + "loss": 0.2094, + "num_input_tokens_seen": 85137864, + "step": 12730 + }, + { + "epoch": 1.5602278643839393, + "grad_norm": 1.0250907389593586, + "learning_rate": 1.2644818466200004e-05, + "loss": 0.2083, + "num_input_tokens_seen": 85171312, + "step": 12735 + }, + { + "epoch": 1.5608404030504426, + "grad_norm": 1.1683643341429248, + "learning_rate": 1.2611171543319944e-05, + "loss": 0.1899, + "num_input_tokens_seen": 85205072, + "step": 12740 + }, + { + "epoch": 1.5614529417169458, + "grad_norm": 1.819220884131095, + "learning_rate": 1.2577562984626984e-05, + "loss": 0.234, + "num_input_tokens_seen": 85238256, + "step": 12745 + }, + { + "epoch": 1.562065480383449, + "grad_norm": 1.6694909610730313, + "learning_rate": 1.2543992824606437e-05, + "loss": 0.2372, + "num_input_tokens_seen": 85271648, + "step": 12750 + }, + { + "epoch": 1.562065480383449, + "eval_loss": 0.11330395191907883, + "eval_runtime": 19.9637, + "eval_samples_per_second": 3.005, + "eval_steps_per_second": 0.751, + "num_input_tokens_seen": 85271648, + "step": 12750 + }, + { + "epoch": 1.5626780190499525, + "grad_norm": 1.2298509913386713, + "learning_rate": 1.251046109770418e-05, + "loss": 0.2537, + "num_input_tokens_seen": 85305136, + "step": 12755 + }, + { + "epoch": 1.563290557716456, + "grad_norm": 1.7598609439750283, + "learning_rate": 1.2476967838326681e-05, + "loss": 0.2563, + "num_input_tokens_seen": 85338344, + "step": 12760 + }, + { + "epoch": 1.5639030963829592, + "grad_norm": 1.5879026728054957, + "learning_rate": 1.2443513080840958e-05, + "loss": 0.2139, + "num_input_tokens_seen": 85372272, + "step": 12765 + }, + { + "epoch": 1.5645156350494624, + "grad_norm": 1.5079130304130068, + "learning_rate": 1.2410096859574489e-05, + "loss": 0.224, + "num_input_tokens_seen": 85405312, + "step": 12770 + }, + { + "epoch": 1.5651281737159657, + "grad_norm": 1.1725934691772146, + "learning_rate": 1.237671920881519e-05, + "loss": 0.2125, + "num_input_tokens_seen": 85439168, + "step": 12775 + }, + { + "epoch": 1.5657407123824691, + "grad_norm": 2.2252071102399373, + "learning_rate": 1.2343380162811469e-05, + "loss": 0.2469, + "num_input_tokens_seen": 85472104, + "step": 12780 + }, + { + "epoch": 1.5663532510489726, + "grad_norm": 38.46657705939354, + "learning_rate": 1.2310079755772081e-05, + "loss": 0.2053, + "num_input_tokens_seen": 85506432, + "step": 12785 + }, + { + "epoch": 1.5669657897154758, + "grad_norm": 2.2249518887583233, + "learning_rate": 1.227681802186611e-05, + "loss": 0.2374, + "num_input_tokens_seen": 85539792, + "step": 12790 + }, + { + "epoch": 1.567578328381979, + "grad_norm": 1.4122504626540888, + "learning_rate": 1.2243594995223007e-05, + "loss": 0.234, + "num_input_tokens_seen": 85573104, + "step": 12795 + }, + { + "epoch": 1.5681908670484823, + "grad_norm": 0.9224978812469172, + "learning_rate": 1.2210410709932479e-05, + "loss": 0.1879, + "num_input_tokens_seen": 85607024, + "step": 12800 + }, + { + "epoch": 1.5681908670484823, + "eval_loss": 0.09048807621002197, + "eval_runtime": 20.1629, + "eval_samples_per_second": 2.976, + "eval_steps_per_second": 0.744, + "num_input_tokens_seen": 85607024, + "step": 12800 + }, + { + "epoch": 1.5688034057149858, + "grad_norm": 1.546364894763706, + "learning_rate": 1.2177265200044507e-05, + "loss": 0.2375, + "num_input_tokens_seen": 85640488, + "step": 12805 + }, + { + "epoch": 1.5694159443814892, + "grad_norm": 1.3975403804255442, + "learning_rate": 1.214415849956923e-05, + "loss": 0.2367, + "num_input_tokens_seen": 85673952, + "step": 12810 + }, + { + "epoch": 1.5700284830479925, + "grad_norm": 1.381218553087244, + "learning_rate": 1.2111090642477019e-05, + "loss": 0.2035, + "num_input_tokens_seen": 85707752, + "step": 12815 + }, + { + "epoch": 1.5706410217144957, + "grad_norm": 1.7538592438892815, + "learning_rate": 1.2078061662698375e-05, + "loss": 0.2531, + "num_input_tokens_seen": 85740768, + "step": 12820 + }, + { + "epoch": 1.571253560380999, + "grad_norm": 1.266720873166428, + "learning_rate": 1.204507159412388e-05, + "loss": 0.1939, + "num_input_tokens_seen": 85774488, + "step": 12825 + }, + { + "epoch": 1.5718660990475024, + "grad_norm": 1.2033315686590011, + "learning_rate": 1.2012120470604227e-05, + "loss": 0.2063, + "num_input_tokens_seen": 85808448, + "step": 12830 + }, + { + "epoch": 1.5724786377140056, + "grad_norm": 1.2454192339071026, + "learning_rate": 1.1979208325950142e-05, + "loss": 0.1732, + "num_input_tokens_seen": 85842184, + "step": 12835 + }, + { + "epoch": 1.573091176380509, + "grad_norm": 1.1412311052990047, + "learning_rate": 1.1946335193932312e-05, + "loss": 0.1844, + "num_input_tokens_seen": 85875920, + "step": 12840 + }, + { + "epoch": 1.5737037150470123, + "grad_norm": 1.1183045881848968, + "learning_rate": 1.1913501108281466e-05, + "loss": 0.2313, + "num_input_tokens_seen": 85909184, + "step": 12845 + }, + { + "epoch": 1.5743162537135156, + "grad_norm": 1.6970852915774366, + "learning_rate": 1.1880706102688199e-05, + "loss": 0.2275, + "num_input_tokens_seen": 85942072, + "step": 12850 + }, + { + "epoch": 1.5743162537135156, + "eval_loss": 0.15477542579174042, + "eval_runtime": 19.9502, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.752, + "num_input_tokens_seen": 85942072, + "step": 12850 + }, + { + "epoch": 1.574928792380019, + "grad_norm": 0.8407831418948041, + "learning_rate": 1.1847950210803043e-05, + "loss": 0.1994, + "num_input_tokens_seen": 85975824, + "step": 12855 + }, + { + "epoch": 1.5755413310465223, + "grad_norm": 1.8981370785134706, + "learning_rate": 1.1815233466236415e-05, + "loss": 0.1829, + "num_input_tokens_seen": 86009872, + "step": 12860 + }, + { + "epoch": 1.5761538697130257, + "grad_norm": 1.9825656515801524, + "learning_rate": 1.1782555902558495e-05, + "loss": 0.2332, + "num_input_tokens_seen": 86043408, + "step": 12865 + }, + { + "epoch": 1.576766408379529, + "grad_norm": 1.6947474783401981, + "learning_rate": 1.1749917553299333e-05, + "loss": 0.1934, + "num_input_tokens_seen": 86076872, + "step": 12870 + }, + { + "epoch": 1.5773789470460322, + "grad_norm": 1.4751702304513257, + "learning_rate": 1.1717318451948716e-05, + "loss": 0.2071, + "num_input_tokens_seen": 86110216, + "step": 12875 + }, + { + "epoch": 1.5779914857125354, + "grad_norm": 1.2177730414152597, + "learning_rate": 1.1684758631956127e-05, + "loss": 0.2189, + "num_input_tokens_seen": 86144000, + "step": 12880 + }, + { + "epoch": 1.578604024379039, + "grad_norm": 1.667491528626558, + "learning_rate": 1.1652238126730792e-05, + "loss": 0.1942, + "num_input_tokens_seen": 86177784, + "step": 12885 + }, + { + "epoch": 1.5792165630455424, + "grad_norm": 1.8415073079648085, + "learning_rate": 1.1619756969641583e-05, + "loss": 0.1607, + "num_input_tokens_seen": 86212072, + "step": 12890 + }, + { + "epoch": 1.5798291017120456, + "grad_norm": 1.2531532681421784, + "learning_rate": 1.1587315194016957e-05, + "loss": 0.1805, + "num_input_tokens_seen": 86246240, + "step": 12895 + }, + { + "epoch": 1.5804416403785488, + "grad_norm": 1.8959678516660914, + "learning_rate": 1.155491283314502e-05, + "loss": 0.2114, + "num_input_tokens_seen": 86280424, + "step": 12900 + }, + { + "epoch": 1.5804416403785488, + "eval_loss": 0.1280047595500946, + "eval_runtime": 19.5528, + "eval_samples_per_second": 3.069, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 86280424, + "step": 12900 + }, + { + "epoch": 1.581054179045052, + "grad_norm": 1.2499947281658765, + "learning_rate": 1.1522549920273407e-05, + "loss": 0.185, + "num_input_tokens_seen": 86314360, + "step": 12905 + }, + { + "epoch": 1.5816667177115555, + "grad_norm": 1.9021763090055035, + "learning_rate": 1.149022648860929e-05, + "loss": 0.2093, + "num_input_tokens_seen": 86347904, + "step": 12910 + }, + { + "epoch": 1.582279256378059, + "grad_norm": 1.3573320864493703, + "learning_rate": 1.1457942571319292e-05, + "loss": 0.2327, + "num_input_tokens_seen": 86381240, + "step": 12915 + }, + { + "epoch": 1.5828917950445622, + "grad_norm": 1.5848267837425798, + "learning_rate": 1.1425698201529527e-05, + "loss": 0.1798, + "num_input_tokens_seen": 86415248, + "step": 12920 + }, + { + "epoch": 1.5835043337110655, + "grad_norm": 1.498992133435535, + "learning_rate": 1.1393493412325545e-05, + "loss": 0.207, + "num_input_tokens_seen": 86449200, + "step": 12925 + }, + { + "epoch": 1.5841168723775687, + "grad_norm": 1.5421872814807298, + "learning_rate": 1.1361328236752238e-05, + "loss": 0.1878, + "num_input_tokens_seen": 86483376, + "step": 12930 + }, + { + "epoch": 1.5847294110440722, + "grad_norm": 1.1609276887049182, + "learning_rate": 1.1329202707813857e-05, + "loss": 0.1721, + "num_input_tokens_seen": 86517304, + "step": 12935 + }, + { + "epoch": 1.5853419497105756, + "grad_norm": 2.249034589228579, + "learning_rate": 1.1297116858474e-05, + "loss": 0.214, + "num_input_tokens_seen": 86550072, + "step": 12940 + }, + { + "epoch": 1.5859544883770789, + "grad_norm": 1.1482436424409752, + "learning_rate": 1.1265070721655552e-05, + "loss": 0.2054, + "num_input_tokens_seen": 86583616, + "step": 12945 + }, + { + "epoch": 1.586567027043582, + "grad_norm": 1.8370054128785447, + "learning_rate": 1.1233064330240607e-05, + "loss": 0.2206, + "num_input_tokens_seen": 86616808, + "step": 12950 + }, + { + "epoch": 1.586567027043582, + "eval_loss": 0.1442350596189499, + "eval_runtime": 19.686, + "eval_samples_per_second": 3.048, + "eval_steps_per_second": 0.762, + "num_input_tokens_seen": 86616808, + "step": 12950 + }, + { + "epoch": 1.5871795657100853, + "grad_norm": 1.3048653472227902, + "learning_rate": 1.1201097717070514e-05, + "loss": 0.1998, + "num_input_tokens_seen": 86650088, + "step": 12955 + }, + { + "epoch": 1.5877921043765888, + "grad_norm": 1.2252091813239074, + "learning_rate": 1.1169170914945799e-05, + "loss": 0.2073, + "num_input_tokens_seen": 86683688, + "step": 12960 + }, + { + "epoch": 1.5884046430430923, + "grad_norm": 1.770847307168642, + "learning_rate": 1.1137283956626154e-05, + "loss": 0.2252, + "num_input_tokens_seen": 86717248, + "step": 12965 + }, + { + "epoch": 1.5890171817095955, + "grad_norm": 1.4700902894788075, + "learning_rate": 1.1105436874830333e-05, + "loss": 0.1592, + "num_input_tokens_seen": 86751584, + "step": 12970 + }, + { + "epoch": 1.5896297203760987, + "grad_norm": 1.9428374792385592, + "learning_rate": 1.1073629702236227e-05, + "loss": 0.2505, + "num_input_tokens_seen": 86784104, + "step": 12975 + }, + { + "epoch": 1.590242259042602, + "grad_norm": 1.2330743086634386, + "learning_rate": 1.1041862471480774e-05, + "loss": 0.2184, + "num_input_tokens_seen": 86817544, + "step": 12980 + }, + { + "epoch": 1.5908547977091054, + "grad_norm": 1.1156150749377718, + "learning_rate": 1.1010135215159883e-05, + "loss": 0.2064, + "num_input_tokens_seen": 86851848, + "step": 12985 + }, + { + "epoch": 1.5914673363756087, + "grad_norm": 1.7007601753189834, + "learning_rate": 1.0978447965828498e-05, + "loss": 0.2336, + "num_input_tokens_seen": 86885200, + "step": 12990 + }, + { + "epoch": 1.5920798750421121, + "grad_norm": 1.4548187897202665, + "learning_rate": 1.0946800756000492e-05, + "loss": 0.2132, + "num_input_tokens_seen": 86918344, + "step": 12995 + }, + { + "epoch": 1.5926924137086154, + "grad_norm": 1.3529213066153727, + "learning_rate": 1.0915193618148628e-05, + "loss": 0.1829, + "num_input_tokens_seen": 86952144, + "step": 13000 + }, + { + "epoch": 1.5926924137086154, + "eval_loss": 0.17378541827201843, + "eval_runtime": 19.8274, + "eval_samples_per_second": 3.026, + "eval_steps_per_second": 0.757, + "num_input_tokens_seen": 86952144, + "step": 13000 + }, + { + "epoch": 1.5933049523751186, + "grad_norm": 1.4281582439487819, + "learning_rate": 1.0883626584704599e-05, + "loss": 0.2113, + "num_input_tokens_seen": 86986432, + "step": 13005 + }, + { + "epoch": 1.5939174910416218, + "grad_norm": 1.6053691334606612, + "learning_rate": 1.085209968805893e-05, + "loss": 0.2341, + "num_input_tokens_seen": 87019888, + "step": 13010 + }, + { + "epoch": 1.5945300297081253, + "grad_norm": 1.383184652186416, + "learning_rate": 1.0820612960560928e-05, + "loss": 0.1944, + "num_input_tokens_seen": 87054224, + "step": 13015 + }, + { + "epoch": 1.5951425683746288, + "grad_norm": 1.2264164199558636, + "learning_rate": 1.078916643451875e-05, + "loss": 0.1609, + "num_input_tokens_seen": 87088584, + "step": 13020 + }, + { + "epoch": 1.595755107041132, + "grad_norm": 1.2185058731660758, + "learning_rate": 1.075776014219922e-05, + "loss": 0.2363, + "num_input_tokens_seen": 87122448, + "step": 13025 + }, + { + "epoch": 1.5963676457076352, + "grad_norm": 1.1823878191032278, + "learning_rate": 1.0726394115827949e-05, + "loss": 0.1856, + "num_input_tokens_seen": 87156416, + "step": 13030 + }, + { + "epoch": 1.5969801843741385, + "grad_norm": 1.467535081072364, + "learning_rate": 1.0695068387589218e-05, + "loss": 0.2147, + "num_input_tokens_seen": 87190064, + "step": 13035 + }, + { + "epoch": 1.597592723040642, + "grad_norm": 1.513262813267922, + "learning_rate": 1.0663782989625914e-05, + "loss": 0.2193, + "num_input_tokens_seen": 87223272, + "step": 13040 + }, + { + "epoch": 1.5982052617071454, + "grad_norm": 1.1927355611117263, + "learning_rate": 1.0632537954039595e-05, + "loss": 0.1845, + "num_input_tokens_seen": 87257696, + "step": 13045 + }, + { + "epoch": 1.5988178003736486, + "grad_norm": 1.410196726981814, + "learning_rate": 1.0601333312890393e-05, + "loss": 0.1889, + "num_input_tokens_seen": 87291504, + "step": 13050 + }, + { + "epoch": 1.5988178003736486, + "eval_loss": 0.12491484731435776, + "eval_runtime": 19.5616, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 87291504, + "step": 13050 + }, + { + "epoch": 1.5994303390401519, + "grad_norm": 1.9794441841934887, + "learning_rate": 1.0570169098196952e-05, + "loss": 0.1994, + "num_input_tokens_seen": 87324920, + "step": 13055 + }, + { + "epoch": 1.600042877706655, + "grad_norm": 1.6668772665250668, + "learning_rate": 1.0539045341936477e-05, + "loss": 0.2388, + "num_input_tokens_seen": 87358072, + "step": 13060 + }, + { + "epoch": 1.6006554163731586, + "grad_norm": 2.1050230635907305, + "learning_rate": 1.0507962076044652e-05, + "loss": 0.1867, + "num_input_tokens_seen": 87391824, + "step": 13065 + }, + { + "epoch": 1.601267955039662, + "grad_norm": 1.2915655582725991, + "learning_rate": 1.0476919332415619e-05, + "loss": 0.2139, + "num_input_tokens_seen": 87425544, + "step": 13070 + }, + { + "epoch": 1.6018804937061653, + "grad_norm": 1.4566721471956783, + "learning_rate": 1.0445917142901906e-05, + "loss": 0.1886, + "num_input_tokens_seen": 87459408, + "step": 13075 + }, + { + "epoch": 1.6024930323726685, + "grad_norm": 1.1212521193101317, + "learning_rate": 1.0414955539314464e-05, + "loss": 0.2091, + "num_input_tokens_seen": 87493464, + "step": 13080 + }, + { + "epoch": 1.6031055710391717, + "grad_norm": 1.4105801391403192, + "learning_rate": 1.0384034553422605e-05, + "loss": 0.18, + "num_input_tokens_seen": 87527488, + "step": 13085 + }, + { + "epoch": 1.6037181097056752, + "grad_norm": 1.7538267199298418, + "learning_rate": 1.0353154216953937e-05, + "loss": 0.1927, + "num_input_tokens_seen": 87561072, + "step": 13090 + }, + { + "epoch": 1.6043306483721786, + "grad_norm": 1.3348130861787946, + "learning_rate": 1.0322314561594343e-05, + "loss": 0.2034, + "num_input_tokens_seen": 87594680, + "step": 13095 + }, + { + "epoch": 1.6049431870386819, + "grad_norm": 1.360077696383161, + "learning_rate": 1.0291515618988012e-05, + "loss": 0.2406, + "num_input_tokens_seen": 87627616, + "step": 13100 + }, + { + "epoch": 1.6049431870386819, + "eval_loss": 0.08859918266534805, + "eval_runtime": 19.2468, + "eval_samples_per_second": 3.117, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 87627616, + "step": 13100 + }, + { + "epoch": 1.6055557257051851, + "grad_norm": 1.5790190321924888, + "learning_rate": 1.0260757420737354e-05, + "loss": 0.1987, + "num_input_tokens_seen": 87660704, + "step": 13105 + }, + { + "epoch": 1.6061682643716884, + "grad_norm": 2.037321047547533, + "learning_rate": 1.0230039998402919e-05, + "loss": 0.2083, + "num_input_tokens_seen": 87694136, + "step": 13110 + }, + { + "epoch": 1.6067808030381918, + "grad_norm": 1.0599387109634815, + "learning_rate": 1.0199363383503464e-05, + "loss": 0.2125, + "num_input_tokens_seen": 87727504, + "step": 13115 + }, + { + "epoch": 1.607393341704695, + "grad_norm": 1.9868319317147352, + "learning_rate": 1.016872760751586e-05, + "loss": 0.2167, + "num_input_tokens_seen": 87760824, + "step": 13120 + }, + { + "epoch": 1.6080058803711985, + "grad_norm": 1.5831732058808419, + "learning_rate": 1.0138132701875103e-05, + "loss": 0.2319, + "num_input_tokens_seen": 87794008, + "step": 13125 + }, + { + "epoch": 1.6086184190377018, + "grad_norm": 1.399301641701743, + "learning_rate": 1.0107578697974179e-05, + "loss": 0.2051, + "num_input_tokens_seen": 87827672, + "step": 13130 + }, + { + "epoch": 1.609230957704205, + "grad_norm": 1.092180846073481, + "learning_rate": 1.0077065627164178e-05, + "loss": 0.2217, + "num_input_tokens_seen": 87861320, + "step": 13135 + }, + { + "epoch": 1.6098434963707084, + "grad_norm": 1.5771713577924222, + "learning_rate": 1.0046593520754171e-05, + "loss": 0.2425, + "num_input_tokens_seen": 87894304, + "step": 13140 + }, + { + "epoch": 1.6104560350372117, + "grad_norm": 1.1494909997480354, + "learning_rate": 1.001616241001116e-05, + "loss": 0.189, + "num_input_tokens_seen": 87928496, + "step": 13145 + }, + { + "epoch": 1.6110685737037151, + "grad_norm": 1.4295769779629215, + "learning_rate": 9.985772326160125e-06, + "loss": 0.2467, + "num_input_tokens_seen": 87961872, + "step": 13150 + }, + { + "epoch": 1.6110685737037151, + "eval_loss": 0.0699838325381279, + "eval_runtime": 19.3223, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 87961872, + "step": 13150 + }, + { + "epoch": 1.6116811123702184, + "grad_norm": 1.6247258089590593, + "learning_rate": 9.955423300383942e-06, + "loss": 0.2211, + "num_input_tokens_seen": 87994856, + "step": 13155 + }, + { + "epoch": 1.6122936510367216, + "grad_norm": 1.3669682885367231, + "learning_rate": 9.925115363823328e-06, + "loss": 0.176, + "num_input_tokens_seen": 88028832, + "step": 13160 + }, + { + "epoch": 1.6129061897032249, + "grad_norm": 1.620528157823805, + "learning_rate": 9.894848547576868e-06, + "loss": 0.2368, + "num_input_tokens_seen": 88062144, + "step": 13165 + }, + { + "epoch": 1.6135187283697283, + "grad_norm": 0.7991432301934884, + "learning_rate": 9.864622882700958e-06, + "loss": 0.155, + "num_input_tokens_seen": 88096560, + "step": 13170 + }, + { + "epoch": 1.6141312670362318, + "grad_norm": 1.1708008985867853, + "learning_rate": 9.834438400209733e-06, + "loss": 0.2433, + "num_input_tokens_seen": 88129968, + "step": 13175 + }, + { + "epoch": 1.614743805702735, + "grad_norm": 1.0693871817346872, + "learning_rate": 9.804295131075125e-06, + "loss": 0.2006, + "num_input_tokens_seen": 88163752, + "step": 13180 + }, + { + "epoch": 1.6153563443692383, + "grad_norm": 1.2548549963491762, + "learning_rate": 9.774193106226715e-06, + "loss": 0.2332, + "num_input_tokens_seen": 88197712, + "step": 13185 + }, + { + "epoch": 1.6159688830357415, + "grad_norm": 1.2892336575662289, + "learning_rate": 9.744132356551815e-06, + "loss": 0.1907, + "num_input_tokens_seen": 88231264, + "step": 13190 + }, + { + "epoch": 1.616581421702245, + "grad_norm": 1.505398882077996, + "learning_rate": 9.714112912895373e-06, + "loss": 0.2256, + "num_input_tokens_seen": 88264088, + "step": 13195 + }, + { + "epoch": 1.6171939603687484, + "grad_norm": 1.6847924452329517, + "learning_rate": 9.684134806059925e-06, + "loss": 0.2259, + "num_input_tokens_seen": 88297128, + "step": 13200 + }, + { + "epoch": 1.6171939603687484, + "eval_loss": 0.08684820681810379, + "eval_runtime": 19.5273, + "eval_samples_per_second": 3.073, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 88297128, + "step": 13200 + }, + { + "epoch": 1.6178064990352516, + "grad_norm": 1.0353341518932697, + "learning_rate": 9.65419806680563e-06, + "loss": 0.2348, + "num_input_tokens_seen": 88330432, + "step": 13205 + }, + { + "epoch": 1.6184190377017549, + "grad_norm": 1.0451823665075328, + "learning_rate": 9.624302725850187e-06, + "loss": 0.1765, + "num_input_tokens_seen": 88364576, + "step": 13210 + }, + { + "epoch": 1.6190315763682581, + "grad_norm": 1.3143861096552512, + "learning_rate": 9.5944488138688e-06, + "loss": 0.2151, + "num_input_tokens_seen": 88397304, + "step": 13215 + }, + { + "epoch": 1.6196441150347616, + "grad_norm": 1.4430030498577782, + "learning_rate": 9.564636361494178e-06, + "loss": 0.1647, + "num_input_tokens_seen": 88431768, + "step": 13220 + }, + { + "epoch": 1.620256653701265, + "grad_norm": 1.4355594750352865, + "learning_rate": 9.5348653993165e-06, + "loss": 0.1876, + "num_input_tokens_seen": 88465312, + "step": 13225 + }, + { + "epoch": 1.6208691923677683, + "grad_norm": 2.180666983454776, + "learning_rate": 9.505135957883365e-06, + "loss": 0.2052, + "num_input_tokens_seen": 88498456, + "step": 13230 + }, + { + "epoch": 1.6214817310342715, + "grad_norm": 1.0731217847810095, + "learning_rate": 9.475448067699739e-06, + "loss": 0.1931, + "num_input_tokens_seen": 88532112, + "step": 13235 + }, + { + "epoch": 1.6220942697007747, + "grad_norm": 1.2192311174498993, + "learning_rate": 9.44580175922799e-06, + "loss": 0.2242, + "num_input_tokens_seen": 88565560, + "step": 13240 + }, + { + "epoch": 1.6227068083672782, + "grad_norm": 2.0384763586548464, + "learning_rate": 9.416197062887815e-06, + "loss": 0.1775, + "num_input_tokens_seen": 88599824, + "step": 13245 + }, + { + "epoch": 1.6233193470337817, + "grad_norm": 1.517455500672446, + "learning_rate": 9.386634009056194e-06, + "loss": 0.2092, + "num_input_tokens_seen": 88633080, + "step": 13250 + }, + { + "epoch": 1.6233193470337817, + "eval_loss": 0.08336101472377777, + "eval_runtime": 19.1122, + "eval_samples_per_second": 3.139, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 88633080, + "step": 13250 + }, + { + "epoch": 1.623931885700285, + "grad_norm": 1.3940308690750487, + "learning_rate": 9.357112628067399e-06, + "loss": 0.202, + "num_input_tokens_seen": 88666784, + "step": 13255 + }, + { + "epoch": 1.6245444243667881, + "grad_norm": 1.7018272322214598, + "learning_rate": 9.327632950212917e-06, + "loss": 0.1991, + "num_input_tokens_seen": 88700064, + "step": 13260 + }, + { + "epoch": 1.6251569630332914, + "grad_norm": 1.2372853465846674, + "learning_rate": 9.298195005741484e-06, + "loss": 0.1668, + "num_input_tokens_seen": 88734648, + "step": 13265 + }, + { + "epoch": 1.6257695016997948, + "grad_norm": 1.626083960406174, + "learning_rate": 9.268798824858954e-06, + "loss": 0.2213, + "num_input_tokens_seen": 88767416, + "step": 13270 + }, + { + "epoch": 1.626382040366298, + "grad_norm": 1.5110245001762672, + "learning_rate": 9.239444437728384e-06, + "loss": 0.2311, + "num_input_tokens_seen": 88800680, + "step": 13275 + }, + { + "epoch": 1.6269945790328015, + "grad_norm": 1.6503742741343286, + "learning_rate": 9.21013187446993e-06, + "loss": 0.2014, + "num_input_tokens_seen": 88834152, + "step": 13280 + }, + { + "epoch": 1.6276071176993048, + "grad_norm": 1.6249925303328252, + "learning_rate": 9.18086116516083e-06, + "loss": 0.1985, + "num_input_tokens_seen": 88868144, + "step": 13285 + }, + { + "epoch": 1.628219656365808, + "grad_norm": 1.5619282065219142, + "learning_rate": 9.15163233983536e-06, + "loss": 0.2323, + "num_input_tokens_seen": 88901064, + "step": 13290 + }, + { + "epoch": 1.6288321950323115, + "grad_norm": 1.3319523647378875, + "learning_rate": 9.122445428484844e-06, + "loss": 0.1814, + "num_input_tokens_seen": 88934848, + "step": 13295 + }, + { + "epoch": 1.6294447336988147, + "grad_norm": 1.5398428234420745, + "learning_rate": 9.093300461057602e-06, + "loss": 0.2285, + "num_input_tokens_seen": 88968232, + "step": 13300 + }, + { + "epoch": 1.6294447336988147, + "eval_loss": 0.13456711173057556, + "eval_runtime": 19.6104, + "eval_samples_per_second": 3.06, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 88968232, + "step": 13300 + }, + { + "epoch": 1.6300572723653182, + "grad_norm": 1.8265511678597688, + "learning_rate": 9.06419746745888e-06, + "loss": 0.2156, + "num_input_tokens_seen": 89002208, + "step": 13305 + }, + { + "epoch": 1.6306698110318214, + "grad_norm": 0.9660303400907118, + "learning_rate": 9.035136477550887e-06, + "loss": 0.1972, + "num_input_tokens_seen": 89036088, + "step": 13310 + }, + { + "epoch": 1.6312823496983246, + "grad_norm": 1.0307799544712286, + "learning_rate": 9.00611752115273e-06, + "loss": 0.2018, + "num_input_tokens_seen": 89070144, + "step": 13315 + }, + { + "epoch": 1.6318948883648279, + "grad_norm": 2.368762506115885, + "learning_rate": 8.977140628040382e-06, + "loss": 0.2123, + "num_input_tokens_seen": 89103352, + "step": 13320 + }, + { + "epoch": 1.6325074270313313, + "grad_norm": 1.13738519594461, + "learning_rate": 8.948205827946637e-06, + "loss": 0.2224, + "num_input_tokens_seen": 89137296, + "step": 13325 + }, + { + "epoch": 1.6331199656978348, + "grad_norm": 1.3452944267041653, + "learning_rate": 8.919313150561131e-06, + "loss": 0.2518, + "num_input_tokens_seen": 89169968, + "step": 13330 + }, + { + "epoch": 1.633732504364338, + "grad_norm": 1.5519469370110663, + "learning_rate": 8.890462625530266e-06, + "loss": 0.2301, + "num_input_tokens_seen": 89202856, + "step": 13335 + }, + { + "epoch": 1.6343450430308413, + "grad_norm": 1.8652820453627195, + "learning_rate": 8.861654282457176e-06, + "loss": 0.1833, + "num_input_tokens_seen": 89236560, + "step": 13340 + }, + { + "epoch": 1.6349575816973445, + "grad_norm": 1.179596431963033, + "learning_rate": 8.832888150901713e-06, + "loss": 0.1836, + "num_input_tokens_seen": 89270712, + "step": 13345 + }, + { + "epoch": 1.635570120363848, + "grad_norm": 1.0858982897345437, + "learning_rate": 8.804164260380443e-06, + "loss": 0.1994, + "num_input_tokens_seen": 89304224, + "step": 13350 + }, + { + "epoch": 1.635570120363848, + "eval_loss": 0.14407266676425934, + "eval_runtime": 19.2536, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 0.779, + "num_input_tokens_seen": 89304224, + "step": 13350 + }, + { + "epoch": 1.6361826590303514, + "grad_norm": 1.0200628164384562, + "learning_rate": 8.775482640366583e-06, + "loss": 0.1972, + "num_input_tokens_seen": 89338352, + "step": 13355 + }, + { + "epoch": 1.6367951976968547, + "grad_norm": 1.3890711416538282, + "learning_rate": 8.746843320289944e-06, + "loss": 0.2029, + "num_input_tokens_seen": 89372176, + "step": 13360 + }, + { + "epoch": 1.637407736363358, + "grad_norm": 1.2949654120907006, + "learning_rate": 8.718246329536967e-06, + "loss": 0.2527, + "num_input_tokens_seen": 89405592, + "step": 13365 + }, + { + "epoch": 1.6380202750298611, + "grad_norm": 1.5527843452326633, + "learning_rate": 8.689691697450669e-06, + "loss": 0.2507, + "num_input_tokens_seen": 89439272, + "step": 13370 + }, + { + "epoch": 1.6386328136963646, + "grad_norm": 1.2728371605858781, + "learning_rate": 8.661179453330553e-06, + "loss": 0.2086, + "num_input_tokens_seen": 89473016, + "step": 13375 + }, + { + "epoch": 1.639245352362868, + "grad_norm": 1.216740852849905, + "learning_rate": 8.632709626432672e-06, + "loss": 0.2259, + "num_input_tokens_seen": 89506200, + "step": 13380 + }, + { + "epoch": 1.6398578910293713, + "grad_norm": 1.4638941146078162, + "learning_rate": 8.604282245969548e-06, + "loss": 0.1939, + "num_input_tokens_seen": 89539440, + "step": 13385 + }, + { + "epoch": 1.6404704296958745, + "grad_norm": 1.400639444734436, + "learning_rate": 8.575897341110145e-06, + "loss": 0.2199, + "num_input_tokens_seen": 89572368, + "step": 13390 + }, + { + "epoch": 1.6410829683623778, + "grad_norm": 1.7394615137668104, + "learning_rate": 8.54755494097983e-06, + "loss": 0.1917, + "num_input_tokens_seen": 89606416, + "step": 13395 + }, + { + "epoch": 1.6416955070288812, + "grad_norm": 1.5132311423136349, + "learning_rate": 8.51925507466037e-06, + "loss": 0.2202, + "num_input_tokens_seen": 89640800, + "step": 13400 + }, + { + "epoch": 1.6416955070288812, + "eval_loss": 0.12518566846847534, + "eval_runtime": 20.2312, + "eval_samples_per_second": 2.966, + "eval_steps_per_second": 0.741, + "num_input_tokens_seen": 89640800, + "step": 13400 + }, + { + "epoch": 1.6423080456953845, + "grad_norm": 1.7959801559594981, + "learning_rate": 8.490997771189907e-06, + "loss": 0.2142, + "num_input_tokens_seen": 89674616, + "step": 13405 + }, + { + "epoch": 1.642920584361888, + "grad_norm": 0.8548932776072425, + "learning_rate": 8.462783059562862e-06, + "loss": 0.2115, + "num_input_tokens_seen": 89708056, + "step": 13410 + }, + { + "epoch": 1.6435331230283912, + "grad_norm": 1.3033802997710071, + "learning_rate": 8.434610968730006e-06, + "loss": 0.1682, + "num_input_tokens_seen": 89742072, + "step": 13415 + }, + { + "epoch": 1.6441456616948944, + "grad_norm": 1.249324437290454, + "learning_rate": 8.406481527598325e-06, + "loss": 0.2267, + "num_input_tokens_seen": 89774920, + "step": 13420 + }, + { + "epoch": 1.6447582003613979, + "grad_norm": 1.1249534676606747, + "learning_rate": 8.378394765031106e-06, + "loss": 0.1916, + "num_input_tokens_seen": 89808840, + "step": 13425 + }, + { + "epoch": 1.645370739027901, + "grad_norm": 0.7999883033647595, + "learning_rate": 8.350350709847764e-06, + "loss": 0.2105, + "num_input_tokens_seen": 89842896, + "step": 13430 + }, + { + "epoch": 1.6459832776944046, + "grad_norm": 1.5382355883135947, + "learning_rate": 8.322349390823969e-06, + "loss": 0.1993, + "num_input_tokens_seen": 89876832, + "step": 13435 + }, + { + "epoch": 1.6465958163609078, + "grad_norm": 1.794103947552637, + "learning_rate": 8.294390836691496e-06, + "loss": 0.2305, + "num_input_tokens_seen": 89910688, + "step": 13440 + }, + { + "epoch": 1.647208355027411, + "grad_norm": 1.5315252971957132, + "learning_rate": 8.266475076138263e-06, + "loss": 0.1883, + "num_input_tokens_seen": 89944992, + "step": 13445 + }, + { + "epoch": 1.6478208936939143, + "grad_norm": 0.8288377274336945, + "learning_rate": 8.238602137808249e-06, + "loss": 0.1682, + "num_input_tokens_seen": 89978672, + "step": 13450 + }, + { + "epoch": 1.6478208936939143, + "eval_loss": 0.12995800375938416, + "eval_runtime": 19.4974, + "eval_samples_per_second": 3.077, + "eval_steps_per_second": 0.769, + "num_input_tokens_seen": 89978672, + "step": 13450 + }, + { + "epoch": 1.6484334323604177, + "grad_norm": 1.5479238112012856, + "learning_rate": 8.210772050301519e-06, + "loss": 0.1763, + "num_input_tokens_seen": 90012432, + "step": 13455 + }, + { + "epoch": 1.6490459710269212, + "grad_norm": 1.7155933241999692, + "learning_rate": 8.182984842174174e-06, + "loss": 0.1827, + "num_input_tokens_seen": 90046184, + "step": 13460 + }, + { + "epoch": 1.6496585096934244, + "grad_norm": 1.4420152295603887, + "learning_rate": 8.155240541938275e-06, + "loss": 0.2052, + "num_input_tokens_seen": 90080280, + "step": 13465 + }, + { + "epoch": 1.6502710483599277, + "grad_norm": 1.574643091034377, + "learning_rate": 8.127539178061906e-06, + "loss": 0.229, + "num_input_tokens_seen": 90113024, + "step": 13470 + }, + { + "epoch": 1.650883587026431, + "grad_norm": 1.547455823429606, + "learning_rate": 8.099880778969066e-06, + "loss": 0.2731, + "num_input_tokens_seen": 90146384, + "step": 13475 + }, + { + "epoch": 1.6514961256929344, + "grad_norm": 1.635289302132703, + "learning_rate": 8.07226537303969e-06, + "loss": 0.2132, + "num_input_tokens_seen": 90179920, + "step": 13480 + }, + { + "epoch": 1.6521086643594378, + "grad_norm": 1.4332324749683067, + "learning_rate": 8.044692988609565e-06, + "loss": 0.2227, + "num_input_tokens_seen": 90213008, + "step": 13485 + }, + { + "epoch": 1.652721203025941, + "grad_norm": 1.5735636367474202, + "learning_rate": 8.017163653970361e-06, + "loss": 0.2084, + "num_input_tokens_seen": 90246872, + "step": 13490 + }, + { + "epoch": 1.6533337416924443, + "grad_norm": 1.1087817623370912, + "learning_rate": 7.989677397369577e-06, + "loss": 0.2222, + "num_input_tokens_seen": 90280160, + "step": 13495 + }, + { + "epoch": 1.6539462803589475, + "grad_norm": 1.4236043681039197, + "learning_rate": 7.962234247010497e-06, + "loss": 0.1801, + "num_input_tokens_seen": 90313896, + "step": 13500 + }, + { + "epoch": 1.6539462803589475, + "eval_loss": 0.11713992059230804, + "eval_runtime": 19.5904, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 90313896, + "step": 13500 + }, + { + "epoch": 1.654558819025451, + "grad_norm": 1.924507013876458, + "learning_rate": 7.934834231052157e-06, + "loss": 0.1951, + "num_input_tokens_seen": 90347776, + "step": 13505 + }, + { + "epoch": 1.6551713576919544, + "grad_norm": 1.3948662364954707, + "learning_rate": 7.907477377609379e-06, + "loss": 0.2118, + "num_input_tokens_seen": 90381544, + "step": 13510 + }, + { + "epoch": 1.6557838963584577, + "grad_norm": 1.5270509460128088, + "learning_rate": 7.880163714752669e-06, + "loss": 0.1658, + "num_input_tokens_seen": 90415952, + "step": 13515 + }, + { + "epoch": 1.656396435024961, + "grad_norm": 22.100239439888032, + "learning_rate": 7.8528932705082e-06, + "loss": 0.21, + "num_input_tokens_seen": 90449864, + "step": 13520 + }, + { + "epoch": 1.6570089736914642, + "grad_norm": 1.204832821186101, + "learning_rate": 7.825666072857834e-06, + "loss": 0.2054, + "num_input_tokens_seen": 90483936, + "step": 13525 + }, + { + "epoch": 1.6576215123579676, + "grad_norm": 1.6528319634526027, + "learning_rate": 7.798482149739051e-06, + "loss": 0.2316, + "num_input_tokens_seen": 90516976, + "step": 13530 + }, + { + "epoch": 1.658234051024471, + "grad_norm": 1.4876135190407918, + "learning_rate": 7.771341529044895e-06, + "loss": 0.2072, + "num_input_tokens_seen": 90550136, + "step": 13535 + }, + { + "epoch": 1.6588465896909743, + "grad_norm": 1.5960337062294825, + "learning_rate": 7.74424423862401e-06, + "loss": 0.2168, + "num_input_tokens_seen": 90583032, + "step": 13540 + }, + { + "epoch": 1.6594591283574776, + "grad_norm": 1.8751793226456541, + "learning_rate": 7.717190306280575e-06, + "loss": 0.2199, + "num_input_tokens_seen": 90617000, + "step": 13545 + }, + { + "epoch": 1.6600716670239808, + "grad_norm": 2.36864423832101, + "learning_rate": 7.690179759774285e-06, + "loss": 0.2621, + "num_input_tokens_seen": 90650056, + "step": 13550 + }, + { + "epoch": 1.6600716670239808, + "eval_loss": 0.09069520980119705, + "eval_runtime": 19.3516, + "eval_samples_per_second": 3.101, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 90650056, + "step": 13550 + }, + { + "epoch": 1.6606842056904842, + "grad_norm": 1.2931412238625248, + "learning_rate": 7.663212626820287e-06, + "loss": 0.1932, + "num_input_tokens_seen": 90684496, + "step": 13555 + }, + { + "epoch": 1.6612967443569875, + "grad_norm": 1.4795308900091226, + "learning_rate": 7.636288935089214e-06, + "loss": 0.218, + "num_input_tokens_seen": 90717064, + "step": 13560 + }, + { + "epoch": 1.661909283023491, + "grad_norm": 1.490374996545998, + "learning_rate": 7.609408712207122e-06, + "loss": 0.2356, + "num_input_tokens_seen": 90750248, + "step": 13565 + }, + { + "epoch": 1.6625218216899942, + "grad_norm": 1.700946022567145, + "learning_rate": 7.582571985755433e-06, + "loss": 0.2164, + "num_input_tokens_seen": 90783952, + "step": 13570 + }, + { + "epoch": 1.6631343603564974, + "grad_norm": 1.2409271303584792, + "learning_rate": 7.555778783270984e-06, + "loss": 0.2227, + "num_input_tokens_seen": 90817168, + "step": 13575 + }, + { + "epoch": 1.6637468990230009, + "grad_norm": 1.1501761982446215, + "learning_rate": 7.529029132245907e-06, + "loss": 0.2205, + "num_input_tokens_seen": 90851120, + "step": 13580 + }, + { + "epoch": 1.6643594376895041, + "grad_norm": 1.6656153097147295, + "learning_rate": 7.502323060127697e-06, + "loss": 0.2159, + "num_input_tokens_seen": 90884568, + "step": 13585 + }, + { + "epoch": 1.6649719763560076, + "grad_norm": 1.7659825265557563, + "learning_rate": 7.4756605943190725e-06, + "loss": 0.2444, + "num_input_tokens_seen": 90917968, + "step": 13590 + }, + { + "epoch": 1.6655845150225108, + "grad_norm": 1.2983267314489475, + "learning_rate": 7.4490417621780495e-06, + "loss": 0.2165, + "num_input_tokens_seen": 90951304, + "step": 13595 + }, + { + "epoch": 1.666197053689014, + "grad_norm": 1.9555822132189395, + "learning_rate": 7.422466591017879e-06, + "loss": 0.2048, + "num_input_tokens_seen": 90985192, + "step": 13600 + }, + { + "epoch": 1.666197053689014, + "eval_loss": 0.09095155447721481, + "eval_runtime": 19.4773, + "eval_samples_per_second": 3.081, + "eval_steps_per_second": 0.77, + "num_input_tokens_seen": 90985192, + "step": 13600 + }, + { + "epoch": 1.6668095923555173, + "grad_norm": 1.065495611495425, + "learning_rate": 7.395935108106988e-06, + "loss": 0.1976, + "num_input_tokens_seen": 91018688, + "step": 13605 + }, + { + "epoch": 1.6674221310220207, + "grad_norm": 1.6470467371700097, + "learning_rate": 7.369447340668978e-06, + "loss": 0.2054, + "num_input_tokens_seen": 91052632, + "step": 13610 + }, + { + "epoch": 1.6680346696885242, + "grad_norm": 1.5693489409130392, + "learning_rate": 7.3430033158825966e-06, + "loss": 0.2193, + "num_input_tokens_seen": 91085408, + "step": 13615 + }, + { + "epoch": 1.6686472083550274, + "grad_norm": 1.9894481149106535, + "learning_rate": 7.3166030608817385e-06, + "loss": 0.2208, + "num_input_tokens_seen": 91118408, + "step": 13620 + }, + { + "epoch": 1.6692597470215307, + "grad_norm": 1.506658069629576, + "learning_rate": 7.290246602755318e-06, + "loss": 0.2142, + "num_input_tokens_seen": 91151832, + "step": 13625 + }, + { + "epoch": 1.669872285688034, + "grad_norm": 2.1428518132028276, + "learning_rate": 7.263933968547382e-06, + "loss": 0.2356, + "num_input_tokens_seen": 91185240, + "step": 13630 + }, + { + "epoch": 1.6704848243545374, + "grad_norm": 1.3350095998363023, + "learning_rate": 7.237665185256964e-06, + "loss": 0.2047, + "num_input_tokens_seen": 91218944, + "step": 13635 + }, + { + "epoch": 1.6710973630210408, + "grad_norm": 1.1178358233865395, + "learning_rate": 7.2114402798381385e-06, + "loss": 0.1918, + "num_input_tokens_seen": 91252776, + "step": 13640 + }, + { + "epoch": 1.671709901687544, + "grad_norm": 1.5427567066029229, + "learning_rate": 7.185259279199913e-06, + "loss": 0.2431, + "num_input_tokens_seen": 91286232, + "step": 13645 + }, + { + "epoch": 1.6723224403540473, + "grad_norm": 1.7496475090655237, + "learning_rate": 7.159122210206276e-06, + "loss": 0.205, + "num_input_tokens_seen": 91319000, + "step": 13650 + }, + { + "epoch": 1.6723224403540473, + "eval_loss": 0.07880138605833054, + "eval_runtime": 20.1182, + "eval_samples_per_second": 2.982, + "eval_steps_per_second": 0.746, + "num_input_tokens_seen": 91319000, + "step": 13650 + }, + { + "epoch": 1.6729349790205505, + "grad_norm": 1.7684111383401413, + "learning_rate": 7.1330290996761524e-06, + "loss": 0.2391, + "num_input_tokens_seen": 91352368, + "step": 13655 + }, + { + "epoch": 1.673547517687054, + "grad_norm": 1.4656082361237002, + "learning_rate": 7.106979974383327e-06, + "loss": 0.2211, + "num_input_tokens_seen": 91385488, + "step": 13660 + }, + { + "epoch": 1.6741600563535575, + "grad_norm": 1.4045801828241775, + "learning_rate": 7.080974861056444e-06, + "loss": 0.2213, + "num_input_tokens_seen": 91418536, + "step": 13665 + }, + { + "epoch": 1.6747725950200607, + "grad_norm": 1.18680243837689, + "learning_rate": 7.055013786379034e-06, + "loss": 0.1736, + "num_input_tokens_seen": 91452184, + "step": 13670 + }, + { + "epoch": 1.675385133686564, + "grad_norm": 1.5214384621124237, + "learning_rate": 7.029096776989425e-06, + "loss": 0.2208, + "num_input_tokens_seen": 91485488, + "step": 13675 + }, + { + "epoch": 1.6759976723530672, + "grad_norm": 1.4293614913184545, + "learning_rate": 7.00322385948069e-06, + "loss": 0.1727, + "num_input_tokens_seen": 91519048, + "step": 13680 + }, + { + "epoch": 1.6766102110195706, + "grad_norm": 1.8009393964077665, + "learning_rate": 6.97739506040071e-06, + "loss": 0.2349, + "num_input_tokens_seen": 91552104, + "step": 13685 + }, + { + "epoch": 1.677222749686074, + "grad_norm": 1.6399267172130925, + "learning_rate": 6.951610406252085e-06, + "loss": 0.195, + "num_input_tokens_seen": 91585608, + "step": 13690 + }, + { + "epoch": 1.6778352883525773, + "grad_norm": 2.11883752298644, + "learning_rate": 6.925869923492084e-06, + "loss": 0.2421, + "num_input_tokens_seen": 91618816, + "step": 13695 + }, + { + "epoch": 1.6784478270190806, + "grad_norm": 52.003194710517754, + "learning_rate": 6.900173638532703e-06, + "loss": 0.2343, + "num_input_tokens_seen": 91652896, + "step": 13700 + }, + { + "epoch": 1.6784478270190806, + "eval_loss": 0.10617480427026749, + "eval_runtime": 19.7167, + "eval_samples_per_second": 3.043, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 91652896, + "step": 13700 + }, + { + "epoch": 1.6790603656855838, + "grad_norm": 1.4738931958462471, + "learning_rate": 6.874521577740556e-06, + "loss": 0.2294, + "num_input_tokens_seen": 91686192, + "step": 13705 + }, + { + "epoch": 1.6796729043520873, + "grad_norm": 1.376047339875712, + "learning_rate": 6.848913767436893e-06, + "loss": 0.2198, + "num_input_tokens_seen": 91719400, + "step": 13710 + }, + { + "epoch": 1.6802854430185905, + "grad_norm": 1.763755369950917, + "learning_rate": 6.823350233897541e-06, + "loss": 0.1973, + "num_input_tokens_seen": 91753776, + "step": 13715 + }, + { + "epoch": 1.680897981685094, + "grad_norm": 1.1362970980220057, + "learning_rate": 6.7978310033529056e-06, + "loss": 0.2014, + "num_input_tokens_seen": 91787632, + "step": 13720 + }, + { + "epoch": 1.6815105203515972, + "grad_norm": 1.6164363424024477, + "learning_rate": 6.7723561019879585e-06, + "loss": 0.1972, + "num_input_tokens_seen": 91821520, + "step": 13725 + }, + { + "epoch": 1.6821230590181004, + "grad_norm": 1.7431243500224902, + "learning_rate": 6.7469255559421295e-06, + "loss": 0.2423, + "num_input_tokens_seen": 91854952, + "step": 13730 + }, + { + "epoch": 1.6827355976846037, + "grad_norm": 1.3812634952744447, + "learning_rate": 6.721539391309389e-06, + "loss": 0.1949, + "num_input_tokens_seen": 91888520, + "step": 13735 + }, + { + "epoch": 1.6833481363511071, + "grad_norm": 1.6315526711206583, + "learning_rate": 6.696197634138152e-06, + "loss": 0.2018, + "num_input_tokens_seen": 91921920, + "step": 13740 + }, + { + "epoch": 1.6839606750176106, + "grad_norm": 2.046266771001651, + "learning_rate": 6.670900310431255e-06, + "loss": 0.177, + "num_input_tokens_seen": 91955424, + "step": 13745 + }, + { + "epoch": 1.6845732136841138, + "grad_norm": 1.7377370236341245, + "learning_rate": 6.645647446145942e-06, + "loss": 0.2717, + "num_input_tokens_seen": 91988016, + "step": 13750 + }, + { + "epoch": 1.6845732136841138, + "eval_loss": 0.05082106962800026, + "eval_runtime": 20.1205, + "eval_samples_per_second": 2.982, + "eval_steps_per_second": 0.746, + "num_input_tokens_seen": 91988016, + "step": 13750 + }, + { + "epoch": 1.685185752350617, + "grad_norm": 1.851060577442212, + "learning_rate": 6.620439067193857e-06, + "loss": 0.2036, + "num_input_tokens_seen": 92021256, + "step": 13755 + }, + { + "epoch": 1.6857982910171203, + "grad_norm": 1.2408502778981032, + "learning_rate": 6.595275199440981e-06, + "loss": 0.2085, + "num_input_tokens_seen": 92054640, + "step": 13760 + }, + { + "epoch": 1.6864108296836238, + "grad_norm": 1.695886377191295, + "learning_rate": 6.570155868707645e-06, + "loss": 0.1851, + "num_input_tokens_seen": 92088368, + "step": 13765 + }, + { + "epoch": 1.6870233683501272, + "grad_norm": 1.9198010682262974, + "learning_rate": 6.545081100768441e-06, + "loss": 0.2536, + "num_input_tokens_seen": 92121792, + "step": 13770 + }, + { + "epoch": 1.6876359070166305, + "grad_norm": 1.4641961735401525, + "learning_rate": 6.520050921352272e-06, + "loss": 0.1916, + "num_input_tokens_seen": 92154800, + "step": 13775 + }, + { + "epoch": 1.6882484456831337, + "grad_norm": 1.2319366167757715, + "learning_rate": 6.495065356142294e-06, + "loss": 0.1581, + "num_input_tokens_seen": 92189080, + "step": 13780 + }, + { + "epoch": 1.688860984349637, + "grad_norm": 1.6255029534974728, + "learning_rate": 6.4701244307758364e-06, + "loss": 0.1999, + "num_input_tokens_seen": 92221832, + "step": 13785 + }, + { + "epoch": 1.6894735230161404, + "grad_norm": 2.2012397147419898, + "learning_rate": 6.44522817084447e-06, + "loss": 0.1753, + "num_input_tokens_seen": 92256128, + "step": 13790 + }, + { + "epoch": 1.6900860616826439, + "grad_norm": 0.8727960244408192, + "learning_rate": 6.4203766018939235e-06, + "loss": 0.2252, + "num_input_tokens_seen": 92290008, + "step": 13795 + }, + { + "epoch": 1.690698600349147, + "grad_norm": 1.5136212401769784, + "learning_rate": 6.395569749424079e-06, + "loss": 0.2281, + "num_input_tokens_seen": 92323496, + "step": 13800 + }, + { + "epoch": 1.690698600349147, + "eval_loss": 0.05405490845441818, + "eval_runtime": 19.5676, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 92323496, + "step": 13800 + }, + { + "epoch": 1.6913111390156503, + "grad_norm": 1.7555135689269017, + "learning_rate": 6.370807638888898e-06, + "loss": 0.2022, + "num_input_tokens_seen": 92356752, + "step": 13805 + }, + { + "epoch": 1.6919236776821536, + "grad_norm": 1.5620758988199406, + "learning_rate": 6.346090295696466e-06, + "loss": 0.2124, + "num_input_tokens_seen": 92390136, + "step": 13810 + }, + { + "epoch": 1.692536216348657, + "grad_norm": 1.9986075156406748, + "learning_rate": 6.321417745208941e-06, + "loss": 0.2198, + "num_input_tokens_seen": 92423496, + "step": 13815 + }, + { + "epoch": 1.6931487550151605, + "grad_norm": 1.101608144891109, + "learning_rate": 6.296790012742493e-06, + "loss": 0.1615, + "num_input_tokens_seen": 92457784, + "step": 13820 + }, + { + "epoch": 1.6937612936816637, + "grad_norm": 1.3224239320596378, + "learning_rate": 6.272207123567297e-06, + "loss": 0.2166, + "num_input_tokens_seen": 92490568, + "step": 13825 + }, + { + "epoch": 1.694373832348167, + "grad_norm": 1.3905244529145964, + "learning_rate": 6.2476691029075495e-06, + "loss": 0.2214, + "num_input_tokens_seen": 92523880, + "step": 13830 + }, + { + "epoch": 1.6949863710146702, + "grad_norm": 2.5800594788140128, + "learning_rate": 6.223175975941409e-06, + "loss": 0.237, + "num_input_tokens_seen": 92556928, + "step": 13835 + }, + { + "epoch": 1.6955989096811737, + "grad_norm": 1.0934317997892002, + "learning_rate": 6.198727767800921e-06, + "loss": 0.2038, + "num_input_tokens_seen": 92590288, + "step": 13840 + }, + { + "epoch": 1.696211448347677, + "grad_norm": 1.2664202030272123, + "learning_rate": 6.174324503572088e-06, + "loss": 0.2175, + "num_input_tokens_seen": 92624344, + "step": 13845 + }, + { + "epoch": 1.6968239870141804, + "grad_norm": 1.18741836278935, + "learning_rate": 6.1499662082947774e-06, + "loss": 0.228, + "num_input_tokens_seen": 92657456, + "step": 13850 + }, + { + "epoch": 1.6968239870141804, + "eval_loss": 0.09495604783296585, + "eval_runtime": 19.5443, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 92657456, + "step": 13850 + }, + { + "epoch": 1.6974365256806836, + "grad_norm": 1.5710538932487879, + "learning_rate": 6.1256529069627255e-06, + "loss": 0.2283, + "num_input_tokens_seen": 92690832, + "step": 13855 + }, + { + "epoch": 1.6980490643471868, + "grad_norm": 1.9139213058337994, + "learning_rate": 6.101384624523476e-06, + "loss": 0.1816, + "num_input_tokens_seen": 92725184, + "step": 13860 + }, + { + "epoch": 1.6986616030136903, + "grad_norm": 1.579648169723815, + "learning_rate": 6.0771613858784045e-06, + "loss": 0.1779, + "num_input_tokens_seen": 92759376, + "step": 13865 + }, + { + "epoch": 1.6992741416801935, + "grad_norm": 1.2224566448379628, + "learning_rate": 6.0529832158826686e-06, + "loss": 0.1943, + "num_input_tokens_seen": 92793144, + "step": 13870 + }, + { + "epoch": 1.699886680346697, + "grad_norm": 1.3688403589131206, + "learning_rate": 6.0288501393451546e-06, + "loss": 0.203, + "num_input_tokens_seen": 92826632, + "step": 13875 + }, + { + "epoch": 1.7004992190132002, + "grad_norm": 1.153917804965399, + "learning_rate": 6.004762181028495e-06, + "loss": 0.2273, + "num_input_tokens_seen": 92860552, + "step": 13880 + }, + { + "epoch": 1.7011117576797035, + "grad_norm": 1.8611330192173607, + "learning_rate": 5.980719365649051e-06, + "loss": 0.2133, + "num_input_tokens_seen": 92894088, + "step": 13885 + }, + { + "epoch": 1.7017242963462067, + "grad_norm": 1.7056270223853014, + "learning_rate": 5.956721717876817e-06, + "loss": 0.1898, + "num_input_tokens_seen": 92927200, + "step": 13890 + }, + { + "epoch": 1.7023368350127102, + "grad_norm": 1.4971923892220476, + "learning_rate": 5.932769262335475e-06, + "loss": 0.227, + "num_input_tokens_seen": 92960976, + "step": 13895 + }, + { + "epoch": 1.7029493736792136, + "grad_norm": 1.330260193370982, + "learning_rate": 5.90886202360233e-06, + "loss": 0.1996, + "num_input_tokens_seen": 92994920, + "step": 13900 + }, + { + "epoch": 1.7029493736792136, + "eval_loss": 0.08521660417318344, + "eval_runtime": 20.0268, + "eval_samples_per_second": 2.996, + "eval_steps_per_second": 0.749, + "num_input_tokens_seen": 92994920, + "step": 13900 + }, + { + "epoch": 1.7035619123457169, + "grad_norm": 1.547522134747003, + "learning_rate": 5.885000026208287e-06, + "loss": 0.2174, + "num_input_tokens_seen": 93027872, + "step": 13905 + }, + { + "epoch": 1.70417445101222, + "grad_norm": 1.6097447935287728, + "learning_rate": 5.861183294637806e-06, + "loss": 0.2163, + "num_input_tokens_seen": 93061384, + "step": 13910 + }, + { + "epoch": 1.7047869896787233, + "grad_norm": 1.9143139416054267, + "learning_rate": 5.837411853328944e-06, + "loss": 0.234, + "num_input_tokens_seen": 93094288, + "step": 13915 + }, + { + "epoch": 1.7053995283452268, + "grad_norm": 1.9116226966689738, + "learning_rate": 5.8136857266732625e-06, + "loss": 0.2196, + "num_input_tokens_seen": 93127288, + "step": 13920 + }, + { + "epoch": 1.7060120670117302, + "grad_norm": 1.1925970900971574, + "learning_rate": 5.790004939015842e-06, + "loss": 0.1984, + "num_input_tokens_seen": 93161032, + "step": 13925 + }, + { + "epoch": 1.7066246056782335, + "grad_norm": 1.3972226852737553, + "learning_rate": 5.766369514655201e-06, + "loss": 0.2261, + "num_input_tokens_seen": 93194120, + "step": 13930 + }, + { + "epoch": 1.7072371443447367, + "grad_norm": 1.2122295345065377, + "learning_rate": 5.742779477843352e-06, + "loss": 0.1541, + "num_input_tokens_seen": 93228512, + "step": 13935 + }, + { + "epoch": 1.70784968301124, + "grad_norm": 1.4298196898561482, + "learning_rate": 5.7192348527857374e-06, + "loss": 0.2392, + "num_input_tokens_seen": 93261680, + "step": 13940 + }, + { + "epoch": 1.7084622216777434, + "grad_norm": 1.4153754964136578, + "learning_rate": 5.6957356636411606e-06, + "loss": 0.1866, + "num_input_tokens_seen": 93295648, + "step": 13945 + }, + { + "epoch": 1.7090747603442469, + "grad_norm": 1.3572669037731744, + "learning_rate": 5.67228193452185e-06, + "loss": 0.2211, + "num_input_tokens_seen": 93328616, + "step": 13950 + }, + { + "epoch": 1.7090747603442469, + "eval_loss": 0.10547046363353729, + "eval_runtime": 19.7382, + "eval_samples_per_second": 3.04, + "eval_steps_per_second": 0.76, + "num_input_tokens_seen": 93328616, + "step": 13950 + }, + { + "epoch": 1.7096872990107501, + "grad_norm": 1.7382636330602763, + "learning_rate": 5.648873689493367e-06, + "loss": 0.1962, + "num_input_tokens_seen": 93362448, + "step": 13955 + }, + { + "epoch": 1.7102998376772534, + "grad_norm": 1.5576561393383597, + "learning_rate": 5.625510952574614e-06, + "loss": 0.1886, + "num_input_tokens_seen": 93396232, + "step": 13960 + }, + { + "epoch": 1.7109123763437566, + "grad_norm": 1.8230458247058678, + "learning_rate": 5.602193747737766e-06, + "loss": 0.2069, + "num_input_tokens_seen": 93429472, + "step": 13965 + }, + { + "epoch": 1.71152491501026, + "grad_norm": 1.20946124274513, + "learning_rate": 5.578922098908313e-06, + "loss": 0.2115, + "num_input_tokens_seen": 93463024, + "step": 13970 + }, + { + "epoch": 1.7121374536767635, + "grad_norm": 1.7142307843784435, + "learning_rate": 5.555696029965008e-06, + "loss": 0.2147, + "num_input_tokens_seen": 93496712, + "step": 13975 + }, + { + "epoch": 1.7127499923432667, + "grad_norm": 1.6357872242980998, + "learning_rate": 5.532515564739782e-06, + "loss": 0.1828, + "num_input_tokens_seen": 93530696, + "step": 13980 + }, + { + "epoch": 1.71336253100977, + "grad_norm": 1.3013793174857207, + "learning_rate": 5.509380727017838e-06, + "loss": 0.2053, + "num_input_tokens_seen": 93564600, + "step": 13985 + }, + { + "epoch": 1.7139750696762732, + "grad_norm": 1.0994041317477732, + "learning_rate": 5.486291540537502e-06, + "loss": 0.2141, + "num_input_tokens_seen": 93598104, + "step": 13990 + }, + { + "epoch": 1.7145876083427767, + "grad_norm": 1.4388707582049916, + "learning_rate": 5.463248028990314e-06, + "loss": 0.209, + "num_input_tokens_seen": 93632056, + "step": 13995 + }, + { + "epoch": 1.71520014700928, + "grad_norm": 1.6259591924734165, + "learning_rate": 5.440250216020892e-06, + "loss": 0.1834, + "num_input_tokens_seen": 93666120, + "step": 14000 + }, + { + "epoch": 1.71520014700928, + "eval_loss": 0.11137460172176361, + "eval_runtime": 46.6699, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.321, + "num_input_tokens_seen": 93666120, + "step": 14000 + }, + { + "epoch": 1.7158126856757834, + "grad_norm": 1.5559204519365208, + "learning_rate": 5.417298125227005e-06, + "loss": 0.2119, + "num_input_tokens_seen": 93699104, + "step": 14005 + }, + { + "epoch": 1.7164252243422866, + "grad_norm": 0.9520475258611504, + "learning_rate": 5.394391780159486e-06, + "loss": 0.1734, + "num_input_tokens_seen": 93733056, + "step": 14010 + }, + { + "epoch": 1.7170377630087899, + "grad_norm": 1.5454358757049043, + "learning_rate": 5.371531204322256e-06, + "loss": 0.2144, + "num_input_tokens_seen": 93766344, + "step": 14015 + }, + { + "epoch": 1.717650301675293, + "grad_norm": 1.5235004236260346, + "learning_rate": 5.348716421172228e-06, + "loss": 0.2057, + "num_input_tokens_seen": 93799864, + "step": 14020 + }, + { + "epoch": 1.7182628403417965, + "grad_norm": 1.3229974090119894, + "learning_rate": 5.325947454119357e-06, + "loss": 0.2091, + "num_input_tokens_seen": 93834040, + "step": 14025 + }, + { + "epoch": 1.7188753790083, + "grad_norm": 1.3760006145069847, + "learning_rate": 5.303224326526596e-06, + "loss": 0.2606, + "num_input_tokens_seen": 93866848, + "step": 14030 + }, + { + "epoch": 1.7194879176748032, + "grad_norm": 1.4106357969230527, + "learning_rate": 5.280547061709829e-06, + "loss": 0.229, + "num_input_tokens_seen": 93900664, + "step": 14035 + }, + { + "epoch": 1.7201004563413065, + "grad_norm": 1.3891198003832192, + "learning_rate": 5.257915682937914e-06, + "loss": 0.1793, + "num_input_tokens_seen": 93934912, + "step": 14040 + }, + { + "epoch": 1.7207129950078097, + "grad_norm": 1.2741777779293801, + "learning_rate": 5.235330213432615e-06, + "loss": 0.217, + "num_input_tokens_seen": 93968800, + "step": 14045 + }, + { + "epoch": 1.7213255336743132, + "grad_norm": 1.2346393044822073, + "learning_rate": 5.212790676368568e-06, + "loss": 0.2246, + "num_input_tokens_seen": 94002480, + "step": 14050 + }, + { + "epoch": 1.7213255336743132, + "eval_loss": 0.10589968413114548, + "eval_runtime": 19.165, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 94002480, + "step": 14050 + }, + { + "epoch": 1.7219380723408166, + "grad_norm": 1.6356622083113634, + "learning_rate": 5.190297094873309e-06, + "loss": 0.2239, + "num_input_tokens_seen": 94036472, + "step": 14055 + }, + { + "epoch": 1.7225506110073199, + "grad_norm": 1.47573399543773, + "learning_rate": 5.16784949202721e-06, + "loss": 0.1969, + "num_input_tokens_seen": 94070568, + "step": 14060 + }, + { + "epoch": 1.7231631496738231, + "grad_norm": 1.1257115197885919, + "learning_rate": 5.145447890863475e-06, + "loss": 0.2337, + "num_input_tokens_seen": 94103832, + "step": 14065 + }, + { + "epoch": 1.7237756883403264, + "grad_norm": 0.921028757982457, + "learning_rate": 5.123092314368061e-06, + "loss": 0.2054, + "num_input_tokens_seen": 94137064, + "step": 14070 + }, + { + "epoch": 1.7243882270068298, + "grad_norm": 1.7661272786880837, + "learning_rate": 5.100782785479746e-06, + "loss": 0.2282, + "num_input_tokens_seen": 94170640, + "step": 14075 + }, + { + "epoch": 1.7250007656733333, + "grad_norm": 1.5880324768761795, + "learning_rate": 5.078519327090048e-06, + "loss": 0.211, + "num_input_tokens_seen": 94204568, + "step": 14080 + }, + { + "epoch": 1.7256133043398365, + "grad_norm": 1.4994768840248383, + "learning_rate": 5.056301962043219e-06, + "loss": 0.2282, + "num_input_tokens_seen": 94237824, + "step": 14085 + }, + { + "epoch": 1.7262258430063397, + "grad_norm": 1.5775623383169957, + "learning_rate": 5.034130713136187e-06, + "loss": 0.2472, + "num_input_tokens_seen": 94271016, + "step": 14090 + }, + { + "epoch": 1.726838381672843, + "grad_norm": 1.2686165314502347, + "learning_rate": 5.012005603118597e-06, + "loss": 0.1795, + "num_input_tokens_seen": 94305600, + "step": 14095 + }, + { + "epoch": 1.7274509203393464, + "grad_norm": 1.316052539444009, + "learning_rate": 4.989926654692728e-06, + "loss": 0.2114, + "num_input_tokens_seen": 94338656, + "step": 14100 + }, + { + "epoch": 1.7274509203393464, + "eval_loss": 0.08303268253803253, + "eval_runtime": 18.8783, + "eval_samples_per_second": 3.178, + "eval_steps_per_second": 0.795, + "num_input_tokens_seen": 94338656, + "step": 14100 + }, + { + "epoch": 1.72806345900585, + "grad_norm": 1.3756923158303223, + "learning_rate": 4.967893890513498e-06, + "loss": 0.2181, + "num_input_tokens_seen": 94371872, + "step": 14105 + }, + { + "epoch": 1.7286759976723531, + "grad_norm": 1.687316108089628, + "learning_rate": 4.945907333188432e-06, + "loss": 0.2158, + "num_input_tokens_seen": 94405304, + "step": 14110 + }, + { + "epoch": 1.7292885363388564, + "grad_norm": 1.32720095157167, + "learning_rate": 4.923967005277658e-06, + "loss": 0.1915, + "num_input_tokens_seen": 94439304, + "step": 14115 + }, + { + "epoch": 1.7299010750053596, + "grad_norm": 1.4779357544334513, + "learning_rate": 4.902072929293866e-06, + "loss": 0.1906, + "num_input_tokens_seen": 94473400, + "step": 14120 + }, + { + "epoch": 1.730513613671863, + "grad_norm": 1.454218480869587, + "learning_rate": 4.8802251277022645e-06, + "loss": 0.2117, + "num_input_tokens_seen": 94506920, + "step": 14125 + }, + { + "epoch": 1.7311261523383663, + "grad_norm": 2.0322266108679807, + "learning_rate": 4.858423622920605e-06, + "loss": 0.2001, + "num_input_tokens_seen": 94540784, + "step": 14130 + }, + { + "epoch": 1.7317386910048698, + "grad_norm": 1.3330015512372282, + "learning_rate": 4.836668437319125e-06, + "loss": 0.1964, + "num_input_tokens_seen": 94574920, + "step": 14135 + }, + { + "epoch": 1.732351229671373, + "grad_norm": 1.3203105624116542, + "learning_rate": 4.814959593220531e-06, + "loss": 0.1883, + "num_input_tokens_seen": 94608760, + "step": 14140 + }, + { + "epoch": 1.7329637683378762, + "grad_norm": 1.3077668111689291, + "learning_rate": 4.793297112899997e-06, + "loss": 0.1621, + "num_input_tokens_seen": 94642800, + "step": 14145 + }, + { + "epoch": 1.7335763070043797, + "grad_norm": 1.2682337264848538, + "learning_rate": 4.771681018585089e-06, + "loss": 0.2018, + "num_input_tokens_seen": 94676200, + "step": 14150 + }, + { + "epoch": 1.7335763070043797, + "eval_loss": 0.06674149632453918, + "eval_runtime": 19.2419, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 94676200, + "step": 14150 + }, + { + "epoch": 1.734188845670883, + "grad_norm": 2.089786904605635, + "learning_rate": 4.750111332455825e-06, + "loss": 0.2392, + "num_input_tokens_seen": 94709600, + "step": 14155 + }, + { + "epoch": 1.7348013843373864, + "grad_norm": 1.2414281555663051, + "learning_rate": 4.728588076644547e-06, + "loss": 0.2236, + "num_input_tokens_seen": 94743392, + "step": 14160 + }, + { + "epoch": 1.7354139230038896, + "grad_norm": 2.6838741399839106, + "learning_rate": 4.707111273236004e-06, + "loss": 0.2583, + "num_input_tokens_seen": 94776848, + "step": 14165 + }, + { + "epoch": 1.7360264616703929, + "grad_norm": 1.3588635852586692, + "learning_rate": 4.685680944267257e-06, + "loss": 0.2111, + "num_input_tokens_seen": 94810360, + "step": 14170 + }, + { + "epoch": 1.736639000336896, + "grad_norm": 1.6681725737031643, + "learning_rate": 4.6642971117277e-06, + "loss": 0.1759, + "num_input_tokens_seen": 94844688, + "step": 14175 + }, + { + "epoch": 1.7372515390033996, + "grad_norm": 1.6698806121315335, + "learning_rate": 4.6429597975589765e-06, + "loss": 0.204, + "num_input_tokens_seen": 94878096, + "step": 14180 + }, + { + "epoch": 1.737864077669903, + "grad_norm": 1.2388806594460386, + "learning_rate": 4.6216690236550454e-06, + "loss": 0.1906, + "num_input_tokens_seen": 94911424, + "step": 14185 + }, + { + "epoch": 1.7384766163364063, + "grad_norm": 1.337511431441936, + "learning_rate": 4.600424811862098e-06, + "loss": 0.1868, + "num_input_tokens_seen": 94945448, + "step": 14190 + }, + { + "epoch": 1.7390891550029095, + "grad_norm": 24.997624583497704, + "learning_rate": 4.579227183978518e-06, + "loss": 0.2381, + "num_input_tokens_seen": 94979056, + "step": 14195 + }, + { + "epoch": 1.7397016936694127, + "grad_norm": 1.7113981148844226, + "learning_rate": 4.5580761617549205e-06, + "loss": 0.2196, + "num_input_tokens_seen": 95012776, + "step": 14200 + }, + { + "epoch": 1.7397016936694127, + "eval_loss": 0.09127970039844513, + "eval_runtime": 19.0744, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 95012776, + "step": 14200 + }, + { + "epoch": 1.7403142323359162, + "grad_norm": 1.0717185578058233, + "learning_rate": 4.5369717668941155e-06, + "loss": 0.1893, + "num_input_tokens_seen": 95045912, + "step": 14205 + }, + { + "epoch": 1.7409267710024197, + "grad_norm": 1.5461348845664533, + "learning_rate": 4.51591402105101e-06, + "loss": 0.2287, + "num_input_tokens_seen": 95078840, + "step": 14210 + }, + { + "epoch": 1.741539309668923, + "grad_norm": 1.9940541300533867, + "learning_rate": 4.494902945832702e-06, + "loss": 0.1898, + "num_input_tokens_seen": 95111880, + "step": 14215 + }, + { + "epoch": 1.7421518483354261, + "grad_norm": 1.5903415407868877, + "learning_rate": 4.47393856279838e-06, + "loss": 0.2223, + "num_input_tokens_seen": 95145128, + "step": 14220 + }, + { + "epoch": 1.7427643870019294, + "grad_norm": 1.4613827693438226, + "learning_rate": 4.4530208934593255e-06, + "loss": 0.2186, + "num_input_tokens_seen": 95178440, + "step": 14225 + }, + { + "epoch": 1.7433769256684328, + "grad_norm": 1.210453575320351, + "learning_rate": 4.432149959278886e-06, + "loss": 0.1852, + "num_input_tokens_seen": 95212352, + "step": 14230 + }, + { + "epoch": 1.7439894643349363, + "grad_norm": 1.1489443304456357, + "learning_rate": 4.411325781672432e-06, + "loss": 0.2282, + "num_input_tokens_seen": 95245744, + "step": 14235 + }, + { + "epoch": 1.7446020030014395, + "grad_norm": 1.6667805110841014, + "learning_rate": 4.390548382007398e-06, + "loss": 0.1665, + "num_input_tokens_seen": 95280344, + "step": 14240 + }, + { + "epoch": 1.7452145416679428, + "grad_norm": 1.7031109720043627, + "learning_rate": 4.369817781603208e-06, + "loss": 0.2084, + "num_input_tokens_seen": 95313328, + "step": 14245 + }, + { + "epoch": 1.745827080334446, + "grad_norm": 1.5067454719172968, + "learning_rate": 4.349134001731236e-06, + "loss": 0.2155, + "num_input_tokens_seen": 95347128, + "step": 14250 + }, + { + "epoch": 1.745827080334446, + "eval_loss": 0.09919190406799316, + "eval_runtime": 19.0072, + "eval_samples_per_second": 3.157, + "eval_steps_per_second": 0.789, + "num_input_tokens_seen": 95347128, + "step": 14250 + }, + { + "epoch": 1.7464396190009495, + "grad_norm": 1.7244325386342882, + "learning_rate": 4.3284970636148555e-06, + "loss": 0.1812, + "num_input_tokens_seen": 95380888, + "step": 14255 + }, + { + "epoch": 1.747052157667453, + "grad_norm": 1.6174231593524293, + "learning_rate": 4.307906988429366e-06, + "loss": 0.2123, + "num_input_tokens_seen": 95414592, + "step": 14260 + }, + { + "epoch": 1.7476646963339562, + "grad_norm": 2.5063081013992017, + "learning_rate": 4.287363797301947e-06, + "loss": 0.2305, + "num_input_tokens_seen": 95448016, + "step": 14265 + }, + { + "epoch": 1.7482772350004594, + "grad_norm": 5.159806584302767, + "learning_rate": 4.266867511311718e-06, + "loss": 0.1886, + "num_input_tokens_seen": 95481752, + "step": 14270 + }, + { + "epoch": 1.7488897736669626, + "grad_norm": 2.06451071598963, + "learning_rate": 4.246418151489639e-06, + "loss": 0.1991, + "num_input_tokens_seen": 95515320, + "step": 14275 + }, + { + "epoch": 1.749502312333466, + "grad_norm": 1.8202816135345918, + "learning_rate": 4.226015738818551e-06, + "loss": 0.2442, + "num_input_tokens_seen": 95548384, + "step": 14280 + }, + { + "epoch": 1.7501148509999693, + "grad_norm": 1.2582951849915052, + "learning_rate": 4.205660294233077e-06, + "loss": 0.2071, + "num_input_tokens_seen": 95582680, + "step": 14285 + }, + { + "epoch": 1.7507273896664728, + "grad_norm": 1.08957033460719, + "learning_rate": 4.185351838619683e-06, + "loss": 0.2228, + "num_input_tokens_seen": 95616160, + "step": 14290 + }, + { + "epoch": 1.751339928332976, + "grad_norm": 1.8539153129578652, + "learning_rate": 4.165090392816612e-06, + "loss": 0.2195, + "num_input_tokens_seen": 95649280, + "step": 14295 + }, + { + "epoch": 1.7519524669994793, + "grad_norm": 1.390536215365459, + "learning_rate": 4.14487597761386e-06, + "loss": 0.2333, + "num_input_tokens_seen": 95683048, + "step": 14300 + }, + { + "epoch": 1.7519524669994793, + "eval_loss": 0.12027577310800552, + "eval_runtime": 19.3267, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 95683048, + "step": 14300 + }, + { + "epoch": 1.7525650056659825, + "grad_norm": 1.3338081734160896, + "learning_rate": 4.124708613753186e-06, + "loss": 0.2071, + "num_input_tokens_seen": 95716536, + "step": 14305 + }, + { + "epoch": 1.753177544332486, + "grad_norm": 1.5246567642279154, + "learning_rate": 4.104588321928043e-06, + "loss": 0.2135, + "num_input_tokens_seen": 95750128, + "step": 14310 + }, + { + "epoch": 1.7537900829989894, + "grad_norm": 1.598316398057618, + "learning_rate": 4.0845151227836165e-06, + "loss": 0.2196, + "num_input_tokens_seen": 95783912, + "step": 14315 + }, + { + "epoch": 1.7544026216654927, + "grad_norm": 1.3051543657187126, + "learning_rate": 4.06448903691673e-06, + "loss": 0.1735, + "num_input_tokens_seen": 95817888, + "step": 14320 + }, + { + "epoch": 1.755015160331996, + "grad_norm": 1.188048471591455, + "learning_rate": 4.044510084875902e-06, + "loss": 0.2017, + "num_input_tokens_seen": 95851648, + "step": 14325 + }, + { + "epoch": 1.7556276989984991, + "grad_norm": 1.791757692894848, + "learning_rate": 4.0245782871612735e-06, + "loss": 0.2393, + "num_input_tokens_seen": 95885200, + "step": 14330 + }, + { + "epoch": 1.7562402376650026, + "grad_norm": 1.6556893800497485, + "learning_rate": 4.004693664224607e-06, + "loss": 0.2201, + "num_input_tokens_seen": 95918408, + "step": 14335 + }, + { + "epoch": 1.756852776331506, + "grad_norm": 2.0497691195493153, + "learning_rate": 3.984856236469237e-06, + "loss": 0.2115, + "num_input_tokens_seen": 95951624, + "step": 14340 + }, + { + "epoch": 1.7574653149980093, + "grad_norm": 1.8265555553853234, + "learning_rate": 3.965066024250097e-06, + "loss": 0.2459, + "num_input_tokens_seen": 95984584, + "step": 14345 + }, + { + "epoch": 1.7580778536645125, + "grad_norm": 1.4650518165447777, + "learning_rate": 3.945323047873678e-06, + "loss": 0.201, + "num_input_tokens_seen": 96018648, + "step": 14350 + }, + { + "epoch": 1.7580778536645125, + "eval_loss": 0.11044047772884369, + "eval_runtime": 19.0607, + "eval_samples_per_second": 3.148, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 96018648, + "step": 14350 + }, + { + "epoch": 1.7586903923310158, + "grad_norm": 1.1861979171162091, + "learning_rate": 3.925627327597964e-06, + "loss": 0.2159, + "num_input_tokens_seen": 96051648, + "step": 14355 + }, + { + "epoch": 1.7593029309975192, + "grad_norm": 1.0266590882044788, + "learning_rate": 3.905978883632488e-06, + "loss": 0.1974, + "num_input_tokens_seen": 96084984, + "step": 14360 + }, + { + "epoch": 1.7599154696640227, + "grad_norm": 1.2477119910394108, + "learning_rate": 3.886377736138269e-06, + "loss": 0.2076, + "num_input_tokens_seen": 96118432, + "step": 14365 + }, + { + "epoch": 1.760528008330526, + "grad_norm": 1.3322865996619695, + "learning_rate": 3.866823905227773e-06, + "loss": 0.1962, + "num_input_tokens_seen": 96151920, + "step": 14370 + }, + { + "epoch": 1.7611405469970292, + "grad_norm": 1.5913731343298603, + "learning_rate": 3.847317410964935e-06, + "loss": 0.1959, + "num_input_tokens_seen": 96185448, + "step": 14375 + }, + { + "epoch": 1.7617530856635324, + "grad_norm": 1.4340894271284181, + "learning_rate": 3.827858273365114e-06, + "loss": 0.2026, + "num_input_tokens_seen": 96219072, + "step": 14380 + }, + { + "epoch": 1.7623656243300359, + "grad_norm": 1.6638695001146644, + "learning_rate": 3.8084465123950862e-06, + "loss": 0.2392, + "num_input_tokens_seen": 96252472, + "step": 14385 + }, + { + "epoch": 1.7629781629965393, + "grad_norm": 1.5410733644548236, + "learning_rate": 3.789082147973e-06, + "loss": 0.2442, + "num_input_tokens_seen": 96286104, + "step": 14390 + }, + { + "epoch": 1.7635907016630425, + "grad_norm": 0.9853284688619448, + "learning_rate": 3.7697651999683615e-06, + "loss": 0.1803, + "num_input_tokens_seen": 96319824, + "step": 14395 + }, + { + "epoch": 1.7642032403295458, + "grad_norm": 1.8534650658833802, + "learning_rate": 3.7504956882020436e-06, + "loss": 0.1886, + "num_input_tokens_seen": 96353568, + "step": 14400 + }, + { + "epoch": 1.7642032403295458, + "eval_loss": 0.12810835242271423, + "eval_runtime": 19.1577, + "eval_samples_per_second": 3.132, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 96353568, + "step": 14400 + }, + { + "epoch": 1.764815778996049, + "grad_norm": 1.9354177384545033, + "learning_rate": 3.7312736324462583e-06, + "loss": 0.1917, + "num_input_tokens_seen": 96387800, + "step": 14405 + }, + { + "epoch": 1.7654283176625525, + "grad_norm": 1.4728621318622495, + "learning_rate": 3.7120990524244782e-06, + "loss": 0.1684, + "num_input_tokens_seen": 96422160, + "step": 14410 + }, + { + "epoch": 1.7660408563290557, + "grad_norm": 1.8016701716935992, + "learning_rate": 3.6929719678115093e-06, + "loss": 0.195, + "num_input_tokens_seen": 96456552, + "step": 14415 + }, + { + "epoch": 1.7666533949955592, + "grad_norm": 1.5382874865628706, + "learning_rate": 3.6738923982333972e-06, + "loss": 0.1954, + "num_input_tokens_seen": 96489688, + "step": 14420 + }, + { + "epoch": 1.7672659336620624, + "grad_norm": 1.2178891908203997, + "learning_rate": 3.654860363267432e-06, + "loss": 0.2134, + "num_input_tokens_seen": 96522744, + "step": 14425 + }, + { + "epoch": 1.7678784723285657, + "grad_norm": 1.3460619610364044, + "learning_rate": 3.6358758824421357e-06, + "loss": 0.2143, + "num_input_tokens_seen": 96556248, + "step": 14430 + }, + { + "epoch": 1.7684910109950691, + "grad_norm": 1.7918792307152447, + "learning_rate": 3.616938975237244e-06, + "loss": 0.1869, + "num_input_tokens_seen": 96589768, + "step": 14435 + }, + { + "epoch": 1.7691035496615723, + "grad_norm": 1.495072839208311, + "learning_rate": 3.598049661083669e-06, + "loss": 0.2192, + "num_input_tokens_seen": 96622984, + "step": 14440 + }, + { + "epoch": 1.7697160883280758, + "grad_norm": 1.6745980197535875, + "learning_rate": 3.5792079593634785e-06, + "loss": 0.2049, + "num_input_tokens_seen": 96656136, + "step": 14445 + }, + { + "epoch": 1.770328626994579, + "grad_norm": 1.4727214825309298, + "learning_rate": 3.5604138894099027e-06, + "loss": 0.1834, + "num_input_tokens_seen": 96689680, + "step": 14450 + }, + { + "epoch": 1.770328626994579, + "eval_loss": 0.12905412912368774, + "eval_runtime": 18.9557, + "eval_samples_per_second": 3.165, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 96689680, + "step": 14450 + }, + { + "epoch": 1.7709411656610823, + "grad_norm": 1.7551199242362399, + "learning_rate": 3.5416674705072985e-06, + "loss": 0.2021, + "num_input_tokens_seen": 96723296, + "step": 14455 + }, + { + "epoch": 1.7715537043275855, + "grad_norm": 1.7653342848795275, + "learning_rate": 3.522968721891101e-06, + "loss": 0.1756, + "num_input_tokens_seen": 96757880, + "step": 14460 + }, + { + "epoch": 1.772166242994089, + "grad_norm": 1.2697773650764543, + "learning_rate": 3.5043176627478636e-06, + "loss": 0.2119, + "num_input_tokens_seen": 96791424, + "step": 14465 + }, + { + "epoch": 1.7727787816605924, + "grad_norm": 1.3715208069460982, + "learning_rate": 3.4857143122152046e-06, + "loss": 0.1666, + "num_input_tokens_seen": 96825400, + "step": 14470 + }, + { + "epoch": 1.7733913203270957, + "grad_norm": 1.5694353632037488, + "learning_rate": 3.467158689381761e-06, + "loss": 0.2343, + "num_input_tokens_seen": 96858304, + "step": 14475 + }, + { + "epoch": 1.774003858993599, + "grad_norm": 0.9054930797246807, + "learning_rate": 3.4486508132872076e-06, + "loss": 0.1987, + "num_input_tokens_seen": 96892408, + "step": 14480 + }, + { + "epoch": 1.7746163976601022, + "grad_norm": 1.5018940679448072, + "learning_rate": 3.4301907029222534e-06, + "loss": 0.2021, + "num_input_tokens_seen": 96926456, + "step": 14485 + }, + { + "epoch": 1.7752289363266056, + "grad_norm": 1.393319737977409, + "learning_rate": 3.4117783772285637e-06, + "loss": 0.2406, + "num_input_tokens_seen": 96959368, + "step": 14490 + }, + { + "epoch": 1.775841474993109, + "grad_norm": 1.7598705114414872, + "learning_rate": 3.3934138550987983e-06, + "loss": 0.23, + "num_input_tokens_seen": 96993128, + "step": 14495 + }, + { + "epoch": 1.7764540136596123, + "grad_norm": 2.1238982015167345, + "learning_rate": 3.3750971553765333e-06, + "loss": 0.2138, + "num_input_tokens_seen": 97026536, + "step": 14500 + }, + { + "epoch": 1.7764540136596123, + "eval_loss": 0.12095509469509125, + "eval_runtime": 18.8021, + "eval_samples_per_second": 3.191, + "eval_steps_per_second": 0.798, + "num_input_tokens_seen": 97026536, + "step": 14500 + }, + { + "epoch": 1.7770665523261155, + "grad_norm": 1.4324719969868325, + "learning_rate": 3.3568282968563015e-06, + "loss": 0.2085, + "num_input_tokens_seen": 97060344, + "step": 14505 + }, + { + "epoch": 1.7776790909926188, + "grad_norm": 1.7479041003948803, + "learning_rate": 3.3386072982835524e-06, + "loss": 0.2155, + "num_input_tokens_seen": 97094160, + "step": 14510 + }, + { + "epoch": 1.7782916296591222, + "grad_norm": 1.46468565255129, + "learning_rate": 3.3204341783545977e-06, + "loss": 0.1951, + "num_input_tokens_seen": 97127784, + "step": 14515 + }, + { + "epoch": 1.7789041683256257, + "grad_norm": 1.324672273288412, + "learning_rate": 3.302308955716643e-06, + "loss": 0.2075, + "num_input_tokens_seen": 97161064, + "step": 14520 + }, + { + "epoch": 1.779516706992129, + "grad_norm": 10.633035007369239, + "learning_rate": 3.2842316489677507e-06, + "loss": 0.1805, + "num_input_tokens_seen": 97194760, + "step": 14525 + }, + { + "epoch": 1.7801292456586322, + "grad_norm": 1.40944618777743, + "learning_rate": 3.2662022766567935e-06, + "loss": 0.2141, + "num_input_tokens_seen": 97228744, + "step": 14530 + }, + { + "epoch": 1.7807417843251354, + "grad_norm": 1.0068871662707584, + "learning_rate": 3.248220857283474e-06, + "loss": 0.1904, + "num_input_tokens_seen": 97262720, + "step": 14535 + }, + { + "epoch": 1.7813543229916389, + "grad_norm": 1.5595868760167488, + "learning_rate": 3.230287409298305e-06, + "loss": 0.2037, + "num_input_tokens_seen": 97296152, + "step": 14540 + }, + { + "epoch": 1.7819668616581423, + "grad_norm": 1.5368065174926178, + "learning_rate": 3.2124019511025616e-06, + "loss": 0.2289, + "num_input_tokens_seen": 97329360, + "step": 14545 + }, + { + "epoch": 1.7825794003246456, + "grad_norm": 1.3034966568310469, + "learning_rate": 3.1945645010482696e-06, + "loss": 0.1703, + "num_input_tokens_seen": 97363272, + "step": 14550 + }, + { + "epoch": 1.7825794003246456, + "eval_loss": 0.152376189827919, + "eval_runtime": 19.6577, + "eval_samples_per_second": 3.052, + "eval_steps_per_second": 0.763, + "num_input_tokens_seen": 97363272, + "step": 14550 + }, + { + "epoch": 1.7831919389911488, + "grad_norm": 1.2300735644666518, + "learning_rate": 3.176775077438199e-06, + "loss": 0.1921, + "num_input_tokens_seen": 97396880, + "step": 14555 + }, + { + "epoch": 1.783804477657652, + "grad_norm": 2.1318461924940997, + "learning_rate": 3.159033698525854e-06, + "loss": 0.2139, + "num_input_tokens_seen": 97430608, + "step": 14560 + }, + { + "epoch": 1.7844170163241555, + "grad_norm": 1.5947555934366116, + "learning_rate": 3.1413403825154285e-06, + "loss": 0.2144, + "num_input_tokens_seen": 97463904, + "step": 14565 + }, + { + "epoch": 1.7850295549906587, + "grad_norm": 1.598623593040159, + "learning_rate": 3.1236951475618002e-06, + "loss": 0.2079, + "num_input_tokens_seen": 97497480, + "step": 14570 + }, + { + "epoch": 1.7856420936571622, + "grad_norm": 1.3443497816347845, + "learning_rate": 3.1060980117705085e-06, + "loss": 0.2057, + "num_input_tokens_seen": 97531168, + "step": 14575 + }, + { + "epoch": 1.7862546323236654, + "grad_norm": 1.870681191409748, + "learning_rate": 3.0885489931977596e-06, + "loss": 0.2223, + "num_input_tokens_seen": 97564792, + "step": 14580 + }, + { + "epoch": 1.7868671709901687, + "grad_norm": 1.452433784540113, + "learning_rate": 3.0710481098503608e-06, + "loss": 0.1863, + "num_input_tokens_seen": 97598536, + "step": 14585 + }, + { + "epoch": 1.787479709656672, + "grad_norm": 1.324173574206757, + "learning_rate": 3.0535953796857364e-06, + "loss": 0.2025, + "num_input_tokens_seen": 97632352, + "step": 14590 + }, + { + "epoch": 1.7880922483231754, + "grad_norm": 1.5517115205765062, + "learning_rate": 3.036190820611906e-06, + "loss": 0.2024, + "num_input_tokens_seen": 97665488, + "step": 14595 + }, + { + "epoch": 1.7887047869896788, + "grad_norm": 1.2719971988263878, + "learning_rate": 3.0188344504874734e-06, + "loss": 0.2117, + "num_input_tokens_seen": 97698864, + "step": 14600 + }, + { + "epoch": 1.7887047869896788, + "eval_loss": 0.13145245611667633, + "eval_runtime": 19.1982, + "eval_samples_per_second": 3.125, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 97698864, + "step": 14600 + }, + { + "epoch": 1.789317325656182, + "grad_norm": 1.518549277476648, + "learning_rate": 3.0015262871215587e-06, + "loss": 0.1954, + "num_input_tokens_seen": 97732008, + "step": 14605 + }, + { + "epoch": 1.7899298643226853, + "grad_norm": 1.722598701596843, + "learning_rate": 2.9842663482738566e-06, + "loss": 0.2309, + "num_input_tokens_seen": 97764984, + "step": 14610 + }, + { + "epoch": 1.7905424029891885, + "grad_norm": 1.503615622601951, + "learning_rate": 2.9670546516545717e-06, + "loss": 0.2307, + "num_input_tokens_seen": 97797792, + "step": 14615 + }, + { + "epoch": 1.791154941655692, + "grad_norm": 1.5508143789355855, + "learning_rate": 2.9498912149243827e-06, + "loss": 0.2041, + "num_input_tokens_seen": 97831144, + "step": 14620 + }, + { + "epoch": 1.7917674803221955, + "grad_norm": 2.1990084715127645, + "learning_rate": 2.9327760556944694e-06, + "loss": 0.2428, + "num_input_tokens_seen": 97864296, + "step": 14625 + }, + { + "epoch": 1.7923800189886987, + "grad_norm": 1.4238488320426643, + "learning_rate": 2.9157091915264944e-06, + "loss": 0.2153, + "num_input_tokens_seen": 97897536, + "step": 14630 + }, + { + "epoch": 1.792992557655202, + "grad_norm": 1.705573470468416, + "learning_rate": 2.898690639932522e-06, + "loss": 0.1993, + "num_input_tokens_seen": 97931168, + "step": 14635 + }, + { + "epoch": 1.7936050963217052, + "grad_norm": 19.10859042529305, + "learning_rate": 2.881720418375061e-06, + "loss": 0.2056, + "num_input_tokens_seen": 97965328, + "step": 14640 + }, + { + "epoch": 1.7942176349882086, + "grad_norm": 1.4409687005766585, + "learning_rate": 2.8647985442670444e-06, + "loss": 0.1727, + "num_input_tokens_seen": 97999096, + "step": 14645 + }, + { + "epoch": 1.794830173654712, + "grad_norm": 1.162590718161583, + "learning_rate": 2.847925034971788e-06, + "loss": 0.1927, + "num_input_tokens_seen": 98032608, + "step": 14650 + }, + { + "epoch": 1.794830173654712, + "eval_loss": 0.1303679496049881, + "eval_runtime": 18.9545, + "eval_samples_per_second": 3.165, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 98032608, + "step": 14650 + }, + { + "epoch": 1.7954427123212153, + "grad_norm": 1.9215448111186493, + "learning_rate": 2.8310999078029754e-06, + "loss": 0.1957, + "num_input_tokens_seen": 98066712, + "step": 14655 + }, + { + "epoch": 1.7960552509877186, + "grad_norm": 1.8755856890604479, + "learning_rate": 2.814323180024647e-06, + "loss": 0.1854, + "num_input_tokens_seen": 98100856, + "step": 14660 + }, + { + "epoch": 1.7966677896542218, + "grad_norm": 0.8850830841975179, + "learning_rate": 2.797594868851183e-06, + "loss": 0.1604, + "num_input_tokens_seen": 98135624, + "step": 14665 + }, + { + "epoch": 1.7972803283207253, + "grad_norm": 1.2398655336301696, + "learning_rate": 2.780914991447292e-06, + "loss": 0.2322, + "num_input_tokens_seen": 98169432, + "step": 14670 + }, + { + "epoch": 1.7978928669872287, + "grad_norm": 1.3989547009808716, + "learning_rate": 2.7642835649279606e-06, + "loss": 0.1974, + "num_input_tokens_seen": 98203464, + "step": 14675 + }, + { + "epoch": 1.798505405653732, + "grad_norm": 1.1486712089717013, + "learning_rate": 2.7477006063584942e-06, + "loss": 0.1307, + "num_input_tokens_seen": 98238568, + "step": 14680 + }, + { + "epoch": 1.7991179443202352, + "grad_norm": 1.6497542212192742, + "learning_rate": 2.7311661327544423e-06, + "loss": 0.1698, + "num_input_tokens_seen": 98272600, + "step": 14685 + }, + { + "epoch": 1.7997304829867384, + "grad_norm": 1.640593380415123, + "learning_rate": 2.714680161081623e-06, + "loss": 0.2237, + "num_input_tokens_seen": 98306000, + "step": 14690 + }, + { + "epoch": 1.800343021653242, + "grad_norm": 1.8465980381324552, + "learning_rate": 2.6982427082560544e-06, + "loss": 0.2262, + "num_input_tokens_seen": 98339152, + "step": 14695 + }, + { + "epoch": 1.8009555603197451, + "grad_norm": 1.0568028629113835, + "learning_rate": 2.6818537911440065e-06, + "loss": 0.184, + "num_input_tokens_seen": 98372656, + "step": 14700 + }, + { + "epoch": 1.8009555603197451, + "eval_loss": 0.13143287599086761, + "eval_runtime": 18.8295, + "eval_samples_per_second": 3.186, + "eval_steps_per_second": 0.797, + "num_input_tokens_seen": 98372656, + "step": 14700 + }, + { + "epoch": 1.8015680989862486, + "grad_norm": 1.3368881663744416, + "learning_rate": 2.6655134265619385e-06, + "loss": 0.1738, + "num_input_tokens_seen": 98406728, + "step": 14705 + }, + { + "epoch": 1.8021806376527518, + "grad_norm": 1.3326936485078436, + "learning_rate": 2.6492216312764662e-06, + "loss": 0.1778, + "num_input_tokens_seen": 98441344, + "step": 14710 + }, + { + "epoch": 1.802793176319255, + "grad_norm": 1.4135712490590584, + "learning_rate": 2.6329784220044007e-06, + "loss": 0.21, + "num_input_tokens_seen": 98474960, + "step": 14715 + }, + { + "epoch": 1.8034057149857585, + "grad_norm": 1.1797035198866808, + "learning_rate": 2.6167838154126765e-06, + "loss": 0.1878, + "num_input_tokens_seen": 98509216, + "step": 14720 + }, + { + "epoch": 1.8040182536522618, + "grad_norm": 1.7265594004205143, + "learning_rate": 2.6006378281183798e-06, + "loss": 0.19, + "num_input_tokens_seen": 98543568, + "step": 14725 + }, + { + "epoch": 1.8046307923187652, + "grad_norm": 1.3545132654518455, + "learning_rate": 2.584540476688685e-06, + "loss": 0.2289, + "num_input_tokens_seen": 98577256, + "step": 14730 + }, + { + "epoch": 1.8052433309852685, + "grad_norm": 1.7979426245037262, + "learning_rate": 2.5684917776408746e-06, + "loss": 0.2327, + "num_input_tokens_seen": 98610824, + "step": 14735 + }, + { + "epoch": 1.8058558696517717, + "grad_norm": 1.3233440440145439, + "learning_rate": 2.5524917474423205e-06, + "loss": 0.2092, + "num_input_tokens_seen": 98644120, + "step": 14740 + }, + { + "epoch": 1.806468408318275, + "grad_norm": 1.629691047057333, + "learning_rate": 2.536540402510429e-06, + "loss": 0.2086, + "num_input_tokens_seen": 98677256, + "step": 14745 + }, + { + "epoch": 1.8070809469847784, + "grad_norm": 1.2498535208723844, + "learning_rate": 2.5206377592126683e-06, + "loss": 0.2065, + "num_input_tokens_seen": 98710624, + "step": 14750 + }, + { + "epoch": 1.8070809469847784, + "eval_loss": 0.11075767874717712, + "eval_runtime": 19.206, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 98710624, + "step": 14750 + }, + { + "epoch": 1.8076934856512819, + "grad_norm": 1.611116494479444, + "learning_rate": 2.504783833866542e-06, + "loss": 0.2144, + "num_input_tokens_seen": 98743648, + "step": 14755 + }, + { + "epoch": 1.808306024317785, + "grad_norm": 1.6985701213985445, + "learning_rate": 2.4889786427395534e-06, + "loss": 0.2127, + "num_input_tokens_seen": 98776688, + "step": 14760 + }, + { + "epoch": 1.8089185629842883, + "grad_norm": 1.5779136114150378, + "learning_rate": 2.4732222020491967e-06, + "loss": 0.1938, + "num_input_tokens_seen": 98810616, + "step": 14765 + }, + { + "epoch": 1.8095311016507916, + "grad_norm": 1.5002960367290024, + "learning_rate": 2.4575145279629452e-06, + "loss": 0.2509, + "num_input_tokens_seen": 98843768, + "step": 14770 + }, + { + "epoch": 1.810143640317295, + "grad_norm": 1.3963999470466515, + "learning_rate": 2.441855636598256e-06, + "loss": 0.1735, + "num_input_tokens_seen": 98877968, + "step": 14775 + }, + { + "epoch": 1.8107561789837985, + "grad_norm": 1.3040902170671054, + "learning_rate": 2.4262455440224872e-06, + "loss": 0.1706, + "num_input_tokens_seen": 98912168, + "step": 14780 + }, + { + "epoch": 1.8113687176503017, + "grad_norm": 1.6619232852172408, + "learning_rate": 2.410684266252966e-06, + "loss": 0.233, + "num_input_tokens_seen": 98946184, + "step": 14785 + }, + { + "epoch": 1.811981256316805, + "grad_norm": 1.3695645465425237, + "learning_rate": 2.395171819256914e-06, + "loss": 0.2107, + "num_input_tokens_seen": 98979480, + "step": 14790 + }, + { + "epoch": 1.8125937949833082, + "grad_norm": 1.5346253252052018, + "learning_rate": 2.3797082189514596e-06, + "loss": 0.2219, + "num_input_tokens_seen": 99012760, + "step": 14795 + }, + { + "epoch": 1.8132063336498117, + "grad_norm": 1.6613153449890468, + "learning_rate": 2.364293481203578e-06, + "loss": 0.2039, + "num_input_tokens_seen": 99045480, + "step": 14800 + }, + { + "epoch": 1.8132063336498117, + "eval_loss": 0.08662518113851547, + "eval_runtime": 19.5582, + "eval_samples_per_second": 3.068, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 99045480, + "step": 14800 + }, + { + "epoch": 1.8138188723163151, + "grad_norm": 1.4441094087375161, + "learning_rate": 2.3489276218301437e-06, + "loss": 0.2164, + "num_input_tokens_seen": 99079336, + "step": 14805 + }, + { + "epoch": 1.8144314109828183, + "grad_norm": 1.6325374317346673, + "learning_rate": 2.3336106565978567e-06, + "loss": 0.1915, + "num_input_tokens_seen": 99113184, + "step": 14810 + }, + { + "epoch": 1.8150439496493216, + "grad_norm": 1.8302243124573483, + "learning_rate": 2.3183426012232666e-06, + "loss": 0.2385, + "num_input_tokens_seen": 99145968, + "step": 14815 + }, + { + "epoch": 1.8156564883158248, + "grad_norm": 1.2955865745852857, + "learning_rate": 2.3031234713727145e-06, + "loss": 0.1638, + "num_input_tokens_seen": 99180864, + "step": 14820 + }, + { + "epoch": 1.8162690269823283, + "grad_norm": 1.4004637805403606, + "learning_rate": 2.2879532826623473e-06, + "loss": 0.2222, + "num_input_tokens_seen": 99214104, + "step": 14825 + }, + { + "epoch": 1.8168815656488317, + "grad_norm": 0.9266816901525021, + "learning_rate": 2.2728320506581138e-06, + "loss": 0.1742, + "num_input_tokens_seen": 99248104, + "step": 14830 + }, + { + "epoch": 1.817494104315335, + "grad_norm": 1.10935981739766, + "learning_rate": 2.257759790875691e-06, + "loss": 0.2108, + "num_input_tokens_seen": 99282424, + "step": 14835 + }, + { + "epoch": 1.8181066429818382, + "grad_norm": 1.5768102953064966, + "learning_rate": 2.242736518780536e-06, + "loss": 0.2084, + "num_input_tokens_seen": 99315864, + "step": 14840 + }, + { + "epoch": 1.8187191816483415, + "grad_norm": 1.8356392333070086, + "learning_rate": 2.227762249787835e-06, + "loss": 0.1833, + "num_input_tokens_seen": 99349368, + "step": 14845 + }, + { + "epoch": 1.819331720314845, + "grad_norm": 1.6639522979285615, + "learning_rate": 2.2128369992624962e-06, + "loss": 0.1985, + "num_input_tokens_seen": 99382520, + "step": 14850 + }, + { + "epoch": 1.819331720314845, + "eval_loss": 0.09910637885332108, + "eval_runtime": 19.0492, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 99382520, + "step": 14850 + }, + { + "epoch": 1.8199442589813482, + "grad_norm": 1.6433469725828247, + "learning_rate": 2.197960782519104e-06, + "loss": 0.1815, + "num_input_tokens_seen": 99415992, + "step": 14855 + }, + { + "epoch": 1.8205567976478516, + "grad_norm": 1.691117133995514, + "learning_rate": 2.1831336148219583e-06, + "loss": 0.2273, + "num_input_tokens_seen": 99449616, + "step": 14860 + }, + { + "epoch": 1.8211693363143548, + "grad_norm": 1.9439126936233546, + "learning_rate": 2.168355511385034e-06, + "loss": 0.1987, + "num_input_tokens_seen": 99483296, + "step": 14865 + }, + { + "epoch": 1.821781874980858, + "grad_norm": 1.5645724682831283, + "learning_rate": 2.1536264873719293e-06, + "loss": 0.2172, + "num_input_tokens_seen": 99516888, + "step": 14870 + }, + { + "epoch": 1.8223944136473615, + "grad_norm": 1.6859913597484448, + "learning_rate": 2.1389465578959154e-06, + "loss": 0.1972, + "num_input_tokens_seen": 99550560, + "step": 14875 + }, + { + "epoch": 1.8230069523138648, + "grad_norm": 1.404151732204081, + "learning_rate": 2.124315738019872e-06, + "loss": 0.2309, + "num_input_tokens_seen": 99584216, + "step": 14880 + }, + { + "epoch": 1.8236194909803682, + "grad_norm": 1.804210319238941, + "learning_rate": 2.109734042756295e-06, + "loss": 0.214, + "num_input_tokens_seen": 99617792, + "step": 14885 + }, + { + "epoch": 1.8242320296468715, + "grad_norm": 1.1577631078834674, + "learning_rate": 2.095201487067261e-06, + "loss": 0.1892, + "num_input_tokens_seen": 99651752, + "step": 14890 + }, + { + "epoch": 1.8248445683133747, + "grad_norm": 1.255052989509529, + "learning_rate": 2.0807180858644417e-06, + "loss": 0.2182, + "num_input_tokens_seen": 99685120, + "step": 14895 + }, + { + "epoch": 1.825457106979878, + "grad_norm": 1.4379731627735197, + "learning_rate": 2.066283854009077e-06, + "loss": 0.1867, + "num_input_tokens_seen": 99719032, + "step": 14900 + }, + { + "epoch": 1.825457106979878, + "eval_loss": 0.10013201087713242, + "eval_runtime": 19.0757, + "eval_samples_per_second": 3.145, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 99719032, + "step": 14900 + }, + { + "epoch": 1.8260696456463814, + "grad_norm": 1.5732964435735932, + "learning_rate": 2.0518988063119194e-06, + "loss": 0.2255, + "num_input_tokens_seen": 99752200, + "step": 14905 + }, + { + "epoch": 1.8266821843128849, + "grad_norm": 1.5267558078425605, + "learning_rate": 2.0375629575332957e-06, + "loss": 0.199, + "num_input_tokens_seen": 99785928, + "step": 14910 + }, + { + "epoch": 1.827294722979388, + "grad_norm": 1.4169281485987877, + "learning_rate": 2.023276322383022e-06, + "loss": 0.2024, + "num_input_tokens_seen": 99819448, + "step": 14915 + }, + { + "epoch": 1.8279072616458913, + "grad_norm": 6.840917074436536, + "learning_rate": 2.009038915520439e-06, + "loss": 0.2321, + "num_input_tokens_seen": 99852736, + "step": 14920 + }, + { + "epoch": 1.8285198003123946, + "grad_norm": 2.1609363467297613, + "learning_rate": 1.994850751554356e-06, + "loss": 0.2397, + "num_input_tokens_seen": 99886184, + "step": 14925 + }, + { + "epoch": 1.829132338978898, + "grad_norm": 1.0901155473238469, + "learning_rate": 1.9807118450430594e-06, + "loss": 0.1966, + "num_input_tokens_seen": 99920056, + "step": 14930 + }, + { + "epoch": 1.8297448776454015, + "grad_norm": 2.0923718704700853, + "learning_rate": 1.966622210494301e-06, + "loss": 0.2183, + "num_input_tokens_seen": 99953608, + "step": 14935 + }, + { + "epoch": 1.8303574163119047, + "grad_norm": 1.3230530848936273, + "learning_rate": 1.9525818623652713e-06, + "loss": 0.2188, + "num_input_tokens_seen": 99987368, + "step": 14940 + }, + { + "epoch": 1.830969954978408, + "grad_norm": 1.70357241904226, + "learning_rate": 1.93859081506258e-06, + "loss": 0.214, + "num_input_tokens_seen": 100020472, + "step": 14945 + }, + { + "epoch": 1.8315824936449112, + "grad_norm": 1.6754106942363047, + "learning_rate": 1.9246490829422646e-06, + "loss": 0.1983, + "num_input_tokens_seen": 100053904, + "step": 14950 + }, + { + "epoch": 1.8315824936449112, + "eval_loss": 0.07142162322998047, + "eval_runtime": 18.963, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 100053904, + "step": 14950 + }, + { + "epoch": 1.8321950323114147, + "grad_norm": 1.0293963459378703, + "learning_rate": 1.9107566803097608e-06, + "loss": 0.209, + "num_input_tokens_seen": 100087184, + "step": 14955 + }, + { + "epoch": 1.8328075709779181, + "grad_norm": 1.6893055769629666, + "learning_rate": 1.8969136214198657e-06, + "loss": 0.1727, + "num_input_tokens_seen": 100120816, + "step": 14960 + }, + { + "epoch": 1.8334201096444214, + "grad_norm": 1.3501133759839317, + "learning_rate": 1.8831199204767612e-06, + "loss": 0.1774, + "num_input_tokens_seen": 100154856, + "step": 14965 + }, + { + "epoch": 1.8340326483109246, + "grad_norm": 1.1059735070147187, + "learning_rate": 1.8693755916339929e-06, + "loss": 0.1541, + "num_input_tokens_seen": 100189688, + "step": 14970 + }, + { + "epoch": 1.8346451869774278, + "grad_norm": 1.5077902140615587, + "learning_rate": 1.8556806489944344e-06, + "loss": 0.2346, + "num_input_tokens_seen": 100222896, + "step": 14975 + }, + { + "epoch": 1.8352577256439313, + "grad_norm": 1.3994058726320298, + "learning_rate": 1.8420351066102847e-06, + "loss": 0.1726, + "num_input_tokens_seen": 100256920, + "step": 14980 + }, + { + "epoch": 1.8358702643104345, + "grad_norm": 2.1425991625937746, + "learning_rate": 1.8284389784830547e-06, + "loss": 0.1766, + "num_input_tokens_seen": 100291000, + "step": 14985 + }, + { + "epoch": 1.836482802976938, + "grad_norm": 1.498113093082703, + "learning_rate": 1.814892278563557e-06, + "loss": 0.1408, + "num_input_tokens_seen": 100325272, + "step": 14990 + }, + { + "epoch": 1.8370953416434412, + "grad_norm": 1.6303979141114462, + "learning_rate": 1.801395020751878e-06, + "loss": 0.231, + "num_input_tokens_seen": 100359016, + "step": 14995 + }, + { + "epoch": 1.8377078803099445, + "grad_norm": 1.18665291010936, + "learning_rate": 1.7879472188973844e-06, + "loss": 0.1842, + "num_input_tokens_seen": 100392776, + "step": 15000 + }, + { + "epoch": 1.8377078803099445, + "eval_loss": 0.08897951990365982, + "eval_runtime": 18.7299, + "eval_samples_per_second": 3.203, + "eval_steps_per_second": 0.801, + "num_input_tokens_seen": 100392776, + "step": 15000 + }, + { + "epoch": 1.838320418976448, + "grad_norm": 1.9892846789284018, + "learning_rate": 1.7745488867986771e-06, + "loss": 0.2254, + "num_input_tokens_seen": 100426336, + "step": 15005 + }, + { + "epoch": 1.8389329576429512, + "grad_norm": 2.009444813333155, + "learning_rate": 1.7612000382036308e-06, + "loss": 0.2226, + "num_input_tokens_seen": 100459744, + "step": 15010 + }, + { + "epoch": 1.8395454963094546, + "grad_norm": 1.376360027014675, + "learning_rate": 1.7479006868093006e-06, + "loss": 0.1888, + "num_input_tokens_seen": 100493288, + "step": 15015 + }, + { + "epoch": 1.8401580349759579, + "grad_norm": 1.4186533109031847, + "learning_rate": 1.734650846261987e-06, + "loss": 0.1688, + "num_input_tokens_seen": 100527512, + "step": 15020 + }, + { + "epoch": 1.840770573642461, + "grad_norm": 1.5222938908702746, + "learning_rate": 1.7214505301571871e-06, + "loss": 0.2857, + "num_input_tokens_seen": 100560864, + "step": 15025 + }, + { + "epoch": 1.8413831123089643, + "grad_norm": 1.7753654660917102, + "learning_rate": 1.708299752039555e-06, + "loss": 0.2345, + "num_input_tokens_seen": 100593464, + "step": 15030 + }, + { + "epoch": 1.8419956509754678, + "grad_norm": 1.7178584304495188, + "learning_rate": 1.6951985254029411e-06, + "loss": 0.1801, + "num_input_tokens_seen": 100627640, + "step": 15035 + }, + { + "epoch": 1.8426081896419713, + "grad_norm": 1.4191937035375526, + "learning_rate": 1.6821468636903427e-06, + "loss": 0.1927, + "num_input_tokens_seen": 100661184, + "step": 15040 + }, + { + "epoch": 1.8432207283084745, + "grad_norm": 1.6909977648687289, + "learning_rate": 1.6691447802938963e-06, + "loss": 0.2323, + "num_input_tokens_seen": 100694104, + "step": 15045 + }, + { + "epoch": 1.8438332669749777, + "grad_norm": 1.3393425622695767, + "learning_rate": 1.6561922885548698e-06, + "loss": 0.1583, + "num_input_tokens_seen": 100729000, + "step": 15050 + }, + { + "epoch": 1.8438332669749777, + "eval_loss": 0.08737240731716156, + "eval_runtime": 19.7776, + "eval_samples_per_second": 3.034, + "eval_steps_per_second": 0.758, + "num_input_tokens_seen": 100729000, + "step": 15050 + }, + { + "epoch": 1.844445805641481, + "grad_norm": 4.716839962497533, + "learning_rate": 1.643289401763648e-06, + "loss": 0.2174, + "num_input_tokens_seen": 100762816, + "step": 15055 + }, + { + "epoch": 1.8450583443079844, + "grad_norm": 1.7135442574802042, + "learning_rate": 1.6304361331597074e-06, + "loss": 0.2164, + "num_input_tokens_seen": 100796032, + "step": 15060 + }, + { + "epoch": 1.845670882974488, + "grad_norm": 1.3209369304872627, + "learning_rate": 1.6176324959316314e-06, + "loss": 0.2185, + "num_input_tokens_seen": 100829824, + "step": 15065 + }, + { + "epoch": 1.8462834216409911, + "grad_norm": 1.1311058815202513, + "learning_rate": 1.6048785032170443e-06, + "loss": 0.1937, + "num_input_tokens_seen": 100863496, + "step": 15070 + }, + { + "epoch": 1.8468959603074944, + "grad_norm": 2.868557767191352, + "learning_rate": 1.592174168102667e-06, + "loss": 0.2065, + "num_input_tokens_seen": 100897264, + "step": 15075 + }, + { + "epoch": 1.8475084989739976, + "grad_norm": 1.3377272735774426, + "learning_rate": 1.579519503624255e-06, + "loss": 0.2037, + "num_input_tokens_seen": 100931256, + "step": 15080 + }, + { + "epoch": 1.848121037640501, + "grad_norm": 1.6254624933554642, + "learning_rate": 1.566914522766577e-06, + "loss": 0.2094, + "num_input_tokens_seen": 100965144, + "step": 15085 + }, + { + "epoch": 1.8487335763070045, + "grad_norm": 1.905983597163492, + "learning_rate": 1.5543592384634486e-06, + "loss": 0.1951, + "num_input_tokens_seen": 100998808, + "step": 15090 + }, + { + "epoch": 1.8493461149735078, + "grad_norm": 1.5179036111771658, + "learning_rate": 1.541853663597692e-06, + "loss": 0.2197, + "num_input_tokens_seen": 101032848, + "step": 15095 + }, + { + "epoch": 1.849958653640011, + "grad_norm": 1.8628438015214015, + "learning_rate": 1.5293978110010932e-06, + "loss": 0.2087, + "num_input_tokens_seen": 101066288, + "step": 15100 + }, + { + "epoch": 1.849958653640011, + "eval_loss": 0.07200353592634201, + "eval_runtime": 19.1253, + "eval_samples_per_second": 3.137, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 101066288, + "step": 15100 + }, + { + "epoch": 1.8505711923065142, + "grad_norm": 0.8238166443209088, + "learning_rate": 1.516991693454445e-06, + "loss": 0.1724, + "num_input_tokens_seen": 101100024, + "step": 15105 + }, + { + "epoch": 1.8511837309730177, + "grad_norm": 1.6572488110346437, + "learning_rate": 1.5046353236875143e-06, + "loss": 0.211, + "num_input_tokens_seen": 101133448, + "step": 15110 + }, + { + "epoch": 1.8517962696395212, + "grad_norm": 1.65853409818662, + "learning_rate": 1.4923287143790043e-06, + "loss": 0.2188, + "num_input_tokens_seen": 101167424, + "step": 15115 + }, + { + "epoch": 1.8524088083060244, + "grad_norm": 1.446905632369376, + "learning_rate": 1.4800718781565637e-06, + "loss": 0.2137, + "num_input_tokens_seen": 101201784, + "step": 15120 + }, + { + "epoch": 1.8530213469725276, + "grad_norm": 1.572874919775131, + "learning_rate": 1.4678648275967654e-06, + "loss": 0.2083, + "num_input_tokens_seen": 101234824, + "step": 15125 + }, + { + "epoch": 1.8536338856390309, + "grad_norm": 1.717174109651591, + "learning_rate": 1.4557075752251125e-06, + "loss": 0.1964, + "num_input_tokens_seen": 101268296, + "step": 15130 + }, + { + "epoch": 1.8542464243055343, + "grad_norm": 1.4133732630311646, + "learning_rate": 1.4436001335160044e-06, + "loss": 0.2402, + "num_input_tokens_seen": 101300984, + "step": 15135 + }, + { + "epoch": 1.8548589629720376, + "grad_norm": 1.832950295678816, + "learning_rate": 1.4315425148927198e-06, + "loss": 0.2117, + "num_input_tokens_seen": 101334096, + "step": 15140 + }, + { + "epoch": 1.855471501638541, + "grad_norm": 27.980477729781537, + "learning_rate": 1.4195347317274287e-06, + "loss": 0.2101, + "num_input_tokens_seen": 101367920, + "step": 15145 + }, + { + "epoch": 1.8560840403050443, + "grad_norm": 1.353133114815059, + "learning_rate": 1.4075767963411645e-06, + "loss": 0.1811, + "num_input_tokens_seen": 101401720, + "step": 15150 + }, + { + "epoch": 1.8560840403050443, + "eval_loss": 0.09131244570016861, + "eval_runtime": 19.1074, + "eval_samples_per_second": 3.14, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 101401720, + "step": 15150 + }, + { + "epoch": 1.8566965789715475, + "grad_norm": 2.036161556060932, + "learning_rate": 1.3956687210037955e-06, + "loss": 0.188, + "num_input_tokens_seen": 101435536, + "step": 15155 + }, + { + "epoch": 1.857309117638051, + "grad_norm": 1.7853405149422736, + "learning_rate": 1.3838105179340477e-06, + "loss": 0.2047, + "num_input_tokens_seen": 101469400, + "step": 15160 + }, + { + "epoch": 1.8579216563045542, + "grad_norm": 1.2350420479922637, + "learning_rate": 1.3720021992994714e-06, + "loss": 0.2097, + "num_input_tokens_seen": 101502592, + "step": 15165 + }, + { + "epoch": 1.8585341949710577, + "grad_norm": 1.5478589838590677, + "learning_rate": 1.36024377721643e-06, + "loss": 0.2079, + "num_input_tokens_seen": 101535872, + "step": 15170 + }, + { + "epoch": 1.8591467336375609, + "grad_norm": 1.3316791783191904, + "learning_rate": 1.3485352637500782e-06, + "loss": 0.2412, + "num_input_tokens_seen": 101569360, + "step": 15175 + }, + { + "epoch": 1.8597592723040641, + "grad_norm": 1.7526105087544281, + "learning_rate": 1.336876670914372e-06, + "loss": 0.19, + "num_input_tokens_seen": 101603224, + "step": 15180 + }, + { + "epoch": 1.8603718109705674, + "grad_norm": 1.1626798451872897, + "learning_rate": 1.3252680106720428e-06, + "loss": 0.1842, + "num_input_tokens_seen": 101636856, + "step": 15185 + }, + { + "epoch": 1.8609843496370708, + "grad_norm": 1.3210613301146434, + "learning_rate": 1.3137092949345785e-06, + "loss": 0.2077, + "num_input_tokens_seen": 101670752, + "step": 15190 + }, + { + "epoch": 1.8615968883035743, + "grad_norm": 1.6792087020116686, + "learning_rate": 1.3022005355622314e-06, + "loss": 0.2215, + "num_input_tokens_seen": 101703784, + "step": 15195 + }, + { + "epoch": 1.8622094269700775, + "grad_norm": 1.2483309284828557, + "learning_rate": 1.2907417443639936e-06, + "loss": 0.1989, + "num_input_tokens_seen": 101737120, + "step": 15200 + }, + { + "epoch": 1.8622094269700775, + "eval_loss": 0.0902518779039383, + "eval_runtime": 19.0075, + "eval_samples_per_second": 3.157, + "eval_steps_per_second": 0.789, + "num_input_tokens_seen": 101737120, + "step": 15200 + }, + { + "epoch": 1.8628219656365808, + "grad_norm": 1.2830340885373985, + "learning_rate": 1.2793329330975711e-06, + "loss": 0.2134, + "num_input_tokens_seen": 101770368, + "step": 15205 + }, + { + "epoch": 1.863434504303084, + "grad_norm": 1.6241138737313565, + "learning_rate": 1.2679741134694e-06, + "loss": 0.2126, + "num_input_tokens_seen": 101803752, + "step": 15210 + }, + { + "epoch": 1.8640470429695875, + "grad_norm": 1.456943430167647, + "learning_rate": 1.2566652971346126e-06, + "loss": 0.2103, + "num_input_tokens_seen": 101837112, + "step": 15215 + }, + { + "epoch": 1.864659581636091, + "grad_norm": 1.1279341236041318, + "learning_rate": 1.245406495697038e-06, + "loss": 0.2127, + "num_input_tokens_seen": 101870472, + "step": 15220 + }, + { + "epoch": 1.8652721203025941, + "grad_norm": 2.537504525334616, + "learning_rate": 1.2341977207091904e-06, + "loss": 0.1788, + "num_input_tokens_seen": 101904464, + "step": 15225 + }, + { + "epoch": 1.8658846589690974, + "grad_norm": 1.3020968429074677, + "learning_rate": 1.2230389836722422e-06, + "loss": 0.1633, + "num_input_tokens_seen": 101938384, + "step": 15230 + }, + { + "epoch": 1.8664971976356006, + "grad_norm": 1.7161684549269949, + "learning_rate": 1.211930296036029e-06, + "loss": 0.2051, + "num_input_tokens_seen": 101972312, + "step": 15235 + }, + { + "epoch": 1.867109736302104, + "grad_norm": 2.2086390927799364, + "learning_rate": 1.2008716691990329e-06, + "loss": 0.2214, + "num_input_tokens_seen": 102006056, + "step": 15240 + }, + { + "epoch": 1.8677222749686075, + "grad_norm": 1.8187610734570117, + "learning_rate": 1.189863114508355e-06, + "loss": 0.1999, + "num_input_tokens_seen": 102038824, + "step": 15245 + }, + { + "epoch": 1.8683348136351108, + "grad_norm": 1.9808790104704663, + "learning_rate": 1.1789046432597427e-06, + "loss": 0.217, + "num_input_tokens_seen": 102072288, + "step": 15250 + }, + { + "epoch": 1.8683348136351108, + "eval_loss": 0.08231386542320251, + "eval_runtime": 18.9618, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 102072288, + "step": 15250 + }, + { + "epoch": 1.868947352301614, + "grad_norm": 1.4983453868148284, + "learning_rate": 1.1679962666975296e-06, + "loss": 0.1987, + "num_input_tokens_seen": 102106056, + "step": 15255 + }, + { + "epoch": 1.8695598909681173, + "grad_norm": 1.4483262745666323, + "learning_rate": 1.1571379960146622e-06, + "loss": 0.2353, + "num_input_tokens_seen": 102139288, + "step": 15260 + }, + { + "epoch": 1.8701724296346207, + "grad_norm": 1.707247795047457, + "learning_rate": 1.1463298423526725e-06, + "loss": 0.2193, + "num_input_tokens_seen": 102172456, + "step": 15265 + }, + { + "epoch": 1.8707849683011242, + "grad_norm": 2.1289611068486867, + "learning_rate": 1.135571816801656e-06, + "loss": 0.1866, + "num_input_tokens_seen": 102205864, + "step": 15270 + }, + { + "epoch": 1.8713975069676274, + "grad_norm": 1.7628935319687016, + "learning_rate": 1.1248639304002996e-06, + "loss": 0.2008, + "num_input_tokens_seen": 102239224, + "step": 15275 + }, + { + "epoch": 1.8720100456341306, + "grad_norm": 1.4132378778504495, + "learning_rate": 1.1142061941358085e-06, + "loss": 0.199, + "num_input_tokens_seen": 102272672, + "step": 15280 + }, + { + "epoch": 1.8726225843006339, + "grad_norm": 1.5613613626572878, + "learning_rate": 1.1035986189439517e-06, + "loss": 0.2167, + "num_input_tokens_seen": 102305384, + "step": 15285 + }, + { + "epoch": 1.8732351229671373, + "grad_norm": 1.4663688902975978, + "learning_rate": 1.0930412157090175e-06, + "loss": 0.2107, + "num_input_tokens_seen": 102339072, + "step": 15290 + }, + { + "epoch": 1.8738476616336406, + "grad_norm": 1.537654969241933, + "learning_rate": 1.0825339952638346e-06, + "loss": 0.2095, + "num_input_tokens_seen": 102372232, + "step": 15295 + }, + { + "epoch": 1.874460200300144, + "grad_norm": 1.3406298459761123, + "learning_rate": 1.072076968389707e-06, + "loss": 0.1836, + "num_input_tokens_seen": 102406408, + "step": 15300 + }, + { + "epoch": 1.874460200300144, + "eval_loss": 0.09559149295091629, + "eval_runtime": 18.6588, + "eval_samples_per_second": 3.216, + "eval_steps_per_second": 0.804, + "num_input_tokens_seen": 102406408, + "step": 15300 + }, + { + "epoch": 1.8750727389666473, + "grad_norm": 1.1714181549175136, + "learning_rate": 1.061670145816457e-06, + "loss": 0.1698, + "num_input_tokens_seen": 102440880, + "step": 15305 + }, + { + "epoch": 1.8756852776331505, + "grad_norm": 1.5071449238053214, + "learning_rate": 1.0513135382224037e-06, + "loss": 0.1908, + "num_input_tokens_seen": 102475112, + "step": 15310 + }, + { + "epoch": 1.8762978162996538, + "grad_norm": 1.2568891768786765, + "learning_rate": 1.0410071562343026e-06, + "loss": 0.1934, + "num_input_tokens_seen": 102508888, + "step": 15315 + }, + { + "epoch": 1.8769103549661572, + "grad_norm": 1.3720517539983024, + "learning_rate": 1.0307510104274165e-06, + "loss": 0.2157, + "num_input_tokens_seen": 102542336, + "step": 15320 + }, + { + "epoch": 1.8775228936326607, + "grad_norm": 1.3562335351869013, + "learning_rate": 1.0205451113254327e-06, + "loss": 0.2349, + "num_input_tokens_seen": 102575480, + "step": 15325 + }, + { + "epoch": 1.878135432299164, + "grad_norm": 1.1084746910805012, + "learning_rate": 1.0103894694004968e-06, + "loss": 0.1854, + "num_input_tokens_seen": 102609328, + "step": 15330 + }, + { + "epoch": 1.8787479709656671, + "grad_norm": 1.6477858943174999, + "learning_rate": 1.0002840950731728e-06, + "loss": 0.167, + "num_input_tokens_seen": 102643024, + "step": 15335 + }, + { + "epoch": 1.8793605096321704, + "grad_norm": 1.5316379923824295, + "learning_rate": 9.902289987124612e-07, + "loss": 0.1919, + "num_input_tokens_seen": 102677064, + "step": 15340 + }, + { + "epoch": 1.8799730482986738, + "grad_norm": 1.4967003596300184, + "learning_rate": 9.80224190635759e-07, + "loss": 0.1832, + "num_input_tokens_seen": 102710648, + "step": 15345 + }, + { + "epoch": 1.8805855869651773, + "grad_norm": 1.5460778790919527, + "learning_rate": 9.702696811088652e-07, + "loss": 0.1769, + "num_input_tokens_seen": 102744360, + "step": 15350 + }, + { + "epoch": 1.8805855869651773, + "eval_loss": 0.08247771114110947, + "eval_runtime": 19.0935, + "eval_samples_per_second": 3.142, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 102744360, + "step": 15350 + }, + { + "epoch": 1.8811981256316805, + "grad_norm": 1.875225135748629, + "learning_rate": 9.60365480345976e-07, + "loss": 0.1963, + "num_input_tokens_seen": 102778024, + "step": 15355 + }, + { + "epoch": 1.8818106642981838, + "grad_norm": 1.4613617210342222, + "learning_rate": 9.50511598509668e-07, + "loss": 0.1971, + "num_input_tokens_seen": 102812400, + "step": 15360 + }, + { + "epoch": 1.882423202964687, + "grad_norm": 1.3593908349885244, + "learning_rate": 9.407080457108753e-07, + "loss": 0.223, + "num_input_tokens_seen": 102845576, + "step": 15365 + }, + { + "epoch": 1.8830357416311905, + "grad_norm": 10.110559925948047, + "learning_rate": 9.309548320088902e-07, + "loss": 0.2699, + "num_input_tokens_seen": 102878696, + "step": 15370 + }, + { + "epoch": 1.883648280297694, + "grad_norm": 1.6617732243528827, + "learning_rate": 9.212519674113573e-07, + "loss": 0.2078, + "num_input_tokens_seen": 102911816, + "step": 15375 + }, + { + "epoch": 1.8842608189641972, + "grad_norm": 2.2296355789019047, + "learning_rate": 9.115994618742685e-07, + "loss": 0.2417, + "num_input_tokens_seen": 102944840, + "step": 15380 + }, + { + "epoch": 1.8848733576307004, + "grad_norm": 1.5251808056384764, + "learning_rate": 9.019973253019287e-07, + "loss": 0.2054, + "num_input_tokens_seen": 102978360, + "step": 15385 + }, + { + "epoch": 1.8854858962972036, + "grad_norm": 1.7472516778067144, + "learning_rate": 8.924455675469622e-07, + "loss": 0.1522, + "num_input_tokens_seen": 103013184, + "step": 15390 + }, + { + "epoch": 1.886098434963707, + "grad_norm": 2.0608959355574794, + "learning_rate": 8.829441984103016e-07, + "loss": 0.2204, + "num_input_tokens_seen": 103046528, + "step": 15395 + }, + { + "epoch": 1.8867109736302106, + "grad_norm": 1.4291652977898142, + "learning_rate": 8.734932276411812e-07, + "loss": 0.1906, + "num_input_tokens_seen": 103079768, + "step": 15400 + }, + { + "epoch": 1.8867109736302106, + "eval_loss": 0.10278834402561188, + "eval_runtime": 19.1999, + "eval_samples_per_second": 3.125, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 103079768, + "step": 15400 + }, + { + "epoch": 1.8873235122967138, + "grad_norm": 3.0170449614595043, + "learning_rate": 8.640926649371162e-07, + "loss": 0.2313, + "num_input_tokens_seen": 103112608, + "step": 15405 + }, + { + "epoch": 1.887936050963217, + "grad_norm": 1.4665284352308667, + "learning_rate": 8.547425199439019e-07, + "loss": 0.2245, + "num_input_tokens_seen": 103146728, + "step": 15410 + }, + { + "epoch": 1.8885485896297203, + "grad_norm": 1.9155721063846798, + "learning_rate": 8.454428022556027e-07, + "loss": 0.2345, + "num_input_tokens_seen": 103179448, + "step": 15415 + }, + { + "epoch": 1.8891611282962237, + "grad_norm": 1.6373751493392241, + "learning_rate": 8.3619352141453e-07, + "loss": 0.2319, + "num_input_tokens_seen": 103212696, + "step": 15420 + }, + { + "epoch": 1.889773666962727, + "grad_norm": 1.117147347942445, + "learning_rate": 8.269946869112478e-07, + "loss": 0.1937, + "num_input_tokens_seen": 103246288, + "step": 15425 + }, + { + "epoch": 1.8903862056292304, + "grad_norm": 1.4877438320875245, + "learning_rate": 8.178463081845722e-07, + "loss": 0.2249, + "num_input_tokens_seen": 103279592, + "step": 15430 + }, + { + "epoch": 1.8909987442957337, + "grad_norm": 1.393839535509293, + "learning_rate": 8.08748394621528e-07, + "loss": 0.2035, + "num_input_tokens_seen": 103312920, + "step": 15435 + }, + { + "epoch": 1.891611282962237, + "grad_norm": 1.7159835857687258, + "learning_rate": 7.997009555573642e-07, + "loss": 0.1801, + "num_input_tokens_seen": 103346728, + "step": 15440 + }, + { + "epoch": 1.8922238216287404, + "grad_norm": 1.9605173447606772, + "learning_rate": 7.907040002755383e-07, + "loss": 0.2356, + "num_input_tokens_seen": 103379768, + "step": 15445 + }, + { + "epoch": 1.8928363602952436, + "grad_norm": 1.2952267298535554, + "learning_rate": 7.817575380077047e-07, + "loss": 0.2168, + "num_input_tokens_seen": 103412640, + "step": 15450 + }, + { + "epoch": 1.8928363602952436, + "eval_loss": 0.07886143028736115, + "eval_runtime": 18.8952, + "eval_samples_per_second": 3.175, + "eval_steps_per_second": 0.794, + "num_input_tokens_seen": 103412640, + "step": 15450 + }, + { + "epoch": 1.893448898961747, + "grad_norm": 1.2338016673745584, + "learning_rate": 7.728615779337256e-07, + "loss": 0.2313, + "num_input_tokens_seen": 103445512, + "step": 15455 + }, + { + "epoch": 1.8940614376282503, + "grad_norm": 0.9081313685580731, + "learning_rate": 7.640161291816161e-07, + "loss": 0.1936, + "num_input_tokens_seen": 103479464, + "step": 15460 + }, + { + "epoch": 1.8946739762947535, + "grad_norm": 1.4753399437693233, + "learning_rate": 7.552212008275772e-07, + "loss": 0.2039, + "num_input_tokens_seen": 103513600, + "step": 15465 + }, + { + "epoch": 1.8952865149612568, + "grad_norm": 1.903759448501201, + "learning_rate": 7.464768018959733e-07, + "loss": 0.2087, + "num_input_tokens_seen": 103547448, + "step": 15470 + }, + { + "epoch": 1.8958990536277602, + "grad_norm": 2.018440377263653, + "learning_rate": 7.377829413593163e-07, + "loss": 0.2327, + "num_input_tokens_seen": 103580760, + "step": 15475 + }, + { + "epoch": 1.8965115922942637, + "grad_norm": 1.2459459864067945, + "learning_rate": 7.29139628138259e-07, + "loss": 0.156, + "num_input_tokens_seen": 103614760, + "step": 15480 + }, + { + "epoch": 1.897124130960767, + "grad_norm": 2.431372730177805, + "learning_rate": 7.205468711015961e-07, + "loss": 0.1996, + "num_input_tokens_seen": 103648480, + "step": 15485 + }, + { + "epoch": 1.8977366696272702, + "grad_norm": 1.4536749988640498, + "learning_rate": 7.120046790662471e-07, + "loss": 0.178, + "num_input_tokens_seen": 103682216, + "step": 15490 + }, + { + "epoch": 1.8983492082937734, + "grad_norm": 1.335020427368712, + "learning_rate": 7.03513060797234e-07, + "loss": 0.2103, + "num_input_tokens_seen": 103715888, + "step": 15495 + }, + { + "epoch": 1.8989617469602769, + "grad_norm": 2.024792547316244, + "learning_rate": 6.950720250076981e-07, + "loss": 0.2276, + "num_input_tokens_seen": 103749376, + "step": 15500 + }, + { + "epoch": 1.8989617469602769, + "eval_loss": 0.11778271198272705, + "eval_runtime": 19.2999, + "eval_samples_per_second": 3.109, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 103749376, + "step": 15500 + }, + { + "epoch": 1.8995742856267803, + "grad_norm": 1.80803476563642, + "learning_rate": 6.866815803588833e-07, + "loss": 0.156, + "num_input_tokens_seen": 103783288, + "step": 15505 + }, + { + "epoch": 1.9001868242932836, + "grad_norm": 1.4657515362280793, + "learning_rate": 6.783417354601029e-07, + "loss": 0.1895, + "num_input_tokens_seen": 103816976, + "step": 15510 + }, + { + "epoch": 1.9007993629597868, + "grad_norm": 1.7781311527797654, + "learning_rate": 6.700524988687673e-07, + "loss": 0.217, + "num_input_tokens_seen": 103850120, + "step": 15515 + }, + { + "epoch": 1.90141190162629, + "grad_norm": 1.8418917337046703, + "learning_rate": 6.618138790903561e-07, + "loss": 0.2099, + "num_input_tokens_seen": 103883680, + "step": 15520 + }, + { + "epoch": 1.9020244402927935, + "grad_norm": 1.9671975142454599, + "learning_rate": 6.536258845784127e-07, + "loss": 0.1916, + "num_input_tokens_seen": 103917296, + "step": 15525 + }, + { + "epoch": 1.902636978959297, + "grad_norm": 1.58890342193607, + "learning_rate": 6.454885237345166e-07, + "loss": 0.1995, + "num_input_tokens_seen": 103950536, + "step": 15530 + }, + { + "epoch": 1.9032495176258002, + "grad_norm": 1.862138710505922, + "learning_rate": 6.374018049083219e-07, + "loss": 0.2049, + "num_input_tokens_seen": 103983136, + "step": 15535 + }, + { + "epoch": 1.9038620562923034, + "grad_norm": 1.4740677422954502, + "learning_rate": 6.29365736397497e-07, + "loss": 0.2146, + "num_input_tokens_seen": 104016528, + "step": 15540 + }, + { + "epoch": 1.9044745949588067, + "grad_norm": 1.5427036413142712, + "learning_rate": 6.213803264477513e-07, + "loss": 0.1904, + "num_input_tokens_seen": 104050032, + "step": 15545 + }, + { + "epoch": 1.9050871336253101, + "grad_norm": 1.5470884508357212, + "learning_rate": 6.13445583252803e-07, + "loss": 0.1944, + "num_input_tokens_seen": 104083336, + "step": 15550 + }, + { + "epoch": 1.9050871336253101, + "eval_loss": 0.11403975635766983, + "eval_runtime": 18.6486, + "eval_samples_per_second": 3.217, + "eval_steps_per_second": 0.804, + "num_input_tokens_seen": 104083336, + "step": 15550 + }, + { + "epoch": 1.9056996722918136, + "grad_norm": 1.7424617038137795, + "learning_rate": 6.055615149543892e-07, + "loss": 0.2264, + "num_input_tokens_seen": 104116376, + "step": 15555 + }, + { + "epoch": 1.9063122109583168, + "grad_norm": 1.6220703041308882, + "learning_rate": 5.97728129642261e-07, + "loss": 0.1904, + "num_input_tokens_seen": 104150184, + "step": 15560 + }, + { + "epoch": 1.90692474962482, + "grad_norm": 1.319809432719218, + "learning_rate": 5.899454353541389e-07, + "loss": 0.2119, + "num_input_tokens_seen": 104183632, + "step": 15565 + }, + { + "epoch": 1.9075372882913233, + "grad_norm": 1.8298245368313333, + "learning_rate": 5.822134400757462e-07, + "loss": 0.1957, + "num_input_tokens_seen": 104217184, + "step": 15570 + }, + { + "epoch": 1.9081498269578268, + "grad_norm": 1.4744445636064405, + "learning_rate": 5.745321517407864e-07, + "loss": 0.2181, + "num_input_tokens_seen": 104250656, + "step": 15575 + }, + { + "epoch": 1.90876236562433, + "grad_norm": 1.3946099658212772, + "learning_rate": 5.669015782309273e-07, + "loss": 0.2237, + "num_input_tokens_seen": 104283904, + "step": 15580 + }, + { + "epoch": 1.9093749042908335, + "grad_norm": 2.2015585228387455, + "learning_rate": 5.593217273757945e-07, + "loss": 0.2018, + "num_input_tokens_seen": 104317416, + "step": 15585 + }, + { + "epoch": 1.9099874429573367, + "grad_norm": 1.4886852162981707, + "learning_rate": 5.517926069529777e-07, + "loss": 0.2093, + "num_input_tokens_seen": 104350752, + "step": 15590 + }, + { + "epoch": 1.91059998162384, + "grad_norm": 1.8060821499923954, + "learning_rate": 5.44314224688014e-07, + "loss": 0.2168, + "num_input_tokens_seen": 104384184, + "step": 15595 + }, + { + "epoch": 1.9112125202903432, + "grad_norm": 1.4495833615170242, + "learning_rate": 5.368865882543705e-07, + "loss": 0.1814, + "num_input_tokens_seen": 104417928, + "step": 15600 + }, + { + "epoch": 1.9112125202903432, + "eval_loss": 0.11014119535684586, + "eval_runtime": 19.1701, + "eval_samples_per_second": 3.13, + "eval_steps_per_second": 0.782, + "num_input_tokens_seen": 104417928, + "step": 15600 + }, + { + "epoch": 1.9118250589568466, + "grad_norm": 1.6064155372746778, + "learning_rate": 5.295097052734454e-07, + "loss": 0.2189, + "num_input_tokens_seen": 104451328, + "step": 15605 + }, + { + "epoch": 1.91243759762335, + "grad_norm": 1.59901361876775, + "learning_rate": 5.221835833145561e-07, + "loss": 0.1891, + "num_input_tokens_seen": 104484896, + "step": 15610 + }, + { + "epoch": 1.9130501362898533, + "grad_norm": 1.2103740355333097, + "learning_rate": 5.149082298949559e-07, + "loss": 0.1933, + "num_input_tokens_seen": 104518728, + "step": 15615 + }, + { + "epoch": 1.9136626749563566, + "grad_norm": 1.5349366551363557, + "learning_rate": 5.076836524797735e-07, + "loss": 0.2248, + "num_input_tokens_seen": 104552224, + "step": 15620 + }, + { + "epoch": 1.9142752136228598, + "grad_norm": 1.1731833955830824, + "learning_rate": 5.005098584820622e-07, + "loss": 0.1867, + "num_input_tokens_seen": 104585848, + "step": 15625 + }, + { + "epoch": 1.9148877522893633, + "grad_norm": 1.5939217688269798, + "learning_rate": 4.933868552627563e-07, + "loss": 0.2197, + "num_input_tokens_seen": 104619440, + "step": 15630 + }, + { + "epoch": 1.9155002909558667, + "grad_norm": 1.5737317710390848, + "learning_rate": 4.863146501306759e-07, + "loss": 0.2361, + "num_input_tokens_seen": 104652296, + "step": 15635 + }, + { + "epoch": 1.91611282962237, + "grad_norm": 1.9453617260949578, + "learning_rate": 4.792932503425163e-07, + "loss": 0.2343, + "num_input_tokens_seen": 104685712, + "step": 15640 + }, + { + "epoch": 1.9167253682888732, + "grad_norm": 1.4868560800347654, + "learning_rate": 4.7232266310284214e-07, + "loss": 0.2473, + "num_input_tokens_seen": 104719336, + "step": 15645 + }, + { + "epoch": 1.9173379069553764, + "grad_norm": 1.7969286635187933, + "learning_rate": 4.6540289556409323e-07, + "loss": 0.1527, + "num_input_tokens_seen": 104753392, + "step": 15650 + }, + { + "epoch": 1.9173379069553764, + "eval_loss": 0.09997235238552094, + "eval_runtime": 18.9585, + "eval_samples_per_second": 3.165, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 104753392, + "step": 15650 + }, + { + "epoch": 1.9179504456218799, + "grad_norm": 1.3189538230538573, + "learning_rate": 4.5853395482653995e-07, + "loss": 0.1524, + "num_input_tokens_seen": 104787432, + "step": 15655 + }, + { + "epoch": 1.9185629842883833, + "grad_norm": 1.4761773853226985, + "learning_rate": 4.5171584793831657e-07, + "loss": 0.2254, + "num_input_tokens_seen": 104820256, + "step": 15660 + }, + { + "epoch": 1.9191755229548866, + "grad_norm": 1.5267071340464822, + "learning_rate": 4.4494858189539356e-07, + "loss": 0.2048, + "num_input_tokens_seen": 104853472, + "step": 15665 + }, + { + "epoch": 1.9197880616213898, + "grad_norm": 0.8726638547718252, + "learning_rate": 4.3823216364157207e-07, + "loss": 0.1861, + "num_input_tokens_seen": 104887064, + "step": 15670 + }, + { + "epoch": 1.920400600287893, + "grad_norm": 1.7453112715854129, + "learning_rate": 4.315666000684837e-07, + "loss": 0.2233, + "num_input_tokens_seen": 104920344, + "step": 15675 + }, + { + "epoch": 1.9210131389543965, + "grad_norm": 1.827706184336836, + "learning_rate": 4.2495189801557424e-07, + "loss": 0.1858, + "num_input_tokens_seen": 104954688, + "step": 15680 + }, + { + "epoch": 1.9216256776209, + "grad_norm": 1.4039970449450245, + "learning_rate": 4.183880642701088e-07, + "loss": 0.1971, + "num_input_tokens_seen": 104987976, + "step": 15685 + }, + { + "epoch": 1.9222382162874032, + "grad_norm": 1.5975375118811985, + "learning_rate": 4.1187510556713884e-07, + "loss": 0.1947, + "num_input_tokens_seen": 105021296, + "step": 15690 + }, + { + "epoch": 1.9228507549539064, + "grad_norm": 1.5750224011203242, + "learning_rate": 4.0541302858953525e-07, + "loss": 0.2465, + "num_input_tokens_seen": 105054192, + "step": 15695 + }, + { + "epoch": 1.9234632936204097, + "grad_norm": 1.5624986183883312, + "learning_rate": 3.9900183996794405e-07, + "loss": 0.2245, + "num_input_tokens_seen": 105087584, + "step": 15700 + }, + { + "epoch": 1.9234632936204097, + "eval_loss": 0.11582653969526291, + "eval_runtime": 19.1235, + "eval_samples_per_second": 3.137, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 105087584, + "step": 15700 + }, + { + "epoch": 1.9240758322869131, + "grad_norm": 1.3284045392475317, + "learning_rate": 3.926415462808086e-07, + "loss": 0.1691, + "num_input_tokens_seen": 105122072, + "step": 15705 + }, + { + "epoch": 1.9246883709534164, + "grad_norm": 1.7101198105918805, + "learning_rate": 3.8633215405434185e-07, + "loss": 0.2275, + "num_input_tokens_seen": 105155088, + "step": 15710 + }, + { + "epoch": 1.9253009096199198, + "grad_norm": 1.7183457758500347, + "learning_rate": 3.8007366976252623e-07, + "loss": 0.2168, + "num_input_tokens_seen": 105188144, + "step": 15715 + }, + { + "epoch": 1.925913448286423, + "grad_norm": 0.9065333327967781, + "learning_rate": 3.7386609982711373e-07, + "loss": 0.1978, + "num_input_tokens_seen": 105221752, + "step": 15720 + }, + { + "epoch": 1.9265259869529263, + "grad_norm": 1.0761616450791043, + "learning_rate": 3.6770945061760377e-07, + "loss": 0.1928, + "num_input_tokens_seen": 105255528, + "step": 15725 + }, + { + "epoch": 1.9271385256194298, + "grad_norm": 1.1696159420224124, + "learning_rate": 3.6160372845125414e-07, + "loss": 0.2134, + "num_input_tokens_seen": 105288824, + "step": 15730 + }, + { + "epoch": 1.927751064285933, + "grad_norm": 1.4354456628614338, + "learning_rate": 3.5554893959308113e-07, + "loss": 0.2446, + "num_input_tokens_seen": 105322376, + "step": 15735 + }, + { + "epoch": 1.9283636029524365, + "grad_norm": 1.289458368547965, + "learning_rate": 3.4954509025580393e-07, + "loss": 0.2499, + "num_input_tokens_seen": 105355488, + "step": 15740 + }, + { + "epoch": 1.9289761416189397, + "grad_norm": 1.394873521974337, + "learning_rate": 3.435921865999059e-07, + "loss": 0.1947, + "num_input_tokens_seen": 105389224, + "step": 15745 + }, + { + "epoch": 1.929588680285443, + "grad_norm": 0.9468992699564943, + "learning_rate": 3.3769023473357866e-07, + "loss": 0.209, + "num_input_tokens_seen": 105422368, + "step": 15750 + }, + { + "epoch": 1.929588680285443, + "eval_loss": 0.11289756000041962, + "eval_runtime": 19.1686, + "eval_samples_per_second": 3.13, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 105422368, + "step": 15750 + }, + { + "epoch": 1.9302012189519462, + "grad_norm": 1.4160613090569896, + "learning_rate": 3.3183924071274464e-07, + "loss": 0.2341, + "num_input_tokens_seen": 105455608, + "step": 15755 + }, + { + "epoch": 1.9308137576184496, + "grad_norm": 1.8716854404919372, + "learning_rate": 3.260392105410182e-07, + "loss": 0.2335, + "num_input_tokens_seen": 105488952, + "step": 15760 + }, + { + "epoch": 1.931426296284953, + "grad_norm": 1.1124352652030178, + "learning_rate": 3.202901501697497e-07, + "loss": 0.186, + "num_input_tokens_seen": 105521848, + "step": 15765 + }, + { + "epoch": 1.9320388349514563, + "grad_norm": 1.125754601526746, + "learning_rate": 3.145920654979706e-07, + "loss": 0.1425, + "num_input_tokens_seen": 105556056, + "step": 15770 + }, + { + "epoch": 1.9326513736179596, + "grad_norm": 0.9652158738945942, + "learning_rate": 3.0894496237240387e-07, + "loss": 0.1663, + "num_input_tokens_seen": 105589936, + "step": 15775 + }, + { + "epoch": 1.9332639122844628, + "grad_norm": 1.5329939068345508, + "learning_rate": 3.033488465874701e-07, + "loss": 0.1754, + "num_input_tokens_seen": 105623640, + "step": 15780 + }, + { + "epoch": 1.9338764509509663, + "grad_norm": 0.9803736788894238, + "learning_rate": 2.97803723885276e-07, + "loss": 0.199, + "num_input_tokens_seen": 105657280, + "step": 15785 + }, + { + "epoch": 1.9344889896174697, + "grad_norm": 1.5672833949359486, + "learning_rate": 2.9230959995559804e-07, + "loss": 0.2091, + "num_input_tokens_seen": 105690576, + "step": 15790 + }, + { + "epoch": 1.935101528283973, + "grad_norm": 1.226435851820831, + "learning_rate": 2.8686648043588227e-07, + "loss": 0.2042, + "num_input_tokens_seen": 105724272, + "step": 15795 + }, + { + "epoch": 1.9357140669504762, + "grad_norm": 1.9384393349227378, + "learning_rate": 2.8147437091123885e-07, + "loss": 0.2102, + "num_input_tokens_seen": 105757576, + "step": 15800 + }, + { + "epoch": 1.9357140669504762, + "eval_loss": 0.13141392171382904, + "eval_runtime": 18.776, + "eval_samples_per_second": 3.196, + "eval_steps_per_second": 0.799, + "num_input_tokens_seen": 105757576, + "step": 15800 + }, + { + "epoch": 1.9363266056169794, + "grad_norm": 1.4512763278886014, + "learning_rate": 2.7613327691444756e-07, + "loss": 0.1666, + "num_input_tokens_seen": 105791544, + "step": 15805 + }, + { + "epoch": 1.936939144283483, + "grad_norm": 1.1943436201344462, + "learning_rate": 2.7084320392593564e-07, + "loss": 0.1876, + "num_input_tokens_seen": 105825424, + "step": 15810 + }, + { + "epoch": 1.9375516829499864, + "grad_norm": 1.6090927691387107, + "learning_rate": 2.656041573737722e-07, + "loss": 0.1917, + "num_input_tokens_seen": 105858992, + "step": 15815 + }, + { + "epoch": 1.9381642216164896, + "grad_norm": 1.623643428878767, + "learning_rate": 2.6041614263367375e-07, + "loss": 0.2277, + "num_input_tokens_seen": 105892432, + "step": 15820 + }, + { + "epoch": 1.9387767602829928, + "grad_norm": 1.5668954358974883, + "learning_rate": 2.5527916502899874e-07, + "loss": 0.2092, + "num_input_tokens_seen": 105925512, + "step": 15825 + }, + { + "epoch": 1.939389298949496, + "grad_norm": 1.7771738412197338, + "learning_rate": 2.501932298307308e-07, + "loss": 0.2118, + "num_input_tokens_seen": 105959104, + "step": 15830 + }, + { + "epoch": 1.9400018376159995, + "grad_norm": 1.1756547182779848, + "learning_rate": 2.4515834225748433e-07, + "loss": 0.1792, + "num_input_tokens_seen": 105993424, + "step": 15835 + }, + { + "epoch": 1.940614376282503, + "grad_norm": 1.9022668530653384, + "learning_rate": 2.401745074754824e-07, + "loss": 0.1992, + "num_input_tokens_seen": 106027048, + "step": 15840 + }, + { + "epoch": 1.9412269149490062, + "grad_norm": 1.5024640344787452, + "learning_rate": 2.3524173059858435e-07, + "loss": 0.1595, + "num_input_tokens_seen": 106061376, + "step": 15845 + }, + { + "epoch": 1.9418394536155095, + "grad_norm": 45.21173145435445, + "learning_rate": 2.3036001668824136e-07, + "loss": 0.2344, + "num_input_tokens_seen": 106094464, + "step": 15850 + }, + { + "epoch": 1.9418394536155095, + "eval_loss": 0.1082676351070404, + "eval_runtime": 19.315, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 106094464, + "step": 15850 + }, + { + "epoch": 1.9424519922820127, + "grad_norm": 1.397149853847731, + "learning_rate": 2.2552937075351887e-07, + "loss": 0.2072, + "num_input_tokens_seen": 106127944, + "step": 15855 + }, + { + "epoch": 1.9430645309485162, + "grad_norm": 1.5500535209600388, + "learning_rate": 2.207497977510742e-07, + "loss": 0.2336, + "num_input_tokens_seen": 106161040, + "step": 15860 + }, + { + "epoch": 1.9436770696150194, + "grad_norm": 1.8980200342587794, + "learning_rate": 2.160213025851676e-07, + "loss": 0.2168, + "num_input_tokens_seen": 106193968, + "step": 15865 + }, + { + "epoch": 1.9442896082815229, + "grad_norm": 1.2442924486290683, + "learning_rate": 2.1134389010765144e-07, + "loss": 0.2315, + "num_input_tokens_seen": 106226568, + "step": 15870 + }, + { + "epoch": 1.944902146948026, + "grad_norm": 1.1509901033967513, + "learning_rate": 2.0671756511794759e-07, + "loss": 0.1941, + "num_input_tokens_seen": 106260320, + "step": 15875 + }, + { + "epoch": 1.9455146856145293, + "grad_norm": 1.6244973095094355, + "learning_rate": 2.0214233236307556e-07, + "loss": 0.2008, + "num_input_tokens_seen": 106293088, + "step": 15880 + }, + { + "epoch": 1.9461272242810326, + "grad_norm": 1.3667312127185582, + "learning_rate": 1.9761819653761339e-07, + "loss": 0.19, + "num_input_tokens_seen": 106326440, + "step": 15885 + }, + { + "epoch": 1.946739762947536, + "grad_norm": 1.4746263992786222, + "learning_rate": 1.9314516228372547e-07, + "loss": 0.1893, + "num_input_tokens_seen": 106360320, + "step": 15890 + }, + { + "epoch": 1.9473523016140395, + "grad_norm": 1.88718863155967, + "learning_rate": 1.887232341911349e-07, + "loss": 0.2165, + "num_input_tokens_seen": 106393496, + "step": 15895 + }, + { + "epoch": 1.9479648402805427, + "grad_norm": 1.4848166745359421, + "learning_rate": 1.8435241679711778e-07, + "loss": 0.1837, + "num_input_tokens_seen": 106427400, + "step": 15900 + }, + { + "epoch": 1.9479648402805427, + "eval_loss": 0.1029721349477768, + "eval_runtime": 18.7995, + "eval_samples_per_second": 3.192, + "eval_steps_per_second": 0.798, + "num_input_tokens_seen": 106427400, + "step": 15900 + }, + { + "epoch": 1.948577378947046, + "grad_norm": 1.5860446736168388, + "learning_rate": 1.800327145865144e-07, + "loss": 0.2007, + "num_input_tokens_seen": 106460416, + "step": 15905 + }, + { + "epoch": 1.9491899176135492, + "grad_norm": 1.5118532033629684, + "learning_rate": 1.7576413199171248e-07, + "loss": 0.2198, + "num_input_tokens_seen": 106493920, + "step": 15910 + }, + { + "epoch": 1.9498024562800527, + "grad_norm": 1.5311392906439887, + "learning_rate": 1.7154667339265295e-07, + "loss": 0.197, + "num_input_tokens_seen": 106527576, + "step": 15915 + }, + { + "epoch": 1.9504149949465561, + "grad_norm": 1.4973483177634102, + "learning_rate": 1.673803431168186e-07, + "loss": 0.1965, + "num_input_tokens_seen": 106561096, + "step": 15920 + }, + { + "epoch": 1.9510275336130594, + "grad_norm": 1.7727513172168234, + "learning_rate": 1.6326514543921202e-07, + "loss": 0.2261, + "num_input_tokens_seen": 106594784, + "step": 15925 + }, + { + "epoch": 1.9516400722795626, + "grad_norm": 1.250591025573601, + "learning_rate": 1.592010845823999e-07, + "loss": 0.1829, + "num_input_tokens_seen": 106629144, + "step": 15930 + }, + { + "epoch": 1.9522526109460658, + "grad_norm": 1.5845350643238112, + "learning_rate": 1.5518816471645214e-07, + "loss": 0.2445, + "num_input_tokens_seen": 106662152, + "step": 15935 + }, + { + "epoch": 1.9528651496125693, + "grad_norm": 1.681531842684363, + "learning_rate": 1.5122638995898054e-07, + "loss": 0.21, + "num_input_tokens_seen": 106695312, + "step": 15940 + }, + { + "epoch": 1.9534776882790728, + "grad_norm": 1.3919236753800528, + "learning_rate": 1.4731576437509998e-07, + "loss": 0.1825, + "num_input_tokens_seen": 106729024, + "step": 15945 + }, + { + "epoch": 1.954090226945576, + "grad_norm": 1.1952675545266116, + "learning_rate": 1.4345629197745626e-07, + "loss": 0.2315, + "num_input_tokens_seen": 106761720, + "step": 15950 + }, + { + "epoch": 1.954090226945576, + "eval_loss": 0.1217665895819664, + "eval_runtime": 19.5444, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 106761720, + "step": 15950 + }, + { + "epoch": 1.9547027656120792, + "grad_norm": 1.5815436361984192, + "learning_rate": 1.3964797672620934e-07, + "loss": 0.1974, + "num_input_tokens_seen": 106795624, + "step": 15955 + }, + { + "epoch": 1.9553153042785825, + "grad_norm": 1.5980212974861097, + "learning_rate": 1.358908225290112e-07, + "loss": 0.1681, + "num_input_tokens_seen": 106829912, + "step": 15960 + }, + { + "epoch": 1.955927842945086, + "grad_norm": 1.3640481803838571, + "learning_rate": 1.3218483324103915e-07, + "loss": 0.21, + "num_input_tokens_seen": 106863336, + "step": 15965 + }, + { + "epoch": 1.9565403816115894, + "grad_norm": 1.5180109169515605, + "learning_rate": 1.285300126649569e-07, + "loss": 0.2095, + "num_input_tokens_seen": 106896896, + "step": 15970 + }, + { + "epoch": 1.9571529202780926, + "grad_norm": 1.7786598937041942, + "learning_rate": 1.2492636455092578e-07, + "loss": 0.1936, + "num_input_tokens_seen": 106930200, + "step": 15975 + }, + { + "epoch": 1.9577654589445959, + "grad_norm": 1.467251634693161, + "learning_rate": 1.2137389259659904e-07, + "loss": 0.1897, + "num_input_tokens_seen": 106963352, + "step": 15980 + }, + { + "epoch": 1.958377997611099, + "grad_norm": 1.7526207360903832, + "learning_rate": 1.1787260044712756e-07, + "loss": 0.2566, + "num_input_tokens_seen": 106996336, + "step": 15985 + }, + { + "epoch": 1.9589905362776026, + "grad_norm": 1.388418330750777, + "learning_rate": 1.1442249169513752e-07, + "loss": 0.2063, + "num_input_tokens_seen": 107030304, + "step": 15990 + }, + { + "epoch": 1.9596030749441058, + "grad_norm": 1.860561485772361, + "learning_rate": 1.1102356988074714e-07, + "loss": 0.2248, + "num_input_tokens_seen": 107063136, + "step": 15995 + }, + { + "epoch": 1.9602156136106093, + "grad_norm": 1.58634642740917, + "learning_rate": 1.0767583849153328e-07, + "loss": 0.2309, + "num_input_tokens_seen": 107096336, + "step": 16000 + }, + { + "epoch": 1.9602156136106093, + "eval_loss": 0.12916378676891327, + "eval_runtime": 19.1084, + "eval_samples_per_second": 3.14, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 107096336, + "step": 16000 + }, + { + "epoch": 1.9608281522771125, + "grad_norm": 1.5676143734985377, + "learning_rate": 1.0437930096257043e-07, + "loss": 0.2443, + "num_input_tokens_seen": 107129472, + "step": 16005 + }, + { + "epoch": 1.9614406909436157, + "grad_norm": 2.4122821182515164, + "learning_rate": 1.0113396067639169e-07, + "loss": 0.2061, + "num_input_tokens_seen": 107163088, + "step": 16010 + }, + { + "epoch": 1.9620532296101192, + "grad_norm": 1.7138775399140103, + "learning_rate": 9.793982096298892e-08, + "loss": 0.2235, + "num_input_tokens_seen": 107196360, + "step": 16015 + }, + { + "epoch": 1.9626657682766224, + "grad_norm": 1.7446351523342973, + "learning_rate": 9.479688509983487e-08, + "loss": 0.204, + "num_input_tokens_seen": 107230128, + "step": 16020 + }, + { + "epoch": 1.9632783069431259, + "grad_norm": 1.7481836333487537, + "learning_rate": 9.170515631185539e-08, + "loss": 0.2279, + "num_input_tokens_seen": 107264040, + "step": 16025 + }, + { + "epoch": 1.9638908456096291, + "grad_norm": 1.5963240085594248, + "learning_rate": 8.866463777142953e-08, + "loss": 0.2044, + "num_input_tokens_seen": 107298064, + "step": 16030 + }, + { + "epoch": 1.9645033842761324, + "grad_norm": 1.078579330137057, + "learning_rate": 8.567533259838945e-08, + "loss": 0.1939, + "num_input_tokens_seen": 107331904, + "step": 16035 + }, + { + "epoch": 1.9651159229426356, + "grad_norm": 1.8188021650986983, + "learning_rate": 8.273724386003712e-08, + "loss": 0.2031, + "num_input_tokens_seen": 107366240, + "step": 16040 + }, + { + "epoch": 1.965728461609139, + "grad_norm": 1.6285967619909565, + "learning_rate": 7.985037457108879e-08, + "loss": 0.1778, + "num_input_tokens_seen": 107399448, + "step": 16045 + }, + { + "epoch": 1.9663410002756425, + "grad_norm": 1.2815779025270306, + "learning_rate": 7.701472769373607e-08, + "loss": 0.2081, + "num_input_tokens_seen": 107433008, + "step": 16050 + }, + { + "epoch": 1.9663410002756425, + "eval_loss": 0.10997606813907623, + "eval_runtime": 18.9855, + "eval_samples_per_second": 3.16, + "eval_steps_per_second": 0.79, + "num_input_tokens_seen": 107433008, + "step": 16050 + }, + { + "epoch": 1.9669535389421458, + "grad_norm": 1.2574545138028856, + "learning_rate": 7.423030613759041e-08, + "loss": 0.1744, + "num_input_tokens_seen": 107467192, + "step": 16055 + }, + { + "epoch": 1.967566077608649, + "grad_norm": 1.446339200735081, + "learning_rate": 7.149711275971637e-08, + "loss": 0.221, + "num_input_tokens_seen": 107500864, + "step": 16060 + }, + { + "epoch": 1.9681786162751522, + "grad_norm": 1.6041360077975761, + "learning_rate": 6.881515036460396e-08, + "loss": 0.2284, + "num_input_tokens_seen": 107534680, + "step": 16065 + }, + { + "epoch": 1.9687911549416557, + "grad_norm": 1.5211206768206678, + "learning_rate": 6.618442170418515e-08, + "loss": 0.1996, + "num_input_tokens_seen": 107568272, + "step": 16070 + }, + { + "epoch": 1.9694036936081591, + "grad_norm": 1.338150212780746, + "learning_rate": 6.360492947781183e-08, + "loss": 0.2096, + "num_input_tokens_seen": 107601304, + "step": 16075 + }, + { + "epoch": 1.9700162322746624, + "grad_norm": 0.8095140639530569, + "learning_rate": 6.107667633226677e-08, + "loss": 0.1764, + "num_input_tokens_seen": 107635416, + "step": 16080 + }, + { + "epoch": 1.9706287709411656, + "grad_norm": 1.2535304239365532, + "learning_rate": 5.8599664861758166e-08, + "loss": 0.202, + "num_input_tokens_seen": 107669080, + "step": 16085 + }, + { + "epoch": 1.9712413096076689, + "grad_norm": 1.6762289055431168, + "learning_rate": 5.6173897607914025e-08, + "loss": 0.1872, + "num_input_tokens_seen": 107702840, + "step": 16090 + }, + { + "epoch": 1.9718538482741723, + "grad_norm": 1.3336540224605404, + "learning_rate": 5.379937705978777e-08, + "loss": 0.2251, + "num_input_tokens_seen": 107736744, + "step": 16095 + }, + { + "epoch": 1.9724663869406758, + "grad_norm": 1.3542705021127341, + "learning_rate": 5.147610565383598e-08, + "loss": 0.1759, + "num_input_tokens_seen": 107770392, + "step": 16100 + }, + { + "epoch": 1.9724663869406758, + "eval_loss": 0.12338043004274368, + "eval_runtime": 19.2425, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 107770392, + "step": 16100 + }, + { + "epoch": 1.973078925607179, + "grad_norm": 1.4875534905401508, + "learning_rate": 4.920408577394065e-08, + "loss": 0.1794, + "num_input_tokens_seen": 107804232, + "step": 16105 + }, + { + "epoch": 1.9736914642736823, + "grad_norm": 1.1870652918601297, + "learning_rate": 4.6983319751392476e-08, + "loss": 0.2013, + "num_input_tokens_seen": 107837344, + "step": 16110 + }, + { + "epoch": 1.9743040029401855, + "grad_norm": 1.2154307282049646, + "learning_rate": 4.481380986489092e-08, + "loss": 0.2222, + "num_input_tokens_seen": 107870296, + "step": 16115 + }, + { + "epoch": 1.974916541606689, + "grad_norm": 1.13013725688011, + "learning_rate": 4.2695558340538623e-08, + "loss": 0.1532, + "num_input_tokens_seen": 107904576, + "step": 16120 + }, + { + "epoch": 1.9755290802731924, + "grad_norm": 1.6734143110116928, + "learning_rate": 4.062856735184695e-08, + "loss": 0.2123, + "num_input_tokens_seen": 107938600, + "step": 16125 + }, + { + "epoch": 1.9761416189396956, + "grad_norm": 1.6239458788357592, + "learning_rate": 3.861283901973045e-08, + "loss": 0.2383, + "num_input_tokens_seen": 107972008, + "step": 16130 + }, + { + "epoch": 1.9767541576061989, + "grad_norm": 1.7172039827032852, + "learning_rate": 3.664837541250132e-08, + "loss": 0.2278, + "num_input_tokens_seen": 108005224, + "step": 16135 + }, + { + "epoch": 1.9773666962727021, + "grad_norm": 1.6078484229423624, + "learning_rate": 3.473517854586939e-08, + "loss": 0.2028, + "num_input_tokens_seen": 108038648, + "step": 16140 + }, + { + "epoch": 1.9779792349392056, + "grad_norm": 1.3431893070869196, + "learning_rate": 3.287325038293654e-08, + "loss": 0.1958, + "num_input_tokens_seen": 108072496, + "step": 16145 + }, + { + "epoch": 1.9785917736057088, + "grad_norm": 1.6085439739489529, + "learning_rate": 3.106259283420787e-08, + "loss": 0.1927, + "num_input_tokens_seen": 108105848, + "step": 16150 + }, + { + "epoch": 1.9785917736057088, + "eval_loss": 0.12711873650550842, + "eval_runtime": 18.6684, + "eval_samples_per_second": 3.214, + "eval_steps_per_second": 0.803, + "num_input_tokens_seen": 108105848, + "step": 16150 + }, + { + "epoch": 1.9792043122722123, + "grad_norm": 1.8546311199145882, + "learning_rate": 2.930320775756945e-08, + "loss": 0.1994, + "num_input_tokens_seen": 108139704, + "step": 16155 + }, + { + "epoch": 1.9798168509387155, + "grad_norm": 24.308913780458777, + "learning_rate": 2.759509695831608e-08, + "loss": 0.2619, + "num_input_tokens_seen": 108172816, + "step": 16160 + }, + { + "epoch": 1.9804293896052187, + "grad_norm": 1.1181298004423077, + "learning_rate": 2.5938262189101338e-08, + "loss": 0.2145, + "num_input_tokens_seen": 108206544, + "step": 16165 + }, + { + "epoch": 1.9810419282717222, + "grad_norm": 1.5089160866189284, + "learning_rate": 2.4332705149993086e-08, + "loss": 0.2193, + "num_input_tokens_seen": 108239608, + "step": 16170 + }, + { + "epoch": 1.9816544669382254, + "grad_norm": 2.0371563186433166, + "learning_rate": 2.2778427488434618e-08, + "loss": 0.2354, + "num_input_tokens_seen": 108273024, + "step": 16175 + }, + { + "epoch": 1.982267005604729, + "grad_norm": 1.3025056406928182, + "learning_rate": 2.1275430799239105e-08, + "loss": 0.1882, + "num_input_tokens_seen": 108307408, + "step": 16180 + }, + { + "epoch": 1.9828795442712321, + "grad_norm": 1.4439753229670431, + "learning_rate": 1.982371662461735e-08, + "loss": 0.2061, + "num_input_tokens_seen": 108340504, + "step": 16185 + }, + { + "epoch": 1.9834920829377354, + "grad_norm": 2.0534564138390277, + "learning_rate": 1.8423286454150036e-08, + "loss": 0.2093, + "num_input_tokens_seen": 108374192, + "step": 16190 + }, + { + "epoch": 1.9841046216042386, + "grad_norm": 1.6933310426650376, + "learning_rate": 1.707414172480992e-08, + "loss": 0.1632, + "num_input_tokens_seen": 108408208, + "step": 16195 + }, + { + "epoch": 1.984717160270742, + "grad_norm": 1.493546902649449, + "learning_rate": 1.577628382092855e-08, + "loss": 0.1823, + "num_input_tokens_seen": 108441584, + "step": 16200 + }, + { + "epoch": 1.984717160270742, + "eval_loss": 0.10478620231151581, + "eval_runtime": 18.9218, + "eval_samples_per_second": 3.171, + "eval_steps_per_second": 0.793, + "num_input_tokens_seen": 108441584, + "step": 16200 + }, + { + "epoch": 1.9853296989372455, + "grad_norm": 0.8148307721924001, + "learning_rate": 1.4529714074223988e-08, + "loss": 0.1985, + "num_input_tokens_seen": 108474648, + "step": 16205 + }, + { + "epoch": 1.9859422376037488, + "grad_norm": 1.5338867198187527, + "learning_rate": 1.3334433763784181e-08, + "loss": 0.189, + "num_input_tokens_seen": 108508000, + "step": 16210 + }, + { + "epoch": 1.986554776270252, + "grad_norm": 1.2269157177171637, + "learning_rate": 1.2190444116066957e-08, + "loss": 0.164, + "num_input_tokens_seen": 108542400, + "step": 16215 + }, + { + "epoch": 1.9871673149367552, + "grad_norm": 1.960939906404557, + "learning_rate": 1.1097746304916667e-08, + "loss": 0.183, + "num_input_tokens_seen": 108576552, + "step": 16220 + }, + { + "epoch": 1.9877798536032587, + "grad_norm": 1.112764475783892, + "learning_rate": 1.0056341451525342e-08, + "loss": 0.1981, + "num_input_tokens_seen": 108610120, + "step": 16225 + }, + { + "epoch": 1.9883923922697622, + "grad_norm": 1.910375816477557, + "learning_rate": 9.066230624465988e-09, + "loss": 0.2291, + "num_input_tokens_seen": 108643208, + "step": 16230 + }, + { + "epoch": 1.9890049309362654, + "grad_norm": 2.753353302561995, + "learning_rate": 8.12741483968149e-09, + "loss": 0.2264, + "num_input_tokens_seen": 108676224, + "step": 16235 + }, + { + "epoch": 1.9896174696027686, + "grad_norm": 1.9629823427621909, + "learning_rate": 7.239895060479063e-09, + "loss": 0.2312, + "num_input_tokens_seen": 108708432, + "step": 16240 + }, + { + "epoch": 1.9902300082692719, + "grad_norm": 1.381199176115903, + "learning_rate": 6.4036721975246905e-09, + "loss": 0.1797, + "num_input_tokens_seen": 108742768, + "step": 16245 + }, + { + "epoch": 1.9908425469357753, + "grad_norm": 1.6212829564304228, + "learning_rate": 5.618747108865341e-09, + "loss": 0.2257, + "num_input_tokens_seen": 108775784, + "step": 16250 + }, + { + "epoch": 1.9908425469357753, + "eval_loss": 0.13182586431503296, + "eval_runtime": 18.6777, + "eval_samples_per_second": 3.212, + "eval_steps_per_second": 0.803, + "num_input_tokens_seen": 108775784, + "step": 16250 + }, + { + "epoch": 1.9914550856022788, + "grad_norm": 1.3808527766571117, + "learning_rate": 4.885120599895654e-09, + "loss": 0.2066, + "num_input_tokens_seen": 108809944, + "step": 16255 + }, + { + "epoch": 1.992067624268782, + "grad_norm": 12.292963233280995, + "learning_rate": 4.202793423380147e-09, + "loss": 0.2017, + "num_input_tokens_seen": 108842592, + "step": 16260 + }, + { + "epoch": 1.9926801629352853, + "grad_norm": 1.4660025513597785, + "learning_rate": 3.571766279453215e-09, + "loss": 0.1999, + "num_input_tokens_seen": 108876200, + "step": 16265 + }, + { + "epoch": 1.9932927016017885, + "grad_norm": 1.7865369550394035, + "learning_rate": 2.9920398155913742e-09, + "loss": 0.1854, + "num_input_tokens_seen": 108910256, + "step": 16270 + }, + { + "epoch": 1.993905240268292, + "grad_norm": 1.8432963525832002, + "learning_rate": 2.4636146266521222e-09, + "loss": 0.2155, + "num_input_tokens_seen": 108944032, + "step": 16275 + }, + { + "epoch": 1.9945177789347952, + "grad_norm": 1.9614056448556825, + "learning_rate": 1.986491254846179e-09, + "loss": 0.2018, + "num_input_tokens_seen": 108977664, + "step": 16280 + }, + { + "epoch": 1.9951303176012987, + "grad_norm": 2.703958999559818, + "learning_rate": 1.5606701897430409e-09, + "loss": 0.2066, + "num_input_tokens_seen": 109010600, + "step": 16285 + }, + { + "epoch": 1.995742856267802, + "grad_norm": 1.107417355225427, + "learning_rate": 1.1861518682709793e-09, + "loss": 0.1816, + "num_input_tokens_seen": 109044464, + "step": 16290 + }, + { + "epoch": 1.9963553949343051, + "grad_norm": 1.2078739160432364, + "learning_rate": 8.629366747170408e-10, + "loss": 0.1816, + "num_input_tokens_seen": 109078888, + "step": 16295 + }, + { + "epoch": 1.9969679336008086, + "grad_norm": 1.4222031437871272, + "learning_rate": 5.910249407270474e-10, + "loss": 0.2158, + "num_input_tokens_seen": 109112120, + "step": 16300 + }, + { + "epoch": 1.9969679336008086, + "eval_loss": 0.11464700102806091, + "eval_runtime": 19.0651, + "eval_samples_per_second": 3.147, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 109112120, + "step": 16300 + }, + { + "epoch": 1.9975804722673118, + "grad_norm": 2.005094271805205, + "learning_rate": 3.7041694531114723e-10, + "loss": 0.2376, + "num_input_tokens_seen": 109145312, + "step": 16305 + }, + { + "epoch": 1.9981930109338153, + "grad_norm": 1.4867493885349998, + "learning_rate": 2.0111291483271288e-10, + "loss": 0.2031, + "num_input_tokens_seen": 109179312, + "step": 16310 + }, + { + "epoch": 1.9988055496003185, + "grad_norm": 1.584898833540703, + "learning_rate": 8.311302300278989e-11, + "loss": 0.2135, + "num_input_tokens_seen": 109212488, + "step": 16315 + }, + { + "epoch": 1.9994180882668218, + "grad_norm": 1.9175958869363128, + "learning_rate": 1.6417390907852438e-11, + "loss": 0.2092, + "num_input_tokens_seen": 109246064, + "step": 16320 + }, + { + "epoch": 1.9999081192000245, + "num_input_tokens_seen": 109273408, + "step": 16324, + "total_flos": 6853640571322368.0, + "train_loss": 0.029832553232572032, + "train_runtime": 29007.2299, + "train_samples_per_second": 9.005, + "train_steps_per_second": 0.563 + } + ], + "logging_steps": 5, + "max_steps": 16324, + "num_input_tokens_seen": 109273408, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6853640571322368.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}