{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012107568176013773, "grad_norm": 1.6823863983154297, "learning_rate": 0.0, "loss": 1.3109, "num_input_tokens_seen": 438344, "step": 1, "train_runtime": 66.1532, "train_tokens_per_second": 6626.197 }, { "epoch": 0.0024215136352027547, "grad_norm": 1.6656533479690552, "learning_rate": 2.5000000000000004e-07, "loss": 1.2706, "num_input_tokens_seen": 897912, "step": 2, "train_runtime": 134.7812, "train_tokens_per_second": 6661.997 }, { "epoch": 0.003632270452804132, "grad_norm": 1.694225788116455, "learning_rate": 5.000000000000001e-07, "loss": 1.3045, "num_input_tokens_seen": 1322736, "step": 3, "train_runtime": 199.1648, "train_tokens_per_second": 6641.415 }, { "epoch": 0.004843027270405509, "grad_norm": 1.6565794944763184, "learning_rate": 7.5e-07, "loss": 1.2432, "num_input_tokens_seen": 1776136, "step": 4, "train_runtime": 267.5205, "train_tokens_per_second": 6639.252 }, { "epoch": 0.006053784088006886, "grad_norm": 1.7125741243362427, "learning_rate": 1.0000000000000002e-06, "loss": 1.2465, "num_input_tokens_seen": 2216360, "step": 5, "train_runtime": 333.3919, "train_tokens_per_second": 6647.912 }, { "epoch": 0.007264540905608264, "grad_norm": 1.627602219581604, "learning_rate": 1.25e-06, "loss": 1.2298, "num_input_tokens_seen": 2672560, "step": 6, "train_runtime": 401.2707, "train_tokens_per_second": 6660.242 }, { "epoch": 0.008475297723209641, "grad_norm": 1.677027702331543, "learning_rate": 1.5e-06, "loss": 1.2147, "num_input_tokens_seen": 3146496, "step": 7, "train_runtime": 471.6412, "train_tokens_per_second": 6671.376 }, { "epoch": 0.009686054540811019, "grad_norm": 1.6377135515213013, "learning_rate": 1.75e-06, "loss": 1.2696, "num_input_tokens_seen": 3600320, "step": 8, "train_runtime": 539.1503, "train_tokens_per_second": 6677.767 }, { "epoch": 0.010896811358412395, "grad_norm": 1.651689052581787, "learning_rate": 2.0000000000000003e-06, "loss": 1.3134, "num_input_tokens_seen": 4040512, "step": 9, "train_runtime": 604.8873, "train_tokens_per_second": 6679.776 }, { "epoch": 0.012107568176013772, "grad_norm": 1.588644027709961, "learning_rate": 2.25e-06, "loss": 1.2148, "num_input_tokens_seen": 4476856, "step": 10, "train_runtime": 669.9676, "train_tokens_per_second": 6682.198 }, { "epoch": 0.01331832499361515, "grad_norm": 1.54507315158844, "learning_rate": 2.5e-06, "loss": 1.2446, "num_input_tokens_seen": 4922928, "step": 11, "train_runtime": 738.3143, "train_tokens_per_second": 6667.795 }, { "epoch": 0.014529081811216527, "grad_norm": 1.5578962564468384, "learning_rate": 2.7500000000000004e-06, "loss": 1.2765, "num_input_tokens_seen": 5366568, "step": 12, "train_runtime": 805.1876, "train_tokens_per_second": 6664.991 }, { "epoch": 0.015739838628817903, "grad_norm": 1.2954202890396118, "learning_rate": 3e-06, "loss": 1.2142, "num_input_tokens_seen": 5805136, "step": 13, "train_runtime": 875.9211, "train_tokens_per_second": 6627.465 }, { "epoch": 0.016950595446419282, "grad_norm": 1.2545137405395508, "learning_rate": 3.2500000000000002e-06, "loss": 1.1928, "num_input_tokens_seen": 6269184, "step": 14, "train_runtime": 950.6377, "train_tokens_per_second": 6594.714 }, { "epoch": 0.018161352264020658, "grad_norm": 1.2325160503387451, "learning_rate": 3.5e-06, "loss": 1.2354, "num_input_tokens_seen": 6723088, "step": 15, "train_runtime": 1023.1734, "train_tokens_per_second": 6570.82 }, { "epoch": 0.019372109081622037, "grad_norm": 1.1806299686431885, "learning_rate": 3.7500000000000005e-06, "loss": 1.2273, "num_input_tokens_seen": 7181184, "step": 16, "train_runtime": 1097.2469, "train_tokens_per_second": 6544.729 }, { "epoch": 0.020582865899223413, "grad_norm": 1.1713570356369019, "learning_rate": 4.000000000000001e-06, "loss": 1.1806, "num_input_tokens_seen": 7621296, "step": 17, "train_runtime": 1167.7968, "train_tokens_per_second": 6526.218 }, { "epoch": 0.02179362271682479, "grad_norm": 0.7615021467208862, "learning_rate": 4.25e-06, "loss": 1.2043, "num_input_tokens_seen": 8091120, "step": 18, "train_runtime": 1242.7454, "train_tokens_per_second": 6510.682 }, { "epoch": 0.02300437953442617, "grad_norm": 0.7624850273132324, "learning_rate": 4.5e-06, "loss": 1.152, "num_input_tokens_seen": 8549696, "step": 19, "train_runtime": 1316.11, "train_tokens_per_second": 6496.187 }, { "epoch": 0.024215136352027544, "grad_norm": 0.753814160823822, "learning_rate": 4.75e-06, "loss": 1.1759, "num_input_tokens_seen": 9015664, "step": 20, "train_runtime": 1390.9508, "train_tokens_per_second": 6481.656 }, { "epoch": 0.025425893169628924, "grad_norm": 0.7244720458984375, "learning_rate": 5e-06, "loss": 1.1802, "num_input_tokens_seen": 9456440, "step": 21, "train_runtime": 1461.1875, "train_tokens_per_second": 6471.75 }, { "epoch": 0.0266366499872303, "grad_norm": 0.7659462094306946, "learning_rate": 5e-06, "loss": 1.1811, "num_input_tokens_seen": 9899784, "step": 22, "train_runtime": 1530.8483, "train_tokens_per_second": 6466.861 }, { "epoch": 0.027847406804831675, "grad_norm": 0.795974612236023, "learning_rate": 5e-06, "loss": 1.1331, "num_input_tokens_seen": 10351040, "step": 23, "train_runtime": 1602.2727, "train_tokens_per_second": 6460.224 }, { "epoch": 0.029058163622433054, "grad_norm": 0.7971475720405579, "learning_rate": 5e-06, "loss": 1.1689, "num_input_tokens_seen": 10818480, "step": 24, "train_runtime": 1675.7391, "train_tokens_per_second": 6455.945 }, { "epoch": 0.03026892044003443, "grad_norm": 0.9564425945281982, "learning_rate": 5e-06, "loss": 1.1556, "num_input_tokens_seen": 11266424, "step": 25, "train_runtime": 1746.8035, "train_tokens_per_second": 6449.737 }, { "epoch": 0.031479677257635806, "grad_norm": 1.0319074392318726, "learning_rate": 5e-06, "loss": 1.1656, "num_input_tokens_seen": 11711824, "step": 26, "train_runtime": 1817.4337, "train_tokens_per_second": 6444.155 }, { "epoch": 0.03269043407523718, "grad_norm": 0.9839694499969482, "learning_rate": 5e-06, "loss": 1.1934, "num_input_tokens_seen": 12167904, "step": 27, "train_runtime": 1890.2289, "train_tokens_per_second": 6437.265 }, { "epoch": 0.033901190892838565, "grad_norm": 1.0027241706848145, "learning_rate": 5e-06, "loss": 1.2015, "num_input_tokens_seen": 12603712, "step": 28, "train_runtime": 1958.8389, "train_tokens_per_second": 6434.277 }, { "epoch": 0.03511194771043994, "grad_norm": 1.0292820930480957, "learning_rate": 5e-06, "loss": 1.114, "num_input_tokens_seen": 13045496, "step": 29, "train_runtime": 2028.3291, "train_tokens_per_second": 6431.647 }, { "epoch": 0.036322704528041316, "grad_norm": 0.8715880513191223, "learning_rate": 5e-06, "loss": 1.1285, "num_input_tokens_seen": 13489440, "step": 30, "train_runtime": 2099.3816, "train_tokens_per_second": 6425.435 }, { "epoch": 0.03753346134564269, "grad_norm": 0.782746434211731, "learning_rate": 5e-06, "loss": 1.1129, "num_input_tokens_seen": 13935984, "step": 31, "train_runtime": 2170.7426, "train_tokens_per_second": 6419.916 }, { "epoch": 0.038744218163244075, "grad_norm": 0.6815439462661743, "learning_rate": 5e-06, "loss": 1.0844, "num_input_tokens_seen": 14376552, "step": 32, "train_runtime": 2240.9154, "train_tokens_per_second": 6415.482 }, { "epoch": 0.03995497498084545, "grad_norm": 0.5916255116462708, "learning_rate": 5e-06, "loss": 1.1448, "num_input_tokens_seen": 14801448, "step": 33, "train_runtime": 2309.3718, "train_tokens_per_second": 6409.296 }, { "epoch": 0.04116573179844683, "grad_norm": 0.5178527235984802, "learning_rate": 5e-06, "loss": 1.1492, "num_input_tokens_seen": 15254568, "step": 34, "train_runtime": 2381.7952, "train_tokens_per_second": 6404.651 }, { "epoch": 0.0423764886160482, "grad_norm": 0.4729219675064087, "learning_rate": 5e-06, "loss": 1.1012, "num_input_tokens_seen": 15721704, "step": 35, "train_runtime": 2456.0312, "train_tokens_per_second": 6401.264 }, { "epoch": 0.04358724543364958, "grad_norm": 0.4695061147212982, "learning_rate": 5e-06, "loss": 1.1549, "num_input_tokens_seen": 16169016, "step": 36, "train_runtime": 2527.4009, "train_tokens_per_second": 6397.488 }, { "epoch": 0.04479800225125096, "grad_norm": 0.4953579306602478, "learning_rate": 5e-06, "loss": 1.0791, "num_input_tokens_seen": 16632416, "step": 37, "train_runtime": 2602.077, "train_tokens_per_second": 6391.977 }, { "epoch": 0.04600875906885234, "grad_norm": 0.5437090396881104, "learning_rate": 5e-06, "loss": 1.166, "num_input_tokens_seen": 17072064, "step": 38, "train_runtime": 2672.2934, "train_tokens_per_second": 6388.544 }, { "epoch": 0.04721951588645371, "grad_norm": 0.49670565128326416, "learning_rate": 5e-06, "loss": 1.1278, "num_input_tokens_seen": 17521824, "step": 39, "train_runtime": 2744.1029, "train_tokens_per_second": 6385.265 }, { "epoch": 0.04843027270405509, "grad_norm": 0.5088937878608704, "learning_rate": 5e-06, "loss": 1.1204, "num_input_tokens_seen": 17970488, "step": 40, "train_runtime": 2816.1612, "train_tokens_per_second": 6381.2 }, { "epoch": 0.049641029521656464, "grad_norm": 0.45026877522468567, "learning_rate": 5e-06, "loss": 1.1271, "num_input_tokens_seen": 18415968, "step": 41, "train_runtime": 2887.7895, "train_tokens_per_second": 6377.185 }, { "epoch": 0.05085178633925785, "grad_norm": 0.450920969247818, "learning_rate": 5e-06, "loss": 1.1152, "num_input_tokens_seen": 18872720, "step": 42, "train_runtime": 2961.0904, "train_tokens_per_second": 6373.571 }, { "epoch": 0.05206254315685922, "grad_norm": 0.3853777050971985, "learning_rate": 5e-06, "loss": 1.0648, "num_input_tokens_seen": 19341632, "step": 43, "train_runtime": 3036.043, "train_tokens_per_second": 6370.671 }, { "epoch": 0.0532732999744606, "grad_norm": 0.37567827105522156, "learning_rate": 5e-06, "loss": 1.1503, "num_input_tokens_seen": 19774192, "step": 44, "train_runtime": 3105.6604, "train_tokens_per_second": 6367.146 }, { "epoch": 0.054484056792061974, "grad_norm": 0.3511997163295746, "learning_rate": 5e-06, "loss": 1.1209, "num_input_tokens_seen": 20246224, "step": 45, "train_runtime": 3181.7883, "train_tokens_per_second": 6363.159 }, { "epoch": 0.05569481360966335, "grad_norm": 0.3575429618358612, "learning_rate": 5e-06, "loss": 1.073, "num_input_tokens_seen": 20721848, "step": 46, "train_runtime": 3258.4956, "train_tokens_per_second": 6359.33 }, { "epoch": 0.05690557042726473, "grad_norm": 0.32883220911026, "learning_rate": 5e-06, "loss": 1.0287, "num_input_tokens_seen": 21204880, "step": 47, "train_runtime": 3336.1547, "train_tokens_per_second": 6356.084 }, { "epoch": 0.05811632724486611, "grad_norm": 0.3266335129737854, "learning_rate": 5e-06, "loss": 1.1667, "num_input_tokens_seen": 21643792, "step": 48, "train_runtime": 3406.2845, "train_tokens_per_second": 6354.076 }, { "epoch": 0.059327084062467485, "grad_norm": 0.32436686754226685, "learning_rate": 5e-06, "loss": 1.0979, "num_input_tokens_seen": 22102080, "step": 49, "train_runtime": 3478.6822, "train_tokens_per_second": 6353.578 }, { "epoch": 0.06053784088006886, "grad_norm": 0.3160610795021057, "learning_rate": 5e-06, "loss": 1.0757, "num_input_tokens_seen": 22560080, "step": 50, "train_runtime": 3550.9743, "train_tokens_per_second": 6353.209 }, { "epoch": 0.061748597697670236, "grad_norm": 0.3259732127189636, "learning_rate": 5e-06, "loss": 1.0585, "num_input_tokens_seen": 23012792, "step": 51, "train_runtime": 3623.5659, "train_tokens_per_second": 6350.869 }, { "epoch": 0.06295935451527161, "grad_norm": 0.3129977881908417, "learning_rate": 5e-06, "loss": 1.0892, "num_input_tokens_seen": 23481928, "step": 52, "train_runtime": 3698.7937, "train_tokens_per_second": 6348.537 }, { "epoch": 0.064170111332873, "grad_norm": 0.31302887201309204, "learning_rate": 5e-06, "loss": 1.107, "num_input_tokens_seen": 23929328, "step": 53, "train_runtime": 3770.8893, "train_tokens_per_second": 6345.805 }, { "epoch": 0.06538086815047436, "grad_norm": 0.30268368124961853, "learning_rate": 5e-06, "loss": 1.0769, "num_input_tokens_seen": 24389344, "step": 54, "train_runtime": 3845.4071, "train_tokens_per_second": 6342.461 }, { "epoch": 0.06659162496807575, "grad_norm": 0.3023386299610138, "learning_rate": 5e-06, "loss": 1.0904, "num_input_tokens_seen": 24835992, "step": 55, "train_runtime": 3917.1748, "train_tokens_per_second": 6340.282 }, { "epoch": 0.06780238178567713, "grad_norm": 0.3157775104045868, "learning_rate": 5e-06, "loss": 1.0838, "num_input_tokens_seen": 25287800, "step": 56, "train_runtime": 3989.6387, "train_tokens_per_second": 6338.368 }, { "epoch": 0.0690131386032785, "grad_norm": 0.3070801794528961, "learning_rate": 5e-06, "loss": 1.1042, "num_input_tokens_seen": 25726600, "step": 57, "train_runtime": 4059.4983, "train_tokens_per_second": 6337.384 }, { "epoch": 0.07022389542087988, "grad_norm": 0.2750767469406128, "learning_rate": 5e-06, "loss": 1.0938, "num_input_tokens_seen": 26197136, "step": 58, "train_runtime": 4135.3446, "train_tokens_per_second": 6334.934 }, { "epoch": 0.07143465223848126, "grad_norm": 0.32206404209136963, "learning_rate": 5e-06, "loss": 1.1449, "num_input_tokens_seen": 26648344, "step": 59, "train_runtime": 4206.337, "train_tokens_per_second": 6335.285 }, { "epoch": 0.07264540905608263, "grad_norm": 0.27299636602401733, "learning_rate": 5e-06, "loss": 1.063, "num_input_tokens_seen": 27117640, "step": 60, "train_runtime": 4279.87, "train_tokens_per_second": 6336.09 }, { "epoch": 0.07385616587368402, "grad_norm": 0.3023524582386017, "learning_rate": 5e-06, "loss": 1.0814, "num_input_tokens_seen": 27558016, "step": 61, "train_runtime": 4349.9557, "train_tokens_per_second": 6335.241 }, { "epoch": 0.07506692269128538, "grad_norm": 0.3390548527240753, "learning_rate": 5e-06, "loss": 1.0838, "num_input_tokens_seen": 28010840, "step": 62, "train_runtime": 4423.2036, "train_tokens_per_second": 6332.704 }, { "epoch": 0.07627767950888677, "grad_norm": 0.3006073832511902, "learning_rate": 5e-06, "loss": 1.0334, "num_input_tokens_seen": 28461840, "step": 63, "train_runtime": 4496.9762, "train_tokens_per_second": 6329.106 }, { "epoch": 0.07748843632648815, "grad_norm": 0.30531835556030273, "learning_rate": 5e-06, "loss": 1.1794, "num_input_tokens_seen": 28903208, "step": 64, "train_runtime": 4568.0669, "train_tokens_per_second": 6327.23 }, { "epoch": 0.07869919314408952, "grad_norm": 0.2855227589607239, "learning_rate": 5e-06, "loss": 1.0802, "num_input_tokens_seen": 29372160, "step": 65, "train_runtime": 4642.0795, "train_tokens_per_second": 6327.371 }, { "epoch": 0.0799099499616909, "grad_norm": 0.2859865725040436, "learning_rate": 5e-06, "loss": 1.1396, "num_input_tokens_seen": 29835496, "step": 66, "train_runtime": 4715.6083, "train_tokens_per_second": 6326.967 }, { "epoch": 0.08112070677929227, "grad_norm": 0.28807154297828674, "learning_rate": 5e-06, "loss": 1.0579, "num_input_tokens_seen": 30301072, "step": 67, "train_runtime": 4789.8228, "train_tokens_per_second": 6326.136 }, { "epoch": 0.08233146359689365, "grad_norm": 0.27400127053260803, "learning_rate": 5e-06, "loss": 1.0897, "num_input_tokens_seen": 30761224, "step": 68, "train_runtime": 4863.205, "train_tokens_per_second": 6325.299 }, { "epoch": 0.08354222041449504, "grad_norm": 0.27055230736732483, "learning_rate": 5e-06, "loss": 1.0776, "num_input_tokens_seen": 31235312, "step": 69, "train_runtime": 4939.24, "train_tokens_per_second": 6323.911 }, { "epoch": 0.0847529772320964, "grad_norm": 0.29049232602119446, "learning_rate": 5e-06, "loss": 1.0942, "num_input_tokens_seen": 31715944, "step": 70, "train_runtime": 5016.1629, "train_tokens_per_second": 6322.75 }, { "epoch": 0.08596373404969779, "grad_norm": 0.28521451354026794, "learning_rate": 5e-06, "loss": 1.1107, "num_input_tokens_seen": 32153488, "step": 71, "train_runtime": 5087.4742, "train_tokens_per_second": 6320.128 }, { "epoch": 0.08717449086729916, "grad_norm": 0.27909162640571594, "learning_rate": 5e-06, "loss": 1.1105, "num_input_tokens_seen": 32614936, "step": 72, "train_runtime": 5163.3824, "train_tokens_per_second": 6316.584 }, { "epoch": 0.08838524768490054, "grad_norm": 0.2773616909980774, "learning_rate": 5e-06, "loss": 1.0926, "num_input_tokens_seen": 33066032, "step": 73, "train_runtime": 5236.3763, "train_tokens_per_second": 6314.678 }, { "epoch": 0.08959600450250192, "grad_norm": 0.2607426047325134, "learning_rate": 5e-06, "loss": 1.0681, "num_input_tokens_seen": 33519952, "step": 74, "train_runtime": 5309.8276, "train_tokens_per_second": 6312.814 }, { "epoch": 0.09080676132010329, "grad_norm": 0.3017564117908478, "learning_rate": 5e-06, "loss": 1.1197, "num_input_tokens_seen": 33979056, "step": 75, "train_runtime": 5383.9039, "train_tokens_per_second": 6311.23 }, { "epoch": 0.09201751813770467, "grad_norm": 0.25366899371147156, "learning_rate": 5e-06, "loss": 1.0534, "num_input_tokens_seen": 34445408, "step": 76, "train_runtime": 5459.5423, "train_tokens_per_second": 6309.212 }, { "epoch": 0.09322827495530604, "grad_norm": 0.30008700489997864, "learning_rate": 5e-06, "loss": 1.0647, "num_input_tokens_seen": 34883760, "step": 77, "train_runtime": 5529.7451, "train_tokens_per_second": 6308.385 }, { "epoch": 0.09443903177290743, "grad_norm": 0.288265198469162, "learning_rate": 5e-06, "loss": 1.1079, "num_input_tokens_seen": 35340528, "step": 78, "train_runtime": 5604.8498, "train_tokens_per_second": 6305.348 }, { "epoch": 0.09564978859050881, "grad_norm": 0.27486133575439453, "learning_rate": 5e-06, "loss": 1.0545, "num_input_tokens_seen": 35797704, "step": 79, "train_runtime": 5678.1881, "train_tokens_per_second": 6304.424 }, { "epoch": 0.09686054540811018, "grad_norm": 0.2748127281665802, "learning_rate": 5e-06, "loss": 1.0813, "num_input_tokens_seen": 36242296, "step": 80, "train_runtime": 5749.0438, "train_tokens_per_second": 6304.056 }, { "epoch": 0.09807130222571156, "grad_norm": 0.25881466269493103, "learning_rate": 5e-06, "loss": 1.0469, "num_input_tokens_seen": 36731608, "step": 81, "train_runtime": 5827.2786, "train_tokens_per_second": 6303.39 }, { "epoch": 0.09928205904331293, "grad_norm": 0.25870904326438904, "learning_rate": 5e-06, "loss": 1.0211, "num_input_tokens_seen": 37192232, "step": 82, "train_runtime": 5900.4297, "train_tokens_per_second": 6303.309 }, { "epoch": 0.10049281586091431, "grad_norm": 0.2989208996295929, "learning_rate": 5e-06, "loss": 1.094, "num_input_tokens_seen": 37644536, "step": 83, "train_runtime": 5972.5325, "train_tokens_per_second": 6302.944 }, { "epoch": 0.1017035726785157, "grad_norm": 0.2510150671005249, "learning_rate": 5e-06, "loss": 1.0657, "num_input_tokens_seen": 38088800, "step": 84, "train_runtime": 6043.5299, "train_tokens_per_second": 6302.41 }, { "epoch": 0.10291432949611706, "grad_norm": 0.25874075293540955, "learning_rate": 5e-06, "loss": 1.083, "num_input_tokens_seen": 38544872, "step": 85, "train_runtime": 6115.5221, "train_tokens_per_second": 6302.793 }, { "epoch": 0.10412508631371845, "grad_norm": 0.2325299233198166, "learning_rate": 5e-06, "loss": 1.0115, "num_input_tokens_seen": 39021888, "step": 86, "train_runtime": 6193.1133, "train_tokens_per_second": 6300.852 }, { "epoch": 0.10533584313131981, "grad_norm": 0.24345119297504425, "learning_rate": 5e-06, "loss": 1.0333, "num_input_tokens_seen": 39493872, "step": 87, "train_runtime": 6268.3853, "train_tokens_per_second": 6300.486 }, { "epoch": 0.1065465999489212, "grad_norm": 0.26478031277656555, "learning_rate": 5e-06, "loss": 1.0525, "num_input_tokens_seen": 39939016, "step": 88, "train_runtime": 6339.297, "train_tokens_per_second": 6300.228 }, { "epoch": 0.10775735676652258, "grad_norm": 0.24371357262134552, "learning_rate": 5e-06, "loss": 1.0548, "num_input_tokens_seen": 40423776, "step": 89, "train_runtime": 6415.9537, "train_tokens_per_second": 6300.509 }, { "epoch": 0.10896811358412395, "grad_norm": 0.25180429220199585, "learning_rate": 5e-06, "loss": 1.0336, "num_input_tokens_seen": 40869936, "step": 90, "train_runtime": 6486.3382, "train_tokens_per_second": 6300.926 }, { "epoch": 0.11017887040172533, "grad_norm": 0.2390969842672348, "learning_rate": 5e-06, "loss": 1.0552, "num_input_tokens_seen": 41317936, "step": 91, "train_runtime": 6557.8413, "train_tokens_per_second": 6300.539 }, { "epoch": 0.1113896272193267, "grad_norm": 0.2268403321504593, "learning_rate": 5e-06, "loss": 1.0251, "num_input_tokens_seen": 41807248, "step": 92, "train_runtime": 6635.8744, "train_tokens_per_second": 6300.187 }, { "epoch": 0.11260038403692808, "grad_norm": 0.23852020502090454, "learning_rate": 5e-06, "loss": 1.0913, "num_input_tokens_seen": 42253040, "step": 93, "train_runtime": 6706.5327, "train_tokens_per_second": 6300.281 }, { "epoch": 0.11381114085452947, "grad_norm": 0.22914916276931763, "learning_rate": 5e-06, "loss": 1.0244, "num_input_tokens_seen": 42729480, "step": 94, "train_runtime": 6783.568, "train_tokens_per_second": 6298.968 }, { "epoch": 0.11502189767213084, "grad_norm": 0.24560000002384186, "learning_rate": 5e-06, "loss": 1.0941, "num_input_tokens_seen": 43185408, "step": 95, "train_runtime": 6856.313, "train_tokens_per_second": 6298.634 }, { "epoch": 0.11623265448973222, "grad_norm": 0.25429603457450867, "learning_rate": 5e-06, "loss": 1.0389, "num_input_tokens_seen": 43633928, "step": 96, "train_runtime": 6927.8513, "train_tokens_per_second": 6298.335 }, { "epoch": 0.11744341130733359, "grad_norm": 0.23469692468643188, "learning_rate": 5e-06, "loss": 1.0, "num_input_tokens_seen": 44105840, "step": 97, "train_runtime": 7003.1186, "train_tokens_per_second": 6298.028 }, { "epoch": 0.11865416812493497, "grad_norm": 0.2390899658203125, "learning_rate": 5e-06, "loss": 1.0403, "num_input_tokens_seen": 44581704, "step": 98, "train_runtime": 7079.5384, "train_tokens_per_second": 6297.261 }, { "epoch": 0.11986492494253635, "grad_norm": 0.2298881709575653, "learning_rate": 5e-06, "loss": 1.0538, "num_input_tokens_seen": 45029648, "step": 99, "train_runtime": 7150.6445, "train_tokens_per_second": 6297.285 }, { "epoch": 0.12107568176013772, "grad_norm": 0.23455004394054413, "learning_rate": 5e-06, "loss": 1.0106, "num_input_tokens_seen": 45491992, "step": 100, "train_runtime": 7224.9828, "train_tokens_per_second": 6296.484 }, { "epoch": 0.1222864385777391, "grad_norm": 0.27862685918807983, "learning_rate": 5e-06, "loss": 1.0551, "num_input_tokens_seen": 45960216, "step": 101, "train_runtime": 7300.9867, "train_tokens_per_second": 6295.069 }, { "epoch": 0.12349719539534047, "grad_norm": 0.2320939600467682, "learning_rate": 5e-06, "loss": 1.0258, "num_input_tokens_seen": 46435864, "step": 102, "train_runtime": 7377.4853, "train_tokens_per_second": 6294.267 }, { "epoch": 0.12470795221294186, "grad_norm": 0.2700980305671692, "learning_rate": 5e-06, "loss": 1.0397, "num_input_tokens_seen": 46897016, "step": 103, "train_runtime": 7451.8901, "train_tokens_per_second": 6293.305 }, { "epoch": 0.12591870903054322, "grad_norm": 0.2502821683883667, "learning_rate": 5e-06, "loss": 1.0432, "num_input_tokens_seen": 47324336, "step": 104, "train_runtime": 7519.7181, "train_tokens_per_second": 6293.366 }, { "epoch": 0.1271294658481446, "grad_norm": 0.23824240267276764, "learning_rate": 5e-06, "loss": 1.0777, "num_input_tokens_seen": 47770912, "step": 105, "train_runtime": 7590.7707, "train_tokens_per_second": 6293.289 }, { "epoch": 0.128340222665746, "grad_norm": 0.24816913902759552, "learning_rate": 5e-06, "loss": 1.0662, "num_input_tokens_seen": 48215152, "step": 106, "train_runtime": 7662.9416, "train_tokens_per_second": 6291.99 }, { "epoch": 0.12955097948334737, "grad_norm": 0.2386653572320938, "learning_rate": 5e-06, "loss": 1.0423, "num_input_tokens_seen": 48688112, "step": 107, "train_runtime": 7739.3312, "train_tokens_per_second": 6290.997 }, { "epoch": 0.13076173630094873, "grad_norm": 0.25550806522369385, "learning_rate": 5e-06, "loss": 1.1077, "num_input_tokens_seen": 49130072, "step": 108, "train_runtime": 7810.4937, "train_tokens_per_second": 6290.265 }, { "epoch": 0.1319724931185501, "grad_norm": 0.2418377846479416, "learning_rate": 5e-06, "loss": 1.0495, "num_input_tokens_seen": 49589584, "step": 109, "train_runtime": 7883.9434, "train_tokens_per_second": 6289.947 }, { "epoch": 0.1331832499361515, "grad_norm": 0.24783344566822052, "learning_rate": 5e-06, "loss": 1.0896, "num_input_tokens_seen": 50020456, "step": 110, "train_runtime": 7953.0419, "train_tokens_per_second": 6289.475 }, { "epoch": 0.13439400675375288, "grad_norm": 0.2944345474243164, "learning_rate": 5e-06, "loss": 1.0701, "num_input_tokens_seen": 50460280, "step": 111, "train_runtime": 8024.2387, "train_tokens_per_second": 6288.482 }, { "epoch": 0.13560476357135426, "grad_norm": 0.23773066699504852, "learning_rate": 5e-06, "loss": 1.0576, "num_input_tokens_seen": 50923488, "step": 112, "train_runtime": 8098.5357, "train_tokens_per_second": 6287.987 }, { "epoch": 0.13681552038895564, "grad_norm": 0.24989427626132965, "learning_rate": 5e-06, "loss": 1.0656, "num_input_tokens_seen": 51357016, "step": 113, "train_runtime": 8169.1458, "train_tokens_per_second": 6286.706 }, { "epoch": 0.138026277206557, "grad_norm": 0.2635020911693573, "learning_rate": 5e-06, "loss": 1.103, "num_input_tokens_seen": 51832576, "step": 114, "train_runtime": 8245.3688, "train_tokens_per_second": 6286.265 }, { "epoch": 0.13923703402415838, "grad_norm": 0.2522059977054596, "learning_rate": 5e-06, "loss": 1.0559, "num_input_tokens_seen": 52261160, "step": 115, "train_runtime": 8314.0019, "train_tokens_per_second": 6285.921 }, { "epoch": 0.14044779084175976, "grad_norm": 0.275611937046051, "learning_rate": 5e-06, "loss": 1.0655, "num_input_tokens_seen": 52721512, "step": 116, "train_runtime": 8386.8183, "train_tokens_per_second": 6286.235 }, { "epoch": 0.14165854765936114, "grad_norm": 0.2655342221260071, "learning_rate": 5e-06, "loss": 1.0463, "num_input_tokens_seen": 53178752, "step": 117, "train_runtime": 8459.8957, "train_tokens_per_second": 6285.982 }, { "epoch": 0.14286930447696253, "grad_norm": 0.24424339830875397, "learning_rate": 5e-06, "loss": 1.0743, "num_input_tokens_seen": 53643504, "step": 118, "train_runtime": 8533.66, "train_tokens_per_second": 6286.108 }, { "epoch": 0.14408006129456388, "grad_norm": 0.24213866889476776, "learning_rate": 5e-06, "loss": 1.0082, "num_input_tokens_seen": 54096264, "step": 119, "train_runtime": 8606.2721, "train_tokens_per_second": 6285.679 }, { "epoch": 0.14529081811216527, "grad_norm": 0.24612732231616974, "learning_rate": 5e-06, "loss": 1.0415, "num_input_tokens_seen": 54542376, "step": 120, "train_runtime": 8677.4706, "train_tokens_per_second": 6285.516 }, { "epoch": 0.14650157492976665, "grad_norm": 0.24935385584831238, "learning_rate": 5e-06, "loss": 1.0735, "num_input_tokens_seen": 54986712, "step": 121, "train_runtime": 8748.1099, "train_tokens_per_second": 6285.553 }, { "epoch": 0.14771233174736803, "grad_norm": 0.2938326597213745, "learning_rate": 5e-06, "loss": 1.0351, "num_input_tokens_seen": 55448736, "step": 122, "train_runtime": 8822.6975, "train_tokens_per_second": 6284.783 }, { "epoch": 0.1489230885649694, "grad_norm": 0.24213974177837372, "learning_rate": 5e-06, "loss": 1.0653, "num_input_tokens_seen": 55891176, "step": 123, "train_runtime": 8894.389, "train_tokens_per_second": 6283.869 }, { "epoch": 0.15013384538257077, "grad_norm": 0.27501124143600464, "learning_rate": 5e-06, "loss": 1.0679, "num_input_tokens_seen": 56307976, "step": 124, "train_runtime": 8961.2502, "train_tokens_per_second": 6283.496 }, { "epoch": 0.15134460220017215, "grad_norm": 0.2943986654281616, "learning_rate": 5e-06, "loss": 1.0693, "num_input_tokens_seen": 56742320, "step": 125, "train_runtime": 9031.5273, "train_tokens_per_second": 6282.694 }, { "epoch": 0.15255535901777353, "grad_norm": 0.2623043656349182, "learning_rate": 5e-06, "loss": 1.1028, "num_input_tokens_seen": 57192864, "step": 126, "train_runtime": 9101.7095, "train_tokens_per_second": 6283.75 }, { "epoch": 0.15376611583537492, "grad_norm": 0.2695028483867645, "learning_rate": 5e-06, "loss": 1.063, "num_input_tokens_seen": 57618472, "step": 127, "train_runtime": 9168.6765, "train_tokens_per_second": 6284.274 }, { "epoch": 0.1549768726529763, "grad_norm": 0.2590481638908386, "learning_rate": 5e-06, "loss": 1.0145, "num_input_tokens_seen": 58066048, "step": 128, "train_runtime": 9239.0759, "train_tokens_per_second": 6284.833 }, { "epoch": 0.15618762947057765, "grad_norm": 0.28023761510849, "learning_rate": 5e-06, "loss": 1.0559, "num_input_tokens_seen": 58500528, "step": 129, "train_runtime": 9313.1627, "train_tokens_per_second": 6281.489 }, { "epoch": 0.15739838628817904, "grad_norm": 0.24649831652641296, "learning_rate": 5e-06, "loss": 1.0134, "num_input_tokens_seen": 58967280, "step": 130, "train_runtime": 9392.9204, "train_tokens_per_second": 6277.843 }, { "epoch": 0.15860914310578042, "grad_norm": 0.2472827285528183, "learning_rate": 5e-06, "loss": 1.0178, "num_input_tokens_seen": 59412456, "step": 131, "train_runtime": 9468.2871, "train_tokens_per_second": 6274.89 }, { "epoch": 0.1598198999233818, "grad_norm": 0.2545448839664459, "learning_rate": 5e-06, "loss": 1.0606, "num_input_tokens_seen": 59878688, "step": 132, "train_runtime": 9545.4506, "train_tokens_per_second": 6273.008 }, { "epoch": 0.16103065674098319, "grad_norm": 0.2501581013202667, "learning_rate": 5e-06, "loss": 1.0483, "num_input_tokens_seen": 60330256, "step": 133, "train_runtime": 9616.5112, "train_tokens_per_second": 6273.612 }, { "epoch": 0.16224141355858454, "grad_norm": 0.29199784994125366, "learning_rate": 5e-06, "loss": 1.0331, "num_input_tokens_seen": 60784672, "step": 134, "train_runtime": 9687.146, "train_tokens_per_second": 6274.776 }, { "epoch": 0.16345217037618592, "grad_norm": 0.23874440789222717, "learning_rate": 5e-06, "loss": 1.0224, "num_input_tokens_seen": 61251264, "step": 135, "train_runtime": 9761.2608, "train_tokens_per_second": 6274.934 }, { "epoch": 0.1646629271937873, "grad_norm": 0.25831273198127747, "learning_rate": 5e-06, "loss": 1.0679, "num_input_tokens_seen": 61709040, "step": 136, "train_runtime": 9835.0216, "train_tokens_per_second": 6274.418 }, { "epoch": 0.1658736840113887, "grad_norm": 0.25276923179626465, "learning_rate": 5e-06, "loss": 1.0455, "num_input_tokens_seen": 62160304, "step": 137, "train_runtime": 9909.0578, "train_tokens_per_second": 6273.079 }, { "epoch": 0.16708444082899007, "grad_norm": 0.29279229044914246, "learning_rate": 5e-06, "loss": 1.0555, "num_input_tokens_seen": 62591968, "step": 138, "train_runtime": 9979.8666, "train_tokens_per_second": 6271.824 }, { "epoch": 0.16829519764659143, "grad_norm": 0.2797205448150635, "learning_rate": 5e-06, "loss": 0.9834, "num_input_tokens_seen": 63045184, "step": 139, "train_runtime": 10056.584, "train_tokens_per_second": 6269.046 }, { "epoch": 0.1695059544641928, "grad_norm": 0.2773694396018982, "learning_rate": 5e-06, "loss": 0.9939, "num_input_tokens_seen": 63504472, "step": 140, "train_runtime": 10134.2006, "train_tokens_per_second": 6266.352 }, { "epoch": 0.1707167112817942, "grad_norm": 0.22478176653385162, "learning_rate": 5e-06, "loss": 1.0418, "num_input_tokens_seen": 63957288, "step": 141, "train_runtime": 10210.8231, "train_tokens_per_second": 6263.676 }, { "epoch": 0.17192746809939558, "grad_norm": 0.24870216846466064, "learning_rate": 5e-06, "loss": 1.054, "num_input_tokens_seen": 64428128, "step": 142, "train_runtime": 10288.8907, "train_tokens_per_second": 6261.912 }, { "epoch": 0.17313822491699696, "grad_norm": 0.22447937726974487, "learning_rate": 5e-06, "loss": 1.0385, "num_input_tokens_seen": 64891424, "step": 143, "train_runtime": 10358.86, "train_tokens_per_second": 6264.34 }, { "epoch": 0.1743489817345983, "grad_norm": 0.25018176436424255, "learning_rate": 5e-06, "loss": 1.0231, "num_input_tokens_seen": 65354392, "step": 144, "train_runtime": 10428.4387, "train_tokens_per_second": 6266.939 }, { "epoch": 0.1755597385521997, "grad_norm": 0.2601490914821625, "learning_rate": 5e-06, "loss": 1.0256, "num_input_tokens_seen": 65810304, "step": 145, "train_runtime": 10501.5228, "train_tokens_per_second": 6266.739 }, { "epoch": 0.17677049536980108, "grad_norm": 0.24077767133712769, "learning_rate": 5e-06, "loss": 1.032, "num_input_tokens_seen": 66289808, "step": 146, "train_runtime": 10580.6955, "train_tokens_per_second": 6265.165 }, { "epoch": 0.17798125218740246, "grad_norm": 0.2406504601240158, "learning_rate": 5e-06, "loss": 1.0469, "num_input_tokens_seen": 66738720, "step": 147, "train_runtime": 10650.666, "train_tokens_per_second": 6266.155 }, { "epoch": 0.17919200900500384, "grad_norm": 0.22819995880126953, "learning_rate": 5e-06, "loss": 1.0674, "num_input_tokens_seen": 67205640, "step": 148, "train_runtime": 10723.6866, "train_tokens_per_second": 6267.028 }, { "epoch": 0.1804027658226052, "grad_norm": 0.2443617284297943, "learning_rate": 5e-06, "loss": 1.0194, "num_input_tokens_seen": 67674720, "step": 149, "train_runtime": 10796.9457, "train_tokens_per_second": 6267.95 }, { "epoch": 0.18161352264020658, "grad_norm": 0.30922770500183105, "learning_rate": 5e-06, "loss": 1.1078, "num_input_tokens_seen": 68119800, "step": 150, "train_runtime": 10867.1223, "train_tokens_per_second": 6268.43 }, { "epoch": 0.18282427945780796, "grad_norm": 0.24705801904201508, "learning_rate": 5e-06, "loss": 1.0226, "num_input_tokens_seen": 68569240, "step": 151, "train_runtime": 10937.4488, "train_tokens_per_second": 6269.217 }, { "epoch": 0.18403503627540935, "grad_norm": 0.2428549975156784, "learning_rate": 5e-06, "loss": 1.0586, "num_input_tokens_seen": 69024352, "step": 152, "train_runtime": 11008.538, "train_tokens_per_second": 6270.074 }, { "epoch": 0.18524579309301073, "grad_norm": 0.23061682283878326, "learning_rate": 5e-06, "loss": 1.0129, "num_input_tokens_seen": 69487680, "step": 153, "train_runtime": 11081.4546, "train_tokens_per_second": 6270.628 }, { "epoch": 0.18645654991061208, "grad_norm": 0.2367316484451294, "learning_rate": 5e-06, "loss": 1.0437, "num_input_tokens_seen": 69923312, "step": 154, "train_runtime": 11149.4652, "train_tokens_per_second": 6271.45 }, { "epoch": 0.18766730672821347, "grad_norm": 0.24783264100551605, "learning_rate": 5e-06, "loss": 1.0682, "num_input_tokens_seen": 70375368, "step": 155, "train_runtime": 11219.9874, "train_tokens_per_second": 6272.321 }, { "epoch": 0.18887806354581485, "grad_norm": 0.22279201447963715, "learning_rate": 5e-06, "loss": 1.0105, "num_input_tokens_seen": 70836136, "step": 156, "train_runtime": 11292.4524, "train_tokens_per_second": 6272.874 }, { "epoch": 0.19008882036341623, "grad_norm": 0.22752974927425385, "learning_rate": 5e-06, "loss": 1.008, "num_input_tokens_seen": 71284208, "step": 157, "train_runtime": 11362.5883, "train_tokens_per_second": 6273.589 }, { "epoch": 0.19129957718101762, "grad_norm": 0.21871839463710785, "learning_rate": 5e-06, "loss": 1.0518, "num_input_tokens_seen": 71773848, "step": 158, "train_runtime": 11438.8409, "train_tokens_per_second": 6274.573 }, { "epoch": 0.19251033399861897, "grad_norm": 0.23992645740509033, "learning_rate": 5e-06, "loss": 0.9932, "num_input_tokens_seen": 72243136, "step": 159, "train_runtime": 11512.8218, "train_tokens_per_second": 6275.016 }, { "epoch": 0.19372109081622035, "grad_norm": 0.25232523679733276, "learning_rate": 5e-06, "loss": 1.0152, "num_input_tokens_seen": 72702040, "step": 160, "train_runtime": 11584.5481, "train_tokens_per_second": 6275.777 }, { "epoch": 0.19493184763382174, "grad_norm": 0.2552812695503235, "learning_rate": 5e-06, "loss": 1.0435, "num_input_tokens_seen": 73152944, "step": 161, "train_runtime": 11655.4728, "train_tokens_per_second": 6276.274 }, { "epoch": 0.19614260445142312, "grad_norm": 0.24950732290744781, "learning_rate": 5e-06, "loss": 1.0205, "num_input_tokens_seen": 73622448, "step": 162, "train_runtime": 11728.7466, "train_tokens_per_second": 6277.094 }, { "epoch": 0.1973533612690245, "grad_norm": 0.23558129370212555, "learning_rate": 5e-06, "loss": 1.0464, "num_input_tokens_seen": 74070512, "step": 163, "train_runtime": 11798.7407, "train_tokens_per_second": 6277.832 }, { "epoch": 0.19856411808662586, "grad_norm": 0.2387412041425705, "learning_rate": 5e-06, "loss": 1.0509, "num_input_tokens_seen": 74523176, "step": 164, "train_runtime": 11869.5779, "train_tokens_per_second": 6278.503 }, { "epoch": 0.19977487490422724, "grad_norm": 0.2554919421672821, "learning_rate": 5e-06, "loss": 1.0716, "num_input_tokens_seen": 74985568, "step": 165, "train_runtime": 11942.0706, "train_tokens_per_second": 6279.109 }, { "epoch": 0.20098563172182862, "grad_norm": 0.24104657769203186, "learning_rate": 5e-06, "loss": 1.0549, "num_input_tokens_seen": 75436832, "step": 166, "train_runtime": 12012.5743, "train_tokens_per_second": 6279.822 }, { "epoch": 0.20219638853943, "grad_norm": 0.2571240961551666, "learning_rate": 5e-06, "loss": 1.0771, "num_input_tokens_seen": 75895608, "step": 167, "train_runtime": 12084.2953, "train_tokens_per_second": 6280.516 }, { "epoch": 0.2034071453570314, "grad_norm": 0.2907203733921051, "learning_rate": 5e-06, "loss": 1.0271, "num_input_tokens_seen": 76343416, "step": 168, "train_runtime": 12154.2802, "train_tokens_per_second": 6281.196 }, { "epoch": 0.20461790217463274, "grad_norm": 0.2559382915496826, "learning_rate": 5e-06, "loss": 1.0148, "num_input_tokens_seen": 76810064, "step": 169, "train_runtime": 12226.7946, "train_tokens_per_second": 6282.11 }, { "epoch": 0.20582865899223413, "grad_norm": 0.26620903611183167, "learning_rate": 5e-06, "loss": 1.0857, "num_input_tokens_seen": 77255168, "step": 170, "train_runtime": 12296.1273, "train_tokens_per_second": 6282.886 }, { "epoch": 0.2070394158098355, "grad_norm": 0.2579341530799866, "learning_rate": 5e-06, "loss": 1.0163, "num_input_tokens_seen": 77712312, "step": 171, "train_runtime": 12367.9858, "train_tokens_per_second": 6283.344 }, { "epoch": 0.2082501726274369, "grad_norm": 0.2516046166419983, "learning_rate": 5e-06, "loss": 1.0318, "num_input_tokens_seen": 78158176, "step": 172, "train_runtime": 12437.6935, "train_tokens_per_second": 6283.977 }, { "epoch": 0.20946092944503827, "grad_norm": 0.26422518491744995, "learning_rate": 5e-06, "loss": 1.0003, "num_input_tokens_seen": 78631984, "step": 173, "train_runtime": 12512.0888, "train_tokens_per_second": 6284.481 }, { "epoch": 0.21067168626263963, "grad_norm": 0.2679826617240906, "learning_rate": 5e-06, "loss": 1.01, "num_input_tokens_seen": 79092368, "step": 174, "train_runtime": 12584.1353, "train_tokens_per_second": 6285.086 }, { "epoch": 0.211882443080241, "grad_norm": 0.23957136273384094, "learning_rate": 5e-06, "loss": 1.0359, "num_input_tokens_seen": 79562144, "step": 175, "train_runtime": 12657.2444, "train_tokens_per_second": 6285.898 }, { "epoch": 0.2130931998978424, "grad_norm": 0.2504132091999054, "learning_rate": 5e-06, "loss": 1.0057, "num_input_tokens_seen": 79997152, "step": 176, "train_runtime": 12725.4596, "train_tokens_per_second": 6286.386 }, { "epoch": 0.21430395671544378, "grad_norm": 0.24493563175201416, "learning_rate": 5e-06, "loss": 1.0224, "num_input_tokens_seen": 80452312, "step": 177, "train_runtime": 12796.7177, "train_tokens_per_second": 6286.949 }, { "epoch": 0.21551471353304516, "grad_norm": 0.24307624995708466, "learning_rate": 5e-06, "loss": 1.0201, "num_input_tokens_seen": 80895192, "step": 178, "train_runtime": 12866.1831, "train_tokens_per_second": 6287.427 }, { "epoch": 0.21672547035064652, "grad_norm": 0.22720018029212952, "learning_rate": 5e-06, "loss": 0.9935, "num_input_tokens_seen": 81373192, "step": 179, "train_runtime": 12941.1713, "train_tokens_per_second": 6287.931 }, { "epoch": 0.2179362271682479, "grad_norm": 0.24937334656715393, "learning_rate": 5e-06, "loss": 0.9786, "num_input_tokens_seen": 81840648, "step": 180, "train_runtime": 13013.8198, "train_tokens_per_second": 6288.749 }, { "epoch": 0.21914698398584928, "grad_norm": 0.2576950490474701, "learning_rate": 5e-06, "loss": 1.0603, "num_input_tokens_seen": 82297504, "step": 181, "train_runtime": 13084.9624, "train_tokens_per_second": 6289.472 }, { "epoch": 0.22035774080345066, "grad_norm": 0.2821928560733795, "learning_rate": 5e-06, "loss": 1.0463, "num_input_tokens_seen": 82729496, "step": 182, "train_runtime": 13152.2778, "train_tokens_per_second": 6290.127 }, { "epoch": 0.22156849762105205, "grad_norm": 0.2612816095352173, "learning_rate": 5e-06, "loss": 0.9959, "num_input_tokens_seen": 83169848, "step": 183, "train_runtime": 13221.1044, "train_tokens_per_second": 6290.688 }, { "epoch": 0.2227792544386534, "grad_norm": 0.24119819700717926, "learning_rate": 5e-06, "loss": 1.0453, "num_input_tokens_seen": 83621816, "step": 184, "train_runtime": 13291.5963, "train_tokens_per_second": 6291.33 }, { "epoch": 0.22399001125625478, "grad_norm": 0.2350812554359436, "learning_rate": 5e-06, "loss": 1.0488, "num_input_tokens_seen": 84093008, "step": 185, "train_runtime": 13365.1515, "train_tokens_per_second": 6291.961 }, { "epoch": 0.22520076807385617, "grad_norm": 0.23204365372657776, "learning_rate": 5e-06, "loss": 1.0438, "num_input_tokens_seen": 84548848, "step": 186, "train_runtime": 13436.6704, "train_tokens_per_second": 6292.396 }, { "epoch": 0.22641152489145755, "grad_norm": 0.21973128616809845, "learning_rate": 5e-06, "loss": 1.0221, "num_input_tokens_seen": 85006432, "step": 187, "train_runtime": 13508.1616, "train_tokens_per_second": 6292.968 }, { "epoch": 0.22762228170905893, "grad_norm": 0.22889819741249084, "learning_rate": 5e-06, "loss": 1.0409, "num_input_tokens_seen": 85473544, "step": 188, "train_runtime": 13580.8488, "train_tokens_per_second": 6293.682 }, { "epoch": 0.2288330385266603, "grad_norm": 0.22178350389003754, "learning_rate": 5e-06, "loss": 0.9972, "num_input_tokens_seen": 85935408, "step": 189, "train_runtime": 13653.0892, "train_tokens_per_second": 6294.21 }, { "epoch": 0.23004379534426167, "grad_norm": 0.22922936081886292, "learning_rate": 5e-06, "loss": 1.1049, "num_input_tokens_seen": 86403776, "step": 190, "train_runtime": 13726.5237, "train_tokens_per_second": 6294.658 }, { "epoch": 0.23125455216186305, "grad_norm": 0.24582232534885406, "learning_rate": 5e-06, "loss": 1.0694, "num_input_tokens_seen": 86866200, "step": 191, "train_runtime": 13799.1133, "train_tokens_per_second": 6295.057 }, { "epoch": 0.23246530897946444, "grad_norm": 0.24143490195274353, "learning_rate": 5e-06, "loss": 1.0036, "num_input_tokens_seen": 87327440, "step": 192, "train_runtime": 13871.561, "train_tokens_per_second": 6295.43 }, { "epoch": 0.23367606579706582, "grad_norm": 0.2200412005186081, "learning_rate": 5e-06, "loss": 1.0321, "num_input_tokens_seen": 87788752, "step": 193, "train_runtime": 13943.4418, "train_tokens_per_second": 6296.06 }, { "epoch": 0.23488682261466717, "grad_norm": 0.24762044847011566, "learning_rate": 5e-06, "loss": 1.0363, "num_input_tokens_seen": 88219736, "step": 194, "train_runtime": 14010.4722, "train_tokens_per_second": 6296.7 }, { "epoch": 0.23609757943226856, "grad_norm": 0.23594461381435394, "learning_rate": 5e-06, "loss": 1.0704, "num_input_tokens_seen": 88680528, "step": 195, "train_runtime": 14082.4487, "train_tokens_per_second": 6297.238 }, { "epoch": 0.23730833624986994, "grad_norm": 0.24670927226543427, "learning_rate": 5e-06, "loss": 1.0181, "num_input_tokens_seen": 89139152, "step": 196, "train_runtime": 14154.1641, "train_tokens_per_second": 6297.733 }, { "epoch": 0.23851909306747132, "grad_norm": 0.2432672679424286, "learning_rate": 5e-06, "loss": 0.9717, "num_input_tokens_seen": 89603368, "step": 197, "train_runtime": 14226.3618, "train_tokens_per_second": 6298.404 }, { "epoch": 0.2397298498850727, "grad_norm": 0.2482805699110031, "learning_rate": 5e-06, "loss": 0.9996, "num_input_tokens_seen": 90053800, "step": 198, "train_runtime": 14296.6962, "train_tokens_per_second": 6298.924 }, { "epoch": 0.24094060670267406, "grad_norm": 0.2421431541442871, "learning_rate": 5e-06, "loss": 0.9607, "num_input_tokens_seen": 90508144, "step": 199, "train_runtime": 14368.0885, "train_tokens_per_second": 6299.247 }, { "epoch": 0.24215136352027544, "grad_norm": 0.21828782558441162, "learning_rate": 5e-06, "loss": 0.9602, "num_input_tokens_seen": 90981216, "step": 200, "train_runtime": 14442.2784, "train_tokens_per_second": 6299.644 }, { "epoch": 0.24336212033787682, "grad_norm": 0.25093552470207214, "learning_rate": 5e-06, "loss": 1.0223, "num_input_tokens_seen": 91405344, "step": 201, "train_runtime": 14508.6653, "train_tokens_per_second": 6300.052 }, { "epoch": 0.2445728771554782, "grad_norm": 0.2346261888742447, "learning_rate": 5e-06, "loss": 1.0457, "num_input_tokens_seen": 91867920, "step": 202, "train_runtime": 14581.2729, "train_tokens_per_second": 6300.405 }, { "epoch": 0.2457836339730796, "grad_norm": 0.2555064260959625, "learning_rate": 5e-06, "loss": 1.0239, "num_input_tokens_seen": 92332376, "step": 203, "train_runtime": 14654.0594, "train_tokens_per_second": 6300.805 }, { "epoch": 0.24699439079068095, "grad_norm": 0.24753707647323608, "learning_rate": 5e-06, "loss": 1.0078, "num_input_tokens_seen": 92798256, "step": 204, "train_runtime": 14727.0741, "train_tokens_per_second": 6301.201 }, { "epoch": 0.24820514760828233, "grad_norm": 0.22091752290725708, "learning_rate": 5e-06, "loss": 0.9881, "num_input_tokens_seen": 93260920, "step": 205, "train_runtime": 14799.4511, "train_tokens_per_second": 6301.647 }, { "epoch": 0.2494159044258837, "grad_norm": 0.23978286981582642, "learning_rate": 5e-06, "loss": 1.0102, "num_input_tokens_seen": 93719680, "step": 206, "train_runtime": 14871.4509, "train_tokens_per_second": 6301.986 }, { "epoch": 0.2506266612434851, "grad_norm": 0.2572280466556549, "learning_rate": 5e-06, "loss": 0.9997, "num_input_tokens_seen": 94167864, "step": 207, "train_runtime": 14941.2387, "train_tokens_per_second": 6302.547 }, { "epoch": 0.25183741806108645, "grad_norm": 0.22775068879127502, "learning_rate": 5e-06, "loss": 0.9997, "num_input_tokens_seen": 94629576, "step": 208, "train_runtime": 15013.5212, "train_tokens_per_second": 6302.957 }, { "epoch": 0.25304817487868786, "grad_norm": 0.24101892113685608, "learning_rate": 5e-06, "loss": 1.0547, "num_input_tokens_seen": 95088576, "step": 209, "train_runtime": 15085.7957, "train_tokens_per_second": 6303.186 }, { "epoch": 0.2542589316962892, "grad_norm": 0.23462055623531342, "learning_rate": 5e-06, "loss": 1.0055, "num_input_tokens_seen": 95528848, "step": 210, "train_runtime": 15154.3744, "train_tokens_per_second": 6303.714 }, { "epoch": 0.25546968851389057, "grad_norm": 0.21969425678253174, "learning_rate": 5e-06, "loss": 1.0179, "num_input_tokens_seen": 96012504, "step": 211, "train_runtime": 15229.6926, "train_tokens_per_second": 6304.297 }, { "epoch": 0.256680445331492, "grad_norm": 0.2324143946170807, "learning_rate": 5e-06, "loss": 1.0263, "num_input_tokens_seen": 96478288, "step": 212, "train_runtime": 15302.7525, "train_tokens_per_second": 6304.636 }, { "epoch": 0.25789120214909333, "grad_norm": 0.2410186231136322, "learning_rate": 5e-06, "loss": 1.0705, "num_input_tokens_seen": 96927768, "step": 213, "train_runtime": 15372.6647, "train_tokens_per_second": 6305.203 }, { "epoch": 0.25910195896669475, "grad_norm": 0.2557809352874756, "learning_rate": 5e-06, "loss": 1.0136, "num_input_tokens_seen": 97369112, "step": 214, "train_runtime": 15441.8864, "train_tokens_per_second": 6305.519 }, { "epoch": 0.2603127157842961, "grad_norm": 0.22955191135406494, "learning_rate": 5e-06, "loss": 0.9885, "num_input_tokens_seen": 97837320, "step": 215, "train_runtime": 15515.0556, "train_tokens_per_second": 6305.96 }, { "epoch": 0.26152347260189746, "grad_norm": 0.23326116800308228, "learning_rate": 5e-06, "loss": 1.0407, "num_input_tokens_seen": 98273464, "step": 216, "train_runtime": 15582.8966, "train_tokens_per_second": 6306.495 }, { "epoch": 0.26273422941949887, "grad_norm": 0.2623524069786072, "learning_rate": 5e-06, "loss": 0.9967, "num_input_tokens_seen": 98734576, "step": 217, "train_runtime": 15654.599, "train_tokens_per_second": 6307.065 }, { "epoch": 0.2639449862371002, "grad_norm": 0.2236497849225998, "learning_rate": 5e-06, "loss": 0.9819, "num_input_tokens_seen": 99183248, "step": 218, "train_runtime": 15724.0156, "train_tokens_per_second": 6307.756 }, { "epoch": 0.26515574305470163, "grad_norm": 0.2309817373752594, "learning_rate": 5e-06, "loss": 1.0123, "num_input_tokens_seen": 99655480, "step": 219, "train_runtime": 15797.6975, "train_tokens_per_second": 6308.228 }, { "epoch": 0.266366499872303, "grad_norm": 0.22036534547805786, "learning_rate": 5e-06, "loss": 1.0621, "num_input_tokens_seen": 100126896, "step": 220, "train_runtime": 15875.4973, "train_tokens_per_second": 6307.008 }, { "epoch": 0.26757725668990434, "grad_norm": 0.24294357001781464, "learning_rate": 5e-06, "loss": 1.0296, "num_input_tokens_seen": 100585544, "step": 221, "train_runtime": 15948.2329, "train_tokens_per_second": 6307.002 }, { "epoch": 0.26878801350750575, "grad_norm": 0.2395816445350647, "learning_rate": 5e-06, "loss": 1.0843, "num_input_tokens_seen": 101027704, "step": 222, "train_runtime": 16017.0188, "train_tokens_per_second": 6307.522 }, { "epoch": 0.2699987703251071, "grad_norm": 0.23171593248844147, "learning_rate": 5e-06, "loss": 1.0259, "num_input_tokens_seen": 101494896, "step": 223, "train_runtime": 16089.9686, "train_tokens_per_second": 6307.961 }, { "epoch": 0.2712095271427085, "grad_norm": 0.23881399631500244, "learning_rate": 5e-06, "loss": 1.0312, "num_input_tokens_seen": 101945248, "step": 224, "train_runtime": 16161.1769, "train_tokens_per_second": 6308.034 }, { "epoch": 0.2724202839603099, "grad_norm": 0.23741568624973297, "learning_rate": 5e-06, "loss": 1.0388, "num_input_tokens_seen": 102381600, "step": 225, "train_runtime": 16233.0137, "train_tokens_per_second": 6306.999 }, { "epoch": 0.2736310407779113, "grad_norm": 0.2587156295776367, "learning_rate": 5e-06, "loss": 1.053, "num_input_tokens_seen": 102853016, "step": 226, "train_runtime": 16310.47, "train_tokens_per_second": 6305.95 }, { "epoch": 0.27484179759551264, "grad_norm": 0.25893622636795044, "learning_rate": 5e-06, "loss": 1.0546, "num_input_tokens_seen": 103292264, "step": 227, "train_runtime": 16382.2499, "train_tokens_per_second": 6305.133 }, { "epoch": 0.276052554413114, "grad_norm": 0.235712468624115, "learning_rate": 5e-06, "loss": 1.0638, "num_input_tokens_seen": 103744464, "step": 228, "train_runtime": 16455.8156, "train_tokens_per_second": 6304.426 }, { "epoch": 0.2772633112307154, "grad_norm": 0.2683420181274414, "learning_rate": 5e-06, "loss": 1.0731, "num_input_tokens_seen": 104191136, "step": 229, "train_runtime": 16528.5498, "train_tokens_per_second": 6303.707 }, { "epoch": 0.27847406804831676, "grad_norm": 0.22673234343528748, "learning_rate": 5e-06, "loss": 0.996, "num_input_tokens_seen": 104663120, "step": 230, "train_runtime": 16605.3871, "train_tokens_per_second": 6302.962 }, { "epoch": 0.27968482486591817, "grad_norm": 0.2398988902568817, "learning_rate": 5e-06, "loss": 1.0543, "num_input_tokens_seen": 105118544, "step": 231, "train_runtime": 16679.8469, "train_tokens_per_second": 6302.129 }, { "epoch": 0.2808955816835195, "grad_norm": 0.2677454948425293, "learning_rate": 5e-06, "loss": 1.0094, "num_input_tokens_seen": 105588584, "step": 232, "train_runtime": 16756.2821, "train_tokens_per_second": 6301.433 }, { "epoch": 0.2821063385011209, "grad_norm": 0.2396971434354782, "learning_rate": 5e-06, "loss": 0.9976, "num_input_tokens_seen": 106052280, "step": 233, "train_runtime": 16831.9341, "train_tokens_per_second": 6300.659 }, { "epoch": 0.2833170953187223, "grad_norm": 0.2204187661409378, "learning_rate": 5e-06, "loss": 0.9871, "num_input_tokens_seen": 106493280, "step": 234, "train_runtime": 16903.9239, "train_tokens_per_second": 6299.915 }, { "epoch": 0.28452785213632364, "grad_norm": 0.2463349997997284, "learning_rate": 5e-06, "loss": 0.9915, "num_input_tokens_seen": 106971464, "step": 235, "train_runtime": 16982.022, "train_tokens_per_second": 6299.101 }, { "epoch": 0.28573860895392506, "grad_norm": 0.22036071121692657, "learning_rate": 5e-06, "loss": 0.9812, "num_input_tokens_seen": 107447560, "step": 236, "train_runtime": 17059.9653, "train_tokens_per_second": 6298.229 }, { "epoch": 0.2869493657715264, "grad_norm": 0.2353561669588089, "learning_rate": 5e-06, "loss": 0.9932, "num_input_tokens_seen": 107889344, "step": 237, "train_runtime": 17132.2103, "train_tokens_per_second": 6297.456 }, { "epoch": 0.28816012258912777, "grad_norm": 0.23488640785217285, "learning_rate": 5e-06, "loss": 1.0296, "num_input_tokens_seen": 108347712, "step": 238, "train_runtime": 17206.8913, "train_tokens_per_second": 6296.763 }, { "epoch": 0.2893708794067292, "grad_norm": 0.23872198164463043, "learning_rate": 5e-06, "loss": 1.0054, "num_input_tokens_seen": 108790344, "step": 239, "train_runtime": 17277.6892, "train_tokens_per_second": 6296.58 }, { "epoch": 0.29058163622433053, "grad_norm": 0.2371063083410263, "learning_rate": 5e-06, "loss": 1.009, "num_input_tokens_seen": 109245808, "step": 240, "train_runtime": 17344.4981, "train_tokens_per_second": 6298.586 }, { "epoch": 0.29179239304193194, "grad_norm": 0.2168145626783371, "learning_rate": 5e-06, "loss": 0.9897, "num_input_tokens_seen": 109708688, "step": 241, "train_runtime": 17412.6292, "train_tokens_per_second": 6300.524 }, { "epoch": 0.2930031498595333, "grad_norm": 0.230647012591362, "learning_rate": 5e-06, "loss": 1.0172, "num_input_tokens_seen": 110166104, "step": 242, "train_runtime": 17480.5962, "train_tokens_per_second": 6302.194 }, { "epoch": 0.29421390667713465, "grad_norm": 0.2462947964668274, "learning_rate": 5e-06, "loss": 1.0429, "num_input_tokens_seen": 110603016, "step": 243, "train_runtime": 17550.9466, "train_tokens_per_second": 6301.826 }, { "epoch": 0.29542466349473606, "grad_norm": 0.2439439445734024, "learning_rate": 5e-06, "loss": 0.9871, "num_input_tokens_seen": 111038080, "step": 244, "train_runtime": 17621.5292, "train_tokens_per_second": 6301.274 }, { "epoch": 0.2966354203123374, "grad_norm": 0.24288234114646912, "learning_rate": 5e-06, "loss": 1.0544, "num_input_tokens_seen": 111474408, "step": 245, "train_runtime": 17694.1217, "train_tokens_per_second": 6300.081 }, { "epoch": 0.2978461771299388, "grad_norm": 0.2557252049446106, "learning_rate": 5e-06, "loss": 1.0276, "num_input_tokens_seen": 111923176, "step": 246, "train_runtime": 17768.7153, "train_tokens_per_second": 6298.89 }, { "epoch": 0.2990569339475402, "grad_norm": 0.25596141815185547, "learning_rate": 5e-06, "loss": 0.9989, "num_input_tokens_seen": 112397080, "step": 247, "train_runtime": 17850.0516, "train_tokens_per_second": 6296.737 }, { "epoch": 0.30026769076514154, "grad_norm": 0.21673010289669037, "learning_rate": 5e-06, "loss": 0.9963, "num_input_tokens_seen": 112862520, "step": 248, "train_runtime": 17937.4215, "train_tokens_per_second": 6292.015 }, { "epoch": 0.30147844758274295, "grad_norm": 0.26896172761917114, "learning_rate": 5e-06, "loss": 1.0335, "num_input_tokens_seen": 113290872, "step": 249, "train_runtime": 18018.1765, "train_tokens_per_second": 6287.588 }, { "epoch": 0.3026892044003443, "grad_norm": 0.2385682761669159, "learning_rate": 5e-06, "loss": 0.9976, "num_input_tokens_seen": 113759016, "step": 250, "train_runtime": 18107.0103, "train_tokens_per_second": 6282.595 }, { "epoch": 0.3038999612179457, "grad_norm": 0.22848090529441833, "learning_rate": 5e-06, "loss": 0.9864, "num_input_tokens_seen": 114208944, "step": 251, "train_runtime": 18191.4232, "train_tokens_per_second": 6278.175 }, { "epoch": 0.30511071803554707, "grad_norm": 0.23898521065711975, "learning_rate": 5e-06, "loss": 0.9731, "num_input_tokens_seen": 114673264, "step": 252, "train_runtime": 18278.8132, "train_tokens_per_second": 6273.562 }, { "epoch": 0.3063214748531484, "grad_norm": 0.23195713758468628, "learning_rate": 5e-06, "loss": 1.0309, "num_input_tokens_seen": 115123744, "step": 253, "train_runtime": 18364.6246, "train_tokens_per_second": 6268.777 }, { "epoch": 0.30753223167074983, "grad_norm": 0.257159948348999, "learning_rate": 5e-06, "loss": 1.0214, "num_input_tokens_seen": 115555688, "step": 254, "train_runtime": 18445.2087, "train_tokens_per_second": 6264.808 }, { "epoch": 0.3087429884883512, "grad_norm": 0.2846441864967346, "learning_rate": 5e-06, "loss": 0.9902, "num_input_tokens_seen": 116006248, "step": 255, "train_runtime": 18529.8186, "train_tokens_per_second": 6260.517 }, { "epoch": 0.3099537453059526, "grad_norm": 0.21537640690803528, "learning_rate": 5e-06, "loss": 0.9767, "num_input_tokens_seen": 116464472, "step": 256, "train_runtime": 18601.195, "train_tokens_per_second": 6261.128 }, { "epoch": 0.31116450212355395, "grad_norm": 0.2560320496559143, "learning_rate": 5e-06, "loss": 1.0474, "num_input_tokens_seen": 116881080, "step": 257, "train_runtime": 18665.8155, "train_tokens_per_second": 6261.772 }, { "epoch": 0.3123752589411553, "grad_norm": 0.23951588571071625, "learning_rate": 5e-06, "loss": 1.0078, "num_input_tokens_seen": 117334528, "step": 258, "train_runtime": 18736.5275, "train_tokens_per_second": 6262.341 }, { "epoch": 0.3135860157587567, "grad_norm": 0.233546182513237, "learning_rate": 5e-06, "loss": 1.0615, "num_input_tokens_seen": 117794752, "step": 259, "train_runtime": 18808.0733, "train_tokens_per_second": 6262.989 }, { "epoch": 0.3147967725763581, "grad_norm": 0.21725581586360931, "learning_rate": 5e-06, "loss": 0.9617, "num_input_tokens_seen": 118273288, "step": 260, "train_runtime": 18883.1278, "train_tokens_per_second": 6263.437 }, { "epoch": 0.3160075293939595, "grad_norm": 0.2287113070487976, "learning_rate": 5e-06, "loss": 0.9977, "num_input_tokens_seen": 118730128, "step": 261, "train_runtime": 18954.4265, "train_tokens_per_second": 6263.979 }, { "epoch": 0.31721828621156084, "grad_norm": 0.2280893474817276, "learning_rate": 5e-06, "loss": 1.0171, "num_input_tokens_seen": 119186808, "step": 262, "train_runtime": 19025.8236, "train_tokens_per_second": 6264.476 }, { "epoch": 0.3184290430291622, "grad_norm": 0.2364167869091034, "learning_rate": 5e-06, "loss": 1.0681, "num_input_tokens_seen": 119627160, "step": 263, "train_runtime": 19094.4088, "train_tokens_per_second": 6265.036 }, { "epoch": 0.3196397998467636, "grad_norm": 0.2416498214006424, "learning_rate": 5e-06, "loss": 1.0912, "num_input_tokens_seen": 120077824, "step": 264, "train_runtime": 19164.669, "train_tokens_per_second": 6265.583 }, { "epoch": 0.32085055666436496, "grad_norm": 0.23011499643325806, "learning_rate": 5e-06, "loss": 1.034, "num_input_tokens_seen": 120521112, "step": 265, "train_runtime": 19233.6771, "train_tokens_per_second": 6266.15 }, { "epoch": 0.32206131348196637, "grad_norm": 0.2434847503900528, "learning_rate": 5e-06, "loss": 1.0225, "num_input_tokens_seen": 120972064, "step": 266, "train_runtime": 19304.0456, "train_tokens_per_second": 6266.669 }, { "epoch": 0.3232720702995677, "grad_norm": 0.2501772344112396, "learning_rate": 5e-06, "loss": 1.0575, "num_input_tokens_seen": 121426920, "step": 267, "train_runtime": 19374.5798, "train_tokens_per_second": 6267.332 }, { "epoch": 0.3244828271171691, "grad_norm": 0.2651502788066864, "learning_rate": 5e-06, "loss": 1.0499, "num_input_tokens_seen": 121873184, "step": 268, "train_runtime": 19443.9026, "train_tokens_per_second": 6267.938 }, { "epoch": 0.3256935839347705, "grad_norm": 0.2908613979816437, "learning_rate": 5e-06, "loss": 1.0486, "num_input_tokens_seen": 122327152, "step": 269, "train_runtime": 19516.72, "train_tokens_per_second": 6267.813 }, { "epoch": 0.32690434075237185, "grad_norm": 0.23566846549510956, "learning_rate": 5e-06, "loss": 1.017, "num_input_tokens_seen": 122784960, "step": 270, "train_runtime": 19592.2753, "train_tokens_per_second": 6267.009 }, { "epoch": 0.32811509756997326, "grad_norm": 0.2798844575881958, "learning_rate": 5e-06, "loss": 1.0546, "num_input_tokens_seen": 123240512, "step": 271, "train_runtime": 19666.8911, "train_tokens_per_second": 6266.395 }, { "epoch": 0.3293258543875746, "grad_norm": 0.21824029088020325, "learning_rate": 5e-06, "loss": 0.9848, "num_input_tokens_seen": 123704960, "step": 272, "train_runtime": 19743.7002, "train_tokens_per_second": 6265.541 }, { "epoch": 0.33053661120517597, "grad_norm": 0.2295370250940323, "learning_rate": 5e-06, "loss": 1.0064, "num_input_tokens_seen": 124147728, "step": 273, "train_runtime": 19815.9537, "train_tokens_per_second": 6265.039 }, { "epoch": 0.3317473680227774, "grad_norm": 0.2582823932170868, "learning_rate": 5e-06, "loss": 1.0589, "num_input_tokens_seen": 124588360, "step": 274, "train_runtime": 19888.5652, "train_tokens_per_second": 6264.321 }, { "epoch": 0.33295812484037873, "grad_norm": 0.2539482116699219, "learning_rate": 5e-06, "loss": 0.9915, "num_input_tokens_seen": 125054408, "step": 275, "train_runtime": 19965.1487, "train_tokens_per_second": 6263.635 }, { "epoch": 0.33416888165798014, "grad_norm": 0.2645561695098877, "learning_rate": 5e-06, "loss": 0.9986, "num_input_tokens_seen": 125507560, "step": 276, "train_runtime": 20039.2339, "train_tokens_per_second": 6263.092 }, { "epoch": 0.3353796384755815, "grad_norm": 0.22155457735061646, "learning_rate": 5e-06, "loss": 0.9988, "num_input_tokens_seen": 125994072, "step": 277, "train_runtime": 20119.5235, "train_tokens_per_second": 6262.279 }, { "epoch": 0.33659039529318285, "grad_norm": 0.2287885844707489, "learning_rate": 5e-06, "loss": 1.0277, "num_input_tokens_seen": 126467528, "step": 278, "train_runtime": 20197.6843, "train_tokens_per_second": 6261.487 }, { "epoch": 0.33780115211078426, "grad_norm": 0.2466982901096344, "learning_rate": 5e-06, "loss": 1.0205, "num_input_tokens_seen": 126892704, "step": 279, "train_runtime": 20269.3446, "train_tokens_per_second": 6260.326 }, { "epoch": 0.3390119089283856, "grad_norm": 0.23624956607818604, "learning_rate": 5e-06, "loss": 0.9984, "num_input_tokens_seen": 127356472, "step": 280, "train_runtime": 20345.7248, "train_tokens_per_second": 6259.618 }, { "epoch": 0.34022266574598703, "grad_norm": 0.23681671917438507, "learning_rate": 5e-06, "loss": 1.0242, "num_input_tokens_seen": 127815984, "step": 281, "train_runtime": 20421.2658, "train_tokens_per_second": 6258.965 }, { "epoch": 0.3414334225635884, "grad_norm": 0.22485695779323578, "learning_rate": 5e-06, "loss": 1.0365, "num_input_tokens_seen": 128272520, "step": 282, "train_runtime": 20496.3233, "train_tokens_per_second": 6258.319 }, { "epoch": 0.34264417938118974, "grad_norm": 0.2541932165622711, "learning_rate": 5e-06, "loss": 0.9823, "num_input_tokens_seen": 128699760, "step": 283, "train_runtime": 20566.2431, "train_tokens_per_second": 6257.816 }, { "epoch": 0.34385493619879115, "grad_norm": 0.22476626932621002, "learning_rate": 5e-06, "loss": 1.0021, "num_input_tokens_seen": 129169888, "step": 284, "train_runtime": 20643.7442, "train_tokens_per_second": 6257.096 }, { "epoch": 0.3450656930163925, "grad_norm": 0.24232985079288483, "learning_rate": 5e-06, "loss": 0.9942, "num_input_tokens_seen": 129627816, "step": 285, "train_runtime": 20724.2188, "train_tokens_per_second": 6254.895 }, { "epoch": 0.3462764498339939, "grad_norm": 0.23191998898983002, "learning_rate": 5e-06, "loss": 1.0246, "num_input_tokens_seen": 130087264, "step": 286, "train_runtime": 20798.7497, "train_tokens_per_second": 6254.571 }, { "epoch": 0.34748720665159527, "grad_norm": 0.2423601895570755, "learning_rate": 5e-06, "loss": 0.9571, "num_input_tokens_seen": 130552064, "step": 287, "train_runtime": 20871.3382, "train_tokens_per_second": 6255.088 }, { "epoch": 0.3486979634691966, "grad_norm": 0.3263372480869293, "learning_rate": 5e-06, "loss": 1.0073, "num_input_tokens_seen": 131001008, "step": 288, "train_runtime": 20941.405, "train_tokens_per_second": 6255.598 }, { "epoch": 0.34990872028679804, "grad_norm": 0.2425222098827362, "learning_rate": 5e-06, "loss": 1.0397, "num_input_tokens_seen": 131456016, "step": 289, "train_runtime": 21012.6636, "train_tokens_per_second": 6256.038 }, { "epoch": 0.3511194771043994, "grad_norm": 0.24094624817371368, "learning_rate": 5e-06, "loss": 0.9737, "num_input_tokens_seen": 131911216, "step": 290, "train_runtime": 21083.6278, "train_tokens_per_second": 6256.571 }, { "epoch": 0.3523302339220008, "grad_norm": 0.2286059558391571, "learning_rate": 5e-06, "loss": 0.9598, "num_input_tokens_seen": 132364240, "step": 291, "train_runtime": 21153.9443, "train_tokens_per_second": 6257.19 }, { "epoch": 0.35354099073960216, "grad_norm": 0.22142821550369263, "learning_rate": 5e-06, "loss": 0.9791, "num_input_tokens_seen": 132820456, "step": 292, "train_runtime": 21225.0777, "train_tokens_per_second": 6257.714 }, { "epoch": 0.3547517475572035, "grad_norm": 0.25561171770095825, "learning_rate": 5e-06, "loss": 1.0434, "num_input_tokens_seen": 133258016, "step": 293, "train_runtime": 21293.1518, "train_tokens_per_second": 6258.257 }, { "epoch": 0.3559625043748049, "grad_norm": 0.23531781136989594, "learning_rate": 5e-06, "loss": 1.0001, "num_input_tokens_seen": 133723576, "step": 294, "train_runtime": 21366.0225, "train_tokens_per_second": 6258.702 }, { "epoch": 0.3571732611924063, "grad_norm": 0.22105760872364044, "learning_rate": 5e-06, "loss": 1.0213, "num_input_tokens_seen": 134179400, "step": 295, "train_runtime": 21437.4884, "train_tokens_per_second": 6259.101 }, { "epoch": 0.3583840180100077, "grad_norm": 0.26079460978507996, "learning_rate": 5e-06, "loss": 1.0322, "num_input_tokens_seen": 134624872, "step": 296, "train_runtime": 21507.5955, "train_tokens_per_second": 6259.411 }, { "epoch": 0.35959477482760904, "grad_norm": 0.2267124503850937, "learning_rate": 5e-06, "loss": 0.9923, "num_input_tokens_seen": 135108584, "step": 297, "train_runtime": 21582.4999, "train_tokens_per_second": 6260.099 }, { "epoch": 0.3608055316452104, "grad_norm": 0.247776061296463, "learning_rate": 5e-06, "loss": 0.9913, "num_input_tokens_seen": 135578000, "step": 298, "train_runtime": 21655.9937, "train_tokens_per_second": 6260.53 }, { "epoch": 0.3620162884628118, "grad_norm": 0.23508575558662415, "learning_rate": 5e-06, "loss": 0.9601, "num_input_tokens_seen": 136004656, "step": 299, "train_runtime": 21722.55, "train_tokens_per_second": 6260.989 }, { "epoch": 0.36322704528041316, "grad_norm": 0.25533682107925415, "learning_rate": 5e-06, "loss": 1.0357, "num_input_tokens_seen": 136462536, "step": 300, "train_runtime": 21793.1578, "train_tokens_per_second": 6261.715 }, { "epoch": 0.3644378020980146, "grad_norm": 0.2101793736219406, "learning_rate": 5e-06, "loss": 0.9743, "num_input_tokens_seen": 136933744, "step": 301, "train_runtime": 21866.9264, "train_tokens_per_second": 6262.14 }, { "epoch": 0.36564855891561593, "grad_norm": 0.2493451088666916, "learning_rate": 5e-06, "loss": 0.973, "num_input_tokens_seen": 137387912, "step": 302, "train_runtime": 21938.1245, "train_tokens_per_second": 6262.519 }, { "epoch": 0.3668593157332173, "grad_norm": 0.23311975598335266, "learning_rate": 5e-06, "loss": 1.0086, "num_input_tokens_seen": 137841208, "step": 303, "train_runtime": 22008.7483, "train_tokens_per_second": 6263.019 }, { "epoch": 0.3680700725508187, "grad_norm": 0.2377161979675293, "learning_rate": 5e-06, "loss": 1.0391, "num_input_tokens_seen": 138299912, "step": 304, "train_runtime": 22080.6387, "train_tokens_per_second": 6263.402 }, { "epoch": 0.36928082936842005, "grad_norm": 0.23572410643100739, "learning_rate": 5e-06, "loss": 1.0402, "num_input_tokens_seen": 138739904, "step": 305, "train_runtime": 22152.1917, "train_tokens_per_second": 6263.033 }, { "epoch": 0.37049158618602146, "grad_norm": 0.24770863354206085, "learning_rate": 5e-06, "loss": 1.0045, "num_input_tokens_seen": 139197120, "step": 306, "train_runtime": 22227.4884, "train_tokens_per_second": 6262.386 }, { "epoch": 0.3717023430036228, "grad_norm": 0.2456834316253662, "learning_rate": 5e-06, "loss": 1.0049, "num_input_tokens_seen": 139637016, "step": 307, "train_runtime": 22300.7598, "train_tokens_per_second": 6261.536 }, { "epoch": 0.37291309982122417, "grad_norm": 0.23433266580104828, "learning_rate": 5e-06, "loss": 0.9637, "num_input_tokens_seen": 140089624, "step": 308, "train_runtime": 22375.468, "train_tokens_per_second": 6260.858 }, { "epoch": 0.3741238566388256, "grad_norm": 0.28043490648269653, "learning_rate": 5e-06, "loss": 1.0334, "num_input_tokens_seen": 140517536, "step": 309, "train_runtime": 22445.9968, "train_tokens_per_second": 6260.249 }, { "epoch": 0.37533461345642694, "grad_norm": 0.26074591279029846, "learning_rate": 5e-06, "loss": 0.9666, "num_input_tokens_seen": 140988184, "step": 310, "train_runtime": 22523.6032, "train_tokens_per_second": 6259.575 }, { "epoch": 0.37654537027402835, "grad_norm": 0.2182447761297226, "learning_rate": 5e-06, "loss": 1.0189, "num_input_tokens_seen": 141453976, "step": 311, "train_runtime": 22600.0411, "train_tokens_per_second": 6259.014 }, { "epoch": 0.3777561270916297, "grad_norm": 0.30261749029159546, "learning_rate": 5e-06, "loss": 0.9974, "num_input_tokens_seen": 141907888, "step": 312, "train_runtime": 22674.4979, "train_tokens_per_second": 6258.48 }, { "epoch": 0.37896688390923106, "grad_norm": 0.2571166753768921, "learning_rate": 5e-06, "loss": 1.0201, "num_input_tokens_seen": 142370032, "step": 313, "train_runtime": 22750.3178, "train_tokens_per_second": 6257.936 }, { "epoch": 0.38017764072683247, "grad_norm": 0.23346489667892456, "learning_rate": 5e-06, "loss": 0.9982, "num_input_tokens_seen": 142807320, "step": 314, "train_runtime": 22822.3169, "train_tokens_per_second": 6257.354 }, { "epoch": 0.3813883975444338, "grad_norm": 0.23612311482429504, "learning_rate": 5e-06, "loss": 1.0125, "num_input_tokens_seen": 143261672, "step": 315, "train_runtime": 22897.0863, "train_tokens_per_second": 6256.764 }, { "epoch": 0.38259915436203523, "grad_norm": 0.26001793146133423, "learning_rate": 5e-06, "loss": 0.9806, "num_input_tokens_seen": 143708208, "step": 316, "train_runtime": 22971.0115, "train_tokens_per_second": 6256.068 }, { "epoch": 0.3838099111796366, "grad_norm": 0.26588013768196106, "learning_rate": 5e-06, "loss": 1.043, "num_input_tokens_seen": 144159888, "step": 317, "train_runtime": 23045.4895, "train_tokens_per_second": 6255.449 }, { "epoch": 0.38502066799723794, "grad_norm": 0.24810902774333954, "learning_rate": 5e-06, "loss": 1.0293, "num_input_tokens_seen": 144607736, "step": 318, "train_runtime": 23121.9728, "train_tokens_per_second": 6254.126 }, { "epoch": 0.38623142481483935, "grad_norm": 0.25210660696029663, "learning_rate": 5e-06, "loss": 1.0545, "num_input_tokens_seen": 145045848, "step": 319, "train_runtime": 23191.7625, "train_tokens_per_second": 6254.197 }, { "epoch": 0.3874421816324407, "grad_norm": 0.2451591044664383, "learning_rate": 5e-06, "loss": 0.9838, "num_input_tokens_seen": 145531920, "step": 320, "train_runtime": 23271.4709, "train_tokens_per_second": 6253.662 }, { "epoch": 0.3886529384500421, "grad_norm": 0.29514279961586, "learning_rate": 5e-06, "loss": 0.9623, "num_input_tokens_seen": 145970160, "step": 321, "train_runtime": 23345.6996, "train_tokens_per_second": 6252.55 }, { "epoch": 0.3898636952676435, "grad_norm": 0.270550936460495, "learning_rate": 5e-06, "loss": 1.0351, "num_input_tokens_seen": 146432040, "step": 322, "train_runtime": 23422.6662, "train_tokens_per_second": 6251.724 }, { "epoch": 0.39107445208524483, "grad_norm": 0.23111458122730255, "learning_rate": 5e-06, "loss": 0.9737, "num_input_tokens_seen": 146886712, "step": 323, "train_runtime": 23498.9622, "train_tokens_per_second": 6250.774 }, { "epoch": 0.39228520890284624, "grad_norm": 0.22839005291461945, "learning_rate": 5e-06, "loss": 0.9722, "num_input_tokens_seen": 147355208, "step": 324, "train_runtime": 23576.9231, "train_tokens_per_second": 6249.976 }, { "epoch": 0.3934959657204476, "grad_norm": 0.24810221791267395, "learning_rate": 5e-06, "loss": 0.9866, "num_input_tokens_seen": 147805720, "step": 325, "train_runtime": 23652.1704, "train_tokens_per_second": 6249.14 }, { "epoch": 0.394706722538049, "grad_norm": 0.23154482245445251, "learning_rate": 5e-06, "loss": 0.9954, "num_input_tokens_seen": 148273872, "step": 326, "train_runtime": 23729.723, "train_tokens_per_second": 6248.445 }, { "epoch": 0.39591747935565036, "grad_norm": 0.3031870126724243, "learning_rate": 5e-06, "loss": 1.0066, "num_input_tokens_seen": 148721464, "step": 327, "train_runtime": 23801.2103, "train_tokens_per_second": 6248.483 }, { "epoch": 0.3971282361732517, "grad_norm": 0.2704046666622162, "learning_rate": 5e-06, "loss": 0.9968, "num_input_tokens_seen": 149168824, "step": 328, "train_runtime": 23872.2639, "train_tokens_per_second": 6248.625 }, { "epoch": 0.3983389929908531, "grad_norm": 0.2855125069618225, "learning_rate": 5e-06, "loss": 1.041, "num_input_tokens_seen": 149608712, "step": 329, "train_runtime": 23941.7145, "train_tokens_per_second": 6248.872 }, { "epoch": 0.3995497498084545, "grad_norm": 0.24565830826759338, "learning_rate": 5e-06, "loss": 0.9534, "num_input_tokens_seen": 150072952, "step": 330, "train_runtime": 24015.5471, "train_tokens_per_second": 6248.992 }, { "epoch": 0.4007605066260559, "grad_norm": 0.22240781784057617, "learning_rate": 5e-06, "loss": 1.0183, "num_input_tokens_seen": 150529952, "step": 331, "train_runtime": 24088.1498, "train_tokens_per_second": 6249.129 }, { "epoch": 0.40197126344365725, "grad_norm": 0.25719258189201355, "learning_rate": 5e-06, "loss": 0.994, "num_input_tokens_seen": 150974144, "step": 332, "train_runtime": 24159.1836, "train_tokens_per_second": 6249.141 }, { "epoch": 0.4031820202612586, "grad_norm": 0.23377108573913574, "learning_rate": 5e-06, "loss": 1.0007, "num_input_tokens_seen": 151437784, "step": 333, "train_runtime": 24232.9718, "train_tokens_per_second": 6249.245 }, { "epoch": 0.40439277707886, "grad_norm": 0.256849467754364, "learning_rate": 5e-06, "loss": 1.0027, "num_input_tokens_seen": 151890232, "step": 334, "train_runtime": 24304.7562, "train_tokens_per_second": 6249.404 }, { "epoch": 0.40560353389646137, "grad_norm": 0.23702043294906616, "learning_rate": 5e-06, "loss": 1.0201, "num_input_tokens_seen": 152334376, "step": 335, "train_runtime": 24375.494, "train_tokens_per_second": 6249.489 }, { "epoch": 0.4068142907140628, "grad_norm": 0.23365221917629242, "learning_rate": 5e-06, "loss": 1.0297, "num_input_tokens_seen": 152790664, "step": 336, "train_runtime": 24447.9699, "train_tokens_per_second": 6249.626 }, { "epoch": 0.40802504753166413, "grad_norm": 0.23382526636123657, "learning_rate": 5e-06, "loss": 1.0004, "num_input_tokens_seen": 153244056, "step": 337, "train_runtime": 24520.5163, "train_tokens_per_second": 6249.626 }, { "epoch": 0.4092358043492655, "grad_norm": 0.22636200487613678, "learning_rate": 5e-06, "loss": 1.0144, "num_input_tokens_seen": 153706824, "step": 338, "train_runtime": 24594.0224, "train_tokens_per_second": 6249.764 }, { "epoch": 0.4104465611668669, "grad_norm": 0.23086538910865784, "learning_rate": 5e-06, "loss": 0.9871, "num_input_tokens_seen": 154147192, "step": 339, "train_runtime": 24663.9908, "train_tokens_per_second": 6249.888 }, { "epoch": 0.41165731798446825, "grad_norm": 0.25210967659950256, "learning_rate": 5e-06, "loss": 1.02, "num_input_tokens_seen": 154601576, "step": 340, "train_runtime": 24736.4881, "train_tokens_per_second": 6249.94 }, { "epoch": 0.41286807480206966, "grad_norm": 0.24582870304584503, "learning_rate": 5e-06, "loss": 1.0406, "num_input_tokens_seen": 155050072, "step": 341, "train_runtime": 24807.9176, "train_tokens_per_second": 6250.024 }, { "epoch": 0.414078831619671, "grad_norm": 0.2524389326572418, "learning_rate": 5e-06, "loss": 1.0108, "num_input_tokens_seen": 155484872, "step": 342, "train_runtime": 24876.7608, "train_tokens_per_second": 6250.206 }, { "epoch": 0.4152895884372724, "grad_norm": 0.24597734212875366, "learning_rate": 5e-06, "loss": 1.0103, "num_input_tokens_seen": 155935768, "step": 343, "train_runtime": 24948.3799, "train_tokens_per_second": 6250.336 }, { "epoch": 0.4165003452548738, "grad_norm": 0.2275368720293045, "learning_rate": 5e-06, "loss": 1.0013, "num_input_tokens_seen": 156401120, "step": 344, "train_runtime": 25022.1853, "train_tokens_per_second": 6250.498 }, { "epoch": 0.41771110207247514, "grad_norm": 0.22949494421482086, "learning_rate": 5e-06, "loss": 1.0111, "num_input_tokens_seen": 156864272, "step": 345, "train_runtime": 25095.8034, "train_tokens_per_second": 6250.618 }, { "epoch": 0.41892185889007655, "grad_norm": 0.23165899515151978, "learning_rate": 5e-06, "loss": 1.0014, "num_input_tokens_seen": 157317000, "step": 346, "train_runtime": 25167.8027, "train_tokens_per_second": 6250.724 }, { "epoch": 0.4201326157076779, "grad_norm": 0.23215775191783905, "learning_rate": 5e-06, "loss": 0.9639, "num_input_tokens_seen": 157785416, "step": 347, "train_runtime": 25242.0286, "train_tokens_per_second": 6250.901 }, { "epoch": 0.42134337252527926, "grad_norm": 0.23086605966091156, "learning_rate": 5e-06, "loss": 1.034, "num_input_tokens_seen": 158244744, "step": 348, "train_runtime": 25315.4472, "train_tokens_per_second": 6250.916 }, { "epoch": 0.42255412934288067, "grad_norm": 0.2317984402179718, "learning_rate": 5e-06, "loss": 1.0313, "num_input_tokens_seen": 158699784, "step": 349, "train_runtime": 25387.642, "train_tokens_per_second": 6251.064 }, { "epoch": 0.423764886160482, "grad_norm": 0.2463163435459137, "learning_rate": 5e-06, "loss": 0.9835, "num_input_tokens_seen": 159153432, "step": 350, "train_runtime": 25459.746, "train_tokens_per_second": 6251.179 }, { "epoch": 0.42497564297808343, "grad_norm": 0.2302168309688568, "learning_rate": 5e-06, "loss": 0.9666, "num_input_tokens_seen": 159597184, "step": 351, "train_runtime": 25530.6729, "train_tokens_per_second": 6251.194 }, { "epoch": 0.4261863997956848, "grad_norm": 0.24311944842338562, "learning_rate": 5e-06, "loss": 0.9938, "num_input_tokens_seen": 160056872, "step": 352, "train_runtime": 25603.679, "train_tokens_per_second": 6251.323 }, { "epoch": 0.42739715661328614, "grad_norm": 0.24332423508167267, "learning_rate": 5e-06, "loss": 1.0026, "num_input_tokens_seen": 160498144, "step": 353, "train_runtime": 25673.5562, "train_tokens_per_second": 6251.496 }, { "epoch": 0.42860791343088755, "grad_norm": 0.2577798664569855, "learning_rate": 5e-06, "loss": 0.946, "num_input_tokens_seen": 160952520, "step": 354, "train_runtime": 25746.1174, "train_tokens_per_second": 6251.526 }, { "epoch": 0.4298186702484889, "grad_norm": 0.24245211482048035, "learning_rate": 5e-06, "loss": 1.0504, "num_input_tokens_seen": 161409016, "step": 355, "train_runtime": 25819.2295, "train_tokens_per_second": 6251.504 }, { "epoch": 0.4310294270660903, "grad_norm": 0.23425163328647614, "learning_rate": 5e-06, "loss": 1.0225, "num_input_tokens_seen": 161870752, "step": 356, "train_runtime": 25892.4963, "train_tokens_per_second": 6251.647 }, { "epoch": 0.4322401838836917, "grad_norm": 0.2525038421154022, "learning_rate": 5e-06, "loss": 1.0433, "num_input_tokens_seen": 162296568, "step": 357, "train_runtime": 25959.8505, "train_tokens_per_second": 6251.83 }, { "epoch": 0.43345094070129303, "grad_norm": 0.2417079657316208, "learning_rate": 5e-06, "loss": 1.0091, "num_input_tokens_seen": 162731720, "step": 358, "train_runtime": 26028.6681, "train_tokens_per_second": 6252.019 }, { "epoch": 0.43466169751889444, "grad_norm": 0.24416188895702362, "learning_rate": 5e-06, "loss": 0.9745, "num_input_tokens_seen": 163191568, "step": 359, "train_runtime": 26101.7213, "train_tokens_per_second": 6252.138 }, { "epoch": 0.4358724543364958, "grad_norm": 0.2705591022968292, "learning_rate": 5e-06, "loss": 1.005, "num_input_tokens_seen": 163643952, "step": 360, "train_runtime": 26173.8821, "train_tokens_per_second": 6252.185 }, { "epoch": 0.4370832111540972, "grad_norm": 0.23336398601531982, "learning_rate": 5e-06, "loss": 0.9792, "num_input_tokens_seen": 164093776, "step": 361, "train_runtime": 26245.424, "train_tokens_per_second": 6252.281 }, { "epoch": 0.43829396797169856, "grad_norm": 0.22414255142211914, "learning_rate": 5e-06, "loss": 0.9672, "num_input_tokens_seen": 164558224, "step": 362, "train_runtime": 26319.3494, "train_tokens_per_second": 6252.367 }, { "epoch": 0.4395047247892999, "grad_norm": 0.22132380306720734, "learning_rate": 5e-06, "loss": 0.9457, "num_input_tokens_seen": 165025024, "step": 363, "train_runtime": 26394.08, "train_tokens_per_second": 6252.35 }, { "epoch": 0.4407154816069013, "grad_norm": 0.2500600814819336, "learning_rate": 5e-06, "loss": 1.0243, "num_input_tokens_seen": 165495112, "step": 364, "train_runtime": 26469.3304, "train_tokens_per_second": 6252.335 }, { "epoch": 0.4419262384245027, "grad_norm": 0.24437642097473145, "learning_rate": 5e-06, "loss": 0.9971, "num_input_tokens_seen": 165959744, "step": 365, "train_runtime": 26543.6441, "train_tokens_per_second": 6252.335 }, { "epoch": 0.4431369952421041, "grad_norm": 0.2317400872707367, "learning_rate": 5e-06, "loss": 0.9962, "num_input_tokens_seen": 166425752, "step": 366, "train_runtime": 26617.8184, "train_tokens_per_second": 6252.419 }, { "epoch": 0.44434775205970545, "grad_norm": 0.22997960448265076, "learning_rate": 5e-06, "loss": 1.0282, "num_input_tokens_seen": 166881504, "step": 367, "train_runtime": 26690.5854, "train_tokens_per_second": 6252.448 }, { "epoch": 0.4455585088773068, "grad_norm": 0.2334347665309906, "learning_rate": 5e-06, "loss": 1.0253, "num_input_tokens_seen": 167353128, "step": 368, "train_runtime": 26766.4126, "train_tokens_per_second": 6252.356 }, { "epoch": 0.4467692656949082, "grad_norm": 0.23148049414157867, "learning_rate": 5e-06, "loss": 0.9588, "num_input_tokens_seen": 167805976, "step": 369, "train_runtime": 26837.3465, "train_tokens_per_second": 6252.704 }, { "epoch": 0.44798002251250957, "grad_norm": 0.2629753649234772, "learning_rate": 5e-06, "loss": 0.9514, "num_input_tokens_seen": 168248368, "step": 370, "train_runtime": 26906.0429, "train_tokens_per_second": 6253.181 }, { "epoch": 0.449190779330111, "grad_norm": 0.2621021568775177, "learning_rate": 5e-06, "loss": 0.991, "num_input_tokens_seen": 168693912, "step": 371, "train_runtime": 26975.4421, "train_tokens_per_second": 6253.611 }, { "epoch": 0.45040153614771233, "grad_norm": 0.2458389848470688, "learning_rate": 5e-06, "loss": 0.9837, "num_input_tokens_seen": 169157520, "step": 372, "train_runtime": 27047.241, "train_tokens_per_second": 6254.151 }, { "epoch": 0.4516122929653137, "grad_norm": 0.22616301476955414, "learning_rate": 5e-06, "loss": 0.9584, "num_input_tokens_seen": 169629008, "step": 373, "train_runtime": 27120.6769, "train_tokens_per_second": 6254.601 }, { "epoch": 0.4528230497829151, "grad_norm": 0.28033509850502014, "learning_rate": 5e-06, "loss": 0.9869, "num_input_tokens_seen": 170096328, "step": 374, "train_runtime": 27193.4201, "train_tokens_per_second": 6255.055 }, { "epoch": 0.45403380660051645, "grad_norm": 0.2582356035709381, "learning_rate": 5e-06, "loss": 0.9658, "num_input_tokens_seen": 170530368, "step": 375, "train_runtime": 27261.0916, "train_tokens_per_second": 6255.449 }, { "epoch": 0.45524456341811786, "grad_norm": 0.26356765627861023, "learning_rate": 5e-06, "loss": 0.954, "num_input_tokens_seen": 170977704, "step": 376, "train_runtime": 27330.4355, "train_tokens_per_second": 6255.945 }, { "epoch": 0.4564553202357192, "grad_norm": 0.2806834280490875, "learning_rate": 5e-06, "loss": 0.9188, "num_input_tokens_seen": 171404840, "step": 377, "train_runtime": 27396.163, "train_tokens_per_second": 6256.527 }, { "epoch": 0.4576660770533206, "grad_norm": 0.24835824966430664, "learning_rate": 5e-06, "loss": 1.0034, "num_input_tokens_seen": 171863528, "step": 378, "train_runtime": 27467.6453, "train_tokens_per_second": 6256.944 }, { "epoch": 0.458876833870922, "grad_norm": 0.24917422235012054, "learning_rate": 5e-06, "loss": 1.0091, "num_input_tokens_seen": 172306160, "step": 379, "train_runtime": 27537.144, "train_tokens_per_second": 6257.227 }, { "epoch": 0.46008759068852334, "grad_norm": 0.24879835546016693, "learning_rate": 5e-06, "loss": 1.0534, "num_input_tokens_seen": 172762776, "step": 380, "train_runtime": 27607.7602, "train_tokens_per_second": 6257.761 }, { "epoch": 0.46129834750612475, "grad_norm": 0.2425055056810379, "learning_rate": 5e-06, "loss": 0.9974, "num_input_tokens_seen": 173211600, "step": 381, "train_runtime": 27677.4709, "train_tokens_per_second": 6258.216 }, { "epoch": 0.4625091043237261, "grad_norm": 0.23279421031475067, "learning_rate": 5e-06, "loss": 1.0132, "num_input_tokens_seen": 173650080, "step": 382, "train_runtime": 27745.2426, "train_tokens_per_second": 6258.734 }, { "epoch": 0.46371986114132746, "grad_norm": 0.23731283843517303, "learning_rate": 5e-06, "loss": 1.0208, "num_input_tokens_seen": 174115976, "step": 383, "train_runtime": 27817.775, "train_tokens_per_second": 6259.163 }, { "epoch": 0.46493061795892887, "grad_norm": 0.2498994767665863, "learning_rate": 5e-06, "loss": 0.9958, "num_input_tokens_seen": 174583112, "step": 384, "train_runtime": 27890.6884, "train_tokens_per_second": 6259.548 }, { "epoch": 0.4661413747765302, "grad_norm": 0.21462289988994598, "learning_rate": 5e-06, "loss": 0.957, "num_input_tokens_seen": 175059472, "step": 385, "train_runtime": 27965.0646, "train_tokens_per_second": 6259.934 }, { "epoch": 0.46735213159413164, "grad_norm": 0.2454395592212677, "learning_rate": 5e-06, "loss": 0.9569, "num_input_tokens_seen": 175520768, "step": 386, "train_runtime": 28036.9974, "train_tokens_per_second": 6260.327 }, { "epoch": 0.468562888411733, "grad_norm": 0.2549636960029602, "learning_rate": 5e-06, "loss": 0.9632, "num_input_tokens_seen": 175947120, "step": 387, "train_runtime": 28103.0751, "train_tokens_per_second": 6260.778 }, { "epoch": 0.46977364522933435, "grad_norm": 0.22117368876934052, "learning_rate": 5e-06, "loss": 1.0324, "num_input_tokens_seen": 176416712, "step": 388, "train_runtime": 28176.0037, "train_tokens_per_second": 6261.24 }, { "epoch": 0.47098440204693576, "grad_norm": 0.24724611639976501, "learning_rate": 5e-06, "loss": 0.9896, "num_input_tokens_seen": 176866424, "step": 389, "train_runtime": 28246.1355, "train_tokens_per_second": 6261.615 }, { "epoch": 0.4721951588645371, "grad_norm": 0.23016729950904846, "learning_rate": 5e-06, "loss": 0.9615, "num_input_tokens_seen": 177343216, "step": 390, "train_runtime": 28320.9312, "train_tokens_per_second": 6261.913 }, { "epoch": 0.4734059156821385, "grad_norm": 0.2248724400997162, "learning_rate": 5e-06, "loss": 0.9356, "num_input_tokens_seen": 177768280, "step": 391, "train_runtime": 28386.5413, "train_tokens_per_second": 6262.414 }, { "epoch": 0.4746166724997399, "grad_norm": 0.26315781474113464, "learning_rate": 5e-06, "loss": 0.9978, "num_input_tokens_seen": 178212008, "step": 392, "train_runtime": 28456.242, "train_tokens_per_second": 6262.668 }, { "epoch": 0.47582742931734123, "grad_norm": 0.24355779588222504, "learning_rate": 5e-06, "loss": 1.0061, "num_input_tokens_seen": 178671232, "step": 393, "train_runtime": 28527.8799, "train_tokens_per_second": 6263.039 }, { "epoch": 0.47703818613494264, "grad_norm": 0.21970634162425995, "learning_rate": 5e-06, "loss": 0.9588, "num_input_tokens_seen": 179130472, "step": 394, "train_runtime": 28599.515, "train_tokens_per_second": 6263.409 }, { "epoch": 0.478248942952544, "grad_norm": 0.25734594464302063, "learning_rate": 5e-06, "loss": 1.1103, "num_input_tokens_seen": 179589296, "step": 395, "train_runtime": 28671.1821, "train_tokens_per_second": 6263.756 }, { "epoch": 0.4794596997701454, "grad_norm": 0.22498640418052673, "learning_rate": 5e-06, "loss": 0.9638, "num_input_tokens_seen": 180039760, "step": 396, "train_runtime": 28741.3132, "train_tokens_per_second": 6264.145 }, { "epoch": 0.48067045658774676, "grad_norm": 0.23484832048416138, "learning_rate": 5e-06, "loss": 0.9852, "num_input_tokens_seen": 180466416, "step": 397, "train_runtime": 28807.6188, "train_tokens_per_second": 6264.538 }, { "epoch": 0.4818812134053481, "grad_norm": 0.23096151649951935, "learning_rate": 5e-06, "loss": 0.9901, "num_input_tokens_seen": 180934584, "step": 398, "train_runtime": 28880.5387, "train_tokens_per_second": 6264.931 }, { "epoch": 0.48309197022294953, "grad_norm": 0.29461684823036194, "learning_rate": 5e-06, "loss": 0.9985, "num_input_tokens_seen": 181383840, "step": 399, "train_runtime": 28950.6138, "train_tokens_per_second": 6265.285 }, { "epoch": 0.4843027270405509, "grad_norm": 0.24854110181331635, "learning_rate": 5e-06, "loss": 1.022, "num_input_tokens_seen": 181833776, "step": 400, "train_runtime": 29020.8816, "train_tokens_per_second": 6265.619 }, { "epoch": 0.4855134838581523, "grad_norm": 0.22923749685287476, "learning_rate": 5e-06, "loss": 0.9995, "num_input_tokens_seen": 182289656, "step": 401, "train_runtime": 29091.6013, "train_tokens_per_second": 6266.058 }, { "epoch": 0.48672424067575365, "grad_norm": 0.23606517910957336, "learning_rate": 5e-06, "loss": 0.9335, "num_input_tokens_seen": 182741656, "step": 402, "train_runtime": 29162.397, "train_tokens_per_second": 6266.346 }, { "epoch": 0.487934997493355, "grad_norm": 0.2514527142047882, "learning_rate": 5e-06, "loss": 0.95, "num_input_tokens_seen": 183208448, "step": 403, "train_runtime": 29235.1663, "train_tokens_per_second": 6266.715 }, { "epoch": 0.4891457543109564, "grad_norm": 0.23453983664512634, "learning_rate": 5e-06, "loss": 0.9837, "num_input_tokens_seen": 183665416, "step": 404, "train_runtime": 29306.3034, "train_tokens_per_second": 6267.096 }, { "epoch": 0.49035651112855777, "grad_norm": 0.23354077339172363, "learning_rate": 5e-06, "loss": 1.0109, "num_input_tokens_seen": 184118688, "step": 405, "train_runtime": 29376.2962, "train_tokens_per_second": 6267.594 }, { "epoch": 0.4915672679461592, "grad_norm": 0.2359265685081482, "learning_rate": 5e-06, "loss": 0.9647, "num_input_tokens_seen": 184576128, "step": 406, "train_runtime": 29447.5204, "train_tokens_per_second": 6267.968 }, { "epoch": 0.49277802476376054, "grad_norm": 0.23804575204849243, "learning_rate": 5e-06, "loss": 1.0444, "num_input_tokens_seen": 185032912, "step": 407, "train_runtime": 29518.842, "train_tokens_per_second": 6268.298 }, { "epoch": 0.4939887815813619, "grad_norm": 0.26842811703681946, "learning_rate": 5e-06, "loss": 1.0753, "num_input_tokens_seen": 185474400, "step": 408, "train_runtime": 29588.0213, "train_tokens_per_second": 6268.564 }, { "epoch": 0.4951995383989633, "grad_norm": 0.2470535784959793, "learning_rate": 5e-06, "loss": 1.0522, "num_input_tokens_seen": 185926928, "step": 409, "train_runtime": 29658.4068, "train_tokens_per_second": 6268.945 }, { "epoch": 0.49641029521656466, "grad_norm": 0.2313876450061798, "learning_rate": 5e-06, "loss": 0.9896, "num_input_tokens_seen": 186395976, "step": 410, "train_runtime": 29731.4655, "train_tokens_per_second": 6269.317 }, { "epoch": 0.49762105203416607, "grad_norm": 0.2276448905467987, "learning_rate": 5e-06, "loss": 1.0273, "num_input_tokens_seen": 186855720, "step": 411, "train_runtime": 29803.2261, "train_tokens_per_second": 6269.647 }, { "epoch": 0.4988318088517674, "grad_norm": 0.24273553490638733, "learning_rate": 5e-06, "loss": 0.9887, "num_input_tokens_seen": 187303704, "step": 412, "train_runtime": 29872.3789, "train_tokens_per_second": 6270.13 }, { "epoch": 0.5000425656693688, "grad_norm": 0.22893160581588745, "learning_rate": 5e-06, "loss": 0.9927, "num_input_tokens_seen": 187767288, "step": 413, "train_runtime": 29944.5337, "train_tokens_per_second": 6270.503 }, { "epoch": 0.5012533224869702, "grad_norm": 0.24135759472846985, "learning_rate": 5e-06, "loss": 1.02, "num_input_tokens_seen": 188215576, "step": 414, "train_runtime": 30014.0007, "train_tokens_per_second": 6270.926 }, { "epoch": 0.5024640793045716, "grad_norm": 0.22361376881599426, "learning_rate": 5e-06, "loss": 0.9666, "num_input_tokens_seen": 188667128, "step": 415, "train_runtime": 30084.01, "train_tokens_per_second": 6271.342 }, { "epoch": 0.5036748361221729, "grad_norm": 0.27765095233917236, "learning_rate": 5e-06, "loss": 0.9637, "num_input_tokens_seen": 189113312, "step": 416, "train_runtime": 30153.5994, "train_tokens_per_second": 6271.666 }, { "epoch": 0.5048855929397743, "grad_norm": 0.2431006133556366, "learning_rate": 5e-06, "loss": 1.023, "num_input_tokens_seen": 189552168, "step": 417, "train_runtime": 30221.737, "train_tokens_per_second": 6272.047 }, { "epoch": 0.5060963497573757, "grad_norm": 0.23247578740119934, "learning_rate": 5e-06, "loss": 0.96, "num_input_tokens_seen": 190031496, "step": 418, "train_runtime": 30296.4767, "train_tokens_per_second": 6272.396 }, { "epoch": 0.507307106574977, "grad_norm": 0.2316485345363617, "learning_rate": 5e-06, "loss": 1.0319, "num_input_tokens_seen": 190468688, "step": 419, "train_runtime": 30366.2489, "train_tokens_per_second": 6272.381 }, { "epoch": 0.5085178633925784, "grad_norm": 0.24219174683094025, "learning_rate": 5e-06, "loss": 0.9843, "num_input_tokens_seen": 190902848, "step": 420, "train_runtime": 30433.8527, "train_tokens_per_second": 6272.714 }, { "epoch": 0.5097286202101798, "grad_norm": 0.22331832349300385, "learning_rate": 5e-06, "loss": 0.9295, "num_input_tokens_seen": 191365624, "step": 421, "train_runtime": 30506.1907, "train_tokens_per_second": 6273.009 }, { "epoch": 0.5109393770277811, "grad_norm": 0.24295338988304138, "learning_rate": 5e-06, "loss": 0.9981, "num_input_tokens_seen": 191812256, "step": 422, "train_runtime": 30575.7222, "train_tokens_per_second": 6273.352 }, { "epoch": 0.5121501338453825, "grad_norm": 0.23116403818130493, "learning_rate": 5e-06, "loss": 0.9845, "num_input_tokens_seen": 192275296, "step": 423, "train_runtime": 30647.9598, "train_tokens_per_second": 6273.674 }, { "epoch": 0.513360890662984, "grad_norm": 0.38395291566848755, "learning_rate": 5e-06, "loss": 0.9968, "num_input_tokens_seen": 192729088, "step": 424, "train_runtime": 30718.5265, "train_tokens_per_second": 6274.034 }, { "epoch": 0.5145716474805854, "grad_norm": 0.21122363209724426, "learning_rate": 5e-06, "loss": 0.9741, "num_input_tokens_seen": 193218400, "step": 425, "train_runtime": 30795.0296, "train_tokens_per_second": 6274.337 }, { "epoch": 0.5157824042981867, "grad_norm": 0.22073934972286224, "learning_rate": 5e-06, "loss": 0.9598, "num_input_tokens_seen": 193675104, "step": 426, "train_runtime": 30865.8936, "train_tokens_per_second": 6274.729 }, { "epoch": 0.5169931611157881, "grad_norm": 0.2508212924003601, "learning_rate": 5e-06, "loss": 0.9413, "num_input_tokens_seen": 194152520, "step": 427, "train_runtime": 30940.0715, "train_tokens_per_second": 6275.115 }, { "epoch": 0.5182039179333895, "grad_norm": 0.24162203073501587, "learning_rate": 5e-06, "loss": 0.9986, "num_input_tokens_seen": 194594776, "step": 428, "train_runtime": 31008.6981, "train_tokens_per_second": 6275.49 }, { "epoch": 0.5194146747509908, "grad_norm": 0.22889398038387299, "learning_rate": 5e-06, "loss": 1.006, "num_input_tokens_seen": 195045984, "step": 429, "train_runtime": 31078.8709, "train_tokens_per_second": 6275.839 }, { "epoch": 0.5206254315685922, "grad_norm": 0.2539101243019104, "learning_rate": 5e-06, "loss": 1.0005, "num_input_tokens_seen": 195479240, "step": 430, "train_runtime": 31146.6426, "train_tokens_per_second": 6276.093 }, { "epoch": 0.5218361883861936, "grad_norm": 0.21705974638462067, "learning_rate": 5e-06, "loss": 0.9376, "num_input_tokens_seen": 195959064, "step": 431, "train_runtime": 31221.7672, "train_tokens_per_second": 6276.36 }, { "epoch": 0.5230469452037949, "grad_norm": 0.22790437936782837, "learning_rate": 5e-06, "loss": 0.9948, "num_input_tokens_seen": 196401264, "step": 432, "train_runtime": 31290.4429, "train_tokens_per_second": 6276.717 }, { "epoch": 0.5242577020213963, "grad_norm": 0.26201656460762024, "learning_rate": 5e-06, "loss": 1.0204, "num_input_tokens_seen": 196831632, "step": 433, "train_runtime": 31357.4731, "train_tokens_per_second": 6277.025 }, { "epoch": 0.5254684588389977, "grad_norm": 0.23872381448745728, "learning_rate": 5e-06, "loss": 0.98, "num_input_tokens_seen": 197280792, "step": 434, "train_runtime": 31427.348, "train_tokens_per_second": 6277.36 }, { "epoch": 0.5266792156565991, "grad_norm": 0.23127026855945587, "learning_rate": 5e-06, "loss": 1.0302, "num_input_tokens_seen": 197738976, "step": 435, "train_runtime": 31498.5952, "train_tokens_per_second": 6277.708 }, { "epoch": 0.5278899724742004, "grad_norm": 0.23606155812740326, "learning_rate": 5e-06, "loss": 1.0139, "num_input_tokens_seen": 198192464, "step": 436, "train_runtime": 31569.309, "train_tokens_per_second": 6278.011 }, { "epoch": 0.5291007292918019, "grad_norm": 0.23491834104061127, "learning_rate": 5e-06, "loss": 0.9967, "num_input_tokens_seen": 198667936, "step": 437, "train_runtime": 31643.9079, "train_tokens_per_second": 6278.236 }, { "epoch": 0.5303114861094033, "grad_norm": 0.21920163929462433, "learning_rate": 5e-06, "loss": 0.9938, "num_input_tokens_seen": 199128912, "step": 438, "train_runtime": 31715.309, "train_tokens_per_second": 6278.637 }, { "epoch": 0.5315222429270046, "grad_norm": 0.24721209704875946, "learning_rate": 5e-06, "loss": 0.9461, "num_input_tokens_seen": 199581136, "step": 439, "train_runtime": 31785.518, "train_tokens_per_second": 6278.996 }, { "epoch": 0.532732999744606, "grad_norm": 0.2280053347349167, "learning_rate": 5e-06, "loss": 0.9607, "num_input_tokens_seen": 200043376, "step": 440, "train_runtime": 31857.7901, "train_tokens_per_second": 6279.261 }, { "epoch": 0.5339437565622074, "grad_norm": 0.23798179626464844, "learning_rate": 5e-06, "loss": 1.0175, "num_input_tokens_seen": 200477576, "step": 441, "train_runtime": 31924.9328, "train_tokens_per_second": 6279.655 }, { "epoch": 0.5351545133798087, "grad_norm": 0.24441802501678467, "learning_rate": 5e-06, "loss": 0.9864, "num_input_tokens_seen": 200902872, "step": 442, "train_runtime": 31991.1171, "train_tokens_per_second": 6279.958 }, { "epoch": 0.5363652701974101, "grad_norm": 0.22049540281295776, "learning_rate": 5e-06, "loss": 0.9682, "num_input_tokens_seen": 201374768, "step": 443, "train_runtime": 32064.8823, "train_tokens_per_second": 6280.228 }, { "epoch": 0.5375760270150115, "grad_norm": 0.26407957077026367, "learning_rate": 5e-06, "loss": 1.0439, "num_input_tokens_seen": 201833576, "step": 444, "train_runtime": 32135.9258, "train_tokens_per_second": 6280.621 }, { "epoch": 0.5387867838326129, "grad_norm": 0.23320509493350983, "learning_rate": 5e-06, "loss": 0.9675, "num_input_tokens_seen": 202288056, "step": 445, "train_runtime": 32207.1376, "train_tokens_per_second": 6280.846 }, { "epoch": 0.5399975406502142, "grad_norm": 0.2530595362186432, "learning_rate": 5e-06, "loss": 0.9806, "num_input_tokens_seen": 202734456, "step": 446, "train_runtime": 32276.5609, "train_tokens_per_second": 6281.167 }, { "epoch": 0.5412082974678156, "grad_norm": 0.24577440321445465, "learning_rate": 5e-06, "loss": 1.032, "num_input_tokens_seen": 203175032, "step": 447, "train_runtime": 32345.0646, "train_tokens_per_second": 6281.485 }, { "epoch": 0.542419054285417, "grad_norm": 0.24135351181030273, "learning_rate": 5e-06, "loss": 0.9941, "num_input_tokens_seen": 203613664, "step": 448, "train_runtime": 32413.2576, "train_tokens_per_second": 6281.802 }, { "epoch": 0.5436298111030183, "grad_norm": 0.2334894835948944, "learning_rate": 5e-06, "loss": 0.9561, "num_input_tokens_seen": 204069960, "step": 449, "train_runtime": 32484.3465, "train_tokens_per_second": 6282.101 }, { "epoch": 0.5448405679206197, "grad_norm": 0.23215444386005402, "learning_rate": 5e-06, "loss": 0.9621, "num_input_tokens_seen": 204514576, "step": 450, "train_runtime": 32553.403, "train_tokens_per_second": 6282.433 }, { "epoch": 0.5460513247382212, "grad_norm": 0.22942085564136505, "learning_rate": 5e-06, "loss": 1.0227, "num_input_tokens_seen": 204978768, "step": 451, "train_runtime": 32626.2536, "train_tokens_per_second": 6282.633 }, { "epoch": 0.5472620815558226, "grad_norm": 0.24713215231895447, "learning_rate": 5e-06, "loss": 0.9849, "num_input_tokens_seen": 205433104, "step": 452, "train_runtime": 32697.6427, "train_tokens_per_second": 6282.811 }, { "epoch": 0.5484728383734239, "grad_norm": 0.23457272350788116, "learning_rate": 5e-06, "loss": 0.9856, "num_input_tokens_seen": 205899472, "step": 453, "train_runtime": 32770.9009, "train_tokens_per_second": 6282.997 }, { "epoch": 0.5496835951910253, "grad_norm": 0.25106683373451233, "learning_rate": 5e-06, "loss": 1.0003, "num_input_tokens_seen": 206350824, "step": 454, "train_runtime": 32840.7984, "train_tokens_per_second": 6283.368 }, { "epoch": 0.5508943520086267, "grad_norm": 0.27677810192108154, "learning_rate": 5e-06, "loss": 0.9914, "num_input_tokens_seen": 206805936, "step": 455, "train_runtime": 32911.5142, "train_tokens_per_second": 6283.696 }, { "epoch": 0.552105108826228, "grad_norm": 0.23585183918476105, "learning_rate": 5e-06, "loss": 0.976, "num_input_tokens_seen": 207258416, "step": 456, "train_runtime": 32981.8418, "train_tokens_per_second": 6284.016 }, { "epoch": 0.5533158656438294, "grad_norm": 0.2358681708574295, "learning_rate": 5e-06, "loss": 1.028, "num_input_tokens_seen": 207695392, "step": 457, "train_runtime": 33049.5443, "train_tokens_per_second": 6284.365 }, { "epoch": 0.5545266224614308, "grad_norm": 0.24082793295383453, "learning_rate": 5e-06, "loss": 1.0006, "num_input_tokens_seen": 208131184, "step": 458, "train_runtime": 33117.4035, "train_tokens_per_second": 6284.647 }, { "epoch": 0.5557373792790321, "grad_norm": 0.22506728768348694, "learning_rate": 5e-06, "loss": 0.9307, "num_input_tokens_seen": 208586944, "step": 459, "train_runtime": 33188.5929, "train_tokens_per_second": 6284.899 }, { "epoch": 0.5569481360966335, "grad_norm": 0.22801756858825684, "learning_rate": 5e-06, "loss": 0.9355, "num_input_tokens_seen": 209044824, "step": 460, "train_runtime": 33260.759, "train_tokens_per_second": 6285.029 }, { "epoch": 0.5581588929142349, "grad_norm": 0.2215615212917328, "learning_rate": 5e-06, "loss": 0.963, "num_input_tokens_seen": 209511008, "step": 461, "train_runtime": 33333.5062, "train_tokens_per_second": 6285.298 }, { "epoch": 0.5593696497318363, "grad_norm": 0.24020282924175262, "learning_rate": 5e-06, "loss": 0.9947, "num_input_tokens_seen": 209962056, "step": 462, "train_runtime": 33403.9215, "train_tokens_per_second": 6285.551 }, { "epoch": 0.5605804065494376, "grad_norm": 0.23402798175811768, "learning_rate": 5e-06, "loss": 0.9612, "num_input_tokens_seen": 210405272, "step": 463, "train_runtime": 33472.9498, "train_tokens_per_second": 6285.83 }, { "epoch": 0.561791163367039, "grad_norm": 0.2381797432899475, "learning_rate": 5e-06, "loss": 1.0089, "num_input_tokens_seen": 210845616, "step": 464, "train_runtime": 33541.6057, "train_tokens_per_second": 6286.092 }, { "epoch": 0.5630019201846405, "grad_norm": 0.2647024989128113, "learning_rate": 5e-06, "loss": 1.0405, "num_input_tokens_seen": 211311336, "step": 465, "train_runtime": 33613.9005, "train_tokens_per_second": 6286.427 }, { "epoch": 0.5642126770022418, "grad_norm": 0.2484758347272873, "learning_rate": 5e-06, "loss": 1.0995, "num_input_tokens_seen": 211755424, "step": 466, "train_runtime": 33682.7067, "train_tokens_per_second": 6286.77 }, { "epoch": 0.5654234338198432, "grad_norm": 0.2419258952140808, "learning_rate": 5e-06, "loss": 0.9984, "num_input_tokens_seen": 212211880, "step": 467, "train_runtime": 33753.5833, "train_tokens_per_second": 6287.092 }, { "epoch": 0.5666341906374446, "grad_norm": 0.24377140402793884, "learning_rate": 5e-06, "loss": 1.0266, "num_input_tokens_seen": 212661224, "step": 468, "train_runtime": 33823.6337, "train_tokens_per_second": 6287.356 }, { "epoch": 0.5678449474550459, "grad_norm": 0.24141238629817963, "learning_rate": 5e-06, "loss": 0.9976, "num_input_tokens_seen": 213120728, "step": 469, "train_runtime": 33895.1784, "train_tokens_per_second": 6287.641 }, { "epoch": 0.5690557042726473, "grad_norm": 0.2776244580745697, "learning_rate": 5e-06, "loss": 1.0136, "num_input_tokens_seen": 213581176, "step": 470, "train_runtime": 33968.5316, "train_tokens_per_second": 6287.619 }, { "epoch": 0.5702664610902487, "grad_norm": 0.2289768010377884, "learning_rate": 5e-06, "loss": 0.9458, "num_input_tokens_seen": 214039216, "step": 471, "train_runtime": 34039.8568, "train_tokens_per_second": 6287.9 }, { "epoch": 0.5714772179078501, "grad_norm": 0.24029488861560822, "learning_rate": 5e-06, "loss": 0.9919, "num_input_tokens_seen": 214486744, "step": 472, "train_runtime": 34109.5176, "train_tokens_per_second": 6288.179 }, { "epoch": 0.5726879747254514, "grad_norm": 0.24775657057762146, "learning_rate": 5e-06, "loss": 1.0085, "num_input_tokens_seen": 214929224, "step": 473, "train_runtime": 34177.8306, "train_tokens_per_second": 6288.557 }, { "epoch": 0.5738987315430528, "grad_norm": 0.257894903421402, "learning_rate": 5e-06, "loss": 1.0131, "num_input_tokens_seen": 215364952, "step": 474, "train_runtime": 34245.3987, "train_tokens_per_second": 6288.873 }, { "epoch": 0.5751094883606542, "grad_norm": 0.22365638613700867, "learning_rate": 5e-06, "loss": 0.9081, "num_input_tokens_seen": 215810336, "step": 475, "train_runtime": 34314.325, "train_tokens_per_second": 6289.22 }, { "epoch": 0.5763202451782555, "grad_norm": 0.222572922706604, "learning_rate": 5e-06, "loss": 0.997, "num_input_tokens_seen": 216285368, "step": 476, "train_runtime": 34388.3445, "train_tokens_per_second": 6289.496 }, { "epoch": 0.5775310019958569, "grad_norm": 0.24267543852329254, "learning_rate": 5e-06, "loss": 1.0052, "num_input_tokens_seen": 216712736, "step": 477, "train_runtime": 34454.3641, "train_tokens_per_second": 6289.849 }, { "epoch": 0.5787417588134584, "grad_norm": 0.2833351790904999, "learning_rate": 5e-06, "loss": 0.9996, "num_input_tokens_seen": 217152592, "step": 478, "train_runtime": 34522.6931, "train_tokens_per_second": 6290.141 }, { "epoch": 0.5799525156310597, "grad_norm": 0.22266528010368347, "learning_rate": 5e-06, "loss": 0.9858, "num_input_tokens_seen": 217617320, "step": 479, "train_runtime": 34595.0498, "train_tokens_per_second": 6290.418 }, { "epoch": 0.5811632724486611, "grad_norm": 0.23907960951328278, "learning_rate": 5e-06, "loss": 0.9656, "num_input_tokens_seen": 218085936, "step": 480, "train_runtime": 34667.9296, "train_tokens_per_second": 6290.711 }, { "epoch": 0.5823740292662625, "grad_norm": 0.2604992985725403, "learning_rate": 5e-06, "loss": 0.9883, "num_input_tokens_seen": 218529216, "step": 481, "train_runtime": 34736.6853, "train_tokens_per_second": 6291.021 }, { "epoch": 0.5835847860838639, "grad_norm": 0.24895359575748444, "learning_rate": 5e-06, "loss": 0.9568, "num_input_tokens_seen": 218971904, "step": 482, "train_runtime": 34805.5529, "train_tokens_per_second": 6291.292 }, { "epoch": 0.5847955429014652, "grad_norm": 0.24118000268936157, "learning_rate": 5e-06, "loss": 0.9923, "num_input_tokens_seen": 219405992, "step": 483, "train_runtime": 34873.0797, "train_tokens_per_second": 6291.558 }, { "epoch": 0.5860062997190666, "grad_norm": 0.245997354388237, "learning_rate": 5e-06, "loss": 0.9438, "num_input_tokens_seen": 219865712, "step": 484, "train_runtime": 34944.6761, "train_tokens_per_second": 6291.823 }, { "epoch": 0.587217056536668, "grad_norm": 0.2530381679534912, "learning_rate": 5e-06, "loss": 0.9391, "num_input_tokens_seen": 220315800, "step": 485, "train_runtime": 35014.4814, "train_tokens_per_second": 6292.134 }, { "epoch": 0.5884278133542693, "grad_norm": 0.2256454974412918, "learning_rate": 5e-06, "loss": 0.9632, "num_input_tokens_seen": 220788832, "step": 486, "train_runtime": 35088.2705, "train_tokens_per_second": 6292.383 }, { "epoch": 0.5896385701718707, "grad_norm": 0.23818935453891754, "learning_rate": 5e-06, "loss": 0.9615, "num_input_tokens_seen": 221257600, "step": 487, "train_runtime": 35161.1927, "train_tokens_per_second": 6292.665 }, { "epoch": 0.5908493269894721, "grad_norm": 0.22735600173473358, "learning_rate": 5e-06, "loss": 0.983, "num_input_tokens_seen": 221697208, "step": 488, "train_runtime": 35229.7059, "train_tokens_per_second": 6292.905 }, { "epoch": 0.5920600838070734, "grad_norm": 0.22348052263259888, "learning_rate": 5e-06, "loss": 1.0074, "num_input_tokens_seen": 222148296, "step": 489, "train_runtime": 35300.089, "train_tokens_per_second": 6293.137 }, { "epoch": 0.5932708406246748, "grad_norm": 0.26825666427612305, "learning_rate": 5e-06, "loss": 0.9512, "num_input_tokens_seen": 222612856, "step": 490, "train_runtime": 35372.8665, "train_tokens_per_second": 6293.322 }, { "epoch": 0.5944815974422762, "grad_norm": 0.23904314637184143, "learning_rate": 5e-06, "loss": 0.9923, "num_input_tokens_seen": 223081152, "step": 491, "train_runtime": 35446.0463, "train_tokens_per_second": 6293.541 }, { "epoch": 0.5956923542598777, "grad_norm": 0.2582261860370636, "learning_rate": 5e-06, "loss": 1.0427, "num_input_tokens_seen": 223537072, "step": 492, "train_runtime": 35517.0699, "train_tokens_per_second": 6293.793 }, { "epoch": 0.596903111077479, "grad_norm": 0.22952939569950104, "learning_rate": 5e-06, "loss": 0.9664, "num_input_tokens_seen": 223980672, "step": 493, "train_runtime": 35585.9954, "train_tokens_per_second": 6294.068 }, { "epoch": 0.5981138678950804, "grad_norm": 0.26730042695999146, "learning_rate": 5e-06, "loss": 1.0279, "num_input_tokens_seen": 224426280, "step": 494, "train_runtime": 35655.3839, "train_tokens_per_second": 6294.317 }, { "epoch": 0.5993246247126818, "grad_norm": 0.25793856382369995, "learning_rate": 5e-06, "loss": 1.0055, "num_input_tokens_seen": 224884152, "step": 495, "train_runtime": 35726.9034, "train_tokens_per_second": 6294.532 }, { "epoch": 0.6005353815302831, "grad_norm": 0.22298921644687653, "learning_rate": 5e-06, "loss": 0.9601, "num_input_tokens_seen": 225350592, "step": 496, "train_runtime": 35799.441, "train_tokens_per_second": 6294.808 }, { "epoch": 0.6017461383478845, "grad_norm": 0.26628899574279785, "learning_rate": 5e-06, "loss": 0.9805, "num_input_tokens_seen": 225807448, "step": 497, "train_runtime": 35870.8365, "train_tokens_per_second": 6295.015 }, { "epoch": 0.6029568951654859, "grad_norm": 0.22120925784111023, "learning_rate": 5e-06, "loss": 0.9208, "num_input_tokens_seen": 226266528, "step": 498, "train_runtime": 35942.5822, "train_tokens_per_second": 6295.222 }, { "epoch": 0.6041676519830872, "grad_norm": 0.24458245933055878, "learning_rate": 5e-06, "loss": 0.9881, "num_input_tokens_seen": 226707672, "step": 499, "train_runtime": 36011.3021, "train_tokens_per_second": 6295.459 }, { "epoch": 0.6053784088006886, "grad_norm": 0.24703119695186615, "learning_rate": 5e-06, "loss": 1.0137, "num_input_tokens_seen": 227145656, "step": 500, "train_runtime": 36079.2522, "train_tokens_per_second": 6295.742 }, { "epoch": 0.60658916561829, "grad_norm": 0.24142247438430786, "learning_rate": 5e-06, "loss": 0.9205, "num_input_tokens_seen": 227611200, "step": 501, "train_runtime": 36151.1703, "train_tokens_per_second": 6296.095 }, { "epoch": 0.6077999224358914, "grad_norm": 0.2489280104637146, "learning_rate": 5e-06, "loss": 1.0177, "num_input_tokens_seen": 228059544, "step": 502, "train_runtime": 36220.9972, "train_tokens_per_second": 6296.335 }, { "epoch": 0.6090106792534927, "grad_norm": 0.23111343383789062, "learning_rate": 5e-06, "loss": 0.962, "num_input_tokens_seen": 228526064, "step": 503, "train_runtime": 36293.9384, "train_tokens_per_second": 6296.535 }, { "epoch": 0.6102214360710941, "grad_norm": 0.24690377712249756, "learning_rate": 5e-06, "loss": 0.9986, "num_input_tokens_seen": 228981232, "step": 504, "train_runtime": 36364.2956, "train_tokens_per_second": 6296.87 }, { "epoch": 0.6114321928886955, "grad_norm": 0.2393392026424408, "learning_rate": 5e-06, "loss": 0.9866, "num_input_tokens_seen": 229439688, "step": 505, "train_runtime": 36435.751, "train_tokens_per_second": 6297.103 }, { "epoch": 0.6126429497062968, "grad_norm": 0.24542857706546783, "learning_rate": 5e-06, "loss": 0.9987, "num_input_tokens_seen": 229910688, "step": 506, "train_runtime": 36509.419, "train_tokens_per_second": 6297.298 }, { "epoch": 0.6138537065238983, "grad_norm": 0.24054135382175446, "learning_rate": 5e-06, "loss": 0.9829, "num_input_tokens_seen": 230361040, "step": 507, "train_runtime": 36579.4262, "train_tokens_per_second": 6297.558 }, { "epoch": 0.6150644633414997, "grad_norm": 0.24931353330612183, "learning_rate": 5e-06, "loss": 1.0582, "num_input_tokens_seen": 230795008, "step": 508, "train_runtime": 36646.8551, "train_tokens_per_second": 6297.812 }, { "epoch": 0.616275220159101, "grad_norm": 0.28090900182724, "learning_rate": 5e-06, "loss": 1.0016, "num_input_tokens_seen": 231239392, "step": 509, "train_runtime": 36716.4303, "train_tokens_per_second": 6297.981 }, { "epoch": 0.6174859769767024, "grad_norm": 0.2591536045074463, "learning_rate": 5e-06, "loss": 0.9496, "num_input_tokens_seen": 231693192, "step": 510, "train_runtime": 36786.8129, "train_tokens_per_second": 6298.268 }, { "epoch": 0.6186967337943038, "grad_norm": 0.24983936548233032, "learning_rate": 5e-06, "loss": 0.9667, "num_input_tokens_seen": 232139528, "step": 511, "train_runtime": 36856.2058, "train_tokens_per_second": 6298.519 }, { "epoch": 0.6199074906119052, "grad_norm": 0.23879870772361755, "learning_rate": 5e-06, "loss": 1.0292, "num_input_tokens_seen": 232572720, "step": 512, "train_runtime": 36923.1252, "train_tokens_per_second": 6298.836 }, { "epoch": 0.6211182474295065, "grad_norm": 0.24429570138454437, "learning_rate": 5e-06, "loss": 1.0124, "num_input_tokens_seen": 233019736, "step": 513, "train_runtime": 36992.5361, "train_tokens_per_second": 6299.101 }, { "epoch": 0.6223290042471079, "grad_norm": 0.24088793992996216, "learning_rate": 5e-06, "loss": 0.9629, "num_input_tokens_seen": 233483920, "step": 514, "train_runtime": 37064.9551, "train_tokens_per_second": 6299.317 }, { "epoch": 0.6235397610647093, "grad_norm": 0.2581544816493988, "learning_rate": 5e-06, "loss": 0.9591, "num_input_tokens_seen": 233939496, "step": 515, "train_runtime": 37136.0969, "train_tokens_per_second": 6299.518 }, { "epoch": 0.6247505178823106, "grad_norm": 0.2298753410577774, "learning_rate": 5e-06, "loss": 0.9898, "num_input_tokens_seen": 234398968, "step": 516, "train_runtime": 37207.7335, "train_tokens_per_second": 6299.738 }, { "epoch": 0.625961274699912, "grad_norm": 0.2409614771604538, "learning_rate": 5e-06, "loss": 0.9524, "num_input_tokens_seen": 234840984, "step": 517, "train_runtime": 37276.4815, "train_tokens_per_second": 6299.977 }, { "epoch": 0.6271720315175134, "grad_norm": 0.24182307720184326, "learning_rate": 5e-06, "loss": 0.9592, "num_input_tokens_seen": 235265976, "step": 518, "train_runtime": 37342.2003, "train_tokens_per_second": 6300.271 }, { "epoch": 0.6283827883351147, "grad_norm": 0.25573626160621643, "learning_rate": 5e-06, "loss": 0.9746, "num_input_tokens_seen": 235751176, "step": 519, "train_runtime": 37417.9534, "train_tokens_per_second": 6300.483 }, { "epoch": 0.6295935451527161, "grad_norm": 0.27016371488571167, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 236213584, "step": 520, "train_runtime": 37490.1733, "train_tokens_per_second": 6300.68 }, { "epoch": 0.6308043019703176, "grad_norm": 0.24281057715415955, "learning_rate": 5e-06, "loss": 1.0085, "num_input_tokens_seen": 236663272, "step": 521, "train_runtime": 37561.7456, "train_tokens_per_second": 6300.646 }, { "epoch": 0.632015058787919, "grad_norm": 0.2382790446281433, "learning_rate": 5e-06, "loss": 1.0166, "num_input_tokens_seen": 237104624, "step": 522, "train_runtime": 37630.4462, "train_tokens_per_second": 6300.872 }, { "epoch": 0.6332258156055203, "grad_norm": 0.24074813723564148, "learning_rate": 5e-06, "loss": 1.0121, "num_input_tokens_seen": 237572832, "step": 523, "train_runtime": 37703.3574, "train_tokens_per_second": 6301.105 }, { "epoch": 0.6344365724231217, "grad_norm": 0.26703017950057983, "learning_rate": 5e-06, "loss": 0.99, "num_input_tokens_seen": 238034928, "step": 524, "train_runtime": 37775.4137, "train_tokens_per_second": 6301.319 }, { "epoch": 0.6356473292407231, "grad_norm": 0.31544211506843567, "learning_rate": 5e-06, "loss": 0.9136, "num_input_tokens_seen": 238489256, "step": 525, "train_runtime": 37846.4867, "train_tokens_per_second": 6301.49 }, { "epoch": 0.6368580860583244, "grad_norm": 0.2323281615972519, "learning_rate": 5e-06, "loss": 0.9803, "num_input_tokens_seen": 238937608, "step": 526, "train_runtime": 37916.1928, "train_tokens_per_second": 6301.73 }, { "epoch": 0.6380688428759258, "grad_norm": 0.22566953301429749, "learning_rate": 5e-06, "loss": 0.9496, "num_input_tokens_seen": 239408904, "step": 527, "train_runtime": 37989.8164, "train_tokens_per_second": 6301.923 }, { "epoch": 0.6392795996935272, "grad_norm": 0.24885083734989166, "learning_rate": 5e-06, "loss": 0.9671, "num_input_tokens_seen": 239856616, "step": 528, "train_runtime": 38059.5861, "train_tokens_per_second": 6302.134 }, { "epoch": 0.6404903565111285, "grad_norm": 0.2867506742477417, "learning_rate": 5e-06, "loss": 1.02, "num_input_tokens_seen": 240342880, "step": 529, "train_runtime": 38136.0338, "train_tokens_per_second": 6302.252 }, { "epoch": 0.6417011133287299, "grad_norm": 0.23189502954483032, "learning_rate": 5e-06, "loss": 1.0122, "num_input_tokens_seen": 240799360, "step": 530, "train_runtime": 38207.0698, "train_tokens_per_second": 6302.482 }, { "epoch": 0.6429118701463313, "grad_norm": 0.2151128053665161, "learning_rate": 5e-06, "loss": 0.9502, "num_input_tokens_seen": 241285344, "step": 531, "train_runtime": 38282.6957, "train_tokens_per_second": 6302.726 }, { "epoch": 0.6441226269639327, "grad_norm": 0.23497872054576874, "learning_rate": 5e-06, "loss": 1.0372, "num_input_tokens_seen": 241748624, "step": 532, "train_runtime": 38355.235, "train_tokens_per_second": 6302.885 }, { "epoch": 0.645333383781534, "grad_norm": 0.22813764214515686, "learning_rate": 5e-06, "loss": 0.936, "num_input_tokens_seen": 242200600, "step": 533, "train_runtime": 38425.9554, "train_tokens_per_second": 6303.047 }, { "epoch": 0.6465441405991355, "grad_norm": 0.2369297444820404, "learning_rate": 5e-06, "loss": 0.9499, "num_input_tokens_seen": 242661128, "step": 534, "train_runtime": 38497.3268, "train_tokens_per_second": 6303.324 }, { "epoch": 0.6477548974167369, "grad_norm": 0.2485128939151764, "learning_rate": 5e-06, "loss": 0.984, "num_input_tokens_seen": 243128592, "step": 535, "train_runtime": 38570.5127, "train_tokens_per_second": 6303.484 }, { "epoch": 0.6489656542343382, "grad_norm": 0.23329830169677734, "learning_rate": 5e-06, "loss": 0.9638, "num_input_tokens_seen": 243580072, "step": 536, "train_runtime": 38640.2643, "train_tokens_per_second": 6303.789 }, { "epoch": 0.6501764110519396, "grad_norm": 0.2227838784456253, "learning_rate": 5e-06, "loss": 0.9573, "num_input_tokens_seen": 244042008, "step": 537, "train_runtime": 38712.1042, "train_tokens_per_second": 6304.023 }, { "epoch": 0.651387167869541, "grad_norm": 0.22910352051258087, "learning_rate": 5e-06, "loss": 0.9834, "num_input_tokens_seen": 244490152, "step": 538, "train_runtime": 38782.3373, "train_tokens_per_second": 6304.162 }, { "epoch": 0.6525979246871423, "grad_norm": 0.24009035527706146, "learning_rate": 5e-06, "loss": 0.9842, "num_input_tokens_seen": 244958928, "step": 539, "train_runtime": 38855.1605, "train_tokens_per_second": 6304.412 }, { "epoch": 0.6538086815047437, "grad_norm": 0.232088103890419, "learning_rate": 5e-06, "loss": 0.9951, "num_input_tokens_seen": 245409888, "step": 540, "train_runtime": 38925.8637, "train_tokens_per_second": 6304.546 }, { "epoch": 0.6550194383223451, "grad_norm": 0.27717524766921997, "learning_rate": 5e-06, "loss": 1.0204, "num_input_tokens_seen": 245860728, "step": 541, "train_runtime": 38995.7398, "train_tokens_per_second": 6304.81 }, { "epoch": 0.6562301951399465, "grad_norm": 0.22988007962703705, "learning_rate": 5e-06, "loss": 0.9889, "num_input_tokens_seen": 246299864, "step": 542, "train_runtime": 39064.3271, "train_tokens_per_second": 6304.982 }, { "epoch": 0.6574409519575478, "grad_norm": 0.21664994955062866, "learning_rate": 5e-06, "loss": 0.9719, "num_input_tokens_seen": 246777792, "step": 543, "train_runtime": 39139.2212, "train_tokens_per_second": 6305.128 }, { "epoch": 0.6586517087751492, "grad_norm": 0.23201525211334229, "learning_rate": 5e-06, "loss": 0.9516, "num_input_tokens_seen": 247213536, "step": 544, "train_runtime": 39206.717, "train_tokens_per_second": 6305.387 }, { "epoch": 0.6598624655927506, "grad_norm": 0.2412644624710083, "learning_rate": 5e-06, "loss": 1.0171, "num_input_tokens_seen": 247655632, "step": 545, "train_runtime": 39275.488, "train_tokens_per_second": 6305.603 }, { "epoch": 0.6610732224103519, "grad_norm": 0.2807646691799164, "learning_rate": 5e-06, "loss": 0.9558, "num_input_tokens_seen": 248112512, "step": 546, "train_runtime": 39346.817, "train_tokens_per_second": 6305.784 }, { "epoch": 0.6622839792279533, "grad_norm": 0.2552436888217926, "learning_rate": 5e-06, "loss": 0.9419, "num_input_tokens_seen": 248543176, "step": 547, "train_runtime": 39413.1641, "train_tokens_per_second": 6306.095 }, { "epoch": 0.6634947360455548, "grad_norm": 0.2214186191558838, "learning_rate": 5e-06, "loss": 0.9624, "num_input_tokens_seen": 248985160, "step": 548, "train_runtime": 39482.1921, "train_tokens_per_second": 6306.265 }, { "epoch": 0.6647054928631561, "grad_norm": 0.24030745029449463, "learning_rate": 5e-06, "loss": 0.9839, "num_input_tokens_seen": 249429704, "step": 549, "train_runtime": 39551.3491, "train_tokens_per_second": 6306.478 }, { "epoch": 0.6659162496807575, "grad_norm": 0.23489521443843842, "learning_rate": 5e-06, "loss": 0.9455, "num_input_tokens_seen": 249889432, "step": 550, "train_runtime": 39622.7881, "train_tokens_per_second": 6306.71 }, { "epoch": 0.6671270064983589, "grad_norm": 0.24063046276569366, "learning_rate": 5e-06, "loss": 1.0, "num_input_tokens_seen": 250325736, "step": 551, "train_runtime": 39690.0826, "train_tokens_per_second": 6307.01 }, { "epoch": 0.6683377633159603, "grad_norm": 0.22540496289730072, "learning_rate": 5e-06, "loss": 0.9921, "num_input_tokens_seen": 250785944, "step": 552, "train_runtime": 39761.8576, "train_tokens_per_second": 6307.199 }, { "epoch": 0.6695485201335616, "grad_norm": 0.2306659072637558, "learning_rate": 5e-06, "loss": 0.9541, "num_input_tokens_seen": 251249584, "step": 553, "train_runtime": 39834.0319, "train_tokens_per_second": 6307.41 }, { "epoch": 0.670759276951163, "grad_norm": 0.24347856640815735, "learning_rate": 5e-06, "loss": 0.9769, "num_input_tokens_seen": 251697752, "step": 554, "train_runtime": 39904.0654, "train_tokens_per_second": 6307.572 }, { "epoch": 0.6719700337687644, "grad_norm": 0.2558618187904358, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 252157024, "step": 555, "train_runtime": 39975.4875, "train_tokens_per_second": 6307.791 }, { "epoch": 0.6731807905863657, "grad_norm": 0.2455194890499115, "learning_rate": 5e-06, "loss": 1.0343, "num_input_tokens_seen": 252624768, "step": 556, "train_runtime": 40048.3193, "train_tokens_per_second": 6307.999 }, { "epoch": 0.6743915474039671, "grad_norm": 0.2299470454454422, "learning_rate": 5e-06, "loss": 0.982, "num_input_tokens_seen": 253113208, "step": 557, "train_runtime": 40124.0351, "train_tokens_per_second": 6308.269 }, { "epoch": 0.6756023042215685, "grad_norm": 0.2273668348789215, "learning_rate": 5e-06, "loss": 0.9321, "num_input_tokens_seen": 253548664, "step": 558, "train_runtime": 40192.1439, "train_tokens_per_second": 6308.414 }, { "epoch": 0.6768130610391698, "grad_norm": 0.2353869080543518, "learning_rate": 5e-06, "loss": 0.9414, "num_input_tokens_seen": 254002376, "step": 559, "train_runtime": 40262.9517, "train_tokens_per_second": 6308.588 }, { "epoch": 0.6780238178567712, "grad_norm": 0.22576971352100372, "learning_rate": 5e-06, "loss": 0.9887, "num_input_tokens_seen": 254443952, "step": 560, "train_runtime": 40331.6289, "train_tokens_per_second": 6308.794 }, { "epoch": 0.6792345746743726, "grad_norm": 0.22624272108078003, "learning_rate": 5e-06, "loss": 0.9121, "num_input_tokens_seen": 254911344, "step": 561, "train_runtime": 40404.3092, "train_tokens_per_second": 6309.014 }, { "epoch": 0.6804453314919741, "grad_norm": 0.2185974419116974, "learning_rate": 5e-06, "loss": 0.9243, "num_input_tokens_seen": 255377664, "step": 562, "train_runtime": 40477.1115, "train_tokens_per_second": 6309.187 }, { "epoch": 0.6816560883095754, "grad_norm": 0.22251008450984955, "learning_rate": 5e-06, "loss": 0.9011, "num_input_tokens_seen": 255833336, "step": 563, "train_runtime": 40548.4408, "train_tokens_per_second": 6309.326 }, { "epoch": 0.6828668451271768, "grad_norm": 0.23521266877651215, "learning_rate": 5e-06, "loss": 0.9787, "num_input_tokens_seen": 256301336, "step": 564, "train_runtime": 40621.3067, "train_tokens_per_second": 6309.53 }, { "epoch": 0.6840776019447782, "grad_norm": 0.2637956440448761, "learning_rate": 5e-06, "loss": 0.9593, "num_input_tokens_seen": 256761416, "step": 565, "train_runtime": 40693.2274, "train_tokens_per_second": 6309.684 }, { "epoch": 0.6852883587623795, "grad_norm": 0.23881720006465912, "learning_rate": 5e-06, "loss": 1.0131, "num_input_tokens_seen": 257211376, "step": 566, "train_runtime": 40763.5757, "train_tokens_per_second": 6309.834 }, { "epoch": 0.6864991155799809, "grad_norm": 0.23504596948623657, "learning_rate": 5e-06, "loss": 0.9946, "num_input_tokens_seen": 257669744, "step": 567, "train_runtime": 40834.9767, "train_tokens_per_second": 6310.025 }, { "epoch": 0.6877098723975823, "grad_norm": 0.22577445209026337, "learning_rate": 5e-06, "loss": 0.9593, "num_input_tokens_seen": 258128040, "step": 568, "train_runtime": 40906.4809, "train_tokens_per_second": 6310.199 }, { "epoch": 0.6889206292151836, "grad_norm": 0.24191945791244507, "learning_rate": 5e-06, "loss": 0.9935, "num_input_tokens_seen": 258575712, "step": 569, "train_runtime": 40976.425, "train_tokens_per_second": 6310.353 }, { "epoch": 0.690131386032785, "grad_norm": 0.23592589795589447, "learning_rate": 5e-06, "loss": 0.9547, "num_input_tokens_seen": 259039864, "step": 570, "train_runtime": 41048.2903, "train_tokens_per_second": 6310.613 }, { "epoch": 0.6913421428503864, "grad_norm": 0.23204831779003143, "learning_rate": 5e-06, "loss": 0.9409, "num_input_tokens_seen": 259505856, "step": 571, "train_runtime": 41121.4188, "train_tokens_per_second": 6310.722 }, { "epoch": 0.6925528996679878, "grad_norm": 0.23110359907150269, "learning_rate": 5e-06, "loss": 0.9908, "num_input_tokens_seen": 259970968, "step": 572, "train_runtime": 41194.822, "train_tokens_per_second": 6310.768 }, { "epoch": 0.6937636564855891, "grad_norm": 0.2301538735628128, "learning_rate": 5e-06, "loss": 0.9846, "num_input_tokens_seen": 260449376, "step": 573, "train_runtime": 41269.549, "train_tokens_per_second": 6310.933 }, { "epoch": 0.6949744133031905, "grad_norm": 0.23412424325942993, "learning_rate": 5e-06, "loss": 0.9713, "num_input_tokens_seen": 260910664, "step": 574, "train_runtime": 41341.5032, "train_tokens_per_second": 6311.107 }, { "epoch": 0.696185170120792, "grad_norm": 0.2345420867204666, "learning_rate": 5e-06, "loss": 1.0265, "num_input_tokens_seen": 261349104, "step": 575, "train_runtime": 41409.221, "train_tokens_per_second": 6311.375 }, { "epoch": 0.6973959269383933, "grad_norm": 0.2388794869184494, "learning_rate": 5e-06, "loss": 0.9976, "num_input_tokens_seen": 261807744, "step": 576, "train_runtime": 41480.6661, "train_tokens_per_second": 6311.561 }, { "epoch": 0.6986066837559947, "grad_norm": 0.27313232421875, "learning_rate": 5e-06, "loss": 0.9973, "num_input_tokens_seen": 262231864, "step": 577, "train_runtime": 41546.1729, "train_tokens_per_second": 6311.818 }, { "epoch": 0.6998174405735961, "grad_norm": 0.2250782698392868, "learning_rate": 5e-06, "loss": 0.9869, "num_input_tokens_seen": 262715224, "step": 578, "train_runtime": 41621.5112, "train_tokens_per_second": 6312.006 }, { "epoch": 0.7010281973911974, "grad_norm": 0.2509269714355469, "learning_rate": 5e-06, "loss": 0.9775, "num_input_tokens_seen": 263176488, "step": 579, "train_runtime": 41693.4808, "train_tokens_per_second": 6312.174 }, { "epoch": 0.7022389542087988, "grad_norm": 0.2787635624408722, "learning_rate": 5e-06, "loss": 1.002, "num_input_tokens_seen": 263610608, "step": 580, "train_runtime": 41761.0277, "train_tokens_per_second": 6312.359 }, { "epoch": 0.7034497110264002, "grad_norm": 0.23429201543331146, "learning_rate": 5e-06, "loss": 0.9957, "num_input_tokens_seen": 264088768, "step": 581, "train_runtime": 41835.8378, "train_tokens_per_second": 6312.501 }, { "epoch": 0.7046604678440016, "grad_norm": 0.21760432422161102, "learning_rate": 5e-06, "loss": 0.9657, "num_input_tokens_seen": 264544496, "step": 582, "train_runtime": 41907.0545, "train_tokens_per_second": 6312.648 }, { "epoch": 0.7058712246616029, "grad_norm": 0.248090460896492, "learning_rate": 5e-06, "loss": 1.0259, "num_input_tokens_seen": 264988656, "step": 583, "train_runtime": 41976.0921, "train_tokens_per_second": 6312.847 }, { "epoch": 0.7070819814792043, "grad_norm": 0.23136785626411438, "learning_rate": 5e-06, "loss": 0.9822, "num_input_tokens_seen": 265423208, "step": 584, "train_runtime": 42043.4666, "train_tokens_per_second": 6313.067 }, { "epoch": 0.7082927382968057, "grad_norm": 0.2529706358909607, "learning_rate": 5e-06, "loss": 1.0446, "num_input_tokens_seen": 265854688, "step": 585, "train_runtime": 42110.6206, "train_tokens_per_second": 6313.246 }, { "epoch": 0.709503495114407, "grad_norm": 0.24560308456420898, "learning_rate": 5e-06, "loss": 0.9233, "num_input_tokens_seen": 266311664, "step": 586, "train_runtime": 42182.4609, "train_tokens_per_second": 6313.327 }, { "epoch": 0.7107142519320084, "grad_norm": 0.24339045584201813, "learning_rate": 5e-06, "loss": 0.9799, "num_input_tokens_seen": 266756048, "step": 587, "train_runtime": 42251.4986, "train_tokens_per_second": 6313.529 }, { "epoch": 0.7119250087496098, "grad_norm": 0.22854940593242645, "learning_rate": 5e-06, "loss": 0.9886, "num_input_tokens_seen": 267210376, "step": 588, "train_runtime": 42321.7976, "train_tokens_per_second": 6313.777 }, { "epoch": 0.7131357655672111, "grad_norm": 0.24025574326515198, "learning_rate": 5e-06, "loss": 0.92, "num_input_tokens_seen": 267660896, "step": 589, "train_runtime": 42392.4873, "train_tokens_per_second": 6313.876 }, { "epoch": 0.7143465223848126, "grad_norm": 0.2508932054042816, "learning_rate": 5e-06, "loss": 1.0165, "num_input_tokens_seen": 268113168, "step": 590, "train_runtime": 42462.8573, "train_tokens_per_second": 6314.063 }, { "epoch": 0.715557279202414, "grad_norm": 0.24230146408081055, "learning_rate": 5e-06, "loss": 0.9882, "num_input_tokens_seen": 268574272, "step": 591, "train_runtime": 42534.2219, "train_tokens_per_second": 6314.31 }, { "epoch": 0.7167680360200154, "grad_norm": 0.23562973737716675, "learning_rate": 5e-06, "loss": 1.012, "num_input_tokens_seen": 269043528, "step": 592, "train_runtime": 42606.947, "train_tokens_per_second": 6314.546 }, { "epoch": 0.7179787928376167, "grad_norm": 0.2341059297323227, "learning_rate": 5e-06, "loss": 1.0073, "num_input_tokens_seen": 269523064, "step": 593, "train_runtime": 42681.8077, "train_tokens_per_second": 6314.706 }, { "epoch": 0.7191895496552181, "grad_norm": 0.2380225509405136, "learning_rate": 5e-06, "loss": 0.9657, "num_input_tokens_seen": 269981784, "step": 594, "train_runtime": 42753.4941, "train_tokens_per_second": 6314.847 }, { "epoch": 0.7204003064728195, "grad_norm": 0.2389514148235321, "learning_rate": 5e-06, "loss": 0.9869, "num_input_tokens_seen": 270421760, "step": 595, "train_runtime": 42822.2596, "train_tokens_per_second": 6314.981 }, { "epoch": 0.7216110632904208, "grad_norm": 0.24948102235794067, "learning_rate": 5e-06, "loss": 0.9544, "num_input_tokens_seen": 270861744, "step": 596, "train_runtime": 42891.0541, "train_tokens_per_second": 6315.11 }, { "epoch": 0.7228218201080222, "grad_norm": 0.25714853405952454, "learning_rate": 5e-06, "loss": 0.994, "num_input_tokens_seen": 271296104, "step": 597, "train_runtime": 42958.6929, "train_tokens_per_second": 6315.278 }, { "epoch": 0.7240325769256236, "grad_norm": 0.23045891523361206, "learning_rate": 5e-06, "loss": 0.9594, "num_input_tokens_seen": 271751872, "step": 598, "train_runtime": 43029.9594, "train_tokens_per_second": 6315.411 }, { "epoch": 0.7252433337432249, "grad_norm": 0.23973950743675232, "learning_rate": 5e-06, "loss": 1.0007, "num_input_tokens_seen": 272182760, "step": 599, "train_runtime": 43096.7924, "train_tokens_per_second": 6315.615 }, { "epoch": 0.7264540905608263, "grad_norm": 0.23554377257823944, "learning_rate": 5e-06, "loss": 0.9565, "num_input_tokens_seen": 272631360, "step": 600, "train_runtime": 43166.6123, "train_tokens_per_second": 6315.792 }, { "epoch": 0.7276648473784277, "grad_norm": 0.24061642587184906, "learning_rate": 5e-06, "loss": 0.9714, "num_input_tokens_seen": 273087768, "step": 601, "train_runtime": 43237.8394, "train_tokens_per_second": 6315.944 }, { "epoch": 0.7288756041960291, "grad_norm": 0.23701608180999756, "learning_rate": 5e-06, "loss": 0.9918, "num_input_tokens_seen": 273545024, "step": 602, "train_runtime": 43309.1444, "train_tokens_per_second": 6316.103 }, { "epoch": 0.7300863610136304, "grad_norm": 0.23831920325756073, "learning_rate": 5e-06, "loss": 0.9747, "num_input_tokens_seen": 273993320, "step": 603, "train_runtime": 43379.1503, "train_tokens_per_second": 6316.245 }, { "epoch": 0.7312971178312319, "grad_norm": 0.22237437963485718, "learning_rate": 5e-06, "loss": 0.9567, "num_input_tokens_seen": 274442800, "step": 604, "train_runtime": 43448.8521, "train_tokens_per_second": 6316.457 }, { "epoch": 0.7325078746488333, "grad_norm": 0.30931001901626587, "learning_rate": 5e-06, "loss": 0.9831, "num_input_tokens_seen": 274887544, "step": 605, "train_runtime": 43517.5812, "train_tokens_per_second": 6316.701 }, { "epoch": 0.7337186314664346, "grad_norm": 0.23581911623477936, "learning_rate": 5e-06, "loss": 0.9202, "num_input_tokens_seen": 275352664, "step": 606, "train_runtime": 43590.2263, "train_tokens_per_second": 6316.844 }, { "epoch": 0.734929388284036, "grad_norm": 0.2689816951751709, "learning_rate": 5e-06, "loss": 1.0085, "num_input_tokens_seen": 275804984, "step": 607, "train_runtime": 43660.6992, "train_tokens_per_second": 6317.008 }, { "epoch": 0.7361401451016374, "grad_norm": 0.2378932386636734, "learning_rate": 5e-06, "loss": 0.9458, "num_input_tokens_seen": 276247352, "step": 608, "train_runtime": 43729.3613, "train_tokens_per_second": 6317.205 }, { "epoch": 0.7373509019192387, "grad_norm": 0.22164365649223328, "learning_rate": 5e-06, "loss": 0.9847, "num_input_tokens_seen": 276707568, "step": 609, "train_runtime": 43800.9856, "train_tokens_per_second": 6317.382 }, { "epoch": 0.7385616587368401, "grad_norm": 0.23126821219921112, "learning_rate": 5e-06, "loss": 0.9935, "num_input_tokens_seen": 277170312, "step": 610, "train_runtime": 43873.0149, "train_tokens_per_second": 6317.558 }, { "epoch": 0.7397724155544415, "grad_norm": 0.23724284768104553, "learning_rate": 5e-06, "loss": 0.9627, "num_input_tokens_seen": 277620736, "step": 611, "train_runtime": 43943.152, "train_tokens_per_second": 6317.725 }, { "epoch": 0.7409831723720429, "grad_norm": 0.2428486943244934, "learning_rate": 5e-06, "loss": 0.9938, "num_input_tokens_seen": 278074992, "step": 612, "train_runtime": 44013.5354, "train_tokens_per_second": 6317.943 }, { "epoch": 0.7421939291896442, "grad_norm": 0.24035997688770294, "learning_rate": 5e-06, "loss": 0.9386, "num_input_tokens_seen": 278525848, "step": 613, "train_runtime": 44083.3847, "train_tokens_per_second": 6318.159 }, { "epoch": 0.7434046860072456, "grad_norm": 0.23970334231853485, "learning_rate": 5e-06, "loss": 0.9874, "num_input_tokens_seen": 278966440, "step": 614, "train_runtime": 44151.8915, "train_tokens_per_second": 6318.335 }, { "epoch": 0.744615442824847, "grad_norm": 0.213746577501297, "learning_rate": 5e-06, "loss": 0.9763, "num_input_tokens_seen": 279449936, "step": 615, "train_runtime": 44227.7947, "train_tokens_per_second": 6318.423 }, { "epoch": 0.7458261996424483, "grad_norm": 0.2598293423652649, "learning_rate": 5e-06, "loss": 0.9765, "num_input_tokens_seen": 279890568, "step": 616, "train_runtime": 44296.6434, "train_tokens_per_second": 6318.55 }, { "epoch": 0.7470369564600498, "grad_norm": 0.2453431487083435, "learning_rate": 5e-06, "loss": 0.9433, "num_input_tokens_seen": 280349072, "step": 617, "train_runtime": 44367.5664, "train_tokens_per_second": 6318.784 }, { "epoch": 0.7482477132776512, "grad_norm": 0.23078188300132751, "learning_rate": 5e-06, "loss": 0.9357, "num_input_tokens_seen": 280821128, "step": 618, "train_runtime": 44441.7787, "train_tokens_per_second": 6318.854 }, { "epoch": 0.7494584700952525, "grad_norm": 0.23313450813293457, "learning_rate": 5e-06, "loss": 0.9628, "num_input_tokens_seen": 281279568, "step": 619, "train_runtime": 44513.2227, "train_tokens_per_second": 6319.012 }, { "epoch": 0.7506692269128539, "grad_norm": 0.21814242005348206, "learning_rate": 5e-06, "loss": 0.9312, "num_input_tokens_seen": 281738128, "step": 620, "train_runtime": 44584.8919, "train_tokens_per_second": 6319.139 }, { "epoch": 0.7518799837304553, "grad_norm": 0.2563712000846863, "learning_rate": 5e-06, "loss": 0.9814, "num_input_tokens_seen": 282181152, "step": 621, "train_runtime": 44653.7827, "train_tokens_per_second": 6319.311 }, { "epoch": 0.7530907405480567, "grad_norm": 0.2649373412132263, "learning_rate": 5e-06, "loss": 1.0402, "num_input_tokens_seen": 282631384, "step": 622, "train_runtime": 44724.7127, "train_tokens_per_second": 6319.356 }, { "epoch": 0.754301497365658, "grad_norm": 0.22597451508045197, "learning_rate": 5e-06, "loss": 0.9611, "num_input_tokens_seen": 283073568, "step": 623, "train_runtime": 44794.6422, "train_tokens_per_second": 6319.362 }, { "epoch": 0.7555122541832594, "grad_norm": 0.24213433265686035, "learning_rate": 5e-06, "loss": 0.9404, "num_input_tokens_seen": 283512448, "step": 624, "train_runtime": 44862.7569, "train_tokens_per_second": 6319.55 }, { "epoch": 0.7567230110008608, "grad_norm": 0.2487850785255432, "learning_rate": 5e-06, "loss": 0.9387, "num_input_tokens_seen": 283970752, "step": 625, "train_runtime": 44934.4627, "train_tokens_per_second": 6319.665 }, { "epoch": 0.7579337678184621, "grad_norm": 0.2626650333404541, "learning_rate": 5e-06, "loss": 0.9924, "num_input_tokens_seen": 284447624, "step": 626, "train_runtime": 45008.9717, "train_tokens_per_second": 6319.798 }, { "epoch": 0.7591445246360635, "grad_norm": 0.24381890892982483, "learning_rate": 5e-06, "loss": 0.9668, "num_input_tokens_seen": 284896224, "step": 627, "train_runtime": 45078.7734, "train_tokens_per_second": 6319.964 }, { "epoch": 0.7603552814536649, "grad_norm": 0.22962401807308197, "learning_rate": 5e-06, "loss": 1.0086, "num_input_tokens_seen": 285355952, "step": 628, "train_runtime": 45150.4275, "train_tokens_per_second": 6320.116 }, { "epoch": 0.7615660382712662, "grad_norm": 0.23533271253108978, "learning_rate": 5e-06, "loss": 0.9176, "num_input_tokens_seen": 285807352, "step": 629, "train_runtime": 45220.9764, "train_tokens_per_second": 6320.238 }, { "epoch": 0.7627767950888676, "grad_norm": 0.24772769212722778, "learning_rate": 5e-06, "loss": 0.9566, "num_input_tokens_seen": 286283944, "step": 630, "train_runtime": 45294.8924, "train_tokens_per_second": 6320.447 }, { "epoch": 0.763987551906469, "grad_norm": 0.2620101571083069, "learning_rate": 5e-06, "loss": 0.9959, "num_input_tokens_seen": 286733384, "step": 631, "train_runtime": 45364.8652, "train_tokens_per_second": 6320.605 }, { "epoch": 0.7651983087240705, "grad_norm": 0.23930427432060242, "learning_rate": 5e-06, "loss": 0.9519, "num_input_tokens_seen": 287216280, "step": 632, "train_runtime": 45439.9774, "train_tokens_per_second": 6320.784 }, { "epoch": 0.7664090655416718, "grad_norm": 0.24364081025123596, "learning_rate": 5e-06, "loss": 1.033, "num_input_tokens_seen": 287686200, "step": 633, "train_runtime": 45513.7576, "train_tokens_per_second": 6320.862 }, { "epoch": 0.7676198223592732, "grad_norm": 0.2459454983472824, "learning_rate": 5e-06, "loss": 0.9851, "num_input_tokens_seen": 288132976, "step": 634, "train_runtime": 45583.2033, "train_tokens_per_second": 6321.034 }, { "epoch": 0.7688305791768746, "grad_norm": 0.2267904430627823, "learning_rate": 5e-06, "loss": 0.9701, "num_input_tokens_seen": 288590576, "step": 635, "train_runtime": 45654.2868, "train_tokens_per_second": 6321.215 }, { "epoch": 0.7700413359944759, "grad_norm": 0.2215666025876999, "learning_rate": 5e-06, "loss": 0.9455, "num_input_tokens_seen": 289047888, "step": 636, "train_runtime": 45725.6961, "train_tokens_per_second": 6321.345 }, { "epoch": 0.7712520928120773, "grad_norm": 0.23759250342845917, "learning_rate": 5e-06, "loss": 0.9361, "num_input_tokens_seen": 289499480, "step": 637, "train_runtime": 45796.0887, "train_tokens_per_second": 6321.489 }, { "epoch": 0.7724628496296787, "grad_norm": 0.23697270452976227, "learning_rate": 5e-06, "loss": 0.9343, "num_input_tokens_seen": 289956856, "step": 638, "train_runtime": 45867.4221, "train_tokens_per_second": 6321.63 }, { "epoch": 0.77367360644728, "grad_norm": 0.2574046552181244, "learning_rate": 5e-06, "loss": 1.0023, "num_input_tokens_seen": 290374696, "step": 639, "train_runtime": 45932.2172, "train_tokens_per_second": 6321.809 }, { "epoch": 0.7748843632648814, "grad_norm": 0.2575940489768982, "learning_rate": 5e-06, "loss": 0.9831, "num_input_tokens_seen": 290844728, "step": 640, "train_runtime": 46005.7775, "train_tokens_per_second": 6321.917 }, { "epoch": 0.7760951200824828, "grad_norm": 0.2475946545600891, "learning_rate": 5e-06, "loss": 1.0082, "num_input_tokens_seen": 291292768, "step": 641, "train_runtime": 46075.669, "train_tokens_per_second": 6322.052 }, { "epoch": 0.7773058769000842, "grad_norm": 0.27857834100723267, "learning_rate": 5e-06, "loss": 0.9734, "num_input_tokens_seen": 291733344, "step": 642, "train_runtime": 46144.534, "train_tokens_per_second": 6322.165 }, { "epoch": 0.7785166337176855, "grad_norm": 0.25765910744667053, "learning_rate": 5e-06, "loss": 0.9982, "num_input_tokens_seen": 292171360, "step": 643, "train_runtime": 46212.6022, "train_tokens_per_second": 6322.331 }, { "epoch": 0.779727390535287, "grad_norm": 0.2572195827960968, "learning_rate": 5e-06, "loss": 0.963, "num_input_tokens_seen": 292612640, "step": 644, "train_runtime": 46281.1163, "train_tokens_per_second": 6322.506 }, { "epoch": 0.7809381473528884, "grad_norm": 0.24165485799312592, "learning_rate": 5e-06, "loss": 1.0424, "num_input_tokens_seen": 293053416, "step": 645, "train_runtime": 46349.6476, "train_tokens_per_second": 6322.668 }, { "epoch": 0.7821489041704897, "grad_norm": 0.2371072620153427, "learning_rate": 5e-06, "loss": 0.9891, "num_input_tokens_seen": 293522488, "step": 646, "train_runtime": 46422.9917, "train_tokens_per_second": 6322.783 }, { "epoch": 0.7833596609880911, "grad_norm": 0.26184481382369995, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 293932816, "step": 647, "train_runtime": 46486.5204, "train_tokens_per_second": 6322.969 }, { "epoch": 0.7845704178056925, "grad_norm": 0.2628776431083679, "learning_rate": 5e-06, "loss": 0.9613, "num_input_tokens_seen": 294392880, "step": 648, "train_runtime": 46558.3764, "train_tokens_per_second": 6323.092 }, { "epoch": 0.7857811746232938, "grad_norm": 0.2746836245059967, "learning_rate": 5e-06, "loss": 1.0326, "num_input_tokens_seen": 294829032, "step": 649, "train_runtime": 46626.1232, "train_tokens_per_second": 6323.259 }, { "epoch": 0.7869919314408952, "grad_norm": 0.23179323971271515, "learning_rate": 5e-06, "loss": 0.959, "num_input_tokens_seen": 295266440, "step": 650, "train_runtime": 46694.3296, "train_tokens_per_second": 6323.39 }, { "epoch": 0.7882026882584966, "grad_norm": 0.29162031412124634, "learning_rate": 5e-06, "loss": 1.0614, "num_input_tokens_seen": 295707072, "step": 651, "train_runtime": 46763.2072, "train_tokens_per_second": 6323.499 }, { "epoch": 0.789413445076098, "grad_norm": 0.24644595384597778, "learning_rate": 5e-06, "loss": 0.9355, "num_input_tokens_seen": 296168304, "step": 652, "train_runtime": 46835.1671, "train_tokens_per_second": 6323.631 }, { "epoch": 0.7906242018936993, "grad_norm": 0.22973157465457916, "learning_rate": 5e-06, "loss": 1.0039, "num_input_tokens_seen": 296626648, "step": 653, "train_runtime": 46906.6535, "train_tokens_per_second": 6323.765 }, { "epoch": 0.7918349587113007, "grad_norm": 0.22654931247234344, "learning_rate": 5e-06, "loss": 0.9769, "num_input_tokens_seen": 297077224, "step": 654, "train_runtime": 46976.9907, "train_tokens_per_second": 6323.888 }, { "epoch": 0.7930457155289021, "grad_norm": 0.25695592164993286, "learning_rate": 5e-06, "loss": 0.9681, "num_input_tokens_seen": 297509888, "step": 655, "train_runtime": 47043.9093, "train_tokens_per_second": 6324.089 }, { "epoch": 0.7942564723465034, "grad_norm": 0.2581423819065094, "learning_rate": 5e-06, "loss": 0.9989, "num_input_tokens_seen": 297939024, "step": 656, "train_runtime": 47110.28, "train_tokens_per_second": 6324.289 }, { "epoch": 0.7954672291641048, "grad_norm": 0.23372498154640198, "learning_rate": 5e-06, "loss": 1.0305, "num_input_tokens_seen": 298407680, "step": 657, "train_runtime": 47183.4049, "train_tokens_per_second": 6324.42 }, { "epoch": 0.7966779859817062, "grad_norm": 0.2330416738986969, "learning_rate": 5e-06, "loss": 0.9725, "num_input_tokens_seen": 298854728, "step": 658, "train_runtime": 47253.2725, "train_tokens_per_second": 6324.53 }, { "epoch": 0.7978887427993075, "grad_norm": 0.23654578626155853, "learning_rate": 5e-06, "loss": 0.9963, "num_input_tokens_seen": 299300440, "step": 659, "train_runtime": 47322.7649, "train_tokens_per_second": 6324.661 }, { "epoch": 0.799099499616909, "grad_norm": 0.2542232275009155, "learning_rate": 5e-06, "loss": 1.0448, "num_input_tokens_seen": 299766136, "step": 660, "train_runtime": 47394.9974, "train_tokens_per_second": 6324.848 }, { "epoch": 0.8003102564345104, "grad_norm": 0.24160121381282806, "learning_rate": 5e-06, "loss": 0.9603, "num_input_tokens_seen": 300221640, "step": 661, "train_runtime": 47466.4669, "train_tokens_per_second": 6324.921 }, { "epoch": 0.8015210132521118, "grad_norm": 0.22822356224060059, "learning_rate": 5e-06, "loss": 0.9958, "num_input_tokens_seen": 300687184, "step": 662, "train_runtime": 47539.0298, "train_tokens_per_second": 6325.059 }, { "epoch": 0.8027317700697131, "grad_norm": 0.2521500587463379, "learning_rate": 5e-06, "loss": 0.972, "num_input_tokens_seen": 301123864, "step": 663, "train_runtime": 47607.2637, "train_tokens_per_second": 6325.166 }, { "epoch": 0.8039425268873145, "grad_norm": 0.23535515367984772, "learning_rate": 5e-06, "loss": 0.9973, "num_input_tokens_seen": 301584744, "step": 664, "train_runtime": 47679.0072, "train_tokens_per_second": 6325.315 }, { "epoch": 0.8051532837049159, "grad_norm": 0.22911347448825836, "learning_rate": 5e-06, "loss": 0.964, "num_input_tokens_seen": 302032008, "step": 665, "train_runtime": 47748.6252, "train_tokens_per_second": 6325.46 }, { "epoch": 0.8063640405225172, "grad_norm": 0.2548276484012604, "learning_rate": 5e-06, "loss": 0.9764, "num_input_tokens_seen": 302498592, "step": 666, "train_runtime": 47821.6069, "train_tokens_per_second": 6325.563 }, { "epoch": 0.8075747973401186, "grad_norm": 0.24845871329307556, "learning_rate": 5e-06, "loss": 0.9747, "num_input_tokens_seen": 302948968, "step": 667, "train_runtime": 47891.6883, "train_tokens_per_second": 6325.711 }, { "epoch": 0.80878555415772, "grad_norm": 0.27543285489082336, "learning_rate": 5e-06, "loss": 1.0221, "num_input_tokens_seen": 303394536, "step": 668, "train_runtime": 47961.0975, "train_tokens_per_second": 6325.846 }, { "epoch": 0.8099963109753213, "grad_norm": 0.22285109758377075, "learning_rate": 5e-06, "loss": 0.9823, "num_input_tokens_seen": 303869992, "step": 669, "train_runtime": 48035.6906, "train_tokens_per_second": 6325.921 }, { "epoch": 0.8112070677929227, "grad_norm": 0.2208424061536789, "learning_rate": 5e-06, "loss": 0.916, "num_input_tokens_seen": 304338112, "step": 670, "train_runtime": 48108.8012, "train_tokens_per_second": 6326.038 }, { "epoch": 0.8124178246105241, "grad_norm": 0.23547379672527313, "learning_rate": 5e-06, "loss": 0.9419, "num_input_tokens_seen": 304770896, "step": 671, "train_runtime": 48175.679, "train_tokens_per_second": 6326.24 }, { "epoch": 0.8136285814281256, "grad_norm": 0.3341003656387329, "learning_rate": 5e-06, "loss": 0.9109, "num_input_tokens_seen": 305218712, "step": 672, "train_runtime": 48245.4679, "train_tokens_per_second": 6326.371 }, { "epoch": 0.8148393382457269, "grad_norm": 0.3057156205177307, "learning_rate": 5e-06, "loss": 0.9952, "num_input_tokens_seen": 305656008, "step": 673, "train_runtime": 48313.4759, "train_tokens_per_second": 6326.517 }, { "epoch": 0.8160500950633283, "grad_norm": 0.2505541741847992, "learning_rate": 5e-06, "loss": 0.9644, "num_input_tokens_seen": 306112992, "step": 674, "train_runtime": 48386.3399, "train_tokens_per_second": 6326.434 }, { "epoch": 0.8172608518809297, "grad_norm": 0.28934425115585327, "learning_rate": 5e-06, "loss": 0.9361, "num_input_tokens_seen": 306569648, "step": 675, "train_runtime": 48457.067, "train_tokens_per_second": 6326.624 }, { "epoch": 0.818471608698531, "grad_norm": 0.24182599782943726, "learning_rate": 5e-06, "loss": 0.9022, "num_input_tokens_seen": 307035984, "step": 676, "train_runtime": 48529.7381, "train_tokens_per_second": 6326.76 }, { "epoch": 0.8196823655161324, "grad_norm": 0.23037275671958923, "learning_rate": 5e-06, "loss": 0.9832, "num_input_tokens_seen": 307477064, "step": 677, "train_runtime": 48598.2501, "train_tokens_per_second": 6326.916 }, { "epoch": 0.8208931223337338, "grad_norm": 0.26939913630485535, "learning_rate": 5e-06, "loss": 0.9765, "num_input_tokens_seen": 307930568, "step": 678, "train_runtime": 48668.4558, "train_tokens_per_second": 6327.108 }, { "epoch": 0.8221038791513351, "grad_norm": 0.2629682719707489, "learning_rate": 5e-06, "loss": 0.9513, "num_input_tokens_seen": 308377608, "step": 679, "train_runtime": 48737.7561, "train_tokens_per_second": 6327.284 }, { "epoch": 0.8233146359689365, "grad_norm": 0.2294158786535263, "learning_rate": 5e-06, "loss": 0.972, "num_input_tokens_seen": 308838600, "step": 680, "train_runtime": 48810.1043, "train_tokens_per_second": 6327.35 }, { "epoch": 0.8245253927865379, "grad_norm": 0.22822599112987518, "learning_rate": 5e-06, "loss": 0.936, "num_input_tokens_seen": 309318664, "step": 681, "train_runtime": 48887.5021, "train_tokens_per_second": 6327.152 }, { "epoch": 0.8257361496041393, "grad_norm": 0.24199745059013367, "learning_rate": 5e-06, "loss": 1.0328, "num_input_tokens_seen": 309745744, "step": 682, "train_runtime": 48957.3644, "train_tokens_per_second": 6326.847 }, { "epoch": 0.8269469064217406, "grad_norm": 0.23330043256282806, "learning_rate": 5e-06, "loss": 0.9579, "num_input_tokens_seen": 310184288, "step": 683, "train_runtime": 49029.1375, "train_tokens_per_second": 6326.53 }, { "epoch": 0.828157663239342, "grad_norm": 0.22911278903484344, "learning_rate": 5e-06, "loss": 1.0027, "num_input_tokens_seen": 310639208, "step": 684, "train_runtime": 49102.2981, "train_tokens_per_second": 6326.368 }, { "epoch": 0.8293684200569434, "grad_norm": 0.23975107073783875, "learning_rate": 5e-06, "loss": 0.9599, "num_input_tokens_seen": 311102208, "step": 685, "train_runtime": 49177.6776, "train_tokens_per_second": 6326.086 }, { "epoch": 0.8305791768745447, "grad_norm": 0.271192729473114, "learning_rate": 5e-06, "loss": 0.9861, "num_input_tokens_seen": 311544336, "step": 686, "train_runtime": 49250.6236, "train_tokens_per_second": 6325.693 }, { "epoch": 0.8317899336921462, "grad_norm": 0.2387111783027649, "learning_rate": 5e-06, "loss": 0.9506, "num_input_tokens_seen": 311991128, "step": 687, "train_runtime": 49323.9975, "train_tokens_per_second": 6325.341 }, { "epoch": 0.8330006905097476, "grad_norm": 0.24908512830734253, "learning_rate": 5e-06, "loss": 0.9557, "num_input_tokens_seen": 312461408, "step": 688, "train_runtime": 49402.3748, "train_tokens_per_second": 6324.826 }, { "epoch": 0.8342114473273489, "grad_norm": 0.2281702756881714, "learning_rate": 5e-06, "loss": 0.9523, "num_input_tokens_seen": 312916448, "step": 689, "train_runtime": 49476.816, "train_tokens_per_second": 6324.507 }, { "epoch": 0.8354222041449503, "grad_norm": 0.23677456378936768, "learning_rate": 5e-06, "loss": 0.9834, "num_input_tokens_seen": 313376968, "step": 690, "train_runtime": 49552.5358, "train_tokens_per_second": 6324.136 }, { "epoch": 0.8366329609625517, "grad_norm": 0.23301224410533905, "learning_rate": 5e-06, "loss": 0.9741, "num_input_tokens_seen": 313834800, "step": 691, "train_runtime": 49627.9277, "train_tokens_per_second": 6323.754 }, { "epoch": 0.8378437177801531, "grad_norm": 0.28598400950431824, "learning_rate": 5e-06, "loss": 0.9854, "num_input_tokens_seen": 314290560, "step": 692, "train_runtime": 49703.1803, "train_tokens_per_second": 6323.349 }, { "epoch": 0.8390544745977544, "grad_norm": 0.24205778539180756, "learning_rate": 5e-06, "loss": 0.9253, "num_input_tokens_seen": 314767360, "step": 693, "train_runtime": 49781.9765, "train_tokens_per_second": 6322.918 }, { "epoch": 0.8402652314153558, "grad_norm": 0.25196146965026855, "learning_rate": 5e-06, "loss": 0.9518, "num_input_tokens_seen": 315210832, "step": 694, "train_runtime": 49854.8652, "train_tokens_per_second": 6322.569 }, { "epoch": 0.8414759882329572, "grad_norm": 0.2362397164106369, "learning_rate": 5e-06, "loss": 0.9407, "num_input_tokens_seen": 315669384, "step": 695, "train_runtime": 49930.4912, "train_tokens_per_second": 6322.177 }, { "epoch": 0.8426867450505585, "grad_norm": 0.23664698004722595, "learning_rate": 5e-06, "loss": 0.9684, "num_input_tokens_seen": 316108928, "step": 696, "train_runtime": 50002.7456, "train_tokens_per_second": 6321.831 }, { "epoch": 0.8438975018681599, "grad_norm": 0.23944173753261566, "learning_rate": 5e-06, "loss": 0.9395, "num_input_tokens_seen": 316566328, "step": 697, "train_runtime": 50077.929, "train_tokens_per_second": 6321.474 }, { "epoch": 0.8451082586857613, "grad_norm": 0.22662094235420227, "learning_rate": 5e-06, "loss": 0.9686, "num_input_tokens_seen": 317031176, "step": 698, "train_runtime": 50154.9704, "train_tokens_per_second": 6321.032 }, { "epoch": 0.8463190155033626, "grad_norm": 0.22922109067440033, "learning_rate": 5e-06, "loss": 0.9671, "num_input_tokens_seen": 317478472, "step": 699, "train_runtime": 50228.3177, "train_tokens_per_second": 6320.707 }, { "epoch": 0.847529772320964, "grad_norm": 0.26036337018013, "learning_rate": 5e-06, "loss": 0.9642, "num_input_tokens_seen": 317940344, "step": 700, "train_runtime": 50304.238, "train_tokens_per_second": 6320.349 }, { "epoch": 0.8487405291385655, "grad_norm": 0.22809621691703796, "learning_rate": 5e-06, "loss": 0.952, "num_input_tokens_seen": 318407112, "step": 701, "train_runtime": 50381.1432, "train_tokens_per_second": 6319.966 }, { "epoch": 0.8499512859561669, "grad_norm": 0.228465273976326, "learning_rate": 5e-06, "loss": 0.9005, "num_input_tokens_seen": 318858464, "step": 702, "train_runtime": 50454.8922, "train_tokens_per_second": 6319.674 }, { "epoch": 0.8511620427737682, "grad_norm": 0.2660825550556183, "learning_rate": 5e-06, "loss": 0.9698, "num_input_tokens_seen": 319300520, "step": 703, "train_runtime": 50527.48, "train_tokens_per_second": 6319.344 }, { "epoch": 0.8523727995913696, "grad_norm": 0.2166413962841034, "learning_rate": 5e-06, "loss": 0.9611, "num_input_tokens_seen": 319756696, "step": 704, "train_runtime": 50602.7456, "train_tokens_per_second": 6318.959 }, { "epoch": 0.853583556408971, "grad_norm": 0.22596792876720428, "learning_rate": 5e-06, "loss": 0.9911, "num_input_tokens_seen": 320225520, "step": 705, "train_runtime": 50680.0879, "train_tokens_per_second": 6318.567 }, { "epoch": 0.8547943132265723, "grad_norm": 0.24560396373271942, "learning_rate": 5e-06, "loss": 0.9301, "num_input_tokens_seen": 320689552, "step": 706, "train_runtime": 50756.6212, "train_tokens_per_second": 6318.182 }, { "epoch": 0.8560050700441737, "grad_norm": 0.2799171805381775, "learning_rate": 5e-06, "loss": 0.9853, "num_input_tokens_seen": 321135128, "step": 707, "train_runtime": 50829.7956, "train_tokens_per_second": 6317.852 }, { "epoch": 0.8572158268617751, "grad_norm": 0.24234268069267273, "learning_rate": 5e-06, "loss": 0.9667, "num_input_tokens_seen": 321592464, "step": 708, "train_runtime": 50905.3416, "train_tokens_per_second": 6317.46 }, { "epoch": 0.8584265836793764, "grad_norm": 0.24366381764411926, "learning_rate": 5e-06, "loss": 0.9408, "num_input_tokens_seen": 322030856, "step": 709, "train_runtime": 50977.5327, "train_tokens_per_second": 6317.113 }, { "epoch": 0.8596373404969778, "grad_norm": 0.23462019860744476, "learning_rate": 5e-06, "loss": 0.9617, "num_input_tokens_seen": 322488840, "step": 710, "train_runtime": 51053.0683, "train_tokens_per_second": 6316.738 }, { "epoch": 0.8608480973145792, "grad_norm": 0.23850728571414948, "learning_rate": 5e-06, "loss": 0.9618, "num_input_tokens_seen": 322956064, "step": 711, "train_runtime": 51130.2729, "train_tokens_per_second": 6316.338 }, { "epoch": 0.8620588541321806, "grad_norm": 0.24310947954654694, "learning_rate": 5e-06, "loss": 0.9352, "num_input_tokens_seen": 323404816, "step": 712, "train_runtime": 51204.0732, "train_tokens_per_second": 6315.998 }, { "epoch": 0.8632696109497819, "grad_norm": 0.2608128786087036, "learning_rate": 5e-06, "loss": 0.9402, "num_input_tokens_seen": 323876440, "step": 713, "train_runtime": 51281.1334, "train_tokens_per_second": 6315.704 }, { "epoch": 0.8644803677673834, "grad_norm": 0.2339504510164261, "learning_rate": 5e-06, "loss": 0.9117, "num_input_tokens_seen": 324335024, "step": 714, "train_runtime": 51356.6378, "train_tokens_per_second": 6315.348 }, { "epoch": 0.8656911245849848, "grad_norm": 0.23948872089385986, "learning_rate": 5e-06, "loss": 0.9073, "num_input_tokens_seen": 324788680, "step": 715, "train_runtime": 51430.9403, "train_tokens_per_second": 6315.045 }, { "epoch": 0.8669018814025861, "grad_norm": 0.23080047965049744, "learning_rate": 5e-06, "loss": 0.9569, "num_input_tokens_seen": 325265904, "step": 716, "train_runtime": 51509.6255, "train_tokens_per_second": 6314.663 }, { "epoch": 0.8681126382201875, "grad_norm": 0.2425810992717743, "learning_rate": 5e-06, "loss": 0.9919, "num_input_tokens_seen": 325749024, "step": 717, "train_runtime": 51589.2266, "train_tokens_per_second": 6314.284 }, { "epoch": 0.8693233950377889, "grad_norm": 0.24254527688026428, "learning_rate": 5e-06, "loss": 0.9758, "num_input_tokens_seen": 326205360, "step": 718, "train_runtime": 51663.9314, "train_tokens_per_second": 6313.986 }, { "epoch": 0.8705341518553902, "grad_norm": 0.2271261066198349, "learning_rate": 5e-06, "loss": 0.9523, "num_input_tokens_seen": 326656568, "step": 719, "train_runtime": 51737.7083, "train_tokens_per_second": 6313.704 }, { "epoch": 0.8717449086729916, "grad_norm": 0.28275179862976074, "learning_rate": 5e-06, "loss": 0.9955, "num_input_tokens_seen": 327100936, "step": 720, "train_runtime": 51810.5603, "train_tokens_per_second": 6313.403 }, { "epoch": 0.872955665490593, "grad_norm": 0.2485657036304474, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 327555136, "step": 721, "train_runtime": 51884.7425, "train_tokens_per_second": 6313.13 }, { "epoch": 0.8741664223081944, "grad_norm": 0.26533499360084534, "learning_rate": 5e-06, "loss": 0.9911, "num_input_tokens_seen": 328014560, "step": 722, "train_runtime": 51960.0639, "train_tokens_per_second": 6312.821 }, { "epoch": 0.8753771791257957, "grad_norm": 0.21716679632663727, "learning_rate": 5e-06, "loss": 0.9419, "num_input_tokens_seen": 328476336, "step": 723, "train_runtime": 52035.9362, "train_tokens_per_second": 6312.49 }, { "epoch": 0.8765879359433971, "grad_norm": 0.238169863820076, "learning_rate": 5e-06, "loss": 0.9927, "num_input_tokens_seen": 328903336, "step": 724, "train_runtime": 52106.2514, "train_tokens_per_second": 6312.167 }, { "epoch": 0.8777986927609985, "grad_norm": 0.23495762050151825, "learning_rate": 5e-06, "loss": 0.9438, "num_input_tokens_seen": 329351280, "step": 725, "train_runtime": 52179.605, "train_tokens_per_second": 6311.878 }, { "epoch": 0.8790094495785998, "grad_norm": 0.2645426094532013, "learning_rate": 5e-06, "loss": 0.9552, "num_input_tokens_seen": 329810048, "step": 726, "train_runtime": 52255.2056, "train_tokens_per_second": 6311.525 }, { "epoch": 0.8802202063962012, "grad_norm": 0.2440696507692337, "learning_rate": 5e-06, "loss": 0.9443, "num_input_tokens_seen": 330266952, "step": 727, "train_runtime": 52330.1379, "train_tokens_per_second": 6311.219 }, { "epoch": 0.8814309632138027, "grad_norm": 0.28334546089172363, "learning_rate": 5e-06, "loss": 0.9749, "num_input_tokens_seen": 330725072, "step": 728, "train_runtime": 52405.845, "train_tokens_per_second": 6310.843 }, { "epoch": 0.882641720031404, "grad_norm": 0.25327134132385254, "learning_rate": 5e-06, "loss": 0.9531, "num_input_tokens_seen": 331165624, "step": 729, "train_runtime": 52477.7764, "train_tokens_per_second": 6310.588 }, { "epoch": 0.8838524768490054, "grad_norm": 0.23178185522556305, "learning_rate": 5e-06, "loss": 0.9606, "num_input_tokens_seen": 331625256, "step": 730, "train_runtime": 52553.8007, "train_tokens_per_second": 6310.205 }, { "epoch": 0.8850632336666068, "grad_norm": 0.23952724039554596, "learning_rate": 5e-06, "loss": 0.9585, "num_input_tokens_seen": 332060176, "step": 731, "train_runtime": 52625.0909, "train_tokens_per_second": 6309.921 }, { "epoch": 0.8862739904842082, "grad_norm": 0.23698952794075012, "learning_rate": 5e-06, "loss": 0.8797, "num_input_tokens_seen": 332509640, "step": 732, "train_runtime": 52699.071, "train_tokens_per_second": 6309.592 }, { "epoch": 0.8874847473018095, "grad_norm": 0.22318892180919647, "learning_rate": 5e-06, "loss": 0.9486, "num_input_tokens_seen": 332946752, "step": 733, "train_runtime": 52771.1507, "train_tokens_per_second": 6309.257 }, { "epoch": 0.8886955041194109, "grad_norm": 0.22661879658699036, "learning_rate": 5e-06, "loss": 0.9622, "num_input_tokens_seen": 333408976, "step": 734, "train_runtime": 52847.3029, "train_tokens_per_second": 6308.912 }, { "epoch": 0.8899062609370123, "grad_norm": 0.22661250829696655, "learning_rate": 5e-06, "loss": 1.0203, "num_input_tokens_seen": 333864040, "step": 735, "train_runtime": 52921.7765, "train_tokens_per_second": 6308.633 }, { "epoch": 0.8911170177546136, "grad_norm": 0.23365598917007446, "learning_rate": 5e-06, "loss": 1.0197, "num_input_tokens_seen": 334328000, "step": 736, "train_runtime": 52997.7307, "train_tokens_per_second": 6308.346 }, { "epoch": 0.892327774572215, "grad_norm": 0.25835007429122925, "learning_rate": 5e-06, "loss": 1.001, "num_input_tokens_seen": 334761336, "step": 737, "train_runtime": 53068.9411, "train_tokens_per_second": 6308.046 }, { "epoch": 0.8935385313898164, "grad_norm": 0.23492054641246796, "learning_rate": 5e-06, "loss": 0.9808, "num_input_tokens_seen": 335218464, "step": 738, "train_runtime": 53143.7831, "train_tokens_per_second": 6307.764 }, { "epoch": 0.8947492882074177, "grad_norm": 0.23521077632904053, "learning_rate": 5e-06, "loss": 0.9753, "num_input_tokens_seen": 335663288, "step": 739, "train_runtime": 53216.7657, "train_tokens_per_second": 6307.473 }, { "epoch": 0.8959600450250191, "grad_norm": 0.24233105778694153, "learning_rate": 5e-06, "loss": 0.9669, "num_input_tokens_seen": 336143112, "step": 740, "train_runtime": 53295.4261, "train_tokens_per_second": 6307.166 }, { "epoch": 0.8971708018426205, "grad_norm": 0.22769199311733246, "learning_rate": 5e-06, "loss": 0.9462, "num_input_tokens_seen": 336620632, "step": 741, "train_runtime": 53373.6272, "train_tokens_per_second": 6306.872 }, { "epoch": 0.898381558660222, "grad_norm": 0.2259906679391861, "learning_rate": 5e-06, "loss": 0.9976, "num_input_tokens_seen": 337087384, "step": 742, "train_runtime": 53450.3668, "train_tokens_per_second": 6306.55 }, { "epoch": 0.8995923154778233, "grad_norm": 0.24663208425045013, "learning_rate": 5e-06, "loss": 0.9739, "num_input_tokens_seen": 337516840, "step": 743, "train_runtime": 53521.0012, "train_tokens_per_second": 6306.25 }, { "epoch": 0.9008030722954247, "grad_norm": 0.24597318470478058, "learning_rate": 5e-06, "loss": 0.9405, "num_input_tokens_seen": 337978032, "step": 744, "train_runtime": 53596.7412, "train_tokens_per_second": 6305.944 }, { "epoch": 0.9020138291130261, "grad_norm": 0.24945300817489624, "learning_rate": 5e-06, "loss": 0.9487, "num_input_tokens_seen": 338429744, "step": 745, "train_runtime": 53671.4992, "train_tokens_per_second": 6305.576 }, { "epoch": 0.9032245859306274, "grad_norm": 0.26230573654174805, "learning_rate": 5e-06, "loss": 0.9708, "num_input_tokens_seen": 338908032, "step": 746, "train_runtime": 53750.8266, "train_tokens_per_second": 6305.169 }, { "epoch": 0.9044353427482288, "grad_norm": 0.23558348417282104, "learning_rate": 5e-06, "loss": 1.0117, "num_input_tokens_seen": 339376168, "step": 747, "train_runtime": 53828.3567, "train_tokens_per_second": 6304.784 }, { "epoch": 0.9056460995658302, "grad_norm": 0.23316293954849243, "learning_rate": 5e-06, "loss": 0.962, "num_input_tokens_seen": 339825168, "step": 748, "train_runtime": 53902.2806, "train_tokens_per_second": 6304.467 }, { "epoch": 0.9068568563834315, "grad_norm": 0.243992418050766, "learning_rate": 5e-06, "loss": 0.983, "num_input_tokens_seen": 340304336, "step": 749, "train_runtime": 53981.4498, "train_tokens_per_second": 6304.098 }, { "epoch": 0.9080676132010329, "grad_norm": 0.2598229646682739, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 340743064, "step": 750, "train_runtime": 54053.3316, "train_tokens_per_second": 6303.831 }, { "epoch": 0.9092783700186343, "grad_norm": 0.24613091349601746, "learning_rate": 5e-06, "loss": 0.9527, "num_input_tokens_seen": 341210112, "step": 751, "train_runtime": 54130.9437, "train_tokens_per_second": 6303.421 }, { "epoch": 0.9104891268362357, "grad_norm": 0.24349863827228546, "learning_rate": 5e-06, "loss": 0.9261, "num_input_tokens_seen": 341671160, "step": 752, "train_runtime": 54207.4785, "train_tokens_per_second": 6303.026 }, { "epoch": 0.911699883653837, "grad_norm": 0.22625428438186646, "learning_rate": 5e-06, "loss": 0.9711, "num_input_tokens_seen": 342126008, "step": 753, "train_runtime": 54282.3751, "train_tokens_per_second": 6302.709 }, { "epoch": 0.9129106404714384, "grad_norm": 0.2454047054052353, "learning_rate": 5e-06, "loss": 0.9962, "num_input_tokens_seen": 342589536, "step": 754, "train_runtime": 54359.1523, "train_tokens_per_second": 6302.334 }, { "epoch": 0.9141213972890398, "grad_norm": 0.2505525052547455, "learning_rate": 5e-06, "loss": 0.9888, "num_input_tokens_seen": 343050616, "step": 755, "train_runtime": 54434.9812, "train_tokens_per_second": 6302.025 }, { "epoch": 0.9153321541066411, "grad_norm": 0.24787583947181702, "learning_rate": 5e-06, "loss": 0.9694, "num_input_tokens_seen": 343502480, "step": 756, "train_runtime": 54509.554, "train_tokens_per_second": 6301.693 }, { "epoch": 0.9165429109242426, "grad_norm": 0.2458108365535736, "learning_rate": 5e-06, "loss": 0.9712, "num_input_tokens_seen": 343979832, "step": 757, "train_runtime": 54588.3291, "train_tokens_per_second": 6301.344 }, { "epoch": 0.917753667741844, "grad_norm": 0.2579139173030853, "learning_rate": 5e-06, "loss": 0.9966, "num_input_tokens_seen": 344423888, "step": 758, "train_runtime": 54661.823, "train_tokens_per_second": 6300.995 }, { "epoch": 0.9189644245594453, "grad_norm": 0.24939359724521637, "learning_rate": 5e-06, "loss": 0.9396, "num_input_tokens_seen": 344888696, "step": 759, "train_runtime": 54738.5951, "train_tokens_per_second": 6300.649 }, { "epoch": 0.9201751813770467, "grad_norm": 0.25321266055107117, "learning_rate": 5e-06, "loss": 0.9294, "num_input_tokens_seen": 345349032, "step": 760, "train_runtime": 54814.3689, "train_tokens_per_second": 6300.338 }, { "epoch": 0.9213859381946481, "grad_norm": 0.25097349286079407, "learning_rate": 5e-06, "loss": 0.9652, "num_input_tokens_seen": 345787136, "step": 761, "train_runtime": 54886.4068, "train_tokens_per_second": 6300.051 }, { "epoch": 0.9225966950122495, "grad_norm": 0.2374579906463623, "learning_rate": 5e-06, "loss": 0.9305, "num_input_tokens_seen": 346227736, "step": 762, "train_runtime": 54958.6615, "train_tokens_per_second": 6299.785 }, { "epoch": 0.9238074518298508, "grad_norm": 0.24115844070911407, "learning_rate": 5e-06, "loss": 0.9923, "num_input_tokens_seen": 346675680, "step": 763, "train_runtime": 55032.5683, "train_tokens_per_second": 6299.464 }, { "epoch": 0.9250182086474522, "grad_norm": 0.24648192524909973, "learning_rate": 5e-06, "loss": 0.9706, "num_input_tokens_seen": 347127480, "step": 764, "train_runtime": 55106.8858, "train_tokens_per_second": 6299.167 }, { "epoch": 0.9262289654650536, "grad_norm": 0.22947219014167786, "learning_rate": 5e-06, "loss": 1.0213, "num_input_tokens_seen": 347585736, "step": 765, "train_runtime": 55183.0077, "train_tokens_per_second": 6298.782 }, { "epoch": 0.9274397222826549, "grad_norm": 0.23317500948905945, "learning_rate": 5e-06, "loss": 0.9755, "num_input_tokens_seen": 348040224, "step": 766, "train_runtime": 55258.3609, "train_tokens_per_second": 6298.417 }, { "epoch": 0.9286504791002563, "grad_norm": 0.24681779742240906, "learning_rate": 5e-06, "loss": 0.9838, "num_input_tokens_seen": 348489688, "step": 767, "train_runtime": 55332.4442, "train_tokens_per_second": 6298.108 }, { "epoch": 0.9298612359178577, "grad_norm": 0.2522102892398834, "learning_rate": 5e-06, "loss": 1.0349, "num_input_tokens_seen": 348958600, "step": 768, "train_runtime": 55410.2472, "train_tokens_per_second": 6297.727 }, { "epoch": 0.931071992735459, "grad_norm": 0.2546612024307251, "learning_rate": 5e-06, "loss": 0.9432, "num_input_tokens_seen": 349397976, "step": 769, "train_runtime": 55482.6793, "train_tokens_per_second": 6297.424 }, { "epoch": 0.9322827495530605, "grad_norm": 0.2435491979122162, "learning_rate": 5e-06, "loss": 0.9651, "num_input_tokens_seen": 349854280, "step": 770, "train_runtime": 55558.3888, "train_tokens_per_second": 6297.056 }, { "epoch": 0.9334935063706619, "grad_norm": 0.2178066521883011, "learning_rate": 5e-06, "loss": 0.9513, "num_input_tokens_seen": 350322312, "step": 771, "train_runtime": 55636.2099, "train_tokens_per_second": 6296.66 }, { "epoch": 0.9347042631882633, "grad_norm": 0.23473484814167023, "learning_rate": 5e-06, "loss": 0.9474, "num_input_tokens_seen": 350791976, "step": 772, "train_runtime": 55713.2299, "train_tokens_per_second": 6296.386 }, { "epoch": 0.9359150200058646, "grad_norm": 0.2700430750846863, "learning_rate": 5e-06, "loss": 0.9686, "num_input_tokens_seen": 351249760, "step": 773, "train_runtime": 55789.3346, "train_tokens_per_second": 6296.002 }, { "epoch": 0.937125776823466, "grad_norm": 0.22990594804286957, "learning_rate": 5e-06, "loss": 0.9592, "num_input_tokens_seen": 351709136, "step": 774, "train_runtime": 55865.6502, "train_tokens_per_second": 6295.624 }, { "epoch": 0.9383365336410674, "grad_norm": 0.23456795513629913, "learning_rate": 5e-06, "loss": 0.9531, "num_input_tokens_seen": 352185984, "step": 775, "train_runtime": 55944.6884, "train_tokens_per_second": 6295.253 }, { "epoch": 0.9395472904586687, "grad_norm": 0.2309848964214325, "learning_rate": 5e-06, "loss": 0.9881, "num_input_tokens_seen": 352631888, "step": 776, "train_runtime": 56018.3574, "train_tokens_per_second": 6294.934 }, { "epoch": 0.9407580472762701, "grad_norm": 0.2821614146232605, "learning_rate": 5e-06, "loss": 1.0289, "num_input_tokens_seen": 353056656, "step": 777, "train_runtime": 56088.0171, "train_tokens_per_second": 6294.69 }, { "epoch": 0.9419688040938715, "grad_norm": 0.24919262528419495, "learning_rate": 5e-06, "loss": 0.9536, "num_input_tokens_seen": 353529936, "step": 778, "train_runtime": 56166.3971, "train_tokens_per_second": 6294.332 }, { "epoch": 0.9431795609114728, "grad_norm": 0.23871028423309326, "learning_rate": 5e-06, "loss": 0.9919, "num_input_tokens_seen": 353990320, "step": 779, "train_runtime": 56242.127, "train_tokens_per_second": 6294.042 }, { "epoch": 0.9443903177290742, "grad_norm": 0.23189355432987213, "learning_rate": 5e-06, "loss": 0.9835, "num_input_tokens_seen": 354452392, "step": 780, "train_runtime": 56318.8627, "train_tokens_per_second": 6293.671 }, { "epoch": 0.9456010745466756, "grad_norm": 0.2740236520767212, "learning_rate": 5e-06, "loss": 0.9532, "num_input_tokens_seen": 354899024, "step": 781, "train_runtime": 56393.0697, "train_tokens_per_second": 6293.309 }, { "epoch": 0.946811831364277, "grad_norm": 0.2556408643722534, "learning_rate": 5e-06, "loss": 0.9444, "num_input_tokens_seen": 355343056, "step": 782, "train_runtime": 56466.3117, "train_tokens_per_second": 6293.01 }, { "epoch": 0.9480225881818783, "grad_norm": 0.23555780947208405, "learning_rate": 5e-06, "loss": 0.9397, "num_input_tokens_seen": 355818304, "step": 783, "train_runtime": 56545.0247, "train_tokens_per_second": 6292.654 }, { "epoch": 0.9492333449994798, "grad_norm": 0.25604984164237976, "learning_rate": 5e-06, "loss": 0.9677, "num_input_tokens_seen": 356251568, "step": 784, "train_runtime": 56616.3994, "train_tokens_per_second": 6292.374 }, { "epoch": 0.9504441018170812, "grad_norm": 0.24111999571323395, "learning_rate": 5e-06, "loss": 0.957, "num_input_tokens_seen": 356696296, "step": 785, "train_runtime": 56689.6311, "train_tokens_per_second": 6292.091 }, { "epoch": 0.9516548586346825, "grad_norm": 0.22817663848400116, "learning_rate": 5e-06, "loss": 0.9279, "num_input_tokens_seen": 357149968, "step": 786, "train_runtime": 56764.8748, "train_tokens_per_second": 6291.742 }, { "epoch": 0.9528656154522839, "grad_norm": 0.256910115480423, "learning_rate": 5e-06, "loss": 0.9302, "num_input_tokens_seen": 357599288, "step": 787, "train_runtime": 56839.306, "train_tokens_per_second": 6291.408 }, { "epoch": 0.9540763722698853, "grad_norm": 0.2196292132139206, "learning_rate": 5e-06, "loss": 0.9459, "num_input_tokens_seen": 358069328, "step": 788, "train_runtime": 56917.2118, "train_tokens_per_second": 6291.055 }, { "epoch": 0.9552871290874866, "grad_norm": 0.22421136498451233, "learning_rate": 5e-06, "loss": 0.9084, "num_input_tokens_seen": 358534160, "step": 789, "train_runtime": 56994.4255, "train_tokens_per_second": 6290.688 }, { "epoch": 0.956497885905088, "grad_norm": 0.22506392002105713, "learning_rate": 5e-06, "loss": 0.8446, "num_input_tokens_seen": 358973624, "step": 790, "train_runtime": 57067.011, "train_tokens_per_second": 6290.388 }, { "epoch": 0.9577086427226894, "grad_norm": 0.2432793378829956, "learning_rate": 5e-06, "loss": 0.9813, "num_input_tokens_seen": 359419408, "step": 791, "train_runtime": 57140.9306, "train_tokens_per_second": 6290.052 }, { "epoch": 0.9589193995402908, "grad_norm": 0.2352157086133957, "learning_rate": 5e-06, "loss": 0.9851, "num_input_tokens_seen": 359883088, "step": 792, "train_runtime": 57218.0451, "train_tokens_per_second": 6289.678 }, { "epoch": 0.9601301563578921, "grad_norm": 0.2471296638250351, "learning_rate": 5e-06, "loss": 0.9857, "num_input_tokens_seen": 360354088, "step": 793, "train_runtime": 57295.7766, "train_tokens_per_second": 6289.366 }, { "epoch": 0.9613409131754935, "grad_norm": 0.24908725917339325, "learning_rate": 5e-06, "loss": 0.9422, "num_input_tokens_seen": 360816256, "step": 794, "train_runtime": 57371.6725, "train_tokens_per_second": 6289.101 }, { "epoch": 0.9625516699930949, "grad_norm": 0.24670016765594482, "learning_rate": 5e-06, "loss": 0.9673, "num_input_tokens_seen": 361274880, "step": 795, "train_runtime": 57447.7271, "train_tokens_per_second": 6288.758 }, { "epoch": 0.9637624268106962, "grad_norm": 0.23842549324035645, "learning_rate": 5e-06, "loss": 0.9775, "num_input_tokens_seen": 361733008, "step": 796, "train_runtime": 57523.5615, "train_tokens_per_second": 6288.432 }, { "epoch": 0.9649731836282976, "grad_norm": 0.24963422119617462, "learning_rate": 5e-06, "loss": 0.9221, "num_input_tokens_seen": 362200552, "step": 797, "train_runtime": 57601.2521, "train_tokens_per_second": 6288.067 }, { "epoch": 0.9661839404458991, "grad_norm": 0.2490622103214264, "learning_rate": 5e-06, "loss": 0.9485, "num_input_tokens_seen": 362658336, "step": 798, "train_runtime": 57676.4994, "train_tokens_per_second": 6287.801 }, { "epoch": 0.9673946972635004, "grad_norm": 0.2377602905035019, "learning_rate": 5e-06, "loss": 0.9424, "num_input_tokens_seen": 363103008, "step": 799, "train_runtime": 57749.703, "train_tokens_per_second": 6287.53 }, { "epoch": 0.9686054540811018, "grad_norm": 0.24257516860961914, "learning_rate": 5e-06, "loss": 0.9765, "num_input_tokens_seen": 363561496, "step": 800, "train_runtime": 57825.6172, "train_tokens_per_second": 6287.205 }, { "epoch": 0.9698162108987032, "grad_norm": 0.22745341062545776, "learning_rate": 5e-06, "loss": 0.9451, "num_input_tokens_seen": 364027560, "step": 801, "train_runtime": 57902.6387, "train_tokens_per_second": 6286.891 }, { "epoch": 0.9710269677163046, "grad_norm": 0.24128001928329468, "learning_rate": 5e-06, "loss": 0.9569, "num_input_tokens_seen": 364476736, "step": 802, "train_runtime": 57977.0904, "train_tokens_per_second": 6286.565 }, { "epoch": 0.9722377245339059, "grad_norm": 0.2616693675518036, "learning_rate": 5e-06, "loss": 1.0019, "num_input_tokens_seen": 364907784, "step": 803, "train_runtime": 58048.2061, "train_tokens_per_second": 6286.289 }, { "epoch": 0.9734484813515073, "grad_norm": 0.2624351680278778, "learning_rate": 5e-06, "loss": 0.9582, "num_input_tokens_seen": 365354112, "step": 804, "train_runtime": 58121.7175, "train_tokens_per_second": 6286.017 }, { "epoch": 0.9746592381691087, "grad_norm": 0.24158768355846405, "learning_rate": 5e-06, "loss": 0.9769, "num_input_tokens_seen": 365795992, "step": 805, "train_runtime": 58194.7377, "train_tokens_per_second": 6285.723 }, { "epoch": 0.97586999498671, "grad_norm": 0.23048560321331024, "learning_rate": 5e-06, "loss": 0.955, "num_input_tokens_seen": 366247480, "step": 806, "train_runtime": 58269.4576, "train_tokens_per_second": 6285.411 }, { "epoch": 0.9770807518043114, "grad_norm": 0.23612691462039948, "learning_rate": 5e-06, "loss": 0.969, "num_input_tokens_seen": 366707864, "step": 807, "train_runtime": 58345.6241, "train_tokens_per_second": 6285.096 }, { "epoch": 0.9782915086219128, "grad_norm": 0.23956720530986786, "learning_rate": 5e-06, "loss": 0.9427, "num_input_tokens_seen": 367171912, "step": 808, "train_runtime": 58421.8288, "train_tokens_per_second": 6284.841 }, { "epoch": 0.9795022654395141, "grad_norm": 0.2306690812110901, "learning_rate": 5e-06, "loss": 0.9312, "num_input_tokens_seen": 367626528, "step": 809, "train_runtime": 58497.4965, "train_tokens_per_second": 6284.483 }, { "epoch": 0.9807130222571155, "grad_norm": 0.23108424246311188, "learning_rate": 5e-06, "loss": 0.9354, "num_input_tokens_seen": 368110672, "step": 810, "train_runtime": 58577.9152, "train_tokens_per_second": 6284.12 }, { "epoch": 0.981923779074717, "grad_norm": 0.2248297929763794, "learning_rate": 5e-06, "loss": 0.9036, "num_input_tokens_seen": 368567152, "step": 811, "train_runtime": 58653.4012, "train_tokens_per_second": 6283.816 }, { "epoch": 0.9831345358923184, "grad_norm": 0.24311695992946625, "learning_rate": 5e-06, "loss": 1.0502, "num_input_tokens_seen": 369025104, "step": 812, "train_runtime": 58729.1413, "train_tokens_per_second": 6283.509 }, { "epoch": 0.9843452927099197, "grad_norm": 0.24215175211429596, "learning_rate": 5e-06, "loss": 0.9737, "num_input_tokens_seen": 369475024, "step": 813, "train_runtime": 58803.2136, "train_tokens_per_second": 6283.245 }, { "epoch": 0.9855560495275211, "grad_norm": 0.253462016582489, "learning_rate": 5e-06, "loss": 0.9832, "num_input_tokens_seen": 369906600, "step": 814, "train_runtime": 58874.4874, "train_tokens_per_second": 6282.969 }, { "epoch": 0.9867668063451225, "grad_norm": 0.23864710330963135, "learning_rate": 5e-06, "loss": 0.9919, "num_input_tokens_seen": 370367360, "step": 815, "train_runtime": 58950.3738, "train_tokens_per_second": 6282.697 }, { "epoch": 0.9879775631627238, "grad_norm": 0.26924240589141846, "learning_rate": 5e-06, "loss": 0.9717, "num_input_tokens_seen": 370837544, "step": 816, "train_runtime": 59028.3463, "train_tokens_per_second": 6282.364 }, { "epoch": 0.9891883199803252, "grad_norm": 0.25375184416770935, "learning_rate": 5e-06, "loss": 0.9292, "num_input_tokens_seen": 371321120, "step": 817, "train_runtime": 59108.6748, "train_tokens_per_second": 6282.007 }, { "epoch": 0.9903990767979266, "grad_norm": 0.24142777919769287, "learning_rate": 5e-06, "loss": 0.9723, "num_input_tokens_seen": 371770736, "step": 818, "train_runtime": 59183.2687, "train_tokens_per_second": 6281.686 }, { "epoch": 0.9916098336155279, "grad_norm": 0.2367551475763321, "learning_rate": 5e-06, "loss": 1.0059, "num_input_tokens_seen": 372212144, "step": 819, "train_runtime": 59256.3298, "train_tokens_per_second": 6281.39 }, { "epoch": 0.9928205904331293, "grad_norm": 0.2153656780719757, "learning_rate": 5e-06, "loss": 0.922, "num_input_tokens_seen": 372689824, "step": 820, "train_runtime": 59335.7827, "train_tokens_per_second": 6281.03 }, { "epoch": 0.9940313472507307, "grad_norm": 0.25366196036338806, "learning_rate": 5e-06, "loss": 0.9453, "num_input_tokens_seen": 373132840, "step": 821, "train_runtime": 59408.7438, "train_tokens_per_second": 6280.773 }, { "epoch": 0.9952421040683321, "grad_norm": 0.2794412076473236, "learning_rate": 5e-06, "loss": 0.9247, "num_input_tokens_seen": 373539032, "step": 822, "train_runtime": 59475.4747, "train_tokens_per_second": 6280.556 }, { "epoch": 0.9964528608859334, "grad_norm": 0.24487674236297607, "learning_rate": 5e-06, "loss": 0.9423, "num_input_tokens_seen": 374024816, "step": 823, "train_runtime": 59555.7552, "train_tokens_per_second": 6280.246 }, { "epoch": 0.9976636177035348, "grad_norm": 0.2563667595386505, "learning_rate": 5e-06, "loss": 1.0405, "num_input_tokens_seen": 374474376, "step": 824, "train_runtime": 59629.3553, "train_tokens_per_second": 6280.034 }, { "epoch": 0.9988743745211363, "grad_norm": 0.23731544613838196, "learning_rate": 5e-06, "loss": 0.9858, "num_input_tokens_seen": 374932816, "step": 825, "train_runtime": 59705.0846, "train_tokens_per_second": 6279.747 }, { "epoch": 1.0, "grad_norm": 0.2806185185909271, "learning_rate": 5e-06, "loss": 0.9485, "num_input_tokens_seen": 375383896, "step": 826, "train_runtime": 59779.2015, "train_tokens_per_second": 6279.507 }, { "epoch": 1.0012107568176014, "grad_norm": 0.32343029975891113, "learning_rate": 5e-06, "loss": 0.9351, "num_input_tokens_seen": 375812944, "step": 827, "train_runtime": 59849.0654, "train_tokens_per_second": 6279.345 }, { "epoch": 1.0024215136352028, "grad_norm": 0.26928800344467163, "learning_rate": 5e-06, "loss": 0.9809, "num_input_tokens_seen": 376259432, "step": 828, "train_runtime": 59921.9342, "train_tokens_per_second": 6279.16 }, { "epoch": 1.0036322704528042, "grad_norm": 0.25450897216796875, "learning_rate": 5e-06, "loss": 0.9789, "num_input_tokens_seen": 376714864, "step": 829, "train_runtime": 59996.7049, "train_tokens_per_second": 6278.926 }, { "epoch": 1.0048430272704054, "grad_norm": 0.28886231780052185, "learning_rate": 5e-06, "loss": 0.9311, "num_input_tokens_seen": 377169072, "step": 830, "train_runtime": 60071.6622, "train_tokens_per_second": 6278.652 }, { "epoch": 1.0060537840880068, "grad_norm": 0.24842868745326996, "learning_rate": 5e-06, "loss": 0.962, "num_input_tokens_seen": 377620616, "step": 831, "train_runtime": 60145.504, "train_tokens_per_second": 6278.451 }, { "epoch": 1.0072645409056082, "grad_norm": 0.25559040904045105, "learning_rate": 5e-06, "loss": 0.9614, "num_input_tokens_seen": 378076536, "step": 832, "train_runtime": 60220.5226, "train_tokens_per_second": 6278.201 }, { "epoch": 1.0084752977232097, "grad_norm": 0.25331735610961914, "learning_rate": 5e-06, "loss": 1.0003, "num_input_tokens_seen": 378514920, "step": 833, "train_runtime": 60292.2197, "train_tokens_per_second": 6278.006 }, { "epoch": 1.009686054540811, "grad_norm": 0.24362653493881226, "learning_rate": 5e-06, "loss": 0.9511, "num_input_tokens_seen": 378977264, "step": 834, "train_runtime": 60368.2898, "train_tokens_per_second": 6277.754 }, { "epoch": 1.0108968113584125, "grad_norm": 0.28384852409362793, "learning_rate": 5e-06, "loss": 0.9329, "num_input_tokens_seen": 379436816, "step": 835, "train_runtime": 60444.0882, "train_tokens_per_second": 6277.484 }, { "epoch": 1.0121075681760137, "grad_norm": 0.2487291693687439, "learning_rate": 5e-06, "loss": 0.9456, "num_input_tokens_seen": 379905488, "step": 836, "train_runtime": 60521.0892, "train_tokens_per_second": 6277.241 }, { "epoch": 1.013318324993615, "grad_norm": 0.23668697476387024, "learning_rate": 5e-06, "loss": 0.9173, "num_input_tokens_seen": 380351432, "step": 837, "train_runtime": 60594.3175, "train_tokens_per_second": 6277.015 }, { "epoch": 1.0145290818112165, "grad_norm": 0.325173944234848, "learning_rate": 5e-06, "loss": 0.9809, "num_input_tokens_seen": 380803728, "step": 838, "train_runtime": 60668.8616, "train_tokens_per_second": 6276.757 }, { "epoch": 1.015739838628818, "grad_norm": 0.23116804659366608, "learning_rate": 5e-06, "loss": 0.9244, "num_input_tokens_seen": 381263144, "step": 839, "train_runtime": 60744.5507, "train_tokens_per_second": 6276.5 }, { "epoch": 1.0169505954464193, "grad_norm": 0.23826251924037933, "learning_rate": 5e-06, "loss": 0.9382, "num_input_tokens_seen": 381700768, "step": 840, "train_runtime": 60816.1107, "train_tokens_per_second": 6276.31 }, { "epoch": 1.0181613522640207, "grad_norm": 0.2259143888950348, "learning_rate": 5e-06, "loss": 0.9372, "num_input_tokens_seen": 382148144, "step": 841, "train_runtime": 60889.5245, "train_tokens_per_second": 6276.09 }, { "epoch": 1.0193721090816221, "grad_norm": 0.254041463136673, "learning_rate": 5e-06, "loss": 0.9983, "num_input_tokens_seen": 382596744, "step": 842, "train_runtime": 60963.3714, "train_tokens_per_second": 6275.846 }, { "epoch": 1.0205828658992233, "grad_norm": 0.2320503443479538, "learning_rate": 5e-06, "loss": 0.9858, "num_input_tokens_seen": 383049456, "step": 843, "train_runtime": 61037.609, "train_tokens_per_second": 6275.63 }, { "epoch": 1.0217936227168247, "grad_norm": 0.2377566397190094, "learning_rate": 5e-06, "loss": 0.9202, "num_input_tokens_seen": 383507960, "step": 844, "train_runtime": 61113.2522, "train_tokens_per_second": 6275.365 }, { "epoch": 1.0230043795344261, "grad_norm": 0.23518335819244385, "learning_rate": 5e-06, "loss": 0.9178, "num_input_tokens_seen": 383957320, "step": 845, "train_runtime": 61187.3572, "train_tokens_per_second": 6275.109 }, { "epoch": 1.0242151363520275, "grad_norm": 0.2533697187900543, "learning_rate": 5e-06, "loss": 0.9678, "num_input_tokens_seen": 384423264, "step": 846, "train_runtime": 61264.2471, "train_tokens_per_second": 6274.839 }, { "epoch": 1.025425893169629, "grad_norm": 0.23266910016536713, "learning_rate": 5e-06, "loss": 0.9683, "num_input_tokens_seen": 384880984, "step": 847, "train_runtime": 61339.9009, "train_tokens_per_second": 6274.562 }, { "epoch": 1.0266366499872304, "grad_norm": 0.26946571469306946, "learning_rate": 5e-06, "loss": 0.9402, "num_input_tokens_seen": 385329376, "step": 848, "train_runtime": 61413.5181, "train_tokens_per_second": 6274.341 }, { "epoch": 1.0278474068048318, "grad_norm": 0.24856071174144745, "learning_rate": 5e-06, "loss": 0.9894, "num_input_tokens_seen": 385777592, "step": 849, "train_runtime": 61487.4406, "train_tokens_per_second": 6274.088 }, { "epoch": 1.029058163622433, "grad_norm": 0.2351611852645874, "learning_rate": 5e-06, "loss": 1.0025, "num_input_tokens_seen": 386234720, "step": 850, "train_runtime": 61562.9352, "train_tokens_per_second": 6273.819 }, { "epoch": 1.0302689204400344, "grad_norm": 0.2401961088180542, "learning_rate": 5e-06, "loss": 0.9382, "num_input_tokens_seen": 386694424, "step": 851, "train_runtime": 61638.1399, "train_tokens_per_second": 6273.623 }, { "epoch": 1.0314796772576358, "grad_norm": 0.22459951043128967, "learning_rate": 5e-06, "loss": 0.9601, "num_input_tokens_seen": 387157680, "step": 852, "train_runtime": 61714.3484, "train_tokens_per_second": 6273.382 }, { "epoch": 1.0326904340752372, "grad_norm": 0.234735906124115, "learning_rate": 5e-06, "loss": 0.9919, "num_input_tokens_seen": 387600544, "step": 853, "train_runtime": 61787.81, "train_tokens_per_second": 6273.091 }, { "epoch": 1.0339011908928386, "grad_norm": 0.25244709849357605, "learning_rate": 5e-06, "loss": 0.9456, "num_input_tokens_seen": 388051704, "step": 854, "train_runtime": 61862.1932, "train_tokens_per_second": 6272.841 }, { "epoch": 1.03511194771044, "grad_norm": 0.2344299554824829, "learning_rate": 5e-06, "loss": 0.9304, "num_input_tokens_seen": 388502672, "step": 855, "train_runtime": 61936.0349, "train_tokens_per_second": 6272.644 }, { "epoch": 1.0363227045280414, "grad_norm": 0.23790518939495087, "learning_rate": 5e-06, "loss": 0.9557, "num_input_tokens_seen": 388966360, "step": 856, "train_runtime": 62013.0899, "train_tokens_per_second": 6272.327 }, { "epoch": 1.0375334613456426, "grad_norm": 0.227335587143898, "learning_rate": 5e-06, "loss": 0.9626, "num_input_tokens_seen": 389459840, "step": 857, "train_runtime": 62094.4481, "train_tokens_per_second": 6272.056 }, { "epoch": 1.038744218163244, "grad_norm": 0.24627360701560974, "learning_rate": 5e-06, "loss": 0.9593, "num_input_tokens_seen": 389920728, "step": 858, "train_runtime": 62170.1606, "train_tokens_per_second": 6271.831 }, { "epoch": 1.0399549749808454, "grad_norm": 0.23155222833156586, "learning_rate": 5e-06, "loss": 0.9678, "num_input_tokens_seen": 390406000, "step": 859, "train_runtime": 62250.3499, "train_tokens_per_second": 6271.547 }, { "epoch": 1.0411657317984468, "grad_norm": 0.24751697480678558, "learning_rate": 5e-06, "loss": 0.8877, "num_input_tokens_seen": 390852104, "step": 860, "train_runtime": 62324.5278, "train_tokens_per_second": 6271.241 }, { "epoch": 1.0423764886160483, "grad_norm": 0.24071338772773743, "learning_rate": 5e-06, "loss": 0.9907, "num_input_tokens_seen": 391293032, "step": 861, "train_runtime": 62397.4491, "train_tokens_per_second": 6270.978 }, { "epoch": 1.0435872454336497, "grad_norm": 0.22940731048583984, "learning_rate": 5e-06, "loss": 0.9161, "num_input_tokens_seen": 391770176, "step": 862, "train_runtime": 62476.2215, "train_tokens_per_second": 6270.709 }, { "epoch": 1.0447980022512509, "grad_norm": 0.2349405437707901, "learning_rate": 5e-06, "loss": 0.9215, "num_input_tokens_seen": 392244600, "step": 863, "train_runtime": 62555.2897, "train_tokens_per_second": 6270.367 }, { "epoch": 1.0460087590688523, "grad_norm": 0.24631568789482117, "learning_rate": 5e-06, "loss": 1.0027, "num_input_tokens_seen": 392696832, "step": 864, "train_runtime": 62629.7506, "train_tokens_per_second": 6270.132 }, { "epoch": 1.0472195158864537, "grad_norm": 0.22788004577159882, "learning_rate": 5e-06, "loss": 0.9622, "num_input_tokens_seen": 393167904, "step": 865, "train_runtime": 62707.7818, "train_tokens_per_second": 6269.842 }, { "epoch": 1.048430272704055, "grad_norm": 0.25337284803390503, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 393614280, "step": 866, "train_runtime": 62781.9156, "train_tokens_per_second": 6269.549 }, { "epoch": 1.0496410295216565, "grad_norm": 0.24765488505363464, "learning_rate": 5e-06, "loss": 0.9575, "num_input_tokens_seen": 394048632, "step": 867, "train_runtime": 62853.869, "train_tokens_per_second": 6269.282 }, { "epoch": 1.050851786339258, "grad_norm": 0.2693709135055542, "learning_rate": 5e-06, "loss": 0.922, "num_input_tokens_seen": 394509160, "step": 868, "train_runtime": 62930.1041, "train_tokens_per_second": 6269.005 }, { "epoch": 1.0520625431568593, "grad_norm": 0.2373555600643158, "learning_rate": 5e-06, "loss": 0.9446, "num_input_tokens_seen": 394987880, "step": 869, "train_runtime": 63009.5701, "train_tokens_per_second": 6268.697 }, { "epoch": 1.0532732999744605, "grad_norm": 0.22769400477409363, "learning_rate": 5e-06, "loss": 0.9221, "num_input_tokens_seen": 395451368, "step": 870, "train_runtime": 63086.1339, "train_tokens_per_second": 6268.436 }, { "epoch": 1.054484056792062, "grad_norm": 0.27482476830482483, "learning_rate": 5e-06, "loss": 1.0033, "num_input_tokens_seen": 395906456, "step": 871, "train_runtime": 63161.4527, "train_tokens_per_second": 6268.166 }, { "epoch": 1.0556948136096633, "grad_norm": 0.3092348873615265, "learning_rate": 5e-06, "loss": 0.925, "num_input_tokens_seen": 396368728, "step": 872, "train_runtime": 63237.3574, "train_tokens_per_second": 6267.952 }, { "epoch": 1.0569055704272647, "grad_norm": 0.24406789243221283, "learning_rate": 5e-06, "loss": 0.9352, "num_input_tokens_seen": 396819384, "step": 873, "train_runtime": 63311.4228, "train_tokens_per_second": 6267.738 }, { "epoch": 1.0581163272448662, "grad_norm": 0.23081360757350922, "learning_rate": 5e-06, "loss": 0.9675, "num_input_tokens_seen": 397266008, "step": 874, "train_runtime": 63385.1132, "train_tokens_per_second": 6267.497 }, { "epoch": 1.0593270840624676, "grad_norm": 0.23777136206626892, "learning_rate": 5e-06, "loss": 0.9109, "num_input_tokens_seen": 397710296, "step": 875, "train_runtime": 63458.1334, "train_tokens_per_second": 6267.286 }, { "epoch": 1.0605378408800687, "grad_norm": 0.27890682220458984, "learning_rate": 5e-06, "loss": 0.9501, "num_input_tokens_seen": 398160280, "step": 876, "train_runtime": 63532.2973, "train_tokens_per_second": 6267.053 }, { "epoch": 1.0617485976976702, "grad_norm": 0.31578439474105835, "learning_rate": 5e-06, "loss": 0.9675, "num_input_tokens_seen": 398617704, "step": 877, "train_runtime": 63607.8411, "train_tokens_per_second": 6266.801 }, { "epoch": 1.0629593545152716, "grad_norm": 0.265449583530426, "learning_rate": 5e-06, "loss": 0.9622, "num_input_tokens_seen": 399065392, "step": 878, "train_runtime": 63681.9925, "train_tokens_per_second": 6266.534 }, { "epoch": 1.064170111332873, "grad_norm": 0.23809348046779633, "learning_rate": 5e-06, "loss": 0.9394, "num_input_tokens_seen": 399511384, "step": 879, "train_runtime": 63755.2338, "train_tokens_per_second": 6266.331 }, { "epoch": 1.0653808681504744, "grad_norm": 0.23853924870491028, "learning_rate": 5e-06, "loss": 0.9333, "num_input_tokens_seen": 399962128, "step": 880, "train_runtime": 63829.5915, "train_tokens_per_second": 6266.093 }, { "epoch": 1.0665916249680758, "grad_norm": 0.2612011432647705, "learning_rate": 5e-06, "loss": 0.9688, "num_input_tokens_seen": 400415080, "step": 881, "train_runtime": 63904.2631, "train_tokens_per_second": 6265.859 }, { "epoch": 1.0678023817856772, "grad_norm": 0.24397185444831848, "learning_rate": 5e-06, "loss": 0.9522, "num_input_tokens_seen": 400891320, "step": 882, "train_runtime": 63982.8326, "train_tokens_per_second": 6265.608 }, { "epoch": 1.0690131386032784, "grad_norm": 0.22875207662582397, "learning_rate": 5e-06, "loss": 0.8692, "num_input_tokens_seen": 401342120, "step": 883, "train_runtime": 64057.1181, "train_tokens_per_second": 6265.379 }, { "epoch": 1.0702238954208798, "grad_norm": 0.2462654709815979, "learning_rate": 5e-06, "loss": 0.9387, "num_input_tokens_seen": 401803896, "step": 884, "train_runtime": 64132.7561, "train_tokens_per_second": 6265.19 }, { "epoch": 1.0714346522384812, "grad_norm": 0.24718287587165833, "learning_rate": 5e-06, "loss": 0.9991, "num_input_tokens_seen": 402272664, "step": 885, "train_runtime": 64209.9502, "train_tokens_per_second": 6264.958 }, { "epoch": 1.0726454090560826, "grad_norm": 0.24072563648223877, "learning_rate": 5e-06, "loss": 0.9287, "num_input_tokens_seen": 402723056, "step": 886, "train_runtime": 64284.2525, "train_tokens_per_second": 6264.723 }, { "epoch": 1.073856165873684, "grad_norm": 0.2594250440597534, "learning_rate": 5e-06, "loss": 0.96, "num_input_tokens_seen": 403187280, "step": 887, "train_runtime": 64360.8895, "train_tokens_per_second": 6264.477 }, { "epoch": 1.0750669226912855, "grad_norm": 0.23461049795150757, "learning_rate": 5e-06, "loss": 0.9394, "num_input_tokens_seen": 403634976, "step": 888, "train_runtime": 64434.6937, "train_tokens_per_second": 6264.249 }, { "epoch": 1.0762776795088869, "grad_norm": 0.26398470997810364, "learning_rate": 5e-06, "loss": 0.9979, "num_input_tokens_seen": 404074752, "step": 889, "train_runtime": 64506.9752, "train_tokens_per_second": 6264.047 }, { "epoch": 1.077488436326488, "grad_norm": 0.22275783121585846, "learning_rate": 5e-06, "loss": 0.9295, "num_input_tokens_seen": 404526048, "step": 890, "train_runtime": 64581.6885, "train_tokens_per_second": 6263.789 }, { "epoch": 1.0786991931440895, "grad_norm": 0.24403129518032074, "learning_rate": 5e-06, "loss": 0.988, "num_input_tokens_seen": 404977352, "step": 891, "train_runtime": 64656.3632, "train_tokens_per_second": 6263.534 }, { "epoch": 1.0799099499616909, "grad_norm": 0.22611185908317566, "learning_rate": 5e-06, "loss": 0.9323, "num_input_tokens_seen": 405472656, "step": 892, "train_runtime": 64738.2809, "train_tokens_per_second": 6263.26 }, { "epoch": 1.0811207067792923, "grad_norm": 0.24935585260391235, "learning_rate": 5e-06, "loss": 0.9754, "num_input_tokens_seen": 405915216, "step": 893, "train_runtime": 64811.4632, "train_tokens_per_second": 6263.016 }, { "epoch": 1.0823314635968937, "grad_norm": 0.23537464439868927, "learning_rate": 5e-06, "loss": 0.9882, "num_input_tokens_seen": 406358328, "step": 894, "train_runtime": 64884.6774, "train_tokens_per_second": 6262.778 }, { "epoch": 1.083542220414495, "grad_norm": 0.25859230756759644, "learning_rate": 5e-06, "loss": 1.0094, "num_input_tokens_seen": 406792080, "step": 895, "train_runtime": 64956.3923, "train_tokens_per_second": 6262.541 }, { "epoch": 1.0847529772320965, "grad_norm": 0.2601807117462158, "learning_rate": 5e-06, "loss": 1.0062, "num_input_tokens_seen": 407236568, "step": 896, "train_runtime": 65029.8834, "train_tokens_per_second": 6262.299 }, { "epoch": 1.0859637340496977, "grad_norm": 0.25152677297592163, "learning_rate": 5e-06, "loss": 0.9604, "num_input_tokens_seen": 407694016, "step": 897, "train_runtime": 65105.3988, "train_tokens_per_second": 6262.062 }, { "epoch": 1.0871744908672991, "grad_norm": 0.2490074634552002, "learning_rate": 5e-06, "loss": 0.9767, "num_input_tokens_seen": 408139040, "step": 898, "train_runtime": 65178.6583, "train_tokens_per_second": 6261.851 }, { "epoch": 1.0883852476849005, "grad_norm": 0.2619398534297943, "learning_rate": 5e-06, "loss": 0.9195, "num_input_tokens_seen": 408609256, "step": 899, "train_runtime": 65255.5022, "train_tokens_per_second": 6261.683 }, { "epoch": 1.089596004502502, "grad_norm": 0.22217896580696106, "learning_rate": 5e-06, "loss": 0.9578, "num_input_tokens_seen": 409081904, "step": 900, "train_runtime": 65333.4837, "train_tokens_per_second": 6261.443 }, { "epoch": 1.0908067613201033, "grad_norm": 0.26266419887542725, "learning_rate": 5e-06, "loss": 0.9423, "num_input_tokens_seen": 409534064, "step": 901, "train_runtime": 65407.6873, "train_tokens_per_second": 6261.253 }, { "epoch": 1.0920175181377048, "grad_norm": 0.23616282641887665, "learning_rate": 5e-06, "loss": 0.9667, "num_input_tokens_seen": 410013440, "step": 902, "train_runtime": 65486.7075, "train_tokens_per_second": 6261.018 }, { "epoch": 1.093228274955306, "grad_norm": 0.2340526580810547, "learning_rate": 5e-06, "loss": 0.9399, "num_input_tokens_seen": 410466096, "step": 903, "train_runtime": 65561.0012, "train_tokens_per_second": 6260.827 }, { "epoch": 1.0944390317729074, "grad_norm": 0.22588470578193665, "learning_rate": 5e-06, "loss": 0.9486, "num_input_tokens_seen": 410910864, "step": 904, "train_runtime": 65634.0777, "train_tokens_per_second": 6260.633 }, { "epoch": 1.0956497885905088, "grad_norm": 0.22636951506137848, "learning_rate": 5e-06, "loss": 0.934, "num_input_tokens_seen": 411365264, "step": 905, "train_runtime": 65708.6219, "train_tokens_per_second": 6260.446 }, { "epoch": 1.0968605454081102, "grad_norm": 0.2439277172088623, "learning_rate": 5e-06, "loss": 0.8909, "num_input_tokens_seen": 411829656, "step": 906, "train_runtime": 65785.1231, "train_tokens_per_second": 6260.225 }, { "epoch": 1.0980713022257116, "grad_norm": 0.24524036049842834, "learning_rate": 5e-06, "loss": 0.9994, "num_input_tokens_seen": 412289888, "step": 907, "train_runtime": 65860.7063, "train_tokens_per_second": 6260.028 }, { "epoch": 1.099282059043313, "grad_norm": 0.23185384273529053, "learning_rate": 5e-06, "loss": 0.9854, "num_input_tokens_seen": 412763840, "step": 908, "train_runtime": 65938.5276, "train_tokens_per_second": 6259.828 }, { "epoch": 1.1004928158609144, "grad_norm": 0.22845549881458282, "learning_rate": 5e-06, "loss": 0.9688, "num_input_tokens_seen": 413225160, "step": 909, "train_runtime": 66014.5341, "train_tokens_per_second": 6259.609 }, { "epoch": 1.1017035726785156, "grad_norm": 0.24248257279396057, "learning_rate": 5e-06, "loss": 0.9407, "num_input_tokens_seen": 413667032, "step": 910, "train_runtime": 66086.6471, "train_tokens_per_second": 6259.465 }, { "epoch": 1.102914329496117, "grad_norm": 0.2400379329919815, "learning_rate": 5e-06, "loss": 0.9766, "num_input_tokens_seen": 414113888, "step": 911, "train_runtime": 66159.6556, "train_tokens_per_second": 6259.311 }, { "epoch": 1.1041250863137184, "grad_norm": 0.2528563439846039, "learning_rate": 5e-06, "loss": 0.9031, "num_input_tokens_seen": 414556384, "step": 912, "train_runtime": 66232.8368, "train_tokens_per_second": 6259.076 }, { "epoch": 1.1053358431313198, "grad_norm": 0.23828411102294922, "learning_rate": 5e-06, "loss": 0.9605, "num_input_tokens_seen": 415033736, "step": 913, "train_runtime": 66311.6819, "train_tokens_per_second": 6258.833 }, { "epoch": 1.1065465999489212, "grad_norm": 0.2361602932214737, "learning_rate": 5e-06, "loss": 0.9757, "num_input_tokens_seen": 415471768, "step": 914, "train_runtime": 66383.8663, "train_tokens_per_second": 6258.626 }, { "epoch": 1.1077573567665226, "grad_norm": 0.25282710790634155, "learning_rate": 5e-06, "loss": 0.9595, "num_input_tokens_seen": 415920848, "step": 915, "train_runtime": 66457.5615, "train_tokens_per_second": 6258.443 }, { "epoch": 1.1089681135841238, "grad_norm": 0.24360793828964233, "learning_rate": 5e-06, "loss": 0.9652, "num_input_tokens_seen": 416359384, "step": 916, "train_runtime": 66529.0151, "train_tokens_per_second": 6258.313 }, { "epoch": 1.1101788704017252, "grad_norm": 0.24343234300613403, "learning_rate": 5e-06, "loss": 0.9539, "num_input_tokens_seen": 416791408, "step": 917, "train_runtime": 66599.6517, "train_tokens_per_second": 6258.162 }, { "epoch": 1.1113896272193267, "grad_norm": 0.22756776213645935, "learning_rate": 5e-06, "loss": 0.9909, "num_input_tokens_seen": 417240208, "step": 918, "train_runtime": 66673.3949, "train_tokens_per_second": 6257.972 }, { "epoch": 1.112600384036928, "grad_norm": 0.24931581318378448, "learning_rate": 5e-06, "loss": 0.9645, "num_input_tokens_seen": 417696072, "step": 919, "train_runtime": 66747.9073, "train_tokens_per_second": 6257.815 }, { "epoch": 1.1138111408545295, "grad_norm": 0.2384309619665146, "learning_rate": 5e-06, "loss": 0.9136, "num_input_tokens_seen": 418138568, "step": 920, "train_runtime": 66820.2813, "train_tokens_per_second": 6257.659 }, { "epoch": 1.115021897672131, "grad_norm": 0.2728740870952606, "learning_rate": 5e-06, "loss": 0.9831, "num_input_tokens_seen": 418582560, "step": 921, "train_runtime": 66893.2348, "train_tokens_per_second": 6257.472 }, { "epoch": 1.1162326544897323, "grad_norm": 0.22459077835083008, "learning_rate": 5e-06, "loss": 0.9284, "num_input_tokens_seen": 419046112, "step": 922, "train_runtime": 66969.9731, "train_tokens_per_second": 6257.224 }, { "epoch": 1.1174434113073335, "grad_norm": 0.22039759159088135, "learning_rate": 5e-06, "loss": 0.9443, "num_input_tokens_seen": 419517280, "step": 923, "train_runtime": 67047.7728, "train_tokens_per_second": 6256.991 }, { "epoch": 1.118654168124935, "grad_norm": 0.251267671585083, "learning_rate": 5e-06, "loss": 0.9654, "num_input_tokens_seen": 419968712, "step": 924, "train_runtime": 67121.4437, "train_tokens_per_second": 6256.849 }, { "epoch": 1.1198649249425363, "grad_norm": 0.24382558465003967, "learning_rate": 5e-06, "loss": 0.9589, "num_input_tokens_seen": 420406984, "step": 925, "train_runtime": 67193.015, "train_tokens_per_second": 6256.707 }, { "epoch": 1.1210756817601377, "grad_norm": 0.22386138141155243, "learning_rate": 5e-06, "loss": 0.9313, "num_input_tokens_seen": 420861848, "step": 926, "train_runtime": 67267.6486, "train_tokens_per_second": 6256.527 }, { "epoch": 1.1222864385777391, "grad_norm": 0.21948383748531342, "learning_rate": 5e-06, "loss": 0.9729, "num_input_tokens_seen": 421331168, "step": 927, "train_runtime": 67345.0933, "train_tokens_per_second": 6256.301 }, { "epoch": 1.1234971953953405, "grad_norm": 0.2778039574623108, "learning_rate": 5e-06, "loss": 1.016, "num_input_tokens_seen": 421758736, "step": 928, "train_runtime": 67414.5044, "train_tokens_per_second": 6256.202 }, { "epoch": 1.124707952212942, "grad_norm": 0.2170412689447403, "learning_rate": 5e-06, "loss": 0.9539, "num_input_tokens_seen": 422211592, "step": 929, "train_runtime": 67488.8275, "train_tokens_per_second": 6256.022 }, { "epoch": 1.1259187090305431, "grad_norm": 0.25213587284088135, "learning_rate": 5e-06, "loss": 0.8931, "num_input_tokens_seen": 422680032, "step": 930, "train_runtime": 67566.688, "train_tokens_per_second": 6255.746 }, { "epoch": 1.1271294658481446, "grad_norm": 0.23005911707878113, "learning_rate": 5e-06, "loss": 0.9915, "num_input_tokens_seen": 423140064, "step": 931, "train_runtime": 67642.6868, "train_tokens_per_second": 6255.518 }, { "epoch": 1.128340222665746, "grad_norm": 0.25569239258766174, "learning_rate": 5e-06, "loss": 0.9817, "num_input_tokens_seen": 423618400, "step": 932, "train_runtime": 67721.4977, "train_tokens_per_second": 6255.302 }, { "epoch": 1.1295509794833474, "grad_norm": 0.2626954913139343, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 424089544, "step": 933, "train_runtime": 67799.5609, "train_tokens_per_second": 6255.049 }, { "epoch": 1.1307617363009488, "grad_norm": 0.2500688135623932, "learning_rate": 5e-06, "loss": 1.0022, "num_input_tokens_seen": 424567696, "step": 934, "train_runtime": 67878.4197, "train_tokens_per_second": 6254.826 }, { "epoch": 1.1319724931185502, "grad_norm": 0.23637151718139648, "learning_rate": 5e-06, "loss": 1.0038, "num_input_tokens_seen": 425008992, "step": 935, "train_runtime": 67951.2851, "train_tokens_per_second": 6254.613 }, { "epoch": 1.1331832499361516, "grad_norm": 0.22515641152858734, "learning_rate": 5e-06, "loss": 0.9132, "num_input_tokens_seen": 425487232, "step": 936, "train_runtime": 68030.5016, "train_tokens_per_second": 6254.36 }, { "epoch": 1.1343940067537528, "grad_norm": 0.22837060689926147, "learning_rate": 5e-06, "loss": 0.9165, "num_input_tokens_seen": 425957792, "step": 937, "train_runtime": 68107.9582, "train_tokens_per_second": 6254.156 }, { "epoch": 1.1356047635713542, "grad_norm": 0.2596193552017212, "learning_rate": 5e-06, "loss": 0.95, "num_input_tokens_seen": 426428528, "step": 938, "train_runtime": 68185.8307, "train_tokens_per_second": 6253.917 }, { "epoch": 1.1368155203889556, "grad_norm": 0.23047588765621185, "learning_rate": 5e-06, "loss": 0.9153, "num_input_tokens_seen": 426877936, "step": 939, "train_runtime": 68259.9737, "train_tokens_per_second": 6253.708 }, { "epoch": 1.138026277206557, "grad_norm": 0.22194674611091614, "learning_rate": 5e-06, "loss": 0.9353, "num_input_tokens_seen": 427332784, "step": 940, "train_runtime": 68334.9772, "train_tokens_per_second": 6253.5 }, { "epoch": 1.1392370340241584, "grad_norm": 0.2305593639612198, "learning_rate": 5e-06, "loss": 0.9578, "num_input_tokens_seen": 427770848, "step": 941, "train_runtime": 68406.5809, "train_tokens_per_second": 6253.358 }, { "epoch": 1.1404477908417598, "grad_norm": 0.2662777900695801, "learning_rate": 5e-06, "loss": 0.9606, "num_input_tokens_seen": 428218112, "step": 942, "train_runtime": 68479.8162, "train_tokens_per_second": 6253.202 }, { "epoch": 1.141658547659361, "grad_norm": 0.26229748129844666, "learning_rate": 5e-06, "loss": 0.9166, "num_input_tokens_seen": 428684216, "step": 943, "train_runtime": 68556.798, "train_tokens_per_second": 6252.979 }, { "epoch": 1.1428693044769624, "grad_norm": 0.22433774173259735, "learning_rate": 5e-06, "loss": 0.9171, "num_input_tokens_seen": 429177656, "step": 944, "train_runtime": 68638.2099, "train_tokens_per_second": 6252.751 }, { "epoch": 1.1440800612945639, "grad_norm": 0.23602762818336487, "learning_rate": 5e-06, "loss": 0.9273, "num_input_tokens_seen": 429659776, "step": 945, "train_runtime": 68717.5939, "train_tokens_per_second": 6252.544 }, { "epoch": 1.1452908181121653, "grad_norm": 0.246641144156456, "learning_rate": 5e-06, "loss": 0.9523, "num_input_tokens_seen": 430124728, "step": 946, "train_runtime": 68793.6291, "train_tokens_per_second": 6252.392 }, { "epoch": 1.1465015749297667, "grad_norm": 0.2760850787162781, "learning_rate": 5e-06, "loss": 0.9436, "num_input_tokens_seen": 430575992, "step": 947, "train_runtime": 68868.1445, "train_tokens_per_second": 6252.179 }, { "epoch": 1.147712331747368, "grad_norm": 0.24327822029590607, "learning_rate": 5e-06, "loss": 0.9575, "num_input_tokens_seen": 431013928, "step": 948, "train_runtime": 68940.3142, "train_tokens_per_second": 6251.987 }, { "epoch": 1.1489230885649695, "grad_norm": 0.24040260910987854, "learning_rate": 5e-06, "loss": 0.9448, "num_input_tokens_seen": 431462608, "step": 949, "train_runtime": 69014.5475, "train_tokens_per_second": 6251.763 }, { "epoch": 1.1501338453825707, "grad_norm": 0.23738116025924683, "learning_rate": 5e-06, "loss": 0.9312, "num_input_tokens_seen": 431919616, "step": 950, "train_runtime": 69089.8288, "train_tokens_per_second": 6251.566 }, { "epoch": 1.151344602200172, "grad_norm": 0.26888352632522583, "learning_rate": 5e-06, "loss": 1.0274, "num_input_tokens_seen": 432378360, "step": 951, "train_runtime": 69166.0174, "train_tokens_per_second": 6251.312 }, { "epoch": 1.1525553590177735, "grad_norm": 0.3020702600479126, "learning_rate": 5e-06, "loss": 0.966, "num_input_tokens_seen": 432815336, "step": 952, "train_runtime": 69238.0188, "train_tokens_per_second": 6251.122 }, { "epoch": 1.153766115835375, "grad_norm": 0.23694109916687012, "learning_rate": 5e-06, "loss": 0.9676, "num_input_tokens_seen": 433256224, "step": 953, "train_runtime": 69310.4665, "train_tokens_per_second": 6250.949 }, { "epoch": 1.1549768726529763, "grad_norm": 0.26480624079704285, "learning_rate": 5e-06, "loss": 0.9864, "num_input_tokens_seen": 433691568, "step": 954, "train_runtime": 69382.337, "train_tokens_per_second": 6250.749 }, { "epoch": 1.1561876294705777, "grad_norm": 0.2512606382369995, "learning_rate": 5e-06, "loss": 0.9802, "num_input_tokens_seen": 434141344, "step": 955, "train_runtime": 69456.6448, "train_tokens_per_second": 6250.537 }, { "epoch": 1.157398386288179, "grad_norm": 0.2603987455368042, "learning_rate": 5e-06, "loss": 0.9443, "num_input_tokens_seen": 434571288, "step": 956, "train_runtime": 69527.4526, "train_tokens_per_second": 6250.355 }, { "epoch": 1.1586091431057803, "grad_norm": 0.2712121903896332, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 435010000, "step": 957, "train_runtime": 69599.8559, "train_tokens_per_second": 6250.157 }, { "epoch": 1.1598198999233817, "grad_norm": 0.2328772246837616, "learning_rate": 5e-06, "loss": 0.9054, "num_input_tokens_seen": 435464528, "step": 958, "train_runtime": 69673.8246, "train_tokens_per_second": 6250.045 }, { "epoch": 1.1610306567409832, "grad_norm": 0.23724646866321564, "learning_rate": 5e-06, "loss": 0.9686, "num_input_tokens_seen": 435929064, "step": 959, "train_runtime": 69747.3951, "train_tokens_per_second": 6250.112 }, { "epoch": 1.1622414135585846, "grad_norm": 0.2300594449043274, "learning_rate": 5e-06, "loss": 0.9641, "num_input_tokens_seen": 436379864, "step": 960, "train_runtime": 69821.0669, "train_tokens_per_second": 6249.974 }, { "epoch": 1.163452170376186, "grad_norm": 0.24695640802383423, "learning_rate": 5e-06, "loss": 0.985, "num_input_tokens_seen": 436818672, "step": 961, "train_runtime": 69893.2539, "train_tokens_per_second": 6249.797 }, { "epoch": 1.1646629271937874, "grad_norm": 0.25464367866516113, "learning_rate": 5e-06, "loss": 0.955, "num_input_tokens_seen": 437256528, "step": 962, "train_runtime": 69965.6023, "train_tokens_per_second": 6249.593 }, { "epoch": 1.1658736840113888, "grad_norm": 0.23890846967697144, "learning_rate": 5e-06, "loss": 0.9812, "num_input_tokens_seen": 437704184, "step": 963, "train_runtime": 70039.3875, "train_tokens_per_second": 6249.401 }, { "epoch": 1.16708444082899, "grad_norm": 0.22985456883907318, "learning_rate": 5e-06, "loss": 0.9547, "num_input_tokens_seen": 438157616, "step": 964, "train_runtime": 70114.4049, "train_tokens_per_second": 6249.181 }, { "epoch": 1.1682951976465914, "grad_norm": 0.2481573075056076, "learning_rate": 5e-06, "loss": 0.9783, "num_input_tokens_seen": 438604112, "step": 965, "train_runtime": 70187.4586, "train_tokens_per_second": 6249.038 }, { "epoch": 1.1695059544641928, "grad_norm": 0.23532527685165405, "learning_rate": 5e-06, "loss": 0.9783, "num_input_tokens_seen": 439082896, "step": 966, "train_runtime": 70266.4555, "train_tokens_per_second": 6248.827 }, { "epoch": 1.1707167112817942, "grad_norm": 0.2518933117389679, "learning_rate": 5e-06, "loss": 0.9852, "num_input_tokens_seen": 439533512, "step": 967, "train_runtime": 70340.9795, "train_tokens_per_second": 6248.612 }, { "epoch": 1.1719274680993956, "grad_norm": 0.22327609360218048, "learning_rate": 5e-06, "loss": 0.9746, "num_input_tokens_seen": 440017968, "step": 968, "train_runtime": 70421.2059, "train_tokens_per_second": 6248.373 }, { "epoch": 1.173138224916997, "grad_norm": 0.21766787767410278, "learning_rate": 5e-06, "loss": 0.9427, "num_input_tokens_seen": 440485848, "step": 969, "train_runtime": 70498.8622, "train_tokens_per_second": 6248.127 }, { "epoch": 1.1743489817345982, "grad_norm": 0.24497343599796295, "learning_rate": 5e-06, "loss": 0.9164, "num_input_tokens_seen": 440958848, "step": 970, "train_runtime": 70577.0563, "train_tokens_per_second": 6247.906 }, { "epoch": 1.1755597385521996, "grad_norm": 0.24692267179489136, "learning_rate": 5e-06, "loss": 0.9531, "num_input_tokens_seen": 441430568, "step": 971, "train_runtime": 70654.5666, "train_tokens_per_second": 6247.729 }, { "epoch": 1.176770495369801, "grad_norm": 0.2446671724319458, "learning_rate": 5e-06, "loss": 1.0036, "num_input_tokens_seen": 441870464, "step": 972, "train_runtime": 70726.7808, "train_tokens_per_second": 6247.569 }, { "epoch": 1.1779812521874025, "grad_norm": 0.22598214447498322, "learning_rate": 5e-06, "loss": 0.9359, "num_input_tokens_seen": 442325960, "step": 973, "train_runtime": 70802.4686, "train_tokens_per_second": 6247.324 }, { "epoch": 1.1791920090050039, "grad_norm": 0.23768270015716553, "learning_rate": 5e-06, "loss": 0.9584, "num_input_tokens_seen": 442783712, "step": 974, "train_runtime": 70878.7415, "train_tokens_per_second": 6247.059 }, { "epoch": 1.1804027658226053, "grad_norm": 0.2732614576816559, "learning_rate": 5e-06, "loss": 0.992, "num_input_tokens_seen": 443235984, "step": 975, "train_runtime": 70953.8006, "train_tokens_per_second": 6246.825 }, { "epoch": 1.1816135226402067, "grad_norm": 0.22531206905841827, "learning_rate": 5e-06, "loss": 0.9176, "num_input_tokens_seen": 443689408, "step": 976, "train_runtime": 71028.2155, "train_tokens_per_second": 6246.664 }, { "epoch": 1.1828242794578079, "grad_norm": 0.246334969997406, "learning_rate": 5e-06, "loss": 0.9678, "num_input_tokens_seen": 444127032, "step": 977, "train_runtime": 71099.99, "train_tokens_per_second": 6246.513 }, { "epoch": 1.1840350362754093, "grad_norm": 0.2669452428817749, "learning_rate": 5e-06, "loss": 0.9531, "num_input_tokens_seen": 444581568, "step": 978, "train_runtime": 71175.2703, "train_tokens_per_second": 6246.293 }, { "epoch": 1.1852457930930107, "grad_norm": 0.24605169892311096, "learning_rate": 5e-06, "loss": 0.9629, "num_input_tokens_seen": 445041736, "step": 979, "train_runtime": 71251.1753, "train_tokens_per_second": 6246.097 }, { "epoch": 1.1864565499106121, "grad_norm": 0.2738041877746582, "learning_rate": 5e-06, "loss": 0.9536, "num_input_tokens_seen": 445507072, "step": 980, "train_runtime": 71328.1089, "train_tokens_per_second": 6245.884 }, { "epoch": 1.1876673067282135, "grad_norm": 0.23345020413398743, "learning_rate": 5e-06, "loss": 0.9208, "num_input_tokens_seen": 445989088, "step": 981, "train_runtime": 71407.5879, "train_tokens_per_second": 6245.682 }, { "epoch": 1.188878063545815, "grad_norm": 0.24863320589065552, "learning_rate": 5e-06, "loss": 1.0073, "num_input_tokens_seen": 446459192, "step": 982, "train_runtime": 71485.3609, "train_tokens_per_second": 6245.463 }, { "epoch": 1.1900888203634161, "grad_norm": 0.230724036693573, "learning_rate": 5e-06, "loss": 0.9858, "num_input_tokens_seen": 446904400, "step": 983, "train_runtime": 71558.8839, "train_tokens_per_second": 6245.268 }, { "epoch": 1.1912995771810175, "grad_norm": 0.26054030656814575, "learning_rate": 5e-06, "loss": 0.9527, "num_input_tokens_seen": 447363192, "step": 984, "train_runtime": 71634.9114, "train_tokens_per_second": 6245.044 }, { "epoch": 1.192510333998619, "grad_norm": 0.24276606738567352, "learning_rate": 5e-06, "loss": 0.9529, "num_input_tokens_seen": 447817104, "step": 985, "train_runtime": 71709.3132, "train_tokens_per_second": 6244.895 }, { "epoch": 1.1937210908162204, "grad_norm": 0.24462191760540009, "learning_rate": 5e-06, "loss": 0.9536, "num_input_tokens_seen": 448278224, "step": 986, "train_runtime": 71785.9847, "train_tokens_per_second": 6244.648 }, { "epoch": 1.1949318476338218, "grad_norm": 0.2560247778892517, "learning_rate": 5e-06, "loss": 0.9833, "num_input_tokens_seen": 448739320, "step": 987, "train_runtime": 71861.9391, "train_tokens_per_second": 6244.464 }, { "epoch": 1.1961426044514232, "grad_norm": 0.24045203626155853, "learning_rate": 5e-06, "loss": 0.9363, "num_input_tokens_seen": 449186640, "step": 988, "train_runtime": 71935.9522, "train_tokens_per_second": 6244.258 }, { "epoch": 1.1973533612690246, "grad_norm": 0.23872441053390503, "learning_rate": 5e-06, "loss": 0.8979, "num_input_tokens_seen": 449653848, "step": 989, "train_runtime": 72012.385, "train_tokens_per_second": 6244.118 }, { "epoch": 1.1985641180866258, "grad_norm": 0.28531908988952637, "learning_rate": 5e-06, "loss": 0.914, "num_input_tokens_seen": 450080552, "step": 990, "train_runtime": 72082.6578, "train_tokens_per_second": 6243.951 }, { "epoch": 1.1997748749042272, "grad_norm": 0.2463030368089676, "learning_rate": 5e-06, "loss": 0.9246, "num_input_tokens_seen": 450513000, "step": 991, "train_runtime": 72153.7158, "train_tokens_per_second": 6243.795 }, { "epoch": 1.2009856317218286, "grad_norm": 0.23530061542987823, "learning_rate": 5e-06, "loss": 0.9557, "num_input_tokens_seen": 450955120, "step": 992, "train_runtime": 72226.4633, "train_tokens_per_second": 6243.627 }, { "epoch": 1.20219638853943, "grad_norm": 0.246900275349617, "learning_rate": 5e-06, "loss": 0.8806, "num_input_tokens_seen": 451394736, "step": 993, "train_runtime": 72298.4949, "train_tokens_per_second": 6243.487 }, { "epoch": 1.2034071453570314, "grad_norm": 0.25331759452819824, "learning_rate": 5e-06, "loss": 1.0287, "num_input_tokens_seen": 451851352, "step": 994, "train_runtime": 72373.4673, "train_tokens_per_second": 6243.329 }, { "epoch": 1.2046179021746328, "grad_norm": 0.23674815893173218, "learning_rate": 5e-06, "loss": 0.9687, "num_input_tokens_seen": 452305600, "step": 995, "train_runtime": 72448.5085, "train_tokens_per_second": 6243.132 }, { "epoch": 1.205828658992234, "grad_norm": 0.24373270571231842, "learning_rate": 5e-06, "loss": 0.9765, "num_input_tokens_seen": 452773312, "step": 996, "train_runtime": 72525.8954, "train_tokens_per_second": 6242.919 }, { "epoch": 1.2070394158098354, "grad_norm": 0.2752549350261688, "learning_rate": 5e-06, "loss": 0.9142, "num_input_tokens_seen": 453228432, "step": 997, "train_runtime": 72600.7249, "train_tokens_per_second": 6242.754 }, { "epoch": 1.2082501726274368, "grad_norm": 0.2349204123020172, "learning_rate": 5e-06, "loss": 0.9476, "num_input_tokens_seen": 453688736, "step": 998, "train_runtime": 72676.6277, "train_tokens_per_second": 6242.567 }, { "epoch": 1.2094609294450382, "grad_norm": 0.23312972486019135, "learning_rate": 5e-06, "loss": 0.9553, "num_input_tokens_seen": 454156576, "step": 999, "train_runtime": 72754.6022, "train_tokens_per_second": 6242.307 }, { "epoch": 1.2106716862626397, "grad_norm": 0.24874716997146606, "learning_rate": 5e-06, "loss": 0.9416, "num_input_tokens_seen": 454614576, "step": 1000, "train_runtime": 72830.4183, "train_tokens_per_second": 6242.098 }, { "epoch": 1.211882443080241, "grad_norm": 0.21960654854774475, "learning_rate": 5e-06, "loss": 0.9301, "num_input_tokens_seen": 455060760, "step": 1001, "train_runtime": 72904.2157, "train_tokens_per_second": 6241.899 }, { "epoch": 1.2130931998978425, "grad_norm": 0.23251725733280182, "learning_rate": 5e-06, "loss": 0.9488, "num_input_tokens_seen": 455524632, "step": 1002, "train_runtime": 72981.2956, "train_tokens_per_second": 6241.663 }, { "epoch": 1.2143039567154439, "grad_norm": 0.2484462857246399, "learning_rate": 5e-06, "loss": 0.9695, "num_input_tokens_seen": 455985256, "step": 1003, "train_runtime": 73057.0852, "train_tokens_per_second": 6241.493 }, { "epoch": 1.215514713533045, "grad_norm": 0.23444589972496033, "learning_rate": 5e-06, "loss": 0.9248, "num_input_tokens_seen": 456447872, "step": 1004, "train_runtime": 73133.7066, "train_tokens_per_second": 6241.279 }, { "epoch": 1.2167254703506465, "grad_norm": 0.23066623508930206, "learning_rate": 5e-06, "loss": 0.9341, "num_input_tokens_seen": 456918240, "step": 1005, "train_runtime": 73211.6327, "train_tokens_per_second": 6241.061 }, { "epoch": 1.217936227168248, "grad_norm": 0.26110243797302246, "learning_rate": 5e-06, "loss": 0.9673, "num_input_tokens_seen": 457361960, "step": 1006, "train_runtime": 73284.6571, "train_tokens_per_second": 6240.896 }, { "epoch": 1.2191469839858493, "grad_norm": 0.22857554256916046, "learning_rate": 5e-06, "loss": 0.9429, "num_input_tokens_seen": 457829616, "step": 1007, "train_runtime": 73362.2686, "train_tokens_per_second": 6240.669 }, { "epoch": 1.2203577408034507, "grad_norm": 0.21707653999328613, "learning_rate": 5e-06, "loss": 0.9058, "num_input_tokens_seen": 458299576, "step": 1008, "train_runtime": 73440.2904, "train_tokens_per_second": 6240.438 }, { "epoch": 1.2215684976210521, "grad_norm": 0.21953126788139343, "learning_rate": 5e-06, "loss": 0.9552, "num_input_tokens_seen": 458744008, "step": 1009, "train_runtime": 73513.7035, "train_tokens_per_second": 6240.252 }, { "epoch": 1.2227792544386533, "grad_norm": 0.24714279174804688, "learning_rate": 5e-06, "loss": 0.9093, "num_input_tokens_seen": 459194536, "step": 1010, "train_runtime": 73589.1216, "train_tokens_per_second": 6239.978 }, { "epoch": 1.2239900112562547, "grad_norm": 0.2624055743217468, "learning_rate": 5e-06, "loss": 0.9537, "num_input_tokens_seen": 459658064, "step": 1011, "train_runtime": 73665.7012, "train_tokens_per_second": 6239.784 }, { "epoch": 1.2252007680738561, "grad_norm": 0.24378705024719238, "learning_rate": 5e-06, "loss": 0.9647, "num_input_tokens_seen": 460102312, "step": 1012, "train_runtime": 73739.1174, "train_tokens_per_second": 6239.596 }, { "epoch": 1.2264115248914575, "grad_norm": 0.2524285316467285, "learning_rate": 5e-06, "loss": 1.008, "num_input_tokens_seen": 460530568, "step": 1013, "train_runtime": 73809.5347, "train_tokens_per_second": 6239.445 }, { "epoch": 1.227622281709059, "grad_norm": 0.22694693505764008, "learning_rate": 5e-06, "loss": 0.9336, "num_input_tokens_seen": 460976752, "step": 1014, "train_runtime": 73882.6824, "train_tokens_per_second": 6239.307 }, { "epoch": 1.2288330385266604, "grad_norm": 0.24876870214939117, "learning_rate": 5e-06, "loss": 0.9167, "num_input_tokens_seen": 461412360, "step": 1015, "train_runtime": 73954.7175, "train_tokens_per_second": 6239.12 }, { "epoch": 1.2300437953442618, "grad_norm": 0.23304542899131775, "learning_rate": 5e-06, "loss": 0.937, "num_input_tokens_seen": 461881936, "step": 1016, "train_runtime": 74032.0414, "train_tokens_per_second": 6238.946 }, { "epoch": 1.231254552161863, "grad_norm": 0.2319115698337555, "learning_rate": 5e-06, "loss": 1.0273, "num_input_tokens_seen": 462327848, "step": 1017, "train_runtime": 74105.762, "train_tokens_per_second": 6238.757 }, { "epoch": 1.2324653089794644, "grad_norm": 0.2387470155954361, "learning_rate": 5e-06, "loss": 0.9402, "num_input_tokens_seen": 462801768, "step": 1018, "train_runtime": 74184.7163, "train_tokens_per_second": 6238.506 }, { "epoch": 1.2336760657970658, "grad_norm": 0.23503154516220093, "learning_rate": 5e-06, "loss": 0.9537, "num_input_tokens_seen": 463250128, "step": 1019, "train_runtime": 74258.5041, "train_tokens_per_second": 6238.344 }, { "epoch": 1.2348868226146672, "grad_norm": 0.2387133687734604, "learning_rate": 5e-06, "loss": 0.9841, "num_input_tokens_seen": 463712976, "step": 1020, "train_runtime": 74335.47, "train_tokens_per_second": 6238.112 }, { "epoch": 1.2360975794322686, "grad_norm": 0.2348925918340683, "learning_rate": 5e-06, "loss": 0.9325, "num_input_tokens_seen": 464170752, "step": 1021, "train_runtime": 74411.2711, "train_tokens_per_second": 6237.909 }, { "epoch": 1.23730833624987, "grad_norm": 0.2409505844116211, "learning_rate": 5e-06, "loss": 1.0182, "num_input_tokens_seen": 464620440, "step": 1022, "train_runtime": 74485.4943, "train_tokens_per_second": 6237.731 }, { "epoch": 1.2385190930674712, "grad_norm": 0.2405453324317932, "learning_rate": 5e-06, "loss": 0.9672, "num_input_tokens_seen": 465074072, "step": 1023, "train_runtime": 74560.4666, "train_tokens_per_second": 6237.542 }, { "epoch": 1.2397298498850726, "grad_norm": 0.2541082799434662, "learning_rate": 5e-06, "loss": 0.9502, "num_input_tokens_seen": 465526872, "step": 1024, "train_runtime": 74635.3173, "train_tokens_per_second": 6237.354 }, { "epoch": 1.240940606702674, "grad_norm": 0.233840674161911, "learning_rate": 5e-06, "loss": 0.9982, "num_input_tokens_seen": 465969728, "step": 1025, "train_runtime": 74708.3224, "train_tokens_per_second": 6237.186 }, { "epoch": 1.2421513635202754, "grad_norm": 0.2615164518356323, "learning_rate": 5e-06, "loss": 0.9789, "num_input_tokens_seen": 466432952, "step": 1026, "train_runtime": 74784.9859, "train_tokens_per_second": 6236.987 }, { "epoch": 1.2433621203378769, "grad_norm": 0.25451064109802246, "learning_rate": 5e-06, "loss": 0.9876, "num_input_tokens_seen": 466859880, "step": 1027, "train_runtime": 74855.3509, "train_tokens_per_second": 6236.827 }, { "epoch": 1.2445728771554783, "grad_norm": 0.23738832771778107, "learning_rate": 5e-06, "loss": 0.9718, "num_input_tokens_seen": 467312216, "step": 1028, "train_runtime": 74929.7678, "train_tokens_per_second": 6236.67 }, { "epoch": 1.2457836339730797, "grad_norm": 0.23887260258197784, "learning_rate": 5e-06, "loss": 0.964, "num_input_tokens_seen": 467762744, "step": 1029, "train_runtime": 75003.9938, "train_tokens_per_second": 6236.504 }, { "epoch": 1.2469943907906809, "grad_norm": 0.2599722743034363, "learning_rate": 5e-06, "loss": 0.9842, "num_input_tokens_seen": 468221536, "step": 1030, "train_runtime": 75079.9409, "train_tokens_per_second": 6236.307 }, { "epoch": 1.2482051476082823, "grad_norm": 0.2669295072555542, "learning_rate": 5e-06, "loss": 0.9675, "num_input_tokens_seen": 468654896, "step": 1031, "train_runtime": 75151.0551, "train_tokens_per_second": 6236.172 }, { "epoch": 1.2494159044258837, "grad_norm": 0.23142068088054657, "learning_rate": 5e-06, "loss": 0.9743, "num_input_tokens_seen": 469100424, "step": 1032, "train_runtime": 75224.2634, "train_tokens_per_second": 6236.025 }, { "epoch": 1.250626661243485, "grad_norm": 0.24564848840236664, "learning_rate": 5e-06, "loss": 0.9775, "num_input_tokens_seen": 469557312, "step": 1033, "train_runtime": 75299.8089, "train_tokens_per_second": 6235.837 }, { "epoch": 1.2518374180610865, "grad_norm": 0.2531740069389343, "learning_rate": 5e-06, "loss": 0.9778, "num_input_tokens_seen": 469993728, "step": 1034, "train_runtime": 75371.0171, "train_tokens_per_second": 6235.736 }, { "epoch": 1.253048174878688, "grad_norm": 0.2566632330417633, "learning_rate": 5e-06, "loss": 0.9607, "num_input_tokens_seen": 470435832, "step": 1035, "train_runtime": 75444.0382, "train_tokens_per_second": 6235.56 }, { "epoch": 1.254258931696289, "grad_norm": 0.2733544111251831, "learning_rate": 5e-06, "loss": 0.9229, "num_input_tokens_seen": 470881848, "step": 1036, "train_runtime": 75517.9642, "train_tokens_per_second": 6235.362 }, { "epoch": 1.2554696885138905, "grad_norm": 0.22786258161067963, "learning_rate": 5e-06, "loss": 0.9479, "num_input_tokens_seen": 471347472, "step": 1037, "train_runtime": 75594.9234, "train_tokens_per_second": 6235.174 }, { "epoch": 1.256680445331492, "grad_norm": 0.2446991503238678, "learning_rate": 5e-06, "loss": 0.9554, "num_input_tokens_seen": 471795376, "step": 1038, "train_runtime": 75668.7229, "train_tokens_per_second": 6235.012 }, { "epoch": 1.2578912021490933, "grad_norm": 0.26110076904296875, "learning_rate": 5e-06, "loss": 0.9515, "num_input_tokens_seen": 472220096, "step": 1039, "train_runtime": 75737.7809, "train_tokens_per_second": 6234.934 }, { "epoch": 1.2591019589666947, "grad_norm": 0.24883201718330383, "learning_rate": 5e-06, "loss": 0.9497, "num_input_tokens_seen": 472677208, "step": 1040, "train_runtime": 75812.9693, "train_tokens_per_second": 6234.78 }, { "epoch": 1.2603127157842962, "grad_norm": 0.2285858392715454, "learning_rate": 5e-06, "loss": 0.9709, "num_input_tokens_seen": 473140032, "step": 1041, "train_runtime": 75889.2935, "train_tokens_per_second": 6234.608 }, { "epoch": 1.2615234726018976, "grad_norm": 0.2190844714641571, "learning_rate": 5e-06, "loss": 0.9493, "num_input_tokens_seen": 473600272, "step": 1042, "train_runtime": 75964.9162, "train_tokens_per_second": 6234.461 }, { "epoch": 1.262734229419499, "grad_norm": 0.2370315045118332, "learning_rate": 5e-06, "loss": 0.971, "num_input_tokens_seen": 474054944, "step": 1043, "train_runtime": 76039.0777, "train_tokens_per_second": 6234.359 }, { "epoch": 1.2639449862371002, "grad_norm": 0.22360284626483917, "learning_rate": 5e-06, "loss": 0.9638, "num_input_tokens_seen": 474536696, "step": 1044, "train_runtime": 76118.2238, "train_tokens_per_second": 6234.206 }, { "epoch": 1.2651557430547016, "grad_norm": 0.25233903527259827, "learning_rate": 5e-06, "loss": 0.8986, "num_input_tokens_seen": 474994584, "step": 1045, "train_runtime": 76194.1156, "train_tokens_per_second": 6234.006 }, { "epoch": 1.266366499872303, "grad_norm": 0.2806606888771057, "learning_rate": 5e-06, "loss": 0.9721, "num_input_tokens_seen": 475437456, "step": 1046, "train_runtime": 76266.4104, "train_tokens_per_second": 6233.904 }, { "epoch": 1.2675772566899044, "grad_norm": 0.23013675212860107, "learning_rate": 5e-06, "loss": 0.9835, "num_input_tokens_seen": 475885056, "step": 1047, "train_runtime": 76340.2753, "train_tokens_per_second": 6233.735 }, { "epoch": 1.2687880135075058, "grad_norm": 0.2585345208644867, "learning_rate": 5e-06, "loss": 0.9556, "num_input_tokens_seen": 476330056, "step": 1048, "train_runtime": 76413.1727, "train_tokens_per_second": 6233.612 }, { "epoch": 1.269998770325107, "grad_norm": 0.27313679456710815, "learning_rate": 5e-06, "loss": 0.9655, "num_input_tokens_seen": 476790288, "step": 1049, "train_runtime": 76488.7274, "train_tokens_per_second": 6233.471 }, { "epoch": 1.2712095271427084, "grad_norm": 0.22804471850395203, "learning_rate": 5e-06, "loss": 0.9798, "num_input_tokens_seen": 477237680, "step": 1050, "train_runtime": 76562.66, "train_tokens_per_second": 6233.295 }, { "epoch": 1.2724202839603098, "grad_norm": 0.23282477259635925, "learning_rate": 5e-06, "loss": 0.9485, "num_input_tokens_seen": 477711928, "step": 1051, "train_runtime": 76641.8583, "train_tokens_per_second": 6233.042 }, { "epoch": 1.2736310407779112, "grad_norm": 0.2197505533695221, "learning_rate": 5e-06, "loss": 0.9604, "num_input_tokens_seen": 478171336, "step": 1052, "train_runtime": 76717.5391, "train_tokens_per_second": 6232.882 }, { "epoch": 1.2748417975955126, "grad_norm": 0.2753906846046448, "learning_rate": 5e-06, "loss": 0.9519, "num_input_tokens_seen": 478594376, "step": 1053, "train_runtime": 76786.3869, "train_tokens_per_second": 6232.802 }, { "epoch": 1.276052554413114, "grad_norm": 0.23567403852939606, "learning_rate": 5e-06, "loss": 0.9258, "num_input_tokens_seen": 479057744, "step": 1054, "train_runtime": 76862.8595, "train_tokens_per_second": 6232.63 }, { "epoch": 1.2772633112307155, "grad_norm": 0.2323777824640274, "learning_rate": 5e-06, "loss": 0.9507, "num_input_tokens_seen": 479530368, "step": 1055, "train_runtime": 76940.9952, "train_tokens_per_second": 6232.443 }, { "epoch": 1.2784740680483169, "grad_norm": 0.24186258018016815, "learning_rate": 5e-06, "loss": 0.9227, "num_input_tokens_seen": 479994304, "step": 1056, "train_runtime": 77017.947, "train_tokens_per_second": 6232.24 }, { "epoch": 1.2796848248659183, "grad_norm": 0.2798727750778198, "learning_rate": 5e-06, "loss": 0.9632, "num_input_tokens_seen": 480447768, "step": 1057, "train_runtime": 77093.054, "train_tokens_per_second": 6232.05 }, { "epoch": 1.2808955816835195, "grad_norm": 0.2540852427482605, "learning_rate": 5e-06, "loss": 0.9633, "num_input_tokens_seen": 480890376, "step": 1058, "train_runtime": 77166.9256, "train_tokens_per_second": 6231.82 }, { "epoch": 1.2821063385011209, "grad_norm": 0.23041221499443054, "learning_rate": 5e-06, "loss": 0.9052, "num_input_tokens_seen": 481360496, "step": 1059, "train_runtime": 77244.961, "train_tokens_per_second": 6231.61 }, { "epoch": 1.2833170953187223, "grad_norm": 0.24767398834228516, "learning_rate": 5e-06, "loss": 0.9332, "num_input_tokens_seen": 481821264, "step": 1060, "train_runtime": 77321.082, "train_tokens_per_second": 6231.435 }, { "epoch": 1.2845278521363237, "grad_norm": 0.25022172927856445, "learning_rate": 5e-06, "loss": 0.9481, "num_input_tokens_seen": 482278160, "step": 1061, "train_runtime": 77396.3485, "train_tokens_per_second": 6231.278 }, { "epoch": 1.285738608953925, "grad_norm": 0.25090205669403076, "learning_rate": 5e-06, "loss": 0.9319, "num_input_tokens_seen": 482732096, "step": 1062, "train_runtime": 77471.2094, "train_tokens_per_second": 6231.116 }, { "epoch": 1.2869493657715263, "grad_norm": 0.24102523922920227, "learning_rate": 5e-06, "loss": 0.9033, "num_input_tokens_seen": 483182128, "step": 1063, "train_runtime": 77545.2333, "train_tokens_per_second": 6230.971 }, { "epoch": 1.2881601225891277, "grad_norm": 0.22408998012542725, "learning_rate": 5e-06, "loss": 0.9583, "num_input_tokens_seen": 483634912, "step": 1064, "train_runtime": 77619.6948, "train_tokens_per_second": 6230.827 }, { "epoch": 1.2893708794067291, "grad_norm": 0.22242091596126556, "learning_rate": 5e-06, "loss": 0.8963, "num_input_tokens_seen": 484082184, "step": 1065, "train_runtime": 77693.1389, "train_tokens_per_second": 6230.694 }, { "epoch": 1.2905816362243305, "grad_norm": 0.24296538531780243, "learning_rate": 5e-06, "loss": 0.9512, "num_input_tokens_seen": 484538336, "step": 1066, "train_runtime": 77767.9496, "train_tokens_per_second": 6230.566 }, { "epoch": 1.291792393041932, "grad_norm": 0.2800133526325226, "learning_rate": 5e-06, "loss": 1.0084, "num_input_tokens_seen": 484979760, "step": 1067, "train_runtime": 77840.1247, "train_tokens_per_second": 6230.46 }, { "epoch": 1.2930031498595334, "grad_norm": 0.26364296674728394, "learning_rate": 5e-06, "loss": 0.9158, "num_input_tokens_seen": 485451992, "step": 1068, "train_runtime": 77919.0701, "train_tokens_per_second": 6230.208 }, { "epoch": 1.2942139066771348, "grad_norm": 0.23616540431976318, "learning_rate": 5e-06, "loss": 0.9675, "num_input_tokens_seen": 485907896, "step": 1069, "train_runtime": 77994.4035, "train_tokens_per_second": 6230.035 }, { "epoch": 1.2954246634947362, "grad_norm": 0.2279627025127411, "learning_rate": 5e-06, "loss": 0.9279, "num_input_tokens_seen": 486374992, "step": 1070, "train_runtime": 78071.146, "train_tokens_per_second": 6229.894 }, { "epoch": 1.2966354203123374, "grad_norm": 0.2602773904800415, "learning_rate": 5e-06, "loss": 0.9288, "num_input_tokens_seen": 486846584, "step": 1071, "train_runtime": 78149.9854, "train_tokens_per_second": 6229.644 }, { "epoch": 1.2978461771299388, "grad_norm": 0.2592213451862335, "learning_rate": 5e-06, "loss": 0.9707, "num_input_tokens_seen": 487299176, "step": 1072, "train_runtime": 78224.87, "train_tokens_per_second": 6229.466 }, { "epoch": 1.2990569339475402, "grad_norm": 0.23838956654071808, "learning_rate": 5e-06, "loss": 0.9459, "num_input_tokens_seen": 487738752, "step": 1073, "train_runtime": 78297.4076, "train_tokens_per_second": 6229.309 }, { "epoch": 1.3002676907651416, "grad_norm": 0.2431815266609192, "learning_rate": 5e-06, "loss": 1.0018, "num_input_tokens_seen": 488179592, "step": 1074, "train_runtime": 78369.9512, "train_tokens_per_second": 6229.168 }, { "epoch": 1.301478447582743, "grad_norm": 0.2688054144382477, "learning_rate": 5e-06, "loss": 0.9754, "num_input_tokens_seen": 488624232, "step": 1075, "train_runtime": 78442.9355, "train_tokens_per_second": 6229.041 }, { "epoch": 1.3026892044003442, "grad_norm": 0.2385970801115036, "learning_rate": 5e-06, "loss": 0.9063, "num_input_tokens_seen": 489046568, "step": 1076, "train_runtime": 78511.9197, "train_tokens_per_second": 6228.947 }, { "epoch": 1.3038999612179456, "grad_norm": 0.23294121026992798, "learning_rate": 5e-06, "loss": 0.9876, "num_input_tokens_seen": 489514704, "step": 1077, "train_runtime": 78589.3633, "train_tokens_per_second": 6228.765 }, { "epoch": 1.305110718035547, "grad_norm": 0.2477468103170395, "learning_rate": 5e-06, "loss": 0.9493, "num_input_tokens_seen": 489948088, "step": 1078, "train_runtime": 78660.5285, "train_tokens_per_second": 6228.64 }, { "epoch": 1.3063214748531484, "grad_norm": 0.2480383664369583, "learning_rate": 5e-06, "loss": 0.9577, "num_input_tokens_seen": 490385808, "step": 1079, "train_runtime": 78732.9706, "train_tokens_per_second": 6228.468 }, { "epoch": 1.3075322316707498, "grad_norm": 0.2859964668750763, "learning_rate": 5e-06, "loss": 0.9368, "num_input_tokens_seen": 490856832, "step": 1080, "train_runtime": 78811.3881, "train_tokens_per_second": 6228.248 }, { "epoch": 1.3087429884883512, "grad_norm": 0.2931101620197296, "learning_rate": 5e-06, "loss": 0.9727, "num_input_tokens_seen": 491314392, "step": 1081, "train_runtime": 78887.1437, "train_tokens_per_second": 6228.067 }, { "epoch": 1.3099537453059527, "grad_norm": 0.27014395594596863, "learning_rate": 5e-06, "loss": 0.9503, "num_input_tokens_seen": 491759208, "step": 1082, "train_runtime": 78960.4837, "train_tokens_per_second": 6227.915 }, { "epoch": 1.311164502123554, "grad_norm": 0.2364778369665146, "learning_rate": 5e-06, "loss": 0.9087, "num_input_tokens_seen": 492218016, "step": 1083, "train_runtime": 79036.6786, "train_tokens_per_second": 6227.716 }, { "epoch": 1.3123752589411553, "grad_norm": 0.2594203054904938, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 492683488, "step": 1084, "train_runtime": 79112.9413, "train_tokens_per_second": 6227.597 }, { "epoch": 1.3135860157587567, "grad_norm": 0.2824831008911133, "learning_rate": 5e-06, "loss": 0.9782, "num_input_tokens_seen": 493126536, "step": 1085, "train_runtime": 79186.2287, "train_tokens_per_second": 6227.428 }, { "epoch": 1.314796772576358, "grad_norm": 0.2868604063987732, "learning_rate": 5e-06, "loss": 0.9473, "num_input_tokens_seen": 493574776, "step": 1086, "train_runtime": 79260.2797, "train_tokens_per_second": 6227.265 }, { "epoch": 1.3160075293939595, "grad_norm": 0.24373245239257812, "learning_rate": 5e-06, "loss": 0.9111, "num_input_tokens_seen": 494018800, "step": 1087, "train_runtime": 79333.7195, "train_tokens_per_second": 6227.097 }, { "epoch": 1.317218286211561, "grad_norm": 0.23148846626281738, "learning_rate": 5e-06, "loss": 0.9671, "num_input_tokens_seen": 494459824, "step": 1088, "train_runtime": 79406.5726, "train_tokens_per_second": 6226.938 }, { "epoch": 1.318429043029162, "grad_norm": 0.2403024286031723, "learning_rate": 5e-06, "loss": 0.9115, "num_input_tokens_seen": 494928432, "step": 1089, "train_runtime": 79484.2329, "train_tokens_per_second": 6226.75 }, { "epoch": 1.3196397998467635, "grad_norm": 0.2649286389350891, "learning_rate": 5e-06, "loss": 0.9377, "num_input_tokens_seen": 495391952, "step": 1090, "train_runtime": 79560.7838, "train_tokens_per_second": 6226.585 }, { "epoch": 1.320850556664365, "grad_norm": 0.24317079782485962, "learning_rate": 5e-06, "loss": 0.9451, "num_input_tokens_seen": 495859560, "step": 1091, "train_runtime": 79637.6715, "train_tokens_per_second": 6226.445 }, { "epoch": 1.3220613134819663, "grad_norm": 0.25734710693359375, "learning_rate": 5e-06, "loss": 0.9564, "num_input_tokens_seen": 496335008, "step": 1092, "train_runtime": 79716.5771, "train_tokens_per_second": 6226.246 }, { "epoch": 1.3232720702995677, "grad_norm": 0.230266273021698, "learning_rate": 5e-06, "loss": 1.0006, "num_input_tokens_seen": 496792520, "step": 1093, "train_runtime": 79791.7859, "train_tokens_per_second": 6226.111 }, { "epoch": 1.3244828271171691, "grad_norm": 0.2398468255996704, "learning_rate": 5e-06, "loss": 0.9919, "num_input_tokens_seen": 497244656, "step": 1094, "train_runtime": 79866.2963, "train_tokens_per_second": 6225.964 }, { "epoch": 1.3256935839347705, "grad_norm": 0.25273364782333374, "learning_rate": 5e-06, "loss": 0.9356, "num_input_tokens_seen": 497737648, "step": 1095, "train_runtime": 79947.6758, "train_tokens_per_second": 6225.793 }, { "epoch": 1.326904340752372, "grad_norm": 0.2629864513874054, "learning_rate": 5e-06, "loss": 0.9285, "num_input_tokens_seen": 498211544, "step": 1096, "train_runtime": 80026.228, "train_tokens_per_second": 6225.603 }, { "epoch": 1.3281150975699734, "grad_norm": 0.24348442256450653, "learning_rate": 5e-06, "loss": 0.9928, "num_input_tokens_seen": 498667784, "step": 1097, "train_runtime": 80101.4968, "train_tokens_per_second": 6225.449 }, { "epoch": 1.3293258543875746, "grad_norm": 0.24186153709888458, "learning_rate": 5e-06, "loss": 0.9611, "num_input_tokens_seen": 499107448, "step": 1098, "train_runtime": 80174.2686, "train_tokens_per_second": 6225.282 }, { "epoch": 1.330536611205176, "grad_norm": 0.28597867488861084, "learning_rate": 5e-06, "loss": 0.9198, "num_input_tokens_seen": 499571536, "step": 1099, "train_runtime": 80251.374, "train_tokens_per_second": 6225.084 }, { "epoch": 1.3317473680227774, "grad_norm": 0.25400543212890625, "learning_rate": 5e-06, "loss": 0.9536, "num_input_tokens_seen": 500007376, "step": 1100, "train_runtime": 80323.5809, "train_tokens_per_second": 6224.914 }, { "epoch": 1.3329581248403788, "grad_norm": 0.26500222086906433, "learning_rate": 5e-06, "loss": 0.998, "num_input_tokens_seen": 500462880, "step": 1101, "train_runtime": 80398.5711, "train_tokens_per_second": 6224.773 }, { "epoch": 1.3341688816579802, "grad_norm": 0.28662461042404175, "learning_rate": 5e-06, "loss": 0.9472, "num_input_tokens_seen": 500914736, "step": 1102, "train_runtime": 80473.0011, "train_tokens_per_second": 6224.631 }, { "epoch": 1.3353796384755814, "grad_norm": 0.2489413022994995, "learning_rate": 5e-06, "loss": 0.9416, "num_input_tokens_seen": 501386272, "step": 1103, "train_runtime": 80550.876, "train_tokens_per_second": 6224.467 }, { "epoch": 1.3365903952931828, "grad_norm": 0.22808928787708282, "learning_rate": 5e-06, "loss": 0.9119, "num_input_tokens_seen": 501848592, "step": 1104, "train_runtime": 80627.7778, "train_tokens_per_second": 6224.264 }, { "epoch": 1.3378011521107842, "grad_norm": 0.23136869072914124, "learning_rate": 5e-06, "loss": 0.958, "num_input_tokens_seen": 502286176, "step": 1105, "train_runtime": 80699.5576, "train_tokens_per_second": 6224.15 }, { "epoch": 1.3390119089283856, "grad_norm": 0.22823567688465118, "learning_rate": 5e-06, "loss": 0.9324, "num_input_tokens_seen": 502742112, "step": 1106, "train_runtime": 80775.4724, "train_tokens_per_second": 6223.945 }, { "epoch": 1.340222665745987, "grad_norm": 0.2484605759382248, "learning_rate": 5e-06, "loss": 0.9735, "num_input_tokens_seen": 503197712, "step": 1107, "train_runtime": 80850.7279, "train_tokens_per_second": 6223.787 }, { "epoch": 1.3414334225635884, "grad_norm": 0.25765275955200195, "learning_rate": 5e-06, "loss": 0.953, "num_input_tokens_seen": 503655864, "step": 1108, "train_runtime": 80926.4497, "train_tokens_per_second": 6223.625 }, { "epoch": 1.3426441793811899, "grad_norm": 0.23261244595050812, "learning_rate": 5e-06, "loss": 0.923, "num_input_tokens_seen": 504117992, "step": 1109, "train_runtime": 81002.8373, "train_tokens_per_second": 6223.461 }, { "epoch": 1.3438549361987913, "grad_norm": 0.23450727760791779, "learning_rate": 5e-06, "loss": 0.9273, "num_input_tokens_seen": 504574512, "step": 1110, "train_runtime": 81077.9846, "train_tokens_per_second": 6223.323 }, { "epoch": 1.3450656930163924, "grad_norm": 0.2521567940711975, "learning_rate": 5e-06, "loss": 0.9632, "num_input_tokens_seen": 505004192, "step": 1111, "train_runtime": 81148.4492, "train_tokens_per_second": 6223.214 }, { "epoch": 1.3462764498339939, "grad_norm": 0.2506852447986603, "learning_rate": 5e-06, "loss": 0.9951, "num_input_tokens_seen": 505460352, "step": 1112, "train_runtime": 81223.6058, "train_tokens_per_second": 6223.072 }, { "epoch": 1.3474872066515953, "grad_norm": 0.2718031704425812, "learning_rate": 5e-06, "loss": 0.9664, "num_input_tokens_seen": 505924544, "step": 1113, "train_runtime": 81300.2242, "train_tokens_per_second": 6222.917 }, { "epoch": 1.3486979634691967, "grad_norm": 0.26461461186408997, "learning_rate": 5e-06, "loss": 0.9479, "num_input_tokens_seen": 506374000, "step": 1114, "train_runtime": 81374.867, "train_tokens_per_second": 6222.732 }, { "epoch": 1.349908720286798, "grad_norm": 0.23874284327030182, "learning_rate": 5e-06, "loss": 0.9868, "num_input_tokens_seen": 506851568, "step": 1115, "train_runtime": 81454.5159, "train_tokens_per_second": 6222.51 }, { "epoch": 1.3511194771043993, "grad_norm": 0.2469114065170288, "learning_rate": 5e-06, "loss": 0.9355, "num_input_tokens_seen": 507321040, "step": 1116, "train_runtime": 81532.2647, "train_tokens_per_second": 6222.335 }, { "epoch": 1.3523302339220007, "grad_norm": 0.2748368978500366, "learning_rate": 5e-06, "loss": 0.8878, "num_input_tokens_seen": 507785192, "step": 1117, "train_runtime": 81608.7161, "train_tokens_per_second": 6222.193 }, { "epoch": 1.353540990739602, "grad_norm": 0.25142693519592285, "learning_rate": 5e-06, "loss": 0.9127, "num_input_tokens_seen": 508241704, "step": 1118, "train_runtime": 81683.6479, "train_tokens_per_second": 6222.074 }, { "epoch": 1.3547517475572035, "grad_norm": 0.23072993755340576, "learning_rate": 5e-06, "loss": 0.9419, "num_input_tokens_seen": 508692400, "step": 1119, "train_runtime": 81758.1595, "train_tokens_per_second": 6221.916 }, { "epoch": 1.355962504374805, "grad_norm": 0.22448928654193878, "learning_rate": 5e-06, "loss": 0.937, "num_input_tokens_seen": 509133480, "step": 1120, "train_runtime": 81830.545, "train_tokens_per_second": 6221.802 }, { "epoch": 1.3571732611924063, "grad_norm": 0.2378361076116562, "learning_rate": 5e-06, "loss": 0.9702, "num_input_tokens_seen": 509569360, "step": 1121, "train_runtime": 81902.3306, "train_tokens_per_second": 6221.671 }, { "epoch": 1.3583840180100077, "grad_norm": 0.23400956392288208, "learning_rate": 5e-06, "loss": 0.909, "num_input_tokens_seen": 510010536, "step": 1122, "train_runtime": 81975.1662, "train_tokens_per_second": 6221.525 }, { "epoch": 1.3595947748276092, "grad_norm": 0.24939168989658356, "learning_rate": 5e-06, "loss": 0.9008, "num_input_tokens_seen": 510470824, "step": 1123, "train_runtime": 82051.5387, "train_tokens_per_second": 6221.344 }, { "epoch": 1.3608055316452103, "grad_norm": 0.23065564036369324, "learning_rate": 5e-06, "loss": 0.9217, "num_input_tokens_seen": 510941664, "step": 1124, "train_runtime": 82129.5737, "train_tokens_per_second": 6221.165 }, { "epoch": 1.3620162884628118, "grad_norm": 0.270669162273407, "learning_rate": 5e-06, "loss": 0.9314, "num_input_tokens_seen": 511387848, "step": 1125, "train_runtime": 82202.6173, "train_tokens_per_second": 6221.065 }, { "epoch": 1.3632270452804132, "grad_norm": 0.2493094503879547, "learning_rate": 5e-06, "loss": 0.9661, "num_input_tokens_seen": 511829632, "step": 1126, "train_runtime": 82275.3767, "train_tokens_per_second": 6220.933 }, { "epoch": 1.3644378020980146, "grad_norm": 0.24099677801132202, "learning_rate": 5e-06, "loss": 0.9565, "num_input_tokens_seen": 512284456, "step": 1127, "train_runtime": 82350.5397, "train_tokens_per_second": 6220.778 }, { "epoch": 1.365648558915616, "grad_norm": 0.28274643421173096, "learning_rate": 5e-06, "loss": 0.9409, "num_input_tokens_seen": 512735624, "step": 1128, "train_runtime": 82424.8965, "train_tokens_per_second": 6220.64 }, { "epoch": 1.3668593157332172, "grad_norm": 0.24693673849105835, "learning_rate": 5e-06, "loss": 0.9281, "num_input_tokens_seen": 513189840, "step": 1129, "train_runtime": 82499.5885, "train_tokens_per_second": 6220.514 }, { "epoch": 1.3680700725508186, "grad_norm": 0.23583988845348358, "learning_rate": 5e-06, "loss": 0.858, "num_input_tokens_seen": 513681000, "step": 1130, "train_runtime": 82581.1309, "train_tokens_per_second": 6220.319 }, { "epoch": 1.36928082936842, "grad_norm": 0.23430530726909637, "learning_rate": 5e-06, "loss": 0.9629, "num_input_tokens_seen": 514139520, "step": 1131, "train_runtime": 82656.5045, "train_tokens_per_second": 6220.194 }, { "epoch": 1.3704915861860214, "grad_norm": 0.2671928405761719, "learning_rate": 5e-06, "loss": 0.9183, "num_input_tokens_seen": 514585024, "step": 1132, "train_runtime": 82730.3681, "train_tokens_per_second": 6220.026 }, { "epoch": 1.3717023430036228, "grad_norm": 0.2957673668861389, "learning_rate": 5e-06, "loss": 0.9404, "num_input_tokens_seen": 515044760, "step": 1133, "train_runtime": 82806.7902, "train_tokens_per_second": 6219.837 }, { "epoch": 1.3729130998212242, "grad_norm": 0.24210570752620697, "learning_rate": 5e-06, "loss": 0.9729, "num_input_tokens_seen": 515503432, "step": 1134, "train_runtime": 82882.7978, "train_tokens_per_second": 6219.667 }, { "epoch": 1.3741238566388256, "grad_norm": 0.25204458832740784, "learning_rate": 5e-06, "loss": 0.9571, "num_input_tokens_seen": 515950480, "step": 1135, "train_runtime": 82956.8397, "train_tokens_per_second": 6219.505 }, { "epoch": 1.375334613456427, "grad_norm": 0.25100481510162354, "learning_rate": 5e-06, "loss": 0.93, "num_input_tokens_seen": 516403560, "step": 1136, "train_runtime": 83031.7598, "train_tokens_per_second": 6219.35 }, { "epoch": 1.3765453702740285, "grad_norm": 0.2839900255203247, "learning_rate": 5e-06, "loss": 0.9969, "num_input_tokens_seen": 516860512, "step": 1137, "train_runtime": 83107.2043, "train_tokens_per_second": 6219.202 }, { "epoch": 1.3777561270916296, "grad_norm": 0.24296337366104126, "learning_rate": 5e-06, "loss": 0.9908, "num_input_tokens_seen": 517309384, "step": 1138, "train_runtime": 83178.397, "train_tokens_per_second": 6219.276 }, { "epoch": 1.378966883909231, "grad_norm": 0.2473958134651184, "learning_rate": 5e-06, "loss": 0.991, "num_input_tokens_seen": 517764120, "step": 1139, "train_runtime": 83249.0866, "train_tokens_per_second": 6219.457 }, { "epoch": 1.3801776407268325, "grad_norm": 0.26322364807128906, "learning_rate": 5e-06, "loss": 0.9685, "num_input_tokens_seen": 518204792, "step": 1140, "train_runtime": 83317.1594, "train_tokens_per_second": 6219.665 }, { "epoch": 1.3813883975444339, "grad_norm": 0.27684542536735535, "learning_rate": 5e-06, "loss": 0.9655, "num_input_tokens_seen": 518647512, "step": 1141, "train_runtime": 83386.2555, "train_tokens_per_second": 6219.82 }, { "epoch": 1.3825991543620353, "grad_norm": 0.24537670612335205, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 519100408, "step": 1142, "train_runtime": 83457.0116, "train_tokens_per_second": 6219.974 }, { "epoch": 1.3838099111796365, "grad_norm": 0.23837308585643768, "learning_rate": 5e-06, "loss": 0.9082, "num_input_tokens_seen": 519546976, "step": 1143, "train_runtime": 83526.2221, "train_tokens_per_second": 6220.166 }, { "epoch": 1.3850206679972379, "grad_norm": 0.2371511310338974, "learning_rate": 5e-06, "loss": 0.9283, "num_input_tokens_seen": 520011528, "step": 1144, "train_runtime": 83598.9547, "train_tokens_per_second": 6220.311 }, { "epoch": 1.3862314248148393, "grad_norm": 0.22656875848770142, "learning_rate": 5e-06, "loss": 0.9689, "num_input_tokens_seen": 520470056, "step": 1145, "train_runtime": 83670.3971, "train_tokens_per_second": 6220.48 }, { "epoch": 1.3874421816324407, "grad_norm": 0.23803792893886566, "learning_rate": 5e-06, "loss": 0.9774, "num_input_tokens_seen": 520904032, "step": 1146, "train_runtime": 83740.3252, "train_tokens_per_second": 6220.468 }, { "epoch": 1.3886529384500421, "grad_norm": 0.21631726622581482, "learning_rate": 5e-06, "loss": 0.92, "num_input_tokens_seen": 521369744, "step": 1147, "train_runtime": 83816.7878, "train_tokens_per_second": 6220.35 }, { "epoch": 1.3898636952676435, "grad_norm": 0.237714946269989, "learning_rate": 5e-06, "loss": 0.9022, "num_input_tokens_seen": 521831576, "step": 1148, "train_runtime": 83893.3078, "train_tokens_per_second": 6220.181 }, { "epoch": 1.391074452085245, "grad_norm": 0.2461657077074051, "learning_rate": 5e-06, "loss": 0.955, "num_input_tokens_seen": 522271136, "step": 1149, "train_runtime": 83965.4254, "train_tokens_per_second": 6220.074 }, { "epoch": 1.3922852089028463, "grad_norm": 0.23177474737167358, "learning_rate": 5e-06, "loss": 0.9326, "num_input_tokens_seen": 522723640, "step": 1150, "train_runtime": 84039.8413, "train_tokens_per_second": 6219.95 }, { "epoch": 1.3934959657204475, "grad_norm": 0.24760431051254272, "learning_rate": 5e-06, "loss": 0.9544, "num_input_tokens_seen": 523187496, "step": 1151, "train_runtime": 84116.6652, "train_tokens_per_second": 6219.784 }, { "epoch": 1.394706722538049, "grad_norm": 0.24664926528930664, "learning_rate": 5e-06, "loss": 0.9197, "num_input_tokens_seen": 523653368, "step": 1152, "train_runtime": 84193.8554, "train_tokens_per_second": 6219.615 }, { "epoch": 1.3959174793556504, "grad_norm": 0.22697068750858307, "learning_rate": 5e-06, "loss": 0.9622, "num_input_tokens_seen": 524120216, "step": 1153, "train_runtime": 84271.1905, "train_tokens_per_second": 6219.447 }, { "epoch": 1.3971282361732518, "grad_norm": 0.24017848074436188, "learning_rate": 5e-06, "loss": 0.9406, "num_input_tokens_seen": 524588968, "step": 1154, "train_runtime": 84348.8248, "train_tokens_per_second": 6219.28 }, { "epoch": 1.3983389929908532, "grad_norm": 0.24601654708385468, "learning_rate": 5e-06, "loss": 0.96, "num_input_tokens_seen": 525035616, "step": 1155, "train_runtime": 84422.8617, "train_tokens_per_second": 6219.117 }, { "epoch": 1.3995497498084544, "grad_norm": 0.22841405868530273, "learning_rate": 5e-06, "loss": 0.9359, "num_input_tokens_seen": 525495368, "step": 1156, "train_runtime": 84498.4629, "train_tokens_per_second": 6218.993 }, { "epoch": 1.4007605066260558, "grad_norm": 0.2503286302089691, "learning_rate": 5e-06, "loss": 0.9101, "num_input_tokens_seen": 525936104, "step": 1157, "train_runtime": 84571.5115, "train_tokens_per_second": 6218.833 }, { "epoch": 1.4019712634436572, "grad_norm": 0.24628864228725433, "learning_rate": 5e-06, "loss": 0.9777, "num_input_tokens_seen": 526383960, "step": 1158, "train_runtime": 84645.7016, "train_tokens_per_second": 6218.673 }, { "epoch": 1.4031820202612586, "grad_norm": 0.23224344849586487, "learning_rate": 5e-06, "loss": 0.9756, "num_input_tokens_seen": 526842064, "step": 1159, "train_runtime": 84721.4748, "train_tokens_per_second": 6218.519 }, { "epoch": 1.40439277707886, "grad_norm": 0.23669494688510895, "learning_rate": 5e-06, "loss": 0.9558, "num_input_tokens_seen": 527274984, "step": 1160, "train_runtime": 84792.9592, "train_tokens_per_second": 6218.382 }, { "epoch": 1.4056035338964614, "grad_norm": 0.2642204165458679, "learning_rate": 5e-06, "loss": 0.9927, "num_input_tokens_seen": 527706216, "step": 1161, "train_runtime": 84864.0145, "train_tokens_per_second": 6218.257 }, { "epoch": 1.4068142907140628, "grad_norm": 0.24115154147148132, "learning_rate": 5e-06, "loss": 0.9297, "num_input_tokens_seen": 528178144, "step": 1162, "train_runtime": 84942.3429, "train_tokens_per_second": 6218.078 }, { "epoch": 1.4080250475316642, "grad_norm": 0.23551017045974731, "learning_rate": 5e-06, "loss": 0.9862, "num_input_tokens_seen": 528630424, "step": 1163, "train_runtime": 85017.0419, "train_tokens_per_second": 6217.935 }, { "epoch": 1.4092358043492654, "grad_norm": 0.2298494577407837, "learning_rate": 5e-06, "loss": 0.9455, "num_input_tokens_seen": 529081184, "step": 1164, "train_runtime": 85091.8299, "train_tokens_per_second": 6217.767 }, { "epoch": 1.4104465611668668, "grad_norm": 0.22845524549484253, "learning_rate": 5e-06, "loss": 0.9526, "num_input_tokens_seen": 529559640, "step": 1165, "train_runtime": 85170.8828, "train_tokens_per_second": 6217.614 }, { "epoch": 1.4116573179844683, "grad_norm": 0.2308027297258377, "learning_rate": 5e-06, "loss": 0.8656, "num_input_tokens_seen": 530022552, "step": 1166, "train_runtime": 85244.8514, "train_tokens_per_second": 6217.649 }, { "epoch": 1.4128680748020697, "grad_norm": 0.2270365059375763, "learning_rate": 5e-06, "loss": 0.9853, "num_input_tokens_seen": 530470680, "step": 1167, "train_runtime": 85314.7105, "train_tokens_per_second": 6217.81 }, { "epoch": 1.414078831619671, "grad_norm": 0.23675860464572906, "learning_rate": 5e-06, "loss": 0.9707, "num_input_tokens_seen": 530923672, "step": 1168, "train_runtime": 85385.4857, "train_tokens_per_second": 6217.962 }, { "epoch": 1.4152895884372723, "grad_norm": 0.24494849145412445, "learning_rate": 5e-06, "loss": 1.0015, "num_input_tokens_seen": 531378672, "step": 1169, "train_runtime": 85457.4364, "train_tokens_per_second": 6218.051 }, { "epoch": 1.4165003452548737, "grad_norm": 0.2266804724931717, "learning_rate": 5e-06, "loss": 0.9243, "num_input_tokens_seen": 531833320, "step": 1170, "train_runtime": 85530.4497, "train_tokens_per_second": 6218.058 }, { "epoch": 1.417711102072475, "grad_norm": 0.25175556540489197, "learning_rate": 5e-06, "loss": 1.0028, "num_input_tokens_seen": 532288648, "step": 1171, "train_runtime": 85604.5221, "train_tokens_per_second": 6217.997 }, { "epoch": 1.4189218588900765, "grad_norm": 0.23558390140533447, "learning_rate": 5e-06, "loss": 0.9245, "num_input_tokens_seen": 532735600, "step": 1172, "train_runtime": 85676.0209, "train_tokens_per_second": 6218.025 }, { "epoch": 1.420132615707678, "grad_norm": 0.220907524228096, "learning_rate": 5e-06, "loss": 0.9561, "num_input_tokens_seen": 533220744, "step": 1173, "train_runtime": 85753.6767, "train_tokens_per_second": 6218.051 }, { "epoch": 1.4213433725252793, "grad_norm": 0.28133559226989746, "learning_rate": 5e-06, "loss": 0.9136, "num_input_tokens_seen": 533681856, "step": 1174, "train_runtime": 85830.2269, "train_tokens_per_second": 6217.878 }, { "epoch": 1.4225541293428807, "grad_norm": 0.2508618235588074, "learning_rate": 5e-06, "loss": 0.9846, "num_input_tokens_seen": 534131488, "step": 1175, "train_runtime": 85905.1182, "train_tokens_per_second": 6217.691 }, { "epoch": 1.4237648861604821, "grad_norm": 0.24241898953914642, "learning_rate": 5e-06, "loss": 0.948, "num_input_tokens_seen": 534587808, "step": 1176, "train_runtime": 85980.4455, "train_tokens_per_second": 6217.551 }, { "epoch": 1.4249756429780835, "grad_norm": 0.2333323061466217, "learning_rate": 5e-06, "loss": 0.9202, "num_input_tokens_seen": 535059256, "step": 1177, "train_runtime": 86059.1263, "train_tokens_per_second": 6217.345 }, { "epoch": 1.4261863997956847, "grad_norm": 0.2457004338502884, "learning_rate": 5e-06, "loss": 0.96, "num_input_tokens_seen": 535516680, "step": 1178, "train_runtime": 86134.7746, "train_tokens_per_second": 6217.195 }, { "epoch": 1.4273971566132861, "grad_norm": 0.2796451151371002, "learning_rate": 5e-06, "loss": 0.9294, "num_input_tokens_seen": 535977480, "step": 1179, "train_runtime": 86211.7662, "train_tokens_per_second": 6216.988 }, { "epoch": 1.4286079134308876, "grad_norm": 0.24755236506462097, "learning_rate": 5e-06, "loss": 0.9704, "num_input_tokens_seen": 536456680, "step": 1180, "train_runtime": 86291.5671, "train_tokens_per_second": 6216.791 }, { "epoch": 1.429818670248489, "grad_norm": 0.23514142632484436, "learning_rate": 5e-06, "loss": 0.9235, "num_input_tokens_seen": 536919736, "step": 1181, "train_runtime": 86368.5347, "train_tokens_per_second": 6216.613 }, { "epoch": 1.4310294270660904, "grad_norm": 0.2705405056476593, "learning_rate": 5e-06, "loss": 0.9898, "num_input_tokens_seen": 537369752, "step": 1182, "train_runtime": 86443.3868, "train_tokens_per_second": 6216.436 }, { "epoch": 1.4322401838836916, "grad_norm": 0.2713667154312134, "learning_rate": 5e-06, "loss": 0.931, "num_input_tokens_seen": 537808528, "step": 1183, "train_runtime": 86515.7051, "train_tokens_per_second": 6216.311 }, { "epoch": 1.433450940701293, "grad_norm": 0.2554599642753601, "learning_rate": 5e-06, "loss": 0.8981, "num_input_tokens_seen": 538237888, "step": 1184, "train_runtime": 86586.8158, "train_tokens_per_second": 6216.164 }, { "epoch": 1.4346616975188944, "grad_norm": 0.22345824539661407, "learning_rate": 5e-06, "loss": 0.9353, "num_input_tokens_seen": 538679104, "step": 1185, "train_runtime": 86659.9026, "train_tokens_per_second": 6216.013 }, { "epoch": 1.4358724543364958, "grad_norm": 0.25475722551345825, "learning_rate": 5e-06, "loss": 0.9775, "num_input_tokens_seen": 539122880, "step": 1186, "train_runtime": 86733.3283, "train_tokens_per_second": 6215.868 }, { "epoch": 1.4370832111540972, "grad_norm": 0.2426735758781433, "learning_rate": 5e-06, "loss": 0.9531, "num_input_tokens_seen": 539580016, "step": 1187, "train_runtime": 86809.5723, "train_tokens_per_second": 6215.674 }, { "epoch": 1.4382939679716986, "grad_norm": 0.24386319518089294, "learning_rate": 5e-06, "loss": 0.8981, "num_input_tokens_seen": 540035632, "step": 1188, "train_runtime": 86885.2572, "train_tokens_per_second": 6215.504 }, { "epoch": 1.4395047247893, "grad_norm": 0.25454631447792053, "learning_rate": 5e-06, "loss": 0.9745, "num_input_tokens_seen": 540481968, "step": 1189, "train_runtime": 86959.2044, "train_tokens_per_second": 6215.351 }, { "epoch": 1.4407154816069014, "grad_norm": 0.2664698660373688, "learning_rate": 5e-06, "loss": 0.9639, "num_input_tokens_seen": 540930360, "step": 1190, "train_runtime": 87033.5401, "train_tokens_per_second": 6215.194 }, { "epoch": 1.4419262384245026, "grad_norm": 0.24694858491420746, "learning_rate": 5e-06, "loss": 0.9673, "num_input_tokens_seen": 541419328, "step": 1191, "train_runtime": 87114.81, "train_tokens_per_second": 6215.009 }, { "epoch": 1.443136995242104, "grad_norm": 0.27929478883743286, "learning_rate": 5e-06, "loss": 0.891, "num_input_tokens_seen": 541886280, "step": 1192, "train_runtime": 87192.2112, "train_tokens_per_second": 6214.847 }, { "epoch": 1.4443477520597054, "grad_norm": 0.26354244351387024, "learning_rate": 5e-06, "loss": 0.9933, "num_input_tokens_seen": 542346832, "step": 1193, "train_runtime": 87268.6309, "train_tokens_per_second": 6214.682 }, { "epoch": 1.4455585088773069, "grad_norm": 0.2514925003051758, "learning_rate": 5e-06, "loss": 0.964, "num_input_tokens_seen": 542801256, "step": 1194, "train_runtime": 87344.2791, "train_tokens_per_second": 6214.503 }, { "epoch": 1.4467692656949083, "grad_norm": 0.24636778235435486, "learning_rate": 5e-06, "loss": 0.9314, "num_input_tokens_seen": 543242600, "step": 1195, "train_runtime": 87417.0709, "train_tokens_per_second": 6214.377 }, { "epoch": 1.4479800225125095, "grad_norm": 0.2630736529827118, "learning_rate": 5e-06, "loss": 0.9703, "num_input_tokens_seen": 543701408, "step": 1196, "train_runtime": 87493.5804, "train_tokens_per_second": 6214.186 }, { "epoch": 1.4491907793301109, "grad_norm": 0.2552695572376251, "learning_rate": 5e-06, "loss": 1.0058, "num_input_tokens_seen": 544170664, "step": 1197, "train_runtime": 87571.6365, "train_tokens_per_second": 6214.006 }, { "epoch": 1.4504015361477123, "grad_norm": 0.2683693468570709, "learning_rate": 5e-06, "loss": 0.9817, "num_input_tokens_seen": 544586024, "step": 1198, "train_runtime": 87639.8342, "train_tokens_per_second": 6213.91 }, { "epoch": 1.4516122929653137, "grad_norm": 0.24069073796272278, "learning_rate": 5e-06, "loss": 0.983, "num_input_tokens_seen": 545032400, "step": 1199, "train_runtime": 87713.2659, "train_tokens_per_second": 6213.797 }, { "epoch": 1.452823049782915, "grad_norm": 0.2466171234846115, "learning_rate": 5e-06, "loss": 0.9388, "num_input_tokens_seen": 545482808, "step": 1200, "train_runtime": 87788.0711, "train_tokens_per_second": 6213.632 }, { "epoch": 1.4540338066005165, "grad_norm": 0.310069739818573, "learning_rate": 5e-06, "loss": 0.964, "num_input_tokens_seen": 545948528, "step": 1201, "train_runtime": 87865.3722, "train_tokens_per_second": 6213.466 }, { "epoch": 1.455244563418118, "grad_norm": 0.23402269184589386, "learning_rate": 5e-06, "loss": 0.9666, "num_input_tokens_seen": 546406224, "step": 1202, "train_runtime": 87941.9012, "train_tokens_per_second": 6213.264 }, { "epoch": 1.4564553202357193, "grad_norm": 0.2361670583486557, "learning_rate": 5e-06, "loss": 0.9641, "num_input_tokens_seen": 546871256, "step": 1203, "train_runtime": 88019.3297, "train_tokens_per_second": 6213.081 }, { "epoch": 1.4576660770533205, "grad_norm": 0.21892577409744263, "learning_rate": 5e-06, "loss": 0.9509, "num_input_tokens_seen": 547331272, "step": 1204, "train_runtime": 88095.2024, "train_tokens_per_second": 6212.952 }, { "epoch": 1.458876833870922, "grad_norm": 0.276292085647583, "learning_rate": 5e-06, "loss": 0.9394, "num_input_tokens_seen": 547782312, "step": 1205, "train_runtime": 88170.4287, "train_tokens_per_second": 6212.767 }, { "epoch": 1.4600875906885233, "grad_norm": 0.24177499115467072, "learning_rate": 5e-06, "loss": 0.9166, "num_input_tokens_seen": 548266600, "step": 1206, "train_runtime": 88251.0882, "train_tokens_per_second": 6212.576 }, { "epoch": 1.4612983475061248, "grad_norm": 0.2835836112499237, "learning_rate": 5e-06, "loss": 0.995, "num_input_tokens_seen": 548717048, "step": 1207, "train_runtime": 88325.5766, "train_tokens_per_second": 6212.437 }, { "epoch": 1.4625091043237262, "grad_norm": 0.23038621246814728, "learning_rate": 5e-06, "loss": 0.9838, "num_input_tokens_seen": 549195520, "step": 1208, "train_runtime": 88405.6772, "train_tokens_per_second": 6212.22 }, { "epoch": 1.4637198611413273, "grad_norm": 0.2618058919906616, "learning_rate": 5e-06, "loss": 0.9669, "num_input_tokens_seen": 549643000, "step": 1209, "train_runtime": 88479.9776, "train_tokens_per_second": 6212.061 }, { "epoch": 1.4649306179589288, "grad_norm": 0.26815587282180786, "learning_rate": 5e-06, "loss": 0.9388, "num_input_tokens_seen": 550092200, "step": 1210, "train_runtime": 88554.698, "train_tokens_per_second": 6211.892 }, { "epoch": 1.4661413747765302, "grad_norm": 0.2662449777126312, "learning_rate": 5e-06, "loss": 0.9636, "num_input_tokens_seen": 550550512, "step": 1211, "train_runtime": 88629.3279, "train_tokens_per_second": 6211.832 }, { "epoch": 1.4673521315941316, "grad_norm": 0.23297056555747986, "learning_rate": 5e-06, "loss": 0.968, "num_input_tokens_seen": 551000744, "step": 1212, "train_runtime": 88700.7236, "train_tokens_per_second": 6211.908 }, { "epoch": 1.468562888411733, "grad_norm": 0.24942202866077423, "learning_rate": 5e-06, "loss": 0.9262, "num_input_tokens_seen": 551460280, "step": 1213, "train_runtime": 88773.122, "train_tokens_per_second": 6212.019 }, { "epoch": 1.4697736452293344, "grad_norm": 0.2555992901325226, "learning_rate": 5e-06, "loss": 0.9494, "num_input_tokens_seen": 551910888, "step": 1214, "train_runtime": 88844.5041, "train_tokens_per_second": 6212.099 }, { "epoch": 1.4709844020469358, "grad_norm": 0.2768413722515106, "learning_rate": 5e-06, "loss": 0.919, "num_input_tokens_seen": 552378856, "step": 1215, "train_runtime": 88918.5443, "train_tokens_per_second": 6212.19 }, { "epoch": 1.4721951588645372, "grad_norm": 0.24520625174045563, "learning_rate": 5e-06, "loss": 0.9503, "num_input_tokens_seen": 552847920, "step": 1216, "train_runtime": 88992.9488, "train_tokens_per_second": 6212.267 }, { "epoch": 1.4734059156821386, "grad_norm": 0.2534187436103821, "learning_rate": 5e-06, "loss": 0.9683, "num_input_tokens_seen": 553286272, "step": 1217, "train_runtime": 89061.8498, "train_tokens_per_second": 6212.382 }, { "epoch": 1.4746166724997398, "grad_norm": 0.2607842981815338, "learning_rate": 5e-06, "loss": 0.9375, "num_input_tokens_seen": 553730632, "step": 1218, "train_runtime": 89131.9721, "train_tokens_per_second": 6212.48 }, { "epoch": 1.4758274293173412, "grad_norm": 0.2503432333469391, "learning_rate": 5e-06, "loss": 0.9422, "num_input_tokens_seen": 554163400, "step": 1219, "train_runtime": 89200.3599, "train_tokens_per_second": 6212.569 }, { "epoch": 1.4770381861349426, "grad_norm": 0.27522653341293335, "learning_rate": 5e-06, "loss": 0.9007, "num_input_tokens_seen": 554616200, "step": 1220, "train_runtime": 89271.7622, "train_tokens_per_second": 6212.672 }, { "epoch": 1.478248942952544, "grad_norm": 0.29365551471710205, "learning_rate": 5e-06, "loss": 0.9819, "num_input_tokens_seen": 555069200, "step": 1221, "train_runtime": 89343.1896, "train_tokens_per_second": 6212.776 }, { "epoch": 1.4794596997701455, "grad_norm": 0.22803185880184174, "learning_rate": 5e-06, "loss": 0.929, "num_input_tokens_seen": 555522824, "step": 1222, "train_runtime": 89414.7038, "train_tokens_per_second": 6212.88 }, { "epoch": 1.4806704565877467, "grad_norm": 0.2833687663078308, "learning_rate": 5e-06, "loss": 0.9506, "num_input_tokens_seen": 555976920, "step": 1223, "train_runtime": 89486.7493, "train_tokens_per_second": 6212.952 }, { "epoch": 1.481881213405348, "grad_norm": 0.23040251433849335, "learning_rate": 5e-06, "loss": 0.9585, "num_input_tokens_seen": 556431480, "step": 1224, "train_runtime": 89564.565, "train_tokens_per_second": 6212.63 }, { "epoch": 1.4830919702229495, "grad_norm": 0.2419111281633377, "learning_rate": 5e-06, "loss": 0.9193, "num_input_tokens_seen": 556890152, "step": 1225, "train_runtime": 89648.8264, "train_tokens_per_second": 6211.907 }, { "epoch": 1.4843027270405509, "grad_norm": 0.29110512137413025, "learning_rate": 5e-06, "loss": 0.8697, "num_input_tokens_seen": 557355240, "step": 1226, "train_runtime": 89732.4654, "train_tokens_per_second": 6211.3 }, { "epoch": 1.4855134838581523, "grad_norm": 0.25912541151046753, "learning_rate": 5e-06, "loss": 0.9736, "num_input_tokens_seen": 557820032, "step": 1227, "train_runtime": 89810.0796, "train_tokens_per_second": 6211.107 }, { "epoch": 1.4867242406757537, "grad_norm": 0.29734542965888977, "learning_rate": 5e-06, "loss": 0.9779, "num_input_tokens_seen": 558247392, "step": 1228, "train_runtime": 89880.9268, "train_tokens_per_second": 6210.966 }, { "epoch": 1.4879349974933551, "grad_norm": 0.23052756488323212, "learning_rate": 5e-06, "loss": 0.9192, "num_input_tokens_seen": 558690584, "step": 1229, "train_runtime": 89956.4661, "train_tokens_per_second": 6210.677 }, { "epoch": 1.4891457543109565, "grad_norm": 0.24976183474063873, "learning_rate": 5e-06, "loss": 0.9726, "num_input_tokens_seen": 559158528, "step": 1230, "train_runtime": 90036.0752, "train_tokens_per_second": 6210.383 }, { "epoch": 1.4903565111285577, "grad_norm": 0.25929853320121765, "learning_rate": 5e-06, "loss": 0.8893, "num_input_tokens_seen": 559606536, "step": 1231, "train_runtime": 90111.8366, "train_tokens_per_second": 6210.134 }, { "epoch": 1.4915672679461591, "grad_norm": 0.2416425496339798, "learning_rate": 5e-06, "loss": 0.9223, "num_input_tokens_seen": 560047016, "step": 1232, "train_runtime": 90186.6064, "train_tokens_per_second": 6209.869 }, { "epoch": 1.4927780247637605, "grad_norm": 0.2509872019290924, "learning_rate": 5e-06, "loss": 0.9414, "num_input_tokens_seen": 560477352, "step": 1233, "train_runtime": 90259.5167, "train_tokens_per_second": 6209.621 }, { "epoch": 1.493988781581362, "grad_norm": 0.24654145538806915, "learning_rate": 5e-06, "loss": 0.9249, "num_input_tokens_seen": 560915928, "step": 1234, "train_runtime": 90333.8577, "train_tokens_per_second": 6209.365 }, { "epoch": 1.4951995383989634, "grad_norm": 0.2723659873008728, "learning_rate": 5e-06, "loss": 0.9631, "num_input_tokens_seen": 561326904, "step": 1235, "train_runtime": 90403.0323, "train_tokens_per_second": 6209.16 }, { "epoch": 1.4964102952165645, "grad_norm": 0.22693853080272675, "learning_rate": 5e-06, "loss": 0.9138, "num_input_tokens_seen": 561775144, "step": 1236, "train_runtime": 90478.8627, "train_tokens_per_second": 6208.91 }, { "epoch": 1.497621052034166, "grad_norm": 0.26430606842041016, "learning_rate": 5e-06, "loss": 0.9036, "num_input_tokens_seen": 562201328, "step": 1237, "train_runtime": 90551.0641, "train_tokens_per_second": 6208.666 }, { "epoch": 1.4988318088517674, "grad_norm": 0.24093542993068695, "learning_rate": 5e-06, "loss": 0.9329, "num_input_tokens_seen": 562665640, "step": 1238, "train_runtime": 90629.6091, "train_tokens_per_second": 6208.409 }, { "epoch": 1.5000425656693688, "grad_norm": 0.24133825302124023, "learning_rate": 5e-06, "loss": 0.9841, "num_input_tokens_seen": 563130560, "step": 1239, "train_runtime": 90707.3461, "train_tokens_per_second": 6208.213 }, { "epoch": 1.5012533224869702, "grad_norm": 0.23979146778583527, "learning_rate": 5e-06, "loss": 0.969, "num_input_tokens_seen": 563574224, "step": 1240, "train_runtime": 90780.5712, "train_tokens_per_second": 6208.093 }, { "epoch": 1.5024640793045716, "grad_norm": 0.2502334713935852, "learning_rate": 5e-06, "loss": 0.9544, "num_input_tokens_seen": 564005368, "step": 1241, "train_runtime": 90849.637, "train_tokens_per_second": 6208.119 }, { "epoch": 1.503674836122173, "grad_norm": 0.24188034236431122, "learning_rate": 5e-06, "loss": 0.9265, "num_input_tokens_seen": 564455488, "step": 1242, "train_runtime": 90922.8277, "train_tokens_per_second": 6208.072 }, { "epoch": 1.5048855929397744, "grad_norm": 0.2516622841358185, "learning_rate": 5e-06, "loss": 0.9798, "num_input_tokens_seen": 564908080, "step": 1243, "train_runtime": 90997.5049, "train_tokens_per_second": 6207.951 }, { "epoch": 1.5060963497573758, "grad_norm": 0.22442975640296936, "learning_rate": 5e-06, "loss": 0.9605, "num_input_tokens_seen": 565380080, "step": 1244, "train_runtime": 91075.2673, "train_tokens_per_second": 6207.833 }, { "epoch": 1.507307106574977, "grad_norm": 0.25572800636291504, "learning_rate": 5e-06, "loss": 1.0025, "num_input_tokens_seen": 565820720, "step": 1245, "train_runtime": 91148.0391, "train_tokens_per_second": 6207.711 }, { "epoch": 1.5085178633925784, "grad_norm": 0.24338506162166595, "learning_rate": 5e-06, "loss": 0.9596, "num_input_tokens_seen": 566254432, "step": 1246, "train_runtime": 91219.5681, "train_tokens_per_second": 6207.598 }, { "epoch": 1.5097286202101798, "grad_norm": 0.26078444719314575, "learning_rate": 5e-06, "loss": 0.9484, "num_input_tokens_seen": 566687608, "step": 1247, "train_runtime": 91291.6618, "train_tokens_per_second": 6207.441 }, { "epoch": 1.510939377027781, "grad_norm": 0.25328484177589417, "learning_rate": 5e-06, "loss": 0.9433, "num_input_tokens_seen": 567135480, "step": 1248, "train_runtime": 91365.8104, "train_tokens_per_second": 6207.305 }, { "epoch": 1.5121501338453824, "grad_norm": 0.2464897632598877, "learning_rate": 5e-06, "loss": 0.9207, "num_input_tokens_seen": 567570544, "step": 1249, "train_runtime": 91437.823, "train_tokens_per_second": 6207.175 }, { "epoch": 1.5133608906629838, "grad_norm": 0.232350155711174, "learning_rate": 5e-06, "loss": 0.9411, "num_input_tokens_seen": 568017064, "step": 1250, "train_runtime": 91511.6898, "train_tokens_per_second": 6207.044 }, { "epoch": 1.5145716474805853, "grad_norm": 0.22308504581451416, "learning_rate": 5e-06, "loss": 0.906, "num_input_tokens_seen": 568479648, "step": 1251, "train_runtime": 91588.8378, "train_tokens_per_second": 6206.866 }, { "epoch": 1.5157824042981867, "grad_norm": 0.23805969953536987, "learning_rate": 5e-06, "loss": 0.9744, "num_input_tokens_seen": 568934456, "step": 1252, "train_runtime": 91663.9685, "train_tokens_per_second": 6206.74 }, { "epoch": 1.516993161115788, "grad_norm": 0.2170308232307434, "learning_rate": 5e-06, "loss": 0.9288, "num_input_tokens_seen": 569397744, "step": 1253, "train_runtime": 91740.5472, "train_tokens_per_second": 6206.609 }, { "epoch": 1.5182039179333895, "grad_norm": 0.237321138381958, "learning_rate": 5e-06, "loss": 0.8996, "num_input_tokens_seen": 569848752, "step": 1254, "train_runtime": 91814.7504, "train_tokens_per_second": 6206.505 }, { "epoch": 1.519414674750991, "grad_norm": 0.25323814153671265, "learning_rate": 5e-06, "loss": 0.9405, "num_input_tokens_seen": 570280800, "step": 1255, "train_runtime": 91885.305, "train_tokens_per_second": 6206.442 }, { "epoch": 1.5206254315685923, "grad_norm": 0.24336665868759155, "learning_rate": 5e-06, "loss": 0.9383, "num_input_tokens_seen": 570733784, "step": 1256, "train_runtime": 91959.8375, "train_tokens_per_second": 6206.337 }, { "epoch": 1.5218361883861937, "grad_norm": 0.24592383205890656, "learning_rate": 5e-06, "loss": 0.9803, "num_input_tokens_seen": 571189672, "step": 1257, "train_runtime": 92034.0279, "train_tokens_per_second": 6206.288 }, { "epoch": 1.523046945203795, "grad_norm": 0.2351573407649994, "learning_rate": 5e-06, "loss": 0.9934, "num_input_tokens_seen": 571646112, "step": 1258, "train_runtime": 92108.3155, "train_tokens_per_second": 6206.238 }, { "epoch": 1.5242577020213963, "grad_norm": 0.25675877928733826, "learning_rate": 5e-06, "loss": 0.9665, "num_input_tokens_seen": 572084040, "step": 1259, "train_runtime": 92176.9792, "train_tokens_per_second": 6206.366 }, { "epoch": 1.5254684588389977, "grad_norm": 0.23532457649707794, "learning_rate": 5e-06, "loss": 0.9635, "num_input_tokens_seen": 572531232, "step": 1260, "train_runtime": 92244.6997, "train_tokens_per_second": 6206.657 }, { "epoch": 1.5266792156565991, "grad_norm": 0.23427313566207886, "learning_rate": 5e-06, "loss": 0.9456, "num_input_tokens_seen": 572985200, "step": 1261, "train_runtime": 92317.2757, "train_tokens_per_second": 6206.695 }, { "epoch": 1.5278899724742003, "grad_norm": 0.2370956540107727, "learning_rate": 5e-06, "loss": 0.9875, "num_input_tokens_seen": 573434448, "step": 1262, "train_runtime": 92388.1785, "train_tokens_per_second": 6206.795 }, { "epoch": 1.5291007292918017, "grad_norm": 0.2511068284511566, "learning_rate": 5e-06, "loss": 0.9408, "num_input_tokens_seen": 573888816, "step": 1263, "train_runtime": 92463.6049, "train_tokens_per_second": 6206.645 }, { "epoch": 1.5303114861094032, "grad_norm": 0.22451600432395935, "learning_rate": 5e-06, "loss": 0.9585, "num_input_tokens_seen": 574350424, "step": 1264, "train_runtime": 92541.1676, "train_tokens_per_second": 6206.432 }, { "epoch": 1.5315222429270046, "grad_norm": 0.23519355058670044, "learning_rate": 5e-06, "loss": 0.9484, "num_input_tokens_seen": 574821008, "step": 1265, "train_runtime": 92620.4674, "train_tokens_per_second": 6206.199 }, { "epoch": 1.532732999744606, "grad_norm": 0.2533230483531952, "learning_rate": 5e-06, "loss": 0.9393, "num_input_tokens_seen": 575257344, "step": 1266, "train_runtime": 92693.6836, "train_tokens_per_second": 6206.004 }, { "epoch": 1.5339437565622074, "grad_norm": 0.251905232667923, "learning_rate": 5e-06, "loss": 0.9898, "num_input_tokens_seen": 575695280, "step": 1267, "train_runtime": 92766.7232, "train_tokens_per_second": 6205.838 }, { "epoch": 1.5351545133798088, "grad_norm": 0.23301640152931213, "learning_rate": 5e-06, "loss": 0.949, "num_input_tokens_seen": 576144632, "step": 1268, "train_runtime": 92842.3959, "train_tokens_per_second": 6205.62 }, { "epoch": 1.5363652701974102, "grad_norm": 0.2319250851869583, "learning_rate": 5e-06, "loss": 0.9151, "num_input_tokens_seen": 576602952, "step": 1269, "train_runtime": 92917.6352, "train_tokens_per_second": 6205.528 }, { "epoch": 1.5375760270150116, "grad_norm": 0.23095951974391937, "learning_rate": 5e-06, "loss": 0.9466, "num_input_tokens_seen": 577064504, "step": 1270, "train_runtime": 92994.592, "train_tokens_per_second": 6205.356 }, { "epoch": 1.538786783832613, "grad_norm": 0.23852431774139404, "learning_rate": 5e-06, "loss": 0.9622, "num_input_tokens_seen": 577527016, "step": 1271, "train_runtime": 93072.0976, "train_tokens_per_second": 6205.157 }, { "epoch": 1.5399975406502142, "grad_norm": 0.22824853658676147, "learning_rate": 5e-06, "loss": 0.9495, "num_input_tokens_seen": 577987968, "step": 1272, "train_runtime": 93148.324, "train_tokens_per_second": 6205.028 }, { "epoch": 1.5412082974678156, "grad_norm": 0.23495082557201385, "learning_rate": 5e-06, "loss": 0.9997, "num_input_tokens_seen": 578430992, "step": 1273, "train_runtime": 93222.962, "train_tokens_per_second": 6204.812 }, { "epoch": 1.542419054285417, "grad_norm": 0.24541781842708588, "learning_rate": 5e-06, "loss": 0.9423, "num_input_tokens_seen": 578870056, "step": 1274, "train_runtime": 93296.3282, "train_tokens_per_second": 6204.639 }, { "epoch": 1.5436298111030182, "grad_norm": 0.24258604645729065, "learning_rate": 5e-06, "loss": 0.9238, "num_input_tokens_seen": 579339008, "step": 1275, "train_runtime": 93375.0592, "train_tokens_per_second": 6204.43 }, { "epoch": 1.5448405679206196, "grad_norm": 0.22991403937339783, "learning_rate": 5e-06, "loss": 0.9566, "num_input_tokens_seen": 579793544, "step": 1276, "train_runtime": 93450.4826, "train_tokens_per_second": 6204.286 }, { "epoch": 1.546051324738221, "grad_norm": 0.2381500005722046, "learning_rate": 5e-06, "loss": 0.9507, "num_input_tokens_seen": 580233824, "step": 1277, "train_runtime": 93524.278, "train_tokens_per_second": 6204.098 }, { "epoch": 1.5472620815558225, "grad_norm": 0.2665536105632782, "learning_rate": 5e-06, "loss": 0.9165, "num_input_tokens_seen": 580681840, "step": 1278, "train_runtime": 93598.8728, "train_tokens_per_second": 6203.941 }, { "epoch": 1.5484728383734239, "grad_norm": 0.25912097096443176, "learning_rate": 5e-06, "loss": 0.9159, "num_input_tokens_seen": 581134808, "step": 1279, "train_runtime": 93674.4264, "train_tokens_per_second": 6203.772 }, { "epoch": 1.5496835951910253, "grad_norm": 0.257059782743454, "learning_rate": 5e-06, "loss": 1.0082, "num_input_tokens_seen": 581578592, "step": 1280, "train_runtime": 93748.5874, "train_tokens_per_second": 6203.598 }, { "epoch": 1.5508943520086267, "grad_norm": 0.22761328518390656, "learning_rate": 5e-06, "loss": 0.918, "num_input_tokens_seen": 582043576, "step": 1281, "train_runtime": 93826.8246, "train_tokens_per_second": 6203.381 }, { "epoch": 1.552105108826228, "grad_norm": 0.23127709329128265, "learning_rate": 5e-06, "loss": 0.9535, "num_input_tokens_seen": 582491064, "step": 1282, "train_runtime": 93900.1492, "train_tokens_per_second": 6203.303 }, { "epoch": 1.5533158656438295, "grad_norm": 0.23334218561649323, "learning_rate": 5e-06, "loss": 0.9363, "num_input_tokens_seen": 582943336, "step": 1283, "train_runtime": 93971.7767, "train_tokens_per_second": 6203.387 }, { "epoch": 1.554526622461431, "grad_norm": 0.2526426613330841, "learning_rate": 5e-06, "loss": 0.9897, "num_input_tokens_seen": 583387120, "step": 1284, "train_runtime": 94045.0911, "train_tokens_per_second": 6203.27 }, { "epoch": 1.555737379279032, "grad_norm": 0.28767573833465576, "learning_rate": 5e-06, "loss": 0.9343, "num_input_tokens_seen": 583846136, "step": 1285, "train_runtime": 94125.4895, "train_tokens_per_second": 6202.848 }, { "epoch": 1.5569481360966335, "grad_norm": 0.22892381250858307, "learning_rate": 5e-06, "loss": 0.911, "num_input_tokens_seen": 584283936, "step": 1286, "train_runtime": 94197.0686, "train_tokens_per_second": 6202.783 }, { "epoch": 1.558158892914235, "grad_norm": 0.22896316647529602, "learning_rate": 5e-06, "loss": 0.9782, "num_input_tokens_seen": 584742536, "step": 1287, "train_runtime": 94271.4579, "train_tokens_per_second": 6202.753 }, { "epoch": 1.5593696497318363, "grad_norm": 0.2572176456451416, "learning_rate": 5e-06, "loss": 0.9574, "num_input_tokens_seen": 585188088, "step": 1288, "train_runtime": 94344.216, "train_tokens_per_second": 6202.692 }, { "epoch": 1.5605804065494375, "grad_norm": 0.23889631032943726, "learning_rate": 5e-06, "loss": 0.97, "num_input_tokens_seen": 585653328, "step": 1289, "train_runtime": 94418.8519, "train_tokens_per_second": 6202.716 }, { "epoch": 1.561791163367039, "grad_norm": 0.23102454841136932, "learning_rate": 5e-06, "loss": 0.9593, "num_input_tokens_seen": 586117272, "step": 1290, "train_runtime": 94493.2781, "train_tokens_per_second": 6202.74 }, { "epoch": 1.5630019201846403, "grad_norm": 0.2229638695716858, "learning_rate": 5e-06, "loss": 0.8971, "num_input_tokens_seen": 586549016, "step": 1291, "train_runtime": 94563.503, "train_tokens_per_second": 6202.7 }, { "epoch": 1.5642126770022418, "grad_norm": 0.258696585893631, "learning_rate": 5e-06, "loss": 0.9113, "num_input_tokens_seen": 587013368, "step": 1292, "train_runtime": 94641.4264, "train_tokens_per_second": 6202.499 }, { "epoch": 1.5654234338198432, "grad_norm": 0.23761804401874542, "learning_rate": 5e-06, "loss": 0.8907, "num_input_tokens_seen": 587491456, "step": 1293, "train_runtime": 94721.2593, "train_tokens_per_second": 6202.319 }, { "epoch": 1.5666341906374446, "grad_norm": 0.24647028744220734, "learning_rate": 5e-06, "loss": 0.9741, "num_input_tokens_seen": 587940688, "step": 1294, "train_runtime": 94794.5549, "train_tokens_per_second": 6202.262 }, { "epoch": 1.567844947455046, "grad_norm": 0.2338888794183731, "learning_rate": 5e-06, "loss": 0.9493, "num_input_tokens_seen": 588396192, "step": 1295, "train_runtime": 94867.988, "train_tokens_per_second": 6202.263 }, { "epoch": 1.5690557042726474, "grad_norm": 0.26478147506713867, "learning_rate": 5e-06, "loss": 0.9862, "num_input_tokens_seen": 588832744, "step": 1296, "train_runtime": 94938.6055, "train_tokens_per_second": 6202.248 }, { "epoch": 1.5702664610902488, "grad_norm": 0.23042112588882446, "learning_rate": 5e-06, "loss": 0.9421, "num_input_tokens_seen": 589284472, "step": 1297, "train_runtime": 95011.311, "train_tokens_per_second": 6202.256 }, { "epoch": 1.5714772179078502, "grad_norm": 0.2494785189628601, "learning_rate": 5e-06, "loss": 0.9467, "num_input_tokens_seen": 589747536, "step": 1298, "train_runtime": 95086.1322, "train_tokens_per_second": 6202.246 }, { "epoch": 1.5726879747254514, "grad_norm": 0.27761778235435486, "learning_rate": 5e-06, "loss": 0.9743, "num_input_tokens_seen": 590193016, "step": 1299, "train_runtime": 95158.7332, "train_tokens_per_second": 6202.195 }, { "epoch": 1.5738987315430528, "grad_norm": 0.2412542998790741, "learning_rate": 5e-06, "loss": 0.9015, "num_input_tokens_seen": 590671280, "step": 1300, "train_runtime": 95236.6476, "train_tokens_per_second": 6202.143 }, { "epoch": 1.5751094883606542, "grad_norm": 0.23688916862010956, "learning_rate": 5e-06, "loss": 0.9363, "num_input_tokens_seen": 591150592, "step": 1301, "train_runtime": 95315.0789, "train_tokens_per_second": 6202.068 }, { "epoch": 1.5763202451782554, "grad_norm": 0.2533585727214813, "learning_rate": 5e-06, "loss": 0.9026, "num_input_tokens_seen": 591598136, "step": 1302, "train_runtime": 95388.7088, "train_tokens_per_second": 6201.972 }, { "epoch": 1.5775310019958568, "grad_norm": 0.24774165451526642, "learning_rate": 5e-06, "loss": 0.9032, "num_input_tokens_seen": 592054200, "step": 1303, "train_runtime": 95461.419, "train_tokens_per_second": 6202.026 }, { "epoch": 1.5787417588134582, "grad_norm": 0.2428959310054779, "learning_rate": 5e-06, "loss": 0.926, "num_input_tokens_seen": 592513824, "step": 1304, "train_runtime": 95532.3576, "train_tokens_per_second": 6202.232 }, { "epoch": 1.5799525156310597, "grad_norm": 0.23670534789562225, "learning_rate": 5e-06, "loss": 0.9507, "num_input_tokens_seen": 592975696, "step": 1305, "train_runtime": 95604.7535, "train_tokens_per_second": 6202.366 }, { "epoch": 1.581163272448661, "grad_norm": 0.2287970781326294, "learning_rate": 5e-06, "loss": 0.881, "num_input_tokens_seen": 593419968, "step": 1306, "train_runtime": 95673.7974, "train_tokens_per_second": 6202.534 }, { "epoch": 1.5823740292662625, "grad_norm": 0.23797202110290527, "learning_rate": 5e-06, "loss": 0.9343, "num_input_tokens_seen": 593882816, "step": 1307, "train_runtime": 95745.9131, "train_tokens_per_second": 6202.696 }, { "epoch": 1.5835847860838639, "grad_norm": 0.24310339987277985, "learning_rate": 5e-06, "loss": 0.9403, "num_input_tokens_seen": 594341808, "step": 1308, "train_runtime": 95817.2622, "train_tokens_per_second": 6202.868 }, { "epoch": 1.5847955429014653, "grad_norm": 0.23264212906360626, "learning_rate": 5e-06, "loss": 0.9363, "num_input_tokens_seen": 594811616, "step": 1309, "train_runtime": 95890.2989, "train_tokens_per_second": 6203.043 }, { "epoch": 1.5860062997190667, "grad_norm": 0.2382027506828308, "learning_rate": 5e-06, "loss": 0.9259, "num_input_tokens_seen": 595259352, "step": 1310, "train_runtime": 95960.1069, "train_tokens_per_second": 6203.196 }, { "epoch": 1.5872170565366681, "grad_norm": 0.28391894698143005, "learning_rate": 5e-06, "loss": 0.9381, "num_input_tokens_seen": 595731272, "step": 1311, "train_runtime": 96034.005, "train_tokens_per_second": 6203.337 }, { "epoch": 1.5884278133542693, "grad_norm": 0.23430295288562775, "learning_rate": 5e-06, "loss": 0.9697, "num_input_tokens_seen": 596173736, "step": 1312, "train_runtime": 96102.8369, "train_tokens_per_second": 6203.498 }, { "epoch": 1.5896385701718707, "grad_norm": 0.22797590494155884, "learning_rate": 5e-06, "loss": 0.952, "num_input_tokens_seen": 596637800, "step": 1313, "train_runtime": 96174.9195, "train_tokens_per_second": 6203.674 }, { "epoch": 1.5908493269894721, "grad_norm": 0.23347218334674835, "learning_rate": 5e-06, "loss": 0.9375, "num_input_tokens_seen": 597079384, "step": 1314, "train_runtime": 96243.2882, "train_tokens_per_second": 6203.855 }, { "epoch": 1.5920600838070733, "grad_norm": 0.22693176567554474, "learning_rate": 5e-06, "loss": 0.9722, "num_input_tokens_seen": 597547256, "step": 1315, "train_runtime": 96315.9975, "train_tokens_per_second": 6204.029 }, { "epoch": 1.5932708406246747, "grad_norm": 0.2342706322669983, "learning_rate": 5e-06, "loss": 0.8936, "num_input_tokens_seen": 598005224, "step": 1316, "train_runtime": 96387.5398, "train_tokens_per_second": 6204.176 }, { "epoch": 1.5944815974422761, "grad_norm": 0.23413512110710144, "learning_rate": 5e-06, "loss": 0.9149, "num_input_tokens_seen": 598458528, "step": 1317, "train_runtime": 96458.3878, "train_tokens_per_second": 6204.318 }, { "epoch": 1.5956923542598775, "grad_norm": 0.2367754727602005, "learning_rate": 5e-06, "loss": 0.9854, "num_input_tokens_seen": 598919008, "step": 1318, "train_runtime": 96530.1929, "train_tokens_per_second": 6204.473 }, { "epoch": 1.596903111077479, "grad_norm": 0.23297631740570068, "learning_rate": 5e-06, "loss": 0.9285, "num_input_tokens_seen": 599369792, "step": 1319, "train_runtime": 96600.1615, "train_tokens_per_second": 6204.646 }, { "epoch": 1.5981138678950804, "grad_norm": 0.23420660197734833, "learning_rate": 5e-06, "loss": 0.9593, "num_input_tokens_seen": 599827040, "step": 1320, "train_runtime": 96672.0453, "train_tokens_per_second": 6204.762 }, { "epoch": 1.5993246247126818, "grad_norm": 0.2214992493391037, "learning_rate": 5e-06, "loss": 0.9336, "num_input_tokens_seen": 600298640, "step": 1321, "train_runtime": 96745.0125, "train_tokens_per_second": 6204.957 }, { "epoch": 1.6005353815302832, "grad_norm": 0.23480089008808136, "learning_rate": 5e-06, "loss": 0.8738, "num_input_tokens_seen": 600740872, "step": 1322, "train_runtime": 96813.8967, "train_tokens_per_second": 6205.11 }, { "epoch": 1.6017461383478846, "grad_norm": 0.2521512508392334, "learning_rate": 5e-06, "loss": 0.9435, "num_input_tokens_seen": 601197152, "step": 1323, "train_runtime": 96886.3997, "train_tokens_per_second": 6205.176 }, { "epoch": 1.602956895165486, "grad_norm": 0.23057833313941956, "learning_rate": 5e-06, "loss": 0.9161, "num_input_tokens_seen": 601645592, "step": 1324, "train_runtime": 96959.6364, "train_tokens_per_second": 6205.114 }, { "epoch": 1.6041676519830872, "grad_norm": 0.27399954199790955, "learning_rate": 5e-06, "loss": 0.9229, "num_input_tokens_seen": 602103456, "step": 1325, "train_runtime": 97034.3576, "train_tokens_per_second": 6205.054 }, { "epoch": 1.6053784088006886, "grad_norm": 0.2807023823261261, "learning_rate": 5e-06, "loss": 0.9729, "num_input_tokens_seen": 602557192, "step": 1326, "train_runtime": 97108.0688, "train_tokens_per_second": 6205.017 }, { "epoch": 1.60658916561829, "grad_norm": 0.24586202204227448, "learning_rate": 5e-06, "loss": 0.9555, "num_input_tokens_seen": 603006376, "step": 1327, "train_runtime": 97180.951, "train_tokens_per_second": 6204.985 }, { "epoch": 1.6077999224358914, "grad_norm": 0.23183618485927582, "learning_rate": 5e-06, "loss": 0.9219, "num_input_tokens_seen": 603457768, "step": 1328, "train_runtime": 97253.8889, "train_tokens_per_second": 6204.973 }, { "epoch": 1.6090106792534926, "grad_norm": 0.24499334394931793, "learning_rate": 5e-06, "loss": 0.9493, "num_input_tokens_seen": 603904864, "step": 1329, "train_runtime": 97325.8918, "train_tokens_per_second": 6204.976 }, { "epoch": 1.610221436071094, "grad_norm": 0.22572267055511475, "learning_rate": 5e-06, "loss": 0.9446, "num_input_tokens_seen": 604368296, "step": 1330, "train_runtime": 97400.8104, "train_tokens_per_second": 6204.962 }, { "epoch": 1.6114321928886954, "grad_norm": 0.24778367578983307, "learning_rate": 5e-06, "loss": 0.9465, "num_input_tokens_seen": 604816744, "step": 1331, "train_runtime": 97472.976, "train_tokens_per_second": 6204.968 }, { "epoch": 1.6126429497062968, "grad_norm": 0.23673632740974426, "learning_rate": 5e-06, "loss": 0.9609, "num_input_tokens_seen": 605278760, "step": 1332, "train_runtime": 97548.1753, "train_tokens_per_second": 6204.921 }, { "epoch": 1.6138537065238983, "grad_norm": 0.24265213310718536, "learning_rate": 5e-06, "loss": 1.009, "num_input_tokens_seen": 605740240, "step": 1333, "train_runtime": 97623.1268, "train_tokens_per_second": 6204.885 }, { "epoch": 1.6150644633414997, "grad_norm": 0.2499813735485077, "learning_rate": 5e-06, "loss": 0.906, "num_input_tokens_seen": 606188040, "step": 1334, "train_runtime": 97695.6817, "train_tokens_per_second": 6204.86 }, { "epoch": 1.616275220159101, "grad_norm": 0.23881113529205322, "learning_rate": 5e-06, "loss": 0.9569, "num_input_tokens_seen": 606636736, "step": 1335, "train_runtime": 97768.0449, "train_tokens_per_second": 6204.857 }, { "epoch": 1.6174859769767025, "grad_norm": 0.23513104021549225, "learning_rate": 5e-06, "loss": 0.9537, "num_input_tokens_seen": 607085392, "step": 1336, "train_runtime": 97840.6583, "train_tokens_per_second": 6204.838 }, { "epoch": 1.618696733794304, "grad_norm": 0.21942594647407532, "learning_rate": 5e-06, "loss": 0.9643, "num_input_tokens_seen": 607566272, "step": 1337, "train_runtime": 97917.8238, "train_tokens_per_second": 6204.859 }, { "epoch": 1.6199074906119053, "grad_norm": 0.24452783167362213, "learning_rate": 5e-06, "loss": 0.9954, "num_input_tokens_seen": 608026592, "step": 1338, "train_runtime": 97992.107, "train_tokens_per_second": 6204.853 }, { "epoch": 1.6211182474295065, "grad_norm": 0.2625705897808075, "learning_rate": 5e-06, "loss": 0.8922, "num_input_tokens_seen": 608489800, "step": 1339, "train_runtime": 98064.8771, "train_tokens_per_second": 6204.972 }, { "epoch": 1.622329004247108, "grad_norm": 0.23123782873153687, "learning_rate": 5e-06, "loss": 0.9478, "num_input_tokens_seen": 608948792, "step": 1340, "train_runtime": 98137.5426, "train_tokens_per_second": 6205.054 }, { "epoch": 1.6235397610647093, "grad_norm": 0.2373858541250229, "learning_rate": 5e-06, "loss": 0.9555, "num_input_tokens_seen": 609387720, "step": 1341, "train_runtime": 98209.4854, "train_tokens_per_second": 6204.978 }, { "epoch": 1.6247505178823105, "grad_norm": 0.26772409677505493, "learning_rate": 5e-06, "loss": 0.9705, "num_input_tokens_seen": 609835256, "step": 1342, "train_runtime": 98283.6431, "train_tokens_per_second": 6204.85 }, { "epoch": 1.625961274699912, "grad_norm": 0.26004475355148315, "learning_rate": 5e-06, "loss": 0.9613, "num_input_tokens_seen": 610280952, "step": 1343, "train_runtime": 98357.4587, "train_tokens_per_second": 6204.725 }, { "epoch": 1.6271720315175133, "grad_norm": 0.24032413959503174, "learning_rate": 5e-06, "loss": 0.9171, "num_input_tokens_seen": 610736608, "step": 1344, "train_runtime": 98432.191, "train_tokens_per_second": 6204.643 }, { "epoch": 1.6283827883351147, "grad_norm": 0.24109645187854767, "learning_rate": 5e-06, "loss": 0.9083, "num_input_tokens_seen": 611195912, "step": 1345, "train_runtime": 98507.5175, "train_tokens_per_second": 6204.561 }, { "epoch": 1.6295935451527161, "grad_norm": 0.23913376033306122, "learning_rate": 5e-06, "loss": 0.9131, "num_input_tokens_seen": 611621864, "step": 1346, "train_runtime": 98575.6226, "train_tokens_per_second": 6204.595 }, { "epoch": 1.6308043019703176, "grad_norm": 0.23697420954704285, "learning_rate": 5e-06, "loss": 0.9913, "num_input_tokens_seen": 612066184, "step": 1347, "train_runtime": 98646.9799, "train_tokens_per_second": 6204.611 }, { "epoch": 1.632015058787919, "grad_norm": 0.23569026589393616, "learning_rate": 5e-06, "loss": 0.9095, "num_input_tokens_seen": 612525336, "step": 1348, "train_runtime": 98721.3967, "train_tokens_per_second": 6204.585 }, { "epoch": 1.6332258156055204, "grad_norm": 0.25485959649086, "learning_rate": 5e-06, "loss": 0.9833, "num_input_tokens_seen": 612975160, "step": 1349, "train_runtime": 98800.9432, "train_tokens_per_second": 6204.143 }, { "epoch": 1.6344365724231218, "grad_norm": 0.2503267228603363, "learning_rate": 5e-06, "loss": 0.939, "num_input_tokens_seen": 613459824, "step": 1350, "train_runtime": 98884.1899, "train_tokens_per_second": 6203.821 }, { "epoch": 1.6356473292407232, "grad_norm": 0.23752045631408691, "learning_rate": 5e-06, "loss": 0.91, "num_input_tokens_seen": 613928136, "step": 1351, "train_runtime": 98962.7457, "train_tokens_per_second": 6203.629 }, { "epoch": 1.6368580860583244, "grad_norm": 0.23110365867614746, "learning_rate": 5e-06, "loss": 0.9469, "num_input_tokens_seen": 614374960, "step": 1352, "train_runtime": 99036.341, "train_tokens_per_second": 6203.53 }, { "epoch": 1.6380688428759258, "grad_norm": 0.24777059257030487, "learning_rate": 5e-06, "loss": 0.9272, "num_input_tokens_seen": 614823416, "step": 1353, "train_runtime": 99110.4637, "train_tokens_per_second": 6203.416 }, { "epoch": 1.6392795996935272, "grad_norm": 0.23056265711784363, "learning_rate": 5e-06, "loss": 0.9341, "num_input_tokens_seen": 615269264, "step": 1354, "train_runtime": 99177.7333, "train_tokens_per_second": 6203.704 }, { "epoch": 1.6404903565111284, "grad_norm": 0.24137234687805176, "learning_rate": 5e-06, "loss": 0.9345, "num_input_tokens_seen": 615705496, "step": 1355, "train_runtime": 99243.9752, "train_tokens_per_second": 6203.958 }, { "epoch": 1.6417011133287298, "grad_norm": 0.25345325469970703, "learning_rate": 5e-06, "loss": 0.9613, "num_input_tokens_seen": 616144368, "step": 1356, "train_runtime": 99309.7621, "train_tokens_per_second": 6204.268 }, { "epoch": 1.6429118701463312, "grad_norm": 0.24765293300151825, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 616584824, "step": 1357, "train_runtime": 99376.1136, "train_tokens_per_second": 6204.558 }, { "epoch": 1.6441226269639326, "grad_norm": 0.2568323612213135, "learning_rate": 5e-06, "loss": 0.9893, "num_input_tokens_seen": 617024680, "step": 1358, "train_runtime": 99443.2364, "train_tokens_per_second": 6204.793 }, { "epoch": 1.645333383781534, "grad_norm": 0.25249186158180237, "learning_rate": 5e-06, "loss": 0.9166, "num_input_tokens_seen": 617471792, "step": 1359, "train_runtime": 99510.1824, "train_tokens_per_second": 6205.112 }, { "epoch": 1.6465441405991355, "grad_norm": 0.2438102513551712, "learning_rate": 5e-06, "loss": 0.9972, "num_input_tokens_seen": 617917968, "step": 1360, "train_runtime": 99576.8247, "train_tokens_per_second": 6205.44 }, { "epoch": 1.6477548974167369, "grad_norm": 0.26252567768096924, "learning_rate": 5e-06, "loss": 0.9144, "num_input_tokens_seen": 618372880, "step": 1361, "train_runtime": 99646.255, "train_tokens_per_second": 6205.681 }, { "epoch": 1.6489656542343383, "grad_norm": 0.2307174950838089, "learning_rate": 5e-06, "loss": 0.9693, "num_input_tokens_seen": 618825688, "step": 1362, "train_runtime": 99721.3775, "train_tokens_per_second": 6205.547 }, { "epoch": 1.6501764110519397, "grad_norm": 0.23613497614860535, "learning_rate": 5e-06, "loss": 0.9993, "num_input_tokens_seen": 619266920, "step": 1363, "train_runtime": 99794.2824, "train_tokens_per_second": 6205.435 }, { "epoch": 1.651387167869541, "grad_norm": 0.2175440788269043, "learning_rate": 5e-06, "loss": 0.8754, "num_input_tokens_seen": 619744360, "step": 1364, "train_runtime": 99875.7248, "train_tokens_per_second": 6205.155 }, { "epoch": 1.6525979246871423, "grad_norm": 0.2543286681175232, "learning_rate": 5e-06, "loss": 1.0312, "num_input_tokens_seen": 620200248, "step": 1365, "train_runtime": 99953.0201, "train_tokens_per_second": 6204.918 }, { "epoch": 1.6538086815047437, "grad_norm": 0.23493343591690063, "learning_rate": 5e-06, "loss": 0.969, "num_input_tokens_seen": 620661640, "step": 1366, "train_runtime": 100028.4666, "train_tokens_per_second": 6204.85 }, { "epoch": 1.655019438322345, "grad_norm": 0.24430783092975616, "learning_rate": 5e-06, "loss": 0.9651, "num_input_tokens_seen": 621103232, "step": 1367, "train_runtime": 100099.1989, "train_tokens_per_second": 6204.877 }, { "epoch": 1.6562301951399465, "grad_norm": 0.24038319289684296, "learning_rate": 5e-06, "loss": 0.9906, "num_input_tokens_seen": 621550880, "step": 1368, "train_runtime": 100170.4126, "train_tokens_per_second": 6204.935 }, { "epoch": 1.6574409519575477, "grad_norm": 0.23617248237133026, "learning_rate": 5e-06, "loss": 0.943, "num_input_tokens_seen": 621993640, "step": 1369, "train_runtime": 100240.8957, "train_tokens_per_second": 6204.989 }, { "epoch": 1.6586517087751491, "grad_norm": 0.24460504949092865, "learning_rate": 5e-06, "loss": 0.9263, "num_input_tokens_seen": 622454504, "step": 1370, "train_runtime": 100316.4623, "train_tokens_per_second": 6204.909 }, { "epoch": 1.6598624655927505, "grad_norm": 0.24925386905670166, "learning_rate": 5e-06, "loss": 0.9425, "num_input_tokens_seen": 622893320, "step": 1371, "train_runtime": 100387.8797, "train_tokens_per_second": 6204.866 }, { "epoch": 1.661073222410352, "grad_norm": 0.2371699959039688, "learning_rate": 5e-06, "loss": 0.9302, "num_input_tokens_seen": 623339984, "step": 1372, "train_runtime": 100461.2817, "train_tokens_per_second": 6204.778 }, { "epoch": 1.6622839792279533, "grad_norm": 0.24603520333766937, "learning_rate": 5e-06, "loss": 0.9302, "num_input_tokens_seen": 623800888, "step": 1373, "train_runtime": 100537.6306, "train_tokens_per_second": 6204.651 }, { "epoch": 1.6634947360455548, "grad_norm": 0.23240868747234344, "learning_rate": 5e-06, "loss": 0.9293, "num_input_tokens_seen": 624248528, "step": 1374, "train_runtime": 100611.4073, "train_tokens_per_second": 6204.55 }, { "epoch": 1.6647054928631562, "grad_norm": 0.2440965324640274, "learning_rate": 5e-06, "loss": 0.9818, "num_input_tokens_seen": 624694936, "step": 1375, "train_runtime": 100684.7425, "train_tokens_per_second": 6204.465 }, { "epoch": 1.6659162496807576, "grad_norm": 0.24966219067573547, "learning_rate": 5e-06, "loss": 0.9338, "num_input_tokens_seen": 625155168, "step": 1376, "train_runtime": 100770.2757, "train_tokens_per_second": 6203.766 }, { "epoch": 1.667127006498359, "grad_norm": 0.24221491813659668, "learning_rate": 5e-06, "loss": 0.9025, "num_input_tokens_seen": 625605080, "step": 1377, "train_runtime": 100845.271, "train_tokens_per_second": 6203.613 }, { "epoch": 1.6683377633159604, "grad_norm": 0.23787087202072144, "learning_rate": 5e-06, "loss": 0.9104, "num_input_tokens_seen": 626052144, "step": 1378, "train_runtime": 100919.6785, "train_tokens_per_second": 6203.469 }, { "epoch": 1.6695485201335616, "grad_norm": 0.24397063255310059, "learning_rate": 5e-06, "loss": 0.9426, "num_input_tokens_seen": 626534576, "step": 1379, "train_runtime": 100999.7059, "train_tokens_per_second": 6203.331 }, { "epoch": 1.670759276951163, "grad_norm": 0.30004844069480896, "learning_rate": 5e-06, "loss": 0.9302, "num_input_tokens_seen": 626990528, "step": 1380, "train_runtime": 101077.577, "train_tokens_per_second": 6203.063 }, { "epoch": 1.6719700337687644, "grad_norm": 0.23161612451076508, "learning_rate": 5e-06, "loss": 0.9117, "num_input_tokens_seen": 627444888, "step": 1381, "train_runtime": 101154.9535, "train_tokens_per_second": 6202.809 }, { "epoch": 1.6731807905863656, "grad_norm": 0.29034850001335144, "learning_rate": 5e-06, "loss": 0.9122, "num_input_tokens_seen": 627887600, "step": 1382, "train_runtime": 101230.0349, "train_tokens_per_second": 6202.582 }, { "epoch": 1.674391547403967, "grad_norm": 0.23793677985668182, "learning_rate": 5e-06, "loss": 0.9402, "num_input_tokens_seen": 628335968, "step": 1383, "train_runtime": 101305.9047, "train_tokens_per_second": 6202.363 }, { "epoch": 1.6756023042215684, "grad_norm": 0.24347274005413055, "learning_rate": 5e-06, "loss": 0.9712, "num_input_tokens_seen": 628770224, "step": 1384, "train_runtime": 101369.8627, "train_tokens_per_second": 6202.733 }, { "epoch": 1.6768130610391698, "grad_norm": 0.26189595460891724, "learning_rate": 5e-06, "loss": 0.8935, "num_input_tokens_seen": 629217368, "step": 1385, "train_runtime": 101436.0596, "train_tokens_per_second": 6203.094 }, { "epoch": 1.6780238178567712, "grad_norm": 0.28286808729171753, "learning_rate": 5e-06, "loss": 0.9675, "num_input_tokens_seen": 629651968, "step": 1386, "train_runtime": 101500.3426, "train_tokens_per_second": 6203.447 }, { "epoch": 1.6792345746743726, "grad_norm": 0.24720792472362518, "learning_rate": 5e-06, "loss": 0.9567, "num_input_tokens_seen": 630106888, "step": 1387, "train_runtime": 101567.8488, "train_tokens_per_second": 6203.803 }, { "epoch": 1.680445331491974, "grad_norm": 0.2625053822994232, "learning_rate": 5e-06, "loss": 0.9209, "num_input_tokens_seen": 630562680, "step": 1388, "train_runtime": 101638.1718, "train_tokens_per_second": 6203.995 }, { "epoch": 1.6816560883095755, "grad_norm": 0.26049408316612244, "learning_rate": 5e-06, "loss": 1.0043, "num_input_tokens_seen": 631017728, "step": 1389, "train_runtime": 101711.0478, "train_tokens_per_second": 6204.023 }, { "epoch": 1.6828668451271769, "grad_norm": 0.24920783936977386, "learning_rate": 5e-06, "loss": 0.9278, "num_input_tokens_seen": 631473184, "step": 1390, "train_runtime": 101784.6458, "train_tokens_per_second": 6204.012 }, { "epoch": 1.6840776019447783, "grad_norm": 0.24204052984714508, "learning_rate": 5e-06, "loss": 0.8937, "num_input_tokens_seen": 631926200, "step": 1391, "train_runtime": 101857.4628, "train_tokens_per_second": 6204.025 }, { "epoch": 1.6852883587623795, "grad_norm": 0.27543655037879944, "learning_rate": 5e-06, "loss": 0.9788, "num_input_tokens_seen": 632376624, "step": 1392, "train_runtime": 101931.1872, "train_tokens_per_second": 6203.956 }, { "epoch": 1.686499115579981, "grad_norm": 0.24152293801307678, "learning_rate": 5e-06, "loss": 0.9632, "num_input_tokens_seen": 632826048, "step": 1393, "train_runtime": 102004.9311, "train_tokens_per_second": 6203.877 }, { "epoch": 1.6877098723975823, "grad_norm": 0.24093790352344513, "learning_rate": 5e-06, "loss": 0.9022, "num_input_tokens_seen": 633294632, "step": 1394, "train_runtime": 102083.4824, "train_tokens_per_second": 6203.693 }, { "epoch": 1.6889206292151835, "grad_norm": 0.23147398233413696, "learning_rate": 5e-06, "loss": 0.9156, "num_input_tokens_seen": 633761776, "step": 1395, "train_runtime": 102161.3889, "train_tokens_per_second": 6203.535 }, { "epoch": 1.690131386032785, "grad_norm": 0.23987317085266113, "learning_rate": 5e-06, "loss": 0.9063, "num_input_tokens_seen": 634226504, "step": 1396, "train_runtime": 102238.2689, "train_tokens_per_second": 6203.416 }, { "epoch": 1.6913421428503863, "grad_norm": 0.25991290807724, "learning_rate": 5e-06, "loss": 0.9713, "num_input_tokens_seen": 634673344, "step": 1397, "train_runtime": 102309.0901, "train_tokens_per_second": 6203.489 }, { "epoch": 1.6925528996679877, "grad_norm": 0.23085501790046692, "learning_rate": 5e-06, "loss": 0.8837, "num_input_tokens_seen": 635118792, "step": 1398, "train_runtime": 102383.7557, "train_tokens_per_second": 6203.316 }, { "epoch": 1.6937636564855891, "grad_norm": 0.2376517653465271, "learning_rate": 5e-06, "loss": 0.9406, "num_input_tokens_seen": 635574264, "step": 1399, "train_runtime": 102459.5006, "train_tokens_per_second": 6203.176 }, { "epoch": 1.6949744133031905, "grad_norm": 0.25487491488456726, "learning_rate": 5e-06, "loss": 0.9089, "num_input_tokens_seen": 636022216, "step": 1400, "train_runtime": 102534.4322, "train_tokens_per_second": 6203.011 }, { "epoch": 1.696185170120792, "grad_norm": 0.2450874000787735, "learning_rate": 5e-06, "loss": 0.9227, "num_input_tokens_seen": 636462544, "step": 1401, "train_runtime": 102607.6137, "train_tokens_per_second": 6202.878 }, { "epoch": 1.6973959269383934, "grad_norm": 0.24227704107761383, "learning_rate": 5e-06, "loss": 0.9495, "num_input_tokens_seen": 636916136, "step": 1402, "train_runtime": 102683.0063, "train_tokens_per_second": 6202.741 }, { "epoch": 1.6986066837559948, "grad_norm": 0.24646668136119843, "learning_rate": 5e-06, "loss": 0.9612, "num_input_tokens_seen": 637349120, "step": 1403, "train_runtime": 102754.4934, "train_tokens_per_second": 6202.64 }, { "epoch": 1.6998174405735962, "grad_norm": 0.23192055523395538, "learning_rate": 5e-06, "loss": 0.9195, "num_input_tokens_seen": 637790064, "step": 1404, "train_runtime": 102827.9317, "train_tokens_per_second": 6202.498 }, { "epoch": 1.7010281973911974, "grad_norm": 0.25445666909217834, "learning_rate": 5e-06, "loss": 0.9757, "num_input_tokens_seen": 638259320, "step": 1405, "train_runtime": 102906.6087, "train_tokens_per_second": 6202.316 }, { "epoch": 1.7022389542087988, "grad_norm": 0.23562908172607422, "learning_rate": 5e-06, "loss": 0.9021, "num_input_tokens_seen": 638743376, "step": 1406, "train_runtime": 102987.9705, "train_tokens_per_second": 6202.116 }, { "epoch": 1.7034497110264002, "grad_norm": 0.26519039273262024, "learning_rate": 5e-06, "loss": 0.943, "num_input_tokens_seen": 639190488, "step": 1407, "train_runtime": 103062.2706, "train_tokens_per_second": 6201.983 }, { "epoch": 1.7046604678440016, "grad_norm": 0.24398094415664673, "learning_rate": 5e-06, "loss": 0.9283, "num_input_tokens_seen": 639643104, "step": 1408, "train_runtime": 103137.9381, "train_tokens_per_second": 6201.822 }, { "epoch": 1.7058712246616028, "grad_norm": 0.2703668475151062, "learning_rate": 5e-06, "loss": 0.9343, "num_input_tokens_seen": 640089040, "step": 1409, "train_runtime": 103211.4765, "train_tokens_per_second": 6201.724 }, { "epoch": 1.7070819814792042, "grad_norm": 0.2557445168495178, "learning_rate": 5e-06, "loss": 0.9697, "num_input_tokens_seen": 640528208, "step": 1410, "train_runtime": 103284.178, "train_tokens_per_second": 6201.61 }, { "epoch": 1.7082927382968056, "grad_norm": 0.2544682025909424, "learning_rate": 5e-06, "loss": 0.9409, "num_input_tokens_seen": 640986416, "step": 1411, "train_runtime": 103359.9486, "train_tokens_per_second": 6201.497 }, { "epoch": 1.709503495114407, "grad_norm": 0.25841024518013, "learning_rate": 5e-06, "loss": 0.9823, "num_input_tokens_seen": 641422048, "step": 1412, "train_runtime": 103430.0233, "train_tokens_per_second": 6201.507 }, { "epoch": 1.7107142519320084, "grad_norm": 0.23430770635604858, "learning_rate": 5e-06, "loss": 0.9422, "num_input_tokens_seen": 641890512, "step": 1413, "train_runtime": 103504.2794, "train_tokens_per_second": 6201.584 }, { "epoch": 1.7119250087496098, "grad_norm": 0.25403422117233276, "learning_rate": 5e-06, "loss": 0.9378, "num_input_tokens_seen": 642351648, "step": 1414, "train_runtime": 103577.2489, "train_tokens_per_second": 6201.667 }, { "epoch": 1.7131357655672113, "grad_norm": 0.22510449588298798, "learning_rate": 5e-06, "loss": 0.9182, "num_input_tokens_seen": 642814728, "step": 1415, "train_runtime": 103650.3402, "train_tokens_per_second": 6201.762 }, { "epoch": 1.7143465223848127, "grad_norm": 0.24203039705753326, "learning_rate": 5e-06, "loss": 0.9479, "num_input_tokens_seen": 643279672, "step": 1416, "train_runtime": 103723.7474, "train_tokens_per_second": 6201.855 }, { "epoch": 1.715557279202414, "grad_norm": 0.3101445436477661, "learning_rate": 5e-06, "loss": 0.9404, "num_input_tokens_seen": 643718560, "step": 1417, "train_runtime": 103792.3388, "train_tokens_per_second": 6201.985 }, { "epoch": 1.7167680360200155, "grad_norm": 0.23789572715759277, "learning_rate": 5e-06, "loss": 0.9216, "num_input_tokens_seen": 644158072, "step": 1418, "train_runtime": 103864.8684, "train_tokens_per_second": 6201.886 }, { "epoch": 1.7179787928376167, "grad_norm": 0.22444191575050354, "learning_rate": 5e-06, "loss": 0.887, "num_input_tokens_seen": 644640400, "step": 1419, "train_runtime": 103943.7191, "train_tokens_per_second": 6201.822 }, { "epoch": 1.719189549655218, "grad_norm": 0.24372327327728271, "learning_rate": 5e-06, "loss": 1.0014, "num_input_tokens_seen": 645098072, "step": 1420, "train_runtime": 104019.5509, "train_tokens_per_second": 6201.7 }, { "epoch": 1.7204003064728195, "grad_norm": 0.2408047616481781, "learning_rate": 5e-06, "loss": 0.9455, "num_input_tokens_seen": 645547368, "step": 1421, "train_runtime": 104094.8084, "train_tokens_per_second": 6201.533 }, { "epoch": 1.7216110632904207, "grad_norm": 0.23340767621994019, "learning_rate": 5e-06, "loss": 0.9614, "num_input_tokens_seen": 646015000, "step": 1422, "train_runtime": 104173.3646, "train_tokens_per_second": 6201.345 }, { "epoch": 1.722821820108022, "grad_norm": 0.24374446272850037, "learning_rate": 5e-06, "loss": 0.955, "num_input_tokens_seen": 646457448, "step": 1423, "train_runtime": 104243.9382, "train_tokens_per_second": 6201.391 }, { "epoch": 1.7240325769256235, "grad_norm": 0.2410658746957779, "learning_rate": 5e-06, "loss": 0.9503, "num_input_tokens_seen": 646928080, "step": 1424, "train_runtime": 104319.3375, "train_tokens_per_second": 6201.421 }, { "epoch": 1.725243333743225, "grad_norm": 0.26561877131462097, "learning_rate": 5e-06, "loss": 0.9843, "num_input_tokens_seen": 647373424, "step": 1425, "train_runtime": 104390.2762, "train_tokens_per_second": 6201.472 }, { "epoch": 1.7264540905608263, "grad_norm": 0.24555157124996185, "learning_rate": 5e-06, "loss": 0.9066, "num_input_tokens_seen": 647839064, "step": 1426, "train_runtime": 104465.7848, "train_tokens_per_second": 6201.447 }, { "epoch": 1.7276648473784277, "grad_norm": 0.26610177755355835, "learning_rate": 5e-06, "loss": 0.9413, "num_input_tokens_seen": 648316536, "step": 1427, "train_runtime": 104544.7937, "train_tokens_per_second": 6201.328 }, { "epoch": 1.7288756041960291, "grad_norm": 0.23927830159664154, "learning_rate": 5e-06, "loss": 0.9217, "num_input_tokens_seen": 648750792, "step": 1428, "train_runtime": 104616.4109, "train_tokens_per_second": 6201.234 }, { "epoch": 1.7300863610136306, "grad_norm": 0.2528975009918213, "learning_rate": 5e-06, "loss": 1.0172, "num_input_tokens_seen": 649211056, "step": 1429, "train_runtime": 104691.6504, "train_tokens_per_second": 6201.173 }, { "epoch": 1.731297117831232, "grad_norm": 0.24375270307064056, "learning_rate": 5e-06, "loss": 0.9071, "num_input_tokens_seen": 649664160, "step": 1430, "train_runtime": 104761.4227, "train_tokens_per_second": 6201.368 }, { "epoch": 1.7325078746488334, "grad_norm": 0.2401747703552246, "learning_rate": 5e-06, "loss": 0.9436, "num_input_tokens_seen": 650120632, "step": 1431, "train_runtime": 104830.7549, "train_tokens_per_second": 6201.621 }, { "epoch": 1.7337186314664346, "grad_norm": 0.2560153901576996, "learning_rate": 5e-06, "loss": 0.9562, "num_input_tokens_seen": 650585600, "step": 1432, "train_runtime": 104901.1048, "train_tokens_per_second": 6201.895 }, { "epoch": 1.734929388284036, "grad_norm": 0.22828106582164764, "learning_rate": 5e-06, "loss": 0.9867, "num_input_tokens_seen": 651054272, "step": 1433, "train_runtime": 104971.5832, "train_tokens_per_second": 6202.195 }, { "epoch": 1.7361401451016374, "grad_norm": 0.2554665207862854, "learning_rate": 5e-06, "loss": 0.9303, "num_input_tokens_seen": 651531888, "step": 1434, "train_runtime": 105043.735, "train_tokens_per_second": 6202.482 }, { "epoch": 1.7373509019192386, "grad_norm": 0.23532572388648987, "learning_rate": 5e-06, "loss": 0.9333, "num_input_tokens_seen": 651989392, "step": 1435, "train_runtime": 105112.6174, "train_tokens_per_second": 6202.77 }, { "epoch": 1.73856165873684, "grad_norm": 0.22667627036571503, "learning_rate": 5e-06, "loss": 0.8893, "num_input_tokens_seen": 652450416, "step": 1436, "train_runtime": 105181.94, "train_tokens_per_second": 6203.065 }, { "epoch": 1.7397724155544414, "grad_norm": 0.2507862150669098, "learning_rate": 5e-06, "loss": 0.966, "num_input_tokens_seen": 652888200, "step": 1437, "train_runtime": 105248.017, "train_tokens_per_second": 6203.33 }, { "epoch": 1.7409831723720428, "grad_norm": 0.23755121231079102, "learning_rate": 5e-06, "loss": 0.9416, "num_input_tokens_seen": 653353944, "step": 1438, "train_runtime": 105318.4006, "train_tokens_per_second": 6203.607 }, { "epoch": 1.7421939291896442, "grad_norm": 0.25960105657577515, "learning_rate": 5e-06, "loss": 0.9615, "num_input_tokens_seen": 653799776, "step": 1439, "train_runtime": 105385.6269, "train_tokens_per_second": 6203.88 }, { "epoch": 1.7434046860072456, "grad_norm": 0.2501721680164337, "learning_rate": 5e-06, "loss": 0.961, "num_input_tokens_seen": 654277144, "step": 1440, "train_runtime": 105457.5792, "train_tokens_per_second": 6204.174 }, { "epoch": 1.744615442824847, "grad_norm": 0.26541006565093994, "learning_rate": 5e-06, "loss": 0.9524, "num_input_tokens_seen": 654704176, "step": 1441, "train_runtime": 105521.9793, "train_tokens_per_second": 6204.434 }, { "epoch": 1.7458261996424485, "grad_norm": 0.2498820275068283, "learning_rate": 5e-06, "loss": 0.9399, "num_input_tokens_seen": 655154280, "step": 1442, "train_runtime": 105589.5581, "train_tokens_per_second": 6204.726 }, { "epoch": 1.7470369564600499, "grad_norm": 0.2539311647415161, "learning_rate": 5e-06, "loss": 1.0376, "num_input_tokens_seen": 655598176, "step": 1443, "train_runtime": 105656.176, "train_tokens_per_second": 6205.015 }, { "epoch": 1.7482477132776513, "grad_norm": 0.2521834969520569, "learning_rate": 5e-06, "loss": 0.9695, "num_input_tokens_seen": 656049168, "step": 1444, "train_runtime": 105724.1835, "train_tokens_per_second": 6205.29 }, { "epoch": 1.7494584700952525, "grad_norm": 0.2886483073234558, "learning_rate": 5e-06, "loss": 0.9608, "num_input_tokens_seen": 656492336, "step": 1445, "train_runtime": 105790.6709, "train_tokens_per_second": 6205.579 }, { "epoch": 1.7506692269128539, "grad_norm": 0.2557690143585205, "learning_rate": 5e-06, "loss": 0.9915, "num_input_tokens_seen": 656921264, "step": 1446, "train_runtime": 105855.2716, "train_tokens_per_second": 6205.844 }, { "epoch": 1.7518799837304553, "grad_norm": 0.23341627418994904, "learning_rate": 5e-06, "loss": 0.956, "num_input_tokens_seen": 657390344, "step": 1447, "train_runtime": 105925.7109, "train_tokens_per_second": 6206.145 }, { "epoch": 1.7530907405480567, "grad_norm": 0.23532052338123322, "learning_rate": 5e-06, "loss": 0.9079, "num_input_tokens_seen": 657845536, "step": 1448, "train_runtime": 105994.3759, "train_tokens_per_second": 6206.419 }, { "epoch": 1.7543014973656579, "grad_norm": 0.2501102089881897, "learning_rate": 5e-06, "loss": 0.9369, "num_input_tokens_seen": 658305072, "step": 1449, "train_runtime": 106063.849, "train_tokens_per_second": 6206.687 }, { "epoch": 1.7555122541832593, "grad_norm": 0.22593450546264648, "learning_rate": 5e-06, "loss": 0.9144, "num_input_tokens_seen": 658774152, "step": 1450, "train_runtime": 106134.342, "train_tokens_per_second": 6206.984 }, { "epoch": 1.7567230110008607, "grad_norm": 0.23350222408771515, "learning_rate": 5e-06, "loss": 0.9024, "num_input_tokens_seen": 659224576, "step": 1451, "train_runtime": 106201.8802, "train_tokens_per_second": 6207.278 }, { "epoch": 1.7579337678184621, "grad_norm": 0.23016194999217987, "learning_rate": 5e-06, "loss": 0.9043, "num_input_tokens_seen": 659686856, "step": 1452, "train_runtime": 106272.1954, "train_tokens_per_second": 6207.521 }, { "epoch": 1.7591445246360635, "grad_norm": 0.21941740810871124, "learning_rate": 5e-06, "loss": 0.9029, "num_input_tokens_seen": 660168448, "step": 1453, "train_runtime": 106345.1484, "train_tokens_per_second": 6207.791 }, { "epoch": 1.760355281453665, "grad_norm": 0.2541714310646057, "learning_rate": 5e-06, "loss": 0.9459, "num_input_tokens_seen": 660618064, "step": 1454, "train_runtime": 106413.9117, "train_tokens_per_second": 6208.005 }, { "epoch": 1.7615660382712663, "grad_norm": 0.25230884552001953, "learning_rate": 5e-06, "loss": 0.9856, "num_input_tokens_seen": 661060488, "step": 1455, "train_runtime": 106480.6042, "train_tokens_per_second": 6208.271 }, { "epoch": 1.7627767950888678, "grad_norm": 0.23480939865112305, "learning_rate": 5e-06, "loss": 0.9658, "num_input_tokens_seen": 661510016, "step": 1456, "train_runtime": 106547.8374, "train_tokens_per_second": 6208.573 }, { "epoch": 1.7639875519064692, "grad_norm": 0.22851701080799103, "learning_rate": 5e-06, "loss": 0.9333, "num_input_tokens_seen": 661986792, "step": 1457, "train_runtime": 106619.9306, "train_tokens_per_second": 6208.847 }, { "epoch": 1.7651983087240706, "grad_norm": 0.24522744119167328, "learning_rate": 5e-06, "loss": 0.9374, "num_input_tokens_seen": 662466720, "step": 1458, "train_runtime": 106692.684, "train_tokens_per_second": 6209.111 }, { "epoch": 1.7664090655416718, "grad_norm": 0.2213152050971985, "learning_rate": 5e-06, "loss": 0.9099, "num_input_tokens_seen": 662937560, "step": 1459, "train_runtime": 106763.8514, "train_tokens_per_second": 6209.382 }, { "epoch": 1.7676198223592732, "grad_norm": 0.23350690305233002, "learning_rate": 5e-06, "loss": 0.9284, "num_input_tokens_seen": 663389520, "step": 1460, "train_runtime": 106831.6837, "train_tokens_per_second": 6209.67 }, { "epoch": 1.7688305791768746, "grad_norm": 0.26306286454200745, "learning_rate": 5e-06, "loss": 0.9014, "num_input_tokens_seen": 663850792, "step": 1461, "train_runtime": 106901.4033, "train_tokens_per_second": 6209.935 }, { "epoch": 1.7700413359944758, "grad_norm": 0.252805233001709, "learning_rate": 5e-06, "loss": 0.929, "num_input_tokens_seen": 664298992, "step": 1462, "train_runtime": 106968.6773, "train_tokens_per_second": 6210.22 }, { "epoch": 1.7712520928120772, "grad_norm": 0.25127750635147095, "learning_rate": 5e-06, "loss": 0.9607, "num_input_tokens_seen": 664755584, "step": 1463, "train_runtime": 107037.4522, "train_tokens_per_second": 6210.495 }, { "epoch": 1.7724628496296786, "grad_norm": 0.24411077797412872, "learning_rate": 5e-06, "loss": 0.882, "num_input_tokens_seen": 665225728, "step": 1464, "train_runtime": 107108.6446, "train_tokens_per_second": 6210.757 }, { "epoch": 1.77367360644728, "grad_norm": 0.25176945328712463, "learning_rate": 5e-06, "loss": 1.0087, "num_input_tokens_seen": 665670928, "step": 1465, "train_runtime": 107175.8114, "train_tokens_per_second": 6211.018 }, { "epoch": 1.7748843632648814, "grad_norm": 0.22492913901805878, "learning_rate": 5e-06, "loss": 0.9548, "num_input_tokens_seen": 666145592, "step": 1466, "train_runtime": 107247.756, "train_tokens_per_second": 6211.278 }, { "epoch": 1.7760951200824828, "grad_norm": 0.29126158356666565, "learning_rate": 5e-06, "loss": 0.9327, "num_input_tokens_seen": 666623936, "step": 1467, "train_runtime": 107320.1773, "train_tokens_per_second": 6211.543 }, { "epoch": 1.7773058769000842, "grad_norm": 0.2463548630475998, "learning_rate": 5e-06, "loss": 0.9628, "num_input_tokens_seen": 667069528, "step": 1468, "train_runtime": 107387.3958, "train_tokens_per_second": 6211.805 }, { "epoch": 1.7785166337176856, "grad_norm": 0.2515462040901184, "learning_rate": 5e-06, "loss": 0.9422, "num_input_tokens_seen": 667527784, "step": 1469, "train_runtime": 107456.5697, "train_tokens_per_second": 6212.07 }, { "epoch": 1.779727390535287, "grad_norm": 0.24735090136528015, "learning_rate": 5e-06, "loss": 0.9324, "num_input_tokens_seen": 668000200, "step": 1470, "train_runtime": 107528.1444, "train_tokens_per_second": 6212.329 }, { "epoch": 1.7809381473528885, "grad_norm": 0.2488315999507904, "learning_rate": 5e-06, "loss": 0.9859, "num_input_tokens_seen": 668449400, "step": 1471, "train_runtime": 107595.4881, "train_tokens_per_second": 6212.616 }, { "epoch": 1.7821489041704897, "grad_norm": 0.22948361933231354, "learning_rate": 5e-06, "loss": 0.9246, "num_input_tokens_seen": 668907592, "step": 1472, "train_runtime": 107664.6957, "train_tokens_per_second": 6212.878 }, { "epoch": 1.783359660988091, "grad_norm": 0.30683404207229614, "learning_rate": 5e-06, "loss": 0.9441, "num_input_tokens_seen": 669348744, "step": 1473, "train_runtime": 107730.9985, "train_tokens_per_second": 6213.149 }, { "epoch": 1.7845704178056925, "grad_norm": 0.2653786242008209, "learning_rate": 5e-06, "loss": 0.9284, "num_input_tokens_seen": 669793168, "step": 1474, "train_runtime": 107797.5952, "train_tokens_per_second": 6213.433 }, { "epoch": 1.7857811746232937, "grad_norm": 0.23417231440544128, "learning_rate": 5e-06, "loss": 0.926, "num_input_tokens_seen": 670254976, "step": 1475, "train_runtime": 107867.6938, "train_tokens_per_second": 6213.677 }, { "epoch": 1.786991931440895, "grad_norm": 0.23506613075733185, "learning_rate": 5e-06, "loss": 0.976, "num_input_tokens_seen": 670697576, "step": 1476, "train_runtime": 107934.2732, "train_tokens_per_second": 6213.944 }, { "epoch": 1.7882026882584965, "grad_norm": 0.29338982701301575, "learning_rate": 5e-06, "loss": 0.9601, "num_input_tokens_seen": 671162736, "step": 1477, "train_runtime": 108004.5346, "train_tokens_per_second": 6214.209 }, { "epoch": 1.789413445076098, "grad_norm": 0.26886627078056335, "learning_rate": 5e-06, "loss": 0.8709, "num_input_tokens_seen": 671637992, "step": 1478, "train_runtime": 108076.0457, "train_tokens_per_second": 6214.495 }, { "epoch": 1.7906242018936993, "grad_norm": 0.23638774454593658, "learning_rate": 5e-06, "loss": 0.9029, "num_input_tokens_seen": 672097800, "step": 1479, "train_runtime": 108145.3648, "train_tokens_per_second": 6214.763 }, { "epoch": 1.7918349587113007, "grad_norm": 0.23951123654842377, "learning_rate": 5e-06, "loss": 0.9188, "num_input_tokens_seen": 672551744, "step": 1480, "train_runtime": 108213.5008, "train_tokens_per_second": 6215.045 }, { "epoch": 1.7930457155289021, "grad_norm": 0.2542056739330292, "learning_rate": 5e-06, "loss": 0.9416, "num_input_tokens_seen": 673008736, "step": 1481, "train_runtime": 108281.9707, "train_tokens_per_second": 6215.335 }, { "epoch": 1.7942564723465035, "grad_norm": 0.2511388659477234, "learning_rate": 5e-06, "loss": 0.9082, "num_input_tokens_seen": 673477648, "step": 1482, "train_runtime": 108351.7157, "train_tokens_per_second": 6215.662 }, { "epoch": 1.795467229164105, "grad_norm": 0.23240311443805695, "learning_rate": 5e-06, "loss": 0.8896, "num_input_tokens_seen": 673924456, "step": 1483, "train_runtime": 108417.5016, "train_tokens_per_second": 6216.012 }, { "epoch": 1.7966779859817064, "grad_norm": 0.2410743683576584, "learning_rate": 5e-06, "loss": 0.9358, "num_input_tokens_seen": 674378912, "step": 1484, "train_runtime": 108484.6044, "train_tokens_per_second": 6216.356 }, { "epoch": 1.7978887427993075, "grad_norm": 0.2558565139770508, "learning_rate": 5e-06, "loss": 0.9506, "num_input_tokens_seen": 674838040, "step": 1485, "train_runtime": 108552.0714, "train_tokens_per_second": 6216.722 }, { "epoch": 1.799099499616909, "grad_norm": 0.2397555112838745, "learning_rate": 5e-06, "loss": 0.9175, "num_input_tokens_seen": 675289776, "step": 1486, "train_runtime": 108618.7287, "train_tokens_per_second": 6217.066 }, { "epoch": 1.8003102564345104, "grad_norm": 0.22383016347885132, "learning_rate": 5e-06, "loss": 0.9353, "num_input_tokens_seen": 675747848, "step": 1487, "train_runtime": 108686.2897, "train_tokens_per_second": 6217.416 }, { "epoch": 1.8015210132521118, "grad_norm": 0.2571597397327423, "learning_rate": 5e-06, "loss": 0.9263, "num_input_tokens_seen": 676171288, "step": 1488, "train_runtime": 108748.7096, "train_tokens_per_second": 6217.741 }, { "epoch": 1.802731770069713, "grad_norm": 0.25441011786460876, "learning_rate": 5e-06, "loss": 1.0048, "num_input_tokens_seen": 676630424, "step": 1489, "train_runtime": 108816.6465, "train_tokens_per_second": 6218.078 }, { "epoch": 1.8039425268873144, "grad_norm": 0.24836276471614838, "learning_rate": 5e-06, "loss": 0.9633, "num_input_tokens_seen": 677067784, "step": 1490, "train_runtime": 108881.3096, "train_tokens_per_second": 6218.402 }, { "epoch": 1.8051532837049158, "grad_norm": 0.24541418254375458, "learning_rate": 5e-06, "loss": 0.9088, "num_input_tokens_seen": 677518960, "step": 1491, "train_runtime": 108948.056, "train_tokens_per_second": 6218.734 }, { "epoch": 1.8063640405225172, "grad_norm": 0.2278079390525818, "learning_rate": 5e-06, "loss": 0.9275, "num_input_tokens_seen": 677983656, "step": 1492, "train_runtime": 109016.4926, "train_tokens_per_second": 6219.093 }, { "epoch": 1.8075747973401186, "grad_norm": 0.23876270651817322, "learning_rate": 5e-06, "loss": 0.926, "num_input_tokens_seen": 678441648, "step": 1493, "train_runtime": 109083.9738, "train_tokens_per_second": 6219.444 }, { "epoch": 1.80878555415772, "grad_norm": 0.25853845477104187, "learning_rate": 5e-06, "loss": 0.9695, "num_input_tokens_seen": 678904952, "step": 1494, "train_runtime": 109152.4011, "train_tokens_per_second": 6219.789 }, { "epoch": 1.8099963109753214, "grad_norm": 0.2297954559326172, "learning_rate": 5e-06, "loss": 0.8819, "num_input_tokens_seen": 679350488, "step": 1495, "train_runtime": 109217.9803, "train_tokens_per_second": 6220.134 }, { "epoch": 1.8112070677929228, "grad_norm": 0.2526834309101105, "learning_rate": 5e-06, "loss": 0.913, "num_input_tokens_seen": 679804296, "step": 1496, "train_runtime": 109285.0115, "train_tokens_per_second": 6220.471 }, { "epoch": 1.8124178246105243, "grad_norm": 0.25374501943588257, "learning_rate": 5e-06, "loss": 0.9266, "num_input_tokens_seen": 680253888, "step": 1497, "train_runtime": 109351.4643, "train_tokens_per_second": 6220.803 }, { "epoch": 1.8136285814281257, "grad_norm": 0.25926515460014343, "learning_rate": 5e-06, "loss": 0.9747, "num_input_tokens_seen": 680706808, "step": 1498, "train_runtime": 109417.9756, "train_tokens_per_second": 6221.161 }, { "epoch": 1.8148393382457269, "grad_norm": 0.24243789911270142, "learning_rate": 5e-06, "loss": 0.8814, "num_input_tokens_seen": 681162152, "step": 1499, "train_runtime": 109485.1447, "train_tokens_per_second": 6221.503 }, { "epoch": 1.8160500950633283, "grad_norm": 0.23476150631904602, "learning_rate": 5e-06, "loss": 0.8957, "num_input_tokens_seen": 681594912, "step": 1500, "train_runtime": 109548.882, "train_tokens_per_second": 6221.834 }, { "epoch": 1.8172608518809297, "grad_norm": 0.23926031589508057, "learning_rate": 5e-06, "loss": 0.9437, "num_input_tokens_seen": 682050248, "step": 1501, "train_runtime": 109615.8898, "train_tokens_per_second": 6222.184 }, { "epoch": 1.8184716086985309, "grad_norm": 0.23174121975898743, "learning_rate": 5e-06, "loss": 0.9173, "num_input_tokens_seen": 682487560, "step": 1502, "train_runtime": 109680.61, "train_tokens_per_second": 6222.5 }, { "epoch": 1.8196823655161323, "grad_norm": 0.23543839156627655, "learning_rate": 5e-06, "loss": 0.9421, "num_input_tokens_seen": 682928504, "step": 1503, "train_runtime": 109745.5472, "train_tokens_per_second": 6222.836 }, { "epoch": 1.8208931223337337, "grad_norm": 0.2303183227777481, "learning_rate": 5e-06, "loss": 0.9095, "num_input_tokens_seen": 683379408, "step": 1504, "train_runtime": 109812.3345, "train_tokens_per_second": 6223.157 }, { "epoch": 1.822103879151335, "grad_norm": 0.24201270937919617, "learning_rate": 5e-06, "loss": 0.9532, "num_input_tokens_seen": 683828072, "step": 1505, "train_runtime": 109878.4824, "train_tokens_per_second": 6223.494 }, { "epoch": 1.8233146359689365, "grad_norm": 0.24431242048740387, "learning_rate": 5e-06, "loss": 0.967, "num_input_tokens_seen": 684271776, "step": 1506, "train_runtime": 109945.1488, "train_tokens_per_second": 6223.756 }, { "epoch": 1.824525392786538, "grad_norm": 0.24904708564281464, "learning_rate": 5e-06, "loss": 0.9425, "num_input_tokens_seen": 684713936, "step": 1507, "train_runtime": 110011.8091, "train_tokens_per_second": 6224.004 }, { "epoch": 1.8257361496041393, "grad_norm": 0.24164269864559174, "learning_rate": 5e-06, "loss": 0.9442, "num_input_tokens_seen": 685180664, "step": 1508, "train_runtime": 110080.6343, "train_tokens_per_second": 6224.352 }, { "epoch": 1.8269469064217407, "grad_norm": 0.23471519351005554, "learning_rate": 5e-06, "loss": 0.9217, "num_input_tokens_seen": 685664952, "step": 1509, "train_runtime": 110151.978, "train_tokens_per_second": 6224.718 }, { "epoch": 1.8281576632393421, "grad_norm": 0.22866208851337433, "learning_rate": 5e-06, "loss": 0.9515, "num_input_tokens_seen": 686116056, "step": 1510, "train_runtime": 110217.8911, "train_tokens_per_second": 6225.088 }, { "epoch": 1.8293684200569436, "grad_norm": 0.24192233383655548, "learning_rate": 5e-06, "loss": 0.8977, "num_input_tokens_seen": 686558696, "step": 1511, "train_runtime": 110283.5539, "train_tokens_per_second": 6225.395 }, { "epoch": 1.8305791768745447, "grad_norm": 0.2574458718299866, "learning_rate": 5e-06, "loss": 0.9646, "num_input_tokens_seen": 687040920, "step": 1512, "train_runtime": 110355.252, "train_tokens_per_second": 6225.72 }, { "epoch": 1.8317899336921462, "grad_norm": 0.23180226981639862, "learning_rate": 5e-06, "loss": 0.9015, "num_input_tokens_seen": 687491464, "step": 1513, "train_runtime": 110421.6831, "train_tokens_per_second": 6226.055 }, { "epoch": 1.8330006905097476, "grad_norm": 0.2546160817146301, "learning_rate": 5e-06, "loss": 0.9606, "num_input_tokens_seen": 687938040, "step": 1514, "train_runtime": 110487.8498, "train_tokens_per_second": 6226.368 }, { "epoch": 1.8342114473273488, "grad_norm": 0.2605888545513153, "learning_rate": 5e-06, "loss": 0.9774, "num_input_tokens_seen": 688393944, "step": 1515, "train_runtime": 110554.8225, "train_tokens_per_second": 6226.72 }, { "epoch": 1.8354222041449502, "grad_norm": 0.24372106790542603, "learning_rate": 5e-06, "loss": 0.9394, "num_input_tokens_seen": 688853280, "step": 1516, "train_runtime": 110623.2037, "train_tokens_per_second": 6227.023 }, { "epoch": 1.8366329609625516, "grad_norm": 0.2518022656440735, "learning_rate": 5e-06, "loss": 0.9911, "num_input_tokens_seen": 689322800, "step": 1517, "train_runtime": 110692.3372, "train_tokens_per_second": 6227.376 }, { "epoch": 1.837843717780153, "grad_norm": 0.2612314522266388, "learning_rate": 5e-06, "loss": 0.9543, "num_input_tokens_seen": 689766792, "step": 1518, "train_runtime": 110757.7501, "train_tokens_per_second": 6227.707 }, { "epoch": 1.8390544745977544, "grad_norm": 0.2807573974132538, "learning_rate": 5e-06, "loss": 0.9692, "num_input_tokens_seen": 690187168, "step": 1519, "train_runtime": 110819.3816, "train_tokens_per_second": 6228.037 }, { "epoch": 1.8402652314153558, "grad_norm": 0.24547508358955383, "learning_rate": 5e-06, "loss": 0.8983, "num_input_tokens_seen": 690622672, "step": 1520, "train_runtime": 110883.3109, "train_tokens_per_second": 6228.373 }, { "epoch": 1.8414759882329572, "grad_norm": 0.25869348645210266, "learning_rate": 5e-06, "loss": 0.9573, "num_input_tokens_seen": 691070096, "step": 1521, "train_runtime": 110949.2534, "train_tokens_per_second": 6228.704 }, { "epoch": 1.8426867450505586, "grad_norm": 0.2718667685985565, "learning_rate": 5e-06, "loss": 0.943, "num_input_tokens_seen": 691528656, "step": 1522, "train_runtime": 111016.9813, "train_tokens_per_second": 6229.035 }, { "epoch": 1.84389750186816, "grad_norm": 0.23198598623275757, "learning_rate": 5e-06, "loss": 0.9308, "num_input_tokens_seen": 692005376, "step": 1523, "train_runtime": 111087.076, "train_tokens_per_second": 6229.396 }, { "epoch": 1.8451082586857614, "grad_norm": 0.2525101602077484, "learning_rate": 5e-06, "loss": 0.9888, "num_input_tokens_seen": 692443744, "step": 1524, "train_runtime": 111151.9146, "train_tokens_per_second": 6229.706 }, { "epoch": 1.8463190155033626, "grad_norm": 0.24284860491752625, "learning_rate": 5e-06, "loss": 0.9236, "num_input_tokens_seen": 692903640, "step": 1525, "train_runtime": 111219.6955, "train_tokens_per_second": 6230.044 }, { "epoch": 1.847529772320964, "grad_norm": 0.2588494122028351, "learning_rate": 5e-06, "loss": 0.9442, "num_input_tokens_seen": 693364616, "step": 1526, "train_runtime": 111287.6738, "train_tokens_per_second": 6230.381 }, { "epoch": 1.8487405291385655, "grad_norm": 0.25384098291397095, "learning_rate": 5e-06, "loss": 0.9606, "num_input_tokens_seen": 693820112, "step": 1527, "train_runtime": 111355.0786, "train_tokens_per_second": 6230.7 }, { "epoch": 1.8499512859561669, "grad_norm": 0.23675884306430817, "learning_rate": 5e-06, "loss": 0.9327, "num_input_tokens_seen": 694280840, "step": 1528, "train_runtime": 111423.4256, "train_tokens_per_second": 6231.013 }, { "epoch": 1.851162042773768, "grad_norm": 0.2325180619955063, "learning_rate": 5e-06, "loss": 0.9156, "num_input_tokens_seen": 694738776, "step": 1529, "train_runtime": 111490.9224, "train_tokens_per_second": 6231.348 }, { "epoch": 1.8523727995913695, "grad_norm": 0.25472497940063477, "learning_rate": 5e-06, "loss": 0.9057, "num_input_tokens_seen": 695188272, "step": 1530, "train_runtime": 111557.6013, "train_tokens_per_second": 6231.653 }, { "epoch": 1.8535835564089709, "grad_norm": 0.23478816449642181, "learning_rate": 5e-06, "loss": 0.8944, "num_input_tokens_seen": 695623080, "step": 1531, "train_runtime": 111621.743, "train_tokens_per_second": 6231.968 }, { "epoch": 1.8547943132265723, "grad_norm": 0.2601574659347534, "learning_rate": 5e-06, "loss": 0.9208, "num_input_tokens_seen": 696081624, "step": 1532, "train_runtime": 111689.1774, "train_tokens_per_second": 6232.31 }, { "epoch": 1.8560050700441737, "grad_norm": 0.26812466979026794, "learning_rate": 5e-06, "loss": 0.9187, "num_input_tokens_seen": 696557776, "step": 1533, "train_runtime": 111759.787, "train_tokens_per_second": 6232.633 }, { "epoch": 1.857215826861775, "grad_norm": 0.28780001401901245, "learning_rate": 5e-06, "loss": 0.8971, "num_input_tokens_seen": 697024808, "step": 1534, "train_runtime": 111828.8573, "train_tokens_per_second": 6232.96 }, { "epoch": 1.8584265836793765, "grad_norm": 0.23128759860992432, "learning_rate": 5e-06, "loss": 0.9624, "num_input_tokens_seen": 697506680, "step": 1535, "train_runtime": 111899.9207, "train_tokens_per_second": 6233.308 }, { "epoch": 1.859637340496978, "grad_norm": 0.2368602603673935, "learning_rate": 5e-06, "loss": 0.9405, "num_input_tokens_seen": 697965088, "step": 1536, "train_runtime": 111967.9468, "train_tokens_per_second": 6233.615 }, { "epoch": 1.8608480973145793, "grad_norm": 0.24492254853248596, "learning_rate": 5e-06, "loss": 0.9242, "num_input_tokens_seen": 698401104, "step": 1537, "train_runtime": 112032.4048, "train_tokens_per_second": 6233.92 }, { "epoch": 1.8620588541321808, "grad_norm": 0.26115724444389343, "learning_rate": 5e-06, "loss": 0.9314, "num_input_tokens_seen": 698857392, "step": 1538, "train_runtime": 112100.3102, "train_tokens_per_second": 6234.215 }, { "epoch": 1.863269610949782, "grad_norm": 0.23800967633724213, "learning_rate": 5e-06, "loss": 0.9329, "num_input_tokens_seen": 699332608, "step": 1539, "train_runtime": 112170.2291, "train_tokens_per_second": 6234.565 }, { "epoch": 1.8644803677673834, "grad_norm": 0.2537146210670471, "learning_rate": 5e-06, "loss": 0.9345, "num_input_tokens_seen": 699789840, "step": 1540, "train_runtime": 112238.3256, "train_tokens_per_second": 6234.856 }, { "epoch": 1.8656911245849848, "grad_norm": 0.23815041780471802, "learning_rate": 5e-06, "loss": 0.9216, "num_input_tokens_seen": 700250208, "step": 1541, "train_runtime": 112306.045, "train_tokens_per_second": 6235.196 }, { "epoch": 1.866901881402586, "grad_norm": 0.2275908887386322, "learning_rate": 5e-06, "loss": 0.9289, "num_input_tokens_seen": 700706192, "step": 1542, "train_runtime": 112373.3647, "train_tokens_per_second": 6235.518 }, { "epoch": 1.8681126382201874, "grad_norm": 0.24416327476501465, "learning_rate": 5e-06, "loss": 0.9387, "num_input_tokens_seen": 701172296, "step": 1543, "train_runtime": 112441.8, "train_tokens_per_second": 6235.869 }, { "epoch": 1.8693233950377888, "grad_norm": 0.23080092668533325, "learning_rate": 5e-06, "loss": 0.8832, "num_input_tokens_seen": 701636096, "step": 1544, "train_runtime": 112510.6891, "train_tokens_per_second": 6236.173 }, { "epoch": 1.8705341518553902, "grad_norm": 0.2627670466899872, "learning_rate": 5e-06, "loss": 0.9469, "num_input_tokens_seen": 702085912, "step": 1545, "train_runtime": 112577.5052, "train_tokens_per_second": 6236.467 }, { "epoch": 1.8717449086729916, "grad_norm": 0.2511466145515442, "learning_rate": 5e-06, "loss": 0.914, "num_input_tokens_seen": 702549976, "step": 1546, "train_runtime": 112645.979, "train_tokens_per_second": 6236.796 }, { "epoch": 1.872955665490593, "grad_norm": 0.23643608391284943, "learning_rate": 5e-06, "loss": 0.9381, "num_input_tokens_seen": 703007240, "step": 1547, "train_runtime": 112713.4614, "train_tokens_per_second": 6237.119 }, { "epoch": 1.8741664223081944, "grad_norm": 0.2743590474128723, "learning_rate": 5e-06, "loss": 1.048, "num_input_tokens_seen": 703458040, "step": 1548, "train_runtime": 112779.5769, "train_tokens_per_second": 6237.459 }, { "epoch": 1.8753771791257958, "grad_norm": 0.2364722192287445, "learning_rate": 5e-06, "loss": 0.9382, "num_input_tokens_seen": 703914896, "step": 1549, "train_runtime": 112846.9643, "train_tokens_per_second": 6237.783 }, { "epoch": 1.8765879359433972, "grad_norm": 0.23627513647079468, "learning_rate": 5e-06, "loss": 0.9412, "num_input_tokens_seen": 704351776, "step": 1550, "train_runtime": 112911.1758, "train_tokens_per_second": 6238.105 }, { "epoch": 1.8777986927609986, "grad_norm": 0.2537660002708435, "learning_rate": 5e-06, "loss": 0.9351, "num_input_tokens_seen": 704778064, "step": 1551, "train_runtime": 112973.6432, "train_tokens_per_second": 6238.429 }, { "epoch": 1.8790094495785998, "grad_norm": 0.2588886022567749, "learning_rate": 5e-06, "loss": 0.9646, "num_input_tokens_seen": 705228264, "step": 1552, "train_runtime": 113040.2615, "train_tokens_per_second": 6238.735 }, { "epoch": 1.8802202063962012, "grad_norm": 0.24146287143230438, "learning_rate": 5e-06, "loss": 0.8843, "num_input_tokens_seen": 705664728, "step": 1553, "train_runtime": 113105.0242, "train_tokens_per_second": 6239.022 }, { "epoch": 1.8814309632138027, "grad_norm": 0.2611408233642578, "learning_rate": 5e-06, "loss": 0.9507, "num_input_tokens_seen": 706140544, "step": 1554, "train_runtime": 113175.4725, "train_tokens_per_second": 6239.343 }, { "epoch": 1.8826417200314038, "grad_norm": 0.24252241849899292, "learning_rate": 5e-06, "loss": 0.8965, "num_input_tokens_seen": 706584696, "step": 1555, "train_runtime": 113240.9269, "train_tokens_per_second": 6239.658 }, { "epoch": 1.8838524768490053, "grad_norm": 0.24674955010414124, "learning_rate": 5e-06, "loss": 0.9562, "num_input_tokens_seen": 707041752, "step": 1556, "train_runtime": 113308.1873, "train_tokens_per_second": 6239.988 }, { "epoch": 1.8850632336666067, "grad_norm": 0.2411464899778366, "learning_rate": 5e-06, "loss": 0.9625, "num_input_tokens_seen": 707509880, "step": 1557, "train_runtime": 113377.0087, "train_tokens_per_second": 6240.329 }, { "epoch": 1.886273990484208, "grad_norm": 0.24759581685066223, "learning_rate": 5e-06, "loss": 0.9369, "num_input_tokens_seen": 707976528, "step": 1558, "train_runtime": 113445.9329, "train_tokens_per_second": 6240.651 }, { "epoch": 1.8874847473018095, "grad_norm": 0.28260865807533264, "learning_rate": 5e-06, "loss": 0.984, "num_input_tokens_seen": 708406264, "step": 1559, "train_runtime": 113509.0703, "train_tokens_per_second": 6240.966 }, { "epoch": 1.888695504119411, "grad_norm": 0.23383396863937378, "learning_rate": 5e-06, "loss": 0.895, "num_input_tokens_seen": 708860936, "step": 1560, "train_runtime": 113577.5988, "train_tokens_per_second": 6241.204 }, { "epoch": 1.8899062609370123, "grad_norm": 0.25613272190093994, "learning_rate": 5e-06, "loss": 0.9105, "num_input_tokens_seen": 709298440, "step": 1561, "train_runtime": 113642.3199, "train_tokens_per_second": 6241.499 }, { "epoch": 1.8911170177546137, "grad_norm": 0.23639342188835144, "learning_rate": 5e-06, "loss": 0.9206, "num_input_tokens_seen": 709772072, "step": 1562, "train_runtime": 113712.4809, "train_tokens_per_second": 6241.813 }, { "epoch": 1.8923277745722151, "grad_norm": 0.24744772911071777, "learning_rate": 5e-06, "loss": 0.9326, "num_input_tokens_seen": 710230928, "step": 1563, "train_runtime": 113780.3401, "train_tokens_per_second": 6242.123 }, { "epoch": 1.8935385313898165, "grad_norm": 0.24007609486579895, "learning_rate": 5e-06, "loss": 0.9552, "num_input_tokens_seen": 710673544, "step": 1564, "train_runtime": 113845.7354, "train_tokens_per_second": 6242.426 }, { "epoch": 1.8947492882074177, "grad_norm": 0.24338461458683014, "learning_rate": 5e-06, "loss": 0.8848, "num_input_tokens_seen": 711156448, "step": 1565, "train_runtime": 113916.9534, "train_tokens_per_second": 6242.762 }, { "epoch": 1.8959600450250191, "grad_norm": 0.25444409251213074, "learning_rate": 5e-06, "loss": 0.9297, "num_input_tokens_seen": 711614160, "step": 1566, "train_runtime": 113984.4746, "train_tokens_per_second": 6243.08 }, { "epoch": 1.8971708018426205, "grad_norm": 0.24998825788497925, "learning_rate": 5e-06, "loss": 0.9388, "num_input_tokens_seen": 712102984, "step": 1567, "train_runtime": 114056.6227, "train_tokens_per_second": 6243.416 }, { "epoch": 1.898381558660222, "grad_norm": 0.25563183426856995, "learning_rate": 5e-06, "loss": 0.9143, "num_input_tokens_seen": 712569488, "step": 1568, "train_runtime": 114125.4791, "train_tokens_per_second": 6243.737 }, { "epoch": 1.8995923154778231, "grad_norm": 0.2723662555217743, "learning_rate": 5e-06, "loss": 0.9361, "num_input_tokens_seen": 712982216, "step": 1569, "train_runtime": 114186.2224, "train_tokens_per_second": 6244.03 }, { "epoch": 1.9008030722954246, "grad_norm": 0.26646265387535095, "learning_rate": 5e-06, "loss": 0.9116, "num_input_tokens_seen": 713435760, "step": 1570, "train_runtime": 114253.013, "train_tokens_per_second": 6244.35 }, { "epoch": 1.902013829113026, "grad_norm": 0.22592444717884064, "learning_rate": 5e-06, "loss": 0.9129, "num_input_tokens_seen": 713905768, "step": 1571, "train_runtime": 114322.3992, "train_tokens_per_second": 6244.671 }, { "epoch": 1.9032245859306274, "grad_norm": 0.2544853985309601, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 714357704, "step": 1572, "train_runtime": 114389.1726, "train_tokens_per_second": 6244.977 }, { "epoch": 1.9044353427482288, "grad_norm": 0.2733955383300781, "learning_rate": 5e-06, "loss": 0.9863, "num_input_tokens_seen": 714815888, "step": 1573, "train_runtime": 114456.8908, "train_tokens_per_second": 6245.285 }, { "epoch": 1.9056460995658302, "grad_norm": 0.22590436041355133, "learning_rate": 5e-06, "loss": 0.8947, "num_input_tokens_seen": 715267248, "step": 1574, "train_runtime": 114524.1877, "train_tokens_per_second": 6245.556 }, { "epoch": 1.9068568563834316, "grad_norm": 0.2679465711116791, "learning_rate": 5e-06, "loss": 0.9425, "num_input_tokens_seen": 715731448, "step": 1575, "train_runtime": 114592.6515, "train_tokens_per_second": 6245.876 }, { "epoch": 1.908067613201033, "grad_norm": 0.2311072051525116, "learning_rate": 5e-06, "loss": 0.9282, "num_input_tokens_seen": 716197480, "step": 1576, "train_runtime": 114661.6647, "train_tokens_per_second": 6246.181 }, { "epoch": 1.9092783700186344, "grad_norm": 0.24477143585681915, "learning_rate": 5e-06, "loss": 0.9186, "num_input_tokens_seen": 716635848, "step": 1577, "train_runtime": 114726.4947, "train_tokens_per_second": 6246.472 }, { "epoch": 1.9104891268362358, "grad_norm": 0.26083871722221375, "learning_rate": 5e-06, "loss": 0.9086, "num_input_tokens_seen": 717095224, "step": 1578, "train_runtime": 114794.5934, "train_tokens_per_second": 6246.768 }, { "epoch": 1.911699883653837, "grad_norm": 0.29289036989212036, "learning_rate": 5e-06, "loss": 0.9533, "num_input_tokens_seen": 717574472, "step": 1579, "train_runtime": 114865.2956, "train_tokens_per_second": 6247.096 }, { "epoch": 1.9129106404714384, "grad_norm": 0.28024327754974365, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 718005536, "step": 1580, "train_runtime": 114928.7168, "train_tokens_per_second": 6247.399 }, { "epoch": 1.9141213972890398, "grad_norm": 0.28200191259384155, "learning_rate": 5e-06, "loss": 0.931, "num_input_tokens_seen": 718454008, "step": 1581, "train_runtime": 114994.4807, "train_tokens_per_second": 6247.726 }, { "epoch": 1.915332154106641, "grad_norm": 0.27790147066116333, "learning_rate": 5e-06, "loss": 0.9131, "num_input_tokens_seen": 718902000, "step": 1582, "train_runtime": 115060.9603, "train_tokens_per_second": 6248.01 }, { "epoch": 1.9165429109242424, "grad_norm": 0.2839493751525879, "learning_rate": 5e-06, "loss": 0.9395, "num_input_tokens_seen": 719363656, "step": 1583, "train_runtime": 115129.3761, "train_tokens_per_second": 6248.307 }, { "epoch": 1.9177536677418439, "grad_norm": 0.28969302773475647, "learning_rate": 5e-06, "loss": 0.939, "num_input_tokens_seen": 719824400, "step": 1584, "train_runtime": 115197.2332, "train_tokens_per_second": 6248.626 }, { "epoch": 1.9189644245594453, "grad_norm": 0.22786937654018402, "learning_rate": 5e-06, "loss": 0.9137, "num_input_tokens_seen": 720285288, "step": 1585, "train_runtime": 115264.6817, "train_tokens_per_second": 6248.968 }, { "epoch": 1.9201751813770467, "grad_norm": 0.3011467158794403, "learning_rate": 5e-06, "loss": 0.9738, "num_input_tokens_seen": 720726248, "step": 1586, "train_runtime": 115329.3874, "train_tokens_per_second": 6249.285 }, { "epoch": 1.921385938194648, "grad_norm": 0.25570541620254517, "learning_rate": 5e-06, "loss": 0.9943, "num_input_tokens_seen": 721189208, "step": 1587, "train_runtime": 115397.8713, "train_tokens_per_second": 6249.588 }, { "epoch": 1.9225966950122495, "grad_norm": 0.30256542563438416, "learning_rate": 5e-06, "loss": 0.9381, "num_input_tokens_seen": 721640744, "step": 1588, "train_runtime": 115464.5774, "train_tokens_per_second": 6249.889 }, { "epoch": 1.923807451829851, "grad_norm": 0.22470492124557495, "learning_rate": 5e-06, "loss": 0.9629, "num_input_tokens_seen": 722107928, "step": 1589, "train_runtime": 115533.4593, "train_tokens_per_second": 6250.206 }, { "epoch": 1.9250182086474523, "grad_norm": 0.26163867115974426, "learning_rate": 5e-06, "loss": 0.8809, "num_input_tokens_seen": 722566072, "step": 1590, "train_runtime": 115601.5317, "train_tokens_per_second": 6250.489 }, { "epoch": 1.9262289654650537, "grad_norm": 0.27157437801361084, "learning_rate": 5e-06, "loss": 0.924, "num_input_tokens_seen": 723026480, "step": 1591, "train_runtime": 115668.9514, "train_tokens_per_second": 6250.826 }, { "epoch": 1.927439722282655, "grad_norm": 0.2507987320423126, "learning_rate": 5e-06, "loss": 0.9413, "num_input_tokens_seen": 723498984, "step": 1592, "train_runtime": 115738.7049, "train_tokens_per_second": 6251.141 }, { "epoch": 1.9286504791002563, "grad_norm": 0.2356843203306198, "learning_rate": 5e-06, "loss": 0.9055, "num_input_tokens_seen": 723937808, "step": 1593, "train_runtime": 115803.4466, "train_tokens_per_second": 6251.436 }, { "epoch": 1.9298612359178577, "grad_norm": 0.2270326465368271, "learning_rate": 5e-06, "loss": 0.9157, "num_input_tokens_seen": 724385408, "step": 1594, "train_runtime": 115869.3052, "train_tokens_per_second": 6251.746 }, { "epoch": 1.931071992735459, "grad_norm": 0.2569643557071686, "learning_rate": 5e-06, "loss": 0.856, "num_input_tokens_seen": 724859232, "step": 1595, "train_runtime": 115939.4374, "train_tokens_per_second": 6252.051 }, { "epoch": 1.9322827495530603, "grad_norm": 0.22327809035778046, "learning_rate": 5e-06, "loss": 0.9313, "num_input_tokens_seen": 725331448, "step": 1596, "train_runtime": 116009.0348, "train_tokens_per_second": 6252.37 }, { "epoch": 1.9334935063706618, "grad_norm": 0.253885418176651, "learning_rate": 5e-06, "loss": 0.8917, "num_input_tokens_seen": 725790744, "step": 1597, "train_runtime": 116077.1839, "train_tokens_per_second": 6252.656 }, { "epoch": 1.9347042631882632, "grad_norm": 0.22820526361465454, "learning_rate": 5e-06, "loss": 0.9767, "num_input_tokens_seen": 726237104, "step": 1598, "train_runtime": 116143.0969, "train_tokens_per_second": 6252.951 }, { "epoch": 1.9359150200058646, "grad_norm": 0.24010008573532104, "learning_rate": 5e-06, "loss": 0.9393, "num_input_tokens_seen": 726690312, "step": 1599, "train_runtime": 116210.1064, "train_tokens_per_second": 6253.245 }, { "epoch": 1.937125776823466, "grad_norm": 0.23890480399131775, "learning_rate": 5e-06, "loss": 0.8982, "num_input_tokens_seen": 727157240, "step": 1600, "train_runtime": 116279.6038, "train_tokens_per_second": 6253.524 }, { "epoch": 1.9383365336410674, "grad_norm": 0.24424760043621063, "learning_rate": 5e-06, "loss": 0.9653, "num_input_tokens_seen": 727622656, "step": 1601, "train_runtime": 116348.2373, "train_tokens_per_second": 6253.835 }, { "epoch": 1.9395472904586688, "grad_norm": 0.2552737891674042, "learning_rate": 5e-06, "loss": 0.9497, "num_input_tokens_seen": 728062096, "step": 1602, "train_runtime": 116412.8385, "train_tokens_per_second": 6254.139 }, { "epoch": 1.9407580472762702, "grad_norm": 0.2567066252231598, "learning_rate": 5e-06, "loss": 0.8888, "num_input_tokens_seen": 728516320, "step": 1603, "train_runtime": 116479.7771, "train_tokens_per_second": 6254.445 }, { "epoch": 1.9419688040938716, "grad_norm": 0.26494523882865906, "learning_rate": 5e-06, "loss": 0.9411, "num_input_tokens_seen": 728967448, "step": 1604, "train_runtime": 116546.2892, "train_tokens_per_second": 6254.746 }, { "epoch": 1.9431795609114728, "grad_norm": 0.24419981241226196, "learning_rate": 5e-06, "loss": 0.9449, "num_input_tokens_seen": 729410712, "step": 1605, "train_runtime": 116611.7138, "train_tokens_per_second": 6255.038 }, { "epoch": 1.9443903177290742, "grad_norm": 0.24061161279678345, "learning_rate": 5e-06, "loss": 0.9237, "num_input_tokens_seen": 729864928, "step": 1606, "train_runtime": 116679.0895, "train_tokens_per_second": 6255.319 }, { "epoch": 1.9456010745466756, "grad_norm": 0.2652917802333832, "learning_rate": 5e-06, "loss": 0.9536, "num_input_tokens_seen": 730304432, "step": 1607, "train_runtime": 116743.7819, "train_tokens_per_second": 6255.617 }, { "epoch": 1.946811831364277, "grad_norm": 0.3060227632522583, "learning_rate": 5e-06, "loss": 0.9476, "num_input_tokens_seen": 730764008, "step": 1608, "train_runtime": 116811.6283, "train_tokens_per_second": 6255.918 }, { "epoch": 1.9480225881818782, "grad_norm": 0.24972648918628693, "learning_rate": 5e-06, "loss": 0.9286, "num_input_tokens_seen": 731207136, "step": 1609, "train_runtime": 116876.7851, "train_tokens_per_second": 6256.222 }, { "epoch": 1.9492333449994796, "grad_norm": 0.22679458558559418, "learning_rate": 5e-06, "loss": 0.9647, "num_input_tokens_seen": 731675192, "step": 1610, "train_runtime": 116946.392, "train_tokens_per_second": 6256.501 }, { "epoch": 1.950444101817081, "grad_norm": 0.24391289055347443, "learning_rate": 5e-06, "loss": 0.9443, "num_input_tokens_seen": 732136656, "step": 1611, "train_runtime": 117014.2749, "train_tokens_per_second": 6256.815 }, { "epoch": 1.9516548586346825, "grad_norm": 0.25399860739707947, "learning_rate": 5e-06, "loss": 0.9206, "num_input_tokens_seen": 732587312, "step": 1612, "train_runtime": 117080.9176, "train_tokens_per_second": 6257.103 }, { "epoch": 1.9528656154522839, "grad_norm": 0.2403707355260849, "learning_rate": 5e-06, "loss": 0.9548, "num_input_tokens_seen": 733055472, "step": 1613, "train_runtime": 117150.9773, "train_tokens_per_second": 6257.357 }, { "epoch": 1.9540763722698853, "grad_norm": 0.24824580550193787, "learning_rate": 5e-06, "loss": 0.9813, "num_input_tokens_seen": 733529576, "step": 1614, "train_runtime": 117222.2753, "train_tokens_per_second": 6257.595 }, { "epoch": 1.9552871290874867, "grad_norm": 0.25411248207092285, "learning_rate": 5e-06, "loss": 0.9657, "num_input_tokens_seen": 733989808, "step": 1615, "train_runtime": 117290.2995, "train_tokens_per_second": 6257.89 }, { "epoch": 1.956497885905088, "grad_norm": 0.244659423828125, "learning_rate": 5e-06, "loss": 0.9625, "num_input_tokens_seen": 734459672, "step": 1616, "train_runtime": 117359.2892, "train_tokens_per_second": 6258.215 }, { "epoch": 1.9577086427226895, "grad_norm": 0.2583770751953125, "learning_rate": 5e-06, "loss": 0.95, "num_input_tokens_seen": 734913456, "step": 1617, "train_runtime": 117425.8088, "train_tokens_per_second": 6258.534 }, { "epoch": 1.958919399540291, "grad_norm": 0.27326807379722595, "learning_rate": 5e-06, "loss": 0.9071, "num_input_tokens_seen": 735361360, "step": 1618, "train_runtime": 117492.1671, "train_tokens_per_second": 6258.812 }, { "epoch": 1.9601301563578921, "grad_norm": 0.2656486928462982, "learning_rate": 5e-06, "loss": 0.9275, "num_input_tokens_seen": 735820904, "step": 1619, "train_runtime": 117560.1966, "train_tokens_per_second": 6259.099 }, { "epoch": 1.9613409131754935, "grad_norm": 0.26864171028137207, "learning_rate": 5e-06, "loss": 0.9431, "num_input_tokens_seen": 736286088, "step": 1620, "train_runtime": 117628.7955, "train_tokens_per_second": 6259.403 }, { "epoch": 1.962551669993095, "grad_norm": 0.23168571293354034, "learning_rate": 5e-06, "loss": 0.9652, "num_input_tokens_seen": 736771304, "step": 1621, "train_runtime": 117700.5675, "train_tokens_per_second": 6259.709 }, { "epoch": 1.9637624268106961, "grad_norm": 0.3031046986579895, "learning_rate": 5e-06, "loss": 0.9284, "num_input_tokens_seen": 737223816, "step": 1622, "train_runtime": 117767.2938, "train_tokens_per_second": 6260.005 }, { "epoch": 1.9649731836282975, "grad_norm": 0.3055347800254822, "learning_rate": 5e-06, "loss": 0.8942, "num_input_tokens_seen": 737671328, "step": 1623, "train_runtime": 117833.4065, "train_tokens_per_second": 6260.29 }, { "epoch": 1.966183940445899, "grad_norm": 0.24057318270206451, "learning_rate": 5e-06, "loss": 0.9013, "num_input_tokens_seen": 738128264, "step": 1624, "train_runtime": 117900.8843, "train_tokens_per_second": 6260.583 }, { "epoch": 1.9673946972635004, "grad_norm": 0.28453585505485535, "learning_rate": 5e-06, "loss": 0.951, "num_input_tokens_seen": 738599800, "step": 1625, "train_runtime": 117970.9661, "train_tokens_per_second": 6260.861 }, { "epoch": 1.9686054540811018, "grad_norm": 0.2978310286998749, "learning_rate": 5e-06, "loss": 0.9524, "num_input_tokens_seen": 739061440, "step": 1626, "train_runtime": 118038.9189, "train_tokens_per_second": 6261.167 }, { "epoch": 1.9698162108987032, "grad_norm": 0.2525809109210968, "learning_rate": 5e-06, "loss": 0.9146, "num_input_tokens_seen": 739504888, "step": 1627, "train_runtime": 118104.044, "train_tokens_per_second": 6261.47 }, { "epoch": 1.9710269677163046, "grad_norm": 0.23271185159683228, "learning_rate": 5e-06, "loss": 0.879, "num_input_tokens_seen": 739959160, "step": 1628, "train_runtime": 118171.0917, "train_tokens_per_second": 6261.761 }, { "epoch": 1.972237724533906, "grad_norm": 0.2425994873046875, "learning_rate": 5e-06, "loss": 0.9498, "num_input_tokens_seen": 740392976, "step": 1629, "train_runtime": 118235.1733, "train_tokens_per_second": 6262.037 }, { "epoch": 1.9734484813515074, "grad_norm": 0.28858521580696106, "learning_rate": 5e-06, "loss": 0.9053, "num_input_tokens_seen": 740852000, "step": 1630, "train_runtime": 118303.084, "train_tokens_per_second": 6262.322 }, { "epoch": 1.9746592381691088, "grad_norm": 0.30428969860076904, "learning_rate": 5e-06, "loss": 0.9647, "num_input_tokens_seen": 741308544, "step": 1631, "train_runtime": 118370.3428, "train_tokens_per_second": 6262.621 }, { "epoch": 1.97586999498671, "grad_norm": 0.2601581811904907, "learning_rate": 5e-06, "loss": 0.9038, "num_input_tokens_seen": 741771776, "step": 1632, "train_runtime": 118438.7832, "train_tokens_per_second": 6262.913 }, { "epoch": 1.9770807518043114, "grad_norm": 0.2240893691778183, "learning_rate": 5e-06, "loss": 0.9744, "num_input_tokens_seen": 742234480, "step": 1633, "train_runtime": 118507.3035, "train_tokens_per_second": 6263.196 }, { "epoch": 1.9782915086219128, "grad_norm": 0.2555893063545227, "learning_rate": 5e-06, "loss": 0.9318, "num_input_tokens_seen": 742693528, "step": 1634, "train_runtime": 118575.2577, "train_tokens_per_second": 6263.478 }, { "epoch": 1.979502265439514, "grad_norm": 0.33006125688552856, "learning_rate": 5e-06, "loss": 0.9128, "num_input_tokens_seen": 743133448, "step": 1635, "train_runtime": 118639.8577, "train_tokens_per_second": 6263.776 }, { "epoch": 1.9807130222571154, "grad_norm": 0.24423004686832428, "learning_rate": 5e-06, "loss": 0.9242, "num_input_tokens_seen": 743587448, "step": 1636, "train_runtime": 118707.0303, "train_tokens_per_second": 6264.056 }, { "epoch": 1.9819237790747168, "grad_norm": 0.2284265011548996, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 744034000, "step": 1637, "train_runtime": 118772.4989, "train_tokens_per_second": 6264.363 }, { "epoch": 1.9831345358923183, "grad_norm": 0.24896208941936493, "learning_rate": 5e-06, "loss": 0.926, "num_input_tokens_seen": 744492584, "step": 1638, "train_runtime": 118840.6792, "train_tokens_per_second": 6264.627 }, { "epoch": 1.9843452927099197, "grad_norm": 0.27802956104278564, "learning_rate": 5e-06, "loss": 0.9311, "num_input_tokens_seen": 744949664, "step": 1639, "train_runtime": 118908.0072, "train_tokens_per_second": 6264.924 }, { "epoch": 1.985556049527521, "grad_norm": 0.2622906565666199, "learning_rate": 5e-06, "loss": 0.909, "num_input_tokens_seen": 745414920, "step": 1640, "train_runtime": 118976.34, "train_tokens_per_second": 6265.237 }, { "epoch": 1.9867668063451225, "grad_norm": 0.25892722606658936, "learning_rate": 5e-06, "loss": 0.9746, "num_input_tokens_seen": 745869072, "step": 1641, "train_runtime": 119043.2791, "train_tokens_per_second": 6265.529 }, { "epoch": 1.987977563162724, "grad_norm": 0.24062815308570862, "learning_rate": 5e-06, "loss": 0.9342, "num_input_tokens_seen": 746319848, "step": 1642, "train_runtime": 119109.7704, "train_tokens_per_second": 6265.816 }, { "epoch": 1.9891883199803253, "grad_norm": 0.3026382029056549, "learning_rate": 5e-06, "loss": 0.9328, "num_input_tokens_seen": 746765768, "step": 1643, "train_runtime": 119175.6132, "train_tokens_per_second": 6266.095 }, { "epoch": 1.9903990767979267, "grad_norm": 0.2536994516849518, "learning_rate": 5e-06, "loss": 0.8838, "num_input_tokens_seen": 747223184, "step": 1644, "train_runtime": 119243.0261, "train_tokens_per_second": 6266.389 }, { "epoch": 1.991609833615528, "grad_norm": 0.24464935064315796, "learning_rate": 5e-06, "loss": 0.9337, "num_input_tokens_seen": 747669984, "step": 1645, "train_runtime": 119309.5096, "train_tokens_per_second": 6266.642 }, { "epoch": 1.9928205904331293, "grad_norm": 0.24241983890533447, "learning_rate": 5e-06, "loss": 0.9195, "num_input_tokens_seen": 748096608, "step": 1646, "train_runtime": 119372.5329, "train_tokens_per_second": 6266.907 }, { "epoch": 1.9940313472507307, "grad_norm": 0.25340303778648376, "learning_rate": 5e-06, "loss": 0.9146, "num_input_tokens_seen": 748518912, "step": 1647, "train_runtime": 119434.6808, "train_tokens_per_second": 6267.182 }, { "epoch": 1.9952421040683321, "grad_norm": 0.24607083201408386, "learning_rate": 5e-06, "loss": 0.9389, "num_input_tokens_seen": 748982560, "step": 1648, "train_runtime": 119503.1742, "train_tokens_per_second": 6267.47 }, { "epoch": 1.9964528608859333, "grad_norm": 0.25516462326049805, "learning_rate": 5e-06, "loss": 0.9252, "num_input_tokens_seen": 749435416, "step": 1649, "train_runtime": 119569.6996, "train_tokens_per_second": 6267.77 }, { "epoch": 1.9976636177035347, "grad_norm": 0.23312324285507202, "learning_rate": 5e-06, "loss": 0.9199, "num_input_tokens_seen": 749880560, "step": 1650, "train_runtime": 119635.2823, "train_tokens_per_second": 6268.055 }, { "epoch": 1.9988743745211361, "grad_norm": 0.22469443082809448, "learning_rate": 5e-06, "loss": 0.9509, "num_input_tokens_seen": 750355760, "step": 1651, "train_runtime": 119705.3381, "train_tokens_per_second": 6268.357 }, { "epoch": 2.0, "grad_norm": 0.2671259641647339, "learning_rate": 5e-06, "loss": 0.923, "num_input_tokens_seen": 750767792, "step": 1652, "train_runtime": 119766.3563, "train_tokens_per_second": 6268.603 }, { "epoch": 2.0012107568176014, "grad_norm": 0.25913000106811523, "learning_rate": 5e-06, "loss": 0.9515, "num_input_tokens_seen": 751241384, "step": 1653, "train_runtime": 119836.2595, "train_tokens_per_second": 6268.899 }, { "epoch": 2.002421513635203, "grad_norm": 0.24619394540786743, "learning_rate": 5e-06, "loss": 0.8988, "num_input_tokens_seen": 751701224, "step": 1654, "train_runtime": 119904.0181, "train_tokens_per_second": 6269.191 }, { "epoch": 2.0036322704528042, "grad_norm": 0.245701402425766, "learning_rate": 5e-06, "loss": 0.9295, "num_input_tokens_seen": 752143752, "step": 1655, "train_runtime": 119969.2461, "train_tokens_per_second": 6269.471 }, { "epoch": 2.0048430272704056, "grad_norm": 0.23520943522453308, "learning_rate": 5e-06, "loss": 0.9266, "num_input_tokens_seen": 752602416, "step": 1656, "train_runtime": 120036.8828, "train_tokens_per_second": 6269.76 }, { "epoch": 2.006053784088007, "grad_norm": 0.2660825550556183, "learning_rate": 5e-06, "loss": 0.9354, "num_input_tokens_seen": 753037600, "step": 1657, "train_runtime": 120100.8699, "train_tokens_per_second": 6270.043 }, { "epoch": 2.0072645409056085, "grad_norm": 0.21745565533638, "learning_rate": 5e-06, "loss": 0.9324, "num_input_tokens_seen": 753522696, "step": 1658, "train_runtime": 120172.9009, "train_tokens_per_second": 6270.321 }, { "epoch": 2.0084752977232094, "grad_norm": 0.2518208920955658, "learning_rate": 5e-06, "loss": 0.9369, "num_input_tokens_seen": 753999256, "step": 1659, "train_runtime": 120243.3277, "train_tokens_per_second": 6270.612 }, { "epoch": 2.009686054540811, "grad_norm": 0.23979134857654572, "learning_rate": 5e-06, "loss": 0.9115, "num_input_tokens_seen": 754457208, "step": 1660, "train_runtime": 120311.1243, "train_tokens_per_second": 6270.885 }, { "epoch": 2.0108968113584123, "grad_norm": 0.25418299436569214, "learning_rate": 5e-06, "loss": 0.9742, "num_input_tokens_seen": 754909448, "step": 1661, "train_runtime": 120377.7091, "train_tokens_per_second": 6271.173 }, { "epoch": 2.0121075681760137, "grad_norm": 0.24330636858940125, "learning_rate": 5e-06, "loss": 0.8973, "num_input_tokens_seen": 755375656, "step": 1662, "train_runtime": 120446.4967, "train_tokens_per_second": 6271.462 }, { "epoch": 2.013318324993615, "grad_norm": 0.23138542473316193, "learning_rate": 5e-06, "loss": 0.9258, "num_input_tokens_seen": 755847280, "step": 1663, "train_runtime": 120515.7872, "train_tokens_per_second": 6271.77 }, { "epoch": 2.0145290818112165, "grad_norm": 0.22977809607982635, "learning_rate": 5e-06, "loss": 0.9414, "num_input_tokens_seen": 756307096, "step": 1664, "train_runtime": 120583.5284, "train_tokens_per_second": 6272.06 }, { "epoch": 2.015739838628818, "grad_norm": 0.23063069581985474, "learning_rate": 5e-06, "loss": 0.9489, "num_input_tokens_seen": 756782872, "step": 1665, "train_runtime": 120653.9002, "train_tokens_per_second": 6272.345 }, { "epoch": 2.0169505954464193, "grad_norm": 0.25464802980422974, "learning_rate": 5e-06, "loss": 0.9193, "num_input_tokens_seen": 757236128, "step": 1666, "train_runtime": 120720.8443, "train_tokens_per_second": 6272.621 }, { "epoch": 2.0181613522640207, "grad_norm": 0.26056936383247375, "learning_rate": 5e-06, "loss": 0.9261, "num_input_tokens_seen": 757706360, "step": 1667, "train_runtime": 120792.0573, "train_tokens_per_second": 6272.816 }, { "epoch": 2.019372109081622, "grad_norm": 0.24450352787971497, "learning_rate": 5e-06, "loss": 0.8846, "num_input_tokens_seen": 758155960, "step": 1668, "train_runtime": 120858.5813, "train_tokens_per_second": 6273.083 }, { "epoch": 2.0205828658992235, "grad_norm": 0.22889290750026703, "learning_rate": 5e-06, "loss": 0.9239, "num_input_tokens_seen": 758608128, "step": 1669, "train_runtime": 120924.9819, "train_tokens_per_second": 6273.378 }, { "epoch": 2.021793622716825, "grad_norm": 0.24720118939876556, "learning_rate": 5e-06, "loss": 0.9581, "num_input_tokens_seen": 759066936, "step": 1670, "train_runtime": 120993.0982, "train_tokens_per_second": 6273.638 }, { "epoch": 2.0230043795344264, "grad_norm": 0.2619543671607971, "learning_rate": 5e-06, "loss": 0.9421, "num_input_tokens_seen": 759523400, "step": 1671, "train_runtime": 121060.4719, "train_tokens_per_second": 6273.917 }, { "epoch": 2.0242151363520273, "grad_norm": 0.2532022297382355, "learning_rate": 5e-06, "loss": 0.9364, "num_input_tokens_seen": 759992104, "step": 1672, "train_runtime": 121129.84, "train_tokens_per_second": 6274.194 }, { "epoch": 2.0254258931696287, "grad_norm": 0.2386539727449417, "learning_rate": 5e-06, "loss": 0.9337, "num_input_tokens_seen": 760421816, "step": 1673, "train_runtime": 121193.2317, "train_tokens_per_second": 6274.458 }, { "epoch": 2.02663664998723, "grad_norm": 0.23992206156253815, "learning_rate": 5e-06, "loss": 0.9429, "num_input_tokens_seen": 760910216, "step": 1674, "train_runtime": 121265.0604, "train_tokens_per_second": 6274.769 }, { "epoch": 2.0278474068048316, "grad_norm": 0.27219098806381226, "learning_rate": 5e-06, "loss": 0.9159, "num_input_tokens_seen": 761358768, "step": 1675, "train_runtime": 121331.2035, "train_tokens_per_second": 6275.045 }, { "epoch": 2.029058163622433, "grad_norm": 0.26987916231155396, "learning_rate": 5e-06, "loss": 0.9063, "num_input_tokens_seen": 761819360, "step": 1676, "train_runtime": 121399.3914, "train_tokens_per_second": 6275.314 }, { "epoch": 2.0302689204400344, "grad_norm": 0.29206639528274536, "learning_rate": 5e-06, "loss": 0.8884, "num_input_tokens_seen": 762265192, "step": 1677, "train_runtime": 121465.4319, "train_tokens_per_second": 6275.573 }, { "epoch": 2.031479677257636, "grad_norm": 0.2402559518814087, "learning_rate": 5e-06, "loss": 0.967, "num_input_tokens_seen": 762727064, "step": 1678, "train_runtime": 121533.5732, "train_tokens_per_second": 6275.855 }, { "epoch": 2.032690434075237, "grad_norm": 0.2560024559497833, "learning_rate": 5e-06, "loss": 1.0016, "num_input_tokens_seen": 763173840, "step": 1679, "train_runtime": 121599.4982, "train_tokens_per_second": 6276.127 }, { "epoch": 2.0339011908928386, "grad_norm": 0.2566429078578949, "learning_rate": 5e-06, "loss": 0.9174, "num_input_tokens_seen": 763633104, "step": 1680, "train_runtime": 121667.3707, "train_tokens_per_second": 6276.4 }, { "epoch": 2.03511194771044, "grad_norm": 0.324238657951355, "learning_rate": 5e-06, "loss": 0.9757, "num_input_tokens_seen": 764078208, "step": 1681, "train_runtime": 121732.7093, "train_tokens_per_second": 6276.688 }, { "epoch": 2.0363227045280414, "grad_norm": 0.2558477818965912, "learning_rate": 5e-06, "loss": 0.9683, "num_input_tokens_seen": 764539136, "step": 1682, "train_runtime": 121800.7565, "train_tokens_per_second": 6276.965 }, { "epoch": 2.037533461345643, "grad_norm": 0.24341857433319092, "learning_rate": 5e-06, "loss": 0.9386, "num_input_tokens_seen": 764978424, "step": 1683, "train_runtime": 121864.9978, "train_tokens_per_second": 6277.261 }, { "epoch": 2.0387442181632442, "grad_norm": 0.2269880771636963, "learning_rate": 5e-06, "loss": 0.9277, "num_input_tokens_seen": 765432624, "step": 1684, "train_runtime": 121931.6988, "train_tokens_per_second": 6277.552 }, { "epoch": 2.0399549749808457, "grad_norm": 0.24292759597301483, "learning_rate": 5e-06, "loss": 0.9419, "num_input_tokens_seen": 765849656, "step": 1685, "train_runtime": 121992.9638, "train_tokens_per_second": 6277.818 }, { "epoch": 2.0411657317984466, "grad_norm": 0.2310955822467804, "learning_rate": 5e-06, "loss": 0.9353, "num_input_tokens_seen": 766298816, "step": 1686, "train_runtime": 122059.3898, "train_tokens_per_second": 6278.082 }, { "epoch": 2.042376488616048, "grad_norm": 0.23854534327983856, "learning_rate": 5e-06, "loss": 0.8951, "num_input_tokens_seen": 766758944, "step": 1687, "train_runtime": 122127.173, "train_tokens_per_second": 6278.365 }, { "epoch": 2.0435872454336494, "grad_norm": 0.2445819079875946, "learning_rate": 5e-06, "loss": 0.9005, "num_input_tokens_seen": 767215168, "step": 1688, "train_runtime": 122194.8356, "train_tokens_per_second": 6278.622 }, { "epoch": 2.044798002251251, "grad_norm": 0.24541962146759033, "learning_rate": 5e-06, "loss": 0.9422, "num_input_tokens_seen": 767661360, "step": 1689, "train_runtime": 122260.9211, "train_tokens_per_second": 6278.878 }, { "epoch": 2.0460087590688523, "grad_norm": 0.23069673776626587, "learning_rate": 5e-06, "loss": 0.937, "num_input_tokens_seen": 768110416, "step": 1690, "train_runtime": 122327.2995, "train_tokens_per_second": 6279.141 }, { "epoch": 2.0472195158864537, "grad_norm": 0.26259496808052063, "learning_rate": 5e-06, "loss": 0.9342, "num_input_tokens_seen": 768564600, "step": 1691, "train_runtime": 122394.1919, "train_tokens_per_second": 6279.421 }, { "epoch": 2.048430272704055, "grad_norm": 0.23317334055900574, "learning_rate": 5e-06, "loss": 0.9462, "num_input_tokens_seen": 769009144, "step": 1692, "train_runtime": 122459.6235, "train_tokens_per_second": 6279.695 }, { "epoch": 2.0496410295216565, "grad_norm": 0.24182665348052979, "learning_rate": 5e-06, "loss": 0.9408, "num_input_tokens_seen": 769483920, "step": 1693, "train_runtime": 122529.2607, "train_tokens_per_second": 6280.001 }, { "epoch": 2.050851786339258, "grad_norm": 0.2536557614803314, "learning_rate": 5e-06, "loss": 0.9511, "num_input_tokens_seen": 769932832, "step": 1694, "train_runtime": 122595.5738, "train_tokens_per_second": 6280.266 }, { "epoch": 2.0520625431568593, "grad_norm": 0.24656134843826294, "learning_rate": 5e-06, "loss": 0.9051, "num_input_tokens_seen": 770395072, "step": 1695, "train_runtime": 122663.6778, "train_tokens_per_second": 6280.548 }, { "epoch": 2.0532732999744607, "grad_norm": 0.2536466419696808, "learning_rate": 5e-06, "loss": 0.8947, "num_input_tokens_seen": 770853376, "step": 1696, "train_runtime": 122731.4511, "train_tokens_per_second": 6280.814 }, { "epoch": 2.054484056792062, "grad_norm": 0.23972494900226593, "learning_rate": 5e-06, "loss": 0.9722, "num_input_tokens_seen": 771293240, "step": 1697, "train_runtime": 122796.193, "train_tokens_per_second": 6281.084 }, { "epoch": 2.0556948136096636, "grad_norm": 0.22034522891044617, "learning_rate": 5e-06, "loss": 0.923, "num_input_tokens_seen": 771761160, "step": 1698, "train_runtime": 122865.4887, "train_tokens_per_second": 6281.35 }, { "epoch": 2.0569055704272645, "grad_norm": 0.24163363873958588, "learning_rate": 5e-06, "loss": 0.902, "num_input_tokens_seen": 772219904, "step": 1699, "train_runtime": 122932.45, "train_tokens_per_second": 6281.66 }, { "epoch": 2.058116327244866, "grad_norm": 0.24234162271022797, "learning_rate": 5e-06, "loss": 0.9049, "num_input_tokens_seen": 772676568, "step": 1700, "train_runtime": 123000.0639, "train_tokens_per_second": 6281.92 }, { "epoch": 2.0593270840624673, "grad_norm": 0.259397029876709, "learning_rate": 5e-06, "loss": 0.9477, "num_input_tokens_seen": 773113520, "step": 1701, "train_runtime": 123064.4333, "train_tokens_per_second": 6282.185 }, { "epoch": 2.0605378408800687, "grad_norm": 0.22705447673797607, "learning_rate": 5e-06, "loss": 0.9678, "num_input_tokens_seen": 773574592, "step": 1702, "train_runtime": 123132.6096, "train_tokens_per_second": 6282.451 }, { "epoch": 2.06174859769767, "grad_norm": 0.25759419798851013, "learning_rate": 5e-06, "loss": 0.8677, "num_input_tokens_seen": 774031056, "step": 1703, "train_runtime": 123199.736, "train_tokens_per_second": 6282.733 }, { "epoch": 2.0629593545152716, "grad_norm": 0.26892176270484924, "learning_rate": 5e-06, "loss": 0.9933, "num_input_tokens_seen": 774434376, "step": 1704, "train_runtime": 123258.9797, "train_tokens_per_second": 6282.985 }, { "epoch": 2.064170111332873, "grad_norm": 0.26564693450927734, "learning_rate": 5e-06, "loss": 0.9881, "num_input_tokens_seen": 774870928, "step": 1705, "train_runtime": 123323.0137, "train_tokens_per_second": 6283.263 }, { "epoch": 2.0653808681504744, "grad_norm": 0.23933526873588562, "learning_rate": 5e-06, "loss": 0.9081, "num_input_tokens_seen": 775321504, "step": 1706, "train_runtime": 123389.357, "train_tokens_per_second": 6283.536 }, { "epoch": 2.066591624968076, "grad_norm": 0.261411190032959, "learning_rate": 5e-06, "loss": 0.9314, "num_input_tokens_seen": 775797968, "step": 1707, "train_runtime": 123459.9465, "train_tokens_per_second": 6283.803 }, { "epoch": 2.067802381785677, "grad_norm": 0.24829885363578796, "learning_rate": 5e-06, "loss": 0.9948, "num_input_tokens_seen": 776232728, "step": 1708, "train_runtime": 123523.8479, "train_tokens_per_second": 6284.072 }, { "epoch": 2.0690131386032786, "grad_norm": 0.24466580152511597, "learning_rate": 5e-06, "loss": 0.9643, "num_input_tokens_seen": 776670512, "step": 1709, "train_runtime": 123588.3574, "train_tokens_per_second": 6284.334 }, { "epoch": 2.07022389542088, "grad_norm": 0.2513468265533447, "learning_rate": 5e-06, "loss": 0.8985, "num_input_tokens_seen": 777112832, "step": 1710, "train_runtime": 123653.4604, "train_tokens_per_second": 6284.602 }, { "epoch": 2.0714346522384814, "grad_norm": 0.2488190084695816, "learning_rate": 5e-06, "loss": 0.9549, "num_input_tokens_seen": 777563728, "step": 1711, "train_runtime": 123719.8041, "train_tokens_per_second": 6284.877 }, { "epoch": 2.072645409056083, "grad_norm": 0.2452920526266098, "learning_rate": 5e-06, "loss": 0.8695, "num_input_tokens_seen": 778016968, "step": 1712, "train_runtime": 123786.2393, "train_tokens_per_second": 6285.165 }, { "epoch": 2.073856165873684, "grad_norm": 0.24354714155197144, "learning_rate": 5e-06, "loss": 0.9446, "num_input_tokens_seen": 778467104, "step": 1713, "train_runtime": 123852.6969, "train_tokens_per_second": 6285.427 }, { "epoch": 2.0750669226912852, "grad_norm": 0.2566715180873871, "learning_rate": 5e-06, "loss": 0.9057, "num_input_tokens_seen": 778913672, "step": 1714, "train_runtime": 123918.4381, "train_tokens_per_second": 6285.696 }, { "epoch": 2.0762776795088866, "grad_norm": 0.23084959387779236, "learning_rate": 5e-06, "loss": 0.9132, "num_input_tokens_seen": 779369120, "step": 1715, "train_runtime": 123985.2855, "train_tokens_per_second": 6285.981 }, { "epoch": 2.077488436326488, "grad_norm": 0.24119411408901215, "learning_rate": 5e-06, "loss": 0.9102, "num_input_tokens_seen": 779831520, "step": 1716, "train_runtime": 124053.9529, "train_tokens_per_second": 6286.229 }, { "epoch": 2.0786991931440895, "grad_norm": 0.24514897167682648, "learning_rate": 5e-06, "loss": 0.895, "num_input_tokens_seen": 780292168, "step": 1717, "train_runtime": 124122.045, "train_tokens_per_second": 6286.491 }, { "epoch": 2.079909949961691, "grad_norm": 0.2566341459751129, "learning_rate": 5e-06, "loss": 0.9118, "num_input_tokens_seen": 780741608, "step": 1718, "train_runtime": 124188.5179, "train_tokens_per_second": 6286.746 }, { "epoch": 2.0811207067792923, "grad_norm": 0.2600558400154114, "learning_rate": 5e-06, "loss": 0.9192, "num_input_tokens_seen": 781191032, "step": 1719, "train_runtime": 124255.021, "train_tokens_per_second": 6286.998 }, { "epoch": 2.0823314635968937, "grad_norm": 0.23223178088665009, "learning_rate": 5e-06, "loss": 0.963, "num_input_tokens_seen": 781650672, "step": 1720, "train_runtime": 124322.857, "train_tokens_per_second": 6287.264 }, { "epoch": 2.083542220414495, "grad_norm": 0.24753454327583313, "learning_rate": 5e-06, "loss": 0.9398, "num_input_tokens_seen": 782098928, "step": 1721, "train_runtime": 124390.6893, "train_tokens_per_second": 6287.439 }, { "epoch": 2.0847529772320965, "grad_norm": 0.25024259090423584, "learning_rate": 5e-06, "loss": 0.9395, "num_input_tokens_seen": 782548672, "step": 1722, "train_runtime": 124457.3538, "train_tokens_per_second": 6287.685 }, { "epoch": 2.085963734049698, "grad_norm": 0.22619232535362244, "learning_rate": 5e-06, "loss": 0.9674, "num_input_tokens_seen": 783008520, "step": 1723, "train_runtime": 124525.0193, "train_tokens_per_second": 6287.961 }, { "epoch": 2.0871744908672993, "grad_norm": 0.2778150737285614, "learning_rate": 5e-06, "loss": 0.9461, "num_input_tokens_seen": 783477736, "step": 1724, "train_runtime": 124593.7565, "train_tokens_per_second": 6288.258 }, { "epoch": 2.0883852476849007, "grad_norm": 0.24901039898395538, "learning_rate": 5e-06, "loss": 0.995, "num_input_tokens_seen": 783923816, "step": 1725, "train_runtime": 124659.2402, "train_tokens_per_second": 6288.534 }, { "epoch": 2.0895960045025017, "grad_norm": 0.27725812792778015, "learning_rate": 5e-06, "loss": 0.8965, "num_input_tokens_seen": 784361728, "step": 1726, "train_runtime": 124723.8291, "train_tokens_per_second": 6288.788 }, { "epoch": 2.090806761320103, "grad_norm": 0.26983052492141724, "learning_rate": 5e-06, "loss": 0.8786, "num_input_tokens_seen": 784809792, "step": 1727, "train_runtime": 124789.805, "train_tokens_per_second": 6289.054 }, { "epoch": 2.0920175181377045, "grad_norm": 0.2453075647354126, "learning_rate": 5e-06, "loss": 0.893, "num_input_tokens_seen": 785274824, "step": 1728, "train_runtime": 124857.9453, "train_tokens_per_second": 6289.346 }, { "epoch": 2.093228274955306, "grad_norm": 0.2598790228366852, "learning_rate": 5e-06, "loss": 0.9416, "num_input_tokens_seen": 785730496, "step": 1729, "train_runtime": 124924.8416, "train_tokens_per_second": 6289.626 }, { "epoch": 2.0944390317729074, "grad_norm": 0.24463999271392822, "learning_rate": 5e-06, "loss": 0.9228, "num_input_tokens_seen": 786166712, "step": 1730, "train_runtime": 124989.1875, "train_tokens_per_second": 6289.878 }, { "epoch": 2.0956497885905088, "grad_norm": 0.2674955129623413, "learning_rate": 5e-06, "loss": 0.9096, "num_input_tokens_seen": 786592656, "step": 1731, "train_runtime": 125051.8506, "train_tokens_per_second": 6290.132 }, { "epoch": 2.09686054540811, "grad_norm": 0.25729501247406006, "learning_rate": 5e-06, "loss": 0.946, "num_input_tokens_seen": 787068024, "step": 1732, "train_runtime": 125122.4222, "train_tokens_per_second": 6290.384 }, { "epoch": 2.0980713022257116, "grad_norm": 0.25448042154312134, "learning_rate": 5e-06, "loss": 0.8971, "num_input_tokens_seen": 787513456, "step": 1733, "train_runtime": 125188.7353, "train_tokens_per_second": 6290.61 }, { "epoch": 2.099282059043313, "grad_norm": 0.22716376185417175, "learning_rate": 5e-06, "loss": 0.916, "num_input_tokens_seen": 787957768, "step": 1734, "train_runtime": 125253.9247, "train_tokens_per_second": 6290.883 }, { "epoch": 2.1004928158609144, "grad_norm": 0.2653203010559082, "learning_rate": 5e-06, "loss": 0.9091, "num_input_tokens_seen": 788385664, "step": 1735, "train_runtime": 125317.0041, "train_tokens_per_second": 6291.131 }, { "epoch": 2.101703572678516, "grad_norm": 0.2422814965248108, "learning_rate": 5e-06, "loss": 0.927, "num_input_tokens_seen": 788859096, "step": 1736, "train_runtime": 125386.8684, "train_tokens_per_second": 6291.401 }, { "epoch": 2.1029143294961172, "grad_norm": 0.2769072651863098, "learning_rate": 5e-06, "loss": 0.9175, "num_input_tokens_seen": 789313272, "step": 1737, "train_runtime": 125454.0354, "train_tokens_per_second": 6291.653 }, { "epoch": 2.1041250863137186, "grad_norm": 0.24323880672454834, "learning_rate": 5e-06, "loss": 0.8791, "num_input_tokens_seen": 789751800, "step": 1738, "train_runtime": 125518.7718, "train_tokens_per_second": 6291.902 }, { "epoch": 2.1053358431313196, "grad_norm": 0.23501011729240417, "learning_rate": 5e-06, "loss": 0.982, "num_input_tokens_seen": 790204272, "step": 1739, "train_runtime": 125585.6763, "train_tokens_per_second": 6292.153 }, { "epoch": 2.106546599948921, "grad_norm": 0.2527690529823303, "learning_rate": 5e-06, "loss": 0.9606, "num_input_tokens_seen": 790653104, "step": 1740, "train_runtime": 125651.8735, "train_tokens_per_second": 6292.41 }, { "epoch": 2.1077573567665224, "grad_norm": 0.23906171321868896, "learning_rate": 5e-06, "loss": 0.8936, "num_input_tokens_seen": 791130304, "step": 1741, "train_runtime": 125722.4514, "train_tokens_per_second": 6292.673 }, { "epoch": 2.108968113584124, "grad_norm": 0.26574084162712097, "learning_rate": 5e-06, "loss": 0.8722, "num_input_tokens_seen": 791579192, "step": 1742, "train_runtime": 125788.8875, "train_tokens_per_second": 6292.918 }, { "epoch": 2.1101788704017252, "grad_norm": 0.2502514123916626, "learning_rate": 5e-06, "loss": 0.9092, "num_input_tokens_seen": 792041488, "step": 1743, "train_runtime": 125857.0394, "train_tokens_per_second": 6293.184 }, { "epoch": 2.1113896272193267, "grad_norm": 0.23396193981170654, "learning_rate": 5e-06, "loss": 0.8746, "num_input_tokens_seen": 792517888, "step": 1744, "train_runtime": 125927.6031, "train_tokens_per_second": 6293.441 }, { "epoch": 2.112600384036928, "grad_norm": 0.23824480175971985, "learning_rate": 5e-06, "loss": 0.9406, "num_input_tokens_seen": 792972896, "step": 1745, "train_runtime": 125994.9999, "train_tokens_per_second": 6293.685 }, { "epoch": 2.1138111408545295, "grad_norm": 0.2447684109210968, "learning_rate": 5e-06, "loss": 0.9068, "num_input_tokens_seen": 793422832, "step": 1746, "train_runtime": 126061.4363, "train_tokens_per_second": 6293.938 }, { "epoch": 2.115021897672131, "grad_norm": 0.22363825142383575, "learning_rate": 5e-06, "loss": 0.8694, "num_input_tokens_seen": 793880960, "step": 1747, "train_runtime": 126129.4678, "train_tokens_per_second": 6294.175 }, { "epoch": 2.1162326544897323, "grad_norm": 0.23261670768260956, "learning_rate": 5e-06, "loss": 0.9484, "num_input_tokens_seen": 794328136, "step": 1748, "train_runtime": 126195.2383, "train_tokens_per_second": 6294.438 }, { "epoch": 2.1174434113073337, "grad_norm": 0.22803719341754913, "learning_rate": 5e-06, "loss": 0.8902, "num_input_tokens_seen": 794810736, "step": 1749, "train_runtime": 126266.3352, "train_tokens_per_second": 6294.716 }, { "epoch": 2.118654168124935, "grad_norm": 0.23991791903972626, "learning_rate": 5e-06, "loss": 0.9623, "num_input_tokens_seen": 795239912, "step": 1750, "train_runtime": 126329.1962, "train_tokens_per_second": 6294.981 }, { "epoch": 2.1198649249425365, "grad_norm": 0.2476852983236313, "learning_rate": 5e-06, "loss": 0.9741, "num_input_tokens_seen": 795711584, "step": 1751, "train_runtime": 126398.9592, "train_tokens_per_second": 6295.238 }, { "epoch": 2.1210756817601375, "grad_norm": 0.24314959347248077, "learning_rate": 5e-06, "loss": 0.9451, "num_input_tokens_seen": 796160352, "step": 1752, "train_runtime": 126464.8762, "train_tokens_per_second": 6295.506 }, { "epoch": 2.122286438577739, "grad_norm": 0.24649563431739807, "learning_rate": 5e-06, "loss": 0.9477, "num_input_tokens_seen": 796618904, "step": 1753, "train_runtime": 126532.6819, "train_tokens_per_second": 6295.756 }, { "epoch": 2.1234971953953403, "grad_norm": 0.2568952441215515, "learning_rate": 5e-06, "loss": 0.9223, "num_input_tokens_seen": 797063056, "step": 1754, "train_runtime": 126598.1871, "train_tokens_per_second": 6296.007 }, { "epoch": 2.1247079522129417, "grad_norm": 0.22107072174549103, "learning_rate": 5e-06, "loss": 0.8551, "num_input_tokens_seen": 797544816, "step": 1755, "train_runtime": 126669.3768, "train_tokens_per_second": 6296.272 }, { "epoch": 2.125918709030543, "grad_norm": 0.29669317603111267, "learning_rate": 5e-06, "loss": 0.9065, "num_input_tokens_seen": 797998352, "step": 1756, "train_runtime": 126736.3752, "train_tokens_per_second": 6296.522 }, { "epoch": 2.1271294658481446, "grad_norm": 0.2783910036087036, "learning_rate": 5e-06, "loss": 0.9066, "num_input_tokens_seen": 798444912, "step": 1757, "train_runtime": 126801.8963, "train_tokens_per_second": 6296.79 }, { "epoch": 2.128340222665746, "grad_norm": 0.2530405819416046, "learning_rate": 5e-06, "loss": 0.9181, "num_input_tokens_seen": 798908496, "step": 1758, "train_runtime": 126870.3737, "train_tokens_per_second": 6297.045 }, { "epoch": 2.1295509794833474, "grad_norm": 0.24973563849925995, "learning_rate": 5e-06, "loss": 0.9519, "num_input_tokens_seen": 799355104, "step": 1759, "train_runtime": 126936.2803, "train_tokens_per_second": 6297.294 }, { "epoch": 2.130761736300949, "grad_norm": 0.24954435229301453, "learning_rate": 5e-06, "loss": 0.9068, "num_input_tokens_seen": 799829328, "step": 1760, "train_runtime": 127006.5431, "train_tokens_per_second": 6297.544 }, { "epoch": 2.13197249311855, "grad_norm": 0.2468835860490799, "learning_rate": 5e-06, "loss": 0.905, "num_input_tokens_seen": 800297592, "step": 1761, "train_runtime": 127075.8817, "train_tokens_per_second": 6297.793 }, { "epoch": 2.1331832499361516, "grad_norm": 0.24968093633651733, "learning_rate": 5e-06, "loss": 0.9725, "num_input_tokens_seen": 800745464, "step": 1762, "train_runtime": 127141.6011, "train_tokens_per_second": 6298.06 }, { "epoch": 2.134394006753753, "grad_norm": 0.24861465394496918, "learning_rate": 5e-06, "loss": 0.9453, "num_input_tokens_seen": 801182160, "step": 1763, "train_runtime": 127205.6251, "train_tokens_per_second": 6298.323 }, { "epoch": 2.1356047635713544, "grad_norm": 0.2691054940223694, "learning_rate": 5e-06, "loss": 0.9744, "num_input_tokens_seen": 801626032, "step": 1764, "train_runtime": 127271.2495, "train_tokens_per_second": 6298.563 }, { "epoch": 2.136815520388956, "grad_norm": 0.2613939046859741, "learning_rate": 5e-06, "loss": 0.9207, "num_input_tokens_seen": 802080576, "step": 1765, "train_runtime": 127338.2741, "train_tokens_per_second": 6298.818 }, { "epoch": 2.138026277206557, "grad_norm": 0.2544805407524109, "learning_rate": 5e-06, "loss": 0.9109, "num_input_tokens_seen": 802543280, "step": 1766, "train_runtime": 127406.2366, "train_tokens_per_second": 6299.089 }, { "epoch": 2.139237034024158, "grad_norm": 0.25102829933166504, "learning_rate": 5e-06, "loss": 0.9391, "num_input_tokens_seen": 802991496, "step": 1767, "train_runtime": 127472.4744, "train_tokens_per_second": 6299.332 }, { "epoch": 2.1404477908417596, "grad_norm": 0.22922591865062714, "learning_rate": 5e-06, "loss": 0.8899, "num_input_tokens_seen": 803454416, "step": 1768, "train_runtime": 127540.4672, "train_tokens_per_second": 6299.604 }, { "epoch": 2.141658547659361, "grad_norm": 0.24334551393985748, "learning_rate": 5e-06, "loss": 0.9564, "num_input_tokens_seen": 803898848, "step": 1769, "train_runtime": 127606.0132, "train_tokens_per_second": 6299.851 }, { "epoch": 2.1428693044769624, "grad_norm": 0.26398375630378723, "learning_rate": 5e-06, "loss": 0.9162, "num_input_tokens_seen": 804346008, "step": 1770, "train_runtime": 127671.9825, "train_tokens_per_second": 6300.098 }, { "epoch": 2.144080061294564, "grad_norm": 0.26432764530181885, "learning_rate": 5e-06, "loss": 0.9098, "num_input_tokens_seen": 804800632, "step": 1771, "train_runtime": 127739.566, "train_tokens_per_second": 6300.324 }, { "epoch": 2.1452908181121653, "grad_norm": 0.24564692378044128, "learning_rate": 5e-06, "loss": 0.9261, "num_input_tokens_seen": 805243600, "step": 1772, "train_runtime": 127804.5626, "train_tokens_per_second": 6300.586 }, { "epoch": 2.1465015749297667, "grad_norm": 0.2491164207458496, "learning_rate": 5e-06, "loss": 0.9222, "num_input_tokens_seen": 805698520, "step": 1773, "train_runtime": 127871.3074, "train_tokens_per_second": 6300.855 }, { "epoch": 2.147712331747368, "grad_norm": 0.2387707233428955, "learning_rate": 5e-06, "loss": 0.971, "num_input_tokens_seen": 806151760, "step": 1774, "train_runtime": 127938.8195, "train_tokens_per_second": 6301.072 }, { "epoch": 2.1489230885649695, "grad_norm": 0.2344633936882019, "learning_rate": 5e-06, "loss": 0.8934, "num_input_tokens_seen": 806619560, "step": 1775, "train_runtime": 128009.9227, "train_tokens_per_second": 6301.227 }, { "epoch": 2.150133845382571, "grad_norm": 0.25677409768104553, "learning_rate": 5e-06, "loss": 0.9392, "num_input_tokens_seen": 807056520, "step": 1776, "train_runtime": 128074.5031, "train_tokens_per_second": 6301.461 }, { "epoch": 2.1513446022001723, "grad_norm": 0.24254010617733002, "learning_rate": 5e-06, "loss": 0.8995, "num_input_tokens_seen": 807505104, "step": 1777, "train_runtime": 128140.1656, "train_tokens_per_second": 6301.733 }, { "epoch": 2.1525553590177737, "grad_norm": 0.2752172350883484, "learning_rate": 5e-06, "loss": 0.885, "num_input_tokens_seen": 807940568, "step": 1778, "train_runtime": 128204.0096, "train_tokens_per_second": 6301.991 }, { "epoch": 2.153766115835375, "grad_norm": 0.25673961639404297, "learning_rate": 5e-06, "loss": 0.9249, "num_input_tokens_seen": 808380984, "step": 1779, "train_runtime": 128269.2141, "train_tokens_per_second": 6302.221 }, { "epoch": 2.154976872652976, "grad_norm": 0.24344174563884735, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 808834400, "step": 1780, "train_runtime": 128335.406, "train_tokens_per_second": 6302.504 }, { "epoch": 2.1561876294705775, "grad_norm": 0.23879307508468628, "learning_rate": 5e-06, "loss": 0.8993, "num_input_tokens_seen": 809277672, "step": 1781, "train_runtime": 128400.9451, "train_tokens_per_second": 6302.739 }, { "epoch": 2.157398386288179, "grad_norm": 0.24937401711940765, "learning_rate": 5e-06, "loss": 0.9594, "num_input_tokens_seen": 809742248, "step": 1782, "train_runtime": 128469.3459, "train_tokens_per_second": 6303.0 }, { "epoch": 2.1586091431057803, "grad_norm": 0.2503887414932251, "learning_rate": 5e-06, "loss": 0.9878, "num_input_tokens_seen": 810215920, "step": 1783, "train_runtime": 128539.3636, "train_tokens_per_second": 6303.251 }, { "epoch": 2.1598198999233817, "grad_norm": 0.2328265905380249, "learning_rate": 5e-06, "loss": 0.9034, "num_input_tokens_seen": 810667968, "step": 1784, "train_runtime": 128605.684, "train_tokens_per_second": 6303.516 }, { "epoch": 2.161030656740983, "grad_norm": 0.27375268936157227, "learning_rate": 5e-06, "loss": 0.9682, "num_input_tokens_seen": 811105360, "step": 1785, "train_runtime": 128670.1265, "train_tokens_per_second": 6303.758 }, { "epoch": 2.1622414135585846, "grad_norm": 0.24299326539039612, "learning_rate": 5e-06, "loss": 0.9181, "num_input_tokens_seen": 811562264, "step": 1786, "train_runtime": 128737.81, "train_tokens_per_second": 6303.993 }, { "epoch": 2.163452170376186, "grad_norm": 0.25592973828315735, "learning_rate": 5e-06, "loss": 0.9622, "num_input_tokens_seen": 812023096, "step": 1787, "train_runtime": 128805.4346, "train_tokens_per_second": 6304.261 }, { "epoch": 2.1646629271937874, "grad_norm": 0.27488279342651367, "learning_rate": 5e-06, "loss": 0.8852, "num_input_tokens_seen": 812487488, "step": 1788, "train_runtime": 128874.145, "train_tokens_per_second": 6304.503 }, { "epoch": 2.165873684011389, "grad_norm": 0.22598235309123993, "learning_rate": 5e-06, "loss": 0.8803, "num_input_tokens_seen": 812951528, "step": 1789, "train_runtime": 128942.4621, "train_tokens_per_second": 6304.762 }, { "epoch": 2.16708444082899, "grad_norm": 0.2569931149482727, "learning_rate": 5e-06, "loss": 0.9139, "num_input_tokens_seen": 813382096, "step": 1790, "train_runtime": 129005.9065, "train_tokens_per_second": 6304.999 }, { "epoch": 2.1682951976465916, "grad_norm": 0.24193847179412842, "learning_rate": 5e-06, "loss": 0.9344, "num_input_tokens_seen": 813843352, "step": 1791, "train_runtime": 129073.5943, "train_tokens_per_second": 6305.266 }, { "epoch": 2.169505954464193, "grad_norm": 0.23365779221057892, "learning_rate": 5e-06, "loss": 0.949, "num_input_tokens_seen": 814308480, "step": 1792, "train_runtime": 129142.3657, "train_tokens_per_second": 6305.51 }, { "epoch": 2.170716711281794, "grad_norm": 0.23331047594547272, "learning_rate": 5e-06, "loss": 0.9625, "num_input_tokens_seen": 814747656, "step": 1793, "train_runtime": 129207.0033, "train_tokens_per_second": 6305.755 }, { "epoch": 2.1719274680993954, "grad_norm": 0.26496171951293945, "learning_rate": 5e-06, "loss": 0.9196, "num_input_tokens_seen": 815206384, "step": 1794, "train_runtime": 129274.5471, "train_tokens_per_second": 6306.008 }, { "epoch": 2.173138224916997, "grad_norm": 0.23653088510036469, "learning_rate": 5e-06, "loss": 0.9278, "num_input_tokens_seen": 815662072, "step": 1795, "train_runtime": 129341.8841, "train_tokens_per_second": 6306.249 }, { "epoch": 2.1743489817345982, "grad_norm": 0.24810637533664703, "learning_rate": 5e-06, "loss": 0.9657, "num_input_tokens_seen": 816098360, "step": 1796, "train_runtime": 129406.0312, "train_tokens_per_second": 6306.494 }, { "epoch": 2.1755597385521996, "grad_norm": 0.23452900350093842, "learning_rate": 5e-06, "loss": 0.901, "num_input_tokens_seen": 816563568, "step": 1797, "train_runtime": 129474.7386, "train_tokens_per_second": 6306.74 }, { "epoch": 2.176770495369801, "grad_norm": 0.2348732203245163, "learning_rate": 5e-06, "loss": 0.9526, "num_input_tokens_seen": 817031280, "step": 1798, "train_runtime": 129544.0556, "train_tokens_per_second": 6306.976 }, { "epoch": 2.1779812521874025, "grad_norm": 0.2519684135913849, "learning_rate": 5e-06, "loss": 0.9246, "num_input_tokens_seen": 817503224, "step": 1799, "train_runtime": 129613.556, "train_tokens_per_second": 6307.236 }, { "epoch": 2.179192009005004, "grad_norm": 0.2337455451488495, "learning_rate": 5e-06, "loss": 0.9357, "num_input_tokens_seen": 817952256, "step": 1800, "train_runtime": 129679.9959, "train_tokens_per_second": 6307.467 }, { "epoch": 2.1804027658226053, "grad_norm": 0.22144410014152527, "learning_rate": 5e-06, "loss": 0.9059, "num_input_tokens_seen": 818411048, "step": 1801, "train_runtime": 129748.001, "train_tokens_per_second": 6307.697 }, { "epoch": 2.1816135226402067, "grad_norm": 0.23474140465259552, "learning_rate": 5e-06, "loss": 0.9692, "num_input_tokens_seen": 818854960, "step": 1802, "train_runtime": 129813.7428, "train_tokens_per_second": 6307.922 }, { "epoch": 2.182824279457808, "grad_norm": 0.2501378357410431, "learning_rate": 5e-06, "loss": 0.9379, "num_input_tokens_seen": 819305080, "step": 1803, "train_runtime": 129880.2273, "train_tokens_per_second": 6308.159 }, { "epoch": 2.1840350362754095, "grad_norm": 0.2469998002052307, "learning_rate": 5e-06, "loss": 0.9185, "num_input_tokens_seen": 819758552, "step": 1804, "train_runtime": 129947.1113, "train_tokens_per_second": 6308.402 }, { "epoch": 2.185245793093011, "grad_norm": 0.24533340334892273, "learning_rate": 5e-06, "loss": 0.9593, "num_input_tokens_seen": 820199856, "step": 1805, "train_runtime": 130012.1095, "train_tokens_per_second": 6308.642 }, { "epoch": 2.186456549910612, "grad_norm": 0.24642273783683777, "learning_rate": 5e-06, "loss": 0.934, "num_input_tokens_seen": 820657760, "step": 1806, "train_runtime": 130084.5055, "train_tokens_per_second": 6308.651 }, { "epoch": 2.1876673067282133, "grad_norm": 0.24866892397403717, "learning_rate": 5e-06, "loss": 0.9497, "num_input_tokens_seen": 821100408, "step": 1807, "train_runtime": 130154.3541, "train_tokens_per_second": 6308.666 }, { "epoch": 2.1888780635458147, "grad_norm": 0.24068208038806915, "learning_rate": 5e-06, "loss": 0.985, "num_input_tokens_seen": 821535944, "step": 1808, "train_runtime": 130223.3035, "train_tokens_per_second": 6308.671 }, { "epoch": 2.190088820363416, "grad_norm": 0.2489953488111496, "learning_rate": 5e-06, "loss": 0.9605, "num_input_tokens_seen": 822016328, "step": 1809, "train_runtime": 130299.4476, "train_tokens_per_second": 6308.671 }, { "epoch": 2.1912995771810175, "grad_norm": 0.2993757724761963, "learning_rate": 5e-06, "loss": 0.9075, "num_input_tokens_seen": 822468376, "step": 1810, "train_runtime": 130370.0272, "train_tokens_per_second": 6308.723 }, { "epoch": 2.192510333998619, "grad_norm": 0.23804070055484772, "learning_rate": 5e-06, "loss": 0.8944, "num_input_tokens_seen": 822908128, "step": 1811, "train_runtime": 130439.4385, "train_tokens_per_second": 6308.737 }, { "epoch": 2.1937210908162204, "grad_norm": 0.2520151734352112, "learning_rate": 5e-06, "loss": 0.9288, "num_input_tokens_seen": 823370440, "step": 1812, "train_runtime": 130512.4486, "train_tokens_per_second": 6308.75 }, { "epoch": 2.1949318476338218, "grad_norm": 0.27723604440689087, "learning_rate": 5e-06, "loss": 0.9607, "num_input_tokens_seen": 823827464, "step": 1813, "train_runtime": 130584.4904, "train_tokens_per_second": 6308.77 }, { "epoch": 2.196142604451423, "grad_norm": 0.24269568920135498, "learning_rate": 5e-06, "loss": 0.8963, "num_input_tokens_seen": 824292328, "step": 1814, "train_runtime": 130657.7113, "train_tokens_per_second": 6308.792 }, { "epoch": 2.1973533612690246, "grad_norm": 0.2714741826057434, "learning_rate": 5e-06, "loss": 0.9484, "num_input_tokens_seen": 824727776, "step": 1815, "train_runtime": 130725.9019, "train_tokens_per_second": 6308.832 }, { "epoch": 2.198564118086626, "grad_norm": 0.2618526518344879, "learning_rate": 5e-06, "loss": 0.88, "num_input_tokens_seen": 825185376, "step": 1816, "train_runtime": 130798.1284, "train_tokens_per_second": 6308.847 }, { "epoch": 2.1997748749042274, "grad_norm": 0.23151424527168274, "learning_rate": 5e-06, "loss": 0.9386, "num_input_tokens_seen": 825654456, "step": 1817, "train_runtime": 130871.6577, "train_tokens_per_second": 6308.887 }, { "epoch": 2.200985631721829, "grad_norm": 0.2615219056606293, "learning_rate": 5e-06, "loss": 0.9338, "num_input_tokens_seen": 826103496, "step": 1818, "train_runtime": 130942.5052, "train_tokens_per_second": 6308.902 }, { "epoch": 2.20219638853943, "grad_norm": 0.24982737004756927, "learning_rate": 5e-06, "loss": 0.9204, "num_input_tokens_seen": 826591216, "step": 1819, "train_runtime": 131020.0461, "train_tokens_per_second": 6308.891 }, { "epoch": 2.203407145357031, "grad_norm": 0.2572263479232788, "learning_rate": 5e-06, "loss": 0.8943, "num_input_tokens_seen": 827049808, "step": 1820, "train_runtime": 131088.1052, "train_tokens_per_second": 6309.114 }, { "epoch": 2.2046179021746326, "grad_norm": 0.2785727083683014, "learning_rate": 5e-06, "loss": 0.953, "num_input_tokens_seen": 827506440, "step": 1821, "train_runtime": 131155.3257, "train_tokens_per_second": 6309.362 }, { "epoch": 2.205828658992234, "grad_norm": 0.23997686803340912, "learning_rate": 5e-06, "loss": 0.9181, "num_input_tokens_seen": 827997808, "step": 1822, "train_runtime": 131228.0259, "train_tokens_per_second": 6309.611 }, { "epoch": 2.2070394158098354, "grad_norm": 0.2337905317544937, "learning_rate": 5e-06, "loss": 0.8995, "num_input_tokens_seen": 828443496, "step": 1823, "train_runtime": 131293.7615, "train_tokens_per_second": 6309.847 }, { "epoch": 2.208250172627437, "grad_norm": 0.22939272224903107, "learning_rate": 5e-06, "loss": 0.9455, "num_input_tokens_seen": 828911096, "step": 1824, "train_runtime": 131363.1117, "train_tokens_per_second": 6310.075 }, { "epoch": 2.2094609294450382, "grad_norm": 0.3020130693912506, "learning_rate": 5e-06, "loss": 0.9275, "num_input_tokens_seen": 829374400, "step": 1825, "train_runtime": 131431.3819, "train_tokens_per_second": 6310.322 }, { "epoch": 2.2106716862626397, "grad_norm": 0.25360703468322754, "learning_rate": 5e-06, "loss": 0.9106, "num_input_tokens_seen": 829831288, "step": 1826, "train_runtime": 131498.3697, "train_tokens_per_second": 6310.582 }, { "epoch": 2.211882443080241, "grad_norm": 0.24077729880809784, "learning_rate": 5e-06, "loss": 0.9154, "num_input_tokens_seen": 830266224, "step": 1827, "train_runtime": 131564.1036, "train_tokens_per_second": 6310.735 }, { "epoch": 2.2130931998978425, "grad_norm": 0.26646628975868225, "learning_rate": 5e-06, "loss": 0.9073, "num_input_tokens_seen": 830695456, "step": 1828, "train_runtime": 131628.3066, "train_tokens_per_second": 6310.918 }, { "epoch": 2.214303956715444, "grad_norm": 0.24032685160636902, "learning_rate": 5e-06, "loss": 0.9257, "num_input_tokens_seen": 831167968, "step": 1829, "train_runtime": 131697.9225, "train_tokens_per_second": 6311.17 }, { "epoch": 2.2155147135330453, "grad_norm": 0.24201683700084686, "learning_rate": 5e-06, "loss": 0.9432, "num_input_tokens_seen": 831603632, "step": 1830, "train_runtime": 131762.2198, "train_tokens_per_second": 6311.397 }, { "epoch": 2.2167254703506467, "grad_norm": 0.25040099024772644, "learning_rate": 5e-06, "loss": 0.9309, "num_input_tokens_seen": 832060792, "step": 1831, "train_runtime": 131829.2539, "train_tokens_per_second": 6311.655 }, { "epoch": 2.2179362271682477, "grad_norm": 0.2554630935192108, "learning_rate": 5e-06, "loss": 0.9374, "num_input_tokens_seen": 832517720, "step": 1832, "train_runtime": 131896.5149, "train_tokens_per_second": 6311.901 }, { "epoch": 2.219146983985849, "grad_norm": 0.2625337839126587, "learning_rate": 5e-06, "loss": 0.8854, "num_input_tokens_seen": 832952008, "step": 1833, "train_runtime": 131960.5471, "train_tokens_per_second": 6312.129 }, { "epoch": 2.2203577408034505, "grad_norm": 0.250442236661911, "learning_rate": 5e-06, "loss": 0.9404, "num_input_tokens_seen": 833400512, "step": 1834, "train_runtime": 132026.3765, "train_tokens_per_second": 6312.379 }, { "epoch": 2.221568497621052, "grad_norm": 0.24164512753486633, "learning_rate": 5e-06, "loss": 0.9089, "num_input_tokens_seen": 833865880, "step": 1835, "train_runtime": 132095.3899, "train_tokens_per_second": 6312.604 }, { "epoch": 2.2227792544386533, "grad_norm": 0.2589486837387085, "learning_rate": 5e-06, "loss": 0.9173, "num_input_tokens_seen": 834320256, "step": 1836, "train_runtime": 132162.8118, "train_tokens_per_second": 6312.822 }, { "epoch": 2.2239900112562547, "grad_norm": 0.26678481698036194, "learning_rate": 5e-06, "loss": 0.9453, "num_input_tokens_seen": 834793808, "step": 1837, "train_runtime": 132232.281, "train_tokens_per_second": 6313.086 }, { "epoch": 2.225200768073856, "grad_norm": 0.2960735261440277, "learning_rate": 5e-06, "loss": 0.8841, "num_input_tokens_seen": 835242864, "step": 1838, "train_runtime": 132298.5746, "train_tokens_per_second": 6313.317 }, { "epoch": 2.2264115248914575, "grad_norm": 0.24359485507011414, "learning_rate": 5e-06, "loss": 0.9434, "num_input_tokens_seen": 835713144, "step": 1839, "train_runtime": 132367.7406, "train_tokens_per_second": 6313.571 }, { "epoch": 2.227622281709059, "grad_norm": 0.24145717918872833, "learning_rate": 5e-06, "loss": 0.9528, "num_input_tokens_seen": 836169392, "step": 1840, "train_runtime": 132435.3509, "train_tokens_per_second": 6313.793 }, { "epoch": 2.2288330385266604, "grad_norm": 0.23885925114154816, "learning_rate": 5e-06, "loss": 0.9194, "num_input_tokens_seen": 836648992, "step": 1841, "train_runtime": 132506.0657, "train_tokens_per_second": 6314.043 }, { "epoch": 2.230043795344262, "grad_norm": 0.2691201865673065, "learning_rate": 5e-06, "loss": 0.9245, "num_input_tokens_seen": 837122792, "step": 1842, "train_runtime": 132575.9881, "train_tokens_per_second": 6314.287 }, { "epoch": 2.231254552161863, "grad_norm": 0.2495044767856598, "learning_rate": 5e-06, "loss": 0.9572, "num_input_tokens_seen": 837581056, "step": 1843, "train_runtime": 132643.3397, "train_tokens_per_second": 6314.535 }, { "epoch": 2.2324653089794646, "grad_norm": 0.2624557316303253, "learning_rate": 5e-06, "loss": 0.9458, "num_input_tokens_seen": 838029032, "step": 1844, "train_runtime": 132709.3351, "train_tokens_per_second": 6314.771 }, { "epoch": 2.233676065797066, "grad_norm": 0.23831219971179962, "learning_rate": 5e-06, "loss": 0.9305, "num_input_tokens_seen": 838473072, "step": 1845, "train_runtime": 132775.0436, "train_tokens_per_second": 6314.99 }, { "epoch": 2.234886822614667, "grad_norm": 0.2543146014213562, "learning_rate": 5e-06, "loss": 0.9035, "num_input_tokens_seen": 838923928, "step": 1846, "train_runtime": 132841.3512, "train_tokens_per_second": 6315.232 }, { "epoch": 2.2360975794322684, "grad_norm": 0.238714799284935, "learning_rate": 5e-06, "loss": 0.8885, "num_input_tokens_seen": 839396800, "step": 1847, "train_runtime": 132911.5196, "train_tokens_per_second": 6315.456 }, { "epoch": 2.23730833624987, "grad_norm": 0.22185099124908447, "learning_rate": 5e-06, "loss": 0.8943, "num_input_tokens_seen": 839870296, "step": 1848, "train_runtime": 132981.288, "train_tokens_per_second": 6315.703 }, { "epoch": 2.238519093067471, "grad_norm": 0.23457881808280945, "learning_rate": 5e-06, "loss": 0.9404, "num_input_tokens_seen": 840334928, "step": 1849, "train_runtime": 133049.8116, "train_tokens_per_second": 6315.942 }, { "epoch": 2.2397298498850726, "grad_norm": 0.22963935136795044, "learning_rate": 5e-06, "loss": 0.9318, "num_input_tokens_seen": 840803472, "step": 1850, "train_runtime": 133118.9729, "train_tokens_per_second": 6316.181 }, { "epoch": 2.240940606702674, "grad_norm": 0.24061468243598938, "learning_rate": 5e-06, "loss": 0.896, "num_input_tokens_seen": 841253104, "step": 1851, "train_runtime": 133185.6102, "train_tokens_per_second": 6316.396 }, { "epoch": 2.2421513635202754, "grad_norm": 0.28269779682159424, "learning_rate": 5e-06, "loss": 0.8956, "num_input_tokens_seen": 841712024, "step": 1852, "train_runtime": 133253.9849, "train_tokens_per_second": 6316.599 }, { "epoch": 2.243362120337877, "grad_norm": 0.2352578043937683, "learning_rate": 5e-06, "loss": 0.9073, "num_input_tokens_seen": 842172480, "step": 1853, "train_runtime": 133322.1946, "train_tokens_per_second": 6316.821 }, { "epoch": 2.2445728771554783, "grad_norm": 0.24535781145095825, "learning_rate": 5e-06, "loss": 0.9765, "num_input_tokens_seen": 842620904, "step": 1854, "train_runtime": 133388.3394, "train_tokens_per_second": 6317.051 }, { "epoch": 2.2457836339730797, "grad_norm": 0.24296994507312775, "learning_rate": 5e-06, "loss": 0.931, "num_input_tokens_seen": 843068456, "step": 1855, "train_runtime": 133454.0409, "train_tokens_per_second": 6317.294 }, { "epoch": 2.246994390790681, "grad_norm": 0.24628207087516785, "learning_rate": 5e-06, "loss": 0.9485, "num_input_tokens_seen": 843536640, "step": 1856, "train_runtime": 133523.3347, "train_tokens_per_second": 6317.522 }, { "epoch": 2.2482051476082825, "grad_norm": 0.2603435814380646, "learning_rate": 5e-06, "loss": 0.8639, "num_input_tokens_seen": 843998264, "step": 1857, "train_runtime": 133591.3388, "train_tokens_per_second": 6317.762 }, { "epoch": 2.249415904425884, "grad_norm": 0.2735736072063446, "learning_rate": 5e-06, "loss": 0.9561, "num_input_tokens_seen": 844454984, "step": 1858, "train_runtime": 133659.0053, "train_tokens_per_second": 6317.98 }, { "epoch": 2.2506266612434853, "grad_norm": 0.25031837821006775, "learning_rate": 5e-06, "loss": 0.9065, "num_input_tokens_seen": 844918424, "step": 1859, "train_runtime": 133727.5797, "train_tokens_per_second": 6318.206 }, { "epoch": 2.2518374180610863, "grad_norm": 0.24365690350532532, "learning_rate": 5e-06, "loss": 0.9319, "num_input_tokens_seen": 845370456, "step": 1860, "train_runtime": 133794.1892, "train_tokens_per_second": 6318.439 }, { "epoch": 2.2530481748786877, "grad_norm": 0.23625266551971436, "learning_rate": 5e-06, "loss": 0.9334, "num_input_tokens_seen": 845840944, "step": 1861, "train_runtime": 133864.0008, "train_tokens_per_second": 6318.659 }, { "epoch": 2.254258931696289, "grad_norm": 0.2634667456150055, "learning_rate": 5e-06, "loss": 0.9272, "num_input_tokens_seen": 846290816, "step": 1862, "train_runtime": 133930.3378, "train_tokens_per_second": 6318.888 }, { "epoch": 2.2554696885138905, "grad_norm": 0.2611207067966461, "learning_rate": 5e-06, "loss": 0.9475, "num_input_tokens_seen": 846744432, "step": 1863, "train_runtime": 133997.2146, "train_tokens_per_second": 6319.12 }, { "epoch": 2.256680445331492, "grad_norm": 0.2601044178009033, "learning_rate": 5e-06, "loss": 0.9298, "num_input_tokens_seen": 847201376, "step": 1864, "train_runtime": 134064.8678, "train_tokens_per_second": 6319.339 }, { "epoch": 2.2578912021490933, "grad_norm": 0.24679550528526306, "learning_rate": 5e-06, "loss": 0.9705, "num_input_tokens_seen": 847641792, "step": 1865, "train_runtime": 134129.2413, "train_tokens_per_second": 6319.59 }, { "epoch": 2.2591019589666947, "grad_norm": 0.23708128929138184, "learning_rate": 5e-06, "loss": 0.9267, "num_input_tokens_seen": 848110896, "step": 1866, "train_runtime": 134198.5814, "train_tokens_per_second": 6319.82 }, { "epoch": 2.260312715784296, "grad_norm": 0.2722652554512024, "learning_rate": 5e-06, "loss": 0.9508, "num_input_tokens_seen": 848564368, "step": 1867, "train_runtime": 134265.8682, "train_tokens_per_second": 6320.03 }, { "epoch": 2.2615234726018976, "grad_norm": 0.2940795123577118, "learning_rate": 5e-06, "loss": 0.9672, "num_input_tokens_seen": 849026776, "step": 1868, "train_runtime": 134334.2382, "train_tokens_per_second": 6320.256 }, { "epoch": 2.262734229419499, "grad_norm": 0.22633950412273407, "learning_rate": 5e-06, "loss": 0.9377, "num_input_tokens_seen": 849482712, "step": 1869, "train_runtime": 134401.8248, "train_tokens_per_second": 6320.47 }, { "epoch": 2.2639449862371004, "grad_norm": 0.24709929525852203, "learning_rate": 5e-06, "loss": 0.9169, "num_input_tokens_seen": 849939544, "step": 1870, "train_runtime": 134469.2516, "train_tokens_per_second": 6320.698 }, { "epoch": 2.265155743054702, "grad_norm": 0.2768784463405609, "learning_rate": 5e-06, "loss": 0.904, "num_input_tokens_seen": 850413688, "step": 1871, "train_runtime": 134539.3239, "train_tokens_per_second": 6320.93 }, { "epoch": 2.266366499872303, "grad_norm": 0.24461229145526886, "learning_rate": 5e-06, "loss": 0.9515, "num_input_tokens_seen": 850865528, "step": 1872, "train_runtime": 134605.9731, "train_tokens_per_second": 6321.157 }, { "epoch": 2.267577256689904, "grad_norm": 0.282145619392395, "learning_rate": 5e-06, "loss": 0.9657, "num_input_tokens_seen": 851305456, "step": 1873, "train_runtime": 134670.8221, "train_tokens_per_second": 6321.38 }, { "epoch": 2.2687880135075056, "grad_norm": 0.24732042849063873, "learning_rate": 5e-06, "loss": 0.9111, "num_input_tokens_seen": 851761888, "step": 1874, "train_runtime": 134738.0896, "train_tokens_per_second": 6321.612 }, { "epoch": 2.269998770325107, "grad_norm": 0.22736340761184692, "learning_rate": 5e-06, "loss": 0.9206, "num_input_tokens_seen": 852221120, "step": 1875, "train_runtime": 134806.452, "train_tokens_per_second": 6321.813 }, { "epoch": 2.2712095271427084, "grad_norm": 0.2657550275325775, "learning_rate": 5e-06, "loss": 0.9847, "num_input_tokens_seen": 852692200, "step": 1876, "train_runtime": 134875.6621, "train_tokens_per_second": 6322.061 }, { "epoch": 2.27242028396031, "grad_norm": 0.2386472225189209, "learning_rate": 5e-06, "loss": 0.9157, "num_input_tokens_seen": 853149952, "step": 1877, "train_runtime": 134943.8512, "train_tokens_per_second": 6322.259 }, { "epoch": 2.2736310407779112, "grad_norm": 0.2535218298435211, "learning_rate": 5e-06, "loss": 0.9699, "num_input_tokens_seen": 853585528, "step": 1878, "train_runtime": 135007.8471, "train_tokens_per_second": 6322.488 }, { "epoch": 2.2748417975955126, "grad_norm": 0.2574761211872101, "learning_rate": 5e-06, "loss": 0.9425, "num_input_tokens_seen": 854028664, "step": 1879, "train_runtime": 135073.4954, "train_tokens_per_second": 6322.696 }, { "epoch": 2.276052554413114, "grad_norm": 0.25591275095939636, "learning_rate": 5e-06, "loss": 0.9834, "num_input_tokens_seen": 854480232, "step": 1880, "train_runtime": 135141.1933, "train_tokens_per_second": 6322.87 }, { "epoch": 2.2772633112307155, "grad_norm": 0.2474929392337799, "learning_rate": 5e-06, "loss": 0.9189, "num_input_tokens_seen": 854957440, "step": 1881, "train_runtime": 135213.5472, "train_tokens_per_second": 6323.016 }, { "epoch": 2.278474068048317, "grad_norm": 0.24820934236049652, "learning_rate": 5e-06, "loss": 0.9193, "num_input_tokens_seen": 855410112, "step": 1882, "train_runtime": 135280.3287, "train_tokens_per_second": 6323.241 }, { "epoch": 2.2796848248659183, "grad_norm": 0.25758039951324463, "learning_rate": 5e-06, "loss": 0.9387, "num_input_tokens_seen": 855843248, "step": 1883, "train_runtime": 135343.7932, "train_tokens_per_second": 6323.476 }, { "epoch": 2.2808955816835197, "grad_norm": 0.32192301750183105, "learning_rate": 5e-06, "loss": 0.9572, "num_input_tokens_seen": 856285992, "step": 1884, "train_runtime": 135408.6894, "train_tokens_per_second": 6323.715 }, { "epoch": 2.282106338501121, "grad_norm": 0.2613389194011688, "learning_rate": 5e-06, "loss": 0.9244, "num_input_tokens_seen": 856720480, "step": 1885, "train_runtime": 135473.0216, "train_tokens_per_second": 6323.919 }, { "epoch": 2.283317095318722, "grad_norm": 0.2691548764705658, "learning_rate": 5e-06, "loss": 0.9155, "num_input_tokens_seen": 857171960, "step": 1886, "train_runtime": 135539.9108, "train_tokens_per_second": 6324.13 }, { "epoch": 2.2845278521363235, "grad_norm": 0.2469540685415268, "learning_rate": 5e-06, "loss": 0.914, "num_input_tokens_seen": 857629224, "step": 1887, "train_runtime": 135607.533, "train_tokens_per_second": 6324.348 }, { "epoch": 2.285738608953925, "grad_norm": 0.24443942308425903, "learning_rate": 5e-06, "loss": 0.8825, "num_input_tokens_seen": 858109800, "step": 1888, "train_runtime": 135678.9773, "train_tokens_per_second": 6324.56 }, { "epoch": 2.2869493657715263, "grad_norm": 0.2294890135526657, "learning_rate": 5e-06, "loss": 0.9354, "num_input_tokens_seen": 858577656, "step": 1889, "train_runtime": 135748.465, "train_tokens_per_second": 6324.769 }, { "epoch": 2.2881601225891277, "grad_norm": 0.23962783813476562, "learning_rate": 5e-06, "loss": 0.9559, "num_input_tokens_seen": 859030552, "step": 1890, "train_runtime": 135815.0908, "train_tokens_per_second": 6325.001 }, { "epoch": 2.289370879406729, "grad_norm": 0.28975754976272583, "learning_rate": 5e-06, "loss": 0.9192, "num_input_tokens_seen": 859488344, "step": 1891, "train_runtime": 135882.7476, "train_tokens_per_second": 6325.221 }, { "epoch": 2.2905816362243305, "grad_norm": 0.23890255391597748, "learning_rate": 5e-06, "loss": 0.8657, "num_input_tokens_seen": 859934904, "step": 1892, "train_runtime": 135949.0563, "train_tokens_per_second": 6325.42 }, { "epoch": 2.291792393041932, "grad_norm": 0.25363996624946594, "learning_rate": 5e-06, "loss": 0.9056, "num_input_tokens_seen": 860406744, "step": 1893, "train_runtime": 136018.9588, "train_tokens_per_second": 6325.638 }, { "epoch": 2.2930031498595334, "grad_norm": 0.26004326343536377, "learning_rate": 5e-06, "loss": 0.9289, "num_input_tokens_seen": 860848024, "step": 1894, "train_runtime": 136084.1855, "train_tokens_per_second": 6325.849 }, { "epoch": 2.2942139066771348, "grad_norm": 0.31975099444389343, "learning_rate": 5e-06, "loss": 0.9069, "num_input_tokens_seen": 861297312, "step": 1895, "train_runtime": 136150.7509, "train_tokens_per_second": 6326.056 }, { "epoch": 2.295424663494736, "grad_norm": 0.2796708345413208, "learning_rate": 5e-06, "loss": 0.8722, "num_input_tokens_seen": 861726552, "step": 1896, "train_runtime": 136214.2119, "train_tokens_per_second": 6326.26 }, { "epoch": 2.2966354203123376, "grad_norm": 0.32552340626716614, "learning_rate": 5e-06, "loss": 0.9589, "num_input_tokens_seen": 862175240, "step": 1897, "train_runtime": 136280.4971, "train_tokens_per_second": 6326.476 }, { "epoch": 2.297846177129939, "grad_norm": 0.2615937292575836, "learning_rate": 5e-06, "loss": 0.8747, "num_input_tokens_seen": 862622672, "step": 1898, "train_runtime": 136346.7206, "train_tokens_per_second": 6326.684 }, { "epoch": 2.29905693394754, "grad_norm": 0.27208948135375977, "learning_rate": 5e-06, "loss": 0.9689, "num_input_tokens_seen": 863047272, "step": 1899, "train_runtime": 136409.4794, "train_tokens_per_second": 6326.886 }, { "epoch": 2.3002676907651414, "grad_norm": 0.2440728098154068, "learning_rate": 5e-06, "loss": 0.9092, "num_input_tokens_seen": 863494008, "step": 1900, "train_runtime": 136475.8625, "train_tokens_per_second": 6327.082 }, { "epoch": 2.301478447582743, "grad_norm": 0.24035605788230896, "learning_rate": 5e-06, "loss": 0.9087, "num_input_tokens_seen": 863952264, "step": 1901, "train_runtime": 136543.8991, "train_tokens_per_second": 6327.286 }, { "epoch": 2.302689204400344, "grad_norm": 0.32341066002845764, "learning_rate": 5e-06, "loss": 0.8986, "num_input_tokens_seen": 864426280, "step": 1902, "train_runtime": 136614.3437, "train_tokens_per_second": 6327.493 }, { "epoch": 2.3038999612179456, "grad_norm": 0.28295764327049255, "learning_rate": 5e-06, "loss": 0.9257, "num_input_tokens_seen": 864905968, "step": 1903, "train_runtime": 136684.861, "train_tokens_per_second": 6327.738 }, { "epoch": 2.305110718035547, "grad_norm": 0.2590475380420685, "learning_rate": 5e-06, "loss": 0.9493, "num_input_tokens_seen": 865342848, "step": 1904, "train_runtime": 136749.3564, "train_tokens_per_second": 6327.948 }, { "epoch": 2.3063214748531484, "grad_norm": 0.25882232189178467, "learning_rate": 5e-06, "loss": 0.8907, "num_input_tokens_seen": 865794736, "step": 1905, "train_runtime": 136815.6825, "train_tokens_per_second": 6328.183 }, { "epoch": 2.30753223167075, "grad_norm": 0.26275938749313354, "learning_rate": 5e-06, "loss": 0.9502, "num_input_tokens_seen": 866250416, "step": 1906, "train_runtime": 136883.0305, "train_tokens_per_second": 6328.399 }, { "epoch": 2.3087429884883512, "grad_norm": 0.2927948236465454, "learning_rate": 5e-06, "loss": 0.8985, "num_input_tokens_seen": 866709976, "step": 1907, "train_runtime": 136951.2483, "train_tokens_per_second": 6328.602 }, { "epoch": 2.3099537453059527, "grad_norm": 0.2844955027103424, "learning_rate": 5e-06, "loss": 0.932, "num_input_tokens_seen": 867193176, "step": 1908, "train_runtime": 137022.6963, "train_tokens_per_second": 6328.829 }, { "epoch": 2.311164502123554, "grad_norm": 0.2642100155353546, "learning_rate": 5e-06, "loss": 0.9547, "num_input_tokens_seen": 867661560, "step": 1909, "train_runtime": 137092.1567, "train_tokens_per_second": 6329.039 }, { "epoch": 2.3123752589411555, "grad_norm": 0.2718662917613983, "learning_rate": 5e-06, "loss": 0.9502, "num_input_tokens_seen": 868124000, "step": 1910, "train_runtime": 137160.7427, "train_tokens_per_second": 6329.245 }, { "epoch": 2.313586015758757, "grad_norm": 0.2536037862300873, "learning_rate": 5e-06, "loss": 0.9193, "num_input_tokens_seen": 868588696, "step": 1911, "train_runtime": 137229.9509, "train_tokens_per_second": 6329.44 }, { "epoch": 2.314796772576358, "grad_norm": 0.249566450715065, "learning_rate": 5e-06, "loss": 0.949, "num_input_tokens_seen": 869053760, "step": 1912, "train_runtime": 137299.3417, "train_tokens_per_second": 6329.628 }, { "epoch": 2.3160075293939597, "grad_norm": 0.262437105178833, "learning_rate": 5e-06, "loss": 0.9716, "num_input_tokens_seen": 869515720, "step": 1913, "train_runtime": 137367.5168, "train_tokens_per_second": 6329.85 }, { "epoch": 2.3172182862115607, "grad_norm": 0.23782069981098175, "learning_rate": 5e-06, "loss": 0.9011, "num_input_tokens_seen": 869971160, "step": 1914, "train_runtime": 137434.8916, "train_tokens_per_second": 6330.06 }, { "epoch": 2.318429043029162, "grad_norm": 0.2517566978931427, "learning_rate": 5e-06, "loss": 0.91, "num_input_tokens_seen": 870399880, "step": 1915, "train_runtime": 137497.6211, "train_tokens_per_second": 6330.29 }, { "epoch": 2.3196397998467635, "grad_norm": 0.26012682914733887, "learning_rate": 5e-06, "loss": 0.9396, "num_input_tokens_seen": 870853040, "step": 1916, "train_runtime": 137564.414, "train_tokens_per_second": 6330.511 }, { "epoch": 2.320850556664365, "grad_norm": 0.24836315214633942, "learning_rate": 5e-06, "loss": 1.01, "num_input_tokens_seen": 871314416, "step": 1917, "train_runtime": 137632.4092, "train_tokens_per_second": 6330.736 }, { "epoch": 2.3220613134819663, "grad_norm": 0.26975148916244507, "learning_rate": 5e-06, "loss": 0.9313, "num_input_tokens_seen": 871770264, "step": 1918, "train_runtime": 137699.9741, "train_tokens_per_second": 6330.94 }, { "epoch": 2.3232720702995677, "grad_norm": 0.23591186106204987, "learning_rate": 5e-06, "loss": 0.9242, "num_input_tokens_seen": 872229016, "step": 1919, "train_runtime": 137767.7326, "train_tokens_per_second": 6331.156 }, { "epoch": 2.324482827117169, "grad_norm": 0.24724294245243073, "learning_rate": 5e-06, "loss": 0.9166, "num_input_tokens_seen": 872653144, "step": 1920, "train_runtime": 137830.5002, "train_tokens_per_second": 6331.35 }, { "epoch": 2.3256935839347705, "grad_norm": 0.23832382261753082, "learning_rate": 5e-06, "loss": 0.9463, "num_input_tokens_seen": 873110384, "step": 1921, "train_runtime": 137898.239, "train_tokens_per_second": 6331.556 }, { "epoch": 2.326904340752372, "grad_norm": 0.28097233176231384, "learning_rate": 5e-06, "loss": 0.9624, "num_input_tokens_seen": 873570488, "step": 1922, "train_runtime": 137966.7648, "train_tokens_per_second": 6331.746 }, { "epoch": 2.3281150975699734, "grad_norm": 0.23570659756660461, "learning_rate": 5e-06, "loss": 0.9168, "num_input_tokens_seen": 874019688, "step": 1923, "train_runtime": 138032.8862, "train_tokens_per_second": 6331.967 }, { "epoch": 2.329325854387575, "grad_norm": 0.2484421581029892, "learning_rate": 5e-06, "loss": 0.9224, "num_input_tokens_seen": 874476456, "step": 1924, "train_runtime": 138100.3145, "train_tokens_per_second": 6332.183 }, { "epoch": 2.3305366112051757, "grad_norm": 0.2436489313840866, "learning_rate": 5e-06, "loss": 0.9769, "num_input_tokens_seen": 874906120, "step": 1925, "train_runtime": 138163.392, "train_tokens_per_second": 6332.402 }, { "epoch": 2.3317473680227776, "grad_norm": 0.23818077147006989, "learning_rate": 5e-06, "loss": 0.9719, "num_input_tokens_seen": 875359880, "step": 1926, "train_runtime": 138230.2722, "train_tokens_per_second": 6332.621 }, { "epoch": 2.3329581248403786, "grad_norm": 0.2646999955177307, "learning_rate": 5e-06, "loss": 0.9759, "num_input_tokens_seen": 875813712, "step": 1927, "train_runtime": 138297.4023, "train_tokens_per_second": 6332.828 }, { "epoch": 2.33416888165798, "grad_norm": 0.24218083918094635, "learning_rate": 5e-06, "loss": 0.9444, "num_input_tokens_seen": 876275792, "step": 1928, "train_runtime": 138366.0473, "train_tokens_per_second": 6333.026 }, { "epoch": 2.3353796384755814, "grad_norm": 0.22336937487125397, "learning_rate": 5e-06, "loss": 0.8828, "num_input_tokens_seen": 876756576, "step": 1929, "train_runtime": 138437.719, "train_tokens_per_second": 6333.22 }, { "epoch": 2.336590395293183, "grad_norm": 0.21716539561748505, "learning_rate": 5e-06, "loss": 0.9001, "num_input_tokens_seen": 877237832, "step": 1930, "train_runtime": 138508.8901, "train_tokens_per_second": 6333.441 }, { "epoch": 2.337801152110784, "grad_norm": 0.24788719415664673, "learning_rate": 5e-06, "loss": 0.9169, "num_input_tokens_seen": 877681584, "step": 1931, "train_runtime": 138574.6608, "train_tokens_per_second": 6333.637 }, { "epoch": 2.3390119089283856, "grad_norm": 0.2476462423801422, "learning_rate": 5e-06, "loss": 0.9653, "num_input_tokens_seen": 878125368, "step": 1932, "train_runtime": 138639.9943, "train_tokens_per_second": 6333.853 }, { "epoch": 2.340222665745987, "grad_norm": 0.26290398836135864, "learning_rate": 5e-06, "loss": 0.9275, "num_input_tokens_seen": 878571608, "step": 1933, "train_runtime": 138706.0769, "train_tokens_per_second": 6334.053 }, { "epoch": 2.3414334225635884, "grad_norm": 0.26040390133857727, "learning_rate": 5e-06, "loss": 0.9176, "num_input_tokens_seen": 879012848, "step": 1934, "train_runtime": 138772.6778, "train_tokens_per_second": 6334.192 }, { "epoch": 2.34264417938119, "grad_norm": 0.22445742785930634, "learning_rate": 5e-06, "loss": 0.8518, "num_input_tokens_seen": 879501408, "step": 1935, "train_runtime": 138846.372, "train_tokens_per_second": 6334.349 }, { "epoch": 2.3438549361987913, "grad_norm": 0.2317107766866684, "learning_rate": 5e-06, "loss": 0.9035, "num_input_tokens_seen": 879957520, "step": 1936, "train_runtime": 138913.7202, "train_tokens_per_second": 6334.562 }, { "epoch": 2.3450656930163927, "grad_norm": 0.2713346481323242, "learning_rate": 5e-06, "loss": 0.9087, "num_input_tokens_seen": 880411232, "step": 1937, "train_runtime": 138981.0054, "train_tokens_per_second": 6334.759 }, { "epoch": 2.346276449833994, "grad_norm": 0.24011683464050293, "learning_rate": 5e-06, "loss": 0.9394, "num_input_tokens_seen": 880860792, "step": 1938, "train_runtime": 139047.4125, "train_tokens_per_second": 6334.967 }, { "epoch": 2.3474872066515955, "grad_norm": 0.2560282349586487, "learning_rate": 5e-06, "loss": 0.9062, "num_input_tokens_seen": 881332280, "step": 1939, "train_runtime": 139117.5608, "train_tokens_per_second": 6335.162 }, { "epoch": 2.3486979634691965, "grad_norm": 0.23384442925453186, "learning_rate": 5e-06, "loss": 0.879, "num_input_tokens_seen": 881765136, "step": 1940, "train_runtime": 139181.4369, "train_tokens_per_second": 6335.365 }, { "epoch": 2.349908720286798, "grad_norm": 0.23254314064979553, "learning_rate": 5e-06, "loss": 0.8938, "num_input_tokens_seen": 882225136, "step": 1941, "train_runtime": 139249.4601, "train_tokens_per_second": 6335.573 }, { "epoch": 2.3511194771043993, "grad_norm": 0.2877858281135559, "learning_rate": 5e-06, "loss": 0.9639, "num_input_tokens_seen": 882660080, "step": 1942, "train_runtime": 139313.9834, "train_tokens_per_second": 6335.761 }, { "epoch": 2.3523302339220007, "grad_norm": 0.24326159060001373, "learning_rate": 5e-06, "loss": 0.9099, "num_input_tokens_seen": 883082800, "step": 1943, "train_runtime": 139376.2313, "train_tokens_per_second": 6335.964 }, { "epoch": 2.353540990739602, "grad_norm": 0.23450767993927002, "learning_rate": 5e-06, "loss": 0.9258, "num_input_tokens_seen": 883532720, "step": 1944, "train_runtime": 139443.249, "train_tokens_per_second": 6336.146 }, { "epoch": 2.3547517475572035, "grad_norm": 0.25885146856307983, "learning_rate": 5e-06, "loss": 0.8931, "num_input_tokens_seen": 883983312, "step": 1945, "train_runtime": 139509.936, "train_tokens_per_second": 6336.347 }, { "epoch": 2.355962504374805, "grad_norm": 0.23597835004329681, "learning_rate": 5e-06, "loss": 0.9222, "num_input_tokens_seen": 884453544, "step": 1946, "train_runtime": 139579.5403, "train_tokens_per_second": 6336.556 }, { "epoch": 2.3571732611924063, "grad_norm": 0.2448599487543106, "learning_rate": 5e-06, "loss": 0.9685, "num_input_tokens_seen": 884895312, "step": 1947, "train_runtime": 139645.5434, "train_tokens_per_second": 6336.724 }, { "epoch": 2.3583840180100077, "grad_norm": 0.25267136096954346, "learning_rate": 5e-06, "loss": 0.9375, "num_input_tokens_seen": 885354208, "step": 1948, "train_runtime": 139713.3438, "train_tokens_per_second": 6336.934 }, { "epoch": 2.359594774827609, "grad_norm": 0.2259773463010788, "learning_rate": 5e-06, "loss": 0.8995, "num_input_tokens_seen": 885828656, "step": 1949, "train_runtime": 139783.4906, "train_tokens_per_second": 6337.148 }, { "epoch": 2.3608055316452106, "grad_norm": 0.25038328766822815, "learning_rate": 5e-06, "loss": 0.9297, "num_input_tokens_seen": 886262736, "step": 1950, "train_runtime": 139847.7611, "train_tokens_per_second": 6337.339 }, { "epoch": 2.362016288462812, "grad_norm": 0.23766860365867615, "learning_rate": 5e-06, "loss": 0.8919, "num_input_tokens_seen": 886724640, "step": 1951, "train_runtime": 139916.2008, "train_tokens_per_second": 6337.541 }, { "epoch": 2.3632270452804134, "grad_norm": 0.2303091287612915, "learning_rate": 5e-06, "loss": 0.8914, "num_input_tokens_seen": 887164120, "step": 1952, "train_runtime": 139981.2882, "train_tokens_per_second": 6337.734 }, { "epoch": 2.3644378020980144, "grad_norm": 0.24684786796569824, "learning_rate": 5e-06, "loss": 0.8894, "num_input_tokens_seen": 887620064, "step": 1953, "train_runtime": 140048.5002, "train_tokens_per_second": 6337.948 }, { "epoch": 2.3656485589156158, "grad_norm": 0.2284991294145584, "learning_rate": 5e-06, "loss": 0.9494, "num_input_tokens_seen": 888070904, "step": 1954, "train_runtime": 140114.8455, "train_tokens_per_second": 6338.164 }, { "epoch": 2.366859315733217, "grad_norm": 0.254375159740448, "learning_rate": 5e-06, "loss": 0.8982, "num_input_tokens_seen": 888519704, "step": 1955, "train_runtime": 140181.3735, "train_tokens_per_second": 6338.358 }, { "epoch": 2.3680700725508186, "grad_norm": 0.2587945759296417, "learning_rate": 5e-06, "loss": 0.926, "num_input_tokens_seen": 888953576, "step": 1956, "train_runtime": 140245.6256, "train_tokens_per_second": 6338.548 }, { "epoch": 2.36928082936842, "grad_norm": 0.263895183801651, "learning_rate": 5e-06, "loss": 0.8845, "num_input_tokens_seen": 889422280, "step": 1957, "train_runtime": 140315.4592, "train_tokens_per_second": 6338.733 }, { "epoch": 2.3704915861860214, "grad_norm": 0.22773973643779755, "learning_rate": 5e-06, "loss": 0.8691, "num_input_tokens_seen": 889895248, "step": 1958, "train_runtime": 140386.1495, "train_tokens_per_second": 6338.911 }, { "epoch": 2.371702343003623, "grad_norm": 0.26075223088264465, "learning_rate": 5e-06, "loss": 1.0117, "num_input_tokens_seen": 890333472, "step": 1959, "train_runtime": 140450.4448, "train_tokens_per_second": 6339.129 }, { "epoch": 2.3729130998212242, "grad_norm": 0.2427862286567688, "learning_rate": 5e-06, "loss": 0.9224, "num_input_tokens_seen": 890779816, "step": 1960, "train_runtime": 140516.3711, "train_tokens_per_second": 6339.331 }, { "epoch": 2.3741238566388256, "grad_norm": 0.24546240270137787, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 891243432, "step": 1961, "train_runtime": 140584.7256, "train_tokens_per_second": 6339.547 }, { "epoch": 2.375334613456427, "grad_norm": 0.24161502718925476, "learning_rate": 5e-06, "loss": 0.9268, "num_input_tokens_seen": 891695152, "step": 1962, "train_runtime": 140651.3388, "train_tokens_per_second": 6339.756 }, { "epoch": 2.3765453702740285, "grad_norm": 0.25115856528282166, "learning_rate": 5e-06, "loss": 0.9467, "num_input_tokens_seen": 892133512, "step": 1963, "train_runtime": 140716.2837, "train_tokens_per_second": 6339.945 }, { "epoch": 2.37775612709163, "grad_norm": 0.2623535692691803, "learning_rate": 5e-06, "loss": 0.9482, "num_input_tokens_seen": 892581496, "step": 1964, "train_runtime": 140782.484, "train_tokens_per_second": 6340.146 }, { "epoch": 2.3789668839092313, "grad_norm": 0.2622727155685425, "learning_rate": 5e-06, "loss": 0.9253, "num_input_tokens_seen": 893044504, "step": 1965, "train_runtime": 140850.9639, "train_tokens_per_second": 6340.351 }, { "epoch": 2.3801776407268322, "grad_norm": 0.2433587610721588, "learning_rate": 5e-06, "loss": 0.9352, "num_input_tokens_seen": 893498056, "step": 1966, "train_runtime": 140918.3136, "train_tokens_per_second": 6340.539 }, { "epoch": 2.3813883975444337, "grad_norm": 0.25335007905960083, "learning_rate": 5e-06, "loss": 0.9527, "num_input_tokens_seen": 893927120, "step": 1967, "train_runtime": 140981.7443, "train_tokens_per_second": 6340.73 }, { "epoch": 2.382599154362035, "grad_norm": 0.24073803424835205, "learning_rate": 5e-06, "loss": 0.964, "num_input_tokens_seen": 894397904, "step": 1968, "train_runtime": 141051.8869, "train_tokens_per_second": 6340.914 }, { "epoch": 2.3838099111796365, "grad_norm": 0.2417605221271515, "learning_rate": 5e-06, "loss": 0.8975, "num_input_tokens_seen": 894866064, "step": 1969, "train_runtime": 141121.0337, "train_tokens_per_second": 6341.125 }, { "epoch": 2.385020667997238, "grad_norm": 0.264097660779953, "learning_rate": 5e-06, "loss": 0.8888, "num_input_tokens_seen": 895325256, "step": 1970, "train_runtime": 141188.7929, "train_tokens_per_second": 6341.334 }, { "epoch": 2.3862314248148393, "grad_norm": 0.23401835560798645, "learning_rate": 5e-06, "loss": 0.914, "num_input_tokens_seen": 895793728, "step": 1971, "train_runtime": 141257.3012, "train_tokens_per_second": 6341.575 }, { "epoch": 2.3874421816324407, "grad_norm": 0.23458734154701233, "learning_rate": 5e-06, "loss": 0.9237, "num_input_tokens_seen": 896231368, "step": 1972, "train_runtime": 141321.9812, "train_tokens_per_second": 6341.769 }, { "epoch": 2.388652938450042, "grad_norm": 0.24703070521354675, "learning_rate": 5e-06, "loss": 0.8911, "num_input_tokens_seen": 896673912, "step": 1973, "train_runtime": 141387.431, "train_tokens_per_second": 6341.963 }, { "epoch": 2.3898636952676435, "grad_norm": 0.2715272009372711, "learning_rate": 5e-06, "loss": 0.9271, "num_input_tokens_seen": 897130160, "step": 1974, "train_runtime": 141455.1371, "train_tokens_per_second": 6342.153 }, { "epoch": 2.391074452085245, "grad_norm": 0.27634164690971375, "learning_rate": 5e-06, "loss": 0.9441, "num_input_tokens_seen": 897585568, "step": 1975, "train_runtime": 141522.4112, "train_tokens_per_second": 6342.356 }, { "epoch": 2.3922852089028463, "grad_norm": 0.2545999586582184, "learning_rate": 5e-06, "loss": 0.9379, "num_input_tokens_seen": 898024864, "step": 1976, "train_runtime": 141587.355, "train_tokens_per_second": 6342.55 }, { "epoch": 2.3934959657204478, "grad_norm": 0.23588921129703522, "learning_rate": 5e-06, "loss": 0.8916, "num_input_tokens_seen": 898487168, "step": 1977, "train_runtime": 141655.6271, "train_tokens_per_second": 6342.757 }, { "epoch": 2.394706722538049, "grad_norm": 0.28697672486305237, "learning_rate": 5e-06, "loss": 0.9252, "num_input_tokens_seen": 898948576, "step": 1978, "train_runtime": 141724.1485, "train_tokens_per_second": 6342.946 }, { "epoch": 2.39591747935565, "grad_norm": 0.2565509080886841, "learning_rate": 5e-06, "loss": 0.8936, "num_input_tokens_seen": 899411768, "step": 1979, "train_runtime": 141793.0536, "train_tokens_per_second": 6343.13 }, { "epoch": 2.3971282361732515, "grad_norm": 0.25218653678894043, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 899887136, "step": 1980, "train_runtime": 141862.8968, "train_tokens_per_second": 6343.358 }, { "epoch": 2.398338992990853, "grad_norm": 0.25139865279197693, "learning_rate": 5e-06, "loss": 0.9023, "num_input_tokens_seen": 900321552, "step": 1981, "train_runtime": 141927.0945, "train_tokens_per_second": 6343.55 }, { "epoch": 2.3995497498084544, "grad_norm": 0.2799871265888214, "learning_rate": 5e-06, "loss": 0.9437, "num_input_tokens_seen": 900784368, "step": 1982, "train_runtime": 141995.7321, "train_tokens_per_second": 6343.743 }, { "epoch": 2.400760506626056, "grad_norm": 0.25518181920051575, "learning_rate": 5e-06, "loss": 0.8679, "num_input_tokens_seen": 901240864, "step": 1983, "train_runtime": 142063.228, "train_tokens_per_second": 6343.942 }, { "epoch": 2.401971263443657, "grad_norm": 0.28088992834091187, "learning_rate": 5e-06, "loss": 0.9436, "num_input_tokens_seen": 901673304, "step": 1984, "train_runtime": 142126.5258, "train_tokens_per_second": 6344.159 }, { "epoch": 2.4031820202612586, "grad_norm": 0.23146390914916992, "learning_rate": 5e-06, "loss": 0.928, "num_input_tokens_seen": 902127568, "step": 1985, "train_runtime": 142193.8578, "train_tokens_per_second": 6344.35 }, { "epoch": 2.40439277707886, "grad_norm": 0.23194481432437897, "learning_rate": 5e-06, "loss": 0.9461, "num_input_tokens_seen": 902577560, "step": 1986, "train_runtime": 142260.1684, "train_tokens_per_second": 6344.556 }, { "epoch": 2.4056035338964614, "grad_norm": 0.2525422275066376, "learning_rate": 5e-06, "loss": 0.9334, "num_input_tokens_seen": 903038008, "step": 1987, "train_runtime": 142328.7983, "train_tokens_per_second": 6344.731 }, { "epoch": 2.406814290714063, "grad_norm": 0.258497953414917, "learning_rate": 5e-06, "loss": 0.9355, "num_input_tokens_seen": 903491080, "step": 1988, "train_runtime": 142397.3731, "train_tokens_per_second": 6344.858 }, { "epoch": 2.4080250475316642, "grad_norm": 0.24086523056030273, "learning_rate": 5e-06, "loss": 0.9504, "num_input_tokens_seen": 903940184, "step": 1989, "train_runtime": 142463.8782, "train_tokens_per_second": 6345.048 }, { "epoch": 2.4092358043492657, "grad_norm": 0.24795937538146973, "learning_rate": 5e-06, "loss": 0.9263, "num_input_tokens_seen": 904404544, "step": 1990, "train_runtime": 142532.6712, "train_tokens_per_second": 6345.244 }, { "epoch": 2.410446561166867, "grad_norm": 0.2638545632362366, "learning_rate": 5e-06, "loss": 1.017, "num_input_tokens_seen": 904832936, "step": 1991, "train_runtime": 142595.965, "train_tokens_per_second": 6345.432 }, { "epoch": 2.411657317984468, "grad_norm": 0.23633180558681488, "learning_rate": 5e-06, "loss": 0.9562, "num_input_tokens_seen": 905310056, "step": 1992, "train_runtime": 142666.5754, "train_tokens_per_second": 6345.635 }, { "epoch": 2.41286807480207, "grad_norm": 0.2632332444190979, "learning_rate": 5e-06, "loss": 0.9153, "num_input_tokens_seen": 905746048, "step": 1993, "train_runtime": 142730.9361, "train_tokens_per_second": 6345.829 }, { "epoch": 2.414078831619671, "grad_norm": 0.2594749927520752, "learning_rate": 5e-06, "loss": 0.9256, "num_input_tokens_seen": 906175336, "step": 1994, "train_runtime": 142794.2311, "train_tokens_per_second": 6346.022 }, { "epoch": 2.4152895884372723, "grad_norm": 0.24447709321975708, "learning_rate": 5e-06, "loss": 0.8639, "num_input_tokens_seen": 906625416, "step": 1995, "train_runtime": 142860.8603, "train_tokens_per_second": 6346.213 }, { "epoch": 2.4165003452548737, "grad_norm": 0.2638216018676758, "learning_rate": 5e-06, "loss": 0.9065, "num_input_tokens_seen": 907116368, "step": 1996, "train_runtime": 142934.0058, "train_tokens_per_second": 6346.4 }, { "epoch": 2.417711102072475, "grad_norm": 0.23817259073257446, "learning_rate": 5e-06, "loss": 0.9236, "num_input_tokens_seen": 907590384, "step": 1997, "train_runtime": 143004.1952, "train_tokens_per_second": 6346.6 }, { "epoch": 2.4189218588900765, "grad_norm": 0.2550632059574127, "learning_rate": 5e-06, "loss": 0.967, "num_input_tokens_seen": 908026040, "step": 1998, "train_runtime": 143068.2631, "train_tokens_per_second": 6346.803 }, { "epoch": 2.420132615707678, "grad_norm": 0.2464226633310318, "learning_rate": 5e-06, "loss": 0.9368, "num_input_tokens_seen": 908465552, "step": 1999, "train_runtime": 143133.3558, "train_tokens_per_second": 6346.987 }, { "epoch": 2.4213433725252793, "grad_norm": 0.2467721402645111, "learning_rate": 5e-06, "loss": 0.9368, "num_input_tokens_seen": 908920080, "step": 2000, "train_runtime": 143200.8915, "train_tokens_per_second": 6347.168 }, { "epoch": 2.4225541293428807, "grad_norm": 0.2405187487602234, "learning_rate": 5e-06, "loss": 0.9122, "num_input_tokens_seen": 909366536, "step": 2001, "train_runtime": 143268.6692, "train_tokens_per_second": 6347.281 }, { "epoch": 2.423764886160482, "grad_norm": 0.2485346794128418, "learning_rate": 5e-06, "loss": 0.9035, "num_input_tokens_seen": 909816392, "step": 2002, "train_runtime": 143335.0761, "train_tokens_per_second": 6347.479 }, { "epoch": 2.4249756429780835, "grad_norm": 0.24442023038864136, "learning_rate": 5e-06, "loss": 0.9275, "num_input_tokens_seen": 910258544, "step": 2003, "train_runtime": 143400.4166, "train_tokens_per_second": 6347.67 }, { "epoch": 2.426186399795685, "grad_norm": 0.229239359498024, "learning_rate": 5e-06, "loss": 0.9361, "num_input_tokens_seen": 910717192, "step": 2004, "train_runtime": 143468.3168, "train_tokens_per_second": 6347.863 }, { "epoch": 2.427397156613286, "grad_norm": 0.24774321913719177, "learning_rate": 5e-06, "loss": 0.9516, "num_input_tokens_seen": 911158464, "step": 2005, "train_runtime": 143533.5062, "train_tokens_per_second": 6348.054 }, { "epoch": 2.4286079134308878, "grad_norm": 0.22622303664684296, "learning_rate": 5e-06, "loss": 0.8989, "num_input_tokens_seen": 911625768, "step": 2006, "train_runtime": 143602.5212, "train_tokens_per_second": 6348.257 }, { "epoch": 2.4298186702484887, "grad_norm": 0.23639139533042908, "learning_rate": 5e-06, "loss": 0.8516, "num_input_tokens_seen": 912074848, "step": 2007, "train_runtime": 143668.9004, "train_tokens_per_second": 6348.45 }, { "epoch": 2.43102942706609, "grad_norm": 0.22876761853694916, "learning_rate": 5e-06, "loss": 0.9411, "num_input_tokens_seen": 912544480, "step": 2008, "train_runtime": 143738.708, "train_tokens_per_second": 6348.634 }, { "epoch": 2.4322401838836916, "grad_norm": 0.22991536557674408, "learning_rate": 5e-06, "loss": 0.9092, "num_input_tokens_seen": 912999640, "step": 2009, "train_runtime": 143805.9728, "train_tokens_per_second": 6348.83 }, { "epoch": 2.433450940701293, "grad_norm": 0.2503306567668915, "learning_rate": 5e-06, "loss": 0.9616, "num_input_tokens_seen": 913462304, "step": 2010, "train_runtime": 143874.5271, "train_tokens_per_second": 6349.02 }, { "epoch": 2.4346616975188944, "grad_norm": 0.2324354350566864, "learning_rate": 5e-06, "loss": 0.9464, "num_input_tokens_seen": 913940856, "step": 2011, "train_runtime": 143945.0382, "train_tokens_per_second": 6349.235 }, { "epoch": 2.435872454336496, "grad_norm": 0.2581816017627716, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 914400040, "step": 2012, "train_runtime": 144012.6324, "train_tokens_per_second": 6349.443 }, { "epoch": 2.437083211154097, "grad_norm": 0.22747096419334412, "learning_rate": 5e-06, "loss": 0.8607, "num_input_tokens_seen": 914857280, "step": 2013, "train_runtime": 144079.8825, "train_tokens_per_second": 6349.653 }, { "epoch": 2.4382939679716986, "grad_norm": 0.2326267808675766, "learning_rate": 5e-06, "loss": 0.9106, "num_input_tokens_seen": 915326624, "step": 2014, "train_runtime": 144149.1853, "train_tokens_per_second": 6349.856 }, { "epoch": 2.4395047247893, "grad_norm": 0.23372125625610352, "learning_rate": 5e-06, "loss": 0.9309, "num_input_tokens_seen": 915785008, "step": 2015, "train_runtime": 144216.7032, "train_tokens_per_second": 6350.062 }, { "epoch": 2.4407154816069014, "grad_norm": 0.22626471519470215, "learning_rate": 5e-06, "loss": 0.9278, "num_input_tokens_seen": 916238880, "step": 2016, "train_runtime": 144283.6442, "train_tokens_per_second": 6350.262 }, { "epoch": 2.441926238424503, "grad_norm": 0.25401008129119873, "learning_rate": 5e-06, "loss": 0.9314, "num_input_tokens_seen": 916695648, "step": 2017, "train_runtime": 144350.7873, "train_tokens_per_second": 6350.472 }, { "epoch": 2.4431369952421043, "grad_norm": 0.2441287785768509, "learning_rate": 5e-06, "loss": 0.9458, "num_input_tokens_seen": 917130712, "step": 2018, "train_runtime": 144414.7762, "train_tokens_per_second": 6350.671 }, { "epoch": 2.4443477520597057, "grad_norm": 0.2315591424703598, "learning_rate": 5e-06, "loss": 0.915, "num_input_tokens_seen": 917573008, "step": 2019, "train_runtime": 144480.3945, "train_tokens_per_second": 6350.848 }, { "epoch": 2.4455585088773066, "grad_norm": 0.24778081476688385, "learning_rate": 5e-06, "loss": 0.9151, "num_input_tokens_seen": 918042304, "step": 2020, "train_runtime": 144550.1954, "train_tokens_per_second": 6351.028 }, { "epoch": 2.446769265694908, "grad_norm": 0.24255888164043427, "learning_rate": 5e-06, "loss": 0.9598, "num_input_tokens_seen": 918472152, "step": 2021, "train_runtime": 144613.6113, "train_tokens_per_second": 6351.215 }, { "epoch": 2.4479800225125095, "grad_norm": 0.2313011735677719, "learning_rate": 5e-06, "loss": 0.8695, "num_input_tokens_seen": 918918720, "step": 2022, "train_runtime": 144679.5413, "train_tokens_per_second": 6351.407 }, { "epoch": 2.449190779330111, "grad_norm": 0.26603221893310547, "learning_rate": 5e-06, "loss": 0.9775, "num_input_tokens_seen": 919349928, "step": 2023, "train_runtime": 144743.4514, "train_tokens_per_second": 6351.582 }, { "epoch": 2.4504015361477123, "grad_norm": 0.2424296885728836, "learning_rate": 5e-06, "loss": 0.965, "num_input_tokens_seen": 919786592, "step": 2024, "train_runtime": 144807.5707, "train_tokens_per_second": 6351.785 }, { "epoch": 2.4516122929653137, "grad_norm": 0.24074110388755798, "learning_rate": 5e-06, "loss": 0.9531, "num_input_tokens_seen": 920241992, "step": 2025, "train_runtime": 144875.0746, "train_tokens_per_second": 6351.969 }, { "epoch": 2.452823049782915, "grad_norm": 0.2590942084789276, "learning_rate": 5e-06, "loss": 0.9863, "num_input_tokens_seen": 920659648, "step": 2026, "train_runtime": 144937.2065, "train_tokens_per_second": 6352.128 }, { "epoch": 2.4540338066005165, "grad_norm": 0.24782414734363556, "learning_rate": 5e-06, "loss": 0.9386, "num_input_tokens_seen": 921097432, "step": 2027, "train_runtime": 145001.8229, "train_tokens_per_second": 6352.316 }, { "epoch": 2.455244563418118, "grad_norm": 0.26278966665267944, "learning_rate": 5e-06, "loss": 0.9022, "num_input_tokens_seen": 921551424, "step": 2028, "train_runtime": 145068.6754, "train_tokens_per_second": 6352.518 }, { "epoch": 2.4564553202357193, "grad_norm": 0.2415189892053604, "learning_rate": 5e-06, "loss": 0.8962, "num_input_tokens_seen": 921997752, "step": 2029, "train_runtime": 145134.5917, "train_tokens_per_second": 6352.708 }, { "epoch": 2.4576660770533207, "grad_norm": 0.23638790845870972, "learning_rate": 5e-06, "loss": 0.8967, "num_input_tokens_seen": 922462192, "step": 2030, "train_runtime": 145203.6043, "train_tokens_per_second": 6352.888 }, { "epoch": 2.458876833870922, "grad_norm": 0.27461180090904236, "learning_rate": 5e-06, "loss": 0.9226, "num_input_tokens_seen": 922906560, "step": 2031, "train_runtime": 145269.2551, "train_tokens_per_second": 6353.076 }, { "epoch": 2.4600875906885236, "grad_norm": 0.24975448846817017, "learning_rate": 5e-06, "loss": 0.9153, "num_input_tokens_seen": 923371312, "step": 2032, "train_runtime": 145338.224, "train_tokens_per_second": 6353.259 }, { "epoch": 2.4612983475061245, "grad_norm": 0.23750756680965424, "learning_rate": 5e-06, "loss": 0.8991, "num_input_tokens_seen": 923838344, "step": 2033, "train_runtime": 145407.6286, "train_tokens_per_second": 6353.438 }, { "epoch": 2.462509104323726, "grad_norm": 0.2277829796075821, "learning_rate": 5e-06, "loss": 0.884, "num_input_tokens_seen": 924294392, "step": 2034, "train_runtime": 145475.5205, "train_tokens_per_second": 6353.608 }, { "epoch": 2.4637198611413273, "grad_norm": 0.2631170451641083, "learning_rate": 5e-06, "loss": 0.9468, "num_input_tokens_seen": 924720120, "step": 2035, "train_runtime": 145538.1668, "train_tokens_per_second": 6353.798 }, { "epoch": 2.4649306179589288, "grad_norm": 0.24038782715797424, "learning_rate": 5e-06, "loss": 0.8965, "num_input_tokens_seen": 925176312, "step": 2036, "train_runtime": 145605.7781, "train_tokens_per_second": 6353.981 }, { "epoch": 2.46614137477653, "grad_norm": 0.2566758692264557, "learning_rate": 5e-06, "loss": 0.9498, "num_input_tokens_seen": 925601920, "step": 2037, "train_runtime": 145668.7144, "train_tokens_per_second": 6354.157 }, { "epoch": 2.4673521315941316, "grad_norm": 0.24463194608688354, "learning_rate": 5e-06, "loss": 0.9263, "num_input_tokens_seen": 926052952, "step": 2038, "train_runtime": 145735.808, "train_tokens_per_second": 6354.327 }, { "epoch": 2.468562888411733, "grad_norm": 0.2460647076368332, "learning_rate": 5e-06, "loss": 0.8975, "num_input_tokens_seen": 926514368, "step": 2039, "train_runtime": 145804.5927, "train_tokens_per_second": 6354.494 }, { "epoch": 2.4697736452293344, "grad_norm": 0.2725125253200531, "learning_rate": 5e-06, "loss": 0.9171, "num_input_tokens_seen": 926978616, "step": 2040, "train_runtime": 145872.9181, "train_tokens_per_second": 6354.7 }, { "epoch": 2.470984402046936, "grad_norm": 0.245590940117836, "learning_rate": 5e-06, "loss": 0.9422, "num_input_tokens_seen": 927427896, "step": 2041, "train_runtime": 145940.2315, "train_tokens_per_second": 6354.847 }, { "epoch": 2.4721951588645372, "grad_norm": 0.22320939600467682, "learning_rate": 5e-06, "loss": 0.8796, "num_input_tokens_seen": 927878992, "step": 2042, "train_runtime": 146008.6109, "train_tokens_per_second": 6354.961 }, { "epoch": 2.4734059156821386, "grad_norm": 0.23028956353664398, "learning_rate": 5e-06, "loss": 0.8894, "num_input_tokens_seen": 928332296, "step": 2043, "train_runtime": 146075.3686, "train_tokens_per_second": 6355.16 }, { "epoch": 2.47461667249974, "grad_norm": 0.2439645677804947, "learning_rate": 5e-06, "loss": 0.9377, "num_input_tokens_seen": 928771176, "step": 2044, "train_runtime": 146140.3819, "train_tokens_per_second": 6355.336 }, { "epoch": 2.4758274293173415, "grad_norm": 0.26759254932403564, "learning_rate": 5e-06, "loss": 0.9297, "num_input_tokens_seen": 929245128, "step": 2045, "train_runtime": 146210.4408, "train_tokens_per_second": 6355.532 }, { "epoch": 2.4770381861349424, "grad_norm": 0.26013848185539246, "learning_rate": 5e-06, "loss": 0.8866, "num_input_tokens_seen": 929700048, "step": 2046, "train_runtime": 146277.6773, "train_tokens_per_second": 6355.721 }, { "epoch": 2.478248942952544, "grad_norm": 0.2415570616722107, "learning_rate": 5e-06, "loss": 0.9105, "num_input_tokens_seen": 930155200, "step": 2047, "train_runtime": 146345.2032, "train_tokens_per_second": 6355.898 }, { "epoch": 2.4794596997701452, "grad_norm": 0.23563383519649506, "learning_rate": 5e-06, "loss": 0.924, "num_input_tokens_seen": 930593920, "step": 2048, "train_runtime": 146410.2731, "train_tokens_per_second": 6356.07 }, { "epoch": 2.4806704565877467, "grad_norm": 0.2751082479953766, "learning_rate": 5e-06, "loss": 0.9136, "num_input_tokens_seen": 931067912, "step": 2049, "train_runtime": 146480.5438, "train_tokens_per_second": 6356.257 }, { "epoch": 2.481881213405348, "grad_norm": 0.24614796042442322, "learning_rate": 5e-06, "loss": 0.9161, "num_input_tokens_seen": 931535648, "step": 2050, "train_runtime": 146549.9494, "train_tokens_per_second": 6356.438 }, { "epoch": 2.4830919702229495, "grad_norm": 0.2446848303079605, "learning_rate": 5e-06, "loss": 0.9223, "num_input_tokens_seen": 931996664, "step": 2051, "train_runtime": 146617.8963, "train_tokens_per_second": 6356.636 }, { "epoch": 2.484302727040551, "grad_norm": 0.24272581934928894, "learning_rate": 5e-06, "loss": 0.9565, "num_input_tokens_seen": 932455392, "step": 2052, "train_runtime": 146685.9301, "train_tokens_per_second": 6356.815 }, { "epoch": 2.4855134838581523, "grad_norm": 0.24807094037532806, "learning_rate": 5e-06, "loss": 0.9369, "num_input_tokens_seen": 932892240, "step": 2053, "train_runtime": 146750.5502, "train_tokens_per_second": 6356.993 }, { "epoch": 2.4867242406757537, "grad_norm": 0.24745595455169678, "learning_rate": 5e-06, "loss": 0.9749, "num_input_tokens_seen": 933347928, "step": 2054, "train_runtime": 146817.8809, "train_tokens_per_second": 6357.182 }, { "epoch": 2.487934997493355, "grad_norm": 0.2522644102573395, "learning_rate": 5e-06, "loss": 0.9338, "num_input_tokens_seen": 933807000, "step": 2055, "train_runtime": 146885.7573, "train_tokens_per_second": 6357.369 }, { "epoch": 2.4891457543109565, "grad_norm": 0.24964258074760437, "learning_rate": 5e-06, "loss": 0.916, "num_input_tokens_seen": 934241376, "step": 2056, "train_runtime": 146949.703, "train_tokens_per_second": 6357.559 }, { "epoch": 2.490356511128558, "grad_norm": 0.24483104050159454, "learning_rate": 5e-06, "loss": 0.9004, "num_input_tokens_seen": 934681960, "step": 2057, "train_runtime": 147014.7916, "train_tokens_per_second": 6357.741 }, { "epoch": 2.4915672679461593, "grad_norm": 0.23873838782310486, "learning_rate": 5e-06, "loss": 0.9291, "num_input_tokens_seen": 935154456, "step": 2058, "train_runtime": 147084.9039, "train_tokens_per_second": 6357.923 }, { "epoch": 2.4927780247637603, "grad_norm": 0.23718827962875366, "learning_rate": 5e-06, "loss": 0.8689, "num_input_tokens_seen": 935605960, "step": 2059, "train_runtime": 147152.0711, "train_tokens_per_second": 6358.089 }, { "epoch": 2.4939887815813617, "grad_norm": 0.2440134733915329, "learning_rate": 5e-06, "loss": 0.9532, "num_input_tokens_seen": 936070728, "step": 2060, "train_runtime": 147221.0941, "train_tokens_per_second": 6358.265 }, { "epoch": 2.495199538398963, "grad_norm": 0.24385643005371094, "learning_rate": 5e-06, "loss": 0.8583, "num_input_tokens_seen": 936518568, "step": 2061, "train_runtime": 147287.537, "train_tokens_per_second": 6358.437 }, { "epoch": 2.4964102952165645, "grad_norm": 0.23779569566249847, "learning_rate": 5e-06, "loss": 0.9048, "num_input_tokens_seen": 936975120, "step": 2062, "train_runtime": 147355.2483, "train_tokens_per_second": 6358.614 }, { "epoch": 2.497621052034166, "grad_norm": 0.25262778997421265, "learning_rate": 5e-06, "loss": 0.9233, "num_input_tokens_seen": 937426104, "step": 2063, "train_runtime": 147422.2915, "train_tokens_per_second": 6358.781 }, { "epoch": 2.4988318088517674, "grad_norm": 0.24266116321086884, "learning_rate": 5e-06, "loss": 0.9427, "num_input_tokens_seen": 937903712, "step": 2064, "train_runtime": 147492.5493, "train_tokens_per_second": 6358.99 }, { "epoch": 2.5000425656693688, "grad_norm": 0.24687156081199646, "learning_rate": 5e-06, "loss": 0.9445, "num_input_tokens_seen": 938338192, "step": 2065, "train_runtime": 147556.7522, "train_tokens_per_second": 6359.168 }, { "epoch": 2.50125332248697, "grad_norm": 0.2338828295469284, "learning_rate": 5e-06, "loss": 0.9168, "num_input_tokens_seen": 938795784, "step": 2066, "train_runtime": 147625.0006, "train_tokens_per_second": 6359.328 }, { "epoch": 2.5024640793045716, "grad_norm": 0.22915604710578918, "learning_rate": 5e-06, "loss": 0.8868, "num_input_tokens_seen": 939266392, "step": 2067, "train_runtime": 147694.5446, "train_tokens_per_second": 6359.52 }, { "epoch": 2.503674836122173, "grad_norm": 0.23226921260356903, "learning_rate": 5e-06, "loss": 0.9124, "num_input_tokens_seen": 939729888, "step": 2068, "train_runtime": 147763.2806, "train_tokens_per_second": 6359.698 }, { "epoch": 2.5048855929397744, "grad_norm": 0.24283023178577423, "learning_rate": 5e-06, "loss": 0.9313, "num_input_tokens_seen": 940204240, "step": 2069, "train_runtime": 147833.4404, "train_tokens_per_second": 6359.889 }, { "epoch": 2.506096349757376, "grad_norm": 0.24000869691371918, "learning_rate": 5e-06, "loss": 0.9282, "num_input_tokens_seen": 940655272, "step": 2070, "train_runtime": 147899.9283, "train_tokens_per_second": 6360.079 }, { "epoch": 2.5073071065749772, "grad_norm": 0.2344265878200531, "learning_rate": 5e-06, "loss": 0.9358, "num_input_tokens_seen": 941102232, "step": 2071, "train_runtime": 147966.0637, "train_tokens_per_second": 6360.257 }, { "epoch": 2.508517863392578, "grad_norm": 0.22600269317626953, "learning_rate": 5e-06, "loss": 0.8906, "num_input_tokens_seen": 941580024, "step": 2072, "train_runtime": 148036.6621, "train_tokens_per_second": 6360.452 }, { "epoch": 2.50972862021018, "grad_norm": 0.23776832222938538, "learning_rate": 5e-06, "loss": 0.9121, "num_input_tokens_seen": 942039728, "step": 2073, "train_runtime": 148104.8937, "train_tokens_per_second": 6360.625 }, { "epoch": 2.510939377027781, "grad_norm": 0.22566133737564087, "learning_rate": 5e-06, "loss": 0.9624, "num_input_tokens_seen": 942489560, "step": 2074, "train_runtime": 148171.3227, "train_tokens_per_second": 6360.81 }, { "epoch": 2.5121501338453824, "grad_norm": 0.24746361374855042, "learning_rate": 5e-06, "loss": 0.8688, "num_input_tokens_seen": 942940600, "step": 2075, "train_runtime": 148237.979, "train_tokens_per_second": 6360.992 }, { "epoch": 2.513360890662984, "grad_norm": 0.2346951961517334, "learning_rate": 5e-06, "loss": 0.9307, "num_input_tokens_seen": 943417360, "step": 2076, "train_runtime": 148308.1213, "train_tokens_per_second": 6361.198 }, { "epoch": 2.5145716474805853, "grad_norm": 0.2295297533273697, "learning_rate": 5e-06, "loss": 0.958, "num_input_tokens_seen": 943867744, "step": 2077, "train_runtime": 148374.8559, "train_tokens_per_second": 6361.373 }, { "epoch": 2.5157824042981867, "grad_norm": 0.2399854212999344, "learning_rate": 5e-06, "loss": 0.8859, "num_input_tokens_seen": 944347752, "step": 2078, "train_runtime": 148445.8736, "train_tokens_per_second": 6361.563 }, { "epoch": 2.516993161115788, "grad_norm": 0.25821027159690857, "learning_rate": 5e-06, "loss": 0.9398, "num_input_tokens_seen": 944819984, "step": 2079, "train_runtime": 148515.6507, "train_tokens_per_second": 6361.754 }, { "epoch": 2.5182039179333895, "grad_norm": 0.24560266733169556, "learning_rate": 5e-06, "loss": 0.9439, "num_input_tokens_seen": 945260464, "step": 2080, "train_runtime": 148580.7954, "train_tokens_per_second": 6361.929 }, { "epoch": 2.519414674750991, "grad_norm": 0.23482368886470795, "learning_rate": 5e-06, "loss": 0.9254, "num_input_tokens_seen": 945718168, "step": 2081, "train_runtime": 148648.5841, "train_tokens_per_second": 6362.107 }, { "epoch": 2.5206254315685923, "grad_norm": 0.24930696189403534, "learning_rate": 5e-06, "loss": 0.9276, "num_input_tokens_seen": 946169448, "step": 2082, "train_runtime": 148715.3849, "train_tokens_per_second": 6362.284 }, { "epoch": 2.5218361883861937, "grad_norm": 0.27057182788848877, "learning_rate": 5e-06, "loss": 0.9687, "num_input_tokens_seen": 946606096, "step": 2083, "train_runtime": 148779.659, "train_tokens_per_second": 6362.47 }, { "epoch": 2.523046945203795, "grad_norm": 0.2677522301673889, "learning_rate": 5e-06, "loss": 0.955, "num_input_tokens_seen": 947064336, "step": 2084, "train_runtime": 148847.4866, "train_tokens_per_second": 6362.649 }, { "epoch": 2.524257702021396, "grad_norm": 0.24084171652793884, "learning_rate": 5e-06, "loss": 0.9752, "num_input_tokens_seen": 947511856, "step": 2085, "train_runtime": 148913.4171, "train_tokens_per_second": 6362.837 }, { "epoch": 2.525468458838998, "grad_norm": 0.2556767165660858, "learning_rate": 5e-06, "loss": 0.9244, "num_input_tokens_seen": 947972304, "step": 2086, "train_runtime": 148981.7706, "train_tokens_per_second": 6363.009 }, { "epoch": 2.526679215656599, "grad_norm": 0.25337082147598267, "learning_rate": 5e-06, "loss": 0.9532, "num_input_tokens_seen": 948416456, "step": 2087, "train_runtime": 149047.4661, "train_tokens_per_second": 6363.184 }, { "epoch": 2.5278899724742003, "grad_norm": 0.23191265761852264, "learning_rate": 5e-06, "loss": 0.9283, "num_input_tokens_seen": 948876472, "step": 2088, "train_runtime": 149115.6529, "train_tokens_per_second": 6363.359 }, { "epoch": 2.5291007292918017, "grad_norm": 0.24382182955741882, "learning_rate": 5e-06, "loss": 0.9737, "num_input_tokens_seen": 949310000, "step": 2089, "train_runtime": 149179.5981, "train_tokens_per_second": 6363.538 }, { "epoch": 2.530311486109403, "grad_norm": 0.25065913796424866, "learning_rate": 5e-06, "loss": 0.9474, "num_input_tokens_seen": 949744928, "step": 2090, "train_runtime": 149243.5236, "train_tokens_per_second": 6363.726 }, { "epoch": 2.5315222429270046, "grad_norm": 0.24396991729736328, "learning_rate": 5e-06, "loss": 0.895, "num_input_tokens_seen": 950192536, "step": 2091, "train_runtime": 149309.8845, "train_tokens_per_second": 6363.896 }, { "epoch": 2.532732999744606, "grad_norm": 0.2316816747188568, "learning_rate": 5e-06, "loss": 0.9147, "num_input_tokens_seen": 950658280, "step": 2092, "train_runtime": 149379.1212, "train_tokens_per_second": 6364.064 }, { "epoch": 2.5339437565622074, "grad_norm": 0.2816956043243408, "learning_rate": 5e-06, "loss": 0.9337, "num_input_tokens_seen": 951108960, "step": 2093, "train_runtime": 149445.9212, "train_tokens_per_second": 6364.235 }, { "epoch": 2.535154513379809, "grad_norm": 0.24696174263954163, "learning_rate": 5e-06, "loss": 0.9976, "num_input_tokens_seen": 951526808, "step": 2094, "train_runtime": 149507.8536, "train_tokens_per_second": 6364.393 }, { "epoch": 2.53636527019741, "grad_norm": 0.23096802830696106, "learning_rate": 5e-06, "loss": 0.9126, "num_input_tokens_seen": 951990024, "step": 2095, "train_runtime": 149578.1316, "train_tokens_per_second": 6364.5 }, { "epoch": 2.5375760270150116, "grad_norm": 0.2423764020204544, "learning_rate": 5e-06, "loss": 0.9433, "num_input_tokens_seen": 952442632, "step": 2096, "train_runtime": 149645.4898, "train_tokens_per_second": 6364.66 }, { "epoch": 2.538786783832613, "grad_norm": 0.2315463125705719, "learning_rate": 5e-06, "loss": 0.9252, "num_input_tokens_seen": 952889608, "step": 2097, "train_runtime": 149711.4793, "train_tokens_per_second": 6364.84 }, { "epoch": 2.539997540650214, "grad_norm": 0.23303987085819244, "learning_rate": 5e-06, "loss": 0.9057, "num_input_tokens_seen": 953335880, "step": 2098, "train_runtime": 149777.8167, "train_tokens_per_second": 6365.001 }, { "epoch": 2.541208297467816, "grad_norm": 0.22208106517791748, "learning_rate": 5e-06, "loss": 0.8773, "num_input_tokens_seen": 953792496, "step": 2099, "train_runtime": 149845.1928, "train_tokens_per_second": 6365.186 }, { "epoch": 2.542419054285417, "grad_norm": 0.27171242237091064, "learning_rate": 5e-06, "loss": 0.9345, "num_input_tokens_seen": 954254432, "step": 2100, "train_runtime": 149913.5984, "train_tokens_per_second": 6365.363 }, { "epoch": 2.5436298111030182, "grad_norm": 0.24096918106079102, "learning_rate": 5e-06, "loss": 0.9452, "num_input_tokens_seen": 954702408, "step": 2101, "train_runtime": 149979.536, "train_tokens_per_second": 6365.551 }, { "epoch": 2.5448405679206196, "grad_norm": 0.22997787594795227, "learning_rate": 5e-06, "loss": 0.882, "num_input_tokens_seen": 955166928, "step": 2102, "train_runtime": 150048.5149, "train_tokens_per_second": 6365.721 }, { "epoch": 2.546051324738221, "grad_norm": 0.23458710312843323, "learning_rate": 5e-06, "loss": 0.8823, "num_input_tokens_seen": 955611512, "step": 2103, "train_runtime": 150113.7698, "train_tokens_per_second": 6365.915 }, { "epoch": 2.5472620815558225, "grad_norm": 0.2433023899793625, "learning_rate": 5e-06, "loss": 0.959, "num_input_tokens_seen": 956040800, "step": 2104, "train_runtime": 150176.961, "train_tokens_per_second": 6366.095 }, { "epoch": 2.548472838373424, "grad_norm": 0.24917687475681305, "learning_rate": 5e-06, "loss": 0.9486, "num_input_tokens_seen": 956490904, "step": 2105, "train_runtime": 150243.2865, "train_tokens_per_second": 6366.28 }, { "epoch": 2.5496835951910253, "grad_norm": 0.24541530013084412, "learning_rate": 5e-06, "loss": 0.9464, "num_input_tokens_seen": 956941400, "step": 2106, "train_runtime": 150309.8385, "train_tokens_per_second": 6366.459 }, { "epoch": 2.5508943520086267, "grad_norm": 0.23959662020206451, "learning_rate": 5e-06, "loss": 0.884, "num_input_tokens_seen": 957382960, "step": 2107, "train_runtime": 150375.1278, "train_tokens_per_second": 6366.631 }, { "epoch": 2.552105108826228, "grad_norm": 0.24745765328407288, "learning_rate": 5e-06, "loss": 0.9583, "num_input_tokens_seen": 957828816, "step": 2108, "train_runtime": 150441.1512, "train_tokens_per_second": 6366.801 }, { "epoch": 2.5533158656438295, "grad_norm": 0.3009890019893646, "learning_rate": 5e-06, "loss": 0.9098, "num_input_tokens_seen": 958242640, "step": 2109, "train_runtime": 150502.4514, "train_tokens_per_second": 6366.957 }, { "epoch": 2.554526622461431, "grad_norm": 0.23632347583770752, "learning_rate": 5e-06, "loss": 0.9395, "num_input_tokens_seen": 958689592, "step": 2110, "train_runtime": 150568.8159, "train_tokens_per_second": 6367.119 }, { "epoch": 2.555737379279032, "grad_norm": 0.2490553855895996, "learning_rate": 5e-06, "loss": 0.8992, "num_input_tokens_seen": 959154704, "step": 2111, "train_runtime": 150637.4185, "train_tokens_per_second": 6367.307 }, { "epoch": 2.5569481360966337, "grad_norm": 0.2370673418045044, "learning_rate": 5e-06, "loss": 0.9111, "num_input_tokens_seen": 959608064, "step": 2112, "train_runtime": 150704.8437, "train_tokens_per_second": 6367.467 }, { "epoch": 2.5581588929142347, "grad_norm": 0.24848392605781555, "learning_rate": 5e-06, "loss": 0.9228, "num_input_tokens_seen": 960056176, "step": 2113, "train_runtime": 150771.1853, "train_tokens_per_second": 6367.637 }, { "epoch": 2.5593696497318366, "grad_norm": 0.22288735210895538, "learning_rate": 5e-06, "loss": 0.876, "num_input_tokens_seen": 960532728, "step": 2114, "train_runtime": 150842.3276, "train_tokens_per_second": 6367.793 }, { "epoch": 2.5605804065494375, "grad_norm": 0.2513042986392975, "learning_rate": 5e-06, "loss": 1.0092, "num_input_tokens_seen": 960985864, "step": 2115, "train_runtime": 150909.5334, "train_tokens_per_second": 6367.96 }, { "epoch": 2.561791163367039, "grad_norm": 0.28590673208236694, "learning_rate": 5e-06, "loss": 0.9171, "num_input_tokens_seen": 961433280, "step": 2116, "train_runtime": 150975.5143, "train_tokens_per_second": 6368.14 }, { "epoch": 2.5630019201846403, "grad_norm": 0.26200953125953674, "learning_rate": 5e-06, "loss": 0.9099, "num_input_tokens_seen": 961888288, "step": 2117, "train_runtime": 151042.961, "train_tokens_per_second": 6368.309 }, { "epoch": 2.5642126770022418, "grad_norm": 0.24091939628124237, "learning_rate": 5e-06, "loss": 0.905, "num_input_tokens_seen": 962325576, "step": 2118, "train_runtime": 151107.3832, "train_tokens_per_second": 6368.488 }, { "epoch": 2.565423433819843, "grad_norm": 0.305169016122818, "learning_rate": 5e-06, "loss": 0.9368, "num_input_tokens_seen": 962790480, "step": 2119, "train_runtime": 151176.0572, "train_tokens_per_second": 6368.67 }, { "epoch": 2.5666341906374446, "grad_norm": 0.23745113611221313, "learning_rate": 5e-06, "loss": 0.912, "num_input_tokens_seen": 963234680, "step": 2120, "train_runtime": 151241.8959, "train_tokens_per_second": 6368.835 }, { "epoch": 2.567844947455046, "grad_norm": 0.26773974299430847, "learning_rate": 5e-06, "loss": 0.9186, "num_input_tokens_seen": 963693880, "step": 2121, "train_runtime": 151309.6612, "train_tokens_per_second": 6369.017 }, { "epoch": 2.5690557042726474, "grad_norm": 0.23392970860004425, "learning_rate": 5e-06, "loss": 0.9131, "num_input_tokens_seen": 964156400, "step": 2122, "train_runtime": 151377.9456, "train_tokens_per_second": 6369.2 }, { "epoch": 2.570266461090249, "grad_norm": 0.24104426801204681, "learning_rate": 5e-06, "loss": 0.9098, "num_input_tokens_seen": 964603680, "step": 2123, "train_runtime": 151443.8686, "train_tokens_per_second": 6369.381 }, { "epoch": 2.57147721790785, "grad_norm": 0.27819520235061646, "learning_rate": 5e-06, "loss": 0.9433, "num_input_tokens_seen": 965051904, "step": 2124, "train_runtime": 151510.0321, "train_tokens_per_second": 6369.558 }, { "epoch": 2.5726879747254516, "grad_norm": 0.23141422867774963, "learning_rate": 5e-06, "loss": 0.9482, "num_input_tokens_seen": 965491424, "step": 2125, "train_runtime": 151574.8146, "train_tokens_per_second": 6369.735 }, { "epoch": 2.5738987315430526, "grad_norm": 0.2362852543592453, "learning_rate": 5e-06, "loss": 0.8856, "num_input_tokens_seen": 965942960, "step": 2126, "train_runtime": 151641.7588, "train_tokens_per_second": 6369.901 }, { "epoch": 2.5751094883606545, "grad_norm": 0.2561604380607605, "learning_rate": 5e-06, "loss": 0.931, "num_input_tokens_seen": 966402048, "step": 2127, "train_runtime": 151710.0948, "train_tokens_per_second": 6370.058 }, { "epoch": 2.5763202451782554, "grad_norm": 0.2245933711528778, "learning_rate": 5e-06, "loss": 0.8996, "num_input_tokens_seen": 966854136, "step": 2128, "train_runtime": 151777.079, "train_tokens_per_second": 6370.225 }, { "epoch": 2.577531001995857, "grad_norm": 0.2425384670495987, "learning_rate": 5e-06, "loss": 0.9431, "num_input_tokens_seen": 967327272, "step": 2129, "train_runtime": 151847.5021, "train_tokens_per_second": 6370.386 }, { "epoch": 2.5787417588134582, "grad_norm": 0.2366553246974945, "learning_rate": 5e-06, "loss": 0.9586, "num_input_tokens_seen": 967784400, "step": 2130, "train_runtime": 151915.0318, "train_tokens_per_second": 6370.564 }, { "epoch": 2.5799525156310597, "grad_norm": 0.24099212884902954, "learning_rate": 5e-06, "loss": 0.9046, "num_input_tokens_seen": 968237464, "step": 2131, "train_runtime": 151982.077, "train_tokens_per_second": 6370.735 }, { "epoch": 2.581163272448661, "grad_norm": 0.25684481859207153, "learning_rate": 5e-06, "loss": 0.9971, "num_input_tokens_seen": 968699672, "step": 2132, "train_runtime": 152050.2017, "train_tokens_per_second": 6370.92 }, { "epoch": 2.5823740292662625, "grad_norm": 0.23455548286437988, "learning_rate": 5e-06, "loss": 0.9301, "num_input_tokens_seen": 969139440, "step": 2133, "train_runtime": 152115.1846, "train_tokens_per_second": 6371.089 }, { "epoch": 2.583584786083864, "grad_norm": 0.2384224683046341, "learning_rate": 5e-06, "loss": 0.9473, "num_input_tokens_seen": 969581296, "step": 2134, "train_runtime": 152180.5835, "train_tokens_per_second": 6371.255 }, { "epoch": 2.5847955429014653, "grad_norm": 0.2409534454345703, "learning_rate": 5e-06, "loss": 0.88, "num_input_tokens_seen": 970046112, "step": 2135, "train_runtime": 152248.9143, "train_tokens_per_second": 6371.448 }, { "epoch": 2.5860062997190667, "grad_norm": 0.25234049558639526, "learning_rate": 5e-06, "loss": 0.952, "num_input_tokens_seen": 970487920, "step": 2136, "train_runtime": 152314.0961, "train_tokens_per_second": 6371.622 }, { "epoch": 2.587217056536668, "grad_norm": 0.22701537609100342, "learning_rate": 5e-06, "loss": 0.8846, "num_input_tokens_seen": 970944456, "step": 2137, "train_runtime": 152381.1973, "train_tokens_per_second": 6371.813 }, { "epoch": 2.5884278133542695, "grad_norm": 0.22347159683704376, "learning_rate": 5e-06, "loss": 0.896, "num_input_tokens_seen": 971420496, "step": 2138, "train_runtime": 152452.0329, "train_tokens_per_second": 6371.975 }, { "epoch": 2.5896385701718705, "grad_norm": 0.23158830404281616, "learning_rate": 5e-06, "loss": 0.9145, "num_input_tokens_seen": 971864936, "step": 2139, "train_runtime": 152517.7639, "train_tokens_per_second": 6372.143 }, { "epoch": 2.5908493269894723, "grad_norm": 0.24082769453525543, "learning_rate": 5e-06, "loss": 0.9517, "num_input_tokens_seen": 972316368, "step": 2140, "train_runtime": 152584.4859, "train_tokens_per_second": 6372.315 }, { "epoch": 2.5920600838070733, "grad_norm": 0.2324010580778122, "learning_rate": 5e-06, "loss": 0.9356, "num_input_tokens_seen": 972751424, "step": 2141, "train_runtime": 152648.8383, "train_tokens_per_second": 6372.478 }, { "epoch": 2.5932708406246747, "grad_norm": 0.24019919335842133, "learning_rate": 5e-06, "loss": 0.9627, "num_input_tokens_seen": 973202808, "step": 2142, "train_runtime": 152715.4644, "train_tokens_per_second": 6372.654 }, { "epoch": 2.594481597442276, "grad_norm": 0.23422782123088837, "learning_rate": 5e-06, "loss": 0.8628, "num_input_tokens_seen": 973689168, "step": 2143, "train_runtime": 152787.9495, "train_tokens_per_second": 6372.814 }, { "epoch": 2.5956923542598775, "grad_norm": 0.2257990539073944, "learning_rate": 5e-06, "loss": 0.8893, "num_input_tokens_seen": 974141104, "step": 2144, "train_runtime": 152854.4834, "train_tokens_per_second": 6372.997 }, { "epoch": 2.596903111077479, "grad_norm": 0.2405652552843094, "learning_rate": 5e-06, "loss": 0.9073, "num_input_tokens_seen": 974607168, "step": 2145, "train_runtime": 152923.3103, "train_tokens_per_second": 6373.176 }, { "epoch": 2.5981138678950804, "grad_norm": 0.25007355213165283, "learning_rate": 5e-06, "loss": 0.9286, "num_input_tokens_seen": 975054616, "step": 2146, "train_runtime": 152989.2769, "train_tokens_per_second": 6373.353 }, { "epoch": 2.5993246247126818, "grad_norm": 0.2606528103351593, "learning_rate": 5e-06, "loss": 0.9382, "num_input_tokens_seen": 975497336, "step": 2147, "train_runtime": 153054.2115, "train_tokens_per_second": 6373.541 }, { "epoch": 2.600535381530283, "grad_norm": 0.22446538507938385, "learning_rate": 5e-06, "loss": 0.8875, "num_input_tokens_seen": 975941776, "step": 2148, "train_runtime": 153120.0609, "train_tokens_per_second": 6373.703 }, { "epoch": 2.6017461383478846, "grad_norm": 0.2446034997701645, "learning_rate": 5e-06, "loss": 0.9442, "num_input_tokens_seen": 976394880, "step": 2149, "train_runtime": 153188.6693, "train_tokens_per_second": 6373.806 }, { "epoch": 2.602956895165486, "grad_norm": 0.22049270570278168, "learning_rate": 5e-06, "loss": 0.8881, "num_input_tokens_seen": 976866832, "step": 2150, "train_runtime": 153258.796, "train_tokens_per_second": 6373.969 }, { "epoch": 2.6041676519830874, "grad_norm": 0.22306476533412933, "learning_rate": 5e-06, "loss": 0.8982, "num_input_tokens_seen": 977324560, "step": 2151, "train_runtime": 153326.3775, "train_tokens_per_second": 6374.145 }, { "epoch": 2.6053784088006884, "grad_norm": 0.25735023617744446, "learning_rate": 5e-06, "loss": 0.9181, "num_input_tokens_seen": 977761376, "step": 2152, "train_runtime": 153390.9315, "train_tokens_per_second": 6374.31 }, { "epoch": 2.6065891656182902, "grad_norm": 0.23419278860092163, "learning_rate": 5e-06, "loss": 0.9264, "num_input_tokens_seen": 978205792, "step": 2153, "train_runtime": 153457.0095, "train_tokens_per_second": 6374.461 }, { "epoch": 2.607799922435891, "grad_norm": 0.23010873794555664, "learning_rate": 5e-06, "loss": 0.9316, "num_input_tokens_seen": 978649992, "step": 2154, "train_runtime": 153522.7683, "train_tokens_per_second": 6374.624 }, { "epoch": 2.6090106792534926, "grad_norm": 0.2428400218486786, "learning_rate": 5e-06, "loss": 0.9585, "num_input_tokens_seen": 979073824, "step": 2155, "train_runtime": 153585.4951, "train_tokens_per_second": 6374.781 }, { "epoch": 2.610221436071094, "grad_norm": 0.24073754251003265, "learning_rate": 5e-06, "loss": 0.9351, "num_input_tokens_seen": 979532472, "step": 2156, "train_runtime": 153653.4719, "train_tokens_per_second": 6374.945 }, { "epoch": 2.6114321928886954, "grad_norm": 0.24380506575107574, "learning_rate": 5e-06, "loss": 0.8991, "num_input_tokens_seen": 979999864, "step": 2157, "train_runtime": 153722.7985, "train_tokens_per_second": 6375.111 }, { "epoch": 2.612642949706297, "grad_norm": 0.23714521527290344, "learning_rate": 5e-06, "loss": 0.9145, "num_input_tokens_seen": 980473120, "step": 2158, "train_runtime": 153793.1517, "train_tokens_per_second": 6375.272 }, { "epoch": 2.6138537065238983, "grad_norm": 0.2587903141975403, "learning_rate": 5e-06, "loss": 0.919, "num_input_tokens_seen": 980913512, "step": 2159, "train_runtime": 153858.4779, "train_tokens_per_second": 6375.427 }, { "epoch": 2.6150644633414997, "grad_norm": 0.2620103359222412, "learning_rate": 5e-06, "loss": 0.9178, "num_input_tokens_seen": 981375896, "step": 2160, "train_runtime": 153927.0515, "train_tokens_per_second": 6375.591 }, { "epoch": 2.616275220159101, "grad_norm": 0.24088148772716522, "learning_rate": 5e-06, "loss": 0.9379, "num_input_tokens_seen": 981832360, "step": 2161, "train_runtime": 153994.5461, "train_tokens_per_second": 6375.761 }, { "epoch": 2.6174859769767025, "grad_norm": 0.22991852462291718, "learning_rate": 5e-06, "loss": 0.9617, "num_input_tokens_seen": 982288048, "step": 2162, "train_runtime": 154062.1977, "train_tokens_per_second": 6375.919 }, { "epoch": 2.618696733794304, "grad_norm": 0.24822624027729034, "learning_rate": 5e-06, "loss": 0.9686, "num_input_tokens_seen": 982743976, "step": 2163, "train_runtime": 154129.7489, "train_tokens_per_second": 6376.082 }, { "epoch": 2.6199074906119053, "grad_norm": 0.25701308250427246, "learning_rate": 5e-06, "loss": 0.9104, "num_input_tokens_seen": 983187152, "step": 2164, "train_runtime": 154195.5626, "train_tokens_per_second": 6376.235 }, { "epoch": 2.6211182474295063, "grad_norm": 0.2755506932735443, "learning_rate": 5e-06, "loss": 0.9725, "num_input_tokens_seen": 983623800, "step": 2165, "train_runtime": 154260.2571, "train_tokens_per_second": 6376.392 }, { "epoch": 2.622329004247108, "grad_norm": 0.25525012612342834, "learning_rate": 5e-06, "loss": 0.9259, "num_input_tokens_seen": 984073384, "step": 2166, "train_runtime": 154327.0854, "train_tokens_per_second": 6376.544 }, { "epoch": 2.623539761064709, "grad_norm": 0.26483437418937683, "learning_rate": 5e-06, "loss": 0.9128, "num_input_tokens_seen": 984513016, "step": 2167, "train_runtime": 154392.1294, "train_tokens_per_second": 6376.705 }, { "epoch": 2.6247505178823105, "grad_norm": 0.23384696245193481, "learning_rate": 5e-06, "loss": 0.957, "num_input_tokens_seen": 984968520, "step": 2168, "train_runtime": 154459.688, "train_tokens_per_second": 6376.865 }, { "epoch": 2.625961274699912, "grad_norm": 0.233501136302948, "learning_rate": 5e-06, "loss": 0.8981, "num_input_tokens_seen": 985445104, "step": 2169, "train_runtime": 154529.8475, "train_tokens_per_second": 6377.053 }, { "epoch": 2.6271720315175133, "grad_norm": 0.2487708330154419, "learning_rate": 5e-06, "loss": 0.8331, "num_input_tokens_seen": 985877536, "step": 2170, "train_runtime": 154593.5382, "train_tokens_per_second": 6377.223 }, { "epoch": 2.6283827883351147, "grad_norm": 0.23218314349651337, "learning_rate": 5e-06, "loss": 0.95, "num_input_tokens_seen": 986316704, "step": 2171, "train_runtime": 154657.7931, "train_tokens_per_second": 6377.414 }, { "epoch": 2.629593545152716, "grad_norm": 0.2570416033267975, "learning_rate": 5e-06, "loss": 0.8875, "num_input_tokens_seen": 986752352, "step": 2172, "train_runtime": 154721.7442, "train_tokens_per_second": 6377.593 }, { "epoch": 2.6308043019703176, "grad_norm": 0.2248729020357132, "learning_rate": 5e-06, "loss": 0.8952, "num_input_tokens_seen": 987227168, "step": 2173, "train_runtime": 154792.0587, "train_tokens_per_second": 6377.764 }, { "epoch": 2.632015058787919, "grad_norm": 0.23463650047779083, "learning_rate": 5e-06, "loss": 0.9381, "num_input_tokens_seen": 987681176, "step": 2174, "train_runtime": 154858.7972, "train_tokens_per_second": 6377.947 }, { "epoch": 2.6332258156055204, "grad_norm": 0.24335210025310516, "learning_rate": 5e-06, "loss": 0.9145, "num_input_tokens_seen": 988153656, "step": 2175, "train_runtime": 154928.7355, "train_tokens_per_second": 6378.117 }, { "epoch": 2.634436572423122, "grad_norm": 0.24974526464939117, "learning_rate": 5e-06, "loss": 0.9623, "num_input_tokens_seen": 988604512, "step": 2176, "train_runtime": 154995.4793, "train_tokens_per_second": 6378.28 }, { "epoch": 2.635647329240723, "grad_norm": 0.23491837084293365, "learning_rate": 5e-06, "loss": 0.925, "num_input_tokens_seen": 989041648, "step": 2177, "train_runtime": 155060.32, "train_tokens_per_second": 6378.432 }, { "epoch": 2.636858086058324, "grad_norm": 0.2458321899175644, "learning_rate": 5e-06, "loss": 0.9523, "num_input_tokens_seen": 989484256, "step": 2178, "train_runtime": 155126.0643, "train_tokens_per_second": 6378.582 }, { "epoch": 2.638068842875926, "grad_norm": 0.28002414107322693, "learning_rate": 5e-06, "loss": 0.9521, "num_input_tokens_seen": 989918248, "step": 2179, "train_runtime": 155190.3341, "train_tokens_per_second": 6378.736 }, { "epoch": 2.639279599693527, "grad_norm": 0.2300572544336319, "learning_rate": 5e-06, "loss": 0.9063, "num_input_tokens_seen": 990388592, "step": 2180, "train_runtime": 155259.9905, "train_tokens_per_second": 6378.904 }, { "epoch": 2.6404903565111284, "grad_norm": 0.23866574466228485, "learning_rate": 5e-06, "loss": 0.9428, "num_input_tokens_seen": 990856224, "step": 2181, "train_runtime": 155329.3298, "train_tokens_per_second": 6379.067 }, { "epoch": 2.64170111332873, "grad_norm": 0.2776472270488739, "learning_rate": 5e-06, "loss": 0.8795, "num_input_tokens_seen": 991300536, "step": 2182, "train_runtime": 155395.1153, "train_tokens_per_second": 6379.226 }, { "epoch": 2.642911870146331, "grad_norm": 0.24262697994709015, "learning_rate": 5e-06, "loss": 0.8967, "num_input_tokens_seen": 991741200, "step": 2183, "train_runtime": 155459.8715, "train_tokens_per_second": 6379.403 }, { "epoch": 2.6441226269639326, "grad_norm": 0.25825032591819763, "learning_rate": 5e-06, "loss": 0.9596, "num_input_tokens_seen": 992195600, "step": 2184, "train_runtime": 155526.9125, "train_tokens_per_second": 6379.575 }, { "epoch": 2.645333383781534, "grad_norm": 0.25043049454689026, "learning_rate": 5e-06, "loss": 0.9482, "num_input_tokens_seen": 992628344, "step": 2185, "train_runtime": 155590.6849, "train_tokens_per_second": 6379.741 }, { "epoch": 2.6465441405991355, "grad_norm": 0.24327807128429413, "learning_rate": 5e-06, "loss": 0.9272, "num_input_tokens_seen": 993091536, "step": 2186, "train_runtime": 155659.1615, "train_tokens_per_second": 6379.911 }, { "epoch": 2.647754897416737, "grad_norm": 0.26455357670783997, "learning_rate": 5e-06, "loss": 0.9241, "num_input_tokens_seen": 993539712, "step": 2187, "train_runtime": 155725.1034, "train_tokens_per_second": 6380.087 }, { "epoch": 2.6489656542343383, "grad_norm": 0.24340102076530457, "learning_rate": 5e-06, "loss": 0.9586, "num_input_tokens_seen": 993981440, "step": 2188, "train_runtime": 155790.4247, "train_tokens_per_second": 6380.247 }, { "epoch": 2.6501764110519397, "grad_norm": 0.24760021269321442, "learning_rate": 5e-06, "loss": 0.9187, "num_input_tokens_seen": 994453512, "step": 2189, "train_runtime": 155860.8542, "train_tokens_per_second": 6380.393 }, { "epoch": 2.651387167869541, "grad_norm": 0.25143691897392273, "learning_rate": 5e-06, "loss": 0.9341, "num_input_tokens_seen": 994882632, "step": 2190, "train_runtime": 155923.9754, "train_tokens_per_second": 6380.562 }, { "epoch": 2.652597924687142, "grad_norm": 0.2322501242160797, "learning_rate": 5e-06, "loss": 0.8817, "num_input_tokens_seen": 995341712, "step": 2191, "train_runtime": 155992.0321, "train_tokens_per_second": 6380.721 }, { "epoch": 2.653808681504744, "grad_norm": 0.23527227342128754, "learning_rate": 5e-06, "loss": 0.9028, "num_input_tokens_seen": 995823768, "step": 2192, "train_runtime": 156063.4226, "train_tokens_per_second": 6380.892 }, { "epoch": 2.655019438322345, "grad_norm": 0.2262798249721527, "learning_rate": 5e-06, "loss": 0.9217, "num_input_tokens_seen": 996285904, "step": 2193, "train_runtime": 156131.4608, "train_tokens_per_second": 6381.071 }, { "epoch": 2.6562301951399467, "grad_norm": 0.24012240767478943, "learning_rate": 5e-06, "loss": 0.9692, "num_input_tokens_seen": 996740120, "step": 2194, "train_runtime": 156198.4805, "train_tokens_per_second": 6381.241 }, { "epoch": 2.6574409519575477, "grad_norm": 0.2367800921201706, "learning_rate": 5e-06, "loss": 0.8777, "num_input_tokens_seen": 997206712, "step": 2195, "train_runtime": 156267.5628, "train_tokens_per_second": 6381.406 }, { "epoch": 2.658651708775149, "grad_norm": 0.22343586385250092, "learning_rate": 5e-06, "loss": 0.8786, "num_input_tokens_seen": 997680768, "step": 2196, "train_runtime": 156337.5139, "train_tokens_per_second": 6381.583 }, { "epoch": 2.6598624655927505, "grad_norm": 0.2318398803472519, "learning_rate": 5e-06, "loss": 0.9135, "num_input_tokens_seen": 998140896, "step": 2197, "train_runtime": 156405.329, "train_tokens_per_second": 6381.758 }, { "epoch": 2.661073222410352, "grad_norm": 0.23731204867362976, "learning_rate": 5e-06, "loss": 0.9568, "num_input_tokens_seen": 998581544, "step": 2198, "train_runtime": 156470.6646, "train_tokens_per_second": 6381.909 }, { "epoch": 2.6622839792279533, "grad_norm": 0.2556219696998596, "learning_rate": 5e-06, "loss": 0.9705, "num_input_tokens_seen": 999021544, "step": 2199, "train_runtime": 156535.5906, "train_tokens_per_second": 6382.073 }, { "epoch": 2.6634947360455548, "grad_norm": 0.25406965613365173, "learning_rate": 5e-06, "loss": 0.9043, "num_input_tokens_seen": 999456000, "step": 2200, "train_runtime": 156600.1967, "train_tokens_per_second": 6382.214 }, { "epoch": 2.664705492863156, "grad_norm": 0.2429000586271286, "learning_rate": 5e-06, "loss": 1.0054, "num_input_tokens_seen": 999917048, "step": 2201, "train_runtime": 156667.8114, "train_tokens_per_second": 6382.403 }, { "epoch": 2.6659162496807576, "grad_norm": 0.24307996034622192, "learning_rate": 5e-06, "loss": 0.9031, "num_input_tokens_seen": 1000345496, "step": 2202, "train_runtime": 156731.558, "train_tokens_per_second": 6382.54 }, { "epoch": 2.667127006498359, "grad_norm": 0.2613001763820648, "learning_rate": 5e-06, "loss": 0.9146, "num_input_tokens_seen": 1000775024, "step": 2203, "train_runtime": 156796.6198, "train_tokens_per_second": 6382.631 }, { "epoch": 2.6683377633159604, "grad_norm": 0.2521812617778778, "learning_rate": 5e-06, "loss": 0.9449, "num_input_tokens_seen": 1001214208, "step": 2204, "train_runtime": 156861.3258, "train_tokens_per_second": 6382.798 }, { "epoch": 2.669548520133562, "grad_norm": 0.22570690512657166, "learning_rate": 5e-06, "loss": 0.8841, "num_input_tokens_seen": 1001691928, "step": 2205, "train_runtime": 156931.9112, "train_tokens_per_second": 6382.972 }, { "epoch": 2.6707592769511628, "grad_norm": 0.2318730354309082, "learning_rate": 5e-06, "loss": 0.89, "num_input_tokens_seen": 1002167032, "step": 2206, "train_runtime": 157002.068, "train_tokens_per_second": 6383.145 }, { "epoch": 2.6719700337687646, "grad_norm": 0.26219817996025085, "learning_rate": 5e-06, "loss": 0.8752, "num_input_tokens_seen": 1002621280, "step": 2207, "train_runtime": 157069.3885, "train_tokens_per_second": 6383.302 }, { "epoch": 2.6731807905863656, "grad_norm": 0.23726919293403625, "learning_rate": 5e-06, "loss": 0.8883, "num_input_tokens_seen": 1003092128, "step": 2208, "train_runtime": 157139.0605, "train_tokens_per_second": 6383.468 }, { "epoch": 2.674391547403967, "grad_norm": 0.24438372254371643, "learning_rate": 5e-06, "loss": 0.9029, "num_input_tokens_seen": 1003551736, "step": 2209, "train_runtime": 157207.0617, "train_tokens_per_second": 6383.63 }, { "epoch": 2.6756023042215684, "grad_norm": 0.2408195585012436, "learning_rate": 5e-06, "loss": 0.9428, "num_input_tokens_seen": 1003992272, "step": 2210, "train_runtime": 157272.0435, "train_tokens_per_second": 6383.794 }, { "epoch": 2.67681306103917, "grad_norm": 0.23207896947860718, "learning_rate": 5e-06, "loss": 0.9647, "num_input_tokens_seen": 1004464200, "step": 2211, "train_runtime": 157341.891, "train_tokens_per_second": 6383.959 }, { "epoch": 2.6780238178567712, "grad_norm": 0.26204124093055725, "learning_rate": 5e-06, "loss": 0.911, "num_input_tokens_seen": 1004918912, "step": 2212, "train_runtime": 157409.1733, "train_tokens_per_second": 6384.119 }, { "epoch": 2.6792345746743726, "grad_norm": 0.24040430784225464, "learning_rate": 5e-06, "loss": 0.8979, "num_input_tokens_seen": 1005365120, "step": 2213, "train_runtime": 157475.3659, "train_tokens_per_second": 6384.269 }, { "epoch": 2.680445331491974, "grad_norm": 0.2542877495288849, "learning_rate": 5e-06, "loss": 0.9251, "num_input_tokens_seen": 1005804512, "step": 2214, "train_runtime": 157540.2892, "train_tokens_per_second": 6384.427 }, { "epoch": 2.6816560883095755, "grad_norm": 0.2377696931362152, "learning_rate": 5e-06, "loss": 0.8792, "num_input_tokens_seen": 1006260000, "step": 2215, "train_runtime": 157607.9347, "train_tokens_per_second": 6384.577 }, { "epoch": 2.682866845127177, "grad_norm": 0.23569363355636597, "learning_rate": 5e-06, "loss": 0.9299, "num_input_tokens_seen": 1006738640, "step": 2216, "train_runtime": 157678.5377, "train_tokens_per_second": 6384.754 }, { "epoch": 2.6840776019447783, "grad_norm": 0.2451499104499817, "learning_rate": 5e-06, "loss": 0.9518, "num_input_tokens_seen": 1007184656, "step": 2217, "train_runtime": 157744.1772, "train_tokens_per_second": 6384.924 }, { "epoch": 2.6852883587623797, "grad_norm": 0.2470230609178543, "learning_rate": 5e-06, "loss": 0.927, "num_input_tokens_seen": 1007627432, "step": 2218, "train_runtime": 157809.6053, "train_tokens_per_second": 6385.083 }, { "epoch": 2.6864991155799807, "grad_norm": 0.24628578126430511, "learning_rate": 5e-06, "loss": 0.9285, "num_input_tokens_seen": 1008077632, "step": 2219, "train_runtime": 157876.7498, "train_tokens_per_second": 6385.219 }, { "epoch": 2.6877098723975825, "grad_norm": 0.2287086695432663, "learning_rate": 5e-06, "loss": 0.9016, "num_input_tokens_seen": 1008546760, "step": 2220, "train_runtime": 157946.0167, "train_tokens_per_second": 6385.389 }, { "epoch": 2.6889206292151835, "grad_norm": 0.25545141100883484, "learning_rate": 5e-06, "loss": 0.9121, "num_input_tokens_seen": 1009016192, "step": 2221, "train_runtime": 158015.3011, "train_tokens_per_second": 6385.56 }, { "epoch": 2.690131386032785, "grad_norm": 0.22718414664268494, "learning_rate": 5e-06, "loss": 0.9043, "num_input_tokens_seen": 1009470936, "step": 2222, "train_runtime": 158082.415, "train_tokens_per_second": 6385.726 }, { "epoch": 2.6913421428503863, "grad_norm": 0.2230096310377121, "learning_rate": 5e-06, "loss": 0.9253, "num_input_tokens_seen": 1009952464, "step": 2223, "train_runtime": 158154.0362, "train_tokens_per_second": 6385.879 }, { "epoch": 2.6925528996679877, "grad_norm": 0.23132304847240448, "learning_rate": 5e-06, "loss": 0.9229, "num_input_tokens_seen": 1010405368, "step": 2224, "train_runtime": 158220.9026, "train_tokens_per_second": 6386.042 }, { "epoch": 2.693763656485589, "grad_norm": 0.22348402440547943, "learning_rate": 5e-06, "loss": 0.8622, "num_input_tokens_seen": 1010875144, "step": 2225, "train_runtime": 158290.9411, "train_tokens_per_second": 6386.184 }, { "epoch": 2.6949744133031905, "grad_norm": 0.25180912017822266, "learning_rate": 5e-06, "loss": 0.934, "num_input_tokens_seen": 1011328552, "step": 2226, "train_runtime": 158358.6024, "train_tokens_per_second": 6386.319 }, { "epoch": 2.696185170120792, "grad_norm": 0.23059271275997162, "learning_rate": 5e-06, "loss": 0.9463, "num_input_tokens_seen": 1011765960, "step": 2227, "train_runtime": 158423.3929, "train_tokens_per_second": 6386.468 }, { "epoch": 2.6973959269383934, "grad_norm": 0.23937790095806122, "learning_rate": 5e-06, "loss": 0.9551, "num_input_tokens_seen": 1012221776, "step": 2228, "train_runtime": 158490.551, "train_tokens_per_second": 6386.638 }, { "epoch": 2.6986066837559948, "grad_norm": 0.23659993708133698, "learning_rate": 5e-06, "loss": 0.8974, "num_input_tokens_seen": 1012684104, "step": 2229, "train_runtime": 158559.0434, "train_tokens_per_second": 6386.795 }, { "epoch": 2.699817440573596, "grad_norm": 0.2516309320926666, "learning_rate": 5e-06, "loss": 0.965, "num_input_tokens_seen": 1013148096, "step": 2230, "train_runtime": 158627.7065, "train_tokens_per_second": 6386.955 }, { "epoch": 2.7010281973911976, "grad_norm": 0.25234147906303406, "learning_rate": 5e-06, "loss": 0.8918, "num_input_tokens_seen": 1013602984, "step": 2231, "train_runtime": 158694.9766, "train_tokens_per_second": 6387.114 }, { "epoch": 2.7022389542087986, "grad_norm": 0.2293567955493927, "learning_rate": 5e-06, "loss": 0.9286, "num_input_tokens_seen": 1014069312, "step": 2232, "train_runtime": 158764.1805, "train_tokens_per_second": 6387.268 }, { "epoch": 2.7034497110264004, "grad_norm": 0.2538798153400421, "learning_rate": 5e-06, "loss": 0.9303, "num_input_tokens_seen": 1014506672, "step": 2233, "train_runtime": 158828.7542, "train_tokens_per_second": 6387.424 }, { "epoch": 2.7046604678440014, "grad_norm": 0.25850167870521545, "learning_rate": 5e-06, "loss": 0.9683, "num_input_tokens_seen": 1014949832, "step": 2234, "train_runtime": 158894.4091, "train_tokens_per_second": 6387.574 }, { "epoch": 2.705871224661603, "grad_norm": 0.23657569289207458, "learning_rate": 5e-06, "loss": 0.9358, "num_input_tokens_seen": 1015406592, "step": 2235, "train_runtime": 158962.0909, "train_tokens_per_second": 6387.728 }, { "epoch": 2.707081981479204, "grad_norm": 0.2708401381969452, "learning_rate": 5e-06, "loss": 0.8852, "num_input_tokens_seen": 1015841432, "step": 2236, "train_runtime": 159026.4419, "train_tokens_per_second": 6387.878 }, { "epoch": 2.7082927382968056, "grad_norm": 0.24284714460372925, "learning_rate": 5e-06, "loss": 0.9317, "num_input_tokens_seen": 1016289424, "step": 2237, "train_runtime": 159092.4722, "train_tokens_per_second": 6388.042 }, { "epoch": 2.709503495114407, "grad_norm": 0.25480154156684875, "learning_rate": 5e-06, "loss": 0.9192, "num_input_tokens_seen": 1016757504, "step": 2238, "train_runtime": 159161.1907, "train_tokens_per_second": 6388.225 }, { "epoch": 2.7107142519320084, "grad_norm": 0.24498331546783447, "learning_rate": 5e-06, "loss": 0.8707, "num_input_tokens_seen": 1017194928, "step": 2239, "train_runtime": 159226.0946, "train_tokens_per_second": 6388.368 }, { "epoch": 2.71192500874961, "grad_norm": 0.26153630018234253, "learning_rate": 5e-06, "loss": 0.9109, "num_input_tokens_seen": 1017660048, "step": 2240, "train_runtime": 159295.3482, "train_tokens_per_second": 6388.511 }, { "epoch": 2.7131357655672113, "grad_norm": 0.28027719259262085, "learning_rate": 5e-06, "loss": 0.9673, "num_input_tokens_seen": 1018086808, "step": 2241, "train_runtime": 159358.1538, "train_tokens_per_second": 6388.671 }, { "epoch": 2.7143465223848127, "grad_norm": 0.3030099868774414, "learning_rate": 5e-06, "loss": 0.9419, "num_input_tokens_seen": 1018528160, "step": 2242, "train_runtime": 159423.698, "train_tokens_per_second": 6388.813 }, { "epoch": 2.715557279202414, "grad_norm": 0.2761872112751007, "learning_rate": 5e-06, "loss": 0.9203, "num_input_tokens_seen": 1018979560, "step": 2243, "train_runtime": 159490.4403, "train_tokens_per_second": 6388.97 }, { "epoch": 2.7167680360200155, "grad_norm": 0.2496478110551834, "learning_rate": 5e-06, "loss": 0.9394, "num_input_tokens_seen": 1019421000, "step": 2244, "train_runtime": 159555.4925, "train_tokens_per_second": 6389.131 }, { "epoch": 2.7179787928376165, "grad_norm": 0.33068129420280457, "learning_rate": 5e-06, "loss": 0.917, "num_input_tokens_seen": 1019848624, "step": 2245, "train_runtime": 159619.1337, "train_tokens_per_second": 6389.263 }, { "epoch": 2.7191895496552183, "grad_norm": 0.26431363821029663, "learning_rate": 5e-06, "loss": 0.957, "num_input_tokens_seen": 1020290632, "step": 2246, "train_runtime": 159684.4414, "train_tokens_per_second": 6389.418 }, { "epoch": 2.7204003064728193, "grad_norm": 0.25907760858535767, "learning_rate": 5e-06, "loss": 0.9241, "num_input_tokens_seen": 1020772456, "step": 2247, "train_runtime": 159755.9211, "train_tokens_per_second": 6389.575 }, { "epoch": 2.7216110632904207, "grad_norm": 0.26979854702949524, "learning_rate": 5e-06, "loss": 0.9457, "num_input_tokens_seen": 1021232408, "step": 2248, "train_runtime": 159823.991, "train_tokens_per_second": 6389.732 }, { "epoch": 2.722821820108022, "grad_norm": 0.27133068442344666, "learning_rate": 5e-06, "loss": 0.9257, "num_input_tokens_seen": 1021703000, "step": 2249, "train_runtime": 159893.8238, "train_tokens_per_second": 6389.884 }, { "epoch": 2.7240325769256235, "grad_norm": 0.2623973786830902, "learning_rate": 5e-06, "loss": 0.9326, "num_input_tokens_seen": 1022143904, "step": 2250, "train_runtime": 159958.597, "train_tokens_per_second": 6390.053 }, { "epoch": 2.725243333743225, "grad_norm": 0.2727581262588501, "learning_rate": 5e-06, "loss": 0.8614, "num_input_tokens_seen": 1022607144, "step": 2251, "train_runtime": 160027.1697, "train_tokens_per_second": 6390.21 }, { "epoch": 2.7264540905608263, "grad_norm": 0.2389581948518753, "learning_rate": 5e-06, "loss": 0.9231, "num_input_tokens_seen": 1023053400, "step": 2252, "train_runtime": 160093.3628, "train_tokens_per_second": 6390.355 }, { "epoch": 2.7276648473784277, "grad_norm": 0.2514803409576416, "learning_rate": 5e-06, "loss": 0.9391, "num_input_tokens_seen": 1023527608, "step": 2253, "train_runtime": 160163.4694, "train_tokens_per_second": 6390.518 }, { "epoch": 2.728875604196029, "grad_norm": 0.24334073066711426, "learning_rate": 5e-06, "loss": 0.9595, "num_input_tokens_seen": 1023989968, "step": 2254, "train_runtime": 160231.8243, "train_tokens_per_second": 6390.678 }, { "epoch": 2.7300863610136306, "grad_norm": 0.2730535864830017, "learning_rate": 5e-06, "loss": 0.9543, "num_input_tokens_seen": 1024436392, "step": 2255, "train_runtime": 160298.2361, "train_tokens_per_second": 6390.815 }, { "epoch": 2.731297117831232, "grad_norm": 0.257646769285202, "learning_rate": 5e-06, "loss": 0.9654, "num_input_tokens_seen": 1024877272, "step": 2256, "train_runtime": 160364.5888, "train_tokens_per_second": 6390.92 }, { "epoch": 2.7325078746488334, "grad_norm": 0.2575959265232086, "learning_rate": 5e-06, "loss": 0.9102, "num_input_tokens_seen": 1025345304, "step": 2257, "train_runtime": 160434.8659, "train_tokens_per_second": 6391.038 }, { "epoch": 2.7337186314664343, "grad_norm": 0.24817879498004913, "learning_rate": 5e-06, "loss": 0.9336, "num_input_tokens_seen": 1025789800, "step": 2258, "train_runtime": 160500.5173, "train_tokens_per_second": 6391.193 }, { "epoch": 2.734929388284036, "grad_norm": 0.23942458629608154, "learning_rate": 5e-06, "loss": 0.9363, "num_input_tokens_seen": 1026251136, "step": 2259, "train_runtime": 160569.0846, "train_tokens_per_second": 6391.337 }, { "epoch": 2.736140145101637, "grad_norm": 0.24998879432678223, "learning_rate": 5e-06, "loss": 0.9369, "num_input_tokens_seen": 1026706720, "step": 2260, "train_runtime": 160636.2779, "train_tokens_per_second": 6391.5 }, { "epoch": 2.7373509019192386, "grad_norm": 0.2262594848871231, "learning_rate": 5e-06, "loss": 0.8807, "num_input_tokens_seen": 1027191856, "step": 2261, "train_runtime": 160708.6042, "train_tokens_per_second": 6391.642 }, { "epoch": 2.73856165873684, "grad_norm": 0.25167331099510193, "learning_rate": 5e-06, "loss": 0.9173, "num_input_tokens_seen": 1027631680, "step": 2262, "train_runtime": 160773.4123, "train_tokens_per_second": 6391.801 }, { "epoch": 2.7397724155544414, "grad_norm": 0.25715553760528564, "learning_rate": 5e-06, "loss": 0.8853, "num_input_tokens_seen": 1028082536, "step": 2263, "train_runtime": 160840.2689, "train_tokens_per_second": 6391.947 }, { "epoch": 2.740983172372043, "grad_norm": 0.23995672166347504, "learning_rate": 5e-06, "loss": 0.9307, "num_input_tokens_seen": 1028562968, "step": 2264, "train_runtime": 160911.4001, "train_tokens_per_second": 6392.108 }, { "epoch": 2.742193929189644, "grad_norm": 0.24075527489185333, "learning_rate": 5e-06, "loss": 0.8959, "num_input_tokens_seen": 1028999072, "step": 2265, "train_runtime": 160975.9822, "train_tokens_per_second": 6392.252 }, { "epoch": 2.7434046860072456, "grad_norm": 0.24111104011535645, "learning_rate": 5e-06, "loss": 0.9342, "num_input_tokens_seen": 1029436168, "step": 2266, "train_runtime": 161040.4703, "train_tokens_per_second": 6392.407 }, { "epoch": 2.744615442824847, "grad_norm": 0.24185071885585785, "learning_rate": 5e-06, "loss": 0.9124, "num_input_tokens_seen": 1029895528, "step": 2267, "train_runtime": 161108.4302, "train_tokens_per_second": 6392.561 }, { "epoch": 2.7458261996424485, "grad_norm": 0.2595217525959015, "learning_rate": 5e-06, "loss": 0.9333, "num_input_tokens_seen": 1030313776, "step": 2268, "train_runtime": 161170.4004, "train_tokens_per_second": 6392.698 }, { "epoch": 2.74703695646005, "grad_norm": 0.2608698308467865, "learning_rate": 5e-06, "loss": 0.9352, "num_input_tokens_seen": 1030773200, "step": 2269, "train_runtime": 161238.33, "train_tokens_per_second": 6392.855 }, { "epoch": 2.7482477132776513, "grad_norm": 0.2337881624698639, "learning_rate": 5e-06, "loss": 0.903, "num_input_tokens_seen": 1031251608, "step": 2270, "train_runtime": 161309.7518, "train_tokens_per_second": 6392.99 }, { "epoch": 2.7494584700952522, "grad_norm": 0.23849591612815857, "learning_rate": 5e-06, "loss": 0.9358, "num_input_tokens_seen": 1031721848, "step": 2271, "train_runtime": 161379.4792, "train_tokens_per_second": 6393.142 }, { "epoch": 2.750669226912854, "grad_norm": 0.2555829882621765, "learning_rate": 5e-06, "loss": 0.9064, "num_input_tokens_seen": 1032189752, "step": 2272, "train_runtime": 161448.9849, "train_tokens_per_second": 6393.287 }, { "epoch": 2.751879983730455, "grad_norm": 0.2797653079032898, "learning_rate": 5e-06, "loss": 0.8848, "num_input_tokens_seen": 1032639608, "step": 2273, "train_runtime": 161515.5469, "train_tokens_per_second": 6393.438 }, { "epoch": 2.753090740548057, "grad_norm": 0.23317913711071014, "learning_rate": 5e-06, "loss": 0.9238, "num_input_tokens_seen": 1033080176, "step": 2274, "train_runtime": 161580.6508, "train_tokens_per_second": 6393.588 }, { "epoch": 2.754301497365658, "grad_norm": 0.24477601051330566, "learning_rate": 5e-06, "loss": 0.9416, "num_input_tokens_seen": 1033520656, "step": 2275, "train_runtime": 161645.5728, "train_tokens_per_second": 6393.746 }, { "epoch": 2.7555122541832593, "grad_norm": 0.2364787459373474, "learning_rate": 5e-06, "loss": 0.9096, "num_input_tokens_seen": 1033971520, "step": 2276, "train_runtime": 161712.1981, "train_tokens_per_second": 6393.899 }, { "epoch": 2.7567230110008607, "grad_norm": 0.2605726718902588, "learning_rate": 5e-06, "loss": 0.8921, "num_input_tokens_seen": 1034395088, "step": 2277, "train_runtime": 161774.2815, "train_tokens_per_second": 6394.064 }, { "epoch": 2.757933767818462, "grad_norm": 0.25480276346206665, "learning_rate": 5e-06, "loss": 0.9177, "num_input_tokens_seen": 1034838552, "step": 2278, "train_runtime": 161839.7302, "train_tokens_per_second": 6394.218 }, { "epoch": 2.7591445246360635, "grad_norm": 0.23550397157669067, "learning_rate": 5e-06, "loss": 0.8798, "num_input_tokens_seen": 1035301976, "step": 2279, "train_runtime": 161908.0151, "train_tokens_per_second": 6394.384 }, { "epoch": 2.760355281453665, "grad_norm": 0.23179112374782562, "learning_rate": 5e-06, "loss": 0.8953, "num_input_tokens_seen": 1035774664, "step": 2280, "train_runtime": 161978.1853, "train_tokens_per_second": 6394.532 }, { "epoch": 2.7615660382712663, "grad_norm": 0.28745490312576294, "learning_rate": 5e-06, "loss": 0.9014, "num_input_tokens_seen": 1036207656, "step": 2281, "train_runtime": 162042.7376, "train_tokens_per_second": 6394.657 }, { "epoch": 2.7627767950888678, "grad_norm": 0.2519856095314026, "learning_rate": 5e-06, "loss": 0.906, "num_input_tokens_seen": 1036649640, "step": 2282, "train_runtime": 162107.9902, "train_tokens_per_second": 6394.809 }, { "epoch": 2.763987551906469, "grad_norm": 0.2556043863296509, "learning_rate": 5e-06, "loss": 0.9393, "num_input_tokens_seen": 1037124656, "step": 2283, "train_runtime": 162178.7126, "train_tokens_per_second": 6394.949 }, { "epoch": 2.7651983087240706, "grad_norm": 0.22922824323177338, "learning_rate": 5e-06, "loss": 0.8837, "num_input_tokens_seen": 1037595840, "step": 2284, "train_runtime": 162248.143, "train_tokens_per_second": 6395.117 }, { "epoch": 2.766409065541672, "grad_norm": 0.24666735529899597, "learning_rate": 5e-06, "loss": 0.9671, "num_input_tokens_seen": 1038036336, "step": 2285, "train_runtime": 162313.37, "train_tokens_per_second": 6395.261 }, { "epoch": 2.767619822359273, "grad_norm": 0.27610307931900024, "learning_rate": 5e-06, "loss": 0.8669, "num_input_tokens_seen": 1038496960, "step": 2286, "train_runtime": 162381.4019, "train_tokens_per_second": 6395.418 }, { "epoch": 2.768830579176875, "grad_norm": 0.2348206490278244, "learning_rate": 5e-06, "loss": 0.9206, "num_input_tokens_seen": 1038953384, "step": 2287, "train_runtime": 162448.8975, "train_tokens_per_second": 6395.571 }, { "epoch": 2.7700413359944758, "grad_norm": 0.2505703270435333, "learning_rate": 5e-06, "loss": 0.88, "num_input_tokens_seen": 1039421160, "step": 2288, "train_runtime": 162517.7659, "train_tokens_per_second": 6395.739 }, { "epoch": 2.771252092812077, "grad_norm": 0.23638983070850372, "learning_rate": 5e-06, "loss": 0.8718, "num_input_tokens_seen": 1039873192, "step": 2289, "train_runtime": 162584.5274, "train_tokens_per_second": 6395.893 }, { "epoch": 2.7724628496296786, "grad_norm": 0.23076026141643524, "learning_rate": 5e-06, "loss": 0.8999, "num_input_tokens_seen": 1040363160, "step": 2290, "train_runtime": 162657.0792, "train_tokens_per_second": 6396.052 }, { "epoch": 2.77367360644728, "grad_norm": 0.25392135977745056, "learning_rate": 5e-06, "loss": 0.9451, "num_input_tokens_seen": 1040790536, "step": 2291, "train_runtime": 162720.1261, "train_tokens_per_second": 6396.2 }, { "epoch": 2.7748843632648814, "grad_norm": 0.24415422976016998, "learning_rate": 5e-06, "loss": 0.8886, "num_input_tokens_seen": 1041230312, "step": 2292, "train_runtime": 162785.4545, "train_tokens_per_second": 6396.335 }, { "epoch": 2.776095120082483, "grad_norm": 0.2522631585597992, "learning_rate": 5e-06, "loss": 0.9343, "num_input_tokens_seen": 1041675088, "step": 2293, "train_runtime": 162851.2507, "train_tokens_per_second": 6396.482 }, { "epoch": 2.7773058769000842, "grad_norm": 0.23644526302814484, "learning_rate": 5e-06, "loss": 0.9147, "num_input_tokens_seen": 1042146576, "step": 2294, "train_runtime": 162920.8304, "train_tokens_per_second": 6396.644 }, { "epoch": 2.7785166337176856, "grad_norm": 0.25499239563941956, "learning_rate": 5e-06, "loss": 0.9219, "num_input_tokens_seen": 1042611024, "step": 2295, "train_runtime": 162989.4272, "train_tokens_per_second": 6396.802 }, { "epoch": 2.779727390535287, "grad_norm": 0.2416430562734604, "learning_rate": 5e-06, "loss": 0.9671, "num_input_tokens_seen": 1043078696, "step": 2296, "train_runtime": 163058.9679, "train_tokens_per_second": 6396.942 }, { "epoch": 2.7809381473528885, "grad_norm": 0.2677612900733948, "learning_rate": 5e-06, "loss": 0.8946, "num_input_tokens_seen": 1043533320, "step": 2297, "train_runtime": 163126.3482, "train_tokens_per_second": 6397.086 }, { "epoch": 2.78214890417049, "grad_norm": 0.25840872526168823, "learning_rate": 5e-06, "loss": 0.9096, "num_input_tokens_seen": 1043990936, "step": 2298, "train_runtime": 163194.0314, "train_tokens_per_second": 6397.237 }, { "epoch": 2.783359660988091, "grad_norm": 0.2687901556491852, "learning_rate": 5e-06, "loss": 0.9439, "num_input_tokens_seen": 1044449824, "step": 2299, "train_runtime": 163262.0571, "train_tokens_per_second": 6397.382 }, { "epoch": 2.7845704178056927, "grad_norm": 0.27363818883895874, "learning_rate": 5e-06, "loss": 0.9365, "num_input_tokens_seen": 1044906680, "step": 2300, "train_runtime": 163329.4795, "train_tokens_per_second": 6397.539 }, { "epoch": 2.7857811746232937, "grad_norm": 0.2355838119983673, "learning_rate": 5e-06, "loss": 0.8774, "num_input_tokens_seen": 1045377512, "step": 2301, "train_runtime": 163398.8973, "train_tokens_per_second": 6397.702 }, { "epoch": 2.786991931440895, "grad_norm": 0.24392828345298767, "learning_rate": 5e-06, "loss": 0.9311, "num_input_tokens_seen": 1045821864, "step": 2302, "train_runtime": 163465.0191, "train_tokens_per_second": 6397.833 }, { "epoch": 2.7882026882584965, "grad_norm": 0.22444923222064972, "learning_rate": 5e-06, "loss": 0.9476, "num_input_tokens_seen": 1046281000, "step": 2303, "train_runtime": 163532.6383, "train_tokens_per_second": 6397.995 }, { "epoch": 2.789413445076098, "grad_norm": 0.23800964653491974, "learning_rate": 5e-06, "loss": 0.9248, "num_input_tokens_seen": 1046761784, "step": 2304, "train_runtime": 163603.0082, "train_tokens_per_second": 6398.182 }, { "epoch": 2.7906242018936993, "grad_norm": 0.2515329420566559, "learning_rate": 5e-06, "loss": 0.8946, "num_input_tokens_seen": 1047209968, "step": 2305, "train_runtime": 163669.4581, "train_tokens_per_second": 6398.322 }, { "epoch": 2.7918349587113007, "grad_norm": 0.27853265404701233, "learning_rate": 5e-06, "loss": 1.03, "num_input_tokens_seen": 1047644576, "step": 2306, "train_runtime": 163733.387, "train_tokens_per_second": 6398.479 }, { "epoch": 2.793045715528902, "grad_norm": 0.23654435575008392, "learning_rate": 5e-06, "loss": 0.8594, "num_input_tokens_seen": 1048113472, "step": 2307, "train_runtime": 163802.8984, "train_tokens_per_second": 6398.626 }, { "epoch": 2.7942564723465035, "grad_norm": 0.24784903228282928, "learning_rate": 5e-06, "loss": 0.9906, "num_input_tokens_seen": 1048549400, "step": 2308, "train_runtime": 163867.387, "train_tokens_per_second": 6398.768 }, { "epoch": 2.795467229164105, "grad_norm": 0.24880841374397278, "learning_rate": 5e-06, "loss": 0.9033, "num_input_tokens_seen": 1048995416, "step": 2309, "train_runtime": 163934.0955, "train_tokens_per_second": 6398.885 }, { "epoch": 2.7966779859817064, "grad_norm": 0.24870271980762482, "learning_rate": 5e-06, "loss": 0.8977, "num_input_tokens_seen": 1049470880, "step": 2310, "train_runtime": 164006.5254, "train_tokens_per_second": 6398.958 }, { "epoch": 2.7978887427993078, "grad_norm": 0.23734253644943237, "learning_rate": 5e-06, "loss": 0.9153, "num_input_tokens_seen": 1049914560, "step": 2311, "train_runtime": 164071.9182, "train_tokens_per_second": 6399.112 }, { "epoch": 2.7990994996169087, "grad_norm": 0.25134560465812683, "learning_rate": 5e-06, "loss": 0.9128, "num_input_tokens_seen": 1050357984, "step": 2312, "train_runtime": 164137.1428, "train_tokens_per_second": 6399.271 }, { "epoch": 2.8003102564345106, "grad_norm": 0.2300664782524109, "learning_rate": 5e-06, "loss": 0.9098, "num_input_tokens_seen": 1050819792, "step": 2313, "train_runtime": 164205.6195, "train_tokens_per_second": 6399.414 }, { "epoch": 2.8015210132521116, "grad_norm": 0.22979731857776642, "learning_rate": 5e-06, "loss": 0.9221, "num_input_tokens_seen": 1051285056, "step": 2314, "train_runtime": 164274.736, "train_tokens_per_second": 6399.554 }, { "epoch": 2.802731770069713, "grad_norm": 0.23814600706100464, "learning_rate": 5e-06, "loss": 0.9126, "num_input_tokens_seen": 1051755944, "step": 2315, "train_runtime": 164344.435, "train_tokens_per_second": 6399.705 }, { "epoch": 2.8039425268873144, "grad_norm": 0.21878504753112793, "learning_rate": 5e-06, "loss": 0.9152, "num_input_tokens_seen": 1052229800, "step": 2316, "train_runtime": 164414.3881, "train_tokens_per_second": 6399.864 }, { "epoch": 2.805153283704916, "grad_norm": 0.24749340116977692, "learning_rate": 5e-06, "loss": 0.9028, "num_input_tokens_seen": 1052673152, "step": 2317, "train_runtime": 164480.1965, "train_tokens_per_second": 6399.999 }, { "epoch": 2.806364040522517, "grad_norm": 0.23459599912166595, "learning_rate": 5e-06, "loss": 0.9071, "num_input_tokens_seen": 1053111560, "step": 2318, "train_runtime": 164545.2472, "train_tokens_per_second": 6400.134 }, { "epoch": 2.8075747973401186, "grad_norm": 0.24665674567222595, "learning_rate": 5e-06, "loss": 0.9421, "num_input_tokens_seen": 1053560808, "step": 2319, "train_runtime": 164611.4841, "train_tokens_per_second": 6400.287 }, { "epoch": 2.80878555415772, "grad_norm": 0.24288515746593475, "learning_rate": 5e-06, "loss": 0.9171, "num_input_tokens_seen": 1054028536, "step": 2320, "train_runtime": 164680.3862, "train_tokens_per_second": 6400.45 }, { "epoch": 2.8099963109753214, "grad_norm": 0.24747171998023987, "learning_rate": 5e-06, "loss": 0.9457, "num_input_tokens_seen": 1054488288, "step": 2321, "train_runtime": 164748.1304, "train_tokens_per_second": 6400.609 }, { "epoch": 2.811207067792923, "grad_norm": 0.23851259052753448, "learning_rate": 5e-06, "loss": 0.9325, "num_input_tokens_seen": 1054961000, "step": 2322, "train_runtime": 164818.1256, "train_tokens_per_second": 6400.758 }, { "epoch": 2.8124178246105243, "grad_norm": 0.24482108652591705, "learning_rate": 5e-06, "loss": 0.8696, "num_input_tokens_seen": 1055393152, "step": 2323, "train_runtime": 164882.1388, "train_tokens_per_second": 6400.894 }, { "epoch": 2.8136285814281257, "grad_norm": 0.24118374288082123, "learning_rate": 5e-06, "loss": 0.9024, "num_input_tokens_seen": 1055852976, "step": 2324, "train_runtime": 164950.2745, "train_tokens_per_second": 6401.038 }, { "epoch": 2.8148393382457266, "grad_norm": 0.2338990718126297, "learning_rate": 5e-06, "loss": 0.8967, "num_input_tokens_seen": 1056302096, "step": 2325, "train_runtime": 165016.6859, "train_tokens_per_second": 6401.184 }, { "epoch": 2.8160500950633285, "grad_norm": 0.249686598777771, "learning_rate": 5e-06, "loss": 0.9129, "num_input_tokens_seen": 1056769208, "step": 2326, "train_runtime": 165086.0367, "train_tokens_per_second": 6401.324 }, { "epoch": 2.8172608518809295, "grad_norm": 0.24016061425209045, "learning_rate": 5e-06, "loss": 0.8875, "num_input_tokens_seen": 1057246168, "step": 2327, "train_runtime": 165156.8351, "train_tokens_per_second": 6401.468 }, { "epoch": 2.818471608698531, "grad_norm": 0.2340596616268158, "learning_rate": 5e-06, "loss": 0.871, "num_input_tokens_seen": 1057694632, "step": 2328, "train_runtime": 165223.3174, "train_tokens_per_second": 6401.606 }, { "epoch": 2.8196823655161323, "grad_norm": 0.23390509188175201, "learning_rate": 5e-06, "loss": 0.9014, "num_input_tokens_seen": 1058166320, "step": 2329, "train_runtime": 165292.9359, "train_tokens_per_second": 6401.764 }, { "epoch": 2.8208931223337337, "grad_norm": 0.240423783659935, "learning_rate": 5e-06, "loss": 0.9042, "num_input_tokens_seen": 1058610264, "step": 2330, "train_runtime": 165358.5343, "train_tokens_per_second": 6401.909 }, { "epoch": 2.822103879151335, "grad_norm": 0.23276259005069733, "learning_rate": 5e-06, "loss": 0.956, "num_input_tokens_seen": 1059081800, "step": 2331, "train_runtime": 165428.3498, "train_tokens_per_second": 6402.057 }, { "epoch": 2.8233146359689365, "grad_norm": 0.23118719458580017, "learning_rate": 5e-06, "loss": 0.9006, "num_input_tokens_seen": 1059516272, "step": 2332, "train_runtime": 165492.7144, "train_tokens_per_second": 6402.193 }, { "epoch": 2.824525392786538, "grad_norm": 0.24484090507030487, "learning_rate": 5e-06, "loss": 0.899, "num_input_tokens_seen": 1059975872, "step": 2333, "train_runtime": 165560.5144, "train_tokens_per_second": 6402.347 }, { "epoch": 2.8257361496041393, "grad_norm": 0.26148274540901184, "learning_rate": 5e-06, "loss": 0.8908, "num_input_tokens_seen": 1060424984, "step": 2334, "train_runtime": 165626.6035, "train_tokens_per_second": 6402.504 }, { "epoch": 2.8269469064217407, "grad_norm": 0.23295333981513977, "learning_rate": 5e-06, "loss": 0.9084, "num_input_tokens_seen": 1060878880, "step": 2335, "train_runtime": 165693.7989, "train_tokens_per_second": 6402.647 }, { "epoch": 2.828157663239342, "grad_norm": 0.2560044229030609, "learning_rate": 5e-06, "loss": 0.9266, "num_input_tokens_seen": 1061345072, "step": 2336, "train_runtime": 165762.4034, "train_tokens_per_second": 6402.809 }, { "epoch": 2.8293684200569436, "grad_norm": 0.293335497379303, "learning_rate": 5e-06, "loss": 0.9615, "num_input_tokens_seen": 1061791296, "step": 2337, "train_runtime": 165827.814, "train_tokens_per_second": 6402.975 }, { "epoch": 2.8305791768745445, "grad_norm": 0.250169038772583, "learning_rate": 5e-06, "loss": 0.8951, "num_input_tokens_seen": 1062258544, "step": 2338, "train_runtime": 165897.2325, "train_tokens_per_second": 6403.112 }, { "epoch": 2.8317899336921464, "grad_norm": 0.24368995428085327, "learning_rate": 5e-06, "loss": 0.8965, "num_input_tokens_seen": 1062713712, "step": 2339, "train_runtime": 165964.4766, "train_tokens_per_second": 6403.26 }, { "epoch": 2.8330006905097473, "grad_norm": 0.26345351338386536, "learning_rate": 5e-06, "loss": 0.9173, "num_input_tokens_seen": 1063171088, "step": 2340, "train_runtime": 166032.2225, "train_tokens_per_second": 6403.402 }, { "epoch": 2.8342114473273488, "grad_norm": 0.26429590582847595, "learning_rate": 5e-06, "loss": 0.9012, "num_input_tokens_seen": 1063627344, "step": 2341, "train_runtime": 166099.2017, "train_tokens_per_second": 6403.567 }, { "epoch": 2.83542220414495, "grad_norm": 0.2443588227033615, "learning_rate": 5e-06, "loss": 0.8997, "num_input_tokens_seen": 1064085144, "step": 2342, "train_runtime": 166167.5411, "train_tokens_per_second": 6403.688 }, { "epoch": 2.8366329609625516, "grad_norm": 0.2589036226272583, "learning_rate": 5e-06, "loss": 0.9135, "num_input_tokens_seen": 1064548048, "step": 2343, "train_runtime": 166236.1917, "train_tokens_per_second": 6403.828 }, { "epoch": 2.837843717780153, "grad_norm": 0.24053068459033966, "learning_rate": 5e-06, "loss": 0.9705, "num_input_tokens_seen": 1065004800, "step": 2344, "train_runtime": 166304.0312, "train_tokens_per_second": 6403.963 }, { "epoch": 2.8390544745977544, "grad_norm": 0.2785547971725464, "learning_rate": 5e-06, "loss": 0.9527, "num_input_tokens_seen": 1065466952, "step": 2345, "train_runtime": 166372.1789, "train_tokens_per_second": 6404.117 }, { "epoch": 2.840265231415356, "grad_norm": 0.2642555236816406, "learning_rate": 5e-06, "loss": 0.8824, "num_input_tokens_seen": 1065920384, "step": 2346, "train_runtime": 166439.2664, "train_tokens_per_second": 6404.26 }, { "epoch": 2.841475988232957, "grad_norm": 0.24180537462234497, "learning_rate": 5e-06, "loss": 0.9517, "num_input_tokens_seen": 1066358144, "step": 2347, "train_runtime": 166504.0514, "train_tokens_per_second": 6404.398 }, { "epoch": 2.8426867450505586, "grad_norm": 0.2499978095293045, "learning_rate": 5e-06, "loss": 0.9408, "num_input_tokens_seen": 1066833672, "step": 2348, "train_runtime": 166574.2519, "train_tokens_per_second": 6404.553 }, { "epoch": 2.84389750186816, "grad_norm": 0.24067756533622742, "learning_rate": 5e-06, "loss": 0.9379, "num_input_tokens_seen": 1067297648, "step": 2349, "train_runtime": 166642.8687, "train_tokens_per_second": 6404.7 }, { "epoch": 2.8451082586857614, "grad_norm": 0.242728590965271, "learning_rate": 5e-06, "loss": 0.9005, "num_input_tokens_seen": 1067762344, "step": 2350, "train_runtime": 166711.6274, "train_tokens_per_second": 6404.846 }, { "epoch": 2.8463190155033624, "grad_norm": 0.23392565548419952, "learning_rate": 5e-06, "loss": 0.9443, "num_input_tokens_seen": 1068211064, "step": 2351, "train_runtime": 166778.0705, "train_tokens_per_second": 6404.985 }, { "epoch": 2.8475297723209643, "grad_norm": 0.24133005738258362, "learning_rate": 5e-06, "loss": 0.9389, "num_input_tokens_seen": 1068666112, "step": 2352, "train_runtime": 166845.1732, "train_tokens_per_second": 6405.137 }, { "epoch": 2.8487405291385652, "grad_norm": 0.25078195333480835, "learning_rate": 5e-06, "loss": 0.8236, "num_input_tokens_seen": 1069135528, "step": 2353, "train_runtime": 166914.9802, "train_tokens_per_second": 6405.27 }, { "epoch": 2.849951285956167, "grad_norm": 0.2609178125858307, "learning_rate": 5e-06, "loss": 0.9244, "num_input_tokens_seen": 1069602576, "step": 2354, "train_runtime": 166984.0381, "train_tokens_per_second": 6405.418 }, { "epoch": 2.851162042773768, "grad_norm": 0.25105518102645874, "learning_rate": 5e-06, "loss": 0.907, "num_input_tokens_seen": 1070071800, "step": 2355, "train_runtime": 167053.8585, "train_tokens_per_second": 6405.55 }, { "epoch": 2.8523727995913695, "grad_norm": 0.2414802759885788, "learning_rate": 5e-06, "loss": 0.9481, "num_input_tokens_seen": 1070521528, "step": 2356, "train_runtime": 167120.4676, "train_tokens_per_second": 6405.688 }, { "epoch": 2.853583556408971, "grad_norm": 0.25838810205459595, "learning_rate": 5e-06, "loss": 0.9217, "num_input_tokens_seen": 1070945224, "step": 2357, "train_runtime": 167183.2228, "train_tokens_per_second": 6405.818 }, { "epoch": 2.8547943132265723, "grad_norm": 0.24007445573806763, "learning_rate": 5e-06, "loss": 0.9555, "num_input_tokens_seen": 1071419872, "step": 2358, "train_runtime": 167253.3886, "train_tokens_per_second": 6405.968 }, { "epoch": 2.8560050700441737, "grad_norm": 0.23948095738887787, "learning_rate": 5e-06, "loss": 0.9238, "num_input_tokens_seen": 1071871960, "step": 2359, "train_runtime": 167320.3656, "train_tokens_per_second": 6406.106 }, { "epoch": 2.857215826861775, "grad_norm": 0.24351708590984344, "learning_rate": 5e-06, "loss": 0.9199, "num_input_tokens_seen": 1072315552, "step": 2360, "train_runtime": 167386.0141, "train_tokens_per_second": 6406.243 }, { "epoch": 2.8584265836793765, "grad_norm": 0.2502671778202057, "learning_rate": 5e-06, "loss": 0.9532, "num_input_tokens_seen": 1072785312, "step": 2361, "train_runtime": 167455.5199, "train_tokens_per_second": 6406.39 }, { "epoch": 2.859637340496978, "grad_norm": 0.2391587793827057, "learning_rate": 5e-06, "loss": 0.9096, "num_input_tokens_seen": 1073222072, "step": 2362, "train_runtime": 167520.3372, "train_tokens_per_second": 6406.518 }, { "epoch": 2.8608480973145793, "grad_norm": 0.2657223641872406, "learning_rate": 5e-06, "loss": 0.9262, "num_input_tokens_seen": 1073678976, "step": 2363, "train_runtime": 167589.7726, "train_tokens_per_second": 6406.59 }, { "epoch": 2.8620588541321808, "grad_norm": 0.22655223309993744, "learning_rate": 5e-06, "loss": 0.9214, "num_input_tokens_seen": 1074141744, "step": 2364, "train_runtime": 167658.8468, "train_tokens_per_second": 6406.711 }, { "epoch": 2.863269610949782, "grad_norm": 0.22646227478981018, "learning_rate": 5e-06, "loss": 0.918, "num_input_tokens_seen": 1074598560, "step": 2365, "train_runtime": 167726.4287, "train_tokens_per_second": 6406.853 }, { "epoch": 2.864480367767383, "grad_norm": 0.23975032567977905, "learning_rate": 5e-06, "loss": 0.9442, "num_input_tokens_seen": 1075063080, "step": 2366, "train_runtime": 167794.848, "train_tokens_per_second": 6407.009 }, { "epoch": 2.865691124584985, "grad_norm": 0.2874181568622589, "learning_rate": 5e-06, "loss": 0.9366, "num_input_tokens_seen": 1075494480, "step": 2367, "train_runtime": 167858.4872, "train_tokens_per_second": 6407.15 }, { "epoch": 2.866901881402586, "grad_norm": 0.23288311064243317, "learning_rate": 5e-06, "loss": 0.8927, "num_input_tokens_seen": 1075946304, "step": 2368, "train_runtime": 167925.2658, "train_tokens_per_second": 6407.293 }, { "epoch": 2.8681126382201874, "grad_norm": 0.22665363550186157, "learning_rate": 5e-06, "loss": 0.9149, "num_input_tokens_seen": 1076407848, "step": 2369, "train_runtime": 167993.6545, "train_tokens_per_second": 6407.432 }, { "epoch": 2.8693233950377888, "grad_norm": 0.2529769539833069, "learning_rate": 5e-06, "loss": 0.9091, "num_input_tokens_seen": 1076849656, "step": 2370, "train_runtime": 168058.8499, "train_tokens_per_second": 6407.575 }, { "epoch": 2.87053415185539, "grad_norm": 0.2684330344200134, "learning_rate": 5e-06, "loss": 0.9756, "num_input_tokens_seen": 1077281544, "step": 2371, "train_runtime": 168122.0355, "train_tokens_per_second": 6407.736 }, { "epoch": 2.8717449086729916, "grad_norm": 0.2409277856349945, "learning_rate": 5e-06, "loss": 0.8855, "num_input_tokens_seen": 1077754688, "step": 2372, "train_runtime": 168192.2219, "train_tokens_per_second": 6407.875 }, { "epoch": 2.872955665490593, "grad_norm": 0.28829601407051086, "learning_rate": 5e-06, "loss": 0.9238, "num_input_tokens_seen": 1078223256, "step": 2373, "train_runtime": 168261.3837, "train_tokens_per_second": 6408.026 }, { "epoch": 2.8741664223081944, "grad_norm": 0.2507815361022949, "learning_rate": 5e-06, "loss": 0.8819, "num_input_tokens_seen": 1078704120, "step": 2374, "train_runtime": 168332.5096, "train_tokens_per_second": 6408.175 }, { "epoch": 2.875377179125796, "grad_norm": 0.24085399508476257, "learning_rate": 5e-06, "loss": 0.9693, "num_input_tokens_seen": 1079153752, "step": 2375, "train_runtime": 168398.802, "train_tokens_per_second": 6408.322 }, { "epoch": 2.8765879359433972, "grad_norm": 0.23706606030464172, "learning_rate": 5e-06, "loss": 0.931, "num_input_tokens_seen": 1079604888, "step": 2376, "train_runtime": 168465.9087, "train_tokens_per_second": 6408.447 }, { "epoch": 2.8777986927609986, "grad_norm": 0.2508695721626282, "learning_rate": 5e-06, "loss": 0.9629, "num_input_tokens_seen": 1080043568, "step": 2377, "train_runtime": 168530.2954, "train_tokens_per_second": 6408.602 }, { "epoch": 2.8790094495786, "grad_norm": 0.25791847705841064, "learning_rate": 5e-06, "loss": 0.9672, "num_input_tokens_seen": 1080495520, "step": 2378, "train_runtime": 168597.8644, "train_tokens_per_second": 6408.714 }, { "epoch": 2.880220206396201, "grad_norm": 0.24105577170848846, "learning_rate": 5e-06, "loss": 0.9289, "num_input_tokens_seen": 1080951824, "step": 2379, "train_runtime": 168665.4403, "train_tokens_per_second": 6408.852 }, { "epoch": 2.881430963213803, "grad_norm": 0.2576942443847656, "learning_rate": 5e-06, "loss": 0.953, "num_input_tokens_seen": 1081410648, "step": 2380, "train_runtime": 168733.6066, "train_tokens_per_second": 6408.982 }, { "epoch": 2.882641720031404, "grad_norm": 0.2406541258096695, "learning_rate": 5e-06, "loss": 0.96, "num_input_tokens_seen": 1081845248, "step": 2381, "train_runtime": 168797.6162, "train_tokens_per_second": 6409.126 }, { "epoch": 2.8838524768490053, "grad_norm": 0.24252809584140778, "learning_rate": 5e-06, "loss": 0.9154, "num_input_tokens_seen": 1082325608, "step": 2382, "train_runtime": 168868.6096, "train_tokens_per_second": 6409.276 }, { "epoch": 2.8850632336666067, "grad_norm": 0.23159775137901306, "learning_rate": 5e-06, "loss": 0.9114, "num_input_tokens_seen": 1082791040, "step": 2383, "train_runtime": 168937.7744, "train_tokens_per_second": 6409.408 }, { "epoch": 2.886273990484208, "grad_norm": 0.22753025591373444, "learning_rate": 5e-06, "loss": 0.8676, "num_input_tokens_seen": 1083225872, "step": 2384, "train_runtime": 169002.5358, "train_tokens_per_second": 6409.524 }, { "epoch": 2.8874847473018095, "grad_norm": 0.2409481555223465, "learning_rate": 5e-06, "loss": 0.9381, "num_input_tokens_seen": 1083672288, "step": 2385, "train_runtime": 169068.1993, "train_tokens_per_second": 6409.675 }, { "epoch": 2.888695504119411, "grad_norm": 0.2493268996477127, "learning_rate": 5e-06, "loss": 0.9092, "num_input_tokens_seen": 1084097152, "step": 2386, "train_runtime": 169130.9436, "train_tokens_per_second": 6409.81 }, { "epoch": 2.8899062609370123, "grad_norm": 0.23205333948135376, "learning_rate": 5e-06, "loss": 0.8994, "num_input_tokens_seen": 1084576688, "step": 2387, "train_runtime": 169202.0957, "train_tokens_per_second": 6409.948 }, { "epoch": 2.8911170177546137, "grad_norm": 0.2507234811782837, "learning_rate": 5e-06, "loss": 0.8826, "num_input_tokens_seen": 1085033800, "step": 2388, "train_runtime": 169269.553, "train_tokens_per_second": 6410.094 }, { "epoch": 2.892327774572215, "grad_norm": 0.23068372905254364, "learning_rate": 5e-06, "loss": 0.8812, "num_input_tokens_seen": 1085492936, "step": 2389, "train_runtime": 169337.2832, "train_tokens_per_second": 6410.242 }, { "epoch": 2.8935385313898165, "grad_norm": 0.2602866590023041, "learning_rate": 5e-06, "loss": 0.9405, "num_input_tokens_seen": 1085947584, "step": 2390, "train_runtime": 169404.5365, "train_tokens_per_second": 6410.381 }, { "epoch": 2.894749288207418, "grad_norm": 0.24214865267276764, "learning_rate": 5e-06, "loss": 0.9443, "num_input_tokens_seen": 1086393744, "step": 2391, "train_runtime": 169470.7089, "train_tokens_per_second": 6410.51 }, { "epoch": 2.895960045025019, "grad_norm": 0.24468237161636353, "learning_rate": 5e-06, "loss": 0.9122, "num_input_tokens_seen": 1086862296, "step": 2392, "train_runtime": 169540.7414, "train_tokens_per_second": 6410.626 }, { "epoch": 2.8971708018426208, "grad_norm": 0.23451176285743713, "learning_rate": 5e-06, "loss": 0.9174, "num_input_tokens_seen": 1087323160, "step": 2393, "train_runtime": 169608.3951, "train_tokens_per_second": 6410.786 }, { "epoch": 2.8983815586602217, "grad_norm": 0.2530493140220642, "learning_rate": 5e-06, "loss": 0.893, "num_input_tokens_seen": 1087787360, "step": 2394, "train_runtime": 169677.6888, "train_tokens_per_second": 6410.904 }, { "epoch": 2.899592315477823, "grad_norm": 0.24401098489761353, "learning_rate": 5e-06, "loss": 0.959, "num_input_tokens_seen": 1088250688, "step": 2395, "train_runtime": 169746.5348, "train_tokens_per_second": 6411.033 }, { "epoch": 2.9008030722954246, "grad_norm": 0.25914639234542847, "learning_rate": 5e-06, "loss": 0.9092, "num_input_tokens_seen": 1088720056, "step": 2396, "train_runtime": 169816.2236, "train_tokens_per_second": 6411.166 }, { "epoch": 2.902013829113026, "grad_norm": 0.24759583175182343, "learning_rate": 5e-06, "loss": 0.9658, "num_input_tokens_seen": 1089175536, "step": 2397, "train_runtime": 169884.2797, "train_tokens_per_second": 6411.279 }, { "epoch": 2.9032245859306274, "grad_norm": 0.23406663537025452, "learning_rate": 5e-06, "loss": 0.9147, "num_input_tokens_seen": 1089629544, "step": 2398, "train_runtime": 169951.7513, "train_tokens_per_second": 6411.405 }, { "epoch": 2.904435342748229, "grad_norm": 0.23380409181118011, "learning_rate": 5e-06, "loss": 0.8736, "num_input_tokens_seen": 1090090760, "step": 2399, "train_runtime": 170020.1649, "train_tokens_per_second": 6411.538 }, { "epoch": 2.90564609956583, "grad_norm": 0.2372436821460724, "learning_rate": 5e-06, "loss": 0.9888, "num_input_tokens_seen": 1090533784, "step": 2400, "train_runtime": 170085.5892, "train_tokens_per_second": 6411.677 }, { "epoch": 2.9068568563834316, "grad_norm": 0.23042653501033783, "learning_rate": 5e-06, "loss": 0.8597, "num_input_tokens_seen": 1090995800, "step": 2401, "train_runtime": 170153.6357, "train_tokens_per_second": 6411.828 }, { "epoch": 2.908067613201033, "grad_norm": 0.23908060789108276, "learning_rate": 5e-06, "loss": 0.9105, "num_input_tokens_seen": 1091454616, "step": 2402, "train_runtime": 170221.8303, "train_tokens_per_second": 6411.954 }, { "epoch": 2.9092783700186344, "grad_norm": 0.2331104278564453, "learning_rate": 5e-06, "loss": 0.8998, "num_input_tokens_seen": 1091927576, "step": 2403, "train_runtime": 170290.973, "train_tokens_per_second": 6412.128 }, { "epoch": 2.910489126836236, "grad_norm": 0.24983853101730347, "learning_rate": 5e-06, "loss": 0.9253, "num_input_tokens_seen": 1092399736, "step": 2404, "train_runtime": 170360.7078, "train_tokens_per_second": 6412.275 }, { "epoch": 2.911699883653837, "grad_norm": 0.2480890452861786, "learning_rate": 5e-06, "loss": 0.8912, "num_input_tokens_seen": 1092848752, "step": 2405, "train_runtime": 170427.5587, "train_tokens_per_second": 6412.395 }, { "epoch": 2.9129106404714387, "grad_norm": 0.27480000257492065, "learning_rate": 5e-06, "loss": 0.9469, "num_input_tokens_seen": 1093302472, "step": 2406, "train_runtime": 170494.7997, "train_tokens_per_second": 6412.527 }, { "epoch": 2.9141213972890396, "grad_norm": 0.23799914121627808, "learning_rate": 5e-06, "loss": 0.8758, "num_input_tokens_seen": 1093755192, "step": 2407, "train_runtime": 170561.4757, "train_tokens_per_second": 6412.674 }, { "epoch": 2.915332154106641, "grad_norm": 0.24148832261562347, "learning_rate": 5e-06, "loss": 0.9984, "num_input_tokens_seen": 1094210248, "step": 2408, "train_runtime": 170628.6094, "train_tokens_per_second": 6412.818 }, { "epoch": 2.9165429109242424, "grad_norm": 0.2666292190551758, "learning_rate": 5e-06, "loss": 0.9139, "num_input_tokens_seen": 1094648584, "step": 2409, "train_runtime": 170693.3721, "train_tokens_per_second": 6412.953 }, { "epoch": 2.917753667741844, "grad_norm": 0.26424267888069153, "learning_rate": 5e-06, "loss": 0.9111, "num_input_tokens_seen": 1095095048, "step": 2410, "train_runtime": 170759.9277, "train_tokens_per_second": 6413.068 }, { "epoch": 2.9189644245594453, "grad_norm": 0.24830442667007446, "learning_rate": 5e-06, "loss": 0.9129, "num_input_tokens_seen": 1095568936, "step": 2411, "train_runtime": 170830.0067, "train_tokens_per_second": 6413.211 }, { "epoch": 2.9201751813770467, "grad_norm": 0.2383262813091278, "learning_rate": 5e-06, "loss": 0.9718, "num_input_tokens_seen": 1096018672, "step": 2412, "train_runtime": 170896.6645, "train_tokens_per_second": 6413.342 }, { "epoch": 2.921385938194648, "grad_norm": 0.24399027228355408, "learning_rate": 5e-06, "loss": 0.8699, "num_input_tokens_seen": 1096467472, "step": 2413, "train_runtime": 170962.5005, "train_tokens_per_second": 6413.497 }, { "epoch": 2.9225966950122495, "grad_norm": 0.25469908118247986, "learning_rate": 5e-06, "loss": 0.9116, "num_input_tokens_seen": 1096916056, "step": 2414, "train_runtime": 171028.7924, "train_tokens_per_second": 6413.634 }, { "epoch": 2.923807451829851, "grad_norm": 0.24257248640060425, "learning_rate": 5e-06, "loss": 0.9307, "num_input_tokens_seen": 1097360792, "step": 2415, "train_runtime": 171094.5732, "train_tokens_per_second": 6413.767 }, { "epoch": 2.9250182086474523, "grad_norm": 0.2582697570323944, "learning_rate": 5e-06, "loss": 0.9129, "num_input_tokens_seen": 1097812096, "step": 2416, "train_runtime": 171162.4853, "train_tokens_per_second": 6413.859 }, { "epoch": 2.9262289654650537, "grad_norm": 0.25255024433135986, "learning_rate": 5e-06, "loss": 0.9374, "num_input_tokens_seen": 1098273232, "step": 2417, "train_runtime": 171231.7513, "train_tokens_per_second": 6413.958 }, { "epoch": 2.9274397222826547, "grad_norm": 0.2257550060749054, "learning_rate": 5e-06, "loss": 0.8717, "num_input_tokens_seen": 1098747232, "step": 2418, "train_runtime": 171301.7027, "train_tokens_per_second": 6414.106 }, { "epoch": 2.9286504791002566, "grad_norm": 0.2679274380207062, "learning_rate": 5e-06, "loss": 0.9413, "num_input_tokens_seen": 1099191672, "step": 2419, "train_runtime": 171368.2395, "train_tokens_per_second": 6414.209 }, { "epoch": 2.9298612359178575, "grad_norm": 0.2332017421722412, "learning_rate": 5e-06, "loss": 0.8987, "num_input_tokens_seen": 1099651328, "step": 2420, "train_runtime": 171440.7464, "train_tokens_per_second": 6414.177 }, { "epoch": 2.931071992735459, "grad_norm": 0.24200941622257233, "learning_rate": 5e-06, "loss": 0.901, "num_input_tokens_seen": 1100118920, "step": 2421, "train_runtime": 171514.9266, "train_tokens_per_second": 6414.129 }, { "epoch": 2.9322827495530603, "grad_norm": 0.25546711683273315, "learning_rate": 5e-06, "loss": 0.9191, "num_input_tokens_seen": 1100554296, "step": 2422, "train_runtime": 171583.4479, "train_tokens_per_second": 6414.105 }, { "epoch": 2.9334935063706618, "grad_norm": 0.25552940368652344, "learning_rate": 5e-06, "loss": 0.8448, "num_input_tokens_seen": 1101022976, "step": 2423, "train_runtime": 171657.7679, "train_tokens_per_second": 6414.059 }, { "epoch": 2.934704263188263, "grad_norm": 0.23404908180236816, "learning_rate": 5e-06, "loss": 0.8804, "num_input_tokens_seen": 1101486248, "step": 2424, "train_runtime": 171731.0221, "train_tokens_per_second": 6414.02 }, { "epoch": 2.9359150200058646, "grad_norm": 0.23336048424243927, "learning_rate": 5e-06, "loss": 0.8998, "num_input_tokens_seen": 1101941064, "step": 2425, "train_runtime": 171803.1314, "train_tokens_per_second": 6413.975 }, { "epoch": 2.937125776823466, "grad_norm": 0.24817214906215668, "learning_rate": 5e-06, "loss": 0.9435, "num_input_tokens_seen": 1102393528, "step": 2426, "train_runtime": 171874.1839, "train_tokens_per_second": 6413.956 }, { "epoch": 2.9383365336410674, "grad_norm": 0.25328731536865234, "learning_rate": 5e-06, "loss": 0.9053, "num_input_tokens_seen": 1102852280, "step": 2427, "train_runtime": 171946.8179, "train_tokens_per_second": 6413.915 }, { "epoch": 2.939547290458669, "grad_norm": 0.26048070192337036, "learning_rate": 5e-06, "loss": 0.9076, "num_input_tokens_seen": 1103317024, "step": 2428, "train_runtime": 172020.4148, "train_tokens_per_second": 6413.873 }, { "epoch": 2.94075804727627, "grad_norm": 0.262016236782074, "learning_rate": 5e-06, "loss": 0.9485, "num_input_tokens_seen": 1103764712, "step": 2429, "train_runtime": 172091.0589, "train_tokens_per_second": 6413.841 }, { "epoch": 2.9419688040938716, "grad_norm": 0.26942306756973267, "learning_rate": 5e-06, "loss": 0.955, "num_input_tokens_seen": 1104217608, "step": 2430, "train_runtime": 172162.6557, "train_tokens_per_second": 6413.804 }, { "epoch": 2.9431795609114726, "grad_norm": 0.23582886159420013, "learning_rate": 5e-06, "loss": 0.9273, "num_input_tokens_seen": 1104678152, "step": 2431, "train_runtime": 172234.9096, "train_tokens_per_second": 6413.788 }, { "epoch": 2.9443903177290744, "grad_norm": 0.23661422729492188, "learning_rate": 5e-06, "loss": 0.8838, "num_input_tokens_seen": 1105153976, "step": 2432, "train_runtime": 172310.273, "train_tokens_per_second": 6413.744 }, { "epoch": 2.9456010745466754, "grad_norm": 0.2636778652667999, "learning_rate": 5e-06, "loss": 0.9245, "num_input_tokens_seen": 1105621224, "step": 2433, "train_runtime": 172384.3139, "train_tokens_per_second": 6413.7 }, { "epoch": 2.9468118313642773, "grad_norm": 0.26190289855003357, "learning_rate": 5e-06, "loss": 0.9535, "num_input_tokens_seen": 1106056184, "step": 2434, "train_runtime": 172452.7145, "train_tokens_per_second": 6413.678 }, { "epoch": 2.9480225881818782, "grad_norm": 0.23981881141662598, "learning_rate": 5e-06, "loss": 0.9348, "num_input_tokens_seen": 1106502344, "step": 2435, "train_runtime": 172523.4959, "train_tokens_per_second": 6413.633 }, { "epoch": 2.9492333449994796, "grad_norm": 0.23015964031219482, "learning_rate": 5e-06, "loss": 0.9253, "num_input_tokens_seen": 1106974256, "step": 2436, "train_runtime": 172598.6333, "train_tokens_per_second": 6413.575 }, { "epoch": 2.950444101817081, "grad_norm": 0.22270654141902924, "learning_rate": 5e-06, "loss": 0.88, "num_input_tokens_seen": 1107434112, "step": 2437, "train_runtime": 172671.4841, "train_tokens_per_second": 6413.532 }, { "epoch": 2.9516548586346825, "grad_norm": 0.23962879180908203, "learning_rate": 5e-06, "loss": 0.9206, "num_input_tokens_seen": 1107898816, "step": 2438, "train_runtime": 172745.1854, "train_tokens_per_second": 6413.486 }, { "epoch": 2.952865615452284, "grad_norm": 0.2439015656709671, "learning_rate": 5e-06, "loss": 0.9122, "num_input_tokens_seen": 1108361840, "step": 2439, "train_runtime": 172818.5353, "train_tokens_per_second": 6413.443 }, { "epoch": 2.9540763722698853, "grad_norm": 0.22247134149074554, "learning_rate": 5e-06, "loss": 0.8976, "num_input_tokens_seen": 1108842328, "step": 2440, "train_runtime": 172894.6773, "train_tokens_per_second": 6413.398 }, { "epoch": 2.9552871290874867, "grad_norm": 0.22744810581207275, "learning_rate": 5e-06, "loss": 0.9221, "num_input_tokens_seen": 1109297992, "step": 2441, "train_runtime": 172966.821, "train_tokens_per_second": 6413.357 }, { "epoch": 2.956497885905088, "grad_norm": 0.2583228051662445, "learning_rate": 5e-06, "loss": 0.9173, "num_input_tokens_seen": 1109745168, "step": 2442, "train_runtime": 173037.3433, "train_tokens_per_second": 6413.328 }, { "epoch": 2.9577086427226895, "grad_norm": 0.2402677983045578, "learning_rate": 5e-06, "loss": 0.878, "num_input_tokens_seen": 1110207960, "step": 2443, "train_runtime": 173110.5564, "train_tokens_per_second": 6413.289 }, { "epoch": 2.958919399540291, "grad_norm": 0.23672647774219513, "learning_rate": 5e-06, "loss": 0.8765, "num_input_tokens_seen": 1110649312, "step": 2444, "train_runtime": 173180.6289, "train_tokens_per_second": 6413.242 }, { "epoch": 2.9601301563578923, "grad_norm": 0.2683030068874359, "learning_rate": 5e-06, "loss": 0.9375, "num_input_tokens_seen": 1111089320, "step": 2445, "train_runtime": 173250.1673, "train_tokens_per_second": 6413.208 }, { "epoch": 2.9613409131754933, "grad_norm": 0.25095537304878235, "learning_rate": 5e-06, "loss": 0.9354, "num_input_tokens_seen": 1111530136, "step": 2446, "train_runtime": 173320.2042, "train_tokens_per_second": 6413.16 }, { "epoch": 2.962551669993095, "grad_norm": 0.25182783603668213, "learning_rate": 5e-06, "loss": 0.9433, "num_input_tokens_seen": 1111995088, "step": 2447, "train_runtime": 173393.9827, "train_tokens_per_second": 6413.112 }, { "epoch": 2.963762426810696, "grad_norm": 0.270939439535141, "learning_rate": 5e-06, "loss": 0.8994, "num_input_tokens_seen": 1112439680, "step": 2448, "train_runtime": 173464.0985, "train_tokens_per_second": 6413.083 }, { "epoch": 2.9649731836282975, "grad_norm": 0.25373977422714233, "learning_rate": 5e-06, "loss": 0.9124, "num_input_tokens_seen": 1112881368, "step": 2449, "train_runtime": 173533.7189, "train_tokens_per_second": 6413.055 }, { "epoch": 2.966183940445899, "grad_norm": 0.24695639312267303, "learning_rate": 5e-06, "loss": 0.8949, "num_input_tokens_seen": 1113344440, "step": 2450, "train_runtime": 173606.9361, "train_tokens_per_second": 6413.018 }, { "epoch": 2.9673946972635004, "grad_norm": 0.24027635157108307, "learning_rate": 5e-06, "loss": 0.9779, "num_input_tokens_seen": 1113813744, "step": 2451, "train_runtime": 173681.1928, "train_tokens_per_second": 6412.978 }, { "epoch": 2.9686054540811018, "grad_norm": 0.2398044615983963, "learning_rate": 5e-06, "loss": 0.8824, "num_input_tokens_seen": 1114270416, "step": 2452, "train_runtime": 173753.6, "train_tokens_per_second": 6412.934 }, { "epoch": 2.969816210898703, "grad_norm": 0.27489855885505676, "learning_rate": 5e-06, "loss": 0.8882, "num_input_tokens_seen": 1114724384, "step": 2453, "train_runtime": 173825.6733, "train_tokens_per_second": 6412.887 }, { "epoch": 2.9710269677163046, "grad_norm": 0.26074662804603577, "learning_rate": 5e-06, "loss": 0.9524, "num_input_tokens_seen": 1115177168, "step": 2454, "train_runtime": 173897.6763, "train_tokens_per_second": 6412.835 }, { "epoch": 2.972237724533906, "grad_norm": 0.24579590559005737, "learning_rate": 5e-06, "loss": 0.9598, "num_input_tokens_seen": 1115634232, "step": 2455, "train_runtime": 173970.4648, "train_tokens_per_second": 6412.78 }, { "epoch": 2.9734484813515074, "grad_norm": 0.22661468386650085, "learning_rate": 5e-06, "loss": 0.8578, "num_input_tokens_seen": 1116124864, "step": 2456, "train_runtime": 174048.669, "train_tokens_per_second": 6412.717 }, { "epoch": 2.974659238169109, "grad_norm": 0.26638656854629517, "learning_rate": 5e-06, "loss": 0.9083, "num_input_tokens_seen": 1116575904, "step": 2457, "train_runtime": 174120.1916, "train_tokens_per_second": 6412.673 }, { "epoch": 2.9758699949867102, "grad_norm": 0.2577857971191406, "learning_rate": 5e-06, "loss": 0.8847, "num_input_tokens_seen": 1117030408, "step": 2458, "train_runtime": 174191.0573, "train_tokens_per_second": 6412.674 }, { "epoch": 2.977080751804311, "grad_norm": 0.23168501257896423, "learning_rate": 5e-06, "loss": 0.9056, "num_input_tokens_seen": 1117520192, "step": 2459, "train_runtime": 174267.8102, "train_tokens_per_second": 6412.66 }, { "epoch": 2.978291508621913, "grad_norm": 0.25029903650283813, "learning_rate": 5e-06, "loss": 0.9101, "num_input_tokens_seen": 1117976192, "step": 2460, "train_runtime": 174339.6479, "train_tokens_per_second": 6412.633 }, { "epoch": 2.979502265439514, "grad_norm": 0.23616862297058105, "learning_rate": 5e-06, "loss": 0.9479, "num_input_tokens_seen": 1118420800, "step": 2461, "train_runtime": 174409.7079, "train_tokens_per_second": 6412.606 }, { "epoch": 2.9807130222571154, "grad_norm": 0.24392381310462952, "learning_rate": 5e-06, "loss": 0.9767, "num_input_tokens_seen": 1118884408, "step": 2462, "train_runtime": 174483.4587, "train_tokens_per_second": 6412.553 }, { "epoch": 2.981923779074717, "grad_norm": 0.23490194976329803, "learning_rate": 5e-06, "loss": 0.8814, "num_input_tokens_seen": 1119381392, "step": 2463, "train_runtime": 174562.9683, "train_tokens_per_second": 6412.479 }, { "epoch": 2.9831345358923183, "grad_norm": 0.2503698170185089, "learning_rate": 5e-06, "loss": 0.9589, "num_input_tokens_seen": 1119802384, "step": 2464, "train_runtime": 174629.2291, "train_tokens_per_second": 6412.457 }, { "epoch": 2.9843452927099197, "grad_norm": 0.2408633977174759, "learning_rate": 5e-06, "loss": 0.967, "num_input_tokens_seen": 1120232368, "step": 2465, "train_runtime": 174696.7418, "train_tokens_per_second": 6412.44 }, { "epoch": 2.985556049527521, "grad_norm": 0.23038393259048462, "learning_rate": 5e-06, "loss": 0.8323, "num_input_tokens_seen": 1120694192, "step": 2466, "train_runtime": 174769.9103, "train_tokens_per_second": 6412.398 }, { "epoch": 2.9867668063451225, "grad_norm": 0.23888365924358368, "learning_rate": 5e-06, "loss": 0.9043, "num_input_tokens_seen": 1121142168, "step": 2467, "train_runtime": 174840.0255, "train_tokens_per_second": 6412.388 }, { "epoch": 2.987977563162724, "grad_norm": 0.2362690418958664, "learning_rate": 5e-06, "loss": 0.9153, "num_input_tokens_seen": 1121609584, "step": 2468, "train_runtime": 174913.8632, "train_tokens_per_second": 6412.354 }, { "epoch": 2.9891883199803253, "grad_norm": 0.2894575595855713, "learning_rate": 5e-06, "loss": 0.939, "num_input_tokens_seen": 1122044904, "step": 2469, "train_runtime": 174982.5392, "train_tokens_per_second": 6412.325 }, { "epoch": 2.9903990767979267, "grad_norm": 0.24327421188354492, "learning_rate": 5e-06, "loss": 0.8949, "num_input_tokens_seen": 1122496600, "step": 2470, "train_runtime": 175053.549, "train_tokens_per_second": 6412.304 }, { "epoch": 2.991609833615528, "grad_norm": 0.2470681220293045, "learning_rate": 5e-06, "loss": 0.8861, "num_input_tokens_seen": 1122965344, "step": 2471, "train_runtime": 175127.8031, "train_tokens_per_second": 6412.262 }, { "epoch": 2.992820590433129, "grad_norm": 0.2589993476867676, "learning_rate": 5e-06, "loss": 0.8901, "num_input_tokens_seen": 1123437960, "step": 2472, "train_runtime": 175202.8631, "train_tokens_per_second": 6412.212 }, { "epoch": 2.994031347250731, "grad_norm": 0.2532251179218292, "learning_rate": 5e-06, "loss": 0.902, "num_input_tokens_seen": 1123894464, "step": 2473, "train_runtime": 175275.4878, "train_tokens_per_second": 6412.16 }, { "epoch": 2.995242104068332, "grad_norm": 0.23978720605373383, "learning_rate": 5e-06, "loss": 0.892, "num_input_tokens_seen": 1124361120, "step": 2474, "train_runtime": 175348.7702, "train_tokens_per_second": 6412.141 }, { "epoch": 2.9964528608859333, "grad_norm": 0.24950125813484192, "learning_rate": 5e-06, "loss": 0.918, "num_input_tokens_seen": 1124811584, "step": 2475, "train_runtime": 175417.2461, "train_tokens_per_second": 6412.206 }, { "epoch": 2.9976636177035347, "grad_norm": 0.2536337971687317, "learning_rate": 5e-06, "loss": 0.9361, "num_input_tokens_seen": 1125259760, "step": 2476, "train_runtime": 175487.7712, "train_tokens_per_second": 6412.183 }, { "epoch": 2.998874374521136, "grad_norm": 0.23223650455474854, "learning_rate": 5e-06, "loss": 0.8352, "num_input_tokens_seen": 1125711656, "step": 2477, "train_runtime": 175556.6609, "train_tokens_per_second": 6412.241 }, { "epoch": 3.0, "grad_norm": 0.25418493151664734, "learning_rate": 5e-06, "loss": 0.9392, "num_input_tokens_seen": 1126151688, "step": 2478, "train_runtime": 175624.5645, "train_tokens_per_second": 6412.268 }, { "epoch": 3.0, "num_input_tokens_seen": 1126151688, "step": 2478, "total_flos": 2.4182853777648783e+18, "train_loss": 0.9592450147342836, "train_runtime": 175626.0126, "train_samples_per_second": 3.612, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 2478, "num_input_tokens_seen": 1126151688, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4182853777648783e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }