diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24824 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2478, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012107568176013773, + "grad_norm": 1.6823863983154297, + "learning_rate": 0.0, + "loss": 1.3109, + "num_input_tokens_seen": 438344, + "step": 1, + "train_runtime": 66.1532, + "train_tokens_per_second": 6626.197 + }, + { + "epoch": 0.0024215136352027547, + "grad_norm": 1.6656533479690552, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.2706, + "num_input_tokens_seen": 897912, + "step": 2, + "train_runtime": 134.7812, + "train_tokens_per_second": 6661.997 + }, + { + "epoch": 0.003632270452804132, + "grad_norm": 1.694225788116455, + "learning_rate": 5.000000000000001e-07, + "loss": 1.3045, + "num_input_tokens_seen": 1322736, + "step": 3, + "train_runtime": 199.1648, + "train_tokens_per_second": 6641.415 + }, + { + "epoch": 0.004843027270405509, + "grad_norm": 1.6565794944763184, + "learning_rate": 7.5e-07, + "loss": 1.2432, + "num_input_tokens_seen": 1776136, + "step": 4, + "train_runtime": 267.5205, + "train_tokens_per_second": 6639.252 + }, + { + "epoch": 0.006053784088006886, + "grad_norm": 1.7125741243362427, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2465, + "num_input_tokens_seen": 2216360, + "step": 5, + "train_runtime": 333.3919, + "train_tokens_per_second": 6647.912 + }, + { + "epoch": 0.007264540905608264, + "grad_norm": 1.627602219581604, + "learning_rate": 1.25e-06, + "loss": 1.2298, + "num_input_tokens_seen": 2672560, + "step": 6, + "train_runtime": 401.2707, + "train_tokens_per_second": 6660.242 + }, + { + "epoch": 0.008475297723209641, + "grad_norm": 1.677027702331543, + "learning_rate": 1.5e-06, + "loss": 1.2147, + "num_input_tokens_seen": 3146496, + "step": 7, + "train_runtime": 471.6412, + "train_tokens_per_second": 6671.376 + }, + { + "epoch": 0.009686054540811019, + "grad_norm": 1.6377135515213013, + "learning_rate": 1.75e-06, + "loss": 1.2696, + "num_input_tokens_seen": 3600320, + "step": 8, + "train_runtime": 539.1503, + "train_tokens_per_second": 6677.767 + }, + { + "epoch": 0.010896811358412395, + "grad_norm": 1.651689052581787, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3134, + "num_input_tokens_seen": 4040512, + "step": 9, + "train_runtime": 604.8873, + "train_tokens_per_second": 6679.776 + }, + { + "epoch": 0.012107568176013772, + "grad_norm": 1.588644027709961, + "learning_rate": 2.25e-06, + "loss": 1.2148, + "num_input_tokens_seen": 4476856, + "step": 10, + "train_runtime": 669.9676, + "train_tokens_per_second": 6682.198 + }, + { + "epoch": 0.01331832499361515, + "grad_norm": 1.54507315158844, + "learning_rate": 2.5e-06, + "loss": 1.2446, + "num_input_tokens_seen": 4922928, + "step": 11, + "train_runtime": 738.3143, + "train_tokens_per_second": 6667.795 + }, + { + "epoch": 0.014529081811216527, + "grad_norm": 1.5578962564468384, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.2765, + "num_input_tokens_seen": 5366568, + "step": 12, + "train_runtime": 805.1876, + "train_tokens_per_second": 6664.991 + }, + { + "epoch": 0.015739838628817903, + "grad_norm": 1.2954202890396118, + "learning_rate": 3e-06, + "loss": 1.2142, + "num_input_tokens_seen": 5805136, + "step": 13, + "train_runtime": 875.9211, + "train_tokens_per_second": 6627.465 + }, + { + "epoch": 0.016950595446419282, + "grad_norm": 1.2545137405395508, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.1928, + "num_input_tokens_seen": 6269184, + "step": 14, + "train_runtime": 950.6377, + "train_tokens_per_second": 6594.714 + }, + { + "epoch": 0.018161352264020658, + "grad_norm": 1.2325160503387451, + "learning_rate": 3.5e-06, + "loss": 1.2354, + "num_input_tokens_seen": 6723088, + "step": 15, + "train_runtime": 1023.1734, + "train_tokens_per_second": 6570.82 + }, + { + "epoch": 0.019372109081622037, + "grad_norm": 1.1806299686431885, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.2273, + "num_input_tokens_seen": 7181184, + "step": 16, + "train_runtime": 1097.2469, + "train_tokens_per_second": 6544.729 + }, + { + "epoch": 0.020582865899223413, + "grad_norm": 1.1713570356369019, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1806, + "num_input_tokens_seen": 7621296, + "step": 17, + "train_runtime": 1167.7968, + "train_tokens_per_second": 6526.218 + }, + { + "epoch": 0.02179362271682479, + "grad_norm": 0.7615021467208862, + "learning_rate": 4.25e-06, + "loss": 1.2043, + "num_input_tokens_seen": 8091120, + "step": 18, + "train_runtime": 1242.7454, + "train_tokens_per_second": 6510.682 + }, + { + "epoch": 0.02300437953442617, + "grad_norm": 0.7624850273132324, + "learning_rate": 4.5e-06, + "loss": 1.152, + "num_input_tokens_seen": 8549696, + "step": 19, + "train_runtime": 1316.11, + "train_tokens_per_second": 6496.187 + }, + { + "epoch": 0.024215136352027544, + "grad_norm": 0.753814160823822, + "learning_rate": 4.75e-06, + "loss": 1.1759, + "num_input_tokens_seen": 9015664, + "step": 20, + "train_runtime": 1390.9508, + "train_tokens_per_second": 6481.656 + }, + { + "epoch": 0.025425893169628924, + "grad_norm": 0.7244720458984375, + "learning_rate": 5e-06, + "loss": 1.1802, + "num_input_tokens_seen": 9456440, + "step": 21, + "train_runtime": 1461.1875, + "train_tokens_per_second": 6471.75 + }, + { + "epoch": 0.0266366499872303, + "grad_norm": 0.7659462094306946, + "learning_rate": 5e-06, + "loss": 1.1811, + "num_input_tokens_seen": 9899784, + "step": 22, + "train_runtime": 1530.8483, + "train_tokens_per_second": 6466.861 + }, + { + "epoch": 0.027847406804831675, + "grad_norm": 0.795974612236023, + "learning_rate": 5e-06, + "loss": 1.1331, + "num_input_tokens_seen": 10351040, + "step": 23, + "train_runtime": 1602.2727, + "train_tokens_per_second": 6460.224 + }, + { + "epoch": 0.029058163622433054, + "grad_norm": 0.7971475720405579, + "learning_rate": 5e-06, + "loss": 1.1689, + "num_input_tokens_seen": 10818480, + "step": 24, + "train_runtime": 1675.7391, + "train_tokens_per_second": 6455.945 + }, + { + "epoch": 0.03026892044003443, + "grad_norm": 0.9564425945281982, + "learning_rate": 5e-06, + "loss": 1.1556, + "num_input_tokens_seen": 11266424, + "step": 25, + "train_runtime": 1746.8035, + "train_tokens_per_second": 6449.737 + }, + { + "epoch": 0.031479677257635806, + "grad_norm": 1.0319074392318726, + "learning_rate": 5e-06, + "loss": 1.1656, + "num_input_tokens_seen": 11711824, + "step": 26, + "train_runtime": 1817.4337, + "train_tokens_per_second": 6444.155 + }, + { + "epoch": 0.03269043407523718, + "grad_norm": 0.9839694499969482, + "learning_rate": 5e-06, + "loss": 1.1934, + "num_input_tokens_seen": 12167904, + "step": 27, + "train_runtime": 1890.2289, + "train_tokens_per_second": 6437.265 + }, + { + "epoch": 0.033901190892838565, + "grad_norm": 1.0027241706848145, + "learning_rate": 5e-06, + "loss": 1.2015, + "num_input_tokens_seen": 12603712, + "step": 28, + "train_runtime": 1958.8389, + "train_tokens_per_second": 6434.277 + }, + { + "epoch": 0.03511194771043994, + "grad_norm": 1.0292820930480957, + "learning_rate": 5e-06, + "loss": 1.114, + "num_input_tokens_seen": 13045496, + "step": 29, + "train_runtime": 2028.3291, + "train_tokens_per_second": 6431.647 + }, + { + "epoch": 0.036322704528041316, + "grad_norm": 0.8715880513191223, + "learning_rate": 5e-06, + "loss": 1.1285, + "num_input_tokens_seen": 13489440, + "step": 30, + "train_runtime": 2099.3816, + "train_tokens_per_second": 6425.435 + }, + { + "epoch": 0.03753346134564269, + "grad_norm": 0.782746434211731, + "learning_rate": 5e-06, + "loss": 1.1129, + "num_input_tokens_seen": 13935984, + "step": 31, + "train_runtime": 2170.7426, + "train_tokens_per_second": 6419.916 + }, + { + "epoch": 0.038744218163244075, + "grad_norm": 0.6815439462661743, + "learning_rate": 5e-06, + "loss": 1.0844, + "num_input_tokens_seen": 14376552, + "step": 32, + "train_runtime": 2240.9154, + "train_tokens_per_second": 6415.482 + }, + { + "epoch": 0.03995497498084545, + "grad_norm": 0.5916255116462708, + "learning_rate": 5e-06, + "loss": 1.1448, + "num_input_tokens_seen": 14801448, + "step": 33, + "train_runtime": 2309.3718, + "train_tokens_per_second": 6409.296 + }, + { + "epoch": 0.04116573179844683, + "grad_norm": 0.5178527235984802, + "learning_rate": 5e-06, + "loss": 1.1492, + "num_input_tokens_seen": 15254568, + "step": 34, + "train_runtime": 2381.7952, + "train_tokens_per_second": 6404.651 + }, + { + "epoch": 0.0423764886160482, + "grad_norm": 0.4729219675064087, + "learning_rate": 5e-06, + "loss": 1.1012, + "num_input_tokens_seen": 15721704, + "step": 35, + "train_runtime": 2456.0312, + "train_tokens_per_second": 6401.264 + }, + { + "epoch": 0.04358724543364958, + "grad_norm": 0.4695061147212982, + "learning_rate": 5e-06, + "loss": 1.1549, + "num_input_tokens_seen": 16169016, + "step": 36, + "train_runtime": 2527.4009, + "train_tokens_per_second": 6397.488 + }, + { + "epoch": 0.04479800225125096, + "grad_norm": 0.4953579306602478, + "learning_rate": 5e-06, + "loss": 1.0791, + "num_input_tokens_seen": 16632416, + "step": 37, + "train_runtime": 2602.077, + "train_tokens_per_second": 6391.977 + }, + { + "epoch": 0.04600875906885234, + "grad_norm": 0.5437090396881104, + "learning_rate": 5e-06, + "loss": 1.166, + "num_input_tokens_seen": 17072064, + "step": 38, + "train_runtime": 2672.2934, + "train_tokens_per_second": 6388.544 + }, + { + "epoch": 0.04721951588645371, + "grad_norm": 0.49670565128326416, + "learning_rate": 5e-06, + "loss": 1.1278, + "num_input_tokens_seen": 17521824, + "step": 39, + "train_runtime": 2744.1029, + "train_tokens_per_second": 6385.265 + }, + { + "epoch": 0.04843027270405509, + "grad_norm": 0.5088937878608704, + "learning_rate": 5e-06, + "loss": 1.1204, + "num_input_tokens_seen": 17970488, + "step": 40, + "train_runtime": 2816.1612, + "train_tokens_per_second": 6381.2 + }, + { + "epoch": 0.049641029521656464, + "grad_norm": 0.45026877522468567, + "learning_rate": 5e-06, + "loss": 1.1271, + "num_input_tokens_seen": 18415968, + "step": 41, + "train_runtime": 2887.7895, + "train_tokens_per_second": 6377.185 + }, + { + "epoch": 0.05085178633925785, + "grad_norm": 0.450920969247818, + "learning_rate": 5e-06, + "loss": 1.1152, + "num_input_tokens_seen": 18872720, + "step": 42, + "train_runtime": 2961.0904, + "train_tokens_per_second": 6373.571 + }, + { + "epoch": 0.05206254315685922, + "grad_norm": 0.3853777050971985, + "learning_rate": 5e-06, + "loss": 1.0648, + "num_input_tokens_seen": 19341632, + "step": 43, + "train_runtime": 3036.043, + "train_tokens_per_second": 6370.671 + }, + { + "epoch": 0.0532732999744606, + "grad_norm": 0.37567827105522156, + "learning_rate": 5e-06, + "loss": 1.1503, + "num_input_tokens_seen": 19774192, + "step": 44, + "train_runtime": 3105.6604, + "train_tokens_per_second": 6367.146 + }, + { + "epoch": 0.054484056792061974, + "grad_norm": 0.3511997163295746, + "learning_rate": 5e-06, + "loss": 1.1209, + "num_input_tokens_seen": 20246224, + "step": 45, + "train_runtime": 3181.7883, + "train_tokens_per_second": 6363.159 + }, + { + "epoch": 0.05569481360966335, + "grad_norm": 0.3575429618358612, + "learning_rate": 5e-06, + "loss": 1.073, + "num_input_tokens_seen": 20721848, + "step": 46, + "train_runtime": 3258.4956, + "train_tokens_per_second": 6359.33 + }, + { + "epoch": 0.05690557042726473, + "grad_norm": 0.32883220911026, + "learning_rate": 5e-06, + "loss": 1.0287, + "num_input_tokens_seen": 21204880, + "step": 47, + "train_runtime": 3336.1547, + "train_tokens_per_second": 6356.084 + }, + { + "epoch": 0.05811632724486611, + "grad_norm": 0.3266335129737854, + "learning_rate": 5e-06, + "loss": 1.1667, + "num_input_tokens_seen": 21643792, + "step": 48, + "train_runtime": 3406.2845, + "train_tokens_per_second": 6354.076 + }, + { + "epoch": 0.059327084062467485, + "grad_norm": 0.32436686754226685, + "learning_rate": 5e-06, + "loss": 1.0979, + "num_input_tokens_seen": 22102080, + "step": 49, + "train_runtime": 3478.6822, + "train_tokens_per_second": 6353.578 + }, + { + "epoch": 0.06053784088006886, + "grad_norm": 0.3160610795021057, + "learning_rate": 5e-06, + "loss": 1.0757, + "num_input_tokens_seen": 22560080, + "step": 50, + "train_runtime": 3550.9743, + "train_tokens_per_second": 6353.209 + }, + { + "epoch": 0.061748597697670236, + "grad_norm": 0.3259732127189636, + "learning_rate": 5e-06, + "loss": 1.0585, + "num_input_tokens_seen": 23012792, + "step": 51, + "train_runtime": 3623.5659, + "train_tokens_per_second": 6350.869 + }, + { + "epoch": 0.06295935451527161, + "grad_norm": 0.3129977881908417, + "learning_rate": 5e-06, + "loss": 1.0892, + "num_input_tokens_seen": 23481928, + "step": 52, + "train_runtime": 3698.7937, + "train_tokens_per_second": 6348.537 + }, + { + "epoch": 0.064170111332873, + "grad_norm": 0.31302887201309204, + "learning_rate": 5e-06, + "loss": 1.107, + "num_input_tokens_seen": 23929328, + "step": 53, + "train_runtime": 3770.8893, + "train_tokens_per_second": 6345.805 + }, + { + "epoch": 0.06538086815047436, + "grad_norm": 0.30268368124961853, + "learning_rate": 5e-06, + "loss": 1.0769, + "num_input_tokens_seen": 24389344, + "step": 54, + "train_runtime": 3845.4071, + "train_tokens_per_second": 6342.461 + }, + { + "epoch": 0.06659162496807575, + "grad_norm": 0.3023386299610138, + "learning_rate": 5e-06, + "loss": 1.0904, + "num_input_tokens_seen": 24835992, + "step": 55, + "train_runtime": 3917.1748, + "train_tokens_per_second": 6340.282 + }, + { + "epoch": 0.06780238178567713, + "grad_norm": 0.3157775104045868, + "learning_rate": 5e-06, + "loss": 1.0838, + "num_input_tokens_seen": 25287800, + "step": 56, + "train_runtime": 3989.6387, + "train_tokens_per_second": 6338.368 + }, + { + "epoch": 0.0690131386032785, + "grad_norm": 0.3070801794528961, + "learning_rate": 5e-06, + "loss": 1.1042, + "num_input_tokens_seen": 25726600, + "step": 57, + "train_runtime": 4059.4983, + "train_tokens_per_second": 6337.384 + }, + { + "epoch": 0.07022389542087988, + "grad_norm": 0.2750767469406128, + "learning_rate": 5e-06, + "loss": 1.0938, + "num_input_tokens_seen": 26197136, + "step": 58, + "train_runtime": 4135.3446, + "train_tokens_per_second": 6334.934 + }, + { + "epoch": 0.07143465223848126, + "grad_norm": 0.32206404209136963, + "learning_rate": 5e-06, + "loss": 1.1449, + "num_input_tokens_seen": 26648344, + "step": 59, + "train_runtime": 4206.337, + "train_tokens_per_second": 6335.285 + }, + { + "epoch": 0.07264540905608263, + "grad_norm": 0.27299636602401733, + "learning_rate": 5e-06, + "loss": 1.063, + "num_input_tokens_seen": 27117640, + "step": 60, + "train_runtime": 4279.87, + "train_tokens_per_second": 6336.09 + }, + { + "epoch": 0.07385616587368402, + "grad_norm": 0.3023524582386017, + "learning_rate": 5e-06, + "loss": 1.0814, + "num_input_tokens_seen": 27558016, + "step": 61, + "train_runtime": 4349.9557, + "train_tokens_per_second": 6335.241 + }, + { + "epoch": 0.07506692269128538, + "grad_norm": 0.3390548527240753, + "learning_rate": 5e-06, + "loss": 1.0838, + "num_input_tokens_seen": 28010840, + "step": 62, + "train_runtime": 4423.2036, + "train_tokens_per_second": 6332.704 + }, + { + "epoch": 0.07627767950888677, + "grad_norm": 0.3006073832511902, + "learning_rate": 5e-06, + "loss": 1.0334, + "num_input_tokens_seen": 28461840, + "step": 63, + "train_runtime": 4496.9762, + "train_tokens_per_second": 6329.106 + }, + { + "epoch": 0.07748843632648815, + "grad_norm": 0.30531835556030273, + "learning_rate": 5e-06, + "loss": 1.1794, + "num_input_tokens_seen": 28903208, + "step": 64, + "train_runtime": 4568.0669, + "train_tokens_per_second": 6327.23 + }, + { + "epoch": 0.07869919314408952, + "grad_norm": 0.2855227589607239, + "learning_rate": 5e-06, + "loss": 1.0802, + "num_input_tokens_seen": 29372160, + "step": 65, + "train_runtime": 4642.0795, + "train_tokens_per_second": 6327.371 + }, + { + "epoch": 0.0799099499616909, + "grad_norm": 0.2859865725040436, + "learning_rate": 5e-06, + "loss": 1.1396, + "num_input_tokens_seen": 29835496, + "step": 66, + "train_runtime": 4715.6083, + "train_tokens_per_second": 6326.967 + }, + { + "epoch": 0.08112070677929227, + "grad_norm": 0.28807154297828674, + "learning_rate": 5e-06, + "loss": 1.0579, + "num_input_tokens_seen": 30301072, + "step": 67, + "train_runtime": 4789.8228, + "train_tokens_per_second": 6326.136 + }, + { + "epoch": 0.08233146359689365, + "grad_norm": 0.27400127053260803, + "learning_rate": 5e-06, + "loss": 1.0897, + "num_input_tokens_seen": 30761224, + "step": 68, + "train_runtime": 4863.205, + "train_tokens_per_second": 6325.299 + }, + { + "epoch": 0.08354222041449504, + "grad_norm": 0.27055230736732483, + "learning_rate": 5e-06, + "loss": 1.0776, + "num_input_tokens_seen": 31235312, + "step": 69, + "train_runtime": 4939.24, + "train_tokens_per_second": 6323.911 + }, + { + "epoch": 0.0847529772320964, + "grad_norm": 0.29049232602119446, + "learning_rate": 5e-06, + "loss": 1.0942, + "num_input_tokens_seen": 31715944, + "step": 70, + "train_runtime": 5016.1629, + "train_tokens_per_second": 6322.75 + }, + { + "epoch": 0.08596373404969779, + "grad_norm": 0.28521451354026794, + "learning_rate": 5e-06, + "loss": 1.1107, + "num_input_tokens_seen": 32153488, + "step": 71, + "train_runtime": 5087.4742, + "train_tokens_per_second": 6320.128 + }, + { + "epoch": 0.08717449086729916, + "grad_norm": 0.27909162640571594, + "learning_rate": 5e-06, + "loss": 1.1105, + "num_input_tokens_seen": 32614936, + "step": 72, + "train_runtime": 5163.3824, + "train_tokens_per_second": 6316.584 + }, + { + "epoch": 0.08838524768490054, + "grad_norm": 0.2773616909980774, + "learning_rate": 5e-06, + "loss": 1.0926, + "num_input_tokens_seen": 33066032, + "step": 73, + "train_runtime": 5236.3763, + "train_tokens_per_second": 6314.678 + }, + { + "epoch": 0.08959600450250192, + "grad_norm": 0.2607426047325134, + "learning_rate": 5e-06, + "loss": 1.0681, + "num_input_tokens_seen": 33519952, + "step": 74, + "train_runtime": 5309.8276, + "train_tokens_per_second": 6312.814 + }, + { + "epoch": 0.09080676132010329, + "grad_norm": 0.3017564117908478, + "learning_rate": 5e-06, + "loss": 1.1197, + "num_input_tokens_seen": 33979056, + "step": 75, + "train_runtime": 5383.9039, + "train_tokens_per_second": 6311.23 + }, + { + "epoch": 0.09201751813770467, + "grad_norm": 0.25366899371147156, + "learning_rate": 5e-06, + "loss": 1.0534, + "num_input_tokens_seen": 34445408, + "step": 76, + "train_runtime": 5459.5423, + "train_tokens_per_second": 6309.212 + }, + { + "epoch": 0.09322827495530604, + "grad_norm": 0.30008700489997864, + "learning_rate": 5e-06, + "loss": 1.0647, + "num_input_tokens_seen": 34883760, + "step": 77, + "train_runtime": 5529.7451, + "train_tokens_per_second": 6308.385 + }, + { + "epoch": 0.09443903177290743, + "grad_norm": 0.288265198469162, + "learning_rate": 5e-06, + "loss": 1.1079, + "num_input_tokens_seen": 35340528, + "step": 78, + "train_runtime": 5604.8498, + "train_tokens_per_second": 6305.348 + }, + { + "epoch": 0.09564978859050881, + "grad_norm": 0.27486133575439453, + "learning_rate": 5e-06, + "loss": 1.0545, + "num_input_tokens_seen": 35797704, + "step": 79, + "train_runtime": 5678.1881, + "train_tokens_per_second": 6304.424 + }, + { + "epoch": 0.09686054540811018, + "grad_norm": 0.2748127281665802, + "learning_rate": 5e-06, + "loss": 1.0813, + "num_input_tokens_seen": 36242296, + "step": 80, + "train_runtime": 5749.0438, + "train_tokens_per_second": 6304.056 + }, + { + "epoch": 0.09807130222571156, + "grad_norm": 0.25881466269493103, + "learning_rate": 5e-06, + "loss": 1.0469, + "num_input_tokens_seen": 36731608, + "step": 81, + "train_runtime": 5827.2786, + "train_tokens_per_second": 6303.39 + }, + { + "epoch": 0.09928205904331293, + "grad_norm": 0.25870904326438904, + "learning_rate": 5e-06, + "loss": 1.0211, + "num_input_tokens_seen": 37192232, + "step": 82, + "train_runtime": 5900.4297, + "train_tokens_per_second": 6303.309 + }, + { + "epoch": 0.10049281586091431, + "grad_norm": 0.2989208996295929, + "learning_rate": 5e-06, + "loss": 1.094, + "num_input_tokens_seen": 37644536, + "step": 83, + "train_runtime": 5972.5325, + "train_tokens_per_second": 6302.944 + }, + { + "epoch": 0.1017035726785157, + "grad_norm": 0.2510150671005249, + "learning_rate": 5e-06, + "loss": 1.0657, + "num_input_tokens_seen": 38088800, + "step": 84, + "train_runtime": 6043.5299, + "train_tokens_per_second": 6302.41 + }, + { + "epoch": 0.10291432949611706, + "grad_norm": 0.25874075293540955, + "learning_rate": 5e-06, + "loss": 1.083, + "num_input_tokens_seen": 38544872, + "step": 85, + "train_runtime": 6115.5221, + "train_tokens_per_second": 6302.793 + }, + { + "epoch": 0.10412508631371845, + "grad_norm": 0.2325299233198166, + "learning_rate": 5e-06, + "loss": 1.0115, + "num_input_tokens_seen": 39021888, + "step": 86, + "train_runtime": 6193.1133, + "train_tokens_per_second": 6300.852 + }, + { + "epoch": 0.10533584313131981, + "grad_norm": 0.24345119297504425, + "learning_rate": 5e-06, + "loss": 1.0333, + "num_input_tokens_seen": 39493872, + "step": 87, + "train_runtime": 6268.3853, + "train_tokens_per_second": 6300.486 + }, + { + "epoch": 0.1065465999489212, + "grad_norm": 0.26478031277656555, + "learning_rate": 5e-06, + "loss": 1.0525, + "num_input_tokens_seen": 39939016, + "step": 88, + "train_runtime": 6339.297, + "train_tokens_per_second": 6300.228 + }, + { + "epoch": 0.10775735676652258, + "grad_norm": 0.24371357262134552, + "learning_rate": 5e-06, + "loss": 1.0548, + "num_input_tokens_seen": 40423776, + "step": 89, + "train_runtime": 6415.9537, + "train_tokens_per_second": 6300.509 + }, + { + "epoch": 0.10896811358412395, + "grad_norm": 0.25180429220199585, + "learning_rate": 5e-06, + "loss": 1.0336, + "num_input_tokens_seen": 40869936, + "step": 90, + "train_runtime": 6486.3382, + "train_tokens_per_second": 6300.926 + }, + { + "epoch": 0.11017887040172533, + "grad_norm": 0.2390969842672348, + "learning_rate": 5e-06, + "loss": 1.0552, + "num_input_tokens_seen": 41317936, + "step": 91, + "train_runtime": 6557.8413, + "train_tokens_per_second": 6300.539 + }, + { + "epoch": 0.1113896272193267, + "grad_norm": 0.2268403321504593, + "learning_rate": 5e-06, + "loss": 1.0251, + "num_input_tokens_seen": 41807248, + "step": 92, + "train_runtime": 6635.8744, + "train_tokens_per_second": 6300.187 + }, + { + "epoch": 0.11260038403692808, + "grad_norm": 0.23852020502090454, + "learning_rate": 5e-06, + "loss": 1.0913, + "num_input_tokens_seen": 42253040, + "step": 93, + "train_runtime": 6706.5327, + "train_tokens_per_second": 6300.281 + }, + { + "epoch": 0.11381114085452947, + "grad_norm": 0.22914916276931763, + "learning_rate": 5e-06, + "loss": 1.0244, + "num_input_tokens_seen": 42729480, + "step": 94, + "train_runtime": 6783.568, + "train_tokens_per_second": 6298.968 + }, + { + "epoch": 0.11502189767213084, + "grad_norm": 0.24560000002384186, + "learning_rate": 5e-06, + "loss": 1.0941, + "num_input_tokens_seen": 43185408, + "step": 95, + "train_runtime": 6856.313, + "train_tokens_per_second": 6298.634 + }, + { + "epoch": 0.11623265448973222, + "grad_norm": 0.25429603457450867, + "learning_rate": 5e-06, + "loss": 1.0389, + "num_input_tokens_seen": 43633928, + "step": 96, + "train_runtime": 6927.8513, + "train_tokens_per_second": 6298.335 + }, + { + "epoch": 0.11744341130733359, + "grad_norm": 0.23469692468643188, + "learning_rate": 5e-06, + "loss": 1.0, + "num_input_tokens_seen": 44105840, + "step": 97, + "train_runtime": 7003.1186, + "train_tokens_per_second": 6298.028 + }, + { + "epoch": 0.11865416812493497, + "grad_norm": 0.2390899658203125, + "learning_rate": 5e-06, + "loss": 1.0403, + "num_input_tokens_seen": 44581704, + "step": 98, + "train_runtime": 7079.5384, + "train_tokens_per_second": 6297.261 + }, + { + "epoch": 0.11986492494253635, + "grad_norm": 0.2298881709575653, + "learning_rate": 5e-06, + "loss": 1.0538, + "num_input_tokens_seen": 45029648, + "step": 99, + "train_runtime": 7150.6445, + "train_tokens_per_second": 6297.285 + }, + { + "epoch": 0.12107568176013772, + "grad_norm": 0.23455004394054413, + "learning_rate": 5e-06, + "loss": 1.0106, + "num_input_tokens_seen": 45491992, + "step": 100, + "train_runtime": 7224.9828, + "train_tokens_per_second": 6296.484 + }, + { + "epoch": 0.1222864385777391, + "grad_norm": 0.27862685918807983, + "learning_rate": 5e-06, + "loss": 1.0551, + "num_input_tokens_seen": 45960216, + "step": 101, + "train_runtime": 7300.9867, + "train_tokens_per_second": 6295.069 + }, + { + "epoch": 0.12349719539534047, + "grad_norm": 0.2320939600467682, + "learning_rate": 5e-06, + "loss": 1.0258, + "num_input_tokens_seen": 46435864, + "step": 102, + "train_runtime": 7377.4853, + "train_tokens_per_second": 6294.267 + }, + { + "epoch": 0.12470795221294186, + "grad_norm": 0.2700980305671692, + "learning_rate": 5e-06, + "loss": 1.0397, + "num_input_tokens_seen": 46897016, + "step": 103, + "train_runtime": 7451.8901, + "train_tokens_per_second": 6293.305 + }, + { + "epoch": 0.12591870903054322, + "grad_norm": 0.2502821683883667, + "learning_rate": 5e-06, + "loss": 1.0432, + "num_input_tokens_seen": 47324336, + "step": 104, + "train_runtime": 7519.7181, + "train_tokens_per_second": 6293.366 + }, + { + "epoch": 0.1271294658481446, + "grad_norm": 0.23824240267276764, + "learning_rate": 5e-06, + "loss": 1.0777, + "num_input_tokens_seen": 47770912, + "step": 105, + "train_runtime": 7590.7707, + "train_tokens_per_second": 6293.289 + }, + { + "epoch": 0.128340222665746, + "grad_norm": 0.24816913902759552, + "learning_rate": 5e-06, + "loss": 1.0662, + "num_input_tokens_seen": 48215152, + "step": 106, + "train_runtime": 7662.9416, + "train_tokens_per_second": 6291.99 + }, + { + "epoch": 0.12955097948334737, + "grad_norm": 0.2386653572320938, + "learning_rate": 5e-06, + "loss": 1.0423, + "num_input_tokens_seen": 48688112, + "step": 107, + "train_runtime": 7739.3312, + "train_tokens_per_second": 6290.997 + }, + { + "epoch": 0.13076173630094873, + "grad_norm": 0.25550806522369385, + "learning_rate": 5e-06, + "loss": 1.1077, + "num_input_tokens_seen": 49130072, + "step": 108, + "train_runtime": 7810.4937, + "train_tokens_per_second": 6290.265 + }, + { + "epoch": 0.1319724931185501, + "grad_norm": 0.2418377846479416, + "learning_rate": 5e-06, + "loss": 1.0495, + "num_input_tokens_seen": 49589584, + "step": 109, + "train_runtime": 7883.9434, + "train_tokens_per_second": 6289.947 + }, + { + "epoch": 0.1331832499361515, + "grad_norm": 0.24783344566822052, + "learning_rate": 5e-06, + "loss": 1.0896, + "num_input_tokens_seen": 50020456, + "step": 110, + "train_runtime": 7953.0419, + "train_tokens_per_second": 6289.475 + }, + { + "epoch": 0.13439400675375288, + "grad_norm": 0.2944345474243164, + "learning_rate": 5e-06, + "loss": 1.0701, + "num_input_tokens_seen": 50460280, + "step": 111, + "train_runtime": 8024.2387, + "train_tokens_per_second": 6288.482 + }, + { + "epoch": 0.13560476357135426, + "grad_norm": 0.23773066699504852, + "learning_rate": 5e-06, + "loss": 1.0576, + "num_input_tokens_seen": 50923488, + "step": 112, + "train_runtime": 8098.5357, + "train_tokens_per_second": 6287.987 + }, + { + "epoch": 0.13681552038895564, + "grad_norm": 0.24989427626132965, + "learning_rate": 5e-06, + "loss": 1.0656, + "num_input_tokens_seen": 51357016, + "step": 113, + "train_runtime": 8169.1458, + "train_tokens_per_second": 6286.706 + }, + { + "epoch": 0.138026277206557, + "grad_norm": 0.2635020911693573, + "learning_rate": 5e-06, + "loss": 1.103, + "num_input_tokens_seen": 51832576, + "step": 114, + "train_runtime": 8245.3688, + "train_tokens_per_second": 6286.265 + }, + { + "epoch": 0.13923703402415838, + "grad_norm": 0.2522059977054596, + "learning_rate": 5e-06, + "loss": 1.0559, + "num_input_tokens_seen": 52261160, + "step": 115, + "train_runtime": 8314.0019, + "train_tokens_per_second": 6285.921 + }, + { + "epoch": 0.14044779084175976, + "grad_norm": 0.275611937046051, + "learning_rate": 5e-06, + "loss": 1.0655, + "num_input_tokens_seen": 52721512, + "step": 116, + "train_runtime": 8386.8183, + "train_tokens_per_second": 6286.235 + }, + { + "epoch": 0.14165854765936114, + "grad_norm": 0.2655342221260071, + "learning_rate": 5e-06, + "loss": 1.0463, + "num_input_tokens_seen": 53178752, + "step": 117, + "train_runtime": 8459.8957, + "train_tokens_per_second": 6285.982 + }, + { + "epoch": 0.14286930447696253, + "grad_norm": 0.24424339830875397, + "learning_rate": 5e-06, + "loss": 1.0743, + "num_input_tokens_seen": 53643504, + "step": 118, + "train_runtime": 8533.66, + "train_tokens_per_second": 6286.108 + }, + { + "epoch": 0.14408006129456388, + "grad_norm": 0.24213866889476776, + "learning_rate": 5e-06, + "loss": 1.0082, + "num_input_tokens_seen": 54096264, + "step": 119, + "train_runtime": 8606.2721, + "train_tokens_per_second": 6285.679 + }, + { + "epoch": 0.14529081811216527, + "grad_norm": 0.24612732231616974, + "learning_rate": 5e-06, + "loss": 1.0415, + "num_input_tokens_seen": 54542376, + "step": 120, + "train_runtime": 8677.4706, + "train_tokens_per_second": 6285.516 + }, + { + "epoch": 0.14650157492976665, + "grad_norm": 0.24935385584831238, + "learning_rate": 5e-06, + "loss": 1.0735, + "num_input_tokens_seen": 54986712, + "step": 121, + "train_runtime": 8748.1099, + "train_tokens_per_second": 6285.553 + }, + { + "epoch": 0.14771233174736803, + "grad_norm": 0.2938326597213745, + "learning_rate": 5e-06, + "loss": 1.0351, + "num_input_tokens_seen": 55448736, + "step": 122, + "train_runtime": 8822.6975, + "train_tokens_per_second": 6284.783 + }, + { + "epoch": 0.1489230885649694, + "grad_norm": 0.24213974177837372, + "learning_rate": 5e-06, + "loss": 1.0653, + "num_input_tokens_seen": 55891176, + "step": 123, + "train_runtime": 8894.389, + "train_tokens_per_second": 6283.869 + }, + { + "epoch": 0.15013384538257077, + "grad_norm": 0.27501124143600464, + "learning_rate": 5e-06, + "loss": 1.0679, + "num_input_tokens_seen": 56307976, + "step": 124, + "train_runtime": 8961.2502, + "train_tokens_per_second": 6283.496 + }, + { + "epoch": 0.15134460220017215, + "grad_norm": 0.2943986654281616, + "learning_rate": 5e-06, + "loss": 1.0693, + "num_input_tokens_seen": 56742320, + "step": 125, + "train_runtime": 9031.5273, + "train_tokens_per_second": 6282.694 + }, + { + "epoch": 0.15255535901777353, + "grad_norm": 0.2623043656349182, + "learning_rate": 5e-06, + "loss": 1.1028, + "num_input_tokens_seen": 57192864, + "step": 126, + "train_runtime": 9101.7095, + "train_tokens_per_second": 6283.75 + }, + { + "epoch": 0.15376611583537492, + "grad_norm": 0.2695028483867645, + "learning_rate": 5e-06, + "loss": 1.063, + "num_input_tokens_seen": 57618472, + "step": 127, + "train_runtime": 9168.6765, + "train_tokens_per_second": 6284.274 + }, + { + "epoch": 0.1549768726529763, + "grad_norm": 0.2590481638908386, + "learning_rate": 5e-06, + "loss": 1.0145, + "num_input_tokens_seen": 58066048, + "step": 128, + "train_runtime": 9239.0759, + "train_tokens_per_second": 6284.833 + }, + { + "epoch": 0.15618762947057765, + "grad_norm": 0.28023761510849, + "learning_rate": 5e-06, + "loss": 1.0559, + "num_input_tokens_seen": 58500528, + "step": 129, + "train_runtime": 9313.1627, + "train_tokens_per_second": 6281.489 + }, + { + "epoch": 0.15739838628817904, + "grad_norm": 0.24649831652641296, + "learning_rate": 5e-06, + "loss": 1.0134, + "num_input_tokens_seen": 58967280, + "step": 130, + "train_runtime": 9392.9204, + "train_tokens_per_second": 6277.843 + }, + { + "epoch": 0.15860914310578042, + "grad_norm": 0.2472827285528183, + "learning_rate": 5e-06, + "loss": 1.0178, + "num_input_tokens_seen": 59412456, + "step": 131, + "train_runtime": 9468.2871, + "train_tokens_per_second": 6274.89 + }, + { + "epoch": 0.1598198999233818, + "grad_norm": 0.2545448839664459, + "learning_rate": 5e-06, + "loss": 1.0606, + "num_input_tokens_seen": 59878688, + "step": 132, + "train_runtime": 9545.4506, + "train_tokens_per_second": 6273.008 + }, + { + "epoch": 0.16103065674098319, + "grad_norm": 0.2501581013202667, + "learning_rate": 5e-06, + "loss": 1.0483, + "num_input_tokens_seen": 60330256, + "step": 133, + "train_runtime": 9616.5112, + "train_tokens_per_second": 6273.612 + }, + { + "epoch": 0.16224141355858454, + "grad_norm": 0.29199784994125366, + "learning_rate": 5e-06, + "loss": 1.0331, + "num_input_tokens_seen": 60784672, + "step": 134, + "train_runtime": 9687.146, + "train_tokens_per_second": 6274.776 + }, + { + "epoch": 0.16345217037618592, + "grad_norm": 0.23874440789222717, + "learning_rate": 5e-06, + "loss": 1.0224, + "num_input_tokens_seen": 61251264, + "step": 135, + "train_runtime": 9761.2608, + "train_tokens_per_second": 6274.934 + }, + { + "epoch": 0.1646629271937873, + "grad_norm": 0.25831273198127747, + "learning_rate": 5e-06, + "loss": 1.0679, + "num_input_tokens_seen": 61709040, + "step": 136, + "train_runtime": 9835.0216, + "train_tokens_per_second": 6274.418 + }, + { + "epoch": 0.1658736840113887, + "grad_norm": 0.25276923179626465, + "learning_rate": 5e-06, + "loss": 1.0455, + "num_input_tokens_seen": 62160304, + "step": 137, + "train_runtime": 9909.0578, + "train_tokens_per_second": 6273.079 + }, + { + "epoch": 0.16708444082899007, + "grad_norm": 0.29279229044914246, + "learning_rate": 5e-06, + "loss": 1.0555, + "num_input_tokens_seen": 62591968, + "step": 138, + "train_runtime": 9979.8666, + "train_tokens_per_second": 6271.824 + }, + { + "epoch": 0.16829519764659143, + "grad_norm": 0.2797205448150635, + "learning_rate": 5e-06, + "loss": 0.9834, + "num_input_tokens_seen": 63045184, + "step": 139, + "train_runtime": 10056.584, + "train_tokens_per_second": 6269.046 + }, + { + "epoch": 0.1695059544641928, + "grad_norm": 0.2773694396018982, + "learning_rate": 5e-06, + "loss": 0.9939, + "num_input_tokens_seen": 63504472, + "step": 140, + "train_runtime": 10134.2006, + "train_tokens_per_second": 6266.352 + }, + { + "epoch": 0.1707167112817942, + "grad_norm": 0.22478176653385162, + "learning_rate": 5e-06, + "loss": 1.0418, + "num_input_tokens_seen": 63957288, + "step": 141, + "train_runtime": 10210.8231, + "train_tokens_per_second": 6263.676 + }, + { + "epoch": 0.17192746809939558, + "grad_norm": 0.24870216846466064, + "learning_rate": 5e-06, + "loss": 1.054, + "num_input_tokens_seen": 64428128, + "step": 142, + "train_runtime": 10288.8907, + "train_tokens_per_second": 6261.912 + }, + { + "epoch": 0.17313822491699696, + "grad_norm": 0.22447937726974487, + "learning_rate": 5e-06, + "loss": 1.0385, + "num_input_tokens_seen": 64891424, + "step": 143, + "train_runtime": 10358.86, + "train_tokens_per_second": 6264.34 + }, + { + "epoch": 0.1743489817345983, + "grad_norm": 0.25018176436424255, + "learning_rate": 5e-06, + "loss": 1.0231, + "num_input_tokens_seen": 65354392, + "step": 144, + "train_runtime": 10428.4387, + "train_tokens_per_second": 6266.939 + }, + { + "epoch": 0.1755597385521997, + "grad_norm": 0.2601490914821625, + "learning_rate": 5e-06, + "loss": 1.0256, + "num_input_tokens_seen": 65810304, + "step": 145, + "train_runtime": 10501.5228, + "train_tokens_per_second": 6266.739 + }, + { + "epoch": 0.17677049536980108, + "grad_norm": 0.24077767133712769, + "learning_rate": 5e-06, + "loss": 1.032, + "num_input_tokens_seen": 66289808, + "step": 146, + "train_runtime": 10580.6955, + "train_tokens_per_second": 6265.165 + }, + { + "epoch": 0.17798125218740246, + "grad_norm": 0.2406504601240158, + "learning_rate": 5e-06, + "loss": 1.0469, + "num_input_tokens_seen": 66738720, + "step": 147, + "train_runtime": 10650.666, + "train_tokens_per_second": 6266.155 + }, + { + "epoch": 0.17919200900500384, + "grad_norm": 0.22819995880126953, + "learning_rate": 5e-06, + "loss": 1.0674, + "num_input_tokens_seen": 67205640, + "step": 148, + "train_runtime": 10723.6866, + "train_tokens_per_second": 6267.028 + }, + { + "epoch": 0.1804027658226052, + "grad_norm": 0.2443617284297943, + "learning_rate": 5e-06, + "loss": 1.0194, + "num_input_tokens_seen": 67674720, + "step": 149, + "train_runtime": 10796.9457, + "train_tokens_per_second": 6267.95 + }, + { + "epoch": 0.18161352264020658, + "grad_norm": 0.30922770500183105, + "learning_rate": 5e-06, + "loss": 1.1078, + "num_input_tokens_seen": 68119800, + "step": 150, + "train_runtime": 10867.1223, + "train_tokens_per_second": 6268.43 + }, + { + "epoch": 0.18282427945780796, + "grad_norm": 0.24705801904201508, + "learning_rate": 5e-06, + "loss": 1.0226, + "num_input_tokens_seen": 68569240, + "step": 151, + "train_runtime": 10937.4488, + "train_tokens_per_second": 6269.217 + }, + { + "epoch": 0.18403503627540935, + "grad_norm": 0.2428549975156784, + "learning_rate": 5e-06, + "loss": 1.0586, + "num_input_tokens_seen": 69024352, + "step": 152, + "train_runtime": 11008.538, + "train_tokens_per_second": 6270.074 + }, + { + "epoch": 0.18524579309301073, + "grad_norm": 0.23061682283878326, + "learning_rate": 5e-06, + "loss": 1.0129, + "num_input_tokens_seen": 69487680, + "step": 153, + "train_runtime": 11081.4546, + "train_tokens_per_second": 6270.628 + }, + { + "epoch": 0.18645654991061208, + "grad_norm": 0.2367316484451294, + "learning_rate": 5e-06, + "loss": 1.0437, + "num_input_tokens_seen": 69923312, + "step": 154, + "train_runtime": 11149.4652, + "train_tokens_per_second": 6271.45 + }, + { + "epoch": 0.18766730672821347, + "grad_norm": 0.24783264100551605, + "learning_rate": 5e-06, + "loss": 1.0682, + "num_input_tokens_seen": 70375368, + "step": 155, + "train_runtime": 11219.9874, + "train_tokens_per_second": 6272.321 + }, + { + "epoch": 0.18887806354581485, + "grad_norm": 0.22279201447963715, + "learning_rate": 5e-06, + "loss": 1.0105, + "num_input_tokens_seen": 70836136, + "step": 156, + "train_runtime": 11292.4524, + "train_tokens_per_second": 6272.874 + }, + { + "epoch": 0.19008882036341623, + "grad_norm": 0.22752974927425385, + "learning_rate": 5e-06, + "loss": 1.008, + "num_input_tokens_seen": 71284208, + "step": 157, + "train_runtime": 11362.5883, + "train_tokens_per_second": 6273.589 + }, + { + "epoch": 0.19129957718101762, + "grad_norm": 0.21871839463710785, + "learning_rate": 5e-06, + "loss": 1.0518, + "num_input_tokens_seen": 71773848, + "step": 158, + "train_runtime": 11438.8409, + "train_tokens_per_second": 6274.573 + }, + { + "epoch": 0.19251033399861897, + "grad_norm": 0.23992645740509033, + "learning_rate": 5e-06, + "loss": 0.9932, + "num_input_tokens_seen": 72243136, + "step": 159, + "train_runtime": 11512.8218, + "train_tokens_per_second": 6275.016 + }, + { + "epoch": 0.19372109081622035, + "grad_norm": 0.25232523679733276, + "learning_rate": 5e-06, + "loss": 1.0152, + "num_input_tokens_seen": 72702040, + "step": 160, + "train_runtime": 11584.5481, + "train_tokens_per_second": 6275.777 + }, + { + "epoch": 0.19493184763382174, + "grad_norm": 0.2552812695503235, + "learning_rate": 5e-06, + "loss": 1.0435, + "num_input_tokens_seen": 73152944, + "step": 161, + "train_runtime": 11655.4728, + "train_tokens_per_second": 6276.274 + }, + { + "epoch": 0.19614260445142312, + "grad_norm": 0.24950732290744781, + "learning_rate": 5e-06, + "loss": 1.0205, + "num_input_tokens_seen": 73622448, + "step": 162, + "train_runtime": 11728.7466, + "train_tokens_per_second": 6277.094 + }, + { + "epoch": 0.1973533612690245, + "grad_norm": 0.23558129370212555, + "learning_rate": 5e-06, + "loss": 1.0464, + "num_input_tokens_seen": 74070512, + "step": 163, + "train_runtime": 11798.7407, + "train_tokens_per_second": 6277.832 + }, + { + "epoch": 0.19856411808662586, + "grad_norm": 0.2387412041425705, + "learning_rate": 5e-06, + "loss": 1.0509, + "num_input_tokens_seen": 74523176, + "step": 164, + "train_runtime": 11869.5779, + "train_tokens_per_second": 6278.503 + }, + { + "epoch": 0.19977487490422724, + "grad_norm": 0.2554919421672821, + "learning_rate": 5e-06, + "loss": 1.0716, + "num_input_tokens_seen": 74985568, + "step": 165, + "train_runtime": 11942.0706, + "train_tokens_per_second": 6279.109 + }, + { + "epoch": 0.20098563172182862, + "grad_norm": 0.24104657769203186, + "learning_rate": 5e-06, + "loss": 1.0549, + "num_input_tokens_seen": 75436832, + "step": 166, + "train_runtime": 12012.5743, + "train_tokens_per_second": 6279.822 + }, + { + "epoch": 0.20219638853943, + "grad_norm": 0.2571240961551666, + "learning_rate": 5e-06, + "loss": 1.0771, + "num_input_tokens_seen": 75895608, + "step": 167, + "train_runtime": 12084.2953, + "train_tokens_per_second": 6280.516 + }, + { + "epoch": 0.2034071453570314, + "grad_norm": 0.2907203733921051, + "learning_rate": 5e-06, + "loss": 1.0271, + "num_input_tokens_seen": 76343416, + "step": 168, + "train_runtime": 12154.2802, + "train_tokens_per_second": 6281.196 + }, + { + "epoch": 0.20461790217463274, + "grad_norm": 0.2559382915496826, + "learning_rate": 5e-06, + "loss": 1.0148, + "num_input_tokens_seen": 76810064, + "step": 169, + "train_runtime": 12226.7946, + "train_tokens_per_second": 6282.11 + }, + { + "epoch": 0.20582865899223413, + "grad_norm": 0.26620903611183167, + "learning_rate": 5e-06, + "loss": 1.0857, + "num_input_tokens_seen": 77255168, + "step": 170, + "train_runtime": 12296.1273, + "train_tokens_per_second": 6282.886 + }, + { + "epoch": 0.2070394158098355, + "grad_norm": 0.2579341530799866, + "learning_rate": 5e-06, + "loss": 1.0163, + "num_input_tokens_seen": 77712312, + "step": 171, + "train_runtime": 12367.9858, + "train_tokens_per_second": 6283.344 + }, + { + "epoch": 0.2082501726274369, + "grad_norm": 0.2516046166419983, + "learning_rate": 5e-06, + "loss": 1.0318, + "num_input_tokens_seen": 78158176, + "step": 172, + "train_runtime": 12437.6935, + "train_tokens_per_second": 6283.977 + }, + { + "epoch": 0.20946092944503827, + "grad_norm": 0.26422518491744995, + "learning_rate": 5e-06, + "loss": 1.0003, + "num_input_tokens_seen": 78631984, + "step": 173, + "train_runtime": 12512.0888, + "train_tokens_per_second": 6284.481 + }, + { + "epoch": 0.21067168626263963, + "grad_norm": 0.2679826617240906, + "learning_rate": 5e-06, + "loss": 1.01, + "num_input_tokens_seen": 79092368, + "step": 174, + "train_runtime": 12584.1353, + "train_tokens_per_second": 6285.086 + }, + { + "epoch": 0.211882443080241, + "grad_norm": 0.23957136273384094, + "learning_rate": 5e-06, + "loss": 1.0359, + "num_input_tokens_seen": 79562144, + "step": 175, + "train_runtime": 12657.2444, + "train_tokens_per_second": 6285.898 + }, + { + "epoch": 0.2130931998978424, + "grad_norm": 0.2504132091999054, + "learning_rate": 5e-06, + "loss": 1.0057, + "num_input_tokens_seen": 79997152, + "step": 176, + "train_runtime": 12725.4596, + "train_tokens_per_second": 6286.386 + }, + { + "epoch": 0.21430395671544378, + "grad_norm": 0.24493563175201416, + "learning_rate": 5e-06, + "loss": 1.0224, + "num_input_tokens_seen": 80452312, + "step": 177, + "train_runtime": 12796.7177, + "train_tokens_per_second": 6286.949 + }, + { + "epoch": 0.21551471353304516, + "grad_norm": 0.24307624995708466, + "learning_rate": 5e-06, + "loss": 1.0201, + "num_input_tokens_seen": 80895192, + "step": 178, + "train_runtime": 12866.1831, + "train_tokens_per_second": 6287.427 + }, + { + "epoch": 0.21672547035064652, + "grad_norm": 0.22720018029212952, + "learning_rate": 5e-06, + "loss": 0.9935, + "num_input_tokens_seen": 81373192, + "step": 179, + "train_runtime": 12941.1713, + "train_tokens_per_second": 6287.931 + }, + { + "epoch": 0.2179362271682479, + "grad_norm": 0.24937334656715393, + "learning_rate": 5e-06, + "loss": 0.9786, + "num_input_tokens_seen": 81840648, + "step": 180, + "train_runtime": 13013.8198, + "train_tokens_per_second": 6288.749 + }, + { + "epoch": 0.21914698398584928, + "grad_norm": 0.2576950490474701, + "learning_rate": 5e-06, + "loss": 1.0603, + "num_input_tokens_seen": 82297504, + "step": 181, + "train_runtime": 13084.9624, + "train_tokens_per_second": 6289.472 + }, + { + "epoch": 0.22035774080345066, + "grad_norm": 0.2821928560733795, + "learning_rate": 5e-06, + "loss": 1.0463, + "num_input_tokens_seen": 82729496, + "step": 182, + "train_runtime": 13152.2778, + "train_tokens_per_second": 6290.127 + }, + { + "epoch": 0.22156849762105205, + "grad_norm": 0.2612816095352173, + "learning_rate": 5e-06, + "loss": 0.9959, + "num_input_tokens_seen": 83169848, + "step": 183, + "train_runtime": 13221.1044, + "train_tokens_per_second": 6290.688 + }, + { + "epoch": 0.2227792544386534, + "grad_norm": 0.24119819700717926, + "learning_rate": 5e-06, + "loss": 1.0453, + "num_input_tokens_seen": 83621816, + "step": 184, + "train_runtime": 13291.5963, + "train_tokens_per_second": 6291.33 + }, + { + "epoch": 0.22399001125625478, + "grad_norm": 0.2350812554359436, + "learning_rate": 5e-06, + "loss": 1.0488, + "num_input_tokens_seen": 84093008, + "step": 185, + "train_runtime": 13365.1515, + "train_tokens_per_second": 6291.961 + }, + { + "epoch": 0.22520076807385617, + "grad_norm": 0.23204365372657776, + "learning_rate": 5e-06, + "loss": 1.0438, + "num_input_tokens_seen": 84548848, + "step": 186, + "train_runtime": 13436.6704, + "train_tokens_per_second": 6292.396 + }, + { + "epoch": 0.22641152489145755, + "grad_norm": 0.21973128616809845, + "learning_rate": 5e-06, + "loss": 1.0221, + "num_input_tokens_seen": 85006432, + "step": 187, + "train_runtime": 13508.1616, + "train_tokens_per_second": 6292.968 + }, + { + "epoch": 0.22762228170905893, + "grad_norm": 0.22889819741249084, + "learning_rate": 5e-06, + "loss": 1.0409, + "num_input_tokens_seen": 85473544, + "step": 188, + "train_runtime": 13580.8488, + "train_tokens_per_second": 6293.682 + }, + { + "epoch": 0.2288330385266603, + "grad_norm": 0.22178350389003754, + "learning_rate": 5e-06, + "loss": 0.9972, + "num_input_tokens_seen": 85935408, + "step": 189, + "train_runtime": 13653.0892, + "train_tokens_per_second": 6294.21 + }, + { + "epoch": 0.23004379534426167, + "grad_norm": 0.22922936081886292, + "learning_rate": 5e-06, + "loss": 1.1049, + "num_input_tokens_seen": 86403776, + "step": 190, + "train_runtime": 13726.5237, + "train_tokens_per_second": 6294.658 + }, + { + "epoch": 0.23125455216186305, + "grad_norm": 0.24582232534885406, + "learning_rate": 5e-06, + "loss": 1.0694, + "num_input_tokens_seen": 86866200, + "step": 191, + "train_runtime": 13799.1133, + "train_tokens_per_second": 6295.057 + }, + { + "epoch": 0.23246530897946444, + "grad_norm": 0.24143490195274353, + "learning_rate": 5e-06, + "loss": 1.0036, + "num_input_tokens_seen": 87327440, + "step": 192, + "train_runtime": 13871.561, + "train_tokens_per_second": 6295.43 + }, + { + "epoch": 0.23367606579706582, + "grad_norm": 0.2200412005186081, + "learning_rate": 5e-06, + "loss": 1.0321, + "num_input_tokens_seen": 87788752, + "step": 193, + "train_runtime": 13943.4418, + "train_tokens_per_second": 6296.06 + }, + { + "epoch": 0.23488682261466717, + "grad_norm": 0.24762044847011566, + "learning_rate": 5e-06, + "loss": 1.0363, + "num_input_tokens_seen": 88219736, + "step": 194, + "train_runtime": 14010.4722, + "train_tokens_per_second": 6296.7 + }, + { + "epoch": 0.23609757943226856, + "grad_norm": 0.23594461381435394, + "learning_rate": 5e-06, + "loss": 1.0704, + "num_input_tokens_seen": 88680528, + "step": 195, + "train_runtime": 14082.4487, + "train_tokens_per_second": 6297.238 + }, + { + "epoch": 0.23730833624986994, + "grad_norm": 0.24670927226543427, + "learning_rate": 5e-06, + "loss": 1.0181, + "num_input_tokens_seen": 89139152, + "step": 196, + "train_runtime": 14154.1641, + "train_tokens_per_second": 6297.733 + }, + { + "epoch": 0.23851909306747132, + "grad_norm": 0.2432672679424286, + "learning_rate": 5e-06, + "loss": 0.9717, + "num_input_tokens_seen": 89603368, + "step": 197, + "train_runtime": 14226.3618, + "train_tokens_per_second": 6298.404 + }, + { + "epoch": 0.2397298498850727, + "grad_norm": 0.2482805699110031, + "learning_rate": 5e-06, + "loss": 0.9996, + "num_input_tokens_seen": 90053800, + "step": 198, + "train_runtime": 14296.6962, + "train_tokens_per_second": 6298.924 + }, + { + "epoch": 0.24094060670267406, + "grad_norm": 0.2421431541442871, + "learning_rate": 5e-06, + "loss": 0.9607, + "num_input_tokens_seen": 90508144, + "step": 199, + "train_runtime": 14368.0885, + "train_tokens_per_second": 6299.247 + }, + { + "epoch": 0.24215136352027544, + "grad_norm": 0.21828782558441162, + "learning_rate": 5e-06, + "loss": 0.9602, + "num_input_tokens_seen": 90981216, + "step": 200, + "train_runtime": 14442.2784, + "train_tokens_per_second": 6299.644 + }, + { + "epoch": 0.24336212033787682, + "grad_norm": 0.25093552470207214, + "learning_rate": 5e-06, + "loss": 1.0223, + "num_input_tokens_seen": 91405344, + "step": 201, + "train_runtime": 14508.6653, + "train_tokens_per_second": 6300.052 + }, + { + "epoch": 0.2445728771554782, + "grad_norm": 0.2346261888742447, + "learning_rate": 5e-06, + "loss": 1.0457, + "num_input_tokens_seen": 91867920, + "step": 202, + "train_runtime": 14581.2729, + "train_tokens_per_second": 6300.405 + }, + { + "epoch": 0.2457836339730796, + "grad_norm": 0.2555064260959625, + "learning_rate": 5e-06, + "loss": 1.0239, + "num_input_tokens_seen": 92332376, + "step": 203, + "train_runtime": 14654.0594, + "train_tokens_per_second": 6300.805 + }, + { + "epoch": 0.24699439079068095, + "grad_norm": 0.24753707647323608, + "learning_rate": 5e-06, + "loss": 1.0078, + "num_input_tokens_seen": 92798256, + "step": 204, + "train_runtime": 14727.0741, + "train_tokens_per_second": 6301.201 + }, + { + "epoch": 0.24820514760828233, + "grad_norm": 0.22091752290725708, + "learning_rate": 5e-06, + "loss": 0.9881, + "num_input_tokens_seen": 93260920, + "step": 205, + "train_runtime": 14799.4511, + "train_tokens_per_second": 6301.647 + }, + { + "epoch": 0.2494159044258837, + "grad_norm": 0.23978286981582642, + "learning_rate": 5e-06, + "loss": 1.0102, + "num_input_tokens_seen": 93719680, + "step": 206, + "train_runtime": 14871.4509, + "train_tokens_per_second": 6301.986 + }, + { + "epoch": 0.2506266612434851, + "grad_norm": 0.2572280466556549, + "learning_rate": 5e-06, + "loss": 0.9997, + "num_input_tokens_seen": 94167864, + "step": 207, + "train_runtime": 14941.2387, + "train_tokens_per_second": 6302.547 + }, + { + "epoch": 0.25183741806108645, + "grad_norm": 0.22775068879127502, + "learning_rate": 5e-06, + "loss": 0.9997, + "num_input_tokens_seen": 94629576, + "step": 208, + "train_runtime": 15013.5212, + "train_tokens_per_second": 6302.957 + }, + { + "epoch": 0.25304817487868786, + "grad_norm": 0.24101892113685608, + "learning_rate": 5e-06, + "loss": 1.0547, + "num_input_tokens_seen": 95088576, + "step": 209, + "train_runtime": 15085.7957, + "train_tokens_per_second": 6303.186 + }, + { + "epoch": 0.2542589316962892, + "grad_norm": 0.23462055623531342, + "learning_rate": 5e-06, + "loss": 1.0055, + "num_input_tokens_seen": 95528848, + "step": 210, + "train_runtime": 15154.3744, + "train_tokens_per_second": 6303.714 + }, + { + "epoch": 0.25546968851389057, + "grad_norm": 0.21969425678253174, + "learning_rate": 5e-06, + "loss": 1.0179, + "num_input_tokens_seen": 96012504, + "step": 211, + "train_runtime": 15229.6926, + "train_tokens_per_second": 6304.297 + }, + { + "epoch": 0.256680445331492, + "grad_norm": 0.2324143946170807, + "learning_rate": 5e-06, + "loss": 1.0263, + "num_input_tokens_seen": 96478288, + "step": 212, + "train_runtime": 15302.7525, + "train_tokens_per_second": 6304.636 + }, + { + "epoch": 0.25789120214909333, + "grad_norm": 0.2410186231136322, + "learning_rate": 5e-06, + "loss": 1.0705, + "num_input_tokens_seen": 96927768, + "step": 213, + "train_runtime": 15372.6647, + "train_tokens_per_second": 6305.203 + }, + { + "epoch": 0.25910195896669475, + "grad_norm": 0.2557809352874756, + "learning_rate": 5e-06, + "loss": 1.0136, + "num_input_tokens_seen": 97369112, + "step": 214, + "train_runtime": 15441.8864, + "train_tokens_per_second": 6305.519 + }, + { + "epoch": 0.2603127157842961, + "grad_norm": 0.22955191135406494, + "learning_rate": 5e-06, + "loss": 0.9885, + "num_input_tokens_seen": 97837320, + "step": 215, + "train_runtime": 15515.0556, + "train_tokens_per_second": 6305.96 + }, + { + "epoch": 0.26152347260189746, + "grad_norm": 0.23326116800308228, + "learning_rate": 5e-06, + "loss": 1.0407, + "num_input_tokens_seen": 98273464, + "step": 216, + "train_runtime": 15582.8966, + "train_tokens_per_second": 6306.495 + }, + { + "epoch": 0.26273422941949887, + "grad_norm": 0.2623524069786072, + "learning_rate": 5e-06, + "loss": 0.9967, + "num_input_tokens_seen": 98734576, + "step": 217, + "train_runtime": 15654.599, + "train_tokens_per_second": 6307.065 + }, + { + "epoch": 0.2639449862371002, + "grad_norm": 0.2236497849225998, + "learning_rate": 5e-06, + "loss": 0.9819, + "num_input_tokens_seen": 99183248, + "step": 218, + "train_runtime": 15724.0156, + "train_tokens_per_second": 6307.756 + }, + { + "epoch": 0.26515574305470163, + "grad_norm": 0.2309817373752594, + "learning_rate": 5e-06, + "loss": 1.0123, + "num_input_tokens_seen": 99655480, + "step": 219, + "train_runtime": 15797.6975, + "train_tokens_per_second": 6308.228 + }, + { + "epoch": 0.266366499872303, + "grad_norm": 0.22036534547805786, + "learning_rate": 5e-06, + "loss": 1.0621, + "num_input_tokens_seen": 100126896, + "step": 220, + "train_runtime": 15875.4973, + "train_tokens_per_second": 6307.008 + }, + { + "epoch": 0.26757725668990434, + "grad_norm": 0.24294357001781464, + "learning_rate": 5e-06, + "loss": 1.0296, + "num_input_tokens_seen": 100585544, + "step": 221, + "train_runtime": 15948.2329, + "train_tokens_per_second": 6307.002 + }, + { + "epoch": 0.26878801350750575, + "grad_norm": 0.2395816445350647, + "learning_rate": 5e-06, + "loss": 1.0843, + "num_input_tokens_seen": 101027704, + "step": 222, + "train_runtime": 16017.0188, + "train_tokens_per_second": 6307.522 + }, + { + "epoch": 0.2699987703251071, + "grad_norm": 0.23171593248844147, + "learning_rate": 5e-06, + "loss": 1.0259, + "num_input_tokens_seen": 101494896, + "step": 223, + "train_runtime": 16089.9686, + "train_tokens_per_second": 6307.961 + }, + { + "epoch": 0.2712095271427085, + "grad_norm": 0.23881399631500244, + "learning_rate": 5e-06, + "loss": 1.0312, + "num_input_tokens_seen": 101945248, + "step": 224, + "train_runtime": 16161.1769, + "train_tokens_per_second": 6308.034 + }, + { + "epoch": 0.2724202839603099, + "grad_norm": 0.23741568624973297, + "learning_rate": 5e-06, + "loss": 1.0388, + "num_input_tokens_seen": 102381600, + "step": 225, + "train_runtime": 16233.0137, + "train_tokens_per_second": 6306.999 + }, + { + "epoch": 0.2736310407779113, + "grad_norm": 0.2587156295776367, + "learning_rate": 5e-06, + "loss": 1.053, + "num_input_tokens_seen": 102853016, + "step": 226, + "train_runtime": 16310.47, + "train_tokens_per_second": 6305.95 + }, + { + "epoch": 0.27484179759551264, + "grad_norm": 0.25893622636795044, + "learning_rate": 5e-06, + "loss": 1.0546, + "num_input_tokens_seen": 103292264, + "step": 227, + "train_runtime": 16382.2499, + "train_tokens_per_second": 6305.133 + }, + { + "epoch": 0.276052554413114, + "grad_norm": 0.235712468624115, + "learning_rate": 5e-06, + "loss": 1.0638, + "num_input_tokens_seen": 103744464, + "step": 228, + "train_runtime": 16455.8156, + "train_tokens_per_second": 6304.426 + }, + { + "epoch": 0.2772633112307154, + "grad_norm": 0.2683420181274414, + "learning_rate": 5e-06, + "loss": 1.0731, + "num_input_tokens_seen": 104191136, + "step": 229, + "train_runtime": 16528.5498, + "train_tokens_per_second": 6303.707 + }, + { + "epoch": 0.27847406804831676, + "grad_norm": 0.22673234343528748, + "learning_rate": 5e-06, + "loss": 0.996, + "num_input_tokens_seen": 104663120, + "step": 230, + "train_runtime": 16605.3871, + "train_tokens_per_second": 6302.962 + }, + { + "epoch": 0.27968482486591817, + "grad_norm": 0.2398988902568817, + "learning_rate": 5e-06, + "loss": 1.0543, + "num_input_tokens_seen": 105118544, + "step": 231, + "train_runtime": 16679.8469, + "train_tokens_per_second": 6302.129 + }, + { + "epoch": 0.2808955816835195, + "grad_norm": 0.2677454948425293, + "learning_rate": 5e-06, + "loss": 1.0094, + "num_input_tokens_seen": 105588584, + "step": 232, + "train_runtime": 16756.2821, + "train_tokens_per_second": 6301.433 + }, + { + "epoch": 0.2821063385011209, + "grad_norm": 0.2396971434354782, + "learning_rate": 5e-06, + "loss": 0.9976, + "num_input_tokens_seen": 106052280, + "step": 233, + "train_runtime": 16831.9341, + "train_tokens_per_second": 6300.659 + }, + { + "epoch": 0.2833170953187223, + "grad_norm": 0.2204187661409378, + "learning_rate": 5e-06, + "loss": 0.9871, + "num_input_tokens_seen": 106493280, + "step": 234, + "train_runtime": 16903.9239, + "train_tokens_per_second": 6299.915 + }, + { + "epoch": 0.28452785213632364, + "grad_norm": 0.2463349997997284, + "learning_rate": 5e-06, + "loss": 0.9915, + "num_input_tokens_seen": 106971464, + "step": 235, + "train_runtime": 16982.022, + "train_tokens_per_second": 6299.101 + }, + { + "epoch": 0.28573860895392506, + "grad_norm": 0.22036071121692657, + "learning_rate": 5e-06, + "loss": 0.9812, + "num_input_tokens_seen": 107447560, + "step": 236, + "train_runtime": 17059.9653, + "train_tokens_per_second": 6298.229 + }, + { + "epoch": 0.2869493657715264, + "grad_norm": 0.2353561669588089, + "learning_rate": 5e-06, + "loss": 0.9932, + "num_input_tokens_seen": 107889344, + "step": 237, + "train_runtime": 17132.2103, + "train_tokens_per_second": 6297.456 + }, + { + "epoch": 0.28816012258912777, + "grad_norm": 0.23488640785217285, + "learning_rate": 5e-06, + "loss": 1.0296, + "num_input_tokens_seen": 108347712, + "step": 238, + "train_runtime": 17206.8913, + "train_tokens_per_second": 6296.763 + }, + { + "epoch": 0.2893708794067292, + "grad_norm": 0.23872198164463043, + "learning_rate": 5e-06, + "loss": 1.0054, + "num_input_tokens_seen": 108790344, + "step": 239, + "train_runtime": 17277.6892, + "train_tokens_per_second": 6296.58 + }, + { + "epoch": 0.29058163622433053, + "grad_norm": 0.2371063083410263, + "learning_rate": 5e-06, + "loss": 1.009, + "num_input_tokens_seen": 109245808, + "step": 240, + "train_runtime": 17344.4981, + "train_tokens_per_second": 6298.586 + }, + { + "epoch": 0.29179239304193194, + "grad_norm": 0.2168145626783371, + "learning_rate": 5e-06, + "loss": 0.9897, + "num_input_tokens_seen": 109708688, + "step": 241, + "train_runtime": 17412.6292, + "train_tokens_per_second": 6300.524 + }, + { + "epoch": 0.2930031498595333, + "grad_norm": 0.230647012591362, + "learning_rate": 5e-06, + "loss": 1.0172, + "num_input_tokens_seen": 110166104, + "step": 242, + "train_runtime": 17480.5962, + "train_tokens_per_second": 6302.194 + }, + { + "epoch": 0.29421390667713465, + "grad_norm": 0.2462947964668274, + "learning_rate": 5e-06, + "loss": 1.0429, + "num_input_tokens_seen": 110603016, + "step": 243, + "train_runtime": 17550.9466, + "train_tokens_per_second": 6301.826 + }, + { + "epoch": 0.29542466349473606, + "grad_norm": 0.2439439445734024, + "learning_rate": 5e-06, + "loss": 0.9871, + "num_input_tokens_seen": 111038080, + "step": 244, + "train_runtime": 17621.5292, + "train_tokens_per_second": 6301.274 + }, + { + "epoch": 0.2966354203123374, + "grad_norm": 0.24288234114646912, + "learning_rate": 5e-06, + "loss": 1.0544, + "num_input_tokens_seen": 111474408, + "step": 245, + "train_runtime": 17694.1217, + "train_tokens_per_second": 6300.081 + }, + { + "epoch": 0.2978461771299388, + "grad_norm": 0.2557252049446106, + "learning_rate": 5e-06, + "loss": 1.0276, + "num_input_tokens_seen": 111923176, + "step": 246, + "train_runtime": 17768.7153, + "train_tokens_per_second": 6298.89 + }, + { + "epoch": 0.2990569339475402, + "grad_norm": 0.25596141815185547, + "learning_rate": 5e-06, + "loss": 0.9989, + "num_input_tokens_seen": 112397080, + "step": 247, + "train_runtime": 17850.0516, + "train_tokens_per_second": 6296.737 + }, + { + "epoch": 0.30026769076514154, + "grad_norm": 0.21673010289669037, + "learning_rate": 5e-06, + "loss": 0.9963, + "num_input_tokens_seen": 112862520, + "step": 248, + "train_runtime": 17937.4215, + "train_tokens_per_second": 6292.015 + }, + { + "epoch": 0.30147844758274295, + "grad_norm": 0.26896172761917114, + "learning_rate": 5e-06, + "loss": 1.0335, + "num_input_tokens_seen": 113290872, + "step": 249, + "train_runtime": 18018.1765, + "train_tokens_per_second": 6287.588 + }, + { + "epoch": 0.3026892044003443, + "grad_norm": 0.2385682761669159, + "learning_rate": 5e-06, + "loss": 0.9976, + "num_input_tokens_seen": 113759016, + "step": 250, + "train_runtime": 18107.0103, + "train_tokens_per_second": 6282.595 + }, + { + "epoch": 0.3038999612179457, + "grad_norm": 0.22848090529441833, + "learning_rate": 5e-06, + "loss": 0.9864, + "num_input_tokens_seen": 114208944, + "step": 251, + "train_runtime": 18191.4232, + "train_tokens_per_second": 6278.175 + }, + { + "epoch": 0.30511071803554707, + "grad_norm": 0.23898521065711975, + "learning_rate": 5e-06, + "loss": 0.9731, + "num_input_tokens_seen": 114673264, + "step": 252, + "train_runtime": 18278.8132, + "train_tokens_per_second": 6273.562 + }, + { + "epoch": 0.3063214748531484, + "grad_norm": 0.23195713758468628, + "learning_rate": 5e-06, + "loss": 1.0309, + "num_input_tokens_seen": 115123744, + "step": 253, + "train_runtime": 18364.6246, + "train_tokens_per_second": 6268.777 + }, + { + "epoch": 0.30753223167074983, + "grad_norm": 0.257159948348999, + "learning_rate": 5e-06, + "loss": 1.0214, + "num_input_tokens_seen": 115555688, + "step": 254, + "train_runtime": 18445.2087, + "train_tokens_per_second": 6264.808 + }, + { + "epoch": 0.3087429884883512, + "grad_norm": 0.2846441864967346, + "learning_rate": 5e-06, + "loss": 0.9902, + "num_input_tokens_seen": 116006248, + "step": 255, + "train_runtime": 18529.8186, + "train_tokens_per_second": 6260.517 + }, + { + "epoch": 0.3099537453059526, + "grad_norm": 0.21537640690803528, + "learning_rate": 5e-06, + "loss": 0.9767, + "num_input_tokens_seen": 116464472, + "step": 256, + "train_runtime": 18601.195, + "train_tokens_per_second": 6261.128 + }, + { + "epoch": 0.31116450212355395, + "grad_norm": 0.2560320496559143, + "learning_rate": 5e-06, + "loss": 1.0474, + "num_input_tokens_seen": 116881080, + "step": 257, + "train_runtime": 18665.8155, + "train_tokens_per_second": 6261.772 + }, + { + "epoch": 0.3123752589411553, + "grad_norm": 0.23951588571071625, + "learning_rate": 5e-06, + "loss": 1.0078, + "num_input_tokens_seen": 117334528, + "step": 258, + "train_runtime": 18736.5275, + "train_tokens_per_second": 6262.341 + }, + { + "epoch": 0.3135860157587567, + "grad_norm": 0.233546182513237, + "learning_rate": 5e-06, + "loss": 1.0615, + "num_input_tokens_seen": 117794752, + "step": 259, + "train_runtime": 18808.0733, + "train_tokens_per_second": 6262.989 + }, + { + "epoch": 0.3147967725763581, + "grad_norm": 0.21725581586360931, + "learning_rate": 5e-06, + "loss": 0.9617, + "num_input_tokens_seen": 118273288, + "step": 260, + "train_runtime": 18883.1278, + "train_tokens_per_second": 6263.437 + }, + { + "epoch": 0.3160075293939595, + "grad_norm": 0.2287113070487976, + "learning_rate": 5e-06, + "loss": 0.9977, + "num_input_tokens_seen": 118730128, + "step": 261, + "train_runtime": 18954.4265, + "train_tokens_per_second": 6263.979 + }, + { + "epoch": 0.31721828621156084, + "grad_norm": 0.2280893474817276, + "learning_rate": 5e-06, + "loss": 1.0171, + "num_input_tokens_seen": 119186808, + "step": 262, + "train_runtime": 19025.8236, + "train_tokens_per_second": 6264.476 + }, + { + "epoch": 0.3184290430291622, + "grad_norm": 0.2364167869091034, + "learning_rate": 5e-06, + "loss": 1.0681, + "num_input_tokens_seen": 119627160, + "step": 263, + "train_runtime": 19094.4088, + "train_tokens_per_second": 6265.036 + }, + { + "epoch": 0.3196397998467636, + "grad_norm": 0.2416498214006424, + "learning_rate": 5e-06, + "loss": 1.0912, + "num_input_tokens_seen": 120077824, + "step": 264, + "train_runtime": 19164.669, + "train_tokens_per_second": 6265.583 + }, + { + "epoch": 0.32085055666436496, + "grad_norm": 0.23011499643325806, + "learning_rate": 5e-06, + "loss": 1.034, + "num_input_tokens_seen": 120521112, + "step": 265, + "train_runtime": 19233.6771, + "train_tokens_per_second": 6266.15 + }, + { + "epoch": 0.32206131348196637, + "grad_norm": 0.2434847503900528, + "learning_rate": 5e-06, + "loss": 1.0225, + "num_input_tokens_seen": 120972064, + "step": 266, + "train_runtime": 19304.0456, + "train_tokens_per_second": 6266.669 + }, + { + "epoch": 0.3232720702995677, + "grad_norm": 0.2501772344112396, + "learning_rate": 5e-06, + "loss": 1.0575, + "num_input_tokens_seen": 121426920, + "step": 267, + "train_runtime": 19374.5798, + "train_tokens_per_second": 6267.332 + }, + { + "epoch": 0.3244828271171691, + "grad_norm": 0.2651502788066864, + "learning_rate": 5e-06, + "loss": 1.0499, + "num_input_tokens_seen": 121873184, + "step": 268, + "train_runtime": 19443.9026, + "train_tokens_per_second": 6267.938 + }, + { + "epoch": 0.3256935839347705, + "grad_norm": 0.2908613979816437, + "learning_rate": 5e-06, + "loss": 1.0486, + "num_input_tokens_seen": 122327152, + "step": 269, + "train_runtime": 19516.72, + "train_tokens_per_second": 6267.813 + }, + { + "epoch": 0.32690434075237185, + "grad_norm": 0.23566846549510956, + "learning_rate": 5e-06, + "loss": 1.017, + "num_input_tokens_seen": 122784960, + "step": 270, + "train_runtime": 19592.2753, + "train_tokens_per_second": 6267.009 + }, + { + "epoch": 0.32811509756997326, + "grad_norm": 0.2798844575881958, + "learning_rate": 5e-06, + "loss": 1.0546, + "num_input_tokens_seen": 123240512, + "step": 271, + "train_runtime": 19666.8911, + "train_tokens_per_second": 6266.395 + }, + { + "epoch": 0.3293258543875746, + "grad_norm": 0.21824029088020325, + "learning_rate": 5e-06, + "loss": 0.9848, + "num_input_tokens_seen": 123704960, + "step": 272, + "train_runtime": 19743.7002, + "train_tokens_per_second": 6265.541 + }, + { + "epoch": 0.33053661120517597, + "grad_norm": 0.2295370250940323, + "learning_rate": 5e-06, + "loss": 1.0064, + "num_input_tokens_seen": 124147728, + "step": 273, + "train_runtime": 19815.9537, + "train_tokens_per_second": 6265.039 + }, + { + "epoch": 0.3317473680227774, + "grad_norm": 0.2582823932170868, + "learning_rate": 5e-06, + "loss": 1.0589, + "num_input_tokens_seen": 124588360, + "step": 274, + "train_runtime": 19888.5652, + "train_tokens_per_second": 6264.321 + }, + { + "epoch": 0.33295812484037873, + "grad_norm": 0.2539482116699219, + "learning_rate": 5e-06, + "loss": 0.9915, + "num_input_tokens_seen": 125054408, + "step": 275, + "train_runtime": 19965.1487, + "train_tokens_per_second": 6263.635 + }, + { + "epoch": 0.33416888165798014, + "grad_norm": 0.2645561695098877, + "learning_rate": 5e-06, + "loss": 0.9986, + "num_input_tokens_seen": 125507560, + "step": 276, + "train_runtime": 20039.2339, + "train_tokens_per_second": 6263.092 + }, + { + "epoch": 0.3353796384755815, + "grad_norm": 0.22155457735061646, + "learning_rate": 5e-06, + "loss": 0.9988, + "num_input_tokens_seen": 125994072, + "step": 277, + "train_runtime": 20119.5235, + "train_tokens_per_second": 6262.279 + }, + { + "epoch": 0.33659039529318285, + "grad_norm": 0.2287885844707489, + "learning_rate": 5e-06, + "loss": 1.0277, + "num_input_tokens_seen": 126467528, + "step": 278, + "train_runtime": 20197.6843, + "train_tokens_per_second": 6261.487 + }, + { + "epoch": 0.33780115211078426, + "grad_norm": 0.2466982901096344, + "learning_rate": 5e-06, + "loss": 1.0205, + "num_input_tokens_seen": 126892704, + "step": 279, + "train_runtime": 20269.3446, + "train_tokens_per_second": 6260.326 + }, + { + "epoch": 0.3390119089283856, + "grad_norm": 0.23624956607818604, + "learning_rate": 5e-06, + "loss": 0.9984, + "num_input_tokens_seen": 127356472, + "step": 280, + "train_runtime": 20345.7248, + "train_tokens_per_second": 6259.618 + }, + { + "epoch": 0.34022266574598703, + "grad_norm": 0.23681671917438507, + "learning_rate": 5e-06, + "loss": 1.0242, + "num_input_tokens_seen": 127815984, + "step": 281, + "train_runtime": 20421.2658, + "train_tokens_per_second": 6258.965 + }, + { + "epoch": 0.3414334225635884, + "grad_norm": 0.22485695779323578, + "learning_rate": 5e-06, + "loss": 1.0365, + "num_input_tokens_seen": 128272520, + "step": 282, + "train_runtime": 20496.3233, + "train_tokens_per_second": 6258.319 + }, + { + "epoch": 0.34264417938118974, + "grad_norm": 0.2541932165622711, + "learning_rate": 5e-06, + "loss": 0.9823, + "num_input_tokens_seen": 128699760, + "step": 283, + "train_runtime": 20566.2431, + "train_tokens_per_second": 6257.816 + }, + { + "epoch": 0.34385493619879115, + "grad_norm": 0.22476626932621002, + "learning_rate": 5e-06, + "loss": 1.0021, + "num_input_tokens_seen": 129169888, + "step": 284, + "train_runtime": 20643.7442, + "train_tokens_per_second": 6257.096 + }, + { + "epoch": 0.3450656930163925, + "grad_norm": 0.24232985079288483, + "learning_rate": 5e-06, + "loss": 0.9942, + "num_input_tokens_seen": 129627816, + "step": 285, + "train_runtime": 20724.2188, + "train_tokens_per_second": 6254.895 + }, + { + "epoch": 0.3462764498339939, + "grad_norm": 0.23191998898983002, + "learning_rate": 5e-06, + "loss": 1.0246, + "num_input_tokens_seen": 130087264, + "step": 286, + "train_runtime": 20798.7497, + "train_tokens_per_second": 6254.571 + }, + { + "epoch": 0.34748720665159527, + "grad_norm": 0.2423601895570755, + "learning_rate": 5e-06, + "loss": 0.9571, + "num_input_tokens_seen": 130552064, + "step": 287, + "train_runtime": 20871.3382, + "train_tokens_per_second": 6255.088 + }, + { + "epoch": 0.3486979634691966, + "grad_norm": 0.3263372480869293, + "learning_rate": 5e-06, + "loss": 1.0073, + "num_input_tokens_seen": 131001008, + "step": 288, + "train_runtime": 20941.405, + "train_tokens_per_second": 6255.598 + }, + { + "epoch": 0.34990872028679804, + "grad_norm": 0.2425222098827362, + "learning_rate": 5e-06, + "loss": 1.0397, + "num_input_tokens_seen": 131456016, + "step": 289, + "train_runtime": 21012.6636, + "train_tokens_per_second": 6256.038 + }, + { + "epoch": 0.3511194771043994, + "grad_norm": 0.24094624817371368, + "learning_rate": 5e-06, + "loss": 0.9737, + "num_input_tokens_seen": 131911216, + "step": 290, + "train_runtime": 21083.6278, + "train_tokens_per_second": 6256.571 + }, + { + "epoch": 0.3523302339220008, + "grad_norm": 0.2286059558391571, + "learning_rate": 5e-06, + "loss": 0.9598, + "num_input_tokens_seen": 132364240, + "step": 291, + "train_runtime": 21153.9443, + "train_tokens_per_second": 6257.19 + }, + { + "epoch": 0.35354099073960216, + "grad_norm": 0.22142821550369263, + "learning_rate": 5e-06, + "loss": 0.9791, + "num_input_tokens_seen": 132820456, + "step": 292, + "train_runtime": 21225.0777, + "train_tokens_per_second": 6257.714 + }, + { + "epoch": 0.3547517475572035, + "grad_norm": 0.25561171770095825, + "learning_rate": 5e-06, + "loss": 1.0434, + "num_input_tokens_seen": 133258016, + "step": 293, + "train_runtime": 21293.1518, + "train_tokens_per_second": 6258.257 + }, + { + "epoch": 0.3559625043748049, + "grad_norm": 0.23531781136989594, + "learning_rate": 5e-06, + "loss": 1.0001, + "num_input_tokens_seen": 133723576, + "step": 294, + "train_runtime": 21366.0225, + "train_tokens_per_second": 6258.702 + }, + { + "epoch": 0.3571732611924063, + "grad_norm": 0.22105760872364044, + "learning_rate": 5e-06, + "loss": 1.0213, + "num_input_tokens_seen": 134179400, + "step": 295, + "train_runtime": 21437.4884, + "train_tokens_per_second": 6259.101 + }, + { + "epoch": 0.3583840180100077, + "grad_norm": 0.26079460978507996, + "learning_rate": 5e-06, + "loss": 1.0322, + "num_input_tokens_seen": 134624872, + "step": 296, + "train_runtime": 21507.5955, + "train_tokens_per_second": 6259.411 + }, + { + "epoch": 0.35959477482760904, + "grad_norm": 0.2267124503850937, + "learning_rate": 5e-06, + "loss": 0.9923, + "num_input_tokens_seen": 135108584, + "step": 297, + "train_runtime": 21582.4999, + "train_tokens_per_second": 6260.099 + }, + { + "epoch": 0.3608055316452104, + "grad_norm": 0.247776061296463, + "learning_rate": 5e-06, + "loss": 0.9913, + "num_input_tokens_seen": 135578000, + "step": 298, + "train_runtime": 21655.9937, + "train_tokens_per_second": 6260.53 + }, + { + "epoch": 0.3620162884628118, + "grad_norm": 0.23508575558662415, + "learning_rate": 5e-06, + "loss": 0.9601, + "num_input_tokens_seen": 136004656, + "step": 299, + "train_runtime": 21722.55, + "train_tokens_per_second": 6260.989 + }, + { + "epoch": 0.36322704528041316, + "grad_norm": 0.25533682107925415, + "learning_rate": 5e-06, + "loss": 1.0357, + "num_input_tokens_seen": 136462536, + "step": 300, + "train_runtime": 21793.1578, + "train_tokens_per_second": 6261.715 + }, + { + "epoch": 0.3644378020980146, + "grad_norm": 0.2101793736219406, + "learning_rate": 5e-06, + "loss": 0.9743, + "num_input_tokens_seen": 136933744, + "step": 301, + "train_runtime": 21866.9264, + "train_tokens_per_second": 6262.14 + }, + { + "epoch": 0.36564855891561593, + "grad_norm": 0.2493451088666916, + "learning_rate": 5e-06, + "loss": 0.973, + "num_input_tokens_seen": 137387912, + "step": 302, + "train_runtime": 21938.1245, + "train_tokens_per_second": 6262.519 + }, + { + "epoch": 0.3668593157332173, + "grad_norm": 0.23311975598335266, + "learning_rate": 5e-06, + "loss": 1.0086, + "num_input_tokens_seen": 137841208, + "step": 303, + "train_runtime": 22008.7483, + "train_tokens_per_second": 6263.019 + }, + { + "epoch": 0.3680700725508187, + "grad_norm": 0.2377161979675293, + "learning_rate": 5e-06, + "loss": 1.0391, + "num_input_tokens_seen": 138299912, + "step": 304, + "train_runtime": 22080.6387, + "train_tokens_per_second": 6263.402 + }, + { + "epoch": 0.36928082936842005, + "grad_norm": 0.23572410643100739, + "learning_rate": 5e-06, + "loss": 1.0402, + "num_input_tokens_seen": 138739904, + "step": 305, + "train_runtime": 22152.1917, + "train_tokens_per_second": 6263.033 + }, + { + "epoch": 0.37049158618602146, + "grad_norm": 0.24770863354206085, + "learning_rate": 5e-06, + "loss": 1.0045, + "num_input_tokens_seen": 139197120, + "step": 306, + "train_runtime": 22227.4884, + "train_tokens_per_second": 6262.386 + }, + { + "epoch": 0.3717023430036228, + "grad_norm": 0.2456834316253662, + "learning_rate": 5e-06, + "loss": 1.0049, + "num_input_tokens_seen": 139637016, + "step": 307, + "train_runtime": 22300.7598, + "train_tokens_per_second": 6261.536 + }, + { + "epoch": 0.37291309982122417, + "grad_norm": 0.23433266580104828, + "learning_rate": 5e-06, + "loss": 0.9637, + "num_input_tokens_seen": 140089624, + "step": 308, + "train_runtime": 22375.468, + "train_tokens_per_second": 6260.858 + }, + { + "epoch": 0.3741238566388256, + "grad_norm": 0.28043490648269653, + "learning_rate": 5e-06, + "loss": 1.0334, + "num_input_tokens_seen": 140517536, + "step": 309, + "train_runtime": 22445.9968, + "train_tokens_per_second": 6260.249 + }, + { + "epoch": 0.37533461345642694, + "grad_norm": 0.26074591279029846, + "learning_rate": 5e-06, + "loss": 0.9666, + "num_input_tokens_seen": 140988184, + "step": 310, + "train_runtime": 22523.6032, + "train_tokens_per_second": 6259.575 + }, + { + "epoch": 0.37654537027402835, + "grad_norm": 0.2182447761297226, + "learning_rate": 5e-06, + "loss": 1.0189, + "num_input_tokens_seen": 141453976, + "step": 311, + "train_runtime": 22600.0411, + "train_tokens_per_second": 6259.014 + }, + { + "epoch": 0.3777561270916297, + "grad_norm": 0.30261749029159546, + "learning_rate": 5e-06, + "loss": 0.9974, + "num_input_tokens_seen": 141907888, + "step": 312, + "train_runtime": 22674.4979, + "train_tokens_per_second": 6258.48 + }, + { + "epoch": 0.37896688390923106, + "grad_norm": 0.2571166753768921, + "learning_rate": 5e-06, + "loss": 1.0201, + "num_input_tokens_seen": 142370032, + "step": 313, + "train_runtime": 22750.3178, + "train_tokens_per_second": 6257.936 + }, + { + "epoch": 0.38017764072683247, + "grad_norm": 0.23346489667892456, + "learning_rate": 5e-06, + "loss": 0.9982, + "num_input_tokens_seen": 142807320, + "step": 314, + "train_runtime": 22822.3169, + "train_tokens_per_second": 6257.354 + }, + { + "epoch": 0.3813883975444338, + "grad_norm": 0.23612311482429504, + "learning_rate": 5e-06, + "loss": 1.0125, + "num_input_tokens_seen": 143261672, + "step": 315, + "train_runtime": 22897.0863, + "train_tokens_per_second": 6256.764 + }, + { + "epoch": 0.38259915436203523, + "grad_norm": 0.26001793146133423, + "learning_rate": 5e-06, + "loss": 0.9806, + "num_input_tokens_seen": 143708208, + "step": 316, + "train_runtime": 22971.0115, + "train_tokens_per_second": 6256.068 + }, + { + "epoch": 0.3838099111796366, + "grad_norm": 0.26588013768196106, + "learning_rate": 5e-06, + "loss": 1.043, + "num_input_tokens_seen": 144159888, + "step": 317, + "train_runtime": 23045.4895, + "train_tokens_per_second": 6255.449 + }, + { + "epoch": 0.38502066799723794, + "grad_norm": 0.24810902774333954, + "learning_rate": 5e-06, + "loss": 1.0293, + "num_input_tokens_seen": 144607736, + "step": 318, + "train_runtime": 23121.9728, + "train_tokens_per_second": 6254.126 + }, + { + "epoch": 0.38623142481483935, + "grad_norm": 0.25210660696029663, + "learning_rate": 5e-06, + "loss": 1.0545, + "num_input_tokens_seen": 145045848, + "step": 319, + "train_runtime": 23191.7625, + "train_tokens_per_second": 6254.197 + }, + { + "epoch": 0.3874421816324407, + "grad_norm": 0.2451591044664383, + "learning_rate": 5e-06, + "loss": 0.9838, + "num_input_tokens_seen": 145531920, + "step": 320, + "train_runtime": 23271.4709, + "train_tokens_per_second": 6253.662 + }, + { + "epoch": 0.3886529384500421, + "grad_norm": 0.29514279961586, + "learning_rate": 5e-06, + "loss": 0.9623, + "num_input_tokens_seen": 145970160, + "step": 321, + "train_runtime": 23345.6996, + "train_tokens_per_second": 6252.55 + }, + { + "epoch": 0.3898636952676435, + "grad_norm": 0.270550936460495, + "learning_rate": 5e-06, + "loss": 1.0351, + "num_input_tokens_seen": 146432040, + "step": 322, + "train_runtime": 23422.6662, + "train_tokens_per_second": 6251.724 + }, + { + "epoch": 0.39107445208524483, + "grad_norm": 0.23111458122730255, + "learning_rate": 5e-06, + "loss": 0.9737, + "num_input_tokens_seen": 146886712, + "step": 323, + "train_runtime": 23498.9622, + "train_tokens_per_second": 6250.774 + }, + { + "epoch": 0.39228520890284624, + "grad_norm": 0.22839005291461945, + "learning_rate": 5e-06, + "loss": 0.9722, + "num_input_tokens_seen": 147355208, + "step": 324, + "train_runtime": 23576.9231, + "train_tokens_per_second": 6249.976 + }, + { + "epoch": 0.3934959657204476, + "grad_norm": 0.24810221791267395, + "learning_rate": 5e-06, + "loss": 0.9866, + "num_input_tokens_seen": 147805720, + "step": 325, + "train_runtime": 23652.1704, + "train_tokens_per_second": 6249.14 + }, + { + "epoch": 0.394706722538049, + "grad_norm": 0.23154482245445251, + "learning_rate": 5e-06, + "loss": 0.9954, + "num_input_tokens_seen": 148273872, + "step": 326, + "train_runtime": 23729.723, + "train_tokens_per_second": 6248.445 + }, + { + "epoch": 0.39591747935565036, + "grad_norm": 0.3031870126724243, + "learning_rate": 5e-06, + "loss": 1.0066, + "num_input_tokens_seen": 148721464, + "step": 327, + "train_runtime": 23801.2103, + "train_tokens_per_second": 6248.483 + }, + { + "epoch": 0.3971282361732517, + "grad_norm": 0.2704046666622162, + "learning_rate": 5e-06, + "loss": 0.9968, + "num_input_tokens_seen": 149168824, + "step": 328, + "train_runtime": 23872.2639, + "train_tokens_per_second": 6248.625 + }, + { + "epoch": 0.3983389929908531, + "grad_norm": 0.2855125069618225, + "learning_rate": 5e-06, + "loss": 1.041, + "num_input_tokens_seen": 149608712, + "step": 329, + "train_runtime": 23941.7145, + "train_tokens_per_second": 6248.872 + }, + { + "epoch": 0.3995497498084545, + "grad_norm": 0.24565830826759338, + "learning_rate": 5e-06, + "loss": 0.9534, + "num_input_tokens_seen": 150072952, + "step": 330, + "train_runtime": 24015.5471, + "train_tokens_per_second": 6248.992 + }, + { + "epoch": 0.4007605066260559, + "grad_norm": 0.22240781784057617, + "learning_rate": 5e-06, + "loss": 1.0183, + "num_input_tokens_seen": 150529952, + "step": 331, + "train_runtime": 24088.1498, + "train_tokens_per_second": 6249.129 + }, + { + "epoch": 0.40197126344365725, + "grad_norm": 0.25719258189201355, + "learning_rate": 5e-06, + "loss": 0.994, + "num_input_tokens_seen": 150974144, + "step": 332, + "train_runtime": 24159.1836, + "train_tokens_per_second": 6249.141 + }, + { + "epoch": 0.4031820202612586, + "grad_norm": 0.23377108573913574, + "learning_rate": 5e-06, + "loss": 1.0007, + "num_input_tokens_seen": 151437784, + "step": 333, + "train_runtime": 24232.9718, + "train_tokens_per_second": 6249.245 + }, + { + "epoch": 0.40439277707886, + "grad_norm": 0.256849467754364, + "learning_rate": 5e-06, + "loss": 1.0027, + "num_input_tokens_seen": 151890232, + "step": 334, + "train_runtime": 24304.7562, + "train_tokens_per_second": 6249.404 + }, + { + "epoch": 0.40560353389646137, + "grad_norm": 0.23702043294906616, + "learning_rate": 5e-06, + "loss": 1.0201, + "num_input_tokens_seen": 152334376, + "step": 335, + "train_runtime": 24375.494, + "train_tokens_per_second": 6249.489 + }, + { + "epoch": 0.4068142907140628, + "grad_norm": 0.23365221917629242, + "learning_rate": 5e-06, + "loss": 1.0297, + "num_input_tokens_seen": 152790664, + "step": 336, + "train_runtime": 24447.9699, + "train_tokens_per_second": 6249.626 + }, + { + "epoch": 0.40802504753166413, + "grad_norm": 0.23382526636123657, + "learning_rate": 5e-06, + "loss": 1.0004, + "num_input_tokens_seen": 153244056, + "step": 337, + "train_runtime": 24520.5163, + "train_tokens_per_second": 6249.626 + }, + { + "epoch": 0.4092358043492655, + "grad_norm": 0.22636200487613678, + "learning_rate": 5e-06, + "loss": 1.0144, + "num_input_tokens_seen": 153706824, + "step": 338, + "train_runtime": 24594.0224, + "train_tokens_per_second": 6249.764 + }, + { + "epoch": 0.4104465611668669, + "grad_norm": 0.23086538910865784, + "learning_rate": 5e-06, + "loss": 0.9871, + "num_input_tokens_seen": 154147192, + "step": 339, + "train_runtime": 24663.9908, + "train_tokens_per_second": 6249.888 + }, + { + "epoch": 0.41165731798446825, + "grad_norm": 0.25210967659950256, + "learning_rate": 5e-06, + "loss": 1.02, + "num_input_tokens_seen": 154601576, + "step": 340, + "train_runtime": 24736.4881, + "train_tokens_per_second": 6249.94 + }, + { + "epoch": 0.41286807480206966, + "grad_norm": 0.24582870304584503, + "learning_rate": 5e-06, + "loss": 1.0406, + "num_input_tokens_seen": 155050072, + "step": 341, + "train_runtime": 24807.9176, + "train_tokens_per_second": 6250.024 + }, + { + "epoch": 0.414078831619671, + "grad_norm": 0.2524389326572418, + "learning_rate": 5e-06, + "loss": 1.0108, + "num_input_tokens_seen": 155484872, + "step": 342, + "train_runtime": 24876.7608, + "train_tokens_per_second": 6250.206 + }, + { + "epoch": 0.4152895884372724, + "grad_norm": 0.24597734212875366, + "learning_rate": 5e-06, + "loss": 1.0103, + "num_input_tokens_seen": 155935768, + "step": 343, + "train_runtime": 24948.3799, + "train_tokens_per_second": 6250.336 + }, + { + "epoch": 0.4165003452548738, + "grad_norm": 0.2275368720293045, + "learning_rate": 5e-06, + "loss": 1.0013, + "num_input_tokens_seen": 156401120, + "step": 344, + "train_runtime": 25022.1853, + "train_tokens_per_second": 6250.498 + }, + { + "epoch": 0.41771110207247514, + "grad_norm": 0.22949494421482086, + "learning_rate": 5e-06, + "loss": 1.0111, + "num_input_tokens_seen": 156864272, + "step": 345, + "train_runtime": 25095.8034, + "train_tokens_per_second": 6250.618 + }, + { + "epoch": 0.41892185889007655, + "grad_norm": 0.23165899515151978, + "learning_rate": 5e-06, + "loss": 1.0014, + "num_input_tokens_seen": 157317000, + "step": 346, + "train_runtime": 25167.8027, + "train_tokens_per_second": 6250.724 + }, + { + "epoch": 0.4201326157076779, + "grad_norm": 0.23215775191783905, + "learning_rate": 5e-06, + "loss": 0.9639, + "num_input_tokens_seen": 157785416, + "step": 347, + "train_runtime": 25242.0286, + "train_tokens_per_second": 6250.901 + }, + { + "epoch": 0.42134337252527926, + "grad_norm": 0.23086605966091156, + "learning_rate": 5e-06, + "loss": 1.034, + "num_input_tokens_seen": 158244744, + "step": 348, + "train_runtime": 25315.4472, + "train_tokens_per_second": 6250.916 + }, + { + "epoch": 0.42255412934288067, + "grad_norm": 0.2317984402179718, + "learning_rate": 5e-06, + "loss": 1.0313, + "num_input_tokens_seen": 158699784, + "step": 349, + "train_runtime": 25387.642, + "train_tokens_per_second": 6251.064 + }, + { + "epoch": 0.423764886160482, + "grad_norm": 0.2463163435459137, + "learning_rate": 5e-06, + "loss": 0.9835, + "num_input_tokens_seen": 159153432, + "step": 350, + "train_runtime": 25459.746, + "train_tokens_per_second": 6251.179 + }, + { + "epoch": 0.42497564297808343, + "grad_norm": 0.2302168309688568, + "learning_rate": 5e-06, + "loss": 0.9666, + "num_input_tokens_seen": 159597184, + "step": 351, + "train_runtime": 25530.6729, + "train_tokens_per_second": 6251.194 + }, + { + "epoch": 0.4261863997956848, + "grad_norm": 0.24311944842338562, + "learning_rate": 5e-06, + "loss": 0.9938, + "num_input_tokens_seen": 160056872, + "step": 352, + "train_runtime": 25603.679, + "train_tokens_per_second": 6251.323 + }, + { + "epoch": 0.42739715661328614, + "grad_norm": 0.24332423508167267, + "learning_rate": 5e-06, + "loss": 1.0026, + "num_input_tokens_seen": 160498144, + "step": 353, + "train_runtime": 25673.5562, + "train_tokens_per_second": 6251.496 + }, + { + "epoch": 0.42860791343088755, + "grad_norm": 0.2577798664569855, + "learning_rate": 5e-06, + "loss": 0.946, + "num_input_tokens_seen": 160952520, + "step": 354, + "train_runtime": 25746.1174, + "train_tokens_per_second": 6251.526 + }, + { + "epoch": 0.4298186702484889, + "grad_norm": 0.24245211482048035, + "learning_rate": 5e-06, + "loss": 1.0504, + "num_input_tokens_seen": 161409016, + "step": 355, + "train_runtime": 25819.2295, + "train_tokens_per_second": 6251.504 + }, + { + "epoch": 0.4310294270660903, + "grad_norm": 0.23425163328647614, + "learning_rate": 5e-06, + "loss": 1.0225, + "num_input_tokens_seen": 161870752, + "step": 356, + "train_runtime": 25892.4963, + "train_tokens_per_second": 6251.647 + }, + { + "epoch": 0.4322401838836917, + "grad_norm": 0.2525038421154022, + "learning_rate": 5e-06, + "loss": 1.0433, + "num_input_tokens_seen": 162296568, + "step": 357, + "train_runtime": 25959.8505, + "train_tokens_per_second": 6251.83 + }, + { + "epoch": 0.43345094070129303, + "grad_norm": 0.2417079657316208, + "learning_rate": 5e-06, + "loss": 1.0091, + "num_input_tokens_seen": 162731720, + "step": 358, + "train_runtime": 26028.6681, + "train_tokens_per_second": 6252.019 + }, + { + "epoch": 0.43466169751889444, + "grad_norm": 0.24416188895702362, + "learning_rate": 5e-06, + "loss": 0.9745, + "num_input_tokens_seen": 163191568, + "step": 359, + "train_runtime": 26101.7213, + "train_tokens_per_second": 6252.138 + }, + { + "epoch": 0.4358724543364958, + "grad_norm": 0.2705591022968292, + "learning_rate": 5e-06, + "loss": 1.005, + "num_input_tokens_seen": 163643952, + "step": 360, + "train_runtime": 26173.8821, + "train_tokens_per_second": 6252.185 + }, + { + "epoch": 0.4370832111540972, + "grad_norm": 0.23336398601531982, + "learning_rate": 5e-06, + "loss": 0.9792, + "num_input_tokens_seen": 164093776, + "step": 361, + "train_runtime": 26245.424, + "train_tokens_per_second": 6252.281 + }, + { + "epoch": 0.43829396797169856, + "grad_norm": 0.22414255142211914, + "learning_rate": 5e-06, + "loss": 0.9672, + "num_input_tokens_seen": 164558224, + "step": 362, + "train_runtime": 26319.3494, + "train_tokens_per_second": 6252.367 + }, + { + "epoch": 0.4395047247892999, + "grad_norm": 0.22132380306720734, + "learning_rate": 5e-06, + "loss": 0.9457, + "num_input_tokens_seen": 165025024, + "step": 363, + "train_runtime": 26394.08, + "train_tokens_per_second": 6252.35 + }, + { + "epoch": 0.4407154816069013, + "grad_norm": 0.2500600814819336, + "learning_rate": 5e-06, + "loss": 1.0243, + "num_input_tokens_seen": 165495112, + "step": 364, + "train_runtime": 26469.3304, + "train_tokens_per_second": 6252.335 + }, + { + "epoch": 0.4419262384245027, + "grad_norm": 0.24437642097473145, + "learning_rate": 5e-06, + "loss": 0.9971, + "num_input_tokens_seen": 165959744, + "step": 365, + "train_runtime": 26543.6441, + "train_tokens_per_second": 6252.335 + }, + { + "epoch": 0.4431369952421041, + "grad_norm": 0.2317400872707367, + "learning_rate": 5e-06, + "loss": 0.9962, + "num_input_tokens_seen": 166425752, + "step": 366, + "train_runtime": 26617.8184, + "train_tokens_per_second": 6252.419 + }, + { + "epoch": 0.44434775205970545, + "grad_norm": 0.22997960448265076, + "learning_rate": 5e-06, + "loss": 1.0282, + "num_input_tokens_seen": 166881504, + "step": 367, + "train_runtime": 26690.5854, + "train_tokens_per_second": 6252.448 + }, + { + "epoch": 0.4455585088773068, + "grad_norm": 0.2334347665309906, + "learning_rate": 5e-06, + "loss": 1.0253, + "num_input_tokens_seen": 167353128, + "step": 368, + "train_runtime": 26766.4126, + "train_tokens_per_second": 6252.356 + }, + { + "epoch": 0.4467692656949082, + "grad_norm": 0.23148049414157867, + "learning_rate": 5e-06, + "loss": 0.9588, + "num_input_tokens_seen": 167805976, + "step": 369, + "train_runtime": 26837.3465, + "train_tokens_per_second": 6252.704 + }, + { + "epoch": 0.44798002251250957, + "grad_norm": 0.2629753649234772, + "learning_rate": 5e-06, + "loss": 0.9514, + "num_input_tokens_seen": 168248368, + "step": 370, + "train_runtime": 26906.0429, + "train_tokens_per_second": 6253.181 + }, + { + "epoch": 0.449190779330111, + "grad_norm": 0.2621021568775177, + "learning_rate": 5e-06, + "loss": 0.991, + "num_input_tokens_seen": 168693912, + "step": 371, + "train_runtime": 26975.4421, + "train_tokens_per_second": 6253.611 + }, + { + "epoch": 0.45040153614771233, + "grad_norm": 0.2458389848470688, + "learning_rate": 5e-06, + "loss": 0.9837, + "num_input_tokens_seen": 169157520, + "step": 372, + "train_runtime": 27047.241, + "train_tokens_per_second": 6254.151 + }, + { + "epoch": 0.4516122929653137, + "grad_norm": 0.22616301476955414, + "learning_rate": 5e-06, + "loss": 0.9584, + "num_input_tokens_seen": 169629008, + "step": 373, + "train_runtime": 27120.6769, + "train_tokens_per_second": 6254.601 + }, + { + "epoch": 0.4528230497829151, + "grad_norm": 0.28033509850502014, + "learning_rate": 5e-06, + "loss": 0.9869, + "num_input_tokens_seen": 170096328, + "step": 374, + "train_runtime": 27193.4201, + "train_tokens_per_second": 6255.055 + }, + { + "epoch": 0.45403380660051645, + "grad_norm": 0.2582356035709381, + "learning_rate": 5e-06, + "loss": 0.9658, + "num_input_tokens_seen": 170530368, + "step": 375, + "train_runtime": 27261.0916, + "train_tokens_per_second": 6255.449 + }, + { + "epoch": 0.45524456341811786, + "grad_norm": 0.26356765627861023, + "learning_rate": 5e-06, + "loss": 0.954, + "num_input_tokens_seen": 170977704, + "step": 376, + "train_runtime": 27330.4355, + "train_tokens_per_second": 6255.945 + }, + { + "epoch": 0.4564553202357192, + "grad_norm": 0.2806834280490875, + "learning_rate": 5e-06, + "loss": 0.9188, + "num_input_tokens_seen": 171404840, + "step": 377, + "train_runtime": 27396.163, + "train_tokens_per_second": 6256.527 + }, + { + "epoch": 0.4576660770533206, + "grad_norm": 0.24835824966430664, + "learning_rate": 5e-06, + "loss": 1.0034, + "num_input_tokens_seen": 171863528, + "step": 378, + "train_runtime": 27467.6453, + "train_tokens_per_second": 6256.944 + }, + { + "epoch": 0.458876833870922, + "grad_norm": 0.24917422235012054, + "learning_rate": 5e-06, + "loss": 1.0091, + "num_input_tokens_seen": 172306160, + "step": 379, + "train_runtime": 27537.144, + "train_tokens_per_second": 6257.227 + }, + { + "epoch": 0.46008759068852334, + "grad_norm": 0.24879835546016693, + "learning_rate": 5e-06, + "loss": 1.0534, + "num_input_tokens_seen": 172762776, + "step": 380, + "train_runtime": 27607.7602, + "train_tokens_per_second": 6257.761 + }, + { + "epoch": 0.46129834750612475, + "grad_norm": 0.2425055056810379, + "learning_rate": 5e-06, + "loss": 0.9974, + "num_input_tokens_seen": 173211600, + "step": 381, + "train_runtime": 27677.4709, + "train_tokens_per_second": 6258.216 + }, + { + "epoch": 0.4625091043237261, + "grad_norm": 0.23279421031475067, + "learning_rate": 5e-06, + "loss": 1.0132, + "num_input_tokens_seen": 173650080, + "step": 382, + "train_runtime": 27745.2426, + "train_tokens_per_second": 6258.734 + }, + { + "epoch": 0.46371986114132746, + "grad_norm": 0.23731283843517303, + "learning_rate": 5e-06, + "loss": 1.0208, + "num_input_tokens_seen": 174115976, + "step": 383, + "train_runtime": 27817.775, + "train_tokens_per_second": 6259.163 + }, + { + "epoch": 0.46493061795892887, + "grad_norm": 0.2498994767665863, + "learning_rate": 5e-06, + "loss": 0.9958, + "num_input_tokens_seen": 174583112, + "step": 384, + "train_runtime": 27890.6884, + "train_tokens_per_second": 6259.548 + }, + { + "epoch": 0.4661413747765302, + "grad_norm": 0.21462289988994598, + "learning_rate": 5e-06, + "loss": 0.957, + "num_input_tokens_seen": 175059472, + "step": 385, + "train_runtime": 27965.0646, + "train_tokens_per_second": 6259.934 + }, + { + "epoch": 0.46735213159413164, + "grad_norm": 0.2454395592212677, + "learning_rate": 5e-06, + "loss": 0.9569, + "num_input_tokens_seen": 175520768, + "step": 386, + "train_runtime": 28036.9974, + "train_tokens_per_second": 6260.327 + }, + { + "epoch": 0.468562888411733, + "grad_norm": 0.2549636960029602, + "learning_rate": 5e-06, + "loss": 0.9632, + "num_input_tokens_seen": 175947120, + "step": 387, + "train_runtime": 28103.0751, + "train_tokens_per_second": 6260.778 + }, + { + "epoch": 0.46977364522933435, + "grad_norm": 0.22117368876934052, + "learning_rate": 5e-06, + "loss": 1.0324, + "num_input_tokens_seen": 176416712, + "step": 388, + "train_runtime": 28176.0037, + "train_tokens_per_second": 6261.24 + }, + { + "epoch": 0.47098440204693576, + "grad_norm": 0.24724611639976501, + "learning_rate": 5e-06, + "loss": 0.9896, + "num_input_tokens_seen": 176866424, + "step": 389, + "train_runtime": 28246.1355, + "train_tokens_per_second": 6261.615 + }, + { + "epoch": 0.4721951588645371, + "grad_norm": 0.23016729950904846, + "learning_rate": 5e-06, + "loss": 0.9615, + "num_input_tokens_seen": 177343216, + "step": 390, + "train_runtime": 28320.9312, + "train_tokens_per_second": 6261.913 + }, + { + "epoch": 0.4734059156821385, + "grad_norm": 0.2248724400997162, + "learning_rate": 5e-06, + "loss": 0.9356, + "num_input_tokens_seen": 177768280, + "step": 391, + "train_runtime": 28386.5413, + "train_tokens_per_second": 6262.414 + }, + { + "epoch": 0.4746166724997399, + "grad_norm": 0.26315781474113464, + "learning_rate": 5e-06, + "loss": 0.9978, + "num_input_tokens_seen": 178212008, + "step": 392, + "train_runtime": 28456.242, + "train_tokens_per_second": 6262.668 + }, + { + "epoch": 0.47582742931734123, + "grad_norm": 0.24355779588222504, + "learning_rate": 5e-06, + "loss": 1.0061, + "num_input_tokens_seen": 178671232, + "step": 393, + "train_runtime": 28527.8799, + "train_tokens_per_second": 6263.039 + }, + { + "epoch": 0.47703818613494264, + "grad_norm": 0.21970634162425995, + "learning_rate": 5e-06, + "loss": 0.9588, + "num_input_tokens_seen": 179130472, + "step": 394, + "train_runtime": 28599.515, + "train_tokens_per_second": 6263.409 + }, + { + "epoch": 0.478248942952544, + "grad_norm": 0.25734594464302063, + "learning_rate": 5e-06, + "loss": 1.1103, + "num_input_tokens_seen": 179589296, + "step": 395, + "train_runtime": 28671.1821, + "train_tokens_per_second": 6263.756 + }, + { + "epoch": 0.4794596997701454, + "grad_norm": 0.22498640418052673, + "learning_rate": 5e-06, + "loss": 0.9638, + "num_input_tokens_seen": 180039760, + "step": 396, + "train_runtime": 28741.3132, + "train_tokens_per_second": 6264.145 + }, + { + "epoch": 0.48067045658774676, + "grad_norm": 0.23484832048416138, + "learning_rate": 5e-06, + "loss": 0.9852, + "num_input_tokens_seen": 180466416, + "step": 397, + "train_runtime": 28807.6188, + "train_tokens_per_second": 6264.538 + }, + { + "epoch": 0.4818812134053481, + "grad_norm": 0.23096151649951935, + "learning_rate": 5e-06, + "loss": 0.9901, + "num_input_tokens_seen": 180934584, + "step": 398, + "train_runtime": 28880.5387, + "train_tokens_per_second": 6264.931 + }, + { + "epoch": 0.48309197022294953, + "grad_norm": 0.29461684823036194, + "learning_rate": 5e-06, + "loss": 0.9985, + "num_input_tokens_seen": 181383840, + "step": 399, + "train_runtime": 28950.6138, + "train_tokens_per_second": 6265.285 + }, + { + "epoch": 0.4843027270405509, + "grad_norm": 0.24854110181331635, + "learning_rate": 5e-06, + "loss": 1.022, + "num_input_tokens_seen": 181833776, + "step": 400, + "train_runtime": 29020.8816, + "train_tokens_per_second": 6265.619 + }, + { + "epoch": 0.4855134838581523, + "grad_norm": 0.22923749685287476, + "learning_rate": 5e-06, + "loss": 0.9995, + "num_input_tokens_seen": 182289656, + "step": 401, + "train_runtime": 29091.6013, + "train_tokens_per_second": 6266.058 + }, + { + "epoch": 0.48672424067575365, + "grad_norm": 0.23606517910957336, + "learning_rate": 5e-06, + "loss": 0.9335, + "num_input_tokens_seen": 182741656, + "step": 402, + "train_runtime": 29162.397, + "train_tokens_per_second": 6266.346 + }, + { + "epoch": 0.487934997493355, + "grad_norm": 0.2514527142047882, + "learning_rate": 5e-06, + "loss": 0.95, + "num_input_tokens_seen": 183208448, + "step": 403, + "train_runtime": 29235.1663, + "train_tokens_per_second": 6266.715 + }, + { + "epoch": 0.4891457543109564, + "grad_norm": 0.23453983664512634, + "learning_rate": 5e-06, + "loss": 0.9837, + "num_input_tokens_seen": 183665416, + "step": 404, + "train_runtime": 29306.3034, + "train_tokens_per_second": 6267.096 + }, + { + "epoch": 0.49035651112855777, + "grad_norm": 0.23354077339172363, + "learning_rate": 5e-06, + "loss": 1.0109, + "num_input_tokens_seen": 184118688, + "step": 405, + "train_runtime": 29376.2962, + "train_tokens_per_second": 6267.594 + }, + { + "epoch": 0.4915672679461592, + "grad_norm": 0.2359265685081482, + "learning_rate": 5e-06, + "loss": 0.9647, + "num_input_tokens_seen": 184576128, + "step": 406, + "train_runtime": 29447.5204, + "train_tokens_per_second": 6267.968 + }, + { + "epoch": 0.49277802476376054, + "grad_norm": 0.23804575204849243, + "learning_rate": 5e-06, + "loss": 1.0444, + "num_input_tokens_seen": 185032912, + "step": 407, + "train_runtime": 29518.842, + "train_tokens_per_second": 6268.298 + }, + { + "epoch": 0.4939887815813619, + "grad_norm": 0.26842811703681946, + "learning_rate": 5e-06, + "loss": 1.0753, + "num_input_tokens_seen": 185474400, + "step": 408, + "train_runtime": 29588.0213, + "train_tokens_per_second": 6268.564 + }, + { + "epoch": 0.4951995383989633, + "grad_norm": 0.2470535784959793, + "learning_rate": 5e-06, + "loss": 1.0522, + "num_input_tokens_seen": 185926928, + "step": 409, + "train_runtime": 29658.4068, + "train_tokens_per_second": 6268.945 + }, + { + "epoch": 0.49641029521656466, + "grad_norm": 0.2313876450061798, + "learning_rate": 5e-06, + "loss": 0.9896, + "num_input_tokens_seen": 186395976, + "step": 410, + "train_runtime": 29731.4655, + "train_tokens_per_second": 6269.317 + }, + { + "epoch": 0.49762105203416607, + "grad_norm": 0.2276448905467987, + "learning_rate": 5e-06, + "loss": 1.0273, + "num_input_tokens_seen": 186855720, + "step": 411, + "train_runtime": 29803.2261, + "train_tokens_per_second": 6269.647 + }, + { + "epoch": 0.4988318088517674, + "grad_norm": 0.24273553490638733, + "learning_rate": 5e-06, + "loss": 0.9887, + "num_input_tokens_seen": 187303704, + "step": 412, + "train_runtime": 29872.3789, + "train_tokens_per_second": 6270.13 + }, + { + "epoch": 0.5000425656693688, + "grad_norm": 0.22893160581588745, + "learning_rate": 5e-06, + "loss": 0.9927, + "num_input_tokens_seen": 187767288, + "step": 413, + "train_runtime": 29944.5337, + "train_tokens_per_second": 6270.503 + }, + { + "epoch": 0.5012533224869702, + "grad_norm": 0.24135759472846985, + "learning_rate": 5e-06, + "loss": 1.02, + "num_input_tokens_seen": 188215576, + "step": 414, + "train_runtime": 30014.0007, + "train_tokens_per_second": 6270.926 + }, + { + "epoch": 0.5024640793045716, + "grad_norm": 0.22361376881599426, + "learning_rate": 5e-06, + "loss": 0.9666, + "num_input_tokens_seen": 188667128, + "step": 415, + "train_runtime": 30084.01, + "train_tokens_per_second": 6271.342 + }, + { + "epoch": 0.5036748361221729, + "grad_norm": 0.27765095233917236, + "learning_rate": 5e-06, + "loss": 0.9637, + "num_input_tokens_seen": 189113312, + "step": 416, + "train_runtime": 30153.5994, + "train_tokens_per_second": 6271.666 + }, + { + "epoch": 0.5048855929397743, + "grad_norm": 0.2431006133556366, + "learning_rate": 5e-06, + "loss": 1.023, + "num_input_tokens_seen": 189552168, + "step": 417, + "train_runtime": 30221.737, + "train_tokens_per_second": 6272.047 + }, + { + "epoch": 0.5060963497573757, + "grad_norm": 0.23247578740119934, + "learning_rate": 5e-06, + "loss": 0.96, + "num_input_tokens_seen": 190031496, + "step": 418, + "train_runtime": 30296.4767, + "train_tokens_per_second": 6272.396 + }, + { + "epoch": 0.507307106574977, + "grad_norm": 0.2316485345363617, + "learning_rate": 5e-06, + "loss": 1.0319, + "num_input_tokens_seen": 190468688, + "step": 419, + "train_runtime": 30366.2489, + "train_tokens_per_second": 6272.381 + }, + { + "epoch": 0.5085178633925784, + "grad_norm": 0.24219174683094025, + "learning_rate": 5e-06, + "loss": 0.9843, + "num_input_tokens_seen": 190902848, + "step": 420, + "train_runtime": 30433.8527, + "train_tokens_per_second": 6272.714 + }, + { + "epoch": 0.5097286202101798, + "grad_norm": 0.22331832349300385, + "learning_rate": 5e-06, + "loss": 0.9295, + "num_input_tokens_seen": 191365624, + "step": 421, + "train_runtime": 30506.1907, + "train_tokens_per_second": 6273.009 + }, + { + "epoch": 0.5109393770277811, + "grad_norm": 0.24295338988304138, + "learning_rate": 5e-06, + "loss": 0.9981, + "num_input_tokens_seen": 191812256, + "step": 422, + "train_runtime": 30575.7222, + "train_tokens_per_second": 6273.352 + }, + { + "epoch": 0.5121501338453825, + "grad_norm": 0.23116403818130493, + "learning_rate": 5e-06, + "loss": 0.9845, + "num_input_tokens_seen": 192275296, + "step": 423, + "train_runtime": 30647.9598, + "train_tokens_per_second": 6273.674 + }, + { + "epoch": 0.513360890662984, + "grad_norm": 0.38395291566848755, + "learning_rate": 5e-06, + "loss": 0.9968, + "num_input_tokens_seen": 192729088, + "step": 424, + "train_runtime": 30718.5265, + "train_tokens_per_second": 6274.034 + }, + { + "epoch": 0.5145716474805854, + "grad_norm": 0.21122363209724426, + "learning_rate": 5e-06, + "loss": 0.9741, + "num_input_tokens_seen": 193218400, + "step": 425, + "train_runtime": 30795.0296, + "train_tokens_per_second": 6274.337 + }, + { + "epoch": 0.5157824042981867, + "grad_norm": 0.22073934972286224, + "learning_rate": 5e-06, + "loss": 0.9598, + "num_input_tokens_seen": 193675104, + "step": 426, + "train_runtime": 30865.8936, + "train_tokens_per_second": 6274.729 + }, + { + "epoch": 0.5169931611157881, + "grad_norm": 0.2508212924003601, + "learning_rate": 5e-06, + "loss": 0.9413, + "num_input_tokens_seen": 194152520, + "step": 427, + "train_runtime": 30940.0715, + "train_tokens_per_second": 6275.115 + }, + { + "epoch": 0.5182039179333895, + "grad_norm": 0.24162203073501587, + "learning_rate": 5e-06, + "loss": 0.9986, + "num_input_tokens_seen": 194594776, + "step": 428, + "train_runtime": 31008.6981, + "train_tokens_per_second": 6275.49 + }, + { + "epoch": 0.5194146747509908, + "grad_norm": 0.22889398038387299, + "learning_rate": 5e-06, + "loss": 1.006, + "num_input_tokens_seen": 195045984, + "step": 429, + "train_runtime": 31078.8709, + "train_tokens_per_second": 6275.839 + }, + { + "epoch": 0.5206254315685922, + "grad_norm": 0.2539101243019104, + "learning_rate": 5e-06, + "loss": 1.0005, + "num_input_tokens_seen": 195479240, + "step": 430, + "train_runtime": 31146.6426, + "train_tokens_per_second": 6276.093 + }, + { + "epoch": 0.5218361883861936, + "grad_norm": 0.21705974638462067, + "learning_rate": 5e-06, + "loss": 0.9376, + "num_input_tokens_seen": 195959064, + "step": 431, + "train_runtime": 31221.7672, + "train_tokens_per_second": 6276.36 + }, + { + "epoch": 0.5230469452037949, + "grad_norm": 0.22790437936782837, + "learning_rate": 5e-06, + "loss": 0.9948, + "num_input_tokens_seen": 196401264, + "step": 432, + "train_runtime": 31290.4429, + "train_tokens_per_second": 6276.717 + }, + { + "epoch": 0.5242577020213963, + "grad_norm": 0.26201656460762024, + "learning_rate": 5e-06, + "loss": 1.0204, + "num_input_tokens_seen": 196831632, + "step": 433, + "train_runtime": 31357.4731, + "train_tokens_per_second": 6277.025 + }, + { + "epoch": 0.5254684588389977, + "grad_norm": 0.23872381448745728, + "learning_rate": 5e-06, + "loss": 0.98, + "num_input_tokens_seen": 197280792, + "step": 434, + "train_runtime": 31427.348, + "train_tokens_per_second": 6277.36 + }, + { + "epoch": 0.5266792156565991, + "grad_norm": 0.23127026855945587, + "learning_rate": 5e-06, + "loss": 1.0302, + "num_input_tokens_seen": 197738976, + "step": 435, + "train_runtime": 31498.5952, + "train_tokens_per_second": 6277.708 + }, + { + "epoch": 0.5278899724742004, + "grad_norm": 0.23606155812740326, + "learning_rate": 5e-06, + "loss": 1.0139, + "num_input_tokens_seen": 198192464, + "step": 436, + "train_runtime": 31569.309, + "train_tokens_per_second": 6278.011 + }, + { + "epoch": 0.5291007292918019, + "grad_norm": 0.23491834104061127, + "learning_rate": 5e-06, + "loss": 0.9967, + "num_input_tokens_seen": 198667936, + "step": 437, + "train_runtime": 31643.9079, + "train_tokens_per_second": 6278.236 + }, + { + "epoch": 0.5303114861094033, + "grad_norm": 0.21920163929462433, + "learning_rate": 5e-06, + "loss": 0.9938, + "num_input_tokens_seen": 199128912, + "step": 438, + "train_runtime": 31715.309, + "train_tokens_per_second": 6278.637 + }, + { + "epoch": 0.5315222429270046, + "grad_norm": 0.24721209704875946, + "learning_rate": 5e-06, + "loss": 0.9461, + "num_input_tokens_seen": 199581136, + "step": 439, + "train_runtime": 31785.518, + "train_tokens_per_second": 6278.996 + }, + { + "epoch": 0.532732999744606, + "grad_norm": 0.2280053347349167, + "learning_rate": 5e-06, + "loss": 0.9607, + "num_input_tokens_seen": 200043376, + "step": 440, + "train_runtime": 31857.7901, + "train_tokens_per_second": 6279.261 + }, + { + "epoch": 0.5339437565622074, + "grad_norm": 0.23798179626464844, + "learning_rate": 5e-06, + "loss": 1.0175, + "num_input_tokens_seen": 200477576, + "step": 441, + "train_runtime": 31924.9328, + "train_tokens_per_second": 6279.655 + }, + { + "epoch": 0.5351545133798087, + "grad_norm": 0.24441802501678467, + "learning_rate": 5e-06, + "loss": 0.9864, + "num_input_tokens_seen": 200902872, + "step": 442, + "train_runtime": 31991.1171, + "train_tokens_per_second": 6279.958 + }, + { + "epoch": 0.5363652701974101, + "grad_norm": 0.22049540281295776, + "learning_rate": 5e-06, + "loss": 0.9682, + "num_input_tokens_seen": 201374768, + "step": 443, + "train_runtime": 32064.8823, + "train_tokens_per_second": 6280.228 + }, + { + "epoch": 0.5375760270150115, + "grad_norm": 0.26407957077026367, + "learning_rate": 5e-06, + "loss": 1.0439, + "num_input_tokens_seen": 201833576, + "step": 444, + "train_runtime": 32135.9258, + "train_tokens_per_second": 6280.621 + }, + { + "epoch": 0.5387867838326129, + "grad_norm": 0.23320509493350983, + "learning_rate": 5e-06, + "loss": 0.9675, + "num_input_tokens_seen": 202288056, + "step": 445, + "train_runtime": 32207.1376, + "train_tokens_per_second": 6280.846 + }, + { + "epoch": 0.5399975406502142, + "grad_norm": 0.2530595362186432, + "learning_rate": 5e-06, + "loss": 0.9806, + "num_input_tokens_seen": 202734456, + "step": 446, + "train_runtime": 32276.5609, + "train_tokens_per_second": 6281.167 + }, + { + "epoch": 0.5412082974678156, + "grad_norm": 0.24577440321445465, + "learning_rate": 5e-06, + "loss": 1.032, + "num_input_tokens_seen": 203175032, + "step": 447, + "train_runtime": 32345.0646, + "train_tokens_per_second": 6281.485 + }, + { + "epoch": 0.542419054285417, + "grad_norm": 0.24135351181030273, + "learning_rate": 5e-06, + "loss": 0.9941, + "num_input_tokens_seen": 203613664, + "step": 448, + "train_runtime": 32413.2576, + "train_tokens_per_second": 6281.802 + }, + { + "epoch": 0.5436298111030183, + "grad_norm": 0.2334894835948944, + "learning_rate": 5e-06, + "loss": 0.9561, + "num_input_tokens_seen": 204069960, + "step": 449, + "train_runtime": 32484.3465, + "train_tokens_per_second": 6282.101 + }, + { + "epoch": 0.5448405679206197, + "grad_norm": 0.23215444386005402, + "learning_rate": 5e-06, + "loss": 0.9621, + "num_input_tokens_seen": 204514576, + "step": 450, + "train_runtime": 32553.403, + "train_tokens_per_second": 6282.433 + }, + { + "epoch": 0.5460513247382212, + "grad_norm": 0.22942085564136505, + "learning_rate": 5e-06, + "loss": 1.0227, + "num_input_tokens_seen": 204978768, + "step": 451, + "train_runtime": 32626.2536, + "train_tokens_per_second": 6282.633 + }, + { + "epoch": 0.5472620815558226, + "grad_norm": 0.24713215231895447, + "learning_rate": 5e-06, + "loss": 0.9849, + "num_input_tokens_seen": 205433104, + "step": 452, + "train_runtime": 32697.6427, + "train_tokens_per_second": 6282.811 + }, + { + "epoch": 0.5484728383734239, + "grad_norm": 0.23457272350788116, + "learning_rate": 5e-06, + "loss": 0.9856, + "num_input_tokens_seen": 205899472, + "step": 453, + "train_runtime": 32770.9009, + "train_tokens_per_second": 6282.997 + }, + { + "epoch": 0.5496835951910253, + "grad_norm": 0.25106683373451233, + "learning_rate": 5e-06, + "loss": 1.0003, + "num_input_tokens_seen": 206350824, + "step": 454, + "train_runtime": 32840.7984, + "train_tokens_per_second": 6283.368 + }, + { + "epoch": 0.5508943520086267, + "grad_norm": 0.27677810192108154, + "learning_rate": 5e-06, + "loss": 0.9914, + "num_input_tokens_seen": 206805936, + "step": 455, + "train_runtime": 32911.5142, + "train_tokens_per_second": 6283.696 + }, + { + "epoch": 0.552105108826228, + "grad_norm": 0.23585183918476105, + "learning_rate": 5e-06, + "loss": 0.976, + "num_input_tokens_seen": 207258416, + "step": 456, + "train_runtime": 32981.8418, + "train_tokens_per_second": 6284.016 + }, + { + "epoch": 0.5533158656438294, + "grad_norm": 0.2358681708574295, + "learning_rate": 5e-06, + "loss": 1.028, + "num_input_tokens_seen": 207695392, + "step": 457, + "train_runtime": 33049.5443, + "train_tokens_per_second": 6284.365 + }, + { + "epoch": 0.5545266224614308, + "grad_norm": 0.24082793295383453, + "learning_rate": 5e-06, + "loss": 1.0006, + "num_input_tokens_seen": 208131184, + "step": 458, + "train_runtime": 33117.4035, + "train_tokens_per_second": 6284.647 + }, + { + "epoch": 0.5557373792790321, + "grad_norm": 0.22506728768348694, + "learning_rate": 5e-06, + "loss": 0.9307, + "num_input_tokens_seen": 208586944, + "step": 459, + "train_runtime": 33188.5929, + "train_tokens_per_second": 6284.899 + }, + { + "epoch": 0.5569481360966335, + "grad_norm": 0.22801756858825684, + "learning_rate": 5e-06, + "loss": 0.9355, + "num_input_tokens_seen": 209044824, + "step": 460, + "train_runtime": 33260.759, + "train_tokens_per_second": 6285.029 + }, + { + "epoch": 0.5581588929142349, + "grad_norm": 0.2215615212917328, + "learning_rate": 5e-06, + "loss": 0.963, + "num_input_tokens_seen": 209511008, + "step": 461, + "train_runtime": 33333.5062, + "train_tokens_per_second": 6285.298 + }, + { + "epoch": 0.5593696497318363, + "grad_norm": 0.24020282924175262, + "learning_rate": 5e-06, + "loss": 0.9947, + "num_input_tokens_seen": 209962056, + "step": 462, + "train_runtime": 33403.9215, + "train_tokens_per_second": 6285.551 + }, + { + "epoch": 0.5605804065494376, + "grad_norm": 0.23402798175811768, + "learning_rate": 5e-06, + "loss": 0.9612, + "num_input_tokens_seen": 210405272, + "step": 463, + "train_runtime": 33472.9498, + "train_tokens_per_second": 6285.83 + }, + { + "epoch": 0.561791163367039, + "grad_norm": 0.2381797432899475, + "learning_rate": 5e-06, + "loss": 1.0089, + "num_input_tokens_seen": 210845616, + "step": 464, + "train_runtime": 33541.6057, + "train_tokens_per_second": 6286.092 + }, + { + "epoch": 0.5630019201846405, + "grad_norm": 0.2647024989128113, + "learning_rate": 5e-06, + "loss": 1.0405, + "num_input_tokens_seen": 211311336, + "step": 465, + "train_runtime": 33613.9005, + "train_tokens_per_second": 6286.427 + }, + { + "epoch": 0.5642126770022418, + "grad_norm": 0.2484758347272873, + "learning_rate": 5e-06, + "loss": 1.0995, + "num_input_tokens_seen": 211755424, + "step": 466, + "train_runtime": 33682.7067, + "train_tokens_per_second": 6286.77 + }, + { + "epoch": 0.5654234338198432, + "grad_norm": 0.2419258952140808, + "learning_rate": 5e-06, + "loss": 0.9984, + "num_input_tokens_seen": 212211880, + "step": 467, + "train_runtime": 33753.5833, + "train_tokens_per_second": 6287.092 + }, + { + "epoch": 0.5666341906374446, + "grad_norm": 0.24377140402793884, + "learning_rate": 5e-06, + "loss": 1.0266, + "num_input_tokens_seen": 212661224, + "step": 468, + "train_runtime": 33823.6337, + "train_tokens_per_second": 6287.356 + }, + { + "epoch": 0.5678449474550459, + "grad_norm": 0.24141238629817963, + "learning_rate": 5e-06, + "loss": 0.9976, + "num_input_tokens_seen": 213120728, + "step": 469, + "train_runtime": 33895.1784, + "train_tokens_per_second": 6287.641 + }, + { + "epoch": 0.5690557042726473, + "grad_norm": 0.2776244580745697, + "learning_rate": 5e-06, + "loss": 1.0136, + "num_input_tokens_seen": 213581176, + "step": 470, + "train_runtime": 33968.5316, + "train_tokens_per_second": 6287.619 + }, + { + "epoch": 0.5702664610902487, + "grad_norm": 0.2289768010377884, + "learning_rate": 5e-06, + "loss": 0.9458, + "num_input_tokens_seen": 214039216, + "step": 471, + "train_runtime": 34039.8568, + "train_tokens_per_second": 6287.9 + }, + { + "epoch": 0.5714772179078501, + "grad_norm": 0.24029488861560822, + "learning_rate": 5e-06, + "loss": 0.9919, + "num_input_tokens_seen": 214486744, + "step": 472, + "train_runtime": 34109.5176, + "train_tokens_per_second": 6288.179 + }, + { + "epoch": 0.5726879747254514, + "grad_norm": 0.24775657057762146, + "learning_rate": 5e-06, + "loss": 1.0085, + "num_input_tokens_seen": 214929224, + "step": 473, + "train_runtime": 34177.8306, + "train_tokens_per_second": 6288.557 + }, + { + "epoch": 0.5738987315430528, + "grad_norm": 0.257894903421402, + "learning_rate": 5e-06, + "loss": 1.0131, + "num_input_tokens_seen": 215364952, + "step": 474, + "train_runtime": 34245.3987, + "train_tokens_per_second": 6288.873 + }, + { + "epoch": 0.5751094883606542, + "grad_norm": 0.22365638613700867, + "learning_rate": 5e-06, + "loss": 0.9081, + "num_input_tokens_seen": 215810336, + "step": 475, + "train_runtime": 34314.325, + "train_tokens_per_second": 6289.22 + }, + { + "epoch": 0.5763202451782555, + "grad_norm": 0.222572922706604, + "learning_rate": 5e-06, + "loss": 0.997, + "num_input_tokens_seen": 216285368, + "step": 476, + "train_runtime": 34388.3445, + "train_tokens_per_second": 6289.496 + }, + { + "epoch": 0.5775310019958569, + "grad_norm": 0.24267543852329254, + "learning_rate": 5e-06, + "loss": 1.0052, + "num_input_tokens_seen": 216712736, + "step": 477, + "train_runtime": 34454.3641, + "train_tokens_per_second": 6289.849 + }, + { + "epoch": 0.5787417588134584, + "grad_norm": 0.2833351790904999, + "learning_rate": 5e-06, + "loss": 0.9996, + "num_input_tokens_seen": 217152592, + "step": 478, + "train_runtime": 34522.6931, + "train_tokens_per_second": 6290.141 + }, + { + "epoch": 0.5799525156310597, + "grad_norm": 0.22266528010368347, + "learning_rate": 5e-06, + "loss": 0.9858, + "num_input_tokens_seen": 217617320, + "step": 479, + "train_runtime": 34595.0498, + "train_tokens_per_second": 6290.418 + }, + { + "epoch": 0.5811632724486611, + "grad_norm": 0.23907960951328278, + "learning_rate": 5e-06, + "loss": 0.9656, + "num_input_tokens_seen": 218085936, + "step": 480, + "train_runtime": 34667.9296, + "train_tokens_per_second": 6290.711 + }, + { + "epoch": 0.5823740292662625, + "grad_norm": 0.2604992985725403, + "learning_rate": 5e-06, + "loss": 0.9883, + "num_input_tokens_seen": 218529216, + "step": 481, + "train_runtime": 34736.6853, + "train_tokens_per_second": 6291.021 + }, + { + "epoch": 0.5835847860838639, + "grad_norm": 0.24895359575748444, + "learning_rate": 5e-06, + "loss": 0.9568, + "num_input_tokens_seen": 218971904, + "step": 482, + "train_runtime": 34805.5529, + "train_tokens_per_second": 6291.292 + }, + { + "epoch": 0.5847955429014652, + "grad_norm": 0.24118000268936157, + "learning_rate": 5e-06, + "loss": 0.9923, + "num_input_tokens_seen": 219405992, + "step": 483, + "train_runtime": 34873.0797, + "train_tokens_per_second": 6291.558 + }, + { + "epoch": 0.5860062997190666, + "grad_norm": 0.245997354388237, + "learning_rate": 5e-06, + "loss": 0.9438, + "num_input_tokens_seen": 219865712, + "step": 484, + "train_runtime": 34944.6761, + "train_tokens_per_second": 6291.823 + }, + { + "epoch": 0.587217056536668, + "grad_norm": 0.2530381679534912, + "learning_rate": 5e-06, + "loss": 0.9391, + "num_input_tokens_seen": 220315800, + "step": 485, + "train_runtime": 35014.4814, + "train_tokens_per_second": 6292.134 + }, + { + "epoch": 0.5884278133542693, + "grad_norm": 0.2256454974412918, + "learning_rate": 5e-06, + "loss": 0.9632, + "num_input_tokens_seen": 220788832, + "step": 486, + "train_runtime": 35088.2705, + "train_tokens_per_second": 6292.383 + }, + { + "epoch": 0.5896385701718707, + "grad_norm": 0.23818935453891754, + "learning_rate": 5e-06, + "loss": 0.9615, + "num_input_tokens_seen": 221257600, + "step": 487, + "train_runtime": 35161.1927, + "train_tokens_per_second": 6292.665 + }, + { + "epoch": 0.5908493269894721, + "grad_norm": 0.22735600173473358, + "learning_rate": 5e-06, + "loss": 0.983, + "num_input_tokens_seen": 221697208, + "step": 488, + "train_runtime": 35229.7059, + "train_tokens_per_second": 6292.905 + }, + { + "epoch": 0.5920600838070734, + "grad_norm": 0.22348052263259888, + "learning_rate": 5e-06, + "loss": 1.0074, + "num_input_tokens_seen": 222148296, + "step": 489, + "train_runtime": 35300.089, + "train_tokens_per_second": 6293.137 + }, + { + "epoch": 0.5932708406246748, + "grad_norm": 0.26825666427612305, + "learning_rate": 5e-06, + "loss": 0.9512, + "num_input_tokens_seen": 222612856, + "step": 490, + "train_runtime": 35372.8665, + "train_tokens_per_second": 6293.322 + }, + { + "epoch": 0.5944815974422762, + "grad_norm": 0.23904314637184143, + "learning_rate": 5e-06, + "loss": 0.9923, + "num_input_tokens_seen": 223081152, + "step": 491, + "train_runtime": 35446.0463, + "train_tokens_per_second": 6293.541 + }, + { + "epoch": 0.5956923542598777, + "grad_norm": 0.2582261860370636, + "learning_rate": 5e-06, + "loss": 1.0427, + "num_input_tokens_seen": 223537072, + "step": 492, + "train_runtime": 35517.0699, + "train_tokens_per_second": 6293.793 + }, + { + "epoch": 0.596903111077479, + "grad_norm": 0.22952939569950104, + "learning_rate": 5e-06, + "loss": 0.9664, + "num_input_tokens_seen": 223980672, + "step": 493, + "train_runtime": 35585.9954, + "train_tokens_per_second": 6294.068 + }, + { + "epoch": 0.5981138678950804, + "grad_norm": 0.26730042695999146, + "learning_rate": 5e-06, + "loss": 1.0279, + "num_input_tokens_seen": 224426280, + "step": 494, + "train_runtime": 35655.3839, + "train_tokens_per_second": 6294.317 + }, + { + "epoch": 0.5993246247126818, + "grad_norm": 0.25793856382369995, + "learning_rate": 5e-06, + "loss": 1.0055, + "num_input_tokens_seen": 224884152, + "step": 495, + "train_runtime": 35726.9034, + "train_tokens_per_second": 6294.532 + }, + { + "epoch": 0.6005353815302831, + "grad_norm": 0.22298921644687653, + "learning_rate": 5e-06, + "loss": 0.9601, + "num_input_tokens_seen": 225350592, + "step": 496, + "train_runtime": 35799.441, + "train_tokens_per_second": 6294.808 + }, + { + "epoch": 0.6017461383478845, + "grad_norm": 0.26628899574279785, + "learning_rate": 5e-06, + "loss": 0.9805, + "num_input_tokens_seen": 225807448, + "step": 497, + "train_runtime": 35870.8365, + "train_tokens_per_second": 6295.015 + }, + { + "epoch": 0.6029568951654859, + "grad_norm": 0.22120925784111023, + "learning_rate": 5e-06, + "loss": 0.9208, + "num_input_tokens_seen": 226266528, + "step": 498, + "train_runtime": 35942.5822, + "train_tokens_per_second": 6295.222 + }, + { + "epoch": 0.6041676519830872, + "grad_norm": 0.24458245933055878, + "learning_rate": 5e-06, + "loss": 0.9881, + "num_input_tokens_seen": 226707672, + "step": 499, + "train_runtime": 36011.3021, + "train_tokens_per_second": 6295.459 + }, + { + "epoch": 0.6053784088006886, + "grad_norm": 0.24703119695186615, + "learning_rate": 5e-06, + "loss": 1.0137, + "num_input_tokens_seen": 227145656, + "step": 500, + "train_runtime": 36079.2522, + "train_tokens_per_second": 6295.742 + }, + { + "epoch": 0.60658916561829, + "grad_norm": 0.24142247438430786, + "learning_rate": 5e-06, + "loss": 0.9205, + "num_input_tokens_seen": 227611200, + "step": 501, + "train_runtime": 36151.1703, + "train_tokens_per_second": 6296.095 + }, + { + "epoch": 0.6077999224358914, + "grad_norm": 0.2489280104637146, + "learning_rate": 5e-06, + "loss": 1.0177, + "num_input_tokens_seen": 228059544, + "step": 502, + "train_runtime": 36220.9972, + "train_tokens_per_second": 6296.335 + }, + { + "epoch": 0.6090106792534927, + "grad_norm": 0.23111343383789062, + "learning_rate": 5e-06, + "loss": 0.962, + "num_input_tokens_seen": 228526064, + "step": 503, + "train_runtime": 36293.9384, + "train_tokens_per_second": 6296.535 + }, + { + "epoch": 0.6102214360710941, + "grad_norm": 0.24690377712249756, + "learning_rate": 5e-06, + "loss": 0.9986, + "num_input_tokens_seen": 228981232, + "step": 504, + "train_runtime": 36364.2956, + "train_tokens_per_second": 6296.87 + }, + { + "epoch": 0.6114321928886955, + "grad_norm": 0.2393392026424408, + "learning_rate": 5e-06, + "loss": 0.9866, + "num_input_tokens_seen": 229439688, + "step": 505, + "train_runtime": 36435.751, + "train_tokens_per_second": 6297.103 + }, + { + "epoch": 0.6126429497062968, + "grad_norm": 0.24542857706546783, + "learning_rate": 5e-06, + "loss": 0.9987, + "num_input_tokens_seen": 229910688, + "step": 506, + "train_runtime": 36509.419, + "train_tokens_per_second": 6297.298 + }, + { + "epoch": 0.6138537065238983, + "grad_norm": 0.24054135382175446, + "learning_rate": 5e-06, + "loss": 0.9829, + "num_input_tokens_seen": 230361040, + "step": 507, + "train_runtime": 36579.4262, + "train_tokens_per_second": 6297.558 + }, + { + "epoch": 0.6150644633414997, + "grad_norm": 0.24931353330612183, + "learning_rate": 5e-06, + "loss": 1.0582, + "num_input_tokens_seen": 230795008, + "step": 508, + "train_runtime": 36646.8551, + "train_tokens_per_second": 6297.812 + }, + { + "epoch": 0.616275220159101, + "grad_norm": 0.28090900182724, + "learning_rate": 5e-06, + "loss": 1.0016, + "num_input_tokens_seen": 231239392, + "step": 509, + "train_runtime": 36716.4303, + "train_tokens_per_second": 6297.981 + }, + { + "epoch": 0.6174859769767024, + "grad_norm": 0.2591536045074463, + "learning_rate": 5e-06, + "loss": 0.9496, + "num_input_tokens_seen": 231693192, + "step": 510, + "train_runtime": 36786.8129, + "train_tokens_per_second": 6298.268 + }, + { + "epoch": 0.6186967337943038, + "grad_norm": 0.24983936548233032, + "learning_rate": 5e-06, + "loss": 0.9667, + "num_input_tokens_seen": 232139528, + "step": 511, + "train_runtime": 36856.2058, + "train_tokens_per_second": 6298.519 + }, + { + "epoch": 0.6199074906119052, + "grad_norm": 0.23879870772361755, + "learning_rate": 5e-06, + "loss": 1.0292, + "num_input_tokens_seen": 232572720, + "step": 512, + "train_runtime": 36923.1252, + "train_tokens_per_second": 6298.836 + }, + { + "epoch": 0.6211182474295065, + "grad_norm": 0.24429570138454437, + "learning_rate": 5e-06, + "loss": 1.0124, + "num_input_tokens_seen": 233019736, + "step": 513, + "train_runtime": 36992.5361, + "train_tokens_per_second": 6299.101 + }, + { + "epoch": 0.6223290042471079, + "grad_norm": 0.24088793992996216, + "learning_rate": 5e-06, + "loss": 0.9629, + "num_input_tokens_seen": 233483920, + "step": 514, + "train_runtime": 37064.9551, + "train_tokens_per_second": 6299.317 + }, + { + "epoch": 0.6235397610647093, + "grad_norm": 0.2581544816493988, + "learning_rate": 5e-06, + "loss": 0.9591, + "num_input_tokens_seen": 233939496, + "step": 515, + "train_runtime": 37136.0969, + "train_tokens_per_second": 6299.518 + }, + { + "epoch": 0.6247505178823106, + "grad_norm": 0.2298753410577774, + "learning_rate": 5e-06, + "loss": 0.9898, + "num_input_tokens_seen": 234398968, + "step": 516, + "train_runtime": 37207.7335, + "train_tokens_per_second": 6299.738 + }, + { + "epoch": 0.625961274699912, + "grad_norm": 0.2409614771604538, + "learning_rate": 5e-06, + "loss": 0.9524, + "num_input_tokens_seen": 234840984, + "step": 517, + "train_runtime": 37276.4815, + "train_tokens_per_second": 6299.977 + }, + { + "epoch": 0.6271720315175134, + "grad_norm": 0.24182307720184326, + "learning_rate": 5e-06, + "loss": 0.9592, + "num_input_tokens_seen": 235265976, + "step": 518, + "train_runtime": 37342.2003, + "train_tokens_per_second": 6300.271 + }, + { + "epoch": 0.6283827883351147, + "grad_norm": 0.25573626160621643, + "learning_rate": 5e-06, + "loss": 0.9746, + "num_input_tokens_seen": 235751176, + "step": 519, + "train_runtime": 37417.9534, + "train_tokens_per_second": 6300.483 + }, + { + "epoch": 0.6295935451527161, + "grad_norm": 0.27016371488571167, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 236213584, + "step": 520, + "train_runtime": 37490.1733, + "train_tokens_per_second": 6300.68 + }, + { + "epoch": 0.6308043019703176, + "grad_norm": 0.24281057715415955, + "learning_rate": 5e-06, + "loss": 1.0085, + "num_input_tokens_seen": 236663272, + "step": 521, + "train_runtime": 37561.7456, + "train_tokens_per_second": 6300.646 + }, + { + "epoch": 0.632015058787919, + "grad_norm": 0.2382790446281433, + "learning_rate": 5e-06, + "loss": 1.0166, + "num_input_tokens_seen": 237104624, + "step": 522, + "train_runtime": 37630.4462, + "train_tokens_per_second": 6300.872 + }, + { + "epoch": 0.6332258156055203, + "grad_norm": 0.24074813723564148, + "learning_rate": 5e-06, + "loss": 1.0121, + "num_input_tokens_seen": 237572832, + "step": 523, + "train_runtime": 37703.3574, + "train_tokens_per_second": 6301.105 + }, + { + "epoch": 0.6344365724231217, + "grad_norm": 0.26703017950057983, + "learning_rate": 5e-06, + "loss": 0.99, + "num_input_tokens_seen": 238034928, + "step": 524, + "train_runtime": 37775.4137, + "train_tokens_per_second": 6301.319 + }, + { + "epoch": 0.6356473292407231, + "grad_norm": 0.31544211506843567, + "learning_rate": 5e-06, + "loss": 0.9136, + "num_input_tokens_seen": 238489256, + "step": 525, + "train_runtime": 37846.4867, + "train_tokens_per_second": 6301.49 + }, + { + "epoch": 0.6368580860583244, + "grad_norm": 0.2323281615972519, + "learning_rate": 5e-06, + "loss": 0.9803, + "num_input_tokens_seen": 238937608, + "step": 526, + "train_runtime": 37916.1928, + "train_tokens_per_second": 6301.73 + }, + { + "epoch": 0.6380688428759258, + "grad_norm": 0.22566953301429749, + "learning_rate": 5e-06, + "loss": 0.9496, + "num_input_tokens_seen": 239408904, + "step": 527, + "train_runtime": 37989.8164, + "train_tokens_per_second": 6301.923 + }, + { + "epoch": 0.6392795996935272, + "grad_norm": 0.24885083734989166, + "learning_rate": 5e-06, + "loss": 0.9671, + "num_input_tokens_seen": 239856616, + "step": 528, + "train_runtime": 38059.5861, + "train_tokens_per_second": 6302.134 + }, + { + "epoch": 0.6404903565111285, + "grad_norm": 0.2867506742477417, + "learning_rate": 5e-06, + "loss": 1.02, + "num_input_tokens_seen": 240342880, + "step": 529, + "train_runtime": 38136.0338, + "train_tokens_per_second": 6302.252 + }, + { + "epoch": 0.6417011133287299, + "grad_norm": 0.23189502954483032, + "learning_rate": 5e-06, + "loss": 1.0122, + "num_input_tokens_seen": 240799360, + "step": 530, + "train_runtime": 38207.0698, + "train_tokens_per_second": 6302.482 + }, + { + "epoch": 0.6429118701463313, + "grad_norm": 0.2151128053665161, + "learning_rate": 5e-06, + "loss": 0.9502, + "num_input_tokens_seen": 241285344, + "step": 531, + "train_runtime": 38282.6957, + "train_tokens_per_second": 6302.726 + }, + { + "epoch": 0.6441226269639327, + "grad_norm": 0.23497872054576874, + "learning_rate": 5e-06, + "loss": 1.0372, + "num_input_tokens_seen": 241748624, + "step": 532, + "train_runtime": 38355.235, + "train_tokens_per_second": 6302.885 + }, + { + "epoch": 0.645333383781534, + "grad_norm": 0.22813764214515686, + "learning_rate": 5e-06, + "loss": 0.936, + "num_input_tokens_seen": 242200600, + "step": 533, + "train_runtime": 38425.9554, + "train_tokens_per_second": 6303.047 + }, + { + "epoch": 0.6465441405991355, + "grad_norm": 0.2369297444820404, + "learning_rate": 5e-06, + "loss": 0.9499, + "num_input_tokens_seen": 242661128, + "step": 534, + "train_runtime": 38497.3268, + "train_tokens_per_second": 6303.324 + }, + { + "epoch": 0.6477548974167369, + "grad_norm": 0.2485128939151764, + "learning_rate": 5e-06, + "loss": 0.984, + "num_input_tokens_seen": 243128592, + "step": 535, + "train_runtime": 38570.5127, + "train_tokens_per_second": 6303.484 + }, + { + "epoch": 0.6489656542343382, + "grad_norm": 0.23329830169677734, + "learning_rate": 5e-06, + "loss": 0.9638, + "num_input_tokens_seen": 243580072, + "step": 536, + "train_runtime": 38640.2643, + "train_tokens_per_second": 6303.789 + }, + { + "epoch": 0.6501764110519396, + "grad_norm": 0.2227838784456253, + "learning_rate": 5e-06, + "loss": 0.9573, + "num_input_tokens_seen": 244042008, + "step": 537, + "train_runtime": 38712.1042, + "train_tokens_per_second": 6304.023 + }, + { + "epoch": 0.651387167869541, + "grad_norm": 0.22910352051258087, + "learning_rate": 5e-06, + "loss": 0.9834, + "num_input_tokens_seen": 244490152, + "step": 538, + "train_runtime": 38782.3373, + "train_tokens_per_second": 6304.162 + }, + { + "epoch": 0.6525979246871423, + "grad_norm": 0.24009035527706146, + "learning_rate": 5e-06, + "loss": 0.9842, + "num_input_tokens_seen": 244958928, + "step": 539, + "train_runtime": 38855.1605, + "train_tokens_per_second": 6304.412 + }, + { + "epoch": 0.6538086815047437, + "grad_norm": 0.232088103890419, + "learning_rate": 5e-06, + "loss": 0.9951, + "num_input_tokens_seen": 245409888, + "step": 540, + "train_runtime": 38925.8637, + "train_tokens_per_second": 6304.546 + }, + { + "epoch": 0.6550194383223451, + "grad_norm": 0.27717524766921997, + "learning_rate": 5e-06, + "loss": 1.0204, + "num_input_tokens_seen": 245860728, + "step": 541, + "train_runtime": 38995.7398, + "train_tokens_per_second": 6304.81 + }, + { + "epoch": 0.6562301951399465, + "grad_norm": 0.22988007962703705, + "learning_rate": 5e-06, + "loss": 0.9889, + "num_input_tokens_seen": 246299864, + "step": 542, + "train_runtime": 39064.3271, + "train_tokens_per_second": 6304.982 + }, + { + "epoch": 0.6574409519575478, + "grad_norm": 0.21664994955062866, + "learning_rate": 5e-06, + "loss": 0.9719, + "num_input_tokens_seen": 246777792, + "step": 543, + "train_runtime": 39139.2212, + "train_tokens_per_second": 6305.128 + }, + { + "epoch": 0.6586517087751492, + "grad_norm": 0.23201525211334229, + "learning_rate": 5e-06, + "loss": 0.9516, + "num_input_tokens_seen": 247213536, + "step": 544, + "train_runtime": 39206.717, + "train_tokens_per_second": 6305.387 + }, + { + "epoch": 0.6598624655927506, + "grad_norm": 0.2412644624710083, + "learning_rate": 5e-06, + "loss": 1.0171, + "num_input_tokens_seen": 247655632, + "step": 545, + "train_runtime": 39275.488, + "train_tokens_per_second": 6305.603 + }, + { + "epoch": 0.6610732224103519, + "grad_norm": 0.2807646691799164, + "learning_rate": 5e-06, + "loss": 0.9558, + "num_input_tokens_seen": 248112512, + "step": 546, + "train_runtime": 39346.817, + "train_tokens_per_second": 6305.784 + }, + { + "epoch": 0.6622839792279533, + "grad_norm": 0.2552436888217926, + "learning_rate": 5e-06, + "loss": 0.9419, + "num_input_tokens_seen": 248543176, + "step": 547, + "train_runtime": 39413.1641, + "train_tokens_per_second": 6306.095 + }, + { + "epoch": 0.6634947360455548, + "grad_norm": 0.2214186191558838, + "learning_rate": 5e-06, + "loss": 0.9624, + "num_input_tokens_seen": 248985160, + "step": 548, + "train_runtime": 39482.1921, + "train_tokens_per_second": 6306.265 + }, + { + "epoch": 0.6647054928631561, + "grad_norm": 0.24030745029449463, + "learning_rate": 5e-06, + "loss": 0.9839, + "num_input_tokens_seen": 249429704, + "step": 549, + "train_runtime": 39551.3491, + "train_tokens_per_second": 6306.478 + }, + { + "epoch": 0.6659162496807575, + "grad_norm": 0.23489521443843842, + "learning_rate": 5e-06, + "loss": 0.9455, + "num_input_tokens_seen": 249889432, + "step": 550, + "train_runtime": 39622.7881, + "train_tokens_per_second": 6306.71 + }, + { + "epoch": 0.6671270064983589, + "grad_norm": 0.24063046276569366, + "learning_rate": 5e-06, + "loss": 1.0, + "num_input_tokens_seen": 250325736, + "step": 551, + "train_runtime": 39690.0826, + "train_tokens_per_second": 6307.01 + }, + { + "epoch": 0.6683377633159603, + "grad_norm": 0.22540496289730072, + "learning_rate": 5e-06, + "loss": 0.9921, + "num_input_tokens_seen": 250785944, + "step": 552, + "train_runtime": 39761.8576, + "train_tokens_per_second": 6307.199 + }, + { + "epoch": 0.6695485201335616, + "grad_norm": 0.2306659072637558, + "learning_rate": 5e-06, + "loss": 0.9541, + "num_input_tokens_seen": 251249584, + "step": 553, + "train_runtime": 39834.0319, + "train_tokens_per_second": 6307.41 + }, + { + "epoch": 0.670759276951163, + "grad_norm": 0.24347856640815735, + "learning_rate": 5e-06, + "loss": 0.9769, + "num_input_tokens_seen": 251697752, + "step": 554, + "train_runtime": 39904.0654, + "train_tokens_per_second": 6307.572 + }, + { + "epoch": 0.6719700337687644, + "grad_norm": 0.2558618187904358, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 252157024, + "step": 555, + "train_runtime": 39975.4875, + "train_tokens_per_second": 6307.791 + }, + { + "epoch": 0.6731807905863657, + "grad_norm": 0.2455194890499115, + "learning_rate": 5e-06, + "loss": 1.0343, + "num_input_tokens_seen": 252624768, + "step": 556, + "train_runtime": 40048.3193, + "train_tokens_per_second": 6307.999 + }, + { + "epoch": 0.6743915474039671, + "grad_norm": 0.2299470454454422, + "learning_rate": 5e-06, + "loss": 0.982, + "num_input_tokens_seen": 253113208, + "step": 557, + "train_runtime": 40124.0351, + "train_tokens_per_second": 6308.269 + }, + { + "epoch": 0.6756023042215685, + "grad_norm": 0.2273668348789215, + "learning_rate": 5e-06, + "loss": 0.9321, + "num_input_tokens_seen": 253548664, + "step": 558, + "train_runtime": 40192.1439, + "train_tokens_per_second": 6308.414 + }, + { + "epoch": 0.6768130610391698, + "grad_norm": 0.2353869080543518, + "learning_rate": 5e-06, + "loss": 0.9414, + "num_input_tokens_seen": 254002376, + "step": 559, + "train_runtime": 40262.9517, + "train_tokens_per_second": 6308.588 + }, + { + "epoch": 0.6780238178567712, + "grad_norm": 0.22576971352100372, + "learning_rate": 5e-06, + "loss": 0.9887, + "num_input_tokens_seen": 254443952, + "step": 560, + "train_runtime": 40331.6289, + "train_tokens_per_second": 6308.794 + }, + { + "epoch": 0.6792345746743726, + "grad_norm": 0.22624272108078003, + "learning_rate": 5e-06, + "loss": 0.9121, + "num_input_tokens_seen": 254911344, + "step": 561, + "train_runtime": 40404.3092, + "train_tokens_per_second": 6309.014 + }, + { + "epoch": 0.6804453314919741, + "grad_norm": 0.2185974419116974, + "learning_rate": 5e-06, + "loss": 0.9243, + "num_input_tokens_seen": 255377664, + "step": 562, + "train_runtime": 40477.1115, + "train_tokens_per_second": 6309.187 + }, + { + "epoch": 0.6816560883095754, + "grad_norm": 0.22251008450984955, + "learning_rate": 5e-06, + "loss": 0.9011, + "num_input_tokens_seen": 255833336, + "step": 563, + "train_runtime": 40548.4408, + "train_tokens_per_second": 6309.326 + }, + { + "epoch": 0.6828668451271768, + "grad_norm": 0.23521266877651215, + "learning_rate": 5e-06, + "loss": 0.9787, + "num_input_tokens_seen": 256301336, + "step": 564, + "train_runtime": 40621.3067, + "train_tokens_per_second": 6309.53 + }, + { + "epoch": 0.6840776019447782, + "grad_norm": 0.2637956440448761, + "learning_rate": 5e-06, + "loss": 0.9593, + "num_input_tokens_seen": 256761416, + "step": 565, + "train_runtime": 40693.2274, + "train_tokens_per_second": 6309.684 + }, + { + "epoch": 0.6852883587623795, + "grad_norm": 0.23881720006465912, + "learning_rate": 5e-06, + "loss": 1.0131, + "num_input_tokens_seen": 257211376, + "step": 566, + "train_runtime": 40763.5757, + "train_tokens_per_second": 6309.834 + }, + { + "epoch": 0.6864991155799809, + "grad_norm": 0.23504596948623657, + "learning_rate": 5e-06, + "loss": 0.9946, + "num_input_tokens_seen": 257669744, + "step": 567, + "train_runtime": 40834.9767, + "train_tokens_per_second": 6310.025 + }, + { + "epoch": 0.6877098723975823, + "grad_norm": 0.22577445209026337, + "learning_rate": 5e-06, + "loss": 0.9593, + "num_input_tokens_seen": 258128040, + "step": 568, + "train_runtime": 40906.4809, + "train_tokens_per_second": 6310.199 + }, + { + "epoch": 0.6889206292151836, + "grad_norm": 0.24191945791244507, + "learning_rate": 5e-06, + "loss": 0.9935, + "num_input_tokens_seen": 258575712, + "step": 569, + "train_runtime": 40976.425, + "train_tokens_per_second": 6310.353 + }, + { + "epoch": 0.690131386032785, + "grad_norm": 0.23592589795589447, + "learning_rate": 5e-06, + "loss": 0.9547, + "num_input_tokens_seen": 259039864, + "step": 570, + "train_runtime": 41048.2903, + "train_tokens_per_second": 6310.613 + }, + { + "epoch": 0.6913421428503864, + "grad_norm": 0.23204831779003143, + "learning_rate": 5e-06, + "loss": 0.9409, + "num_input_tokens_seen": 259505856, + "step": 571, + "train_runtime": 41121.4188, + "train_tokens_per_second": 6310.722 + }, + { + "epoch": 0.6925528996679878, + "grad_norm": 0.23110359907150269, + "learning_rate": 5e-06, + "loss": 0.9908, + "num_input_tokens_seen": 259970968, + "step": 572, + "train_runtime": 41194.822, + "train_tokens_per_second": 6310.768 + }, + { + "epoch": 0.6937636564855891, + "grad_norm": 0.2301538735628128, + "learning_rate": 5e-06, + "loss": 0.9846, + "num_input_tokens_seen": 260449376, + "step": 573, + "train_runtime": 41269.549, + "train_tokens_per_second": 6310.933 + }, + { + "epoch": 0.6949744133031905, + "grad_norm": 0.23412424325942993, + "learning_rate": 5e-06, + "loss": 0.9713, + "num_input_tokens_seen": 260910664, + "step": 574, + "train_runtime": 41341.5032, + "train_tokens_per_second": 6311.107 + }, + { + "epoch": 0.696185170120792, + "grad_norm": 0.2345420867204666, + "learning_rate": 5e-06, + "loss": 1.0265, + "num_input_tokens_seen": 261349104, + "step": 575, + "train_runtime": 41409.221, + "train_tokens_per_second": 6311.375 + }, + { + "epoch": 0.6973959269383933, + "grad_norm": 0.2388794869184494, + "learning_rate": 5e-06, + "loss": 0.9976, + "num_input_tokens_seen": 261807744, + "step": 576, + "train_runtime": 41480.6661, + "train_tokens_per_second": 6311.561 + }, + { + "epoch": 0.6986066837559947, + "grad_norm": 0.27313232421875, + "learning_rate": 5e-06, + "loss": 0.9973, + "num_input_tokens_seen": 262231864, + "step": 577, + "train_runtime": 41546.1729, + "train_tokens_per_second": 6311.818 + }, + { + "epoch": 0.6998174405735961, + "grad_norm": 0.2250782698392868, + "learning_rate": 5e-06, + "loss": 0.9869, + "num_input_tokens_seen": 262715224, + "step": 578, + "train_runtime": 41621.5112, + "train_tokens_per_second": 6312.006 + }, + { + "epoch": 0.7010281973911974, + "grad_norm": 0.2509269714355469, + "learning_rate": 5e-06, + "loss": 0.9775, + "num_input_tokens_seen": 263176488, + "step": 579, + "train_runtime": 41693.4808, + "train_tokens_per_second": 6312.174 + }, + { + "epoch": 0.7022389542087988, + "grad_norm": 0.2787635624408722, + "learning_rate": 5e-06, + "loss": 1.002, + "num_input_tokens_seen": 263610608, + "step": 580, + "train_runtime": 41761.0277, + "train_tokens_per_second": 6312.359 + }, + { + "epoch": 0.7034497110264002, + "grad_norm": 0.23429201543331146, + "learning_rate": 5e-06, + "loss": 0.9957, + "num_input_tokens_seen": 264088768, + "step": 581, + "train_runtime": 41835.8378, + "train_tokens_per_second": 6312.501 + }, + { + "epoch": 0.7046604678440016, + "grad_norm": 0.21760432422161102, + "learning_rate": 5e-06, + "loss": 0.9657, + "num_input_tokens_seen": 264544496, + "step": 582, + "train_runtime": 41907.0545, + "train_tokens_per_second": 6312.648 + }, + { + "epoch": 0.7058712246616029, + "grad_norm": 0.248090460896492, + "learning_rate": 5e-06, + "loss": 1.0259, + "num_input_tokens_seen": 264988656, + "step": 583, + "train_runtime": 41976.0921, + "train_tokens_per_second": 6312.847 + }, + { + "epoch": 0.7070819814792043, + "grad_norm": 0.23136785626411438, + "learning_rate": 5e-06, + "loss": 0.9822, + "num_input_tokens_seen": 265423208, + "step": 584, + "train_runtime": 42043.4666, + "train_tokens_per_second": 6313.067 + }, + { + "epoch": 0.7082927382968057, + "grad_norm": 0.2529706358909607, + "learning_rate": 5e-06, + "loss": 1.0446, + "num_input_tokens_seen": 265854688, + "step": 585, + "train_runtime": 42110.6206, + "train_tokens_per_second": 6313.246 + }, + { + "epoch": 0.709503495114407, + "grad_norm": 0.24560308456420898, + "learning_rate": 5e-06, + "loss": 0.9233, + "num_input_tokens_seen": 266311664, + "step": 586, + "train_runtime": 42182.4609, + "train_tokens_per_second": 6313.327 + }, + { + "epoch": 0.7107142519320084, + "grad_norm": 0.24339045584201813, + "learning_rate": 5e-06, + "loss": 0.9799, + "num_input_tokens_seen": 266756048, + "step": 587, + "train_runtime": 42251.4986, + "train_tokens_per_second": 6313.529 + }, + { + "epoch": 0.7119250087496098, + "grad_norm": 0.22854940593242645, + "learning_rate": 5e-06, + "loss": 0.9886, + "num_input_tokens_seen": 267210376, + "step": 588, + "train_runtime": 42321.7976, + "train_tokens_per_second": 6313.777 + }, + { + "epoch": 0.7131357655672111, + "grad_norm": 0.24025574326515198, + "learning_rate": 5e-06, + "loss": 0.92, + "num_input_tokens_seen": 267660896, + "step": 589, + "train_runtime": 42392.4873, + "train_tokens_per_second": 6313.876 + }, + { + "epoch": 0.7143465223848126, + "grad_norm": 0.2508932054042816, + "learning_rate": 5e-06, + "loss": 1.0165, + "num_input_tokens_seen": 268113168, + "step": 590, + "train_runtime": 42462.8573, + "train_tokens_per_second": 6314.063 + }, + { + "epoch": 0.715557279202414, + "grad_norm": 0.24230146408081055, + "learning_rate": 5e-06, + "loss": 0.9882, + "num_input_tokens_seen": 268574272, + "step": 591, + "train_runtime": 42534.2219, + "train_tokens_per_second": 6314.31 + }, + { + "epoch": 0.7167680360200154, + "grad_norm": 0.23562973737716675, + "learning_rate": 5e-06, + "loss": 1.012, + "num_input_tokens_seen": 269043528, + "step": 592, + "train_runtime": 42606.947, + "train_tokens_per_second": 6314.546 + }, + { + "epoch": 0.7179787928376167, + "grad_norm": 0.2341059297323227, + "learning_rate": 5e-06, + "loss": 1.0073, + "num_input_tokens_seen": 269523064, + "step": 593, + "train_runtime": 42681.8077, + "train_tokens_per_second": 6314.706 + }, + { + "epoch": 0.7191895496552181, + "grad_norm": 0.2380225509405136, + "learning_rate": 5e-06, + "loss": 0.9657, + "num_input_tokens_seen": 269981784, + "step": 594, + "train_runtime": 42753.4941, + "train_tokens_per_second": 6314.847 + }, + { + "epoch": 0.7204003064728195, + "grad_norm": 0.2389514148235321, + "learning_rate": 5e-06, + "loss": 0.9869, + "num_input_tokens_seen": 270421760, + "step": 595, + "train_runtime": 42822.2596, + "train_tokens_per_second": 6314.981 + }, + { + "epoch": 0.7216110632904208, + "grad_norm": 0.24948102235794067, + "learning_rate": 5e-06, + "loss": 0.9544, + "num_input_tokens_seen": 270861744, + "step": 596, + "train_runtime": 42891.0541, + "train_tokens_per_second": 6315.11 + }, + { + "epoch": 0.7228218201080222, + "grad_norm": 0.25714853405952454, + "learning_rate": 5e-06, + "loss": 0.994, + "num_input_tokens_seen": 271296104, + "step": 597, + "train_runtime": 42958.6929, + "train_tokens_per_second": 6315.278 + }, + { + "epoch": 0.7240325769256236, + "grad_norm": 0.23045891523361206, + "learning_rate": 5e-06, + "loss": 0.9594, + "num_input_tokens_seen": 271751872, + "step": 598, + "train_runtime": 43029.9594, + "train_tokens_per_second": 6315.411 + }, + { + "epoch": 0.7252433337432249, + "grad_norm": 0.23973950743675232, + "learning_rate": 5e-06, + "loss": 1.0007, + "num_input_tokens_seen": 272182760, + "step": 599, + "train_runtime": 43096.7924, + "train_tokens_per_second": 6315.615 + }, + { + "epoch": 0.7264540905608263, + "grad_norm": 0.23554377257823944, + "learning_rate": 5e-06, + "loss": 0.9565, + "num_input_tokens_seen": 272631360, + "step": 600, + "train_runtime": 43166.6123, + "train_tokens_per_second": 6315.792 + }, + { + "epoch": 0.7276648473784277, + "grad_norm": 0.24061642587184906, + "learning_rate": 5e-06, + "loss": 0.9714, + "num_input_tokens_seen": 273087768, + "step": 601, + "train_runtime": 43237.8394, + "train_tokens_per_second": 6315.944 + }, + { + "epoch": 0.7288756041960291, + "grad_norm": 0.23701608180999756, + "learning_rate": 5e-06, + "loss": 0.9918, + "num_input_tokens_seen": 273545024, + "step": 602, + "train_runtime": 43309.1444, + "train_tokens_per_second": 6316.103 + }, + { + "epoch": 0.7300863610136304, + "grad_norm": 0.23831920325756073, + "learning_rate": 5e-06, + "loss": 0.9747, + "num_input_tokens_seen": 273993320, + "step": 603, + "train_runtime": 43379.1503, + "train_tokens_per_second": 6316.245 + }, + { + "epoch": 0.7312971178312319, + "grad_norm": 0.22237437963485718, + "learning_rate": 5e-06, + "loss": 0.9567, + "num_input_tokens_seen": 274442800, + "step": 604, + "train_runtime": 43448.8521, + "train_tokens_per_second": 6316.457 + }, + { + "epoch": 0.7325078746488333, + "grad_norm": 0.30931001901626587, + "learning_rate": 5e-06, + "loss": 0.9831, + "num_input_tokens_seen": 274887544, + "step": 605, + "train_runtime": 43517.5812, + "train_tokens_per_second": 6316.701 + }, + { + "epoch": 0.7337186314664346, + "grad_norm": 0.23581911623477936, + "learning_rate": 5e-06, + "loss": 0.9202, + "num_input_tokens_seen": 275352664, + "step": 606, + "train_runtime": 43590.2263, + "train_tokens_per_second": 6316.844 + }, + { + "epoch": 0.734929388284036, + "grad_norm": 0.2689816951751709, + "learning_rate": 5e-06, + "loss": 1.0085, + "num_input_tokens_seen": 275804984, + "step": 607, + "train_runtime": 43660.6992, + "train_tokens_per_second": 6317.008 + }, + { + "epoch": 0.7361401451016374, + "grad_norm": 0.2378932386636734, + "learning_rate": 5e-06, + "loss": 0.9458, + "num_input_tokens_seen": 276247352, + "step": 608, + "train_runtime": 43729.3613, + "train_tokens_per_second": 6317.205 + }, + { + "epoch": 0.7373509019192387, + "grad_norm": 0.22164365649223328, + "learning_rate": 5e-06, + "loss": 0.9847, + "num_input_tokens_seen": 276707568, + "step": 609, + "train_runtime": 43800.9856, + "train_tokens_per_second": 6317.382 + }, + { + "epoch": 0.7385616587368401, + "grad_norm": 0.23126821219921112, + "learning_rate": 5e-06, + "loss": 0.9935, + "num_input_tokens_seen": 277170312, + "step": 610, + "train_runtime": 43873.0149, + "train_tokens_per_second": 6317.558 + }, + { + "epoch": 0.7397724155544415, + "grad_norm": 0.23724284768104553, + "learning_rate": 5e-06, + "loss": 0.9627, + "num_input_tokens_seen": 277620736, + "step": 611, + "train_runtime": 43943.152, + "train_tokens_per_second": 6317.725 + }, + { + "epoch": 0.7409831723720429, + "grad_norm": 0.2428486943244934, + "learning_rate": 5e-06, + "loss": 0.9938, + "num_input_tokens_seen": 278074992, + "step": 612, + "train_runtime": 44013.5354, + "train_tokens_per_second": 6317.943 + }, + { + "epoch": 0.7421939291896442, + "grad_norm": 0.24035997688770294, + "learning_rate": 5e-06, + "loss": 0.9386, + "num_input_tokens_seen": 278525848, + "step": 613, + "train_runtime": 44083.3847, + "train_tokens_per_second": 6318.159 + }, + { + "epoch": 0.7434046860072456, + "grad_norm": 0.23970334231853485, + "learning_rate": 5e-06, + "loss": 0.9874, + "num_input_tokens_seen": 278966440, + "step": 614, + "train_runtime": 44151.8915, + "train_tokens_per_second": 6318.335 + }, + { + "epoch": 0.744615442824847, + "grad_norm": 0.213746577501297, + "learning_rate": 5e-06, + "loss": 0.9763, + "num_input_tokens_seen": 279449936, + "step": 615, + "train_runtime": 44227.7947, + "train_tokens_per_second": 6318.423 + }, + { + "epoch": 0.7458261996424483, + "grad_norm": 0.2598293423652649, + "learning_rate": 5e-06, + "loss": 0.9765, + "num_input_tokens_seen": 279890568, + "step": 616, + "train_runtime": 44296.6434, + "train_tokens_per_second": 6318.55 + }, + { + "epoch": 0.7470369564600498, + "grad_norm": 0.2453431487083435, + "learning_rate": 5e-06, + "loss": 0.9433, + "num_input_tokens_seen": 280349072, + "step": 617, + "train_runtime": 44367.5664, + "train_tokens_per_second": 6318.784 + }, + { + "epoch": 0.7482477132776512, + "grad_norm": 0.23078188300132751, + "learning_rate": 5e-06, + "loss": 0.9357, + "num_input_tokens_seen": 280821128, + "step": 618, + "train_runtime": 44441.7787, + "train_tokens_per_second": 6318.854 + }, + { + "epoch": 0.7494584700952525, + "grad_norm": 0.23313450813293457, + "learning_rate": 5e-06, + "loss": 0.9628, + "num_input_tokens_seen": 281279568, + "step": 619, + "train_runtime": 44513.2227, + "train_tokens_per_second": 6319.012 + }, + { + "epoch": 0.7506692269128539, + "grad_norm": 0.21814242005348206, + "learning_rate": 5e-06, + "loss": 0.9312, + "num_input_tokens_seen": 281738128, + "step": 620, + "train_runtime": 44584.8919, + "train_tokens_per_second": 6319.139 + }, + { + "epoch": 0.7518799837304553, + "grad_norm": 0.2563712000846863, + "learning_rate": 5e-06, + "loss": 0.9814, + "num_input_tokens_seen": 282181152, + "step": 621, + "train_runtime": 44653.7827, + "train_tokens_per_second": 6319.311 + }, + { + "epoch": 0.7530907405480567, + "grad_norm": 0.2649373412132263, + "learning_rate": 5e-06, + "loss": 1.0402, + "num_input_tokens_seen": 282631384, + "step": 622, + "train_runtime": 44724.7127, + "train_tokens_per_second": 6319.356 + }, + { + "epoch": 0.754301497365658, + "grad_norm": 0.22597451508045197, + "learning_rate": 5e-06, + "loss": 0.9611, + "num_input_tokens_seen": 283073568, + "step": 623, + "train_runtime": 44794.6422, + "train_tokens_per_second": 6319.362 + }, + { + "epoch": 0.7555122541832594, + "grad_norm": 0.24213433265686035, + "learning_rate": 5e-06, + "loss": 0.9404, + "num_input_tokens_seen": 283512448, + "step": 624, + "train_runtime": 44862.7569, + "train_tokens_per_second": 6319.55 + }, + { + "epoch": 0.7567230110008608, + "grad_norm": 0.2487850785255432, + "learning_rate": 5e-06, + "loss": 0.9387, + "num_input_tokens_seen": 283970752, + "step": 625, + "train_runtime": 44934.4627, + "train_tokens_per_second": 6319.665 + }, + { + "epoch": 0.7579337678184621, + "grad_norm": 0.2626650333404541, + "learning_rate": 5e-06, + "loss": 0.9924, + "num_input_tokens_seen": 284447624, + "step": 626, + "train_runtime": 45008.9717, + "train_tokens_per_second": 6319.798 + }, + { + "epoch": 0.7591445246360635, + "grad_norm": 0.24381890892982483, + "learning_rate": 5e-06, + "loss": 0.9668, + "num_input_tokens_seen": 284896224, + "step": 627, + "train_runtime": 45078.7734, + "train_tokens_per_second": 6319.964 + }, + { + "epoch": 0.7603552814536649, + "grad_norm": 0.22962401807308197, + "learning_rate": 5e-06, + "loss": 1.0086, + "num_input_tokens_seen": 285355952, + "step": 628, + "train_runtime": 45150.4275, + "train_tokens_per_second": 6320.116 + }, + { + "epoch": 0.7615660382712662, + "grad_norm": 0.23533271253108978, + "learning_rate": 5e-06, + "loss": 0.9176, + "num_input_tokens_seen": 285807352, + "step": 629, + "train_runtime": 45220.9764, + "train_tokens_per_second": 6320.238 + }, + { + "epoch": 0.7627767950888676, + "grad_norm": 0.24772769212722778, + "learning_rate": 5e-06, + "loss": 0.9566, + "num_input_tokens_seen": 286283944, + "step": 630, + "train_runtime": 45294.8924, + "train_tokens_per_second": 6320.447 + }, + { + "epoch": 0.763987551906469, + "grad_norm": 0.2620101571083069, + "learning_rate": 5e-06, + "loss": 0.9959, + "num_input_tokens_seen": 286733384, + "step": 631, + "train_runtime": 45364.8652, + "train_tokens_per_second": 6320.605 + }, + { + "epoch": 0.7651983087240705, + "grad_norm": 0.23930427432060242, + "learning_rate": 5e-06, + "loss": 0.9519, + "num_input_tokens_seen": 287216280, + "step": 632, + "train_runtime": 45439.9774, + "train_tokens_per_second": 6320.784 + }, + { + "epoch": 0.7664090655416718, + "grad_norm": 0.24364081025123596, + "learning_rate": 5e-06, + "loss": 1.033, + "num_input_tokens_seen": 287686200, + "step": 633, + "train_runtime": 45513.7576, + "train_tokens_per_second": 6320.862 + }, + { + "epoch": 0.7676198223592732, + "grad_norm": 0.2459454983472824, + "learning_rate": 5e-06, + "loss": 0.9851, + "num_input_tokens_seen": 288132976, + "step": 634, + "train_runtime": 45583.2033, + "train_tokens_per_second": 6321.034 + }, + { + "epoch": 0.7688305791768746, + "grad_norm": 0.2267904430627823, + "learning_rate": 5e-06, + "loss": 0.9701, + "num_input_tokens_seen": 288590576, + "step": 635, + "train_runtime": 45654.2868, + "train_tokens_per_second": 6321.215 + }, + { + "epoch": 0.7700413359944759, + "grad_norm": 0.2215666025876999, + "learning_rate": 5e-06, + "loss": 0.9455, + "num_input_tokens_seen": 289047888, + "step": 636, + "train_runtime": 45725.6961, + "train_tokens_per_second": 6321.345 + }, + { + "epoch": 0.7712520928120773, + "grad_norm": 0.23759250342845917, + "learning_rate": 5e-06, + "loss": 0.9361, + "num_input_tokens_seen": 289499480, + "step": 637, + "train_runtime": 45796.0887, + "train_tokens_per_second": 6321.489 + }, + { + "epoch": 0.7724628496296787, + "grad_norm": 0.23697270452976227, + "learning_rate": 5e-06, + "loss": 0.9343, + "num_input_tokens_seen": 289956856, + "step": 638, + "train_runtime": 45867.4221, + "train_tokens_per_second": 6321.63 + }, + { + "epoch": 0.77367360644728, + "grad_norm": 0.2574046552181244, + "learning_rate": 5e-06, + "loss": 1.0023, + "num_input_tokens_seen": 290374696, + "step": 639, + "train_runtime": 45932.2172, + "train_tokens_per_second": 6321.809 + }, + { + "epoch": 0.7748843632648814, + "grad_norm": 0.2575940489768982, + "learning_rate": 5e-06, + "loss": 0.9831, + "num_input_tokens_seen": 290844728, + "step": 640, + "train_runtime": 46005.7775, + "train_tokens_per_second": 6321.917 + }, + { + "epoch": 0.7760951200824828, + "grad_norm": 0.2475946545600891, + "learning_rate": 5e-06, + "loss": 1.0082, + "num_input_tokens_seen": 291292768, + "step": 641, + "train_runtime": 46075.669, + "train_tokens_per_second": 6322.052 + }, + { + "epoch": 0.7773058769000842, + "grad_norm": 0.27857834100723267, + "learning_rate": 5e-06, + "loss": 0.9734, + "num_input_tokens_seen": 291733344, + "step": 642, + "train_runtime": 46144.534, + "train_tokens_per_second": 6322.165 + }, + { + "epoch": 0.7785166337176855, + "grad_norm": 0.25765910744667053, + "learning_rate": 5e-06, + "loss": 0.9982, + "num_input_tokens_seen": 292171360, + "step": 643, + "train_runtime": 46212.6022, + "train_tokens_per_second": 6322.331 + }, + { + "epoch": 0.779727390535287, + "grad_norm": 0.2572195827960968, + "learning_rate": 5e-06, + "loss": 0.963, + "num_input_tokens_seen": 292612640, + "step": 644, + "train_runtime": 46281.1163, + "train_tokens_per_second": 6322.506 + }, + { + "epoch": 0.7809381473528884, + "grad_norm": 0.24165485799312592, + "learning_rate": 5e-06, + "loss": 1.0424, + "num_input_tokens_seen": 293053416, + "step": 645, + "train_runtime": 46349.6476, + "train_tokens_per_second": 6322.668 + }, + { + "epoch": 0.7821489041704897, + "grad_norm": 0.2371072620153427, + "learning_rate": 5e-06, + "loss": 0.9891, + "num_input_tokens_seen": 293522488, + "step": 646, + "train_runtime": 46422.9917, + "train_tokens_per_second": 6322.783 + }, + { + "epoch": 0.7833596609880911, + "grad_norm": 0.26184481382369995, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 293932816, + "step": 647, + "train_runtime": 46486.5204, + "train_tokens_per_second": 6322.969 + }, + { + "epoch": 0.7845704178056925, + "grad_norm": 0.2628776431083679, + "learning_rate": 5e-06, + "loss": 0.9613, + "num_input_tokens_seen": 294392880, + "step": 648, + "train_runtime": 46558.3764, + "train_tokens_per_second": 6323.092 + }, + { + "epoch": 0.7857811746232938, + "grad_norm": 0.2746836245059967, + "learning_rate": 5e-06, + "loss": 1.0326, + "num_input_tokens_seen": 294829032, + "step": 649, + "train_runtime": 46626.1232, + "train_tokens_per_second": 6323.259 + }, + { + "epoch": 0.7869919314408952, + "grad_norm": 0.23179323971271515, + "learning_rate": 5e-06, + "loss": 0.959, + "num_input_tokens_seen": 295266440, + "step": 650, + "train_runtime": 46694.3296, + "train_tokens_per_second": 6323.39 + }, + { + "epoch": 0.7882026882584966, + "grad_norm": 0.29162031412124634, + "learning_rate": 5e-06, + "loss": 1.0614, + "num_input_tokens_seen": 295707072, + "step": 651, + "train_runtime": 46763.2072, + "train_tokens_per_second": 6323.499 + }, + { + "epoch": 0.789413445076098, + "grad_norm": 0.24644595384597778, + "learning_rate": 5e-06, + "loss": 0.9355, + "num_input_tokens_seen": 296168304, + "step": 652, + "train_runtime": 46835.1671, + "train_tokens_per_second": 6323.631 + }, + { + "epoch": 0.7906242018936993, + "grad_norm": 0.22973157465457916, + "learning_rate": 5e-06, + "loss": 1.0039, + "num_input_tokens_seen": 296626648, + "step": 653, + "train_runtime": 46906.6535, + "train_tokens_per_second": 6323.765 + }, + { + "epoch": 0.7918349587113007, + "grad_norm": 0.22654931247234344, + "learning_rate": 5e-06, + "loss": 0.9769, + "num_input_tokens_seen": 297077224, + "step": 654, + "train_runtime": 46976.9907, + "train_tokens_per_second": 6323.888 + }, + { + "epoch": 0.7930457155289021, + "grad_norm": 0.25695592164993286, + "learning_rate": 5e-06, + "loss": 0.9681, + "num_input_tokens_seen": 297509888, + "step": 655, + "train_runtime": 47043.9093, + "train_tokens_per_second": 6324.089 + }, + { + "epoch": 0.7942564723465034, + "grad_norm": 0.2581423819065094, + "learning_rate": 5e-06, + "loss": 0.9989, + "num_input_tokens_seen": 297939024, + "step": 656, + "train_runtime": 47110.28, + "train_tokens_per_second": 6324.289 + }, + { + "epoch": 0.7954672291641048, + "grad_norm": 0.23372498154640198, + "learning_rate": 5e-06, + "loss": 1.0305, + "num_input_tokens_seen": 298407680, + "step": 657, + "train_runtime": 47183.4049, + "train_tokens_per_second": 6324.42 + }, + { + "epoch": 0.7966779859817062, + "grad_norm": 0.2330416738986969, + "learning_rate": 5e-06, + "loss": 0.9725, + "num_input_tokens_seen": 298854728, + "step": 658, + "train_runtime": 47253.2725, + "train_tokens_per_second": 6324.53 + }, + { + "epoch": 0.7978887427993075, + "grad_norm": 0.23654578626155853, + "learning_rate": 5e-06, + "loss": 0.9963, + "num_input_tokens_seen": 299300440, + "step": 659, + "train_runtime": 47322.7649, + "train_tokens_per_second": 6324.661 + }, + { + "epoch": 0.799099499616909, + "grad_norm": 0.2542232275009155, + "learning_rate": 5e-06, + "loss": 1.0448, + "num_input_tokens_seen": 299766136, + "step": 660, + "train_runtime": 47394.9974, + "train_tokens_per_second": 6324.848 + }, + { + "epoch": 0.8003102564345104, + "grad_norm": 0.24160121381282806, + "learning_rate": 5e-06, + "loss": 0.9603, + "num_input_tokens_seen": 300221640, + "step": 661, + "train_runtime": 47466.4669, + "train_tokens_per_second": 6324.921 + }, + { + "epoch": 0.8015210132521118, + "grad_norm": 0.22822356224060059, + "learning_rate": 5e-06, + "loss": 0.9958, + "num_input_tokens_seen": 300687184, + "step": 662, + "train_runtime": 47539.0298, + "train_tokens_per_second": 6325.059 + }, + { + "epoch": 0.8027317700697131, + "grad_norm": 0.2521500587463379, + "learning_rate": 5e-06, + "loss": 0.972, + "num_input_tokens_seen": 301123864, + "step": 663, + "train_runtime": 47607.2637, + "train_tokens_per_second": 6325.166 + }, + { + "epoch": 0.8039425268873145, + "grad_norm": 0.23535515367984772, + "learning_rate": 5e-06, + "loss": 0.9973, + "num_input_tokens_seen": 301584744, + "step": 664, + "train_runtime": 47679.0072, + "train_tokens_per_second": 6325.315 + }, + { + "epoch": 0.8051532837049159, + "grad_norm": 0.22911347448825836, + "learning_rate": 5e-06, + "loss": 0.964, + "num_input_tokens_seen": 302032008, + "step": 665, + "train_runtime": 47748.6252, + "train_tokens_per_second": 6325.46 + }, + { + "epoch": 0.8063640405225172, + "grad_norm": 0.2548276484012604, + "learning_rate": 5e-06, + "loss": 0.9764, + "num_input_tokens_seen": 302498592, + "step": 666, + "train_runtime": 47821.6069, + "train_tokens_per_second": 6325.563 + }, + { + "epoch": 0.8075747973401186, + "grad_norm": 0.24845871329307556, + "learning_rate": 5e-06, + "loss": 0.9747, + "num_input_tokens_seen": 302948968, + "step": 667, + "train_runtime": 47891.6883, + "train_tokens_per_second": 6325.711 + }, + { + "epoch": 0.80878555415772, + "grad_norm": 0.27543285489082336, + "learning_rate": 5e-06, + "loss": 1.0221, + "num_input_tokens_seen": 303394536, + "step": 668, + "train_runtime": 47961.0975, + "train_tokens_per_second": 6325.846 + }, + { + "epoch": 0.8099963109753213, + "grad_norm": 0.22285109758377075, + "learning_rate": 5e-06, + "loss": 0.9823, + "num_input_tokens_seen": 303869992, + "step": 669, + "train_runtime": 48035.6906, + "train_tokens_per_second": 6325.921 + }, + { + "epoch": 0.8112070677929227, + "grad_norm": 0.2208424061536789, + "learning_rate": 5e-06, + "loss": 0.916, + "num_input_tokens_seen": 304338112, + "step": 670, + "train_runtime": 48108.8012, + "train_tokens_per_second": 6326.038 + }, + { + "epoch": 0.8124178246105241, + "grad_norm": 0.23547379672527313, + "learning_rate": 5e-06, + "loss": 0.9419, + "num_input_tokens_seen": 304770896, + "step": 671, + "train_runtime": 48175.679, + "train_tokens_per_second": 6326.24 + }, + { + "epoch": 0.8136285814281256, + "grad_norm": 0.3341003656387329, + "learning_rate": 5e-06, + "loss": 0.9109, + "num_input_tokens_seen": 305218712, + "step": 672, + "train_runtime": 48245.4679, + "train_tokens_per_second": 6326.371 + }, + { + "epoch": 0.8148393382457269, + "grad_norm": 0.3057156205177307, + "learning_rate": 5e-06, + "loss": 0.9952, + "num_input_tokens_seen": 305656008, + "step": 673, + "train_runtime": 48313.4759, + "train_tokens_per_second": 6326.517 + }, + { + "epoch": 0.8160500950633283, + "grad_norm": 0.2505541741847992, + "learning_rate": 5e-06, + "loss": 0.9644, + "num_input_tokens_seen": 306112992, + "step": 674, + "train_runtime": 48386.3399, + "train_tokens_per_second": 6326.434 + }, + { + "epoch": 0.8172608518809297, + "grad_norm": 0.28934425115585327, + "learning_rate": 5e-06, + "loss": 0.9361, + "num_input_tokens_seen": 306569648, + "step": 675, + "train_runtime": 48457.067, + "train_tokens_per_second": 6326.624 + }, + { + "epoch": 0.818471608698531, + "grad_norm": 0.24182599782943726, + "learning_rate": 5e-06, + "loss": 0.9022, + "num_input_tokens_seen": 307035984, + "step": 676, + "train_runtime": 48529.7381, + "train_tokens_per_second": 6326.76 + }, + { + "epoch": 0.8196823655161324, + "grad_norm": 0.23037275671958923, + "learning_rate": 5e-06, + "loss": 0.9832, + "num_input_tokens_seen": 307477064, + "step": 677, + "train_runtime": 48598.2501, + "train_tokens_per_second": 6326.916 + }, + { + "epoch": 0.8208931223337338, + "grad_norm": 0.26939913630485535, + "learning_rate": 5e-06, + "loss": 0.9765, + "num_input_tokens_seen": 307930568, + "step": 678, + "train_runtime": 48668.4558, + "train_tokens_per_second": 6327.108 + }, + { + "epoch": 0.8221038791513351, + "grad_norm": 0.2629682719707489, + "learning_rate": 5e-06, + "loss": 0.9513, + "num_input_tokens_seen": 308377608, + "step": 679, + "train_runtime": 48737.7561, + "train_tokens_per_second": 6327.284 + }, + { + "epoch": 0.8233146359689365, + "grad_norm": 0.2294158786535263, + "learning_rate": 5e-06, + "loss": 0.972, + "num_input_tokens_seen": 308838600, + "step": 680, + "train_runtime": 48810.1043, + "train_tokens_per_second": 6327.35 + }, + { + "epoch": 0.8245253927865379, + "grad_norm": 0.22822599112987518, + "learning_rate": 5e-06, + "loss": 0.936, + "num_input_tokens_seen": 309318664, + "step": 681, + "train_runtime": 48887.5021, + "train_tokens_per_second": 6327.152 + }, + { + "epoch": 0.8257361496041393, + "grad_norm": 0.24199745059013367, + "learning_rate": 5e-06, + "loss": 1.0328, + "num_input_tokens_seen": 309745744, + "step": 682, + "train_runtime": 48957.3644, + "train_tokens_per_second": 6326.847 + }, + { + "epoch": 0.8269469064217406, + "grad_norm": 0.23330043256282806, + "learning_rate": 5e-06, + "loss": 0.9579, + "num_input_tokens_seen": 310184288, + "step": 683, + "train_runtime": 49029.1375, + "train_tokens_per_second": 6326.53 + }, + { + "epoch": 0.828157663239342, + "grad_norm": 0.22911278903484344, + "learning_rate": 5e-06, + "loss": 1.0027, + "num_input_tokens_seen": 310639208, + "step": 684, + "train_runtime": 49102.2981, + "train_tokens_per_second": 6326.368 + }, + { + "epoch": 0.8293684200569434, + "grad_norm": 0.23975107073783875, + "learning_rate": 5e-06, + "loss": 0.9599, + "num_input_tokens_seen": 311102208, + "step": 685, + "train_runtime": 49177.6776, + "train_tokens_per_second": 6326.086 + }, + { + "epoch": 0.8305791768745447, + "grad_norm": 0.271192729473114, + "learning_rate": 5e-06, + "loss": 0.9861, + "num_input_tokens_seen": 311544336, + "step": 686, + "train_runtime": 49250.6236, + "train_tokens_per_second": 6325.693 + }, + { + "epoch": 0.8317899336921462, + "grad_norm": 0.2387111783027649, + "learning_rate": 5e-06, + "loss": 0.9506, + "num_input_tokens_seen": 311991128, + "step": 687, + "train_runtime": 49323.9975, + "train_tokens_per_second": 6325.341 + }, + { + "epoch": 0.8330006905097476, + "grad_norm": 0.24908512830734253, + "learning_rate": 5e-06, + "loss": 0.9557, + "num_input_tokens_seen": 312461408, + "step": 688, + "train_runtime": 49402.3748, + "train_tokens_per_second": 6324.826 + }, + { + "epoch": 0.8342114473273489, + "grad_norm": 0.2281702756881714, + "learning_rate": 5e-06, + "loss": 0.9523, + "num_input_tokens_seen": 312916448, + "step": 689, + "train_runtime": 49476.816, + "train_tokens_per_second": 6324.507 + }, + { + "epoch": 0.8354222041449503, + "grad_norm": 0.23677456378936768, + "learning_rate": 5e-06, + "loss": 0.9834, + "num_input_tokens_seen": 313376968, + "step": 690, + "train_runtime": 49552.5358, + "train_tokens_per_second": 6324.136 + }, + { + "epoch": 0.8366329609625517, + "grad_norm": 0.23301224410533905, + "learning_rate": 5e-06, + "loss": 0.9741, + "num_input_tokens_seen": 313834800, + "step": 691, + "train_runtime": 49627.9277, + "train_tokens_per_second": 6323.754 + }, + { + "epoch": 0.8378437177801531, + "grad_norm": 0.28598400950431824, + "learning_rate": 5e-06, + "loss": 0.9854, + "num_input_tokens_seen": 314290560, + "step": 692, + "train_runtime": 49703.1803, + "train_tokens_per_second": 6323.349 + }, + { + "epoch": 0.8390544745977544, + "grad_norm": 0.24205778539180756, + "learning_rate": 5e-06, + "loss": 0.9253, + "num_input_tokens_seen": 314767360, + "step": 693, + "train_runtime": 49781.9765, + "train_tokens_per_second": 6322.918 + }, + { + "epoch": 0.8402652314153558, + "grad_norm": 0.25196146965026855, + "learning_rate": 5e-06, + "loss": 0.9518, + "num_input_tokens_seen": 315210832, + "step": 694, + "train_runtime": 49854.8652, + "train_tokens_per_second": 6322.569 + }, + { + "epoch": 0.8414759882329572, + "grad_norm": 0.2362397164106369, + "learning_rate": 5e-06, + "loss": 0.9407, + "num_input_tokens_seen": 315669384, + "step": 695, + "train_runtime": 49930.4912, + "train_tokens_per_second": 6322.177 + }, + { + "epoch": 0.8426867450505585, + "grad_norm": 0.23664698004722595, + "learning_rate": 5e-06, + "loss": 0.9684, + "num_input_tokens_seen": 316108928, + "step": 696, + "train_runtime": 50002.7456, + "train_tokens_per_second": 6321.831 + }, + { + "epoch": 0.8438975018681599, + "grad_norm": 0.23944173753261566, + "learning_rate": 5e-06, + "loss": 0.9395, + "num_input_tokens_seen": 316566328, + "step": 697, + "train_runtime": 50077.929, + "train_tokens_per_second": 6321.474 + }, + { + "epoch": 0.8451082586857613, + "grad_norm": 0.22662094235420227, + "learning_rate": 5e-06, + "loss": 0.9686, + "num_input_tokens_seen": 317031176, + "step": 698, + "train_runtime": 50154.9704, + "train_tokens_per_second": 6321.032 + }, + { + "epoch": 0.8463190155033626, + "grad_norm": 0.22922109067440033, + "learning_rate": 5e-06, + "loss": 0.9671, + "num_input_tokens_seen": 317478472, + "step": 699, + "train_runtime": 50228.3177, + "train_tokens_per_second": 6320.707 + }, + { + "epoch": 0.847529772320964, + "grad_norm": 0.26036337018013, + "learning_rate": 5e-06, + "loss": 0.9642, + "num_input_tokens_seen": 317940344, + "step": 700, + "train_runtime": 50304.238, + "train_tokens_per_second": 6320.349 + }, + { + "epoch": 0.8487405291385655, + "grad_norm": 0.22809621691703796, + "learning_rate": 5e-06, + "loss": 0.952, + "num_input_tokens_seen": 318407112, + "step": 701, + "train_runtime": 50381.1432, + "train_tokens_per_second": 6319.966 + }, + { + "epoch": 0.8499512859561669, + "grad_norm": 0.228465273976326, + "learning_rate": 5e-06, + "loss": 0.9005, + "num_input_tokens_seen": 318858464, + "step": 702, + "train_runtime": 50454.8922, + "train_tokens_per_second": 6319.674 + }, + { + "epoch": 0.8511620427737682, + "grad_norm": 0.2660825550556183, + "learning_rate": 5e-06, + "loss": 0.9698, + "num_input_tokens_seen": 319300520, + "step": 703, + "train_runtime": 50527.48, + "train_tokens_per_second": 6319.344 + }, + { + "epoch": 0.8523727995913696, + "grad_norm": 0.2166413962841034, + "learning_rate": 5e-06, + "loss": 0.9611, + "num_input_tokens_seen": 319756696, + "step": 704, + "train_runtime": 50602.7456, + "train_tokens_per_second": 6318.959 + }, + { + "epoch": 0.853583556408971, + "grad_norm": 0.22596792876720428, + "learning_rate": 5e-06, + "loss": 0.9911, + "num_input_tokens_seen": 320225520, + "step": 705, + "train_runtime": 50680.0879, + "train_tokens_per_second": 6318.567 + }, + { + "epoch": 0.8547943132265723, + "grad_norm": 0.24560396373271942, + "learning_rate": 5e-06, + "loss": 0.9301, + "num_input_tokens_seen": 320689552, + "step": 706, + "train_runtime": 50756.6212, + "train_tokens_per_second": 6318.182 + }, + { + "epoch": 0.8560050700441737, + "grad_norm": 0.2799171805381775, + "learning_rate": 5e-06, + "loss": 0.9853, + "num_input_tokens_seen": 321135128, + "step": 707, + "train_runtime": 50829.7956, + "train_tokens_per_second": 6317.852 + }, + { + "epoch": 0.8572158268617751, + "grad_norm": 0.24234268069267273, + "learning_rate": 5e-06, + "loss": 0.9667, + "num_input_tokens_seen": 321592464, + "step": 708, + "train_runtime": 50905.3416, + "train_tokens_per_second": 6317.46 + }, + { + "epoch": 0.8584265836793764, + "grad_norm": 0.24366381764411926, + "learning_rate": 5e-06, + "loss": 0.9408, + "num_input_tokens_seen": 322030856, + "step": 709, + "train_runtime": 50977.5327, + "train_tokens_per_second": 6317.113 + }, + { + "epoch": 0.8596373404969778, + "grad_norm": 0.23462019860744476, + "learning_rate": 5e-06, + "loss": 0.9617, + "num_input_tokens_seen": 322488840, + "step": 710, + "train_runtime": 51053.0683, + "train_tokens_per_second": 6316.738 + }, + { + "epoch": 0.8608480973145792, + "grad_norm": 0.23850728571414948, + "learning_rate": 5e-06, + "loss": 0.9618, + "num_input_tokens_seen": 322956064, + "step": 711, + "train_runtime": 51130.2729, + "train_tokens_per_second": 6316.338 + }, + { + "epoch": 0.8620588541321806, + "grad_norm": 0.24310947954654694, + "learning_rate": 5e-06, + "loss": 0.9352, + "num_input_tokens_seen": 323404816, + "step": 712, + "train_runtime": 51204.0732, + "train_tokens_per_second": 6315.998 + }, + { + "epoch": 0.8632696109497819, + "grad_norm": 0.2608128786087036, + "learning_rate": 5e-06, + "loss": 0.9402, + "num_input_tokens_seen": 323876440, + "step": 713, + "train_runtime": 51281.1334, + "train_tokens_per_second": 6315.704 + }, + { + "epoch": 0.8644803677673834, + "grad_norm": 0.2339504510164261, + "learning_rate": 5e-06, + "loss": 0.9117, + "num_input_tokens_seen": 324335024, + "step": 714, + "train_runtime": 51356.6378, + "train_tokens_per_second": 6315.348 + }, + { + "epoch": 0.8656911245849848, + "grad_norm": 0.23948872089385986, + "learning_rate": 5e-06, + "loss": 0.9073, + "num_input_tokens_seen": 324788680, + "step": 715, + "train_runtime": 51430.9403, + "train_tokens_per_second": 6315.045 + }, + { + "epoch": 0.8669018814025861, + "grad_norm": 0.23080047965049744, + "learning_rate": 5e-06, + "loss": 0.9569, + "num_input_tokens_seen": 325265904, + "step": 716, + "train_runtime": 51509.6255, + "train_tokens_per_second": 6314.663 + }, + { + "epoch": 0.8681126382201875, + "grad_norm": 0.2425810992717743, + "learning_rate": 5e-06, + "loss": 0.9919, + "num_input_tokens_seen": 325749024, + "step": 717, + "train_runtime": 51589.2266, + "train_tokens_per_second": 6314.284 + }, + { + "epoch": 0.8693233950377889, + "grad_norm": 0.24254527688026428, + "learning_rate": 5e-06, + "loss": 0.9758, + "num_input_tokens_seen": 326205360, + "step": 718, + "train_runtime": 51663.9314, + "train_tokens_per_second": 6313.986 + }, + { + "epoch": 0.8705341518553902, + "grad_norm": 0.2271261066198349, + "learning_rate": 5e-06, + "loss": 0.9523, + "num_input_tokens_seen": 326656568, + "step": 719, + "train_runtime": 51737.7083, + "train_tokens_per_second": 6313.704 + }, + { + "epoch": 0.8717449086729916, + "grad_norm": 0.28275179862976074, + "learning_rate": 5e-06, + "loss": 0.9955, + "num_input_tokens_seen": 327100936, + "step": 720, + "train_runtime": 51810.5603, + "train_tokens_per_second": 6313.403 + }, + { + "epoch": 0.872955665490593, + "grad_norm": 0.2485657036304474, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 327555136, + "step": 721, + "train_runtime": 51884.7425, + "train_tokens_per_second": 6313.13 + }, + { + "epoch": 0.8741664223081944, + "grad_norm": 0.26533499360084534, + "learning_rate": 5e-06, + "loss": 0.9911, + "num_input_tokens_seen": 328014560, + "step": 722, + "train_runtime": 51960.0639, + "train_tokens_per_second": 6312.821 + }, + { + "epoch": 0.8753771791257957, + "grad_norm": 0.21716679632663727, + "learning_rate": 5e-06, + "loss": 0.9419, + "num_input_tokens_seen": 328476336, + "step": 723, + "train_runtime": 52035.9362, + "train_tokens_per_second": 6312.49 + }, + { + "epoch": 0.8765879359433971, + "grad_norm": 0.238169863820076, + "learning_rate": 5e-06, + "loss": 0.9927, + "num_input_tokens_seen": 328903336, + "step": 724, + "train_runtime": 52106.2514, + "train_tokens_per_second": 6312.167 + }, + { + "epoch": 0.8777986927609985, + "grad_norm": 0.23495762050151825, + "learning_rate": 5e-06, + "loss": 0.9438, + "num_input_tokens_seen": 329351280, + "step": 725, + "train_runtime": 52179.605, + "train_tokens_per_second": 6311.878 + }, + { + "epoch": 0.8790094495785998, + "grad_norm": 0.2645426094532013, + "learning_rate": 5e-06, + "loss": 0.9552, + "num_input_tokens_seen": 329810048, + "step": 726, + "train_runtime": 52255.2056, + "train_tokens_per_second": 6311.525 + }, + { + "epoch": 0.8802202063962012, + "grad_norm": 0.2440696507692337, + "learning_rate": 5e-06, + "loss": 0.9443, + "num_input_tokens_seen": 330266952, + "step": 727, + "train_runtime": 52330.1379, + "train_tokens_per_second": 6311.219 + }, + { + "epoch": 0.8814309632138027, + "grad_norm": 0.28334546089172363, + "learning_rate": 5e-06, + "loss": 0.9749, + "num_input_tokens_seen": 330725072, + "step": 728, + "train_runtime": 52405.845, + "train_tokens_per_second": 6310.843 + }, + { + "epoch": 0.882641720031404, + "grad_norm": 0.25327134132385254, + "learning_rate": 5e-06, + "loss": 0.9531, + "num_input_tokens_seen": 331165624, + "step": 729, + "train_runtime": 52477.7764, + "train_tokens_per_second": 6310.588 + }, + { + "epoch": 0.8838524768490054, + "grad_norm": 0.23178185522556305, + "learning_rate": 5e-06, + "loss": 0.9606, + "num_input_tokens_seen": 331625256, + "step": 730, + "train_runtime": 52553.8007, + "train_tokens_per_second": 6310.205 + }, + { + "epoch": 0.8850632336666068, + "grad_norm": 0.23952724039554596, + "learning_rate": 5e-06, + "loss": 0.9585, + "num_input_tokens_seen": 332060176, + "step": 731, + "train_runtime": 52625.0909, + "train_tokens_per_second": 6309.921 + }, + { + "epoch": 0.8862739904842082, + "grad_norm": 0.23698952794075012, + "learning_rate": 5e-06, + "loss": 0.8797, + "num_input_tokens_seen": 332509640, + "step": 732, + "train_runtime": 52699.071, + "train_tokens_per_second": 6309.592 + }, + { + "epoch": 0.8874847473018095, + "grad_norm": 0.22318892180919647, + "learning_rate": 5e-06, + "loss": 0.9486, + "num_input_tokens_seen": 332946752, + "step": 733, + "train_runtime": 52771.1507, + "train_tokens_per_second": 6309.257 + }, + { + "epoch": 0.8886955041194109, + "grad_norm": 0.22661879658699036, + "learning_rate": 5e-06, + "loss": 0.9622, + "num_input_tokens_seen": 333408976, + "step": 734, + "train_runtime": 52847.3029, + "train_tokens_per_second": 6308.912 + }, + { + "epoch": 0.8899062609370123, + "grad_norm": 0.22661250829696655, + "learning_rate": 5e-06, + "loss": 1.0203, + "num_input_tokens_seen": 333864040, + "step": 735, + "train_runtime": 52921.7765, + "train_tokens_per_second": 6308.633 + }, + { + "epoch": 0.8911170177546136, + "grad_norm": 0.23365598917007446, + "learning_rate": 5e-06, + "loss": 1.0197, + "num_input_tokens_seen": 334328000, + "step": 736, + "train_runtime": 52997.7307, + "train_tokens_per_second": 6308.346 + }, + { + "epoch": 0.892327774572215, + "grad_norm": 0.25835007429122925, + "learning_rate": 5e-06, + "loss": 1.001, + "num_input_tokens_seen": 334761336, + "step": 737, + "train_runtime": 53068.9411, + "train_tokens_per_second": 6308.046 + }, + { + "epoch": 0.8935385313898164, + "grad_norm": 0.23492054641246796, + "learning_rate": 5e-06, + "loss": 0.9808, + "num_input_tokens_seen": 335218464, + "step": 738, + "train_runtime": 53143.7831, + "train_tokens_per_second": 6307.764 + }, + { + "epoch": 0.8947492882074177, + "grad_norm": 0.23521077632904053, + "learning_rate": 5e-06, + "loss": 0.9753, + "num_input_tokens_seen": 335663288, + "step": 739, + "train_runtime": 53216.7657, + "train_tokens_per_second": 6307.473 + }, + { + "epoch": 0.8959600450250191, + "grad_norm": 0.24233105778694153, + "learning_rate": 5e-06, + "loss": 0.9669, + "num_input_tokens_seen": 336143112, + "step": 740, + "train_runtime": 53295.4261, + "train_tokens_per_second": 6307.166 + }, + { + "epoch": 0.8971708018426205, + "grad_norm": 0.22769199311733246, + "learning_rate": 5e-06, + "loss": 0.9462, + "num_input_tokens_seen": 336620632, + "step": 741, + "train_runtime": 53373.6272, + "train_tokens_per_second": 6306.872 + }, + { + "epoch": 0.898381558660222, + "grad_norm": 0.2259906679391861, + "learning_rate": 5e-06, + "loss": 0.9976, + "num_input_tokens_seen": 337087384, + "step": 742, + "train_runtime": 53450.3668, + "train_tokens_per_second": 6306.55 + }, + { + "epoch": 0.8995923154778233, + "grad_norm": 0.24663208425045013, + "learning_rate": 5e-06, + "loss": 0.9739, + "num_input_tokens_seen": 337516840, + "step": 743, + "train_runtime": 53521.0012, + "train_tokens_per_second": 6306.25 + }, + { + "epoch": 0.9008030722954247, + "grad_norm": 0.24597318470478058, + "learning_rate": 5e-06, + "loss": 0.9405, + "num_input_tokens_seen": 337978032, + "step": 744, + "train_runtime": 53596.7412, + "train_tokens_per_second": 6305.944 + }, + { + "epoch": 0.9020138291130261, + "grad_norm": 0.24945300817489624, + "learning_rate": 5e-06, + "loss": 0.9487, + "num_input_tokens_seen": 338429744, + "step": 745, + "train_runtime": 53671.4992, + "train_tokens_per_second": 6305.576 + }, + { + "epoch": 0.9032245859306274, + "grad_norm": 0.26230573654174805, + "learning_rate": 5e-06, + "loss": 0.9708, + "num_input_tokens_seen": 338908032, + "step": 746, + "train_runtime": 53750.8266, + "train_tokens_per_second": 6305.169 + }, + { + "epoch": 0.9044353427482288, + "grad_norm": 0.23558348417282104, + "learning_rate": 5e-06, + "loss": 1.0117, + "num_input_tokens_seen": 339376168, + "step": 747, + "train_runtime": 53828.3567, + "train_tokens_per_second": 6304.784 + }, + { + "epoch": 0.9056460995658302, + "grad_norm": 0.23316293954849243, + "learning_rate": 5e-06, + "loss": 0.962, + "num_input_tokens_seen": 339825168, + "step": 748, + "train_runtime": 53902.2806, + "train_tokens_per_second": 6304.467 + }, + { + "epoch": 0.9068568563834315, + "grad_norm": 0.243992418050766, + "learning_rate": 5e-06, + "loss": 0.983, + "num_input_tokens_seen": 340304336, + "step": 749, + "train_runtime": 53981.4498, + "train_tokens_per_second": 6304.098 + }, + { + "epoch": 0.9080676132010329, + "grad_norm": 0.2598229646682739, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 340743064, + "step": 750, + "train_runtime": 54053.3316, + "train_tokens_per_second": 6303.831 + }, + { + "epoch": 0.9092783700186343, + "grad_norm": 0.24613091349601746, + "learning_rate": 5e-06, + "loss": 0.9527, + "num_input_tokens_seen": 341210112, + "step": 751, + "train_runtime": 54130.9437, + "train_tokens_per_second": 6303.421 + }, + { + "epoch": 0.9104891268362357, + "grad_norm": 0.24349863827228546, + "learning_rate": 5e-06, + "loss": 0.9261, + "num_input_tokens_seen": 341671160, + "step": 752, + "train_runtime": 54207.4785, + "train_tokens_per_second": 6303.026 + }, + { + "epoch": 0.911699883653837, + "grad_norm": 0.22625428438186646, + "learning_rate": 5e-06, + "loss": 0.9711, + "num_input_tokens_seen": 342126008, + "step": 753, + "train_runtime": 54282.3751, + "train_tokens_per_second": 6302.709 + }, + { + "epoch": 0.9129106404714384, + "grad_norm": 0.2454047054052353, + "learning_rate": 5e-06, + "loss": 0.9962, + "num_input_tokens_seen": 342589536, + "step": 754, + "train_runtime": 54359.1523, + "train_tokens_per_second": 6302.334 + }, + { + "epoch": 0.9141213972890398, + "grad_norm": 0.2505525052547455, + "learning_rate": 5e-06, + "loss": 0.9888, + "num_input_tokens_seen": 343050616, + "step": 755, + "train_runtime": 54434.9812, + "train_tokens_per_second": 6302.025 + }, + { + "epoch": 0.9153321541066411, + "grad_norm": 0.24787583947181702, + "learning_rate": 5e-06, + "loss": 0.9694, + "num_input_tokens_seen": 343502480, + "step": 756, + "train_runtime": 54509.554, + "train_tokens_per_second": 6301.693 + }, + { + "epoch": 0.9165429109242426, + "grad_norm": 0.2458108365535736, + "learning_rate": 5e-06, + "loss": 0.9712, + "num_input_tokens_seen": 343979832, + "step": 757, + "train_runtime": 54588.3291, + "train_tokens_per_second": 6301.344 + }, + { + "epoch": 0.917753667741844, + "grad_norm": 0.2579139173030853, + "learning_rate": 5e-06, + "loss": 0.9966, + "num_input_tokens_seen": 344423888, + "step": 758, + "train_runtime": 54661.823, + "train_tokens_per_second": 6300.995 + }, + { + "epoch": 0.9189644245594453, + "grad_norm": 0.24939359724521637, + "learning_rate": 5e-06, + "loss": 0.9396, + "num_input_tokens_seen": 344888696, + "step": 759, + "train_runtime": 54738.5951, + "train_tokens_per_second": 6300.649 + }, + { + "epoch": 0.9201751813770467, + "grad_norm": 0.25321266055107117, + "learning_rate": 5e-06, + "loss": 0.9294, + "num_input_tokens_seen": 345349032, + "step": 760, + "train_runtime": 54814.3689, + "train_tokens_per_second": 6300.338 + }, + { + "epoch": 0.9213859381946481, + "grad_norm": 0.25097349286079407, + "learning_rate": 5e-06, + "loss": 0.9652, + "num_input_tokens_seen": 345787136, + "step": 761, + "train_runtime": 54886.4068, + "train_tokens_per_second": 6300.051 + }, + { + "epoch": 0.9225966950122495, + "grad_norm": 0.2374579906463623, + "learning_rate": 5e-06, + "loss": 0.9305, + "num_input_tokens_seen": 346227736, + "step": 762, + "train_runtime": 54958.6615, + "train_tokens_per_second": 6299.785 + }, + { + "epoch": 0.9238074518298508, + "grad_norm": 0.24115844070911407, + "learning_rate": 5e-06, + "loss": 0.9923, + "num_input_tokens_seen": 346675680, + "step": 763, + "train_runtime": 55032.5683, + "train_tokens_per_second": 6299.464 + }, + { + "epoch": 0.9250182086474522, + "grad_norm": 0.24648192524909973, + "learning_rate": 5e-06, + "loss": 0.9706, + "num_input_tokens_seen": 347127480, + "step": 764, + "train_runtime": 55106.8858, + "train_tokens_per_second": 6299.167 + }, + { + "epoch": 0.9262289654650536, + "grad_norm": 0.22947219014167786, + "learning_rate": 5e-06, + "loss": 1.0213, + "num_input_tokens_seen": 347585736, + "step": 765, + "train_runtime": 55183.0077, + "train_tokens_per_second": 6298.782 + }, + { + "epoch": 0.9274397222826549, + "grad_norm": 0.23317500948905945, + "learning_rate": 5e-06, + "loss": 0.9755, + "num_input_tokens_seen": 348040224, + "step": 766, + "train_runtime": 55258.3609, + "train_tokens_per_second": 6298.417 + }, + { + "epoch": 0.9286504791002563, + "grad_norm": 0.24681779742240906, + "learning_rate": 5e-06, + "loss": 0.9838, + "num_input_tokens_seen": 348489688, + "step": 767, + "train_runtime": 55332.4442, + "train_tokens_per_second": 6298.108 + }, + { + "epoch": 0.9298612359178577, + "grad_norm": 0.2522102892398834, + "learning_rate": 5e-06, + "loss": 1.0349, + "num_input_tokens_seen": 348958600, + "step": 768, + "train_runtime": 55410.2472, + "train_tokens_per_second": 6297.727 + }, + { + "epoch": 0.931071992735459, + "grad_norm": 0.2546612024307251, + "learning_rate": 5e-06, + "loss": 0.9432, + "num_input_tokens_seen": 349397976, + "step": 769, + "train_runtime": 55482.6793, + "train_tokens_per_second": 6297.424 + }, + { + "epoch": 0.9322827495530605, + "grad_norm": 0.2435491979122162, + "learning_rate": 5e-06, + "loss": 0.9651, + "num_input_tokens_seen": 349854280, + "step": 770, + "train_runtime": 55558.3888, + "train_tokens_per_second": 6297.056 + }, + { + "epoch": 0.9334935063706619, + "grad_norm": 0.2178066521883011, + "learning_rate": 5e-06, + "loss": 0.9513, + "num_input_tokens_seen": 350322312, + "step": 771, + "train_runtime": 55636.2099, + "train_tokens_per_second": 6296.66 + }, + { + "epoch": 0.9347042631882633, + "grad_norm": 0.23473484814167023, + "learning_rate": 5e-06, + "loss": 0.9474, + "num_input_tokens_seen": 350791976, + "step": 772, + "train_runtime": 55713.2299, + "train_tokens_per_second": 6296.386 + }, + { + "epoch": 0.9359150200058646, + "grad_norm": 0.2700430750846863, + "learning_rate": 5e-06, + "loss": 0.9686, + "num_input_tokens_seen": 351249760, + "step": 773, + "train_runtime": 55789.3346, + "train_tokens_per_second": 6296.002 + }, + { + "epoch": 0.937125776823466, + "grad_norm": 0.22990594804286957, + "learning_rate": 5e-06, + "loss": 0.9592, + "num_input_tokens_seen": 351709136, + "step": 774, + "train_runtime": 55865.6502, + "train_tokens_per_second": 6295.624 + }, + { + "epoch": 0.9383365336410674, + "grad_norm": 0.23456795513629913, + "learning_rate": 5e-06, + "loss": 0.9531, + "num_input_tokens_seen": 352185984, + "step": 775, + "train_runtime": 55944.6884, + "train_tokens_per_second": 6295.253 + }, + { + "epoch": 0.9395472904586687, + "grad_norm": 0.2309848964214325, + "learning_rate": 5e-06, + "loss": 0.9881, + "num_input_tokens_seen": 352631888, + "step": 776, + "train_runtime": 56018.3574, + "train_tokens_per_second": 6294.934 + }, + { + "epoch": 0.9407580472762701, + "grad_norm": 0.2821614146232605, + "learning_rate": 5e-06, + "loss": 1.0289, + "num_input_tokens_seen": 353056656, + "step": 777, + "train_runtime": 56088.0171, + "train_tokens_per_second": 6294.69 + }, + { + "epoch": 0.9419688040938715, + "grad_norm": 0.24919262528419495, + "learning_rate": 5e-06, + "loss": 0.9536, + "num_input_tokens_seen": 353529936, + "step": 778, + "train_runtime": 56166.3971, + "train_tokens_per_second": 6294.332 + }, + { + "epoch": 0.9431795609114728, + "grad_norm": 0.23871028423309326, + "learning_rate": 5e-06, + "loss": 0.9919, + "num_input_tokens_seen": 353990320, + "step": 779, + "train_runtime": 56242.127, + "train_tokens_per_second": 6294.042 + }, + { + "epoch": 0.9443903177290742, + "grad_norm": 0.23189355432987213, + "learning_rate": 5e-06, + "loss": 0.9835, + "num_input_tokens_seen": 354452392, + "step": 780, + "train_runtime": 56318.8627, + "train_tokens_per_second": 6293.671 + }, + { + "epoch": 0.9456010745466756, + "grad_norm": 0.2740236520767212, + "learning_rate": 5e-06, + "loss": 0.9532, + "num_input_tokens_seen": 354899024, + "step": 781, + "train_runtime": 56393.0697, + "train_tokens_per_second": 6293.309 + }, + { + "epoch": 0.946811831364277, + "grad_norm": 0.2556408643722534, + "learning_rate": 5e-06, + "loss": 0.9444, + "num_input_tokens_seen": 355343056, + "step": 782, + "train_runtime": 56466.3117, + "train_tokens_per_second": 6293.01 + }, + { + "epoch": 0.9480225881818783, + "grad_norm": 0.23555780947208405, + "learning_rate": 5e-06, + "loss": 0.9397, + "num_input_tokens_seen": 355818304, + "step": 783, + "train_runtime": 56545.0247, + "train_tokens_per_second": 6292.654 + }, + { + "epoch": 0.9492333449994798, + "grad_norm": 0.25604984164237976, + "learning_rate": 5e-06, + "loss": 0.9677, + "num_input_tokens_seen": 356251568, + "step": 784, + "train_runtime": 56616.3994, + "train_tokens_per_second": 6292.374 + }, + { + "epoch": 0.9504441018170812, + "grad_norm": 0.24111999571323395, + "learning_rate": 5e-06, + "loss": 0.957, + "num_input_tokens_seen": 356696296, + "step": 785, + "train_runtime": 56689.6311, + "train_tokens_per_second": 6292.091 + }, + { + "epoch": 0.9516548586346825, + "grad_norm": 0.22817663848400116, + "learning_rate": 5e-06, + "loss": 0.9279, + "num_input_tokens_seen": 357149968, + "step": 786, + "train_runtime": 56764.8748, + "train_tokens_per_second": 6291.742 + }, + { + "epoch": 0.9528656154522839, + "grad_norm": 0.256910115480423, + "learning_rate": 5e-06, + "loss": 0.9302, + "num_input_tokens_seen": 357599288, + "step": 787, + "train_runtime": 56839.306, + "train_tokens_per_second": 6291.408 + }, + { + "epoch": 0.9540763722698853, + "grad_norm": 0.2196292132139206, + "learning_rate": 5e-06, + "loss": 0.9459, + "num_input_tokens_seen": 358069328, + "step": 788, + "train_runtime": 56917.2118, + "train_tokens_per_second": 6291.055 + }, + { + "epoch": 0.9552871290874866, + "grad_norm": 0.22421136498451233, + "learning_rate": 5e-06, + "loss": 0.9084, + "num_input_tokens_seen": 358534160, + "step": 789, + "train_runtime": 56994.4255, + "train_tokens_per_second": 6290.688 + }, + { + "epoch": 0.956497885905088, + "grad_norm": 0.22506392002105713, + "learning_rate": 5e-06, + "loss": 0.8446, + "num_input_tokens_seen": 358973624, + "step": 790, + "train_runtime": 57067.011, + "train_tokens_per_second": 6290.388 + }, + { + "epoch": 0.9577086427226894, + "grad_norm": 0.2432793378829956, + "learning_rate": 5e-06, + "loss": 0.9813, + "num_input_tokens_seen": 359419408, + "step": 791, + "train_runtime": 57140.9306, + "train_tokens_per_second": 6290.052 + }, + { + "epoch": 0.9589193995402908, + "grad_norm": 0.2352157086133957, + "learning_rate": 5e-06, + "loss": 0.9851, + "num_input_tokens_seen": 359883088, + "step": 792, + "train_runtime": 57218.0451, + "train_tokens_per_second": 6289.678 + }, + { + "epoch": 0.9601301563578921, + "grad_norm": 0.2471296638250351, + "learning_rate": 5e-06, + "loss": 0.9857, + "num_input_tokens_seen": 360354088, + "step": 793, + "train_runtime": 57295.7766, + "train_tokens_per_second": 6289.366 + }, + { + "epoch": 0.9613409131754935, + "grad_norm": 0.24908725917339325, + "learning_rate": 5e-06, + "loss": 0.9422, + "num_input_tokens_seen": 360816256, + "step": 794, + "train_runtime": 57371.6725, + "train_tokens_per_second": 6289.101 + }, + { + "epoch": 0.9625516699930949, + "grad_norm": 0.24670016765594482, + "learning_rate": 5e-06, + "loss": 0.9673, + "num_input_tokens_seen": 361274880, + "step": 795, + "train_runtime": 57447.7271, + "train_tokens_per_second": 6288.758 + }, + { + "epoch": 0.9637624268106962, + "grad_norm": 0.23842549324035645, + "learning_rate": 5e-06, + "loss": 0.9775, + "num_input_tokens_seen": 361733008, + "step": 796, + "train_runtime": 57523.5615, + "train_tokens_per_second": 6288.432 + }, + { + "epoch": 0.9649731836282976, + "grad_norm": 0.24963422119617462, + "learning_rate": 5e-06, + "loss": 0.9221, + "num_input_tokens_seen": 362200552, + "step": 797, + "train_runtime": 57601.2521, + "train_tokens_per_second": 6288.067 + }, + { + "epoch": 0.9661839404458991, + "grad_norm": 0.2490622103214264, + "learning_rate": 5e-06, + "loss": 0.9485, + "num_input_tokens_seen": 362658336, + "step": 798, + "train_runtime": 57676.4994, + "train_tokens_per_second": 6287.801 + }, + { + "epoch": 0.9673946972635004, + "grad_norm": 0.2377602905035019, + "learning_rate": 5e-06, + "loss": 0.9424, + "num_input_tokens_seen": 363103008, + "step": 799, + "train_runtime": 57749.703, + "train_tokens_per_second": 6287.53 + }, + { + "epoch": 0.9686054540811018, + "grad_norm": 0.24257516860961914, + "learning_rate": 5e-06, + "loss": 0.9765, + "num_input_tokens_seen": 363561496, + "step": 800, + "train_runtime": 57825.6172, + "train_tokens_per_second": 6287.205 + }, + { + "epoch": 0.9698162108987032, + "grad_norm": 0.22745341062545776, + "learning_rate": 5e-06, + "loss": 0.9451, + "num_input_tokens_seen": 364027560, + "step": 801, + "train_runtime": 57902.6387, + "train_tokens_per_second": 6286.891 + }, + { + "epoch": 0.9710269677163046, + "grad_norm": 0.24128001928329468, + "learning_rate": 5e-06, + "loss": 0.9569, + "num_input_tokens_seen": 364476736, + "step": 802, + "train_runtime": 57977.0904, + "train_tokens_per_second": 6286.565 + }, + { + "epoch": 0.9722377245339059, + "grad_norm": 0.2616693675518036, + "learning_rate": 5e-06, + "loss": 1.0019, + "num_input_tokens_seen": 364907784, + "step": 803, + "train_runtime": 58048.2061, + "train_tokens_per_second": 6286.289 + }, + { + "epoch": 0.9734484813515073, + "grad_norm": 0.2624351680278778, + "learning_rate": 5e-06, + "loss": 0.9582, + "num_input_tokens_seen": 365354112, + "step": 804, + "train_runtime": 58121.7175, + "train_tokens_per_second": 6286.017 + }, + { + "epoch": 0.9746592381691087, + "grad_norm": 0.24158768355846405, + "learning_rate": 5e-06, + "loss": 0.9769, + "num_input_tokens_seen": 365795992, + "step": 805, + "train_runtime": 58194.7377, + "train_tokens_per_second": 6285.723 + }, + { + "epoch": 0.97586999498671, + "grad_norm": 0.23048560321331024, + "learning_rate": 5e-06, + "loss": 0.955, + "num_input_tokens_seen": 366247480, + "step": 806, + "train_runtime": 58269.4576, + "train_tokens_per_second": 6285.411 + }, + { + "epoch": 0.9770807518043114, + "grad_norm": 0.23612691462039948, + "learning_rate": 5e-06, + "loss": 0.969, + "num_input_tokens_seen": 366707864, + "step": 807, + "train_runtime": 58345.6241, + "train_tokens_per_second": 6285.096 + }, + { + "epoch": 0.9782915086219128, + "grad_norm": 0.23956720530986786, + "learning_rate": 5e-06, + "loss": 0.9427, + "num_input_tokens_seen": 367171912, + "step": 808, + "train_runtime": 58421.8288, + "train_tokens_per_second": 6284.841 + }, + { + "epoch": 0.9795022654395141, + "grad_norm": 0.2306690812110901, + "learning_rate": 5e-06, + "loss": 0.9312, + "num_input_tokens_seen": 367626528, + "step": 809, + "train_runtime": 58497.4965, + "train_tokens_per_second": 6284.483 + }, + { + "epoch": 0.9807130222571155, + "grad_norm": 0.23108424246311188, + "learning_rate": 5e-06, + "loss": 0.9354, + "num_input_tokens_seen": 368110672, + "step": 810, + "train_runtime": 58577.9152, + "train_tokens_per_second": 6284.12 + }, + { + "epoch": 0.981923779074717, + "grad_norm": 0.2248297929763794, + "learning_rate": 5e-06, + "loss": 0.9036, + "num_input_tokens_seen": 368567152, + "step": 811, + "train_runtime": 58653.4012, + "train_tokens_per_second": 6283.816 + }, + { + "epoch": 0.9831345358923184, + "grad_norm": 0.24311695992946625, + "learning_rate": 5e-06, + "loss": 1.0502, + "num_input_tokens_seen": 369025104, + "step": 812, + "train_runtime": 58729.1413, + "train_tokens_per_second": 6283.509 + }, + { + "epoch": 0.9843452927099197, + "grad_norm": 0.24215175211429596, + "learning_rate": 5e-06, + "loss": 0.9737, + "num_input_tokens_seen": 369475024, + "step": 813, + "train_runtime": 58803.2136, + "train_tokens_per_second": 6283.245 + }, + { + "epoch": 0.9855560495275211, + "grad_norm": 0.253462016582489, + "learning_rate": 5e-06, + "loss": 0.9832, + "num_input_tokens_seen": 369906600, + "step": 814, + "train_runtime": 58874.4874, + "train_tokens_per_second": 6282.969 + }, + { + "epoch": 0.9867668063451225, + "grad_norm": 0.23864710330963135, + "learning_rate": 5e-06, + "loss": 0.9919, + "num_input_tokens_seen": 370367360, + "step": 815, + "train_runtime": 58950.3738, + "train_tokens_per_second": 6282.697 + }, + { + "epoch": 0.9879775631627238, + "grad_norm": 0.26924240589141846, + "learning_rate": 5e-06, + "loss": 0.9717, + "num_input_tokens_seen": 370837544, + "step": 816, + "train_runtime": 59028.3463, + "train_tokens_per_second": 6282.364 + }, + { + "epoch": 0.9891883199803252, + "grad_norm": 0.25375184416770935, + "learning_rate": 5e-06, + "loss": 0.9292, + "num_input_tokens_seen": 371321120, + "step": 817, + "train_runtime": 59108.6748, + "train_tokens_per_second": 6282.007 + }, + { + "epoch": 0.9903990767979266, + "grad_norm": 0.24142777919769287, + "learning_rate": 5e-06, + "loss": 0.9723, + "num_input_tokens_seen": 371770736, + "step": 818, + "train_runtime": 59183.2687, + "train_tokens_per_second": 6281.686 + }, + { + "epoch": 0.9916098336155279, + "grad_norm": 0.2367551475763321, + "learning_rate": 5e-06, + "loss": 1.0059, + "num_input_tokens_seen": 372212144, + "step": 819, + "train_runtime": 59256.3298, + "train_tokens_per_second": 6281.39 + }, + { + "epoch": 0.9928205904331293, + "grad_norm": 0.2153656780719757, + "learning_rate": 5e-06, + "loss": 0.922, + "num_input_tokens_seen": 372689824, + "step": 820, + "train_runtime": 59335.7827, + "train_tokens_per_second": 6281.03 + }, + { + "epoch": 0.9940313472507307, + "grad_norm": 0.25366196036338806, + "learning_rate": 5e-06, + "loss": 0.9453, + "num_input_tokens_seen": 373132840, + "step": 821, + "train_runtime": 59408.7438, + "train_tokens_per_second": 6280.773 + }, + { + "epoch": 0.9952421040683321, + "grad_norm": 0.2794412076473236, + "learning_rate": 5e-06, + "loss": 0.9247, + "num_input_tokens_seen": 373539032, + "step": 822, + "train_runtime": 59475.4747, + "train_tokens_per_second": 6280.556 + }, + { + "epoch": 0.9964528608859334, + "grad_norm": 0.24487674236297607, + "learning_rate": 5e-06, + "loss": 0.9423, + "num_input_tokens_seen": 374024816, + "step": 823, + "train_runtime": 59555.7552, + "train_tokens_per_second": 6280.246 + }, + { + "epoch": 0.9976636177035348, + "grad_norm": 0.2563667595386505, + "learning_rate": 5e-06, + "loss": 1.0405, + "num_input_tokens_seen": 374474376, + "step": 824, + "train_runtime": 59629.3553, + "train_tokens_per_second": 6280.034 + }, + { + "epoch": 0.9988743745211363, + "grad_norm": 0.23731544613838196, + "learning_rate": 5e-06, + "loss": 0.9858, + "num_input_tokens_seen": 374932816, + "step": 825, + "train_runtime": 59705.0846, + "train_tokens_per_second": 6279.747 + }, + { + "epoch": 1.0, + "grad_norm": 0.2806185185909271, + "learning_rate": 5e-06, + "loss": 0.9485, + "num_input_tokens_seen": 375383896, + "step": 826, + "train_runtime": 59779.2015, + "train_tokens_per_second": 6279.507 + }, + { + "epoch": 1.0012107568176014, + "grad_norm": 0.32343029975891113, + "learning_rate": 5e-06, + "loss": 0.9351, + "num_input_tokens_seen": 375812944, + "step": 827, + "train_runtime": 59849.0654, + "train_tokens_per_second": 6279.345 + }, + { + "epoch": 1.0024215136352028, + "grad_norm": 0.26928800344467163, + "learning_rate": 5e-06, + "loss": 0.9809, + "num_input_tokens_seen": 376259432, + "step": 828, + "train_runtime": 59921.9342, + "train_tokens_per_second": 6279.16 + }, + { + "epoch": 1.0036322704528042, + "grad_norm": 0.25450897216796875, + "learning_rate": 5e-06, + "loss": 0.9789, + "num_input_tokens_seen": 376714864, + "step": 829, + "train_runtime": 59996.7049, + "train_tokens_per_second": 6278.926 + }, + { + "epoch": 1.0048430272704054, + "grad_norm": 0.28886231780052185, + "learning_rate": 5e-06, + "loss": 0.9311, + "num_input_tokens_seen": 377169072, + "step": 830, + "train_runtime": 60071.6622, + "train_tokens_per_second": 6278.652 + }, + { + "epoch": 1.0060537840880068, + "grad_norm": 0.24842868745326996, + "learning_rate": 5e-06, + "loss": 0.962, + "num_input_tokens_seen": 377620616, + "step": 831, + "train_runtime": 60145.504, + "train_tokens_per_second": 6278.451 + }, + { + "epoch": 1.0072645409056082, + "grad_norm": 0.25559040904045105, + "learning_rate": 5e-06, + "loss": 0.9614, + "num_input_tokens_seen": 378076536, + "step": 832, + "train_runtime": 60220.5226, + "train_tokens_per_second": 6278.201 + }, + { + "epoch": 1.0084752977232097, + "grad_norm": 0.25331735610961914, + "learning_rate": 5e-06, + "loss": 1.0003, + "num_input_tokens_seen": 378514920, + "step": 833, + "train_runtime": 60292.2197, + "train_tokens_per_second": 6278.006 + }, + { + "epoch": 1.009686054540811, + "grad_norm": 0.24362653493881226, + "learning_rate": 5e-06, + "loss": 0.9511, + "num_input_tokens_seen": 378977264, + "step": 834, + "train_runtime": 60368.2898, + "train_tokens_per_second": 6277.754 + }, + { + "epoch": 1.0108968113584125, + "grad_norm": 0.28384852409362793, + "learning_rate": 5e-06, + "loss": 0.9329, + "num_input_tokens_seen": 379436816, + "step": 835, + "train_runtime": 60444.0882, + "train_tokens_per_second": 6277.484 + }, + { + "epoch": 1.0121075681760137, + "grad_norm": 0.2487291693687439, + "learning_rate": 5e-06, + "loss": 0.9456, + "num_input_tokens_seen": 379905488, + "step": 836, + "train_runtime": 60521.0892, + "train_tokens_per_second": 6277.241 + }, + { + "epoch": 1.013318324993615, + "grad_norm": 0.23668697476387024, + "learning_rate": 5e-06, + "loss": 0.9173, + "num_input_tokens_seen": 380351432, + "step": 837, + "train_runtime": 60594.3175, + "train_tokens_per_second": 6277.015 + }, + { + "epoch": 1.0145290818112165, + "grad_norm": 0.325173944234848, + "learning_rate": 5e-06, + "loss": 0.9809, + "num_input_tokens_seen": 380803728, + "step": 838, + "train_runtime": 60668.8616, + "train_tokens_per_second": 6276.757 + }, + { + "epoch": 1.015739838628818, + "grad_norm": 0.23116804659366608, + "learning_rate": 5e-06, + "loss": 0.9244, + "num_input_tokens_seen": 381263144, + "step": 839, + "train_runtime": 60744.5507, + "train_tokens_per_second": 6276.5 + }, + { + "epoch": 1.0169505954464193, + "grad_norm": 0.23826251924037933, + "learning_rate": 5e-06, + "loss": 0.9382, + "num_input_tokens_seen": 381700768, + "step": 840, + "train_runtime": 60816.1107, + "train_tokens_per_second": 6276.31 + }, + { + "epoch": 1.0181613522640207, + "grad_norm": 0.2259143888950348, + "learning_rate": 5e-06, + "loss": 0.9372, + "num_input_tokens_seen": 382148144, + "step": 841, + "train_runtime": 60889.5245, + "train_tokens_per_second": 6276.09 + }, + { + "epoch": 1.0193721090816221, + "grad_norm": 0.254041463136673, + "learning_rate": 5e-06, + "loss": 0.9983, + "num_input_tokens_seen": 382596744, + "step": 842, + "train_runtime": 60963.3714, + "train_tokens_per_second": 6275.846 + }, + { + "epoch": 1.0205828658992233, + "grad_norm": 0.2320503443479538, + "learning_rate": 5e-06, + "loss": 0.9858, + "num_input_tokens_seen": 383049456, + "step": 843, + "train_runtime": 61037.609, + "train_tokens_per_second": 6275.63 + }, + { + "epoch": 1.0217936227168247, + "grad_norm": 0.2377566397190094, + "learning_rate": 5e-06, + "loss": 0.9202, + "num_input_tokens_seen": 383507960, + "step": 844, + "train_runtime": 61113.2522, + "train_tokens_per_second": 6275.365 + }, + { + "epoch": 1.0230043795344261, + "grad_norm": 0.23518335819244385, + "learning_rate": 5e-06, + "loss": 0.9178, + "num_input_tokens_seen": 383957320, + "step": 845, + "train_runtime": 61187.3572, + "train_tokens_per_second": 6275.109 + }, + { + "epoch": 1.0242151363520275, + "grad_norm": 0.2533697187900543, + "learning_rate": 5e-06, + "loss": 0.9678, + "num_input_tokens_seen": 384423264, + "step": 846, + "train_runtime": 61264.2471, + "train_tokens_per_second": 6274.839 + }, + { + "epoch": 1.025425893169629, + "grad_norm": 0.23266910016536713, + "learning_rate": 5e-06, + "loss": 0.9683, + "num_input_tokens_seen": 384880984, + "step": 847, + "train_runtime": 61339.9009, + "train_tokens_per_second": 6274.562 + }, + { + "epoch": 1.0266366499872304, + "grad_norm": 0.26946571469306946, + "learning_rate": 5e-06, + "loss": 0.9402, + "num_input_tokens_seen": 385329376, + "step": 848, + "train_runtime": 61413.5181, + "train_tokens_per_second": 6274.341 + }, + { + "epoch": 1.0278474068048318, + "grad_norm": 0.24856071174144745, + "learning_rate": 5e-06, + "loss": 0.9894, + "num_input_tokens_seen": 385777592, + "step": 849, + "train_runtime": 61487.4406, + "train_tokens_per_second": 6274.088 + }, + { + "epoch": 1.029058163622433, + "grad_norm": 0.2351611852645874, + "learning_rate": 5e-06, + "loss": 1.0025, + "num_input_tokens_seen": 386234720, + "step": 850, + "train_runtime": 61562.9352, + "train_tokens_per_second": 6273.819 + }, + { + "epoch": 1.0302689204400344, + "grad_norm": 0.2401961088180542, + "learning_rate": 5e-06, + "loss": 0.9382, + "num_input_tokens_seen": 386694424, + "step": 851, + "train_runtime": 61638.1399, + "train_tokens_per_second": 6273.623 + }, + { + "epoch": 1.0314796772576358, + "grad_norm": 0.22459951043128967, + "learning_rate": 5e-06, + "loss": 0.9601, + "num_input_tokens_seen": 387157680, + "step": 852, + "train_runtime": 61714.3484, + "train_tokens_per_second": 6273.382 + }, + { + "epoch": 1.0326904340752372, + "grad_norm": 0.234735906124115, + "learning_rate": 5e-06, + "loss": 0.9919, + "num_input_tokens_seen": 387600544, + "step": 853, + "train_runtime": 61787.81, + "train_tokens_per_second": 6273.091 + }, + { + "epoch": 1.0339011908928386, + "grad_norm": 0.25244709849357605, + "learning_rate": 5e-06, + "loss": 0.9456, + "num_input_tokens_seen": 388051704, + "step": 854, + "train_runtime": 61862.1932, + "train_tokens_per_second": 6272.841 + }, + { + "epoch": 1.03511194771044, + "grad_norm": 0.2344299554824829, + "learning_rate": 5e-06, + "loss": 0.9304, + "num_input_tokens_seen": 388502672, + "step": 855, + "train_runtime": 61936.0349, + "train_tokens_per_second": 6272.644 + }, + { + "epoch": 1.0363227045280414, + "grad_norm": 0.23790518939495087, + "learning_rate": 5e-06, + "loss": 0.9557, + "num_input_tokens_seen": 388966360, + "step": 856, + "train_runtime": 62013.0899, + "train_tokens_per_second": 6272.327 + }, + { + "epoch": 1.0375334613456426, + "grad_norm": 0.227335587143898, + "learning_rate": 5e-06, + "loss": 0.9626, + "num_input_tokens_seen": 389459840, + "step": 857, + "train_runtime": 62094.4481, + "train_tokens_per_second": 6272.056 + }, + { + "epoch": 1.038744218163244, + "grad_norm": 0.24627360701560974, + "learning_rate": 5e-06, + "loss": 0.9593, + "num_input_tokens_seen": 389920728, + "step": 858, + "train_runtime": 62170.1606, + "train_tokens_per_second": 6271.831 + }, + { + "epoch": 1.0399549749808454, + "grad_norm": 0.23155222833156586, + "learning_rate": 5e-06, + "loss": 0.9678, + "num_input_tokens_seen": 390406000, + "step": 859, + "train_runtime": 62250.3499, + "train_tokens_per_second": 6271.547 + }, + { + "epoch": 1.0411657317984468, + "grad_norm": 0.24751697480678558, + "learning_rate": 5e-06, + "loss": 0.8877, + "num_input_tokens_seen": 390852104, + "step": 860, + "train_runtime": 62324.5278, + "train_tokens_per_second": 6271.241 + }, + { + "epoch": 1.0423764886160483, + "grad_norm": 0.24071338772773743, + "learning_rate": 5e-06, + "loss": 0.9907, + "num_input_tokens_seen": 391293032, + "step": 861, + "train_runtime": 62397.4491, + "train_tokens_per_second": 6270.978 + }, + { + "epoch": 1.0435872454336497, + "grad_norm": 0.22940731048583984, + "learning_rate": 5e-06, + "loss": 0.9161, + "num_input_tokens_seen": 391770176, + "step": 862, + "train_runtime": 62476.2215, + "train_tokens_per_second": 6270.709 + }, + { + "epoch": 1.0447980022512509, + "grad_norm": 0.2349405437707901, + "learning_rate": 5e-06, + "loss": 0.9215, + "num_input_tokens_seen": 392244600, + "step": 863, + "train_runtime": 62555.2897, + "train_tokens_per_second": 6270.367 + }, + { + "epoch": 1.0460087590688523, + "grad_norm": 0.24631568789482117, + "learning_rate": 5e-06, + "loss": 1.0027, + "num_input_tokens_seen": 392696832, + "step": 864, + "train_runtime": 62629.7506, + "train_tokens_per_second": 6270.132 + }, + { + "epoch": 1.0472195158864537, + "grad_norm": 0.22788004577159882, + "learning_rate": 5e-06, + "loss": 0.9622, + "num_input_tokens_seen": 393167904, + "step": 865, + "train_runtime": 62707.7818, + "train_tokens_per_second": 6269.842 + }, + { + "epoch": 1.048430272704055, + "grad_norm": 0.25337284803390503, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 393614280, + "step": 866, + "train_runtime": 62781.9156, + "train_tokens_per_second": 6269.549 + }, + { + "epoch": 1.0496410295216565, + "grad_norm": 0.24765488505363464, + "learning_rate": 5e-06, + "loss": 0.9575, + "num_input_tokens_seen": 394048632, + "step": 867, + "train_runtime": 62853.869, + "train_tokens_per_second": 6269.282 + }, + { + "epoch": 1.050851786339258, + "grad_norm": 0.2693709135055542, + "learning_rate": 5e-06, + "loss": 0.922, + "num_input_tokens_seen": 394509160, + "step": 868, + "train_runtime": 62930.1041, + "train_tokens_per_second": 6269.005 + }, + { + "epoch": 1.0520625431568593, + "grad_norm": 0.2373555600643158, + "learning_rate": 5e-06, + "loss": 0.9446, + "num_input_tokens_seen": 394987880, + "step": 869, + "train_runtime": 63009.5701, + "train_tokens_per_second": 6268.697 + }, + { + "epoch": 1.0532732999744605, + "grad_norm": 0.22769400477409363, + "learning_rate": 5e-06, + "loss": 0.9221, + "num_input_tokens_seen": 395451368, + "step": 870, + "train_runtime": 63086.1339, + "train_tokens_per_second": 6268.436 + }, + { + "epoch": 1.054484056792062, + "grad_norm": 0.27482476830482483, + "learning_rate": 5e-06, + "loss": 1.0033, + "num_input_tokens_seen": 395906456, + "step": 871, + "train_runtime": 63161.4527, + "train_tokens_per_second": 6268.166 + }, + { + "epoch": 1.0556948136096633, + "grad_norm": 0.3092348873615265, + "learning_rate": 5e-06, + "loss": 0.925, + "num_input_tokens_seen": 396368728, + "step": 872, + "train_runtime": 63237.3574, + "train_tokens_per_second": 6267.952 + }, + { + "epoch": 1.0569055704272647, + "grad_norm": 0.24406789243221283, + "learning_rate": 5e-06, + "loss": 0.9352, + "num_input_tokens_seen": 396819384, + "step": 873, + "train_runtime": 63311.4228, + "train_tokens_per_second": 6267.738 + }, + { + "epoch": 1.0581163272448662, + "grad_norm": 0.23081360757350922, + "learning_rate": 5e-06, + "loss": 0.9675, + "num_input_tokens_seen": 397266008, + "step": 874, + "train_runtime": 63385.1132, + "train_tokens_per_second": 6267.497 + }, + { + "epoch": 1.0593270840624676, + "grad_norm": 0.23777136206626892, + "learning_rate": 5e-06, + "loss": 0.9109, + "num_input_tokens_seen": 397710296, + "step": 875, + "train_runtime": 63458.1334, + "train_tokens_per_second": 6267.286 + }, + { + "epoch": 1.0605378408800687, + "grad_norm": 0.27890682220458984, + "learning_rate": 5e-06, + "loss": 0.9501, + "num_input_tokens_seen": 398160280, + "step": 876, + "train_runtime": 63532.2973, + "train_tokens_per_second": 6267.053 + }, + { + "epoch": 1.0617485976976702, + "grad_norm": 0.31578439474105835, + "learning_rate": 5e-06, + "loss": 0.9675, + "num_input_tokens_seen": 398617704, + "step": 877, + "train_runtime": 63607.8411, + "train_tokens_per_second": 6266.801 + }, + { + "epoch": 1.0629593545152716, + "grad_norm": 0.265449583530426, + "learning_rate": 5e-06, + "loss": 0.9622, + "num_input_tokens_seen": 399065392, + "step": 878, + "train_runtime": 63681.9925, + "train_tokens_per_second": 6266.534 + }, + { + "epoch": 1.064170111332873, + "grad_norm": 0.23809348046779633, + "learning_rate": 5e-06, + "loss": 0.9394, + "num_input_tokens_seen": 399511384, + "step": 879, + "train_runtime": 63755.2338, + "train_tokens_per_second": 6266.331 + }, + { + "epoch": 1.0653808681504744, + "grad_norm": 0.23853924870491028, + "learning_rate": 5e-06, + "loss": 0.9333, + "num_input_tokens_seen": 399962128, + "step": 880, + "train_runtime": 63829.5915, + "train_tokens_per_second": 6266.093 + }, + { + "epoch": 1.0665916249680758, + "grad_norm": 0.2612011432647705, + "learning_rate": 5e-06, + "loss": 0.9688, + "num_input_tokens_seen": 400415080, + "step": 881, + "train_runtime": 63904.2631, + "train_tokens_per_second": 6265.859 + }, + { + "epoch": 1.0678023817856772, + "grad_norm": 0.24397185444831848, + "learning_rate": 5e-06, + "loss": 0.9522, + "num_input_tokens_seen": 400891320, + "step": 882, + "train_runtime": 63982.8326, + "train_tokens_per_second": 6265.608 + }, + { + "epoch": 1.0690131386032784, + "grad_norm": 0.22875207662582397, + "learning_rate": 5e-06, + "loss": 0.8692, + "num_input_tokens_seen": 401342120, + "step": 883, + "train_runtime": 64057.1181, + "train_tokens_per_second": 6265.379 + }, + { + "epoch": 1.0702238954208798, + "grad_norm": 0.2462654709815979, + "learning_rate": 5e-06, + "loss": 0.9387, + "num_input_tokens_seen": 401803896, + "step": 884, + "train_runtime": 64132.7561, + "train_tokens_per_second": 6265.19 + }, + { + "epoch": 1.0714346522384812, + "grad_norm": 0.24718287587165833, + "learning_rate": 5e-06, + "loss": 0.9991, + "num_input_tokens_seen": 402272664, + "step": 885, + "train_runtime": 64209.9502, + "train_tokens_per_second": 6264.958 + }, + { + "epoch": 1.0726454090560826, + "grad_norm": 0.24072563648223877, + "learning_rate": 5e-06, + "loss": 0.9287, + "num_input_tokens_seen": 402723056, + "step": 886, + "train_runtime": 64284.2525, + "train_tokens_per_second": 6264.723 + }, + { + "epoch": 1.073856165873684, + "grad_norm": 0.2594250440597534, + "learning_rate": 5e-06, + "loss": 0.96, + "num_input_tokens_seen": 403187280, + "step": 887, + "train_runtime": 64360.8895, + "train_tokens_per_second": 6264.477 + }, + { + "epoch": 1.0750669226912855, + "grad_norm": 0.23461049795150757, + "learning_rate": 5e-06, + "loss": 0.9394, + "num_input_tokens_seen": 403634976, + "step": 888, + "train_runtime": 64434.6937, + "train_tokens_per_second": 6264.249 + }, + { + "epoch": 1.0762776795088869, + "grad_norm": 0.26398470997810364, + "learning_rate": 5e-06, + "loss": 0.9979, + "num_input_tokens_seen": 404074752, + "step": 889, + "train_runtime": 64506.9752, + "train_tokens_per_second": 6264.047 + }, + { + "epoch": 1.077488436326488, + "grad_norm": 0.22275783121585846, + "learning_rate": 5e-06, + "loss": 0.9295, + "num_input_tokens_seen": 404526048, + "step": 890, + "train_runtime": 64581.6885, + "train_tokens_per_second": 6263.789 + }, + { + "epoch": 1.0786991931440895, + "grad_norm": 0.24403129518032074, + "learning_rate": 5e-06, + "loss": 0.988, + "num_input_tokens_seen": 404977352, + "step": 891, + "train_runtime": 64656.3632, + "train_tokens_per_second": 6263.534 + }, + { + "epoch": 1.0799099499616909, + "grad_norm": 0.22611185908317566, + "learning_rate": 5e-06, + "loss": 0.9323, + "num_input_tokens_seen": 405472656, + "step": 892, + "train_runtime": 64738.2809, + "train_tokens_per_second": 6263.26 + }, + { + "epoch": 1.0811207067792923, + "grad_norm": 0.24935585260391235, + "learning_rate": 5e-06, + "loss": 0.9754, + "num_input_tokens_seen": 405915216, + "step": 893, + "train_runtime": 64811.4632, + "train_tokens_per_second": 6263.016 + }, + { + "epoch": 1.0823314635968937, + "grad_norm": 0.23537464439868927, + "learning_rate": 5e-06, + "loss": 0.9882, + "num_input_tokens_seen": 406358328, + "step": 894, + "train_runtime": 64884.6774, + "train_tokens_per_second": 6262.778 + }, + { + "epoch": 1.083542220414495, + "grad_norm": 0.25859230756759644, + "learning_rate": 5e-06, + "loss": 1.0094, + "num_input_tokens_seen": 406792080, + "step": 895, + "train_runtime": 64956.3923, + "train_tokens_per_second": 6262.541 + }, + { + "epoch": 1.0847529772320965, + "grad_norm": 0.2601807117462158, + "learning_rate": 5e-06, + "loss": 1.0062, + "num_input_tokens_seen": 407236568, + "step": 896, + "train_runtime": 65029.8834, + "train_tokens_per_second": 6262.299 + }, + { + "epoch": 1.0859637340496977, + "grad_norm": 0.25152677297592163, + "learning_rate": 5e-06, + "loss": 0.9604, + "num_input_tokens_seen": 407694016, + "step": 897, + "train_runtime": 65105.3988, + "train_tokens_per_second": 6262.062 + }, + { + "epoch": 1.0871744908672991, + "grad_norm": 0.2490074634552002, + "learning_rate": 5e-06, + "loss": 0.9767, + "num_input_tokens_seen": 408139040, + "step": 898, + "train_runtime": 65178.6583, + "train_tokens_per_second": 6261.851 + }, + { + "epoch": 1.0883852476849005, + "grad_norm": 0.2619398534297943, + "learning_rate": 5e-06, + "loss": 0.9195, + "num_input_tokens_seen": 408609256, + "step": 899, + "train_runtime": 65255.5022, + "train_tokens_per_second": 6261.683 + }, + { + "epoch": 1.089596004502502, + "grad_norm": 0.22217896580696106, + "learning_rate": 5e-06, + "loss": 0.9578, + "num_input_tokens_seen": 409081904, + "step": 900, + "train_runtime": 65333.4837, + "train_tokens_per_second": 6261.443 + }, + { + "epoch": 1.0908067613201033, + "grad_norm": 0.26266419887542725, + "learning_rate": 5e-06, + "loss": 0.9423, + "num_input_tokens_seen": 409534064, + "step": 901, + "train_runtime": 65407.6873, + "train_tokens_per_second": 6261.253 + }, + { + "epoch": 1.0920175181377048, + "grad_norm": 0.23616282641887665, + "learning_rate": 5e-06, + "loss": 0.9667, + "num_input_tokens_seen": 410013440, + "step": 902, + "train_runtime": 65486.7075, + "train_tokens_per_second": 6261.018 + }, + { + "epoch": 1.093228274955306, + "grad_norm": 0.2340526580810547, + "learning_rate": 5e-06, + "loss": 0.9399, + "num_input_tokens_seen": 410466096, + "step": 903, + "train_runtime": 65561.0012, + "train_tokens_per_second": 6260.827 + }, + { + "epoch": 1.0944390317729074, + "grad_norm": 0.22588470578193665, + "learning_rate": 5e-06, + "loss": 0.9486, + "num_input_tokens_seen": 410910864, + "step": 904, + "train_runtime": 65634.0777, + "train_tokens_per_second": 6260.633 + }, + { + "epoch": 1.0956497885905088, + "grad_norm": 0.22636951506137848, + "learning_rate": 5e-06, + "loss": 0.934, + "num_input_tokens_seen": 411365264, + "step": 905, + "train_runtime": 65708.6219, + "train_tokens_per_second": 6260.446 + }, + { + "epoch": 1.0968605454081102, + "grad_norm": 0.2439277172088623, + "learning_rate": 5e-06, + "loss": 0.8909, + "num_input_tokens_seen": 411829656, + "step": 906, + "train_runtime": 65785.1231, + "train_tokens_per_second": 6260.225 + }, + { + "epoch": 1.0980713022257116, + "grad_norm": 0.24524036049842834, + "learning_rate": 5e-06, + "loss": 0.9994, + "num_input_tokens_seen": 412289888, + "step": 907, + "train_runtime": 65860.7063, + "train_tokens_per_second": 6260.028 + }, + { + "epoch": 1.099282059043313, + "grad_norm": 0.23185384273529053, + "learning_rate": 5e-06, + "loss": 0.9854, + "num_input_tokens_seen": 412763840, + "step": 908, + "train_runtime": 65938.5276, + "train_tokens_per_second": 6259.828 + }, + { + "epoch": 1.1004928158609144, + "grad_norm": 0.22845549881458282, + "learning_rate": 5e-06, + "loss": 0.9688, + "num_input_tokens_seen": 413225160, + "step": 909, + "train_runtime": 66014.5341, + "train_tokens_per_second": 6259.609 + }, + { + "epoch": 1.1017035726785156, + "grad_norm": 0.24248257279396057, + "learning_rate": 5e-06, + "loss": 0.9407, + "num_input_tokens_seen": 413667032, + "step": 910, + "train_runtime": 66086.6471, + "train_tokens_per_second": 6259.465 + }, + { + "epoch": 1.102914329496117, + "grad_norm": 0.2400379329919815, + "learning_rate": 5e-06, + "loss": 0.9766, + "num_input_tokens_seen": 414113888, + "step": 911, + "train_runtime": 66159.6556, + "train_tokens_per_second": 6259.311 + }, + { + "epoch": 1.1041250863137184, + "grad_norm": 0.2528563439846039, + "learning_rate": 5e-06, + "loss": 0.9031, + "num_input_tokens_seen": 414556384, + "step": 912, + "train_runtime": 66232.8368, + "train_tokens_per_second": 6259.076 + }, + { + "epoch": 1.1053358431313198, + "grad_norm": 0.23828411102294922, + "learning_rate": 5e-06, + "loss": 0.9605, + "num_input_tokens_seen": 415033736, + "step": 913, + "train_runtime": 66311.6819, + "train_tokens_per_second": 6258.833 + }, + { + "epoch": 1.1065465999489212, + "grad_norm": 0.2361602932214737, + "learning_rate": 5e-06, + "loss": 0.9757, + "num_input_tokens_seen": 415471768, + "step": 914, + "train_runtime": 66383.8663, + "train_tokens_per_second": 6258.626 + }, + { + "epoch": 1.1077573567665226, + "grad_norm": 0.25282710790634155, + "learning_rate": 5e-06, + "loss": 0.9595, + "num_input_tokens_seen": 415920848, + "step": 915, + "train_runtime": 66457.5615, + "train_tokens_per_second": 6258.443 + }, + { + "epoch": 1.1089681135841238, + "grad_norm": 0.24360793828964233, + "learning_rate": 5e-06, + "loss": 0.9652, + "num_input_tokens_seen": 416359384, + "step": 916, + "train_runtime": 66529.0151, + "train_tokens_per_second": 6258.313 + }, + { + "epoch": 1.1101788704017252, + "grad_norm": 0.24343234300613403, + "learning_rate": 5e-06, + "loss": 0.9539, + "num_input_tokens_seen": 416791408, + "step": 917, + "train_runtime": 66599.6517, + "train_tokens_per_second": 6258.162 + }, + { + "epoch": 1.1113896272193267, + "grad_norm": 0.22756776213645935, + "learning_rate": 5e-06, + "loss": 0.9909, + "num_input_tokens_seen": 417240208, + "step": 918, + "train_runtime": 66673.3949, + "train_tokens_per_second": 6257.972 + }, + { + "epoch": 1.112600384036928, + "grad_norm": 0.24931581318378448, + "learning_rate": 5e-06, + "loss": 0.9645, + "num_input_tokens_seen": 417696072, + "step": 919, + "train_runtime": 66747.9073, + "train_tokens_per_second": 6257.815 + }, + { + "epoch": 1.1138111408545295, + "grad_norm": 0.2384309619665146, + "learning_rate": 5e-06, + "loss": 0.9136, + "num_input_tokens_seen": 418138568, + "step": 920, + "train_runtime": 66820.2813, + "train_tokens_per_second": 6257.659 + }, + { + "epoch": 1.115021897672131, + "grad_norm": 0.2728740870952606, + "learning_rate": 5e-06, + "loss": 0.9831, + "num_input_tokens_seen": 418582560, + "step": 921, + "train_runtime": 66893.2348, + "train_tokens_per_second": 6257.472 + }, + { + "epoch": 1.1162326544897323, + "grad_norm": 0.22459077835083008, + "learning_rate": 5e-06, + "loss": 0.9284, + "num_input_tokens_seen": 419046112, + "step": 922, + "train_runtime": 66969.9731, + "train_tokens_per_second": 6257.224 + }, + { + "epoch": 1.1174434113073335, + "grad_norm": 0.22039759159088135, + "learning_rate": 5e-06, + "loss": 0.9443, + "num_input_tokens_seen": 419517280, + "step": 923, + "train_runtime": 67047.7728, + "train_tokens_per_second": 6256.991 + }, + { + "epoch": 1.118654168124935, + "grad_norm": 0.251267671585083, + "learning_rate": 5e-06, + "loss": 0.9654, + "num_input_tokens_seen": 419968712, + "step": 924, + "train_runtime": 67121.4437, + "train_tokens_per_second": 6256.849 + }, + { + "epoch": 1.1198649249425363, + "grad_norm": 0.24382558465003967, + "learning_rate": 5e-06, + "loss": 0.9589, + "num_input_tokens_seen": 420406984, + "step": 925, + "train_runtime": 67193.015, + "train_tokens_per_second": 6256.707 + }, + { + "epoch": 1.1210756817601377, + "grad_norm": 0.22386138141155243, + "learning_rate": 5e-06, + "loss": 0.9313, + "num_input_tokens_seen": 420861848, + "step": 926, + "train_runtime": 67267.6486, + "train_tokens_per_second": 6256.527 + }, + { + "epoch": 1.1222864385777391, + "grad_norm": 0.21948383748531342, + "learning_rate": 5e-06, + "loss": 0.9729, + "num_input_tokens_seen": 421331168, + "step": 927, + "train_runtime": 67345.0933, + "train_tokens_per_second": 6256.301 + }, + { + "epoch": 1.1234971953953405, + "grad_norm": 0.2778039574623108, + "learning_rate": 5e-06, + "loss": 1.016, + "num_input_tokens_seen": 421758736, + "step": 928, + "train_runtime": 67414.5044, + "train_tokens_per_second": 6256.202 + }, + { + "epoch": 1.124707952212942, + "grad_norm": 0.2170412689447403, + "learning_rate": 5e-06, + "loss": 0.9539, + "num_input_tokens_seen": 422211592, + "step": 929, + "train_runtime": 67488.8275, + "train_tokens_per_second": 6256.022 + }, + { + "epoch": 1.1259187090305431, + "grad_norm": 0.25213587284088135, + "learning_rate": 5e-06, + "loss": 0.8931, + "num_input_tokens_seen": 422680032, + "step": 930, + "train_runtime": 67566.688, + "train_tokens_per_second": 6255.746 + }, + { + "epoch": 1.1271294658481446, + "grad_norm": 0.23005911707878113, + "learning_rate": 5e-06, + "loss": 0.9915, + "num_input_tokens_seen": 423140064, + "step": 931, + "train_runtime": 67642.6868, + "train_tokens_per_second": 6255.518 + }, + { + "epoch": 1.128340222665746, + "grad_norm": 0.25569239258766174, + "learning_rate": 5e-06, + "loss": 0.9817, + "num_input_tokens_seen": 423618400, + "step": 932, + "train_runtime": 67721.4977, + "train_tokens_per_second": 6255.302 + }, + { + "epoch": 1.1295509794833474, + "grad_norm": 0.2626954913139343, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 424089544, + "step": 933, + "train_runtime": 67799.5609, + "train_tokens_per_second": 6255.049 + }, + { + "epoch": 1.1307617363009488, + "grad_norm": 0.2500688135623932, + "learning_rate": 5e-06, + "loss": 1.0022, + "num_input_tokens_seen": 424567696, + "step": 934, + "train_runtime": 67878.4197, + "train_tokens_per_second": 6254.826 + }, + { + "epoch": 1.1319724931185502, + "grad_norm": 0.23637151718139648, + "learning_rate": 5e-06, + "loss": 1.0038, + "num_input_tokens_seen": 425008992, + "step": 935, + "train_runtime": 67951.2851, + "train_tokens_per_second": 6254.613 + }, + { + "epoch": 1.1331832499361516, + "grad_norm": 0.22515641152858734, + "learning_rate": 5e-06, + "loss": 0.9132, + "num_input_tokens_seen": 425487232, + "step": 936, + "train_runtime": 68030.5016, + "train_tokens_per_second": 6254.36 + }, + { + "epoch": 1.1343940067537528, + "grad_norm": 0.22837060689926147, + "learning_rate": 5e-06, + "loss": 0.9165, + "num_input_tokens_seen": 425957792, + "step": 937, + "train_runtime": 68107.9582, + "train_tokens_per_second": 6254.156 + }, + { + "epoch": 1.1356047635713542, + "grad_norm": 0.2596193552017212, + "learning_rate": 5e-06, + "loss": 0.95, + "num_input_tokens_seen": 426428528, + "step": 938, + "train_runtime": 68185.8307, + "train_tokens_per_second": 6253.917 + }, + { + "epoch": 1.1368155203889556, + "grad_norm": 0.23047588765621185, + "learning_rate": 5e-06, + "loss": 0.9153, + "num_input_tokens_seen": 426877936, + "step": 939, + "train_runtime": 68259.9737, + "train_tokens_per_second": 6253.708 + }, + { + "epoch": 1.138026277206557, + "grad_norm": 0.22194674611091614, + "learning_rate": 5e-06, + "loss": 0.9353, + "num_input_tokens_seen": 427332784, + "step": 940, + "train_runtime": 68334.9772, + "train_tokens_per_second": 6253.5 + }, + { + "epoch": 1.1392370340241584, + "grad_norm": 0.2305593639612198, + "learning_rate": 5e-06, + "loss": 0.9578, + "num_input_tokens_seen": 427770848, + "step": 941, + "train_runtime": 68406.5809, + "train_tokens_per_second": 6253.358 + }, + { + "epoch": 1.1404477908417598, + "grad_norm": 0.2662777900695801, + "learning_rate": 5e-06, + "loss": 0.9606, + "num_input_tokens_seen": 428218112, + "step": 942, + "train_runtime": 68479.8162, + "train_tokens_per_second": 6253.202 + }, + { + "epoch": 1.141658547659361, + "grad_norm": 0.26229748129844666, + "learning_rate": 5e-06, + "loss": 0.9166, + "num_input_tokens_seen": 428684216, + "step": 943, + "train_runtime": 68556.798, + "train_tokens_per_second": 6252.979 + }, + { + "epoch": 1.1428693044769624, + "grad_norm": 0.22433774173259735, + "learning_rate": 5e-06, + "loss": 0.9171, + "num_input_tokens_seen": 429177656, + "step": 944, + "train_runtime": 68638.2099, + "train_tokens_per_second": 6252.751 + }, + { + "epoch": 1.1440800612945639, + "grad_norm": 0.23602762818336487, + "learning_rate": 5e-06, + "loss": 0.9273, + "num_input_tokens_seen": 429659776, + "step": 945, + "train_runtime": 68717.5939, + "train_tokens_per_second": 6252.544 + }, + { + "epoch": 1.1452908181121653, + "grad_norm": 0.246641144156456, + "learning_rate": 5e-06, + "loss": 0.9523, + "num_input_tokens_seen": 430124728, + "step": 946, + "train_runtime": 68793.6291, + "train_tokens_per_second": 6252.392 + }, + { + "epoch": 1.1465015749297667, + "grad_norm": 0.2760850787162781, + "learning_rate": 5e-06, + "loss": 0.9436, + "num_input_tokens_seen": 430575992, + "step": 947, + "train_runtime": 68868.1445, + "train_tokens_per_second": 6252.179 + }, + { + "epoch": 1.147712331747368, + "grad_norm": 0.24327822029590607, + "learning_rate": 5e-06, + "loss": 0.9575, + "num_input_tokens_seen": 431013928, + "step": 948, + "train_runtime": 68940.3142, + "train_tokens_per_second": 6251.987 + }, + { + "epoch": 1.1489230885649695, + "grad_norm": 0.24040260910987854, + "learning_rate": 5e-06, + "loss": 0.9448, + "num_input_tokens_seen": 431462608, + "step": 949, + "train_runtime": 69014.5475, + "train_tokens_per_second": 6251.763 + }, + { + "epoch": 1.1501338453825707, + "grad_norm": 0.23738116025924683, + "learning_rate": 5e-06, + "loss": 0.9312, + "num_input_tokens_seen": 431919616, + "step": 950, + "train_runtime": 69089.8288, + "train_tokens_per_second": 6251.566 + }, + { + "epoch": 1.151344602200172, + "grad_norm": 0.26888352632522583, + "learning_rate": 5e-06, + "loss": 1.0274, + "num_input_tokens_seen": 432378360, + "step": 951, + "train_runtime": 69166.0174, + "train_tokens_per_second": 6251.312 + }, + { + "epoch": 1.1525553590177735, + "grad_norm": 0.3020702600479126, + "learning_rate": 5e-06, + "loss": 0.966, + "num_input_tokens_seen": 432815336, + "step": 952, + "train_runtime": 69238.0188, + "train_tokens_per_second": 6251.122 + }, + { + "epoch": 1.153766115835375, + "grad_norm": 0.23694109916687012, + "learning_rate": 5e-06, + "loss": 0.9676, + "num_input_tokens_seen": 433256224, + "step": 953, + "train_runtime": 69310.4665, + "train_tokens_per_second": 6250.949 + }, + { + "epoch": 1.1549768726529763, + "grad_norm": 0.26480624079704285, + "learning_rate": 5e-06, + "loss": 0.9864, + "num_input_tokens_seen": 433691568, + "step": 954, + "train_runtime": 69382.337, + "train_tokens_per_second": 6250.749 + }, + { + "epoch": 1.1561876294705777, + "grad_norm": 0.2512606382369995, + "learning_rate": 5e-06, + "loss": 0.9802, + "num_input_tokens_seen": 434141344, + "step": 955, + "train_runtime": 69456.6448, + "train_tokens_per_second": 6250.537 + }, + { + "epoch": 1.157398386288179, + "grad_norm": 0.2603987455368042, + "learning_rate": 5e-06, + "loss": 0.9443, + "num_input_tokens_seen": 434571288, + "step": 956, + "train_runtime": 69527.4526, + "train_tokens_per_second": 6250.355 + }, + { + "epoch": 1.1586091431057803, + "grad_norm": 0.2712121903896332, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 435010000, + "step": 957, + "train_runtime": 69599.8559, + "train_tokens_per_second": 6250.157 + }, + { + "epoch": 1.1598198999233817, + "grad_norm": 0.2328772246837616, + "learning_rate": 5e-06, + "loss": 0.9054, + "num_input_tokens_seen": 435464528, + "step": 958, + "train_runtime": 69673.8246, + "train_tokens_per_second": 6250.045 + }, + { + "epoch": 1.1610306567409832, + "grad_norm": 0.23724646866321564, + "learning_rate": 5e-06, + "loss": 0.9686, + "num_input_tokens_seen": 435929064, + "step": 959, + "train_runtime": 69747.3951, + "train_tokens_per_second": 6250.112 + }, + { + "epoch": 1.1622414135585846, + "grad_norm": 0.2300594449043274, + "learning_rate": 5e-06, + "loss": 0.9641, + "num_input_tokens_seen": 436379864, + "step": 960, + "train_runtime": 69821.0669, + "train_tokens_per_second": 6249.974 + }, + { + "epoch": 1.163452170376186, + "grad_norm": 0.24695640802383423, + "learning_rate": 5e-06, + "loss": 0.985, + "num_input_tokens_seen": 436818672, + "step": 961, + "train_runtime": 69893.2539, + "train_tokens_per_second": 6249.797 + }, + { + "epoch": 1.1646629271937874, + "grad_norm": 0.25464367866516113, + "learning_rate": 5e-06, + "loss": 0.955, + "num_input_tokens_seen": 437256528, + "step": 962, + "train_runtime": 69965.6023, + "train_tokens_per_second": 6249.593 + }, + { + "epoch": 1.1658736840113888, + "grad_norm": 0.23890846967697144, + "learning_rate": 5e-06, + "loss": 0.9812, + "num_input_tokens_seen": 437704184, + "step": 963, + "train_runtime": 70039.3875, + "train_tokens_per_second": 6249.401 + }, + { + "epoch": 1.16708444082899, + "grad_norm": 0.22985456883907318, + "learning_rate": 5e-06, + "loss": 0.9547, + "num_input_tokens_seen": 438157616, + "step": 964, + "train_runtime": 70114.4049, + "train_tokens_per_second": 6249.181 + }, + { + "epoch": 1.1682951976465914, + "grad_norm": 0.2481573075056076, + "learning_rate": 5e-06, + "loss": 0.9783, + "num_input_tokens_seen": 438604112, + "step": 965, + "train_runtime": 70187.4586, + "train_tokens_per_second": 6249.038 + }, + { + "epoch": 1.1695059544641928, + "grad_norm": 0.23532527685165405, + "learning_rate": 5e-06, + "loss": 0.9783, + "num_input_tokens_seen": 439082896, + "step": 966, + "train_runtime": 70266.4555, + "train_tokens_per_second": 6248.827 + }, + { + "epoch": 1.1707167112817942, + "grad_norm": 0.2518933117389679, + "learning_rate": 5e-06, + "loss": 0.9852, + "num_input_tokens_seen": 439533512, + "step": 967, + "train_runtime": 70340.9795, + "train_tokens_per_second": 6248.612 + }, + { + "epoch": 1.1719274680993956, + "grad_norm": 0.22327609360218048, + "learning_rate": 5e-06, + "loss": 0.9746, + "num_input_tokens_seen": 440017968, + "step": 968, + "train_runtime": 70421.2059, + "train_tokens_per_second": 6248.373 + }, + { + "epoch": 1.173138224916997, + "grad_norm": 0.21766787767410278, + "learning_rate": 5e-06, + "loss": 0.9427, + "num_input_tokens_seen": 440485848, + "step": 969, + "train_runtime": 70498.8622, + "train_tokens_per_second": 6248.127 + }, + { + "epoch": 1.1743489817345982, + "grad_norm": 0.24497343599796295, + "learning_rate": 5e-06, + "loss": 0.9164, + "num_input_tokens_seen": 440958848, + "step": 970, + "train_runtime": 70577.0563, + "train_tokens_per_second": 6247.906 + }, + { + "epoch": 1.1755597385521996, + "grad_norm": 0.24692267179489136, + "learning_rate": 5e-06, + "loss": 0.9531, + "num_input_tokens_seen": 441430568, + "step": 971, + "train_runtime": 70654.5666, + "train_tokens_per_second": 6247.729 + }, + { + "epoch": 1.176770495369801, + "grad_norm": 0.2446671724319458, + "learning_rate": 5e-06, + "loss": 1.0036, + "num_input_tokens_seen": 441870464, + "step": 972, + "train_runtime": 70726.7808, + "train_tokens_per_second": 6247.569 + }, + { + "epoch": 1.1779812521874025, + "grad_norm": 0.22598214447498322, + "learning_rate": 5e-06, + "loss": 0.9359, + "num_input_tokens_seen": 442325960, + "step": 973, + "train_runtime": 70802.4686, + "train_tokens_per_second": 6247.324 + }, + { + "epoch": 1.1791920090050039, + "grad_norm": 0.23768270015716553, + "learning_rate": 5e-06, + "loss": 0.9584, + "num_input_tokens_seen": 442783712, + "step": 974, + "train_runtime": 70878.7415, + "train_tokens_per_second": 6247.059 + }, + { + "epoch": 1.1804027658226053, + "grad_norm": 0.2732614576816559, + "learning_rate": 5e-06, + "loss": 0.992, + "num_input_tokens_seen": 443235984, + "step": 975, + "train_runtime": 70953.8006, + "train_tokens_per_second": 6246.825 + }, + { + "epoch": 1.1816135226402067, + "grad_norm": 0.22531206905841827, + "learning_rate": 5e-06, + "loss": 0.9176, + "num_input_tokens_seen": 443689408, + "step": 976, + "train_runtime": 71028.2155, + "train_tokens_per_second": 6246.664 + }, + { + "epoch": 1.1828242794578079, + "grad_norm": 0.246334969997406, + "learning_rate": 5e-06, + "loss": 0.9678, + "num_input_tokens_seen": 444127032, + "step": 977, + "train_runtime": 71099.99, + "train_tokens_per_second": 6246.513 + }, + { + "epoch": 1.1840350362754093, + "grad_norm": 0.2669452428817749, + "learning_rate": 5e-06, + "loss": 0.9531, + "num_input_tokens_seen": 444581568, + "step": 978, + "train_runtime": 71175.2703, + "train_tokens_per_second": 6246.293 + }, + { + "epoch": 1.1852457930930107, + "grad_norm": 0.24605169892311096, + "learning_rate": 5e-06, + "loss": 0.9629, + "num_input_tokens_seen": 445041736, + "step": 979, + "train_runtime": 71251.1753, + "train_tokens_per_second": 6246.097 + }, + { + "epoch": 1.1864565499106121, + "grad_norm": 0.2738041877746582, + "learning_rate": 5e-06, + "loss": 0.9536, + "num_input_tokens_seen": 445507072, + "step": 980, + "train_runtime": 71328.1089, + "train_tokens_per_second": 6245.884 + }, + { + "epoch": 1.1876673067282135, + "grad_norm": 0.23345020413398743, + "learning_rate": 5e-06, + "loss": 0.9208, + "num_input_tokens_seen": 445989088, + "step": 981, + "train_runtime": 71407.5879, + "train_tokens_per_second": 6245.682 + }, + { + "epoch": 1.188878063545815, + "grad_norm": 0.24863320589065552, + "learning_rate": 5e-06, + "loss": 1.0073, + "num_input_tokens_seen": 446459192, + "step": 982, + "train_runtime": 71485.3609, + "train_tokens_per_second": 6245.463 + }, + { + "epoch": 1.1900888203634161, + "grad_norm": 0.230724036693573, + "learning_rate": 5e-06, + "loss": 0.9858, + "num_input_tokens_seen": 446904400, + "step": 983, + "train_runtime": 71558.8839, + "train_tokens_per_second": 6245.268 + }, + { + "epoch": 1.1912995771810175, + "grad_norm": 0.26054030656814575, + "learning_rate": 5e-06, + "loss": 0.9527, + "num_input_tokens_seen": 447363192, + "step": 984, + "train_runtime": 71634.9114, + "train_tokens_per_second": 6245.044 + }, + { + "epoch": 1.192510333998619, + "grad_norm": 0.24276606738567352, + "learning_rate": 5e-06, + "loss": 0.9529, + "num_input_tokens_seen": 447817104, + "step": 985, + "train_runtime": 71709.3132, + "train_tokens_per_second": 6244.895 + }, + { + "epoch": 1.1937210908162204, + "grad_norm": 0.24462191760540009, + "learning_rate": 5e-06, + "loss": 0.9536, + "num_input_tokens_seen": 448278224, + "step": 986, + "train_runtime": 71785.9847, + "train_tokens_per_second": 6244.648 + }, + { + "epoch": 1.1949318476338218, + "grad_norm": 0.2560247778892517, + "learning_rate": 5e-06, + "loss": 0.9833, + "num_input_tokens_seen": 448739320, + "step": 987, + "train_runtime": 71861.9391, + "train_tokens_per_second": 6244.464 + }, + { + "epoch": 1.1961426044514232, + "grad_norm": 0.24045203626155853, + "learning_rate": 5e-06, + "loss": 0.9363, + "num_input_tokens_seen": 449186640, + "step": 988, + "train_runtime": 71935.9522, + "train_tokens_per_second": 6244.258 + }, + { + "epoch": 1.1973533612690246, + "grad_norm": 0.23872441053390503, + "learning_rate": 5e-06, + "loss": 0.8979, + "num_input_tokens_seen": 449653848, + "step": 989, + "train_runtime": 72012.385, + "train_tokens_per_second": 6244.118 + }, + { + "epoch": 1.1985641180866258, + "grad_norm": 0.28531908988952637, + "learning_rate": 5e-06, + "loss": 0.914, + "num_input_tokens_seen": 450080552, + "step": 990, + "train_runtime": 72082.6578, + "train_tokens_per_second": 6243.951 + }, + { + "epoch": 1.1997748749042272, + "grad_norm": 0.2463030368089676, + "learning_rate": 5e-06, + "loss": 0.9246, + "num_input_tokens_seen": 450513000, + "step": 991, + "train_runtime": 72153.7158, + "train_tokens_per_second": 6243.795 + }, + { + "epoch": 1.2009856317218286, + "grad_norm": 0.23530061542987823, + "learning_rate": 5e-06, + "loss": 0.9557, + "num_input_tokens_seen": 450955120, + "step": 992, + "train_runtime": 72226.4633, + "train_tokens_per_second": 6243.627 + }, + { + "epoch": 1.20219638853943, + "grad_norm": 0.246900275349617, + "learning_rate": 5e-06, + "loss": 0.8806, + "num_input_tokens_seen": 451394736, + "step": 993, + "train_runtime": 72298.4949, + "train_tokens_per_second": 6243.487 + }, + { + "epoch": 1.2034071453570314, + "grad_norm": 0.25331759452819824, + "learning_rate": 5e-06, + "loss": 1.0287, + "num_input_tokens_seen": 451851352, + "step": 994, + "train_runtime": 72373.4673, + "train_tokens_per_second": 6243.329 + }, + { + "epoch": 1.2046179021746328, + "grad_norm": 0.23674815893173218, + "learning_rate": 5e-06, + "loss": 0.9687, + "num_input_tokens_seen": 452305600, + "step": 995, + "train_runtime": 72448.5085, + "train_tokens_per_second": 6243.132 + }, + { + "epoch": 1.205828658992234, + "grad_norm": 0.24373270571231842, + "learning_rate": 5e-06, + "loss": 0.9765, + "num_input_tokens_seen": 452773312, + "step": 996, + "train_runtime": 72525.8954, + "train_tokens_per_second": 6242.919 + }, + { + "epoch": 1.2070394158098354, + "grad_norm": 0.2752549350261688, + "learning_rate": 5e-06, + "loss": 0.9142, + "num_input_tokens_seen": 453228432, + "step": 997, + "train_runtime": 72600.7249, + "train_tokens_per_second": 6242.754 + }, + { + "epoch": 1.2082501726274368, + "grad_norm": 0.2349204123020172, + "learning_rate": 5e-06, + "loss": 0.9476, + "num_input_tokens_seen": 453688736, + "step": 998, + "train_runtime": 72676.6277, + "train_tokens_per_second": 6242.567 + }, + { + "epoch": 1.2094609294450382, + "grad_norm": 0.23312972486019135, + "learning_rate": 5e-06, + "loss": 0.9553, + "num_input_tokens_seen": 454156576, + "step": 999, + "train_runtime": 72754.6022, + "train_tokens_per_second": 6242.307 + }, + { + "epoch": 1.2106716862626397, + "grad_norm": 0.24874716997146606, + "learning_rate": 5e-06, + "loss": 0.9416, + "num_input_tokens_seen": 454614576, + "step": 1000, + "train_runtime": 72830.4183, + "train_tokens_per_second": 6242.098 + }, + { + "epoch": 1.211882443080241, + "grad_norm": 0.21960654854774475, + "learning_rate": 5e-06, + "loss": 0.9301, + "num_input_tokens_seen": 455060760, + "step": 1001, + "train_runtime": 72904.2157, + "train_tokens_per_second": 6241.899 + }, + { + "epoch": 1.2130931998978425, + "grad_norm": 0.23251725733280182, + "learning_rate": 5e-06, + "loss": 0.9488, + "num_input_tokens_seen": 455524632, + "step": 1002, + "train_runtime": 72981.2956, + "train_tokens_per_second": 6241.663 + }, + { + "epoch": 1.2143039567154439, + "grad_norm": 0.2484462857246399, + "learning_rate": 5e-06, + "loss": 0.9695, + "num_input_tokens_seen": 455985256, + "step": 1003, + "train_runtime": 73057.0852, + "train_tokens_per_second": 6241.493 + }, + { + "epoch": 1.215514713533045, + "grad_norm": 0.23444589972496033, + "learning_rate": 5e-06, + "loss": 0.9248, + "num_input_tokens_seen": 456447872, + "step": 1004, + "train_runtime": 73133.7066, + "train_tokens_per_second": 6241.279 + }, + { + "epoch": 1.2167254703506465, + "grad_norm": 0.23066623508930206, + "learning_rate": 5e-06, + "loss": 0.9341, + "num_input_tokens_seen": 456918240, + "step": 1005, + "train_runtime": 73211.6327, + "train_tokens_per_second": 6241.061 + }, + { + "epoch": 1.217936227168248, + "grad_norm": 0.26110243797302246, + "learning_rate": 5e-06, + "loss": 0.9673, + "num_input_tokens_seen": 457361960, + "step": 1006, + "train_runtime": 73284.6571, + "train_tokens_per_second": 6240.896 + }, + { + "epoch": 1.2191469839858493, + "grad_norm": 0.22857554256916046, + "learning_rate": 5e-06, + "loss": 0.9429, + "num_input_tokens_seen": 457829616, + "step": 1007, + "train_runtime": 73362.2686, + "train_tokens_per_second": 6240.669 + }, + { + "epoch": 1.2203577408034507, + "grad_norm": 0.21707653999328613, + "learning_rate": 5e-06, + "loss": 0.9058, + "num_input_tokens_seen": 458299576, + "step": 1008, + "train_runtime": 73440.2904, + "train_tokens_per_second": 6240.438 + }, + { + "epoch": 1.2215684976210521, + "grad_norm": 0.21953126788139343, + "learning_rate": 5e-06, + "loss": 0.9552, + "num_input_tokens_seen": 458744008, + "step": 1009, + "train_runtime": 73513.7035, + "train_tokens_per_second": 6240.252 + }, + { + "epoch": 1.2227792544386533, + "grad_norm": 0.24714279174804688, + "learning_rate": 5e-06, + "loss": 0.9093, + "num_input_tokens_seen": 459194536, + "step": 1010, + "train_runtime": 73589.1216, + "train_tokens_per_second": 6239.978 + }, + { + "epoch": 1.2239900112562547, + "grad_norm": 0.2624055743217468, + "learning_rate": 5e-06, + "loss": 0.9537, + "num_input_tokens_seen": 459658064, + "step": 1011, + "train_runtime": 73665.7012, + "train_tokens_per_second": 6239.784 + }, + { + "epoch": 1.2252007680738561, + "grad_norm": 0.24378705024719238, + "learning_rate": 5e-06, + "loss": 0.9647, + "num_input_tokens_seen": 460102312, + "step": 1012, + "train_runtime": 73739.1174, + "train_tokens_per_second": 6239.596 + }, + { + "epoch": 1.2264115248914575, + "grad_norm": 0.2524285316467285, + "learning_rate": 5e-06, + "loss": 1.008, + "num_input_tokens_seen": 460530568, + "step": 1013, + "train_runtime": 73809.5347, + "train_tokens_per_second": 6239.445 + }, + { + "epoch": 1.227622281709059, + "grad_norm": 0.22694693505764008, + "learning_rate": 5e-06, + "loss": 0.9336, + "num_input_tokens_seen": 460976752, + "step": 1014, + "train_runtime": 73882.6824, + "train_tokens_per_second": 6239.307 + }, + { + "epoch": 1.2288330385266604, + "grad_norm": 0.24876870214939117, + "learning_rate": 5e-06, + "loss": 0.9167, + "num_input_tokens_seen": 461412360, + "step": 1015, + "train_runtime": 73954.7175, + "train_tokens_per_second": 6239.12 + }, + { + "epoch": 1.2300437953442618, + "grad_norm": 0.23304542899131775, + "learning_rate": 5e-06, + "loss": 0.937, + "num_input_tokens_seen": 461881936, + "step": 1016, + "train_runtime": 74032.0414, + "train_tokens_per_second": 6238.946 + }, + { + "epoch": 1.231254552161863, + "grad_norm": 0.2319115698337555, + "learning_rate": 5e-06, + "loss": 1.0273, + "num_input_tokens_seen": 462327848, + "step": 1017, + "train_runtime": 74105.762, + "train_tokens_per_second": 6238.757 + }, + { + "epoch": 1.2324653089794644, + "grad_norm": 0.2387470155954361, + "learning_rate": 5e-06, + "loss": 0.9402, + "num_input_tokens_seen": 462801768, + "step": 1018, + "train_runtime": 74184.7163, + "train_tokens_per_second": 6238.506 + }, + { + "epoch": 1.2336760657970658, + "grad_norm": 0.23503154516220093, + "learning_rate": 5e-06, + "loss": 0.9537, + "num_input_tokens_seen": 463250128, + "step": 1019, + "train_runtime": 74258.5041, + "train_tokens_per_second": 6238.344 + }, + { + "epoch": 1.2348868226146672, + "grad_norm": 0.2387133687734604, + "learning_rate": 5e-06, + "loss": 0.9841, + "num_input_tokens_seen": 463712976, + "step": 1020, + "train_runtime": 74335.47, + "train_tokens_per_second": 6238.112 + }, + { + "epoch": 1.2360975794322686, + "grad_norm": 0.2348925918340683, + "learning_rate": 5e-06, + "loss": 0.9325, + "num_input_tokens_seen": 464170752, + "step": 1021, + "train_runtime": 74411.2711, + "train_tokens_per_second": 6237.909 + }, + { + "epoch": 1.23730833624987, + "grad_norm": 0.2409505844116211, + "learning_rate": 5e-06, + "loss": 1.0182, + "num_input_tokens_seen": 464620440, + "step": 1022, + "train_runtime": 74485.4943, + "train_tokens_per_second": 6237.731 + }, + { + "epoch": 1.2385190930674712, + "grad_norm": 0.2405453324317932, + "learning_rate": 5e-06, + "loss": 0.9672, + "num_input_tokens_seen": 465074072, + "step": 1023, + "train_runtime": 74560.4666, + "train_tokens_per_second": 6237.542 + }, + { + "epoch": 1.2397298498850726, + "grad_norm": 0.2541082799434662, + "learning_rate": 5e-06, + "loss": 0.9502, + "num_input_tokens_seen": 465526872, + "step": 1024, + "train_runtime": 74635.3173, + "train_tokens_per_second": 6237.354 + }, + { + "epoch": 1.240940606702674, + "grad_norm": 0.233840674161911, + "learning_rate": 5e-06, + "loss": 0.9982, + "num_input_tokens_seen": 465969728, + "step": 1025, + "train_runtime": 74708.3224, + "train_tokens_per_second": 6237.186 + }, + { + "epoch": 1.2421513635202754, + "grad_norm": 0.2615164518356323, + "learning_rate": 5e-06, + "loss": 0.9789, + "num_input_tokens_seen": 466432952, + "step": 1026, + "train_runtime": 74784.9859, + "train_tokens_per_second": 6236.987 + }, + { + "epoch": 1.2433621203378769, + "grad_norm": 0.25451064109802246, + "learning_rate": 5e-06, + "loss": 0.9876, + "num_input_tokens_seen": 466859880, + "step": 1027, + "train_runtime": 74855.3509, + "train_tokens_per_second": 6236.827 + }, + { + "epoch": 1.2445728771554783, + "grad_norm": 0.23738832771778107, + "learning_rate": 5e-06, + "loss": 0.9718, + "num_input_tokens_seen": 467312216, + "step": 1028, + "train_runtime": 74929.7678, + "train_tokens_per_second": 6236.67 + }, + { + "epoch": 1.2457836339730797, + "grad_norm": 0.23887260258197784, + "learning_rate": 5e-06, + "loss": 0.964, + "num_input_tokens_seen": 467762744, + "step": 1029, + "train_runtime": 75003.9938, + "train_tokens_per_second": 6236.504 + }, + { + "epoch": 1.2469943907906809, + "grad_norm": 0.2599722743034363, + "learning_rate": 5e-06, + "loss": 0.9842, + "num_input_tokens_seen": 468221536, + "step": 1030, + "train_runtime": 75079.9409, + "train_tokens_per_second": 6236.307 + }, + { + "epoch": 1.2482051476082823, + "grad_norm": 0.2669295072555542, + "learning_rate": 5e-06, + "loss": 0.9675, + "num_input_tokens_seen": 468654896, + "step": 1031, + "train_runtime": 75151.0551, + "train_tokens_per_second": 6236.172 + }, + { + "epoch": 1.2494159044258837, + "grad_norm": 0.23142068088054657, + "learning_rate": 5e-06, + "loss": 0.9743, + "num_input_tokens_seen": 469100424, + "step": 1032, + "train_runtime": 75224.2634, + "train_tokens_per_second": 6236.025 + }, + { + "epoch": 1.250626661243485, + "grad_norm": 0.24564848840236664, + "learning_rate": 5e-06, + "loss": 0.9775, + "num_input_tokens_seen": 469557312, + "step": 1033, + "train_runtime": 75299.8089, + "train_tokens_per_second": 6235.837 + }, + { + "epoch": 1.2518374180610865, + "grad_norm": 0.2531740069389343, + "learning_rate": 5e-06, + "loss": 0.9778, + "num_input_tokens_seen": 469993728, + "step": 1034, + "train_runtime": 75371.0171, + "train_tokens_per_second": 6235.736 + }, + { + "epoch": 1.253048174878688, + "grad_norm": 0.2566632330417633, + "learning_rate": 5e-06, + "loss": 0.9607, + "num_input_tokens_seen": 470435832, + "step": 1035, + "train_runtime": 75444.0382, + "train_tokens_per_second": 6235.56 + }, + { + "epoch": 1.254258931696289, + "grad_norm": 0.2733544111251831, + "learning_rate": 5e-06, + "loss": 0.9229, + "num_input_tokens_seen": 470881848, + "step": 1036, + "train_runtime": 75517.9642, + "train_tokens_per_second": 6235.362 + }, + { + "epoch": 1.2554696885138905, + "grad_norm": 0.22786258161067963, + "learning_rate": 5e-06, + "loss": 0.9479, + "num_input_tokens_seen": 471347472, + "step": 1037, + "train_runtime": 75594.9234, + "train_tokens_per_second": 6235.174 + }, + { + "epoch": 1.256680445331492, + "grad_norm": 0.2446991503238678, + "learning_rate": 5e-06, + "loss": 0.9554, + "num_input_tokens_seen": 471795376, + "step": 1038, + "train_runtime": 75668.7229, + "train_tokens_per_second": 6235.012 + }, + { + "epoch": 1.2578912021490933, + "grad_norm": 0.26110076904296875, + "learning_rate": 5e-06, + "loss": 0.9515, + "num_input_tokens_seen": 472220096, + "step": 1039, + "train_runtime": 75737.7809, + "train_tokens_per_second": 6234.934 + }, + { + "epoch": 1.2591019589666947, + "grad_norm": 0.24883201718330383, + "learning_rate": 5e-06, + "loss": 0.9497, + "num_input_tokens_seen": 472677208, + "step": 1040, + "train_runtime": 75812.9693, + "train_tokens_per_second": 6234.78 + }, + { + "epoch": 1.2603127157842962, + "grad_norm": 0.2285858392715454, + "learning_rate": 5e-06, + "loss": 0.9709, + "num_input_tokens_seen": 473140032, + "step": 1041, + "train_runtime": 75889.2935, + "train_tokens_per_second": 6234.608 + }, + { + "epoch": 1.2615234726018976, + "grad_norm": 0.2190844714641571, + "learning_rate": 5e-06, + "loss": 0.9493, + "num_input_tokens_seen": 473600272, + "step": 1042, + "train_runtime": 75964.9162, + "train_tokens_per_second": 6234.461 + }, + { + "epoch": 1.262734229419499, + "grad_norm": 0.2370315045118332, + "learning_rate": 5e-06, + "loss": 0.971, + "num_input_tokens_seen": 474054944, + "step": 1043, + "train_runtime": 76039.0777, + "train_tokens_per_second": 6234.359 + }, + { + "epoch": 1.2639449862371002, + "grad_norm": 0.22360284626483917, + "learning_rate": 5e-06, + "loss": 0.9638, + "num_input_tokens_seen": 474536696, + "step": 1044, + "train_runtime": 76118.2238, + "train_tokens_per_second": 6234.206 + }, + { + "epoch": 1.2651557430547016, + "grad_norm": 0.25233903527259827, + "learning_rate": 5e-06, + "loss": 0.8986, + "num_input_tokens_seen": 474994584, + "step": 1045, + "train_runtime": 76194.1156, + "train_tokens_per_second": 6234.006 + }, + { + "epoch": 1.266366499872303, + "grad_norm": 0.2806606888771057, + "learning_rate": 5e-06, + "loss": 0.9721, + "num_input_tokens_seen": 475437456, + "step": 1046, + "train_runtime": 76266.4104, + "train_tokens_per_second": 6233.904 + }, + { + "epoch": 1.2675772566899044, + "grad_norm": 0.23013675212860107, + "learning_rate": 5e-06, + "loss": 0.9835, + "num_input_tokens_seen": 475885056, + "step": 1047, + "train_runtime": 76340.2753, + "train_tokens_per_second": 6233.735 + }, + { + "epoch": 1.2687880135075058, + "grad_norm": 0.2585345208644867, + "learning_rate": 5e-06, + "loss": 0.9556, + "num_input_tokens_seen": 476330056, + "step": 1048, + "train_runtime": 76413.1727, + "train_tokens_per_second": 6233.612 + }, + { + "epoch": 1.269998770325107, + "grad_norm": 0.27313679456710815, + "learning_rate": 5e-06, + "loss": 0.9655, + "num_input_tokens_seen": 476790288, + "step": 1049, + "train_runtime": 76488.7274, + "train_tokens_per_second": 6233.471 + }, + { + "epoch": 1.2712095271427084, + "grad_norm": 0.22804471850395203, + "learning_rate": 5e-06, + "loss": 0.9798, + "num_input_tokens_seen": 477237680, + "step": 1050, + "train_runtime": 76562.66, + "train_tokens_per_second": 6233.295 + }, + { + "epoch": 1.2724202839603098, + "grad_norm": 0.23282477259635925, + "learning_rate": 5e-06, + "loss": 0.9485, + "num_input_tokens_seen": 477711928, + "step": 1051, + "train_runtime": 76641.8583, + "train_tokens_per_second": 6233.042 + }, + { + "epoch": 1.2736310407779112, + "grad_norm": 0.2197505533695221, + "learning_rate": 5e-06, + "loss": 0.9604, + "num_input_tokens_seen": 478171336, + "step": 1052, + "train_runtime": 76717.5391, + "train_tokens_per_second": 6232.882 + }, + { + "epoch": 1.2748417975955126, + "grad_norm": 0.2753906846046448, + "learning_rate": 5e-06, + "loss": 0.9519, + "num_input_tokens_seen": 478594376, + "step": 1053, + "train_runtime": 76786.3869, + "train_tokens_per_second": 6232.802 + }, + { + "epoch": 1.276052554413114, + "grad_norm": 0.23567403852939606, + "learning_rate": 5e-06, + "loss": 0.9258, + "num_input_tokens_seen": 479057744, + "step": 1054, + "train_runtime": 76862.8595, + "train_tokens_per_second": 6232.63 + }, + { + "epoch": 1.2772633112307155, + "grad_norm": 0.2323777824640274, + "learning_rate": 5e-06, + "loss": 0.9507, + "num_input_tokens_seen": 479530368, + "step": 1055, + "train_runtime": 76940.9952, + "train_tokens_per_second": 6232.443 + }, + { + "epoch": 1.2784740680483169, + "grad_norm": 0.24186258018016815, + "learning_rate": 5e-06, + "loss": 0.9227, + "num_input_tokens_seen": 479994304, + "step": 1056, + "train_runtime": 77017.947, + "train_tokens_per_second": 6232.24 + }, + { + "epoch": 1.2796848248659183, + "grad_norm": 0.2798727750778198, + "learning_rate": 5e-06, + "loss": 0.9632, + "num_input_tokens_seen": 480447768, + "step": 1057, + "train_runtime": 77093.054, + "train_tokens_per_second": 6232.05 + }, + { + "epoch": 1.2808955816835195, + "grad_norm": 0.2540852427482605, + "learning_rate": 5e-06, + "loss": 0.9633, + "num_input_tokens_seen": 480890376, + "step": 1058, + "train_runtime": 77166.9256, + "train_tokens_per_second": 6231.82 + }, + { + "epoch": 1.2821063385011209, + "grad_norm": 0.23041221499443054, + "learning_rate": 5e-06, + "loss": 0.9052, + "num_input_tokens_seen": 481360496, + "step": 1059, + "train_runtime": 77244.961, + "train_tokens_per_second": 6231.61 + }, + { + "epoch": 1.2833170953187223, + "grad_norm": 0.24767398834228516, + "learning_rate": 5e-06, + "loss": 0.9332, + "num_input_tokens_seen": 481821264, + "step": 1060, + "train_runtime": 77321.082, + "train_tokens_per_second": 6231.435 + }, + { + "epoch": 1.2845278521363237, + "grad_norm": 0.25022172927856445, + "learning_rate": 5e-06, + "loss": 0.9481, + "num_input_tokens_seen": 482278160, + "step": 1061, + "train_runtime": 77396.3485, + "train_tokens_per_second": 6231.278 + }, + { + "epoch": 1.285738608953925, + "grad_norm": 0.25090205669403076, + "learning_rate": 5e-06, + "loss": 0.9319, + "num_input_tokens_seen": 482732096, + "step": 1062, + "train_runtime": 77471.2094, + "train_tokens_per_second": 6231.116 + }, + { + "epoch": 1.2869493657715263, + "grad_norm": 0.24102523922920227, + "learning_rate": 5e-06, + "loss": 0.9033, + "num_input_tokens_seen": 483182128, + "step": 1063, + "train_runtime": 77545.2333, + "train_tokens_per_second": 6230.971 + }, + { + "epoch": 1.2881601225891277, + "grad_norm": 0.22408998012542725, + "learning_rate": 5e-06, + "loss": 0.9583, + "num_input_tokens_seen": 483634912, + "step": 1064, + "train_runtime": 77619.6948, + "train_tokens_per_second": 6230.827 + }, + { + "epoch": 1.2893708794067291, + "grad_norm": 0.22242091596126556, + "learning_rate": 5e-06, + "loss": 0.8963, + "num_input_tokens_seen": 484082184, + "step": 1065, + "train_runtime": 77693.1389, + "train_tokens_per_second": 6230.694 + }, + { + "epoch": 1.2905816362243305, + "grad_norm": 0.24296538531780243, + "learning_rate": 5e-06, + "loss": 0.9512, + "num_input_tokens_seen": 484538336, + "step": 1066, + "train_runtime": 77767.9496, + "train_tokens_per_second": 6230.566 + }, + { + "epoch": 1.291792393041932, + "grad_norm": 0.2800133526325226, + "learning_rate": 5e-06, + "loss": 1.0084, + "num_input_tokens_seen": 484979760, + "step": 1067, + "train_runtime": 77840.1247, + "train_tokens_per_second": 6230.46 + }, + { + "epoch": 1.2930031498595334, + "grad_norm": 0.26364296674728394, + "learning_rate": 5e-06, + "loss": 0.9158, + "num_input_tokens_seen": 485451992, + "step": 1068, + "train_runtime": 77919.0701, + "train_tokens_per_second": 6230.208 + }, + { + "epoch": 1.2942139066771348, + "grad_norm": 0.23616540431976318, + "learning_rate": 5e-06, + "loss": 0.9675, + "num_input_tokens_seen": 485907896, + "step": 1069, + "train_runtime": 77994.4035, + "train_tokens_per_second": 6230.035 + }, + { + "epoch": 1.2954246634947362, + "grad_norm": 0.2279627025127411, + "learning_rate": 5e-06, + "loss": 0.9279, + "num_input_tokens_seen": 486374992, + "step": 1070, + "train_runtime": 78071.146, + "train_tokens_per_second": 6229.894 + }, + { + "epoch": 1.2966354203123374, + "grad_norm": 0.2602773904800415, + "learning_rate": 5e-06, + "loss": 0.9288, + "num_input_tokens_seen": 486846584, + "step": 1071, + "train_runtime": 78149.9854, + "train_tokens_per_second": 6229.644 + }, + { + "epoch": 1.2978461771299388, + "grad_norm": 0.2592213451862335, + "learning_rate": 5e-06, + "loss": 0.9707, + "num_input_tokens_seen": 487299176, + "step": 1072, + "train_runtime": 78224.87, + "train_tokens_per_second": 6229.466 + }, + { + "epoch": 1.2990569339475402, + "grad_norm": 0.23838956654071808, + "learning_rate": 5e-06, + "loss": 0.9459, + "num_input_tokens_seen": 487738752, + "step": 1073, + "train_runtime": 78297.4076, + "train_tokens_per_second": 6229.309 + }, + { + "epoch": 1.3002676907651416, + "grad_norm": 0.2431815266609192, + "learning_rate": 5e-06, + "loss": 1.0018, + "num_input_tokens_seen": 488179592, + "step": 1074, + "train_runtime": 78369.9512, + "train_tokens_per_second": 6229.168 + }, + { + "epoch": 1.301478447582743, + "grad_norm": 0.2688054144382477, + "learning_rate": 5e-06, + "loss": 0.9754, + "num_input_tokens_seen": 488624232, + "step": 1075, + "train_runtime": 78442.9355, + "train_tokens_per_second": 6229.041 + }, + { + "epoch": 1.3026892044003442, + "grad_norm": 0.2385970801115036, + "learning_rate": 5e-06, + "loss": 0.9063, + "num_input_tokens_seen": 489046568, + "step": 1076, + "train_runtime": 78511.9197, + "train_tokens_per_second": 6228.947 + }, + { + "epoch": 1.3038999612179456, + "grad_norm": 0.23294121026992798, + "learning_rate": 5e-06, + "loss": 0.9876, + "num_input_tokens_seen": 489514704, + "step": 1077, + "train_runtime": 78589.3633, + "train_tokens_per_second": 6228.765 + }, + { + "epoch": 1.305110718035547, + "grad_norm": 0.2477468103170395, + "learning_rate": 5e-06, + "loss": 0.9493, + "num_input_tokens_seen": 489948088, + "step": 1078, + "train_runtime": 78660.5285, + "train_tokens_per_second": 6228.64 + }, + { + "epoch": 1.3063214748531484, + "grad_norm": 0.2480383664369583, + "learning_rate": 5e-06, + "loss": 0.9577, + "num_input_tokens_seen": 490385808, + "step": 1079, + "train_runtime": 78732.9706, + "train_tokens_per_second": 6228.468 + }, + { + "epoch": 1.3075322316707498, + "grad_norm": 0.2859964668750763, + "learning_rate": 5e-06, + "loss": 0.9368, + "num_input_tokens_seen": 490856832, + "step": 1080, + "train_runtime": 78811.3881, + "train_tokens_per_second": 6228.248 + }, + { + "epoch": 1.3087429884883512, + "grad_norm": 0.2931101620197296, + "learning_rate": 5e-06, + "loss": 0.9727, + "num_input_tokens_seen": 491314392, + "step": 1081, + "train_runtime": 78887.1437, + "train_tokens_per_second": 6228.067 + }, + { + "epoch": 1.3099537453059527, + "grad_norm": 0.27014395594596863, + "learning_rate": 5e-06, + "loss": 0.9503, + "num_input_tokens_seen": 491759208, + "step": 1082, + "train_runtime": 78960.4837, + "train_tokens_per_second": 6227.915 + }, + { + "epoch": 1.311164502123554, + "grad_norm": 0.2364778369665146, + "learning_rate": 5e-06, + "loss": 0.9087, + "num_input_tokens_seen": 492218016, + "step": 1083, + "train_runtime": 79036.6786, + "train_tokens_per_second": 6227.716 + }, + { + "epoch": 1.3123752589411553, + "grad_norm": 0.2594203054904938, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 492683488, + "step": 1084, + "train_runtime": 79112.9413, + "train_tokens_per_second": 6227.597 + }, + { + "epoch": 1.3135860157587567, + "grad_norm": 0.2824831008911133, + "learning_rate": 5e-06, + "loss": 0.9782, + "num_input_tokens_seen": 493126536, + "step": 1085, + "train_runtime": 79186.2287, + "train_tokens_per_second": 6227.428 + }, + { + "epoch": 1.314796772576358, + "grad_norm": 0.2868604063987732, + "learning_rate": 5e-06, + "loss": 0.9473, + "num_input_tokens_seen": 493574776, + "step": 1086, + "train_runtime": 79260.2797, + "train_tokens_per_second": 6227.265 + }, + { + "epoch": 1.3160075293939595, + "grad_norm": 0.24373245239257812, + "learning_rate": 5e-06, + "loss": 0.9111, + "num_input_tokens_seen": 494018800, + "step": 1087, + "train_runtime": 79333.7195, + "train_tokens_per_second": 6227.097 + }, + { + "epoch": 1.317218286211561, + "grad_norm": 0.23148846626281738, + "learning_rate": 5e-06, + "loss": 0.9671, + "num_input_tokens_seen": 494459824, + "step": 1088, + "train_runtime": 79406.5726, + "train_tokens_per_second": 6226.938 + }, + { + "epoch": 1.318429043029162, + "grad_norm": 0.2403024286031723, + "learning_rate": 5e-06, + "loss": 0.9115, + "num_input_tokens_seen": 494928432, + "step": 1089, + "train_runtime": 79484.2329, + "train_tokens_per_second": 6226.75 + }, + { + "epoch": 1.3196397998467635, + "grad_norm": 0.2649286389350891, + "learning_rate": 5e-06, + "loss": 0.9377, + "num_input_tokens_seen": 495391952, + "step": 1090, + "train_runtime": 79560.7838, + "train_tokens_per_second": 6226.585 + }, + { + "epoch": 1.320850556664365, + "grad_norm": 0.24317079782485962, + "learning_rate": 5e-06, + "loss": 0.9451, + "num_input_tokens_seen": 495859560, + "step": 1091, + "train_runtime": 79637.6715, + "train_tokens_per_second": 6226.445 + }, + { + "epoch": 1.3220613134819663, + "grad_norm": 0.25734710693359375, + "learning_rate": 5e-06, + "loss": 0.9564, + "num_input_tokens_seen": 496335008, + "step": 1092, + "train_runtime": 79716.5771, + "train_tokens_per_second": 6226.246 + }, + { + "epoch": 1.3232720702995677, + "grad_norm": 0.230266273021698, + "learning_rate": 5e-06, + "loss": 1.0006, + "num_input_tokens_seen": 496792520, + "step": 1093, + "train_runtime": 79791.7859, + "train_tokens_per_second": 6226.111 + }, + { + "epoch": 1.3244828271171691, + "grad_norm": 0.2398468255996704, + "learning_rate": 5e-06, + "loss": 0.9919, + "num_input_tokens_seen": 497244656, + "step": 1094, + "train_runtime": 79866.2963, + "train_tokens_per_second": 6225.964 + }, + { + "epoch": 1.3256935839347705, + "grad_norm": 0.25273364782333374, + "learning_rate": 5e-06, + "loss": 0.9356, + "num_input_tokens_seen": 497737648, + "step": 1095, + "train_runtime": 79947.6758, + "train_tokens_per_second": 6225.793 + }, + { + "epoch": 1.326904340752372, + "grad_norm": 0.2629864513874054, + "learning_rate": 5e-06, + "loss": 0.9285, + "num_input_tokens_seen": 498211544, + "step": 1096, + "train_runtime": 80026.228, + "train_tokens_per_second": 6225.603 + }, + { + "epoch": 1.3281150975699734, + "grad_norm": 0.24348442256450653, + "learning_rate": 5e-06, + "loss": 0.9928, + "num_input_tokens_seen": 498667784, + "step": 1097, + "train_runtime": 80101.4968, + "train_tokens_per_second": 6225.449 + }, + { + "epoch": 1.3293258543875746, + "grad_norm": 0.24186153709888458, + "learning_rate": 5e-06, + "loss": 0.9611, + "num_input_tokens_seen": 499107448, + "step": 1098, + "train_runtime": 80174.2686, + "train_tokens_per_second": 6225.282 + }, + { + "epoch": 1.330536611205176, + "grad_norm": 0.28597867488861084, + "learning_rate": 5e-06, + "loss": 0.9198, + "num_input_tokens_seen": 499571536, + "step": 1099, + "train_runtime": 80251.374, + "train_tokens_per_second": 6225.084 + }, + { + "epoch": 1.3317473680227774, + "grad_norm": 0.25400543212890625, + "learning_rate": 5e-06, + "loss": 0.9536, + "num_input_tokens_seen": 500007376, + "step": 1100, + "train_runtime": 80323.5809, + "train_tokens_per_second": 6224.914 + }, + { + "epoch": 1.3329581248403788, + "grad_norm": 0.26500222086906433, + "learning_rate": 5e-06, + "loss": 0.998, + "num_input_tokens_seen": 500462880, + "step": 1101, + "train_runtime": 80398.5711, + "train_tokens_per_second": 6224.773 + }, + { + "epoch": 1.3341688816579802, + "grad_norm": 0.28662461042404175, + "learning_rate": 5e-06, + "loss": 0.9472, + "num_input_tokens_seen": 500914736, + "step": 1102, + "train_runtime": 80473.0011, + "train_tokens_per_second": 6224.631 + }, + { + "epoch": 1.3353796384755814, + "grad_norm": 0.2489413022994995, + "learning_rate": 5e-06, + "loss": 0.9416, + "num_input_tokens_seen": 501386272, + "step": 1103, + "train_runtime": 80550.876, + "train_tokens_per_second": 6224.467 + }, + { + "epoch": 1.3365903952931828, + "grad_norm": 0.22808928787708282, + "learning_rate": 5e-06, + "loss": 0.9119, + "num_input_tokens_seen": 501848592, + "step": 1104, + "train_runtime": 80627.7778, + "train_tokens_per_second": 6224.264 + }, + { + "epoch": 1.3378011521107842, + "grad_norm": 0.23136869072914124, + "learning_rate": 5e-06, + "loss": 0.958, + "num_input_tokens_seen": 502286176, + "step": 1105, + "train_runtime": 80699.5576, + "train_tokens_per_second": 6224.15 + }, + { + "epoch": 1.3390119089283856, + "grad_norm": 0.22823567688465118, + "learning_rate": 5e-06, + "loss": 0.9324, + "num_input_tokens_seen": 502742112, + "step": 1106, + "train_runtime": 80775.4724, + "train_tokens_per_second": 6223.945 + }, + { + "epoch": 1.340222665745987, + "grad_norm": 0.2484605759382248, + "learning_rate": 5e-06, + "loss": 0.9735, + "num_input_tokens_seen": 503197712, + "step": 1107, + "train_runtime": 80850.7279, + "train_tokens_per_second": 6223.787 + }, + { + "epoch": 1.3414334225635884, + "grad_norm": 0.25765275955200195, + "learning_rate": 5e-06, + "loss": 0.953, + "num_input_tokens_seen": 503655864, + "step": 1108, + "train_runtime": 80926.4497, + "train_tokens_per_second": 6223.625 + }, + { + "epoch": 1.3426441793811899, + "grad_norm": 0.23261244595050812, + "learning_rate": 5e-06, + "loss": 0.923, + "num_input_tokens_seen": 504117992, + "step": 1109, + "train_runtime": 81002.8373, + "train_tokens_per_second": 6223.461 + }, + { + "epoch": 1.3438549361987913, + "grad_norm": 0.23450727760791779, + "learning_rate": 5e-06, + "loss": 0.9273, + "num_input_tokens_seen": 504574512, + "step": 1110, + "train_runtime": 81077.9846, + "train_tokens_per_second": 6223.323 + }, + { + "epoch": 1.3450656930163924, + "grad_norm": 0.2521567940711975, + "learning_rate": 5e-06, + "loss": 0.9632, + "num_input_tokens_seen": 505004192, + "step": 1111, + "train_runtime": 81148.4492, + "train_tokens_per_second": 6223.214 + }, + { + "epoch": 1.3462764498339939, + "grad_norm": 0.2506852447986603, + "learning_rate": 5e-06, + "loss": 0.9951, + "num_input_tokens_seen": 505460352, + "step": 1112, + "train_runtime": 81223.6058, + "train_tokens_per_second": 6223.072 + }, + { + "epoch": 1.3474872066515953, + "grad_norm": 0.2718031704425812, + "learning_rate": 5e-06, + "loss": 0.9664, + "num_input_tokens_seen": 505924544, + "step": 1113, + "train_runtime": 81300.2242, + "train_tokens_per_second": 6222.917 + }, + { + "epoch": 1.3486979634691967, + "grad_norm": 0.26461461186408997, + "learning_rate": 5e-06, + "loss": 0.9479, + "num_input_tokens_seen": 506374000, + "step": 1114, + "train_runtime": 81374.867, + "train_tokens_per_second": 6222.732 + }, + { + "epoch": 1.349908720286798, + "grad_norm": 0.23874284327030182, + "learning_rate": 5e-06, + "loss": 0.9868, + "num_input_tokens_seen": 506851568, + "step": 1115, + "train_runtime": 81454.5159, + "train_tokens_per_second": 6222.51 + }, + { + "epoch": 1.3511194771043993, + "grad_norm": 0.2469114065170288, + "learning_rate": 5e-06, + "loss": 0.9355, + "num_input_tokens_seen": 507321040, + "step": 1116, + "train_runtime": 81532.2647, + "train_tokens_per_second": 6222.335 + }, + { + "epoch": 1.3523302339220007, + "grad_norm": 0.2748368978500366, + "learning_rate": 5e-06, + "loss": 0.8878, + "num_input_tokens_seen": 507785192, + "step": 1117, + "train_runtime": 81608.7161, + "train_tokens_per_second": 6222.193 + }, + { + "epoch": 1.353540990739602, + "grad_norm": 0.25142693519592285, + "learning_rate": 5e-06, + "loss": 0.9127, + "num_input_tokens_seen": 508241704, + "step": 1118, + "train_runtime": 81683.6479, + "train_tokens_per_second": 6222.074 + }, + { + "epoch": 1.3547517475572035, + "grad_norm": 0.23072993755340576, + "learning_rate": 5e-06, + "loss": 0.9419, + "num_input_tokens_seen": 508692400, + "step": 1119, + "train_runtime": 81758.1595, + "train_tokens_per_second": 6221.916 + }, + { + "epoch": 1.355962504374805, + "grad_norm": 0.22448928654193878, + "learning_rate": 5e-06, + "loss": 0.937, + "num_input_tokens_seen": 509133480, + "step": 1120, + "train_runtime": 81830.545, + "train_tokens_per_second": 6221.802 + }, + { + "epoch": 1.3571732611924063, + "grad_norm": 0.2378361076116562, + "learning_rate": 5e-06, + "loss": 0.9702, + "num_input_tokens_seen": 509569360, + "step": 1121, + "train_runtime": 81902.3306, + "train_tokens_per_second": 6221.671 + }, + { + "epoch": 1.3583840180100077, + "grad_norm": 0.23400956392288208, + "learning_rate": 5e-06, + "loss": 0.909, + "num_input_tokens_seen": 510010536, + "step": 1122, + "train_runtime": 81975.1662, + "train_tokens_per_second": 6221.525 + }, + { + "epoch": 1.3595947748276092, + "grad_norm": 0.24939168989658356, + "learning_rate": 5e-06, + "loss": 0.9008, + "num_input_tokens_seen": 510470824, + "step": 1123, + "train_runtime": 82051.5387, + "train_tokens_per_second": 6221.344 + }, + { + "epoch": 1.3608055316452103, + "grad_norm": 0.23065564036369324, + "learning_rate": 5e-06, + "loss": 0.9217, + "num_input_tokens_seen": 510941664, + "step": 1124, + "train_runtime": 82129.5737, + "train_tokens_per_second": 6221.165 + }, + { + "epoch": 1.3620162884628118, + "grad_norm": 0.270669162273407, + "learning_rate": 5e-06, + "loss": 0.9314, + "num_input_tokens_seen": 511387848, + "step": 1125, + "train_runtime": 82202.6173, + "train_tokens_per_second": 6221.065 + }, + { + "epoch": 1.3632270452804132, + "grad_norm": 0.2493094503879547, + "learning_rate": 5e-06, + "loss": 0.9661, + "num_input_tokens_seen": 511829632, + "step": 1126, + "train_runtime": 82275.3767, + "train_tokens_per_second": 6220.933 + }, + { + "epoch": 1.3644378020980146, + "grad_norm": 0.24099677801132202, + "learning_rate": 5e-06, + "loss": 0.9565, + "num_input_tokens_seen": 512284456, + "step": 1127, + "train_runtime": 82350.5397, + "train_tokens_per_second": 6220.778 + }, + { + "epoch": 1.365648558915616, + "grad_norm": 0.28274643421173096, + "learning_rate": 5e-06, + "loss": 0.9409, + "num_input_tokens_seen": 512735624, + "step": 1128, + "train_runtime": 82424.8965, + "train_tokens_per_second": 6220.64 + }, + { + "epoch": 1.3668593157332172, + "grad_norm": 0.24693673849105835, + "learning_rate": 5e-06, + "loss": 0.9281, + "num_input_tokens_seen": 513189840, + "step": 1129, + "train_runtime": 82499.5885, + "train_tokens_per_second": 6220.514 + }, + { + "epoch": 1.3680700725508186, + "grad_norm": 0.23583988845348358, + "learning_rate": 5e-06, + "loss": 0.858, + "num_input_tokens_seen": 513681000, + "step": 1130, + "train_runtime": 82581.1309, + "train_tokens_per_second": 6220.319 + }, + { + "epoch": 1.36928082936842, + "grad_norm": 0.23430530726909637, + "learning_rate": 5e-06, + "loss": 0.9629, + "num_input_tokens_seen": 514139520, + "step": 1131, + "train_runtime": 82656.5045, + "train_tokens_per_second": 6220.194 + }, + { + "epoch": 1.3704915861860214, + "grad_norm": 0.2671928405761719, + "learning_rate": 5e-06, + "loss": 0.9183, + "num_input_tokens_seen": 514585024, + "step": 1132, + "train_runtime": 82730.3681, + "train_tokens_per_second": 6220.026 + }, + { + "epoch": 1.3717023430036228, + "grad_norm": 0.2957673668861389, + "learning_rate": 5e-06, + "loss": 0.9404, + "num_input_tokens_seen": 515044760, + "step": 1133, + "train_runtime": 82806.7902, + "train_tokens_per_second": 6219.837 + }, + { + "epoch": 1.3729130998212242, + "grad_norm": 0.24210570752620697, + "learning_rate": 5e-06, + "loss": 0.9729, + "num_input_tokens_seen": 515503432, + "step": 1134, + "train_runtime": 82882.7978, + "train_tokens_per_second": 6219.667 + }, + { + "epoch": 1.3741238566388256, + "grad_norm": 0.25204458832740784, + "learning_rate": 5e-06, + "loss": 0.9571, + "num_input_tokens_seen": 515950480, + "step": 1135, + "train_runtime": 82956.8397, + "train_tokens_per_second": 6219.505 + }, + { + "epoch": 1.375334613456427, + "grad_norm": 0.25100481510162354, + "learning_rate": 5e-06, + "loss": 0.93, + "num_input_tokens_seen": 516403560, + "step": 1136, + "train_runtime": 83031.7598, + "train_tokens_per_second": 6219.35 + }, + { + "epoch": 1.3765453702740285, + "grad_norm": 0.2839900255203247, + "learning_rate": 5e-06, + "loss": 0.9969, + "num_input_tokens_seen": 516860512, + "step": 1137, + "train_runtime": 83107.2043, + "train_tokens_per_second": 6219.202 + }, + { + "epoch": 1.3777561270916296, + "grad_norm": 0.24296337366104126, + "learning_rate": 5e-06, + "loss": 0.9908, + "num_input_tokens_seen": 517309384, + "step": 1138, + "train_runtime": 83178.397, + "train_tokens_per_second": 6219.276 + }, + { + "epoch": 1.378966883909231, + "grad_norm": 0.2473958134651184, + "learning_rate": 5e-06, + "loss": 0.991, + "num_input_tokens_seen": 517764120, + "step": 1139, + "train_runtime": 83249.0866, + "train_tokens_per_second": 6219.457 + }, + { + "epoch": 1.3801776407268325, + "grad_norm": 0.26322364807128906, + "learning_rate": 5e-06, + "loss": 0.9685, + "num_input_tokens_seen": 518204792, + "step": 1140, + "train_runtime": 83317.1594, + "train_tokens_per_second": 6219.665 + }, + { + "epoch": 1.3813883975444339, + "grad_norm": 0.27684542536735535, + "learning_rate": 5e-06, + "loss": 0.9655, + "num_input_tokens_seen": 518647512, + "step": 1141, + "train_runtime": 83386.2555, + "train_tokens_per_second": 6219.82 + }, + { + "epoch": 1.3825991543620353, + "grad_norm": 0.24537670612335205, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 519100408, + "step": 1142, + "train_runtime": 83457.0116, + "train_tokens_per_second": 6219.974 + }, + { + "epoch": 1.3838099111796365, + "grad_norm": 0.23837308585643768, + "learning_rate": 5e-06, + "loss": 0.9082, + "num_input_tokens_seen": 519546976, + "step": 1143, + "train_runtime": 83526.2221, + "train_tokens_per_second": 6220.166 + }, + { + "epoch": 1.3850206679972379, + "grad_norm": 0.2371511310338974, + "learning_rate": 5e-06, + "loss": 0.9283, + "num_input_tokens_seen": 520011528, + "step": 1144, + "train_runtime": 83598.9547, + "train_tokens_per_second": 6220.311 + }, + { + "epoch": 1.3862314248148393, + "grad_norm": 0.22656875848770142, + "learning_rate": 5e-06, + "loss": 0.9689, + "num_input_tokens_seen": 520470056, + "step": 1145, + "train_runtime": 83670.3971, + "train_tokens_per_second": 6220.48 + }, + { + "epoch": 1.3874421816324407, + "grad_norm": 0.23803792893886566, + "learning_rate": 5e-06, + "loss": 0.9774, + "num_input_tokens_seen": 520904032, + "step": 1146, + "train_runtime": 83740.3252, + "train_tokens_per_second": 6220.468 + }, + { + "epoch": 1.3886529384500421, + "grad_norm": 0.21631726622581482, + "learning_rate": 5e-06, + "loss": 0.92, + "num_input_tokens_seen": 521369744, + "step": 1147, + "train_runtime": 83816.7878, + "train_tokens_per_second": 6220.35 + }, + { + "epoch": 1.3898636952676435, + "grad_norm": 0.237714946269989, + "learning_rate": 5e-06, + "loss": 0.9022, + "num_input_tokens_seen": 521831576, + "step": 1148, + "train_runtime": 83893.3078, + "train_tokens_per_second": 6220.181 + }, + { + "epoch": 1.391074452085245, + "grad_norm": 0.2461657077074051, + "learning_rate": 5e-06, + "loss": 0.955, + "num_input_tokens_seen": 522271136, + "step": 1149, + "train_runtime": 83965.4254, + "train_tokens_per_second": 6220.074 + }, + { + "epoch": 1.3922852089028463, + "grad_norm": 0.23177474737167358, + "learning_rate": 5e-06, + "loss": 0.9326, + "num_input_tokens_seen": 522723640, + "step": 1150, + "train_runtime": 84039.8413, + "train_tokens_per_second": 6219.95 + }, + { + "epoch": 1.3934959657204475, + "grad_norm": 0.24760431051254272, + "learning_rate": 5e-06, + "loss": 0.9544, + "num_input_tokens_seen": 523187496, + "step": 1151, + "train_runtime": 84116.6652, + "train_tokens_per_second": 6219.784 + }, + { + "epoch": 1.394706722538049, + "grad_norm": 0.24664926528930664, + "learning_rate": 5e-06, + "loss": 0.9197, + "num_input_tokens_seen": 523653368, + "step": 1152, + "train_runtime": 84193.8554, + "train_tokens_per_second": 6219.615 + }, + { + "epoch": 1.3959174793556504, + "grad_norm": 0.22697068750858307, + "learning_rate": 5e-06, + "loss": 0.9622, + "num_input_tokens_seen": 524120216, + "step": 1153, + "train_runtime": 84271.1905, + "train_tokens_per_second": 6219.447 + }, + { + "epoch": 1.3971282361732518, + "grad_norm": 0.24017848074436188, + "learning_rate": 5e-06, + "loss": 0.9406, + "num_input_tokens_seen": 524588968, + "step": 1154, + "train_runtime": 84348.8248, + "train_tokens_per_second": 6219.28 + }, + { + "epoch": 1.3983389929908532, + "grad_norm": 0.24601654708385468, + "learning_rate": 5e-06, + "loss": 0.96, + "num_input_tokens_seen": 525035616, + "step": 1155, + "train_runtime": 84422.8617, + "train_tokens_per_second": 6219.117 + }, + { + "epoch": 1.3995497498084544, + "grad_norm": 0.22841405868530273, + "learning_rate": 5e-06, + "loss": 0.9359, + "num_input_tokens_seen": 525495368, + "step": 1156, + "train_runtime": 84498.4629, + "train_tokens_per_second": 6218.993 + }, + { + "epoch": 1.4007605066260558, + "grad_norm": 0.2503286302089691, + "learning_rate": 5e-06, + "loss": 0.9101, + "num_input_tokens_seen": 525936104, + "step": 1157, + "train_runtime": 84571.5115, + "train_tokens_per_second": 6218.833 + }, + { + "epoch": 1.4019712634436572, + "grad_norm": 0.24628864228725433, + "learning_rate": 5e-06, + "loss": 0.9777, + "num_input_tokens_seen": 526383960, + "step": 1158, + "train_runtime": 84645.7016, + "train_tokens_per_second": 6218.673 + }, + { + "epoch": 1.4031820202612586, + "grad_norm": 0.23224344849586487, + "learning_rate": 5e-06, + "loss": 0.9756, + "num_input_tokens_seen": 526842064, + "step": 1159, + "train_runtime": 84721.4748, + "train_tokens_per_second": 6218.519 + }, + { + "epoch": 1.40439277707886, + "grad_norm": 0.23669494688510895, + "learning_rate": 5e-06, + "loss": 0.9558, + "num_input_tokens_seen": 527274984, + "step": 1160, + "train_runtime": 84792.9592, + "train_tokens_per_second": 6218.382 + }, + { + "epoch": 1.4056035338964614, + "grad_norm": 0.2642204165458679, + "learning_rate": 5e-06, + "loss": 0.9927, + "num_input_tokens_seen": 527706216, + "step": 1161, + "train_runtime": 84864.0145, + "train_tokens_per_second": 6218.257 + }, + { + "epoch": 1.4068142907140628, + "grad_norm": 0.24115154147148132, + "learning_rate": 5e-06, + "loss": 0.9297, + "num_input_tokens_seen": 528178144, + "step": 1162, + "train_runtime": 84942.3429, + "train_tokens_per_second": 6218.078 + }, + { + "epoch": 1.4080250475316642, + "grad_norm": 0.23551017045974731, + "learning_rate": 5e-06, + "loss": 0.9862, + "num_input_tokens_seen": 528630424, + "step": 1163, + "train_runtime": 85017.0419, + "train_tokens_per_second": 6217.935 + }, + { + "epoch": 1.4092358043492654, + "grad_norm": 0.2298494577407837, + "learning_rate": 5e-06, + "loss": 0.9455, + "num_input_tokens_seen": 529081184, + "step": 1164, + "train_runtime": 85091.8299, + "train_tokens_per_second": 6217.767 + }, + { + "epoch": 1.4104465611668668, + "grad_norm": 0.22845524549484253, + "learning_rate": 5e-06, + "loss": 0.9526, + "num_input_tokens_seen": 529559640, + "step": 1165, + "train_runtime": 85170.8828, + "train_tokens_per_second": 6217.614 + }, + { + "epoch": 1.4116573179844683, + "grad_norm": 0.2308027297258377, + "learning_rate": 5e-06, + "loss": 0.8656, + "num_input_tokens_seen": 530022552, + "step": 1166, + "train_runtime": 85244.8514, + "train_tokens_per_second": 6217.649 + }, + { + "epoch": 1.4128680748020697, + "grad_norm": 0.2270365059375763, + "learning_rate": 5e-06, + "loss": 0.9853, + "num_input_tokens_seen": 530470680, + "step": 1167, + "train_runtime": 85314.7105, + "train_tokens_per_second": 6217.81 + }, + { + "epoch": 1.414078831619671, + "grad_norm": 0.23675860464572906, + "learning_rate": 5e-06, + "loss": 0.9707, + "num_input_tokens_seen": 530923672, + "step": 1168, + "train_runtime": 85385.4857, + "train_tokens_per_second": 6217.962 + }, + { + "epoch": 1.4152895884372723, + "grad_norm": 0.24494849145412445, + "learning_rate": 5e-06, + "loss": 1.0015, + "num_input_tokens_seen": 531378672, + "step": 1169, + "train_runtime": 85457.4364, + "train_tokens_per_second": 6218.051 + }, + { + "epoch": 1.4165003452548737, + "grad_norm": 0.2266804724931717, + "learning_rate": 5e-06, + "loss": 0.9243, + "num_input_tokens_seen": 531833320, + "step": 1170, + "train_runtime": 85530.4497, + "train_tokens_per_second": 6218.058 + }, + { + "epoch": 1.417711102072475, + "grad_norm": 0.25175556540489197, + "learning_rate": 5e-06, + "loss": 1.0028, + "num_input_tokens_seen": 532288648, + "step": 1171, + "train_runtime": 85604.5221, + "train_tokens_per_second": 6217.997 + }, + { + "epoch": 1.4189218588900765, + "grad_norm": 0.23558390140533447, + "learning_rate": 5e-06, + "loss": 0.9245, + "num_input_tokens_seen": 532735600, + "step": 1172, + "train_runtime": 85676.0209, + "train_tokens_per_second": 6218.025 + }, + { + "epoch": 1.420132615707678, + "grad_norm": 0.220907524228096, + "learning_rate": 5e-06, + "loss": 0.9561, + "num_input_tokens_seen": 533220744, + "step": 1173, + "train_runtime": 85753.6767, + "train_tokens_per_second": 6218.051 + }, + { + "epoch": 1.4213433725252793, + "grad_norm": 0.28133559226989746, + "learning_rate": 5e-06, + "loss": 0.9136, + "num_input_tokens_seen": 533681856, + "step": 1174, + "train_runtime": 85830.2269, + "train_tokens_per_second": 6217.878 + }, + { + "epoch": 1.4225541293428807, + "grad_norm": 0.2508618235588074, + "learning_rate": 5e-06, + "loss": 0.9846, + "num_input_tokens_seen": 534131488, + "step": 1175, + "train_runtime": 85905.1182, + "train_tokens_per_second": 6217.691 + }, + { + "epoch": 1.4237648861604821, + "grad_norm": 0.24241898953914642, + "learning_rate": 5e-06, + "loss": 0.948, + "num_input_tokens_seen": 534587808, + "step": 1176, + "train_runtime": 85980.4455, + "train_tokens_per_second": 6217.551 + }, + { + "epoch": 1.4249756429780835, + "grad_norm": 0.2333323061466217, + "learning_rate": 5e-06, + "loss": 0.9202, + "num_input_tokens_seen": 535059256, + "step": 1177, + "train_runtime": 86059.1263, + "train_tokens_per_second": 6217.345 + }, + { + "epoch": 1.4261863997956847, + "grad_norm": 0.2457004338502884, + "learning_rate": 5e-06, + "loss": 0.96, + "num_input_tokens_seen": 535516680, + "step": 1178, + "train_runtime": 86134.7746, + "train_tokens_per_second": 6217.195 + }, + { + "epoch": 1.4273971566132861, + "grad_norm": 0.2796451151371002, + "learning_rate": 5e-06, + "loss": 0.9294, + "num_input_tokens_seen": 535977480, + "step": 1179, + "train_runtime": 86211.7662, + "train_tokens_per_second": 6216.988 + }, + { + "epoch": 1.4286079134308876, + "grad_norm": 0.24755236506462097, + "learning_rate": 5e-06, + "loss": 0.9704, + "num_input_tokens_seen": 536456680, + "step": 1180, + "train_runtime": 86291.5671, + "train_tokens_per_second": 6216.791 + }, + { + "epoch": 1.429818670248489, + "grad_norm": 0.23514142632484436, + "learning_rate": 5e-06, + "loss": 0.9235, + "num_input_tokens_seen": 536919736, + "step": 1181, + "train_runtime": 86368.5347, + "train_tokens_per_second": 6216.613 + }, + { + "epoch": 1.4310294270660904, + "grad_norm": 0.2705405056476593, + "learning_rate": 5e-06, + "loss": 0.9898, + "num_input_tokens_seen": 537369752, + "step": 1182, + "train_runtime": 86443.3868, + "train_tokens_per_second": 6216.436 + }, + { + "epoch": 1.4322401838836916, + "grad_norm": 0.2713667154312134, + "learning_rate": 5e-06, + "loss": 0.931, + "num_input_tokens_seen": 537808528, + "step": 1183, + "train_runtime": 86515.7051, + "train_tokens_per_second": 6216.311 + }, + { + "epoch": 1.433450940701293, + "grad_norm": 0.2554599642753601, + "learning_rate": 5e-06, + "loss": 0.8981, + "num_input_tokens_seen": 538237888, + "step": 1184, + "train_runtime": 86586.8158, + "train_tokens_per_second": 6216.164 + }, + { + "epoch": 1.4346616975188944, + "grad_norm": 0.22345824539661407, + "learning_rate": 5e-06, + "loss": 0.9353, + "num_input_tokens_seen": 538679104, + "step": 1185, + "train_runtime": 86659.9026, + "train_tokens_per_second": 6216.013 + }, + { + "epoch": 1.4358724543364958, + "grad_norm": 0.25475722551345825, + "learning_rate": 5e-06, + "loss": 0.9775, + "num_input_tokens_seen": 539122880, + "step": 1186, + "train_runtime": 86733.3283, + "train_tokens_per_second": 6215.868 + }, + { + "epoch": 1.4370832111540972, + "grad_norm": 0.2426735758781433, + "learning_rate": 5e-06, + "loss": 0.9531, + "num_input_tokens_seen": 539580016, + "step": 1187, + "train_runtime": 86809.5723, + "train_tokens_per_second": 6215.674 + }, + { + "epoch": 1.4382939679716986, + "grad_norm": 0.24386319518089294, + "learning_rate": 5e-06, + "loss": 0.8981, + "num_input_tokens_seen": 540035632, + "step": 1188, + "train_runtime": 86885.2572, + "train_tokens_per_second": 6215.504 + }, + { + "epoch": 1.4395047247893, + "grad_norm": 0.25454631447792053, + "learning_rate": 5e-06, + "loss": 0.9745, + "num_input_tokens_seen": 540481968, + "step": 1189, + "train_runtime": 86959.2044, + "train_tokens_per_second": 6215.351 + }, + { + "epoch": 1.4407154816069014, + "grad_norm": 0.2664698660373688, + "learning_rate": 5e-06, + "loss": 0.9639, + "num_input_tokens_seen": 540930360, + "step": 1190, + "train_runtime": 87033.5401, + "train_tokens_per_second": 6215.194 + }, + { + "epoch": 1.4419262384245026, + "grad_norm": 0.24694858491420746, + "learning_rate": 5e-06, + "loss": 0.9673, + "num_input_tokens_seen": 541419328, + "step": 1191, + "train_runtime": 87114.81, + "train_tokens_per_second": 6215.009 + }, + { + "epoch": 1.443136995242104, + "grad_norm": 0.27929478883743286, + "learning_rate": 5e-06, + "loss": 0.891, + "num_input_tokens_seen": 541886280, + "step": 1192, + "train_runtime": 87192.2112, + "train_tokens_per_second": 6214.847 + }, + { + "epoch": 1.4443477520597054, + "grad_norm": 0.26354244351387024, + "learning_rate": 5e-06, + "loss": 0.9933, + "num_input_tokens_seen": 542346832, + "step": 1193, + "train_runtime": 87268.6309, + "train_tokens_per_second": 6214.682 + }, + { + "epoch": 1.4455585088773069, + "grad_norm": 0.2514925003051758, + "learning_rate": 5e-06, + "loss": 0.964, + "num_input_tokens_seen": 542801256, + "step": 1194, + "train_runtime": 87344.2791, + "train_tokens_per_second": 6214.503 + }, + { + "epoch": 1.4467692656949083, + "grad_norm": 0.24636778235435486, + "learning_rate": 5e-06, + "loss": 0.9314, + "num_input_tokens_seen": 543242600, + "step": 1195, + "train_runtime": 87417.0709, + "train_tokens_per_second": 6214.377 + }, + { + "epoch": 1.4479800225125095, + "grad_norm": 0.2630736529827118, + "learning_rate": 5e-06, + "loss": 0.9703, + "num_input_tokens_seen": 543701408, + "step": 1196, + "train_runtime": 87493.5804, + "train_tokens_per_second": 6214.186 + }, + { + "epoch": 1.4491907793301109, + "grad_norm": 0.2552695572376251, + "learning_rate": 5e-06, + "loss": 1.0058, + "num_input_tokens_seen": 544170664, + "step": 1197, + "train_runtime": 87571.6365, + "train_tokens_per_second": 6214.006 + }, + { + "epoch": 1.4504015361477123, + "grad_norm": 0.2683693468570709, + "learning_rate": 5e-06, + "loss": 0.9817, + "num_input_tokens_seen": 544586024, + "step": 1198, + "train_runtime": 87639.8342, + "train_tokens_per_second": 6213.91 + }, + { + "epoch": 1.4516122929653137, + "grad_norm": 0.24069073796272278, + "learning_rate": 5e-06, + "loss": 0.983, + "num_input_tokens_seen": 545032400, + "step": 1199, + "train_runtime": 87713.2659, + "train_tokens_per_second": 6213.797 + }, + { + "epoch": 1.452823049782915, + "grad_norm": 0.2466171234846115, + "learning_rate": 5e-06, + "loss": 0.9388, + "num_input_tokens_seen": 545482808, + "step": 1200, + "train_runtime": 87788.0711, + "train_tokens_per_second": 6213.632 + }, + { + "epoch": 1.4540338066005165, + "grad_norm": 0.310069739818573, + "learning_rate": 5e-06, + "loss": 0.964, + "num_input_tokens_seen": 545948528, + "step": 1201, + "train_runtime": 87865.3722, + "train_tokens_per_second": 6213.466 + }, + { + "epoch": 1.455244563418118, + "grad_norm": 0.23402269184589386, + "learning_rate": 5e-06, + "loss": 0.9666, + "num_input_tokens_seen": 546406224, + "step": 1202, + "train_runtime": 87941.9012, + "train_tokens_per_second": 6213.264 + }, + { + "epoch": 1.4564553202357193, + "grad_norm": 0.2361670583486557, + "learning_rate": 5e-06, + "loss": 0.9641, + "num_input_tokens_seen": 546871256, + "step": 1203, + "train_runtime": 88019.3297, + "train_tokens_per_second": 6213.081 + }, + { + "epoch": 1.4576660770533205, + "grad_norm": 0.21892577409744263, + "learning_rate": 5e-06, + "loss": 0.9509, + "num_input_tokens_seen": 547331272, + "step": 1204, + "train_runtime": 88095.2024, + "train_tokens_per_second": 6212.952 + }, + { + "epoch": 1.458876833870922, + "grad_norm": 0.276292085647583, + "learning_rate": 5e-06, + "loss": 0.9394, + "num_input_tokens_seen": 547782312, + "step": 1205, + "train_runtime": 88170.4287, + "train_tokens_per_second": 6212.767 + }, + { + "epoch": 1.4600875906885233, + "grad_norm": 0.24177499115467072, + "learning_rate": 5e-06, + "loss": 0.9166, + "num_input_tokens_seen": 548266600, + "step": 1206, + "train_runtime": 88251.0882, + "train_tokens_per_second": 6212.576 + }, + { + "epoch": 1.4612983475061248, + "grad_norm": 0.2835836112499237, + "learning_rate": 5e-06, + "loss": 0.995, + "num_input_tokens_seen": 548717048, + "step": 1207, + "train_runtime": 88325.5766, + "train_tokens_per_second": 6212.437 + }, + { + "epoch": 1.4625091043237262, + "grad_norm": 0.23038621246814728, + "learning_rate": 5e-06, + "loss": 0.9838, + "num_input_tokens_seen": 549195520, + "step": 1208, + "train_runtime": 88405.6772, + "train_tokens_per_second": 6212.22 + }, + { + "epoch": 1.4637198611413273, + "grad_norm": 0.2618058919906616, + "learning_rate": 5e-06, + "loss": 0.9669, + "num_input_tokens_seen": 549643000, + "step": 1209, + "train_runtime": 88479.9776, + "train_tokens_per_second": 6212.061 + }, + { + "epoch": 1.4649306179589288, + "grad_norm": 0.26815587282180786, + "learning_rate": 5e-06, + "loss": 0.9388, + "num_input_tokens_seen": 550092200, + "step": 1210, + "train_runtime": 88554.698, + "train_tokens_per_second": 6211.892 + }, + { + "epoch": 1.4661413747765302, + "grad_norm": 0.2662449777126312, + "learning_rate": 5e-06, + "loss": 0.9636, + "num_input_tokens_seen": 550550512, + "step": 1211, + "train_runtime": 88629.3279, + "train_tokens_per_second": 6211.832 + }, + { + "epoch": 1.4673521315941316, + "grad_norm": 0.23297056555747986, + "learning_rate": 5e-06, + "loss": 0.968, + "num_input_tokens_seen": 551000744, + "step": 1212, + "train_runtime": 88700.7236, + "train_tokens_per_second": 6211.908 + }, + { + "epoch": 1.468562888411733, + "grad_norm": 0.24942202866077423, + "learning_rate": 5e-06, + "loss": 0.9262, + "num_input_tokens_seen": 551460280, + "step": 1213, + "train_runtime": 88773.122, + "train_tokens_per_second": 6212.019 + }, + { + "epoch": 1.4697736452293344, + "grad_norm": 0.2555992901325226, + "learning_rate": 5e-06, + "loss": 0.9494, + "num_input_tokens_seen": 551910888, + "step": 1214, + "train_runtime": 88844.5041, + "train_tokens_per_second": 6212.099 + }, + { + "epoch": 1.4709844020469358, + "grad_norm": 0.2768413722515106, + "learning_rate": 5e-06, + "loss": 0.919, + "num_input_tokens_seen": 552378856, + "step": 1215, + "train_runtime": 88918.5443, + "train_tokens_per_second": 6212.19 + }, + { + "epoch": 1.4721951588645372, + "grad_norm": 0.24520625174045563, + "learning_rate": 5e-06, + "loss": 0.9503, + "num_input_tokens_seen": 552847920, + "step": 1216, + "train_runtime": 88992.9488, + "train_tokens_per_second": 6212.267 + }, + { + "epoch": 1.4734059156821386, + "grad_norm": 0.2534187436103821, + "learning_rate": 5e-06, + "loss": 0.9683, + "num_input_tokens_seen": 553286272, + "step": 1217, + "train_runtime": 89061.8498, + "train_tokens_per_second": 6212.382 + }, + { + "epoch": 1.4746166724997398, + "grad_norm": 0.2607842981815338, + "learning_rate": 5e-06, + "loss": 0.9375, + "num_input_tokens_seen": 553730632, + "step": 1218, + "train_runtime": 89131.9721, + "train_tokens_per_second": 6212.48 + }, + { + "epoch": 1.4758274293173412, + "grad_norm": 0.2503432333469391, + "learning_rate": 5e-06, + "loss": 0.9422, + "num_input_tokens_seen": 554163400, + "step": 1219, + "train_runtime": 89200.3599, + "train_tokens_per_second": 6212.569 + }, + { + "epoch": 1.4770381861349426, + "grad_norm": 0.27522653341293335, + "learning_rate": 5e-06, + "loss": 0.9007, + "num_input_tokens_seen": 554616200, + "step": 1220, + "train_runtime": 89271.7622, + "train_tokens_per_second": 6212.672 + }, + { + "epoch": 1.478248942952544, + "grad_norm": 0.29365551471710205, + "learning_rate": 5e-06, + "loss": 0.9819, + "num_input_tokens_seen": 555069200, + "step": 1221, + "train_runtime": 89343.1896, + "train_tokens_per_second": 6212.776 + }, + { + "epoch": 1.4794596997701455, + "grad_norm": 0.22803185880184174, + "learning_rate": 5e-06, + "loss": 0.929, + "num_input_tokens_seen": 555522824, + "step": 1222, + "train_runtime": 89414.7038, + "train_tokens_per_second": 6212.88 + }, + { + "epoch": 1.4806704565877467, + "grad_norm": 0.2833687663078308, + "learning_rate": 5e-06, + "loss": 0.9506, + "num_input_tokens_seen": 555976920, + "step": 1223, + "train_runtime": 89486.7493, + "train_tokens_per_second": 6212.952 + }, + { + "epoch": 1.481881213405348, + "grad_norm": 0.23040251433849335, + "learning_rate": 5e-06, + "loss": 0.9585, + "num_input_tokens_seen": 556431480, + "step": 1224, + "train_runtime": 89564.565, + "train_tokens_per_second": 6212.63 + }, + { + "epoch": 1.4830919702229495, + "grad_norm": 0.2419111281633377, + "learning_rate": 5e-06, + "loss": 0.9193, + "num_input_tokens_seen": 556890152, + "step": 1225, + "train_runtime": 89648.8264, + "train_tokens_per_second": 6211.907 + }, + { + "epoch": 1.4843027270405509, + "grad_norm": 0.29110512137413025, + "learning_rate": 5e-06, + "loss": 0.8697, + "num_input_tokens_seen": 557355240, + "step": 1226, + "train_runtime": 89732.4654, + "train_tokens_per_second": 6211.3 + }, + { + "epoch": 1.4855134838581523, + "grad_norm": 0.25912541151046753, + "learning_rate": 5e-06, + "loss": 0.9736, + "num_input_tokens_seen": 557820032, + "step": 1227, + "train_runtime": 89810.0796, + "train_tokens_per_second": 6211.107 + }, + { + "epoch": 1.4867242406757537, + "grad_norm": 0.29734542965888977, + "learning_rate": 5e-06, + "loss": 0.9779, + "num_input_tokens_seen": 558247392, + "step": 1228, + "train_runtime": 89880.9268, + "train_tokens_per_second": 6210.966 + }, + { + "epoch": 1.4879349974933551, + "grad_norm": 0.23052756488323212, + "learning_rate": 5e-06, + "loss": 0.9192, + "num_input_tokens_seen": 558690584, + "step": 1229, + "train_runtime": 89956.4661, + "train_tokens_per_second": 6210.677 + }, + { + "epoch": 1.4891457543109565, + "grad_norm": 0.24976183474063873, + "learning_rate": 5e-06, + "loss": 0.9726, + "num_input_tokens_seen": 559158528, + "step": 1230, + "train_runtime": 90036.0752, + "train_tokens_per_second": 6210.383 + }, + { + "epoch": 1.4903565111285577, + "grad_norm": 0.25929853320121765, + "learning_rate": 5e-06, + "loss": 0.8893, + "num_input_tokens_seen": 559606536, + "step": 1231, + "train_runtime": 90111.8366, + "train_tokens_per_second": 6210.134 + }, + { + "epoch": 1.4915672679461591, + "grad_norm": 0.2416425496339798, + "learning_rate": 5e-06, + "loss": 0.9223, + "num_input_tokens_seen": 560047016, + "step": 1232, + "train_runtime": 90186.6064, + "train_tokens_per_second": 6209.869 + }, + { + "epoch": 1.4927780247637605, + "grad_norm": 0.2509872019290924, + "learning_rate": 5e-06, + "loss": 0.9414, + "num_input_tokens_seen": 560477352, + "step": 1233, + "train_runtime": 90259.5167, + "train_tokens_per_second": 6209.621 + }, + { + "epoch": 1.493988781581362, + "grad_norm": 0.24654145538806915, + "learning_rate": 5e-06, + "loss": 0.9249, + "num_input_tokens_seen": 560915928, + "step": 1234, + "train_runtime": 90333.8577, + "train_tokens_per_second": 6209.365 + }, + { + "epoch": 1.4951995383989634, + "grad_norm": 0.2723659873008728, + "learning_rate": 5e-06, + "loss": 0.9631, + "num_input_tokens_seen": 561326904, + "step": 1235, + "train_runtime": 90403.0323, + "train_tokens_per_second": 6209.16 + }, + { + "epoch": 1.4964102952165645, + "grad_norm": 0.22693853080272675, + "learning_rate": 5e-06, + "loss": 0.9138, + "num_input_tokens_seen": 561775144, + "step": 1236, + "train_runtime": 90478.8627, + "train_tokens_per_second": 6208.91 + }, + { + "epoch": 1.497621052034166, + "grad_norm": 0.26430606842041016, + "learning_rate": 5e-06, + "loss": 0.9036, + "num_input_tokens_seen": 562201328, + "step": 1237, + "train_runtime": 90551.0641, + "train_tokens_per_second": 6208.666 + }, + { + "epoch": 1.4988318088517674, + "grad_norm": 0.24093542993068695, + "learning_rate": 5e-06, + "loss": 0.9329, + "num_input_tokens_seen": 562665640, + "step": 1238, + "train_runtime": 90629.6091, + "train_tokens_per_second": 6208.409 + }, + { + "epoch": 1.5000425656693688, + "grad_norm": 0.24133825302124023, + "learning_rate": 5e-06, + "loss": 0.9841, + "num_input_tokens_seen": 563130560, + "step": 1239, + "train_runtime": 90707.3461, + "train_tokens_per_second": 6208.213 + }, + { + "epoch": 1.5012533224869702, + "grad_norm": 0.23979146778583527, + "learning_rate": 5e-06, + "loss": 0.969, + "num_input_tokens_seen": 563574224, + "step": 1240, + "train_runtime": 90780.5712, + "train_tokens_per_second": 6208.093 + }, + { + "epoch": 1.5024640793045716, + "grad_norm": 0.2502334713935852, + "learning_rate": 5e-06, + "loss": 0.9544, + "num_input_tokens_seen": 564005368, + "step": 1241, + "train_runtime": 90849.637, + "train_tokens_per_second": 6208.119 + }, + { + "epoch": 1.503674836122173, + "grad_norm": 0.24188034236431122, + "learning_rate": 5e-06, + "loss": 0.9265, + "num_input_tokens_seen": 564455488, + "step": 1242, + "train_runtime": 90922.8277, + "train_tokens_per_second": 6208.072 + }, + { + "epoch": 1.5048855929397744, + "grad_norm": 0.2516622841358185, + "learning_rate": 5e-06, + "loss": 0.9798, + "num_input_tokens_seen": 564908080, + "step": 1243, + "train_runtime": 90997.5049, + "train_tokens_per_second": 6207.951 + }, + { + "epoch": 1.5060963497573758, + "grad_norm": 0.22442975640296936, + "learning_rate": 5e-06, + "loss": 0.9605, + "num_input_tokens_seen": 565380080, + "step": 1244, + "train_runtime": 91075.2673, + "train_tokens_per_second": 6207.833 + }, + { + "epoch": 1.507307106574977, + "grad_norm": 0.25572800636291504, + "learning_rate": 5e-06, + "loss": 1.0025, + "num_input_tokens_seen": 565820720, + "step": 1245, + "train_runtime": 91148.0391, + "train_tokens_per_second": 6207.711 + }, + { + "epoch": 1.5085178633925784, + "grad_norm": 0.24338506162166595, + "learning_rate": 5e-06, + "loss": 0.9596, + "num_input_tokens_seen": 566254432, + "step": 1246, + "train_runtime": 91219.5681, + "train_tokens_per_second": 6207.598 + }, + { + "epoch": 1.5097286202101798, + "grad_norm": 0.26078444719314575, + "learning_rate": 5e-06, + "loss": 0.9484, + "num_input_tokens_seen": 566687608, + "step": 1247, + "train_runtime": 91291.6618, + "train_tokens_per_second": 6207.441 + }, + { + "epoch": 1.510939377027781, + "grad_norm": 0.25328484177589417, + "learning_rate": 5e-06, + "loss": 0.9433, + "num_input_tokens_seen": 567135480, + "step": 1248, + "train_runtime": 91365.8104, + "train_tokens_per_second": 6207.305 + }, + { + "epoch": 1.5121501338453824, + "grad_norm": 0.2464897632598877, + "learning_rate": 5e-06, + "loss": 0.9207, + "num_input_tokens_seen": 567570544, + "step": 1249, + "train_runtime": 91437.823, + "train_tokens_per_second": 6207.175 + }, + { + "epoch": 1.5133608906629838, + "grad_norm": 0.232350155711174, + "learning_rate": 5e-06, + "loss": 0.9411, + "num_input_tokens_seen": 568017064, + "step": 1250, + "train_runtime": 91511.6898, + "train_tokens_per_second": 6207.044 + }, + { + "epoch": 1.5145716474805853, + "grad_norm": 0.22308504581451416, + "learning_rate": 5e-06, + "loss": 0.906, + "num_input_tokens_seen": 568479648, + "step": 1251, + "train_runtime": 91588.8378, + "train_tokens_per_second": 6206.866 + }, + { + "epoch": 1.5157824042981867, + "grad_norm": 0.23805969953536987, + "learning_rate": 5e-06, + "loss": 0.9744, + "num_input_tokens_seen": 568934456, + "step": 1252, + "train_runtime": 91663.9685, + "train_tokens_per_second": 6206.74 + }, + { + "epoch": 1.516993161115788, + "grad_norm": 0.2170308232307434, + "learning_rate": 5e-06, + "loss": 0.9288, + "num_input_tokens_seen": 569397744, + "step": 1253, + "train_runtime": 91740.5472, + "train_tokens_per_second": 6206.609 + }, + { + "epoch": 1.5182039179333895, + "grad_norm": 0.237321138381958, + "learning_rate": 5e-06, + "loss": 0.8996, + "num_input_tokens_seen": 569848752, + "step": 1254, + "train_runtime": 91814.7504, + "train_tokens_per_second": 6206.505 + }, + { + "epoch": 1.519414674750991, + "grad_norm": 0.25323814153671265, + "learning_rate": 5e-06, + "loss": 0.9405, + "num_input_tokens_seen": 570280800, + "step": 1255, + "train_runtime": 91885.305, + "train_tokens_per_second": 6206.442 + }, + { + "epoch": 1.5206254315685923, + "grad_norm": 0.24336665868759155, + "learning_rate": 5e-06, + "loss": 0.9383, + "num_input_tokens_seen": 570733784, + "step": 1256, + "train_runtime": 91959.8375, + "train_tokens_per_second": 6206.337 + }, + { + "epoch": 1.5218361883861937, + "grad_norm": 0.24592383205890656, + "learning_rate": 5e-06, + "loss": 0.9803, + "num_input_tokens_seen": 571189672, + "step": 1257, + "train_runtime": 92034.0279, + "train_tokens_per_second": 6206.288 + }, + { + "epoch": 1.523046945203795, + "grad_norm": 0.2351573407649994, + "learning_rate": 5e-06, + "loss": 0.9934, + "num_input_tokens_seen": 571646112, + "step": 1258, + "train_runtime": 92108.3155, + "train_tokens_per_second": 6206.238 + }, + { + "epoch": 1.5242577020213963, + "grad_norm": 0.25675877928733826, + "learning_rate": 5e-06, + "loss": 0.9665, + "num_input_tokens_seen": 572084040, + "step": 1259, + "train_runtime": 92176.9792, + "train_tokens_per_second": 6206.366 + }, + { + "epoch": 1.5254684588389977, + "grad_norm": 0.23532457649707794, + "learning_rate": 5e-06, + "loss": 0.9635, + "num_input_tokens_seen": 572531232, + "step": 1260, + "train_runtime": 92244.6997, + "train_tokens_per_second": 6206.657 + }, + { + "epoch": 1.5266792156565991, + "grad_norm": 0.23427313566207886, + "learning_rate": 5e-06, + "loss": 0.9456, + "num_input_tokens_seen": 572985200, + "step": 1261, + "train_runtime": 92317.2757, + "train_tokens_per_second": 6206.695 + }, + { + "epoch": 1.5278899724742003, + "grad_norm": 0.2370956540107727, + "learning_rate": 5e-06, + "loss": 0.9875, + "num_input_tokens_seen": 573434448, + "step": 1262, + "train_runtime": 92388.1785, + "train_tokens_per_second": 6206.795 + }, + { + "epoch": 1.5291007292918017, + "grad_norm": 0.2511068284511566, + "learning_rate": 5e-06, + "loss": 0.9408, + "num_input_tokens_seen": 573888816, + "step": 1263, + "train_runtime": 92463.6049, + "train_tokens_per_second": 6206.645 + }, + { + "epoch": 1.5303114861094032, + "grad_norm": 0.22451600432395935, + "learning_rate": 5e-06, + "loss": 0.9585, + "num_input_tokens_seen": 574350424, + "step": 1264, + "train_runtime": 92541.1676, + "train_tokens_per_second": 6206.432 + }, + { + "epoch": 1.5315222429270046, + "grad_norm": 0.23519355058670044, + "learning_rate": 5e-06, + "loss": 0.9484, + "num_input_tokens_seen": 574821008, + "step": 1265, + "train_runtime": 92620.4674, + "train_tokens_per_second": 6206.199 + }, + { + "epoch": 1.532732999744606, + "grad_norm": 0.2533230483531952, + "learning_rate": 5e-06, + "loss": 0.9393, + "num_input_tokens_seen": 575257344, + "step": 1266, + "train_runtime": 92693.6836, + "train_tokens_per_second": 6206.004 + }, + { + "epoch": 1.5339437565622074, + "grad_norm": 0.251905232667923, + "learning_rate": 5e-06, + "loss": 0.9898, + "num_input_tokens_seen": 575695280, + "step": 1267, + "train_runtime": 92766.7232, + "train_tokens_per_second": 6205.838 + }, + { + "epoch": 1.5351545133798088, + "grad_norm": 0.23301640152931213, + "learning_rate": 5e-06, + "loss": 0.949, + "num_input_tokens_seen": 576144632, + "step": 1268, + "train_runtime": 92842.3959, + "train_tokens_per_second": 6205.62 + }, + { + "epoch": 1.5363652701974102, + "grad_norm": 0.2319250851869583, + "learning_rate": 5e-06, + "loss": 0.9151, + "num_input_tokens_seen": 576602952, + "step": 1269, + "train_runtime": 92917.6352, + "train_tokens_per_second": 6205.528 + }, + { + "epoch": 1.5375760270150116, + "grad_norm": 0.23095951974391937, + "learning_rate": 5e-06, + "loss": 0.9466, + "num_input_tokens_seen": 577064504, + "step": 1270, + "train_runtime": 92994.592, + "train_tokens_per_second": 6205.356 + }, + { + "epoch": 1.538786783832613, + "grad_norm": 0.23852431774139404, + "learning_rate": 5e-06, + "loss": 0.9622, + "num_input_tokens_seen": 577527016, + "step": 1271, + "train_runtime": 93072.0976, + "train_tokens_per_second": 6205.157 + }, + { + "epoch": 1.5399975406502142, + "grad_norm": 0.22824853658676147, + "learning_rate": 5e-06, + "loss": 0.9495, + "num_input_tokens_seen": 577987968, + "step": 1272, + "train_runtime": 93148.324, + "train_tokens_per_second": 6205.028 + }, + { + "epoch": 1.5412082974678156, + "grad_norm": 0.23495082557201385, + "learning_rate": 5e-06, + "loss": 0.9997, + "num_input_tokens_seen": 578430992, + "step": 1273, + "train_runtime": 93222.962, + "train_tokens_per_second": 6204.812 + }, + { + "epoch": 1.542419054285417, + "grad_norm": 0.24541781842708588, + "learning_rate": 5e-06, + "loss": 0.9423, + "num_input_tokens_seen": 578870056, + "step": 1274, + "train_runtime": 93296.3282, + "train_tokens_per_second": 6204.639 + }, + { + "epoch": 1.5436298111030182, + "grad_norm": 0.24258604645729065, + "learning_rate": 5e-06, + "loss": 0.9238, + "num_input_tokens_seen": 579339008, + "step": 1275, + "train_runtime": 93375.0592, + "train_tokens_per_second": 6204.43 + }, + { + "epoch": 1.5448405679206196, + "grad_norm": 0.22991403937339783, + "learning_rate": 5e-06, + "loss": 0.9566, + "num_input_tokens_seen": 579793544, + "step": 1276, + "train_runtime": 93450.4826, + "train_tokens_per_second": 6204.286 + }, + { + "epoch": 1.546051324738221, + "grad_norm": 0.2381500005722046, + "learning_rate": 5e-06, + "loss": 0.9507, + "num_input_tokens_seen": 580233824, + "step": 1277, + "train_runtime": 93524.278, + "train_tokens_per_second": 6204.098 + }, + { + "epoch": 1.5472620815558225, + "grad_norm": 0.2665536105632782, + "learning_rate": 5e-06, + "loss": 0.9165, + "num_input_tokens_seen": 580681840, + "step": 1278, + "train_runtime": 93598.8728, + "train_tokens_per_second": 6203.941 + }, + { + "epoch": 1.5484728383734239, + "grad_norm": 0.25912097096443176, + "learning_rate": 5e-06, + "loss": 0.9159, + "num_input_tokens_seen": 581134808, + "step": 1279, + "train_runtime": 93674.4264, + "train_tokens_per_second": 6203.772 + }, + { + "epoch": 1.5496835951910253, + "grad_norm": 0.257059782743454, + "learning_rate": 5e-06, + "loss": 1.0082, + "num_input_tokens_seen": 581578592, + "step": 1280, + "train_runtime": 93748.5874, + "train_tokens_per_second": 6203.598 + }, + { + "epoch": 1.5508943520086267, + "grad_norm": 0.22761328518390656, + "learning_rate": 5e-06, + "loss": 0.918, + "num_input_tokens_seen": 582043576, + "step": 1281, + "train_runtime": 93826.8246, + "train_tokens_per_second": 6203.381 + }, + { + "epoch": 1.552105108826228, + "grad_norm": 0.23127709329128265, + "learning_rate": 5e-06, + "loss": 0.9535, + "num_input_tokens_seen": 582491064, + "step": 1282, + "train_runtime": 93900.1492, + "train_tokens_per_second": 6203.303 + }, + { + "epoch": 1.5533158656438295, + "grad_norm": 0.23334218561649323, + "learning_rate": 5e-06, + "loss": 0.9363, + "num_input_tokens_seen": 582943336, + "step": 1283, + "train_runtime": 93971.7767, + "train_tokens_per_second": 6203.387 + }, + { + "epoch": 1.554526622461431, + "grad_norm": 0.2526426613330841, + "learning_rate": 5e-06, + "loss": 0.9897, + "num_input_tokens_seen": 583387120, + "step": 1284, + "train_runtime": 94045.0911, + "train_tokens_per_second": 6203.27 + }, + { + "epoch": 1.555737379279032, + "grad_norm": 0.28767573833465576, + "learning_rate": 5e-06, + "loss": 0.9343, + "num_input_tokens_seen": 583846136, + "step": 1285, + "train_runtime": 94125.4895, + "train_tokens_per_second": 6202.848 + }, + { + "epoch": 1.5569481360966335, + "grad_norm": 0.22892381250858307, + "learning_rate": 5e-06, + "loss": 0.911, + "num_input_tokens_seen": 584283936, + "step": 1286, + "train_runtime": 94197.0686, + "train_tokens_per_second": 6202.783 + }, + { + "epoch": 1.558158892914235, + "grad_norm": 0.22896316647529602, + "learning_rate": 5e-06, + "loss": 0.9782, + "num_input_tokens_seen": 584742536, + "step": 1287, + "train_runtime": 94271.4579, + "train_tokens_per_second": 6202.753 + }, + { + "epoch": 1.5593696497318363, + "grad_norm": 0.2572176456451416, + "learning_rate": 5e-06, + "loss": 0.9574, + "num_input_tokens_seen": 585188088, + "step": 1288, + "train_runtime": 94344.216, + "train_tokens_per_second": 6202.692 + }, + { + "epoch": 1.5605804065494375, + "grad_norm": 0.23889631032943726, + "learning_rate": 5e-06, + "loss": 0.97, + "num_input_tokens_seen": 585653328, + "step": 1289, + "train_runtime": 94418.8519, + "train_tokens_per_second": 6202.716 + }, + { + "epoch": 1.561791163367039, + "grad_norm": 0.23102454841136932, + "learning_rate": 5e-06, + "loss": 0.9593, + "num_input_tokens_seen": 586117272, + "step": 1290, + "train_runtime": 94493.2781, + "train_tokens_per_second": 6202.74 + }, + { + "epoch": 1.5630019201846403, + "grad_norm": 0.2229638695716858, + "learning_rate": 5e-06, + "loss": 0.8971, + "num_input_tokens_seen": 586549016, + "step": 1291, + "train_runtime": 94563.503, + "train_tokens_per_second": 6202.7 + }, + { + "epoch": 1.5642126770022418, + "grad_norm": 0.258696585893631, + "learning_rate": 5e-06, + "loss": 0.9113, + "num_input_tokens_seen": 587013368, + "step": 1292, + "train_runtime": 94641.4264, + "train_tokens_per_second": 6202.499 + }, + { + "epoch": 1.5654234338198432, + "grad_norm": 0.23761804401874542, + "learning_rate": 5e-06, + "loss": 0.8907, + "num_input_tokens_seen": 587491456, + "step": 1293, + "train_runtime": 94721.2593, + "train_tokens_per_second": 6202.319 + }, + { + "epoch": 1.5666341906374446, + "grad_norm": 0.24647028744220734, + "learning_rate": 5e-06, + "loss": 0.9741, + "num_input_tokens_seen": 587940688, + "step": 1294, + "train_runtime": 94794.5549, + "train_tokens_per_second": 6202.262 + }, + { + "epoch": 1.567844947455046, + "grad_norm": 0.2338888794183731, + "learning_rate": 5e-06, + "loss": 0.9493, + "num_input_tokens_seen": 588396192, + "step": 1295, + "train_runtime": 94867.988, + "train_tokens_per_second": 6202.263 + }, + { + "epoch": 1.5690557042726474, + "grad_norm": 0.26478147506713867, + "learning_rate": 5e-06, + "loss": 0.9862, + "num_input_tokens_seen": 588832744, + "step": 1296, + "train_runtime": 94938.6055, + "train_tokens_per_second": 6202.248 + }, + { + "epoch": 1.5702664610902488, + "grad_norm": 0.23042112588882446, + "learning_rate": 5e-06, + "loss": 0.9421, + "num_input_tokens_seen": 589284472, + "step": 1297, + "train_runtime": 95011.311, + "train_tokens_per_second": 6202.256 + }, + { + "epoch": 1.5714772179078502, + "grad_norm": 0.2494785189628601, + "learning_rate": 5e-06, + "loss": 0.9467, + "num_input_tokens_seen": 589747536, + "step": 1298, + "train_runtime": 95086.1322, + "train_tokens_per_second": 6202.246 + }, + { + "epoch": 1.5726879747254514, + "grad_norm": 0.27761778235435486, + "learning_rate": 5e-06, + "loss": 0.9743, + "num_input_tokens_seen": 590193016, + "step": 1299, + "train_runtime": 95158.7332, + "train_tokens_per_second": 6202.195 + }, + { + "epoch": 1.5738987315430528, + "grad_norm": 0.2412542998790741, + "learning_rate": 5e-06, + "loss": 0.9015, + "num_input_tokens_seen": 590671280, + "step": 1300, + "train_runtime": 95236.6476, + "train_tokens_per_second": 6202.143 + }, + { + "epoch": 1.5751094883606542, + "grad_norm": 0.23688916862010956, + "learning_rate": 5e-06, + "loss": 0.9363, + "num_input_tokens_seen": 591150592, + "step": 1301, + "train_runtime": 95315.0789, + "train_tokens_per_second": 6202.068 + }, + { + "epoch": 1.5763202451782554, + "grad_norm": 0.2533585727214813, + "learning_rate": 5e-06, + "loss": 0.9026, + "num_input_tokens_seen": 591598136, + "step": 1302, + "train_runtime": 95388.7088, + "train_tokens_per_second": 6201.972 + }, + { + "epoch": 1.5775310019958568, + "grad_norm": 0.24774165451526642, + "learning_rate": 5e-06, + "loss": 0.9032, + "num_input_tokens_seen": 592054200, + "step": 1303, + "train_runtime": 95461.419, + "train_tokens_per_second": 6202.026 + }, + { + "epoch": 1.5787417588134582, + "grad_norm": 0.2428959310054779, + "learning_rate": 5e-06, + "loss": 0.926, + "num_input_tokens_seen": 592513824, + "step": 1304, + "train_runtime": 95532.3576, + "train_tokens_per_second": 6202.232 + }, + { + "epoch": 1.5799525156310597, + "grad_norm": 0.23670534789562225, + "learning_rate": 5e-06, + "loss": 0.9507, + "num_input_tokens_seen": 592975696, + "step": 1305, + "train_runtime": 95604.7535, + "train_tokens_per_second": 6202.366 + }, + { + "epoch": 1.581163272448661, + "grad_norm": 0.2287970781326294, + "learning_rate": 5e-06, + "loss": 0.881, + "num_input_tokens_seen": 593419968, + "step": 1306, + "train_runtime": 95673.7974, + "train_tokens_per_second": 6202.534 + }, + { + "epoch": 1.5823740292662625, + "grad_norm": 0.23797202110290527, + "learning_rate": 5e-06, + "loss": 0.9343, + "num_input_tokens_seen": 593882816, + "step": 1307, + "train_runtime": 95745.9131, + "train_tokens_per_second": 6202.696 + }, + { + "epoch": 1.5835847860838639, + "grad_norm": 0.24310339987277985, + "learning_rate": 5e-06, + "loss": 0.9403, + "num_input_tokens_seen": 594341808, + "step": 1308, + "train_runtime": 95817.2622, + "train_tokens_per_second": 6202.868 + }, + { + "epoch": 1.5847955429014653, + "grad_norm": 0.23264212906360626, + "learning_rate": 5e-06, + "loss": 0.9363, + "num_input_tokens_seen": 594811616, + "step": 1309, + "train_runtime": 95890.2989, + "train_tokens_per_second": 6203.043 + }, + { + "epoch": 1.5860062997190667, + "grad_norm": 0.2382027506828308, + "learning_rate": 5e-06, + "loss": 0.9259, + "num_input_tokens_seen": 595259352, + "step": 1310, + "train_runtime": 95960.1069, + "train_tokens_per_second": 6203.196 + }, + { + "epoch": 1.5872170565366681, + "grad_norm": 0.28391894698143005, + "learning_rate": 5e-06, + "loss": 0.9381, + "num_input_tokens_seen": 595731272, + "step": 1311, + "train_runtime": 96034.005, + "train_tokens_per_second": 6203.337 + }, + { + "epoch": 1.5884278133542693, + "grad_norm": 0.23430295288562775, + "learning_rate": 5e-06, + "loss": 0.9697, + "num_input_tokens_seen": 596173736, + "step": 1312, + "train_runtime": 96102.8369, + "train_tokens_per_second": 6203.498 + }, + { + "epoch": 1.5896385701718707, + "grad_norm": 0.22797590494155884, + "learning_rate": 5e-06, + "loss": 0.952, + "num_input_tokens_seen": 596637800, + "step": 1313, + "train_runtime": 96174.9195, + "train_tokens_per_second": 6203.674 + }, + { + "epoch": 1.5908493269894721, + "grad_norm": 0.23347218334674835, + "learning_rate": 5e-06, + "loss": 0.9375, + "num_input_tokens_seen": 597079384, + "step": 1314, + "train_runtime": 96243.2882, + "train_tokens_per_second": 6203.855 + }, + { + "epoch": 1.5920600838070733, + "grad_norm": 0.22693176567554474, + "learning_rate": 5e-06, + "loss": 0.9722, + "num_input_tokens_seen": 597547256, + "step": 1315, + "train_runtime": 96315.9975, + "train_tokens_per_second": 6204.029 + }, + { + "epoch": 1.5932708406246747, + "grad_norm": 0.2342706322669983, + "learning_rate": 5e-06, + "loss": 0.8936, + "num_input_tokens_seen": 598005224, + "step": 1316, + "train_runtime": 96387.5398, + "train_tokens_per_second": 6204.176 + }, + { + "epoch": 1.5944815974422761, + "grad_norm": 0.23413512110710144, + "learning_rate": 5e-06, + "loss": 0.9149, + "num_input_tokens_seen": 598458528, + "step": 1317, + "train_runtime": 96458.3878, + "train_tokens_per_second": 6204.318 + }, + { + "epoch": 1.5956923542598775, + "grad_norm": 0.2367754727602005, + "learning_rate": 5e-06, + "loss": 0.9854, + "num_input_tokens_seen": 598919008, + "step": 1318, + "train_runtime": 96530.1929, + "train_tokens_per_second": 6204.473 + }, + { + "epoch": 1.596903111077479, + "grad_norm": 0.23297631740570068, + "learning_rate": 5e-06, + "loss": 0.9285, + "num_input_tokens_seen": 599369792, + "step": 1319, + "train_runtime": 96600.1615, + "train_tokens_per_second": 6204.646 + }, + { + "epoch": 1.5981138678950804, + "grad_norm": 0.23420660197734833, + "learning_rate": 5e-06, + "loss": 0.9593, + "num_input_tokens_seen": 599827040, + "step": 1320, + "train_runtime": 96672.0453, + "train_tokens_per_second": 6204.762 + }, + { + "epoch": 1.5993246247126818, + "grad_norm": 0.2214992493391037, + "learning_rate": 5e-06, + "loss": 0.9336, + "num_input_tokens_seen": 600298640, + "step": 1321, + "train_runtime": 96745.0125, + "train_tokens_per_second": 6204.957 + }, + { + "epoch": 1.6005353815302832, + "grad_norm": 0.23480089008808136, + "learning_rate": 5e-06, + "loss": 0.8738, + "num_input_tokens_seen": 600740872, + "step": 1322, + "train_runtime": 96813.8967, + "train_tokens_per_second": 6205.11 + }, + { + "epoch": 1.6017461383478846, + "grad_norm": 0.2521512508392334, + "learning_rate": 5e-06, + "loss": 0.9435, + "num_input_tokens_seen": 601197152, + "step": 1323, + "train_runtime": 96886.3997, + "train_tokens_per_second": 6205.176 + }, + { + "epoch": 1.602956895165486, + "grad_norm": 0.23057833313941956, + "learning_rate": 5e-06, + "loss": 0.9161, + "num_input_tokens_seen": 601645592, + "step": 1324, + "train_runtime": 96959.6364, + "train_tokens_per_second": 6205.114 + }, + { + "epoch": 1.6041676519830872, + "grad_norm": 0.27399954199790955, + "learning_rate": 5e-06, + "loss": 0.9229, + "num_input_tokens_seen": 602103456, + "step": 1325, + "train_runtime": 97034.3576, + "train_tokens_per_second": 6205.054 + }, + { + "epoch": 1.6053784088006886, + "grad_norm": 0.2807023823261261, + "learning_rate": 5e-06, + "loss": 0.9729, + "num_input_tokens_seen": 602557192, + "step": 1326, + "train_runtime": 97108.0688, + "train_tokens_per_second": 6205.017 + }, + { + "epoch": 1.60658916561829, + "grad_norm": 0.24586202204227448, + "learning_rate": 5e-06, + "loss": 0.9555, + "num_input_tokens_seen": 603006376, + "step": 1327, + "train_runtime": 97180.951, + "train_tokens_per_second": 6204.985 + }, + { + "epoch": 1.6077999224358914, + "grad_norm": 0.23183618485927582, + "learning_rate": 5e-06, + "loss": 0.9219, + "num_input_tokens_seen": 603457768, + "step": 1328, + "train_runtime": 97253.8889, + "train_tokens_per_second": 6204.973 + }, + { + "epoch": 1.6090106792534926, + "grad_norm": 0.24499334394931793, + "learning_rate": 5e-06, + "loss": 0.9493, + "num_input_tokens_seen": 603904864, + "step": 1329, + "train_runtime": 97325.8918, + "train_tokens_per_second": 6204.976 + }, + { + "epoch": 1.610221436071094, + "grad_norm": 0.22572267055511475, + "learning_rate": 5e-06, + "loss": 0.9446, + "num_input_tokens_seen": 604368296, + "step": 1330, + "train_runtime": 97400.8104, + "train_tokens_per_second": 6204.962 + }, + { + "epoch": 1.6114321928886954, + "grad_norm": 0.24778367578983307, + "learning_rate": 5e-06, + "loss": 0.9465, + "num_input_tokens_seen": 604816744, + "step": 1331, + "train_runtime": 97472.976, + "train_tokens_per_second": 6204.968 + }, + { + "epoch": 1.6126429497062968, + "grad_norm": 0.23673632740974426, + "learning_rate": 5e-06, + "loss": 0.9609, + "num_input_tokens_seen": 605278760, + "step": 1332, + "train_runtime": 97548.1753, + "train_tokens_per_second": 6204.921 + }, + { + "epoch": 1.6138537065238983, + "grad_norm": 0.24265213310718536, + "learning_rate": 5e-06, + "loss": 1.009, + "num_input_tokens_seen": 605740240, + "step": 1333, + "train_runtime": 97623.1268, + "train_tokens_per_second": 6204.885 + }, + { + "epoch": 1.6150644633414997, + "grad_norm": 0.2499813735485077, + "learning_rate": 5e-06, + "loss": 0.906, + "num_input_tokens_seen": 606188040, + "step": 1334, + "train_runtime": 97695.6817, + "train_tokens_per_second": 6204.86 + }, + { + "epoch": 1.616275220159101, + "grad_norm": 0.23881113529205322, + "learning_rate": 5e-06, + "loss": 0.9569, + "num_input_tokens_seen": 606636736, + "step": 1335, + "train_runtime": 97768.0449, + "train_tokens_per_second": 6204.857 + }, + { + "epoch": 1.6174859769767025, + "grad_norm": 0.23513104021549225, + "learning_rate": 5e-06, + "loss": 0.9537, + "num_input_tokens_seen": 607085392, + "step": 1336, + "train_runtime": 97840.6583, + "train_tokens_per_second": 6204.838 + }, + { + "epoch": 1.618696733794304, + "grad_norm": 0.21942594647407532, + "learning_rate": 5e-06, + "loss": 0.9643, + "num_input_tokens_seen": 607566272, + "step": 1337, + "train_runtime": 97917.8238, + "train_tokens_per_second": 6204.859 + }, + { + "epoch": 1.6199074906119053, + "grad_norm": 0.24452783167362213, + "learning_rate": 5e-06, + "loss": 0.9954, + "num_input_tokens_seen": 608026592, + "step": 1338, + "train_runtime": 97992.107, + "train_tokens_per_second": 6204.853 + }, + { + "epoch": 1.6211182474295065, + "grad_norm": 0.2625705897808075, + "learning_rate": 5e-06, + "loss": 0.8922, + "num_input_tokens_seen": 608489800, + "step": 1339, + "train_runtime": 98064.8771, + "train_tokens_per_second": 6204.972 + }, + { + "epoch": 1.622329004247108, + "grad_norm": 0.23123782873153687, + "learning_rate": 5e-06, + "loss": 0.9478, + "num_input_tokens_seen": 608948792, + "step": 1340, + "train_runtime": 98137.5426, + "train_tokens_per_second": 6205.054 + }, + { + "epoch": 1.6235397610647093, + "grad_norm": 0.2373858541250229, + "learning_rate": 5e-06, + "loss": 0.9555, + "num_input_tokens_seen": 609387720, + "step": 1341, + "train_runtime": 98209.4854, + "train_tokens_per_second": 6204.978 + }, + { + "epoch": 1.6247505178823105, + "grad_norm": 0.26772409677505493, + "learning_rate": 5e-06, + "loss": 0.9705, + "num_input_tokens_seen": 609835256, + "step": 1342, + "train_runtime": 98283.6431, + "train_tokens_per_second": 6204.85 + }, + { + "epoch": 1.625961274699912, + "grad_norm": 0.26004475355148315, + "learning_rate": 5e-06, + "loss": 0.9613, + "num_input_tokens_seen": 610280952, + "step": 1343, + "train_runtime": 98357.4587, + "train_tokens_per_second": 6204.725 + }, + { + "epoch": 1.6271720315175133, + "grad_norm": 0.24032413959503174, + "learning_rate": 5e-06, + "loss": 0.9171, + "num_input_tokens_seen": 610736608, + "step": 1344, + "train_runtime": 98432.191, + "train_tokens_per_second": 6204.643 + }, + { + "epoch": 1.6283827883351147, + "grad_norm": 0.24109645187854767, + "learning_rate": 5e-06, + "loss": 0.9083, + "num_input_tokens_seen": 611195912, + "step": 1345, + "train_runtime": 98507.5175, + "train_tokens_per_second": 6204.561 + }, + { + "epoch": 1.6295935451527161, + "grad_norm": 0.23913376033306122, + "learning_rate": 5e-06, + "loss": 0.9131, + "num_input_tokens_seen": 611621864, + "step": 1346, + "train_runtime": 98575.6226, + "train_tokens_per_second": 6204.595 + }, + { + "epoch": 1.6308043019703176, + "grad_norm": 0.23697420954704285, + "learning_rate": 5e-06, + "loss": 0.9913, + "num_input_tokens_seen": 612066184, + "step": 1347, + "train_runtime": 98646.9799, + "train_tokens_per_second": 6204.611 + }, + { + "epoch": 1.632015058787919, + "grad_norm": 0.23569026589393616, + "learning_rate": 5e-06, + "loss": 0.9095, + "num_input_tokens_seen": 612525336, + "step": 1348, + "train_runtime": 98721.3967, + "train_tokens_per_second": 6204.585 + }, + { + "epoch": 1.6332258156055204, + "grad_norm": 0.25485959649086, + "learning_rate": 5e-06, + "loss": 0.9833, + "num_input_tokens_seen": 612975160, + "step": 1349, + "train_runtime": 98800.9432, + "train_tokens_per_second": 6204.143 + }, + { + "epoch": 1.6344365724231218, + "grad_norm": 0.2503267228603363, + "learning_rate": 5e-06, + "loss": 0.939, + "num_input_tokens_seen": 613459824, + "step": 1350, + "train_runtime": 98884.1899, + "train_tokens_per_second": 6203.821 + }, + { + "epoch": 1.6356473292407232, + "grad_norm": 0.23752045631408691, + "learning_rate": 5e-06, + "loss": 0.91, + "num_input_tokens_seen": 613928136, + "step": 1351, + "train_runtime": 98962.7457, + "train_tokens_per_second": 6203.629 + }, + { + "epoch": 1.6368580860583244, + "grad_norm": 0.23110365867614746, + "learning_rate": 5e-06, + "loss": 0.9469, + "num_input_tokens_seen": 614374960, + "step": 1352, + "train_runtime": 99036.341, + "train_tokens_per_second": 6203.53 + }, + { + "epoch": 1.6380688428759258, + "grad_norm": 0.24777059257030487, + "learning_rate": 5e-06, + "loss": 0.9272, + "num_input_tokens_seen": 614823416, + "step": 1353, + "train_runtime": 99110.4637, + "train_tokens_per_second": 6203.416 + }, + { + "epoch": 1.6392795996935272, + "grad_norm": 0.23056265711784363, + "learning_rate": 5e-06, + "loss": 0.9341, + "num_input_tokens_seen": 615269264, + "step": 1354, + "train_runtime": 99177.7333, + "train_tokens_per_second": 6203.704 + }, + { + "epoch": 1.6404903565111284, + "grad_norm": 0.24137234687805176, + "learning_rate": 5e-06, + "loss": 0.9345, + "num_input_tokens_seen": 615705496, + "step": 1355, + "train_runtime": 99243.9752, + "train_tokens_per_second": 6203.958 + }, + { + "epoch": 1.6417011133287298, + "grad_norm": 0.25345325469970703, + "learning_rate": 5e-06, + "loss": 0.9613, + "num_input_tokens_seen": 616144368, + "step": 1356, + "train_runtime": 99309.7621, + "train_tokens_per_second": 6204.268 + }, + { + "epoch": 1.6429118701463312, + "grad_norm": 0.24765293300151825, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 616584824, + "step": 1357, + "train_runtime": 99376.1136, + "train_tokens_per_second": 6204.558 + }, + { + "epoch": 1.6441226269639326, + "grad_norm": 0.2568323612213135, + "learning_rate": 5e-06, + "loss": 0.9893, + "num_input_tokens_seen": 617024680, + "step": 1358, + "train_runtime": 99443.2364, + "train_tokens_per_second": 6204.793 + }, + { + "epoch": 1.645333383781534, + "grad_norm": 0.25249186158180237, + "learning_rate": 5e-06, + "loss": 0.9166, + "num_input_tokens_seen": 617471792, + "step": 1359, + "train_runtime": 99510.1824, + "train_tokens_per_second": 6205.112 + }, + { + "epoch": 1.6465441405991355, + "grad_norm": 0.2438102513551712, + "learning_rate": 5e-06, + "loss": 0.9972, + "num_input_tokens_seen": 617917968, + "step": 1360, + "train_runtime": 99576.8247, + "train_tokens_per_second": 6205.44 + }, + { + "epoch": 1.6477548974167369, + "grad_norm": 0.26252567768096924, + "learning_rate": 5e-06, + "loss": 0.9144, + "num_input_tokens_seen": 618372880, + "step": 1361, + "train_runtime": 99646.255, + "train_tokens_per_second": 6205.681 + }, + { + "epoch": 1.6489656542343383, + "grad_norm": 0.2307174950838089, + "learning_rate": 5e-06, + "loss": 0.9693, + "num_input_tokens_seen": 618825688, + "step": 1362, + "train_runtime": 99721.3775, + "train_tokens_per_second": 6205.547 + }, + { + "epoch": 1.6501764110519397, + "grad_norm": 0.23613497614860535, + "learning_rate": 5e-06, + "loss": 0.9993, + "num_input_tokens_seen": 619266920, + "step": 1363, + "train_runtime": 99794.2824, + "train_tokens_per_second": 6205.435 + }, + { + "epoch": 1.651387167869541, + "grad_norm": 0.2175440788269043, + "learning_rate": 5e-06, + "loss": 0.8754, + "num_input_tokens_seen": 619744360, + "step": 1364, + "train_runtime": 99875.7248, + "train_tokens_per_second": 6205.155 + }, + { + "epoch": 1.6525979246871423, + "grad_norm": 0.2543286681175232, + "learning_rate": 5e-06, + "loss": 1.0312, + "num_input_tokens_seen": 620200248, + "step": 1365, + "train_runtime": 99953.0201, + "train_tokens_per_second": 6204.918 + }, + { + "epoch": 1.6538086815047437, + "grad_norm": 0.23493343591690063, + "learning_rate": 5e-06, + "loss": 0.969, + "num_input_tokens_seen": 620661640, + "step": 1366, + "train_runtime": 100028.4666, + "train_tokens_per_second": 6204.85 + }, + { + "epoch": 1.655019438322345, + "grad_norm": 0.24430783092975616, + "learning_rate": 5e-06, + "loss": 0.9651, + "num_input_tokens_seen": 621103232, + "step": 1367, + "train_runtime": 100099.1989, + "train_tokens_per_second": 6204.877 + }, + { + "epoch": 1.6562301951399465, + "grad_norm": 0.24038319289684296, + "learning_rate": 5e-06, + "loss": 0.9906, + "num_input_tokens_seen": 621550880, + "step": 1368, + "train_runtime": 100170.4126, + "train_tokens_per_second": 6204.935 + }, + { + "epoch": 1.6574409519575477, + "grad_norm": 0.23617248237133026, + "learning_rate": 5e-06, + "loss": 0.943, + "num_input_tokens_seen": 621993640, + "step": 1369, + "train_runtime": 100240.8957, + "train_tokens_per_second": 6204.989 + }, + { + "epoch": 1.6586517087751491, + "grad_norm": 0.24460504949092865, + "learning_rate": 5e-06, + "loss": 0.9263, + "num_input_tokens_seen": 622454504, + "step": 1370, + "train_runtime": 100316.4623, + "train_tokens_per_second": 6204.909 + }, + { + "epoch": 1.6598624655927505, + "grad_norm": 0.24925386905670166, + "learning_rate": 5e-06, + "loss": 0.9425, + "num_input_tokens_seen": 622893320, + "step": 1371, + "train_runtime": 100387.8797, + "train_tokens_per_second": 6204.866 + }, + { + "epoch": 1.661073222410352, + "grad_norm": 0.2371699959039688, + "learning_rate": 5e-06, + "loss": 0.9302, + "num_input_tokens_seen": 623339984, + "step": 1372, + "train_runtime": 100461.2817, + "train_tokens_per_second": 6204.778 + }, + { + "epoch": 1.6622839792279533, + "grad_norm": 0.24603520333766937, + "learning_rate": 5e-06, + "loss": 0.9302, + "num_input_tokens_seen": 623800888, + "step": 1373, + "train_runtime": 100537.6306, + "train_tokens_per_second": 6204.651 + }, + { + "epoch": 1.6634947360455548, + "grad_norm": 0.23240868747234344, + "learning_rate": 5e-06, + "loss": 0.9293, + "num_input_tokens_seen": 624248528, + "step": 1374, + "train_runtime": 100611.4073, + "train_tokens_per_second": 6204.55 + }, + { + "epoch": 1.6647054928631562, + "grad_norm": 0.2440965324640274, + "learning_rate": 5e-06, + "loss": 0.9818, + "num_input_tokens_seen": 624694936, + "step": 1375, + "train_runtime": 100684.7425, + "train_tokens_per_second": 6204.465 + }, + { + "epoch": 1.6659162496807576, + "grad_norm": 0.24966219067573547, + "learning_rate": 5e-06, + "loss": 0.9338, + "num_input_tokens_seen": 625155168, + "step": 1376, + "train_runtime": 100770.2757, + "train_tokens_per_second": 6203.766 + }, + { + "epoch": 1.667127006498359, + "grad_norm": 0.24221491813659668, + "learning_rate": 5e-06, + "loss": 0.9025, + "num_input_tokens_seen": 625605080, + "step": 1377, + "train_runtime": 100845.271, + "train_tokens_per_second": 6203.613 + }, + { + "epoch": 1.6683377633159604, + "grad_norm": 0.23787087202072144, + "learning_rate": 5e-06, + "loss": 0.9104, + "num_input_tokens_seen": 626052144, + "step": 1378, + "train_runtime": 100919.6785, + "train_tokens_per_second": 6203.469 + }, + { + "epoch": 1.6695485201335616, + "grad_norm": 0.24397063255310059, + "learning_rate": 5e-06, + "loss": 0.9426, + "num_input_tokens_seen": 626534576, + "step": 1379, + "train_runtime": 100999.7059, + "train_tokens_per_second": 6203.331 + }, + { + "epoch": 1.670759276951163, + "grad_norm": 0.30004844069480896, + "learning_rate": 5e-06, + "loss": 0.9302, + "num_input_tokens_seen": 626990528, + "step": 1380, + "train_runtime": 101077.577, + "train_tokens_per_second": 6203.063 + }, + { + "epoch": 1.6719700337687644, + "grad_norm": 0.23161612451076508, + "learning_rate": 5e-06, + "loss": 0.9117, + "num_input_tokens_seen": 627444888, + "step": 1381, + "train_runtime": 101154.9535, + "train_tokens_per_second": 6202.809 + }, + { + "epoch": 1.6731807905863656, + "grad_norm": 0.29034850001335144, + "learning_rate": 5e-06, + "loss": 0.9122, + "num_input_tokens_seen": 627887600, + "step": 1382, + "train_runtime": 101230.0349, + "train_tokens_per_second": 6202.582 + }, + { + "epoch": 1.674391547403967, + "grad_norm": 0.23793677985668182, + "learning_rate": 5e-06, + "loss": 0.9402, + "num_input_tokens_seen": 628335968, + "step": 1383, + "train_runtime": 101305.9047, + "train_tokens_per_second": 6202.363 + }, + { + "epoch": 1.6756023042215684, + "grad_norm": 0.24347274005413055, + "learning_rate": 5e-06, + "loss": 0.9712, + "num_input_tokens_seen": 628770224, + "step": 1384, + "train_runtime": 101369.8627, + "train_tokens_per_second": 6202.733 + }, + { + "epoch": 1.6768130610391698, + "grad_norm": 0.26189595460891724, + "learning_rate": 5e-06, + "loss": 0.8935, + "num_input_tokens_seen": 629217368, + "step": 1385, + "train_runtime": 101436.0596, + "train_tokens_per_second": 6203.094 + }, + { + "epoch": 1.6780238178567712, + "grad_norm": 0.28286808729171753, + "learning_rate": 5e-06, + "loss": 0.9675, + "num_input_tokens_seen": 629651968, + "step": 1386, + "train_runtime": 101500.3426, + "train_tokens_per_second": 6203.447 + }, + { + "epoch": 1.6792345746743726, + "grad_norm": 0.24720792472362518, + "learning_rate": 5e-06, + "loss": 0.9567, + "num_input_tokens_seen": 630106888, + "step": 1387, + "train_runtime": 101567.8488, + "train_tokens_per_second": 6203.803 + }, + { + "epoch": 1.680445331491974, + "grad_norm": 0.2625053822994232, + "learning_rate": 5e-06, + "loss": 0.9209, + "num_input_tokens_seen": 630562680, + "step": 1388, + "train_runtime": 101638.1718, + "train_tokens_per_second": 6203.995 + }, + { + "epoch": 1.6816560883095755, + "grad_norm": 0.26049408316612244, + "learning_rate": 5e-06, + "loss": 1.0043, + "num_input_tokens_seen": 631017728, + "step": 1389, + "train_runtime": 101711.0478, + "train_tokens_per_second": 6204.023 + }, + { + "epoch": 1.6828668451271769, + "grad_norm": 0.24920783936977386, + "learning_rate": 5e-06, + "loss": 0.9278, + "num_input_tokens_seen": 631473184, + "step": 1390, + "train_runtime": 101784.6458, + "train_tokens_per_second": 6204.012 + }, + { + "epoch": 1.6840776019447783, + "grad_norm": 0.24204052984714508, + "learning_rate": 5e-06, + "loss": 0.8937, + "num_input_tokens_seen": 631926200, + "step": 1391, + "train_runtime": 101857.4628, + "train_tokens_per_second": 6204.025 + }, + { + "epoch": 1.6852883587623795, + "grad_norm": 0.27543655037879944, + "learning_rate": 5e-06, + "loss": 0.9788, + "num_input_tokens_seen": 632376624, + "step": 1392, + "train_runtime": 101931.1872, + "train_tokens_per_second": 6203.956 + }, + { + "epoch": 1.686499115579981, + "grad_norm": 0.24152293801307678, + "learning_rate": 5e-06, + "loss": 0.9632, + "num_input_tokens_seen": 632826048, + "step": 1393, + "train_runtime": 102004.9311, + "train_tokens_per_second": 6203.877 + }, + { + "epoch": 1.6877098723975823, + "grad_norm": 0.24093790352344513, + "learning_rate": 5e-06, + "loss": 0.9022, + "num_input_tokens_seen": 633294632, + "step": 1394, + "train_runtime": 102083.4824, + "train_tokens_per_second": 6203.693 + }, + { + "epoch": 1.6889206292151835, + "grad_norm": 0.23147398233413696, + "learning_rate": 5e-06, + "loss": 0.9156, + "num_input_tokens_seen": 633761776, + "step": 1395, + "train_runtime": 102161.3889, + "train_tokens_per_second": 6203.535 + }, + { + "epoch": 1.690131386032785, + "grad_norm": 0.23987317085266113, + "learning_rate": 5e-06, + "loss": 0.9063, + "num_input_tokens_seen": 634226504, + "step": 1396, + "train_runtime": 102238.2689, + "train_tokens_per_second": 6203.416 + }, + { + "epoch": 1.6913421428503863, + "grad_norm": 0.25991290807724, + "learning_rate": 5e-06, + "loss": 0.9713, + "num_input_tokens_seen": 634673344, + "step": 1397, + "train_runtime": 102309.0901, + "train_tokens_per_second": 6203.489 + }, + { + "epoch": 1.6925528996679877, + "grad_norm": 0.23085501790046692, + "learning_rate": 5e-06, + "loss": 0.8837, + "num_input_tokens_seen": 635118792, + "step": 1398, + "train_runtime": 102383.7557, + "train_tokens_per_second": 6203.316 + }, + { + "epoch": 1.6937636564855891, + "grad_norm": 0.2376517653465271, + "learning_rate": 5e-06, + "loss": 0.9406, + "num_input_tokens_seen": 635574264, + "step": 1399, + "train_runtime": 102459.5006, + "train_tokens_per_second": 6203.176 + }, + { + "epoch": 1.6949744133031905, + "grad_norm": 0.25487491488456726, + "learning_rate": 5e-06, + "loss": 0.9089, + "num_input_tokens_seen": 636022216, + "step": 1400, + "train_runtime": 102534.4322, + "train_tokens_per_second": 6203.011 + }, + { + "epoch": 1.696185170120792, + "grad_norm": 0.2450874000787735, + "learning_rate": 5e-06, + "loss": 0.9227, + "num_input_tokens_seen": 636462544, + "step": 1401, + "train_runtime": 102607.6137, + "train_tokens_per_second": 6202.878 + }, + { + "epoch": 1.6973959269383934, + "grad_norm": 0.24227704107761383, + "learning_rate": 5e-06, + "loss": 0.9495, + "num_input_tokens_seen": 636916136, + "step": 1402, + "train_runtime": 102683.0063, + "train_tokens_per_second": 6202.741 + }, + { + "epoch": 1.6986066837559948, + "grad_norm": 0.24646668136119843, + "learning_rate": 5e-06, + "loss": 0.9612, + "num_input_tokens_seen": 637349120, + "step": 1403, + "train_runtime": 102754.4934, + "train_tokens_per_second": 6202.64 + }, + { + "epoch": 1.6998174405735962, + "grad_norm": 0.23192055523395538, + "learning_rate": 5e-06, + "loss": 0.9195, + "num_input_tokens_seen": 637790064, + "step": 1404, + "train_runtime": 102827.9317, + "train_tokens_per_second": 6202.498 + }, + { + "epoch": 1.7010281973911974, + "grad_norm": 0.25445666909217834, + "learning_rate": 5e-06, + "loss": 0.9757, + "num_input_tokens_seen": 638259320, + "step": 1405, + "train_runtime": 102906.6087, + "train_tokens_per_second": 6202.316 + }, + { + "epoch": 1.7022389542087988, + "grad_norm": 0.23562908172607422, + "learning_rate": 5e-06, + "loss": 0.9021, + "num_input_tokens_seen": 638743376, + "step": 1406, + "train_runtime": 102987.9705, + "train_tokens_per_second": 6202.116 + }, + { + "epoch": 1.7034497110264002, + "grad_norm": 0.26519039273262024, + "learning_rate": 5e-06, + "loss": 0.943, + "num_input_tokens_seen": 639190488, + "step": 1407, + "train_runtime": 103062.2706, + "train_tokens_per_second": 6201.983 + }, + { + "epoch": 1.7046604678440016, + "grad_norm": 0.24398094415664673, + "learning_rate": 5e-06, + "loss": 0.9283, + "num_input_tokens_seen": 639643104, + "step": 1408, + "train_runtime": 103137.9381, + "train_tokens_per_second": 6201.822 + }, + { + "epoch": 1.7058712246616028, + "grad_norm": 0.2703668475151062, + "learning_rate": 5e-06, + "loss": 0.9343, + "num_input_tokens_seen": 640089040, + "step": 1409, + "train_runtime": 103211.4765, + "train_tokens_per_second": 6201.724 + }, + { + "epoch": 1.7070819814792042, + "grad_norm": 0.2557445168495178, + "learning_rate": 5e-06, + "loss": 0.9697, + "num_input_tokens_seen": 640528208, + "step": 1410, + "train_runtime": 103284.178, + "train_tokens_per_second": 6201.61 + }, + { + "epoch": 1.7082927382968056, + "grad_norm": 0.2544682025909424, + "learning_rate": 5e-06, + "loss": 0.9409, + "num_input_tokens_seen": 640986416, + "step": 1411, + "train_runtime": 103359.9486, + "train_tokens_per_second": 6201.497 + }, + { + "epoch": 1.709503495114407, + "grad_norm": 0.25841024518013, + "learning_rate": 5e-06, + "loss": 0.9823, + "num_input_tokens_seen": 641422048, + "step": 1412, + "train_runtime": 103430.0233, + "train_tokens_per_second": 6201.507 + }, + { + "epoch": 1.7107142519320084, + "grad_norm": 0.23430770635604858, + "learning_rate": 5e-06, + "loss": 0.9422, + "num_input_tokens_seen": 641890512, + "step": 1413, + "train_runtime": 103504.2794, + "train_tokens_per_second": 6201.584 + }, + { + "epoch": 1.7119250087496098, + "grad_norm": 0.25403422117233276, + "learning_rate": 5e-06, + "loss": 0.9378, + "num_input_tokens_seen": 642351648, + "step": 1414, + "train_runtime": 103577.2489, + "train_tokens_per_second": 6201.667 + }, + { + "epoch": 1.7131357655672113, + "grad_norm": 0.22510449588298798, + "learning_rate": 5e-06, + "loss": 0.9182, + "num_input_tokens_seen": 642814728, + "step": 1415, + "train_runtime": 103650.3402, + "train_tokens_per_second": 6201.762 + }, + { + "epoch": 1.7143465223848127, + "grad_norm": 0.24203039705753326, + "learning_rate": 5e-06, + "loss": 0.9479, + "num_input_tokens_seen": 643279672, + "step": 1416, + "train_runtime": 103723.7474, + "train_tokens_per_second": 6201.855 + }, + { + "epoch": 1.715557279202414, + "grad_norm": 0.3101445436477661, + "learning_rate": 5e-06, + "loss": 0.9404, + "num_input_tokens_seen": 643718560, + "step": 1417, + "train_runtime": 103792.3388, + "train_tokens_per_second": 6201.985 + }, + { + "epoch": 1.7167680360200155, + "grad_norm": 0.23789572715759277, + "learning_rate": 5e-06, + "loss": 0.9216, + "num_input_tokens_seen": 644158072, + "step": 1418, + "train_runtime": 103864.8684, + "train_tokens_per_second": 6201.886 + }, + { + "epoch": 1.7179787928376167, + "grad_norm": 0.22444191575050354, + "learning_rate": 5e-06, + "loss": 0.887, + "num_input_tokens_seen": 644640400, + "step": 1419, + "train_runtime": 103943.7191, + "train_tokens_per_second": 6201.822 + }, + { + "epoch": 1.719189549655218, + "grad_norm": 0.24372327327728271, + "learning_rate": 5e-06, + "loss": 1.0014, + "num_input_tokens_seen": 645098072, + "step": 1420, + "train_runtime": 104019.5509, + "train_tokens_per_second": 6201.7 + }, + { + "epoch": 1.7204003064728195, + "grad_norm": 0.2408047616481781, + "learning_rate": 5e-06, + "loss": 0.9455, + "num_input_tokens_seen": 645547368, + "step": 1421, + "train_runtime": 104094.8084, + "train_tokens_per_second": 6201.533 + }, + { + "epoch": 1.7216110632904207, + "grad_norm": 0.23340767621994019, + "learning_rate": 5e-06, + "loss": 0.9614, + "num_input_tokens_seen": 646015000, + "step": 1422, + "train_runtime": 104173.3646, + "train_tokens_per_second": 6201.345 + }, + { + "epoch": 1.722821820108022, + "grad_norm": 0.24374446272850037, + "learning_rate": 5e-06, + "loss": 0.955, + "num_input_tokens_seen": 646457448, + "step": 1423, + "train_runtime": 104243.9382, + "train_tokens_per_second": 6201.391 + }, + { + "epoch": 1.7240325769256235, + "grad_norm": 0.2410658746957779, + "learning_rate": 5e-06, + "loss": 0.9503, + "num_input_tokens_seen": 646928080, + "step": 1424, + "train_runtime": 104319.3375, + "train_tokens_per_second": 6201.421 + }, + { + "epoch": 1.725243333743225, + "grad_norm": 0.26561877131462097, + "learning_rate": 5e-06, + "loss": 0.9843, + "num_input_tokens_seen": 647373424, + "step": 1425, + "train_runtime": 104390.2762, + "train_tokens_per_second": 6201.472 + }, + { + "epoch": 1.7264540905608263, + "grad_norm": 0.24555157124996185, + "learning_rate": 5e-06, + "loss": 0.9066, + "num_input_tokens_seen": 647839064, + "step": 1426, + "train_runtime": 104465.7848, + "train_tokens_per_second": 6201.447 + }, + { + "epoch": 1.7276648473784277, + "grad_norm": 0.26610177755355835, + "learning_rate": 5e-06, + "loss": 0.9413, + "num_input_tokens_seen": 648316536, + "step": 1427, + "train_runtime": 104544.7937, + "train_tokens_per_second": 6201.328 + }, + { + "epoch": 1.7288756041960291, + "grad_norm": 0.23927830159664154, + "learning_rate": 5e-06, + "loss": 0.9217, + "num_input_tokens_seen": 648750792, + "step": 1428, + "train_runtime": 104616.4109, + "train_tokens_per_second": 6201.234 + }, + { + "epoch": 1.7300863610136306, + "grad_norm": 0.2528975009918213, + "learning_rate": 5e-06, + "loss": 1.0172, + "num_input_tokens_seen": 649211056, + "step": 1429, + "train_runtime": 104691.6504, + "train_tokens_per_second": 6201.173 + }, + { + "epoch": 1.731297117831232, + "grad_norm": 0.24375270307064056, + "learning_rate": 5e-06, + "loss": 0.9071, + "num_input_tokens_seen": 649664160, + "step": 1430, + "train_runtime": 104761.4227, + "train_tokens_per_second": 6201.368 + }, + { + "epoch": 1.7325078746488334, + "grad_norm": 0.2401747703552246, + "learning_rate": 5e-06, + "loss": 0.9436, + "num_input_tokens_seen": 650120632, + "step": 1431, + "train_runtime": 104830.7549, + "train_tokens_per_second": 6201.621 + }, + { + "epoch": 1.7337186314664346, + "grad_norm": 0.2560153901576996, + "learning_rate": 5e-06, + "loss": 0.9562, + "num_input_tokens_seen": 650585600, + "step": 1432, + "train_runtime": 104901.1048, + "train_tokens_per_second": 6201.895 + }, + { + "epoch": 1.734929388284036, + "grad_norm": 0.22828106582164764, + "learning_rate": 5e-06, + "loss": 0.9867, + "num_input_tokens_seen": 651054272, + "step": 1433, + "train_runtime": 104971.5832, + "train_tokens_per_second": 6202.195 + }, + { + "epoch": 1.7361401451016374, + "grad_norm": 0.2554665207862854, + "learning_rate": 5e-06, + "loss": 0.9303, + "num_input_tokens_seen": 651531888, + "step": 1434, + "train_runtime": 105043.735, + "train_tokens_per_second": 6202.482 + }, + { + "epoch": 1.7373509019192386, + "grad_norm": 0.23532572388648987, + "learning_rate": 5e-06, + "loss": 0.9333, + "num_input_tokens_seen": 651989392, + "step": 1435, + "train_runtime": 105112.6174, + "train_tokens_per_second": 6202.77 + }, + { + "epoch": 1.73856165873684, + "grad_norm": 0.22667627036571503, + "learning_rate": 5e-06, + "loss": 0.8893, + "num_input_tokens_seen": 652450416, + "step": 1436, + "train_runtime": 105181.94, + "train_tokens_per_second": 6203.065 + }, + { + "epoch": 1.7397724155544414, + "grad_norm": 0.2507862150669098, + "learning_rate": 5e-06, + "loss": 0.966, + "num_input_tokens_seen": 652888200, + "step": 1437, + "train_runtime": 105248.017, + "train_tokens_per_second": 6203.33 + }, + { + "epoch": 1.7409831723720428, + "grad_norm": 0.23755121231079102, + "learning_rate": 5e-06, + "loss": 0.9416, + "num_input_tokens_seen": 653353944, + "step": 1438, + "train_runtime": 105318.4006, + "train_tokens_per_second": 6203.607 + }, + { + "epoch": 1.7421939291896442, + "grad_norm": 0.25960105657577515, + "learning_rate": 5e-06, + "loss": 0.9615, + "num_input_tokens_seen": 653799776, + "step": 1439, + "train_runtime": 105385.6269, + "train_tokens_per_second": 6203.88 + }, + { + "epoch": 1.7434046860072456, + "grad_norm": 0.2501721680164337, + "learning_rate": 5e-06, + "loss": 0.961, + "num_input_tokens_seen": 654277144, + "step": 1440, + "train_runtime": 105457.5792, + "train_tokens_per_second": 6204.174 + }, + { + "epoch": 1.744615442824847, + "grad_norm": 0.26541006565093994, + "learning_rate": 5e-06, + "loss": 0.9524, + "num_input_tokens_seen": 654704176, + "step": 1441, + "train_runtime": 105521.9793, + "train_tokens_per_second": 6204.434 + }, + { + "epoch": 1.7458261996424485, + "grad_norm": 0.2498820275068283, + "learning_rate": 5e-06, + "loss": 0.9399, + "num_input_tokens_seen": 655154280, + "step": 1442, + "train_runtime": 105589.5581, + "train_tokens_per_second": 6204.726 + }, + { + "epoch": 1.7470369564600499, + "grad_norm": 0.2539311647415161, + "learning_rate": 5e-06, + "loss": 1.0376, + "num_input_tokens_seen": 655598176, + "step": 1443, + "train_runtime": 105656.176, + "train_tokens_per_second": 6205.015 + }, + { + "epoch": 1.7482477132776513, + "grad_norm": 0.2521834969520569, + "learning_rate": 5e-06, + "loss": 0.9695, + "num_input_tokens_seen": 656049168, + "step": 1444, + "train_runtime": 105724.1835, + "train_tokens_per_second": 6205.29 + }, + { + "epoch": 1.7494584700952525, + "grad_norm": 0.2886483073234558, + "learning_rate": 5e-06, + "loss": 0.9608, + "num_input_tokens_seen": 656492336, + "step": 1445, + "train_runtime": 105790.6709, + "train_tokens_per_second": 6205.579 + }, + { + "epoch": 1.7506692269128539, + "grad_norm": 0.2557690143585205, + "learning_rate": 5e-06, + "loss": 0.9915, + "num_input_tokens_seen": 656921264, + "step": 1446, + "train_runtime": 105855.2716, + "train_tokens_per_second": 6205.844 + }, + { + "epoch": 1.7518799837304553, + "grad_norm": 0.23341627418994904, + "learning_rate": 5e-06, + "loss": 0.956, + "num_input_tokens_seen": 657390344, + "step": 1447, + "train_runtime": 105925.7109, + "train_tokens_per_second": 6206.145 + }, + { + "epoch": 1.7530907405480567, + "grad_norm": 0.23532052338123322, + "learning_rate": 5e-06, + "loss": 0.9079, + "num_input_tokens_seen": 657845536, + "step": 1448, + "train_runtime": 105994.3759, + "train_tokens_per_second": 6206.419 + }, + { + "epoch": 1.7543014973656579, + "grad_norm": 0.2501102089881897, + "learning_rate": 5e-06, + "loss": 0.9369, + "num_input_tokens_seen": 658305072, + "step": 1449, + "train_runtime": 106063.849, + "train_tokens_per_second": 6206.687 + }, + { + "epoch": 1.7555122541832593, + "grad_norm": 0.22593450546264648, + "learning_rate": 5e-06, + "loss": 0.9144, + "num_input_tokens_seen": 658774152, + "step": 1450, + "train_runtime": 106134.342, + "train_tokens_per_second": 6206.984 + }, + { + "epoch": 1.7567230110008607, + "grad_norm": 0.23350222408771515, + "learning_rate": 5e-06, + "loss": 0.9024, + "num_input_tokens_seen": 659224576, + "step": 1451, + "train_runtime": 106201.8802, + "train_tokens_per_second": 6207.278 + }, + { + "epoch": 1.7579337678184621, + "grad_norm": 0.23016194999217987, + "learning_rate": 5e-06, + "loss": 0.9043, + "num_input_tokens_seen": 659686856, + "step": 1452, + "train_runtime": 106272.1954, + "train_tokens_per_second": 6207.521 + }, + { + "epoch": 1.7591445246360635, + "grad_norm": 0.21941740810871124, + "learning_rate": 5e-06, + "loss": 0.9029, + "num_input_tokens_seen": 660168448, + "step": 1453, + "train_runtime": 106345.1484, + "train_tokens_per_second": 6207.791 + }, + { + "epoch": 1.760355281453665, + "grad_norm": 0.2541714310646057, + "learning_rate": 5e-06, + "loss": 0.9459, + "num_input_tokens_seen": 660618064, + "step": 1454, + "train_runtime": 106413.9117, + "train_tokens_per_second": 6208.005 + }, + { + "epoch": 1.7615660382712663, + "grad_norm": 0.25230884552001953, + "learning_rate": 5e-06, + "loss": 0.9856, + "num_input_tokens_seen": 661060488, + "step": 1455, + "train_runtime": 106480.6042, + "train_tokens_per_second": 6208.271 + }, + { + "epoch": 1.7627767950888678, + "grad_norm": 0.23480939865112305, + "learning_rate": 5e-06, + "loss": 0.9658, + "num_input_tokens_seen": 661510016, + "step": 1456, + "train_runtime": 106547.8374, + "train_tokens_per_second": 6208.573 + }, + { + "epoch": 1.7639875519064692, + "grad_norm": 0.22851701080799103, + "learning_rate": 5e-06, + "loss": 0.9333, + "num_input_tokens_seen": 661986792, + "step": 1457, + "train_runtime": 106619.9306, + "train_tokens_per_second": 6208.847 + }, + { + "epoch": 1.7651983087240706, + "grad_norm": 0.24522744119167328, + "learning_rate": 5e-06, + "loss": 0.9374, + "num_input_tokens_seen": 662466720, + "step": 1458, + "train_runtime": 106692.684, + "train_tokens_per_second": 6209.111 + }, + { + "epoch": 1.7664090655416718, + "grad_norm": 0.2213152050971985, + "learning_rate": 5e-06, + "loss": 0.9099, + "num_input_tokens_seen": 662937560, + "step": 1459, + "train_runtime": 106763.8514, + "train_tokens_per_second": 6209.382 + }, + { + "epoch": 1.7676198223592732, + "grad_norm": 0.23350690305233002, + "learning_rate": 5e-06, + "loss": 0.9284, + "num_input_tokens_seen": 663389520, + "step": 1460, + "train_runtime": 106831.6837, + "train_tokens_per_second": 6209.67 + }, + { + "epoch": 1.7688305791768746, + "grad_norm": 0.26306286454200745, + "learning_rate": 5e-06, + "loss": 0.9014, + "num_input_tokens_seen": 663850792, + "step": 1461, + "train_runtime": 106901.4033, + "train_tokens_per_second": 6209.935 + }, + { + "epoch": 1.7700413359944758, + "grad_norm": 0.252805233001709, + "learning_rate": 5e-06, + "loss": 0.929, + "num_input_tokens_seen": 664298992, + "step": 1462, + "train_runtime": 106968.6773, + "train_tokens_per_second": 6210.22 + }, + { + "epoch": 1.7712520928120772, + "grad_norm": 0.25127750635147095, + "learning_rate": 5e-06, + "loss": 0.9607, + "num_input_tokens_seen": 664755584, + "step": 1463, + "train_runtime": 107037.4522, + "train_tokens_per_second": 6210.495 + }, + { + "epoch": 1.7724628496296786, + "grad_norm": 0.24411077797412872, + "learning_rate": 5e-06, + "loss": 0.882, + "num_input_tokens_seen": 665225728, + "step": 1464, + "train_runtime": 107108.6446, + "train_tokens_per_second": 6210.757 + }, + { + "epoch": 1.77367360644728, + "grad_norm": 0.25176945328712463, + "learning_rate": 5e-06, + "loss": 1.0087, + "num_input_tokens_seen": 665670928, + "step": 1465, + "train_runtime": 107175.8114, + "train_tokens_per_second": 6211.018 + }, + { + "epoch": 1.7748843632648814, + "grad_norm": 0.22492913901805878, + "learning_rate": 5e-06, + "loss": 0.9548, + "num_input_tokens_seen": 666145592, + "step": 1466, + "train_runtime": 107247.756, + "train_tokens_per_second": 6211.278 + }, + { + "epoch": 1.7760951200824828, + "grad_norm": 0.29126158356666565, + "learning_rate": 5e-06, + "loss": 0.9327, + "num_input_tokens_seen": 666623936, + "step": 1467, + "train_runtime": 107320.1773, + "train_tokens_per_second": 6211.543 + }, + { + "epoch": 1.7773058769000842, + "grad_norm": 0.2463548630475998, + "learning_rate": 5e-06, + "loss": 0.9628, + "num_input_tokens_seen": 667069528, + "step": 1468, + "train_runtime": 107387.3958, + "train_tokens_per_second": 6211.805 + }, + { + "epoch": 1.7785166337176856, + "grad_norm": 0.2515462040901184, + "learning_rate": 5e-06, + "loss": 0.9422, + "num_input_tokens_seen": 667527784, + "step": 1469, + "train_runtime": 107456.5697, + "train_tokens_per_second": 6212.07 + }, + { + "epoch": 1.779727390535287, + "grad_norm": 0.24735090136528015, + "learning_rate": 5e-06, + "loss": 0.9324, + "num_input_tokens_seen": 668000200, + "step": 1470, + "train_runtime": 107528.1444, + "train_tokens_per_second": 6212.329 + }, + { + "epoch": 1.7809381473528885, + "grad_norm": 0.2488315999507904, + "learning_rate": 5e-06, + "loss": 0.9859, + "num_input_tokens_seen": 668449400, + "step": 1471, + "train_runtime": 107595.4881, + "train_tokens_per_second": 6212.616 + }, + { + "epoch": 1.7821489041704897, + "grad_norm": 0.22948361933231354, + "learning_rate": 5e-06, + "loss": 0.9246, + "num_input_tokens_seen": 668907592, + "step": 1472, + "train_runtime": 107664.6957, + "train_tokens_per_second": 6212.878 + }, + { + "epoch": 1.783359660988091, + "grad_norm": 0.30683404207229614, + "learning_rate": 5e-06, + "loss": 0.9441, + "num_input_tokens_seen": 669348744, + "step": 1473, + "train_runtime": 107730.9985, + "train_tokens_per_second": 6213.149 + }, + { + "epoch": 1.7845704178056925, + "grad_norm": 0.2653786242008209, + "learning_rate": 5e-06, + "loss": 0.9284, + "num_input_tokens_seen": 669793168, + "step": 1474, + "train_runtime": 107797.5952, + "train_tokens_per_second": 6213.433 + }, + { + "epoch": 1.7857811746232937, + "grad_norm": 0.23417231440544128, + "learning_rate": 5e-06, + "loss": 0.926, + "num_input_tokens_seen": 670254976, + "step": 1475, + "train_runtime": 107867.6938, + "train_tokens_per_second": 6213.677 + }, + { + "epoch": 1.786991931440895, + "grad_norm": 0.23506613075733185, + "learning_rate": 5e-06, + "loss": 0.976, + "num_input_tokens_seen": 670697576, + "step": 1476, + "train_runtime": 107934.2732, + "train_tokens_per_second": 6213.944 + }, + { + "epoch": 1.7882026882584965, + "grad_norm": 0.29338982701301575, + "learning_rate": 5e-06, + "loss": 0.9601, + "num_input_tokens_seen": 671162736, + "step": 1477, + "train_runtime": 108004.5346, + "train_tokens_per_second": 6214.209 + }, + { + "epoch": 1.789413445076098, + "grad_norm": 0.26886627078056335, + "learning_rate": 5e-06, + "loss": 0.8709, + "num_input_tokens_seen": 671637992, + "step": 1478, + "train_runtime": 108076.0457, + "train_tokens_per_second": 6214.495 + }, + { + "epoch": 1.7906242018936993, + "grad_norm": 0.23638774454593658, + "learning_rate": 5e-06, + "loss": 0.9029, + "num_input_tokens_seen": 672097800, + "step": 1479, + "train_runtime": 108145.3648, + "train_tokens_per_second": 6214.763 + }, + { + "epoch": 1.7918349587113007, + "grad_norm": 0.23951123654842377, + "learning_rate": 5e-06, + "loss": 0.9188, + "num_input_tokens_seen": 672551744, + "step": 1480, + "train_runtime": 108213.5008, + "train_tokens_per_second": 6215.045 + }, + { + "epoch": 1.7930457155289021, + "grad_norm": 0.2542056739330292, + "learning_rate": 5e-06, + "loss": 0.9416, + "num_input_tokens_seen": 673008736, + "step": 1481, + "train_runtime": 108281.9707, + "train_tokens_per_second": 6215.335 + }, + { + "epoch": 1.7942564723465035, + "grad_norm": 0.2511388659477234, + "learning_rate": 5e-06, + "loss": 0.9082, + "num_input_tokens_seen": 673477648, + "step": 1482, + "train_runtime": 108351.7157, + "train_tokens_per_second": 6215.662 + }, + { + "epoch": 1.795467229164105, + "grad_norm": 0.23240311443805695, + "learning_rate": 5e-06, + "loss": 0.8896, + "num_input_tokens_seen": 673924456, + "step": 1483, + "train_runtime": 108417.5016, + "train_tokens_per_second": 6216.012 + }, + { + "epoch": 1.7966779859817064, + "grad_norm": 0.2410743683576584, + "learning_rate": 5e-06, + "loss": 0.9358, + "num_input_tokens_seen": 674378912, + "step": 1484, + "train_runtime": 108484.6044, + "train_tokens_per_second": 6216.356 + }, + { + "epoch": 1.7978887427993075, + "grad_norm": 0.2558565139770508, + "learning_rate": 5e-06, + "loss": 0.9506, + "num_input_tokens_seen": 674838040, + "step": 1485, + "train_runtime": 108552.0714, + "train_tokens_per_second": 6216.722 + }, + { + "epoch": 1.799099499616909, + "grad_norm": 0.2397555112838745, + "learning_rate": 5e-06, + "loss": 0.9175, + "num_input_tokens_seen": 675289776, + "step": 1486, + "train_runtime": 108618.7287, + "train_tokens_per_second": 6217.066 + }, + { + "epoch": 1.8003102564345104, + "grad_norm": 0.22383016347885132, + "learning_rate": 5e-06, + "loss": 0.9353, + "num_input_tokens_seen": 675747848, + "step": 1487, + "train_runtime": 108686.2897, + "train_tokens_per_second": 6217.416 + }, + { + "epoch": 1.8015210132521118, + "grad_norm": 0.2571597397327423, + "learning_rate": 5e-06, + "loss": 0.9263, + "num_input_tokens_seen": 676171288, + "step": 1488, + "train_runtime": 108748.7096, + "train_tokens_per_second": 6217.741 + }, + { + "epoch": 1.802731770069713, + "grad_norm": 0.25441011786460876, + "learning_rate": 5e-06, + "loss": 1.0048, + "num_input_tokens_seen": 676630424, + "step": 1489, + "train_runtime": 108816.6465, + "train_tokens_per_second": 6218.078 + }, + { + "epoch": 1.8039425268873144, + "grad_norm": 0.24836276471614838, + "learning_rate": 5e-06, + "loss": 0.9633, + "num_input_tokens_seen": 677067784, + "step": 1490, + "train_runtime": 108881.3096, + "train_tokens_per_second": 6218.402 + }, + { + "epoch": 1.8051532837049158, + "grad_norm": 0.24541418254375458, + "learning_rate": 5e-06, + "loss": 0.9088, + "num_input_tokens_seen": 677518960, + "step": 1491, + "train_runtime": 108948.056, + "train_tokens_per_second": 6218.734 + }, + { + "epoch": 1.8063640405225172, + "grad_norm": 0.2278079390525818, + "learning_rate": 5e-06, + "loss": 0.9275, + "num_input_tokens_seen": 677983656, + "step": 1492, + "train_runtime": 109016.4926, + "train_tokens_per_second": 6219.093 + }, + { + "epoch": 1.8075747973401186, + "grad_norm": 0.23876270651817322, + "learning_rate": 5e-06, + "loss": 0.926, + "num_input_tokens_seen": 678441648, + "step": 1493, + "train_runtime": 109083.9738, + "train_tokens_per_second": 6219.444 + }, + { + "epoch": 1.80878555415772, + "grad_norm": 0.25853845477104187, + "learning_rate": 5e-06, + "loss": 0.9695, + "num_input_tokens_seen": 678904952, + "step": 1494, + "train_runtime": 109152.4011, + "train_tokens_per_second": 6219.789 + }, + { + "epoch": 1.8099963109753214, + "grad_norm": 0.2297954559326172, + "learning_rate": 5e-06, + "loss": 0.8819, + "num_input_tokens_seen": 679350488, + "step": 1495, + "train_runtime": 109217.9803, + "train_tokens_per_second": 6220.134 + }, + { + "epoch": 1.8112070677929228, + "grad_norm": 0.2526834309101105, + "learning_rate": 5e-06, + "loss": 0.913, + "num_input_tokens_seen": 679804296, + "step": 1496, + "train_runtime": 109285.0115, + "train_tokens_per_second": 6220.471 + }, + { + "epoch": 1.8124178246105243, + "grad_norm": 0.25374501943588257, + "learning_rate": 5e-06, + "loss": 0.9266, + "num_input_tokens_seen": 680253888, + "step": 1497, + "train_runtime": 109351.4643, + "train_tokens_per_second": 6220.803 + }, + { + "epoch": 1.8136285814281257, + "grad_norm": 0.25926515460014343, + "learning_rate": 5e-06, + "loss": 0.9747, + "num_input_tokens_seen": 680706808, + "step": 1498, + "train_runtime": 109417.9756, + "train_tokens_per_second": 6221.161 + }, + { + "epoch": 1.8148393382457269, + "grad_norm": 0.24243789911270142, + "learning_rate": 5e-06, + "loss": 0.8814, + "num_input_tokens_seen": 681162152, + "step": 1499, + "train_runtime": 109485.1447, + "train_tokens_per_second": 6221.503 + }, + { + "epoch": 1.8160500950633283, + "grad_norm": 0.23476150631904602, + "learning_rate": 5e-06, + "loss": 0.8957, + "num_input_tokens_seen": 681594912, + "step": 1500, + "train_runtime": 109548.882, + "train_tokens_per_second": 6221.834 + }, + { + "epoch": 1.8172608518809297, + "grad_norm": 0.23926031589508057, + "learning_rate": 5e-06, + "loss": 0.9437, + "num_input_tokens_seen": 682050248, + "step": 1501, + "train_runtime": 109615.8898, + "train_tokens_per_second": 6222.184 + }, + { + "epoch": 1.8184716086985309, + "grad_norm": 0.23174121975898743, + "learning_rate": 5e-06, + "loss": 0.9173, + "num_input_tokens_seen": 682487560, + "step": 1502, + "train_runtime": 109680.61, + "train_tokens_per_second": 6222.5 + }, + { + "epoch": 1.8196823655161323, + "grad_norm": 0.23543839156627655, + "learning_rate": 5e-06, + "loss": 0.9421, + "num_input_tokens_seen": 682928504, + "step": 1503, + "train_runtime": 109745.5472, + "train_tokens_per_second": 6222.836 + }, + { + "epoch": 1.8208931223337337, + "grad_norm": 0.2303183227777481, + "learning_rate": 5e-06, + "loss": 0.9095, + "num_input_tokens_seen": 683379408, + "step": 1504, + "train_runtime": 109812.3345, + "train_tokens_per_second": 6223.157 + }, + { + "epoch": 1.822103879151335, + "grad_norm": 0.24201270937919617, + "learning_rate": 5e-06, + "loss": 0.9532, + "num_input_tokens_seen": 683828072, + "step": 1505, + "train_runtime": 109878.4824, + "train_tokens_per_second": 6223.494 + }, + { + "epoch": 1.8233146359689365, + "grad_norm": 0.24431242048740387, + "learning_rate": 5e-06, + "loss": 0.967, + "num_input_tokens_seen": 684271776, + "step": 1506, + "train_runtime": 109945.1488, + "train_tokens_per_second": 6223.756 + }, + { + "epoch": 1.824525392786538, + "grad_norm": 0.24904708564281464, + "learning_rate": 5e-06, + "loss": 0.9425, + "num_input_tokens_seen": 684713936, + "step": 1507, + "train_runtime": 110011.8091, + "train_tokens_per_second": 6224.004 + }, + { + "epoch": 1.8257361496041393, + "grad_norm": 0.24164269864559174, + "learning_rate": 5e-06, + "loss": 0.9442, + "num_input_tokens_seen": 685180664, + "step": 1508, + "train_runtime": 110080.6343, + "train_tokens_per_second": 6224.352 + }, + { + "epoch": 1.8269469064217407, + "grad_norm": 0.23471519351005554, + "learning_rate": 5e-06, + "loss": 0.9217, + "num_input_tokens_seen": 685664952, + "step": 1509, + "train_runtime": 110151.978, + "train_tokens_per_second": 6224.718 + }, + { + "epoch": 1.8281576632393421, + "grad_norm": 0.22866208851337433, + "learning_rate": 5e-06, + "loss": 0.9515, + "num_input_tokens_seen": 686116056, + "step": 1510, + "train_runtime": 110217.8911, + "train_tokens_per_second": 6225.088 + }, + { + "epoch": 1.8293684200569436, + "grad_norm": 0.24192233383655548, + "learning_rate": 5e-06, + "loss": 0.8977, + "num_input_tokens_seen": 686558696, + "step": 1511, + "train_runtime": 110283.5539, + "train_tokens_per_second": 6225.395 + }, + { + "epoch": 1.8305791768745447, + "grad_norm": 0.2574458718299866, + "learning_rate": 5e-06, + "loss": 0.9646, + "num_input_tokens_seen": 687040920, + "step": 1512, + "train_runtime": 110355.252, + "train_tokens_per_second": 6225.72 + }, + { + "epoch": 1.8317899336921462, + "grad_norm": 0.23180226981639862, + "learning_rate": 5e-06, + "loss": 0.9015, + "num_input_tokens_seen": 687491464, + "step": 1513, + "train_runtime": 110421.6831, + "train_tokens_per_second": 6226.055 + }, + { + "epoch": 1.8330006905097476, + "grad_norm": 0.2546160817146301, + "learning_rate": 5e-06, + "loss": 0.9606, + "num_input_tokens_seen": 687938040, + "step": 1514, + "train_runtime": 110487.8498, + "train_tokens_per_second": 6226.368 + }, + { + "epoch": 1.8342114473273488, + "grad_norm": 0.2605888545513153, + "learning_rate": 5e-06, + "loss": 0.9774, + "num_input_tokens_seen": 688393944, + "step": 1515, + "train_runtime": 110554.8225, + "train_tokens_per_second": 6226.72 + }, + { + "epoch": 1.8354222041449502, + "grad_norm": 0.24372106790542603, + "learning_rate": 5e-06, + "loss": 0.9394, + "num_input_tokens_seen": 688853280, + "step": 1516, + "train_runtime": 110623.2037, + "train_tokens_per_second": 6227.023 + }, + { + "epoch": 1.8366329609625516, + "grad_norm": 0.2518022656440735, + "learning_rate": 5e-06, + "loss": 0.9911, + "num_input_tokens_seen": 689322800, + "step": 1517, + "train_runtime": 110692.3372, + "train_tokens_per_second": 6227.376 + }, + { + "epoch": 1.837843717780153, + "grad_norm": 0.2612314522266388, + "learning_rate": 5e-06, + "loss": 0.9543, + "num_input_tokens_seen": 689766792, + "step": 1518, + "train_runtime": 110757.7501, + "train_tokens_per_second": 6227.707 + }, + { + "epoch": 1.8390544745977544, + "grad_norm": 0.2807573974132538, + "learning_rate": 5e-06, + "loss": 0.9692, + "num_input_tokens_seen": 690187168, + "step": 1519, + "train_runtime": 110819.3816, + "train_tokens_per_second": 6228.037 + }, + { + "epoch": 1.8402652314153558, + "grad_norm": 0.24547508358955383, + "learning_rate": 5e-06, + "loss": 0.8983, + "num_input_tokens_seen": 690622672, + "step": 1520, + "train_runtime": 110883.3109, + "train_tokens_per_second": 6228.373 + }, + { + "epoch": 1.8414759882329572, + "grad_norm": 0.25869348645210266, + "learning_rate": 5e-06, + "loss": 0.9573, + "num_input_tokens_seen": 691070096, + "step": 1521, + "train_runtime": 110949.2534, + "train_tokens_per_second": 6228.704 + }, + { + "epoch": 1.8426867450505586, + "grad_norm": 0.2718667685985565, + "learning_rate": 5e-06, + "loss": 0.943, + "num_input_tokens_seen": 691528656, + "step": 1522, + "train_runtime": 111016.9813, + "train_tokens_per_second": 6229.035 + }, + { + "epoch": 1.84389750186816, + "grad_norm": 0.23198598623275757, + "learning_rate": 5e-06, + "loss": 0.9308, + "num_input_tokens_seen": 692005376, + "step": 1523, + "train_runtime": 111087.076, + "train_tokens_per_second": 6229.396 + }, + { + "epoch": 1.8451082586857614, + "grad_norm": 0.2525101602077484, + "learning_rate": 5e-06, + "loss": 0.9888, + "num_input_tokens_seen": 692443744, + "step": 1524, + "train_runtime": 111151.9146, + "train_tokens_per_second": 6229.706 + }, + { + "epoch": 1.8463190155033626, + "grad_norm": 0.24284860491752625, + "learning_rate": 5e-06, + "loss": 0.9236, + "num_input_tokens_seen": 692903640, + "step": 1525, + "train_runtime": 111219.6955, + "train_tokens_per_second": 6230.044 + }, + { + "epoch": 1.847529772320964, + "grad_norm": 0.2588494122028351, + "learning_rate": 5e-06, + "loss": 0.9442, + "num_input_tokens_seen": 693364616, + "step": 1526, + "train_runtime": 111287.6738, + "train_tokens_per_second": 6230.381 + }, + { + "epoch": 1.8487405291385655, + "grad_norm": 0.25384098291397095, + "learning_rate": 5e-06, + "loss": 0.9606, + "num_input_tokens_seen": 693820112, + "step": 1527, + "train_runtime": 111355.0786, + "train_tokens_per_second": 6230.7 + }, + { + "epoch": 1.8499512859561669, + "grad_norm": 0.23675884306430817, + "learning_rate": 5e-06, + "loss": 0.9327, + "num_input_tokens_seen": 694280840, + "step": 1528, + "train_runtime": 111423.4256, + "train_tokens_per_second": 6231.013 + }, + { + "epoch": 1.851162042773768, + "grad_norm": 0.2325180619955063, + "learning_rate": 5e-06, + "loss": 0.9156, + "num_input_tokens_seen": 694738776, + "step": 1529, + "train_runtime": 111490.9224, + "train_tokens_per_second": 6231.348 + }, + { + "epoch": 1.8523727995913695, + "grad_norm": 0.25472497940063477, + "learning_rate": 5e-06, + "loss": 0.9057, + "num_input_tokens_seen": 695188272, + "step": 1530, + "train_runtime": 111557.6013, + "train_tokens_per_second": 6231.653 + }, + { + "epoch": 1.8535835564089709, + "grad_norm": 0.23478816449642181, + "learning_rate": 5e-06, + "loss": 0.8944, + "num_input_tokens_seen": 695623080, + "step": 1531, + "train_runtime": 111621.743, + "train_tokens_per_second": 6231.968 + }, + { + "epoch": 1.8547943132265723, + "grad_norm": 0.2601574659347534, + "learning_rate": 5e-06, + "loss": 0.9208, + "num_input_tokens_seen": 696081624, + "step": 1532, + "train_runtime": 111689.1774, + "train_tokens_per_second": 6232.31 + }, + { + "epoch": 1.8560050700441737, + "grad_norm": 0.26812466979026794, + "learning_rate": 5e-06, + "loss": 0.9187, + "num_input_tokens_seen": 696557776, + "step": 1533, + "train_runtime": 111759.787, + "train_tokens_per_second": 6232.633 + }, + { + "epoch": 1.857215826861775, + "grad_norm": 0.28780001401901245, + "learning_rate": 5e-06, + "loss": 0.8971, + "num_input_tokens_seen": 697024808, + "step": 1534, + "train_runtime": 111828.8573, + "train_tokens_per_second": 6232.96 + }, + { + "epoch": 1.8584265836793765, + "grad_norm": 0.23128759860992432, + "learning_rate": 5e-06, + "loss": 0.9624, + "num_input_tokens_seen": 697506680, + "step": 1535, + "train_runtime": 111899.9207, + "train_tokens_per_second": 6233.308 + }, + { + "epoch": 1.859637340496978, + "grad_norm": 0.2368602603673935, + "learning_rate": 5e-06, + "loss": 0.9405, + "num_input_tokens_seen": 697965088, + "step": 1536, + "train_runtime": 111967.9468, + "train_tokens_per_second": 6233.615 + }, + { + "epoch": 1.8608480973145793, + "grad_norm": 0.24492254853248596, + "learning_rate": 5e-06, + "loss": 0.9242, + "num_input_tokens_seen": 698401104, + "step": 1537, + "train_runtime": 112032.4048, + "train_tokens_per_second": 6233.92 + }, + { + "epoch": 1.8620588541321808, + "grad_norm": 0.26115724444389343, + "learning_rate": 5e-06, + "loss": 0.9314, + "num_input_tokens_seen": 698857392, + "step": 1538, + "train_runtime": 112100.3102, + "train_tokens_per_second": 6234.215 + }, + { + "epoch": 1.863269610949782, + "grad_norm": 0.23800967633724213, + "learning_rate": 5e-06, + "loss": 0.9329, + "num_input_tokens_seen": 699332608, + "step": 1539, + "train_runtime": 112170.2291, + "train_tokens_per_second": 6234.565 + }, + { + "epoch": 1.8644803677673834, + "grad_norm": 0.2537146210670471, + "learning_rate": 5e-06, + "loss": 0.9345, + "num_input_tokens_seen": 699789840, + "step": 1540, + "train_runtime": 112238.3256, + "train_tokens_per_second": 6234.856 + }, + { + "epoch": 1.8656911245849848, + "grad_norm": 0.23815041780471802, + "learning_rate": 5e-06, + "loss": 0.9216, + "num_input_tokens_seen": 700250208, + "step": 1541, + "train_runtime": 112306.045, + "train_tokens_per_second": 6235.196 + }, + { + "epoch": 1.866901881402586, + "grad_norm": 0.2275908887386322, + "learning_rate": 5e-06, + "loss": 0.9289, + "num_input_tokens_seen": 700706192, + "step": 1542, + "train_runtime": 112373.3647, + "train_tokens_per_second": 6235.518 + }, + { + "epoch": 1.8681126382201874, + "grad_norm": 0.24416327476501465, + "learning_rate": 5e-06, + "loss": 0.9387, + "num_input_tokens_seen": 701172296, + "step": 1543, + "train_runtime": 112441.8, + "train_tokens_per_second": 6235.869 + }, + { + "epoch": 1.8693233950377888, + "grad_norm": 0.23080092668533325, + "learning_rate": 5e-06, + "loss": 0.8832, + "num_input_tokens_seen": 701636096, + "step": 1544, + "train_runtime": 112510.6891, + "train_tokens_per_second": 6236.173 + }, + { + "epoch": 1.8705341518553902, + "grad_norm": 0.2627670466899872, + "learning_rate": 5e-06, + "loss": 0.9469, + "num_input_tokens_seen": 702085912, + "step": 1545, + "train_runtime": 112577.5052, + "train_tokens_per_second": 6236.467 + }, + { + "epoch": 1.8717449086729916, + "grad_norm": 0.2511466145515442, + "learning_rate": 5e-06, + "loss": 0.914, + "num_input_tokens_seen": 702549976, + "step": 1546, + "train_runtime": 112645.979, + "train_tokens_per_second": 6236.796 + }, + { + "epoch": 1.872955665490593, + "grad_norm": 0.23643608391284943, + "learning_rate": 5e-06, + "loss": 0.9381, + "num_input_tokens_seen": 703007240, + "step": 1547, + "train_runtime": 112713.4614, + "train_tokens_per_second": 6237.119 + }, + { + "epoch": 1.8741664223081944, + "grad_norm": 0.2743590474128723, + "learning_rate": 5e-06, + "loss": 1.048, + "num_input_tokens_seen": 703458040, + "step": 1548, + "train_runtime": 112779.5769, + "train_tokens_per_second": 6237.459 + }, + { + "epoch": 1.8753771791257958, + "grad_norm": 0.2364722192287445, + "learning_rate": 5e-06, + "loss": 0.9382, + "num_input_tokens_seen": 703914896, + "step": 1549, + "train_runtime": 112846.9643, + "train_tokens_per_second": 6237.783 + }, + { + "epoch": 1.8765879359433972, + "grad_norm": 0.23627513647079468, + "learning_rate": 5e-06, + "loss": 0.9412, + "num_input_tokens_seen": 704351776, + "step": 1550, + "train_runtime": 112911.1758, + "train_tokens_per_second": 6238.105 + }, + { + "epoch": 1.8777986927609986, + "grad_norm": 0.2537660002708435, + "learning_rate": 5e-06, + "loss": 0.9351, + "num_input_tokens_seen": 704778064, + "step": 1551, + "train_runtime": 112973.6432, + "train_tokens_per_second": 6238.429 + }, + { + "epoch": 1.8790094495785998, + "grad_norm": 0.2588886022567749, + "learning_rate": 5e-06, + "loss": 0.9646, + "num_input_tokens_seen": 705228264, + "step": 1552, + "train_runtime": 113040.2615, + "train_tokens_per_second": 6238.735 + }, + { + "epoch": 1.8802202063962012, + "grad_norm": 0.24146287143230438, + "learning_rate": 5e-06, + "loss": 0.8843, + "num_input_tokens_seen": 705664728, + "step": 1553, + "train_runtime": 113105.0242, + "train_tokens_per_second": 6239.022 + }, + { + "epoch": 1.8814309632138027, + "grad_norm": 0.2611408233642578, + "learning_rate": 5e-06, + "loss": 0.9507, + "num_input_tokens_seen": 706140544, + "step": 1554, + "train_runtime": 113175.4725, + "train_tokens_per_second": 6239.343 + }, + { + "epoch": 1.8826417200314038, + "grad_norm": 0.24252241849899292, + "learning_rate": 5e-06, + "loss": 0.8965, + "num_input_tokens_seen": 706584696, + "step": 1555, + "train_runtime": 113240.9269, + "train_tokens_per_second": 6239.658 + }, + { + "epoch": 1.8838524768490053, + "grad_norm": 0.24674955010414124, + "learning_rate": 5e-06, + "loss": 0.9562, + "num_input_tokens_seen": 707041752, + "step": 1556, + "train_runtime": 113308.1873, + "train_tokens_per_second": 6239.988 + }, + { + "epoch": 1.8850632336666067, + "grad_norm": 0.2411464899778366, + "learning_rate": 5e-06, + "loss": 0.9625, + "num_input_tokens_seen": 707509880, + "step": 1557, + "train_runtime": 113377.0087, + "train_tokens_per_second": 6240.329 + }, + { + "epoch": 1.886273990484208, + "grad_norm": 0.24759581685066223, + "learning_rate": 5e-06, + "loss": 0.9369, + "num_input_tokens_seen": 707976528, + "step": 1558, + "train_runtime": 113445.9329, + "train_tokens_per_second": 6240.651 + }, + { + "epoch": 1.8874847473018095, + "grad_norm": 0.28260865807533264, + "learning_rate": 5e-06, + "loss": 0.984, + "num_input_tokens_seen": 708406264, + "step": 1559, + "train_runtime": 113509.0703, + "train_tokens_per_second": 6240.966 + }, + { + "epoch": 1.888695504119411, + "grad_norm": 0.23383396863937378, + "learning_rate": 5e-06, + "loss": 0.895, + "num_input_tokens_seen": 708860936, + "step": 1560, + "train_runtime": 113577.5988, + "train_tokens_per_second": 6241.204 + }, + { + "epoch": 1.8899062609370123, + "grad_norm": 0.25613272190093994, + "learning_rate": 5e-06, + "loss": 0.9105, + "num_input_tokens_seen": 709298440, + "step": 1561, + "train_runtime": 113642.3199, + "train_tokens_per_second": 6241.499 + }, + { + "epoch": 1.8911170177546137, + "grad_norm": 0.23639342188835144, + "learning_rate": 5e-06, + "loss": 0.9206, + "num_input_tokens_seen": 709772072, + "step": 1562, + "train_runtime": 113712.4809, + "train_tokens_per_second": 6241.813 + }, + { + "epoch": 1.8923277745722151, + "grad_norm": 0.24744772911071777, + "learning_rate": 5e-06, + "loss": 0.9326, + "num_input_tokens_seen": 710230928, + "step": 1563, + "train_runtime": 113780.3401, + "train_tokens_per_second": 6242.123 + }, + { + "epoch": 1.8935385313898165, + "grad_norm": 0.24007609486579895, + "learning_rate": 5e-06, + "loss": 0.9552, + "num_input_tokens_seen": 710673544, + "step": 1564, + "train_runtime": 113845.7354, + "train_tokens_per_second": 6242.426 + }, + { + "epoch": 1.8947492882074177, + "grad_norm": 0.24338461458683014, + "learning_rate": 5e-06, + "loss": 0.8848, + "num_input_tokens_seen": 711156448, + "step": 1565, + "train_runtime": 113916.9534, + "train_tokens_per_second": 6242.762 + }, + { + "epoch": 1.8959600450250191, + "grad_norm": 0.25444409251213074, + "learning_rate": 5e-06, + "loss": 0.9297, + "num_input_tokens_seen": 711614160, + "step": 1566, + "train_runtime": 113984.4746, + "train_tokens_per_second": 6243.08 + }, + { + "epoch": 1.8971708018426205, + "grad_norm": 0.24998825788497925, + "learning_rate": 5e-06, + "loss": 0.9388, + "num_input_tokens_seen": 712102984, + "step": 1567, + "train_runtime": 114056.6227, + "train_tokens_per_second": 6243.416 + }, + { + "epoch": 1.898381558660222, + "grad_norm": 0.25563183426856995, + "learning_rate": 5e-06, + "loss": 0.9143, + "num_input_tokens_seen": 712569488, + "step": 1568, + "train_runtime": 114125.4791, + "train_tokens_per_second": 6243.737 + }, + { + "epoch": 1.8995923154778231, + "grad_norm": 0.2723662555217743, + "learning_rate": 5e-06, + "loss": 0.9361, + "num_input_tokens_seen": 712982216, + "step": 1569, + "train_runtime": 114186.2224, + "train_tokens_per_second": 6244.03 + }, + { + "epoch": 1.9008030722954246, + "grad_norm": 0.26646265387535095, + "learning_rate": 5e-06, + "loss": 0.9116, + "num_input_tokens_seen": 713435760, + "step": 1570, + "train_runtime": 114253.013, + "train_tokens_per_second": 6244.35 + }, + { + "epoch": 1.902013829113026, + "grad_norm": 0.22592444717884064, + "learning_rate": 5e-06, + "loss": 0.9129, + "num_input_tokens_seen": 713905768, + "step": 1571, + "train_runtime": 114322.3992, + "train_tokens_per_second": 6244.671 + }, + { + "epoch": 1.9032245859306274, + "grad_norm": 0.2544853985309601, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 714357704, + "step": 1572, + "train_runtime": 114389.1726, + "train_tokens_per_second": 6244.977 + }, + { + "epoch": 1.9044353427482288, + "grad_norm": 0.2733955383300781, + "learning_rate": 5e-06, + "loss": 0.9863, + "num_input_tokens_seen": 714815888, + "step": 1573, + "train_runtime": 114456.8908, + "train_tokens_per_second": 6245.285 + }, + { + "epoch": 1.9056460995658302, + "grad_norm": 0.22590436041355133, + "learning_rate": 5e-06, + "loss": 0.8947, + "num_input_tokens_seen": 715267248, + "step": 1574, + "train_runtime": 114524.1877, + "train_tokens_per_second": 6245.556 + }, + { + "epoch": 1.9068568563834316, + "grad_norm": 0.2679465711116791, + "learning_rate": 5e-06, + "loss": 0.9425, + "num_input_tokens_seen": 715731448, + "step": 1575, + "train_runtime": 114592.6515, + "train_tokens_per_second": 6245.876 + }, + { + "epoch": 1.908067613201033, + "grad_norm": 0.2311072051525116, + "learning_rate": 5e-06, + "loss": 0.9282, + "num_input_tokens_seen": 716197480, + "step": 1576, + "train_runtime": 114661.6647, + "train_tokens_per_second": 6246.181 + }, + { + "epoch": 1.9092783700186344, + "grad_norm": 0.24477143585681915, + "learning_rate": 5e-06, + "loss": 0.9186, + "num_input_tokens_seen": 716635848, + "step": 1577, + "train_runtime": 114726.4947, + "train_tokens_per_second": 6246.472 + }, + { + "epoch": 1.9104891268362358, + "grad_norm": 0.26083871722221375, + "learning_rate": 5e-06, + "loss": 0.9086, + "num_input_tokens_seen": 717095224, + "step": 1578, + "train_runtime": 114794.5934, + "train_tokens_per_second": 6246.768 + }, + { + "epoch": 1.911699883653837, + "grad_norm": 0.29289036989212036, + "learning_rate": 5e-06, + "loss": 0.9533, + "num_input_tokens_seen": 717574472, + "step": 1579, + "train_runtime": 114865.2956, + "train_tokens_per_second": 6247.096 + }, + { + "epoch": 1.9129106404714384, + "grad_norm": 0.28024327754974365, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 718005536, + "step": 1580, + "train_runtime": 114928.7168, + "train_tokens_per_second": 6247.399 + }, + { + "epoch": 1.9141213972890398, + "grad_norm": 0.28200191259384155, + "learning_rate": 5e-06, + "loss": 0.931, + "num_input_tokens_seen": 718454008, + "step": 1581, + "train_runtime": 114994.4807, + "train_tokens_per_second": 6247.726 + }, + { + "epoch": 1.915332154106641, + "grad_norm": 0.27790147066116333, + "learning_rate": 5e-06, + "loss": 0.9131, + "num_input_tokens_seen": 718902000, + "step": 1582, + "train_runtime": 115060.9603, + "train_tokens_per_second": 6248.01 + }, + { + "epoch": 1.9165429109242424, + "grad_norm": 0.2839493751525879, + "learning_rate": 5e-06, + "loss": 0.9395, + "num_input_tokens_seen": 719363656, + "step": 1583, + "train_runtime": 115129.3761, + "train_tokens_per_second": 6248.307 + }, + { + "epoch": 1.9177536677418439, + "grad_norm": 0.28969302773475647, + "learning_rate": 5e-06, + "loss": 0.939, + "num_input_tokens_seen": 719824400, + "step": 1584, + "train_runtime": 115197.2332, + "train_tokens_per_second": 6248.626 + }, + { + "epoch": 1.9189644245594453, + "grad_norm": 0.22786937654018402, + "learning_rate": 5e-06, + "loss": 0.9137, + "num_input_tokens_seen": 720285288, + "step": 1585, + "train_runtime": 115264.6817, + "train_tokens_per_second": 6248.968 + }, + { + "epoch": 1.9201751813770467, + "grad_norm": 0.3011467158794403, + "learning_rate": 5e-06, + "loss": 0.9738, + "num_input_tokens_seen": 720726248, + "step": 1586, + "train_runtime": 115329.3874, + "train_tokens_per_second": 6249.285 + }, + { + "epoch": 1.921385938194648, + "grad_norm": 0.25570541620254517, + "learning_rate": 5e-06, + "loss": 0.9943, + "num_input_tokens_seen": 721189208, + "step": 1587, + "train_runtime": 115397.8713, + "train_tokens_per_second": 6249.588 + }, + { + "epoch": 1.9225966950122495, + "grad_norm": 0.30256542563438416, + "learning_rate": 5e-06, + "loss": 0.9381, + "num_input_tokens_seen": 721640744, + "step": 1588, + "train_runtime": 115464.5774, + "train_tokens_per_second": 6249.889 + }, + { + "epoch": 1.923807451829851, + "grad_norm": 0.22470492124557495, + "learning_rate": 5e-06, + "loss": 0.9629, + "num_input_tokens_seen": 722107928, + "step": 1589, + "train_runtime": 115533.4593, + "train_tokens_per_second": 6250.206 + }, + { + "epoch": 1.9250182086474523, + "grad_norm": 0.26163867115974426, + "learning_rate": 5e-06, + "loss": 0.8809, + "num_input_tokens_seen": 722566072, + "step": 1590, + "train_runtime": 115601.5317, + "train_tokens_per_second": 6250.489 + }, + { + "epoch": 1.9262289654650537, + "grad_norm": 0.27157437801361084, + "learning_rate": 5e-06, + "loss": 0.924, + "num_input_tokens_seen": 723026480, + "step": 1591, + "train_runtime": 115668.9514, + "train_tokens_per_second": 6250.826 + }, + { + "epoch": 1.927439722282655, + "grad_norm": 0.2507987320423126, + "learning_rate": 5e-06, + "loss": 0.9413, + "num_input_tokens_seen": 723498984, + "step": 1592, + "train_runtime": 115738.7049, + "train_tokens_per_second": 6251.141 + }, + { + "epoch": 1.9286504791002563, + "grad_norm": 0.2356843203306198, + "learning_rate": 5e-06, + "loss": 0.9055, + "num_input_tokens_seen": 723937808, + "step": 1593, + "train_runtime": 115803.4466, + "train_tokens_per_second": 6251.436 + }, + { + "epoch": 1.9298612359178577, + "grad_norm": 0.2270326465368271, + "learning_rate": 5e-06, + "loss": 0.9157, + "num_input_tokens_seen": 724385408, + "step": 1594, + "train_runtime": 115869.3052, + "train_tokens_per_second": 6251.746 + }, + { + "epoch": 1.931071992735459, + "grad_norm": 0.2569643557071686, + "learning_rate": 5e-06, + "loss": 0.856, + "num_input_tokens_seen": 724859232, + "step": 1595, + "train_runtime": 115939.4374, + "train_tokens_per_second": 6252.051 + }, + { + "epoch": 1.9322827495530603, + "grad_norm": 0.22327809035778046, + "learning_rate": 5e-06, + "loss": 0.9313, + "num_input_tokens_seen": 725331448, + "step": 1596, + "train_runtime": 116009.0348, + "train_tokens_per_second": 6252.37 + }, + { + "epoch": 1.9334935063706618, + "grad_norm": 0.253885418176651, + "learning_rate": 5e-06, + "loss": 0.8917, + "num_input_tokens_seen": 725790744, + "step": 1597, + "train_runtime": 116077.1839, + "train_tokens_per_second": 6252.656 + }, + { + "epoch": 1.9347042631882632, + "grad_norm": 0.22820526361465454, + "learning_rate": 5e-06, + "loss": 0.9767, + "num_input_tokens_seen": 726237104, + "step": 1598, + "train_runtime": 116143.0969, + "train_tokens_per_second": 6252.951 + }, + { + "epoch": 1.9359150200058646, + "grad_norm": 0.24010008573532104, + "learning_rate": 5e-06, + "loss": 0.9393, + "num_input_tokens_seen": 726690312, + "step": 1599, + "train_runtime": 116210.1064, + "train_tokens_per_second": 6253.245 + }, + { + "epoch": 1.937125776823466, + "grad_norm": 0.23890480399131775, + "learning_rate": 5e-06, + "loss": 0.8982, + "num_input_tokens_seen": 727157240, + "step": 1600, + "train_runtime": 116279.6038, + "train_tokens_per_second": 6253.524 + }, + { + "epoch": 1.9383365336410674, + "grad_norm": 0.24424760043621063, + "learning_rate": 5e-06, + "loss": 0.9653, + "num_input_tokens_seen": 727622656, + "step": 1601, + "train_runtime": 116348.2373, + "train_tokens_per_second": 6253.835 + }, + { + "epoch": 1.9395472904586688, + "grad_norm": 0.2552737891674042, + "learning_rate": 5e-06, + "loss": 0.9497, + "num_input_tokens_seen": 728062096, + "step": 1602, + "train_runtime": 116412.8385, + "train_tokens_per_second": 6254.139 + }, + { + "epoch": 1.9407580472762702, + "grad_norm": 0.2567066252231598, + "learning_rate": 5e-06, + "loss": 0.8888, + "num_input_tokens_seen": 728516320, + "step": 1603, + "train_runtime": 116479.7771, + "train_tokens_per_second": 6254.445 + }, + { + "epoch": 1.9419688040938716, + "grad_norm": 0.26494523882865906, + "learning_rate": 5e-06, + "loss": 0.9411, + "num_input_tokens_seen": 728967448, + "step": 1604, + "train_runtime": 116546.2892, + "train_tokens_per_second": 6254.746 + }, + { + "epoch": 1.9431795609114728, + "grad_norm": 0.24419981241226196, + "learning_rate": 5e-06, + "loss": 0.9449, + "num_input_tokens_seen": 729410712, + "step": 1605, + "train_runtime": 116611.7138, + "train_tokens_per_second": 6255.038 + }, + { + "epoch": 1.9443903177290742, + "grad_norm": 0.24061161279678345, + "learning_rate": 5e-06, + "loss": 0.9237, + "num_input_tokens_seen": 729864928, + "step": 1606, + "train_runtime": 116679.0895, + "train_tokens_per_second": 6255.319 + }, + { + "epoch": 1.9456010745466756, + "grad_norm": 0.2652917802333832, + "learning_rate": 5e-06, + "loss": 0.9536, + "num_input_tokens_seen": 730304432, + "step": 1607, + "train_runtime": 116743.7819, + "train_tokens_per_second": 6255.617 + }, + { + "epoch": 1.946811831364277, + "grad_norm": 0.3060227632522583, + "learning_rate": 5e-06, + "loss": 0.9476, + "num_input_tokens_seen": 730764008, + "step": 1608, + "train_runtime": 116811.6283, + "train_tokens_per_second": 6255.918 + }, + { + "epoch": 1.9480225881818782, + "grad_norm": 0.24972648918628693, + "learning_rate": 5e-06, + "loss": 0.9286, + "num_input_tokens_seen": 731207136, + "step": 1609, + "train_runtime": 116876.7851, + "train_tokens_per_second": 6256.222 + }, + { + "epoch": 1.9492333449994796, + "grad_norm": 0.22679458558559418, + "learning_rate": 5e-06, + "loss": 0.9647, + "num_input_tokens_seen": 731675192, + "step": 1610, + "train_runtime": 116946.392, + "train_tokens_per_second": 6256.501 + }, + { + "epoch": 1.950444101817081, + "grad_norm": 0.24391289055347443, + "learning_rate": 5e-06, + "loss": 0.9443, + "num_input_tokens_seen": 732136656, + "step": 1611, + "train_runtime": 117014.2749, + "train_tokens_per_second": 6256.815 + }, + { + "epoch": 1.9516548586346825, + "grad_norm": 0.25399860739707947, + "learning_rate": 5e-06, + "loss": 0.9206, + "num_input_tokens_seen": 732587312, + "step": 1612, + "train_runtime": 117080.9176, + "train_tokens_per_second": 6257.103 + }, + { + "epoch": 1.9528656154522839, + "grad_norm": 0.2403707355260849, + "learning_rate": 5e-06, + "loss": 0.9548, + "num_input_tokens_seen": 733055472, + "step": 1613, + "train_runtime": 117150.9773, + "train_tokens_per_second": 6257.357 + }, + { + "epoch": 1.9540763722698853, + "grad_norm": 0.24824580550193787, + "learning_rate": 5e-06, + "loss": 0.9813, + "num_input_tokens_seen": 733529576, + "step": 1614, + "train_runtime": 117222.2753, + "train_tokens_per_second": 6257.595 + }, + { + "epoch": 1.9552871290874867, + "grad_norm": 0.25411248207092285, + "learning_rate": 5e-06, + "loss": 0.9657, + "num_input_tokens_seen": 733989808, + "step": 1615, + "train_runtime": 117290.2995, + "train_tokens_per_second": 6257.89 + }, + { + "epoch": 1.956497885905088, + "grad_norm": 0.244659423828125, + "learning_rate": 5e-06, + "loss": 0.9625, + "num_input_tokens_seen": 734459672, + "step": 1616, + "train_runtime": 117359.2892, + "train_tokens_per_second": 6258.215 + }, + { + "epoch": 1.9577086427226895, + "grad_norm": 0.2583770751953125, + "learning_rate": 5e-06, + "loss": 0.95, + "num_input_tokens_seen": 734913456, + "step": 1617, + "train_runtime": 117425.8088, + "train_tokens_per_second": 6258.534 + }, + { + "epoch": 1.958919399540291, + "grad_norm": 0.27326807379722595, + "learning_rate": 5e-06, + "loss": 0.9071, + "num_input_tokens_seen": 735361360, + "step": 1618, + "train_runtime": 117492.1671, + "train_tokens_per_second": 6258.812 + }, + { + "epoch": 1.9601301563578921, + "grad_norm": 0.2656486928462982, + "learning_rate": 5e-06, + "loss": 0.9275, + "num_input_tokens_seen": 735820904, + "step": 1619, + "train_runtime": 117560.1966, + "train_tokens_per_second": 6259.099 + }, + { + "epoch": 1.9613409131754935, + "grad_norm": 0.26864171028137207, + "learning_rate": 5e-06, + "loss": 0.9431, + "num_input_tokens_seen": 736286088, + "step": 1620, + "train_runtime": 117628.7955, + "train_tokens_per_second": 6259.403 + }, + { + "epoch": 1.962551669993095, + "grad_norm": 0.23168571293354034, + "learning_rate": 5e-06, + "loss": 0.9652, + "num_input_tokens_seen": 736771304, + "step": 1621, + "train_runtime": 117700.5675, + "train_tokens_per_second": 6259.709 + }, + { + "epoch": 1.9637624268106961, + "grad_norm": 0.3031046986579895, + "learning_rate": 5e-06, + "loss": 0.9284, + "num_input_tokens_seen": 737223816, + "step": 1622, + "train_runtime": 117767.2938, + "train_tokens_per_second": 6260.005 + }, + { + "epoch": 1.9649731836282975, + "grad_norm": 0.3055347800254822, + "learning_rate": 5e-06, + "loss": 0.8942, + "num_input_tokens_seen": 737671328, + "step": 1623, + "train_runtime": 117833.4065, + "train_tokens_per_second": 6260.29 + }, + { + "epoch": 1.966183940445899, + "grad_norm": 0.24057318270206451, + "learning_rate": 5e-06, + "loss": 0.9013, + "num_input_tokens_seen": 738128264, + "step": 1624, + "train_runtime": 117900.8843, + "train_tokens_per_second": 6260.583 + }, + { + "epoch": 1.9673946972635004, + "grad_norm": 0.28453585505485535, + "learning_rate": 5e-06, + "loss": 0.951, + "num_input_tokens_seen": 738599800, + "step": 1625, + "train_runtime": 117970.9661, + "train_tokens_per_second": 6260.861 + }, + { + "epoch": 1.9686054540811018, + "grad_norm": 0.2978310286998749, + "learning_rate": 5e-06, + "loss": 0.9524, + "num_input_tokens_seen": 739061440, + "step": 1626, + "train_runtime": 118038.9189, + "train_tokens_per_second": 6261.167 + }, + { + "epoch": 1.9698162108987032, + "grad_norm": 0.2525809109210968, + "learning_rate": 5e-06, + "loss": 0.9146, + "num_input_tokens_seen": 739504888, + "step": 1627, + "train_runtime": 118104.044, + "train_tokens_per_second": 6261.47 + }, + { + "epoch": 1.9710269677163046, + "grad_norm": 0.23271185159683228, + "learning_rate": 5e-06, + "loss": 0.879, + "num_input_tokens_seen": 739959160, + "step": 1628, + "train_runtime": 118171.0917, + "train_tokens_per_second": 6261.761 + }, + { + "epoch": 1.972237724533906, + "grad_norm": 0.2425994873046875, + "learning_rate": 5e-06, + "loss": 0.9498, + "num_input_tokens_seen": 740392976, + "step": 1629, + "train_runtime": 118235.1733, + "train_tokens_per_second": 6262.037 + }, + { + "epoch": 1.9734484813515074, + "grad_norm": 0.28858521580696106, + "learning_rate": 5e-06, + "loss": 0.9053, + "num_input_tokens_seen": 740852000, + "step": 1630, + "train_runtime": 118303.084, + "train_tokens_per_second": 6262.322 + }, + { + "epoch": 1.9746592381691088, + "grad_norm": 0.30428969860076904, + "learning_rate": 5e-06, + "loss": 0.9647, + "num_input_tokens_seen": 741308544, + "step": 1631, + "train_runtime": 118370.3428, + "train_tokens_per_second": 6262.621 + }, + { + "epoch": 1.97586999498671, + "grad_norm": 0.2601581811904907, + "learning_rate": 5e-06, + "loss": 0.9038, + "num_input_tokens_seen": 741771776, + "step": 1632, + "train_runtime": 118438.7832, + "train_tokens_per_second": 6262.913 + }, + { + "epoch": 1.9770807518043114, + "grad_norm": 0.2240893691778183, + "learning_rate": 5e-06, + "loss": 0.9744, + "num_input_tokens_seen": 742234480, + "step": 1633, + "train_runtime": 118507.3035, + "train_tokens_per_second": 6263.196 + }, + { + "epoch": 1.9782915086219128, + "grad_norm": 0.2555893063545227, + "learning_rate": 5e-06, + "loss": 0.9318, + "num_input_tokens_seen": 742693528, + "step": 1634, + "train_runtime": 118575.2577, + "train_tokens_per_second": 6263.478 + }, + { + "epoch": 1.979502265439514, + "grad_norm": 0.33006125688552856, + "learning_rate": 5e-06, + "loss": 0.9128, + "num_input_tokens_seen": 743133448, + "step": 1635, + "train_runtime": 118639.8577, + "train_tokens_per_second": 6263.776 + }, + { + "epoch": 1.9807130222571154, + "grad_norm": 0.24423004686832428, + "learning_rate": 5e-06, + "loss": 0.9242, + "num_input_tokens_seen": 743587448, + "step": 1636, + "train_runtime": 118707.0303, + "train_tokens_per_second": 6264.056 + }, + { + "epoch": 1.9819237790747168, + "grad_norm": 0.2284265011548996, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 744034000, + "step": 1637, + "train_runtime": 118772.4989, + "train_tokens_per_second": 6264.363 + }, + { + "epoch": 1.9831345358923183, + "grad_norm": 0.24896208941936493, + "learning_rate": 5e-06, + "loss": 0.926, + "num_input_tokens_seen": 744492584, + "step": 1638, + "train_runtime": 118840.6792, + "train_tokens_per_second": 6264.627 + }, + { + "epoch": 1.9843452927099197, + "grad_norm": 0.27802956104278564, + "learning_rate": 5e-06, + "loss": 0.9311, + "num_input_tokens_seen": 744949664, + "step": 1639, + "train_runtime": 118908.0072, + "train_tokens_per_second": 6264.924 + }, + { + "epoch": 1.985556049527521, + "grad_norm": 0.2622906565666199, + "learning_rate": 5e-06, + "loss": 0.909, + "num_input_tokens_seen": 745414920, + "step": 1640, + "train_runtime": 118976.34, + "train_tokens_per_second": 6265.237 + }, + { + "epoch": 1.9867668063451225, + "grad_norm": 0.25892722606658936, + "learning_rate": 5e-06, + "loss": 0.9746, + "num_input_tokens_seen": 745869072, + "step": 1641, + "train_runtime": 119043.2791, + "train_tokens_per_second": 6265.529 + }, + { + "epoch": 1.987977563162724, + "grad_norm": 0.24062815308570862, + "learning_rate": 5e-06, + "loss": 0.9342, + "num_input_tokens_seen": 746319848, + "step": 1642, + "train_runtime": 119109.7704, + "train_tokens_per_second": 6265.816 + }, + { + "epoch": 1.9891883199803253, + "grad_norm": 0.3026382029056549, + "learning_rate": 5e-06, + "loss": 0.9328, + "num_input_tokens_seen": 746765768, + "step": 1643, + "train_runtime": 119175.6132, + "train_tokens_per_second": 6266.095 + }, + { + "epoch": 1.9903990767979267, + "grad_norm": 0.2536994516849518, + "learning_rate": 5e-06, + "loss": 0.8838, + "num_input_tokens_seen": 747223184, + "step": 1644, + "train_runtime": 119243.0261, + "train_tokens_per_second": 6266.389 + }, + { + "epoch": 1.991609833615528, + "grad_norm": 0.24464935064315796, + "learning_rate": 5e-06, + "loss": 0.9337, + "num_input_tokens_seen": 747669984, + "step": 1645, + "train_runtime": 119309.5096, + "train_tokens_per_second": 6266.642 + }, + { + "epoch": 1.9928205904331293, + "grad_norm": 0.24241983890533447, + "learning_rate": 5e-06, + "loss": 0.9195, + "num_input_tokens_seen": 748096608, + "step": 1646, + "train_runtime": 119372.5329, + "train_tokens_per_second": 6266.907 + }, + { + "epoch": 1.9940313472507307, + "grad_norm": 0.25340303778648376, + "learning_rate": 5e-06, + "loss": 0.9146, + "num_input_tokens_seen": 748518912, + "step": 1647, + "train_runtime": 119434.6808, + "train_tokens_per_second": 6267.182 + }, + { + "epoch": 1.9952421040683321, + "grad_norm": 0.24607083201408386, + "learning_rate": 5e-06, + "loss": 0.9389, + "num_input_tokens_seen": 748982560, + "step": 1648, + "train_runtime": 119503.1742, + "train_tokens_per_second": 6267.47 + }, + { + "epoch": 1.9964528608859333, + "grad_norm": 0.25516462326049805, + "learning_rate": 5e-06, + "loss": 0.9252, + "num_input_tokens_seen": 749435416, + "step": 1649, + "train_runtime": 119569.6996, + "train_tokens_per_second": 6267.77 + }, + { + "epoch": 1.9976636177035347, + "grad_norm": 0.23312324285507202, + "learning_rate": 5e-06, + "loss": 0.9199, + "num_input_tokens_seen": 749880560, + "step": 1650, + "train_runtime": 119635.2823, + "train_tokens_per_second": 6268.055 + }, + { + "epoch": 1.9988743745211361, + "grad_norm": 0.22469443082809448, + "learning_rate": 5e-06, + "loss": 0.9509, + "num_input_tokens_seen": 750355760, + "step": 1651, + "train_runtime": 119705.3381, + "train_tokens_per_second": 6268.357 + }, + { + "epoch": 2.0, + "grad_norm": 0.2671259641647339, + "learning_rate": 5e-06, + "loss": 0.923, + "num_input_tokens_seen": 750767792, + "step": 1652, + "train_runtime": 119766.3563, + "train_tokens_per_second": 6268.603 + }, + { + "epoch": 2.0012107568176014, + "grad_norm": 0.25913000106811523, + "learning_rate": 5e-06, + "loss": 0.9515, + "num_input_tokens_seen": 751241384, + "step": 1653, + "train_runtime": 119836.2595, + "train_tokens_per_second": 6268.899 + }, + { + "epoch": 2.002421513635203, + "grad_norm": 0.24619394540786743, + "learning_rate": 5e-06, + "loss": 0.8988, + "num_input_tokens_seen": 751701224, + "step": 1654, + "train_runtime": 119904.0181, + "train_tokens_per_second": 6269.191 + }, + { + "epoch": 2.0036322704528042, + "grad_norm": 0.245701402425766, + "learning_rate": 5e-06, + "loss": 0.9295, + "num_input_tokens_seen": 752143752, + "step": 1655, + "train_runtime": 119969.2461, + "train_tokens_per_second": 6269.471 + }, + { + "epoch": 2.0048430272704056, + "grad_norm": 0.23520943522453308, + "learning_rate": 5e-06, + "loss": 0.9266, + "num_input_tokens_seen": 752602416, + "step": 1656, + "train_runtime": 120036.8828, + "train_tokens_per_second": 6269.76 + }, + { + "epoch": 2.006053784088007, + "grad_norm": 0.2660825550556183, + "learning_rate": 5e-06, + "loss": 0.9354, + "num_input_tokens_seen": 753037600, + "step": 1657, + "train_runtime": 120100.8699, + "train_tokens_per_second": 6270.043 + }, + { + "epoch": 2.0072645409056085, + "grad_norm": 0.21745565533638, + "learning_rate": 5e-06, + "loss": 0.9324, + "num_input_tokens_seen": 753522696, + "step": 1658, + "train_runtime": 120172.9009, + "train_tokens_per_second": 6270.321 + }, + { + "epoch": 2.0084752977232094, + "grad_norm": 0.2518208920955658, + "learning_rate": 5e-06, + "loss": 0.9369, + "num_input_tokens_seen": 753999256, + "step": 1659, + "train_runtime": 120243.3277, + "train_tokens_per_second": 6270.612 + }, + { + "epoch": 2.009686054540811, + "grad_norm": 0.23979134857654572, + "learning_rate": 5e-06, + "loss": 0.9115, + "num_input_tokens_seen": 754457208, + "step": 1660, + "train_runtime": 120311.1243, + "train_tokens_per_second": 6270.885 + }, + { + "epoch": 2.0108968113584123, + "grad_norm": 0.25418299436569214, + "learning_rate": 5e-06, + "loss": 0.9742, + "num_input_tokens_seen": 754909448, + "step": 1661, + "train_runtime": 120377.7091, + "train_tokens_per_second": 6271.173 + }, + { + "epoch": 2.0121075681760137, + "grad_norm": 0.24330636858940125, + "learning_rate": 5e-06, + "loss": 0.8973, + "num_input_tokens_seen": 755375656, + "step": 1662, + "train_runtime": 120446.4967, + "train_tokens_per_second": 6271.462 + }, + { + "epoch": 2.013318324993615, + "grad_norm": 0.23138542473316193, + "learning_rate": 5e-06, + "loss": 0.9258, + "num_input_tokens_seen": 755847280, + "step": 1663, + "train_runtime": 120515.7872, + "train_tokens_per_second": 6271.77 + }, + { + "epoch": 2.0145290818112165, + "grad_norm": 0.22977809607982635, + "learning_rate": 5e-06, + "loss": 0.9414, + "num_input_tokens_seen": 756307096, + "step": 1664, + "train_runtime": 120583.5284, + "train_tokens_per_second": 6272.06 + }, + { + "epoch": 2.015739838628818, + "grad_norm": 0.23063069581985474, + "learning_rate": 5e-06, + "loss": 0.9489, + "num_input_tokens_seen": 756782872, + "step": 1665, + "train_runtime": 120653.9002, + "train_tokens_per_second": 6272.345 + }, + { + "epoch": 2.0169505954464193, + "grad_norm": 0.25464802980422974, + "learning_rate": 5e-06, + "loss": 0.9193, + "num_input_tokens_seen": 757236128, + "step": 1666, + "train_runtime": 120720.8443, + "train_tokens_per_second": 6272.621 + }, + { + "epoch": 2.0181613522640207, + "grad_norm": 0.26056936383247375, + "learning_rate": 5e-06, + "loss": 0.9261, + "num_input_tokens_seen": 757706360, + "step": 1667, + "train_runtime": 120792.0573, + "train_tokens_per_second": 6272.816 + }, + { + "epoch": 2.019372109081622, + "grad_norm": 0.24450352787971497, + "learning_rate": 5e-06, + "loss": 0.8846, + "num_input_tokens_seen": 758155960, + "step": 1668, + "train_runtime": 120858.5813, + "train_tokens_per_second": 6273.083 + }, + { + "epoch": 2.0205828658992235, + "grad_norm": 0.22889290750026703, + "learning_rate": 5e-06, + "loss": 0.9239, + "num_input_tokens_seen": 758608128, + "step": 1669, + "train_runtime": 120924.9819, + "train_tokens_per_second": 6273.378 + }, + { + "epoch": 2.021793622716825, + "grad_norm": 0.24720118939876556, + "learning_rate": 5e-06, + "loss": 0.9581, + "num_input_tokens_seen": 759066936, + "step": 1670, + "train_runtime": 120993.0982, + "train_tokens_per_second": 6273.638 + }, + { + "epoch": 2.0230043795344264, + "grad_norm": 0.2619543671607971, + "learning_rate": 5e-06, + "loss": 0.9421, + "num_input_tokens_seen": 759523400, + "step": 1671, + "train_runtime": 121060.4719, + "train_tokens_per_second": 6273.917 + }, + { + "epoch": 2.0242151363520273, + "grad_norm": 0.2532022297382355, + "learning_rate": 5e-06, + "loss": 0.9364, + "num_input_tokens_seen": 759992104, + "step": 1672, + "train_runtime": 121129.84, + "train_tokens_per_second": 6274.194 + }, + { + "epoch": 2.0254258931696287, + "grad_norm": 0.2386539727449417, + "learning_rate": 5e-06, + "loss": 0.9337, + "num_input_tokens_seen": 760421816, + "step": 1673, + "train_runtime": 121193.2317, + "train_tokens_per_second": 6274.458 + }, + { + "epoch": 2.02663664998723, + "grad_norm": 0.23992206156253815, + "learning_rate": 5e-06, + "loss": 0.9429, + "num_input_tokens_seen": 760910216, + "step": 1674, + "train_runtime": 121265.0604, + "train_tokens_per_second": 6274.769 + }, + { + "epoch": 2.0278474068048316, + "grad_norm": 0.27219098806381226, + "learning_rate": 5e-06, + "loss": 0.9159, + "num_input_tokens_seen": 761358768, + "step": 1675, + "train_runtime": 121331.2035, + "train_tokens_per_second": 6275.045 + }, + { + "epoch": 2.029058163622433, + "grad_norm": 0.26987916231155396, + "learning_rate": 5e-06, + "loss": 0.9063, + "num_input_tokens_seen": 761819360, + "step": 1676, + "train_runtime": 121399.3914, + "train_tokens_per_second": 6275.314 + }, + { + "epoch": 2.0302689204400344, + "grad_norm": 0.29206639528274536, + "learning_rate": 5e-06, + "loss": 0.8884, + "num_input_tokens_seen": 762265192, + "step": 1677, + "train_runtime": 121465.4319, + "train_tokens_per_second": 6275.573 + }, + { + "epoch": 2.031479677257636, + "grad_norm": 0.2402559518814087, + "learning_rate": 5e-06, + "loss": 0.967, + "num_input_tokens_seen": 762727064, + "step": 1678, + "train_runtime": 121533.5732, + "train_tokens_per_second": 6275.855 + }, + { + "epoch": 2.032690434075237, + "grad_norm": 0.2560024559497833, + "learning_rate": 5e-06, + "loss": 1.0016, + "num_input_tokens_seen": 763173840, + "step": 1679, + "train_runtime": 121599.4982, + "train_tokens_per_second": 6276.127 + }, + { + "epoch": 2.0339011908928386, + "grad_norm": 0.2566429078578949, + "learning_rate": 5e-06, + "loss": 0.9174, + "num_input_tokens_seen": 763633104, + "step": 1680, + "train_runtime": 121667.3707, + "train_tokens_per_second": 6276.4 + }, + { + "epoch": 2.03511194771044, + "grad_norm": 0.324238657951355, + "learning_rate": 5e-06, + "loss": 0.9757, + "num_input_tokens_seen": 764078208, + "step": 1681, + "train_runtime": 121732.7093, + "train_tokens_per_second": 6276.688 + }, + { + "epoch": 2.0363227045280414, + "grad_norm": 0.2558477818965912, + "learning_rate": 5e-06, + "loss": 0.9683, + "num_input_tokens_seen": 764539136, + "step": 1682, + "train_runtime": 121800.7565, + "train_tokens_per_second": 6276.965 + }, + { + "epoch": 2.037533461345643, + "grad_norm": 0.24341857433319092, + "learning_rate": 5e-06, + "loss": 0.9386, + "num_input_tokens_seen": 764978424, + "step": 1683, + "train_runtime": 121864.9978, + "train_tokens_per_second": 6277.261 + }, + { + "epoch": 2.0387442181632442, + "grad_norm": 0.2269880771636963, + "learning_rate": 5e-06, + "loss": 0.9277, + "num_input_tokens_seen": 765432624, + "step": 1684, + "train_runtime": 121931.6988, + "train_tokens_per_second": 6277.552 + }, + { + "epoch": 2.0399549749808457, + "grad_norm": 0.24292759597301483, + "learning_rate": 5e-06, + "loss": 0.9419, + "num_input_tokens_seen": 765849656, + "step": 1685, + "train_runtime": 121992.9638, + "train_tokens_per_second": 6277.818 + }, + { + "epoch": 2.0411657317984466, + "grad_norm": 0.2310955822467804, + "learning_rate": 5e-06, + "loss": 0.9353, + "num_input_tokens_seen": 766298816, + "step": 1686, + "train_runtime": 122059.3898, + "train_tokens_per_second": 6278.082 + }, + { + "epoch": 2.042376488616048, + "grad_norm": 0.23854534327983856, + "learning_rate": 5e-06, + "loss": 0.8951, + "num_input_tokens_seen": 766758944, + "step": 1687, + "train_runtime": 122127.173, + "train_tokens_per_second": 6278.365 + }, + { + "epoch": 2.0435872454336494, + "grad_norm": 0.2445819079875946, + "learning_rate": 5e-06, + "loss": 0.9005, + "num_input_tokens_seen": 767215168, + "step": 1688, + "train_runtime": 122194.8356, + "train_tokens_per_second": 6278.622 + }, + { + "epoch": 2.044798002251251, + "grad_norm": 0.24541962146759033, + "learning_rate": 5e-06, + "loss": 0.9422, + "num_input_tokens_seen": 767661360, + "step": 1689, + "train_runtime": 122260.9211, + "train_tokens_per_second": 6278.878 + }, + { + "epoch": 2.0460087590688523, + "grad_norm": 0.23069673776626587, + "learning_rate": 5e-06, + "loss": 0.937, + "num_input_tokens_seen": 768110416, + "step": 1690, + "train_runtime": 122327.2995, + "train_tokens_per_second": 6279.141 + }, + { + "epoch": 2.0472195158864537, + "grad_norm": 0.26259496808052063, + "learning_rate": 5e-06, + "loss": 0.9342, + "num_input_tokens_seen": 768564600, + "step": 1691, + "train_runtime": 122394.1919, + "train_tokens_per_second": 6279.421 + }, + { + "epoch": 2.048430272704055, + "grad_norm": 0.23317334055900574, + "learning_rate": 5e-06, + "loss": 0.9462, + "num_input_tokens_seen": 769009144, + "step": 1692, + "train_runtime": 122459.6235, + "train_tokens_per_second": 6279.695 + }, + { + "epoch": 2.0496410295216565, + "grad_norm": 0.24182665348052979, + "learning_rate": 5e-06, + "loss": 0.9408, + "num_input_tokens_seen": 769483920, + "step": 1693, + "train_runtime": 122529.2607, + "train_tokens_per_second": 6280.001 + }, + { + "epoch": 2.050851786339258, + "grad_norm": 0.2536557614803314, + "learning_rate": 5e-06, + "loss": 0.9511, + "num_input_tokens_seen": 769932832, + "step": 1694, + "train_runtime": 122595.5738, + "train_tokens_per_second": 6280.266 + }, + { + "epoch": 2.0520625431568593, + "grad_norm": 0.24656134843826294, + "learning_rate": 5e-06, + "loss": 0.9051, + "num_input_tokens_seen": 770395072, + "step": 1695, + "train_runtime": 122663.6778, + "train_tokens_per_second": 6280.548 + }, + { + "epoch": 2.0532732999744607, + "grad_norm": 0.2536466419696808, + "learning_rate": 5e-06, + "loss": 0.8947, + "num_input_tokens_seen": 770853376, + "step": 1696, + "train_runtime": 122731.4511, + "train_tokens_per_second": 6280.814 + }, + { + "epoch": 2.054484056792062, + "grad_norm": 0.23972494900226593, + "learning_rate": 5e-06, + "loss": 0.9722, + "num_input_tokens_seen": 771293240, + "step": 1697, + "train_runtime": 122796.193, + "train_tokens_per_second": 6281.084 + }, + { + "epoch": 2.0556948136096636, + "grad_norm": 0.22034522891044617, + "learning_rate": 5e-06, + "loss": 0.923, + "num_input_tokens_seen": 771761160, + "step": 1698, + "train_runtime": 122865.4887, + "train_tokens_per_second": 6281.35 + }, + { + "epoch": 2.0569055704272645, + "grad_norm": 0.24163363873958588, + "learning_rate": 5e-06, + "loss": 0.902, + "num_input_tokens_seen": 772219904, + "step": 1699, + "train_runtime": 122932.45, + "train_tokens_per_second": 6281.66 + }, + { + "epoch": 2.058116327244866, + "grad_norm": 0.24234162271022797, + "learning_rate": 5e-06, + "loss": 0.9049, + "num_input_tokens_seen": 772676568, + "step": 1700, + "train_runtime": 123000.0639, + "train_tokens_per_second": 6281.92 + }, + { + "epoch": 2.0593270840624673, + "grad_norm": 0.259397029876709, + "learning_rate": 5e-06, + "loss": 0.9477, + "num_input_tokens_seen": 773113520, + "step": 1701, + "train_runtime": 123064.4333, + "train_tokens_per_second": 6282.185 + }, + { + "epoch": 2.0605378408800687, + "grad_norm": 0.22705447673797607, + "learning_rate": 5e-06, + "loss": 0.9678, + "num_input_tokens_seen": 773574592, + "step": 1702, + "train_runtime": 123132.6096, + "train_tokens_per_second": 6282.451 + }, + { + "epoch": 2.06174859769767, + "grad_norm": 0.25759419798851013, + "learning_rate": 5e-06, + "loss": 0.8677, + "num_input_tokens_seen": 774031056, + "step": 1703, + "train_runtime": 123199.736, + "train_tokens_per_second": 6282.733 + }, + { + "epoch": 2.0629593545152716, + "grad_norm": 0.26892176270484924, + "learning_rate": 5e-06, + "loss": 0.9933, + "num_input_tokens_seen": 774434376, + "step": 1704, + "train_runtime": 123258.9797, + "train_tokens_per_second": 6282.985 + }, + { + "epoch": 2.064170111332873, + "grad_norm": 0.26564693450927734, + "learning_rate": 5e-06, + "loss": 0.9881, + "num_input_tokens_seen": 774870928, + "step": 1705, + "train_runtime": 123323.0137, + "train_tokens_per_second": 6283.263 + }, + { + "epoch": 2.0653808681504744, + "grad_norm": 0.23933526873588562, + "learning_rate": 5e-06, + "loss": 0.9081, + "num_input_tokens_seen": 775321504, + "step": 1706, + "train_runtime": 123389.357, + "train_tokens_per_second": 6283.536 + }, + { + "epoch": 2.066591624968076, + "grad_norm": 0.261411190032959, + "learning_rate": 5e-06, + "loss": 0.9314, + "num_input_tokens_seen": 775797968, + "step": 1707, + "train_runtime": 123459.9465, + "train_tokens_per_second": 6283.803 + }, + { + "epoch": 2.067802381785677, + "grad_norm": 0.24829885363578796, + "learning_rate": 5e-06, + "loss": 0.9948, + "num_input_tokens_seen": 776232728, + "step": 1708, + "train_runtime": 123523.8479, + "train_tokens_per_second": 6284.072 + }, + { + "epoch": 2.0690131386032786, + "grad_norm": 0.24466580152511597, + "learning_rate": 5e-06, + "loss": 0.9643, + "num_input_tokens_seen": 776670512, + "step": 1709, + "train_runtime": 123588.3574, + "train_tokens_per_second": 6284.334 + }, + { + "epoch": 2.07022389542088, + "grad_norm": 0.2513468265533447, + "learning_rate": 5e-06, + "loss": 0.8985, + "num_input_tokens_seen": 777112832, + "step": 1710, + "train_runtime": 123653.4604, + "train_tokens_per_second": 6284.602 + }, + { + "epoch": 2.0714346522384814, + "grad_norm": 0.2488190084695816, + "learning_rate": 5e-06, + "loss": 0.9549, + "num_input_tokens_seen": 777563728, + "step": 1711, + "train_runtime": 123719.8041, + "train_tokens_per_second": 6284.877 + }, + { + "epoch": 2.072645409056083, + "grad_norm": 0.2452920526266098, + "learning_rate": 5e-06, + "loss": 0.8695, + "num_input_tokens_seen": 778016968, + "step": 1712, + "train_runtime": 123786.2393, + "train_tokens_per_second": 6285.165 + }, + { + "epoch": 2.073856165873684, + "grad_norm": 0.24354714155197144, + "learning_rate": 5e-06, + "loss": 0.9446, + "num_input_tokens_seen": 778467104, + "step": 1713, + "train_runtime": 123852.6969, + "train_tokens_per_second": 6285.427 + }, + { + "epoch": 2.0750669226912852, + "grad_norm": 0.2566715180873871, + "learning_rate": 5e-06, + "loss": 0.9057, + "num_input_tokens_seen": 778913672, + "step": 1714, + "train_runtime": 123918.4381, + "train_tokens_per_second": 6285.696 + }, + { + "epoch": 2.0762776795088866, + "grad_norm": 0.23084959387779236, + "learning_rate": 5e-06, + "loss": 0.9132, + "num_input_tokens_seen": 779369120, + "step": 1715, + "train_runtime": 123985.2855, + "train_tokens_per_second": 6285.981 + }, + { + "epoch": 2.077488436326488, + "grad_norm": 0.24119411408901215, + "learning_rate": 5e-06, + "loss": 0.9102, + "num_input_tokens_seen": 779831520, + "step": 1716, + "train_runtime": 124053.9529, + "train_tokens_per_second": 6286.229 + }, + { + "epoch": 2.0786991931440895, + "grad_norm": 0.24514897167682648, + "learning_rate": 5e-06, + "loss": 0.895, + "num_input_tokens_seen": 780292168, + "step": 1717, + "train_runtime": 124122.045, + "train_tokens_per_second": 6286.491 + }, + { + "epoch": 2.079909949961691, + "grad_norm": 0.2566341459751129, + "learning_rate": 5e-06, + "loss": 0.9118, + "num_input_tokens_seen": 780741608, + "step": 1718, + "train_runtime": 124188.5179, + "train_tokens_per_second": 6286.746 + }, + { + "epoch": 2.0811207067792923, + "grad_norm": 0.2600558400154114, + "learning_rate": 5e-06, + "loss": 0.9192, + "num_input_tokens_seen": 781191032, + "step": 1719, + "train_runtime": 124255.021, + "train_tokens_per_second": 6286.998 + }, + { + "epoch": 2.0823314635968937, + "grad_norm": 0.23223178088665009, + "learning_rate": 5e-06, + "loss": 0.963, + "num_input_tokens_seen": 781650672, + "step": 1720, + "train_runtime": 124322.857, + "train_tokens_per_second": 6287.264 + }, + { + "epoch": 2.083542220414495, + "grad_norm": 0.24753454327583313, + "learning_rate": 5e-06, + "loss": 0.9398, + "num_input_tokens_seen": 782098928, + "step": 1721, + "train_runtime": 124390.6893, + "train_tokens_per_second": 6287.439 + }, + { + "epoch": 2.0847529772320965, + "grad_norm": 0.25024259090423584, + "learning_rate": 5e-06, + "loss": 0.9395, + "num_input_tokens_seen": 782548672, + "step": 1722, + "train_runtime": 124457.3538, + "train_tokens_per_second": 6287.685 + }, + { + "epoch": 2.085963734049698, + "grad_norm": 0.22619232535362244, + "learning_rate": 5e-06, + "loss": 0.9674, + "num_input_tokens_seen": 783008520, + "step": 1723, + "train_runtime": 124525.0193, + "train_tokens_per_second": 6287.961 + }, + { + "epoch": 2.0871744908672993, + "grad_norm": 0.2778150737285614, + "learning_rate": 5e-06, + "loss": 0.9461, + "num_input_tokens_seen": 783477736, + "step": 1724, + "train_runtime": 124593.7565, + "train_tokens_per_second": 6288.258 + }, + { + "epoch": 2.0883852476849007, + "grad_norm": 0.24901039898395538, + "learning_rate": 5e-06, + "loss": 0.995, + "num_input_tokens_seen": 783923816, + "step": 1725, + "train_runtime": 124659.2402, + "train_tokens_per_second": 6288.534 + }, + { + "epoch": 2.0895960045025017, + "grad_norm": 0.27725812792778015, + "learning_rate": 5e-06, + "loss": 0.8965, + "num_input_tokens_seen": 784361728, + "step": 1726, + "train_runtime": 124723.8291, + "train_tokens_per_second": 6288.788 + }, + { + "epoch": 2.090806761320103, + "grad_norm": 0.26983052492141724, + "learning_rate": 5e-06, + "loss": 0.8786, + "num_input_tokens_seen": 784809792, + "step": 1727, + "train_runtime": 124789.805, + "train_tokens_per_second": 6289.054 + }, + { + "epoch": 2.0920175181377045, + "grad_norm": 0.2453075647354126, + "learning_rate": 5e-06, + "loss": 0.893, + "num_input_tokens_seen": 785274824, + "step": 1728, + "train_runtime": 124857.9453, + "train_tokens_per_second": 6289.346 + }, + { + "epoch": 2.093228274955306, + "grad_norm": 0.2598790228366852, + "learning_rate": 5e-06, + "loss": 0.9416, + "num_input_tokens_seen": 785730496, + "step": 1729, + "train_runtime": 124924.8416, + "train_tokens_per_second": 6289.626 + }, + { + "epoch": 2.0944390317729074, + "grad_norm": 0.24463999271392822, + "learning_rate": 5e-06, + "loss": 0.9228, + "num_input_tokens_seen": 786166712, + "step": 1730, + "train_runtime": 124989.1875, + "train_tokens_per_second": 6289.878 + }, + { + "epoch": 2.0956497885905088, + "grad_norm": 0.2674955129623413, + "learning_rate": 5e-06, + "loss": 0.9096, + "num_input_tokens_seen": 786592656, + "step": 1731, + "train_runtime": 125051.8506, + "train_tokens_per_second": 6290.132 + }, + { + "epoch": 2.09686054540811, + "grad_norm": 0.25729501247406006, + "learning_rate": 5e-06, + "loss": 0.946, + "num_input_tokens_seen": 787068024, + "step": 1732, + "train_runtime": 125122.4222, + "train_tokens_per_second": 6290.384 + }, + { + "epoch": 2.0980713022257116, + "grad_norm": 0.25448042154312134, + "learning_rate": 5e-06, + "loss": 0.8971, + "num_input_tokens_seen": 787513456, + "step": 1733, + "train_runtime": 125188.7353, + "train_tokens_per_second": 6290.61 + }, + { + "epoch": 2.099282059043313, + "grad_norm": 0.22716376185417175, + "learning_rate": 5e-06, + "loss": 0.916, + "num_input_tokens_seen": 787957768, + "step": 1734, + "train_runtime": 125253.9247, + "train_tokens_per_second": 6290.883 + }, + { + "epoch": 2.1004928158609144, + "grad_norm": 0.2653203010559082, + "learning_rate": 5e-06, + "loss": 0.9091, + "num_input_tokens_seen": 788385664, + "step": 1735, + "train_runtime": 125317.0041, + "train_tokens_per_second": 6291.131 + }, + { + "epoch": 2.101703572678516, + "grad_norm": 0.2422814965248108, + "learning_rate": 5e-06, + "loss": 0.927, + "num_input_tokens_seen": 788859096, + "step": 1736, + "train_runtime": 125386.8684, + "train_tokens_per_second": 6291.401 + }, + { + "epoch": 2.1029143294961172, + "grad_norm": 0.2769072651863098, + "learning_rate": 5e-06, + "loss": 0.9175, + "num_input_tokens_seen": 789313272, + "step": 1737, + "train_runtime": 125454.0354, + "train_tokens_per_second": 6291.653 + }, + { + "epoch": 2.1041250863137186, + "grad_norm": 0.24323880672454834, + "learning_rate": 5e-06, + "loss": 0.8791, + "num_input_tokens_seen": 789751800, + "step": 1738, + "train_runtime": 125518.7718, + "train_tokens_per_second": 6291.902 + }, + { + "epoch": 2.1053358431313196, + "grad_norm": 0.23501011729240417, + "learning_rate": 5e-06, + "loss": 0.982, + "num_input_tokens_seen": 790204272, + "step": 1739, + "train_runtime": 125585.6763, + "train_tokens_per_second": 6292.153 + }, + { + "epoch": 2.106546599948921, + "grad_norm": 0.2527690529823303, + "learning_rate": 5e-06, + "loss": 0.9606, + "num_input_tokens_seen": 790653104, + "step": 1740, + "train_runtime": 125651.8735, + "train_tokens_per_second": 6292.41 + }, + { + "epoch": 2.1077573567665224, + "grad_norm": 0.23906171321868896, + "learning_rate": 5e-06, + "loss": 0.8936, + "num_input_tokens_seen": 791130304, + "step": 1741, + "train_runtime": 125722.4514, + "train_tokens_per_second": 6292.673 + }, + { + "epoch": 2.108968113584124, + "grad_norm": 0.26574084162712097, + "learning_rate": 5e-06, + "loss": 0.8722, + "num_input_tokens_seen": 791579192, + "step": 1742, + "train_runtime": 125788.8875, + "train_tokens_per_second": 6292.918 + }, + { + "epoch": 2.1101788704017252, + "grad_norm": 0.2502514123916626, + "learning_rate": 5e-06, + "loss": 0.9092, + "num_input_tokens_seen": 792041488, + "step": 1743, + "train_runtime": 125857.0394, + "train_tokens_per_second": 6293.184 + }, + { + "epoch": 2.1113896272193267, + "grad_norm": 0.23396193981170654, + "learning_rate": 5e-06, + "loss": 0.8746, + "num_input_tokens_seen": 792517888, + "step": 1744, + "train_runtime": 125927.6031, + "train_tokens_per_second": 6293.441 + }, + { + "epoch": 2.112600384036928, + "grad_norm": 0.23824480175971985, + "learning_rate": 5e-06, + "loss": 0.9406, + "num_input_tokens_seen": 792972896, + "step": 1745, + "train_runtime": 125994.9999, + "train_tokens_per_second": 6293.685 + }, + { + "epoch": 2.1138111408545295, + "grad_norm": 0.2447684109210968, + "learning_rate": 5e-06, + "loss": 0.9068, + "num_input_tokens_seen": 793422832, + "step": 1746, + "train_runtime": 126061.4363, + "train_tokens_per_second": 6293.938 + }, + { + "epoch": 2.115021897672131, + "grad_norm": 0.22363825142383575, + "learning_rate": 5e-06, + "loss": 0.8694, + "num_input_tokens_seen": 793880960, + "step": 1747, + "train_runtime": 126129.4678, + "train_tokens_per_second": 6294.175 + }, + { + "epoch": 2.1162326544897323, + "grad_norm": 0.23261670768260956, + "learning_rate": 5e-06, + "loss": 0.9484, + "num_input_tokens_seen": 794328136, + "step": 1748, + "train_runtime": 126195.2383, + "train_tokens_per_second": 6294.438 + }, + { + "epoch": 2.1174434113073337, + "grad_norm": 0.22803719341754913, + "learning_rate": 5e-06, + "loss": 0.8902, + "num_input_tokens_seen": 794810736, + "step": 1749, + "train_runtime": 126266.3352, + "train_tokens_per_second": 6294.716 + }, + { + "epoch": 2.118654168124935, + "grad_norm": 0.23991791903972626, + "learning_rate": 5e-06, + "loss": 0.9623, + "num_input_tokens_seen": 795239912, + "step": 1750, + "train_runtime": 126329.1962, + "train_tokens_per_second": 6294.981 + }, + { + "epoch": 2.1198649249425365, + "grad_norm": 0.2476852983236313, + "learning_rate": 5e-06, + "loss": 0.9741, + "num_input_tokens_seen": 795711584, + "step": 1751, + "train_runtime": 126398.9592, + "train_tokens_per_second": 6295.238 + }, + { + "epoch": 2.1210756817601375, + "grad_norm": 0.24314959347248077, + "learning_rate": 5e-06, + "loss": 0.9451, + "num_input_tokens_seen": 796160352, + "step": 1752, + "train_runtime": 126464.8762, + "train_tokens_per_second": 6295.506 + }, + { + "epoch": 2.122286438577739, + "grad_norm": 0.24649563431739807, + "learning_rate": 5e-06, + "loss": 0.9477, + "num_input_tokens_seen": 796618904, + "step": 1753, + "train_runtime": 126532.6819, + "train_tokens_per_second": 6295.756 + }, + { + "epoch": 2.1234971953953403, + "grad_norm": 0.2568952441215515, + "learning_rate": 5e-06, + "loss": 0.9223, + "num_input_tokens_seen": 797063056, + "step": 1754, + "train_runtime": 126598.1871, + "train_tokens_per_second": 6296.007 + }, + { + "epoch": 2.1247079522129417, + "grad_norm": 0.22107072174549103, + "learning_rate": 5e-06, + "loss": 0.8551, + "num_input_tokens_seen": 797544816, + "step": 1755, + "train_runtime": 126669.3768, + "train_tokens_per_second": 6296.272 + }, + { + "epoch": 2.125918709030543, + "grad_norm": 0.29669317603111267, + "learning_rate": 5e-06, + "loss": 0.9065, + "num_input_tokens_seen": 797998352, + "step": 1756, + "train_runtime": 126736.3752, + "train_tokens_per_second": 6296.522 + }, + { + "epoch": 2.1271294658481446, + "grad_norm": 0.2783910036087036, + "learning_rate": 5e-06, + "loss": 0.9066, + "num_input_tokens_seen": 798444912, + "step": 1757, + "train_runtime": 126801.8963, + "train_tokens_per_second": 6296.79 + }, + { + "epoch": 2.128340222665746, + "grad_norm": 0.2530405819416046, + "learning_rate": 5e-06, + "loss": 0.9181, + "num_input_tokens_seen": 798908496, + "step": 1758, + "train_runtime": 126870.3737, + "train_tokens_per_second": 6297.045 + }, + { + "epoch": 2.1295509794833474, + "grad_norm": 0.24973563849925995, + "learning_rate": 5e-06, + "loss": 0.9519, + "num_input_tokens_seen": 799355104, + "step": 1759, + "train_runtime": 126936.2803, + "train_tokens_per_second": 6297.294 + }, + { + "epoch": 2.130761736300949, + "grad_norm": 0.24954435229301453, + "learning_rate": 5e-06, + "loss": 0.9068, + "num_input_tokens_seen": 799829328, + "step": 1760, + "train_runtime": 127006.5431, + "train_tokens_per_second": 6297.544 + }, + { + "epoch": 2.13197249311855, + "grad_norm": 0.2468835860490799, + "learning_rate": 5e-06, + "loss": 0.905, + "num_input_tokens_seen": 800297592, + "step": 1761, + "train_runtime": 127075.8817, + "train_tokens_per_second": 6297.793 + }, + { + "epoch": 2.1331832499361516, + "grad_norm": 0.24968093633651733, + "learning_rate": 5e-06, + "loss": 0.9725, + "num_input_tokens_seen": 800745464, + "step": 1762, + "train_runtime": 127141.6011, + "train_tokens_per_second": 6298.06 + }, + { + "epoch": 2.134394006753753, + "grad_norm": 0.24861465394496918, + "learning_rate": 5e-06, + "loss": 0.9453, + "num_input_tokens_seen": 801182160, + "step": 1763, + "train_runtime": 127205.6251, + "train_tokens_per_second": 6298.323 + }, + { + "epoch": 2.1356047635713544, + "grad_norm": 0.2691054940223694, + "learning_rate": 5e-06, + "loss": 0.9744, + "num_input_tokens_seen": 801626032, + "step": 1764, + "train_runtime": 127271.2495, + "train_tokens_per_second": 6298.563 + }, + { + "epoch": 2.136815520388956, + "grad_norm": 0.2613939046859741, + "learning_rate": 5e-06, + "loss": 0.9207, + "num_input_tokens_seen": 802080576, + "step": 1765, + "train_runtime": 127338.2741, + "train_tokens_per_second": 6298.818 + }, + { + "epoch": 2.138026277206557, + "grad_norm": 0.2544805407524109, + "learning_rate": 5e-06, + "loss": 0.9109, + "num_input_tokens_seen": 802543280, + "step": 1766, + "train_runtime": 127406.2366, + "train_tokens_per_second": 6299.089 + }, + { + "epoch": 2.139237034024158, + "grad_norm": 0.25102829933166504, + "learning_rate": 5e-06, + "loss": 0.9391, + "num_input_tokens_seen": 802991496, + "step": 1767, + "train_runtime": 127472.4744, + "train_tokens_per_second": 6299.332 + }, + { + "epoch": 2.1404477908417596, + "grad_norm": 0.22922591865062714, + "learning_rate": 5e-06, + "loss": 0.8899, + "num_input_tokens_seen": 803454416, + "step": 1768, + "train_runtime": 127540.4672, + "train_tokens_per_second": 6299.604 + }, + { + "epoch": 2.141658547659361, + "grad_norm": 0.24334551393985748, + "learning_rate": 5e-06, + "loss": 0.9564, + "num_input_tokens_seen": 803898848, + "step": 1769, + "train_runtime": 127606.0132, + "train_tokens_per_second": 6299.851 + }, + { + "epoch": 2.1428693044769624, + "grad_norm": 0.26398375630378723, + "learning_rate": 5e-06, + "loss": 0.9162, + "num_input_tokens_seen": 804346008, + "step": 1770, + "train_runtime": 127671.9825, + "train_tokens_per_second": 6300.098 + }, + { + "epoch": 2.144080061294564, + "grad_norm": 0.26432764530181885, + "learning_rate": 5e-06, + "loss": 0.9098, + "num_input_tokens_seen": 804800632, + "step": 1771, + "train_runtime": 127739.566, + "train_tokens_per_second": 6300.324 + }, + { + "epoch": 2.1452908181121653, + "grad_norm": 0.24564692378044128, + "learning_rate": 5e-06, + "loss": 0.9261, + "num_input_tokens_seen": 805243600, + "step": 1772, + "train_runtime": 127804.5626, + "train_tokens_per_second": 6300.586 + }, + { + "epoch": 2.1465015749297667, + "grad_norm": 0.2491164207458496, + "learning_rate": 5e-06, + "loss": 0.9222, + "num_input_tokens_seen": 805698520, + "step": 1773, + "train_runtime": 127871.3074, + "train_tokens_per_second": 6300.855 + }, + { + "epoch": 2.147712331747368, + "grad_norm": 0.2387707233428955, + "learning_rate": 5e-06, + "loss": 0.971, + "num_input_tokens_seen": 806151760, + "step": 1774, + "train_runtime": 127938.8195, + "train_tokens_per_second": 6301.072 + }, + { + "epoch": 2.1489230885649695, + "grad_norm": 0.2344633936882019, + "learning_rate": 5e-06, + "loss": 0.8934, + "num_input_tokens_seen": 806619560, + "step": 1775, + "train_runtime": 128009.9227, + "train_tokens_per_second": 6301.227 + }, + { + "epoch": 2.150133845382571, + "grad_norm": 0.25677409768104553, + "learning_rate": 5e-06, + "loss": 0.9392, + "num_input_tokens_seen": 807056520, + "step": 1776, + "train_runtime": 128074.5031, + "train_tokens_per_second": 6301.461 + }, + { + "epoch": 2.1513446022001723, + "grad_norm": 0.24254010617733002, + "learning_rate": 5e-06, + "loss": 0.8995, + "num_input_tokens_seen": 807505104, + "step": 1777, + "train_runtime": 128140.1656, + "train_tokens_per_second": 6301.733 + }, + { + "epoch": 2.1525553590177737, + "grad_norm": 0.2752172350883484, + "learning_rate": 5e-06, + "loss": 0.885, + "num_input_tokens_seen": 807940568, + "step": 1778, + "train_runtime": 128204.0096, + "train_tokens_per_second": 6301.991 + }, + { + "epoch": 2.153766115835375, + "grad_norm": 0.25673961639404297, + "learning_rate": 5e-06, + "loss": 0.9249, + "num_input_tokens_seen": 808380984, + "step": 1779, + "train_runtime": 128269.2141, + "train_tokens_per_second": 6302.221 + }, + { + "epoch": 2.154976872652976, + "grad_norm": 0.24344174563884735, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 808834400, + "step": 1780, + "train_runtime": 128335.406, + "train_tokens_per_second": 6302.504 + }, + { + "epoch": 2.1561876294705775, + "grad_norm": 0.23879307508468628, + "learning_rate": 5e-06, + "loss": 0.8993, + "num_input_tokens_seen": 809277672, + "step": 1781, + "train_runtime": 128400.9451, + "train_tokens_per_second": 6302.739 + }, + { + "epoch": 2.157398386288179, + "grad_norm": 0.24937401711940765, + "learning_rate": 5e-06, + "loss": 0.9594, + "num_input_tokens_seen": 809742248, + "step": 1782, + "train_runtime": 128469.3459, + "train_tokens_per_second": 6303.0 + }, + { + "epoch": 2.1586091431057803, + "grad_norm": 0.2503887414932251, + "learning_rate": 5e-06, + "loss": 0.9878, + "num_input_tokens_seen": 810215920, + "step": 1783, + "train_runtime": 128539.3636, + "train_tokens_per_second": 6303.251 + }, + { + "epoch": 2.1598198999233817, + "grad_norm": 0.2328265905380249, + "learning_rate": 5e-06, + "loss": 0.9034, + "num_input_tokens_seen": 810667968, + "step": 1784, + "train_runtime": 128605.684, + "train_tokens_per_second": 6303.516 + }, + { + "epoch": 2.161030656740983, + "grad_norm": 0.27375268936157227, + "learning_rate": 5e-06, + "loss": 0.9682, + "num_input_tokens_seen": 811105360, + "step": 1785, + "train_runtime": 128670.1265, + "train_tokens_per_second": 6303.758 + }, + { + "epoch": 2.1622414135585846, + "grad_norm": 0.24299326539039612, + "learning_rate": 5e-06, + "loss": 0.9181, + "num_input_tokens_seen": 811562264, + "step": 1786, + "train_runtime": 128737.81, + "train_tokens_per_second": 6303.993 + }, + { + "epoch": 2.163452170376186, + "grad_norm": 0.25592973828315735, + "learning_rate": 5e-06, + "loss": 0.9622, + "num_input_tokens_seen": 812023096, + "step": 1787, + "train_runtime": 128805.4346, + "train_tokens_per_second": 6304.261 + }, + { + "epoch": 2.1646629271937874, + "grad_norm": 0.27488279342651367, + "learning_rate": 5e-06, + "loss": 0.8852, + "num_input_tokens_seen": 812487488, + "step": 1788, + "train_runtime": 128874.145, + "train_tokens_per_second": 6304.503 + }, + { + "epoch": 2.165873684011389, + "grad_norm": 0.22598235309123993, + "learning_rate": 5e-06, + "loss": 0.8803, + "num_input_tokens_seen": 812951528, + "step": 1789, + "train_runtime": 128942.4621, + "train_tokens_per_second": 6304.762 + }, + { + "epoch": 2.16708444082899, + "grad_norm": 0.2569931149482727, + "learning_rate": 5e-06, + "loss": 0.9139, + "num_input_tokens_seen": 813382096, + "step": 1790, + "train_runtime": 129005.9065, + "train_tokens_per_second": 6304.999 + }, + { + "epoch": 2.1682951976465916, + "grad_norm": 0.24193847179412842, + "learning_rate": 5e-06, + "loss": 0.9344, + "num_input_tokens_seen": 813843352, + "step": 1791, + "train_runtime": 129073.5943, + "train_tokens_per_second": 6305.266 + }, + { + "epoch": 2.169505954464193, + "grad_norm": 0.23365779221057892, + "learning_rate": 5e-06, + "loss": 0.949, + "num_input_tokens_seen": 814308480, + "step": 1792, + "train_runtime": 129142.3657, + "train_tokens_per_second": 6305.51 + }, + { + "epoch": 2.170716711281794, + "grad_norm": 0.23331047594547272, + "learning_rate": 5e-06, + "loss": 0.9625, + "num_input_tokens_seen": 814747656, + "step": 1793, + "train_runtime": 129207.0033, + "train_tokens_per_second": 6305.755 + }, + { + "epoch": 2.1719274680993954, + "grad_norm": 0.26496171951293945, + "learning_rate": 5e-06, + "loss": 0.9196, + "num_input_tokens_seen": 815206384, + "step": 1794, + "train_runtime": 129274.5471, + "train_tokens_per_second": 6306.008 + }, + { + "epoch": 2.173138224916997, + "grad_norm": 0.23653088510036469, + "learning_rate": 5e-06, + "loss": 0.9278, + "num_input_tokens_seen": 815662072, + "step": 1795, + "train_runtime": 129341.8841, + "train_tokens_per_second": 6306.249 + }, + { + "epoch": 2.1743489817345982, + "grad_norm": 0.24810637533664703, + "learning_rate": 5e-06, + "loss": 0.9657, + "num_input_tokens_seen": 816098360, + "step": 1796, + "train_runtime": 129406.0312, + "train_tokens_per_second": 6306.494 + }, + { + "epoch": 2.1755597385521996, + "grad_norm": 0.23452900350093842, + "learning_rate": 5e-06, + "loss": 0.901, + "num_input_tokens_seen": 816563568, + "step": 1797, + "train_runtime": 129474.7386, + "train_tokens_per_second": 6306.74 + }, + { + "epoch": 2.176770495369801, + "grad_norm": 0.2348732203245163, + "learning_rate": 5e-06, + "loss": 0.9526, + "num_input_tokens_seen": 817031280, + "step": 1798, + "train_runtime": 129544.0556, + "train_tokens_per_second": 6306.976 + }, + { + "epoch": 2.1779812521874025, + "grad_norm": 0.2519684135913849, + "learning_rate": 5e-06, + "loss": 0.9246, + "num_input_tokens_seen": 817503224, + "step": 1799, + "train_runtime": 129613.556, + "train_tokens_per_second": 6307.236 + }, + { + "epoch": 2.179192009005004, + "grad_norm": 0.2337455451488495, + "learning_rate": 5e-06, + "loss": 0.9357, + "num_input_tokens_seen": 817952256, + "step": 1800, + "train_runtime": 129679.9959, + "train_tokens_per_second": 6307.467 + }, + { + "epoch": 2.1804027658226053, + "grad_norm": 0.22144410014152527, + "learning_rate": 5e-06, + "loss": 0.9059, + "num_input_tokens_seen": 818411048, + "step": 1801, + "train_runtime": 129748.001, + "train_tokens_per_second": 6307.697 + }, + { + "epoch": 2.1816135226402067, + "grad_norm": 0.23474140465259552, + "learning_rate": 5e-06, + "loss": 0.9692, + "num_input_tokens_seen": 818854960, + "step": 1802, + "train_runtime": 129813.7428, + "train_tokens_per_second": 6307.922 + }, + { + "epoch": 2.182824279457808, + "grad_norm": 0.2501378357410431, + "learning_rate": 5e-06, + "loss": 0.9379, + "num_input_tokens_seen": 819305080, + "step": 1803, + "train_runtime": 129880.2273, + "train_tokens_per_second": 6308.159 + }, + { + "epoch": 2.1840350362754095, + "grad_norm": 0.2469998002052307, + "learning_rate": 5e-06, + "loss": 0.9185, + "num_input_tokens_seen": 819758552, + "step": 1804, + "train_runtime": 129947.1113, + "train_tokens_per_second": 6308.402 + }, + { + "epoch": 2.185245793093011, + "grad_norm": 0.24533340334892273, + "learning_rate": 5e-06, + "loss": 0.9593, + "num_input_tokens_seen": 820199856, + "step": 1805, + "train_runtime": 130012.1095, + "train_tokens_per_second": 6308.642 + }, + { + "epoch": 2.186456549910612, + "grad_norm": 0.24642273783683777, + "learning_rate": 5e-06, + "loss": 0.934, + "num_input_tokens_seen": 820657760, + "step": 1806, + "train_runtime": 130084.5055, + "train_tokens_per_second": 6308.651 + }, + { + "epoch": 2.1876673067282133, + "grad_norm": 0.24866892397403717, + "learning_rate": 5e-06, + "loss": 0.9497, + "num_input_tokens_seen": 821100408, + "step": 1807, + "train_runtime": 130154.3541, + "train_tokens_per_second": 6308.666 + }, + { + "epoch": 2.1888780635458147, + "grad_norm": 0.24068208038806915, + "learning_rate": 5e-06, + "loss": 0.985, + "num_input_tokens_seen": 821535944, + "step": 1808, + "train_runtime": 130223.3035, + "train_tokens_per_second": 6308.671 + }, + { + "epoch": 2.190088820363416, + "grad_norm": 0.2489953488111496, + "learning_rate": 5e-06, + "loss": 0.9605, + "num_input_tokens_seen": 822016328, + "step": 1809, + "train_runtime": 130299.4476, + "train_tokens_per_second": 6308.671 + }, + { + "epoch": 2.1912995771810175, + "grad_norm": 0.2993757724761963, + "learning_rate": 5e-06, + "loss": 0.9075, + "num_input_tokens_seen": 822468376, + "step": 1810, + "train_runtime": 130370.0272, + "train_tokens_per_second": 6308.723 + }, + { + "epoch": 2.192510333998619, + "grad_norm": 0.23804070055484772, + "learning_rate": 5e-06, + "loss": 0.8944, + "num_input_tokens_seen": 822908128, + "step": 1811, + "train_runtime": 130439.4385, + "train_tokens_per_second": 6308.737 + }, + { + "epoch": 2.1937210908162204, + "grad_norm": 0.2520151734352112, + "learning_rate": 5e-06, + "loss": 0.9288, + "num_input_tokens_seen": 823370440, + "step": 1812, + "train_runtime": 130512.4486, + "train_tokens_per_second": 6308.75 + }, + { + "epoch": 2.1949318476338218, + "grad_norm": 0.27723604440689087, + "learning_rate": 5e-06, + "loss": 0.9607, + "num_input_tokens_seen": 823827464, + "step": 1813, + "train_runtime": 130584.4904, + "train_tokens_per_second": 6308.77 + }, + { + "epoch": 2.196142604451423, + "grad_norm": 0.24269568920135498, + "learning_rate": 5e-06, + "loss": 0.8963, + "num_input_tokens_seen": 824292328, + "step": 1814, + "train_runtime": 130657.7113, + "train_tokens_per_second": 6308.792 + }, + { + "epoch": 2.1973533612690246, + "grad_norm": 0.2714741826057434, + "learning_rate": 5e-06, + "loss": 0.9484, + "num_input_tokens_seen": 824727776, + "step": 1815, + "train_runtime": 130725.9019, + "train_tokens_per_second": 6308.832 + }, + { + "epoch": 2.198564118086626, + "grad_norm": 0.2618526518344879, + "learning_rate": 5e-06, + "loss": 0.88, + "num_input_tokens_seen": 825185376, + "step": 1816, + "train_runtime": 130798.1284, + "train_tokens_per_second": 6308.847 + }, + { + "epoch": 2.1997748749042274, + "grad_norm": 0.23151424527168274, + "learning_rate": 5e-06, + "loss": 0.9386, + "num_input_tokens_seen": 825654456, + "step": 1817, + "train_runtime": 130871.6577, + "train_tokens_per_second": 6308.887 + }, + { + "epoch": 2.200985631721829, + "grad_norm": 0.2615219056606293, + "learning_rate": 5e-06, + "loss": 0.9338, + "num_input_tokens_seen": 826103496, + "step": 1818, + "train_runtime": 130942.5052, + "train_tokens_per_second": 6308.902 + }, + { + "epoch": 2.20219638853943, + "grad_norm": 0.24982737004756927, + "learning_rate": 5e-06, + "loss": 0.9204, + "num_input_tokens_seen": 826591216, + "step": 1819, + "train_runtime": 131020.0461, + "train_tokens_per_second": 6308.891 + }, + { + "epoch": 2.203407145357031, + "grad_norm": 0.2572263479232788, + "learning_rate": 5e-06, + "loss": 0.8943, + "num_input_tokens_seen": 827049808, + "step": 1820, + "train_runtime": 131088.1052, + "train_tokens_per_second": 6309.114 + }, + { + "epoch": 2.2046179021746326, + "grad_norm": 0.2785727083683014, + "learning_rate": 5e-06, + "loss": 0.953, + "num_input_tokens_seen": 827506440, + "step": 1821, + "train_runtime": 131155.3257, + "train_tokens_per_second": 6309.362 + }, + { + "epoch": 2.205828658992234, + "grad_norm": 0.23997686803340912, + "learning_rate": 5e-06, + "loss": 0.9181, + "num_input_tokens_seen": 827997808, + "step": 1822, + "train_runtime": 131228.0259, + "train_tokens_per_second": 6309.611 + }, + { + "epoch": 2.2070394158098354, + "grad_norm": 0.2337905317544937, + "learning_rate": 5e-06, + "loss": 0.8995, + "num_input_tokens_seen": 828443496, + "step": 1823, + "train_runtime": 131293.7615, + "train_tokens_per_second": 6309.847 + }, + { + "epoch": 2.208250172627437, + "grad_norm": 0.22939272224903107, + "learning_rate": 5e-06, + "loss": 0.9455, + "num_input_tokens_seen": 828911096, + "step": 1824, + "train_runtime": 131363.1117, + "train_tokens_per_second": 6310.075 + }, + { + "epoch": 2.2094609294450382, + "grad_norm": 0.3020130693912506, + "learning_rate": 5e-06, + "loss": 0.9275, + "num_input_tokens_seen": 829374400, + "step": 1825, + "train_runtime": 131431.3819, + "train_tokens_per_second": 6310.322 + }, + { + "epoch": 2.2106716862626397, + "grad_norm": 0.25360703468322754, + "learning_rate": 5e-06, + "loss": 0.9106, + "num_input_tokens_seen": 829831288, + "step": 1826, + "train_runtime": 131498.3697, + "train_tokens_per_second": 6310.582 + }, + { + "epoch": 2.211882443080241, + "grad_norm": 0.24077729880809784, + "learning_rate": 5e-06, + "loss": 0.9154, + "num_input_tokens_seen": 830266224, + "step": 1827, + "train_runtime": 131564.1036, + "train_tokens_per_second": 6310.735 + }, + { + "epoch": 2.2130931998978425, + "grad_norm": 0.26646628975868225, + "learning_rate": 5e-06, + "loss": 0.9073, + "num_input_tokens_seen": 830695456, + "step": 1828, + "train_runtime": 131628.3066, + "train_tokens_per_second": 6310.918 + }, + { + "epoch": 2.214303956715444, + "grad_norm": 0.24032685160636902, + "learning_rate": 5e-06, + "loss": 0.9257, + "num_input_tokens_seen": 831167968, + "step": 1829, + "train_runtime": 131697.9225, + "train_tokens_per_second": 6311.17 + }, + { + "epoch": 2.2155147135330453, + "grad_norm": 0.24201683700084686, + "learning_rate": 5e-06, + "loss": 0.9432, + "num_input_tokens_seen": 831603632, + "step": 1830, + "train_runtime": 131762.2198, + "train_tokens_per_second": 6311.397 + }, + { + "epoch": 2.2167254703506467, + "grad_norm": 0.25040099024772644, + "learning_rate": 5e-06, + "loss": 0.9309, + "num_input_tokens_seen": 832060792, + "step": 1831, + "train_runtime": 131829.2539, + "train_tokens_per_second": 6311.655 + }, + { + "epoch": 2.2179362271682477, + "grad_norm": 0.2554630935192108, + "learning_rate": 5e-06, + "loss": 0.9374, + "num_input_tokens_seen": 832517720, + "step": 1832, + "train_runtime": 131896.5149, + "train_tokens_per_second": 6311.901 + }, + { + "epoch": 2.219146983985849, + "grad_norm": 0.2625337839126587, + "learning_rate": 5e-06, + "loss": 0.8854, + "num_input_tokens_seen": 832952008, + "step": 1833, + "train_runtime": 131960.5471, + "train_tokens_per_second": 6312.129 + }, + { + "epoch": 2.2203577408034505, + "grad_norm": 0.250442236661911, + "learning_rate": 5e-06, + "loss": 0.9404, + "num_input_tokens_seen": 833400512, + "step": 1834, + "train_runtime": 132026.3765, + "train_tokens_per_second": 6312.379 + }, + { + "epoch": 2.221568497621052, + "grad_norm": 0.24164512753486633, + "learning_rate": 5e-06, + "loss": 0.9089, + "num_input_tokens_seen": 833865880, + "step": 1835, + "train_runtime": 132095.3899, + "train_tokens_per_second": 6312.604 + }, + { + "epoch": 2.2227792544386533, + "grad_norm": 0.2589486837387085, + "learning_rate": 5e-06, + "loss": 0.9173, + "num_input_tokens_seen": 834320256, + "step": 1836, + "train_runtime": 132162.8118, + "train_tokens_per_second": 6312.822 + }, + { + "epoch": 2.2239900112562547, + "grad_norm": 0.26678481698036194, + "learning_rate": 5e-06, + "loss": 0.9453, + "num_input_tokens_seen": 834793808, + "step": 1837, + "train_runtime": 132232.281, + "train_tokens_per_second": 6313.086 + }, + { + "epoch": 2.225200768073856, + "grad_norm": 0.2960735261440277, + "learning_rate": 5e-06, + "loss": 0.8841, + "num_input_tokens_seen": 835242864, + "step": 1838, + "train_runtime": 132298.5746, + "train_tokens_per_second": 6313.317 + }, + { + "epoch": 2.2264115248914575, + "grad_norm": 0.24359485507011414, + "learning_rate": 5e-06, + "loss": 0.9434, + "num_input_tokens_seen": 835713144, + "step": 1839, + "train_runtime": 132367.7406, + "train_tokens_per_second": 6313.571 + }, + { + "epoch": 2.227622281709059, + "grad_norm": 0.24145717918872833, + "learning_rate": 5e-06, + "loss": 0.9528, + "num_input_tokens_seen": 836169392, + "step": 1840, + "train_runtime": 132435.3509, + "train_tokens_per_second": 6313.793 + }, + { + "epoch": 2.2288330385266604, + "grad_norm": 0.23885925114154816, + "learning_rate": 5e-06, + "loss": 0.9194, + "num_input_tokens_seen": 836648992, + "step": 1841, + "train_runtime": 132506.0657, + "train_tokens_per_second": 6314.043 + }, + { + "epoch": 2.230043795344262, + "grad_norm": 0.2691201865673065, + "learning_rate": 5e-06, + "loss": 0.9245, + "num_input_tokens_seen": 837122792, + "step": 1842, + "train_runtime": 132575.9881, + "train_tokens_per_second": 6314.287 + }, + { + "epoch": 2.231254552161863, + "grad_norm": 0.2495044767856598, + "learning_rate": 5e-06, + "loss": 0.9572, + "num_input_tokens_seen": 837581056, + "step": 1843, + "train_runtime": 132643.3397, + "train_tokens_per_second": 6314.535 + }, + { + "epoch": 2.2324653089794646, + "grad_norm": 0.2624557316303253, + "learning_rate": 5e-06, + "loss": 0.9458, + "num_input_tokens_seen": 838029032, + "step": 1844, + "train_runtime": 132709.3351, + "train_tokens_per_second": 6314.771 + }, + { + "epoch": 2.233676065797066, + "grad_norm": 0.23831219971179962, + "learning_rate": 5e-06, + "loss": 0.9305, + "num_input_tokens_seen": 838473072, + "step": 1845, + "train_runtime": 132775.0436, + "train_tokens_per_second": 6314.99 + }, + { + "epoch": 2.234886822614667, + "grad_norm": 0.2543146014213562, + "learning_rate": 5e-06, + "loss": 0.9035, + "num_input_tokens_seen": 838923928, + "step": 1846, + "train_runtime": 132841.3512, + "train_tokens_per_second": 6315.232 + }, + { + "epoch": 2.2360975794322684, + "grad_norm": 0.238714799284935, + "learning_rate": 5e-06, + "loss": 0.8885, + "num_input_tokens_seen": 839396800, + "step": 1847, + "train_runtime": 132911.5196, + "train_tokens_per_second": 6315.456 + }, + { + "epoch": 2.23730833624987, + "grad_norm": 0.22185099124908447, + "learning_rate": 5e-06, + "loss": 0.8943, + "num_input_tokens_seen": 839870296, + "step": 1848, + "train_runtime": 132981.288, + "train_tokens_per_second": 6315.703 + }, + { + "epoch": 2.238519093067471, + "grad_norm": 0.23457881808280945, + "learning_rate": 5e-06, + "loss": 0.9404, + "num_input_tokens_seen": 840334928, + "step": 1849, + "train_runtime": 133049.8116, + "train_tokens_per_second": 6315.942 + }, + { + "epoch": 2.2397298498850726, + "grad_norm": 0.22963935136795044, + "learning_rate": 5e-06, + "loss": 0.9318, + "num_input_tokens_seen": 840803472, + "step": 1850, + "train_runtime": 133118.9729, + "train_tokens_per_second": 6316.181 + }, + { + "epoch": 2.240940606702674, + "grad_norm": 0.24061468243598938, + "learning_rate": 5e-06, + "loss": 0.896, + "num_input_tokens_seen": 841253104, + "step": 1851, + "train_runtime": 133185.6102, + "train_tokens_per_second": 6316.396 + }, + { + "epoch": 2.2421513635202754, + "grad_norm": 0.28269779682159424, + "learning_rate": 5e-06, + "loss": 0.8956, + "num_input_tokens_seen": 841712024, + "step": 1852, + "train_runtime": 133253.9849, + "train_tokens_per_second": 6316.599 + }, + { + "epoch": 2.243362120337877, + "grad_norm": 0.2352578043937683, + "learning_rate": 5e-06, + "loss": 0.9073, + "num_input_tokens_seen": 842172480, + "step": 1853, + "train_runtime": 133322.1946, + "train_tokens_per_second": 6316.821 + }, + { + "epoch": 2.2445728771554783, + "grad_norm": 0.24535781145095825, + "learning_rate": 5e-06, + "loss": 0.9765, + "num_input_tokens_seen": 842620904, + "step": 1854, + "train_runtime": 133388.3394, + "train_tokens_per_second": 6317.051 + }, + { + "epoch": 2.2457836339730797, + "grad_norm": 0.24296994507312775, + "learning_rate": 5e-06, + "loss": 0.931, + "num_input_tokens_seen": 843068456, + "step": 1855, + "train_runtime": 133454.0409, + "train_tokens_per_second": 6317.294 + }, + { + "epoch": 2.246994390790681, + "grad_norm": 0.24628207087516785, + "learning_rate": 5e-06, + "loss": 0.9485, + "num_input_tokens_seen": 843536640, + "step": 1856, + "train_runtime": 133523.3347, + "train_tokens_per_second": 6317.522 + }, + { + "epoch": 2.2482051476082825, + "grad_norm": 0.2603435814380646, + "learning_rate": 5e-06, + "loss": 0.8639, + "num_input_tokens_seen": 843998264, + "step": 1857, + "train_runtime": 133591.3388, + "train_tokens_per_second": 6317.762 + }, + { + "epoch": 2.249415904425884, + "grad_norm": 0.2735736072063446, + "learning_rate": 5e-06, + "loss": 0.9561, + "num_input_tokens_seen": 844454984, + "step": 1858, + "train_runtime": 133659.0053, + "train_tokens_per_second": 6317.98 + }, + { + "epoch": 2.2506266612434853, + "grad_norm": 0.25031837821006775, + "learning_rate": 5e-06, + "loss": 0.9065, + "num_input_tokens_seen": 844918424, + "step": 1859, + "train_runtime": 133727.5797, + "train_tokens_per_second": 6318.206 + }, + { + "epoch": 2.2518374180610863, + "grad_norm": 0.24365690350532532, + "learning_rate": 5e-06, + "loss": 0.9319, + "num_input_tokens_seen": 845370456, + "step": 1860, + "train_runtime": 133794.1892, + "train_tokens_per_second": 6318.439 + }, + { + "epoch": 2.2530481748786877, + "grad_norm": 0.23625266551971436, + "learning_rate": 5e-06, + "loss": 0.9334, + "num_input_tokens_seen": 845840944, + "step": 1861, + "train_runtime": 133864.0008, + "train_tokens_per_second": 6318.659 + }, + { + "epoch": 2.254258931696289, + "grad_norm": 0.2634667456150055, + "learning_rate": 5e-06, + "loss": 0.9272, + "num_input_tokens_seen": 846290816, + "step": 1862, + "train_runtime": 133930.3378, + "train_tokens_per_second": 6318.888 + }, + { + "epoch": 2.2554696885138905, + "grad_norm": 0.2611207067966461, + "learning_rate": 5e-06, + "loss": 0.9475, + "num_input_tokens_seen": 846744432, + "step": 1863, + "train_runtime": 133997.2146, + "train_tokens_per_second": 6319.12 + }, + { + "epoch": 2.256680445331492, + "grad_norm": 0.2601044178009033, + "learning_rate": 5e-06, + "loss": 0.9298, + "num_input_tokens_seen": 847201376, + "step": 1864, + "train_runtime": 134064.8678, + "train_tokens_per_second": 6319.339 + }, + { + "epoch": 2.2578912021490933, + "grad_norm": 0.24679550528526306, + "learning_rate": 5e-06, + "loss": 0.9705, + "num_input_tokens_seen": 847641792, + "step": 1865, + "train_runtime": 134129.2413, + "train_tokens_per_second": 6319.59 + }, + { + "epoch": 2.2591019589666947, + "grad_norm": 0.23708128929138184, + "learning_rate": 5e-06, + "loss": 0.9267, + "num_input_tokens_seen": 848110896, + "step": 1866, + "train_runtime": 134198.5814, + "train_tokens_per_second": 6319.82 + }, + { + "epoch": 2.260312715784296, + "grad_norm": 0.2722652554512024, + "learning_rate": 5e-06, + "loss": 0.9508, + "num_input_tokens_seen": 848564368, + "step": 1867, + "train_runtime": 134265.8682, + "train_tokens_per_second": 6320.03 + }, + { + "epoch": 2.2615234726018976, + "grad_norm": 0.2940795123577118, + "learning_rate": 5e-06, + "loss": 0.9672, + "num_input_tokens_seen": 849026776, + "step": 1868, + "train_runtime": 134334.2382, + "train_tokens_per_second": 6320.256 + }, + { + "epoch": 2.262734229419499, + "grad_norm": 0.22633950412273407, + "learning_rate": 5e-06, + "loss": 0.9377, + "num_input_tokens_seen": 849482712, + "step": 1869, + "train_runtime": 134401.8248, + "train_tokens_per_second": 6320.47 + }, + { + "epoch": 2.2639449862371004, + "grad_norm": 0.24709929525852203, + "learning_rate": 5e-06, + "loss": 0.9169, + "num_input_tokens_seen": 849939544, + "step": 1870, + "train_runtime": 134469.2516, + "train_tokens_per_second": 6320.698 + }, + { + "epoch": 2.265155743054702, + "grad_norm": 0.2768784463405609, + "learning_rate": 5e-06, + "loss": 0.904, + "num_input_tokens_seen": 850413688, + "step": 1871, + "train_runtime": 134539.3239, + "train_tokens_per_second": 6320.93 + }, + { + "epoch": 2.266366499872303, + "grad_norm": 0.24461229145526886, + "learning_rate": 5e-06, + "loss": 0.9515, + "num_input_tokens_seen": 850865528, + "step": 1872, + "train_runtime": 134605.9731, + "train_tokens_per_second": 6321.157 + }, + { + "epoch": 2.267577256689904, + "grad_norm": 0.282145619392395, + "learning_rate": 5e-06, + "loss": 0.9657, + "num_input_tokens_seen": 851305456, + "step": 1873, + "train_runtime": 134670.8221, + "train_tokens_per_second": 6321.38 + }, + { + "epoch": 2.2687880135075056, + "grad_norm": 0.24732042849063873, + "learning_rate": 5e-06, + "loss": 0.9111, + "num_input_tokens_seen": 851761888, + "step": 1874, + "train_runtime": 134738.0896, + "train_tokens_per_second": 6321.612 + }, + { + "epoch": 2.269998770325107, + "grad_norm": 0.22736340761184692, + "learning_rate": 5e-06, + "loss": 0.9206, + "num_input_tokens_seen": 852221120, + "step": 1875, + "train_runtime": 134806.452, + "train_tokens_per_second": 6321.813 + }, + { + "epoch": 2.2712095271427084, + "grad_norm": 0.2657550275325775, + "learning_rate": 5e-06, + "loss": 0.9847, + "num_input_tokens_seen": 852692200, + "step": 1876, + "train_runtime": 134875.6621, + "train_tokens_per_second": 6322.061 + }, + { + "epoch": 2.27242028396031, + "grad_norm": 0.2386472225189209, + "learning_rate": 5e-06, + "loss": 0.9157, + "num_input_tokens_seen": 853149952, + "step": 1877, + "train_runtime": 134943.8512, + "train_tokens_per_second": 6322.259 + }, + { + "epoch": 2.2736310407779112, + "grad_norm": 0.2535218298435211, + "learning_rate": 5e-06, + "loss": 0.9699, + "num_input_tokens_seen": 853585528, + "step": 1878, + "train_runtime": 135007.8471, + "train_tokens_per_second": 6322.488 + }, + { + "epoch": 2.2748417975955126, + "grad_norm": 0.2574761211872101, + "learning_rate": 5e-06, + "loss": 0.9425, + "num_input_tokens_seen": 854028664, + "step": 1879, + "train_runtime": 135073.4954, + "train_tokens_per_second": 6322.696 + }, + { + "epoch": 2.276052554413114, + "grad_norm": 0.25591275095939636, + "learning_rate": 5e-06, + "loss": 0.9834, + "num_input_tokens_seen": 854480232, + "step": 1880, + "train_runtime": 135141.1933, + "train_tokens_per_second": 6322.87 + }, + { + "epoch": 2.2772633112307155, + "grad_norm": 0.2474929392337799, + "learning_rate": 5e-06, + "loss": 0.9189, + "num_input_tokens_seen": 854957440, + "step": 1881, + "train_runtime": 135213.5472, + "train_tokens_per_second": 6323.016 + }, + { + "epoch": 2.278474068048317, + "grad_norm": 0.24820934236049652, + "learning_rate": 5e-06, + "loss": 0.9193, + "num_input_tokens_seen": 855410112, + "step": 1882, + "train_runtime": 135280.3287, + "train_tokens_per_second": 6323.241 + }, + { + "epoch": 2.2796848248659183, + "grad_norm": 0.25758039951324463, + "learning_rate": 5e-06, + "loss": 0.9387, + "num_input_tokens_seen": 855843248, + "step": 1883, + "train_runtime": 135343.7932, + "train_tokens_per_second": 6323.476 + }, + { + "epoch": 2.2808955816835197, + "grad_norm": 0.32192301750183105, + "learning_rate": 5e-06, + "loss": 0.9572, + "num_input_tokens_seen": 856285992, + "step": 1884, + "train_runtime": 135408.6894, + "train_tokens_per_second": 6323.715 + }, + { + "epoch": 2.282106338501121, + "grad_norm": 0.2613389194011688, + "learning_rate": 5e-06, + "loss": 0.9244, + "num_input_tokens_seen": 856720480, + "step": 1885, + "train_runtime": 135473.0216, + "train_tokens_per_second": 6323.919 + }, + { + "epoch": 2.283317095318722, + "grad_norm": 0.2691548764705658, + "learning_rate": 5e-06, + "loss": 0.9155, + "num_input_tokens_seen": 857171960, + "step": 1886, + "train_runtime": 135539.9108, + "train_tokens_per_second": 6324.13 + }, + { + "epoch": 2.2845278521363235, + "grad_norm": 0.2469540685415268, + "learning_rate": 5e-06, + "loss": 0.914, + "num_input_tokens_seen": 857629224, + "step": 1887, + "train_runtime": 135607.533, + "train_tokens_per_second": 6324.348 + }, + { + "epoch": 2.285738608953925, + "grad_norm": 0.24443942308425903, + "learning_rate": 5e-06, + "loss": 0.8825, + "num_input_tokens_seen": 858109800, + "step": 1888, + "train_runtime": 135678.9773, + "train_tokens_per_second": 6324.56 + }, + { + "epoch": 2.2869493657715263, + "grad_norm": 0.2294890135526657, + "learning_rate": 5e-06, + "loss": 0.9354, + "num_input_tokens_seen": 858577656, + "step": 1889, + "train_runtime": 135748.465, + "train_tokens_per_second": 6324.769 + }, + { + "epoch": 2.2881601225891277, + "grad_norm": 0.23962783813476562, + "learning_rate": 5e-06, + "loss": 0.9559, + "num_input_tokens_seen": 859030552, + "step": 1890, + "train_runtime": 135815.0908, + "train_tokens_per_second": 6325.001 + }, + { + "epoch": 2.289370879406729, + "grad_norm": 0.28975754976272583, + "learning_rate": 5e-06, + "loss": 0.9192, + "num_input_tokens_seen": 859488344, + "step": 1891, + "train_runtime": 135882.7476, + "train_tokens_per_second": 6325.221 + }, + { + "epoch": 2.2905816362243305, + "grad_norm": 0.23890255391597748, + "learning_rate": 5e-06, + "loss": 0.8657, + "num_input_tokens_seen": 859934904, + "step": 1892, + "train_runtime": 135949.0563, + "train_tokens_per_second": 6325.42 + }, + { + "epoch": 2.291792393041932, + "grad_norm": 0.25363996624946594, + "learning_rate": 5e-06, + "loss": 0.9056, + "num_input_tokens_seen": 860406744, + "step": 1893, + "train_runtime": 136018.9588, + "train_tokens_per_second": 6325.638 + }, + { + "epoch": 2.2930031498595334, + "grad_norm": 0.26004326343536377, + "learning_rate": 5e-06, + "loss": 0.9289, + "num_input_tokens_seen": 860848024, + "step": 1894, + "train_runtime": 136084.1855, + "train_tokens_per_second": 6325.849 + }, + { + "epoch": 2.2942139066771348, + "grad_norm": 0.31975099444389343, + "learning_rate": 5e-06, + "loss": 0.9069, + "num_input_tokens_seen": 861297312, + "step": 1895, + "train_runtime": 136150.7509, + "train_tokens_per_second": 6326.056 + }, + { + "epoch": 2.295424663494736, + "grad_norm": 0.2796708345413208, + "learning_rate": 5e-06, + "loss": 0.8722, + "num_input_tokens_seen": 861726552, + "step": 1896, + "train_runtime": 136214.2119, + "train_tokens_per_second": 6326.26 + }, + { + "epoch": 2.2966354203123376, + "grad_norm": 0.32552340626716614, + "learning_rate": 5e-06, + "loss": 0.9589, + "num_input_tokens_seen": 862175240, + "step": 1897, + "train_runtime": 136280.4971, + "train_tokens_per_second": 6326.476 + }, + { + "epoch": 2.297846177129939, + "grad_norm": 0.2615937292575836, + "learning_rate": 5e-06, + "loss": 0.8747, + "num_input_tokens_seen": 862622672, + "step": 1898, + "train_runtime": 136346.7206, + "train_tokens_per_second": 6326.684 + }, + { + "epoch": 2.29905693394754, + "grad_norm": 0.27208948135375977, + "learning_rate": 5e-06, + "loss": 0.9689, + "num_input_tokens_seen": 863047272, + "step": 1899, + "train_runtime": 136409.4794, + "train_tokens_per_second": 6326.886 + }, + { + "epoch": 2.3002676907651414, + "grad_norm": 0.2440728098154068, + "learning_rate": 5e-06, + "loss": 0.9092, + "num_input_tokens_seen": 863494008, + "step": 1900, + "train_runtime": 136475.8625, + "train_tokens_per_second": 6327.082 + }, + { + "epoch": 2.301478447582743, + "grad_norm": 0.24035605788230896, + "learning_rate": 5e-06, + "loss": 0.9087, + "num_input_tokens_seen": 863952264, + "step": 1901, + "train_runtime": 136543.8991, + "train_tokens_per_second": 6327.286 + }, + { + "epoch": 2.302689204400344, + "grad_norm": 0.32341066002845764, + "learning_rate": 5e-06, + "loss": 0.8986, + "num_input_tokens_seen": 864426280, + "step": 1902, + "train_runtime": 136614.3437, + "train_tokens_per_second": 6327.493 + }, + { + "epoch": 2.3038999612179456, + "grad_norm": 0.28295764327049255, + "learning_rate": 5e-06, + "loss": 0.9257, + "num_input_tokens_seen": 864905968, + "step": 1903, + "train_runtime": 136684.861, + "train_tokens_per_second": 6327.738 + }, + { + "epoch": 2.305110718035547, + "grad_norm": 0.2590475380420685, + "learning_rate": 5e-06, + "loss": 0.9493, + "num_input_tokens_seen": 865342848, + "step": 1904, + "train_runtime": 136749.3564, + "train_tokens_per_second": 6327.948 + }, + { + "epoch": 2.3063214748531484, + "grad_norm": 0.25882232189178467, + "learning_rate": 5e-06, + "loss": 0.8907, + "num_input_tokens_seen": 865794736, + "step": 1905, + "train_runtime": 136815.6825, + "train_tokens_per_second": 6328.183 + }, + { + "epoch": 2.30753223167075, + "grad_norm": 0.26275938749313354, + "learning_rate": 5e-06, + "loss": 0.9502, + "num_input_tokens_seen": 866250416, + "step": 1906, + "train_runtime": 136883.0305, + "train_tokens_per_second": 6328.399 + }, + { + "epoch": 2.3087429884883512, + "grad_norm": 0.2927948236465454, + "learning_rate": 5e-06, + "loss": 0.8985, + "num_input_tokens_seen": 866709976, + "step": 1907, + "train_runtime": 136951.2483, + "train_tokens_per_second": 6328.602 + }, + { + "epoch": 2.3099537453059527, + "grad_norm": 0.2844955027103424, + "learning_rate": 5e-06, + "loss": 0.932, + "num_input_tokens_seen": 867193176, + "step": 1908, + "train_runtime": 137022.6963, + "train_tokens_per_second": 6328.829 + }, + { + "epoch": 2.311164502123554, + "grad_norm": 0.2642100155353546, + "learning_rate": 5e-06, + "loss": 0.9547, + "num_input_tokens_seen": 867661560, + "step": 1909, + "train_runtime": 137092.1567, + "train_tokens_per_second": 6329.039 + }, + { + "epoch": 2.3123752589411555, + "grad_norm": 0.2718662917613983, + "learning_rate": 5e-06, + "loss": 0.9502, + "num_input_tokens_seen": 868124000, + "step": 1910, + "train_runtime": 137160.7427, + "train_tokens_per_second": 6329.245 + }, + { + "epoch": 2.313586015758757, + "grad_norm": 0.2536037862300873, + "learning_rate": 5e-06, + "loss": 0.9193, + "num_input_tokens_seen": 868588696, + "step": 1911, + "train_runtime": 137229.9509, + "train_tokens_per_second": 6329.44 + }, + { + "epoch": 2.314796772576358, + "grad_norm": 0.249566450715065, + "learning_rate": 5e-06, + "loss": 0.949, + "num_input_tokens_seen": 869053760, + "step": 1912, + "train_runtime": 137299.3417, + "train_tokens_per_second": 6329.628 + }, + { + "epoch": 2.3160075293939597, + "grad_norm": 0.262437105178833, + "learning_rate": 5e-06, + "loss": 0.9716, + "num_input_tokens_seen": 869515720, + "step": 1913, + "train_runtime": 137367.5168, + "train_tokens_per_second": 6329.85 + }, + { + "epoch": 2.3172182862115607, + "grad_norm": 0.23782069981098175, + "learning_rate": 5e-06, + "loss": 0.9011, + "num_input_tokens_seen": 869971160, + "step": 1914, + "train_runtime": 137434.8916, + "train_tokens_per_second": 6330.06 + }, + { + "epoch": 2.318429043029162, + "grad_norm": 0.2517566978931427, + "learning_rate": 5e-06, + "loss": 0.91, + "num_input_tokens_seen": 870399880, + "step": 1915, + "train_runtime": 137497.6211, + "train_tokens_per_second": 6330.29 + }, + { + "epoch": 2.3196397998467635, + "grad_norm": 0.26012682914733887, + "learning_rate": 5e-06, + "loss": 0.9396, + "num_input_tokens_seen": 870853040, + "step": 1916, + "train_runtime": 137564.414, + "train_tokens_per_second": 6330.511 + }, + { + "epoch": 2.320850556664365, + "grad_norm": 0.24836315214633942, + "learning_rate": 5e-06, + "loss": 1.01, + "num_input_tokens_seen": 871314416, + "step": 1917, + "train_runtime": 137632.4092, + "train_tokens_per_second": 6330.736 + }, + { + "epoch": 2.3220613134819663, + "grad_norm": 0.26975148916244507, + "learning_rate": 5e-06, + "loss": 0.9313, + "num_input_tokens_seen": 871770264, + "step": 1918, + "train_runtime": 137699.9741, + "train_tokens_per_second": 6330.94 + }, + { + "epoch": 2.3232720702995677, + "grad_norm": 0.23591186106204987, + "learning_rate": 5e-06, + "loss": 0.9242, + "num_input_tokens_seen": 872229016, + "step": 1919, + "train_runtime": 137767.7326, + "train_tokens_per_second": 6331.156 + }, + { + "epoch": 2.324482827117169, + "grad_norm": 0.24724294245243073, + "learning_rate": 5e-06, + "loss": 0.9166, + "num_input_tokens_seen": 872653144, + "step": 1920, + "train_runtime": 137830.5002, + "train_tokens_per_second": 6331.35 + }, + { + "epoch": 2.3256935839347705, + "grad_norm": 0.23832382261753082, + "learning_rate": 5e-06, + "loss": 0.9463, + "num_input_tokens_seen": 873110384, + "step": 1921, + "train_runtime": 137898.239, + "train_tokens_per_second": 6331.556 + }, + { + "epoch": 2.326904340752372, + "grad_norm": 0.28097233176231384, + "learning_rate": 5e-06, + "loss": 0.9624, + "num_input_tokens_seen": 873570488, + "step": 1922, + "train_runtime": 137966.7648, + "train_tokens_per_second": 6331.746 + }, + { + "epoch": 2.3281150975699734, + "grad_norm": 0.23570659756660461, + "learning_rate": 5e-06, + "loss": 0.9168, + "num_input_tokens_seen": 874019688, + "step": 1923, + "train_runtime": 138032.8862, + "train_tokens_per_second": 6331.967 + }, + { + "epoch": 2.329325854387575, + "grad_norm": 0.2484421581029892, + "learning_rate": 5e-06, + "loss": 0.9224, + "num_input_tokens_seen": 874476456, + "step": 1924, + "train_runtime": 138100.3145, + "train_tokens_per_second": 6332.183 + }, + { + "epoch": 2.3305366112051757, + "grad_norm": 0.2436489313840866, + "learning_rate": 5e-06, + "loss": 0.9769, + "num_input_tokens_seen": 874906120, + "step": 1925, + "train_runtime": 138163.392, + "train_tokens_per_second": 6332.402 + }, + { + "epoch": 2.3317473680227776, + "grad_norm": 0.23818077147006989, + "learning_rate": 5e-06, + "loss": 0.9719, + "num_input_tokens_seen": 875359880, + "step": 1926, + "train_runtime": 138230.2722, + "train_tokens_per_second": 6332.621 + }, + { + "epoch": 2.3329581248403786, + "grad_norm": 0.2646999955177307, + "learning_rate": 5e-06, + "loss": 0.9759, + "num_input_tokens_seen": 875813712, + "step": 1927, + "train_runtime": 138297.4023, + "train_tokens_per_second": 6332.828 + }, + { + "epoch": 2.33416888165798, + "grad_norm": 0.24218083918094635, + "learning_rate": 5e-06, + "loss": 0.9444, + "num_input_tokens_seen": 876275792, + "step": 1928, + "train_runtime": 138366.0473, + "train_tokens_per_second": 6333.026 + }, + { + "epoch": 2.3353796384755814, + "grad_norm": 0.22336937487125397, + "learning_rate": 5e-06, + "loss": 0.8828, + "num_input_tokens_seen": 876756576, + "step": 1929, + "train_runtime": 138437.719, + "train_tokens_per_second": 6333.22 + }, + { + "epoch": 2.336590395293183, + "grad_norm": 0.21716539561748505, + "learning_rate": 5e-06, + "loss": 0.9001, + "num_input_tokens_seen": 877237832, + "step": 1930, + "train_runtime": 138508.8901, + "train_tokens_per_second": 6333.441 + }, + { + "epoch": 2.337801152110784, + "grad_norm": 0.24788719415664673, + "learning_rate": 5e-06, + "loss": 0.9169, + "num_input_tokens_seen": 877681584, + "step": 1931, + "train_runtime": 138574.6608, + "train_tokens_per_second": 6333.637 + }, + { + "epoch": 2.3390119089283856, + "grad_norm": 0.2476462423801422, + "learning_rate": 5e-06, + "loss": 0.9653, + "num_input_tokens_seen": 878125368, + "step": 1932, + "train_runtime": 138639.9943, + "train_tokens_per_second": 6333.853 + }, + { + "epoch": 2.340222665745987, + "grad_norm": 0.26290398836135864, + "learning_rate": 5e-06, + "loss": 0.9275, + "num_input_tokens_seen": 878571608, + "step": 1933, + "train_runtime": 138706.0769, + "train_tokens_per_second": 6334.053 + }, + { + "epoch": 2.3414334225635884, + "grad_norm": 0.26040390133857727, + "learning_rate": 5e-06, + "loss": 0.9176, + "num_input_tokens_seen": 879012848, + "step": 1934, + "train_runtime": 138772.6778, + "train_tokens_per_second": 6334.192 + }, + { + "epoch": 2.34264417938119, + "grad_norm": 0.22445742785930634, + "learning_rate": 5e-06, + "loss": 0.8518, + "num_input_tokens_seen": 879501408, + "step": 1935, + "train_runtime": 138846.372, + "train_tokens_per_second": 6334.349 + }, + { + "epoch": 2.3438549361987913, + "grad_norm": 0.2317107766866684, + "learning_rate": 5e-06, + "loss": 0.9035, + "num_input_tokens_seen": 879957520, + "step": 1936, + "train_runtime": 138913.7202, + "train_tokens_per_second": 6334.562 + }, + { + "epoch": 2.3450656930163927, + "grad_norm": 0.2713346481323242, + "learning_rate": 5e-06, + "loss": 0.9087, + "num_input_tokens_seen": 880411232, + "step": 1937, + "train_runtime": 138981.0054, + "train_tokens_per_second": 6334.759 + }, + { + "epoch": 2.346276449833994, + "grad_norm": 0.24011683464050293, + "learning_rate": 5e-06, + "loss": 0.9394, + "num_input_tokens_seen": 880860792, + "step": 1938, + "train_runtime": 139047.4125, + "train_tokens_per_second": 6334.967 + }, + { + "epoch": 2.3474872066515955, + "grad_norm": 0.2560282349586487, + "learning_rate": 5e-06, + "loss": 0.9062, + "num_input_tokens_seen": 881332280, + "step": 1939, + "train_runtime": 139117.5608, + "train_tokens_per_second": 6335.162 + }, + { + "epoch": 2.3486979634691965, + "grad_norm": 0.23384442925453186, + "learning_rate": 5e-06, + "loss": 0.879, + "num_input_tokens_seen": 881765136, + "step": 1940, + "train_runtime": 139181.4369, + "train_tokens_per_second": 6335.365 + }, + { + "epoch": 2.349908720286798, + "grad_norm": 0.23254314064979553, + "learning_rate": 5e-06, + "loss": 0.8938, + "num_input_tokens_seen": 882225136, + "step": 1941, + "train_runtime": 139249.4601, + "train_tokens_per_second": 6335.573 + }, + { + "epoch": 2.3511194771043993, + "grad_norm": 0.2877858281135559, + "learning_rate": 5e-06, + "loss": 0.9639, + "num_input_tokens_seen": 882660080, + "step": 1942, + "train_runtime": 139313.9834, + "train_tokens_per_second": 6335.761 + }, + { + "epoch": 2.3523302339220007, + "grad_norm": 0.24326159060001373, + "learning_rate": 5e-06, + "loss": 0.9099, + "num_input_tokens_seen": 883082800, + "step": 1943, + "train_runtime": 139376.2313, + "train_tokens_per_second": 6335.964 + }, + { + "epoch": 2.353540990739602, + "grad_norm": 0.23450767993927002, + "learning_rate": 5e-06, + "loss": 0.9258, + "num_input_tokens_seen": 883532720, + "step": 1944, + "train_runtime": 139443.249, + "train_tokens_per_second": 6336.146 + }, + { + "epoch": 2.3547517475572035, + "grad_norm": 0.25885146856307983, + "learning_rate": 5e-06, + "loss": 0.8931, + "num_input_tokens_seen": 883983312, + "step": 1945, + "train_runtime": 139509.936, + "train_tokens_per_second": 6336.347 + }, + { + "epoch": 2.355962504374805, + "grad_norm": 0.23597835004329681, + "learning_rate": 5e-06, + "loss": 0.9222, + "num_input_tokens_seen": 884453544, + "step": 1946, + "train_runtime": 139579.5403, + "train_tokens_per_second": 6336.556 + }, + { + "epoch": 2.3571732611924063, + "grad_norm": 0.2448599487543106, + "learning_rate": 5e-06, + "loss": 0.9685, + "num_input_tokens_seen": 884895312, + "step": 1947, + "train_runtime": 139645.5434, + "train_tokens_per_second": 6336.724 + }, + { + "epoch": 2.3583840180100077, + "grad_norm": 0.25267136096954346, + "learning_rate": 5e-06, + "loss": 0.9375, + "num_input_tokens_seen": 885354208, + "step": 1948, + "train_runtime": 139713.3438, + "train_tokens_per_second": 6336.934 + }, + { + "epoch": 2.359594774827609, + "grad_norm": 0.2259773463010788, + "learning_rate": 5e-06, + "loss": 0.8995, + "num_input_tokens_seen": 885828656, + "step": 1949, + "train_runtime": 139783.4906, + "train_tokens_per_second": 6337.148 + }, + { + "epoch": 2.3608055316452106, + "grad_norm": 0.25038328766822815, + "learning_rate": 5e-06, + "loss": 0.9297, + "num_input_tokens_seen": 886262736, + "step": 1950, + "train_runtime": 139847.7611, + "train_tokens_per_second": 6337.339 + }, + { + "epoch": 2.362016288462812, + "grad_norm": 0.23766860365867615, + "learning_rate": 5e-06, + "loss": 0.8919, + "num_input_tokens_seen": 886724640, + "step": 1951, + "train_runtime": 139916.2008, + "train_tokens_per_second": 6337.541 + }, + { + "epoch": 2.3632270452804134, + "grad_norm": 0.2303091287612915, + "learning_rate": 5e-06, + "loss": 0.8914, + "num_input_tokens_seen": 887164120, + "step": 1952, + "train_runtime": 139981.2882, + "train_tokens_per_second": 6337.734 + }, + { + "epoch": 2.3644378020980144, + "grad_norm": 0.24684786796569824, + "learning_rate": 5e-06, + "loss": 0.8894, + "num_input_tokens_seen": 887620064, + "step": 1953, + "train_runtime": 140048.5002, + "train_tokens_per_second": 6337.948 + }, + { + "epoch": 2.3656485589156158, + "grad_norm": 0.2284991294145584, + "learning_rate": 5e-06, + "loss": 0.9494, + "num_input_tokens_seen": 888070904, + "step": 1954, + "train_runtime": 140114.8455, + "train_tokens_per_second": 6338.164 + }, + { + "epoch": 2.366859315733217, + "grad_norm": 0.254375159740448, + "learning_rate": 5e-06, + "loss": 0.8982, + "num_input_tokens_seen": 888519704, + "step": 1955, + "train_runtime": 140181.3735, + "train_tokens_per_second": 6338.358 + }, + { + "epoch": 2.3680700725508186, + "grad_norm": 0.2587945759296417, + "learning_rate": 5e-06, + "loss": 0.926, + "num_input_tokens_seen": 888953576, + "step": 1956, + "train_runtime": 140245.6256, + "train_tokens_per_second": 6338.548 + }, + { + "epoch": 2.36928082936842, + "grad_norm": 0.263895183801651, + "learning_rate": 5e-06, + "loss": 0.8845, + "num_input_tokens_seen": 889422280, + "step": 1957, + "train_runtime": 140315.4592, + "train_tokens_per_second": 6338.733 + }, + { + "epoch": 2.3704915861860214, + "grad_norm": 0.22773973643779755, + "learning_rate": 5e-06, + "loss": 0.8691, + "num_input_tokens_seen": 889895248, + "step": 1958, + "train_runtime": 140386.1495, + "train_tokens_per_second": 6338.911 + }, + { + "epoch": 2.371702343003623, + "grad_norm": 0.26075223088264465, + "learning_rate": 5e-06, + "loss": 1.0117, + "num_input_tokens_seen": 890333472, + "step": 1959, + "train_runtime": 140450.4448, + "train_tokens_per_second": 6339.129 + }, + { + "epoch": 2.3729130998212242, + "grad_norm": 0.2427862286567688, + "learning_rate": 5e-06, + "loss": 0.9224, + "num_input_tokens_seen": 890779816, + "step": 1960, + "train_runtime": 140516.3711, + "train_tokens_per_second": 6339.331 + }, + { + "epoch": 2.3741238566388256, + "grad_norm": 0.24546240270137787, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 891243432, + "step": 1961, + "train_runtime": 140584.7256, + "train_tokens_per_second": 6339.547 + }, + { + "epoch": 2.375334613456427, + "grad_norm": 0.24161502718925476, + "learning_rate": 5e-06, + "loss": 0.9268, + "num_input_tokens_seen": 891695152, + "step": 1962, + "train_runtime": 140651.3388, + "train_tokens_per_second": 6339.756 + }, + { + "epoch": 2.3765453702740285, + "grad_norm": 0.25115856528282166, + "learning_rate": 5e-06, + "loss": 0.9467, + "num_input_tokens_seen": 892133512, + "step": 1963, + "train_runtime": 140716.2837, + "train_tokens_per_second": 6339.945 + }, + { + "epoch": 2.37775612709163, + "grad_norm": 0.2623535692691803, + "learning_rate": 5e-06, + "loss": 0.9482, + "num_input_tokens_seen": 892581496, + "step": 1964, + "train_runtime": 140782.484, + "train_tokens_per_second": 6340.146 + }, + { + "epoch": 2.3789668839092313, + "grad_norm": 0.2622727155685425, + "learning_rate": 5e-06, + "loss": 0.9253, + "num_input_tokens_seen": 893044504, + "step": 1965, + "train_runtime": 140850.9639, + "train_tokens_per_second": 6340.351 + }, + { + "epoch": 2.3801776407268322, + "grad_norm": 0.2433587610721588, + "learning_rate": 5e-06, + "loss": 0.9352, + "num_input_tokens_seen": 893498056, + "step": 1966, + "train_runtime": 140918.3136, + "train_tokens_per_second": 6340.539 + }, + { + "epoch": 2.3813883975444337, + "grad_norm": 0.25335007905960083, + "learning_rate": 5e-06, + "loss": 0.9527, + "num_input_tokens_seen": 893927120, + "step": 1967, + "train_runtime": 140981.7443, + "train_tokens_per_second": 6340.73 + }, + { + "epoch": 2.382599154362035, + "grad_norm": 0.24073803424835205, + "learning_rate": 5e-06, + "loss": 0.964, + "num_input_tokens_seen": 894397904, + "step": 1968, + "train_runtime": 141051.8869, + "train_tokens_per_second": 6340.914 + }, + { + "epoch": 2.3838099111796365, + "grad_norm": 0.2417605221271515, + "learning_rate": 5e-06, + "loss": 0.8975, + "num_input_tokens_seen": 894866064, + "step": 1969, + "train_runtime": 141121.0337, + "train_tokens_per_second": 6341.125 + }, + { + "epoch": 2.385020667997238, + "grad_norm": 0.264097660779953, + "learning_rate": 5e-06, + "loss": 0.8888, + "num_input_tokens_seen": 895325256, + "step": 1970, + "train_runtime": 141188.7929, + "train_tokens_per_second": 6341.334 + }, + { + "epoch": 2.3862314248148393, + "grad_norm": 0.23401835560798645, + "learning_rate": 5e-06, + "loss": 0.914, + "num_input_tokens_seen": 895793728, + "step": 1971, + "train_runtime": 141257.3012, + "train_tokens_per_second": 6341.575 + }, + { + "epoch": 2.3874421816324407, + "grad_norm": 0.23458734154701233, + "learning_rate": 5e-06, + "loss": 0.9237, + "num_input_tokens_seen": 896231368, + "step": 1972, + "train_runtime": 141321.9812, + "train_tokens_per_second": 6341.769 + }, + { + "epoch": 2.388652938450042, + "grad_norm": 0.24703070521354675, + "learning_rate": 5e-06, + "loss": 0.8911, + "num_input_tokens_seen": 896673912, + "step": 1973, + "train_runtime": 141387.431, + "train_tokens_per_second": 6341.963 + }, + { + "epoch": 2.3898636952676435, + "grad_norm": 0.2715272009372711, + "learning_rate": 5e-06, + "loss": 0.9271, + "num_input_tokens_seen": 897130160, + "step": 1974, + "train_runtime": 141455.1371, + "train_tokens_per_second": 6342.153 + }, + { + "epoch": 2.391074452085245, + "grad_norm": 0.27634164690971375, + "learning_rate": 5e-06, + "loss": 0.9441, + "num_input_tokens_seen": 897585568, + "step": 1975, + "train_runtime": 141522.4112, + "train_tokens_per_second": 6342.356 + }, + { + "epoch": 2.3922852089028463, + "grad_norm": 0.2545999586582184, + "learning_rate": 5e-06, + "loss": 0.9379, + "num_input_tokens_seen": 898024864, + "step": 1976, + "train_runtime": 141587.355, + "train_tokens_per_second": 6342.55 + }, + { + "epoch": 2.3934959657204478, + "grad_norm": 0.23588921129703522, + "learning_rate": 5e-06, + "loss": 0.8916, + "num_input_tokens_seen": 898487168, + "step": 1977, + "train_runtime": 141655.6271, + "train_tokens_per_second": 6342.757 + }, + { + "epoch": 2.394706722538049, + "grad_norm": 0.28697672486305237, + "learning_rate": 5e-06, + "loss": 0.9252, + "num_input_tokens_seen": 898948576, + "step": 1978, + "train_runtime": 141724.1485, + "train_tokens_per_second": 6342.946 + }, + { + "epoch": 2.39591747935565, + "grad_norm": 0.2565509080886841, + "learning_rate": 5e-06, + "loss": 0.8936, + "num_input_tokens_seen": 899411768, + "step": 1979, + "train_runtime": 141793.0536, + "train_tokens_per_second": 6343.13 + }, + { + "epoch": 2.3971282361732515, + "grad_norm": 0.25218653678894043, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 899887136, + "step": 1980, + "train_runtime": 141862.8968, + "train_tokens_per_second": 6343.358 + }, + { + "epoch": 2.398338992990853, + "grad_norm": 0.25139865279197693, + "learning_rate": 5e-06, + "loss": 0.9023, + "num_input_tokens_seen": 900321552, + "step": 1981, + "train_runtime": 141927.0945, + "train_tokens_per_second": 6343.55 + }, + { + "epoch": 2.3995497498084544, + "grad_norm": 0.2799871265888214, + "learning_rate": 5e-06, + "loss": 0.9437, + "num_input_tokens_seen": 900784368, + "step": 1982, + "train_runtime": 141995.7321, + "train_tokens_per_second": 6343.743 + }, + { + "epoch": 2.400760506626056, + "grad_norm": 0.25518181920051575, + "learning_rate": 5e-06, + "loss": 0.8679, + "num_input_tokens_seen": 901240864, + "step": 1983, + "train_runtime": 142063.228, + "train_tokens_per_second": 6343.942 + }, + { + "epoch": 2.401971263443657, + "grad_norm": 0.28088992834091187, + "learning_rate": 5e-06, + "loss": 0.9436, + "num_input_tokens_seen": 901673304, + "step": 1984, + "train_runtime": 142126.5258, + "train_tokens_per_second": 6344.159 + }, + { + "epoch": 2.4031820202612586, + "grad_norm": 0.23146390914916992, + "learning_rate": 5e-06, + "loss": 0.928, + "num_input_tokens_seen": 902127568, + "step": 1985, + "train_runtime": 142193.8578, + "train_tokens_per_second": 6344.35 + }, + { + "epoch": 2.40439277707886, + "grad_norm": 0.23194481432437897, + "learning_rate": 5e-06, + "loss": 0.9461, + "num_input_tokens_seen": 902577560, + "step": 1986, + "train_runtime": 142260.1684, + "train_tokens_per_second": 6344.556 + }, + { + "epoch": 2.4056035338964614, + "grad_norm": 0.2525422275066376, + "learning_rate": 5e-06, + "loss": 0.9334, + "num_input_tokens_seen": 903038008, + "step": 1987, + "train_runtime": 142328.7983, + "train_tokens_per_second": 6344.731 + }, + { + "epoch": 2.406814290714063, + "grad_norm": 0.258497953414917, + "learning_rate": 5e-06, + "loss": 0.9355, + "num_input_tokens_seen": 903491080, + "step": 1988, + "train_runtime": 142397.3731, + "train_tokens_per_second": 6344.858 + }, + { + "epoch": 2.4080250475316642, + "grad_norm": 0.24086523056030273, + "learning_rate": 5e-06, + "loss": 0.9504, + "num_input_tokens_seen": 903940184, + "step": 1989, + "train_runtime": 142463.8782, + "train_tokens_per_second": 6345.048 + }, + { + "epoch": 2.4092358043492657, + "grad_norm": 0.24795937538146973, + "learning_rate": 5e-06, + "loss": 0.9263, + "num_input_tokens_seen": 904404544, + "step": 1990, + "train_runtime": 142532.6712, + "train_tokens_per_second": 6345.244 + }, + { + "epoch": 2.410446561166867, + "grad_norm": 0.2638545632362366, + "learning_rate": 5e-06, + "loss": 1.017, + "num_input_tokens_seen": 904832936, + "step": 1991, + "train_runtime": 142595.965, + "train_tokens_per_second": 6345.432 + }, + { + "epoch": 2.411657317984468, + "grad_norm": 0.23633180558681488, + "learning_rate": 5e-06, + "loss": 0.9562, + "num_input_tokens_seen": 905310056, + "step": 1992, + "train_runtime": 142666.5754, + "train_tokens_per_second": 6345.635 + }, + { + "epoch": 2.41286807480207, + "grad_norm": 0.2632332444190979, + "learning_rate": 5e-06, + "loss": 0.9153, + "num_input_tokens_seen": 905746048, + "step": 1993, + "train_runtime": 142730.9361, + "train_tokens_per_second": 6345.829 + }, + { + "epoch": 2.414078831619671, + "grad_norm": 0.2594749927520752, + "learning_rate": 5e-06, + "loss": 0.9256, + "num_input_tokens_seen": 906175336, + "step": 1994, + "train_runtime": 142794.2311, + "train_tokens_per_second": 6346.022 + }, + { + "epoch": 2.4152895884372723, + "grad_norm": 0.24447709321975708, + "learning_rate": 5e-06, + "loss": 0.8639, + "num_input_tokens_seen": 906625416, + "step": 1995, + "train_runtime": 142860.8603, + "train_tokens_per_second": 6346.213 + }, + { + "epoch": 2.4165003452548737, + "grad_norm": 0.2638216018676758, + "learning_rate": 5e-06, + "loss": 0.9065, + "num_input_tokens_seen": 907116368, + "step": 1996, + "train_runtime": 142934.0058, + "train_tokens_per_second": 6346.4 + }, + { + "epoch": 2.417711102072475, + "grad_norm": 0.23817259073257446, + "learning_rate": 5e-06, + "loss": 0.9236, + "num_input_tokens_seen": 907590384, + "step": 1997, + "train_runtime": 143004.1952, + "train_tokens_per_second": 6346.6 + }, + { + "epoch": 2.4189218588900765, + "grad_norm": 0.2550632059574127, + "learning_rate": 5e-06, + "loss": 0.967, + "num_input_tokens_seen": 908026040, + "step": 1998, + "train_runtime": 143068.2631, + "train_tokens_per_second": 6346.803 + }, + { + "epoch": 2.420132615707678, + "grad_norm": 0.2464226633310318, + "learning_rate": 5e-06, + "loss": 0.9368, + "num_input_tokens_seen": 908465552, + "step": 1999, + "train_runtime": 143133.3558, + "train_tokens_per_second": 6346.987 + }, + { + "epoch": 2.4213433725252793, + "grad_norm": 0.2467721402645111, + "learning_rate": 5e-06, + "loss": 0.9368, + "num_input_tokens_seen": 908920080, + "step": 2000, + "train_runtime": 143200.8915, + "train_tokens_per_second": 6347.168 + }, + { + "epoch": 2.4225541293428807, + "grad_norm": 0.2405187487602234, + "learning_rate": 5e-06, + "loss": 0.9122, + "num_input_tokens_seen": 909366536, + "step": 2001, + "train_runtime": 143268.6692, + "train_tokens_per_second": 6347.281 + }, + { + "epoch": 2.423764886160482, + "grad_norm": 0.2485346794128418, + "learning_rate": 5e-06, + "loss": 0.9035, + "num_input_tokens_seen": 909816392, + "step": 2002, + "train_runtime": 143335.0761, + "train_tokens_per_second": 6347.479 + }, + { + "epoch": 2.4249756429780835, + "grad_norm": 0.24442023038864136, + "learning_rate": 5e-06, + "loss": 0.9275, + "num_input_tokens_seen": 910258544, + "step": 2003, + "train_runtime": 143400.4166, + "train_tokens_per_second": 6347.67 + }, + { + "epoch": 2.426186399795685, + "grad_norm": 0.229239359498024, + "learning_rate": 5e-06, + "loss": 0.9361, + "num_input_tokens_seen": 910717192, + "step": 2004, + "train_runtime": 143468.3168, + "train_tokens_per_second": 6347.863 + }, + { + "epoch": 2.427397156613286, + "grad_norm": 0.24774321913719177, + "learning_rate": 5e-06, + "loss": 0.9516, + "num_input_tokens_seen": 911158464, + "step": 2005, + "train_runtime": 143533.5062, + "train_tokens_per_second": 6348.054 + }, + { + "epoch": 2.4286079134308878, + "grad_norm": 0.22622303664684296, + "learning_rate": 5e-06, + "loss": 0.8989, + "num_input_tokens_seen": 911625768, + "step": 2006, + "train_runtime": 143602.5212, + "train_tokens_per_second": 6348.257 + }, + { + "epoch": 2.4298186702484887, + "grad_norm": 0.23639139533042908, + "learning_rate": 5e-06, + "loss": 0.8516, + "num_input_tokens_seen": 912074848, + "step": 2007, + "train_runtime": 143668.9004, + "train_tokens_per_second": 6348.45 + }, + { + "epoch": 2.43102942706609, + "grad_norm": 0.22876761853694916, + "learning_rate": 5e-06, + "loss": 0.9411, + "num_input_tokens_seen": 912544480, + "step": 2008, + "train_runtime": 143738.708, + "train_tokens_per_second": 6348.634 + }, + { + "epoch": 2.4322401838836916, + "grad_norm": 0.22991536557674408, + "learning_rate": 5e-06, + "loss": 0.9092, + "num_input_tokens_seen": 912999640, + "step": 2009, + "train_runtime": 143805.9728, + "train_tokens_per_second": 6348.83 + }, + { + "epoch": 2.433450940701293, + "grad_norm": 0.2503306567668915, + "learning_rate": 5e-06, + "loss": 0.9616, + "num_input_tokens_seen": 913462304, + "step": 2010, + "train_runtime": 143874.5271, + "train_tokens_per_second": 6349.02 + }, + { + "epoch": 2.4346616975188944, + "grad_norm": 0.2324354350566864, + "learning_rate": 5e-06, + "loss": 0.9464, + "num_input_tokens_seen": 913940856, + "step": 2011, + "train_runtime": 143945.0382, + "train_tokens_per_second": 6349.235 + }, + { + "epoch": 2.435872454336496, + "grad_norm": 0.2581816017627716, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 914400040, + "step": 2012, + "train_runtime": 144012.6324, + "train_tokens_per_second": 6349.443 + }, + { + "epoch": 2.437083211154097, + "grad_norm": 0.22747096419334412, + "learning_rate": 5e-06, + "loss": 0.8607, + "num_input_tokens_seen": 914857280, + "step": 2013, + "train_runtime": 144079.8825, + "train_tokens_per_second": 6349.653 + }, + { + "epoch": 2.4382939679716986, + "grad_norm": 0.2326267808675766, + "learning_rate": 5e-06, + "loss": 0.9106, + "num_input_tokens_seen": 915326624, + "step": 2014, + "train_runtime": 144149.1853, + "train_tokens_per_second": 6349.856 + }, + { + "epoch": 2.4395047247893, + "grad_norm": 0.23372125625610352, + "learning_rate": 5e-06, + "loss": 0.9309, + "num_input_tokens_seen": 915785008, + "step": 2015, + "train_runtime": 144216.7032, + "train_tokens_per_second": 6350.062 + }, + { + "epoch": 2.4407154816069014, + "grad_norm": 0.22626471519470215, + "learning_rate": 5e-06, + "loss": 0.9278, + "num_input_tokens_seen": 916238880, + "step": 2016, + "train_runtime": 144283.6442, + "train_tokens_per_second": 6350.262 + }, + { + "epoch": 2.441926238424503, + "grad_norm": 0.25401008129119873, + "learning_rate": 5e-06, + "loss": 0.9314, + "num_input_tokens_seen": 916695648, + "step": 2017, + "train_runtime": 144350.7873, + "train_tokens_per_second": 6350.472 + }, + { + "epoch": 2.4431369952421043, + "grad_norm": 0.2441287785768509, + "learning_rate": 5e-06, + "loss": 0.9458, + "num_input_tokens_seen": 917130712, + "step": 2018, + "train_runtime": 144414.7762, + "train_tokens_per_second": 6350.671 + }, + { + "epoch": 2.4443477520597057, + "grad_norm": 0.2315591424703598, + "learning_rate": 5e-06, + "loss": 0.915, + "num_input_tokens_seen": 917573008, + "step": 2019, + "train_runtime": 144480.3945, + "train_tokens_per_second": 6350.848 + }, + { + "epoch": 2.4455585088773066, + "grad_norm": 0.24778081476688385, + "learning_rate": 5e-06, + "loss": 0.9151, + "num_input_tokens_seen": 918042304, + "step": 2020, + "train_runtime": 144550.1954, + "train_tokens_per_second": 6351.028 + }, + { + "epoch": 2.446769265694908, + "grad_norm": 0.24255888164043427, + "learning_rate": 5e-06, + "loss": 0.9598, + "num_input_tokens_seen": 918472152, + "step": 2021, + "train_runtime": 144613.6113, + "train_tokens_per_second": 6351.215 + }, + { + "epoch": 2.4479800225125095, + "grad_norm": 0.2313011735677719, + "learning_rate": 5e-06, + "loss": 0.8695, + "num_input_tokens_seen": 918918720, + "step": 2022, + "train_runtime": 144679.5413, + "train_tokens_per_second": 6351.407 + }, + { + "epoch": 2.449190779330111, + "grad_norm": 0.26603221893310547, + "learning_rate": 5e-06, + "loss": 0.9775, + "num_input_tokens_seen": 919349928, + "step": 2023, + "train_runtime": 144743.4514, + "train_tokens_per_second": 6351.582 + }, + { + "epoch": 2.4504015361477123, + "grad_norm": 0.2424296885728836, + "learning_rate": 5e-06, + "loss": 0.965, + "num_input_tokens_seen": 919786592, + "step": 2024, + "train_runtime": 144807.5707, + "train_tokens_per_second": 6351.785 + }, + { + "epoch": 2.4516122929653137, + "grad_norm": 0.24074110388755798, + "learning_rate": 5e-06, + "loss": 0.9531, + "num_input_tokens_seen": 920241992, + "step": 2025, + "train_runtime": 144875.0746, + "train_tokens_per_second": 6351.969 + }, + { + "epoch": 2.452823049782915, + "grad_norm": 0.2590942084789276, + "learning_rate": 5e-06, + "loss": 0.9863, + "num_input_tokens_seen": 920659648, + "step": 2026, + "train_runtime": 144937.2065, + "train_tokens_per_second": 6352.128 + }, + { + "epoch": 2.4540338066005165, + "grad_norm": 0.24782414734363556, + "learning_rate": 5e-06, + "loss": 0.9386, + "num_input_tokens_seen": 921097432, + "step": 2027, + "train_runtime": 145001.8229, + "train_tokens_per_second": 6352.316 + }, + { + "epoch": 2.455244563418118, + "grad_norm": 0.26278966665267944, + "learning_rate": 5e-06, + "loss": 0.9022, + "num_input_tokens_seen": 921551424, + "step": 2028, + "train_runtime": 145068.6754, + "train_tokens_per_second": 6352.518 + }, + { + "epoch": 2.4564553202357193, + "grad_norm": 0.2415189892053604, + "learning_rate": 5e-06, + "loss": 0.8962, + "num_input_tokens_seen": 921997752, + "step": 2029, + "train_runtime": 145134.5917, + "train_tokens_per_second": 6352.708 + }, + { + "epoch": 2.4576660770533207, + "grad_norm": 0.23638790845870972, + "learning_rate": 5e-06, + "loss": 0.8967, + "num_input_tokens_seen": 922462192, + "step": 2030, + "train_runtime": 145203.6043, + "train_tokens_per_second": 6352.888 + }, + { + "epoch": 2.458876833870922, + "grad_norm": 0.27461180090904236, + "learning_rate": 5e-06, + "loss": 0.9226, + "num_input_tokens_seen": 922906560, + "step": 2031, + "train_runtime": 145269.2551, + "train_tokens_per_second": 6353.076 + }, + { + "epoch": 2.4600875906885236, + "grad_norm": 0.24975448846817017, + "learning_rate": 5e-06, + "loss": 0.9153, + "num_input_tokens_seen": 923371312, + "step": 2032, + "train_runtime": 145338.224, + "train_tokens_per_second": 6353.259 + }, + { + "epoch": 2.4612983475061245, + "grad_norm": 0.23750756680965424, + "learning_rate": 5e-06, + "loss": 0.8991, + "num_input_tokens_seen": 923838344, + "step": 2033, + "train_runtime": 145407.6286, + "train_tokens_per_second": 6353.438 + }, + { + "epoch": 2.462509104323726, + "grad_norm": 0.2277829796075821, + "learning_rate": 5e-06, + "loss": 0.884, + "num_input_tokens_seen": 924294392, + "step": 2034, + "train_runtime": 145475.5205, + "train_tokens_per_second": 6353.608 + }, + { + "epoch": 2.4637198611413273, + "grad_norm": 0.2631170451641083, + "learning_rate": 5e-06, + "loss": 0.9468, + "num_input_tokens_seen": 924720120, + "step": 2035, + "train_runtime": 145538.1668, + "train_tokens_per_second": 6353.798 + }, + { + "epoch": 2.4649306179589288, + "grad_norm": 0.24038782715797424, + "learning_rate": 5e-06, + "loss": 0.8965, + "num_input_tokens_seen": 925176312, + "step": 2036, + "train_runtime": 145605.7781, + "train_tokens_per_second": 6353.981 + }, + { + "epoch": 2.46614137477653, + "grad_norm": 0.2566758692264557, + "learning_rate": 5e-06, + "loss": 0.9498, + "num_input_tokens_seen": 925601920, + "step": 2037, + "train_runtime": 145668.7144, + "train_tokens_per_second": 6354.157 + }, + { + "epoch": 2.4673521315941316, + "grad_norm": 0.24463194608688354, + "learning_rate": 5e-06, + "loss": 0.9263, + "num_input_tokens_seen": 926052952, + "step": 2038, + "train_runtime": 145735.808, + "train_tokens_per_second": 6354.327 + }, + { + "epoch": 2.468562888411733, + "grad_norm": 0.2460647076368332, + "learning_rate": 5e-06, + "loss": 0.8975, + "num_input_tokens_seen": 926514368, + "step": 2039, + "train_runtime": 145804.5927, + "train_tokens_per_second": 6354.494 + }, + { + "epoch": 2.4697736452293344, + "grad_norm": 0.2725125253200531, + "learning_rate": 5e-06, + "loss": 0.9171, + "num_input_tokens_seen": 926978616, + "step": 2040, + "train_runtime": 145872.9181, + "train_tokens_per_second": 6354.7 + }, + { + "epoch": 2.470984402046936, + "grad_norm": 0.245590940117836, + "learning_rate": 5e-06, + "loss": 0.9422, + "num_input_tokens_seen": 927427896, + "step": 2041, + "train_runtime": 145940.2315, + "train_tokens_per_second": 6354.847 + }, + { + "epoch": 2.4721951588645372, + "grad_norm": 0.22320939600467682, + "learning_rate": 5e-06, + "loss": 0.8796, + "num_input_tokens_seen": 927878992, + "step": 2042, + "train_runtime": 146008.6109, + "train_tokens_per_second": 6354.961 + }, + { + "epoch": 2.4734059156821386, + "grad_norm": 0.23028956353664398, + "learning_rate": 5e-06, + "loss": 0.8894, + "num_input_tokens_seen": 928332296, + "step": 2043, + "train_runtime": 146075.3686, + "train_tokens_per_second": 6355.16 + }, + { + "epoch": 2.47461667249974, + "grad_norm": 0.2439645677804947, + "learning_rate": 5e-06, + "loss": 0.9377, + "num_input_tokens_seen": 928771176, + "step": 2044, + "train_runtime": 146140.3819, + "train_tokens_per_second": 6355.336 + }, + { + "epoch": 2.4758274293173415, + "grad_norm": 0.26759254932403564, + "learning_rate": 5e-06, + "loss": 0.9297, + "num_input_tokens_seen": 929245128, + "step": 2045, + "train_runtime": 146210.4408, + "train_tokens_per_second": 6355.532 + }, + { + "epoch": 2.4770381861349424, + "grad_norm": 0.26013848185539246, + "learning_rate": 5e-06, + "loss": 0.8866, + "num_input_tokens_seen": 929700048, + "step": 2046, + "train_runtime": 146277.6773, + "train_tokens_per_second": 6355.721 + }, + { + "epoch": 2.478248942952544, + "grad_norm": 0.2415570616722107, + "learning_rate": 5e-06, + "loss": 0.9105, + "num_input_tokens_seen": 930155200, + "step": 2047, + "train_runtime": 146345.2032, + "train_tokens_per_second": 6355.898 + }, + { + "epoch": 2.4794596997701452, + "grad_norm": 0.23563383519649506, + "learning_rate": 5e-06, + "loss": 0.924, + "num_input_tokens_seen": 930593920, + "step": 2048, + "train_runtime": 146410.2731, + "train_tokens_per_second": 6356.07 + }, + { + "epoch": 2.4806704565877467, + "grad_norm": 0.2751082479953766, + "learning_rate": 5e-06, + "loss": 0.9136, + "num_input_tokens_seen": 931067912, + "step": 2049, + "train_runtime": 146480.5438, + "train_tokens_per_second": 6356.257 + }, + { + "epoch": 2.481881213405348, + "grad_norm": 0.24614796042442322, + "learning_rate": 5e-06, + "loss": 0.9161, + "num_input_tokens_seen": 931535648, + "step": 2050, + "train_runtime": 146549.9494, + "train_tokens_per_second": 6356.438 + }, + { + "epoch": 2.4830919702229495, + "grad_norm": 0.2446848303079605, + "learning_rate": 5e-06, + "loss": 0.9223, + "num_input_tokens_seen": 931996664, + "step": 2051, + "train_runtime": 146617.8963, + "train_tokens_per_second": 6356.636 + }, + { + "epoch": 2.484302727040551, + "grad_norm": 0.24272581934928894, + "learning_rate": 5e-06, + "loss": 0.9565, + "num_input_tokens_seen": 932455392, + "step": 2052, + "train_runtime": 146685.9301, + "train_tokens_per_second": 6356.815 + }, + { + "epoch": 2.4855134838581523, + "grad_norm": 0.24807094037532806, + "learning_rate": 5e-06, + "loss": 0.9369, + "num_input_tokens_seen": 932892240, + "step": 2053, + "train_runtime": 146750.5502, + "train_tokens_per_second": 6356.993 + }, + { + "epoch": 2.4867242406757537, + "grad_norm": 0.24745595455169678, + "learning_rate": 5e-06, + "loss": 0.9749, + "num_input_tokens_seen": 933347928, + "step": 2054, + "train_runtime": 146817.8809, + "train_tokens_per_second": 6357.182 + }, + { + "epoch": 2.487934997493355, + "grad_norm": 0.2522644102573395, + "learning_rate": 5e-06, + "loss": 0.9338, + "num_input_tokens_seen": 933807000, + "step": 2055, + "train_runtime": 146885.7573, + "train_tokens_per_second": 6357.369 + }, + { + "epoch": 2.4891457543109565, + "grad_norm": 0.24964258074760437, + "learning_rate": 5e-06, + "loss": 0.916, + "num_input_tokens_seen": 934241376, + "step": 2056, + "train_runtime": 146949.703, + "train_tokens_per_second": 6357.559 + }, + { + "epoch": 2.490356511128558, + "grad_norm": 0.24483104050159454, + "learning_rate": 5e-06, + "loss": 0.9004, + "num_input_tokens_seen": 934681960, + "step": 2057, + "train_runtime": 147014.7916, + "train_tokens_per_second": 6357.741 + }, + { + "epoch": 2.4915672679461593, + "grad_norm": 0.23873838782310486, + "learning_rate": 5e-06, + "loss": 0.9291, + "num_input_tokens_seen": 935154456, + "step": 2058, + "train_runtime": 147084.9039, + "train_tokens_per_second": 6357.923 + }, + { + "epoch": 2.4927780247637603, + "grad_norm": 0.23718827962875366, + "learning_rate": 5e-06, + "loss": 0.8689, + "num_input_tokens_seen": 935605960, + "step": 2059, + "train_runtime": 147152.0711, + "train_tokens_per_second": 6358.089 + }, + { + "epoch": 2.4939887815813617, + "grad_norm": 0.2440134733915329, + "learning_rate": 5e-06, + "loss": 0.9532, + "num_input_tokens_seen": 936070728, + "step": 2060, + "train_runtime": 147221.0941, + "train_tokens_per_second": 6358.265 + }, + { + "epoch": 2.495199538398963, + "grad_norm": 0.24385643005371094, + "learning_rate": 5e-06, + "loss": 0.8583, + "num_input_tokens_seen": 936518568, + "step": 2061, + "train_runtime": 147287.537, + "train_tokens_per_second": 6358.437 + }, + { + "epoch": 2.4964102952165645, + "grad_norm": 0.23779569566249847, + "learning_rate": 5e-06, + "loss": 0.9048, + "num_input_tokens_seen": 936975120, + "step": 2062, + "train_runtime": 147355.2483, + "train_tokens_per_second": 6358.614 + }, + { + "epoch": 2.497621052034166, + "grad_norm": 0.25262778997421265, + "learning_rate": 5e-06, + "loss": 0.9233, + "num_input_tokens_seen": 937426104, + "step": 2063, + "train_runtime": 147422.2915, + "train_tokens_per_second": 6358.781 + }, + { + "epoch": 2.4988318088517674, + "grad_norm": 0.24266116321086884, + "learning_rate": 5e-06, + "loss": 0.9427, + "num_input_tokens_seen": 937903712, + "step": 2064, + "train_runtime": 147492.5493, + "train_tokens_per_second": 6358.99 + }, + { + "epoch": 2.5000425656693688, + "grad_norm": 0.24687156081199646, + "learning_rate": 5e-06, + "loss": 0.9445, + "num_input_tokens_seen": 938338192, + "step": 2065, + "train_runtime": 147556.7522, + "train_tokens_per_second": 6359.168 + }, + { + "epoch": 2.50125332248697, + "grad_norm": 0.2338828295469284, + "learning_rate": 5e-06, + "loss": 0.9168, + "num_input_tokens_seen": 938795784, + "step": 2066, + "train_runtime": 147625.0006, + "train_tokens_per_second": 6359.328 + }, + { + "epoch": 2.5024640793045716, + "grad_norm": 0.22915604710578918, + "learning_rate": 5e-06, + "loss": 0.8868, + "num_input_tokens_seen": 939266392, + "step": 2067, + "train_runtime": 147694.5446, + "train_tokens_per_second": 6359.52 + }, + { + "epoch": 2.503674836122173, + "grad_norm": 0.23226921260356903, + "learning_rate": 5e-06, + "loss": 0.9124, + "num_input_tokens_seen": 939729888, + "step": 2068, + "train_runtime": 147763.2806, + "train_tokens_per_second": 6359.698 + }, + { + "epoch": 2.5048855929397744, + "grad_norm": 0.24283023178577423, + "learning_rate": 5e-06, + "loss": 0.9313, + "num_input_tokens_seen": 940204240, + "step": 2069, + "train_runtime": 147833.4404, + "train_tokens_per_second": 6359.889 + }, + { + "epoch": 2.506096349757376, + "grad_norm": 0.24000869691371918, + "learning_rate": 5e-06, + "loss": 0.9282, + "num_input_tokens_seen": 940655272, + "step": 2070, + "train_runtime": 147899.9283, + "train_tokens_per_second": 6360.079 + }, + { + "epoch": 2.5073071065749772, + "grad_norm": 0.2344265878200531, + "learning_rate": 5e-06, + "loss": 0.9358, + "num_input_tokens_seen": 941102232, + "step": 2071, + "train_runtime": 147966.0637, + "train_tokens_per_second": 6360.257 + }, + { + "epoch": 2.508517863392578, + "grad_norm": 0.22600269317626953, + "learning_rate": 5e-06, + "loss": 0.8906, + "num_input_tokens_seen": 941580024, + "step": 2072, + "train_runtime": 148036.6621, + "train_tokens_per_second": 6360.452 + }, + { + "epoch": 2.50972862021018, + "grad_norm": 0.23776832222938538, + "learning_rate": 5e-06, + "loss": 0.9121, + "num_input_tokens_seen": 942039728, + "step": 2073, + "train_runtime": 148104.8937, + "train_tokens_per_second": 6360.625 + }, + { + "epoch": 2.510939377027781, + "grad_norm": 0.22566133737564087, + "learning_rate": 5e-06, + "loss": 0.9624, + "num_input_tokens_seen": 942489560, + "step": 2074, + "train_runtime": 148171.3227, + "train_tokens_per_second": 6360.81 + }, + { + "epoch": 2.5121501338453824, + "grad_norm": 0.24746361374855042, + "learning_rate": 5e-06, + "loss": 0.8688, + "num_input_tokens_seen": 942940600, + "step": 2075, + "train_runtime": 148237.979, + "train_tokens_per_second": 6360.992 + }, + { + "epoch": 2.513360890662984, + "grad_norm": 0.2346951961517334, + "learning_rate": 5e-06, + "loss": 0.9307, + "num_input_tokens_seen": 943417360, + "step": 2076, + "train_runtime": 148308.1213, + "train_tokens_per_second": 6361.198 + }, + { + "epoch": 2.5145716474805853, + "grad_norm": 0.2295297533273697, + "learning_rate": 5e-06, + "loss": 0.958, + "num_input_tokens_seen": 943867744, + "step": 2077, + "train_runtime": 148374.8559, + "train_tokens_per_second": 6361.373 + }, + { + "epoch": 2.5157824042981867, + "grad_norm": 0.2399854212999344, + "learning_rate": 5e-06, + "loss": 0.8859, + "num_input_tokens_seen": 944347752, + "step": 2078, + "train_runtime": 148445.8736, + "train_tokens_per_second": 6361.563 + }, + { + "epoch": 2.516993161115788, + "grad_norm": 0.25821027159690857, + "learning_rate": 5e-06, + "loss": 0.9398, + "num_input_tokens_seen": 944819984, + "step": 2079, + "train_runtime": 148515.6507, + "train_tokens_per_second": 6361.754 + }, + { + "epoch": 2.5182039179333895, + "grad_norm": 0.24560266733169556, + "learning_rate": 5e-06, + "loss": 0.9439, + "num_input_tokens_seen": 945260464, + "step": 2080, + "train_runtime": 148580.7954, + "train_tokens_per_second": 6361.929 + }, + { + "epoch": 2.519414674750991, + "grad_norm": 0.23482368886470795, + "learning_rate": 5e-06, + "loss": 0.9254, + "num_input_tokens_seen": 945718168, + "step": 2081, + "train_runtime": 148648.5841, + "train_tokens_per_second": 6362.107 + }, + { + "epoch": 2.5206254315685923, + "grad_norm": 0.24930696189403534, + "learning_rate": 5e-06, + "loss": 0.9276, + "num_input_tokens_seen": 946169448, + "step": 2082, + "train_runtime": 148715.3849, + "train_tokens_per_second": 6362.284 + }, + { + "epoch": 2.5218361883861937, + "grad_norm": 0.27057182788848877, + "learning_rate": 5e-06, + "loss": 0.9687, + "num_input_tokens_seen": 946606096, + "step": 2083, + "train_runtime": 148779.659, + "train_tokens_per_second": 6362.47 + }, + { + "epoch": 2.523046945203795, + "grad_norm": 0.2677522301673889, + "learning_rate": 5e-06, + "loss": 0.955, + "num_input_tokens_seen": 947064336, + "step": 2084, + "train_runtime": 148847.4866, + "train_tokens_per_second": 6362.649 + }, + { + "epoch": 2.524257702021396, + "grad_norm": 0.24084171652793884, + "learning_rate": 5e-06, + "loss": 0.9752, + "num_input_tokens_seen": 947511856, + "step": 2085, + "train_runtime": 148913.4171, + "train_tokens_per_second": 6362.837 + }, + { + "epoch": 2.525468458838998, + "grad_norm": 0.2556767165660858, + "learning_rate": 5e-06, + "loss": 0.9244, + "num_input_tokens_seen": 947972304, + "step": 2086, + "train_runtime": 148981.7706, + "train_tokens_per_second": 6363.009 + }, + { + "epoch": 2.526679215656599, + "grad_norm": 0.25337082147598267, + "learning_rate": 5e-06, + "loss": 0.9532, + "num_input_tokens_seen": 948416456, + "step": 2087, + "train_runtime": 149047.4661, + "train_tokens_per_second": 6363.184 + }, + { + "epoch": 2.5278899724742003, + "grad_norm": 0.23191265761852264, + "learning_rate": 5e-06, + "loss": 0.9283, + "num_input_tokens_seen": 948876472, + "step": 2088, + "train_runtime": 149115.6529, + "train_tokens_per_second": 6363.359 + }, + { + "epoch": 2.5291007292918017, + "grad_norm": 0.24382182955741882, + "learning_rate": 5e-06, + "loss": 0.9737, + "num_input_tokens_seen": 949310000, + "step": 2089, + "train_runtime": 149179.5981, + "train_tokens_per_second": 6363.538 + }, + { + "epoch": 2.530311486109403, + "grad_norm": 0.25065913796424866, + "learning_rate": 5e-06, + "loss": 0.9474, + "num_input_tokens_seen": 949744928, + "step": 2090, + "train_runtime": 149243.5236, + "train_tokens_per_second": 6363.726 + }, + { + "epoch": 2.5315222429270046, + "grad_norm": 0.24396991729736328, + "learning_rate": 5e-06, + "loss": 0.895, + "num_input_tokens_seen": 950192536, + "step": 2091, + "train_runtime": 149309.8845, + "train_tokens_per_second": 6363.896 + }, + { + "epoch": 2.532732999744606, + "grad_norm": 0.2316816747188568, + "learning_rate": 5e-06, + "loss": 0.9147, + "num_input_tokens_seen": 950658280, + "step": 2092, + "train_runtime": 149379.1212, + "train_tokens_per_second": 6364.064 + }, + { + "epoch": 2.5339437565622074, + "grad_norm": 0.2816956043243408, + "learning_rate": 5e-06, + "loss": 0.9337, + "num_input_tokens_seen": 951108960, + "step": 2093, + "train_runtime": 149445.9212, + "train_tokens_per_second": 6364.235 + }, + { + "epoch": 2.535154513379809, + "grad_norm": 0.24696174263954163, + "learning_rate": 5e-06, + "loss": 0.9976, + "num_input_tokens_seen": 951526808, + "step": 2094, + "train_runtime": 149507.8536, + "train_tokens_per_second": 6364.393 + }, + { + "epoch": 2.53636527019741, + "grad_norm": 0.23096802830696106, + "learning_rate": 5e-06, + "loss": 0.9126, + "num_input_tokens_seen": 951990024, + "step": 2095, + "train_runtime": 149578.1316, + "train_tokens_per_second": 6364.5 + }, + { + "epoch": 2.5375760270150116, + "grad_norm": 0.2423764020204544, + "learning_rate": 5e-06, + "loss": 0.9433, + "num_input_tokens_seen": 952442632, + "step": 2096, + "train_runtime": 149645.4898, + "train_tokens_per_second": 6364.66 + }, + { + "epoch": 2.538786783832613, + "grad_norm": 0.2315463125705719, + "learning_rate": 5e-06, + "loss": 0.9252, + "num_input_tokens_seen": 952889608, + "step": 2097, + "train_runtime": 149711.4793, + "train_tokens_per_second": 6364.84 + }, + { + "epoch": 2.539997540650214, + "grad_norm": 0.23303987085819244, + "learning_rate": 5e-06, + "loss": 0.9057, + "num_input_tokens_seen": 953335880, + "step": 2098, + "train_runtime": 149777.8167, + "train_tokens_per_second": 6365.001 + }, + { + "epoch": 2.541208297467816, + "grad_norm": 0.22208106517791748, + "learning_rate": 5e-06, + "loss": 0.8773, + "num_input_tokens_seen": 953792496, + "step": 2099, + "train_runtime": 149845.1928, + "train_tokens_per_second": 6365.186 + }, + { + "epoch": 2.542419054285417, + "grad_norm": 0.27171242237091064, + "learning_rate": 5e-06, + "loss": 0.9345, + "num_input_tokens_seen": 954254432, + "step": 2100, + "train_runtime": 149913.5984, + "train_tokens_per_second": 6365.363 + }, + { + "epoch": 2.5436298111030182, + "grad_norm": 0.24096918106079102, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 954702408, + "step": 2101, + "train_runtime": 149979.536, + "train_tokens_per_second": 6365.551 + }, + { + "epoch": 2.5448405679206196, + "grad_norm": 0.22997787594795227, + "learning_rate": 5e-06, + "loss": 0.882, + "num_input_tokens_seen": 955166928, + "step": 2102, + "train_runtime": 150048.5149, + "train_tokens_per_second": 6365.721 + }, + { + "epoch": 2.546051324738221, + "grad_norm": 0.23458710312843323, + "learning_rate": 5e-06, + "loss": 0.8823, + "num_input_tokens_seen": 955611512, + "step": 2103, + "train_runtime": 150113.7698, + "train_tokens_per_second": 6365.915 + }, + { + "epoch": 2.5472620815558225, + "grad_norm": 0.2433023899793625, + "learning_rate": 5e-06, + "loss": 0.959, + "num_input_tokens_seen": 956040800, + "step": 2104, + "train_runtime": 150176.961, + "train_tokens_per_second": 6366.095 + }, + { + "epoch": 2.548472838373424, + "grad_norm": 0.24917687475681305, + "learning_rate": 5e-06, + "loss": 0.9486, + "num_input_tokens_seen": 956490904, + "step": 2105, + "train_runtime": 150243.2865, + "train_tokens_per_second": 6366.28 + }, + { + "epoch": 2.5496835951910253, + "grad_norm": 0.24541530013084412, + "learning_rate": 5e-06, + "loss": 0.9464, + "num_input_tokens_seen": 956941400, + "step": 2106, + "train_runtime": 150309.8385, + "train_tokens_per_second": 6366.459 + }, + { + "epoch": 2.5508943520086267, + "grad_norm": 0.23959662020206451, + "learning_rate": 5e-06, + "loss": 0.884, + "num_input_tokens_seen": 957382960, + "step": 2107, + "train_runtime": 150375.1278, + "train_tokens_per_second": 6366.631 + }, + { + "epoch": 2.552105108826228, + "grad_norm": 0.24745765328407288, + "learning_rate": 5e-06, + "loss": 0.9583, + "num_input_tokens_seen": 957828816, + "step": 2108, + "train_runtime": 150441.1512, + "train_tokens_per_second": 6366.801 + }, + { + "epoch": 2.5533158656438295, + "grad_norm": 0.3009890019893646, + "learning_rate": 5e-06, + "loss": 0.9098, + "num_input_tokens_seen": 958242640, + "step": 2109, + "train_runtime": 150502.4514, + "train_tokens_per_second": 6366.957 + }, + { + "epoch": 2.554526622461431, + "grad_norm": 0.23632347583770752, + "learning_rate": 5e-06, + "loss": 0.9395, + "num_input_tokens_seen": 958689592, + "step": 2110, + "train_runtime": 150568.8159, + "train_tokens_per_second": 6367.119 + }, + { + "epoch": 2.555737379279032, + "grad_norm": 0.2490553855895996, + "learning_rate": 5e-06, + "loss": 0.8992, + "num_input_tokens_seen": 959154704, + "step": 2111, + "train_runtime": 150637.4185, + "train_tokens_per_second": 6367.307 + }, + { + "epoch": 2.5569481360966337, + "grad_norm": 0.2370673418045044, + "learning_rate": 5e-06, + "loss": 0.9111, + "num_input_tokens_seen": 959608064, + "step": 2112, + "train_runtime": 150704.8437, + "train_tokens_per_second": 6367.467 + }, + { + "epoch": 2.5581588929142347, + "grad_norm": 0.24848392605781555, + "learning_rate": 5e-06, + "loss": 0.9228, + "num_input_tokens_seen": 960056176, + "step": 2113, + "train_runtime": 150771.1853, + "train_tokens_per_second": 6367.637 + }, + { + "epoch": 2.5593696497318366, + "grad_norm": 0.22288735210895538, + "learning_rate": 5e-06, + "loss": 0.876, + "num_input_tokens_seen": 960532728, + "step": 2114, + "train_runtime": 150842.3276, + "train_tokens_per_second": 6367.793 + }, + { + "epoch": 2.5605804065494375, + "grad_norm": 0.2513042986392975, + "learning_rate": 5e-06, + "loss": 1.0092, + "num_input_tokens_seen": 960985864, + "step": 2115, + "train_runtime": 150909.5334, + "train_tokens_per_second": 6367.96 + }, + { + "epoch": 2.561791163367039, + "grad_norm": 0.28590673208236694, + "learning_rate": 5e-06, + "loss": 0.9171, + "num_input_tokens_seen": 961433280, + "step": 2116, + "train_runtime": 150975.5143, + "train_tokens_per_second": 6368.14 + }, + { + "epoch": 2.5630019201846403, + "grad_norm": 0.26200953125953674, + "learning_rate": 5e-06, + "loss": 0.9099, + "num_input_tokens_seen": 961888288, + "step": 2117, + "train_runtime": 151042.961, + "train_tokens_per_second": 6368.309 + }, + { + "epoch": 2.5642126770022418, + "grad_norm": 0.24091939628124237, + "learning_rate": 5e-06, + "loss": 0.905, + "num_input_tokens_seen": 962325576, + "step": 2118, + "train_runtime": 151107.3832, + "train_tokens_per_second": 6368.488 + }, + { + "epoch": 2.565423433819843, + "grad_norm": 0.305169016122818, + "learning_rate": 5e-06, + "loss": 0.9368, + "num_input_tokens_seen": 962790480, + "step": 2119, + "train_runtime": 151176.0572, + "train_tokens_per_second": 6368.67 + }, + { + "epoch": 2.5666341906374446, + "grad_norm": 0.23745113611221313, + "learning_rate": 5e-06, + "loss": 0.912, + "num_input_tokens_seen": 963234680, + "step": 2120, + "train_runtime": 151241.8959, + "train_tokens_per_second": 6368.835 + }, + { + "epoch": 2.567844947455046, + "grad_norm": 0.26773974299430847, + "learning_rate": 5e-06, + "loss": 0.9186, + "num_input_tokens_seen": 963693880, + "step": 2121, + "train_runtime": 151309.6612, + "train_tokens_per_second": 6369.017 + }, + { + "epoch": 2.5690557042726474, + "grad_norm": 0.23392970860004425, + "learning_rate": 5e-06, + "loss": 0.9131, + "num_input_tokens_seen": 964156400, + "step": 2122, + "train_runtime": 151377.9456, + "train_tokens_per_second": 6369.2 + }, + { + "epoch": 2.570266461090249, + "grad_norm": 0.24104426801204681, + "learning_rate": 5e-06, + "loss": 0.9098, + "num_input_tokens_seen": 964603680, + "step": 2123, + "train_runtime": 151443.8686, + "train_tokens_per_second": 6369.381 + }, + { + "epoch": 2.57147721790785, + "grad_norm": 0.27819520235061646, + "learning_rate": 5e-06, + "loss": 0.9433, + "num_input_tokens_seen": 965051904, + "step": 2124, + "train_runtime": 151510.0321, + "train_tokens_per_second": 6369.558 + }, + { + "epoch": 2.5726879747254516, + "grad_norm": 0.23141422867774963, + "learning_rate": 5e-06, + "loss": 0.9482, + "num_input_tokens_seen": 965491424, + "step": 2125, + "train_runtime": 151574.8146, + "train_tokens_per_second": 6369.735 + }, + { + "epoch": 2.5738987315430526, + "grad_norm": 0.2362852543592453, + "learning_rate": 5e-06, + "loss": 0.8856, + "num_input_tokens_seen": 965942960, + "step": 2126, + "train_runtime": 151641.7588, + "train_tokens_per_second": 6369.901 + }, + { + "epoch": 2.5751094883606545, + "grad_norm": 0.2561604380607605, + "learning_rate": 5e-06, + "loss": 0.931, + "num_input_tokens_seen": 966402048, + "step": 2127, + "train_runtime": 151710.0948, + "train_tokens_per_second": 6370.058 + }, + { + "epoch": 2.5763202451782554, + "grad_norm": 0.2245933711528778, + "learning_rate": 5e-06, + "loss": 0.8996, + "num_input_tokens_seen": 966854136, + "step": 2128, + "train_runtime": 151777.079, + "train_tokens_per_second": 6370.225 + }, + { + "epoch": 2.577531001995857, + "grad_norm": 0.2425384670495987, + "learning_rate": 5e-06, + "loss": 0.9431, + "num_input_tokens_seen": 967327272, + "step": 2129, + "train_runtime": 151847.5021, + "train_tokens_per_second": 6370.386 + }, + { + "epoch": 2.5787417588134582, + "grad_norm": 0.2366553246974945, + "learning_rate": 5e-06, + "loss": 0.9586, + "num_input_tokens_seen": 967784400, + "step": 2130, + "train_runtime": 151915.0318, + "train_tokens_per_second": 6370.564 + }, + { + "epoch": 2.5799525156310597, + "grad_norm": 0.24099212884902954, + "learning_rate": 5e-06, + "loss": 0.9046, + "num_input_tokens_seen": 968237464, + "step": 2131, + "train_runtime": 151982.077, + "train_tokens_per_second": 6370.735 + }, + { + "epoch": 2.581163272448661, + "grad_norm": 0.25684481859207153, + "learning_rate": 5e-06, + "loss": 0.9971, + "num_input_tokens_seen": 968699672, + "step": 2132, + "train_runtime": 152050.2017, + "train_tokens_per_second": 6370.92 + }, + { + "epoch": 2.5823740292662625, + "grad_norm": 0.23455548286437988, + "learning_rate": 5e-06, + "loss": 0.9301, + "num_input_tokens_seen": 969139440, + "step": 2133, + "train_runtime": 152115.1846, + "train_tokens_per_second": 6371.089 + }, + { + "epoch": 2.583584786083864, + "grad_norm": 0.2384224683046341, + "learning_rate": 5e-06, + "loss": 0.9473, + "num_input_tokens_seen": 969581296, + "step": 2134, + "train_runtime": 152180.5835, + "train_tokens_per_second": 6371.255 + }, + { + "epoch": 2.5847955429014653, + "grad_norm": 0.2409534454345703, + "learning_rate": 5e-06, + "loss": 0.88, + "num_input_tokens_seen": 970046112, + "step": 2135, + "train_runtime": 152248.9143, + "train_tokens_per_second": 6371.448 + }, + { + "epoch": 2.5860062997190667, + "grad_norm": 0.25234049558639526, + "learning_rate": 5e-06, + "loss": 0.952, + "num_input_tokens_seen": 970487920, + "step": 2136, + "train_runtime": 152314.0961, + "train_tokens_per_second": 6371.622 + }, + { + "epoch": 2.587217056536668, + "grad_norm": 0.22701537609100342, + "learning_rate": 5e-06, + "loss": 0.8846, + "num_input_tokens_seen": 970944456, + "step": 2137, + "train_runtime": 152381.1973, + "train_tokens_per_second": 6371.813 + }, + { + "epoch": 2.5884278133542695, + "grad_norm": 0.22347159683704376, + "learning_rate": 5e-06, + "loss": 0.896, + "num_input_tokens_seen": 971420496, + "step": 2138, + "train_runtime": 152452.0329, + "train_tokens_per_second": 6371.975 + }, + { + "epoch": 2.5896385701718705, + "grad_norm": 0.23158830404281616, + "learning_rate": 5e-06, + "loss": 0.9145, + "num_input_tokens_seen": 971864936, + "step": 2139, + "train_runtime": 152517.7639, + "train_tokens_per_second": 6372.143 + }, + { + "epoch": 2.5908493269894723, + "grad_norm": 0.24082769453525543, + "learning_rate": 5e-06, + "loss": 0.9517, + "num_input_tokens_seen": 972316368, + "step": 2140, + "train_runtime": 152584.4859, + "train_tokens_per_second": 6372.315 + }, + { + "epoch": 2.5920600838070733, + "grad_norm": 0.2324010580778122, + "learning_rate": 5e-06, + "loss": 0.9356, + "num_input_tokens_seen": 972751424, + "step": 2141, + "train_runtime": 152648.8383, + "train_tokens_per_second": 6372.478 + }, + { + "epoch": 2.5932708406246747, + "grad_norm": 0.24019919335842133, + "learning_rate": 5e-06, + "loss": 0.9627, + "num_input_tokens_seen": 973202808, + "step": 2142, + "train_runtime": 152715.4644, + "train_tokens_per_second": 6372.654 + }, + { + "epoch": 2.594481597442276, + "grad_norm": 0.23422782123088837, + "learning_rate": 5e-06, + "loss": 0.8628, + "num_input_tokens_seen": 973689168, + "step": 2143, + "train_runtime": 152787.9495, + "train_tokens_per_second": 6372.814 + }, + { + "epoch": 2.5956923542598775, + "grad_norm": 0.2257990539073944, + "learning_rate": 5e-06, + "loss": 0.8893, + "num_input_tokens_seen": 974141104, + "step": 2144, + "train_runtime": 152854.4834, + "train_tokens_per_second": 6372.997 + }, + { + "epoch": 2.596903111077479, + "grad_norm": 0.2405652552843094, + "learning_rate": 5e-06, + "loss": 0.9073, + "num_input_tokens_seen": 974607168, + "step": 2145, + "train_runtime": 152923.3103, + "train_tokens_per_second": 6373.176 + }, + { + "epoch": 2.5981138678950804, + "grad_norm": 0.25007355213165283, + "learning_rate": 5e-06, + "loss": 0.9286, + "num_input_tokens_seen": 975054616, + "step": 2146, + "train_runtime": 152989.2769, + "train_tokens_per_second": 6373.353 + }, + { + "epoch": 2.5993246247126818, + "grad_norm": 0.2606528103351593, + "learning_rate": 5e-06, + "loss": 0.9382, + "num_input_tokens_seen": 975497336, + "step": 2147, + "train_runtime": 153054.2115, + "train_tokens_per_second": 6373.541 + }, + { + "epoch": 2.600535381530283, + "grad_norm": 0.22446538507938385, + "learning_rate": 5e-06, + "loss": 0.8875, + "num_input_tokens_seen": 975941776, + "step": 2148, + "train_runtime": 153120.0609, + "train_tokens_per_second": 6373.703 + }, + { + "epoch": 2.6017461383478846, + "grad_norm": 0.2446034997701645, + "learning_rate": 5e-06, + "loss": 0.9442, + "num_input_tokens_seen": 976394880, + "step": 2149, + "train_runtime": 153188.6693, + "train_tokens_per_second": 6373.806 + }, + { + "epoch": 2.602956895165486, + "grad_norm": 0.22049270570278168, + "learning_rate": 5e-06, + "loss": 0.8881, + "num_input_tokens_seen": 976866832, + "step": 2150, + "train_runtime": 153258.796, + "train_tokens_per_second": 6373.969 + }, + { + "epoch": 2.6041676519830874, + "grad_norm": 0.22306476533412933, + "learning_rate": 5e-06, + "loss": 0.8982, + "num_input_tokens_seen": 977324560, + "step": 2151, + "train_runtime": 153326.3775, + "train_tokens_per_second": 6374.145 + }, + { + "epoch": 2.6053784088006884, + "grad_norm": 0.25735023617744446, + "learning_rate": 5e-06, + "loss": 0.9181, + "num_input_tokens_seen": 977761376, + "step": 2152, + "train_runtime": 153390.9315, + "train_tokens_per_second": 6374.31 + }, + { + "epoch": 2.6065891656182902, + "grad_norm": 0.23419278860092163, + "learning_rate": 5e-06, + "loss": 0.9264, + "num_input_tokens_seen": 978205792, + "step": 2153, + "train_runtime": 153457.0095, + "train_tokens_per_second": 6374.461 + }, + { + "epoch": 2.607799922435891, + "grad_norm": 0.23010873794555664, + "learning_rate": 5e-06, + "loss": 0.9316, + "num_input_tokens_seen": 978649992, + "step": 2154, + "train_runtime": 153522.7683, + "train_tokens_per_second": 6374.624 + }, + { + "epoch": 2.6090106792534926, + "grad_norm": 0.2428400218486786, + "learning_rate": 5e-06, + "loss": 0.9585, + "num_input_tokens_seen": 979073824, + "step": 2155, + "train_runtime": 153585.4951, + "train_tokens_per_second": 6374.781 + }, + { + "epoch": 2.610221436071094, + "grad_norm": 0.24073754251003265, + "learning_rate": 5e-06, + "loss": 0.9351, + "num_input_tokens_seen": 979532472, + "step": 2156, + "train_runtime": 153653.4719, + "train_tokens_per_second": 6374.945 + }, + { + "epoch": 2.6114321928886954, + "grad_norm": 0.24380506575107574, + "learning_rate": 5e-06, + "loss": 0.8991, + "num_input_tokens_seen": 979999864, + "step": 2157, + "train_runtime": 153722.7985, + "train_tokens_per_second": 6375.111 + }, + { + "epoch": 2.612642949706297, + "grad_norm": 0.23714521527290344, + "learning_rate": 5e-06, + "loss": 0.9145, + "num_input_tokens_seen": 980473120, + "step": 2158, + "train_runtime": 153793.1517, + "train_tokens_per_second": 6375.272 + }, + { + "epoch": 2.6138537065238983, + "grad_norm": 0.2587903141975403, + "learning_rate": 5e-06, + "loss": 0.919, + "num_input_tokens_seen": 980913512, + "step": 2159, + "train_runtime": 153858.4779, + "train_tokens_per_second": 6375.427 + }, + { + "epoch": 2.6150644633414997, + "grad_norm": 0.2620103359222412, + "learning_rate": 5e-06, + "loss": 0.9178, + "num_input_tokens_seen": 981375896, + "step": 2160, + "train_runtime": 153927.0515, + "train_tokens_per_second": 6375.591 + }, + { + "epoch": 2.616275220159101, + "grad_norm": 0.24088148772716522, + "learning_rate": 5e-06, + "loss": 0.9379, + "num_input_tokens_seen": 981832360, + "step": 2161, + "train_runtime": 153994.5461, + "train_tokens_per_second": 6375.761 + }, + { + "epoch": 2.6174859769767025, + "grad_norm": 0.22991852462291718, + "learning_rate": 5e-06, + "loss": 0.9617, + "num_input_tokens_seen": 982288048, + "step": 2162, + "train_runtime": 154062.1977, + "train_tokens_per_second": 6375.919 + }, + { + "epoch": 2.618696733794304, + "grad_norm": 0.24822624027729034, + "learning_rate": 5e-06, + "loss": 0.9686, + "num_input_tokens_seen": 982743976, + "step": 2163, + "train_runtime": 154129.7489, + "train_tokens_per_second": 6376.082 + }, + { + "epoch": 2.6199074906119053, + "grad_norm": 0.25701308250427246, + "learning_rate": 5e-06, + "loss": 0.9104, + "num_input_tokens_seen": 983187152, + "step": 2164, + "train_runtime": 154195.5626, + "train_tokens_per_second": 6376.235 + }, + { + "epoch": 2.6211182474295063, + "grad_norm": 0.2755506932735443, + "learning_rate": 5e-06, + "loss": 0.9725, + "num_input_tokens_seen": 983623800, + "step": 2165, + "train_runtime": 154260.2571, + "train_tokens_per_second": 6376.392 + }, + { + "epoch": 2.622329004247108, + "grad_norm": 0.25525012612342834, + "learning_rate": 5e-06, + "loss": 0.9259, + "num_input_tokens_seen": 984073384, + "step": 2166, + "train_runtime": 154327.0854, + "train_tokens_per_second": 6376.544 + }, + { + "epoch": 2.623539761064709, + "grad_norm": 0.26483437418937683, + "learning_rate": 5e-06, + "loss": 0.9128, + "num_input_tokens_seen": 984513016, + "step": 2167, + "train_runtime": 154392.1294, + "train_tokens_per_second": 6376.705 + }, + { + "epoch": 2.6247505178823105, + "grad_norm": 0.23384696245193481, + "learning_rate": 5e-06, + "loss": 0.957, + "num_input_tokens_seen": 984968520, + "step": 2168, + "train_runtime": 154459.688, + "train_tokens_per_second": 6376.865 + }, + { + "epoch": 2.625961274699912, + "grad_norm": 0.233501136302948, + "learning_rate": 5e-06, + "loss": 0.8981, + "num_input_tokens_seen": 985445104, + "step": 2169, + "train_runtime": 154529.8475, + "train_tokens_per_second": 6377.053 + }, + { + "epoch": 2.6271720315175133, + "grad_norm": 0.2487708330154419, + "learning_rate": 5e-06, + "loss": 0.8331, + "num_input_tokens_seen": 985877536, + "step": 2170, + "train_runtime": 154593.5382, + "train_tokens_per_second": 6377.223 + }, + { + "epoch": 2.6283827883351147, + "grad_norm": 0.23218314349651337, + "learning_rate": 5e-06, + "loss": 0.95, + "num_input_tokens_seen": 986316704, + "step": 2171, + "train_runtime": 154657.7931, + "train_tokens_per_second": 6377.414 + }, + { + "epoch": 2.629593545152716, + "grad_norm": 0.2570416033267975, + "learning_rate": 5e-06, + "loss": 0.8875, + "num_input_tokens_seen": 986752352, + "step": 2172, + "train_runtime": 154721.7442, + "train_tokens_per_second": 6377.593 + }, + { + "epoch": 2.6308043019703176, + "grad_norm": 0.2248729020357132, + "learning_rate": 5e-06, + "loss": 0.8952, + "num_input_tokens_seen": 987227168, + "step": 2173, + "train_runtime": 154792.0587, + "train_tokens_per_second": 6377.764 + }, + { + "epoch": 2.632015058787919, + "grad_norm": 0.23463650047779083, + "learning_rate": 5e-06, + "loss": 0.9381, + "num_input_tokens_seen": 987681176, + "step": 2174, + "train_runtime": 154858.7972, + "train_tokens_per_second": 6377.947 + }, + { + "epoch": 2.6332258156055204, + "grad_norm": 0.24335210025310516, + "learning_rate": 5e-06, + "loss": 0.9145, + "num_input_tokens_seen": 988153656, + "step": 2175, + "train_runtime": 154928.7355, + "train_tokens_per_second": 6378.117 + }, + { + "epoch": 2.634436572423122, + "grad_norm": 0.24974526464939117, + "learning_rate": 5e-06, + "loss": 0.9623, + "num_input_tokens_seen": 988604512, + "step": 2176, + "train_runtime": 154995.4793, + "train_tokens_per_second": 6378.28 + }, + { + "epoch": 2.635647329240723, + "grad_norm": 0.23491837084293365, + "learning_rate": 5e-06, + "loss": 0.925, + "num_input_tokens_seen": 989041648, + "step": 2177, + "train_runtime": 155060.32, + "train_tokens_per_second": 6378.432 + }, + { + "epoch": 2.636858086058324, + "grad_norm": 0.2458321899175644, + "learning_rate": 5e-06, + "loss": 0.9523, + "num_input_tokens_seen": 989484256, + "step": 2178, + "train_runtime": 155126.0643, + "train_tokens_per_second": 6378.582 + }, + { + "epoch": 2.638068842875926, + "grad_norm": 0.28002414107322693, + "learning_rate": 5e-06, + "loss": 0.9521, + "num_input_tokens_seen": 989918248, + "step": 2179, + "train_runtime": 155190.3341, + "train_tokens_per_second": 6378.736 + }, + { + "epoch": 2.639279599693527, + "grad_norm": 0.2300572544336319, + "learning_rate": 5e-06, + "loss": 0.9063, + "num_input_tokens_seen": 990388592, + "step": 2180, + "train_runtime": 155259.9905, + "train_tokens_per_second": 6378.904 + }, + { + "epoch": 2.6404903565111284, + "grad_norm": 0.23866574466228485, + "learning_rate": 5e-06, + "loss": 0.9428, + "num_input_tokens_seen": 990856224, + "step": 2181, + "train_runtime": 155329.3298, + "train_tokens_per_second": 6379.067 + }, + { + "epoch": 2.64170111332873, + "grad_norm": 0.2776472270488739, + "learning_rate": 5e-06, + "loss": 0.8795, + "num_input_tokens_seen": 991300536, + "step": 2182, + "train_runtime": 155395.1153, + "train_tokens_per_second": 6379.226 + }, + { + "epoch": 2.642911870146331, + "grad_norm": 0.24262697994709015, + "learning_rate": 5e-06, + "loss": 0.8967, + "num_input_tokens_seen": 991741200, + "step": 2183, + "train_runtime": 155459.8715, + "train_tokens_per_second": 6379.403 + }, + { + "epoch": 2.6441226269639326, + "grad_norm": 0.25825032591819763, + "learning_rate": 5e-06, + "loss": 0.9596, + "num_input_tokens_seen": 992195600, + "step": 2184, + "train_runtime": 155526.9125, + "train_tokens_per_second": 6379.575 + }, + { + "epoch": 2.645333383781534, + "grad_norm": 0.25043049454689026, + "learning_rate": 5e-06, + "loss": 0.9482, + "num_input_tokens_seen": 992628344, + "step": 2185, + "train_runtime": 155590.6849, + "train_tokens_per_second": 6379.741 + }, + { + "epoch": 2.6465441405991355, + "grad_norm": 0.24327807128429413, + "learning_rate": 5e-06, + "loss": 0.9272, + "num_input_tokens_seen": 993091536, + "step": 2186, + "train_runtime": 155659.1615, + "train_tokens_per_second": 6379.911 + }, + { + "epoch": 2.647754897416737, + "grad_norm": 0.26455357670783997, + "learning_rate": 5e-06, + "loss": 0.9241, + "num_input_tokens_seen": 993539712, + "step": 2187, + "train_runtime": 155725.1034, + "train_tokens_per_second": 6380.087 + }, + { + "epoch": 2.6489656542343383, + "grad_norm": 0.24340102076530457, + "learning_rate": 5e-06, + "loss": 0.9586, + "num_input_tokens_seen": 993981440, + "step": 2188, + "train_runtime": 155790.4247, + "train_tokens_per_second": 6380.247 + }, + { + "epoch": 2.6501764110519397, + "grad_norm": 0.24760021269321442, + "learning_rate": 5e-06, + "loss": 0.9187, + "num_input_tokens_seen": 994453512, + "step": 2189, + "train_runtime": 155860.8542, + "train_tokens_per_second": 6380.393 + }, + { + "epoch": 2.651387167869541, + "grad_norm": 0.25143691897392273, + "learning_rate": 5e-06, + "loss": 0.9341, + "num_input_tokens_seen": 994882632, + "step": 2190, + "train_runtime": 155923.9754, + "train_tokens_per_second": 6380.562 + }, + { + "epoch": 2.652597924687142, + "grad_norm": 0.2322501242160797, + "learning_rate": 5e-06, + "loss": 0.8817, + "num_input_tokens_seen": 995341712, + "step": 2191, + "train_runtime": 155992.0321, + "train_tokens_per_second": 6380.721 + }, + { + "epoch": 2.653808681504744, + "grad_norm": 0.23527227342128754, + "learning_rate": 5e-06, + "loss": 0.9028, + "num_input_tokens_seen": 995823768, + "step": 2192, + "train_runtime": 156063.4226, + "train_tokens_per_second": 6380.892 + }, + { + "epoch": 2.655019438322345, + "grad_norm": 0.2262798249721527, + "learning_rate": 5e-06, + "loss": 0.9217, + "num_input_tokens_seen": 996285904, + "step": 2193, + "train_runtime": 156131.4608, + "train_tokens_per_second": 6381.071 + }, + { + "epoch": 2.6562301951399467, + "grad_norm": 0.24012240767478943, + "learning_rate": 5e-06, + "loss": 0.9692, + "num_input_tokens_seen": 996740120, + "step": 2194, + "train_runtime": 156198.4805, + "train_tokens_per_second": 6381.241 + }, + { + "epoch": 2.6574409519575477, + "grad_norm": 0.2367800921201706, + "learning_rate": 5e-06, + "loss": 0.8777, + "num_input_tokens_seen": 997206712, + "step": 2195, + "train_runtime": 156267.5628, + "train_tokens_per_second": 6381.406 + }, + { + "epoch": 2.658651708775149, + "grad_norm": 0.22343586385250092, + "learning_rate": 5e-06, + "loss": 0.8786, + "num_input_tokens_seen": 997680768, + "step": 2196, + "train_runtime": 156337.5139, + "train_tokens_per_second": 6381.583 + }, + { + "epoch": 2.6598624655927505, + "grad_norm": 0.2318398803472519, + "learning_rate": 5e-06, + "loss": 0.9135, + "num_input_tokens_seen": 998140896, + "step": 2197, + "train_runtime": 156405.329, + "train_tokens_per_second": 6381.758 + }, + { + "epoch": 2.661073222410352, + "grad_norm": 0.23731204867362976, + "learning_rate": 5e-06, + "loss": 0.9568, + "num_input_tokens_seen": 998581544, + "step": 2198, + "train_runtime": 156470.6646, + "train_tokens_per_second": 6381.909 + }, + { + "epoch": 2.6622839792279533, + "grad_norm": 0.2556219696998596, + "learning_rate": 5e-06, + "loss": 0.9705, + "num_input_tokens_seen": 999021544, + "step": 2199, + "train_runtime": 156535.5906, + "train_tokens_per_second": 6382.073 + }, + { + "epoch": 2.6634947360455548, + "grad_norm": 0.25406965613365173, + "learning_rate": 5e-06, + "loss": 0.9043, + "num_input_tokens_seen": 999456000, + "step": 2200, + "train_runtime": 156600.1967, + "train_tokens_per_second": 6382.214 + }, + { + "epoch": 2.664705492863156, + "grad_norm": 0.2429000586271286, + "learning_rate": 5e-06, + "loss": 1.0054, + "num_input_tokens_seen": 999917048, + "step": 2201, + "train_runtime": 156667.8114, + "train_tokens_per_second": 6382.403 + }, + { + "epoch": 2.6659162496807576, + "grad_norm": 0.24307996034622192, + "learning_rate": 5e-06, + "loss": 0.9031, + "num_input_tokens_seen": 1000345496, + "step": 2202, + "train_runtime": 156731.558, + "train_tokens_per_second": 6382.54 + }, + { + "epoch": 2.667127006498359, + "grad_norm": 0.2613001763820648, + "learning_rate": 5e-06, + "loss": 0.9146, + "num_input_tokens_seen": 1000775024, + "step": 2203, + "train_runtime": 156796.6198, + "train_tokens_per_second": 6382.631 + }, + { + "epoch": 2.6683377633159604, + "grad_norm": 0.2521812617778778, + "learning_rate": 5e-06, + "loss": 0.9449, + "num_input_tokens_seen": 1001214208, + "step": 2204, + "train_runtime": 156861.3258, + "train_tokens_per_second": 6382.798 + }, + { + "epoch": 2.669548520133562, + "grad_norm": 0.22570690512657166, + "learning_rate": 5e-06, + "loss": 0.8841, + "num_input_tokens_seen": 1001691928, + "step": 2205, + "train_runtime": 156931.9112, + "train_tokens_per_second": 6382.972 + }, + { + "epoch": 2.6707592769511628, + "grad_norm": 0.2318730354309082, + "learning_rate": 5e-06, + "loss": 0.89, + "num_input_tokens_seen": 1002167032, + "step": 2206, + "train_runtime": 157002.068, + "train_tokens_per_second": 6383.145 + }, + { + "epoch": 2.6719700337687646, + "grad_norm": 0.26219817996025085, + "learning_rate": 5e-06, + "loss": 0.8752, + "num_input_tokens_seen": 1002621280, + "step": 2207, + "train_runtime": 157069.3885, + "train_tokens_per_second": 6383.302 + }, + { + "epoch": 2.6731807905863656, + "grad_norm": 0.23726919293403625, + "learning_rate": 5e-06, + "loss": 0.8883, + "num_input_tokens_seen": 1003092128, + "step": 2208, + "train_runtime": 157139.0605, + "train_tokens_per_second": 6383.468 + }, + { + "epoch": 2.674391547403967, + "grad_norm": 0.24438372254371643, + "learning_rate": 5e-06, + "loss": 0.9029, + "num_input_tokens_seen": 1003551736, + "step": 2209, + "train_runtime": 157207.0617, + "train_tokens_per_second": 6383.63 + }, + { + "epoch": 2.6756023042215684, + "grad_norm": 0.2408195585012436, + "learning_rate": 5e-06, + "loss": 0.9428, + "num_input_tokens_seen": 1003992272, + "step": 2210, + "train_runtime": 157272.0435, + "train_tokens_per_second": 6383.794 + }, + { + "epoch": 2.67681306103917, + "grad_norm": 0.23207896947860718, + "learning_rate": 5e-06, + "loss": 0.9647, + "num_input_tokens_seen": 1004464200, + "step": 2211, + "train_runtime": 157341.891, + "train_tokens_per_second": 6383.959 + }, + { + "epoch": 2.6780238178567712, + "grad_norm": 0.26204124093055725, + "learning_rate": 5e-06, + "loss": 0.911, + "num_input_tokens_seen": 1004918912, + "step": 2212, + "train_runtime": 157409.1733, + "train_tokens_per_second": 6384.119 + }, + { + "epoch": 2.6792345746743726, + "grad_norm": 0.24040430784225464, + "learning_rate": 5e-06, + "loss": 0.8979, + "num_input_tokens_seen": 1005365120, + "step": 2213, + "train_runtime": 157475.3659, + "train_tokens_per_second": 6384.269 + }, + { + "epoch": 2.680445331491974, + "grad_norm": 0.2542877495288849, + "learning_rate": 5e-06, + "loss": 0.9251, + "num_input_tokens_seen": 1005804512, + "step": 2214, + "train_runtime": 157540.2892, + "train_tokens_per_second": 6384.427 + }, + { + "epoch": 2.6816560883095755, + "grad_norm": 0.2377696931362152, + "learning_rate": 5e-06, + "loss": 0.8792, + "num_input_tokens_seen": 1006260000, + "step": 2215, + "train_runtime": 157607.9347, + "train_tokens_per_second": 6384.577 + }, + { + "epoch": 2.682866845127177, + "grad_norm": 0.23569363355636597, + "learning_rate": 5e-06, + "loss": 0.9299, + "num_input_tokens_seen": 1006738640, + "step": 2216, + "train_runtime": 157678.5377, + "train_tokens_per_second": 6384.754 + }, + { + "epoch": 2.6840776019447783, + "grad_norm": 0.2451499104499817, + "learning_rate": 5e-06, + "loss": 0.9518, + "num_input_tokens_seen": 1007184656, + "step": 2217, + "train_runtime": 157744.1772, + "train_tokens_per_second": 6384.924 + }, + { + "epoch": 2.6852883587623797, + "grad_norm": 0.2470230609178543, + "learning_rate": 5e-06, + "loss": 0.927, + "num_input_tokens_seen": 1007627432, + "step": 2218, + "train_runtime": 157809.6053, + "train_tokens_per_second": 6385.083 + }, + { + "epoch": 2.6864991155799807, + "grad_norm": 0.24628578126430511, + "learning_rate": 5e-06, + "loss": 0.9285, + "num_input_tokens_seen": 1008077632, + "step": 2219, + "train_runtime": 157876.7498, + "train_tokens_per_second": 6385.219 + }, + { + "epoch": 2.6877098723975825, + "grad_norm": 0.2287086695432663, + "learning_rate": 5e-06, + "loss": 0.9016, + "num_input_tokens_seen": 1008546760, + "step": 2220, + "train_runtime": 157946.0167, + "train_tokens_per_second": 6385.389 + }, + { + "epoch": 2.6889206292151835, + "grad_norm": 0.25545141100883484, + "learning_rate": 5e-06, + "loss": 0.9121, + "num_input_tokens_seen": 1009016192, + "step": 2221, + "train_runtime": 158015.3011, + "train_tokens_per_second": 6385.56 + }, + { + "epoch": 2.690131386032785, + "grad_norm": 0.22718414664268494, + "learning_rate": 5e-06, + "loss": 0.9043, + "num_input_tokens_seen": 1009470936, + "step": 2222, + "train_runtime": 158082.415, + "train_tokens_per_second": 6385.726 + }, + { + "epoch": 2.6913421428503863, + "grad_norm": 0.2230096310377121, + "learning_rate": 5e-06, + "loss": 0.9253, + "num_input_tokens_seen": 1009952464, + "step": 2223, + "train_runtime": 158154.0362, + "train_tokens_per_second": 6385.879 + }, + { + "epoch": 2.6925528996679877, + "grad_norm": 0.23132304847240448, + "learning_rate": 5e-06, + "loss": 0.9229, + "num_input_tokens_seen": 1010405368, + "step": 2224, + "train_runtime": 158220.9026, + "train_tokens_per_second": 6386.042 + }, + { + "epoch": 2.693763656485589, + "grad_norm": 0.22348402440547943, + "learning_rate": 5e-06, + "loss": 0.8622, + "num_input_tokens_seen": 1010875144, + "step": 2225, + "train_runtime": 158290.9411, + "train_tokens_per_second": 6386.184 + }, + { + "epoch": 2.6949744133031905, + "grad_norm": 0.25180912017822266, + "learning_rate": 5e-06, + "loss": 0.934, + "num_input_tokens_seen": 1011328552, + "step": 2226, + "train_runtime": 158358.6024, + "train_tokens_per_second": 6386.319 + }, + { + "epoch": 2.696185170120792, + "grad_norm": 0.23059271275997162, + "learning_rate": 5e-06, + "loss": 0.9463, + "num_input_tokens_seen": 1011765960, + "step": 2227, + "train_runtime": 158423.3929, + "train_tokens_per_second": 6386.468 + }, + { + "epoch": 2.6973959269383934, + "grad_norm": 0.23937790095806122, + "learning_rate": 5e-06, + "loss": 0.9551, + "num_input_tokens_seen": 1012221776, + "step": 2228, + "train_runtime": 158490.551, + "train_tokens_per_second": 6386.638 + }, + { + "epoch": 2.6986066837559948, + "grad_norm": 0.23659993708133698, + "learning_rate": 5e-06, + "loss": 0.8974, + "num_input_tokens_seen": 1012684104, + "step": 2229, + "train_runtime": 158559.0434, + "train_tokens_per_second": 6386.795 + }, + { + "epoch": 2.699817440573596, + "grad_norm": 0.2516309320926666, + "learning_rate": 5e-06, + "loss": 0.965, + "num_input_tokens_seen": 1013148096, + "step": 2230, + "train_runtime": 158627.7065, + "train_tokens_per_second": 6386.955 + }, + { + "epoch": 2.7010281973911976, + "grad_norm": 0.25234147906303406, + "learning_rate": 5e-06, + "loss": 0.8918, + "num_input_tokens_seen": 1013602984, + "step": 2231, + "train_runtime": 158694.9766, + "train_tokens_per_second": 6387.114 + }, + { + "epoch": 2.7022389542087986, + "grad_norm": 0.2293567955493927, + "learning_rate": 5e-06, + "loss": 0.9286, + "num_input_tokens_seen": 1014069312, + "step": 2232, + "train_runtime": 158764.1805, + "train_tokens_per_second": 6387.268 + }, + { + "epoch": 2.7034497110264004, + "grad_norm": 0.2538798153400421, + "learning_rate": 5e-06, + "loss": 0.9303, + "num_input_tokens_seen": 1014506672, + "step": 2233, + "train_runtime": 158828.7542, + "train_tokens_per_second": 6387.424 + }, + { + "epoch": 2.7046604678440014, + "grad_norm": 0.25850167870521545, + "learning_rate": 5e-06, + "loss": 0.9683, + "num_input_tokens_seen": 1014949832, + "step": 2234, + "train_runtime": 158894.4091, + "train_tokens_per_second": 6387.574 + }, + { + "epoch": 2.705871224661603, + "grad_norm": 0.23657569289207458, + "learning_rate": 5e-06, + "loss": 0.9358, + "num_input_tokens_seen": 1015406592, + "step": 2235, + "train_runtime": 158962.0909, + "train_tokens_per_second": 6387.728 + }, + { + "epoch": 2.707081981479204, + "grad_norm": 0.2708401381969452, + "learning_rate": 5e-06, + "loss": 0.8852, + "num_input_tokens_seen": 1015841432, + "step": 2236, + "train_runtime": 159026.4419, + "train_tokens_per_second": 6387.878 + }, + { + "epoch": 2.7082927382968056, + "grad_norm": 0.24284714460372925, + "learning_rate": 5e-06, + "loss": 0.9317, + "num_input_tokens_seen": 1016289424, + "step": 2237, + "train_runtime": 159092.4722, + "train_tokens_per_second": 6388.042 + }, + { + "epoch": 2.709503495114407, + "grad_norm": 0.25480154156684875, + "learning_rate": 5e-06, + "loss": 0.9192, + "num_input_tokens_seen": 1016757504, + "step": 2238, + "train_runtime": 159161.1907, + "train_tokens_per_second": 6388.225 + }, + { + "epoch": 2.7107142519320084, + "grad_norm": 0.24498331546783447, + "learning_rate": 5e-06, + "loss": 0.8707, + "num_input_tokens_seen": 1017194928, + "step": 2239, + "train_runtime": 159226.0946, + "train_tokens_per_second": 6388.368 + }, + { + "epoch": 2.71192500874961, + "grad_norm": 0.26153630018234253, + "learning_rate": 5e-06, + "loss": 0.9109, + "num_input_tokens_seen": 1017660048, + "step": 2240, + "train_runtime": 159295.3482, + "train_tokens_per_second": 6388.511 + }, + { + "epoch": 2.7131357655672113, + "grad_norm": 0.28027719259262085, + "learning_rate": 5e-06, + "loss": 0.9673, + "num_input_tokens_seen": 1018086808, + "step": 2241, + "train_runtime": 159358.1538, + "train_tokens_per_second": 6388.671 + }, + { + "epoch": 2.7143465223848127, + "grad_norm": 0.3030099868774414, + "learning_rate": 5e-06, + "loss": 0.9419, + "num_input_tokens_seen": 1018528160, + "step": 2242, + "train_runtime": 159423.698, + "train_tokens_per_second": 6388.813 + }, + { + "epoch": 2.715557279202414, + "grad_norm": 0.2761872112751007, + "learning_rate": 5e-06, + "loss": 0.9203, + "num_input_tokens_seen": 1018979560, + "step": 2243, + "train_runtime": 159490.4403, + "train_tokens_per_second": 6388.97 + }, + { + "epoch": 2.7167680360200155, + "grad_norm": 0.2496478110551834, + "learning_rate": 5e-06, + "loss": 0.9394, + "num_input_tokens_seen": 1019421000, + "step": 2244, + "train_runtime": 159555.4925, + "train_tokens_per_second": 6389.131 + }, + { + "epoch": 2.7179787928376165, + "grad_norm": 0.33068129420280457, + "learning_rate": 5e-06, + "loss": 0.917, + "num_input_tokens_seen": 1019848624, + "step": 2245, + "train_runtime": 159619.1337, + "train_tokens_per_second": 6389.263 + }, + { + "epoch": 2.7191895496552183, + "grad_norm": 0.26431363821029663, + "learning_rate": 5e-06, + "loss": 0.957, + "num_input_tokens_seen": 1020290632, + "step": 2246, + "train_runtime": 159684.4414, + "train_tokens_per_second": 6389.418 + }, + { + "epoch": 2.7204003064728193, + "grad_norm": 0.25907760858535767, + "learning_rate": 5e-06, + "loss": 0.9241, + "num_input_tokens_seen": 1020772456, + "step": 2247, + "train_runtime": 159755.9211, + "train_tokens_per_second": 6389.575 + }, + { + "epoch": 2.7216110632904207, + "grad_norm": 0.26979854702949524, + "learning_rate": 5e-06, + "loss": 0.9457, + "num_input_tokens_seen": 1021232408, + "step": 2248, + "train_runtime": 159823.991, + "train_tokens_per_second": 6389.732 + }, + { + "epoch": 2.722821820108022, + "grad_norm": 0.27133068442344666, + "learning_rate": 5e-06, + "loss": 0.9257, + "num_input_tokens_seen": 1021703000, + "step": 2249, + "train_runtime": 159893.8238, + "train_tokens_per_second": 6389.884 + }, + { + "epoch": 2.7240325769256235, + "grad_norm": 0.2623973786830902, + "learning_rate": 5e-06, + "loss": 0.9326, + "num_input_tokens_seen": 1022143904, + "step": 2250, + "train_runtime": 159958.597, + "train_tokens_per_second": 6390.053 + }, + { + "epoch": 2.725243333743225, + "grad_norm": 0.2727581262588501, + "learning_rate": 5e-06, + "loss": 0.8614, + "num_input_tokens_seen": 1022607144, + "step": 2251, + "train_runtime": 160027.1697, + "train_tokens_per_second": 6390.21 + }, + { + "epoch": 2.7264540905608263, + "grad_norm": 0.2389581948518753, + "learning_rate": 5e-06, + "loss": 0.9231, + "num_input_tokens_seen": 1023053400, + "step": 2252, + "train_runtime": 160093.3628, + "train_tokens_per_second": 6390.355 + }, + { + "epoch": 2.7276648473784277, + "grad_norm": 0.2514803409576416, + "learning_rate": 5e-06, + "loss": 0.9391, + "num_input_tokens_seen": 1023527608, + "step": 2253, + "train_runtime": 160163.4694, + "train_tokens_per_second": 6390.518 + }, + { + "epoch": 2.728875604196029, + "grad_norm": 0.24334073066711426, + "learning_rate": 5e-06, + "loss": 0.9595, + "num_input_tokens_seen": 1023989968, + "step": 2254, + "train_runtime": 160231.8243, + "train_tokens_per_second": 6390.678 + }, + { + "epoch": 2.7300863610136306, + "grad_norm": 0.2730535864830017, + "learning_rate": 5e-06, + "loss": 0.9543, + "num_input_tokens_seen": 1024436392, + "step": 2255, + "train_runtime": 160298.2361, + "train_tokens_per_second": 6390.815 + }, + { + "epoch": 2.731297117831232, + "grad_norm": 0.257646769285202, + "learning_rate": 5e-06, + "loss": 0.9654, + "num_input_tokens_seen": 1024877272, + "step": 2256, + "train_runtime": 160364.5888, + "train_tokens_per_second": 6390.92 + }, + { + "epoch": 2.7325078746488334, + "grad_norm": 0.2575959265232086, + "learning_rate": 5e-06, + "loss": 0.9102, + "num_input_tokens_seen": 1025345304, + "step": 2257, + "train_runtime": 160434.8659, + "train_tokens_per_second": 6391.038 + }, + { + "epoch": 2.7337186314664343, + "grad_norm": 0.24817879498004913, + "learning_rate": 5e-06, + "loss": 0.9336, + "num_input_tokens_seen": 1025789800, + "step": 2258, + "train_runtime": 160500.5173, + "train_tokens_per_second": 6391.193 + }, + { + "epoch": 2.734929388284036, + "grad_norm": 0.23942458629608154, + "learning_rate": 5e-06, + "loss": 0.9363, + "num_input_tokens_seen": 1026251136, + "step": 2259, + "train_runtime": 160569.0846, + "train_tokens_per_second": 6391.337 + }, + { + "epoch": 2.736140145101637, + "grad_norm": 0.24998879432678223, + "learning_rate": 5e-06, + "loss": 0.9369, + "num_input_tokens_seen": 1026706720, + "step": 2260, + "train_runtime": 160636.2779, + "train_tokens_per_second": 6391.5 + }, + { + "epoch": 2.7373509019192386, + "grad_norm": 0.2262594848871231, + "learning_rate": 5e-06, + "loss": 0.8807, + "num_input_tokens_seen": 1027191856, + "step": 2261, + "train_runtime": 160708.6042, + "train_tokens_per_second": 6391.642 + }, + { + "epoch": 2.73856165873684, + "grad_norm": 0.25167331099510193, + "learning_rate": 5e-06, + "loss": 0.9173, + "num_input_tokens_seen": 1027631680, + "step": 2262, + "train_runtime": 160773.4123, + "train_tokens_per_second": 6391.801 + }, + { + "epoch": 2.7397724155544414, + "grad_norm": 0.25715553760528564, + "learning_rate": 5e-06, + "loss": 0.8853, + "num_input_tokens_seen": 1028082536, + "step": 2263, + "train_runtime": 160840.2689, + "train_tokens_per_second": 6391.947 + }, + { + "epoch": 2.740983172372043, + "grad_norm": 0.23995672166347504, + "learning_rate": 5e-06, + "loss": 0.9307, + "num_input_tokens_seen": 1028562968, + "step": 2264, + "train_runtime": 160911.4001, + "train_tokens_per_second": 6392.108 + }, + { + "epoch": 2.742193929189644, + "grad_norm": 0.24075527489185333, + "learning_rate": 5e-06, + "loss": 0.8959, + "num_input_tokens_seen": 1028999072, + "step": 2265, + "train_runtime": 160975.9822, + "train_tokens_per_second": 6392.252 + }, + { + "epoch": 2.7434046860072456, + "grad_norm": 0.24111104011535645, + "learning_rate": 5e-06, + "loss": 0.9342, + "num_input_tokens_seen": 1029436168, + "step": 2266, + "train_runtime": 161040.4703, + "train_tokens_per_second": 6392.407 + }, + { + "epoch": 2.744615442824847, + "grad_norm": 0.24185071885585785, + "learning_rate": 5e-06, + "loss": 0.9124, + "num_input_tokens_seen": 1029895528, + "step": 2267, + "train_runtime": 161108.4302, + "train_tokens_per_second": 6392.561 + }, + { + "epoch": 2.7458261996424485, + "grad_norm": 0.2595217525959015, + "learning_rate": 5e-06, + "loss": 0.9333, + "num_input_tokens_seen": 1030313776, + "step": 2268, + "train_runtime": 161170.4004, + "train_tokens_per_second": 6392.698 + }, + { + "epoch": 2.74703695646005, + "grad_norm": 0.2608698308467865, + "learning_rate": 5e-06, + "loss": 0.9352, + "num_input_tokens_seen": 1030773200, + "step": 2269, + "train_runtime": 161238.33, + "train_tokens_per_second": 6392.855 + }, + { + "epoch": 2.7482477132776513, + "grad_norm": 0.2337881624698639, + "learning_rate": 5e-06, + "loss": 0.903, + "num_input_tokens_seen": 1031251608, + "step": 2270, + "train_runtime": 161309.7518, + "train_tokens_per_second": 6392.99 + }, + { + "epoch": 2.7494584700952522, + "grad_norm": 0.23849591612815857, + "learning_rate": 5e-06, + "loss": 0.9358, + "num_input_tokens_seen": 1031721848, + "step": 2271, + "train_runtime": 161379.4792, + "train_tokens_per_second": 6393.142 + }, + { + "epoch": 2.750669226912854, + "grad_norm": 0.2555829882621765, + "learning_rate": 5e-06, + "loss": 0.9064, + "num_input_tokens_seen": 1032189752, + "step": 2272, + "train_runtime": 161448.9849, + "train_tokens_per_second": 6393.287 + }, + { + "epoch": 2.751879983730455, + "grad_norm": 0.2797653079032898, + "learning_rate": 5e-06, + "loss": 0.8848, + "num_input_tokens_seen": 1032639608, + "step": 2273, + "train_runtime": 161515.5469, + "train_tokens_per_second": 6393.438 + }, + { + "epoch": 2.753090740548057, + "grad_norm": 0.23317913711071014, + "learning_rate": 5e-06, + "loss": 0.9238, + "num_input_tokens_seen": 1033080176, + "step": 2274, + "train_runtime": 161580.6508, + "train_tokens_per_second": 6393.588 + }, + { + "epoch": 2.754301497365658, + "grad_norm": 0.24477601051330566, + "learning_rate": 5e-06, + "loss": 0.9416, + "num_input_tokens_seen": 1033520656, + "step": 2275, + "train_runtime": 161645.5728, + "train_tokens_per_second": 6393.746 + }, + { + "epoch": 2.7555122541832593, + "grad_norm": 0.2364787459373474, + "learning_rate": 5e-06, + "loss": 0.9096, + "num_input_tokens_seen": 1033971520, + "step": 2276, + "train_runtime": 161712.1981, + "train_tokens_per_second": 6393.899 + }, + { + "epoch": 2.7567230110008607, + "grad_norm": 0.2605726718902588, + "learning_rate": 5e-06, + "loss": 0.8921, + "num_input_tokens_seen": 1034395088, + "step": 2277, + "train_runtime": 161774.2815, + "train_tokens_per_second": 6394.064 + }, + { + "epoch": 2.757933767818462, + "grad_norm": 0.25480276346206665, + "learning_rate": 5e-06, + "loss": 0.9177, + "num_input_tokens_seen": 1034838552, + "step": 2278, + "train_runtime": 161839.7302, + "train_tokens_per_second": 6394.218 + }, + { + "epoch": 2.7591445246360635, + "grad_norm": 0.23550397157669067, + "learning_rate": 5e-06, + "loss": 0.8798, + "num_input_tokens_seen": 1035301976, + "step": 2279, + "train_runtime": 161908.0151, + "train_tokens_per_second": 6394.384 + }, + { + "epoch": 2.760355281453665, + "grad_norm": 0.23179112374782562, + "learning_rate": 5e-06, + "loss": 0.8953, + "num_input_tokens_seen": 1035774664, + "step": 2280, + "train_runtime": 161978.1853, + "train_tokens_per_second": 6394.532 + }, + { + "epoch": 2.7615660382712663, + "grad_norm": 0.28745490312576294, + "learning_rate": 5e-06, + "loss": 0.9014, + "num_input_tokens_seen": 1036207656, + "step": 2281, + "train_runtime": 162042.7376, + "train_tokens_per_second": 6394.657 + }, + { + "epoch": 2.7627767950888678, + "grad_norm": 0.2519856095314026, + "learning_rate": 5e-06, + "loss": 0.906, + "num_input_tokens_seen": 1036649640, + "step": 2282, + "train_runtime": 162107.9902, + "train_tokens_per_second": 6394.809 + }, + { + "epoch": 2.763987551906469, + "grad_norm": 0.2556043863296509, + "learning_rate": 5e-06, + "loss": 0.9393, + "num_input_tokens_seen": 1037124656, + "step": 2283, + "train_runtime": 162178.7126, + "train_tokens_per_second": 6394.949 + }, + { + "epoch": 2.7651983087240706, + "grad_norm": 0.22922824323177338, + "learning_rate": 5e-06, + "loss": 0.8837, + "num_input_tokens_seen": 1037595840, + "step": 2284, + "train_runtime": 162248.143, + "train_tokens_per_second": 6395.117 + }, + { + "epoch": 2.766409065541672, + "grad_norm": 0.24666735529899597, + "learning_rate": 5e-06, + "loss": 0.9671, + "num_input_tokens_seen": 1038036336, + "step": 2285, + "train_runtime": 162313.37, + "train_tokens_per_second": 6395.261 + }, + { + "epoch": 2.767619822359273, + "grad_norm": 0.27610307931900024, + "learning_rate": 5e-06, + "loss": 0.8669, + "num_input_tokens_seen": 1038496960, + "step": 2286, + "train_runtime": 162381.4019, + "train_tokens_per_second": 6395.418 + }, + { + "epoch": 2.768830579176875, + "grad_norm": 0.2348206490278244, + "learning_rate": 5e-06, + "loss": 0.9206, + "num_input_tokens_seen": 1038953384, + "step": 2287, + "train_runtime": 162448.8975, + "train_tokens_per_second": 6395.571 + }, + { + "epoch": 2.7700413359944758, + "grad_norm": 0.2505703270435333, + "learning_rate": 5e-06, + "loss": 0.88, + "num_input_tokens_seen": 1039421160, + "step": 2288, + "train_runtime": 162517.7659, + "train_tokens_per_second": 6395.739 + }, + { + "epoch": 2.771252092812077, + "grad_norm": 0.23638983070850372, + "learning_rate": 5e-06, + "loss": 0.8718, + "num_input_tokens_seen": 1039873192, + "step": 2289, + "train_runtime": 162584.5274, + "train_tokens_per_second": 6395.893 + }, + { + "epoch": 2.7724628496296786, + "grad_norm": 0.23076026141643524, + "learning_rate": 5e-06, + "loss": 0.8999, + "num_input_tokens_seen": 1040363160, + "step": 2290, + "train_runtime": 162657.0792, + "train_tokens_per_second": 6396.052 + }, + { + "epoch": 2.77367360644728, + "grad_norm": 0.25392135977745056, + "learning_rate": 5e-06, + "loss": 0.9451, + "num_input_tokens_seen": 1040790536, + "step": 2291, + "train_runtime": 162720.1261, + "train_tokens_per_second": 6396.2 + }, + { + "epoch": 2.7748843632648814, + "grad_norm": 0.24415422976016998, + "learning_rate": 5e-06, + "loss": 0.8886, + "num_input_tokens_seen": 1041230312, + "step": 2292, + "train_runtime": 162785.4545, + "train_tokens_per_second": 6396.335 + }, + { + "epoch": 2.776095120082483, + "grad_norm": 0.2522631585597992, + "learning_rate": 5e-06, + "loss": 0.9343, + "num_input_tokens_seen": 1041675088, + "step": 2293, + "train_runtime": 162851.2507, + "train_tokens_per_second": 6396.482 + }, + { + "epoch": 2.7773058769000842, + "grad_norm": 0.23644526302814484, + "learning_rate": 5e-06, + "loss": 0.9147, + "num_input_tokens_seen": 1042146576, + "step": 2294, + "train_runtime": 162920.8304, + "train_tokens_per_second": 6396.644 + }, + { + "epoch": 2.7785166337176856, + "grad_norm": 0.25499239563941956, + "learning_rate": 5e-06, + "loss": 0.9219, + "num_input_tokens_seen": 1042611024, + "step": 2295, + "train_runtime": 162989.4272, + "train_tokens_per_second": 6396.802 + }, + { + "epoch": 2.779727390535287, + "grad_norm": 0.2416430562734604, + "learning_rate": 5e-06, + "loss": 0.9671, + "num_input_tokens_seen": 1043078696, + "step": 2296, + "train_runtime": 163058.9679, + "train_tokens_per_second": 6396.942 + }, + { + "epoch": 2.7809381473528885, + "grad_norm": 0.2677612900733948, + "learning_rate": 5e-06, + "loss": 0.8946, + "num_input_tokens_seen": 1043533320, + "step": 2297, + "train_runtime": 163126.3482, + "train_tokens_per_second": 6397.086 + }, + { + "epoch": 2.78214890417049, + "grad_norm": 0.25840872526168823, + "learning_rate": 5e-06, + "loss": 0.9096, + "num_input_tokens_seen": 1043990936, + "step": 2298, + "train_runtime": 163194.0314, + "train_tokens_per_second": 6397.237 + }, + { + "epoch": 2.783359660988091, + "grad_norm": 0.2687901556491852, + "learning_rate": 5e-06, + "loss": 0.9439, + "num_input_tokens_seen": 1044449824, + "step": 2299, + "train_runtime": 163262.0571, + "train_tokens_per_second": 6397.382 + }, + { + "epoch": 2.7845704178056927, + "grad_norm": 0.27363818883895874, + "learning_rate": 5e-06, + "loss": 0.9365, + "num_input_tokens_seen": 1044906680, + "step": 2300, + "train_runtime": 163329.4795, + "train_tokens_per_second": 6397.539 + }, + { + "epoch": 2.7857811746232937, + "grad_norm": 0.2355838119983673, + "learning_rate": 5e-06, + "loss": 0.8774, + "num_input_tokens_seen": 1045377512, + "step": 2301, + "train_runtime": 163398.8973, + "train_tokens_per_second": 6397.702 + }, + { + "epoch": 2.786991931440895, + "grad_norm": 0.24392828345298767, + "learning_rate": 5e-06, + "loss": 0.9311, + "num_input_tokens_seen": 1045821864, + "step": 2302, + "train_runtime": 163465.0191, + "train_tokens_per_second": 6397.833 + }, + { + "epoch": 2.7882026882584965, + "grad_norm": 0.22444923222064972, + "learning_rate": 5e-06, + "loss": 0.9476, + "num_input_tokens_seen": 1046281000, + "step": 2303, + "train_runtime": 163532.6383, + "train_tokens_per_second": 6397.995 + }, + { + "epoch": 2.789413445076098, + "grad_norm": 0.23800964653491974, + "learning_rate": 5e-06, + "loss": 0.9248, + "num_input_tokens_seen": 1046761784, + "step": 2304, + "train_runtime": 163603.0082, + "train_tokens_per_second": 6398.182 + }, + { + "epoch": 2.7906242018936993, + "grad_norm": 0.2515329420566559, + "learning_rate": 5e-06, + "loss": 0.8946, + "num_input_tokens_seen": 1047209968, + "step": 2305, + "train_runtime": 163669.4581, + "train_tokens_per_second": 6398.322 + }, + { + "epoch": 2.7918349587113007, + "grad_norm": 0.27853265404701233, + "learning_rate": 5e-06, + "loss": 1.03, + "num_input_tokens_seen": 1047644576, + "step": 2306, + "train_runtime": 163733.387, + "train_tokens_per_second": 6398.479 + }, + { + "epoch": 2.793045715528902, + "grad_norm": 0.23654435575008392, + "learning_rate": 5e-06, + "loss": 0.8594, + "num_input_tokens_seen": 1048113472, + "step": 2307, + "train_runtime": 163802.8984, + "train_tokens_per_second": 6398.626 + }, + { + "epoch": 2.7942564723465035, + "grad_norm": 0.24784903228282928, + "learning_rate": 5e-06, + "loss": 0.9906, + "num_input_tokens_seen": 1048549400, + "step": 2308, + "train_runtime": 163867.387, + "train_tokens_per_second": 6398.768 + }, + { + "epoch": 2.795467229164105, + "grad_norm": 0.24880841374397278, + "learning_rate": 5e-06, + "loss": 0.9033, + "num_input_tokens_seen": 1048995416, + "step": 2309, + "train_runtime": 163934.0955, + "train_tokens_per_second": 6398.885 + }, + { + "epoch": 2.7966779859817064, + "grad_norm": 0.24870271980762482, + "learning_rate": 5e-06, + "loss": 0.8977, + "num_input_tokens_seen": 1049470880, + "step": 2310, + "train_runtime": 164006.5254, + "train_tokens_per_second": 6398.958 + }, + { + "epoch": 2.7978887427993078, + "grad_norm": 0.23734253644943237, + "learning_rate": 5e-06, + "loss": 0.9153, + "num_input_tokens_seen": 1049914560, + "step": 2311, + "train_runtime": 164071.9182, + "train_tokens_per_second": 6399.112 + }, + { + "epoch": 2.7990994996169087, + "grad_norm": 0.25134560465812683, + "learning_rate": 5e-06, + "loss": 0.9128, + "num_input_tokens_seen": 1050357984, + "step": 2312, + "train_runtime": 164137.1428, + "train_tokens_per_second": 6399.271 + }, + { + "epoch": 2.8003102564345106, + "grad_norm": 0.2300664782524109, + "learning_rate": 5e-06, + "loss": 0.9098, + "num_input_tokens_seen": 1050819792, + "step": 2313, + "train_runtime": 164205.6195, + "train_tokens_per_second": 6399.414 + }, + { + "epoch": 2.8015210132521116, + "grad_norm": 0.22979731857776642, + "learning_rate": 5e-06, + "loss": 0.9221, + "num_input_tokens_seen": 1051285056, + "step": 2314, + "train_runtime": 164274.736, + "train_tokens_per_second": 6399.554 + }, + { + "epoch": 2.802731770069713, + "grad_norm": 0.23814600706100464, + "learning_rate": 5e-06, + "loss": 0.9126, + "num_input_tokens_seen": 1051755944, + "step": 2315, + "train_runtime": 164344.435, + "train_tokens_per_second": 6399.705 + }, + { + "epoch": 2.8039425268873144, + "grad_norm": 0.21878504753112793, + "learning_rate": 5e-06, + "loss": 0.9152, + "num_input_tokens_seen": 1052229800, + "step": 2316, + "train_runtime": 164414.3881, + "train_tokens_per_second": 6399.864 + }, + { + "epoch": 2.805153283704916, + "grad_norm": 0.24749340116977692, + "learning_rate": 5e-06, + "loss": 0.9028, + "num_input_tokens_seen": 1052673152, + "step": 2317, + "train_runtime": 164480.1965, + "train_tokens_per_second": 6399.999 + }, + { + "epoch": 2.806364040522517, + "grad_norm": 0.23459599912166595, + "learning_rate": 5e-06, + "loss": 0.9071, + "num_input_tokens_seen": 1053111560, + "step": 2318, + "train_runtime": 164545.2472, + "train_tokens_per_second": 6400.134 + }, + { + "epoch": 2.8075747973401186, + "grad_norm": 0.24665674567222595, + "learning_rate": 5e-06, + "loss": 0.9421, + "num_input_tokens_seen": 1053560808, + "step": 2319, + "train_runtime": 164611.4841, + "train_tokens_per_second": 6400.287 + }, + { + "epoch": 2.80878555415772, + "grad_norm": 0.24288515746593475, + "learning_rate": 5e-06, + "loss": 0.9171, + "num_input_tokens_seen": 1054028536, + "step": 2320, + "train_runtime": 164680.3862, + "train_tokens_per_second": 6400.45 + }, + { + "epoch": 2.8099963109753214, + "grad_norm": 0.24747171998023987, + "learning_rate": 5e-06, + "loss": 0.9457, + "num_input_tokens_seen": 1054488288, + "step": 2321, + "train_runtime": 164748.1304, + "train_tokens_per_second": 6400.609 + }, + { + "epoch": 2.811207067792923, + "grad_norm": 0.23851259052753448, + "learning_rate": 5e-06, + "loss": 0.9325, + "num_input_tokens_seen": 1054961000, + "step": 2322, + "train_runtime": 164818.1256, + "train_tokens_per_second": 6400.758 + }, + { + "epoch": 2.8124178246105243, + "grad_norm": 0.24482108652591705, + "learning_rate": 5e-06, + "loss": 0.8696, + "num_input_tokens_seen": 1055393152, + "step": 2323, + "train_runtime": 164882.1388, + "train_tokens_per_second": 6400.894 + }, + { + "epoch": 2.8136285814281257, + "grad_norm": 0.24118374288082123, + "learning_rate": 5e-06, + "loss": 0.9024, + "num_input_tokens_seen": 1055852976, + "step": 2324, + "train_runtime": 164950.2745, + "train_tokens_per_second": 6401.038 + }, + { + "epoch": 2.8148393382457266, + "grad_norm": 0.2338990718126297, + "learning_rate": 5e-06, + "loss": 0.8967, + "num_input_tokens_seen": 1056302096, + "step": 2325, + "train_runtime": 165016.6859, + "train_tokens_per_second": 6401.184 + }, + { + "epoch": 2.8160500950633285, + "grad_norm": 0.249686598777771, + "learning_rate": 5e-06, + "loss": 0.9129, + "num_input_tokens_seen": 1056769208, + "step": 2326, + "train_runtime": 165086.0367, + "train_tokens_per_second": 6401.324 + }, + { + "epoch": 2.8172608518809295, + "grad_norm": 0.24016061425209045, + "learning_rate": 5e-06, + "loss": 0.8875, + "num_input_tokens_seen": 1057246168, + "step": 2327, + "train_runtime": 165156.8351, + "train_tokens_per_second": 6401.468 + }, + { + "epoch": 2.818471608698531, + "grad_norm": 0.2340596616268158, + "learning_rate": 5e-06, + "loss": 0.871, + "num_input_tokens_seen": 1057694632, + "step": 2328, + "train_runtime": 165223.3174, + "train_tokens_per_second": 6401.606 + }, + { + "epoch": 2.8196823655161323, + "grad_norm": 0.23390509188175201, + "learning_rate": 5e-06, + "loss": 0.9014, + "num_input_tokens_seen": 1058166320, + "step": 2329, + "train_runtime": 165292.9359, + "train_tokens_per_second": 6401.764 + }, + { + "epoch": 2.8208931223337337, + "grad_norm": 0.240423783659935, + "learning_rate": 5e-06, + "loss": 0.9042, + "num_input_tokens_seen": 1058610264, + "step": 2330, + "train_runtime": 165358.5343, + "train_tokens_per_second": 6401.909 + }, + { + "epoch": 2.822103879151335, + "grad_norm": 0.23276259005069733, + "learning_rate": 5e-06, + "loss": 0.956, + "num_input_tokens_seen": 1059081800, + "step": 2331, + "train_runtime": 165428.3498, + "train_tokens_per_second": 6402.057 + }, + { + "epoch": 2.8233146359689365, + "grad_norm": 0.23118719458580017, + "learning_rate": 5e-06, + "loss": 0.9006, + "num_input_tokens_seen": 1059516272, + "step": 2332, + "train_runtime": 165492.7144, + "train_tokens_per_second": 6402.193 + }, + { + "epoch": 2.824525392786538, + "grad_norm": 0.24484090507030487, + "learning_rate": 5e-06, + "loss": 0.899, + "num_input_tokens_seen": 1059975872, + "step": 2333, + "train_runtime": 165560.5144, + "train_tokens_per_second": 6402.347 + }, + { + "epoch": 2.8257361496041393, + "grad_norm": 0.26148274540901184, + "learning_rate": 5e-06, + "loss": 0.8908, + "num_input_tokens_seen": 1060424984, + "step": 2334, + "train_runtime": 165626.6035, + "train_tokens_per_second": 6402.504 + }, + { + "epoch": 2.8269469064217407, + "grad_norm": 0.23295333981513977, + "learning_rate": 5e-06, + "loss": 0.9084, + "num_input_tokens_seen": 1060878880, + "step": 2335, + "train_runtime": 165693.7989, + "train_tokens_per_second": 6402.647 + }, + { + "epoch": 2.828157663239342, + "grad_norm": 0.2560044229030609, + "learning_rate": 5e-06, + "loss": 0.9266, + "num_input_tokens_seen": 1061345072, + "step": 2336, + "train_runtime": 165762.4034, + "train_tokens_per_second": 6402.809 + }, + { + "epoch": 2.8293684200569436, + "grad_norm": 0.293335497379303, + "learning_rate": 5e-06, + "loss": 0.9615, + "num_input_tokens_seen": 1061791296, + "step": 2337, + "train_runtime": 165827.814, + "train_tokens_per_second": 6402.975 + }, + { + "epoch": 2.8305791768745445, + "grad_norm": 0.250169038772583, + "learning_rate": 5e-06, + "loss": 0.8951, + "num_input_tokens_seen": 1062258544, + "step": 2338, + "train_runtime": 165897.2325, + "train_tokens_per_second": 6403.112 + }, + { + "epoch": 2.8317899336921464, + "grad_norm": 0.24368995428085327, + "learning_rate": 5e-06, + "loss": 0.8965, + "num_input_tokens_seen": 1062713712, + "step": 2339, + "train_runtime": 165964.4766, + "train_tokens_per_second": 6403.26 + }, + { + "epoch": 2.8330006905097473, + "grad_norm": 0.26345351338386536, + "learning_rate": 5e-06, + "loss": 0.9173, + "num_input_tokens_seen": 1063171088, + "step": 2340, + "train_runtime": 166032.2225, + "train_tokens_per_second": 6403.402 + }, + { + "epoch": 2.8342114473273488, + "grad_norm": 0.26429590582847595, + "learning_rate": 5e-06, + "loss": 0.9012, + "num_input_tokens_seen": 1063627344, + "step": 2341, + "train_runtime": 166099.2017, + "train_tokens_per_second": 6403.567 + }, + { + "epoch": 2.83542220414495, + "grad_norm": 0.2443588227033615, + "learning_rate": 5e-06, + "loss": 0.8997, + "num_input_tokens_seen": 1064085144, + "step": 2342, + "train_runtime": 166167.5411, + "train_tokens_per_second": 6403.688 + }, + { + "epoch": 2.8366329609625516, + "grad_norm": 0.2589036226272583, + "learning_rate": 5e-06, + "loss": 0.9135, + "num_input_tokens_seen": 1064548048, + "step": 2343, + "train_runtime": 166236.1917, + "train_tokens_per_second": 6403.828 + }, + { + "epoch": 2.837843717780153, + "grad_norm": 0.24053068459033966, + "learning_rate": 5e-06, + "loss": 0.9705, + "num_input_tokens_seen": 1065004800, + "step": 2344, + "train_runtime": 166304.0312, + "train_tokens_per_second": 6403.963 + }, + { + "epoch": 2.8390544745977544, + "grad_norm": 0.2785547971725464, + "learning_rate": 5e-06, + "loss": 0.9527, + "num_input_tokens_seen": 1065466952, + "step": 2345, + "train_runtime": 166372.1789, + "train_tokens_per_second": 6404.117 + }, + { + "epoch": 2.840265231415356, + "grad_norm": 0.2642555236816406, + "learning_rate": 5e-06, + "loss": 0.8824, + "num_input_tokens_seen": 1065920384, + "step": 2346, + "train_runtime": 166439.2664, + "train_tokens_per_second": 6404.26 + }, + { + "epoch": 2.841475988232957, + "grad_norm": 0.24180537462234497, + "learning_rate": 5e-06, + "loss": 0.9517, + "num_input_tokens_seen": 1066358144, + "step": 2347, + "train_runtime": 166504.0514, + "train_tokens_per_second": 6404.398 + }, + { + "epoch": 2.8426867450505586, + "grad_norm": 0.2499978095293045, + "learning_rate": 5e-06, + "loss": 0.9408, + "num_input_tokens_seen": 1066833672, + "step": 2348, + "train_runtime": 166574.2519, + "train_tokens_per_second": 6404.553 + }, + { + "epoch": 2.84389750186816, + "grad_norm": 0.24067756533622742, + "learning_rate": 5e-06, + "loss": 0.9379, + "num_input_tokens_seen": 1067297648, + "step": 2349, + "train_runtime": 166642.8687, + "train_tokens_per_second": 6404.7 + }, + { + "epoch": 2.8451082586857614, + "grad_norm": 0.242728590965271, + "learning_rate": 5e-06, + "loss": 0.9005, + "num_input_tokens_seen": 1067762344, + "step": 2350, + "train_runtime": 166711.6274, + "train_tokens_per_second": 6404.846 + }, + { + "epoch": 2.8463190155033624, + "grad_norm": 0.23392565548419952, + "learning_rate": 5e-06, + "loss": 0.9443, + "num_input_tokens_seen": 1068211064, + "step": 2351, + "train_runtime": 166778.0705, + "train_tokens_per_second": 6404.985 + }, + { + "epoch": 2.8475297723209643, + "grad_norm": 0.24133005738258362, + "learning_rate": 5e-06, + "loss": 0.9389, + "num_input_tokens_seen": 1068666112, + "step": 2352, + "train_runtime": 166845.1732, + "train_tokens_per_second": 6405.137 + }, + { + "epoch": 2.8487405291385652, + "grad_norm": 0.25078195333480835, + "learning_rate": 5e-06, + "loss": 0.8236, + "num_input_tokens_seen": 1069135528, + "step": 2353, + "train_runtime": 166914.9802, + "train_tokens_per_second": 6405.27 + }, + { + "epoch": 2.849951285956167, + "grad_norm": 0.2609178125858307, + "learning_rate": 5e-06, + "loss": 0.9244, + "num_input_tokens_seen": 1069602576, + "step": 2354, + "train_runtime": 166984.0381, + "train_tokens_per_second": 6405.418 + }, + { + "epoch": 2.851162042773768, + "grad_norm": 0.25105518102645874, + "learning_rate": 5e-06, + "loss": 0.907, + "num_input_tokens_seen": 1070071800, + "step": 2355, + "train_runtime": 167053.8585, + "train_tokens_per_second": 6405.55 + }, + { + "epoch": 2.8523727995913695, + "grad_norm": 0.2414802759885788, + "learning_rate": 5e-06, + "loss": 0.9481, + "num_input_tokens_seen": 1070521528, + "step": 2356, + "train_runtime": 167120.4676, + "train_tokens_per_second": 6405.688 + }, + { + "epoch": 2.853583556408971, + "grad_norm": 0.25838810205459595, + "learning_rate": 5e-06, + "loss": 0.9217, + "num_input_tokens_seen": 1070945224, + "step": 2357, + "train_runtime": 167183.2228, + "train_tokens_per_second": 6405.818 + }, + { + "epoch": 2.8547943132265723, + "grad_norm": 0.24007445573806763, + "learning_rate": 5e-06, + "loss": 0.9555, + "num_input_tokens_seen": 1071419872, + "step": 2358, + "train_runtime": 167253.3886, + "train_tokens_per_second": 6405.968 + }, + { + "epoch": 2.8560050700441737, + "grad_norm": 0.23948095738887787, + "learning_rate": 5e-06, + "loss": 0.9238, + "num_input_tokens_seen": 1071871960, + "step": 2359, + "train_runtime": 167320.3656, + "train_tokens_per_second": 6406.106 + }, + { + "epoch": 2.857215826861775, + "grad_norm": 0.24351708590984344, + "learning_rate": 5e-06, + "loss": 0.9199, + "num_input_tokens_seen": 1072315552, + "step": 2360, + "train_runtime": 167386.0141, + "train_tokens_per_second": 6406.243 + }, + { + "epoch": 2.8584265836793765, + "grad_norm": 0.2502671778202057, + "learning_rate": 5e-06, + "loss": 0.9532, + "num_input_tokens_seen": 1072785312, + "step": 2361, + "train_runtime": 167455.5199, + "train_tokens_per_second": 6406.39 + }, + { + "epoch": 2.859637340496978, + "grad_norm": 0.2391587793827057, + "learning_rate": 5e-06, + "loss": 0.9096, + "num_input_tokens_seen": 1073222072, + "step": 2362, + "train_runtime": 167520.3372, + "train_tokens_per_second": 6406.518 + }, + { + "epoch": 2.8608480973145793, + "grad_norm": 0.2657223641872406, + "learning_rate": 5e-06, + "loss": 0.9262, + "num_input_tokens_seen": 1073678976, + "step": 2363, + "train_runtime": 167589.7726, + "train_tokens_per_second": 6406.59 + }, + { + "epoch": 2.8620588541321808, + "grad_norm": 0.22655223309993744, + "learning_rate": 5e-06, + "loss": 0.9214, + "num_input_tokens_seen": 1074141744, + "step": 2364, + "train_runtime": 167658.8468, + "train_tokens_per_second": 6406.711 + }, + { + "epoch": 2.863269610949782, + "grad_norm": 0.22646227478981018, + "learning_rate": 5e-06, + "loss": 0.918, + "num_input_tokens_seen": 1074598560, + "step": 2365, + "train_runtime": 167726.4287, + "train_tokens_per_second": 6406.853 + }, + { + "epoch": 2.864480367767383, + "grad_norm": 0.23975032567977905, + "learning_rate": 5e-06, + "loss": 0.9442, + "num_input_tokens_seen": 1075063080, + "step": 2366, + "train_runtime": 167794.848, + "train_tokens_per_second": 6407.009 + }, + { + "epoch": 2.865691124584985, + "grad_norm": 0.2874181568622589, + "learning_rate": 5e-06, + "loss": 0.9366, + "num_input_tokens_seen": 1075494480, + "step": 2367, + "train_runtime": 167858.4872, + "train_tokens_per_second": 6407.15 + }, + { + "epoch": 2.866901881402586, + "grad_norm": 0.23288311064243317, + "learning_rate": 5e-06, + "loss": 0.8927, + "num_input_tokens_seen": 1075946304, + "step": 2368, + "train_runtime": 167925.2658, + "train_tokens_per_second": 6407.293 + }, + { + "epoch": 2.8681126382201874, + "grad_norm": 0.22665363550186157, + "learning_rate": 5e-06, + "loss": 0.9149, + "num_input_tokens_seen": 1076407848, + "step": 2369, + "train_runtime": 167993.6545, + "train_tokens_per_second": 6407.432 + }, + { + "epoch": 2.8693233950377888, + "grad_norm": 0.2529769539833069, + "learning_rate": 5e-06, + "loss": 0.9091, + "num_input_tokens_seen": 1076849656, + "step": 2370, + "train_runtime": 168058.8499, + "train_tokens_per_second": 6407.575 + }, + { + "epoch": 2.87053415185539, + "grad_norm": 0.2684330344200134, + "learning_rate": 5e-06, + "loss": 0.9756, + "num_input_tokens_seen": 1077281544, + "step": 2371, + "train_runtime": 168122.0355, + "train_tokens_per_second": 6407.736 + }, + { + "epoch": 2.8717449086729916, + "grad_norm": 0.2409277856349945, + "learning_rate": 5e-06, + "loss": 0.8855, + "num_input_tokens_seen": 1077754688, + "step": 2372, + "train_runtime": 168192.2219, + "train_tokens_per_second": 6407.875 + }, + { + "epoch": 2.872955665490593, + "grad_norm": 0.28829601407051086, + "learning_rate": 5e-06, + "loss": 0.9238, + "num_input_tokens_seen": 1078223256, + "step": 2373, + "train_runtime": 168261.3837, + "train_tokens_per_second": 6408.026 + }, + { + "epoch": 2.8741664223081944, + "grad_norm": 0.2507815361022949, + "learning_rate": 5e-06, + "loss": 0.8819, + "num_input_tokens_seen": 1078704120, + "step": 2374, + "train_runtime": 168332.5096, + "train_tokens_per_second": 6408.175 + }, + { + "epoch": 2.875377179125796, + "grad_norm": 0.24085399508476257, + "learning_rate": 5e-06, + "loss": 0.9693, + "num_input_tokens_seen": 1079153752, + "step": 2375, + "train_runtime": 168398.802, + "train_tokens_per_second": 6408.322 + }, + { + "epoch": 2.8765879359433972, + "grad_norm": 0.23706606030464172, + "learning_rate": 5e-06, + "loss": 0.931, + "num_input_tokens_seen": 1079604888, + "step": 2376, + "train_runtime": 168465.9087, + "train_tokens_per_second": 6408.447 + }, + { + "epoch": 2.8777986927609986, + "grad_norm": 0.2508695721626282, + "learning_rate": 5e-06, + "loss": 0.9629, + "num_input_tokens_seen": 1080043568, + "step": 2377, + "train_runtime": 168530.2954, + "train_tokens_per_second": 6408.602 + }, + { + "epoch": 2.8790094495786, + "grad_norm": 0.25791847705841064, + "learning_rate": 5e-06, + "loss": 0.9672, + "num_input_tokens_seen": 1080495520, + "step": 2378, + "train_runtime": 168597.8644, + "train_tokens_per_second": 6408.714 + }, + { + "epoch": 2.880220206396201, + "grad_norm": 0.24105577170848846, + "learning_rate": 5e-06, + "loss": 0.9289, + "num_input_tokens_seen": 1080951824, + "step": 2379, + "train_runtime": 168665.4403, + "train_tokens_per_second": 6408.852 + }, + { + "epoch": 2.881430963213803, + "grad_norm": 0.2576942443847656, + "learning_rate": 5e-06, + "loss": 0.953, + "num_input_tokens_seen": 1081410648, + "step": 2380, + "train_runtime": 168733.6066, + "train_tokens_per_second": 6408.982 + }, + { + "epoch": 2.882641720031404, + "grad_norm": 0.2406541258096695, + "learning_rate": 5e-06, + "loss": 0.96, + "num_input_tokens_seen": 1081845248, + "step": 2381, + "train_runtime": 168797.6162, + "train_tokens_per_second": 6409.126 + }, + { + "epoch": 2.8838524768490053, + "grad_norm": 0.24252809584140778, + "learning_rate": 5e-06, + "loss": 0.9154, + "num_input_tokens_seen": 1082325608, + "step": 2382, + "train_runtime": 168868.6096, + "train_tokens_per_second": 6409.276 + }, + { + "epoch": 2.8850632336666067, + "grad_norm": 0.23159775137901306, + "learning_rate": 5e-06, + "loss": 0.9114, + "num_input_tokens_seen": 1082791040, + "step": 2383, + "train_runtime": 168937.7744, + "train_tokens_per_second": 6409.408 + }, + { + "epoch": 2.886273990484208, + "grad_norm": 0.22753025591373444, + "learning_rate": 5e-06, + "loss": 0.8676, + "num_input_tokens_seen": 1083225872, + "step": 2384, + "train_runtime": 169002.5358, + "train_tokens_per_second": 6409.524 + }, + { + "epoch": 2.8874847473018095, + "grad_norm": 0.2409481555223465, + "learning_rate": 5e-06, + "loss": 0.9381, + "num_input_tokens_seen": 1083672288, + "step": 2385, + "train_runtime": 169068.1993, + "train_tokens_per_second": 6409.675 + }, + { + "epoch": 2.888695504119411, + "grad_norm": 0.2493268996477127, + "learning_rate": 5e-06, + "loss": 0.9092, + "num_input_tokens_seen": 1084097152, + "step": 2386, + "train_runtime": 169130.9436, + "train_tokens_per_second": 6409.81 + }, + { + "epoch": 2.8899062609370123, + "grad_norm": 0.23205333948135376, + "learning_rate": 5e-06, + "loss": 0.8994, + "num_input_tokens_seen": 1084576688, + "step": 2387, + "train_runtime": 169202.0957, + "train_tokens_per_second": 6409.948 + }, + { + "epoch": 2.8911170177546137, + "grad_norm": 0.2507234811782837, + "learning_rate": 5e-06, + "loss": 0.8826, + "num_input_tokens_seen": 1085033800, + "step": 2388, + "train_runtime": 169269.553, + "train_tokens_per_second": 6410.094 + }, + { + "epoch": 2.892327774572215, + "grad_norm": 0.23068372905254364, + "learning_rate": 5e-06, + "loss": 0.8812, + "num_input_tokens_seen": 1085492936, + "step": 2389, + "train_runtime": 169337.2832, + "train_tokens_per_second": 6410.242 + }, + { + "epoch": 2.8935385313898165, + "grad_norm": 0.2602866590023041, + "learning_rate": 5e-06, + "loss": 0.9405, + "num_input_tokens_seen": 1085947584, + "step": 2390, + "train_runtime": 169404.5365, + "train_tokens_per_second": 6410.381 + }, + { + "epoch": 2.894749288207418, + "grad_norm": 0.24214865267276764, + "learning_rate": 5e-06, + "loss": 0.9443, + "num_input_tokens_seen": 1086393744, + "step": 2391, + "train_runtime": 169470.7089, + "train_tokens_per_second": 6410.51 + }, + { + "epoch": 2.895960045025019, + "grad_norm": 0.24468237161636353, + "learning_rate": 5e-06, + "loss": 0.9122, + "num_input_tokens_seen": 1086862296, + "step": 2392, + "train_runtime": 169540.7414, + "train_tokens_per_second": 6410.626 + }, + { + "epoch": 2.8971708018426208, + "grad_norm": 0.23451176285743713, + "learning_rate": 5e-06, + "loss": 0.9174, + "num_input_tokens_seen": 1087323160, + "step": 2393, + "train_runtime": 169608.3951, + "train_tokens_per_second": 6410.786 + }, + { + "epoch": 2.8983815586602217, + "grad_norm": 0.2530493140220642, + "learning_rate": 5e-06, + "loss": 0.893, + "num_input_tokens_seen": 1087787360, + "step": 2394, + "train_runtime": 169677.6888, + "train_tokens_per_second": 6410.904 + }, + { + "epoch": 2.899592315477823, + "grad_norm": 0.24401098489761353, + "learning_rate": 5e-06, + "loss": 0.959, + "num_input_tokens_seen": 1088250688, + "step": 2395, + "train_runtime": 169746.5348, + "train_tokens_per_second": 6411.033 + }, + { + "epoch": 2.9008030722954246, + "grad_norm": 0.25914639234542847, + "learning_rate": 5e-06, + "loss": 0.9092, + "num_input_tokens_seen": 1088720056, + "step": 2396, + "train_runtime": 169816.2236, + "train_tokens_per_second": 6411.166 + }, + { + "epoch": 2.902013829113026, + "grad_norm": 0.24759583175182343, + "learning_rate": 5e-06, + "loss": 0.9658, + "num_input_tokens_seen": 1089175536, + "step": 2397, + "train_runtime": 169884.2797, + "train_tokens_per_second": 6411.279 + }, + { + "epoch": 2.9032245859306274, + "grad_norm": 0.23406663537025452, + "learning_rate": 5e-06, + "loss": 0.9147, + "num_input_tokens_seen": 1089629544, + "step": 2398, + "train_runtime": 169951.7513, + "train_tokens_per_second": 6411.405 + }, + { + "epoch": 2.904435342748229, + "grad_norm": 0.23380409181118011, + "learning_rate": 5e-06, + "loss": 0.8736, + "num_input_tokens_seen": 1090090760, + "step": 2399, + "train_runtime": 170020.1649, + "train_tokens_per_second": 6411.538 + }, + { + "epoch": 2.90564609956583, + "grad_norm": 0.2372436821460724, + "learning_rate": 5e-06, + "loss": 0.9888, + "num_input_tokens_seen": 1090533784, + "step": 2400, + "train_runtime": 170085.5892, + "train_tokens_per_second": 6411.677 + }, + { + "epoch": 2.9068568563834316, + "grad_norm": 0.23042653501033783, + "learning_rate": 5e-06, + "loss": 0.8597, + "num_input_tokens_seen": 1090995800, + "step": 2401, + "train_runtime": 170153.6357, + "train_tokens_per_second": 6411.828 + }, + { + "epoch": 2.908067613201033, + "grad_norm": 0.23908060789108276, + "learning_rate": 5e-06, + "loss": 0.9105, + "num_input_tokens_seen": 1091454616, + "step": 2402, + "train_runtime": 170221.8303, + "train_tokens_per_second": 6411.954 + }, + { + "epoch": 2.9092783700186344, + "grad_norm": 0.2331104278564453, + "learning_rate": 5e-06, + "loss": 0.8998, + "num_input_tokens_seen": 1091927576, + "step": 2403, + "train_runtime": 170290.973, + "train_tokens_per_second": 6412.128 + }, + { + "epoch": 2.910489126836236, + "grad_norm": 0.24983853101730347, + "learning_rate": 5e-06, + "loss": 0.9253, + "num_input_tokens_seen": 1092399736, + "step": 2404, + "train_runtime": 170360.7078, + "train_tokens_per_second": 6412.275 + }, + { + "epoch": 2.911699883653837, + "grad_norm": 0.2480890452861786, + "learning_rate": 5e-06, + "loss": 0.8912, + "num_input_tokens_seen": 1092848752, + "step": 2405, + "train_runtime": 170427.5587, + "train_tokens_per_second": 6412.395 + }, + { + "epoch": 2.9129106404714387, + "grad_norm": 0.27480000257492065, + "learning_rate": 5e-06, + "loss": 0.9469, + "num_input_tokens_seen": 1093302472, + "step": 2406, + "train_runtime": 170494.7997, + "train_tokens_per_second": 6412.527 + }, + { + "epoch": 2.9141213972890396, + "grad_norm": 0.23799914121627808, + "learning_rate": 5e-06, + "loss": 0.8758, + "num_input_tokens_seen": 1093755192, + "step": 2407, + "train_runtime": 170561.4757, + "train_tokens_per_second": 6412.674 + }, + { + "epoch": 2.915332154106641, + "grad_norm": 0.24148832261562347, + "learning_rate": 5e-06, + "loss": 0.9984, + "num_input_tokens_seen": 1094210248, + "step": 2408, + "train_runtime": 170628.6094, + "train_tokens_per_second": 6412.818 + }, + { + "epoch": 2.9165429109242424, + "grad_norm": 0.2666292190551758, + "learning_rate": 5e-06, + "loss": 0.9139, + "num_input_tokens_seen": 1094648584, + "step": 2409, + "train_runtime": 170693.3721, + "train_tokens_per_second": 6412.953 + }, + { + "epoch": 2.917753667741844, + "grad_norm": 0.26424267888069153, + "learning_rate": 5e-06, + "loss": 0.9111, + "num_input_tokens_seen": 1095095048, + "step": 2410, + "train_runtime": 170759.9277, + "train_tokens_per_second": 6413.068 + }, + { + "epoch": 2.9189644245594453, + "grad_norm": 0.24830442667007446, + "learning_rate": 5e-06, + "loss": 0.9129, + "num_input_tokens_seen": 1095568936, + "step": 2411, + "train_runtime": 170830.0067, + "train_tokens_per_second": 6413.211 + }, + { + "epoch": 2.9201751813770467, + "grad_norm": 0.2383262813091278, + "learning_rate": 5e-06, + "loss": 0.9718, + "num_input_tokens_seen": 1096018672, + "step": 2412, + "train_runtime": 170896.6645, + "train_tokens_per_second": 6413.342 + }, + { + "epoch": 2.921385938194648, + "grad_norm": 0.24399027228355408, + "learning_rate": 5e-06, + "loss": 0.8699, + "num_input_tokens_seen": 1096467472, + "step": 2413, + "train_runtime": 170962.5005, + "train_tokens_per_second": 6413.497 + }, + { + "epoch": 2.9225966950122495, + "grad_norm": 0.25469908118247986, + "learning_rate": 5e-06, + "loss": 0.9116, + "num_input_tokens_seen": 1096916056, + "step": 2414, + "train_runtime": 171028.7924, + "train_tokens_per_second": 6413.634 + }, + { + "epoch": 2.923807451829851, + "grad_norm": 0.24257248640060425, + "learning_rate": 5e-06, + "loss": 0.9307, + "num_input_tokens_seen": 1097360792, + "step": 2415, + "train_runtime": 171094.5732, + "train_tokens_per_second": 6413.767 + }, + { + "epoch": 2.9250182086474523, + "grad_norm": 0.2582697570323944, + "learning_rate": 5e-06, + "loss": 0.9129, + "num_input_tokens_seen": 1097812096, + "step": 2416, + "train_runtime": 171162.4853, + "train_tokens_per_second": 6413.859 + }, + { + "epoch": 2.9262289654650537, + "grad_norm": 0.25255024433135986, + "learning_rate": 5e-06, + "loss": 0.9374, + "num_input_tokens_seen": 1098273232, + "step": 2417, + "train_runtime": 171231.7513, + "train_tokens_per_second": 6413.958 + }, + { + "epoch": 2.9274397222826547, + "grad_norm": 0.2257550060749054, + "learning_rate": 5e-06, + "loss": 0.8717, + "num_input_tokens_seen": 1098747232, + "step": 2418, + "train_runtime": 171301.7027, + "train_tokens_per_second": 6414.106 + }, + { + "epoch": 2.9286504791002566, + "grad_norm": 0.2679274380207062, + "learning_rate": 5e-06, + "loss": 0.9413, + "num_input_tokens_seen": 1099191672, + "step": 2419, + "train_runtime": 171368.2395, + "train_tokens_per_second": 6414.209 + }, + { + "epoch": 2.9298612359178575, + "grad_norm": 0.2332017421722412, + "learning_rate": 5e-06, + "loss": 0.8987, + "num_input_tokens_seen": 1099651328, + "step": 2420, + "train_runtime": 171440.7464, + "train_tokens_per_second": 6414.177 + }, + { + "epoch": 2.931071992735459, + "grad_norm": 0.24200941622257233, + "learning_rate": 5e-06, + "loss": 0.901, + "num_input_tokens_seen": 1100118920, + "step": 2421, + "train_runtime": 171514.9266, + "train_tokens_per_second": 6414.129 + }, + { + "epoch": 2.9322827495530603, + "grad_norm": 0.25546711683273315, + "learning_rate": 5e-06, + "loss": 0.9191, + "num_input_tokens_seen": 1100554296, + "step": 2422, + "train_runtime": 171583.4479, + "train_tokens_per_second": 6414.105 + }, + { + "epoch": 2.9334935063706618, + "grad_norm": 0.25552940368652344, + "learning_rate": 5e-06, + "loss": 0.8448, + "num_input_tokens_seen": 1101022976, + "step": 2423, + "train_runtime": 171657.7679, + "train_tokens_per_second": 6414.059 + }, + { + "epoch": 2.934704263188263, + "grad_norm": 0.23404908180236816, + "learning_rate": 5e-06, + "loss": 0.8804, + "num_input_tokens_seen": 1101486248, + "step": 2424, + "train_runtime": 171731.0221, + "train_tokens_per_second": 6414.02 + }, + { + "epoch": 2.9359150200058646, + "grad_norm": 0.23336048424243927, + "learning_rate": 5e-06, + "loss": 0.8998, + "num_input_tokens_seen": 1101941064, + "step": 2425, + "train_runtime": 171803.1314, + "train_tokens_per_second": 6413.975 + }, + { + "epoch": 2.937125776823466, + "grad_norm": 0.24817214906215668, + "learning_rate": 5e-06, + "loss": 0.9435, + "num_input_tokens_seen": 1102393528, + "step": 2426, + "train_runtime": 171874.1839, + "train_tokens_per_second": 6413.956 + }, + { + "epoch": 2.9383365336410674, + "grad_norm": 0.25328731536865234, + "learning_rate": 5e-06, + "loss": 0.9053, + "num_input_tokens_seen": 1102852280, + "step": 2427, + "train_runtime": 171946.8179, + "train_tokens_per_second": 6413.915 + }, + { + "epoch": 2.939547290458669, + "grad_norm": 0.26048070192337036, + "learning_rate": 5e-06, + "loss": 0.9076, + "num_input_tokens_seen": 1103317024, + "step": 2428, + "train_runtime": 172020.4148, + "train_tokens_per_second": 6413.873 + }, + { + "epoch": 2.94075804727627, + "grad_norm": 0.262016236782074, + "learning_rate": 5e-06, + "loss": 0.9485, + "num_input_tokens_seen": 1103764712, + "step": 2429, + "train_runtime": 172091.0589, + "train_tokens_per_second": 6413.841 + }, + { + "epoch": 2.9419688040938716, + "grad_norm": 0.26942306756973267, + "learning_rate": 5e-06, + "loss": 0.955, + "num_input_tokens_seen": 1104217608, + "step": 2430, + "train_runtime": 172162.6557, + "train_tokens_per_second": 6413.804 + }, + { + "epoch": 2.9431795609114726, + "grad_norm": 0.23582886159420013, + "learning_rate": 5e-06, + "loss": 0.9273, + "num_input_tokens_seen": 1104678152, + "step": 2431, + "train_runtime": 172234.9096, + "train_tokens_per_second": 6413.788 + }, + { + "epoch": 2.9443903177290744, + "grad_norm": 0.23661422729492188, + "learning_rate": 5e-06, + "loss": 0.8838, + "num_input_tokens_seen": 1105153976, + "step": 2432, + "train_runtime": 172310.273, + "train_tokens_per_second": 6413.744 + }, + { + "epoch": 2.9456010745466754, + "grad_norm": 0.2636778652667999, + "learning_rate": 5e-06, + "loss": 0.9245, + "num_input_tokens_seen": 1105621224, + "step": 2433, + "train_runtime": 172384.3139, + "train_tokens_per_second": 6413.7 + }, + { + "epoch": 2.9468118313642773, + "grad_norm": 0.26190289855003357, + "learning_rate": 5e-06, + "loss": 0.9535, + "num_input_tokens_seen": 1106056184, + "step": 2434, + "train_runtime": 172452.7145, + "train_tokens_per_second": 6413.678 + }, + { + "epoch": 2.9480225881818782, + "grad_norm": 0.23981881141662598, + "learning_rate": 5e-06, + "loss": 0.9348, + "num_input_tokens_seen": 1106502344, + "step": 2435, + "train_runtime": 172523.4959, + "train_tokens_per_second": 6413.633 + }, + { + "epoch": 2.9492333449994796, + "grad_norm": 0.23015964031219482, + "learning_rate": 5e-06, + "loss": 0.9253, + "num_input_tokens_seen": 1106974256, + "step": 2436, + "train_runtime": 172598.6333, + "train_tokens_per_second": 6413.575 + }, + { + "epoch": 2.950444101817081, + "grad_norm": 0.22270654141902924, + "learning_rate": 5e-06, + "loss": 0.88, + "num_input_tokens_seen": 1107434112, + "step": 2437, + "train_runtime": 172671.4841, + "train_tokens_per_second": 6413.532 + }, + { + "epoch": 2.9516548586346825, + "grad_norm": 0.23962879180908203, + "learning_rate": 5e-06, + "loss": 0.9206, + "num_input_tokens_seen": 1107898816, + "step": 2438, + "train_runtime": 172745.1854, + "train_tokens_per_second": 6413.486 + }, + { + "epoch": 2.952865615452284, + "grad_norm": 0.2439015656709671, + "learning_rate": 5e-06, + "loss": 0.9122, + "num_input_tokens_seen": 1108361840, + "step": 2439, + "train_runtime": 172818.5353, + "train_tokens_per_second": 6413.443 + }, + { + "epoch": 2.9540763722698853, + "grad_norm": 0.22247134149074554, + "learning_rate": 5e-06, + "loss": 0.8976, + "num_input_tokens_seen": 1108842328, + "step": 2440, + "train_runtime": 172894.6773, + "train_tokens_per_second": 6413.398 + }, + { + "epoch": 2.9552871290874867, + "grad_norm": 0.22744810581207275, + "learning_rate": 5e-06, + "loss": 0.9221, + "num_input_tokens_seen": 1109297992, + "step": 2441, + "train_runtime": 172966.821, + "train_tokens_per_second": 6413.357 + }, + { + "epoch": 2.956497885905088, + "grad_norm": 0.2583228051662445, + "learning_rate": 5e-06, + "loss": 0.9173, + "num_input_tokens_seen": 1109745168, + "step": 2442, + "train_runtime": 173037.3433, + "train_tokens_per_second": 6413.328 + }, + { + "epoch": 2.9577086427226895, + "grad_norm": 0.2402677983045578, + "learning_rate": 5e-06, + "loss": 0.878, + "num_input_tokens_seen": 1110207960, + "step": 2443, + "train_runtime": 173110.5564, + "train_tokens_per_second": 6413.289 + }, + { + "epoch": 2.958919399540291, + "grad_norm": 0.23672647774219513, + "learning_rate": 5e-06, + "loss": 0.8765, + "num_input_tokens_seen": 1110649312, + "step": 2444, + "train_runtime": 173180.6289, + "train_tokens_per_second": 6413.242 + }, + { + "epoch": 2.9601301563578923, + "grad_norm": 0.2683030068874359, + "learning_rate": 5e-06, + "loss": 0.9375, + "num_input_tokens_seen": 1111089320, + "step": 2445, + "train_runtime": 173250.1673, + "train_tokens_per_second": 6413.208 + }, + { + "epoch": 2.9613409131754933, + "grad_norm": 0.25095537304878235, + "learning_rate": 5e-06, + "loss": 0.9354, + "num_input_tokens_seen": 1111530136, + "step": 2446, + "train_runtime": 173320.2042, + "train_tokens_per_second": 6413.16 + }, + { + "epoch": 2.962551669993095, + "grad_norm": 0.25182783603668213, + "learning_rate": 5e-06, + "loss": 0.9433, + "num_input_tokens_seen": 1111995088, + "step": 2447, + "train_runtime": 173393.9827, + "train_tokens_per_second": 6413.112 + }, + { + "epoch": 2.963762426810696, + "grad_norm": 0.270939439535141, + "learning_rate": 5e-06, + "loss": 0.8994, + "num_input_tokens_seen": 1112439680, + "step": 2448, + "train_runtime": 173464.0985, + "train_tokens_per_second": 6413.083 + }, + { + "epoch": 2.9649731836282975, + "grad_norm": 0.25373977422714233, + "learning_rate": 5e-06, + "loss": 0.9124, + "num_input_tokens_seen": 1112881368, + "step": 2449, + "train_runtime": 173533.7189, + "train_tokens_per_second": 6413.055 + }, + { + "epoch": 2.966183940445899, + "grad_norm": 0.24695639312267303, + "learning_rate": 5e-06, + "loss": 0.8949, + "num_input_tokens_seen": 1113344440, + "step": 2450, + "train_runtime": 173606.9361, + "train_tokens_per_second": 6413.018 + }, + { + "epoch": 2.9673946972635004, + "grad_norm": 0.24027635157108307, + "learning_rate": 5e-06, + "loss": 0.9779, + "num_input_tokens_seen": 1113813744, + "step": 2451, + "train_runtime": 173681.1928, + "train_tokens_per_second": 6412.978 + }, + { + "epoch": 2.9686054540811018, + "grad_norm": 0.2398044615983963, + "learning_rate": 5e-06, + "loss": 0.8824, + "num_input_tokens_seen": 1114270416, + "step": 2452, + "train_runtime": 173753.6, + "train_tokens_per_second": 6412.934 + }, + { + "epoch": 2.969816210898703, + "grad_norm": 0.27489855885505676, + "learning_rate": 5e-06, + "loss": 0.8882, + "num_input_tokens_seen": 1114724384, + "step": 2453, + "train_runtime": 173825.6733, + "train_tokens_per_second": 6412.887 + }, + { + "epoch": 2.9710269677163046, + "grad_norm": 0.26074662804603577, + "learning_rate": 5e-06, + "loss": 0.9524, + "num_input_tokens_seen": 1115177168, + "step": 2454, + "train_runtime": 173897.6763, + "train_tokens_per_second": 6412.835 + }, + { + "epoch": 2.972237724533906, + "grad_norm": 0.24579590559005737, + "learning_rate": 5e-06, + "loss": 0.9598, + "num_input_tokens_seen": 1115634232, + "step": 2455, + "train_runtime": 173970.4648, + "train_tokens_per_second": 6412.78 + }, + { + "epoch": 2.9734484813515074, + "grad_norm": 0.22661468386650085, + "learning_rate": 5e-06, + "loss": 0.8578, + "num_input_tokens_seen": 1116124864, + "step": 2456, + "train_runtime": 174048.669, + "train_tokens_per_second": 6412.717 + }, + { + "epoch": 2.974659238169109, + "grad_norm": 0.26638656854629517, + "learning_rate": 5e-06, + "loss": 0.9083, + "num_input_tokens_seen": 1116575904, + "step": 2457, + "train_runtime": 174120.1916, + "train_tokens_per_second": 6412.673 + }, + { + "epoch": 2.9758699949867102, + "grad_norm": 0.2577857971191406, + "learning_rate": 5e-06, + "loss": 0.8847, + "num_input_tokens_seen": 1117030408, + "step": 2458, + "train_runtime": 174191.0573, + "train_tokens_per_second": 6412.674 + }, + { + "epoch": 2.977080751804311, + "grad_norm": 0.23168501257896423, + "learning_rate": 5e-06, + "loss": 0.9056, + "num_input_tokens_seen": 1117520192, + "step": 2459, + "train_runtime": 174267.8102, + "train_tokens_per_second": 6412.66 + }, + { + "epoch": 2.978291508621913, + "grad_norm": 0.25029903650283813, + "learning_rate": 5e-06, + "loss": 0.9101, + "num_input_tokens_seen": 1117976192, + "step": 2460, + "train_runtime": 174339.6479, + "train_tokens_per_second": 6412.633 + }, + { + "epoch": 2.979502265439514, + "grad_norm": 0.23616862297058105, + "learning_rate": 5e-06, + "loss": 0.9479, + "num_input_tokens_seen": 1118420800, + "step": 2461, + "train_runtime": 174409.7079, + "train_tokens_per_second": 6412.606 + }, + { + "epoch": 2.9807130222571154, + "grad_norm": 0.24392381310462952, + "learning_rate": 5e-06, + "loss": 0.9767, + "num_input_tokens_seen": 1118884408, + "step": 2462, + "train_runtime": 174483.4587, + "train_tokens_per_second": 6412.553 + }, + { + "epoch": 2.981923779074717, + "grad_norm": 0.23490194976329803, + "learning_rate": 5e-06, + "loss": 0.8814, + "num_input_tokens_seen": 1119381392, + "step": 2463, + "train_runtime": 174562.9683, + "train_tokens_per_second": 6412.479 + }, + { + "epoch": 2.9831345358923183, + "grad_norm": 0.2503698170185089, + "learning_rate": 5e-06, + "loss": 0.9589, + "num_input_tokens_seen": 1119802384, + "step": 2464, + "train_runtime": 174629.2291, + "train_tokens_per_second": 6412.457 + }, + { + "epoch": 2.9843452927099197, + "grad_norm": 0.2408633977174759, + "learning_rate": 5e-06, + "loss": 0.967, + "num_input_tokens_seen": 1120232368, + "step": 2465, + "train_runtime": 174696.7418, + "train_tokens_per_second": 6412.44 + }, + { + "epoch": 2.985556049527521, + "grad_norm": 0.23038393259048462, + "learning_rate": 5e-06, + "loss": 0.8323, + "num_input_tokens_seen": 1120694192, + "step": 2466, + "train_runtime": 174769.9103, + "train_tokens_per_second": 6412.398 + }, + { + "epoch": 2.9867668063451225, + "grad_norm": 0.23888365924358368, + "learning_rate": 5e-06, + "loss": 0.9043, + "num_input_tokens_seen": 1121142168, + "step": 2467, + "train_runtime": 174840.0255, + "train_tokens_per_second": 6412.388 + }, + { + "epoch": 2.987977563162724, + "grad_norm": 0.2362690418958664, + "learning_rate": 5e-06, + "loss": 0.9153, + "num_input_tokens_seen": 1121609584, + "step": 2468, + "train_runtime": 174913.8632, + "train_tokens_per_second": 6412.354 + }, + { + "epoch": 2.9891883199803253, + "grad_norm": 0.2894575595855713, + "learning_rate": 5e-06, + "loss": 0.939, + "num_input_tokens_seen": 1122044904, + "step": 2469, + "train_runtime": 174982.5392, + "train_tokens_per_second": 6412.325 + }, + { + "epoch": 2.9903990767979267, + "grad_norm": 0.24327421188354492, + "learning_rate": 5e-06, + "loss": 0.8949, + "num_input_tokens_seen": 1122496600, + "step": 2470, + "train_runtime": 175053.549, + "train_tokens_per_second": 6412.304 + }, + { + "epoch": 2.991609833615528, + "grad_norm": 0.2470681220293045, + "learning_rate": 5e-06, + "loss": 0.8861, + "num_input_tokens_seen": 1122965344, + "step": 2471, + "train_runtime": 175127.8031, + "train_tokens_per_second": 6412.262 + }, + { + "epoch": 2.992820590433129, + "grad_norm": 0.2589993476867676, + "learning_rate": 5e-06, + "loss": 0.8901, + "num_input_tokens_seen": 1123437960, + "step": 2472, + "train_runtime": 175202.8631, + "train_tokens_per_second": 6412.212 + }, + { + "epoch": 2.994031347250731, + "grad_norm": 0.2532251179218292, + "learning_rate": 5e-06, + "loss": 0.902, + "num_input_tokens_seen": 1123894464, + "step": 2473, + "train_runtime": 175275.4878, + "train_tokens_per_second": 6412.16 + }, + { + "epoch": 2.995242104068332, + "grad_norm": 0.23978720605373383, + "learning_rate": 5e-06, + "loss": 0.892, + "num_input_tokens_seen": 1124361120, + "step": 2474, + "train_runtime": 175348.7702, + "train_tokens_per_second": 6412.141 + }, + { + "epoch": 2.9964528608859333, + "grad_norm": 0.24950125813484192, + "learning_rate": 5e-06, + "loss": 0.918, + "num_input_tokens_seen": 1124811584, + "step": 2475, + "train_runtime": 175417.2461, + "train_tokens_per_second": 6412.206 + }, + { + "epoch": 2.9976636177035347, + "grad_norm": 0.2536337971687317, + "learning_rate": 5e-06, + "loss": 0.9361, + "num_input_tokens_seen": 1125259760, + "step": 2476, + "train_runtime": 175487.7712, + "train_tokens_per_second": 6412.183 + }, + { + "epoch": 2.998874374521136, + "grad_norm": 0.23223650455474854, + "learning_rate": 5e-06, + "loss": 0.8352, + "num_input_tokens_seen": 1125711656, + "step": 2477, + "train_runtime": 175556.6609, + "train_tokens_per_second": 6412.241 + }, + { + "epoch": 3.0, + "grad_norm": 0.25418493151664734, + "learning_rate": 5e-06, + "loss": 0.9392, + "num_input_tokens_seen": 1126151688, + "step": 2478, + "train_runtime": 175624.5645, + "train_tokens_per_second": 6412.268 + }, + { + "epoch": 3.0, + "num_input_tokens_seen": 1126151688, + "step": 2478, + "total_flos": 2.4182853777648783e+18, + "train_loss": 0.9592450147342836, + "train_runtime": 175626.0126, + "train_samples_per_second": 3.612, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 1, + "max_steps": 2478, + "num_input_tokens_seen": 1126151688, + "num_train_epochs": 3, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4182853777648783e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}