diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.9912565575818135, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019985011241568824, + "grad_norm": 42.184392372160474, + "learning_rate": 0.0, + "loss": 2.5206, + "num_tokens": 4185045.0, + "step": 1 + }, + { + "epoch": 0.003997002248313765, + "grad_norm": 42.20630313521732, + "learning_rate": 5.333333333333335e-07, + "loss": 2.4922, + "num_tokens": 8364385.0, + "step": 2 + }, + { + "epoch": 0.005995503372470647, + "grad_norm": 41.51791736538307, + "learning_rate": 1.066666666666667e-06, + "loss": 2.5083, + "num_tokens": 12550189.0, + "step": 3 + }, + { + "epoch": 0.00799400449662753, + "grad_norm": 49.41857102158481, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.5249, + "num_tokens": 16734554.0, + "step": 4 + }, + { + "epoch": 0.009992505620784412, + "grad_norm": 66.53734233126855, + "learning_rate": 2.133333333333334e-06, + "loss": 2.4239, + "num_tokens": 20920706.0, + "step": 5 + }, + { + "epoch": 0.011991006744941295, + "grad_norm": 441.73277224850347, + "learning_rate": 2.666666666666667e-06, + "loss": 2.2906, + "num_tokens": 25107011.0, + "step": 6 + }, + { + "epoch": 0.013989507869098177, + "grad_norm": 82.71605325094072, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.1937, + "num_tokens": 29291841.0, + "step": 7 + }, + { + "epoch": 0.01598800899325506, + "grad_norm": 326.43177993994016, + "learning_rate": 3.7333333333333337e-06, + "loss": 1.9083, + "num_tokens": 33476563.0, + "step": 8 + }, + { + "epoch": 0.017986510117411942, + "grad_norm": 100.30090500904903, + "learning_rate": 4.266666666666668e-06, + "loss": 1.8005, + "num_tokens": 37658935.0, + "step": 9 + }, + { + "epoch": 0.019985011241568824, + "grad_norm": 24.214612966011444, + "learning_rate": 4.800000000000001e-06, + "loss": 1.4608, + "num_tokens": 41819937.0, + "step": 10 + }, + { + "epoch": 0.021983512365725707, + "grad_norm": 23.446788112863842, + "learning_rate": 5.333333333333334e-06, + "loss": 1.3913, + "num_tokens": 45979393.0, + "step": 11 + }, + { + "epoch": 0.02398201348988259, + "grad_norm": 40.716008763489825, + "learning_rate": 5.8666666666666675e-06, + "loss": 1.3421, + "num_tokens": 50164917.0, + "step": 12 + }, + { + "epoch": 0.025980514614039472, + "grad_norm": 37.03131911025415, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.3001, + "num_tokens": 54350700.0, + "step": 13 + }, + { + "epoch": 0.027979015738196354, + "grad_norm": 28.034773566345486, + "learning_rate": 6.9333333333333344e-06, + "loss": 1.18, + "num_tokens": 58537298.0, + "step": 14 + }, + { + "epoch": 0.029977516862353237, + "grad_norm": 15.379002437693554, + "learning_rate": 7.4666666666666675e-06, + "loss": 1.1383, + "num_tokens": 62712376.0, + "step": 15 + }, + { + "epoch": 0.03197601798651012, + "grad_norm": 3.69636236361517, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1008, + "num_tokens": 66897355.0, + "step": 16 + }, + { + "epoch": 0.033974519110667, + "grad_norm": 3.2777095914200216, + "learning_rate": 8.533333333333335e-06, + "loss": 1.0766, + "num_tokens": 71071673.0, + "step": 17 + }, + { + "epoch": 0.035973020234823884, + "grad_norm": 3.8235286837847893, + "learning_rate": 9.066666666666667e-06, + "loss": 1.0183, + "num_tokens": 75255431.0, + "step": 18 + }, + { + "epoch": 0.03797152135898076, + "grad_norm": 2.069211156911732, + "learning_rate": 9.600000000000001e-06, + "loss": 0.9929, + "num_tokens": 79433086.0, + "step": 19 + }, + { + "epoch": 0.03997002248313765, + "grad_norm": 1.6468644709985536, + "learning_rate": 1.0133333333333335e-05, + "loss": 0.9661, + "num_tokens": 83563738.0, + "step": 20 + }, + { + "epoch": 0.04196852360729453, + "grad_norm": 1.5370993256632604, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.9461, + "num_tokens": 87750079.0, + "step": 21 + }, + { + "epoch": 0.043967024731451414, + "grad_norm": 1.2767961326056532, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.908, + "num_tokens": 91932366.0, + "step": 22 + }, + { + "epoch": 0.04596552585560829, + "grad_norm": 1.138074548776672, + "learning_rate": 1.1733333333333335e-05, + "loss": 0.8927, + "num_tokens": 96068067.0, + "step": 23 + }, + { + "epoch": 0.04796402697976518, + "grad_norm": 0.7628186122130527, + "learning_rate": 1.2266666666666667e-05, + "loss": 0.8755, + "num_tokens": 100173007.0, + "step": 24 + }, + { + "epoch": 0.04996252810392206, + "grad_norm": 0.8212233150244571, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.8518, + "num_tokens": 104359276.0, + "step": 25 + }, + { + "epoch": 0.051961029228078943, + "grad_norm": 0.8683695854768825, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8216, + "num_tokens": 108545948.0, + "step": 26 + }, + { + "epoch": 0.05395953035223582, + "grad_norm": 0.582183435601648, + "learning_rate": 1.3866666666666669e-05, + "loss": 0.818, + "num_tokens": 112728623.0, + "step": 27 + }, + { + "epoch": 0.05595803147639271, + "grad_norm": 0.7010940094733009, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.803, + "num_tokens": 116897360.0, + "step": 28 + }, + { + "epoch": 0.05795653260054959, + "grad_norm": 0.758404907048536, + "learning_rate": 1.4933333333333335e-05, + "loss": 0.8127, + "num_tokens": 121056098.0, + "step": 29 + }, + { + "epoch": 0.05995503372470647, + "grad_norm": 0.5172675776873745, + "learning_rate": 1.546666666666667e-05, + "loss": 0.7754, + "num_tokens": 125228058.0, + "step": 30 + }, + { + "epoch": 0.06195353484886335, + "grad_norm": 0.5056586897440846, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.8001, + "num_tokens": 129385966.0, + "step": 31 + }, + { + "epoch": 0.06395203597302024, + "grad_norm": 0.639531325496852, + "learning_rate": 1.6533333333333333e-05, + "loss": 0.7686, + "num_tokens": 133515694.0, + "step": 32 + }, + { + "epoch": 0.06595053709717712, + "grad_norm": 0.4465403081527611, + "learning_rate": 1.706666666666667e-05, + "loss": 0.7582, + "num_tokens": 137701566.0, + "step": 33 + }, + { + "epoch": 0.067949038221334, + "grad_norm": 0.511675779939411, + "learning_rate": 1.76e-05, + "loss": 0.7549, + "num_tokens": 141885477.0, + "step": 34 + }, + { + "epoch": 0.06994753934549088, + "grad_norm": 0.5528015238899816, + "learning_rate": 1.8133333333333335e-05, + "loss": 0.7601, + "num_tokens": 146036089.0, + "step": 35 + }, + { + "epoch": 0.07194604046964777, + "grad_norm": 0.5407835393956167, + "learning_rate": 1.866666666666667e-05, + "loss": 0.7467, + "num_tokens": 150197895.0, + "step": 36 + }, + { + "epoch": 0.07394454159380465, + "grad_norm": 0.40166338838979404, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.7245, + "num_tokens": 154365126.0, + "step": 37 + }, + { + "epoch": 0.07594304271796153, + "grad_norm": 0.587497742867939, + "learning_rate": 1.9733333333333336e-05, + "loss": 0.7255, + "num_tokens": 158536441.0, + "step": 38 + }, + { + "epoch": 0.0779415438421184, + "grad_norm": 0.5959839464054549, + "learning_rate": 2.026666666666667e-05, + "loss": 0.7442, + "num_tokens": 162721995.0, + "step": 39 + }, + { + "epoch": 0.0799400449662753, + "grad_norm": 0.5944236801895781, + "learning_rate": 2.08e-05, + "loss": 0.738, + "num_tokens": 166873555.0, + "step": 40 + }, + { + "epoch": 0.08193854609043218, + "grad_norm": 0.454605058167223, + "learning_rate": 2.1333333333333335e-05, + "loss": 0.7165, + "num_tokens": 171059799.0, + "step": 41 + }, + { + "epoch": 0.08393704721458906, + "grad_norm": 0.46491742256056895, + "learning_rate": 2.186666666666667e-05, + "loss": 0.7332, + "num_tokens": 175244476.0, + "step": 42 + }, + { + "epoch": 0.08593554833874593, + "grad_norm": 0.3246477172492339, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.7061, + "num_tokens": 179429104.0, + "step": 43 + }, + { + "epoch": 0.08793404946290283, + "grad_norm": 0.6945635024523399, + "learning_rate": 2.2933333333333336e-05, + "loss": 0.7211, + "num_tokens": 183614314.0, + "step": 44 + }, + { + "epoch": 0.0899325505870597, + "grad_norm": 0.4377701074925824, + "learning_rate": 2.346666666666667e-05, + "loss": 0.7145, + "num_tokens": 187798434.0, + "step": 45 + }, + { + "epoch": 0.09193105171121659, + "grad_norm": 0.7545688342856701, + "learning_rate": 2.4e-05, + "loss": 0.7112, + "num_tokens": 191963193.0, + "step": 46 + }, + { + "epoch": 0.09392955283537346, + "grad_norm": 0.6672644703622103, + "learning_rate": 2.4533333333333334e-05, + "loss": 0.6907, + "num_tokens": 196149267.0, + "step": 47 + }, + { + "epoch": 0.09592805395953036, + "grad_norm": 0.552485053095052, + "learning_rate": 2.5066666666666672e-05, + "loss": 0.6968, + "num_tokens": 200334652.0, + "step": 48 + }, + { + "epoch": 0.09792655508368724, + "grad_norm": 0.7375437533571079, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.7029, + "num_tokens": 204520129.0, + "step": 49 + }, + { + "epoch": 0.09992505620784412, + "grad_norm": 0.4441602363241456, + "learning_rate": 2.6133333333333336e-05, + "loss": 0.6971, + "num_tokens": 208685165.0, + "step": 50 + }, + { + "epoch": 0.101923557332001, + "grad_norm": 0.6015504695122488, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.687, + "num_tokens": 212840363.0, + "step": 51 + }, + { + "epoch": 0.10392205845615789, + "grad_norm": 0.5059797154434139, + "learning_rate": 2.7200000000000004e-05, + "loss": 0.6855, + "num_tokens": 217001316.0, + "step": 52 + }, + { + "epoch": 0.10592055958031477, + "grad_norm": 0.3812096340708665, + "learning_rate": 2.7733333333333338e-05, + "loss": 0.7071, + "num_tokens": 221187284.0, + "step": 53 + }, + { + "epoch": 0.10791906070447164, + "grad_norm": 0.5126227473905077, + "learning_rate": 2.8266666666666668e-05, + "loss": 0.6903, + "num_tokens": 225372396.0, + "step": 54 + }, + { + "epoch": 0.10991756182862852, + "grad_norm": 0.4511387712464445, + "learning_rate": 2.8800000000000002e-05, + "loss": 0.6933, + "num_tokens": 229534726.0, + "step": 55 + }, + { + "epoch": 0.11191606295278542, + "grad_norm": 0.41830907130044265, + "learning_rate": 2.9333333333333333e-05, + "loss": 0.6974, + "num_tokens": 233703104.0, + "step": 56 + }, + { + "epoch": 0.1139145640769423, + "grad_norm": 0.5182445015844716, + "learning_rate": 2.986666666666667e-05, + "loss": 0.6684, + "num_tokens": 237889742.0, + "step": 57 + }, + { + "epoch": 0.11591306520109917, + "grad_norm": 0.3844728446034073, + "learning_rate": 3.0400000000000004e-05, + "loss": 0.6947, + "num_tokens": 242074830.0, + "step": 58 + }, + { + "epoch": 0.11791156632525605, + "grad_norm": 0.550685559076612, + "learning_rate": 3.093333333333334e-05, + "loss": 0.6734, + "num_tokens": 246234251.0, + "step": 59 + }, + { + "epoch": 0.11991006744941295, + "grad_norm": 0.6272318096449185, + "learning_rate": 3.146666666666667e-05, + "loss": 0.6999, + "num_tokens": 250418326.0, + "step": 60 + }, + { + "epoch": 0.12190856857356983, + "grad_norm": 0.35331643148680686, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.6776, + "num_tokens": 254604095.0, + "step": 61 + }, + { + "epoch": 0.1239070696977267, + "grad_norm": 0.661983609069102, + "learning_rate": 3.2533333333333336e-05, + "loss": 0.6918, + "num_tokens": 258775359.0, + "step": 62 + }, + { + "epoch": 0.12590557082188358, + "grad_norm": 0.4678568810652704, + "learning_rate": 3.3066666666666666e-05, + "loss": 0.6702, + "num_tokens": 262960329.0, + "step": 63 + }, + { + "epoch": 0.12790407194604048, + "grad_norm": 0.5448545798663242, + "learning_rate": 3.3600000000000004e-05, + "loss": 0.6744, + "num_tokens": 267101016.0, + "step": 64 + }, + { + "epoch": 0.12990257307019734, + "grad_norm": 0.4317377653291983, + "learning_rate": 3.413333333333334e-05, + "loss": 0.6697, + "num_tokens": 271285305.0, + "step": 65 + }, + { + "epoch": 0.13190107419435423, + "grad_norm": 0.42982155444729425, + "learning_rate": 3.466666666666667e-05, + "loss": 0.6782, + "num_tokens": 275471617.0, + "step": 66 + }, + { + "epoch": 0.13389957531851113, + "grad_norm": 0.5494519682358964, + "learning_rate": 3.52e-05, + "loss": 0.6913, + "num_tokens": 279658458.0, + "step": 67 + }, + { + "epoch": 0.135898076442668, + "grad_norm": 0.46520667234659624, + "learning_rate": 3.573333333333333e-05, + "loss": 0.6832, + "num_tokens": 283827352.0, + "step": 68 + }, + { + "epoch": 0.13789657756682488, + "grad_norm": 0.5814419543118462, + "learning_rate": 3.626666666666667e-05, + "loss": 0.674, + "num_tokens": 288012653.0, + "step": 69 + }, + { + "epoch": 0.13989507869098175, + "grad_norm": 0.5964732367666356, + "learning_rate": 3.680000000000001e-05, + "loss": 0.6544, + "num_tokens": 292171718.0, + "step": 70 + }, + { + "epoch": 0.14189357981513864, + "grad_norm": 0.32870758222365803, + "learning_rate": 3.733333333333334e-05, + "loss": 0.6774, + "num_tokens": 296343783.0, + "step": 71 + }, + { + "epoch": 0.14389208093929554, + "grad_norm": 1.1754403484560965, + "learning_rate": 3.786666666666667e-05, + "loss": 0.6834, + "num_tokens": 300514629.0, + "step": 72 + }, + { + "epoch": 0.1458905820634524, + "grad_norm": 0.9117700363108912, + "learning_rate": 3.8400000000000005e-05, + "loss": 0.6466, + "num_tokens": 304657280.0, + "step": 73 + }, + { + "epoch": 0.1478890831876093, + "grad_norm": 0.8516768469922108, + "learning_rate": 3.8933333333333336e-05, + "loss": 0.6579, + "num_tokens": 308841265.0, + "step": 74 + }, + { + "epoch": 0.1498875843117662, + "grad_norm": 0.6499476675515055, + "learning_rate": 3.946666666666667e-05, + "loss": 0.6694, + "num_tokens": 313004700.0, + "step": 75 + }, + { + "epoch": 0.15188608543592305, + "grad_norm": 1.0443407190916916, + "learning_rate": 4e-05, + "loss": 0.6562, + "num_tokens": 317191438.0, + "step": 76 + }, + { + "epoch": 0.15388458656007994, + "grad_norm": 0.8176404393705897, + "learning_rate": 3.999998489507031e-05, + "loss": 0.669, + "num_tokens": 321333038.0, + "step": 77 + }, + { + "epoch": 0.1558830876842368, + "grad_norm": 0.9979156618466608, + "learning_rate": 3.999993958030657e-05, + "loss": 0.657, + "num_tokens": 325505427.0, + "step": 78 + }, + { + "epoch": 0.1578815888083937, + "grad_norm": 0.9565404762010582, + "learning_rate": 3.9999864055784856e-05, + "loss": 0.6619, + "num_tokens": 329650202.0, + "step": 79 + }, + { + "epoch": 0.1598800899325506, + "grad_norm": 0.7386636077901623, + "learning_rate": 3.99997583216319e-05, + "loss": 0.6558, + "num_tokens": 333837391.0, + "step": 80 + }, + { + "epoch": 0.16187859105670746, + "grad_norm": 0.8350003156683696, + "learning_rate": 3.999962237802518e-05, + "loss": 0.6611, + "num_tokens": 338019775.0, + "step": 81 + }, + { + "epoch": 0.16387709218086435, + "grad_norm": 0.502324445247342, + "learning_rate": 3.999945622519284e-05, + "loss": 0.6618, + "num_tokens": 342204226.0, + "step": 82 + }, + { + "epoch": 0.16587559330502125, + "grad_norm": 0.5367971149234448, + "learning_rate": 3.999925986341374e-05, + "loss": 0.6468, + "num_tokens": 346370542.0, + "step": 83 + }, + { + "epoch": 0.1678740944291781, + "grad_norm": 0.4916318475084931, + "learning_rate": 3.999903329301744e-05, + "loss": 0.659, + "num_tokens": 350553761.0, + "step": 84 + }, + { + "epoch": 0.169872595553335, + "grad_norm": 0.4055326874299623, + "learning_rate": 3.99987765143842e-05, + "loss": 0.6528, + "num_tokens": 354736899.0, + "step": 85 + }, + { + "epoch": 0.17187109667749187, + "grad_norm": 0.38285857152124575, + "learning_rate": 3.999848952794498e-05, + "loss": 0.6788, + "num_tokens": 358917821.0, + "step": 86 + }, + { + "epoch": 0.17386959780164876, + "grad_norm": 0.45743069586532376, + "learning_rate": 3.999817233418143e-05, + "loss": 0.6552, + "num_tokens": 363102220.0, + "step": 87 + }, + { + "epoch": 0.17586809892580565, + "grad_norm": 0.36511149486977523, + "learning_rate": 3.999782493362591e-05, + "loss": 0.6576, + "num_tokens": 367267943.0, + "step": 88 + }, + { + "epoch": 0.17786660004996252, + "grad_norm": 0.4219481554324682, + "learning_rate": 3.999744732686147e-05, + "loss": 0.6708, + "num_tokens": 371449446.0, + "step": 89 + }, + { + "epoch": 0.1798651011741194, + "grad_norm": 0.5843722686419192, + "learning_rate": 3.999703951452185e-05, + "loss": 0.6458, + "num_tokens": 375636821.0, + "step": 90 + }, + { + "epoch": 0.1818636022982763, + "grad_norm": 0.39578550317298455, + "learning_rate": 3.999660149729151e-05, + "loss": 0.6616, + "num_tokens": 379807256.0, + "step": 91 + }, + { + "epoch": 0.18386210342243317, + "grad_norm": 0.4915683698988975, + "learning_rate": 3.999613327590557e-05, + "loss": 0.6502, + "num_tokens": 383990813.0, + "step": 92 + }, + { + "epoch": 0.18586060454659006, + "grad_norm": 0.6097620158895595, + "learning_rate": 3.999563485114986e-05, + "loss": 0.6628, + "num_tokens": 388174694.0, + "step": 93 + }, + { + "epoch": 0.18785910567074693, + "grad_norm": 0.44358496938934394, + "learning_rate": 3.99951062238609e-05, + "loss": 0.6679, + "num_tokens": 392353707.0, + "step": 94 + }, + { + "epoch": 0.18985760679490382, + "grad_norm": 0.6170260507512995, + "learning_rate": 3.9994547394925906e-05, + "loss": 0.6397, + "num_tokens": 396523169.0, + "step": 95 + }, + { + "epoch": 0.19185610791906071, + "grad_norm": 0.45849097070258554, + "learning_rate": 3.9993958365282764e-05, + "loss": 0.6609, + "num_tokens": 400706031.0, + "step": 96 + }, + { + "epoch": 0.19385460904321758, + "grad_norm": 0.5805104300942704, + "learning_rate": 3.9993339135920066e-05, + "loss": 0.6658, + "num_tokens": 404889804.0, + "step": 97 + }, + { + "epoch": 0.19585311016737447, + "grad_norm": 0.5690545682005914, + "learning_rate": 3.999268970787707e-05, + "loss": 0.6527, + "num_tokens": 409062572.0, + "step": 98 + }, + { + "epoch": 0.19785161129153137, + "grad_norm": 0.4535623307472101, + "learning_rate": 3.999201008224374e-05, + "loss": 0.6388, + "num_tokens": 413225829.0, + "step": 99 + }, + { + "epoch": 0.19985011241568823, + "grad_norm": 0.724053506757067, + "learning_rate": 3.99913002601607e-05, + "loss": 0.6659, + "num_tokens": 417410328.0, + "step": 100 + }, + { + "epoch": 0.20184861353984512, + "grad_norm": 0.4552881564775425, + "learning_rate": 3.9990560242819274e-05, + "loss": 0.6474, + "num_tokens": 421577922.0, + "step": 101 + }, + { + "epoch": 0.203847114664002, + "grad_norm": 0.8085241946471347, + "learning_rate": 3.998979003146143e-05, + "loss": 0.6342, + "num_tokens": 425762688.0, + "step": 102 + }, + { + "epoch": 0.20584561578815888, + "grad_norm": 0.6613662828692533, + "learning_rate": 3.998898962737986e-05, + "loss": 0.6548, + "num_tokens": 429947426.0, + "step": 103 + }, + { + "epoch": 0.20784411691231577, + "grad_norm": 0.7251990236161243, + "learning_rate": 3.998815903191788e-05, + "loss": 0.6428, + "num_tokens": 434132386.0, + "step": 104 + }, + { + "epoch": 0.20984261803647264, + "grad_norm": 0.6971142609307405, + "learning_rate": 3.998729824646952e-05, + "loss": 0.6425, + "num_tokens": 438317426.0, + "step": 105 + }, + { + "epoch": 0.21184111916062953, + "grad_norm": 0.49878185284932325, + "learning_rate": 3.998640727247943e-05, + "loss": 0.6475, + "num_tokens": 442502668.0, + "step": 106 + }, + { + "epoch": 0.2138396202847864, + "grad_norm": 0.5087470689967984, + "learning_rate": 3.998548611144299e-05, + "loss": 0.6282, + "num_tokens": 446686608.0, + "step": 107 + }, + { + "epoch": 0.2158381214089433, + "grad_norm": 0.506373471246999, + "learning_rate": 3.998453476490619e-05, + "loss": 0.6289, + "num_tokens": 450872771.0, + "step": 108 + }, + { + "epoch": 0.21783662253310018, + "grad_norm": 0.4481612369433036, + "learning_rate": 3.9983553234465685e-05, + "loss": 0.6449, + "num_tokens": 455058319.0, + "step": 109 + }, + { + "epoch": 0.21983512365725705, + "grad_norm": 0.4822646150495586, + "learning_rate": 3.9982541521768824e-05, + "loss": 0.6432, + "num_tokens": 459240525.0, + "step": 110 + }, + { + "epoch": 0.22183362478141394, + "grad_norm": 0.3865138293701518, + "learning_rate": 3.9981499628513586e-05, + "loss": 0.6494, + "num_tokens": 463421837.0, + "step": 111 + }, + { + "epoch": 0.22383212590557083, + "grad_norm": 0.48200203157279325, + "learning_rate": 3.99804275564486e-05, + "loss": 0.6584, + "num_tokens": 467605570.0, + "step": 112 + }, + { + "epoch": 0.2258306270297277, + "grad_norm": 0.3637196811355306, + "learning_rate": 3.997932530737315e-05, + "loss": 0.6619, + "num_tokens": 471792759.0, + "step": 113 + }, + { + "epoch": 0.2278291281538846, + "grad_norm": 0.6027803551801617, + "learning_rate": 3.9978192883137186e-05, + "loss": 0.6213, + "num_tokens": 475978184.0, + "step": 114 + }, + { + "epoch": 0.22982762927804146, + "grad_norm": 0.4885796086302507, + "learning_rate": 3.9977030285641264e-05, + "loss": 0.6176, + "num_tokens": 480164230.0, + "step": 115 + }, + { + "epoch": 0.23182613040219835, + "grad_norm": 0.5288740424939445, + "learning_rate": 3.997583751683662e-05, + "loss": 0.6454, + "num_tokens": 484350095.0, + "step": 116 + }, + { + "epoch": 0.23382463152635524, + "grad_norm": 0.48261660281618485, + "learning_rate": 3.997461457872509e-05, + "loss": 0.6485, + "num_tokens": 488504290.0, + "step": 117 + }, + { + "epoch": 0.2358231326505121, + "grad_norm": 0.44735137984375534, + "learning_rate": 3.997336147335917e-05, + "loss": 0.6493, + "num_tokens": 492688269.0, + "step": 118 + }, + { + "epoch": 0.237821633774669, + "grad_norm": 0.4971076060684779, + "learning_rate": 3.997207820284199e-05, + "loss": 0.6481, + "num_tokens": 496865874.0, + "step": 119 + }, + { + "epoch": 0.2398201348988259, + "grad_norm": 0.38364725805706, + "learning_rate": 3.9970764769327285e-05, + "loss": 0.6367, + "num_tokens": 501044624.0, + "step": 120 + }, + { + "epoch": 0.24181863602298276, + "grad_norm": 0.395839882495478, + "learning_rate": 3.996942117501941e-05, + "loss": 0.6372, + "num_tokens": 505228033.0, + "step": 121 + }, + { + "epoch": 0.24381713714713965, + "grad_norm": 0.495943950726676, + "learning_rate": 3.996804742217338e-05, + "loss": 0.6339, + "num_tokens": 509393845.0, + "step": 122 + }, + { + "epoch": 0.24581563827129652, + "grad_norm": 0.27458609223323094, + "learning_rate": 3.996664351309478e-05, + "loss": 0.6324, + "num_tokens": 513544307.0, + "step": 123 + }, + { + "epoch": 0.2478141393954534, + "grad_norm": 0.6549217038127465, + "learning_rate": 3.996520945013984e-05, + "loss": 0.63, + "num_tokens": 517730914.0, + "step": 124 + }, + { + "epoch": 0.2498126405196103, + "grad_norm": 0.4209512401128276, + "learning_rate": 3.996374523571537e-05, + "loss": 0.646, + "num_tokens": 521917299.0, + "step": 125 + }, + { + "epoch": 0.25181114164376717, + "grad_norm": 0.6572833094038069, + "learning_rate": 3.996225087227881e-05, + "loss": 0.648, + "num_tokens": 526077417.0, + "step": 126 + }, + { + "epoch": 0.25380964276792406, + "grad_norm": 0.5250997907994038, + "learning_rate": 3.9960726362338194e-05, + "loss": 0.6441, + "num_tokens": 530261351.0, + "step": 127 + }, + { + "epoch": 0.25580814389208095, + "grad_norm": 0.5671750680312675, + "learning_rate": 3.995917170845213e-05, + "loss": 0.636, + "num_tokens": 534443198.0, + "step": 128 + }, + { + "epoch": 0.25780664501623785, + "grad_norm": 0.45891413136093295, + "learning_rate": 3.995758691322983e-05, + "loss": 0.6245, + "num_tokens": 538608956.0, + "step": 129 + }, + { + "epoch": 0.2598051461403947, + "grad_norm": 0.428126384491859, + "learning_rate": 3.995597197933112e-05, + "loss": 0.6306, + "num_tokens": 542796010.0, + "step": 130 + }, + { + "epoch": 0.2618036472645516, + "grad_norm": 0.3260876320222859, + "learning_rate": 3.9954326909466366e-05, + "loss": 0.638, + "num_tokens": 546962543.0, + "step": 131 + }, + { + "epoch": 0.26380214838870847, + "grad_norm": 0.4658246899707976, + "learning_rate": 3.995265170639654e-05, + "loss": 0.6556, + "num_tokens": 551120664.0, + "step": 132 + }, + { + "epoch": 0.26580064951286536, + "grad_norm": 0.35544484680232896, + "learning_rate": 3.995094637293317e-05, + "loss": 0.6278, + "num_tokens": 555288163.0, + "step": 133 + }, + { + "epoch": 0.26779915063702225, + "grad_norm": 0.4899250175132269, + "learning_rate": 3.994921091193836e-05, + "loss": 0.6225, + "num_tokens": 559471115.0, + "step": 134 + }, + { + "epoch": 0.2697976517611791, + "grad_norm": 0.39117589792907287, + "learning_rate": 3.994744532632479e-05, + "loss": 0.6438, + "num_tokens": 563654947.0, + "step": 135 + }, + { + "epoch": 0.271796152885336, + "grad_norm": 0.4211561602703111, + "learning_rate": 3.994564961905568e-05, + "loss": 0.6285, + "num_tokens": 567809766.0, + "step": 136 + }, + { + "epoch": 0.2737946540094929, + "grad_norm": 0.4930339754927668, + "learning_rate": 3.994382379314481e-05, + "loss": 0.6368, + "num_tokens": 571957856.0, + "step": 137 + }, + { + "epoch": 0.27579315513364977, + "grad_norm": 0.3239303290940829, + "learning_rate": 3.9941967851656515e-05, + "loss": 0.6336, + "num_tokens": 576114571.0, + "step": 138 + }, + { + "epoch": 0.27779165625780666, + "grad_norm": 0.430445686535215, + "learning_rate": 3.994008179770566e-05, + "loss": 0.6507, + "num_tokens": 580299647.0, + "step": 139 + }, + { + "epoch": 0.2797901573819635, + "grad_norm": 0.3322449898807073, + "learning_rate": 3.993816563445766e-05, + "loss": 0.6262, + "num_tokens": 584451308.0, + "step": 140 + }, + { + "epoch": 0.2817886585061204, + "grad_norm": 0.28536389131256207, + "learning_rate": 3.993621936512848e-05, + "loss": 0.6303, + "num_tokens": 588619792.0, + "step": 141 + }, + { + "epoch": 0.2837871596302773, + "grad_norm": 0.2885070501464784, + "learning_rate": 3.993424299298457e-05, + "loss": 0.6371, + "num_tokens": 592782883.0, + "step": 142 + }, + { + "epoch": 0.2857856607544342, + "grad_norm": 0.31043604862744967, + "learning_rate": 3.993223652134293e-05, + "loss": 0.6319, + "num_tokens": 596967974.0, + "step": 143 + }, + { + "epoch": 0.28778416187859107, + "grad_norm": 0.4558075239549601, + "learning_rate": 3.993019995357108e-05, + "loss": 0.6164, + "num_tokens": 601098759.0, + "step": 144 + }, + { + "epoch": 0.28978266300274796, + "grad_norm": 0.3368801261122649, + "learning_rate": 3.992813329308704e-05, + "loss": 0.623, + "num_tokens": 605283584.0, + "step": 145 + }, + { + "epoch": 0.2917811641269048, + "grad_norm": 0.37737164772277193, + "learning_rate": 3.992603654335934e-05, + "loss": 0.626, + "num_tokens": 609465251.0, + "step": 146 + }, + { + "epoch": 0.2937796652510617, + "grad_norm": 0.40897451444010796, + "learning_rate": 3.9923909707907e-05, + "loss": 0.631, + "num_tokens": 613650325.0, + "step": 147 + }, + { + "epoch": 0.2957781663752186, + "grad_norm": 0.3647409820710091, + "learning_rate": 3.992175279029956e-05, + "loss": 0.6181, + "num_tokens": 617833081.0, + "step": 148 + }, + { + "epoch": 0.2977766674993755, + "grad_norm": 0.34616075612115077, + "learning_rate": 3.9919565794157004e-05, + "loss": 0.6226, + "num_tokens": 622019302.0, + "step": 149 + }, + { + "epoch": 0.2997751686235324, + "grad_norm": 0.31790558639813443, + "learning_rate": 3.9917348723149855e-05, + "loss": 0.6263, + "num_tokens": 626202391.0, + "step": 150 + }, + { + "epoch": 0.3017736697476892, + "grad_norm": 0.3423802791561228, + "learning_rate": 3.991510158099905e-05, + "loss": 0.6088, + "num_tokens": 630387451.0, + "step": 151 + }, + { + "epoch": 0.3037721708718461, + "grad_norm": 0.3898815323588864, + "learning_rate": 3.991282437147605e-05, + "loss": 0.6084, + "num_tokens": 634541607.0, + "step": 152 + }, + { + "epoch": 0.305770671996003, + "grad_norm": 0.3529555032097221, + "learning_rate": 3.991051709840274e-05, + "loss": 0.6327, + "num_tokens": 638728728.0, + "step": 153 + }, + { + "epoch": 0.3077691731201599, + "grad_norm": 0.37447172207246693, + "learning_rate": 3.990817976565147e-05, + "loss": 0.6361, + "num_tokens": 642915241.0, + "step": 154 + }, + { + "epoch": 0.3097676742443168, + "grad_norm": 0.36383449387241823, + "learning_rate": 3.9905812377145065e-05, + "loss": 0.6222, + "num_tokens": 647098991.0, + "step": 155 + }, + { + "epoch": 0.3117661753684736, + "grad_norm": 0.2907464117479951, + "learning_rate": 3.990341493685676e-05, + "loss": 0.6233, + "num_tokens": 651281815.0, + "step": 156 + }, + { + "epoch": 0.3137646764926305, + "grad_norm": 0.2577763506249385, + "learning_rate": 3.9900987448810233e-05, + "loss": 0.6144, + "num_tokens": 655408278.0, + "step": 157 + }, + { + "epoch": 0.3157631776167874, + "grad_norm": 0.3731739701610645, + "learning_rate": 3.9898529917079614e-05, + "loss": 0.6375, + "num_tokens": 659590711.0, + "step": 158 + }, + { + "epoch": 0.3177616787409443, + "grad_norm": 0.29962442418452706, + "learning_rate": 3.989604234578944e-05, + "loss": 0.645, + "num_tokens": 663775106.0, + "step": 159 + }, + { + "epoch": 0.3197601798651012, + "grad_norm": 0.36233631422698886, + "learning_rate": 3.989352473911465e-05, + "loss": 0.6446, + "num_tokens": 667959356.0, + "step": 160 + }, + { + "epoch": 0.3217586809892581, + "grad_norm": 0.39316284786573613, + "learning_rate": 3.989097710128062e-05, + "loss": 0.6279, + "num_tokens": 672144194.0, + "step": 161 + }, + { + "epoch": 0.3237571821134149, + "grad_norm": 0.4372961836222143, + "learning_rate": 3.988839943656312e-05, + "loss": 0.6395, + "num_tokens": 676307121.0, + "step": 162 + }, + { + "epoch": 0.3257556832375718, + "grad_norm": 0.26270417995365725, + "learning_rate": 3.98857917492883e-05, + "loss": 0.6315, + "num_tokens": 680490742.0, + "step": 163 + }, + { + "epoch": 0.3277541843617287, + "grad_norm": 0.5664466960996606, + "learning_rate": 3.9883154043832714e-05, + "loss": 0.6131, + "num_tokens": 684638422.0, + "step": 164 + }, + { + "epoch": 0.3297526854858856, + "grad_norm": 0.353428542171955, + "learning_rate": 3.9880486324623284e-05, + "loss": 0.6148, + "num_tokens": 688811997.0, + "step": 165 + }, + { + "epoch": 0.3317511866100425, + "grad_norm": 0.5270825018704551, + "learning_rate": 3.987778859613732e-05, + "loss": 0.5994, + "num_tokens": 692967699.0, + "step": 166 + }, + { + "epoch": 0.33374968773419933, + "grad_norm": 0.3998131867430412, + "learning_rate": 3.987506086290249e-05, + "loss": 0.621, + "num_tokens": 697125874.0, + "step": 167 + }, + { + "epoch": 0.3357481888583562, + "grad_norm": 0.4514495465382229, + "learning_rate": 3.9872303129496805e-05, + "loss": 0.6217, + "num_tokens": 701278911.0, + "step": 168 + }, + { + "epoch": 0.3377466899825131, + "grad_norm": 0.39198494433635367, + "learning_rate": 3.986951540054865e-05, + "loss": 0.6301, + "num_tokens": 705462341.0, + "step": 169 + }, + { + "epoch": 0.33974519110667, + "grad_norm": 0.4529588973817943, + "learning_rate": 3.986669768073674e-05, + "loss": 0.6298, + "num_tokens": 709647728.0, + "step": 170 + }, + { + "epoch": 0.3417436922308269, + "grad_norm": 0.34432290008262123, + "learning_rate": 3.9863849974790124e-05, + "loss": 0.6223, + "num_tokens": 713835408.0, + "step": 171 + }, + { + "epoch": 0.34374219335498374, + "grad_norm": 0.5761299608961877, + "learning_rate": 3.9860972287488187e-05, + "loss": 0.6359, + "num_tokens": 718009528.0, + "step": 172 + }, + { + "epoch": 0.34574069447914063, + "grad_norm": 0.43475649993243937, + "learning_rate": 3.985806462366061e-05, + "loss": 0.6215, + "num_tokens": 722145337.0, + "step": 173 + }, + { + "epoch": 0.3477391956032975, + "grad_norm": 0.46653259991679885, + "learning_rate": 3.98551269881874e-05, + "loss": 0.623, + "num_tokens": 726330240.0, + "step": 174 + }, + { + "epoch": 0.3497376967274544, + "grad_norm": 0.5455165131636455, + "learning_rate": 3.985215938599889e-05, + "loss": 0.6315, + "num_tokens": 730488772.0, + "step": 175 + }, + { + "epoch": 0.3517361978516113, + "grad_norm": 0.39044858442268515, + "learning_rate": 3.9849161822075655e-05, + "loss": 0.6181, + "num_tokens": 734659467.0, + "step": 176 + }, + { + "epoch": 0.35373469897576815, + "grad_norm": 0.685246163388862, + "learning_rate": 3.9846134301448595e-05, + "loss": 0.6158, + "num_tokens": 738843674.0, + "step": 177 + }, + { + "epoch": 0.35573320009992504, + "grad_norm": 0.6678976774847737, + "learning_rate": 3.984307682919888e-05, + "loss": 0.6289, + "num_tokens": 743030843.0, + "step": 178 + }, + { + "epoch": 0.35773170122408193, + "grad_norm": 0.48842118979557564, + "learning_rate": 3.983998941045793e-05, + "loss": 0.6123, + "num_tokens": 747215745.0, + "step": 179 + }, + { + "epoch": 0.3597302023482388, + "grad_norm": 0.42804216160279124, + "learning_rate": 3.983687205040746e-05, + "loss": 0.6193, + "num_tokens": 751399303.0, + "step": 180 + }, + { + "epoch": 0.3617287034723957, + "grad_norm": 0.49930157791740554, + "learning_rate": 3.9833724754279394e-05, + "loss": 0.6083, + "num_tokens": 755583229.0, + "step": 181 + }, + { + "epoch": 0.3637272045965526, + "grad_norm": 0.33792926405890616, + "learning_rate": 3.983054752735592e-05, + "loss": 0.6133, + "num_tokens": 759763400.0, + "step": 182 + }, + { + "epoch": 0.36572570572070945, + "grad_norm": 0.47694376987275006, + "learning_rate": 3.982734037496947e-05, + "loss": 0.6262, + "num_tokens": 763922050.0, + "step": 183 + }, + { + "epoch": 0.36772420684486634, + "grad_norm": 0.3311073662053782, + "learning_rate": 3.982410330250269e-05, + "loss": 0.6097, + "num_tokens": 768097871.0, + "step": 184 + }, + { + "epoch": 0.36972270796902323, + "grad_norm": 0.30958672892327066, + "learning_rate": 3.982083631538844e-05, + "loss": 0.6102, + "num_tokens": 772238646.0, + "step": 185 + }, + { + "epoch": 0.3717212090931801, + "grad_norm": 0.43785051420681853, + "learning_rate": 3.981753941910978e-05, + "loss": 0.6064, + "num_tokens": 776402034.0, + "step": 186 + }, + { + "epoch": 0.373719710217337, + "grad_norm": 0.30648776536634315, + "learning_rate": 3.981421261919997e-05, + "loss": 0.635, + "num_tokens": 780587443.0, + "step": 187 + }, + { + "epoch": 0.37571821134149386, + "grad_norm": 0.4889169323642654, + "learning_rate": 3.9810855921242485e-05, + "loss": 0.622, + "num_tokens": 784774793.0, + "step": 188 + }, + { + "epoch": 0.37771671246565075, + "grad_norm": 0.411604179302514, + "learning_rate": 3.980746933087095e-05, + "loss": 0.634, + "num_tokens": 788937328.0, + "step": 189 + }, + { + "epoch": 0.37971521358980764, + "grad_norm": 0.37145722351687765, + "learning_rate": 3.980405285376915e-05, + "loss": 0.623, + "num_tokens": 793056467.0, + "step": 190 + }, + { + "epoch": 0.38171371471396454, + "grad_norm": 0.3130779308773784, + "learning_rate": 3.980060649567106e-05, + "loss": 0.612, + "num_tokens": 797234808.0, + "step": 191 + }, + { + "epoch": 0.38371221583812143, + "grad_norm": 0.3598572204763115, + "learning_rate": 3.9797130262360786e-05, + "loss": 0.6032, + "num_tokens": 801390807.0, + "step": 192 + }, + { + "epoch": 0.38571071696227827, + "grad_norm": 0.2821764015497079, + "learning_rate": 3.9793624159672585e-05, + "loss": 0.627, + "num_tokens": 805541163.0, + "step": 193 + }, + { + "epoch": 0.38770921808643516, + "grad_norm": 0.5026273269115951, + "learning_rate": 3.979008819349084e-05, + "loss": 0.6277, + "num_tokens": 809723804.0, + "step": 194 + }, + { + "epoch": 0.38970771921059205, + "grad_norm": 0.24646657643956257, + "learning_rate": 3.978652236975003e-05, + "loss": 0.6239, + "num_tokens": 813904028.0, + "step": 195 + }, + { + "epoch": 0.39170622033474894, + "grad_norm": 0.5472953460812967, + "learning_rate": 3.978292669443479e-05, + "loss": 0.6239, + "num_tokens": 818073146.0, + "step": 196 + }, + { + "epoch": 0.39370472145890584, + "grad_norm": 0.3876108262714518, + "learning_rate": 3.9779301173579836e-05, + "loss": 0.6072, + "num_tokens": 822259281.0, + "step": 197 + }, + { + "epoch": 0.39570322258306273, + "grad_norm": 0.510698577210308, + "learning_rate": 3.9775645813269966e-05, + "loss": 0.6072, + "num_tokens": 826411208.0, + "step": 198 + }, + { + "epoch": 0.39770172370721957, + "grad_norm": 0.4645570167956041, + "learning_rate": 3.977196061964006e-05, + "loss": 0.6188, + "num_tokens": 830593491.0, + "step": 199 + }, + { + "epoch": 0.39970022483137646, + "grad_norm": 0.35957603750715683, + "learning_rate": 3.976824559887508e-05, + "loss": 0.6132, + "num_tokens": 834761339.0, + "step": 200 + }, + { + "epoch": 0.40169872595553335, + "grad_norm": 0.4908156422049382, + "learning_rate": 3.976450075721003e-05, + "loss": 0.5928, + "num_tokens": 838874571.0, + "step": 201 + }, + { + "epoch": 0.40369722707969025, + "grad_norm": 0.31940438015697753, + "learning_rate": 3.976072610092999e-05, + "loss": 0.6326, + "num_tokens": 843042131.0, + "step": 202 + }, + { + "epoch": 0.40569572820384714, + "grad_norm": 0.592916733598508, + "learning_rate": 3.975692163637005e-05, + "loss": 0.612, + "num_tokens": 847227371.0, + "step": 203 + }, + { + "epoch": 0.407694229328004, + "grad_norm": 0.5255772273816999, + "learning_rate": 3.9753087369915336e-05, + "loss": 0.6097, + "num_tokens": 851412340.0, + "step": 204 + }, + { + "epoch": 0.40969273045216087, + "grad_norm": 0.3907565466971485, + "learning_rate": 3.9749223308001e-05, + "loss": 0.6061, + "num_tokens": 855570641.0, + "step": 205 + }, + { + "epoch": 0.41169123157631776, + "grad_norm": 0.44401773014750034, + "learning_rate": 3.9745329457112206e-05, + "loss": 0.5896, + "num_tokens": 859756041.0, + "step": 206 + }, + { + "epoch": 0.41368973270047465, + "grad_norm": 0.3018170905035139, + "learning_rate": 3.974140582378408e-05, + "loss": 0.6201, + "num_tokens": 863909391.0, + "step": 207 + }, + { + "epoch": 0.41568823382463155, + "grad_norm": 0.5934381011431151, + "learning_rate": 3.973745241460178e-05, + "loss": 0.6142, + "num_tokens": 868053245.0, + "step": 208 + }, + { + "epoch": 0.4176867349487884, + "grad_norm": 0.425772366901439, + "learning_rate": 3.9733469236200406e-05, + "loss": 0.6178, + "num_tokens": 872223781.0, + "step": 209 + }, + { + "epoch": 0.4196852360729453, + "grad_norm": 0.6714379193560002, + "learning_rate": 3.972945629526502e-05, + "loss": 0.6319, + "num_tokens": 876407400.0, + "step": 210 + }, + { + "epoch": 0.42168373719710217, + "grad_norm": 0.5381494117385783, + "learning_rate": 3.9725413598530645e-05, + "loss": 0.6141, + "num_tokens": 880591881.0, + "step": 211 + }, + { + "epoch": 0.42368223832125906, + "grad_norm": 0.6923853994505028, + "learning_rate": 3.9721341152782254e-05, + "loss": 0.6147, + "num_tokens": 884776205.0, + "step": 212 + }, + { + "epoch": 0.42568073944541596, + "grad_norm": 0.7039647712934992, + "learning_rate": 3.9717238964854726e-05, + "loss": 0.6142, + "num_tokens": 888959594.0, + "step": 213 + }, + { + "epoch": 0.4276792405695728, + "grad_norm": 0.37750569176586096, + "learning_rate": 3.971310704163287e-05, + "loss": 0.6106, + "num_tokens": 893144658.0, + "step": 214 + }, + { + "epoch": 0.4296777416937297, + "grad_norm": 0.5301507559669968, + "learning_rate": 3.970894539005139e-05, + "loss": 0.6086, + "num_tokens": 897319028.0, + "step": 215 + }, + { + "epoch": 0.4316762428178866, + "grad_norm": 0.3906099695142284, + "learning_rate": 3.970475401709492e-05, + "loss": 0.6255, + "num_tokens": 901456951.0, + "step": 216 + }, + { + "epoch": 0.4336747439420435, + "grad_norm": 0.46579089459020595, + "learning_rate": 3.970053292979791e-05, + "loss": 0.6159, + "num_tokens": 905638109.0, + "step": 217 + }, + { + "epoch": 0.43567324506620037, + "grad_norm": 0.30738397320261857, + "learning_rate": 3.969628213524475e-05, + "loss": 0.6202, + "num_tokens": 909798598.0, + "step": 218 + }, + { + "epoch": 0.43767174619035726, + "grad_norm": 0.33627417249104985, + "learning_rate": 3.9692001640569644e-05, + "loss": 0.6097, + "num_tokens": 913945308.0, + "step": 219 + }, + { + "epoch": 0.4396702473145141, + "grad_norm": 0.33460509715274295, + "learning_rate": 3.968769145295664e-05, + "loss": 0.6245, + "num_tokens": 918120402.0, + "step": 220 + }, + { + "epoch": 0.441668748438671, + "grad_norm": 0.2692153073663313, + "learning_rate": 3.968335157963966e-05, + "loss": 0.6043, + "num_tokens": 922305581.0, + "step": 221 + }, + { + "epoch": 0.4436672495628279, + "grad_norm": 0.4364827117500931, + "learning_rate": 3.9678982027902426e-05, + "loss": 0.611, + "num_tokens": 926489145.0, + "step": 222 + }, + { + "epoch": 0.4456657506869848, + "grad_norm": 0.3044629558739248, + "learning_rate": 3.967458280507845e-05, + "loss": 0.6272, + "num_tokens": 930665498.0, + "step": 223 + }, + { + "epoch": 0.44766425181114167, + "grad_norm": 0.4451741383227151, + "learning_rate": 3.9670153918551066e-05, + "loss": 0.6025, + "num_tokens": 934821902.0, + "step": 224 + }, + { + "epoch": 0.4496627529352985, + "grad_norm": 0.34775737714403027, + "learning_rate": 3.9665695375753385e-05, + "loss": 0.5972, + "num_tokens": 938977174.0, + "step": 225 + }, + { + "epoch": 0.4516612540594554, + "grad_norm": 0.4539111039653282, + "learning_rate": 3.9661207184168305e-05, + "loss": 0.6016, + "num_tokens": 943163282.0, + "step": 226 + }, + { + "epoch": 0.4536597551836123, + "grad_norm": 0.3683691344819752, + "learning_rate": 3.965668935132846e-05, + "loss": 0.6087, + "num_tokens": 947349535.0, + "step": 227 + }, + { + "epoch": 0.4556582563077692, + "grad_norm": 0.43082046442134825, + "learning_rate": 3.965214188481626e-05, + "loss": 0.6058, + "num_tokens": 951534207.0, + "step": 228 + }, + { + "epoch": 0.4576567574319261, + "grad_norm": 0.35320838274135236, + "learning_rate": 3.964756479226381e-05, + "loss": 0.5907, + "num_tokens": 955719775.0, + "step": 229 + }, + { + "epoch": 0.4596552585560829, + "grad_norm": 0.4477623134616741, + "learning_rate": 3.964295808135297e-05, + "loss": 0.6094, + "num_tokens": 959902829.0, + "step": 230 + }, + { + "epoch": 0.4616537596802398, + "grad_norm": 0.39038150491501405, + "learning_rate": 3.963832175981532e-05, + "loss": 0.6107, + "num_tokens": 964089303.0, + "step": 231 + }, + { + "epoch": 0.4636522608043967, + "grad_norm": 0.4178556610436171, + "learning_rate": 3.9633655835432096e-05, + "loss": 0.6085, + "num_tokens": 968240625.0, + "step": 232 + }, + { + "epoch": 0.4656507619285536, + "grad_norm": 0.4149459661080102, + "learning_rate": 3.962896031603424e-05, + "loss": 0.6219, + "num_tokens": 972424747.0, + "step": 233 + }, + { + "epoch": 0.4676492630527105, + "grad_norm": 0.32588060735259444, + "learning_rate": 3.962423520950237e-05, + "loss": 0.6094, + "num_tokens": 976609079.0, + "step": 234 + }, + { + "epoch": 0.4696477641768674, + "grad_norm": 0.36137869505308146, + "learning_rate": 3.961948052376675e-05, + "loss": 0.5931, + "num_tokens": 980793654.0, + "step": 235 + }, + { + "epoch": 0.4716462653010242, + "grad_norm": 0.2799425004116182, + "learning_rate": 3.961469626680728e-05, + "loss": 0.6049, + "num_tokens": 984977489.0, + "step": 236 + }, + { + "epoch": 0.4736447664251811, + "grad_norm": 0.3052395401876023, + "learning_rate": 3.9609882446653516e-05, + "loss": 0.6074, + "num_tokens": 989162584.0, + "step": 237 + }, + { + "epoch": 0.475643267549338, + "grad_norm": 0.266448900057142, + "learning_rate": 3.96050390713846e-05, + "loss": 0.5911, + "num_tokens": 993321766.0, + "step": 238 + }, + { + "epoch": 0.4776417686734949, + "grad_norm": 0.3634525362746061, + "learning_rate": 3.960016614912931e-05, + "loss": 0.5799, + "num_tokens": 997505462.0, + "step": 239 + }, + { + "epoch": 0.4796402697976518, + "grad_norm": 0.31473398286711857, + "learning_rate": 3.959526368806599e-05, + "loss": 0.5944, + "num_tokens": 1001689503.0, + "step": 240 + }, + { + "epoch": 0.4816387709218086, + "grad_norm": 0.29954680402507255, + "learning_rate": 3.959033169642255e-05, + "loss": 0.5825, + "num_tokens": 1005874377.0, + "step": 241 + }, + { + "epoch": 0.4836372720459655, + "grad_norm": 0.3725074228099601, + "learning_rate": 3.9585370182476504e-05, + "loss": 0.5963, + "num_tokens": 1010058327.0, + "step": 242 + }, + { + "epoch": 0.4856357731701224, + "grad_norm": 0.3655874753987347, + "learning_rate": 3.9580379154554863e-05, + "loss": 0.607, + "num_tokens": 1014244438.0, + "step": 243 + }, + { + "epoch": 0.4876342742942793, + "grad_norm": 0.30102040657163753, + "learning_rate": 3.9575358621034215e-05, + "loss": 0.5973, + "num_tokens": 1018429467.0, + "step": 244 + }, + { + "epoch": 0.4896327754184362, + "grad_norm": 0.3523635574286666, + "learning_rate": 3.9570308590340634e-05, + "loss": 0.6202, + "num_tokens": 1022561586.0, + "step": 245 + }, + { + "epoch": 0.49163127654259303, + "grad_norm": 0.2541533269514933, + "learning_rate": 3.956522907094973e-05, + "loss": 0.5905, + "num_tokens": 1026713331.0, + "step": 246 + }, + { + "epoch": 0.4936297776667499, + "grad_norm": 0.3844135490178722, + "learning_rate": 3.956012007138657e-05, + "loss": 0.5982, + "num_tokens": 1030897192.0, + "step": 247 + }, + { + "epoch": 0.4956282787909068, + "grad_norm": 0.3569390695661858, + "learning_rate": 3.955498160022574e-05, + "loss": 0.6051, + "num_tokens": 1035081053.0, + "step": 248 + }, + { + "epoch": 0.4976267799150637, + "grad_norm": 0.33744868104358444, + "learning_rate": 3.9549813666091254e-05, + "loss": 0.5988, + "num_tokens": 1039218287.0, + "step": 249 + }, + { + "epoch": 0.4996252810392206, + "grad_norm": 0.32907784893952186, + "learning_rate": 3.95446162776566e-05, + "loss": 0.601, + "num_tokens": 1043401575.0, + "step": 250 + }, + { + "epoch": 0.5016237821633774, + "grad_norm": 0.3429781492702601, + "learning_rate": 3.953938944364467e-05, + "loss": 0.6163, + "num_tokens": 1047574504.0, + "step": 251 + }, + { + "epoch": 0.5036222832875343, + "grad_norm": 0.26605381835164577, + "learning_rate": 3.953413317282781e-05, + "loss": 0.5867, + "num_tokens": 1051757461.0, + "step": 252 + }, + { + "epoch": 0.5056207844116912, + "grad_norm": 0.29294388624721795, + "learning_rate": 3.952884747402774e-05, + "loss": 0.5888, + "num_tokens": 1055944824.0, + "step": 253 + }, + { + "epoch": 0.5076192855358481, + "grad_norm": 0.3386275275516605, + "learning_rate": 3.952353235611559e-05, + "loss": 0.5975, + "num_tokens": 1060129191.0, + "step": 254 + }, + { + "epoch": 0.509617786660005, + "grad_norm": 0.28697327849072773, + "learning_rate": 3.951818782801187e-05, + "loss": 0.5838, + "num_tokens": 1064267328.0, + "step": 255 + }, + { + "epoch": 0.5116162877841619, + "grad_norm": 0.2522160338721744, + "learning_rate": 3.9512813898686413e-05, + "loss": 0.6034, + "num_tokens": 1068427061.0, + "step": 256 + }, + { + "epoch": 0.5136147889083188, + "grad_norm": 0.3228149370680137, + "learning_rate": 3.950741057715843e-05, + "loss": 0.5879, + "num_tokens": 1072611431.0, + "step": 257 + }, + { + "epoch": 0.5156132900324757, + "grad_norm": 0.2419343167580712, + "learning_rate": 3.950197787249647e-05, + "loss": 0.6072, + "num_tokens": 1076721925.0, + "step": 258 + }, + { + "epoch": 0.5176117911566325, + "grad_norm": 0.43631356033868607, + "learning_rate": 3.949651579381836e-05, + "loss": 0.6043, + "num_tokens": 1080906254.0, + "step": 259 + }, + { + "epoch": 0.5196102922807894, + "grad_norm": 0.33437821141619156, + "learning_rate": 3.949102435029125e-05, + "loss": 0.6047, + "num_tokens": 1085022968.0, + "step": 260 + }, + { + "epoch": 0.5216087934049463, + "grad_norm": 0.3237549721286124, + "learning_rate": 3.948550355113158e-05, + "loss": 0.6037, + "num_tokens": 1089161145.0, + "step": 261 + }, + { + "epoch": 0.5236072945291032, + "grad_norm": 0.41876372880938895, + "learning_rate": 3.947995340560504e-05, + "loss": 0.5951, + "num_tokens": 1093328778.0, + "step": 262 + }, + { + "epoch": 0.52560579565326, + "grad_norm": 0.2804215401281971, + "learning_rate": 3.9474373923026584e-05, + "loss": 0.5965, + "num_tokens": 1097463662.0, + "step": 263 + }, + { + "epoch": 0.5276042967774169, + "grad_norm": 0.3277763178250504, + "learning_rate": 3.9468765112760395e-05, + "loss": 0.6013, + "num_tokens": 1101649602.0, + "step": 264 + }, + { + "epoch": 0.5296027979015738, + "grad_norm": 0.3668377210003374, + "learning_rate": 3.9463126984219883e-05, + "loss": 0.6113, + "num_tokens": 1105832792.0, + "step": 265 + }, + { + "epoch": 0.5316012990257307, + "grad_norm": 0.3079693232078478, + "learning_rate": 3.945745954686767e-05, + "loss": 0.5881, + "num_tokens": 1109994053.0, + "step": 266 + }, + { + "epoch": 0.5335998001498876, + "grad_norm": 0.2869896274705637, + "learning_rate": 3.945176281021556e-05, + "loss": 0.6041, + "num_tokens": 1114166972.0, + "step": 267 + }, + { + "epoch": 0.5355983012740445, + "grad_norm": 0.41763871425619986, + "learning_rate": 3.9446036783824516e-05, + "loss": 0.6116, + "num_tokens": 1118349585.0, + "step": 268 + }, + { + "epoch": 0.5375968023982014, + "grad_norm": 0.30885112256272945, + "learning_rate": 3.9440281477304696e-05, + "loss": 0.5793, + "num_tokens": 1122499819.0, + "step": 269 + }, + { + "epoch": 0.5395953035223582, + "grad_norm": 0.42370720070567874, + "learning_rate": 3.9434496900315364e-05, + "loss": 0.6032, + "num_tokens": 1126683498.0, + "step": 270 + }, + { + "epoch": 0.5415938046465151, + "grad_norm": 0.35242409172151246, + "learning_rate": 3.942868306256493e-05, + "loss": 0.6226, + "num_tokens": 1130822468.0, + "step": 271 + }, + { + "epoch": 0.543592305770672, + "grad_norm": 0.3031515853917845, + "learning_rate": 3.94228399738109e-05, + "loss": 0.5847, + "num_tokens": 1134975487.0, + "step": 272 + }, + { + "epoch": 0.5455908068948289, + "grad_norm": 0.3780140143060919, + "learning_rate": 3.9416967643859886e-05, + "loss": 0.5897, + "num_tokens": 1139135646.0, + "step": 273 + }, + { + "epoch": 0.5475893080189858, + "grad_norm": 0.3427933925854614, + "learning_rate": 3.941106608256756e-05, + "loss": 0.5946, + "num_tokens": 1143320988.0, + "step": 274 + }, + { + "epoch": 0.5495878091431426, + "grad_norm": 0.3154786015792477, + "learning_rate": 3.9405135299838675e-05, + "loss": 0.5925, + "num_tokens": 1147498934.0, + "step": 275 + }, + { + "epoch": 0.5515863102672995, + "grad_norm": 0.3875953036578235, + "learning_rate": 3.939917530562701e-05, + "loss": 0.5955, + "num_tokens": 1151683592.0, + "step": 276 + }, + { + "epoch": 0.5535848113914564, + "grad_norm": 0.25963095967580646, + "learning_rate": 3.9393186109935365e-05, + "loss": 0.6022, + "num_tokens": 1155836585.0, + "step": 277 + }, + { + "epoch": 0.5555833125156133, + "grad_norm": 0.3852684901765339, + "learning_rate": 3.938716772281557e-05, + "loss": 0.5947, + "num_tokens": 1160020647.0, + "step": 278 + }, + { + "epoch": 0.5575818136397702, + "grad_norm": 0.2867632919636252, + "learning_rate": 3.938112015436845e-05, + "loss": 0.5996, + "num_tokens": 1164191648.0, + "step": 279 + }, + { + "epoch": 0.559580314763927, + "grad_norm": 0.44893822923580007, + "learning_rate": 3.9375043414743766e-05, + "loss": 0.6054, + "num_tokens": 1168377299.0, + "step": 280 + }, + { + "epoch": 0.5615788158880839, + "grad_norm": 0.39840937523802644, + "learning_rate": 3.936893751414028e-05, + "loss": 0.6012, + "num_tokens": 1172562565.0, + "step": 281 + }, + { + "epoch": 0.5635773170122408, + "grad_norm": 0.3671938651946383, + "learning_rate": 3.9362802462805684e-05, + "loss": 0.6054, + "num_tokens": 1176716397.0, + "step": 282 + }, + { + "epoch": 0.5655758181363977, + "grad_norm": 0.3884080564118209, + "learning_rate": 3.935663827103659e-05, + "loss": 0.5962, + "num_tokens": 1180901375.0, + "step": 283 + }, + { + "epoch": 0.5675743192605546, + "grad_norm": 0.3507339364847573, + "learning_rate": 3.9350444949178516e-05, + "loss": 0.6089, + "num_tokens": 1185076053.0, + "step": 284 + }, + { + "epoch": 0.5695728203847115, + "grad_norm": 0.3172023946979428, + "learning_rate": 3.9344222507625865e-05, + "loss": 0.5837, + "num_tokens": 1189260077.0, + "step": 285 + }, + { + "epoch": 0.5715713215088684, + "grad_norm": 0.315718364365409, + "learning_rate": 3.9337970956821936e-05, + "loss": 0.5724, + "num_tokens": 1193421832.0, + "step": 286 + }, + { + "epoch": 0.5735698226330253, + "grad_norm": 0.2469016557979294, + "learning_rate": 3.933169030725885e-05, + "loss": 0.5911, + "num_tokens": 1197607857.0, + "step": 287 + }, + { + "epoch": 0.5755683237571821, + "grad_norm": 0.42892799107081664, + "learning_rate": 3.932538056947759e-05, + "loss": 0.5923, + "num_tokens": 1201775951.0, + "step": 288 + }, + { + "epoch": 0.577566824881339, + "grad_norm": 0.27225200744618727, + "learning_rate": 3.931904175406795e-05, + "loss": 0.5894, + "num_tokens": 1205947392.0, + "step": 289 + }, + { + "epoch": 0.5795653260054959, + "grad_norm": 0.4228889369193428, + "learning_rate": 3.931267387166852e-05, + "loss": 0.588, + "num_tokens": 1210051745.0, + "step": 290 + }, + { + "epoch": 0.5815638271296527, + "grad_norm": 0.31529135680903564, + "learning_rate": 3.9306276932966684e-05, + "loss": 0.5931, + "num_tokens": 1214235450.0, + "step": 291 + }, + { + "epoch": 0.5835623282538096, + "grad_norm": 0.452107820357542, + "learning_rate": 3.929985094869858e-05, + "loss": 0.6008, + "num_tokens": 1218404520.0, + "step": 292 + }, + { + "epoch": 0.5855608293779665, + "grad_norm": 0.3943672552750794, + "learning_rate": 3.9293395929649124e-05, + "loss": 0.5747, + "num_tokens": 1222587386.0, + "step": 293 + }, + { + "epoch": 0.5875593305021234, + "grad_norm": 0.37021358537277566, + "learning_rate": 3.928691188665191e-05, + "loss": 0.5985, + "num_tokens": 1226721589.0, + "step": 294 + }, + { + "epoch": 0.5895578316262803, + "grad_norm": 0.42307584054660957, + "learning_rate": 3.928039883058929e-05, + "loss": 0.5941, + "num_tokens": 1230908146.0, + "step": 295 + }, + { + "epoch": 0.5915563327504372, + "grad_norm": 0.36823744235917416, + "learning_rate": 3.927385677239229e-05, + "loss": 0.5905, + "num_tokens": 1235082978.0, + "step": 296 + }, + { + "epoch": 0.5935548338745941, + "grad_norm": 0.3991510220455802, + "learning_rate": 3.9267285723040614e-05, + "loss": 0.583, + "num_tokens": 1239244880.0, + "step": 297 + }, + { + "epoch": 0.595553334998751, + "grad_norm": 0.3214181754177232, + "learning_rate": 3.926068569356262e-05, + "loss": 0.5902, + "num_tokens": 1243421034.0, + "step": 298 + }, + { + "epoch": 0.5975518361229079, + "grad_norm": 0.3882984490396843, + "learning_rate": 3.92540566950353e-05, + "loss": 0.5903, + "num_tokens": 1247578402.0, + "step": 299 + }, + { + "epoch": 0.5995503372470647, + "grad_norm": 0.3347190885884958, + "learning_rate": 3.924739873858428e-05, + "loss": 0.6046, + "num_tokens": 1251763421.0, + "step": 300 + }, + { + "epoch": 0.6015488383712216, + "grad_norm": 0.42212915165041925, + "learning_rate": 3.9240711835383766e-05, + "loss": 0.6027, + "num_tokens": 1255948162.0, + "step": 301 + }, + { + "epoch": 0.6035473394953784, + "grad_norm": 0.44665317274438415, + "learning_rate": 3.9233995996656585e-05, + "loss": 0.5758, + "num_tokens": 1260126533.0, + "step": 302 + }, + { + "epoch": 0.6055458406195353, + "grad_norm": 0.2675043030898904, + "learning_rate": 3.9227251233674065e-05, + "loss": 0.5969, + "num_tokens": 1264311937.0, + "step": 303 + }, + { + "epoch": 0.6075443417436922, + "grad_norm": 0.4430456332397782, + "learning_rate": 3.922047755775614e-05, + "loss": 0.6019, + "num_tokens": 1268463695.0, + "step": 304 + }, + { + "epoch": 0.6095428428678491, + "grad_norm": 0.323633075517029, + "learning_rate": 3.921367498027124e-05, + "loss": 0.6079, + "num_tokens": 1272615950.0, + "step": 305 + }, + { + "epoch": 0.611541343992006, + "grad_norm": 0.41501171693540334, + "learning_rate": 3.920684351263629e-05, + "loss": 0.6026, + "num_tokens": 1276787415.0, + "step": 306 + }, + { + "epoch": 0.6135398451161629, + "grad_norm": 0.35426240907927786, + "learning_rate": 3.919998316631673e-05, + "loss": 0.5872, + "num_tokens": 1280941610.0, + "step": 307 + }, + { + "epoch": 0.6155383462403198, + "grad_norm": 0.29313517131153527, + "learning_rate": 3.919309395282645e-05, + "loss": 0.5981, + "num_tokens": 1285127301.0, + "step": 308 + }, + { + "epoch": 0.6175368473644767, + "grad_norm": 0.38551062865935515, + "learning_rate": 3.918617588372779e-05, + "loss": 0.6033, + "num_tokens": 1289307966.0, + "step": 309 + }, + { + "epoch": 0.6195353484886336, + "grad_norm": 0.2658272123589417, + "learning_rate": 3.917922897063153e-05, + "loss": 0.5705, + "num_tokens": 1293488157.0, + "step": 310 + }, + { + "epoch": 0.6215338496127905, + "grad_norm": 0.3862823559002034, + "learning_rate": 3.9172253225196844e-05, + "loss": 0.5831, + "num_tokens": 1297645446.0, + "step": 311 + }, + { + "epoch": 0.6235323507369472, + "grad_norm": 0.40279783908754246, + "learning_rate": 3.916524865913131e-05, + "loss": 0.589, + "num_tokens": 1301828676.0, + "step": 312 + }, + { + "epoch": 0.6255308518611041, + "grad_norm": 0.2898238914743625, + "learning_rate": 3.915821528419086e-05, + "loss": 0.604, + "num_tokens": 1306015273.0, + "step": 313 + }, + { + "epoch": 0.627529352985261, + "grad_norm": 0.4324778861951147, + "learning_rate": 3.9151153112179794e-05, + "loss": 0.5802, + "num_tokens": 1310175227.0, + "step": 314 + }, + { + "epoch": 0.6295278541094179, + "grad_norm": 0.3073898158633527, + "learning_rate": 3.914406215495074e-05, + "loss": 0.5703, + "num_tokens": 1314332718.0, + "step": 315 + }, + { + "epoch": 0.6315263552335748, + "grad_norm": 0.447470111447466, + "learning_rate": 3.913694242440462e-05, + "loss": 0.5717, + "num_tokens": 1318518753.0, + "step": 316 + }, + { + "epoch": 0.6335248563577317, + "grad_norm": 0.3551624360958809, + "learning_rate": 3.9129793932490664e-05, + "loss": 0.5827, + "num_tokens": 1322704458.0, + "step": 317 + }, + { + "epoch": 0.6355233574818886, + "grad_norm": 0.3504326164356333, + "learning_rate": 3.912261669120638e-05, + "loss": 0.5885, + "num_tokens": 1326889229.0, + "step": 318 + }, + { + "epoch": 0.6375218586060455, + "grad_norm": 0.3636900580399306, + "learning_rate": 3.9115410712597504e-05, + "loss": 0.596, + "num_tokens": 1331077314.0, + "step": 319 + }, + { + "epoch": 0.6395203597302024, + "grad_norm": 0.2236139709971572, + "learning_rate": 3.9108176008758e-05, + "loss": 0.5814, + "num_tokens": 1335260881.0, + "step": 320 + }, + { + "epoch": 0.6415188608543593, + "grad_norm": 0.3403723139825135, + "learning_rate": 3.910091259183009e-05, + "loss": 0.5936, + "num_tokens": 1339447806.0, + "step": 321 + }, + { + "epoch": 0.6435173619785162, + "grad_norm": 0.3611729893331964, + "learning_rate": 3.909362047400413e-05, + "loss": 0.5951, + "num_tokens": 1343601763.0, + "step": 322 + }, + { + "epoch": 0.645515863102673, + "grad_norm": 0.2660675617916867, + "learning_rate": 3.908629966751867e-05, + "loss": 0.5908, + "num_tokens": 1347783275.0, + "step": 323 + }, + { + "epoch": 0.6475143642268298, + "grad_norm": 0.6176400767906435, + "learning_rate": 3.90789501846604e-05, + "loss": 0.6054, + "num_tokens": 1351969607.0, + "step": 324 + }, + { + "epoch": 0.6495128653509867, + "grad_norm": 0.4991444003666476, + "learning_rate": 3.907157203776416e-05, + "loss": 0.5894, + "num_tokens": 1356146061.0, + "step": 325 + }, + { + "epoch": 0.6515113664751436, + "grad_norm": 0.42877138573169626, + "learning_rate": 3.9064165239212874e-05, + "loss": 0.5838, + "num_tokens": 1360327347.0, + "step": 326 + }, + { + "epoch": 0.6535098675993005, + "grad_norm": 0.4240811621307827, + "learning_rate": 3.905672980143756e-05, + "loss": 0.5951, + "num_tokens": 1364512836.0, + "step": 327 + }, + { + "epoch": 0.6555083687234574, + "grad_norm": 0.3944685100540348, + "learning_rate": 3.90492657369173e-05, + "loss": 0.5781, + "num_tokens": 1368683699.0, + "step": 328 + }, + { + "epoch": 0.6575068698476143, + "grad_norm": 0.3243933984963071, + "learning_rate": 3.904177305817923e-05, + "loss": 0.5879, + "num_tokens": 1372841253.0, + "step": 329 + }, + { + "epoch": 0.6595053709717712, + "grad_norm": 0.525971745659925, + "learning_rate": 3.903425177779851e-05, + "loss": 0.5854, + "num_tokens": 1377012079.0, + "step": 330 + }, + { + "epoch": 0.6615038720959281, + "grad_norm": 0.4538398929151208, + "learning_rate": 3.902670190839828e-05, + "loss": 0.5935, + "num_tokens": 1381180575.0, + "step": 331 + }, + { + "epoch": 0.663502373220085, + "grad_norm": 0.43371173123423473, + "learning_rate": 3.90191234626497e-05, + "loss": 0.6111, + "num_tokens": 1385361484.0, + "step": 332 + }, + { + "epoch": 0.6655008743442418, + "grad_norm": 0.440190236148942, + "learning_rate": 3.901151645327185e-05, + "loss": 0.5928, + "num_tokens": 1389535770.0, + "step": 333 + }, + { + "epoch": 0.6674993754683987, + "grad_norm": 0.3267095581547485, + "learning_rate": 3.900388089303177e-05, + "loss": 0.5879, + "num_tokens": 1393689578.0, + "step": 334 + }, + { + "epoch": 0.6694978765925556, + "grad_norm": 0.31667555355278, + "learning_rate": 3.8996216794744426e-05, + "loss": 0.5878, + "num_tokens": 1397873191.0, + "step": 335 + }, + { + "epoch": 0.6714963777167124, + "grad_norm": 0.3891023786091309, + "learning_rate": 3.898852417127266e-05, + "loss": 0.5904, + "num_tokens": 1402033297.0, + "step": 336 + }, + { + "epoch": 0.6734948788408693, + "grad_norm": 0.31436744347617784, + "learning_rate": 3.8980803035527204e-05, + "loss": 0.5957, + "num_tokens": 1406178064.0, + "step": 337 + }, + { + "epoch": 0.6754933799650262, + "grad_norm": 0.46667603194041524, + "learning_rate": 3.897305340046663e-05, + "loss": 0.5811, + "num_tokens": 1410364958.0, + "step": 338 + }, + { + "epoch": 0.6774918810891831, + "grad_norm": 0.40615134811693643, + "learning_rate": 3.8965275279097366e-05, + "loss": 0.5899, + "num_tokens": 1414526599.0, + "step": 339 + }, + { + "epoch": 0.67949038221334, + "grad_norm": 0.4144896730826761, + "learning_rate": 3.8957468684473616e-05, + "loss": 0.6035, + "num_tokens": 1418658553.0, + "step": 340 + }, + { + "epoch": 0.6814888833374969, + "grad_norm": 0.3522730949183637, + "learning_rate": 3.8949633629697386e-05, + "loss": 0.588, + "num_tokens": 1422789118.0, + "step": 341 + }, + { + "epoch": 0.6834873844616538, + "grad_norm": 0.4424310054093142, + "learning_rate": 3.8941770127918446e-05, + "loss": 0.5968, + "num_tokens": 1426948762.0, + "step": 342 + }, + { + "epoch": 0.6854858855858107, + "grad_norm": 0.36412435236667523, + "learning_rate": 3.8933878192334324e-05, + "loss": 0.5775, + "num_tokens": 1431134726.0, + "step": 343 + }, + { + "epoch": 0.6874843867099675, + "grad_norm": 0.35586701701349377, + "learning_rate": 3.892595783619026e-05, + "loss": 0.603, + "num_tokens": 1435312598.0, + "step": 344 + }, + { + "epoch": 0.6894828878341244, + "grad_norm": 0.3592846007090845, + "learning_rate": 3.891800907277917e-05, + "loss": 0.5909, + "num_tokens": 1439497814.0, + "step": 345 + }, + { + "epoch": 0.6914813889582813, + "grad_norm": 0.32671922923210994, + "learning_rate": 3.891003191544168e-05, + "loss": 0.5821, + "num_tokens": 1443649096.0, + "step": 346 + }, + { + "epoch": 0.6934798900824382, + "grad_norm": 0.28796451145658697, + "learning_rate": 3.890202637756607e-05, + "loss": 0.5978, + "num_tokens": 1447834045.0, + "step": 347 + }, + { + "epoch": 0.695478391206595, + "grad_norm": 0.365617299654531, + "learning_rate": 3.88939924725882e-05, + "loss": 0.578, + "num_tokens": 1451995666.0, + "step": 348 + }, + { + "epoch": 0.6974768923307519, + "grad_norm": 0.27028342613849154, + "learning_rate": 3.8885930213991614e-05, + "loss": 0.5833, + "num_tokens": 1456149851.0, + "step": 349 + }, + { + "epoch": 0.6994753934549088, + "grad_norm": 0.40078332355924706, + "learning_rate": 3.887783961530739e-05, + "loss": 0.5906, + "num_tokens": 1460333832.0, + "step": 350 + }, + { + "epoch": 0.7014738945790657, + "grad_norm": 0.30305579327747306, + "learning_rate": 3.886972069011419e-05, + "loss": 0.6012, + "num_tokens": 1464517905.0, + "step": 351 + }, + { + "epoch": 0.7034723957032226, + "grad_norm": 0.44745499187156573, + "learning_rate": 3.886157345203821e-05, + "loss": 0.5848, + "num_tokens": 1468692642.0, + "step": 352 + }, + { + "epoch": 0.7054708968273795, + "grad_norm": 0.37547065470584884, + "learning_rate": 3.885339791475317e-05, + "loss": 0.5907, + "num_tokens": 1472869579.0, + "step": 353 + }, + { + "epoch": 0.7074693979515363, + "grad_norm": 0.37799858941409564, + "learning_rate": 3.8845194091980286e-05, + "loss": 0.5862, + "num_tokens": 1477040372.0, + "step": 354 + }, + { + "epoch": 0.7094678990756932, + "grad_norm": 0.31342260211569895, + "learning_rate": 3.883696199748824e-05, + "loss": 0.5709, + "num_tokens": 1481171409.0, + "step": 355 + }, + { + "epoch": 0.7114664001998501, + "grad_norm": 0.40530361074757787, + "learning_rate": 3.882870164509317e-05, + "loss": 0.5856, + "num_tokens": 1485355682.0, + "step": 356 + }, + { + "epoch": 0.713464901324007, + "grad_norm": 0.32936761082321353, + "learning_rate": 3.882041304865863e-05, + "loss": 0.5921, + "num_tokens": 1489542022.0, + "step": 357 + }, + { + "epoch": 0.7154634024481639, + "grad_norm": 0.38609043703688, + "learning_rate": 3.8812096222095586e-05, + "loss": 0.5866, + "num_tokens": 1493728758.0, + "step": 358 + }, + { + "epoch": 0.7174619035723208, + "grad_norm": 0.34106679624230807, + "learning_rate": 3.880375117936238e-05, + "loss": 0.572, + "num_tokens": 1497911743.0, + "step": 359 + }, + { + "epoch": 0.7194604046964777, + "grad_norm": 0.3774166108941244, + "learning_rate": 3.8795377934464706e-05, + "loss": 0.589, + "num_tokens": 1502098310.0, + "step": 360 + }, + { + "epoch": 0.7214589058206345, + "grad_norm": 0.35460679730124733, + "learning_rate": 3.87869765014556e-05, + "loss": 0.5982, + "num_tokens": 1506283200.0, + "step": 361 + }, + { + "epoch": 0.7234574069447914, + "grad_norm": 0.38713056744774005, + "learning_rate": 3.877854689443541e-05, + "loss": 0.6008, + "num_tokens": 1510467955.0, + "step": 362 + }, + { + "epoch": 0.7254559080689483, + "grad_norm": 0.3101275024651486, + "learning_rate": 3.877008912755174e-05, + "loss": 0.5834, + "num_tokens": 1514655157.0, + "step": 363 + }, + { + "epoch": 0.7274544091931052, + "grad_norm": 0.41028733810879237, + "learning_rate": 3.876160321499949e-05, + "loss": 0.587, + "num_tokens": 1518816213.0, + "step": 364 + }, + { + "epoch": 0.729452910317262, + "grad_norm": 0.33907055850072887, + "learning_rate": 3.875308917102079e-05, + "loss": 0.5748, + "num_tokens": 1522999762.0, + "step": 365 + }, + { + "epoch": 0.7314514114414189, + "grad_norm": 0.38489477995131877, + "learning_rate": 3.8744547009904956e-05, + "loss": 0.5742, + "num_tokens": 1527185654.0, + "step": 366 + }, + { + "epoch": 0.7334499125655758, + "grad_norm": 0.3824649551819308, + "learning_rate": 3.873597674598853e-05, + "loss": 0.5753, + "num_tokens": 1531371240.0, + "step": 367 + }, + { + "epoch": 0.7354484136897327, + "grad_norm": 0.3180386515355124, + "learning_rate": 3.8727378393655206e-05, + "loss": 0.5832, + "num_tokens": 1535555253.0, + "step": 368 + }, + { + "epoch": 0.7374469148138896, + "grad_norm": 0.33881230771604226, + "learning_rate": 3.871875196733581e-05, + "loss": 0.6056, + "num_tokens": 1539742491.0, + "step": 369 + }, + { + "epoch": 0.7394454159380465, + "grad_norm": 0.2791604332330073, + "learning_rate": 3.871009748150829e-05, + "loss": 0.5675, + "num_tokens": 1543879296.0, + "step": 370 + }, + { + "epoch": 0.7414439170622034, + "grad_norm": 0.2896341341430996, + "learning_rate": 3.8701414950697705e-05, + "loss": 0.5965, + "num_tokens": 1548061842.0, + "step": 371 + }, + { + "epoch": 0.7434424181863603, + "grad_norm": 0.2896742133279401, + "learning_rate": 3.869270438947616e-05, + "loss": 0.5776, + "num_tokens": 1552216540.0, + "step": 372 + }, + { + "epoch": 0.7454409193105171, + "grad_norm": 0.21072944282981565, + "learning_rate": 3.868396581246281e-05, + "loss": 0.5678, + "num_tokens": 1556401992.0, + "step": 373 + }, + { + "epoch": 0.747439420434674, + "grad_norm": 0.37585730652352145, + "learning_rate": 3.867519923432384e-05, + "loss": 0.5732, + "num_tokens": 1560560350.0, + "step": 374 + }, + { + "epoch": 0.7494379215588308, + "grad_norm": 0.3009231450710855, + "learning_rate": 3.8666404669772415e-05, + "loss": 0.5707, + "num_tokens": 1564744029.0, + "step": 375 + }, + { + "epoch": 0.7514364226829877, + "grad_norm": 0.35790083820989355, + "learning_rate": 3.865758213356868e-05, + "loss": 0.5881, + "num_tokens": 1568905369.0, + "step": 376 + }, + { + "epoch": 0.7534349238071446, + "grad_norm": 0.27607212973916767, + "learning_rate": 3.8648731640519714e-05, + "loss": 0.5788, + "num_tokens": 1573090361.0, + "step": 377 + }, + { + "epoch": 0.7554334249313015, + "grad_norm": 0.35792964554886975, + "learning_rate": 3.863985320547954e-05, + "loss": 0.5683, + "num_tokens": 1577274155.0, + "step": 378 + }, + { + "epoch": 0.7574319260554584, + "grad_norm": 0.29206891273643293, + "learning_rate": 3.8630946843349054e-05, + "loss": 0.5902, + "num_tokens": 1581443514.0, + "step": 379 + }, + { + "epoch": 0.7594304271796153, + "grad_norm": 0.3466737945597925, + "learning_rate": 3.8622012569076026e-05, + "loss": 0.5861, + "num_tokens": 1585627980.0, + "step": 380 + }, + { + "epoch": 0.7614289283037722, + "grad_norm": 0.28464176935104146, + "learning_rate": 3.8613050397655095e-05, + "loss": 0.5855, + "num_tokens": 1589815041.0, + "step": 381 + }, + { + "epoch": 0.7634274294279291, + "grad_norm": 0.2968473325616499, + "learning_rate": 3.8604060344127676e-05, + "loss": 0.5616, + "num_tokens": 1594002021.0, + "step": 382 + }, + { + "epoch": 0.765425930552086, + "grad_norm": 0.2560296093069984, + "learning_rate": 3.859504242358202e-05, + "loss": 0.5799, + "num_tokens": 1598172632.0, + "step": 383 + }, + { + "epoch": 0.7674244316762429, + "grad_norm": 0.31414139034662997, + "learning_rate": 3.8585996651153145e-05, + "loss": 0.5884, + "num_tokens": 1602357814.0, + "step": 384 + }, + { + "epoch": 0.7694229328003998, + "grad_norm": 0.2929082856590142, + "learning_rate": 3.857692304202278e-05, + "loss": 0.5877, + "num_tokens": 1606539914.0, + "step": 385 + }, + { + "epoch": 0.7714214339245565, + "grad_norm": 0.28507495543308264, + "learning_rate": 3.8567821611419394e-05, + "loss": 0.5754, + "num_tokens": 1610722821.0, + "step": 386 + }, + { + "epoch": 0.7734199350487134, + "grad_norm": 0.32153702931909567, + "learning_rate": 3.855869237461817e-05, + "loss": 0.5936, + "num_tokens": 1614910552.0, + "step": 387 + }, + { + "epoch": 0.7754184361728703, + "grad_norm": 0.2037661082136033, + "learning_rate": 3.8549535346940924e-05, + "loss": 0.58, + "num_tokens": 1619094471.0, + "step": 388 + }, + { + "epoch": 0.7774169372970272, + "grad_norm": 0.3146281899909916, + "learning_rate": 3.854035054375614e-05, + "loss": 0.5945, + "num_tokens": 1623237061.0, + "step": 389 + }, + { + "epoch": 0.7794154384211841, + "grad_norm": 0.24534110629688863, + "learning_rate": 3.85311379804789e-05, + "loss": 0.5624, + "num_tokens": 1627422764.0, + "step": 390 + }, + { + "epoch": 0.781413939545341, + "grad_norm": 0.3020989097928363, + "learning_rate": 3.852189767257088e-05, + "loss": 0.59, + "num_tokens": 1631600758.0, + "step": 391 + }, + { + "epoch": 0.7834124406694979, + "grad_norm": 0.21479518412331008, + "learning_rate": 3.8512629635540334e-05, + "loss": 0.5789, + "num_tokens": 1635765628.0, + "step": 392 + }, + { + "epoch": 0.7854109417936548, + "grad_norm": 0.38290690087264806, + "learning_rate": 3.8503333884942034e-05, + "loss": 0.5871, + "num_tokens": 1639953468.0, + "step": 393 + }, + { + "epoch": 0.7874094429178117, + "grad_norm": 0.27814524976848587, + "learning_rate": 3.849401043637729e-05, + "loss": 0.5863, + "num_tokens": 1644125942.0, + "step": 394 + }, + { + "epoch": 0.7894079440419686, + "grad_norm": 0.3652512557556841, + "learning_rate": 3.848465930549388e-05, + "loss": 0.5695, + "num_tokens": 1648299393.0, + "step": 395 + }, + { + "epoch": 0.7914064451661255, + "grad_norm": 0.3448454807333739, + "learning_rate": 3.8475280507986034e-05, + "loss": 0.5741, + "num_tokens": 1652480831.0, + "step": 396 + }, + { + "epoch": 0.7934049462902822, + "grad_norm": 0.3002238719300878, + "learning_rate": 3.8465874059594436e-05, + "loss": 0.5828, + "num_tokens": 1656635108.0, + "step": 397 + }, + { + "epoch": 0.7954034474144391, + "grad_norm": 0.35474072959981595, + "learning_rate": 3.845643997610617e-05, + "loss": 0.576, + "num_tokens": 1660819746.0, + "step": 398 + }, + { + "epoch": 0.797401948538596, + "grad_norm": 0.3055506957081295, + "learning_rate": 3.8446978273354696e-05, + "loss": 0.5826, + "num_tokens": 1665005345.0, + "step": 399 + }, + { + "epoch": 0.7994004496627529, + "grad_norm": 0.31174755305106405, + "learning_rate": 3.843748896721984e-05, + "loss": 0.5882, + "num_tokens": 1669173044.0, + "step": 400 + }, + { + "epoch": 0.8013989507869098, + "grad_norm": 0.2638216111731479, + "learning_rate": 3.8427972073627724e-05, + "loss": 0.5903, + "num_tokens": 1673359254.0, + "step": 401 + }, + { + "epoch": 0.8033974519110667, + "grad_norm": 0.22595090522915456, + "learning_rate": 3.841842760855082e-05, + "loss": 0.5862, + "num_tokens": 1677511908.0, + "step": 402 + }, + { + "epoch": 0.8053959530352236, + "grad_norm": 0.27449872796803426, + "learning_rate": 3.840885558800783e-05, + "loss": 0.5911, + "num_tokens": 1681688122.0, + "step": 403 + }, + { + "epoch": 0.8073944541593805, + "grad_norm": 0.227363625115527, + "learning_rate": 3.8399256028063716e-05, + "loss": 0.5925, + "num_tokens": 1685875301.0, + "step": 404 + }, + { + "epoch": 0.8093929552835374, + "grad_norm": 0.2982874849449248, + "learning_rate": 3.838962894482969e-05, + "loss": 0.5867, + "num_tokens": 1690061440.0, + "step": 405 + }, + { + "epoch": 0.8113914564076943, + "grad_norm": 0.21407171048737647, + "learning_rate": 3.837997435446311e-05, + "loss": 0.5642, + "num_tokens": 1694248582.0, + "step": 406 + }, + { + "epoch": 0.8133899575318511, + "grad_norm": 0.34858562251576175, + "learning_rate": 3.837029227316753e-05, + "loss": 0.5722, + "num_tokens": 1698434606.0, + "step": 407 + }, + { + "epoch": 0.815388458656008, + "grad_norm": 0.23933925851034552, + "learning_rate": 3.8360582717192626e-05, + "loss": 0.5682, + "num_tokens": 1702616104.0, + "step": 408 + }, + { + "epoch": 0.8173869597801648, + "grad_norm": 0.38084460761833544, + "learning_rate": 3.83508457028342e-05, + "loss": 0.5715, + "num_tokens": 1706772651.0, + "step": 409 + }, + { + "epoch": 0.8193854609043217, + "grad_norm": 0.3381150210066301, + "learning_rate": 3.834108124643413e-05, + "loss": 0.5941, + "num_tokens": 1710958973.0, + "step": 410 + }, + { + "epoch": 0.8213839620284786, + "grad_norm": 0.3700601738640218, + "learning_rate": 3.833128936438036e-05, + "loss": 0.5896, + "num_tokens": 1715143066.0, + "step": 411 + }, + { + "epoch": 0.8233824631526355, + "grad_norm": 0.3427777194497715, + "learning_rate": 3.832147007310684e-05, + "loss": 0.5682, + "num_tokens": 1719329473.0, + "step": 412 + }, + { + "epoch": 0.8253809642767924, + "grad_norm": 0.258354722665756, + "learning_rate": 3.8311623389093546e-05, + "loss": 0.5686, + "num_tokens": 1723484610.0, + "step": 413 + }, + { + "epoch": 0.8273794654009493, + "grad_norm": 0.33550975588078685, + "learning_rate": 3.830174932886642e-05, + "loss": 0.5765, + "num_tokens": 1727669708.0, + "step": 414 + }, + { + "epoch": 0.8293779665251062, + "grad_norm": 0.2996175519529202, + "learning_rate": 3.829184790899735e-05, + "loss": 0.5863, + "num_tokens": 1731833780.0, + "step": 415 + }, + { + "epoch": 0.8313764676492631, + "grad_norm": 0.27029325880528965, + "learning_rate": 3.828191914610414e-05, + "loss": 0.575, + "num_tokens": 1736000490.0, + "step": 416 + }, + { + "epoch": 0.83337496877342, + "grad_norm": 0.3757578253402653, + "learning_rate": 3.8271963056850474e-05, + "loss": 0.5749, + "num_tokens": 1740186114.0, + "step": 417 + }, + { + "epoch": 0.8353734698975768, + "grad_norm": 0.2930180918377146, + "learning_rate": 3.8261979657945943e-05, + "loss": 0.5696, + "num_tokens": 1744340213.0, + "step": 418 + }, + { + "epoch": 0.8373719710217337, + "grad_norm": 0.25077829871008367, + "learning_rate": 3.82519689661459e-05, + "loss": 0.5579, + "num_tokens": 1748499961.0, + "step": 419 + }, + { + "epoch": 0.8393704721458906, + "grad_norm": 0.3603559964468532, + "learning_rate": 3.824193099825158e-05, + "loss": 0.5891, + "num_tokens": 1752636145.0, + "step": 420 + }, + { + "epoch": 0.8413689732700474, + "grad_norm": 0.19140001167151618, + "learning_rate": 3.8231865771109935e-05, + "loss": 0.5718, + "num_tokens": 1756814760.0, + "step": 421 + }, + { + "epoch": 0.8433674743942043, + "grad_norm": 0.39039752200183836, + "learning_rate": 3.822177330161371e-05, + "loss": 0.5807, + "num_tokens": 1761001570.0, + "step": 422 + }, + { + "epoch": 0.8453659755183612, + "grad_norm": 0.34246455282152527, + "learning_rate": 3.821165360670134e-05, + "loss": 0.5788, + "num_tokens": 1765152759.0, + "step": 423 + }, + { + "epoch": 0.8473644766425181, + "grad_norm": 0.3953713557296321, + "learning_rate": 3.8201506703356976e-05, + "loss": 0.5781, + "num_tokens": 1769338497.0, + "step": 424 + }, + { + "epoch": 0.849362977766675, + "grad_norm": 0.33092681930673473, + "learning_rate": 3.819133260861042e-05, + "loss": 0.5822, + "num_tokens": 1773523245.0, + "step": 425 + }, + { + "epoch": 0.8513614788908319, + "grad_norm": 0.3621224600333994, + "learning_rate": 3.818113133953712e-05, + "loss": 0.565, + "num_tokens": 1777706153.0, + "step": 426 + }, + { + "epoch": 0.8533599800149888, + "grad_norm": 0.2772116026807886, + "learning_rate": 3.8170902913258126e-05, + "loss": 0.5879, + "num_tokens": 1781889597.0, + "step": 427 + }, + { + "epoch": 0.8553584811391456, + "grad_norm": 0.3318630474481867, + "learning_rate": 3.816064734694006e-05, + "loss": 0.5735, + "num_tokens": 1786074286.0, + "step": 428 + }, + { + "epoch": 0.8573569822633025, + "grad_norm": 0.1955439774394788, + "learning_rate": 3.81503646577951e-05, + "loss": 0.5708, + "num_tokens": 1790233680.0, + "step": 429 + }, + { + "epoch": 0.8593554833874594, + "grad_norm": 0.2991857728749293, + "learning_rate": 3.814005486308097e-05, + "loss": 0.581, + "num_tokens": 1794418347.0, + "step": 430 + }, + { + "epoch": 0.8613539845116163, + "grad_norm": 0.2614619784564939, + "learning_rate": 3.812971798010082e-05, + "loss": 0.5613, + "num_tokens": 1798603944.0, + "step": 431 + }, + { + "epoch": 0.8633524856357732, + "grad_norm": 0.22623317083939004, + "learning_rate": 3.8119354026203344e-05, + "loss": 0.5774, + "num_tokens": 1802739813.0, + "step": 432 + }, + { + "epoch": 0.86535098675993, + "grad_norm": 0.3740202395304368, + "learning_rate": 3.810896301878261e-05, + "loss": 0.5949, + "num_tokens": 1806922115.0, + "step": 433 + }, + { + "epoch": 0.867349487884087, + "grad_norm": 0.21878627479712906, + "learning_rate": 3.809854497527811e-05, + "loss": 0.5905, + "num_tokens": 1811082457.0, + "step": 434 + }, + { + "epoch": 0.8693479890082438, + "grad_norm": 0.3246474851297919, + "learning_rate": 3.8088099913174724e-05, + "loss": 0.5719, + "num_tokens": 1815268661.0, + "step": 435 + }, + { + "epoch": 0.8713464901324007, + "grad_norm": 0.2662943327139358, + "learning_rate": 3.807762785000265e-05, + "loss": 0.5656, + "num_tokens": 1819454182.0, + "step": 436 + }, + { + "epoch": 0.8733449912565576, + "grad_norm": 0.25652788755521355, + "learning_rate": 3.806712880333743e-05, + "loss": 0.5707, + "num_tokens": 1823641044.0, + "step": 437 + }, + { + "epoch": 0.8753434923807145, + "grad_norm": 0.3674469699340125, + "learning_rate": 3.8056602790799876e-05, + "loss": 0.56, + "num_tokens": 1827827742.0, + "step": 438 + }, + { + "epoch": 0.8773419935048713, + "grad_norm": 0.27024735855011045, + "learning_rate": 3.804604983005607e-05, + "loss": 0.5778, + "num_tokens": 1832013892.0, + "step": 439 + }, + { + "epoch": 0.8793404946290282, + "grad_norm": 0.3993691351198341, + "learning_rate": 3.803546993881732e-05, + "loss": 0.5783, + "num_tokens": 1836198204.0, + "step": 440 + }, + { + "epoch": 0.8813389957531851, + "grad_norm": 0.2686259499363446, + "learning_rate": 3.802486313484012e-05, + "loss": 0.5847, + "num_tokens": 1840348244.0, + "step": 441 + }, + { + "epoch": 0.883337496877342, + "grad_norm": 0.44968604979216686, + "learning_rate": 3.8014229435926135e-05, + "loss": 0.5771, + "num_tokens": 1844516848.0, + "step": 442 + }, + { + "epoch": 0.8853359980014989, + "grad_norm": 0.32648342735648306, + "learning_rate": 3.8003568859922175e-05, + "loss": 0.5696, + "num_tokens": 1848701706.0, + "step": 443 + }, + { + "epoch": 0.8873344991256558, + "grad_norm": 0.5113476381746866, + "learning_rate": 3.799288142472016e-05, + "loss": 0.566, + "num_tokens": 1852858324.0, + "step": 444 + }, + { + "epoch": 0.8893330002498127, + "grad_norm": 0.5051972339848061, + "learning_rate": 3.798216714825709e-05, + "loss": 0.553, + "num_tokens": 1857044703.0, + "step": 445 + }, + { + "epoch": 0.8913315013739695, + "grad_norm": 0.29149209287309596, + "learning_rate": 3.7971426048515e-05, + "loss": 0.5879, + "num_tokens": 1861207444.0, + "step": 446 + }, + { + "epoch": 0.8933300024981264, + "grad_norm": 0.4063668735534789, + "learning_rate": 3.796065814352096e-05, + "loss": 0.5646, + "num_tokens": 1865393576.0, + "step": 447 + }, + { + "epoch": 0.8953285036222833, + "grad_norm": 0.33993787901596684, + "learning_rate": 3.7949863451347014e-05, + "loss": 0.5741, + "num_tokens": 1869578639.0, + "step": 448 + }, + { + "epoch": 0.8973270047464401, + "grad_norm": 0.32300541215074324, + "learning_rate": 3.793904199011017e-05, + "loss": 0.5913, + "num_tokens": 1873764972.0, + "step": 449 + }, + { + "epoch": 0.899325505870597, + "grad_norm": 0.33247050804313294, + "learning_rate": 3.792819377797237e-05, + "loss": 0.5718, + "num_tokens": 1877919772.0, + "step": 450 + }, + { + "epoch": 0.9013240069947539, + "grad_norm": 0.2886120765195873, + "learning_rate": 3.791731883314043e-05, + "loss": 0.5863, + "num_tokens": 1882105004.0, + "step": 451 + }, + { + "epoch": 0.9033225081189108, + "grad_norm": 0.34140434204781595, + "learning_rate": 3.7906417173866055e-05, + "loss": 0.5843, + "num_tokens": 1886281414.0, + "step": 452 + }, + { + "epoch": 0.9053210092430677, + "grad_norm": 0.30927519667165465, + "learning_rate": 3.7895488818445786e-05, + "loss": 0.5974, + "num_tokens": 1890448138.0, + "step": 453 + }, + { + "epoch": 0.9073195103672246, + "grad_norm": 0.2944639114136576, + "learning_rate": 3.788453378522095e-05, + "loss": 0.5621, + "num_tokens": 1894632878.0, + "step": 454 + }, + { + "epoch": 0.9093180114913815, + "grad_norm": 0.3129226352162425, + "learning_rate": 3.7873552092577663e-05, + "loss": 0.5779, + "num_tokens": 1898803914.0, + "step": 455 + }, + { + "epoch": 0.9113165126155384, + "grad_norm": 0.24499021892465545, + "learning_rate": 3.786254375894678e-05, + "loss": 0.5616, + "num_tokens": 1902989057.0, + "step": 456 + }, + { + "epoch": 0.9133150137396953, + "grad_norm": 0.25166897952969813, + "learning_rate": 3.785150880280387e-05, + "loss": 0.5747, + "num_tokens": 1907173402.0, + "step": 457 + }, + { + "epoch": 0.9153135148638522, + "grad_norm": 0.3171137829087611, + "learning_rate": 3.784044724266918e-05, + "loss": 0.5642, + "num_tokens": 1911349282.0, + "step": 458 + }, + { + "epoch": 0.917312015988009, + "grad_norm": 0.2379520612630094, + "learning_rate": 3.78293590971076e-05, + "loss": 0.567, + "num_tokens": 1915531823.0, + "step": 459 + }, + { + "epoch": 0.9193105171121658, + "grad_norm": 0.3033910967936305, + "learning_rate": 3.781824438472867e-05, + "loss": 0.5664, + "num_tokens": 1919690686.0, + "step": 460 + }, + { + "epoch": 0.9213090182363227, + "grad_norm": 0.26113708896194915, + "learning_rate": 3.7807103124186476e-05, + "loss": 0.5773, + "num_tokens": 1923876236.0, + "step": 461 + }, + { + "epoch": 0.9233075193604796, + "grad_norm": 0.2572697634593496, + "learning_rate": 3.7795935334179676e-05, + "loss": 0.5801, + "num_tokens": 1928061310.0, + "step": 462 + }, + { + "epoch": 0.9253060204846365, + "grad_norm": 0.23999410919052488, + "learning_rate": 3.778474103345148e-05, + "loss": 0.569, + "num_tokens": 1932224732.0, + "step": 463 + }, + { + "epoch": 0.9273045216087934, + "grad_norm": 0.33809697667623434, + "learning_rate": 3.777352024078955e-05, + "loss": 0.5788, + "num_tokens": 1936408099.0, + "step": 464 + }, + { + "epoch": 0.9293030227329503, + "grad_norm": 0.23095999453463134, + "learning_rate": 3.7762272975026035e-05, + "loss": 0.5538, + "num_tokens": 1940589083.0, + "step": 465 + }, + { + "epoch": 0.9313015238571072, + "grad_norm": 0.41900000733790005, + "learning_rate": 3.77509992550375e-05, + "loss": 0.5648, + "num_tokens": 1944772645.0, + "step": 466 + }, + { + "epoch": 0.9333000249812641, + "grad_norm": 0.3557502865608081, + "learning_rate": 3.773969909974493e-05, + "loss": 0.5673, + "num_tokens": 1948935030.0, + "step": 467 + }, + { + "epoch": 0.935298526105421, + "grad_norm": 0.32239761610297973, + "learning_rate": 3.772837252811366e-05, + "loss": 0.5595, + "num_tokens": 1953093478.0, + "step": 468 + }, + { + "epoch": 0.9372970272295779, + "grad_norm": 0.3341433959555827, + "learning_rate": 3.7717019559153354e-05, + "loss": 0.5676, + "num_tokens": 1957279169.0, + "step": 469 + }, + { + "epoch": 0.9392955283537348, + "grad_norm": 0.2833850156432731, + "learning_rate": 3.7705640211918004e-05, + "loss": 0.5765, + "num_tokens": 1961463795.0, + "step": 470 + }, + { + "epoch": 0.9412940294778915, + "grad_norm": 0.31328736223664455, + "learning_rate": 3.769423450550585e-05, + "loss": 0.589, + "num_tokens": 1965634340.0, + "step": 471 + }, + { + "epoch": 0.9432925306020484, + "grad_norm": 0.232726479927616, + "learning_rate": 3.768280245905938e-05, + "loss": 0.5718, + "num_tokens": 1969822755.0, + "step": 472 + }, + { + "epoch": 0.9452910317262053, + "grad_norm": 0.2628508288114439, + "learning_rate": 3.76713440917653e-05, + "loss": 0.5715, + "num_tokens": 1973933652.0, + "step": 473 + }, + { + "epoch": 0.9472895328503622, + "grad_norm": 0.2690259801401941, + "learning_rate": 3.765985942285446e-05, + "loss": 0.5734, + "num_tokens": 1978116960.0, + "step": 474 + }, + { + "epoch": 0.9492880339745191, + "grad_norm": 0.2282814932263068, + "learning_rate": 3.76483484716019e-05, + "loss": 0.5602, + "num_tokens": 1982302539.0, + "step": 475 + }, + { + "epoch": 0.951286535098676, + "grad_norm": 0.282794333114189, + "learning_rate": 3.763681125732672e-05, + "loss": 0.5702, + "num_tokens": 1986462103.0, + "step": 476 + }, + { + "epoch": 0.9532850362228329, + "grad_norm": 0.22231294771941096, + "learning_rate": 3.762524779939214e-05, + "loss": 0.5561, + "num_tokens": 1990607482.0, + "step": 477 + }, + { + "epoch": 0.9552835373469898, + "grad_norm": 0.26604748221386126, + "learning_rate": 3.7613658117205386e-05, + "loss": 0.5616, + "num_tokens": 1994791646.0, + "step": 478 + }, + { + "epoch": 0.9572820384711467, + "grad_norm": 0.2480170554198706, + "learning_rate": 3.7602042230217726e-05, + "loss": 0.5837, + "num_tokens": 1998953755.0, + "step": 479 + }, + { + "epoch": 0.9592805395953036, + "grad_norm": 0.27220400421463886, + "learning_rate": 3.7590400157924406e-05, + "loss": 0.56, + "num_tokens": 2003110189.0, + "step": 480 + }, + { + "epoch": 0.9612790407194604, + "grad_norm": 0.23698691105318342, + "learning_rate": 3.7578731919864606e-05, + "loss": 0.5752, + "num_tokens": 2007278059.0, + "step": 481 + }, + { + "epoch": 0.9632775418436172, + "grad_norm": 0.3485997122442459, + "learning_rate": 3.756703753562143e-05, + "loss": 0.5619, + "num_tokens": 2011463119.0, + "step": 482 + }, + { + "epoch": 0.9652760429677741, + "grad_norm": 0.28381874029966003, + "learning_rate": 3.755531702482186e-05, + "loss": 0.5727, + "num_tokens": 2015649422.0, + "step": 483 + }, + { + "epoch": 0.967274544091931, + "grad_norm": 0.32189205212663696, + "learning_rate": 3.7543570407136736e-05, + "loss": 0.5785, + "num_tokens": 2019833300.0, + "step": 484 + }, + { + "epoch": 0.9692730452160879, + "grad_norm": 0.277787476243753, + "learning_rate": 3.753179770228069e-05, + "loss": 0.5749, + "num_tokens": 2023996385.0, + "step": 485 + }, + { + "epoch": 0.9712715463402448, + "grad_norm": 0.265982027732585, + "learning_rate": 3.751999893001217e-05, + "loss": 0.5494, + "num_tokens": 2028179201.0, + "step": 486 + }, + { + "epoch": 0.9732700474644017, + "grad_norm": 0.2504290980166547, + "learning_rate": 3.7508174110133355e-05, + "loss": 0.5693, + "num_tokens": 2032361560.0, + "step": 487 + }, + { + "epoch": 0.9752685485885586, + "grad_norm": 0.21257605722240813, + "learning_rate": 3.749632326249013e-05, + "loss": 0.5627, + "num_tokens": 2036548539.0, + "step": 488 + }, + { + "epoch": 0.9772670497127155, + "grad_norm": 0.33282333824430455, + "learning_rate": 3.7484446406972086e-05, + "loss": 0.5782, + "num_tokens": 2040735354.0, + "step": 489 + }, + { + "epoch": 0.9792655508368724, + "grad_norm": 0.2938991322837538, + "learning_rate": 3.747254356351245e-05, + "loss": 0.578, + "num_tokens": 2044922842.0, + "step": 490 + }, + { + "epoch": 0.9812640519610293, + "grad_norm": 0.25938938408755347, + "learning_rate": 3.746061475208806e-05, + "loss": 0.5663, + "num_tokens": 2049092044.0, + "step": 491 + }, + { + "epoch": 0.9832625530851861, + "grad_norm": 0.31943598145969004, + "learning_rate": 3.7448659992719355e-05, + "loss": 0.5678, + "num_tokens": 2053247952.0, + "step": 492 + }, + { + "epoch": 0.985261054209343, + "grad_norm": 0.20875335965154002, + "learning_rate": 3.7436679305470306e-05, + "loss": 0.568, + "num_tokens": 2057431246.0, + "step": 493 + }, + { + "epoch": 0.9872595553334998, + "grad_norm": 0.24594157089084487, + "learning_rate": 3.742467271044841e-05, + "loss": 0.536, + "num_tokens": 2061590543.0, + "step": 494 + }, + { + "epoch": 0.9892580564576567, + "grad_norm": 0.26241163946807, + "learning_rate": 3.7412640227804643e-05, + "loss": 0.5795, + "num_tokens": 2065746811.0, + "step": 495 + }, + { + "epoch": 0.9912565575818136, + "grad_norm": 0.1905274701104775, + "learning_rate": 3.740058187773342e-05, + "loss": 0.5678, + "num_tokens": 2069893116.0, + "step": 496 + }, + { + "epoch": 0.9932550587059705, + "grad_norm": 0.3485667910027832, + "learning_rate": 3.738849768047259e-05, + "loss": 0.5657, + "num_tokens": 2074056904.0, + "step": 497 + }, + { + "epoch": 0.9952535598301274, + "grad_norm": 0.20942587885672417, + "learning_rate": 3.737638765630336e-05, + "loss": 0.5642, + "num_tokens": 2078233077.0, + "step": 498 + }, + { + "epoch": 0.9972520609542843, + "grad_norm": 0.4256065121030731, + "learning_rate": 3.7364251825550305e-05, + "loss": 0.5822, + "num_tokens": 2082408622.0, + "step": 499 + }, + { + "epoch": 0.9992505620784412, + "grad_norm": 0.37313844646335925, + "learning_rate": 3.735209020858128e-05, + "loss": 0.571, + "num_tokens": 2086562586.0, + "step": 500 + }, + { + "epoch": 1.0, + "grad_norm": 0.37313844646335925, + "learning_rate": 3.733990282580745e-05, + "loss": 0.5743, + "num_tokens": 2088117842.0, + "step": 501 + }, + { + "epoch": 1.0019985011241568, + "grad_norm": 0.5688081134254176, + "learning_rate": 3.732768969768321e-05, + "loss": 0.5588, + "num_tokens": 2092300994.0, + "step": 502 + }, + { + "epoch": 1.0039970022483138, + "grad_norm": 0.3204244147899268, + "learning_rate": 3.7315450844706157e-05, + "loss": 0.5369, + "num_tokens": 2096461819.0, + "step": 503 + }, + { + "epoch": 1.0059955033724706, + "grad_norm": 0.29868750350808204, + "learning_rate": 3.730318628741708e-05, + "loss": 0.6669, + "num_tokens": 2100647196.0, + "step": 504 + }, + { + "epoch": 1.0079940044966276, + "grad_norm": 68.5750351221874, + "learning_rate": 3.729089604639989e-05, + "loss": 0.546, + "num_tokens": 2104797591.0, + "step": 505 + }, + { + "epoch": 1.0099925056207844, + "grad_norm": 0.46461466072190494, + "learning_rate": 3.7278580142281624e-05, + "loss": 0.5483, + "num_tokens": 2108975764.0, + "step": 506 + }, + { + "epoch": 1.0119910067449414, + "grad_norm": 0.25286477235900306, + "learning_rate": 3.726623859573237e-05, + "loss": 0.5549, + "num_tokens": 2113157780.0, + "step": 507 + }, + { + "epoch": 1.0139895078690981, + "grad_norm": 0.4157433405196153, + "learning_rate": 3.725387142746525e-05, + "loss": 0.5252, + "num_tokens": 2117313674.0, + "step": 508 + }, + { + "epoch": 1.0159880089932551, + "grad_norm": 0.4583211459164858, + "learning_rate": 3.7241478658236424e-05, + "loss": 0.5408, + "num_tokens": 2121498417.0, + "step": 509 + }, + { + "epoch": 1.017986510117412, + "grad_norm": 0.22346577360605901, + "learning_rate": 3.722906030884497e-05, + "loss": 0.5292, + "num_tokens": 2125648678.0, + "step": 510 + }, + { + "epoch": 1.019985011241569, + "grad_norm": 0.4515979407982383, + "learning_rate": 3.721661640013293e-05, + "loss": 0.5569, + "num_tokens": 2129817902.0, + "step": 511 + }, + { + "epoch": 1.0219835123657257, + "grad_norm": 0.41745096095101375, + "learning_rate": 3.720414695298523e-05, + "loss": 0.5546, + "num_tokens": 2133978507.0, + "step": 512 + }, + { + "epoch": 1.0239820134898825, + "grad_norm": 0.2644943891458544, + "learning_rate": 3.719165198832967e-05, + "loss": 0.5475, + "num_tokens": 2138164210.0, + "step": 513 + }, + { + "epoch": 1.0259805146140395, + "grad_norm": 0.30933058657090723, + "learning_rate": 3.7179131527136856e-05, + "loss": 0.5543, + "num_tokens": 2142323523.0, + "step": 514 + }, + { + "epoch": 1.0279790157381963, + "grad_norm": 0.2444816695090602, + "learning_rate": 3.7166585590420206e-05, + "loss": 0.5462, + "num_tokens": 2146510763.0, + "step": 515 + }, + { + "epoch": 1.0299775168623533, + "grad_norm": 0.24388824154335598, + "learning_rate": 3.7154014199235893e-05, + "loss": 0.5476, + "num_tokens": 2150696088.0, + "step": 516 + }, + { + "epoch": 1.03197601798651, + "grad_norm": 0.30178497066295223, + "learning_rate": 3.714141737468278e-05, + "loss": 0.5481, + "num_tokens": 2154878261.0, + "step": 517 + }, + { + "epoch": 1.033974519110667, + "grad_norm": 0.23514301459379902, + "learning_rate": 3.712879513790246e-05, + "loss": 0.5494, + "num_tokens": 2159008478.0, + "step": 518 + }, + { + "epoch": 1.0359730202348238, + "grad_norm": 0.3024846179352475, + "learning_rate": 3.7116147510079145e-05, + "loss": 0.552, + "num_tokens": 2163193292.0, + "step": 519 + }, + { + "epoch": 1.0379715213589809, + "grad_norm": 0.27930457961652083, + "learning_rate": 3.710347451243968e-05, + "loss": 0.5391, + "num_tokens": 2167363184.0, + "step": 520 + }, + { + "epoch": 1.0399700224831376, + "grad_norm": 0.2545791801563356, + "learning_rate": 3.7090776166253474e-05, + "loss": 0.5248, + "num_tokens": 2171533451.0, + "step": 521 + }, + { + "epoch": 1.0419685236072946, + "grad_norm": 0.26168872170789403, + "learning_rate": 3.707805249283248e-05, + "loss": 0.5635, + "num_tokens": 2175716711.0, + "step": 522 + }, + { + "epoch": 1.0439670247314514, + "grad_norm": 0.24737776065059125, + "learning_rate": 3.7065303513531174e-05, + "loss": 0.5408, + "num_tokens": 2179898137.0, + "step": 523 + }, + { + "epoch": 1.0459655258556082, + "grad_norm": 0.22858684704029436, + "learning_rate": 3.7052529249746496e-05, + "loss": 0.5465, + "num_tokens": 2184083005.0, + "step": 524 + }, + { + "epoch": 1.0479640269797652, + "grad_norm": 0.2707286869379139, + "learning_rate": 3.70397297229178e-05, + "loss": 0.5605, + "num_tokens": 2188269020.0, + "step": 525 + }, + { + "epoch": 1.049962528103922, + "grad_norm": 0.2593639345029228, + "learning_rate": 3.7026904954526884e-05, + "loss": 0.5414, + "num_tokens": 2192441352.0, + "step": 526 + }, + { + "epoch": 1.051961029228079, + "grad_norm": 0.25138382193799985, + "learning_rate": 3.701405496609787e-05, + "loss": 0.5417, + "num_tokens": 2196598068.0, + "step": 527 + }, + { + "epoch": 1.0539595303522358, + "grad_norm": 0.24829118542109116, + "learning_rate": 3.700117977919722e-05, + "loss": 0.5487, + "num_tokens": 2200782841.0, + "step": 528 + }, + { + "epoch": 1.0559580314763928, + "grad_norm": 0.22217070114421816, + "learning_rate": 3.698827941543369e-05, + "loss": 0.5537, + "num_tokens": 2204970131.0, + "step": 529 + }, + { + "epoch": 1.0579565326005496, + "grad_norm": 0.20549147359291053, + "learning_rate": 3.697535389645829e-05, + "loss": 0.5484, + "num_tokens": 2209146817.0, + "step": 530 + }, + { + "epoch": 1.0599550337247066, + "grad_norm": 0.20468491422618534, + "learning_rate": 3.696240324396426e-05, + "loss": 0.5423, + "num_tokens": 2213303889.0, + "step": 531 + }, + { + "epoch": 1.0619535348488633, + "grad_norm": 0.24543174138044282, + "learning_rate": 3.694942747968699e-05, + "loss": 0.5414, + "num_tokens": 2217488148.0, + "step": 532 + }, + { + "epoch": 1.0639520359730201, + "grad_norm": 0.21565287142499098, + "learning_rate": 3.693642662540405e-05, + "loss": 0.5407, + "num_tokens": 2221643010.0, + "step": 533 + }, + { + "epoch": 1.0659505370971771, + "grad_norm": 0.22497070657235657, + "learning_rate": 3.69234007029351e-05, + "loss": 0.5422, + "num_tokens": 2225809986.0, + "step": 534 + }, + { + "epoch": 1.067949038221334, + "grad_norm": 0.39996780672186893, + "learning_rate": 3.6910349734141885e-05, + "loss": 0.5469, + "num_tokens": 2229994065.0, + "step": 535 + }, + { + "epoch": 1.069947539345491, + "grad_norm": 0.2780154644181402, + "learning_rate": 3.6897273740928175e-05, + "loss": 0.5605, + "num_tokens": 2234179558.0, + "step": 536 + }, + { + "epoch": 1.0719460404696477, + "grad_norm": 0.4066429921730585, + "learning_rate": 3.6884172745239745e-05, + "loss": 0.5429, + "num_tokens": 2238333531.0, + "step": 537 + }, + { + "epoch": 1.0739445415938047, + "grad_norm": 0.2914016447133343, + "learning_rate": 3.6871046769064316e-05, + "loss": 0.5519, + "num_tokens": 2242519893.0, + "step": 538 + }, + { + "epoch": 1.0759430427179615, + "grad_norm": 0.42071997625312346, + "learning_rate": 3.685789583443157e-05, + "loss": 0.5393, + "num_tokens": 2246697892.0, + "step": 539 + }, + { + "epoch": 1.0779415438421185, + "grad_norm": 0.2795874192088721, + "learning_rate": 3.6844719963413047e-05, + "loss": 0.5214, + "num_tokens": 2250872585.0, + "step": 540 + }, + { + "epoch": 1.0799400449662753, + "grad_norm": 0.4235354385318699, + "learning_rate": 3.6831519178122145e-05, + "loss": 0.5529, + "num_tokens": 2255049528.0, + "step": 541 + }, + { + "epoch": 1.0819385460904323, + "grad_norm": 0.3156491369025634, + "learning_rate": 3.681829350071409e-05, + "loss": 0.5614, + "num_tokens": 2259234708.0, + "step": 542 + }, + { + "epoch": 1.083937047214589, + "grad_norm": 0.43033176292447733, + "learning_rate": 3.6805042953385854e-05, + "loss": 0.5442, + "num_tokens": 2263421122.0, + "step": 543 + }, + { + "epoch": 1.0859355483387458, + "grad_norm": 0.3451957198753569, + "learning_rate": 3.6791767558376186e-05, + "loss": 0.5457, + "num_tokens": 2267606253.0, + "step": 544 + }, + { + "epoch": 1.0879340494629028, + "grad_norm": 0.4572135973353961, + "learning_rate": 3.677846733796551e-05, + "loss": 0.5579, + "num_tokens": 2271791722.0, + "step": 545 + }, + { + "epoch": 1.0899325505870596, + "grad_norm": 0.4005936162101148, + "learning_rate": 3.6765142314475937e-05, + "loss": 0.537, + "num_tokens": 2275965884.0, + "step": 546 + }, + { + "epoch": 1.0919310517112166, + "grad_norm": 0.40150720490408964, + "learning_rate": 3.675179251027118e-05, + "loss": 0.547, + "num_tokens": 2280137711.0, + "step": 547 + }, + { + "epoch": 1.0939295528353734, + "grad_norm": 0.43266661740077195, + "learning_rate": 3.673841794775657e-05, + "loss": 0.5533, + "num_tokens": 2284308424.0, + "step": 548 + }, + { + "epoch": 1.0959280539595304, + "grad_norm": 0.30691021919133643, + "learning_rate": 3.672501864937896e-05, + "loss": 0.5452, + "num_tokens": 2288492520.0, + "step": 549 + }, + { + "epoch": 1.0979265550836872, + "grad_norm": 0.3714932090682662, + "learning_rate": 3.671159463762676e-05, + "loss": 0.5463, + "num_tokens": 2292675848.0, + "step": 550 + }, + { + "epoch": 1.0999250562078442, + "grad_norm": 0.25969976098279196, + "learning_rate": 3.6698145935029794e-05, + "loss": 0.5383, + "num_tokens": 2296860030.0, + "step": 551 + }, + { + "epoch": 1.101923557332001, + "grad_norm": 0.33341943909110483, + "learning_rate": 3.6684672564159404e-05, + "loss": 0.5365, + "num_tokens": 2301020961.0, + "step": 552 + }, + { + "epoch": 1.103922058456158, + "grad_norm": 0.26323564501628915, + "learning_rate": 3.6671174547628265e-05, + "loss": 0.5418, + "num_tokens": 2305206077.0, + "step": 553 + }, + { + "epoch": 1.1059205595803148, + "grad_norm": 0.3240532090652202, + "learning_rate": 3.665765190809045e-05, + "loss": 0.5432, + "num_tokens": 2309377155.0, + "step": 554 + }, + { + "epoch": 1.1079190607044715, + "grad_norm": 0.25962076119043925, + "learning_rate": 3.664410466824134e-05, + "loss": 0.5534, + "num_tokens": 2313563616.0, + "step": 555 + }, + { + "epoch": 1.1099175618286286, + "grad_norm": 0.3451386824697679, + "learning_rate": 3.663053285081764e-05, + "loss": 0.539, + "num_tokens": 2317747935.0, + "step": 556 + }, + { + "epoch": 1.1119160629527853, + "grad_norm": 0.3099227452285639, + "learning_rate": 3.661693647859726e-05, + "loss": 0.5336, + "num_tokens": 2321919334.0, + "step": 557 + }, + { + "epoch": 1.1139145640769423, + "grad_norm": 0.28461538328644265, + "learning_rate": 3.660331557439933e-05, + "loss": 0.5307, + "num_tokens": 2326089660.0, + "step": 558 + }, + { + "epoch": 1.1159130652010991, + "grad_norm": 0.3031689057391982, + "learning_rate": 3.658967016108418e-05, + "loss": 0.5354, + "num_tokens": 2330273383.0, + "step": 559 + }, + { + "epoch": 1.1179115663252561, + "grad_norm": 0.2554393066948755, + "learning_rate": 3.6576000261553234e-05, + "loss": 0.539, + "num_tokens": 2334448693.0, + "step": 560 + }, + { + "epoch": 1.119910067449413, + "grad_norm": 0.29259806137148026, + "learning_rate": 3.656230589874905e-05, + "loss": 0.5314, + "num_tokens": 2338633371.0, + "step": 561 + }, + { + "epoch": 1.12190856857357, + "grad_norm": 0.2284136177491733, + "learning_rate": 3.6548587095655236e-05, + "loss": 0.5541, + "num_tokens": 2342798466.0, + "step": 562 + }, + { + "epoch": 1.1239070696977267, + "grad_norm": 0.28435091987329736, + "learning_rate": 3.653484387529638e-05, + "loss": 0.5323, + "num_tokens": 2346945915.0, + "step": 563 + }, + { + "epoch": 1.1259055708218835, + "grad_norm": 0.18902439321861078, + "learning_rate": 3.65210762607381e-05, + "loss": 0.5289, + "num_tokens": 2351127748.0, + "step": 564 + }, + { + "epoch": 1.1279040719460405, + "grad_norm": 0.3232954463883714, + "learning_rate": 3.650728427508693e-05, + "loss": 0.5287, + "num_tokens": 2355271677.0, + "step": 565 + }, + { + "epoch": 1.1299025730701973, + "grad_norm": 0.21093143720368357, + "learning_rate": 3.649346794149031e-05, + "loss": 0.5549, + "num_tokens": 2359454840.0, + "step": 566 + }, + { + "epoch": 1.1319010741943543, + "grad_norm": 0.3990267448992818, + "learning_rate": 3.6479627283136554e-05, + "loss": 0.5378, + "num_tokens": 2363598962.0, + "step": 567 + }, + { + "epoch": 1.133899575318511, + "grad_norm": 0.35215763437757136, + "learning_rate": 3.646576232325477e-05, + "loss": 0.5396, + "num_tokens": 2367765951.0, + "step": 568 + }, + { + "epoch": 1.135898076442668, + "grad_norm": 0.2754597759292642, + "learning_rate": 3.64518730851149e-05, + "loss": 0.552, + "num_tokens": 2371934096.0, + "step": 569 + }, + { + "epoch": 1.1378965775668248, + "grad_norm": 0.31855160871121746, + "learning_rate": 3.6437959592027594e-05, + "loss": 0.559, + "num_tokens": 2376095629.0, + "step": 570 + }, + { + "epoch": 1.1398950786909818, + "grad_norm": 0.24266193408916265, + "learning_rate": 3.642402186734423e-05, + "loss": 0.5395, + "num_tokens": 2380282244.0, + "step": 571 + }, + { + "epoch": 1.1418935798151386, + "grad_norm": 0.2989687047031166, + "learning_rate": 3.641005993445683e-05, + "loss": 0.5441, + "num_tokens": 2384468122.0, + "step": 572 + }, + { + "epoch": 1.1438920809392956, + "grad_norm": 0.2239755803884637, + "learning_rate": 3.639607381679808e-05, + "loss": 0.5392, + "num_tokens": 2388651683.0, + "step": 573 + }, + { + "epoch": 1.1458905820634524, + "grad_norm": 0.3487229852200295, + "learning_rate": 3.6382063537841226e-05, + "loss": 0.5521, + "num_tokens": 2392834468.0, + "step": 574 + }, + { + "epoch": 1.1478890831876094, + "grad_norm": 0.2960969659301572, + "learning_rate": 3.636802912110009e-05, + "loss": 0.5457, + "num_tokens": 2396952468.0, + "step": 575 + }, + { + "epoch": 1.1498875843117662, + "grad_norm": 0.35607891728556423, + "learning_rate": 3.6353970590128975e-05, + "loss": 0.5514, + "num_tokens": 2401136300.0, + "step": 576 + }, + { + "epoch": 1.151886085435923, + "grad_norm": 0.3648784569359035, + "learning_rate": 3.633988796852269e-05, + "loss": 0.5501, + "num_tokens": 2405319525.0, + "step": 577 + }, + { + "epoch": 1.15388458656008, + "grad_norm": 0.33921757838165034, + "learning_rate": 3.632578127991645e-05, + "loss": 0.5416, + "num_tokens": 2409502845.0, + "step": 578 + }, + { + "epoch": 1.1558830876842368, + "grad_norm": 0.3108203646716946, + "learning_rate": 3.631165054798587e-05, + "loss": 0.5376, + "num_tokens": 2413687039.0, + "step": 579 + }, + { + "epoch": 1.1578815888083938, + "grad_norm": 0.297004977959581, + "learning_rate": 3.629749579644691e-05, + "loss": 0.5285, + "num_tokens": 2417870587.0, + "step": 580 + }, + { + "epoch": 1.1598800899325505, + "grad_norm": 0.3015000988483337, + "learning_rate": 3.6283317049055864e-05, + "loss": 0.5405, + "num_tokens": 2422006351.0, + "step": 581 + }, + { + "epoch": 1.1618785910567075, + "grad_norm": 0.30298039092497603, + "learning_rate": 3.6269114329609285e-05, + "loss": 0.5416, + "num_tokens": 2426191871.0, + "step": 582 + }, + { + "epoch": 1.1638770921808643, + "grad_norm": 0.21830031381546727, + "learning_rate": 3.625488766194395e-05, + "loss": 0.546, + "num_tokens": 2430349906.0, + "step": 583 + }, + { + "epoch": 1.1658755933050213, + "grad_norm": 0.28840818133520285, + "learning_rate": 3.6240637069936855e-05, + "loss": 0.5416, + "num_tokens": 2434533561.0, + "step": 584 + }, + { + "epoch": 1.1678740944291781, + "grad_norm": 0.19554598368667836, + "learning_rate": 3.6226362577505104e-05, + "loss": 0.5415, + "num_tokens": 2438717764.0, + "step": 585 + }, + { + "epoch": 1.169872595553335, + "grad_norm": 0.3132150706053422, + "learning_rate": 3.6212064208605966e-05, + "loss": 0.5395, + "num_tokens": 2442873633.0, + "step": 586 + }, + { + "epoch": 1.171871096677492, + "grad_norm": 0.2263751480277752, + "learning_rate": 3.6197741987236754e-05, + "loss": 0.537, + "num_tokens": 2447032145.0, + "step": 587 + }, + { + "epoch": 1.1738695978016487, + "grad_norm": 0.2211529124490697, + "learning_rate": 3.618339593743482e-05, + "loss": 0.5447, + "num_tokens": 2451217485.0, + "step": 588 + }, + { + "epoch": 1.1758680989258057, + "grad_norm": 0.26575912330416285, + "learning_rate": 3.616902608327749e-05, + "loss": 0.5325, + "num_tokens": 2455401047.0, + "step": 589 + }, + { + "epoch": 1.1778666000499625, + "grad_norm": 0.18998838079185706, + "learning_rate": 3.6154632448882077e-05, + "loss": 0.5474, + "num_tokens": 2459535116.0, + "step": 590 + }, + { + "epoch": 1.1798651011741195, + "grad_norm": 0.22348794870649755, + "learning_rate": 3.6140215058405765e-05, + "loss": 0.5368, + "num_tokens": 2463688799.0, + "step": 591 + }, + { + "epoch": 1.1818636022982763, + "grad_norm": 0.24235797865825928, + "learning_rate": 3.6125773936045646e-05, + "loss": 0.5451, + "num_tokens": 2467873578.0, + "step": 592 + }, + { + "epoch": 1.1838621034224333, + "grad_norm": 0.24433895174325188, + "learning_rate": 3.611130910603861e-05, + "loss": 0.5364, + "num_tokens": 2472058042.0, + "step": 593 + }, + { + "epoch": 1.18586060454659, + "grad_norm": 0.1983249646885123, + "learning_rate": 3.609682059266137e-05, + "loss": 0.5561, + "num_tokens": 2476244685.0, + "step": 594 + }, + { + "epoch": 1.1878591056707468, + "grad_norm": 0.27856696567819395, + "learning_rate": 3.608230842023035e-05, + "loss": 0.5383, + "num_tokens": 2480429690.0, + "step": 595 + }, + { + "epoch": 1.1898576067949038, + "grad_norm": 0.19706505117729467, + "learning_rate": 3.60677726131017e-05, + "loss": 0.5383, + "num_tokens": 2484614635.0, + "step": 596 + }, + { + "epoch": 1.1918561079190608, + "grad_norm": 0.23578881200617707, + "learning_rate": 3.605321319567125e-05, + "loss": 0.519, + "num_tokens": 2488801336.0, + "step": 597 + }, + { + "epoch": 1.1938546090432176, + "grad_norm": 0.21158022869889037, + "learning_rate": 3.603863019237442e-05, + "loss": 0.5444, + "num_tokens": 2492988539.0, + "step": 598 + }, + { + "epoch": 1.1958531101673744, + "grad_norm": 0.2053219054572145, + "learning_rate": 3.602402362768626e-05, + "loss": 0.5261, + "num_tokens": 2497148615.0, + "step": 599 + }, + { + "epoch": 1.1978516112915314, + "grad_norm": 0.2461698169668525, + "learning_rate": 3.6009393526121324e-05, + "loss": 0.5453, + "num_tokens": 2501335037.0, + "step": 600 + }, + { + "epoch": 1.1998501124156882, + "grad_norm": 0.24540177013569567, + "learning_rate": 3.599473991223369e-05, + "loss": 0.5608, + "num_tokens": 2505493608.0, + "step": 601 + }, + { + "epoch": 1.2018486135398452, + "grad_norm": 0.1902369321598082, + "learning_rate": 3.5980062810616894e-05, + "loss": 0.5384, + "num_tokens": 2509660336.0, + "step": 602 + }, + { + "epoch": 1.203847114664002, + "grad_norm": 0.286659338345752, + "learning_rate": 3.596536224590389e-05, + "loss": 0.5637, + "num_tokens": 2513819332.0, + "step": 603 + }, + { + "epoch": 1.205845615788159, + "grad_norm": 0.25325964660774936, + "learning_rate": 3.595063824276701e-05, + "loss": 0.5351, + "num_tokens": 2517987297.0, + "step": 604 + }, + { + "epoch": 1.2078441169123157, + "grad_norm": 0.18423340030427962, + "learning_rate": 3.593589082591792e-05, + "loss": 0.5344, + "num_tokens": 2522171128.0, + "step": 605 + }, + { + "epoch": 1.2098426180364728, + "grad_norm": 0.28016101845453195, + "learning_rate": 3.592112002010759e-05, + "loss": 0.548, + "num_tokens": 2526355746.0, + "step": 606 + }, + { + "epoch": 1.2118411191606295, + "grad_norm": 0.2549892026351929, + "learning_rate": 3.5906325850126244e-05, + "loss": 0.5562, + "num_tokens": 2530540036.0, + "step": 607 + }, + { + "epoch": 1.2138396202847863, + "grad_norm": 0.19683433714843346, + "learning_rate": 3.5891508340803315e-05, + "loss": 0.524, + "num_tokens": 2534723668.0, + "step": 608 + }, + { + "epoch": 1.2158381214089433, + "grad_norm": 0.4574807296060874, + "learning_rate": 3.5876667517007394e-05, + "loss": 0.5432, + "num_tokens": 2538908962.0, + "step": 609 + }, + { + "epoch": 1.2178366225331, + "grad_norm": 0.38277103248684696, + "learning_rate": 3.586180340364623e-05, + "loss": 0.5578, + "num_tokens": 2543092812.0, + "step": 610 + }, + { + "epoch": 1.219835123657257, + "grad_norm": 0.3992758919786219, + "learning_rate": 3.584691602566664e-05, + "loss": 0.5544, + "num_tokens": 2547277568.0, + "step": 611 + }, + { + "epoch": 1.2218336247814139, + "grad_norm": 0.37815248012554437, + "learning_rate": 3.583200540805448e-05, + "loss": 0.5422, + "num_tokens": 2551462139.0, + "step": 612 + }, + { + "epoch": 1.2238321259055709, + "grad_norm": 0.3529029567463777, + "learning_rate": 3.5817071575834634e-05, + "loss": 0.5301, + "num_tokens": 2555597643.0, + "step": 613 + }, + { + "epoch": 1.2258306270297277, + "grad_norm": 0.28707058821579684, + "learning_rate": 3.580211455407093e-05, + "loss": 0.5558, + "num_tokens": 2559761016.0, + "step": 614 + }, + { + "epoch": 1.2278291281538847, + "grad_norm": 0.39246718110314005, + "learning_rate": 3.578713436786611e-05, + "loss": 0.5331, + "num_tokens": 2563944682.0, + "step": 615 + }, + { + "epoch": 1.2298276292780415, + "grad_norm": 0.37566943087566407, + "learning_rate": 3.577213104236181e-05, + "loss": 0.546, + "num_tokens": 2568130553.0, + "step": 616 + }, + { + "epoch": 1.2318261304021982, + "grad_norm": 0.36315544067988004, + "learning_rate": 3.57571046027385e-05, + "loss": 0.565, + "num_tokens": 2572292940.0, + "step": 617 + }, + { + "epoch": 1.2338246315263552, + "grad_norm": 0.33483693743337084, + "learning_rate": 3.5742055074215436e-05, + "loss": 0.5395, + "num_tokens": 2576457164.0, + "step": 618 + }, + { + "epoch": 1.235823132650512, + "grad_norm": 0.35452116450917204, + "learning_rate": 3.572698248205061e-05, + "loss": 0.5412, + "num_tokens": 2580641674.0, + "step": 619 + }, + { + "epoch": 1.237821633774669, + "grad_norm": 0.33266178864925766, + "learning_rate": 3.571188685154075e-05, + "loss": 0.5469, + "num_tokens": 2584796149.0, + "step": 620 + }, + { + "epoch": 1.2398201348988258, + "grad_norm": 0.31302422701129007, + "learning_rate": 3.569676820802124e-05, + "loss": 0.5332, + "num_tokens": 2588967015.0, + "step": 621 + }, + { + "epoch": 1.2418186360229828, + "grad_norm": 0.25294982723165016, + "learning_rate": 3.568162657686609e-05, + "loss": 0.5486, + "num_tokens": 2593136338.0, + "step": 622 + }, + { + "epoch": 1.2438171371471396, + "grad_norm": 0.3365264643494846, + "learning_rate": 3.566646198348787e-05, + "loss": 0.5488, + "num_tokens": 2597320398.0, + "step": 623 + }, + { + "epoch": 1.2458156382712966, + "grad_norm": 0.2663038421020362, + "learning_rate": 3.5651274453337716e-05, + "loss": 0.529, + "num_tokens": 2601499198.0, + "step": 624 + }, + { + "epoch": 1.2478141393954534, + "grad_norm": 0.32865690553752036, + "learning_rate": 3.5636064011905235e-05, + "loss": 0.5642, + "num_tokens": 2605659045.0, + "step": 625 + }, + { + "epoch": 1.2498126405196104, + "grad_norm": 0.23916308918535173, + "learning_rate": 3.5620830684718515e-05, + "loss": 0.5481, + "num_tokens": 2609813693.0, + "step": 626 + }, + { + "epoch": 1.2518111416437672, + "grad_norm": 0.30044363193260704, + "learning_rate": 3.5605574497344034e-05, + "loss": 0.5534, + "num_tokens": 2613999303.0, + "step": 627 + }, + { + "epoch": 1.2538096427679242, + "grad_norm": 0.19793321333224345, + "learning_rate": 3.559029547538663e-05, + "loss": 0.5402, + "num_tokens": 2618165377.0, + "step": 628 + }, + { + "epoch": 1.255808143892081, + "grad_norm": 0.2969956581274978, + "learning_rate": 3.557499364448949e-05, + "loss": 0.5437, + "num_tokens": 2622351977.0, + "step": 629 + }, + { + "epoch": 1.2578066450162377, + "grad_norm": 0.1917176281916505, + "learning_rate": 3.555966903033406e-05, + "loss": 0.5552, + "num_tokens": 2626536555.0, + "step": 630 + }, + { + "epoch": 1.2598051461403947, + "grad_norm": 0.27273575051825577, + "learning_rate": 3.554432165864003e-05, + "loss": 0.5445, + "num_tokens": 2630722003.0, + "step": 631 + }, + { + "epoch": 1.2618036472645515, + "grad_norm": 0.18947609997720868, + "learning_rate": 3.5528951555165286e-05, + "loss": 0.5443, + "num_tokens": 2634909032.0, + "step": 632 + }, + { + "epoch": 1.2638021483887085, + "grad_norm": 0.2575348925978812, + "learning_rate": 3.551355874570586e-05, + "loss": 0.5288, + "num_tokens": 2639076656.0, + "step": 633 + }, + { + "epoch": 1.2658006495128653, + "grad_norm": 0.19229878270516787, + "learning_rate": 3.5498143256095915e-05, + "loss": 0.5362, + "num_tokens": 2643263536.0, + "step": 634 + }, + { + "epoch": 1.2677991506370223, + "grad_norm": 0.2688869638250526, + "learning_rate": 3.548270511220764e-05, + "loss": 0.5328, + "num_tokens": 2647369011.0, + "step": 635 + }, + { + "epoch": 1.269797651761179, + "grad_norm": 0.1957926013713345, + "learning_rate": 3.546724433995127e-05, + "loss": 0.5408, + "num_tokens": 2651546590.0, + "step": 636 + }, + { + "epoch": 1.271796152885336, + "grad_norm": 0.26188209817931873, + "learning_rate": 3.545176096527503e-05, + "loss": 0.541, + "num_tokens": 2655713416.0, + "step": 637 + }, + { + "epoch": 1.2737946540094929, + "grad_norm": 0.22650419316601228, + "learning_rate": 3.543625501416504e-05, + "loss": 0.5246, + "num_tokens": 2659899074.0, + "step": 638 + }, + { + "epoch": 1.2757931551336497, + "grad_norm": 0.24136395590471008, + "learning_rate": 3.5420726512645355e-05, + "loss": 0.5412, + "num_tokens": 2664070272.0, + "step": 639 + }, + { + "epoch": 1.2777916562578067, + "grad_norm": 0.22744262895331696, + "learning_rate": 3.5405175486777844e-05, + "loss": 0.5187, + "num_tokens": 2668230911.0, + "step": 640 + }, + { + "epoch": 1.2797901573819634, + "grad_norm": 0.24469696091259618, + "learning_rate": 3.538960196266219e-05, + "loss": 0.5318, + "num_tokens": 2672386739.0, + "step": 641 + }, + { + "epoch": 1.2817886585061204, + "grad_norm": 0.19700995932762588, + "learning_rate": 3.5374005966435825e-05, + "loss": 0.5389, + "num_tokens": 2676527028.0, + "step": 642 + }, + { + "epoch": 1.2837871596302772, + "grad_norm": 0.2320411805757896, + "learning_rate": 3.535838752427393e-05, + "loss": 0.5587, + "num_tokens": 2680677728.0, + "step": 643 + }, + { + "epoch": 1.2857856607544342, + "grad_norm": 0.17105989444277683, + "learning_rate": 3.534274666238932e-05, + "loss": 0.5436, + "num_tokens": 2684862930.0, + "step": 644 + }, + { + "epoch": 1.287784161878591, + "grad_norm": 0.19770140517825788, + "learning_rate": 3.532708340703247e-05, + "loss": 0.5293, + "num_tokens": 2689046435.0, + "step": 645 + }, + { + "epoch": 1.289782663002748, + "grad_norm": 0.179437303899049, + "learning_rate": 3.5311397784491396e-05, + "loss": 0.5352, + "num_tokens": 2693230806.0, + "step": 646 + }, + { + "epoch": 1.2917811641269048, + "grad_norm": 0.17736736095323424, + "learning_rate": 3.52956898210917e-05, + "loss": 0.5282, + "num_tokens": 2697416613.0, + "step": 647 + }, + { + "epoch": 1.2937796652510616, + "grad_norm": 0.2733256183873301, + "learning_rate": 3.5279959543196444e-05, + "loss": 0.5303, + "num_tokens": 2701602076.0, + "step": 648 + }, + { + "epoch": 1.2957781663752186, + "grad_norm": 0.20564823644684693, + "learning_rate": 3.526420697720616e-05, + "loss": 0.5404, + "num_tokens": 2705758161.0, + "step": 649 + }, + { + "epoch": 1.2977766674993756, + "grad_norm": 0.24192635739818374, + "learning_rate": 3.5248432149558785e-05, + "loss": 0.5437, + "num_tokens": 2709928062.0, + "step": 650 + }, + { + "epoch": 1.2997751686235324, + "grad_norm": 0.3071632637612425, + "learning_rate": 3.523263508672961e-05, + "loss": 0.548, + "num_tokens": 2714110406.0, + "step": 651 + }, + { + "epoch": 1.3017736697476892, + "grad_norm": 0.21614182755563038, + "learning_rate": 3.521681581523125e-05, + "loss": 0.5309, + "num_tokens": 2718283088.0, + "step": 652 + }, + { + "epoch": 1.3037721708718462, + "grad_norm": 0.3734077804412869, + "learning_rate": 3.520097436161359e-05, + "loss": 0.5637, + "num_tokens": 2722454072.0, + "step": 653 + }, + { + "epoch": 1.305770671996003, + "grad_norm": 0.32041058284209806, + "learning_rate": 3.5185110752463755e-05, + "loss": 0.5475, + "num_tokens": 2726641274.0, + "step": 654 + }, + { + "epoch": 1.30776917312016, + "grad_norm": 0.3001123677257565, + "learning_rate": 3.5169225014406035e-05, + "loss": 0.5385, + "num_tokens": 2730816897.0, + "step": 655 + }, + { + "epoch": 1.3097676742443167, + "grad_norm": 0.27865701812486243, + "learning_rate": 3.515331717410187e-05, + "loss": 0.5396, + "num_tokens": 2735003483.0, + "step": 656 + }, + { + "epoch": 1.3117661753684735, + "grad_norm": 0.22850601011226407, + "learning_rate": 3.5137387258249806e-05, + "loss": 0.555, + "num_tokens": 2739188648.0, + "step": 657 + }, + { + "epoch": 1.3137646764926305, + "grad_norm": 0.17646629988830936, + "learning_rate": 3.512143529358541e-05, + "loss": 0.5288, + "num_tokens": 2743346266.0, + "step": 658 + }, + { + "epoch": 1.3157631776167875, + "grad_norm": 0.17630437075003697, + "learning_rate": 3.510546130688128e-05, + "loss": 0.5259, + "num_tokens": 2747509373.0, + "step": 659 + }, + { + "epoch": 1.3177616787409443, + "grad_norm": 0.20252680075902343, + "learning_rate": 3.508946532494697e-05, + "loss": 0.534, + "num_tokens": 2751681713.0, + "step": 660 + }, + { + "epoch": 1.319760179865101, + "grad_norm": 0.21135150507132397, + "learning_rate": 3.507344737462894e-05, + "loss": 0.5324, + "num_tokens": 2755862019.0, + "step": 661 + }, + { + "epoch": 1.321758680989258, + "grad_norm": 0.24589523678930644, + "learning_rate": 3.505740748281053e-05, + "loss": 0.5401, + "num_tokens": 2760045698.0, + "step": 662 + }, + { + "epoch": 1.3237571821134149, + "grad_norm": 0.17765480490556895, + "learning_rate": 3.5041345676411885e-05, + "loss": 0.5389, + "num_tokens": 2764230085.0, + "step": 663 + }, + { + "epoch": 1.3257556832375719, + "grad_norm": 0.18912239452631915, + "learning_rate": 3.5025261982389956e-05, + "loss": 0.529, + "num_tokens": 2768414363.0, + "step": 664 + }, + { + "epoch": 1.3277541843617287, + "grad_norm": 0.296086099364385, + "learning_rate": 3.500915642773842e-05, + "loss": 0.5464, + "num_tokens": 2772568456.0, + "step": 665 + }, + { + "epoch": 1.3297526854858857, + "grad_norm": 0.17397461103141135, + "learning_rate": 3.499302903948764e-05, + "loss": 0.5244, + "num_tokens": 2776752764.0, + "step": 666 + }, + { + "epoch": 1.3317511866100424, + "grad_norm": 0.27240910568725474, + "learning_rate": 3.497687984470461e-05, + "loss": 0.5263, + "num_tokens": 2780937170.0, + "step": 667 + }, + { + "epoch": 1.3337496877341994, + "grad_norm": 0.274128107292034, + "learning_rate": 3.4960708870492955e-05, + "loss": 0.545, + "num_tokens": 2785119531.0, + "step": 668 + }, + { + "epoch": 1.3357481888583562, + "grad_norm": 0.21478806765941447, + "learning_rate": 3.494451614399282e-05, + "loss": 0.5316, + "num_tokens": 2789302425.0, + "step": 669 + }, + { + "epoch": 1.337746689982513, + "grad_norm": 0.3686322056496915, + "learning_rate": 3.492830169238088e-05, + "loss": 0.5457, + "num_tokens": 2793487078.0, + "step": 670 + }, + { + "epoch": 1.33974519110667, + "grad_norm": 0.2760769612014662, + "learning_rate": 3.4912065542870245e-05, + "loss": 0.5355, + "num_tokens": 2797668959.0, + "step": 671 + }, + { + "epoch": 1.341743692230827, + "grad_norm": 0.3599159018650345, + "learning_rate": 3.489580772271048e-05, + "loss": 0.5471, + "num_tokens": 2801854794.0, + "step": 672 + }, + { + "epoch": 1.3437421933549838, + "grad_norm": 0.3028954129852853, + "learning_rate": 3.4879528259187495e-05, + "loss": 0.552, + "num_tokens": 2806040698.0, + "step": 673 + }, + { + "epoch": 1.3457406944791406, + "grad_norm": 0.3494448489416874, + "learning_rate": 3.4863227179623524e-05, + "loss": 0.545, + "num_tokens": 2810225052.0, + "step": 674 + }, + { + "epoch": 1.3477391956032976, + "grad_norm": 0.2653396154711516, + "learning_rate": 3.4846904511377085e-05, + "loss": 0.5362, + "num_tokens": 2814409672.0, + "step": 675 + }, + { + "epoch": 1.3497376967274544, + "grad_norm": 0.35623763851618384, + "learning_rate": 3.483056028184293e-05, + "loss": 0.5457, + "num_tokens": 2818594177.0, + "step": 676 + }, + { + "epoch": 1.3517361978516114, + "grad_norm": 0.28827369960359694, + "learning_rate": 3.4814194518451994e-05, + "loss": 0.5271, + "num_tokens": 2822751667.0, + "step": 677 + }, + { + "epoch": 1.3537346989757681, + "grad_norm": 0.31548016499014486, + "learning_rate": 3.479780724867137e-05, + "loss": 0.546, + "num_tokens": 2826915487.0, + "step": 678 + }, + { + "epoch": 1.355733200099925, + "grad_norm": 0.2993704000712479, + "learning_rate": 3.4781398500004206e-05, + "loss": 0.5303, + "num_tokens": 2831101239.0, + "step": 679 + }, + { + "epoch": 1.357731701224082, + "grad_norm": 0.29631577765472056, + "learning_rate": 3.4764968299989745e-05, + "loss": 0.5353, + "num_tokens": 2835275476.0, + "step": 680 + }, + { + "epoch": 1.359730202348239, + "grad_norm": 0.22126446655664733, + "learning_rate": 3.47485166762032e-05, + "loss": 0.541, + "num_tokens": 2839460623.0, + "step": 681 + }, + { + "epoch": 1.3617287034723957, + "grad_norm": 0.352267834609651, + "learning_rate": 3.4732043656255754e-05, + "loss": 0.5499, + "num_tokens": 2843645945.0, + "step": 682 + }, + { + "epoch": 1.3637272045965525, + "grad_norm": 0.24714041294792166, + "learning_rate": 3.47155492677945e-05, + "loss": 0.5317, + "num_tokens": 2847763340.0, + "step": 683 + }, + { + "epoch": 1.3657257057207095, + "grad_norm": 0.3928172451164973, + "learning_rate": 3.469903353850239e-05, + "loss": 0.5412, + "num_tokens": 2851909788.0, + "step": 684 + }, + { + "epoch": 1.3677242068448663, + "grad_norm": 0.4072954831716879, + "learning_rate": 3.4682496496098186e-05, + "loss": 0.5364, + "num_tokens": 2856094069.0, + "step": 685 + }, + { + "epoch": 1.3697227079690233, + "grad_norm": 0.2538947488362466, + "learning_rate": 3.4665938168336435e-05, + "loss": 0.5324, + "num_tokens": 2860256678.0, + "step": 686 + }, + { + "epoch": 1.37172120909318, + "grad_norm": 0.2912311574469873, + "learning_rate": 3.46493585830074e-05, + "loss": 0.5223, + "num_tokens": 2864411515.0, + "step": 687 + }, + { + "epoch": 1.373719710217337, + "grad_norm": 0.24994106709176955, + "learning_rate": 3.463275776793703e-05, + "loss": 0.539, + "num_tokens": 2868565390.0, + "step": 688 + }, + { + "epoch": 1.3757182113414939, + "grad_norm": 0.283186531630279, + "learning_rate": 3.461613575098688e-05, + "loss": 0.5383, + "num_tokens": 2872752825.0, + "step": 689 + }, + { + "epoch": 1.3777167124656509, + "grad_norm": 0.21339077591351635, + "learning_rate": 3.459949256005412e-05, + "loss": 0.5261, + "num_tokens": 2876937731.0, + "step": 690 + }, + { + "epoch": 1.3797152135898076, + "grad_norm": 0.24115842479671049, + "learning_rate": 3.458282822307144e-05, + "loss": 0.5211, + "num_tokens": 2881121454.0, + "step": 691 + }, + { + "epoch": 1.3817137147139644, + "grad_norm": 0.17119049938009376, + "learning_rate": 3.456614276800701e-05, + "loss": 0.5313, + "num_tokens": 2885305476.0, + "step": 692 + }, + { + "epoch": 1.3837122158381214, + "grad_norm": 0.19719906137522128, + "learning_rate": 3.4549436222864475e-05, + "loss": 0.5371, + "num_tokens": 2889489061.0, + "step": 693 + }, + { + "epoch": 1.3857107169622782, + "grad_norm": 0.20655932044124167, + "learning_rate": 3.453270861568283e-05, + "loss": 0.5374, + "num_tokens": 2893648955.0, + "step": 694 + }, + { + "epoch": 1.3877092180864352, + "grad_norm": 0.18297250578862567, + "learning_rate": 3.451595997453647e-05, + "loss": 0.5309, + "num_tokens": 2897773255.0, + "step": 695 + }, + { + "epoch": 1.389707719210592, + "grad_norm": 0.24571452843893957, + "learning_rate": 3.449919032753505e-05, + "loss": 0.5449, + "num_tokens": 2901958243.0, + "step": 696 + }, + { + "epoch": 1.391706220334749, + "grad_norm": 0.14156700566356475, + "learning_rate": 3.448239970282351e-05, + "loss": 0.5334, + "num_tokens": 2906145597.0, + "step": 697 + }, + { + "epoch": 1.3937047214589058, + "grad_norm": 0.2846654163681826, + "learning_rate": 3.446558812858198e-05, + "loss": 0.5503, + "num_tokens": 2910305723.0, + "step": 698 + }, + { + "epoch": 1.3957032225830628, + "grad_norm": 0.1393388361254038, + "learning_rate": 3.444875563302574e-05, + "loss": 0.5442, + "num_tokens": 2914491781.0, + "step": 699 + }, + { + "epoch": 1.3977017237072196, + "grad_norm": 0.29797574367680346, + "learning_rate": 3.443190224440522e-05, + "loss": 0.5329, + "num_tokens": 2918675364.0, + "step": 700 + }, + { + "epoch": 1.3997002248313763, + "grad_norm": 0.19263250769947246, + "learning_rate": 3.441502799100588e-05, + "loss": 0.5383, + "num_tokens": 2922859014.0, + "step": 701 + }, + { + "epoch": 1.4016987259555334, + "grad_norm": 0.2860476957839862, + "learning_rate": 3.439813290114821e-05, + "loss": 0.5366, + "num_tokens": 2927027110.0, + "step": 702 + }, + { + "epoch": 1.4036972270796904, + "grad_norm": 0.23443202238321698, + "learning_rate": 3.4381217003187665e-05, + "loss": 0.5462, + "num_tokens": 2931204721.0, + "step": 703 + }, + { + "epoch": 1.4056957282038471, + "grad_norm": 0.2771466280835855, + "learning_rate": 3.436428032551464e-05, + "loss": 0.5485, + "num_tokens": 2935372525.0, + "step": 704 + }, + { + "epoch": 1.407694229328004, + "grad_norm": 0.2429025974661291, + "learning_rate": 3.434732289655438e-05, + "loss": 0.5397, + "num_tokens": 2939557063.0, + "step": 705 + }, + { + "epoch": 1.409692730452161, + "grad_norm": 0.2921825229256543, + "learning_rate": 3.4330344744766986e-05, + "loss": 0.5374, + "num_tokens": 2943740343.0, + "step": 706 + }, + { + "epoch": 1.4116912315763177, + "grad_norm": 0.2554932395453374, + "learning_rate": 3.431334589864732e-05, + "loss": 0.5343, + "num_tokens": 2947923008.0, + "step": 707 + }, + { + "epoch": 1.4136897327004747, + "grad_norm": 0.2869070949121316, + "learning_rate": 3.4296326386724964e-05, + "loss": 0.5459, + "num_tokens": 2952105054.0, + "step": 708 + }, + { + "epoch": 1.4156882338246315, + "grad_norm": 0.2690638685602334, + "learning_rate": 3.427928623756422e-05, + "loss": 0.5446, + "num_tokens": 2956263027.0, + "step": 709 + }, + { + "epoch": 1.4176867349487883, + "grad_norm": 0.2799002861472886, + "learning_rate": 3.4262225479763995e-05, + "loss": 0.5436, + "num_tokens": 2960447397.0, + "step": 710 + }, + { + "epoch": 1.4196852360729453, + "grad_norm": 0.22699051896447278, + "learning_rate": 3.4245144141957784e-05, + "loss": 0.5329, + "num_tokens": 2964633849.0, + "step": 711 + }, + { + "epoch": 1.4216837371971023, + "grad_norm": 0.28911478694902193, + "learning_rate": 3.422804225281365e-05, + "loss": 0.5459, + "num_tokens": 2968804053.0, + "step": 712 + }, + { + "epoch": 1.423682238321259, + "grad_norm": 0.2102151739240794, + "learning_rate": 3.421091984103413e-05, + "loss": 0.535, + "num_tokens": 2972961761.0, + "step": 713 + }, + { + "epoch": 1.4256807394454158, + "grad_norm": 0.35334453020364476, + "learning_rate": 3.4193776935356195e-05, + "loss": 0.5406, + "num_tokens": 2977131482.0, + "step": 714 + }, + { + "epoch": 1.4276792405695728, + "grad_norm": 0.30666357863424965, + "learning_rate": 3.417661356455122e-05, + "loss": 0.5406, + "num_tokens": 2981316299.0, + "step": 715 + }, + { + "epoch": 1.4296777416937296, + "grad_norm": 0.2728058521350272, + "learning_rate": 3.4159429757424935e-05, + "loss": 0.5436, + "num_tokens": 2985451607.0, + "step": 716 + }, + { + "epoch": 1.4316762428178866, + "grad_norm": 0.31291072955051913, + "learning_rate": 3.414222554281736e-05, + "loss": 0.5414, + "num_tokens": 2989611562.0, + "step": 717 + }, + { + "epoch": 1.4336747439420434, + "grad_norm": 0.21031889727418127, + "learning_rate": 3.412500094960277e-05, + "loss": 0.5406, + "num_tokens": 2993778380.0, + "step": 718 + }, + { + "epoch": 1.4356732450662004, + "grad_norm": 0.29149086144875336, + "learning_rate": 3.4107756006689634e-05, + "loss": 0.5382, + "num_tokens": 2997960995.0, + "step": 719 + }, + { + "epoch": 1.4376717461903572, + "grad_norm": 0.20788649291317107, + "learning_rate": 3.4090490743020575e-05, + "loss": 0.5424, + "num_tokens": 3002145095.0, + "step": 720 + }, + { + "epoch": 1.4396702473145142, + "grad_norm": 0.26994783988837684, + "learning_rate": 3.407320518757234e-05, + "loss": 0.5414, + "num_tokens": 3006329732.0, + "step": 721 + }, + { + "epoch": 1.441668748438671, + "grad_norm": 0.2225904779735929, + "learning_rate": 3.4055899369355714e-05, + "loss": 0.5456, + "num_tokens": 3010514218.0, + "step": 722 + }, + { + "epoch": 1.4436672495628278, + "grad_norm": 0.2546214905560001, + "learning_rate": 3.403857331741548e-05, + "loss": 0.5375, + "num_tokens": 3014625924.0, + "step": 723 + }, + { + "epoch": 1.4456657506869848, + "grad_norm": 0.21887727145304914, + "learning_rate": 3.402122706083041e-05, + "loss": 0.5397, + "num_tokens": 3018813478.0, + "step": 724 + }, + { + "epoch": 1.4476642518111418, + "grad_norm": 0.19902884728563808, + "learning_rate": 3.400386062871316e-05, + "loss": 0.5325, + "num_tokens": 3022942887.0, + "step": 725 + }, + { + "epoch": 1.4496627529352986, + "grad_norm": 0.21124908389341482, + "learning_rate": 3.398647405021026e-05, + "loss": 0.5363, + "num_tokens": 3027127976.0, + "step": 726 + }, + { + "epoch": 1.4516612540594553, + "grad_norm": 0.20612599846629429, + "learning_rate": 3.396906735450205e-05, + "loss": 0.5433, + "num_tokens": 3031314147.0, + "step": 727 + }, + { + "epoch": 1.4536597551836123, + "grad_norm": 0.18472758030287995, + "learning_rate": 3.3951640570802626e-05, + "loss": 0.523, + "num_tokens": 3035476176.0, + "step": 728 + }, + { + "epoch": 1.4556582563077691, + "grad_norm": 0.23120808942594148, + "learning_rate": 3.39341937283598e-05, + "loss": 0.5458, + "num_tokens": 3039661956.0, + "step": 729 + }, + { + "epoch": 1.4576567574319261, + "grad_norm": 0.15209040134113508, + "learning_rate": 3.391672685645507e-05, + "loss": 0.5379, + "num_tokens": 3043848614.0, + "step": 730 + }, + { + "epoch": 1.459655258556083, + "grad_norm": 0.2244793689378212, + "learning_rate": 3.389923998440352e-05, + "loss": 0.5417, + "num_tokens": 3048021732.0, + "step": 731 + }, + { + "epoch": 1.4616537596802397, + "grad_norm": 0.1534466015861308, + "learning_rate": 3.388173314155381e-05, + "loss": 0.5219, + "num_tokens": 3052207306.0, + "step": 732 + }, + { + "epoch": 1.4636522608043967, + "grad_norm": 0.19259042154622014, + "learning_rate": 3.386420635728813e-05, + "loss": 0.5397, + "num_tokens": 3056382961.0, + "step": 733 + }, + { + "epoch": 1.4656507619285537, + "grad_norm": 0.16155318377760625, + "learning_rate": 3.3846659661022124e-05, + "loss": 0.524, + "num_tokens": 3060564174.0, + "step": 734 + }, + { + "epoch": 1.4676492630527105, + "grad_norm": 0.1977675386823489, + "learning_rate": 3.382909308220487e-05, + "loss": 0.5188, + "num_tokens": 3064695575.0, + "step": 735 + }, + { + "epoch": 1.4696477641768673, + "grad_norm": 0.13460535547746427, + "learning_rate": 3.38115066503188e-05, + "loss": 0.5162, + "num_tokens": 3068876011.0, + "step": 736 + }, + { + "epoch": 1.4716462653010243, + "grad_norm": 0.18792047870125067, + "learning_rate": 3.379390039487966e-05, + "loss": 0.5311, + "num_tokens": 3073060536.0, + "step": 737 + }, + { + "epoch": 1.473644766425181, + "grad_norm": 0.19354806913317735, + "learning_rate": 3.377627434543649e-05, + "loss": 0.5525, + "num_tokens": 3077246700.0, + "step": 738 + }, + { + "epoch": 1.475643267549338, + "grad_norm": 0.17075647406492994, + "learning_rate": 3.3758628531571534e-05, + "loss": 0.5142, + "num_tokens": 3081431871.0, + "step": 739 + }, + { + "epoch": 1.4776417686734948, + "grad_norm": 0.1462940911279408, + "learning_rate": 3.374096298290022e-05, + "loss": 0.5448, + "num_tokens": 3085618767.0, + "step": 740 + }, + { + "epoch": 1.4796402697976518, + "grad_norm": 0.16890692275014912, + "learning_rate": 3.372327772907108e-05, + "loss": 0.5166, + "num_tokens": 3089784800.0, + "step": 741 + }, + { + "epoch": 1.4816387709218086, + "grad_norm": 0.1327876893640545, + "learning_rate": 3.3705572799765725e-05, + "loss": 0.5336, + "num_tokens": 3093969530.0, + "step": 742 + }, + { + "epoch": 1.4836372720459656, + "grad_norm": 0.14583518720098712, + "learning_rate": 3.3687848224698804e-05, + "loss": 0.5376, + "num_tokens": 3098139540.0, + "step": 743 + }, + { + "epoch": 1.4856357731701224, + "grad_norm": 0.15222532976557268, + "learning_rate": 3.367010403361791e-05, + "loss": 0.5487, + "num_tokens": 3102323581.0, + "step": 744 + }, + { + "epoch": 1.4876342742942792, + "grad_norm": 0.14444432361443127, + "learning_rate": 3.3652340256303577e-05, + "loss": 0.5342, + "num_tokens": 3106508843.0, + "step": 745 + }, + { + "epoch": 1.4896327754184362, + "grad_norm": 0.15140342748049335, + "learning_rate": 3.363455692256921e-05, + "loss": 0.5515, + "num_tokens": 3110691626.0, + "step": 746 + }, + { + "epoch": 1.491631276542593, + "grad_norm": 0.16682305877523157, + "learning_rate": 3.3616754062261015e-05, + "loss": 0.5327, + "num_tokens": 3114874477.0, + "step": 747 + }, + { + "epoch": 1.49362977766675, + "grad_norm": 0.1659882823101331, + "learning_rate": 3.3598931705258e-05, + "loss": 0.5043, + "num_tokens": 3119056305.0, + "step": 748 + }, + { + "epoch": 1.4956282787909068, + "grad_norm": 0.1593383135319266, + "learning_rate": 3.358108988147189e-05, + "loss": 0.533, + "num_tokens": 3123240687.0, + "step": 749 + }, + { + "epoch": 1.4976267799150638, + "grad_norm": 0.14850530021719002, + "learning_rate": 3.3563228620847063e-05, + "loss": 0.5272, + "num_tokens": 3127426721.0, + "step": 750 + }, + { + "epoch": 1.4996252810392205, + "grad_norm": 0.13739325584827425, + "learning_rate": 3.354534795336052e-05, + "loss": 0.5236, + "num_tokens": 3131610815.0, + "step": 751 + }, + { + "epoch": 1.5016237821633776, + "grad_norm": 0.15888745347504543, + "learning_rate": 3.3527447909021856e-05, + "loss": 0.528, + "num_tokens": 3135763789.0, + "step": 752 + }, + { + "epoch": 1.5036222832875343, + "grad_norm": 0.15971442897467153, + "learning_rate": 3.350952851787317e-05, + "loss": 0.5235, + "num_tokens": 3139948746.0, + "step": 753 + }, + { + "epoch": 1.5056207844116911, + "grad_norm": 0.13321165263723092, + "learning_rate": 3.3491589809989025e-05, + "loss": 0.5322, + "num_tokens": 3144109073.0, + "step": 754 + }, + { + "epoch": 1.5076192855358481, + "grad_norm": 0.1361572403988464, + "learning_rate": 3.347363181547642e-05, + "loss": 0.534, + "num_tokens": 3148265386.0, + "step": 755 + }, + { + "epoch": 1.5096177866600051, + "grad_norm": 0.14002985725474024, + "learning_rate": 3.345565456447471e-05, + "loss": 0.5341, + "num_tokens": 3152448885.0, + "step": 756 + }, + { + "epoch": 1.511616287784162, + "grad_norm": 0.12097496675814558, + "learning_rate": 3.343765808715558e-05, + "loss": 0.5485, + "num_tokens": 3156633416.0, + "step": 757 + }, + { + "epoch": 1.5136147889083187, + "grad_norm": 0.14358587016487076, + "learning_rate": 3.341964241372297e-05, + "loss": 0.5366, + "num_tokens": 3160797975.0, + "step": 758 + }, + { + "epoch": 1.5156132900324757, + "grad_norm": 0.13861365958498134, + "learning_rate": 3.340160757441306e-05, + "loss": 0.5464, + "num_tokens": 3164953105.0, + "step": 759 + }, + { + "epoch": 1.5176117911566325, + "grad_norm": 0.1486843955663931, + "learning_rate": 3.338355359949416e-05, + "loss": 0.5429, + "num_tokens": 3169126725.0, + "step": 760 + }, + { + "epoch": 1.5196102922807895, + "grad_norm": 0.15195050337033714, + "learning_rate": 3.3365480519266734e-05, + "loss": 0.5334, + "num_tokens": 3173301183.0, + "step": 761 + }, + { + "epoch": 1.5216087934049463, + "grad_norm": 0.16131057949383662, + "learning_rate": 3.334738836406327e-05, + "loss": 0.5198, + "num_tokens": 3177464664.0, + "step": 762 + }, + { + "epoch": 1.523607294529103, + "grad_norm": 0.17144379904344276, + "learning_rate": 3.332927716424833e-05, + "loss": 0.5264, + "num_tokens": 3181651080.0, + "step": 763 + }, + { + "epoch": 1.52560579565326, + "grad_norm": 0.1506741611396248, + "learning_rate": 3.331114695021838e-05, + "loss": 0.5332, + "num_tokens": 3185836006.0, + "step": 764 + }, + { + "epoch": 1.527604296777417, + "grad_norm": 0.13979818821294218, + "learning_rate": 3.3292997752401816e-05, + "loss": 0.5433, + "num_tokens": 3190021961.0, + "step": 765 + }, + { + "epoch": 1.5296027979015738, + "grad_norm": 0.14821605325234133, + "learning_rate": 3.327482960125892e-05, + "loss": 0.5321, + "num_tokens": 3194167294.0, + "step": 766 + }, + { + "epoch": 1.5316012990257306, + "grad_norm": 0.16211000173201034, + "learning_rate": 3.325664252728174e-05, + "loss": 0.5345, + "num_tokens": 3198324700.0, + "step": 767 + }, + { + "epoch": 1.5335998001498876, + "grad_norm": 0.140789644798288, + "learning_rate": 3.3238436560994124e-05, + "loss": 0.5422, + "num_tokens": 3202508216.0, + "step": 768 + }, + { + "epoch": 1.5355983012740446, + "grad_norm": 0.15954957397922745, + "learning_rate": 3.322021173295161e-05, + "loss": 0.5314, + "num_tokens": 3206680398.0, + "step": 769 + }, + { + "epoch": 1.5375968023982014, + "grad_norm": 0.1344129454539413, + "learning_rate": 3.320196807374138e-05, + "loss": 0.5417, + "num_tokens": 3210864436.0, + "step": 770 + }, + { + "epoch": 1.5395953035223582, + "grad_norm": 0.14893771224254299, + "learning_rate": 3.318370561398224e-05, + "loss": 0.5349, + "num_tokens": 3215021016.0, + "step": 771 + }, + { + "epoch": 1.541593804646515, + "grad_norm": 0.14718003700488197, + "learning_rate": 3.3165424384324535e-05, + "loss": 0.5442, + "num_tokens": 3219174304.0, + "step": 772 + }, + { + "epoch": 1.543592305770672, + "grad_norm": 0.1770237908257867, + "learning_rate": 3.314712441545014e-05, + "loss": 0.5376, + "num_tokens": 3223300353.0, + "step": 773 + }, + { + "epoch": 1.545590806894829, + "grad_norm": 0.17746814568727926, + "learning_rate": 3.312880573807233e-05, + "loss": 0.5349, + "num_tokens": 3227464298.0, + "step": 774 + }, + { + "epoch": 1.5475893080189858, + "grad_norm": 0.16249266242248428, + "learning_rate": 3.311046838293584e-05, + "loss": 0.5275, + "num_tokens": 3231634345.0, + "step": 775 + }, + { + "epoch": 1.5495878091431425, + "grad_norm": 0.19464406469420673, + "learning_rate": 3.3092112380816696e-05, + "loss": 0.5445, + "num_tokens": 3235819585.0, + "step": 776 + }, + { + "epoch": 1.5515863102672995, + "grad_norm": 0.17902707208814247, + "learning_rate": 3.307373776252225e-05, + "loss": 0.5389, + "num_tokens": 3239966499.0, + "step": 777 + }, + { + "epoch": 1.5535848113914565, + "grad_norm": 0.15490279638571913, + "learning_rate": 3.3055344558891104e-05, + "loss": 0.5309, + "num_tokens": 3244152438.0, + "step": 778 + }, + { + "epoch": 1.5555833125156133, + "grad_norm": 0.13958554413877844, + "learning_rate": 3.3036932800793035e-05, + "loss": 0.529, + "num_tokens": 3248329440.0, + "step": 779 + }, + { + "epoch": 1.55758181363977, + "grad_norm": 0.18113875795603132, + "learning_rate": 3.3018502519128964e-05, + "loss": 0.537, + "num_tokens": 3252481437.0, + "step": 780 + }, + { + "epoch": 1.559580314763927, + "grad_norm": 0.17027080088212254, + "learning_rate": 3.300005374483091e-05, + "loss": 0.5322, + "num_tokens": 3256666009.0, + "step": 781 + }, + { + "epoch": 1.561578815888084, + "grad_norm": 0.15238813771604415, + "learning_rate": 3.298158650886191e-05, + "loss": 0.5418, + "num_tokens": 3260849463.0, + "step": 782 + }, + { + "epoch": 1.563577317012241, + "grad_norm": 0.12496286833264052, + "learning_rate": 3.2963100842216e-05, + "loss": 0.5319, + "num_tokens": 3264987948.0, + "step": 783 + }, + { + "epoch": 1.5655758181363977, + "grad_norm": 0.1656577931201619, + "learning_rate": 3.294459677591816e-05, + "loss": 0.5315, + "num_tokens": 3269171365.0, + "step": 784 + }, + { + "epoch": 1.5675743192605545, + "grad_norm": 0.15716619069661478, + "learning_rate": 3.292607434102422e-05, + "loss": 0.5294, + "num_tokens": 3273329349.0, + "step": 785 + }, + { + "epoch": 1.5695728203847115, + "grad_norm": 0.15602226771762356, + "learning_rate": 3.290753356862086e-05, + "loss": 0.5339, + "num_tokens": 3277517410.0, + "step": 786 + }, + { + "epoch": 1.5715713215088685, + "grad_norm": 0.1459443577595303, + "learning_rate": 3.2888974489825545e-05, + "loss": 0.5432, + "num_tokens": 3281701904.0, + "step": 787 + }, + { + "epoch": 1.5735698226330253, + "grad_norm": 0.18086649802259785, + "learning_rate": 3.287039713578643e-05, + "loss": 0.5292, + "num_tokens": 3285885663.0, + "step": 788 + }, + { + "epoch": 1.575568323757182, + "grad_norm": 0.13923851911626456, + "learning_rate": 3.285180153768238e-05, + "loss": 0.5471, + "num_tokens": 3290069709.0, + "step": 789 + }, + { + "epoch": 1.577566824881339, + "grad_norm": 0.13320629688932917, + "learning_rate": 3.283318772672284e-05, + "loss": 0.553, + "num_tokens": 3294229157.0, + "step": 790 + }, + { + "epoch": 1.579565326005496, + "grad_norm": 0.13409944980548694, + "learning_rate": 3.281455573414787e-05, + "loss": 0.5387, + "num_tokens": 3298408495.0, + "step": 791 + }, + { + "epoch": 1.5815638271296528, + "grad_norm": 0.14482180280393073, + "learning_rate": 3.2795905591228004e-05, + "loss": 0.5289, + "num_tokens": 3302590578.0, + "step": 792 + }, + { + "epoch": 1.5835623282538096, + "grad_norm": 0.15433413268080523, + "learning_rate": 3.277723732926425e-05, + "loss": 0.5348, + "num_tokens": 3306771376.0, + "step": 793 + }, + { + "epoch": 1.5855608293779664, + "grad_norm": 0.14073136246698495, + "learning_rate": 3.275855097958803e-05, + "loss": 0.5434, + "num_tokens": 3310921018.0, + "step": 794 + }, + { + "epoch": 1.5875593305021234, + "grad_norm": 0.1427751427794272, + "learning_rate": 3.273984657356113e-05, + "loss": 0.5434, + "num_tokens": 3315092308.0, + "step": 795 + }, + { + "epoch": 1.5895578316262804, + "grad_norm": 0.1313474678354926, + "learning_rate": 3.272112414257563e-05, + "loss": 0.5262, + "num_tokens": 3319278184.0, + "step": 796 + }, + { + "epoch": 1.5915563327504372, + "grad_norm": 0.15384654547181845, + "learning_rate": 3.2702383718053865e-05, + "loss": 0.5221, + "num_tokens": 3323437732.0, + "step": 797 + }, + { + "epoch": 1.593554833874594, + "grad_norm": 0.13299497933142726, + "learning_rate": 3.268362533144835e-05, + "loss": 0.5371, + "num_tokens": 3327620829.0, + "step": 798 + }, + { + "epoch": 1.595553334998751, + "grad_norm": 0.1385293470474629, + "learning_rate": 3.266484901424178e-05, + "loss": 0.5378, + "num_tokens": 3331804914.0, + "step": 799 + }, + { + "epoch": 1.597551836122908, + "grad_norm": 0.13749123008809777, + "learning_rate": 3.2646054797946934e-05, + "loss": 0.5252, + "num_tokens": 3335942340.0, + "step": 800 + }, + { + "epoch": 1.5995503372470647, + "grad_norm": 0.14611814146114052, + "learning_rate": 3.262724271410661e-05, + "loss": 0.5349, + "num_tokens": 3340125153.0, + "step": 801 + }, + { + "epoch": 1.6015488383712215, + "grad_norm": 0.15764938305030837, + "learning_rate": 3.260841279429361e-05, + "loss": 0.5193, + "num_tokens": 3344311157.0, + "step": 802 + }, + { + "epoch": 1.6035473394953783, + "grad_norm": 0.12571619289907465, + "learning_rate": 3.258956507011069e-05, + "loss": 0.5253, + "num_tokens": 3348495238.0, + "step": 803 + }, + { + "epoch": 1.6055458406195353, + "grad_norm": 0.12931928780633312, + "learning_rate": 3.2570699573190435e-05, + "loss": 0.5178, + "num_tokens": 3352681156.0, + "step": 804 + }, + { + "epoch": 1.6075443417436923, + "grad_norm": 0.14137241114058924, + "learning_rate": 3.2551816335195304e-05, + "loss": 0.5238, + "num_tokens": 3356825535.0, + "step": 805 + }, + { + "epoch": 1.609542842867849, + "grad_norm": 0.13808313640057787, + "learning_rate": 3.253291538781752e-05, + "loss": 0.5349, + "num_tokens": 3361001110.0, + "step": 806 + }, + { + "epoch": 1.6115413439920059, + "grad_norm": 0.14034377587621064, + "learning_rate": 3.251399676277903e-05, + "loss": 0.5297, + "num_tokens": 3365133429.0, + "step": 807 + }, + { + "epoch": 1.6135398451161629, + "grad_norm": 0.15323804286881654, + "learning_rate": 3.249506049183143e-05, + "loss": 0.5443, + "num_tokens": 3369319708.0, + "step": 808 + }, + { + "epoch": 1.61553834624032, + "grad_norm": 0.14723002407167393, + "learning_rate": 3.247610660675596e-05, + "loss": 0.5291, + "num_tokens": 3373497825.0, + "step": 809 + }, + { + "epoch": 1.6175368473644767, + "grad_norm": 0.13563911386323982, + "learning_rate": 3.245713513936341e-05, + "loss": 0.5421, + "num_tokens": 3377680203.0, + "step": 810 + }, + { + "epoch": 1.6195353484886335, + "grad_norm": 0.16155918502315506, + "learning_rate": 3.2438146121494065e-05, + "loss": 0.5336, + "num_tokens": 3381862405.0, + "step": 811 + }, + { + "epoch": 1.6215338496127905, + "grad_norm": 0.13215669616777126, + "learning_rate": 3.241913958501768e-05, + "loss": 0.5372, + "num_tokens": 3386044860.0, + "step": 812 + }, + { + "epoch": 1.6235323507369472, + "grad_norm": 0.1511604312995185, + "learning_rate": 3.2400115561833426e-05, + "loss": 0.5406, + "num_tokens": 3390231719.0, + "step": 813 + }, + { + "epoch": 1.6255308518611042, + "grad_norm": 0.15409216374478052, + "learning_rate": 3.238107408386979e-05, + "loss": 0.5401, + "num_tokens": 3394374713.0, + "step": 814 + }, + { + "epoch": 1.627529352985261, + "grad_norm": 0.1500289086495232, + "learning_rate": 3.236201518308458e-05, + "loss": 0.5326, + "num_tokens": 3398535996.0, + "step": 815 + }, + { + "epoch": 1.6295278541094178, + "grad_norm": 0.1583241281829279, + "learning_rate": 3.234293889146483e-05, + "loss": 0.5388, + "num_tokens": 3402723311.0, + "step": 816 + }, + { + "epoch": 1.6315263552335748, + "grad_norm": 0.15700102944782507, + "learning_rate": 3.2323845241026776e-05, + "loss": 0.5246, + "num_tokens": 3406892926.0, + "step": 817 + }, + { + "epoch": 1.6335248563577318, + "grad_norm": 0.19101442285828182, + "learning_rate": 3.2304734263815766e-05, + "loss": 0.5253, + "num_tokens": 3411061743.0, + "step": 818 + }, + { + "epoch": 1.6355233574818886, + "grad_norm": 0.18707867659298466, + "learning_rate": 3.228560599190625e-05, + "loss": 0.5604, + "num_tokens": 3415208226.0, + "step": 819 + }, + { + "epoch": 1.6375218586060454, + "grad_norm": 0.16353073735962917, + "learning_rate": 3.226646045740169e-05, + "loss": 0.515, + "num_tokens": 3419394884.0, + "step": 820 + }, + { + "epoch": 1.6395203597302024, + "grad_norm": 0.1464826766426585, + "learning_rate": 3.224729769243453e-05, + "loss": 0.5288, + "num_tokens": 3423529653.0, + "step": 821 + }, + { + "epoch": 1.6415188608543594, + "grad_norm": 0.17831760845323547, + "learning_rate": 3.222811772916612e-05, + "loss": 0.548, + "num_tokens": 3427716126.0, + "step": 822 + }, + { + "epoch": 1.6435173619785162, + "grad_norm": 0.1908167035916003, + "learning_rate": 3.22089205997867e-05, + "loss": 0.5343, + "num_tokens": 3431903037.0, + "step": 823 + }, + { + "epoch": 1.645515863102673, + "grad_norm": 0.14832071877510056, + "learning_rate": 3.218970633651528e-05, + "loss": 0.5259, + "num_tokens": 3436044527.0, + "step": 824 + }, + { + "epoch": 1.6475143642268297, + "grad_norm": 0.17113763536581567, + "learning_rate": 3.2170474971599654e-05, + "loss": 0.5286, + "num_tokens": 3440227372.0, + "step": 825 + }, + { + "epoch": 1.6495128653509867, + "grad_norm": 0.12965763158442112, + "learning_rate": 3.2151226537316315e-05, + "loss": 0.5213, + "num_tokens": 3444414003.0, + "step": 826 + }, + { + "epoch": 1.6515113664751437, + "grad_norm": 0.15259800255333686, + "learning_rate": 3.213196106597041e-05, + "loss": 0.5336, + "num_tokens": 3448598982.0, + "step": 827 + }, + { + "epoch": 1.6535098675993005, + "grad_norm": 0.13452704396859363, + "learning_rate": 3.211267858989566e-05, + "loss": 0.5243, + "num_tokens": 3452756703.0, + "step": 828 + }, + { + "epoch": 1.6555083687234573, + "grad_norm": 0.17517084524796359, + "learning_rate": 3.209337914145434e-05, + "loss": 0.5282, + "num_tokens": 3456942182.0, + "step": 829 + }, + { + "epoch": 1.6575068698476143, + "grad_norm": 0.14405643468929474, + "learning_rate": 3.2074062753037214e-05, + "loss": 0.5336, + "num_tokens": 3461099270.0, + "step": 830 + }, + { + "epoch": 1.6595053709717713, + "grad_norm": 0.1643075509939601, + "learning_rate": 3.205472945706345e-05, + "loss": 0.5188, + "num_tokens": 3465279271.0, + "step": 831 + }, + { + "epoch": 1.661503872095928, + "grad_norm": 0.17422819300914535, + "learning_rate": 3.2035379285980635e-05, + "loss": 0.5282, + "num_tokens": 3469458240.0, + "step": 832 + }, + { + "epoch": 1.6635023732200849, + "grad_norm": 0.15271950698093725, + "learning_rate": 3.2016012272264646e-05, + "loss": 0.5285, + "num_tokens": 3473584614.0, + "step": 833 + }, + { + "epoch": 1.6655008743442417, + "grad_norm": 0.15008445824779457, + "learning_rate": 3.199662844841963e-05, + "loss": 0.547, + "num_tokens": 3477762155.0, + "step": 834 + }, + { + "epoch": 1.6674993754683987, + "grad_norm": 0.13634926048776133, + "learning_rate": 3.1977227846977954e-05, + "loss": 0.5181, + "num_tokens": 3481947010.0, + "step": 835 + }, + { + "epoch": 1.6694978765925557, + "grad_norm": 0.15452857219691793, + "learning_rate": 3.1957810500500156e-05, + "loss": 0.5242, + "num_tokens": 3486096452.0, + "step": 836 + }, + { + "epoch": 1.6714963777167124, + "grad_norm": 0.14675813909634358, + "learning_rate": 3.193837644157486e-05, + "loss": 0.5261, + "num_tokens": 3490280512.0, + "step": 837 + }, + { + "epoch": 1.6734948788408692, + "grad_norm": 0.1555894296878722, + "learning_rate": 3.191892570281872e-05, + "loss": 0.5317, + "num_tokens": 3494451873.0, + "step": 838 + }, + { + "epoch": 1.6754933799650262, + "grad_norm": 0.18119952145315954, + "learning_rate": 3.1899458316876435e-05, + "loss": 0.5335, + "num_tokens": 3498635782.0, + "step": 839 + }, + { + "epoch": 1.6774918810891832, + "grad_norm": 0.13872861136169595, + "learning_rate": 3.187997431642061e-05, + "loss": 0.54, + "num_tokens": 3502820747.0, + "step": 840 + }, + { + "epoch": 1.67949038221334, + "grad_norm": 0.18353617267316755, + "learning_rate": 3.1860473734151737e-05, + "loss": 0.5199, + "num_tokens": 3507004210.0, + "step": 841 + }, + { + "epoch": 1.6814888833374968, + "grad_norm": 0.17241435374930097, + "learning_rate": 3.1840956602798134e-05, + "loss": 0.5151, + "num_tokens": 3511171539.0, + "step": 842 + }, + { + "epoch": 1.6834873844616538, + "grad_norm": 0.18668153425846076, + "learning_rate": 3.182142295511592e-05, + "loss": 0.5096, + "num_tokens": 3515358117.0, + "step": 843 + }, + { + "epoch": 1.6854858855858108, + "grad_norm": 0.15384323970094196, + "learning_rate": 3.1801872823888885e-05, + "loss": 0.5443, + "num_tokens": 3519519555.0, + "step": 844 + }, + { + "epoch": 1.6874843867099676, + "grad_norm": 0.19656935179480048, + "learning_rate": 3.178230624192854e-05, + "loss": 0.5442, + "num_tokens": 3523706603.0, + "step": 845 + }, + { + "epoch": 1.6894828878341244, + "grad_norm": 0.17970184911713014, + "learning_rate": 3.176272324207396e-05, + "loss": 0.5348, + "num_tokens": 3527888018.0, + "step": 846 + }, + { + "epoch": 1.6914813889582812, + "grad_norm": 0.12691409565747744, + "learning_rate": 3.174312385719181e-05, + "loss": 0.5381, + "num_tokens": 3532072538.0, + "step": 847 + }, + { + "epoch": 1.6934798900824382, + "grad_norm": 0.21396490914341537, + "learning_rate": 3.172350812017623e-05, + "loss": 0.5293, + "num_tokens": 3536240479.0, + "step": 848 + }, + { + "epoch": 1.6954783912065952, + "grad_norm": 0.15299288290245397, + "learning_rate": 3.1703876063948806e-05, + "loss": 0.5412, + "num_tokens": 3540426268.0, + "step": 849 + }, + { + "epoch": 1.697476892330752, + "grad_norm": 0.18762072499499766, + "learning_rate": 3.168422772145853e-05, + "loss": 0.5375, + "num_tokens": 3544611690.0, + "step": 850 + }, + { + "epoch": 1.6994753934549087, + "grad_norm": 0.16787310365940641, + "learning_rate": 3.166456312568171e-05, + "loss": 0.5267, + "num_tokens": 3548765086.0, + "step": 851 + }, + { + "epoch": 1.7014738945790657, + "grad_norm": 0.15069845036099602, + "learning_rate": 3.164488230962194e-05, + "loss": 0.5338, + "num_tokens": 3552948327.0, + "step": 852 + }, + { + "epoch": 1.7034723957032227, + "grad_norm": 0.1406578357486688, + "learning_rate": 3.162518530631004e-05, + "loss": 0.5153, + "num_tokens": 3557122448.0, + "step": 853 + }, + { + "epoch": 1.7054708968273795, + "grad_norm": 0.15952371318155179, + "learning_rate": 3.160547214880398e-05, + "loss": 0.5217, + "num_tokens": 3561305720.0, + "step": 854 + }, + { + "epoch": 1.7074693979515363, + "grad_norm": 0.12795215976882596, + "learning_rate": 3.158574287018888e-05, + "loss": 0.5316, + "num_tokens": 3565488743.0, + "step": 855 + }, + { + "epoch": 1.709467899075693, + "grad_norm": 0.15682853331285299, + "learning_rate": 3.156599750357687e-05, + "loss": 0.53, + "num_tokens": 3569644191.0, + "step": 856 + }, + { + "epoch": 1.71146640019985, + "grad_norm": 0.15105540697131017, + "learning_rate": 3.154623608210711e-05, + "loss": 0.5261, + "num_tokens": 3573830745.0, + "step": 857 + }, + { + "epoch": 1.713464901324007, + "grad_norm": 0.15066625892217364, + "learning_rate": 3.15264586389457e-05, + "loss": 0.5341, + "num_tokens": 3577989698.0, + "step": 858 + }, + { + "epoch": 1.7154634024481639, + "grad_norm": 0.17341348154838568, + "learning_rate": 3.150666520728562e-05, + "loss": 0.5241, + "num_tokens": 3582174860.0, + "step": 859 + }, + { + "epoch": 1.7174619035723206, + "grad_norm": 0.1502511967648011, + "learning_rate": 3.14868558203467e-05, + "loss": 0.5128, + "num_tokens": 3586358695.0, + "step": 860 + }, + { + "epoch": 1.7194604046964777, + "grad_norm": 0.14497390855280992, + "learning_rate": 3.146703051137553e-05, + "loss": 0.5324, + "num_tokens": 3590538296.0, + "step": 861 + }, + { + "epoch": 1.7214589058206347, + "grad_norm": 0.1683860418725973, + "learning_rate": 3.144718931364545e-05, + "loss": 0.5367, + "num_tokens": 3594713958.0, + "step": 862 + }, + { + "epoch": 1.7234574069447914, + "grad_norm": 0.14281961256857562, + "learning_rate": 3.142733226045643e-05, + "loss": 0.5389, + "num_tokens": 3598896016.0, + "step": 863 + }, + { + "epoch": 1.7254559080689482, + "grad_norm": 0.18163886066130847, + "learning_rate": 3.1407459385135075e-05, + "loss": 0.5326, + "num_tokens": 3603082591.0, + "step": 864 + }, + { + "epoch": 1.7274544091931052, + "grad_norm": 0.14459035911064683, + "learning_rate": 3.138757072103455e-05, + "loss": 0.5393, + "num_tokens": 3607261306.0, + "step": 865 + }, + { + "epoch": 1.729452910317262, + "grad_norm": 0.14754040218923906, + "learning_rate": 3.136766630153449e-05, + "loss": 0.5185, + "num_tokens": 3611440113.0, + "step": 866 + }, + { + "epoch": 1.731451411441419, + "grad_norm": 0.14550615532525696, + "learning_rate": 3.134774616004099e-05, + "loss": 0.5628, + "num_tokens": 3615624497.0, + "step": 867 + }, + { + "epoch": 1.7334499125655758, + "grad_norm": 0.1476962661517269, + "learning_rate": 3.132781032998655e-05, + "loss": 0.5191, + "num_tokens": 3619808701.0, + "step": 868 + }, + { + "epoch": 1.7354484136897326, + "grad_norm": 0.14619777662002523, + "learning_rate": 3.1307858844829974e-05, + "loss": 0.5226, + "num_tokens": 3623970670.0, + "step": 869 + }, + { + "epoch": 1.7374469148138896, + "grad_norm": 0.14248004284659824, + "learning_rate": 3.1287891738056336e-05, + "loss": 0.5481, + "num_tokens": 3628158971.0, + "step": 870 + }, + { + "epoch": 1.7394454159380466, + "grad_norm": 0.14837993875331823, + "learning_rate": 3.1267909043176966e-05, + "loss": 0.5292, + "num_tokens": 3632343906.0, + "step": 871 + }, + { + "epoch": 1.7414439170622034, + "grad_norm": 0.15188990896107107, + "learning_rate": 3.124791079372931e-05, + "loss": 0.5331, + "num_tokens": 3636508738.0, + "step": 872 + }, + { + "epoch": 1.7434424181863601, + "grad_norm": 0.13850340921230345, + "learning_rate": 3.1227897023276945e-05, + "loss": 0.5292, + "num_tokens": 3640681942.0, + "step": 873 + }, + { + "epoch": 1.7454409193105171, + "grad_norm": 0.1527511207312414, + "learning_rate": 3.12078677654095e-05, + "loss": 0.5249, + "num_tokens": 3644836975.0, + "step": 874 + }, + { + "epoch": 1.7474394204346742, + "grad_norm": 0.15790596101364213, + "learning_rate": 3.118782305374257e-05, + "loss": 0.5323, + "num_tokens": 3648979033.0, + "step": 875 + }, + { + "epoch": 1.749437921558831, + "grad_norm": 0.16129763840134664, + "learning_rate": 3.116776292191774e-05, + "loss": 0.5425, + "num_tokens": 3653164987.0, + "step": 876 + }, + { + "epoch": 1.7514364226829877, + "grad_norm": 0.15627055555131417, + "learning_rate": 3.114768740360241e-05, + "loss": 0.5187, + "num_tokens": 3657350000.0, + "step": 877 + }, + { + "epoch": 1.7534349238071445, + "grad_norm": 0.17530803746689888, + "learning_rate": 3.112759653248985e-05, + "loss": 0.526, + "num_tokens": 3661535792.0, + "step": 878 + }, + { + "epoch": 1.7554334249313015, + "grad_norm": 0.16646783579192315, + "learning_rate": 3.110749034229908e-05, + "loss": 0.5385, + "num_tokens": 3665690694.0, + "step": 879 + }, + { + "epoch": 1.7574319260554585, + "grad_norm": 0.18362595129260045, + "learning_rate": 3.108736886677483e-05, + "loss": 0.5186, + "num_tokens": 3669819825.0, + "step": 880 + }, + { + "epoch": 1.7594304271796153, + "grad_norm": 0.18895220770440066, + "learning_rate": 3.106723213968748e-05, + "loss": 0.5386, + "num_tokens": 3673954806.0, + "step": 881 + }, + { + "epoch": 1.761428928303772, + "grad_norm": 0.17888783212605994, + "learning_rate": 3.1047080194833016e-05, + "loss": 0.5253, + "num_tokens": 3678140482.0, + "step": 882 + }, + { + "epoch": 1.763427429427929, + "grad_norm": 0.2269450007163213, + "learning_rate": 3.1026913066032976e-05, + "loss": 0.5365, + "num_tokens": 3682327400.0, + "step": 883 + }, + { + "epoch": 1.765425930552086, + "grad_norm": 0.16637271996705125, + "learning_rate": 3.100673078713435e-05, + "loss": 0.5125, + "num_tokens": 3686513372.0, + "step": 884 + }, + { + "epoch": 1.7674244316762429, + "grad_norm": 0.13817005582122155, + "learning_rate": 3.098653339200958e-05, + "loss": 0.52, + "num_tokens": 3690698415.0, + "step": 885 + }, + { + "epoch": 1.7694229328003996, + "grad_norm": 0.15359822779260038, + "learning_rate": 3.096632091455647e-05, + "loss": 0.5219, + "num_tokens": 3694855465.0, + "step": 886 + }, + { + "epoch": 1.7714214339245564, + "grad_norm": 0.14619810882817905, + "learning_rate": 3.094609338869813e-05, + "loss": 0.5346, + "num_tokens": 3699006494.0, + "step": 887 + }, + { + "epoch": 1.7734199350487134, + "grad_norm": 0.17117957109137738, + "learning_rate": 3.092585084838294e-05, + "loss": 0.5363, + "num_tokens": 3703192441.0, + "step": 888 + }, + { + "epoch": 1.7754184361728704, + "grad_norm": 0.1756167212301527, + "learning_rate": 3.090559332758446e-05, + "loss": 0.5168, + "num_tokens": 3707346088.0, + "step": 889 + }, + { + "epoch": 1.7774169372970272, + "grad_norm": 0.15910247523887247, + "learning_rate": 3.088532086030142e-05, + "loss": 0.5356, + "num_tokens": 3711516057.0, + "step": 890 + }, + { + "epoch": 1.779415438421184, + "grad_norm": 0.16248247817078162, + "learning_rate": 3.086503348055761e-05, + "loss": 0.5241, + "num_tokens": 3715699801.0, + "step": 891 + }, + { + "epoch": 1.781413939545341, + "grad_norm": 0.17345900061967892, + "learning_rate": 3.084473122240186e-05, + "loss": 0.5263, + "num_tokens": 3719885457.0, + "step": 892 + }, + { + "epoch": 1.783412440669498, + "grad_norm": 0.14124705045401392, + "learning_rate": 3.082441411990797e-05, + "loss": 0.5254, + "num_tokens": 3724046993.0, + "step": 893 + }, + { + "epoch": 1.7854109417936548, + "grad_norm": 0.17809639608138275, + "learning_rate": 3.0804082207174646e-05, + "loss": 0.5182, + "num_tokens": 3728225568.0, + "step": 894 + }, + { + "epoch": 1.7874094429178116, + "grad_norm": 0.15229210552684172, + "learning_rate": 3.0783735518325465e-05, + "loss": 0.5377, + "num_tokens": 3732382456.0, + "step": 895 + }, + { + "epoch": 1.7894079440419686, + "grad_norm": 0.15824461838441917, + "learning_rate": 3.0763374087508776e-05, + "loss": 0.5321, + "num_tokens": 3736567925.0, + "step": 896 + }, + { + "epoch": 1.7914064451661256, + "grad_norm": 0.18383145548923005, + "learning_rate": 3.074299794889771e-05, + "loss": 0.5274, + "num_tokens": 3740726151.0, + "step": 897 + }, + { + "epoch": 1.7934049462902824, + "grad_norm": 0.18357339521050006, + "learning_rate": 3.0722607136690046e-05, + "loss": 0.5226, + "num_tokens": 3744909062.0, + "step": 898 + }, + { + "epoch": 1.7954034474144391, + "grad_norm": 0.1495496854798841, + "learning_rate": 3.070220168510821e-05, + "loss": 0.5485, + "num_tokens": 3749095444.0, + "step": 899 + }, + { + "epoch": 1.797401948538596, + "grad_norm": 0.19015628416878827, + "learning_rate": 3.06817816283992e-05, + "loss": 0.5362, + "num_tokens": 3753280863.0, + "step": 900 + }, + { + "epoch": 1.799400449662753, + "grad_norm": 0.18904937603378233, + "learning_rate": 3.0661347000834496e-05, + "loss": 0.5252, + "num_tokens": 3757466848.0, + "step": 901 + }, + { + "epoch": 1.80139895078691, + "grad_norm": 0.14841015126125265, + "learning_rate": 3.0640897836710074e-05, + "loss": 0.5354, + "num_tokens": 3761649485.0, + "step": 902 + }, + { + "epoch": 1.8033974519110667, + "grad_norm": 0.18368230227701243, + "learning_rate": 3.0620434170346276e-05, + "loss": 0.5372, + "num_tokens": 3765834637.0, + "step": 903 + }, + { + "epoch": 1.8053959530352235, + "grad_norm": 0.19079929238104532, + "learning_rate": 3.0599956036087816e-05, + "loss": 0.5291, + "num_tokens": 3770021126.0, + "step": 904 + }, + { + "epoch": 1.8073944541593805, + "grad_norm": 0.13371104941588838, + "learning_rate": 3.057946346830365e-05, + "loss": 0.5303, + "num_tokens": 3774183928.0, + "step": 905 + }, + { + "epoch": 1.8093929552835375, + "grad_norm": 0.17568634995090215, + "learning_rate": 3.055895650138698e-05, + "loss": 0.5243, + "num_tokens": 3778368988.0, + "step": 906 + }, + { + "epoch": 1.8113914564076943, + "grad_norm": 0.17840191641048195, + "learning_rate": 3.0538435169755175e-05, + "loss": 0.5329, + "num_tokens": 3782529318.0, + "step": 907 + }, + { + "epoch": 1.813389957531851, + "grad_norm": 0.15002660938665635, + "learning_rate": 3.0517899507849715e-05, + "loss": 0.551, + "num_tokens": 3786714422.0, + "step": 908 + }, + { + "epoch": 1.8153884586560078, + "grad_norm": 0.1818387358357631, + "learning_rate": 3.0497349550136116e-05, + "loss": 0.5272, + "num_tokens": 3790900106.0, + "step": 909 + }, + { + "epoch": 1.8173869597801648, + "grad_norm": 0.1615750338557447, + "learning_rate": 3.0476785331103903e-05, + "loss": 0.5225, + "num_tokens": 3795074701.0, + "step": 910 + }, + { + "epoch": 1.8193854609043218, + "grad_norm": 0.19179243874244412, + "learning_rate": 3.0456206885266525e-05, + "loss": 0.5272, + "num_tokens": 3799260830.0, + "step": 911 + }, + { + "epoch": 1.8213839620284786, + "grad_norm": 0.18196129966337476, + "learning_rate": 3.0435614247161323e-05, + "loss": 0.5284, + "num_tokens": 3803446813.0, + "step": 912 + }, + { + "epoch": 1.8233824631526354, + "grad_norm": 0.18134055066983687, + "learning_rate": 3.041500745134943e-05, + "loss": 0.5513, + "num_tokens": 3807630030.0, + "step": 913 + }, + { + "epoch": 1.8253809642767924, + "grad_norm": 0.20311211478052715, + "learning_rate": 3.0394386532415766e-05, + "loss": 0.5263, + "num_tokens": 3811768869.0, + "step": 914 + }, + { + "epoch": 1.8273794654009494, + "grad_norm": 0.15044022511332186, + "learning_rate": 3.037375152496895e-05, + "loss": 0.5257, + "num_tokens": 3815954430.0, + "step": 915 + }, + { + "epoch": 1.8293779665251062, + "grad_norm": 0.24071522199239043, + "learning_rate": 3.0353102463641235e-05, + "loss": 0.5116, + "num_tokens": 3820107698.0, + "step": 916 + }, + { + "epoch": 1.831376467649263, + "grad_norm": 0.1617354717831046, + "learning_rate": 3.033243938308847e-05, + "loss": 0.5473, + "num_tokens": 3824292997.0, + "step": 917 + }, + { + "epoch": 1.83337496877342, + "grad_norm": 0.1679074536526969, + "learning_rate": 3.0311762317990032e-05, + "loss": 0.5113, + "num_tokens": 3828476291.0, + "step": 918 + }, + { + "epoch": 1.8353734698975768, + "grad_norm": 0.212599383233638, + "learning_rate": 3.029107130304877e-05, + "loss": 0.5269, + "num_tokens": 3832659575.0, + "step": 919 + }, + { + "epoch": 1.8373719710217338, + "grad_norm": 0.12607377871800104, + "learning_rate": 3.0270366372990936e-05, + "loss": 0.5217, + "num_tokens": 3836837243.0, + "step": 920 + }, + { + "epoch": 1.8393704721458906, + "grad_norm": 0.15000689872896422, + "learning_rate": 3.0249647562566142e-05, + "loss": 0.5336, + "num_tokens": 3841011520.0, + "step": 921 + }, + { + "epoch": 1.8413689732700473, + "grad_norm": 0.21675724178172576, + "learning_rate": 3.0228914906547304e-05, + "loss": 0.5304, + "num_tokens": 3845194591.0, + "step": 922 + }, + { + "epoch": 1.8433674743942043, + "grad_norm": 0.11201014382210286, + "learning_rate": 3.020816843973056e-05, + "loss": 0.5328, + "num_tokens": 3849381632.0, + "step": 923 + }, + { + "epoch": 1.8453659755183613, + "grad_norm": 0.21797087604301393, + "learning_rate": 3.018740819693524e-05, + "loss": 0.5609, + "num_tokens": 3853558709.0, + "step": 924 + }, + { + "epoch": 1.8473644766425181, + "grad_norm": 0.16562518456251152, + "learning_rate": 3.016663421300378e-05, + "loss": 0.5458, + "num_tokens": 3857735759.0, + "step": 925 + }, + { + "epoch": 1.849362977766675, + "grad_norm": 0.14289674947356634, + "learning_rate": 3.0145846522801703e-05, + "loss": 0.5319, + "num_tokens": 3861921235.0, + "step": 926 + }, + { + "epoch": 1.851361478890832, + "grad_norm": 0.18252870141504618, + "learning_rate": 3.0125045161217502e-05, + "loss": 0.5178, + "num_tokens": 3866087893.0, + "step": 927 + }, + { + "epoch": 1.853359980014989, + "grad_norm": 0.16690914726600448, + "learning_rate": 3.0104230163162653e-05, + "loss": 0.5143, + "num_tokens": 3870274702.0, + "step": 928 + }, + { + "epoch": 1.8553584811391457, + "grad_norm": 0.13154317409923139, + "learning_rate": 3.008340156357148e-05, + "loss": 0.541, + "num_tokens": 3874458777.0, + "step": 929 + }, + { + "epoch": 1.8573569822633025, + "grad_norm": 0.2190270373025752, + "learning_rate": 3.0062559397401164e-05, + "loss": 0.5473, + "num_tokens": 3878615803.0, + "step": 930 + }, + { + "epoch": 1.8593554833874593, + "grad_norm": 0.12668375490210115, + "learning_rate": 3.0041703699631646e-05, + "loss": 0.5171, + "num_tokens": 3882774812.0, + "step": 931 + }, + { + "epoch": 1.8613539845116163, + "grad_norm": 0.1585243509665725, + "learning_rate": 3.002083450526556e-05, + "loss": 0.5185, + "num_tokens": 3886950986.0, + "step": 932 + }, + { + "epoch": 1.8633524856357733, + "grad_norm": 0.14369259213462093, + "learning_rate": 2.9999951849328237e-05, + "loss": 0.5336, + "num_tokens": 3891136420.0, + "step": 933 + }, + { + "epoch": 1.86535098675993, + "grad_norm": 0.11080796195002368, + "learning_rate": 2.9979055766867536e-05, + "loss": 0.5255, + "num_tokens": 3895290274.0, + "step": 934 + }, + { + "epoch": 1.8673494878840868, + "grad_norm": 0.12828060577909428, + "learning_rate": 2.9958146292953913e-05, + "loss": 0.5178, + "num_tokens": 3899447363.0, + "step": 935 + }, + { + "epoch": 1.8693479890082438, + "grad_norm": 0.12848514280071746, + "learning_rate": 2.9937223462680253e-05, + "loss": 0.5363, + "num_tokens": 3903630954.0, + "step": 936 + }, + { + "epoch": 1.8713464901324008, + "grad_norm": 0.142217653133302, + "learning_rate": 2.991628731116189e-05, + "loss": 0.5514, + "num_tokens": 3907810459.0, + "step": 937 + }, + { + "epoch": 1.8733449912565576, + "grad_norm": 0.12445794218586209, + "learning_rate": 2.98953378735365e-05, + "loss": 0.5242, + "num_tokens": 3911994016.0, + "step": 938 + }, + { + "epoch": 1.8753434923807144, + "grad_norm": 0.1594796701356054, + "learning_rate": 2.9874375184964045e-05, + "loss": 0.5255, + "num_tokens": 3916157324.0, + "step": 939 + }, + { + "epoch": 1.8773419935048712, + "grad_norm": 0.15688882545755897, + "learning_rate": 2.9853399280626757e-05, + "loss": 0.519, + "num_tokens": 3920339889.0, + "step": 940 + }, + { + "epoch": 1.8793404946290282, + "grad_norm": 0.1464367608367313, + "learning_rate": 2.983241019572902e-05, + "loss": 0.5257, + "num_tokens": 3924523673.0, + "step": 941 + }, + { + "epoch": 1.8813389957531852, + "grad_norm": 0.18177157792213205, + "learning_rate": 2.9811407965497362e-05, + "loss": 0.5281, + "num_tokens": 3928681350.0, + "step": 942 + }, + { + "epoch": 1.883337496877342, + "grad_norm": 0.12720780275997307, + "learning_rate": 2.9790392625180348e-05, + "loss": 0.5198, + "num_tokens": 3932837013.0, + "step": 943 + }, + { + "epoch": 1.8853359980014988, + "grad_norm": 0.17078082994746768, + "learning_rate": 2.9769364210048577e-05, + "loss": 0.5111, + "num_tokens": 3936999285.0, + "step": 944 + }, + { + "epoch": 1.8873344991256558, + "grad_norm": 0.16208539925976226, + "learning_rate": 2.974832275539455e-05, + "loss": 0.5061, + "num_tokens": 3941153258.0, + "step": 945 + }, + { + "epoch": 1.8893330002498128, + "grad_norm": 0.1338958398600285, + "learning_rate": 2.972726829653271e-05, + "loss": 0.5352, + "num_tokens": 3945338448.0, + "step": 946 + }, + { + "epoch": 1.8913315013739695, + "grad_norm": 0.17854154498209812, + "learning_rate": 2.9706200868799265e-05, + "loss": 0.5305, + "num_tokens": 3949523642.0, + "step": 947 + }, + { + "epoch": 1.8933300024981263, + "grad_norm": 0.21744416362070348, + "learning_rate": 2.968512050755223e-05, + "loss": 0.5358, + "num_tokens": 3953671757.0, + "step": 948 + }, + { + "epoch": 1.8953285036222833, + "grad_norm": 0.13600907110364807, + "learning_rate": 2.966402724817131e-05, + "loss": 0.5254, + "num_tokens": 3957836007.0, + "step": 949 + }, + { + "epoch": 1.8973270047464401, + "grad_norm": 0.21644854882854725, + "learning_rate": 2.9642921126057857e-05, + "loss": 0.5122, + "num_tokens": 3962017816.0, + "step": 950 + }, + { + "epoch": 1.8993255058705971, + "grad_norm": 0.17993078323121936, + "learning_rate": 2.962180217663483e-05, + "loss": 0.532, + "num_tokens": 3966155949.0, + "step": 951 + }, + { + "epoch": 1.901324006994754, + "grad_norm": 0.1486867314643031, + "learning_rate": 2.9600670435346672e-05, + "loss": 0.5336, + "num_tokens": 3970314292.0, + "step": 952 + }, + { + "epoch": 1.9033225081189107, + "grad_norm": 0.15142223544029465, + "learning_rate": 2.957952593765936e-05, + "loss": 0.5352, + "num_tokens": 3974461760.0, + "step": 953 + }, + { + "epoch": 1.9053210092430677, + "grad_norm": 0.15417961025048316, + "learning_rate": 2.9558368719060205e-05, + "loss": 0.5336, + "num_tokens": 3978644196.0, + "step": 954 + }, + { + "epoch": 1.9073195103672247, + "grad_norm": 0.14577449563047148, + "learning_rate": 2.9537198815057938e-05, + "loss": 0.5229, + "num_tokens": 3982786938.0, + "step": 955 + }, + { + "epoch": 1.9093180114913815, + "grad_norm": 0.1597651486581622, + "learning_rate": 2.9516016261182535e-05, + "loss": 0.5463, + "num_tokens": 3986970977.0, + "step": 956 + }, + { + "epoch": 1.9113165126155383, + "grad_norm": 0.16906843648659758, + "learning_rate": 2.9494821092985222e-05, + "loss": 0.52, + "num_tokens": 3991155496.0, + "step": 957 + }, + { + "epoch": 1.9133150137396953, + "grad_norm": 0.1625731009850065, + "learning_rate": 2.947361334603839e-05, + "loss": 0.5325, + "num_tokens": 3995327042.0, + "step": 958 + }, + { + "epoch": 1.9153135148638523, + "grad_norm": 0.15916051037874362, + "learning_rate": 2.9452393055935542e-05, + "loss": 0.5306, + "num_tokens": 3999509205.0, + "step": 959 + }, + { + "epoch": 1.917312015988009, + "grad_norm": 0.17899849597635692, + "learning_rate": 2.9431160258291226e-05, + "loss": 0.5319, + "num_tokens": 4003693362.0, + "step": 960 + }, + { + "epoch": 1.9193105171121658, + "grad_norm": 0.14397156170082645, + "learning_rate": 2.9409914988740996e-05, + "loss": 0.5435, + "num_tokens": 4007837304.0, + "step": 961 + }, + { + "epoch": 1.9213090182363226, + "grad_norm": 0.1403901437324228, + "learning_rate": 2.938865728294132e-05, + "loss": 0.5278, + "num_tokens": 4012024289.0, + "step": 962 + }, + { + "epoch": 1.9233075193604796, + "grad_norm": 0.1724001136298181, + "learning_rate": 2.9367387176569565e-05, + "loss": 0.5415, + "num_tokens": 4016208060.0, + "step": 963 + }, + { + "epoch": 1.9253060204846366, + "grad_norm": 0.131186384313383, + "learning_rate": 2.9346104705323875e-05, + "loss": 0.539, + "num_tokens": 4020330281.0, + "step": 964 + }, + { + "epoch": 1.9273045216087934, + "grad_norm": 0.1583833584470951, + "learning_rate": 2.9324809904923182e-05, + "loss": 0.5306, + "num_tokens": 4024494981.0, + "step": 965 + }, + { + "epoch": 1.9293030227329502, + "grad_norm": 0.17460667019590054, + "learning_rate": 2.9303502811107082e-05, + "loss": 0.5241, + "num_tokens": 4028682144.0, + "step": 966 + }, + { + "epoch": 1.9313015238571072, + "grad_norm": 0.12013520569327267, + "learning_rate": 2.9282183459635808e-05, + "loss": 0.5375, + "num_tokens": 4032866154.0, + "step": 967 + }, + { + "epoch": 1.9333000249812642, + "grad_norm": 0.1276072528223653, + "learning_rate": 2.9260851886290192e-05, + "loss": 0.539, + "num_tokens": 4037045067.0, + "step": 968 + }, + { + "epoch": 1.935298526105421, + "grad_norm": 0.1646008672871739, + "learning_rate": 2.9239508126871534e-05, + "loss": 0.5216, + "num_tokens": 4041230991.0, + "step": 969 + }, + { + "epoch": 1.9372970272295778, + "grad_norm": 0.12742068450254185, + "learning_rate": 2.9218152217201633e-05, + "loss": 0.5286, + "num_tokens": 4045394164.0, + "step": 970 + }, + { + "epoch": 1.9392955283537348, + "grad_norm": 0.14617535186916644, + "learning_rate": 2.9196784193122646e-05, + "loss": 0.5428, + "num_tokens": 4049578724.0, + "step": 971 + }, + { + "epoch": 1.9412940294778915, + "grad_norm": 0.1413070592500873, + "learning_rate": 2.917540409049707e-05, + "loss": 0.5148, + "num_tokens": 4053754336.0, + "step": 972 + }, + { + "epoch": 1.9432925306020485, + "grad_norm": 0.11965402300512984, + "learning_rate": 2.9154011945207683e-05, + "loss": 0.5354, + "num_tokens": 4057940092.0, + "step": 973 + }, + { + "epoch": 1.9452910317262053, + "grad_norm": 0.13121637860893096, + "learning_rate": 2.9132607793157466e-05, + "loss": 0.5352, + "num_tokens": 4062125010.0, + "step": 974 + }, + { + "epoch": 1.947289532850362, + "grad_norm": 0.13067756250066348, + "learning_rate": 2.9111191670269557e-05, + "loss": 0.5218, + "num_tokens": 4066309602.0, + "step": 975 + }, + { + "epoch": 1.949288033974519, + "grad_norm": 0.1469570612897502, + "learning_rate": 2.908976361248717e-05, + "loss": 0.5372, + "num_tokens": 4070492700.0, + "step": 976 + }, + { + "epoch": 1.9512865350986761, + "grad_norm": 0.15423183283105196, + "learning_rate": 2.906832365577358e-05, + "loss": 0.5143, + "num_tokens": 4074659642.0, + "step": 977 + }, + { + "epoch": 1.953285036222833, + "grad_norm": 0.17345376188487596, + "learning_rate": 2.904687183611199e-05, + "loss": 0.5314, + "num_tokens": 4078841994.0, + "step": 978 + }, + { + "epoch": 1.9552835373469897, + "grad_norm": 0.14476761424173132, + "learning_rate": 2.9025408189505563e-05, + "loss": 0.5036, + "num_tokens": 4083005452.0, + "step": 979 + }, + { + "epoch": 1.9572820384711467, + "grad_norm": 0.13347103023966514, + "learning_rate": 2.9003932751977263e-05, + "loss": 0.5266, + "num_tokens": 4087149132.0, + "step": 980 + }, + { + "epoch": 1.9592805395953037, + "grad_norm": 0.14371315496187684, + "learning_rate": 2.898244555956988e-05, + "loss": 0.5273, + "num_tokens": 4091335389.0, + "step": 981 + }, + { + "epoch": 1.9612790407194605, + "grad_norm": 0.1202141420881177, + "learning_rate": 2.8960946648345908e-05, + "loss": 0.5386, + "num_tokens": 4095491270.0, + "step": 982 + }, + { + "epoch": 1.9632775418436172, + "grad_norm": 0.165729438992163, + "learning_rate": 2.893943605438752e-05, + "loss": 0.5274, + "num_tokens": 4099677685.0, + "step": 983 + }, + { + "epoch": 1.965276042967774, + "grad_norm": 0.1360697208402388, + "learning_rate": 2.8917913813796498e-05, + "loss": 0.5112, + "num_tokens": 4103795522.0, + "step": 984 + }, + { + "epoch": 1.967274544091931, + "grad_norm": 0.19324843817154516, + "learning_rate": 2.8896379962694164e-05, + "loss": 0.519, + "num_tokens": 4107930762.0, + "step": 985 + }, + { + "epoch": 1.969273045216088, + "grad_norm": 0.1277713892684782, + "learning_rate": 2.8874834537221336e-05, + "loss": 0.5341, + "num_tokens": 4112110401.0, + "step": 986 + }, + { + "epoch": 1.9712715463402448, + "grad_norm": 0.1523229913888909, + "learning_rate": 2.885327757353824e-05, + "loss": 0.5138, + "num_tokens": 4116289856.0, + "step": 987 + }, + { + "epoch": 1.9732700474644016, + "grad_norm": 0.14999757262079796, + "learning_rate": 2.883170910782449e-05, + "loss": 0.5257, + "num_tokens": 4120462696.0, + "step": 988 + }, + { + "epoch": 1.9752685485885586, + "grad_norm": 0.12312735956034046, + "learning_rate": 2.8810129176278988e-05, + "loss": 0.5265, + "num_tokens": 4124629347.0, + "step": 989 + }, + { + "epoch": 1.9772670497127156, + "grad_norm": 0.1596957623461568, + "learning_rate": 2.878853781511988e-05, + "loss": 0.5285, + "num_tokens": 4128760527.0, + "step": 990 + }, + { + "epoch": 1.9792655508368724, + "grad_norm": 0.13403897098640438, + "learning_rate": 2.8766935060584506e-05, + "loss": 0.5489, + "num_tokens": 4132946103.0, + "step": 991 + }, + { + "epoch": 1.9812640519610292, + "grad_norm": 0.15124144376189988, + "learning_rate": 2.8745320948929315e-05, + "loss": 0.531, + "num_tokens": 4137111630.0, + "step": 992 + }, + { + "epoch": 1.983262553085186, + "grad_norm": 0.1479875637947502, + "learning_rate": 2.8723695516429825e-05, + "loss": 0.531, + "num_tokens": 4141298908.0, + "step": 993 + }, + { + "epoch": 1.985261054209343, + "grad_norm": 0.156740646384333, + "learning_rate": 2.870205879938055e-05, + "loss": 0.5227, + "num_tokens": 4145450798.0, + "step": 994 + }, + { + "epoch": 1.9872595553335, + "grad_norm": 0.13037407520996772, + "learning_rate": 2.8680410834094944e-05, + "loss": 0.5355, + "num_tokens": 4149588102.0, + "step": 995 + }, + { + "epoch": 1.9892580564576567, + "grad_norm": 0.17668522105671147, + "learning_rate": 2.865875165690534e-05, + "loss": 0.5412, + "num_tokens": 4153769485.0, + "step": 996 + }, + { + "epoch": 1.9912565575818135, + "grad_norm": 0.11634279667539706, + "learning_rate": 2.8637081304162886e-05, + "loss": 0.5251, + "num_tokens": 4157927240.0, + "step": 997 + }, + { + "epoch": 1.9932550587059705, + "grad_norm": 0.14658355643769622, + "learning_rate": 2.8615399812237492e-05, + "loss": 0.5352, + "num_tokens": 4162110818.0, + "step": 998 + }, + { + "epoch": 1.9952535598301275, + "grad_norm": 0.14066546148186873, + "learning_rate": 2.8593707217517752e-05, + "loss": 0.5211, + "num_tokens": 4166294278.0, + "step": 999 + }, + { + "epoch": 1.9972520609542843, + "grad_norm": 0.12724170021846187, + "learning_rate": 2.8572003556410913e-05, + "loss": 0.5227, + "num_tokens": 4170480993.0, + "step": 1000 + }, + { + "epoch": 1.999250562078441, + "grad_norm": 0.13169475351475682, + "learning_rate": 2.855028886534278e-05, + "loss": 0.5183, + "num_tokens": 4174664993.0, + "step": 1001 + }, + { + "epoch": 2.0, + "grad_norm": 0.13169475351475682, + "learning_rate": 2.8528563180757663e-05, + "loss": 0.5274, + "num_tokens": 4176235933.0, + "step": 1002 + }, + { + "epoch": 2.001998501124157, + "grad_norm": 0.21160571831247074, + "learning_rate": 2.8506826539118342e-05, + "loss": 0.5106, + "num_tokens": 4180387160.0, + "step": 1003 + }, + { + "epoch": 2.0039970022483136, + "grad_norm": 0.1652016956673078, + "learning_rate": 2.8485078976905977e-05, + "loss": 0.4967, + "num_tokens": 4184549654.0, + "step": 1004 + }, + { + "epoch": 2.005995503372471, + "grad_norm": 0.18651985671555207, + "learning_rate": 2.8463320530620057e-05, + "loss": 0.5125, + "num_tokens": 4188732765.0, + "step": 1005 + }, + { + "epoch": 2.0079940044966276, + "grad_norm": 0.18302451963010355, + "learning_rate": 2.8441551236778328e-05, + "loss": 0.503, + "num_tokens": 4192917693.0, + "step": 1006 + }, + { + "epoch": 2.0099925056207844, + "grad_norm": 0.19607281747014288, + "learning_rate": 2.8419771131916768e-05, + "loss": 0.4918, + "num_tokens": 4197100985.0, + "step": 1007 + }, + { + "epoch": 2.011991006744941, + "grad_norm": 0.20031237089126383, + "learning_rate": 2.8397980252589474e-05, + "loss": 0.4955, + "num_tokens": 4201285695.0, + "step": 1008 + }, + { + "epoch": 2.0139895078690984, + "grad_norm": 0.16511349311435142, + "learning_rate": 2.8376178635368624e-05, + "loss": 0.5035, + "num_tokens": 4205449294.0, + "step": 1009 + }, + { + "epoch": 2.015988008993255, + "grad_norm": 0.1577687907803843, + "learning_rate": 2.8354366316844452e-05, + "loss": 0.4969, + "num_tokens": 4209580909.0, + "step": 1010 + }, + { + "epoch": 2.017986510117412, + "grad_norm": 0.20293774243482518, + "learning_rate": 2.8332543333625103e-05, + "loss": 0.4955, + "num_tokens": 4213765893.0, + "step": 1011 + }, + { + "epoch": 2.0199850112415687, + "grad_norm": 0.139286266428699, + "learning_rate": 2.8310709722336675e-05, + "loss": 0.5074, + "num_tokens": 4217919206.0, + "step": 1012 + }, + { + "epoch": 2.0219835123657255, + "grad_norm": 0.17947707758412956, + "learning_rate": 2.8288865519623058e-05, + "loss": 0.4915, + "num_tokens": 4222030201.0, + "step": 1013 + }, + { + "epoch": 2.0239820134898827, + "grad_norm": 0.15652821092814692, + "learning_rate": 2.826701076214593e-05, + "loss": 0.5191, + "num_tokens": 4226213469.0, + "step": 1014 + }, + { + "epoch": 2.0259805146140395, + "grad_norm": 0.17307056360280587, + "learning_rate": 2.8245145486584702e-05, + "loss": 0.5011, + "num_tokens": 4230400404.0, + "step": 1015 + }, + { + "epoch": 2.0279790157381963, + "grad_norm": 0.13394423392197233, + "learning_rate": 2.8223269729636413e-05, + "loss": 0.5192, + "num_tokens": 4234567050.0, + "step": 1016 + }, + { + "epoch": 2.029977516862353, + "grad_norm": 0.1662561364806371, + "learning_rate": 2.8201383528015712e-05, + "loss": 0.4918, + "num_tokens": 4238750987.0, + "step": 1017 + }, + { + "epoch": 2.0319760179865103, + "grad_norm": 0.13367603311210982, + "learning_rate": 2.817948691845476e-05, + "loss": 0.5127, + "num_tokens": 4242937095.0, + "step": 1018 + }, + { + "epoch": 2.033974519110667, + "grad_norm": 0.1604916812489208, + "learning_rate": 2.815757993770321e-05, + "loss": 0.4788, + "num_tokens": 4247114474.0, + "step": 1019 + }, + { + "epoch": 2.035973020234824, + "grad_norm": 0.1511051885792761, + "learning_rate": 2.8135662622528093e-05, + "loss": 0.5072, + "num_tokens": 4251297683.0, + "step": 1020 + }, + { + "epoch": 2.0379715213589806, + "grad_norm": 0.14665980088330285, + "learning_rate": 2.8113735009713812e-05, + "loss": 0.487, + "num_tokens": 4255458975.0, + "step": 1021 + }, + { + "epoch": 2.039970022483138, + "grad_norm": 0.17696806784884658, + "learning_rate": 2.8091797136062022e-05, + "loss": 0.4907, + "num_tokens": 4259633653.0, + "step": 1022 + }, + { + "epoch": 2.0419685236072946, + "grad_norm": 0.13459749631058618, + "learning_rate": 2.8069849038391634e-05, + "loss": 0.4941, + "num_tokens": 4263807303.0, + "step": 1023 + }, + { + "epoch": 2.0439670247314514, + "grad_norm": 0.16286364865881886, + "learning_rate": 2.8047890753538693e-05, + "loss": 0.4755, + "num_tokens": 4267990183.0, + "step": 1024 + }, + { + "epoch": 2.045965525855608, + "grad_norm": 0.13099167575298715, + "learning_rate": 2.802592231835634e-05, + "loss": 0.4975, + "num_tokens": 4272160752.0, + "step": 1025 + }, + { + "epoch": 2.047964026979765, + "grad_norm": 0.1604104248100875, + "learning_rate": 2.8003943769714776e-05, + "loss": 0.4903, + "num_tokens": 4276346756.0, + "step": 1026 + }, + { + "epoch": 2.049962528103922, + "grad_norm": 0.11249640765072437, + "learning_rate": 2.798195514450115e-05, + "loss": 0.4925, + "num_tokens": 4280534902.0, + "step": 1027 + }, + { + "epoch": 2.051961029228079, + "grad_norm": 0.1196703560136032, + "learning_rate": 2.795995647961953e-05, + "loss": 0.5037, + "num_tokens": 4284720064.0, + "step": 1028 + }, + { + "epoch": 2.0539595303522358, + "grad_norm": 0.11469763529076021, + "learning_rate": 2.7937947811990853e-05, + "loss": 0.4988, + "num_tokens": 4288902205.0, + "step": 1029 + }, + { + "epoch": 2.0559580314763926, + "grad_norm": 0.119598189476521, + "learning_rate": 2.791592917855281e-05, + "loss": 0.5075, + "num_tokens": 4293027495.0, + "step": 1030 + }, + { + "epoch": 2.05795653260055, + "grad_norm": 0.11652651048953226, + "learning_rate": 2.789390061625984e-05, + "loss": 0.4875, + "num_tokens": 4297214151.0, + "step": 1031 + }, + { + "epoch": 2.0599550337247066, + "grad_norm": 0.12687019491862397, + "learning_rate": 2.787186216208305e-05, + "loss": 0.4867, + "num_tokens": 4301379766.0, + "step": 1032 + }, + { + "epoch": 2.0619535348488633, + "grad_norm": 0.13103266845775943, + "learning_rate": 2.784981385301013e-05, + "loss": 0.5095, + "num_tokens": 4305565854.0, + "step": 1033 + }, + { + "epoch": 2.06395203597302, + "grad_norm": 0.11767544182571255, + "learning_rate": 2.7827755726045328e-05, + "loss": 0.5025, + "num_tokens": 4309742805.0, + "step": 1034 + }, + { + "epoch": 2.065950537097177, + "grad_norm": 0.1333652339303225, + "learning_rate": 2.7805687818209356e-05, + "loss": 0.5197, + "num_tokens": 4313927579.0, + "step": 1035 + }, + { + "epoch": 2.067949038221334, + "grad_norm": 0.11784877542960838, + "learning_rate": 2.7783610166539342e-05, + "loss": 0.4874, + "num_tokens": 4318112066.0, + "step": 1036 + }, + { + "epoch": 2.069947539345491, + "grad_norm": 0.1267796046650927, + "learning_rate": 2.7761522808088787e-05, + "loss": 0.4885, + "num_tokens": 4322292706.0, + "step": 1037 + }, + { + "epoch": 2.0719460404696477, + "grad_norm": 0.1445030812910346, + "learning_rate": 2.7739425779927452e-05, + "loss": 0.5049, + "num_tokens": 4326460044.0, + "step": 1038 + }, + { + "epoch": 2.0739445415938045, + "grad_norm": 0.13331656191970195, + "learning_rate": 2.7717319119141356e-05, + "loss": 0.4908, + "num_tokens": 4330645930.0, + "step": 1039 + }, + { + "epoch": 2.0759430427179617, + "grad_norm": 0.1538620925671774, + "learning_rate": 2.7695202862832667e-05, + "loss": 0.5056, + "num_tokens": 4334827534.0, + "step": 1040 + }, + { + "epoch": 2.0779415438421185, + "grad_norm": 0.15900536851042454, + "learning_rate": 2.7673077048119662e-05, + "loss": 0.5109, + "num_tokens": 4339012664.0, + "step": 1041 + }, + { + "epoch": 2.0799400449662753, + "grad_norm": 0.1271281199605633, + "learning_rate": 2.765094171213665e-05, + "loss": 0.4903, + "num_tokens": 4343180496.0, + "step": 1042 + }, + { + "epoch": 2.081938546090432, + "grad_norm": 0.12754751421829605, + "learning_rate": 2.762879689203396e-05, + "loss": 0.5018, + "num_tokens": 4347361835.0, + "step": 1043 + }, + { + "epoch": 2.0839370472145893, + "grad_norm": 0.13581152354845086, + "learning_rate": 2.7606642624977777e-05, + "loss": 0.5062, + "num_tokens": 4351538483.0, + "step": 1044 + }, + { + "epoch": 2.085935548338746, + "grad_norm": 0.11378717033336214, + "learning_rate": 2.7584478948150192e-05, + "loss": 0.5055, + "num_tokens": 4355725954.0, + "step": 1045 + }, + { + "epoch": 2.087934049462903, + "grad_norm": 0.14385224474337197, + "learning_rate": 2.7562305898749054e-05, + "loss": 0.4944, + "num_tokens": 4359852483.0, + "step": 1046 + }, + { + "epoch": 2.0899325505870596, + "grad_norm": 0.1205901040054997, + "learning_rate": 2.754012351398797e-05, + "loss": 0.5035, + "num_tokens": 4363997191.0, + "step": 1047 + }, + { + "epoch": 2.0919310517112164, + "grad_norm": 0.12195575571775899, + "learning_rate": 2.7517931831096213e-05, + "loss": 0.4846, + "num_tokens": 4368072794.0, + "step": 1048 + }, + { + "epoch": 2.0939295528353736, + "grad_norm": 0.1065069456026774, + "learning_rate": 2.749573088731862e-05, + "loss": 0.4743, + "num_tokens": 4372238663.0, + "step": 1049 + }, + { + "epoch": 2.0959280539595304, + "grad_norm": 0.1260674573565864, + "learning_rate": 2.7473520719915635e-05, + "loss": 0.523, + "num_tokens": 4376421527.0, + "step": 1050 + }, + { + "epoch": 2.097926555083687, + "grad_norm": 0.11243116826598415, + "learning_rate": 2.7451301366163116e-05, + "loss": 0.4908, + "num_tokens": 4380556634.0, + "step": 1051 + }, + { + "epoch": 2.099925056207844, + "grad_norm": 0.1439711949210315, + "learning_rate": 2.7429072863352394e-05, + "loss": 0.507, + "num_tokens": 4384742019.0, + "step": 1052 + }, + { + "epoch": 2.101923557332001, + "grad_norm": 5.570446877908358, + "learning_rate": 2.740683524879013e-05, + "loss": 0.5035, + "num_tokens": 4388927519.0, + "step": 1053 + }, + { + "epoch": 2.103922058456158, + "grad_norm": 0.2967579443145661, + "learning_rate": 2.7384588559798274e-05, + "loss": 0.4986, + "num_tokens": 4393111857.0, + "step": 1054 + }, + { + "epoch": 2.1059205595803148, + "grad_norm": 0.1285194130261887, + "learning_rate": 2.7362332833714016e-05, + "loss": 0.4934, + "num_tokens": 4397297352.0, + "step": 1055 + }, + { + "epoch": 2.1079190607044715, + "grad_norm": 0.22290631620781143, + "learning_rate": 2.7340068107889702e-05, + "loss": 0.4921, + "num_tokens": 4401436140.0, + "step": 1056 + }, + { + "epoch": 2.1099175618286283, + "grad_norm": 0.14072679829074522, + "learning_rate": 2.7317794419692806e-05, + "loss": 0.4964, + "num_tokens": 4405620919.0, + "step": 1057 + }, + { + "epoch": 2.1119160629527856, + "grad_norm": 0.17296830766315072, + "learning_rate": 2.729551180650581e-05, + "loss": 0.5041, + "num_tokens": 4409775990.0, + "step": 1058 + }, + { + "epoch": 2.1139145640769423, + "grad_norm": 0.13990964274670054, + "learning_rate": 2.7273220305726198e-05, + "loss": 0.5029, + "num_tokens": 4413960458.0, + "step": 1059 + }, + { + "epoch": 2.115913065201099, + "grad_norm": 0.13405009989189542, + "learning_rate": 2.7250919954766365e-05, + "loss": 0.5014, + "num_tokens": 4418129016.0, + "step": 1060 + }, + { + "epoch": 2.117911566325256, + "grad_norm": 0.15174067764623186, + "learning_rate": 2.7228610791053562e-05, + "loss": 0.5056, + "num_tokens": 4422313686.0, + "step": 1061 + }, + { + "epoch": 2.119910067449413, + "grad_norm": 0.13859101679853622, + "learning_rate": 2.7206292852029824e-05, + "loss": 0.4925, + "num_tokens": 4426477371.0, + "step": 1062 + }, + { + "epoch": 2.12190856857357, + "grad_norm": 0.14626897180566167, + "learning_rate": 2.7183966175151922e-05, + "loss": 0.4973, + "num_tokens": 4430641849.0, + "step": 1063 + }, + { + "epoch": 2.1239070696977267, + "grad_norm": 0.1498892105685055, + "learning_rate": 2.7161630797891284e-05, + "loss": 0.5168, + "num_tokens": 4434829095.0, + "step": 1064 + }, + { + "epoch": 2.1259055708218835, + "grad_norm": 0.13195167534116636, + "learning_rate": 2.713928675773394e-05, + "loss": 0.5162, + "num_tokens": 4439010110.0, + "step": 1065 + }, + { + "epoch": 2.1279040719460403, + "grad_norm": 0.16946253022415825, + "learning_rate": 2.7116934092180476e-05, + "loss": 0.512, + "num_tokens": 4443194839.0, + "step": 1066 + }, + { + "epoch": 2.1299025730701975, + "grad_norm": 0.16931003876440823, + "learning_rate": 2.7094572838745926e-05, + "loss": 0.4961, + "num_tokens": 4447381777.0, + "step": 1067 + }, + { + "epoch": 2.1319010741943543, + "grad_norm": 0.12949769720473964, + "learning_rate": 2.7072203034959767e-05, + "loss": 0.5001, + "num_tokens": 4451556416.0, + "step": 1068 + }, + { + "epoch": 2.133899575318511, + "grad_norm": 0.14179790367740872, + "learning_rate": 2.7049824718365792e-05, + "loss": 0.4916, + "num_tokens": 4455737915.0, + "step": 1069 + }, + { + "epoch": 2.135898076442668, + "grad_norm": 0.1453099444969698, + "learning_rate": 2.702743792652212e-05, + "loss": 0.4972, + "num_tokens": 4459925078.0, + "step": 1070 + }, + { + "epoch": 2.137896577566825, + "grad_norm": 0.10549374150670274, + "learning_rate": 2.7005042697001066e-05, + "loss": 0.4891, + "num_tokens": 4464088367.0, + "step": 1071 + }, + { + "epoch": 2.139895078690982, + "grad_norm": 0.13957231933225944, + "learning_rate": 2.6982639067389116e-05, + "loss": 0.5165, + "num_tokens": 4468272890.0, + "step": 1072 + }, + { + "epoch": 2.1418935798151386, + "grad_norm": 0.10922830895884021, + "learning_rate": 2.6960227075286854e-05, + "loss": 0.5008, + "num_tokens": 4472456604.0, + "step": 1073 + }, + { + "epoch": 2.1438920809392954, + "grad_norm": 0.12045594250040216, + "learning_rate": 2.6937806758308897e-05, + "loss": 0.5133, + "num_tokens": 4476640444.0, + "step": 1074 + }, + { + "epoch": 2.145890582063452, + "grad_norm": 0.12642441492379425, + "learning_rate": 2.691537815408384e-05, + "loss": 0.5013, + "num_tokens": 4480822325.0, + "step": 1075 + }, + { + "epoch": 2.1478890831876094, + "grad_norm": 0.12314040989019641, + "learning_rate": 2.6892941300254176e-05, + "loss": 0.5101, + "num_tokens": 4484983220.0, + "step": 1076 + }, + { + "epoch": 2.149887584311766, + "grad_norm": 0.1220632648149086, + "learning_rate": 2.687049623447625e-05, + "loss": 0.5087, + "num_tokens": 4489168608.0, + "step": 1077 + }, + { + "epoch": 2.151886085435923, + "grad_norm": 0.1429304656836273, + "learning_rate": 2.6848042994420196e-05, + "loss": 0.5127, + "num_tokens": 4493344521.0, + "step": 1078 + }, + { + "epoch": 2.1538845865600798, + "grad_norm": 0.11322037107544294, + "learning_rate": 2.6825581617769847e-05, + "loss": 0.4984, + "num_tokens": 4497528964.0, + "step": 1079 + }, + { + "epoch": 2.155883087684237, + "grad_norm": 0.12370798744552106, + "learning_rate": 2.6803112142222717e-05, + "loss": 0.5113, + "num_tokens": 4501695541.0, + "step": 1080 + }, + { + "epoch": 2.1578815888083938, + "grad_norm": 0.12984446399743696, + "learning_rate": 2.6780634605489897e-05, + "loss": 0.4989, + "num_tokens": 4505882396.0, + "step": 1081 + }, + { + "epoch": 2.1598800899325505, + "grad_norm": 0.12534974362053256, + "learning_rate": 2.6758149045296003e-05, + "loss": 0.4983, + "num_tokens": 4510066504.0, + "step": 1082 + }, + { + "epoch": 2.1618785910567073, + "grad_norm": 0.13509112361878045, + "learning_rate": 2.6735655499379142e-05, + "loss": 0.4949, + "num_tokens": 4514251538.0, + "step": 1083 + }, + { + "epoch": 2.1638770921808645, + "grad_norm": 0.13493909605224993, + "learning_rate": 2.671315400549078e-05, + "loss": 0.4976, + "num_tokens": 4518436789.0, + "step": 1084 + }, + { + "epoch": 2.1658755933050213, + "grad_norm": 0.15279597651873503, + "learning_rate": 2.6690644601395773e-05, + "loss": 0.5044, + "num_tokens": 4522619130.0, + "step": 1085 + }, + { + "epoch": 2.167874094429178, + "grad_norm": 0.1366957077236854, + "learning_rate": 2.6668127324872216e-05, + "loss": 0.4885, + "num_tokens": 4526802202.0, + "step": 1086 + }, + { + "epoch": 2.169872595553335, + "grad_norm": 0.1331454252945968, + "learning_rate": 2.6645602213711434e-05, + "loss": 0.5118, + "num_tokens": 4530987658.0, + "step": 1087 + }, + { + "epoch": 2.1718710966774917, + "grad_norm": 0.11744126219027522, + "learning_rate": 2.6623069305717885e-05, + "loss": 0.5075, + "num_tokens": 4535172244.0, + "step": 1088 + }, + { + "epoch": 2.173869597801649, + "grad_norm": 0.12050690596463969, + "learning_rate": 2.6600528638709135e-05, + "loss": 0.5066, + "num_tokens": 4539345598.0, + "step": 1089 + }, + { + "epoch": 2.1758680989258057, + "grad_norm": 0.13201667397050387, + "learning_rate": 2.6577980250515768e-05, + "loss": 0.4979, + "num_tokens": 4543516446.0, + "step": 1090 + }, + { + "epoch": 2.1778666000499625, + "grad_norm": 0.12671243244530014, + "learning_rate": 2.6555424178981293e-05, + "loss": 0.508, + "num_tokens": 4547700623.0, + "step": 1091 + }, + { + "epoch": 2.1798651011741192, + "grad_norm": 0.10890655586226086, + "learning_rate": 2.6532860461962166e-05, + "loss": 0.4973, + "num_tokens": 4551884171.0, + "step": 1092 + }, + { + "epoch": 2.1818636022982765, + "grad_norm": 0.12457790006253697, + "learning_rate": 2.6510289137327635e-05, + "loss": 0.5073, + "num_tokens": 4556067751.0, + "step": 1093 + }, + { + "epoch": 2.1838621034224333, + "grad_norm": 0.10702714053212128, + "learning_rate": 2.6487710242959744e-05, + "loss": 0.4846, + "num_tokens": 4560253832.0, + "step": 1094 + }, + { + "epoch": 2.18586060454659, + "grad_norm": 0.1280328851016457, + "learning_rate": 2.6465123816753208e-05, + "loss": 0.4999, + "num_tokens": 4564439974.0, + "step": 1095 + }, + { + "epoch": 2.187859105670747, + "grad_norm": 0.11129964980542939, + "learning_rate": 2.6442529896615416e-05, + "loss": 0.5026, + "num_tokens": 4568625215.0, + "step": 1096 + }, + { + "epoch": 2.189857606794904, + "grad_norm": 0.1127904020648982, + "learning_rate": 2.6419928520466324e-05, + "loss": 0.5046, + "num_tokens": 4572811818.0, + "step": 1097 + }, + { + "epoch": 2.191856107919061, + "grad_norm": 0.1298752860219889, + "learning_rate": 2.6397319726238382e-05, + "loss": 0.5065, + "num_tokens": 4576995357.0, + "step": 1098 + }, + { + "epoch": 2.1938546090432176, + "grad_norm": 0.17256594665999342, + "learning_rate": 2.6374703551876522e-05, + "loss": 0.4987, + "num_tokens": 4581178794.0, + "step": 1099 + }, + { + "epoch": 2.1958531101673744, + "grad_norm": 0.11180931667885022, + "learning_rate": 2.6352080035338042e-05, + "loss": 0.5087, + "num_tokens": 4585344051.0, + "step": 1100 + }, + { + "epoch": 2.197851611291531, + "grad_norm": 0.18180055656341254, + "learning_rate": 2.6329449214592568e-05, + "loss": 0.4945, + "num_tokens": 4589500141.0, + "step": 1101 + }, + { + "epoch": 2.1998501124156884, + "grad_norm": 0.12636360774736677, + "learning_rate": 2.630681112762198e-05, + "loss": 0.4994, + "num_tokens": 4593672369.0, + "step": 1102 + }, + { + "epoch": 2.201848613539845, + "grad_norm": 0.12385335406128187, + "learning_rate": 2.628416581242036e-05, + "loss": 0.4958, + "num_tokens": 4597854122.0, + "step": 1103 + }, + { + "epoch": 2.203847114664002, + "grad_norm": 0.13079995994017318, + "learning_rate": 2.6261513306993917e-05, + "loss": 0.4944, + "num_tokens": 4602038615.0, + "step": 1104 + }, + { + "epoch": 2.2058456157881587, + "grad_norm": 0.1162056114215574, + "learning_rate": 2.623885364936093e-05, + "loss": 0.5024, + "num_tokens": 4606222791.0, + "step": 1105 + }, + { + "epoch": 2.207844116912316, + "grad_norm": 0.1095076309219436, + "learning_rate": 2.621618687755168e-05, + "loss": 0.4995, + "num_tokens": 4610408471.0, + "step": 1106 + }, + { + "epoch": 2.2098426180364728, + "grad_norm": 0.12243840368512335, + "learning_rate": 2.6193513029608394e-05, + "loss": 0.4934, + "num_tokens": 4614550685.0, + "step": 1107 + }, + { + "epoch": 2.2118411191606295, + "grad_norm": 0.12763297027650575, + "learning_rate": 2.6170832143585164e-05, + "loss": 0.4832, + "num_tokens": 4618695500.0, + "step": 1108 + }, + { + "epoch": 2.2138396202847863, + "grad_norm": 0.1170618453603016, + "learning_rate": 2.6148144257547898e-05, + "loss": 0.5023, + "num_tokens": 4622876704.0, + "step": 1109 + }, + { + "epoch": 2.215838121408943, + "grad_norm": 0.11164514515656015, + "learning_rate": 2.6125449409574257e-05, + "loss": 0.504, + "num_tokens": 4627060892.0, + "step": 1110 + }, + { + "epoch": 2.2178366225331003, + "grad_norm": 0.12688953269182784, + "learning_rate": 2.6102747637753588e-05, + "loss": 0.4871, + "num_tokens": 4631246591.0, + "step": 1111 + }, + { + "epoch": 2.219835123657257, + "grad_norm": 0.11824222000658793, + "learning_rate": 2.608003898018684e-05, + "loss": 0.508, + "num_tokens": 4635380859.0, + "step": 1112 + }, + { + "epoch": 2.221833624781414, + "grad_norm": 0.12665767259217112, + "learning_rate": 2.605732347498655e-05, + "loss": 0.5013, + "num_tokens": 4639505578.0, + "step": 1113 + }, + { + "epoch": 2.2238321259055707, + "grad_norm": 0.11438837451492373, + "learning_rate": 2.6034601160276722e-05, + "loss": 0.5097, + "num_tokens": 4643688469.0, + "step": 1114 + }, + { + "epoch": 2.225830627029728, + "grad_norm": 0.11098479679505477, + "learning_rate": 2.6011872074192796e-05, + "loss": 0.4884, + "num_tokens": 4647856581.0, + "step": 1115 + }, + { + "epoch": 2.2278291281538847, + "grad_norm": 0.1120275773383183, + "learning_rate": 2.598913625488158e-05, + "loss": 0.4886, + "num_tokens": 4652038261.0, + "step": 1116 + }, + { + "epoch": 2.2298276292780415, + "grad_norm": 0.11807044032500531, + "learning_rate": 2.5966393740501173e-05, + "loss": 0.5054, + "num_tokens": 4656222249.0, + "step": 1117 + }, + { + "epoch": 2.2318261304021982, + "grad_norm": 0.11678099177358182, + "learning_rate": 2.5943644569220934e-05, + "loss": 0.5123, + "num_tokens": 4660374030.0, + "step": 1118 + }, + { + "epoch": 2.233824631526355, + "grad_norm": 0.1394513075187605, + "learning_rate": 2.5920888779221355e-05, + "loss": 0.5, + "num_tokens": 4664509804.0, + "step": 1119 + }, + { + "epoch": 2.2358231326505122, + "grad_norm": 0.11488823107482765, + "learning_rate": 2.5898126408694074e-05, + "loss": 0.5213, + "num_tokens": 4668685653.0, + "step": 1120 + }, + { + "epoch": 2.237821633774669, + "grad_norm": 0.15350632537818815, + "learning_rate": 2.587535749584175e-05, + "loss": 0.5095, + "num_tokens": 4672866562.0, + "step": 1121 + }, + { + "epoch": 2.239820134898826, + "grad_norm": 0.11437938746839421, + "learning_rate": 2.5852582078878037e-05, + "loss": 0.5068, + "num_tokens": 4677051273.0, + "step": 1122 + }, + { + "epoch": 2.2418186360229826, + "grad_norm": 0.12795611471893875, + "learning_rate": 2.5829800196027508e-05, + "loss": 0.5075, + "num_tokens": 4681233711.0, + "step": 1123 + }, + { + "epoch": 2.24381713714714, + "grad_norm": 0.11469418274377176, + "learning_rate": 2.580701188552555e-05, + "loss": 0.4868, + "num_tokens": 4685377109.0, + "step": 1124 + }, + { + "epoch": 2.2458156382712966, + "grad_norm": 0.1213799568739611, + "learning_rate": 2.5784217185618396e-05, + "loss": 0.5095, + "num_tokens": 4689562189.0, + "step": 1125 + }, + { + "epoch": 2.2478141393954534, + "grad_norm": 0.1096159931571787, + "learning_rate": 2.5761416134562955e-05, + "loss": 0.5043, + "num_tokens": 4693725774.0, + "step": 1126 + }, + { + "epoch": 2.24981264051961, + "grad_norm": 0.11601329848557264, + "learning_rate": 2.573860877062682e-05, + "loss": 0.4989, + "num_tokens": 4697910993.0, + "step": 1127 + }, + { + "epoch": 2.251811141643767, + "grad_norm": 0.12099551191470348, + "learning_rate": 2.571579513208817e-05, + "loss": 0.4798, + "num_tokens": 4702096575.0, + "step": 1128 + }, + { + "epoch": 2.253809642767924, + "grad_norm": 0.11400429970216962, + "learning_rate": 2.5692975257235708e-05, + "loss": 0.5007, + "num_tokens": 4706265579.0, + "step": 1129 + }, + { + "epoch": 2.255808143892081, + "grad_norm": 0.1254871967126555, + "learning_rate": 2.5670149184368634e-05, + "loss": 0.4865, + "num_tokens": 4710450077.0, + "step": 1130 + }, + { + "epoch": 2.2578066450162377, + "grad_norm": 0.11417932737972498, + "learning_rate": 2.5647316951796504e-05, + "loss": 0.492, + "num_tokens": 4714635148.0, + "step": 1131 + }, + { + "epoch": 2.2598051461403945, + "grad_norm": 0.10491498756688282, + "learning_rate": 2.5624478597839257e-05, + "loss": 0.4848, + "num_tokens": 4718798311.0, + "step": 1132 + }, + { + "epoch": 2.2618036472645517, + "grad_norm": 0.11646012558892381, + "learning_rate": 2.560163416082707e-05, + "loss": 0.5107, + "num_tokens": 4722967853.0, + "step": 1133 + }, + { + "epoch": 2.2638021483887085, + "grad_norm": 0.11136918549799038, + "learning_rate": 2.5578783679100358e-05, + "loss": 0.5027, + "num_tokens": 4727153871.0, + "step": 1134 + }, + { + "epoch": 2.2658006495128653, + "grad_norm": 0.11625507127716775, + "learning_rate": 2.555592719100965e-05, + "loss": 0.4808, + "num_tokens": 4731340329.0, + "step": 1135 + }, + { + "epoch": 2.267799150637022, + "grad_norm": 0.09327357954151835, + "learning_rate": 2.5533064734915597e-05, + "loss": 0.5023, + "num_tokens": 4735512107.0, + "step": 1136 + }, + { + "epoch": 2.269797651761179, + "grad_norm": 0.12784623210275625, + "learning_rate": 2.551019634918883e-05, + "loss": 0.5133, + "num_tokens": 4739685871.0, + "step": 1137 + }, + { + "epoch": 2.271796152885336, + "grad_norm": 0.11271021640431086, + "learning_rate": 2.548732207220993e-05, + "loss": 0.4977, + "num_tokens": 4743806597.0, + "step": 1138 + }, + { + "epoch": 2.273794654009493, + "grad_norm": 0.1396918543211418, + "learning_rate": 2.546444194236941e-05, + "loss": 0.4944, + "num_tokens": 4747990554.0, + "step": 1139 + }, + { + "epoch": 2.2757931551336497, + "grad_norm": 0.11790989671380048, + "learning_rate": 2.5441555998067552e-05, + "loss": 0.5005, + "num_tokens": 4752175457.0, + "step": 1140 + }, + { + "epoch": 2.277791656257807, + "grad_norm": 0.17397670252903383, + "learning_rate": 2.5418664277714425e-05, + "loss": 0.4929, + "num_tokens": 4756315947.0, + "step": 1141 + }, + { + "epoch": 2.2797901573819637, + "grad_norm": 0.1519919406519971, + "learning_rate": 2.539576681972979e-05, + "loss": 0.5061, + "num_tokens": 4760497277.0, + "step": 1142 + }, + { + "epoch": 2.2817886585061204, + "grad_norm": 0.13219896388538036, + "learning_rate": 2.5372863662543047e-05, + "loss": 0.5089, + "num_tokens": 4764681917.0, + "step": 1143 + }, + { + "epoch": 2.2837871596302772, + "grad_norm": 0.1779977127881492, + "learning_rate": 2.5349954844593134e-05, + "loss": 0.498, + "num_tokens": 4768866010.0, + "step": 1144 + }, + { + "epoch": 2.285785660754434, + "grad_norm": 0.11429249830712224, + "learning_rate": 2.53270404043285e-05, + "loss": 0.5064, + "num_tokens": 4773027841.0, + "step": 1145 + }, + { + "epoch": 2.2877841618785912, + "grad_norm": 0.1332158915202159, + "learning_rate": 2.530412038020705e-05, + "loss": 0.4973, + "num_tokens": 4777211236.0, + "step": 1146 + }, + { + "epoch": 2.289782663002748, + "grad_norm": 0.13097067308983817, + "learning_rate": 2.528119481069604e-05, + "loss": 0.4959, + "num_tokens": 4781396084.0, + "step": 1147 + }, + { + "epoch": 2.291781164126905, + "grad_norm": 0.12309790978237727, + "learning_rate": 2.525826373427204e-05, + "loss": 0.5061, + "num_tokens": 4785577399.0, + "step": 1148 + }, + { + "epoch": 2.2937796652510616, + "grad_norm": 0.11760001263549823, + "learning_rate": 2.5235327189420856e-05, + "loss": 0.4976, + "num_tokens": 4789759961.0, + "step": 1149 + }, + { + "epoch": 2.295778166375219, + "grad_norm": 0.15768955349835734, + "learning_rate": 2.5212385214637483e-05, + "loss": 0.5046, + "num_tokens": 4793943313.0, + "step": 1150 + }, + { + "epoch": 2.2977766674993756, + "grad_norm": 0.11633950189839977, + "learning_rate": 2.5189437848426016e-05, + "loss": 0.5063, + "num_tokens": 4798115414.0, + "step": 1151 + }, + { + "epoch": 2.2997751686235324, + "grad_norm": 0.1424606452780357, + "learning_rate": 2.5166485129299606e-05, + "loss": 0.4904, + "num_tokens": 4802271549.0, + "step": 1152 + }, + { + "epoch": 2.301773669747689, + "grad_norm": 0.10958428261666366, + "learning_rate": 2.514352709578039e-05, + "loss": 0.5052, + "num_tokens": 4806454573.0, + "step": 1153 + }, + { + "epoch": 2.303772170871846, + "grad_norm": 0.13911165026036013, + "learning_rate": 2.5120563786399422e-05, + "loss": 0.4986, + "num_tokens": 4810597444.0, + "step": 1154 + }, + { + "epoch": 2.305770671996003, + "grad_norm": 0.1253585387208923, + "learning_rate": 2.5097595239696607e-05, + "loss": 0.5108, + "num_tokens": 4814782228.0, + "step": 1155 + }, + { + "epoch": 2.30776917312016, + "grad_norm": 0.11670433844870828, + "learning_rate": 2.5074621494220644e-05, + "loss": 0.5107, + "num_tokens": 4818936587.0, + "step": 1156 + }, + { + "epoch": 2.3097676742443167, + "grad_norm": 0.13068885516402956, + "learning_rate": 2.5051642588528965e-05, + "loss": 0.4906, + "num_tokens": 4823121850.0, + "step": 1157 + }, + { + "epoch": 2.3117661753684735, + "grad_norm": 0.12035758477526445, + "learning_rate": 2.5028658561187644e-05, + "loss": 0.4915, + "num_tokens": 4827274781.0, + "step": 1158 + }, + { + "epoch": 2.3137646764926307, + "grad_norm": 0.1262917034911545, + "learning_rate": 2.5005669450771363e-05, + "loss": 0.4992, + "num_tokens": 4831461195.0, + "step": 1159 + }, + { + "epoch": 2.3157631776167875, + "grad_norm": 0.1331081020017094, + "learning_rate": 2.4982675295863335e-05, + "loss": 0.5104, + "num_tokens": 4835634132.0, + "step": 1160 + }, + { + "epoch": 2.3177616787409443, + "grad_norm": 0.12507095898471088, + "learning_rate": 2.4959676135055235e-05, + "loss": 0.4843, + "num_tokens": 4839809469.0, + "step": 1161 + }, + { + "epoch": 2.319760179865101, + "grad_norm": 0.11967922875293811, + "learning_rate": 2.493667200694714e-05, + "loss": 0.5076, + "num_tokens": 4843995445.0, + "step": 1162 + }, + { + "epoch": 2.321758680989258, + "grad_norm": 0.12078043507784701, + "learning_rate": 2.4913662950147466e-05, + "loss": 0.4904, + "num_tokens": 4848177584.0, + "step": 1163 + }, + { + "epoch": 2.323757182113415, + "grad_norm": 0.10833521725958634, + "learning_rate": 2.4890649003272897e-05, + "loss": 0.5104, + "num_tokens": 4852364549.0, + "step": 1164 + }, + { + "epoch": 2.325755683237572, + "grad_norm": 0.15334459984771, + "learning_rate": 2.4867630204948344e-05, + "loss": 0.4857, + "num_tokens": 4856541320.0, + "step": 1165 + }, + { + "epoch": 2.3277541843617287, + "grad_norm": 0.10596048749316737, + "learning_rate": 2.4844606593806816e-05, + "loss": 0.5071, + "num_tokens": 4860728023.0, + "step": 1166 + }, + { + "epoch": 2.3297526854858854, + "grad_norm": 0.16808451881115094, + "learning_rate": 2.4821578208489458e-05, + "loss": 0.4865, + "num_tokens": 4864889512.0, + "step": 1167 + }, + { + "epoch": 2.3317511866100427, + "grad_norm": 0.11404108020055757, + "learning_rate": 2.4798545087645375e-05, + "loss": 0.5064, + "num_tokens": 4869045502.0, + "step": 1168 + }, + { + "epoch": 2.3337496877341994, + "grad_norm": 0.13254995076925558, + "learning_rate": 2.4775507269931646e-05, + "loss": 0.5025, + "num_tokens": 4873231796.0, + "step": 1169 + }, + { + "epoch": 2.3357481888583562, + "grad_norm": 0.12042032918737071, + "learning_rate": 2.475246479401323e-05, + "loss": 0.4968, + "num_tokens": 4877386399.0, + "step": 1170 + }, + { + "epoch": 2.337746689982513, + "grad_norm": 0.13007459034638735, + "learning_rate": 2.4729417698562905e-05, + "loss": 0.5159, + "num_tokens": 4881569330.0, + "step": 1171 + }, + { + "epoch": 2.33974519110667, + "grad_norm": 0.12574501270594438, + "learning_rate": 2.47063660222612e-05, + "loss": 0.5046, + "num_tokens": 4885750943.0, + "step": 1172 + }, + { + "epoch": 2.341743692230827, + "grad_norm": 0.13108977766135002, + "learning_rate": 2.4683309803796323e-05, + "loss": 0.4961, + "num_tokens": 4889927872.0, + "step": 1173 + }, + { + "epoch": 2.343742193354984, + "grad_norm": 0.12447358992433769, + "learning_rate": 2.4660249081864124e-05, + "loss": 0.4984, + "num_tokens": 4894115324.0, + "step": 1174 + }, + { + "epoch": 2.3457406944791406, + "grad_norm": 0.12263153789249465, + "learning_rate": 2.4637183895167992e-05, + "loss": 0.5112, + "num_tokens": 4898256833.0, + "step": 1175 + }, + { + "epoch": 2.3477391956032974, + "grad_norm": 0.13327876791080132, + "learning_rate": 2.461411428241883e-05, + "loss": 0.4952, + "num_tokens": 4902442956.0, + "step": 1176 + }, + { + "epoch": 2.3497376967274546, + "grad_norm": 0.1427814716406302, + "learning_rate": 2.4591040282334945e-05, + "loss": 0.4908, + "num_tokens": 4906623789.0, + "step": 1177 + }, + { + "epoch": 2.3517361978516114, + "grad_norm": 0.1242228137296894, + "learning_rate": 2.4567961933642028e-05, + "loss": 0.4929, + "num_tokens": 4910810088.0, + "step": 1178 + }, + { + "epoch": 2.353734698975768, + "grad_norm": 0.13105015904152717, + "learning_rate": 2.4544879275073067e-05, + "loss": 0.4948, + "num_tokens": 4914991681.0, + "step": 1179 + }, + { + "epoch": 2.355733200099925, + "grad_norm": 0.10907033503498091, + "learning_rate": 2.452179234536826e-05, + "loss": 0.5017, + "num_tokens": 4919173027.0, + "step": 1180 + }, + { + "epoch": 2.3577317012240817, + "grad_norm": 0.12888831170856316, + "learning_rate": 2.4498701183275008e-05, + "loss": 0.4885, + "num_tokens": 4923357387.0, + "step": 1181 + }, + { + "epoch": 2.359730202348239, + "grad_norm": 0.10641316403425177, + "learning_rate": 2.4475605827547778e-05, + "loss": 0.4738, + "num_tokens": 4927543918.0, + "step": 1182 + }, + { + "epoch": 2.3617287034723957, + "grad_norm": 0.11937078425027635, + "learning_rate": 2.4452506316948114e-05, + "loss": 0.4931, + "num_tokens": 4931679282.0, + "step": 1183 + }, + { + "epoch": 2.3637272045965525, + "grad_norm": 0.1075076206200448, + "learning_rate": 2.44294026902445e-05, + "loss": 0.5058, + "num_tokens": 4935819833.0, + "step": 1184 + }, + { + "epoch": 2.3657257057207093, + "grad_norm": 0.1306047466018927, + "learning_rate": 2.4406294986212357e-05, + "loss": 0.5022, + "num_tokens": 4940005624.0, + "step": 1185 + }, + { + "epoch": 2.3677242068448665, + "grad_norm": 0.1051232598350472, + "learning_rate": 2.438318324363392e-05, + "loss": 0.5137, + "num_tokens": 4944161254.0, + "step": 1186 + }, + { + "epoch": 2.3697227079690233, + "grad_norm": 0.11679400991700495, + "learning_rate": 2.4360067501298222e-05, + "loss": 0.4945, + "num_tokens": 4948346632.0, + "step": 1187 + }, + { + "epoch": 2.37172120909318, + "grad_norm": 0.10829714199527313, + "learning_rate": 2.4336947798001e-05, + "loss": 0.4962, + "num_tokens": 4952532073.0, + "step": 1188 + }, + { + "epoch": 2.373719710217337, + "grad_norm": 0.1328541164400745, + "learning_rate": 2.431382417254466e-05, + "loss": 0.5023, + "num_tokens": 4956696377.0, + "step": 1189 + }, + { + "epoch": 2.3757182113414936, + "grad_norm": 0.10861955777012862, + "learning_rate": 2.429069666373815e-05, + "loss": 0.5035, + "num_tokens": 4960879895.0, + "step": 1190 + }, + { + "epoch": 2.377716712465651, + "grad_norm": 0.12669868668512083, + "learning_rate": 2.4267565310396967e-05, + "loss": 0.5176, + "num_tokens": 4965063099.0, + "step": 1191 + }, + { + "epoch": 2.3797152135898076, + "grad_norm": 0.16525823935202236, + "learning_rate": 2.4244430151343053e-05, + "loss": 0.5123, + "num_tokens": 4969207170.0, + "step": 1192 + }, + { + "epoch": 2.3817137147139644, + "grad_norm": 0.10896796671093058, + "learning_rate": 2.422129122540473e-05, + "loss": 0.5078, + "num_tokens": 4973357014.0, + "step": 1193 + }, + { + "epoch": 2.3837122158381217, + "grad_norm": 0.17130647964355425, + "learning_rate": 2.419814857141666e-05, + "loss": 0.4892, + "num_tokens": 4977532650.0, + "step": 1194 + }, + { + "epoch": 2.3857107169622784, + "grad_norm": 0.10955494843143793, + "learning_rate": 2.4175002228219736e-05, + "loss": 0.5189, + "num_tokens": 4981707808.0, + "step": 1195 + }, + { + "epoch": 2.387709218086435, + "grad_norm": 0.14270727690795754, + "learning_rate": 2.4151852234661063e-05, + "loss": 0.4856, + "num_tokens": 4985893727.0, + "step": 1196 + }, + { + "epoch": 2.389707719210592, + "grad_norm": 0.10244074670500411, + "learning_rate": 2.4128698629593867e-05, + "loss": 0.4823, + "num_tokens": 4990078822.0, + "step": 1197 + }, + { + "epoch": 2.391706220334749, + "grad_norm": 0.11953742782257755, + "learning_rate": 2.4105541451877438e-05, + "loss": 0.4984, + "num_tokens": 4994260453.0, + "step": 1198 + }, + { + "epoch": 2.393704721458906, + "grad_norm": 0.12134697400585774, + "learning_rate": 2.408238074037704e-05, + "loss": 0.5067, + "num_tokens": 4998441573.0, + "step": 1199 + }, + { + "epoch": 2.395703222583063, + "grad_norm": 0.11869637919150698, + "learning_rate": 2.4059216533963917e-05, + "loss": 0.4922, + "num_tokens": 5002628255.0, + "step": 1200 + }, + { + "epoch": 2.3977017237072196, + "grad_norm": 0.1297473530784867, + "learning_rate": 2.403604887151512e-05, + "loss": 0.5011, + "num_tokens": 5006785629.0, + "step": 1201 + }, + { + "epoch": 2.3997002248313763, + "grad_norm": 0.1130199931757571, + "learning_rate": 2.4012877791913544e-05, + "loss": 0.4992, + "num_tokens": 5010964798.0, + "step": 1202 + }, + { + "epoch": 2.4016987259555336, + "grad_norm": 0.13239269237181875, + "learning_rate": 2.398970333404779e-05, + "loss": 0.4993, + "num_tokens": 5015116002.0, + "step": 1203 + }, + { + "epoch": 2.4036972270796904, + "grad_norm": 0.16562864943503894, + "learning_rate": 2.396652553681216e-05, + "loss": 0.4998, + "num_tokens": 5019295459.0, + "step": 1204 + }, + { + "epoch": 2.405695728203847, + "grad_norm": 0.1277690076542446, + "learning_rate": 2.3943344439106527e-05, + "loss": 0.5, + "num_tokens": 5023481673.0, + "step": 1205 + }, + { + "epoch": 2.407694229328004, + "grad_norm": 0.1874862512987499, + "learning_rate": 2.3920160079836317e-05, + "loss": 0.493, + "num_tokens": 5027668517.0, + "step": 1206 + }, + { + "epoch": 2.4096927304521607, + "grad_norm": 0.13015305822459008, + "learning_rate": 2.3896972497912453e-05, + "loss": 0.5124, + "num_tokens": 5031852646.0, + "step": 1207 + }, + { + "epoch": 2.411691231576318, + "grad_norm": 0.1526061893096601, + "learning_rate": 2.3873781732251224e-05, + "loss": 0.4962, + "num_tokens": 5036036149.0, + "step": 1208 + }, + { + "epoch": 2.4136897327004747, + "grad_norm": 0.12812564844213037, + "learning_rate": 2.3850587821774292e-05, + "loss": 0.4924, + "num_tokens": 5040156285.0, + "step": 1209 + }, + { + "epoch": 2.4156882338246315, + "grad_norm": 0.13432844172030003, + "learning_rate": 2.382739080540859e-05, + "loss": 0.5031, + "num_tokens": 5044311143.0, + "step": 1210 + }, + { + "epoch": 2.4176867349487883, + "grad_norm": 0.13155130909605728, + "learning_rate": 2.380419072208626e-05, + "loss": 0.5034, + "num_tokens": 5048495876.0, + "step": 1211 + }, + { + "epoch": 2.4196852360729455, + "grad_norm": 0.13771668515685392, + "learning_rate": 2.3780987610744598e-05, + "loss": 0.5172, + "num_tokens": 5052681816.0, + "step": 1212 + }, + { + "epoch": 2.4216837371971023, + "grad_norm": 0.11929725453169247, + "learning_rate": 2.3757781510325976e-05, + "loss": 0.5054, + "num_tokens": 5056858989.0, + "step": 1213 + }, + { + "epoch": 2.423682238321259, + "grad_norm": 0.12107491122345616, + "learning_rate": 2.3734572459777786e-05, + "loss": 0.4921, + "num_tokens": 5061017735.0, + "step": 1214 + }, + { + "epoch": 2.425680739445416, + "grad_norm": 0.10145502719862441, + "learning_rate": 2.371136049805236e-05, + "loss": 0.5051, + "num_tokens": 5065180241.0, + "step": 1215 + }, + { + "epoch": 2.4276792405695726, + "grad_norm": 0.13211435714341319, + "learning_rate": 2.3688145664106937e-05, + "loss": 0.5023, + "num_tokens": 5069365609.0, + "step": 1216 + }, + { + "epoch": 2.42967774169373, + "grad_norm": 0.11148224789351474, + "learning_rate": 2.366492799690356e-05, + "loss": 0.5002, + "num_tokens": 5073552456.0, + "step": 1217 + }, + { + "epoch": 2.4316762428178866, + "grad_norm": 0.127905331705515, + "learning_rate": 2.364170753540903e-05, + "loss": 0.5173, + "num_tokens": 5077736765.0, + "step": 1218 + }, + { + "epoch": 2.4336747439420434, + "grad_norm": 0.09710013903830438, + "learning_rate": 2.3618484318594855e-05, + "loss": 0.4875, + "num_tokens": 5081909719.0, + "step": 1219 + }, + { + "epoch": 2.4356732450662, + "grad_norm": 0.12691182086581754, + "learning_rate": 2.3595258385437134e-05, + "loss": 0.5168, + "num_tokens": 5086027394.0, + "step": 1220 + }, + { + "epoch": 2.4376717461903574, + "grad_norm": 0.10769002358371045, + "learning_rate": 2.3572029774916555e-05, + "loss": 0.5143, + "num_tokens": 5090210961.0, + "step": 1221 + }, + { + "epoch": 2.439670247314514, + "grad_norm": 0.12428778215726967, + "learning_rate": 2.354879852601828e-05, + "loss": 0.5013, + "num_tokens": 5094395987.0, + "step": 1222 + }, + { + "epoch": 2.441668748438671, + "grad_norm": 0.11227000092051866, + "learning_rate": 2.3525564677731917e-05, + "loss": 0.5164, + "num_tokens": 5098580706.0, + "step": 1223 + }, + { + "epoch": 2.4436672495628278, + "grad_norm": 0.12291227600727747, + "learning_rate": 2.350232826905142e-05, + "loss": 0.5055, + "num_tokens": 5102733216.0, + "step": 1224 + }, + { + "epoch": 2.4456657506869846, + "grad_norm": 0.12599242194686705, + "learning_rate": 2.3479089338975044e-05, + "loss": 0.4942, + "num_tokens": 5106892043.0, + "step": 1225 + }, + { + "epoch": 2.4476642518111418, + "grad_norm": 0.12535029015001073, + "learning_rate": 2.3455847926505283e-05, + "loss": 0.4974, + "num_tokens": 5111076856.0, + "step": 1226 + }, + { + "epoch": 2.4496627529352986, + "grad_norm": 0.109396569030111, + "learning_rate": 2.3432604070648783e-05, + "loss": 0.496, + "num_tokens": 5115261421.0, + "step": 1227 + }, + { + "epoch": 2.4516612540594553, + "grad_norm": 0.1100106452333304, + "learning_rate": 2.3409357810416315e-05, + "loss": 0.5084, + "num_tokens": 5119445698.0, + "step": 1228 + }, + { + "epoch": 2.453659755183612, + "grad_norm": 0.10937879615197418, + "learning_rate": 2.3386109184822668e-05, + "loss": 0.5079, + "num_tokens": 5123594718.0, + "step": 1229 + }, + { + "epoch": 2.4556582563077693, + "grad_norm": 0.12490743510974332, + "learning_rate": 2.3362858232886598e-05, + "loss": 0.5128, + "num_tokens": 5127779612.0, + "step": 1230 + }, + { + "epoch": 2.457656757431926, + "grad_norm": 0.12112217600028634, + "learning_rate": 2.3339604993630776e-05, + "loss": 0.4969, + "num_tokens": 5131961006.0, + "step": 1231 + }, + { + "epoch": 2.459655258556083, + "grad_norm": 0.11429654602415301, + "learning_rate": 2.3316349506081698e-05, + "loss": 0.4979, + "num_tokens": 5136144632.0, + "step": 1232 + }, + { + "epoch": 2.4616537596802397, + "grad_norm": 0.12456331197313228, + "learning_rate": 2.3293091809269655e-05, + "loss": 0.4897, + "num_tokens": 5140329700.0, + "step": 1233 + }, + { + "epoch": 2.4636522608043965, + "grad_norm": 0.120908817869444, + "learning_rate": 2.3269831942228622e-05, + "loss": 0.4987, + "num_tokens": 5144514382.0, + "step": 1234 + }, + { + "epoch": 2.4656507619285537, + "grad_norm": 0.1159243335379237, + "learning_rate": 2.3246569943996235e-05, + "loss": 0.5068, + "num_tokens": 5148697648.0, + "step": 1235 + }, + { + "epoch": 2.4676492630527105, + "grad_norm": 0.09998210788138946, + "learning_rate": 2.3223305853613694e-05, + "loss": 0.5046, + "num_tokens": 5152882457.0, + "step": 1236 + }, + { + "epoch": 2.4696477641768673, + "grad_norm": 0.12569567747627472, + "learning_rate": 2.3200039710125716e-05, + "loss": 0.4976, + "num_tokens": 5157063108.0, + "step": 1237 + }, + { + "epoch": 2.471646265301024, + "grad_norm": 0.10324840114309243, + "learning_rate": 2.317677155258047e-05, + "loss": 0.5015, + "num_tokens": 5161224524.0, + "step": 1238 + }, + { + "epoch": 2.4736447664251813, + "grad_norm": 0.11903093158035, + "learning_rate": 2.315350142002949e-05, + "loss": 0.5016, + "num_tokens": 5165408457.0, + "step": 1239 + }, + { + "epoch": 2.475643267549338, + "grad_norm": 0.11467358903279407, + "learning_rate": 2.3130229351527645e-05, + "loss": 0.4991, + "num_tokens": 5169591286.0, + "step": 1240 + }, + { + "epoch": 2.477641768673495, + "grad_norm": 0.12835477909915038, + "learning_rate": 2.310695538613303e-05, + "loss": 0.4912, + "num_tokens": 5173718116.0, + "step": 1241 + }, + { + "epoch": 2.4796402697976516, + "grad_norm": 0.11140949204443108, + "learning_rate": 2.308367956290694e-05, + "loss": 0.5194, + "num_tokens": 5177901785.0, + "step": 1242 + }, + { + "epoch": 2.4816387709218084, + "grad_norm": 0.12703759755041857, + "learning_rate": 2.306040192091378e-05, + "loss": 0.5192, + "num_tokens": 5182088009.0, + "step": 1243 + }, + { + "epoch": 2.4836372720459656, + "grad_norm": 0.11252958271420434, + "learning_rate": 2.3037122499221013e-05, + "loss": 0.4939, + "num_tokens": 5186273382.0, + "step": 1244 + }, + { + "epoch": 2.4856357731701224, + "grad_norm": 0.10657171703940937, + "learning_rate": 2.3013841336899086e-05, + "loss": 0.5111, + "num_tokens": 5190459307.0, + "step": 1245 + }, + { + "epoch": 2.487634274294279, + "grad_norm": 0.10763564384881176, + "learning_rate": 2.299055847302137e-05, + "loss": 0.4925, + "num_tokens": 5194643367.0, + "step": 1246 + }, + { + "epoch": 2.4896327754184364, + "grad_norm": 0.11356577645738612, + "learning_rate": 2.2967273946664092e-05, + "loss": 0.5054, + "num_tokens": 5198823986.0, + "step": 1247 + }, + { + "epoch": 2.491631276542593, + "grad_norm": 0.13920597380875663, + "learning_rate": 2.2943987796906253e-05, + "loss": 0.5038, + "num_tokens": 5203011040.0, + "step": 1248 + }, + { + "epoch": 2.49362977766675, + "grad_norm": 0.11007369949003722, + "learning_rate": 2.2920700062829613e-05, + "loss": 0.4953, + "num_tokens": 5207198825.0, + "step": 1249 + }, + { + "epoch": 2.4956282787909068, + "grad_norm": 0.13215965799029347, + "learning_rate": 2.2897410783518547e-05, + "loss": 0.5095, + "num_tokens": 5211385511.0, + "step": 1250 + }, + { + "epoch": 2.4976267799150635, + "grad_norm": 0.10676941032599821, + "learning_rate": 2.287411999806007e-05, + "loss": 0.4854, + "num_tokens": 5215561699.0, + "step": 1251 + }, + { + "epoch": 2.4996252810392208, + "grad_norm": 0.1080030920246531, + "learning_rate": 2.2850827745543694e-05, + "loss": 0.4945, + "num_tokens": 5219692734.0, + "step": 1252 + }, + { + "epoch": 2.5016237821633776, + "grad_norm": 0.1093574668953215, + "learning_rate": 2.2827534065061388e-05, + "loss": 0.494, + "num_tokens": 5223877496.0, + "step": 1253 + }, + { + "epoch": 2.5036222832875343, + "grad_norm": 0.1168432670432424, + "learning_rate": 2.280423899570755e-05, + "loss": 0.4957, + "num_tokens": 5228019974.0, + "step": 1254 + }, + { + "epoch": 2.505620784411691, + "grad_norm": 0.10514353263926626, + "learning_rate": 2.278094257657887e-05, + "loss": 0.5042, + "num_tokens": 5232180436.0, + "step": 1255 + }, + { + "epoch": 2.5076192855358483, + "grad_norm": 0.1074301856677173, + "learning_rate": 2.2757644846774342e-05, + "loss": 0.4969, + "num_tokens": 5236342389.0, + "step": 1256 + }, + { + "epoch": 2.509617786660005, + "grad_norm": 0.10091473229419083, + "learning_rate": 2.273434584539512e-05, + "loss": 0.5038, + "num_tokens": 5240517850.0, + "step": 1257 + }, + { + "epoch": 2.511616287784162, + "grad_norm": 0.12241845669102427, + "learning_rate": 2.2711045611544537e-05, + "loss": 0.499, + "num_tokens": 5244685606.0, + "step": 1258 + }, + { + "epoch": 2.5136147889083187, + "grad_norm": 0.11873622080687184, + "learning_rate": 2.2687744184327952e-05, + "loss": 0.5029, + "num_tokens": 5248868275.0, + "step": 1259 + }, + { + "epoch": 2.5156132900324755, + "grad_norm": 0.10606789610689146, + "learning_rate": 2.266444160285275e-05, + "loss": 0.5001, + "num_tokens": 5253027963.0, + "step": 1260 + }, + { + "epoch": 2.5176117911566323, + "grad_norm": 0.13198387253554786, + "learning_rate": 2.2641137906228248e-05, + "loss": 0.4903, + "num_tokens": 5257200829.0, + "step": 1261 + }, + { + "epoch": 2.5196102922807895, + "grad_norm": 0.10926446882388384, + "learning_rate": 2.2617833133565633e-05, + "loss": 0.4873, + "num_tokens": 5261375945.0, + "step": 1262 + }, + { + "epoch": 2.5216087934049463, + "grad_norm": 0.13164067404588786, + "learning_rate": 2.2594527323977908e-05, + "loss": 0.5018, + "num_tokens": 5265562079.0, + "step": 1263 + }, + { + "epoch": 2.523607294529103, + "grad_norm": 0.10582202438105051, + "learning_rate": 2.2571220516579794e-05, + "loss": 0.49, + "num_tokens": 5269749038.0, + "step": 1264 + }, + { + "epoch": 2.5256057956532603, + "grad_norm": 0.1111191852986855, + "learning_rate": 2.254791275048771e-05, + "loss": 0.4967, + "num_tokens": 5273908211.0, + "step": 1265 + }, + { + "epoch": 2.527604296777417, + "grad_norm": 0.12741177200974957, + "learning_rate": 2.252460406481968e-05, + "loss": 0.4941, + "num_tokens": 5278041427.0, + "step": 1266 + }, + { + "epoch": 2.529602797901574, + "grad_norm": 0.10503444852318541, + "learning_rate": 2.2501294498695253e-05, + "loss": 0.4958, + "num_tokens": 5282226862.0, + "step": 1267 + }, + { + "epoch": 2.5316012990257306, + "grad_norm": 0.112004089723139, + "learning_rate": 2.2477984091235477e-05, + "loss": 0.496, + "num_tokens": 5286410380.0, + "step": 1268 + }, + { + "epoch": 2.5335998001498874, + "grad_norm": 0.10854720062651563, + "learning_rate": 2.24546728815628e-05, + "loss": 0.4921, + "num_tokens": 5290557985.0, + "step": 1269 + }, + { + "epoch": 2.5355983012740446, + "grad_norm": 0.11892884936208002, + "learning_rate": 2.243136090880102e-05, + "loss": 0.4902, + "num_tokens": 5294740882.0, + "step": 1270 + }, + { + "epoch": 2.5375968023982014, + "grad_norm": 0.13497188002668686, + "learning_rate": 2.240804821207522e-05, + "loss": 0.4966, + "num_tokens": 5298926915.0, + "step": 1271 + }, + { + "epoch": 2.539595303522358, + "grad_norm": 0.1265593374728195, + "learning_rate": 2.2384734830511694e-05, + "loss": 0.5029, + "num_tokens": 5303084661.0, + "step": 1272 + }, + { + "epoch": 2.541593804646515, + "grad_norm": 0.1337669534021803, + "learning_rate": 2.236142080323788e-05, + "loss": 0.5006, + "num_tokens": 5307269738.0, + "step": 1273 + }, + { + "epoch": 2.543592305770672, + "grad_norm": 0.12159958072026333, + "learning_rate": 2.2338106169382315e-05, + "loss": 0.4966, + "num_tokens": 5311438728.0, + "step": 1274 + }, + { + "epoch": 2.545590806894829, + "grad_norm": 0.1058510185294233, + "learning_rate": 2.2314790968074533e-05, + "loss": 0.5045, + "num_tokens": 5315593420.0, + "step": 1275 + }, + { + "epoch": 2.5475893080189858, + "grad_norm": 0.13279946653274927, + "learning_rate": 2.2291475238445033e-05, + "loss": 0.4919, + "num_tokens": 5319766777.0, + "step": 1276 + }, + { + "epoch": 2.5495878091431425, + "grad_norm": 0.10307598491354347, + "learning_rate": 2.2268159019625197e-05, + "loss": 0.4986, + "num_tokens": 5323953734.0, + "step": 1277 + }, + { + "epoch": 2.5515863102672993, + "grad_norm": 0.12388259697915918, + "learning_rate": 2.2244842350747242e-05, + "loss": 0.4951, + "num_tokens": 5328139638.0, + "step": 1278 + }, + { + "epoch": 2.5535848113914565, + "grad_norm": 0.12919219517872702, + "learning_rate": 2.2221525270944117e-05, + "loss": 0.4885, + "num_tokens": 5332326404.0, + "step": 1279 + }, + { + "epoch": 2.5555833125156133, + "grad_norm": 0.10960243429302427, + "learning_rate": 2.219820781934948e-05, + "loss": 0.4853, + "num_tokens": 5336472601.0, + "step": 1280 + }, + { + "epoch": 2.55758181363977, + "grad_norm": 0.13044989496854326, + "learning_rate": 2.2174890035097592e-05, + "loss": 0.5079, + "num_tokens": 5340645440.0, + "step": 1281 + }, + { + "epoch": 2.559580314763927, + "grad_norm": 0.12752801161939256, + "learning_rate": 2.2151571957323305e-05, + "loss": 0.5132, + "num_tokens": 5344830629.0, + "step": 1282 + }, + { + "epoch": 2.561578815888084, + "grad_norm": 0.11254049166560397, + "learning_rate": 2.2128253625161934e-05, + "loss": 0.5003, + "num_tokens": 5349010667.0, + "step": 1283 + }, + { + "epoch": 2.563577317012241, + "grad_norm": 0.12744588063828424, + "learning_rate": 2.2104935077749223e-05, + "loss": 0.4987, + "num_tokens": 5353196064.0, + "step": 1284 + }, + { + "epoch": 2.5655758181363977, + "grad_norm": 0.10080555048962779, + "learning_rate": 2.2081616354221297e-05, + "loss": 0.4995, + "num_tokens": 5357376084.0, + "step": 1285 + }, + { + "epoch": 2.5675743192605545, + "grad_norm": 0.11260986619747329, + "learning_rate": 2.2058297493714563e-05, + "loss": 0.5045, + "num_tokens": 5361543268.0, + "step": 1286 + }, + { + "epoch": 2.5695728203847112, + "grad_norm": 0.10225665843411035, + "learning_rate": 2.203497853536565e-05, + "loss": 0.4959, + "num_tokens": 5365715254.0, + "step": 1287 + }, + { + "epoch": 2.5715713215088685, + "grad_norm": 0.11227282363043731, + "learning_rate": 2.2011659518311372e-05, + "loss": 0.5027, + "num_tokens": 5369877630.0, + "step": 1288 + }, + { + "epoch": 2.5735698226330253, + "grad_norm": 0.1031806350767425, + "learning_rate": 2.198834048168863e-05, + "loss": 0.504, + "num_tokens": 5374036455.0, + "step": 1289 + }, + { + "epoch": 2.575568323757182, + "grad_norm": 0.10483959278381469, + "learning_rate": 2.196502146463435e-05, + "loss": 0.5074, + "num_tokens": 5378222967.0, + "step": 1290 + }, + { + "epoch": 2.5775668248813393, + "grad_norm": 0.10118019667568633, + "learning_rate": 2.194170250628545e-05, + "loss": 0.4856, + "num_tokens": 5382408911.0, + "step": 1291 + }, + { + "epoch": 2.579565326005496, + "grad_norm": 0.0983308825976211, + "learning_rate": 2.191838364577871e-05, + "loss": 0.5019, + "num_tokens": 5386587492.0, + "step": 1292 + }, + { + "epoch": 2.581563827129653, + "grad_norm": 0.1096704869383505, + "learning_rate": 2.189506492225079e-05, + "loss": 0.4882, + "num_tokens": 5390774254.0, + "step": 1293 + }, + { + "epoch": 2.5835623282538096, + "grad_norm": 0.10390448601160149, + "learning_rate": 2.1871746374838075e-05, + "loss": 0.494, + "num_tokens": 5394953446.0, + "step": 1294 + }, + { + "epoch": 2.5855608293779664, + "grad_norm": 0.10731396046914424, + "learning_rate": 2.1848428042676697e-05, + "loss": 0.5072, + "num_tokens": 5399128803.0, + "step": 1295 + }, + { + "epoch": 2.587559330502123, + "grad_norm": 0.10468382245949627, + "learning_rate": 2.1825109964902413e-05, + "loss": 0.4915, + "num_tokens": 5403306647.0, + "step": 1296 + }, + { + "epoch": 2.5895578316262804, + "grad_norm": 0.12400665831702144, + "learning_rate": 2.180179218065053e-05, + "loss": 0.5086, + "num_tokens": 5407478292.0, + "step": 1297 + }, + { + "epoch": 2.591556332750437, + "grad_norm": 0.10743215025246532, + "learning_rate": 2.1778474729055885e-05, + "loss": 0.5131, + "num_tokens": 5411661602.0, + "step": 1298 + }, + { + "epoch": 2.593554833874594, + "grad_norm": 0.12136113766944304, + "learning_rate": 2.175515764925276e-05, + "loss": 0.493, + "num_tokens": 5415846722.0, + "step": 1299 + }, + { + "epoch": 2.595553334998751, + "grad_norm": 0.10594714276706019, + "learning_rate": 2.1731840980374805e-05, + "loss": 0.4943, + "num_tokens": 5420027736.0, + "step": 1300 + }, + { + "epoch": 2.597551836122908, + "grad_norm": 0.11754410962370701, + "learning_rate": 2.1708524761554973e-05, + "loss": 0.5014, + "num_tokens": 5424169223.0, + "step": 1301 + }, + { + "epoch": 2.5995503372470647, + "grad_norm": 0.11410418057805868, + "learning_rate": 2.168520903192548e-05, + "loss": 0.5007, + "num_tokens": 5428335165.0, + "step": 1302 + }, + { + "epoch": 2.6015488383712215, + "grad_norm": 0.10651138424998591, + "learning_rate": 2.1661893830617694e-05, + "loss": 0.501, + "num_tokens": 5432512587.0, + "step": 1303 + }, + { + "epoch": 2.6035473394953783, + "grad_norm": 0.12346942348420746, + "learning_rate": 2.163857919676212e-05, + "loss": 0.4913, + "num_tokens": 5436697613.0, + "step": 1304 + }, + { + "epoch": 2.605545840619535, + "grad_norm": 0.10839793270952533, + "learning_rate": 2.161526516948831e-05, + "loss": 0.4815, + "num_tokens": 5440859494.0, + "step": 1305 + }, + { + "epoch": 2.6075443417436923, + "grad_norm": 0.11624061274437088, + "learning_rate": 2.1591951787924784e-05, + "loss": 0.5032, + "num_tokens": 5445046101.0, + "step": 1306 + }, + { + "epoch": 2.609542842867849, + "grad_norm": 0.11813304663741825, + "learning_rate": 2.1568639091198983e-05, + "loss": 0.4909, + "num_tokens": 5449233555.0, + "step": 1307 + }, + { + "epoch": 2.611541343992006, + "grad_norm": 0.09667790844421108, + "learning_rate": 2.1545327118437204e-05, + "loss": 0.5022, + "num_tokens": 5453420781.0, + "step": 1308 + }, + { + "epoch": 2.613539845116163, + "grad_norm": 0.14420338460079674, + "learning_rate": 2.152201590876453e-05, + "loss": 0.5145, + "num_tokens": 5457607700.0, + "step": 1309 + }, + { + "epoch": 2.61553834624032, + "grad_norm": 0.10756105011986507, + "learning_rate": 2.1498705501304756e-05, + "loss": 0.4928, + "num_tokens": 5461793846.0, + "step": 1310 + }, + { + "epoch": 2.6175368473644767, + "grad_norm": 0.13847510387435397, + "learning_rate": 2.1475395935180333e-05, + "loss": 0.4895, + "num_tokens": 5465957149.0, + "step": 1311 + }, + { + "epoch": 2.6195353484886335, + "grad_norm": 0.09806964432683413, + "learning_rate": 2.1452087249512293e-05, + "loss": 0.4891, + "num_tokens": 5470122061.0, + "step": 1312 + }, + { + "epoch": 2.6215338496127902, + "grad_norm": 0.14823862953550845, + "learning_rate": 2.142877948342021e-05, + "loss": 0.4851, + "num_tokens": 5474266473.0, + "step": 1313 + }, + { + "epoch": 2.623532350736947, + "grad_norm": 0.11312177084761268, + "learning_rate": 2.1405472676022105e-05, + "loss": 0.4963, + "num_tokens": 5478416202.0, + "step": 1314 + }, + { + "epoch": 2.6255308518611042, + "grad_norm": 0.1282469303478099, + "learning_rate": 2.1382166866434373e-05, + "loss": 0.4894, + "num_tokens": 5482602511.0, + "step": 1315 + }, + { + "epoch": 2.627529352985261, + "grad_norm": 0.11186655408495733, + "learning_rate": 2.135886209377176e-05, + "loss": 0.4896, + "num_tokens": 5486786255.0, + "step": 1316 + }, + { + "epoch": 2.629527854109418, + "grad_norm": 0.11128582769886873, + "learning_rate": 2.1335558397147255e-05, + "loss": 0.4986, + "num_tokens": 5490946032.0, + "step": 1317 + }, + { + "epoch": 2.631526355233575, + "grad_norm": 0.12014713448577527, + "learning_rate": 2.1312255815672053e-05, + "loss": 0.5074, + "num_tokens": 5495128159.0, + "step": 1318 + }, + { + "epoch": 2.633524856357732, + "grad_norm": 0.1147625518811947, + "learning_rate": 2.1288954388455466e-05, + "loss": 0.497, + "num_tokens": 5499312496.0, + "step": 1319 + }, + { + "epoch": 2.6355233574818886, + "grad_norm": 0.11505230563704935, + "learning_rate": 2.126565415460488e-05, + "loss": 0.5019, + "num_tokens": 5503497369.0, + "step": 1320 + }, + { + "epoch": 2.6375218586060454, + "grad_norm": 0.11398030022286365, + "learning_rate": 2.124235515322567e-05, + "loss": 0.4858, + "num_tokens": 5507682313.0, + "step": 1321 + }, + { + "epoch": 2.639520359730202, + "grad_norm": 0.09575749836527689, + "learning_rate": 2.1219057423421132e-05, + "loss": 0.5099, + "num_tokens": 5511867113.0, + "step": 1322 + }, + { + "epoch": 2.6415188608543594, + "grad_norm": 0.13921590140306442, + "learning_rate": 2.119576100429246e-05, + "loss": 0.4886, + "num_tokens": 5516051667.0, + "step": 1323 + }, + { + "epoch": 2.643517361978516, + "grad_norm": 0.09985723232323349, + "learning_rate": 2.1172465934938618e-05, + "loss": 0.493, + "num_tokens": 5520216699.0, + "step": 1324 + }, + { + "epoch": 2.645515863102673, + "grad_norm": 0.10935546380603876, + "learning_rate": 2.1149172254456318e-05, + "loss": 0.5005, + "num_tokens": 5524400566.0, + "step": 1325 + }, + { + "epoch": 2.6475143642268297, + "grad_norm": 0.11071029689738932, + "learning_rate": 2.112588000193994e-05, + "loss": 0.4985, + "num_tokens": 5528570564.0, + "step": 1326 + }, + { + "epoch": 2.649512865350987, + "grad_norm": 0.11018253110205624, + "learning_rate": 2.1102589216481455e-05, + "loss": 0.5094, + "num_tokens": 5532724913.0, + "step": 1327 + }, + { + "epoch": 2.6515113664751437, + "grad_norm": 0.12331236640004721, + "learning_rate": 2.1079299937170396e-05, + "loss": 0.519, + "num_tokens": 5536908741.0, + "step": 1328 + }, + { + "epoch": 2.6535098675993005, + "grad_norm": 0.10718152046626414, + "learning_rate": 2.1056012203093753e-05, + "loss": 0.4903, + "num_tokens": 5541063811.0, + "step": 1329 + }, + { + "epoch": 2.6555083687234573, + "grad_norm": 0.1299204968380154, + "learning_rate": 2.103272605333592e-05, + "loss": 0.4924, + "num_tokens": 5545249099.0, + "step": 1330 + }, + { + "epoch": 2.657506869847614, + "grad_norm": 0.10443889512923875, + "learning_rate": 2.1009441526978632e-05, + "loss": 0.513, + "num_tokens": 5549434219.0, + "step": 1331 + }, + { + "epoch": 2.6595053709717713, + "grad_norm": 0.11663345690338625, + "learning_rate": 2.098615866310092e-05, + "loss": 0.5008, + "num_tokens": 5553618546.0, + "step": 1332 + }, + { + "epoch": 2.661503872095928, + "grad_norm": 0.10468694023448878, + "learning_rate": 2.0962877500778993e-05, + "loss": 0.5075, + "num_tokens": 5557791422.0, + "step": 1333 + }, + { + "epoch": 2.663502373220085, + "grad_norm": 0.10739603410308576, + "learning_rate": 2.0939598079086226e-05, + "loss": 0.5113, + "num_tokens": 5561970947.0, + "step": 1334 + }, + { + "epoch": 2.6655008743442417, + "grad_norm": 0.11835936538864736, + "learning_rate": 2.0916320437093073e-05, + "loss": 0.5037, + "num_tokens": 5566155350.0, + "step": 1335 + }, + { + "epoch": 2.667499375468399, + "grad_norm": 0.10495638753233863, + "learning_rate": 2.0893044613866978e-05, + "loss": 0.492, + "num_tokens": 5570342026.0, + "step": 1336 + }, + { + "epoch": 2.6694978765925557, + "grad_norm": 0.1158623130028439, + "learning_rate": 2.0869770648472364e-05, + "loss": 0.4892, + "num_tokens": 5574468427.0, + "step": 1337 + }, + { + "epoch": 2.6714963777167124, + "grad_norm": 0.11369181463856701, + "learning_rate": 2.0846498579970515e-05, + "loss": 0.5275, + "num_tokens": 5578653675.0, + "step": 1338 + }, + { + "epoch": 2.6734948788408692, + "grad_norm": 0.10622061848169399, + "learning_rate": 2.0823228447419534e-05, + "loss": 0.4862, + "num_tokens": 5582830953.0, + "step": 1339 + }, + { + "epoch": 2.675493379965026, + "grad_norm": 0.10620484632071024, + "learning_rate": 2.0799960289874293e-05, + "loss": 0.505, + "num_tokens": 5586961605.0, + "step": 1340 + }, + { + "epoch": 2.6774918810891832, + "grad_norm": 0.11211391673927218, + "learning_rate": 2.0776694146386308e-05, + "loss": 0.5001, + "num_tokens": 5591134208.0, + "step": 1341 + }, + { + "epoch": 2.67949038221334, + "grad_norm": 0.10586869590999144, + "learning_rate": 2.0753430056003774e-05, + "loss": 0.5037, + "num_tokens": 5595320268.0, + "step": 1342 + }, + { + "epoch": 2.681488883337497, + "grad_norm": 0.10085649185630421, + "learning_rate": 2.0730168057771383e-05, + "loss": 0.5015, + "num_tokens": 5599506276.0, + "step": 1343 + }, + { + "epoch": 2.683487384461654, + "grad_norm": 0.09824458784627471, + "learning_rate": 2.0706908190730358e-05, + "loss": 0.5042, + "num_tokens": 5603691581.0, + "step": 1344 + }, + { + "epoch": 2.685485885585811, + "grad_norm": 0.0965108474375328, + "learning_rate": 2.0683650493918308e-05, + "loss": 0.4803, + "num_tokens": 5607876713.0, + "step": 1345 + }, + { + "epoch": 2.6874843867099676, + "grad_norm": 0.11193760255114597, + "learning_rate": 2.0660395006369233e-05, + "loss": 0.5051, + "num_tokens": 5612021944.0, + "step": 1346 + }, + { + "epoch": 2.6894828878341244, + "grad_norm": 0.10678476120613194, + "learning_rate": 2.0637141767113408e-05, + "loss": 0.5007, + "num_tokens": 5616208900.0, + "step": 1347 + }, + { + "epoch": 2.691481388958281, + "grad_norm": 0.13276772331483633, + "learning_rate": 2.061389081517734e-05, + "loss": 0.5006, + "num_tokens": 5620393913.0, + "step": 1348 + }, + { + "epoch": 2.693479890082438, + "grad_norm": 0.11482818349218914, + "learning_rate": 2.059064218958369e-05, + "loss": 0.5061, + "num_tokens": 5624579265.0, + "step": 1349 + }, + { + "epoch": 2.695478391206595, + "grad_norm": 0.1295011378537167, + "learning_rate": 2.056739592935122e-05, + "loss": 0.5048, + "num_tokens": 5628763026.0, + "step": 1350 + }, + { + "epoch": 2.697476892330752, + "grad_norm": 0.10116065219495526, + "learning_rate": 2.054415207349473e-05, + "loss": 0.4881, + "num_tokens": 5632945939.0, + "step": 1351 + }, + { + "epoch": 2.6994753934549087, + "grad_norm": 0.12138274824132843, + "learning_rate": 2.0520910661024965e-05, + "loss": 0.4977, + "num_tokens": 5637113017.0, + "step": 1352 + }, + { + "epoch": 2.701473894579066, + "grad_norm": 0.1076445489138784, + "learning_rate": 2.0497671730948592e-05, + "loss": 0.5128, + "num_tokens": 5641300408.0, + "step": 1353 + }, + { + "epoch": 2.7034723957032227, + "grad_norm": 0.10147754550452394, + "learning_rate": 2.0474435322268095e-05, + "loss": 0.4806, + "num_tokens": 5645468150.0, + "step": 1354 + }, + { + "epoch": 2.7054708968273795, + "grad_norm": 0.09968046307780418, + "learning_rate": 2.045120147398172e-05, + "loss": 0.5186, + "num_tokens": 5649652964.0, + "step": 1355 + }, + { + "epoch": 2.7074693979515363, + "grad_norm": 0.09587770616146357, + "learning_rate": 2.0427970225083454e-05, + "loss": 0.5036, + "num_tokens": 5653815282.0, + "step": 1356 + }, + { + "epoch": 2.709467899075693, + "grad_norm": 0.10913455420990575, + "learning_rate": 2.0404741614562868e-05, + "loss": 0.4937, + "num_tokens": 5657989344.0, + "step": 1357 + }, + { + "epoch": 2.71146640019985, + "grad_norm": 0.09663981779478398, + "learning_rate": 2.0381515681405154e-05, + "loss": 0.5106, + "num_tokens": 5662153008.0, + "step": 1358 + }, + { + "epoch": 2.713464901324007, + "grad_norm": 0.09923103113194517, + "learning_rate": 2.035829246459097e-05, + "loss": 0.4926, + "num_tokens": 5666338656.0, + "step": 1359 + }, + { + "epoch": 2.715463402448164, + "grad_norm": 0.10748470350399128, + "learning_rate": 2.033507200309645e-05, + "loss": 0.5086, + "num_tokens": 5670510146.0, + "step": 1360 + }, + { + "epoch": 2.7174619035723206, + "grad_norm": 0.12240132245010168, + "learning_rate": 2.0311854335893072e-05, + "loss": 0.4998, + "num_tokens": 5674684293.0, + "step": 1361 + }, + { + "epoch": 2.719460404696478, + "grad_norm": 0.11378432614457229, + "learning_rate": 2.028863950194765e-05, + "loss": 0.492, + "num_tokens": 5678857544.0, + "step": 1362 + }, + { + "epoch": 2.7214589058206347, + "grad_norm": 0.11526148007007117, + "learning_rate": 2.0265427540222226e-05, + "loss": 0.505, + "num_tokens": 5683013276.0, + "step": 1363 + }, + { + "epoch": 2.7234574069447914, + "grad_norm": 0.12887869404711708, + "learning_rate": 2.024221848967403e-05, + "loss": 0.5024, + "num_tokens": 5687198293.0, + "step": 1364 + }, + { + "epoch": 2.725455908068948, + "grad_norm": 0.10376344378785779, + "learning_rate": 2.0219012389255407e-05, + "loss": 0.4856, + "num_tokens": 5691367418.0, + "step": 1365 + }, + { + "epoch": 2.727454409193105, + "grad_norm": 0.09876291532392181, + "learning_rate": 2.0195809277913745e-05, + "loss": 0.4963, + "num_tokens": 5695519650.0, + "step": 1366 + }, + { + "epoch": 2.729452910317262, + "grad_norm": 0.11842697232391951, + "learning_rate": 2.017260919459142e-05, + "loss": 0.5044, + "num_tokens": 5699704295.0, + "step": 1367 + }, + { + "epoch": 2.731451411441419, + "grad_norm": 0.09763875610397992, + "learning_rate": 2.0149412178225717e-05, + "loss": 0.4857, + "num_tokens": 5703839181.0, + "step": 1368 + }, + { + "epoch": 2.733449912565576, + "grad_norm": 0.11272332253425842, + "learning_rate": 2.012621826774878e-05, + "loss": 0.4921, + "num_tokens": 5708012210.0, + "step": 1369 + }, + { + "epoch": 2.7354484136897326, + "grad_norm": 0.12217440334845911, + "learning_rate": 2.0103027502087556e-05, + "loss": 0.513, + "num_tokens": 5712196713.0, + "step": 1370 + }, + { + "epoch": 2.73744691481389, + "grad_norm": 0.09730307256290921, + "learning_rate": 2.0079839920163685e-05, + "loss": 0.4981, + "num_tokens": 5716370223.0, + "step": 1371 + }, + { + "epoch": 2.7394454159380466, + "grad_norm": 0.11214704753194889, + "learning_rate": 2.0056655560893485e-05, + "loss": 0.5063, + "num_tokens": 5720555176.0, + "step": 1372 + }, + { + "epoch": 2.7414439170622034, + "grad_norm": 0.10615330069078441, + "learning_rate": 2.003347446318785e-05, + "loss": 0.4991, + "num_tokens": 5724740715.0, + "step": 1373 + }, + { + "epoch": 2.74344241818636, + "grad_norm": 0.10464369740120202, + "learning_rate": 2.001029666595221e-05, + "loss": 0.4924, + "num_tokens": 5728926564.0, + "step": 1374 + }, + { + "epoch": 2.745440919310517, + "grad_norm": 0.11286182554416609, + "learning_rate": 1.9987122208086465e-05, + "loss": 0.5066, + "num_tokens": 5733089427.0, + "step": 1375 + }, + { + "epoch": 2.747439420434674, + "grad_norm": 0.11299995445792203, + "learning_rate": 1.9963951128484886e-05, + "loss": 0.497, + "num_tokens": 5737244617.0, + "step": 1376 + }, + { + "epoch": 2.749437921558831, + "grad_norm": 0.10740934382363586, + "learning_rate": 1.9940783466036095e-05, + "loss": 0.4892, + "num_tokens": 5741400958.0, + "step": 1377 + }, + { + "epoch": 2.7514364226829877, + "grad_norm": 0.10546637811432323, + "learning_rate": 1.9917619259622957e-05, + "loss": 0.5005, + "num_tokens": 5745584805.0, + "step": 1378 + }, + { + "epoch": 2.7534349238071445, + "grad_norm": 0.11733764460261821, + "learning_rate": 1.989445854812257e-05, + "loss": 0.4926, + "num_tokens": 5749766453.0, + "step": 1379 + }, + { + "epoch": 2.7554334249313017, + "grad_norm": 0.10318122432789108, + "learning_rate": 1.987130137040614e-05, + "loss": 0.5045, + "num_tokens": 5753948733.0, + "step": 1380 + }, + { + "epoch": 2.7574319260554585, + "grad_norm": 0.12119298304278756, + "learning_rate": 1.9848147765338943e-05, + "loss": 0.4972, + "num_tokens": 5758133766.0, + "step": 1381 + }, + { + "epoch": 2.7594304271796153, + "grad_norm": 0.11030203751355822, + "learning_rate": 1.9824997771780276e-05, + "loss": 0.505, + "num_tokens": 5762319292.0, + "step": 1382 + }, + { + "epoch": 2.761428928303772, + "grad_norm": 0.09012310066266788, + "learning_rate": 1.9801851428583344e-05, + "loss": 0.5137, + "num_tokens": 5766502449.0, + "step": 1383 + }, + { + "epoch": 2.763427429427929, + "grad_norm": 0.11506286239496573, + "learning_rate": 1.9778708774595272e-05, + "loss": 0.5086, + "num_tokens": 5770687055.0, + "step": 1384 + }, + { + "epoch": 2.765425930552086, + "grad_norm": 0.10547558095100516, + "learning_rate": 1.9755569848656956e-05, + "loss": 0.4959, + "num_tokens": 5774871125.0, + "step": 1385 + }, + { + "epoch": 2.767424431676243, + "grad_norm": 0.10559444400419582, + "learning_rate": 1.9732434689603042e-05, + "loss": 0.5073, + "num_tokens": 5779057844.0, + "step": 1386 + }, + { + "epoch": 2.7694229328003996, + "grad_norm": 0.11631705050742168, + "learning_rate": 1.970930333626186e-05, + "loss": 0.5091, + "num_tokens": 5783217101.0, + "step": 1387 + }, + { + "epoch": 2.7714214339245564, + "grad_norm": 0.10819455344175978, + "learning_rate": 1.9686175827455342e-05, + "loss": 0.4967, + "num_tokens": 5787398367.0, + "step": 1388 + }, + { + "epoch": 2.7734199350487136, + "grad_norm": 0.10450087302463126, + "learning_rate": 1.9663052201999e-05, + "loss": 0.5068, + "num_tokens": 5791580744.0, + "step": 1389 + }, + { + "epoch": 2.7754184361728704, + "grad_norm": 0.13423666354027372, + "learning_rate": 1.9639932498701783e-05, + "loss": 0.4963, + "num_tokens": 5795766984.0, + "step": 1390 + }, + { + "epoch": 2.777416937297027, + "grad_norm": 0.10782711491283269, + "learning_rate": 1.9616816756366092e-05, + "loss": 0.4946, + "num_tokens": 5799862047.0, + "step": 1391 + }, + { + "epoch": 2.779415438421184, + "grad_norm": 0.12633584181294033, + "learning_rate": 1.959370501378765e-05, + "loss": 0.5013, + "num_tokens": 5804048213.0, + "step": 1392 + }, + { + "epoch": 2.7814139395453408, + "grad_norm": 0.1184370613933914, + "learning_rate": 1.95705973097555e-05, + "loss": 0.4955, + "num_tokens": 5808231927.0, + "step": 1393 + }, + { + "epoch": 2.783412440669498, + "grad_norm": 0.10643822388556146, + "learning_rate": 1.954749368305189e-05, + "loss": 0.4852, + "num_tokens": 5812416568.0, + "step": 1394 + }, + { + "epoch": 2.785410941793655, + "grad_norm": 0.11949186982596503, + "learning_rate": 1.952439417245223e-05, + "loss": 0.4871, + "num_tokens": 5816601237.0, + "step": 1395 + }, + { + "epoch": 2.7874094429178116, + "grad_norm": 0.11544425205509622, + "learning_rate": 1.9501298816725004e-05, + "loss": 0.5007, + "num_tokens": 5820785877.0, + "step": 1396 + }, + { + "epoch": 2.789407944041969, + "grad_norm": 0.1108035861610097, + "learning_rate": 1.9478207654631746e-05, + "loss": 0.51, + "num_tokens": 5824957996.0, + "step": 1397 + }, + { + "epoch": 2.7914064451661256, + "grad_norm": 0.11685695096566748, + "learning_rate": 1.9455120724926942e-05, + "loss": 0.4887, + "num_tokens": 5829142565.0, + "step": 1398 + }, + { + "epoch": 2.7934049462902824, + "grad_norm": 0.11317438867345131, + "learning_rate": 1.9432038066357974e-05, + "loss": 0.4966, + "num_tokens": 5833319007.0, + "step": 1399 + }, + { + "epoch": 2.795403447414439, + "grad_norm": 0.11313460327740728, + "learning_rate": 1.9408959717665058e-05, + "loss": 0.5047, + "num_tokens": 5837504166.0, + "step": 1400 + }, + { + "epoch": 2.797401948538596, + "grad_norm": 0.11911523539460424, + "learning_rate": 1.9385885717581182e-05, + "loss": 0.4944, + "num_tokens": 5841687405.0, + "step": 1401 + }, + { + "epoch": 2.7994004496627527, + "grad_norm": 0.11446255249600344, + "learning_rate": 1.936281610483201e-05, + "loss": 0.4959, + "num_tokens": 5845850765.0, + "step": 1402 + }, + { + "epoch": 2.80139895078691, + "grad_norm": 0.13279463145540496, + "learning_rate": 1.9339750918135882e-05, + "loss": 0.5099, + "num_tokens": 5850011118.0, + "step": 1403 + }, + { + "epoch": 2.8033974519110667, + "grad_norm": 0.11058637494739637, + "learning_rate": 1.9316690196203683e-05, + "loss": 0.4816, + "num_tokens": 5854196825.0, + "step": 1404 + }, + { + "epoch": 2.8053959530352235, + "grad_norm": 0.1559821255030015, + "learning_rate": 1.929363397773881e-05, + "loss": 0.5051, + "num_tokens": 5858355653.0, + "step": 1405 + }, + { + "epoch": 2.8073944541593807, + "grad_norm": 0.10839705485954405, + "learning_rate": 1.9270582301437097e-05, + "loss": 0.5129, + "num_tokens": 5862513779.0, + "step": 1406 + }, + { + "epoch": 2.8093929552835375, + "grad_norm": 0.13633951162934305, + "learning_rate": 1.9247535205986775e-05, + "loss": 0.5048, + "num_tokens": 5866698627.0, + "step": 1407 + }, + { + "epoch": 2.8113914564076943, + "grad_norm": 0.09922965624977526, + "learning_rate": 1.922449273006836e-05, + "loss": 0.4951, + "num_tokens": 5870854595.0, + "step": 1408 + }, + { + "epoch": 2.813389957531851, + "grad_norm": 0.12267683241381508, + "learning_rate": 1.9201454912354634e-05, + "loss": 0.4896, + "num_tokens": 5875037784.0, + "step": 1409 + }, + { + "epoch": 2.815388458656008, + "grad_norm": 0.11374414096445844, + "learning_rate": 1.9178421791510554e-05, + "loss": 0.5032, + "num_tokens": 5879223389.0, + "step": 1410 + }, + { + "epoch": 2.8173869597801646, + "grad_norm": 0.12503401570296588, + "learning_rate": 1.915539340619318e-05, + "loss": 0.4862, + "num_tokens": 5883409657.0, + "step": 1411 + }, + { + "epoch": 2.819385460904322, + "grad_norm": 0.11754355739257327, + "learning_rate": 1.9132369795051665e-05, + "loss": 0.4998, + "num_tokens": 5887564682.0, + "step": 1412 + }, + { + "epoch": 2.8213839620284786, + "grad_norm": 0.1236943953390612, + "learning_rate": 1.9109350996727105e-05, + "loss": 0.5144, + "num_tokens": 5891726016.0, + "step": 1413 + }, + { + "epoch": 2.8233824631526354, + "grad_norm": 0.10378219750948206, + "learning_rate": 1.9086337049852543e-05, + "loss": 0.4898, + "num_tokens": 5895910693.0, + "step": 1414 + }, + { + "epoch": 2.8253809642767926, + "grad_norm": 0.11867301642572158, + "learning_rate": 1.9063327993052874e-05, + "loss": 0.5009, + "num_tokens": 5900093271.0, + "step": 1415 + }, + { + "epoch": 2.8273794654009494, + "grad_norm": 0.10012404730539574, + "learning_rate": 1.9040323864944774e-05, + "loss": 0.5012, + "num_tokens": 5904250499.0, + "step": 1416 + }, + { + "epoch": 2.829377966525106, + "grad_norm": 0.1049377840823038, + "learning_rate": 1.9017324704136674e-05, + "loss": 0.4986, + "num_tokens": 5908432560.0, + "step": 1417 + }, + { + "epoch": 2.831376467649263, + "grad_norm": 0.11891338510484177, + "learning_rate": 1.8994330549228642e-05, + "loss": 0.4911, + "num_tokens": 5912618211.0, + "step": 1418 + }, + { + "epoch": 2.8333749687734198, + "grad_norm": 0.10622524871899107, + "learning_rate": 1.8971341438812362e-05, + "loss": 0.487, + "num_tokens": 5916804597.0, + "step": 1419 + }, + { + "epoch": 2.8353734698975765, + "grad_norm": 0.11006742286946046, + "learning_rate": 1.894835741147104e-05, + "loss": 0.5065, + "num_tokens": 5920951841.0, + "step": 1420 + }, + { + "epoch": 2.8373719710217338, + "grad_norm": 0.0987473263111996, + "learning_rate": 1.892537850577935e-05, + "loss": 0.499, + "num_tokens": 5925090068.0, + "step": 1421 + }, + { + "epoch": 2.8393704721458906, + "grad_norm": 0.12058147570857128, + "learning_rate": 1.8902404760303402e-05, + "loss": 0.4934, + "num_tokens": 5929234868.0, + "step": 1422 + }, + { + "epoch": 2.8413689732700473, + "grad_norm": 0.1022969147262882, + "learning_rate": 1.8879436213600587e-05, + "loss": 0.5061, + "num_tokens": 5933394596.0, + "step": 1423 + }, + { + "epoch": 2.8433674743942046, + "grad_norm": 0.11857229627205663, + "learning_rate": 1.8856472904219623e-05, + "loss": 0.496, + "num_tokens": 5937546324.0, + "step": 1424 + }, + { + "epoch": 2.8453659755183613, + "grad_norm": 0.12206184714348256, + "learning_rate": 1.8833514870700396e-05, + "loss": 0.4919, + "num_tokens": 5941691202.0, + "step": 1425 + }, + { + "epoch": 2.847364476642518, + "grad_norm": 0.1013398669574232, + "learning_rate": 1.8810562151573993e-05, + "loss": 0.4877, + "num_tokens": 5945861017.0, + "step": 1426 + }, + { + "epoch": 2.849362977766675, + "grad_norm": 0.11045220611379837, + "learning_rate": 1.8787614785362526e-05, + "loss": 0.489, + "num_tokens": 5950038104.0, + "step": 1427 + }, + { + "epoch": 2.8513614788908317, + "grad_norm": 0.11537964053236831, + "learning_rate": 1.8764672810579153e-05, + "loss": 0.515, + "num_tokens": 5954197559.0, + "step": 1428 + }, + { + "epoch": 2.853359980014989, + "grad_norm": 0.10350261092999508, + "learning_rate": 1.8741736265727967e-05, + "loss": 0.4855, + "num_tokens": 5958349664.0, + "step": 1429 + }, + { + "epoch": 2.8553584811391457, + "grad_norm": 0.12761317067669722, + "learning_rate": 1.871880518930396e-05, + "loss": 0.4945, + "num_tokens": 5962520654.0, + "step": 1430 + }, + { + "epoch": 2.8573569822633025, + "grad_norm": 0.11127592300075087, + "learning_rate": 1.869587961979295e-05, + "loss": 0.4897, + "num_tokens": 5966705152.0, + "step": 1431 + }, + { + "epoch": 2.8593554833874593, + "grad_norm": 0.10133969928472411, + "learning_rate": 1.8672959595671502e-05, + "loss": 0.4866, + "num_tokens": 5970887632.0, + "step": 1432 + }, + { + "epoch": 2.8613539845116165, + "grad_norm": 0.12998013385998713, + "learning_rate": 1.865004515540688e-05, + "loss": 0.5122, + "num_tokens": 5975041606.0, + "step": 1433 + }, + { + "epoch": 2.8633524856357733, + "grad_norm": 0.1098455159255085, + "learning_rate": 1.8627136337456956e-05, + "loss": 0.5149, + "num_tokens": 5979194449.0, + "step": 1434 + }, + { + "epoch": 2.86535098675993, + "grad_norm": 0.12282371575107907, + "learning_rate": 1.8604233180270208e-05, + "loss": 0.487, + "num_tokens": 5983346888.0, + "step": 1435 + }, + { + "epoch": 2.867349487884087, + "grad_norm": 0.0955523488735877, + "learning_rate": 1.8581335722285577e-05, + "loss": 0.5006, + "num_tokens": 5987511581.0, + "step": 1436 + }, + { + "epoch": 2.8693479890082436, + "grad_norm": 0.11711295251596898, + "learning_rate": 1.855844400193246e-05, + "loss": 0.5102, + "num_tokens": 5991664254.0, + "step": 1437 + }, + { + "epoch": 2.871346490132401, + "grad_norm": 0.10889380728710601, + "learning_rate": 1.85355580576306e-05, + "loss": 0.4857, + "num_tokens": 5995812119.0, + "step": 1438 + }, + { + "epoch": 2.8733449912565576, + "grad_norm": 0.11986932917887771, + "learning_rate": 1.851267792779007e-05, + "loss": 0.5062, + "num_tokens": 5999992315.0, + "step": 1439 + }, + { + "epoch": 2.8753434923807144, + "grad_norm": 0.11039458393262425, + "learning_rate": 1.848980365081118e-05, + "loss": 0.4839, + "num_tokens": 6004180571.0, + "step": 1440 + }, + { + "epoch": 2.877341993504871, + "grad_norm": 0.1285018898028311, + "learning_rate": 1.8466935265084406e-05, + "loss": 0.4934, + "num_tokens": 6008328479.0, + "step": 1441 + }, + { + "epoch": 2.8793404946290284, + "grad_norm": 0.11328411416046454, + "learning_rate": 1.844407280899035e-05, + "loss": 0.5, + "num_tokens": 6012471072.0, + "step": 1442 + }, + { + "epoch": 2.881338995753185, + "grad_norm": 0.12392802753220365, + "learning_rate": 1.842121632089965e-05, + "loss": 0.4856, + "num_tokens": 6016636944.0, + "step": 1443 + }, + { + "epoch": 2.883337496877342, + "grad_norm": 0.11992529473765998, + "learning_rate": 1.8398365839172933e-05, + "loss": 0.5098, + "num_tokens": 6020770476.0, + "step": 1444 + }, + { + "epoch": 2.8853359980014988, + "grad_norm": 0.10265781245170351, + "learning_rate": 1.8375521402160748e-05, + "loss": 0.5003, + "num_tokens": 6024952358.0, + "step": 1445 + }, + { + "epoch": 2.8873344991256555, + "grad_norm": 0.14787497281206705, + "learning_rate": 1.83526830482035e-05, + "loss": 0.4859, + "num_tokens": 6029108285.0, + "step": 1446 + }, + { + "epoch": 2.8893330002498128, + "grad_norm": 0.09983910092460482, + "learning_rate": 1.8329850815631375e-05, + "loss": 0.4914, + "num_tokens": 6033291450.0, + "step": 1447 + }, + { + "epoch": 2.8913315013739695, + "grad_norm": 0.12955920856339584, + "learning_rate": 1.830702474276429e-05, + "loss": 0.4981, + "num_tokens": 6037476524.0, + "step": 1448 + }, + { + "epoch": 2.8933300024981263, + "grad_norm": 0.10816297183035459, + "learning_rate": 1.828420486791184e-05, + "loss": 0.5112, + "num_tokens": 6041662001.0, + "step": 1449 + }, + { + "epoch": 2.8953285036222836, + "grad_norm": 0.11052477182651287, + "learning_rate": 1.8261391229373188e-05, + "loss": 0.4953, + "num_tokens": 6045847795.0, + "step": 1450 + }, + { + "epoch": 2.8973270047464403, + "grad_norm": 0.10735265935014966, + "learning_rate": 1.823858386543705e-05, + "loss": 0.4902, + "num_tokens": 6050033932.0, + "step": 1451 + }, + { + "epoch": 2.899325505870597, + "grad_norm": 0.09065938551073906, + "learning_rate": 1.8215782814381616e-05, + "loss": 0.504, + "num_tokens": 6054218325.0, + "step": 1452 + }, + { + "epoch": 2.901324006994754, + "grad_norm": 0.10043588028292214, + "learning_rate": 1.8192988114474447e-05, + "loss": 0.5041, + "num_tokens": 6058371341.0, + "step": 1453 + }, + { + "epoch": 2.9033225081189107, + "grad_norm": 0.11328461831075223, + "learning_rate": 1.8170199803972504e-05, + "loss": 0.488, + "num_tokens": 6062553107.0, + "step": 1454 + }, + { + "epoch": 2.9053210092430675, + "grad_norm": 0.10305388449666712, + "learning_rate": 1.8147417921121965e-05, + "loss": 0.5097, + "num_tokens": 6066736875.0, + "step": 1455 + }, + { + "epoch": 2.9073195103672247, + "grad_norm": 0.09865214773315144, + "learning_rate": 1.8124642504158253e-05, + "loss": 0.5, + "num_tokens": 6070887313.0, + "step": 1456 + }, + { + "epoch": 2.9093180114913815, + "grad_norm": 0.11553450240786832, + "learning_rate": 1.8101873591305938e-05, + "loss": 0.4973, + "num_tokens": 6075048577.0, + "step": 1457 + }, + { + "epoch": 2.9113165126155383, + "grad_norm": 0.11739723548900258, + "learning_rate": 1.807911122077865e-05, + "loss": 0.4859, + "num_tokens": 6079208166.0, + "step": 1458 + }, + { + "epoch": 2.9133150137396955, + "grad_norm": 0.09314165964367735, + "learning_rate": 1.805635543077908e-05, + "loss": 0.4847, + "num_tokens": 6083377663.0, + "step": 1459 + }, + { + "epoch": 2.9153135148638523, + "grad_norm": 0.12611476323755166, + "learning_rate": 1.803360625949883e-05, + "loss": 0.4894, + "num_tokens": 6087564547.0, + "step": 1460 + }, + { + "epoch": 2.917312015988009, + "grad_norm": 0.09986272075271208, + "learning_rate": 1.8010863745118427e-05, + "loss": 0.5093, + "num_tokens": 6091727964.0, + "step": 1461 + }, + { + "epoch": 2.919310517112166, + "grad_norm": 0.11043993713020298, + "learning_rate": 1.798812792580721e-05, + "loss": 0.4895, + "num_tokens": 6095896977.0, + "step": 1462 + }, + { + "epoch": 2.9213090182363226, + "grad_norm": 0.08882026097698854, + "learning_rate": 1.796539883972328e-05, + "loss": 0.5013, + "num_tokens": 6100084039.0, + "step": 1463 + }, + { + "epoch": 2.9233075193604794, + "grad_norm": 0.09845744317670786, + "learning_rate": 1.7942676525013455e-05, + "loss": 0.4964, + "num_tokens": 6104270234.0, + "step": 1464 + }, + { + "epoch": 2.9253060204846366, + "grad_norm": 0.0950311119928611, + "learning_rate": 1.7919961019813162e-05, + "loss": 0.4977, + "num_tokens": 6108430553.0, + "step": 1465 + }, + { + "epoch": 2.9273045216087934, + "grad_norm": 0.08935170170457987, + "learning_rate": 1.7897252362246424e-05, + "loss": 0.4869, + "num_tokens": 6112615277.0, + "step": 1466 + }, + { + "epoch": 2.92930302273295, + "grad_norm": 0.1142143351870689, + "learning_rate": 1.787455059042575e-05, + "loss": 0.5008, + "num_tokens": 6116788103.0, + "step": 1467 + }, + { + "epoch": 2.9313015238571074, + "grad_norm": 0.10160626257890931, + "learning_rate": 1.7851855742452108e-05, + "loss": 0.5186, + "num_tokens": 6120975343.0, + "step": 1468 + }, + { + "epoch": 2.933300024981264, + "grad_norm": 0.11413494225282075, + "learning_rate": 1.7829167856414842e-05, + "loss": 0.5084, + "num_tokens": 6125159277.0, + "step": 1469 + }, + { + "epoch": 2.935298526105421, + "grad_norm": 0.10364982878333265, + "learning_rate": 1.7806486970391615e-05, + "loss": 0.496, + "num_tokens": 6129316929.0, + "step": 1470 + }, + { + "epoch": 2.9372970272295778, + "grad_norm": 0.10665769869460046, + "learning_rate": 1.7783813122448323e-05, + "loss": 0.495, + "num_tokens": 6133476940.0, + "step": 1471 + }, + { + "epoch": 2.9392955283537345, + "grad_norm": 0.09884881568814151, + "learning_rate": 1.776114635063907e-05, + "loss": 0.499, + "num_tokens": 6137624501.0, + "step": 1472 + }, + { + "epoch": 2.9412940294778913, + "grad_norm": 0.10073831650818577, + "learning_rate": 1.773848669300609e-05, + "loss": 0.4961, + "num_tokens": 6141803086.0, + "step": 1473 + }, + { + "epoch": 2.9432925306020485, + "grad_norm": 0.11069654726383452, + "learning_rate": 1.7715834187579644e-05, + "loss": 0.495, + "num_tokens": 6145939078.0, + "step": 1474 + }, + { + "epoch": 2.9452910317262053, + "grad_norm": 0.09617937271091713, + "learning_rate": 1.7693188872378032e-05, + "loss": 0.5015, + "num_tokens": 6150096420.0, + "step": 1475 + }, + { + "epoch": 2.947289532850362, + "grad_norm": 0.11287206598657469, + "learning_rate": 1.7670550785407444e-05, + "loss": 0.4884, + "num_tokens": 6154269847.0, + "step": 1476 + }, + { + "epoch": 2.9492880339745193, + "grad_norm": 0.11005640504998486, + "learning_rate": 1.764791996466196e-05, + "loss": 0.5255, + "num_tokens": 6158423789.0, + "step": 1477 + }, + { + "epoch": 2.951286535098676, + "grad_norm": 0.11409339184169917, + "learning_rate": 1.762529644812348e-05, + "loss": 0.4984, + "num_tokens": 6162606890.0, + "step": 1478 + }, + { + "epoch": 2.953285036222833, + "grad_norm": 0.12748127459589018, + "learning_rate": 1.7602680273761623e-05, + "loss": 0.4898, + "num_tokens": 6166791289.0, + "step": 1479 + }, + { + "epoch": 2.9552835373469897, + "grad_norm": 0.13447574461851614, + "learning_rate": 1.7580071479533685e-05, + "loss": 0.4912, + "num_tokens": 6170940419.0, + "step": 1480 + }, + { + "epoch": 2.9572820384711465, + "grad_norm": 0.11077979259278173, + "learning_rate": 1.7557470103384583e-05, + "loss": 0.5145, + "num_tokens": 6175125171.0, + "step": 1481 + }, + { + "epoch": 2.9592805395953037, + "grad_norm": 0.11115121466477386, + "learning_rate": 1.7534876183246798e-05, + "loss": 0.5296, + "num_tokens": 6179311181.0, + "step": 1482 + }, + { + "epoch": 2.9612790407194605, + "grad_norm": 0.12467406375280517, + "learning_rate": 1.7512289757040265e-05, + "loss": 0.4909, + "num_tokens": 6183498864.0, + "step": 1483 + }, + { + "epoch": 2.9632775418436172, + "grad_norm": 0.11004435905728702, + "learning_rate": 1.748971086267237e-05, + "loss": 0.5008, + "num_tokens": 6187683744.0, + "step": 1484 + }, + { + "epoch": 2.965276042967774, + "grad_norm": 0.12470112198949071, + "learning_rate": 1.7467139538037843e-05, + "loss": 0.4966, + "num_tokens": 6191859556.0, + "step": 1485 + }, + { + "epoch": 2.9672745440919313, + "grad_norm": 0.10768582687332875, + "learning_rate": 1.744457582101871e-05, + "loss": 0.4831, + "num_tokens": 6196044707.0, + "step": 1486 + }, + { + "epoch": 2.969273045216088, + "grad_norm": 0.10929395321932324, + "learning_rate": 1.742201974948424e-05, + "loss": 0.4997, + "num_tokens": 6200231032.0, + "step": 1487 + }, + { + "epoch": 2.971271546340245, + "grad_norm": 0.11635720970718297, + "learning_rate": 1.7399471361290868e-05, + "loss": 0.5019, + "num_tokens": 6204389108.0, + "step": 1488 + }, + { + "epoch": 2.9732700474644016, + "grad_norm": 0.1103955990459972, + "learning_rate": 1.737693069428212e-05, + "loss": 0.5097, + "num_tokens": 6208562499.0, + "step": 1489 + }, + { + "epoch": 2.9752685485885584, + "grad_norm": 0.12618861143017077, + "learning_rate": 1.735439778628858e-05, + "loss": 0.5015, + "num_tokens": 6212748314.0, + "step": 1490 + }, + { + "epoch": 2.9772670497127156, + "grad_norm": 0.1202767588624781, + "learning_rate": 1.7331872675127793e-05, + "loss": 0.4888, + "num_tokens": 6216880522.0, + "step": 1491 + }, + { + "epoch": 2.9792655508368724, + "grad_norm": 0.11078386880239656, + "learning_rate": 1.7309355398604233e-05, + "loss": 0.4845, + "num_tokens": 6221010950.0, + "step": 1492 + }, + { + "epoch": 2.981264051961029, + "grad_norm": 0.107859977347645, + "learning_rate": 1.728684599450923e-05, + "loss": 0.4966, + "num_tokens": 6225174632.0, + "step": 1493 + }, + { + "epoch": 2.983262553085186, + "grad_norm": 0.10243078094089993, + "learning_rate": 1.726434450062087e-05, + "loss": 0.4767, + "num_tokens": 6229359407.0, + "step": 1494 + }, + { + "epoch": 2.985261054209343, + "grad_norm": 0.10835740267054576, + "learning_rate": 1.7241850954703996e-05, + "loss": 0.505, + "num_tokens": 6233543454.0, + "step": 1495 + }, + { + "epoch": 2.9872595553335, + "grad_norm": 0.09893302030517133, + "learning_rate": 1.721936539451011e-05, + "loss": 0.4819, + "num_tokens": 6237728166.0, + "step": 1496 + }, + { + "epoch": 2.9892580564576567, + "grad_norm": 0.10627611101245146, + "learning_rate": 1.7196887857777288e-05, + "loss": 0.4885, + "num_tokens": 6241912480.0, + "step": 1497 + }, + { + "epoch": 2.9912565575818135, + "grad_norm": 0.10732013377674111, + "learning_rate": 1.7174418382230155e-05, + "loss": 0.4914, + "num_tokens": 6246098068.0, + "step": 1498 + }, + { + "epoch": 2.9932550587059703, + "grad_norm": 0.10126496883530146, + "learning_rate": 1.7151957005579816e-05, + "loss": 0.5017, + "num_tokens": 6250281090.0, + "step": 1499 + }, + { + "epoch": 2.9952535598301275, + "grad_norm": 0.11297641564325976, + "learning_rate": 1.7129503765523754e-05, + "loss": 0.4936, + "num_tokens": 6254446692.0, + "step": 1500 + }, + { + "epoch": 2.9972520609542843, + "grad_norm": 0.09558811531625477, + "learning_rate": 1.710705869974583e-05, + "loss": 0.5033, + "num_tokens": 6258624474.0, + "step": 1501 + }, + { + "epoch": 2.999250562078441, + "grad_norm": 0.10420858366041354, + "learning_rate": 1.7084621845916168e-05, + "loss": 0.4895, + "num_tokens": 6262810904.0, + "step": 1502 + }, + { + "epoch": 3.0, + "grad_norm": 0.15417320075616178, + "learning_rate": 1.7062193241691112e-05, + "loss": 0.5039, + "num_tokens": 6264321679.0, + "step": 1503 + }, + { + "epoch": 3.001998501124157, + "grad_norm": 0.16036864900016376, + "learning_rate": 1.703977292471316e-05, + "loss": 0.4656, + "num_tokens": 6268482605.0, + "step": 1504 + }, + { + "epoch": 3.0039970022483136, + "grad_norm": 0.1195290984125463, + "learning_rate": 1.7017360932610886e-05, + "loss": 0.4806, + "num_tokens": 6272669132.0, + "step": 1505 + }, + { + "epoch": 3.005995503372471, + "grad_norm": 0.12980892579976402, + "learning_rate": 1.6994957302998943e-05, + "loss": 0.4829, + "num_tokens": 6276854261.0, + "step": 1506 + }, + { + "epoch": 3.0079940044966276, + "grad_norm": 0.13309927817743913, + "learning_rate": 1.6972562073477886e-05, + "loss": 0.4592, + "num_tokens": 6281036590.0, + "step": 1507 + }, + { + "epoch": 3.0099925056207844, + "grad_norm": 0.12457292791344261, + "learning_rate": 1.6950175281634217e-05, + "loss": 0.4674, + "num_tokens": 6285196163.0, + "step": 1508 + }, + { + "epoch": 3.011991006744941, + "grad_norm": 0.1053540252664737, + "learning_rate": 1.6927796965040242e-05, + "loss": 0.4774, + "num_tokens": 6289356175.0, + "step": 1509 + }, + { + "epoch": 3.0139895078690984, + "grad_norm": 0.12593087231428596, + "learning_rate": 1.6905427161254077e-05, + "loss": 0.4722, + "num_tokens": 6293514387.0, + "step": 1510 + }, + { + "epoch": 3.015988008993255, + "grad_norm": 0.129865679789465, + "learning_rate": 1.6883065907819527e-05, + "loss": 0.467, + "num_tokens": 6297664515.0, + "step": 1511 + }, + { + "epoch": 3.017986510117412, + "grad_norm": 0.10860445550027141, + "learning_rate": 1.686071324226606e-05, + "loss": 0.4806, + "num_tokens": 6301775904.0, + "step": 1512 + }, + { + "epoch": 3.0199850112415687, + "grad_norm": 0.12504027985886676, + "learning_rate": 1.6838369202108725e-05, + "loss": 0.4689, + "num_tokens": 6305959826.0, + "step": 1513 + }, + { + "epoch": 3.0219835123657255, + "grad_norm": 0.12647028424161558, + "learning_rate": 1.681603382484808e-05, + "loss": 0.4685, + "num_tokens": 6310032006.0, + "step": 1514 + }, + { + "epoch": 3.0239820134898827, + "grad_norm": 0.12594542180166135, + "learning_rate": 1.679370714797018e-05, + "loss": 0.4753, + "num_tokens": 6314216704.0, + "step": 1515 + }, + { + "epoch": 3.0259805146140395, + "grad_norm": 0.13550519141556702, + "learning_rate": 1.677138920894644e-05, + "loss": 0.4797, + "num_tokens": 6318401630.0, + "step": 1516 + }, + { + "epoch": 3.0279790157381963, + "grad_norm": 0.11776921272182717, + "learning_rate": 1.6749080045233644e-05, + "loss": 0.4726, + "num_tokens": 6322585865.0, + "step": 1517 + }, + { + "epoch": 3.029977516862353, + "grad_norm": 0.11732958043769925, + "learning_rate": 1.6726779694273814e-05, + "loss": 0.4739, + "num_tokens": 6326744436.0, + "step": 1518 + }, + { + "epoch": 3.0319760179865103, + "grad_norm": 0.11686980131618442, + "learning_rate": 1.67044881934942e-05, + "loss": 0.4692, + "num_tokens": 6330902017.0, + "step": 1519 + }, + { + "epoch": 3.033974519110667, + "grad_norm": 0.11887561188984713, + "learning_rate": 1.6682205580307203e-05, + "loss": 0.477, + "num_tokens": 6335086484.0, + "step": 1520 + }, + { + "epoch": 3.035973020234824, + "grad_norm": 0.10906019697497839, + "learning_rate": 1.6659931892110304e-05, + "loss": 0.4751, + "num_tokens": 6339273430.0, + "step": 1521 + }, + { + "epoch": 3.0379715213589806, + "grad_norm": 0.10445871448420332, + "learning_rate": 1.6637667166285993e-05, + "loss": 0.4736, + "num_tokens": 6343460886.0, + "step": 1522 + }, + { + "epoch": 3.039970022483138, + "grad_norm": 0.0983316329185456, + "learning_rate": 1.6615411440201735e-05, + "loss": 0.4662, + "num_tokens": 6347644647.0, + "step": 1523 + }, + { + "epoch": 3.0419685236072946, + "grad_norm": 0.11350823998229811, + "learning_rate": 1.6593164751209876e-05, + "loss": 0.4833, + "num_tokens": 6351828391.0, + "step": 1524 + }, + { + "epoch": 3.0439670247314514, + "grad_norm": 0.09892205025300979, + "learning_rate": 1.657092713664761e-05, + "loss": 0.4705, + "num_tokens": 6356003069.0, + "step": 1525 + }, + { + "epoch": 3.045965525855608, + "grad_norm": 0.1025733205175973, + "learning_rate": 1.6548698633836893e-05, + "loss": 0.4678, + "num_tokens": 6360143514.0, + "step": 1526 + }, + { + "epoch": 3.047964026979765, + "grad_norm": 0.09796915133299221, + "learning_rate": 1.652647928008438e-05, + "loss": 0.4766, + "num_tokens": 6364328293.0, + "step": 1527 + }, + { + "epoch": 3.049962528103922, + "grad_norm": 0.10737330577806369, + "learning_rate": 1.6504269112681378e-05, + "loss": 0.4802, + "num_tokens": 6368485942.0, + "step": 1528 + }, + { + "epoch": 3.051961029228079, + "grad_norm": 0.11223522994332939, + "learning_rate": 1.6482068168903796e-05, + "loss": 0.474, + "num_tokens": 6372643612.0, + "step": 1529 + }, + { + "epoch": 3.0539595303522358, + "grad_norm": 0.10791919032288208, + "learning_rate": 1.645987648601203e-05, + "loss": 0.4758, + "num_tokens": 6376812830.0, + "step": 1530 + }, + { + "epoch": 3.0559580314763926, + "grad_norm": 0.09174789884066566, + "learning_rate": 1.643769410125095e-05, + "loss": 0.4702, + "num_tokens": 6380955918.0, + "step": 1531 + }, + { + "epoch": 3.05795653260055, + "grad_norm": 0.10232768119418681, + "learning_rate": 1.641552105184982e-05, + "loss": 0.4799, + "num_tokens": 6385122206.0, + "step": 1532 + }, + { + "epoch": 3.0599550337247066, + "grad_norm": 0.09848521365730865, + "learning_rate": 1.639335737502223e-05, + "loss": 0.467, + "num_tokens": 6389305315.0, + "step": 1533 + }, + { + "epoch": 3.0619535348488633, + "grad_norm": 0.09340781555109079, + "learning_rate": 1.637120310796605e-05, + "loss": 0.4719, + "num_tokens": 6393488634.0, + "step": 1534 + }, + { + "epoch": 3.06395203597302, + "grad_norm": 0.0988520947168479, + "learning_rate": 1.6349058287863355e-05, + "loss": 0.4676, + "num_tokens": 6397664330.0, + "step": 1535 + }, + { + "epoch": 3.065950537097177, + "grad_norm": 0.11016851617499394, + "learning_rate": 1.632692295188035e-05, + "loss": 0.4838, + "num_tokens": 6401837059.0, + "step": 1536 + }, + { + "epoch": 3.067949038221334, + "grad_norm": 0.0942572165906265, + "learning_rate": 1.6304797137167342e-05, + "loss": 0.4743, + "num_tokens": 6406013789.0, + "step": 1537 + }, + { + "epoch": 3.069947539345491, + "grad_norm": 0.11413568696674053, + "learning_rate": 1.6282680880858647e-05, + "loss": 0.4848, + "num_tokens": 6410197315.0, + "step": 1538 + }, + { + "epoch": 3.0719460404696477, + "grad_norm": 0.10644891542371293, + "learning_rate": 1.6260574220072553e-05, + "loss": 0.4846, + "num_tokens": 6414383417.0, + "step": 1539 + }, + { + "epoch": 3.0739445415938045, + "grad_norm": 0.11415220877655034, + "learning_rate": 1.623847719191122e-05, + "loss": 0.4846, + "num_tokens": 6418562501.0, + "step": 1540 + }, + { + "epoch": 3.0759430427179617, + "grad_norm": 0.10646184991798362, + "learning_rate": 1.6216389833460663e-05, + "loss": 0.4782, + "num_tokens": 6422747694.0, + "step": 1541 + }, + { + "epoch": 3.0779415438421185, + "grad_norm": 0.10446774996434671, + "learning_rate": 1.619431218179065e-05, + "loss": 0.4758, + "num_tokens": 6426905752.0, + "step": 1542 + }, + { + "epoch": 3.0799400449662753, + "grad_norm": 0.11999479406077533, + "learning_rate": 1.6172244273954678e-05, + "loss": 0.4721, + "num_tokens": 6431091746.0, + "step": 1543 + }, + { + "epoch": 3.081938546090432, + "grad_norm": 0.10191647919649603, + "learning_rate": 1.6150186146989874e-05, + "loss": 0.4727, + "num_tokens": 6435276639.0, + "step": 1544 + }, + { + "epoch": 3.0839370472145893, + "grad_norm": 0.11082448692259324, + "learning_rate": 1.6128137837916952e-05, + "loss": 0.4707, + "num_tokens": 6439460607.0, + "step": 1545 + }, + { + "epoch": 3.085935548338746, + "grad_norm": 0.10111137123658841, + "learning_rate": 1.6106099383740164e-05, + "loss": 0.4733, + "num_tokens": 6443639416.0, + "step": 1546 + }, + { + "epoch": 3.087934049462903, + "grad_norm": 0.09720020025538365, + "learning_rate": 1.6084070821447194e-05, + "loss": 0.4745, + "num_tokens": 6447803625.0, + "step": 1547 + }, + { + "epoch": 3.0899325505870596, + "grad_norm": 0.12132611752374103, + "learning_rate": 1.6062052188009153e-05, + "loss": 0.4765, + "num_tokens": 6451948288.0, + "step": 1548 + }, + { + "epoch": 3.0919310517112164, + "grad_norm": 0.10024363504756684, + "learning_rate": 1.6040043520380475e-05, + "loss": 0.4692, + "num_tokens": 6456111856.0, + "step": 1549 + }, + { + "epoch": 3.0939295528353736, + "grad_norm": 0.11197837703381142, + "learning_rate": 1.601804485549886e-05, + "loss": 0.4641, + "num_tokens": 6460297835.0, + "step": 1550 + }, + { + "epoch": 3.0959280539595304, + "grad_norm": 0.10885237016532925, + "learning_rate": 1.5996056230285237e-05, + "loss": 0.4726, + "num_tokens": 6464450465.0, + "step": 1551 + }, + { + "epoch": 3.097926555083687, + "grad_norm": 0.10450124864527013, + "learning_rate": 1.597407768164366e-05, + "loss": 0.4726, + "num_tokens": 6468635615.0, + "step": 1552 + }, + { + "epoch": 3.099925056207844, + "grad_norm": 0.11433145782102483, + "learning_rate": 1.5952109246461316e-05, + "loss": 0.4619, + "num_tokens": 6472820349.0, + "step": 1553 + }, + { + "epoch": 3.101923557332001, + "grad_norm": 0.10327376025736441, + "learning_rate": 1.5930150961608372e-05, + "loss": 0.4643, + "num_tokens": 6477005799.0, + "step": 1554 + }, + { + "epoch": 3.103922058456158, + "grad_norm": 0.14645309824590644, + "learning_rate": 1.590820286393798e-05, + "loss": 0.4847, + "num_tokens": 6481192902.0, + "step": 1555 + }, + { + "epoch": 3.1059205595803148, + "grad_norm": 0.11502414507816616, + "learning_rate": 1.588626499028619e-05, + "loss": 0.4885, + "num_tokens": 6485362611.0, + "step": 1556 + }, + { + "epoch": 3.1079190607044715, + "grad_norm": 0.10900053879202123, + "learning_rate": 1.586433737747191e-05, + "loss": 0.4651, + "num_tokens": 6489520003.0, + "step": 1557 + }, + { + "epoch": 3.1099175618286283, + "grad_norm": 0.13586160595440383, + "learning_rate": 1.5842420062296794e-05, + "loss": 0.4676, + "num_tokens": 6493700632.0, + "step": 1558 + }, + { + "epoch": 3.1119160629527856, + "grad_norm": 0.09558981428775175, + "learning_rate": 1.5820513081545245e-05, + "loss": 0.475, + "num_tokens": 6497887826.0, + "step": 1559 + }, + { + "epoch": 3.1139145640769423, + "grad_norm": 0.12299624174755748, + "learning_rate": 1.5798616471984297e-05, + "loss": 0.4774, + "num_tokens": 6502071004.0, + "step": 1560 + }, + { + "epoch": 3.115913065201099, + "grad_norm": 0.10314424391015087, + "learning_rate": 1.577673027036359e-05, + "loss": 0.494, + "num_tokens": 6506255883.0, + "step": 1561 + }, + { + "epoch": 3.117911566325256, + "grad_norm": 0.09317870657280344, + "learning_rate": 1.57548545134153e-05, + "loss": 0.4648, + "num_tokens": 6510424126.0, + "step": 1562 + }, + { + "epoch": 3.119910067449413, + "grad_norm": 0.11238232890033041, + "learning_rate": 1.573298923785407e-05, + "loss": 0.4819, + "num_tokens": 6514604409.0, + "step": 1563 + }, + { + "epoch": 3.12190856857357, + "grad_norm": 0.09471330219283683, + "learning_rate": 1.571113448037695e-05, + "loss": 0.4701, + "num_tokens": 6518788644.0, + "step": 1564 + }, + { + "epoch": 3.1239070696977267, + "grad_norm": 0.09773051171785539, + "learning_rate": 1.5689290277663334e-05, + "loss": 0.465, + "num_tokens": 6522961920.0, + "step": 1565 + }, + { + "epoch": 3.1259055708218835, + "grad_norm": 0.10198283927611919, + "learning_rate": 1.5667456666374896e-05, + "loss": 0.4811, + "num_tokens": 6527144113.0, + "step": 1566 + }, + { + "epoch": 3.1279040719460403, + "grad_norm": 0.0904553632773699, + "learning_rate": 1.5645633683155553e-05, + "loss": 0.4677, + "num_tokens": 6531326964.0, + "step": 1567 + }, + { + "epoch": 3.1299025730701975, + "grad_norm": 0.08931877508500682, + "learning_rate": 1.5623821364631382e-05, + "loss": 0.4682, + "num_tokens": 6535512112.0, + "step": 1568 + }, + { + "epoch": 3.1319010741943543, + "grad_norm": 0.1033203264740534, + "learning_rate": 1.560201974741054e-05, + "loss": 0.4595, + "num_tokens": 6539697079.0, + "step": 1569 + }, + { + "epoch": 3.133899575318511, + "grad_norm": 0.10364181620491959, + "learning_rate": 1.5580228868083238e-05, + "loss": 0.4633, + "num_tokens": 6543883230.0, + "step": 1570 + }, + { + "epoch": 3.135898076442668, + "grad_norm": 0.10023877177256785, + "learning_rate": 1.5558448763221674e-05, + "loss": 0.4707, + "num_tokens": 6548067646.0, + "step": 1571 + }, + { + "epoch": 3.137896577566825, + "grad_norm": 0.1009602083214303, + "learning_rate": 1.5536679469379953e-05, + "loss": 0.4832, + "num_tokens": 6552241438.0, + "step": 1572 + }, + { + "epoch": 3.139895078690982, + "grad_norm": 0.11307470375499838, + "learning_rate": 1.5514921023094028e-05, + "loss": 0.4789, + "num_tokens": 6556371686.0, + "step": 1573 + }, + { + "epoch": 3.1418935798151386, + "grad_norm": 0.09939335806908302, + "learning_rate": 1.5493173460881664e-05, + "loss": 0.4867, + "num_tokens": 6560520582.0, + "step": 1574 + }, + { + "epoch": 3.1438920809392954, + "grad_norm": 0.10696839774422116, + "learning_rate": 1.5471436819242343e-05, + "loss": 0.4726, + "num_tokens": 6564704440.0, + "step": 1575 + }, + { + "epoch": 3.145890582063452, + "grad_norm": 0.1172468872810591, + "learning_rate": 1.5449711134657224e-05, + "loss": 0.479, + "num_tokens": 6568880729.0, + "step": 1576 + }, + { + "epoch": 3.1478890831876094, + "grad_norm": 0.098625299597037, + "learning_rate": 1.5427996443589092e-05, + "loss": 0.4576, + "num_tokens": 6573047584.0, + "step": 1577 + }, + { + "epoch": 3.149887584311766, + "grad_norm": 0.11929701081344218, + "learning_rate": 1.540629278248225e-05, + "loss": 0.4794, + "num_tokens": 6577193461.0, + "step": 1578 + }, + { + "epoch": 3.151886085435923, + "grad_norm": 0.11894353186132106, + "learning_rate": 1.538460018776252e-05, + "loss": 0.4774, + "num_tokens": 6581363034.0, + "step": 1579 + }, + { + "epoch": 3.1538845865600798, + "grad_norm": 0.09829683522804339, + "learning_rate": 1.5362918695837116e-05, + "loss": 0.4659, + "num_tokens": 6585513376.0, + "step": 1580 + }, + { + "epoch": 3.155883087684237, + "grad_norm": 0.13086794426973639, + "learning_rate": 1.534124834309467e-05, + "loss": 0.4724, + "num_tokens": 6589699037.0, + "step": 1581 + }, + { + "epoch": 3.1578815888083938, + "grad_norm": 0.0986043941199627, + "learning_rate": 1.5319589165905062e-05, + "loss": 0.4735, + "num_tokens": 6593863329.0, + "step": 1582 + }, + { + "epoch": 3.1598800899325505, + "grad_norm": 0.10268845652439035, + "learning_rate": 1.5297941200619462e-05, + "loss": 0.4811, + "num_tokens": 6598048749.0, + "step": 1583 + }, + { + "epoch": 3.1618785910567073, + "grad_norm": 0.10384214328066296, + "learning_rate": 1.527630448357018e-05, + "loss": 0.476, + "num_tokens": 6602233517.0, + "step": 1584 + }, + { + "epoch": 3.1638770921808645, + "grad_norm": 0.10529111058810559, + "learning_rate": 1.5254679051070688e-05, + "loss": 0.4691, + "num_tokens": 6606404871.0, + "step": 1585 + }, + { + "epoch": 3.1658755933050213, + "grad_norm": 0.11129453385593088, + "learning_rate": 1.5233064939415498e-05, + "loss": 0.4883, + "num_tokens": 6610568102.0, + "step": 1586 + }, + { + "epoch": 3.167874094429178, + "grad_norm": 0.09083564174903641, + "learning_rate": 1.5211462184880126e-05, + "loss": 0.4765, + "num_tokens": 6614744565.0, + "step": 1587 + }, + { + "epoch": 3.169872595553335, + "grad_norm": 0.10251709733611353, + "learning_rate": 1.5189870823721018e-05, + "loss": 0.4628, + "num_tokens": 6618912693.0, + "step": 1588 + }, + { + "epoch": 3.1718710966774917, + "grad_norm": 0.11487722861248023, + "learning_rate": 1.5168290892175512e-05, + "loss": 0.4884, + "num_tokens": 6623097603.0, + "step": 1589 + }, + { + "epoch": 3.173869597801649, + "grad_norm": 0.10112856045955053, + "learning_rate": 1.5146722426461767e-05, + "loss": 0.4815, + "num_tokens": 6627250385.0, + "step": 1590 + }, + { + "epoch": 3.1758680989258057, + "grad_norm": 0.10773861717556799, + "learning_rate": 1.5125165462778673e-05, + "loss": 0.4732, + "num_tokens": 6631435483.0, + "step": 1591 + }, + { + "epoch": 3.1778666000499625, + "grad_norm": 0.09784936124709007, + "learning_rate": 1.5103620037305847e-05, + "loss": 0.4894, + "num_tokens": 6635618351.0, + "step": 1592 + }, + { + "epoch": 3.1798651011741192, + "grad_norm": 0.09186209497155128, + "learning_rate": 1.5082086186203508e-05, + "loss": 0.4696, + "num_tokens": 6639790351.0, + "step": 1593 + }, + { + "epoch": 3.1818636022982765, + "grad_norm": 0.09203667946890617, + "learning_rate": 1.5060563945612483e-05, + "loss": 0.4771, + "num_tokens": 6643976321.0, + "step": 1594 + }, + { + "epoch": 3.1838621034224333, + "grad_norm": 0.09214895685374125, + "learning_rate": 1.5039053351654098e-05, + "loss": 0.4804, + "num_tokens": 6648153516.0, + "step": 1595 + }, + { + "epoch": 3.18586060454659, + "grad_norm": 0.1029543254307162, + "learning_rate": 1.5017554440430125e-05, + "loss": 0.4711, + "num_tokens": 6652303078.0, + "step": 1596 + }, + { + "epoch": 3.187859105670747, + "grad_norm": 0.09672745544578937, + "learning_rate": 1.499606724802274e-05, + "loss": 0.4834, + "num_tokens": 6656486066.0, + "step": 1597 + }, + { + "epoch": 3.189857606794904, + "grad_norm": 0.10505776412205806, + "learning_rate": 1.4974591810494446e-05, + "loss": 0.4963, + "num_tokens": 6660672053.0, + "step": 1598 + }, + { + "epoch": 3.191856107919061, + "grad_norm": 0.10329762234055953, + "learning_rate": 1.4953128163888008e-05, + "loss": 0.4723, + "num_tokens": 6664855881.0, + "step": 1599 + }, + { + "epoch": 3.1938546090432176, + "grad_norm": 0.1080642653858654, + "learning_rate": 1.4931676344226427e-05, + "loss": 0.4788, + "num_tokens": 6669040989.0, + "step": 1600 + }, + { + "epoch": 3.1958531101673744, + "grad_norm": 0.1076652921904872, + "learning_rate": 1.4910236387512837e-05, + "loss": 0.4762, + "num_tokens": 6673211652.0, + "step": 1601 + }, + { + "epoch": 3.197851611291531, + "grad_norm": 0.0946031341354181, + "learning_rate": 1.4888808329730454e-05, + "loss": 0.4889, + "num_tokens": 6677398688.0, + "step": 1602 + }, + { + "epoch": 3.1998501124156884, + "grad_norm": 0.09427627701966766, + "learning_rate": 1.4867392206842536e-05, + "loss": 0.4866, + "num_tokens": 6681581867.0, + "step": 1603 + }, + { + "epoch": 3.201848613539845, + "grad_norm": 0.09616641142561122, + "learning_rate": 1.4845988054792324e-05, + "loss": 0.4741, + "num_tokens": 6685740305.0, + "step": 1604 + }, + { + "epoch": 3.203847114664002, + "grad_norm": 0.09172205887678048, + "learning_rate": 1.4824595909502937e-05, + "loss": 0.4779, + "num_tokens": 6689881945.0, + "step": 1605 + }, + { + "epoch": 3.2058456157881587, + "grad_norm": 0.09536866952342064, + "learning_rate": 1.4803215806877358e-05, + "loss": 0.4769, + "num_tokens": 6694044777.0, + "step": 1606 + }, + { + "epoch": 3.207844116912316, + "grad_norm": 0.09764840744875572, + "learning_rate": 1.4781847782798373e-05, + "loss": 0.4751, + "num_tokens": 6698210501.0, + "step": 1607 + }, + { + "epoch": 3.2098426180364728, + "grad_norm": 0.09487561391742766, + "learning_rate": 1.4760491873128464e-05, + "loss": 0.4775, + "num_tokens": 6702317452.0, + "step": 1608 + }, + { + "epoch": 3.2118411191606295, + "grad_norm": 0.10669647234121941, + "learning_rate": 1.4739148113709813e-05, + "loss": 0.4751, + "num_tokens": 6706499330.0, + "step": 1609 + }, + { + "epoch": 3.2138396202847863, + "grad_norm": 0.08879754767317388, + "learning_rate": 1.4717816540364198e-05, + "loss": 0.475, + "num_tokens": 6710685791.0, + "step": 1610 + }, + { + "epoch": 3.215838121408943, + "grad_norm": 0.10531363369503895, + "learning_rate": 1.469649718889293e-05, + "loss": 0.477, + "num_tokens": 6714864217.0, + "step": 1611 + }, + { + "epoch": 3.2178366225331003, + "grad_norm": 0.10135814995625692, + "learning_rate": 1.467519009507683e-05, + "loss": 0.4721, + "num_tokens": 6719049228.0, + "step": 1612 + }, + { + "epoch": 3.219835123657257, + "grad_norm": 0.09240651513974406, + "learning_rate": 1.4653895294676127e-05, + "loss": 0.4695, + "num_tokens": 6723236663.0, + "step": 1613 + }, + { + "epoch": 3.221833624781414, + "grad_norm": 0.1055273373387148, + "learning_rate": 1.4632612823430443e-05, + "loss": 0.4689, + "num_tokens": 6727397390.0, + "step": 1614 + }, + { + "epoch": 3.2238321259055707, + "grad_norm": 0.10153312829435415, + "learning_rate": 1.4611342717058686e-05, + "loss": 0.4817, + "num_tokens": 6731580568.0, + "step": 1615 + }, + { + "epoch": 3.225830627029728, + "grad_norm": 0.09418474670000197, + "learning_rate": 1.4590085011259016e-05, + "loss": 0.4877, + "num_tokens": 6735726161.0, + "step": 1616 + }, + { + "epoch": 3.2278291281538847, + "grad_norm": 0.10701462054362072, + "learning_rate": 1.4568839741708778e-05, + "loss": 0.4752, + "num_tokens": 6739913358.0, + "step": 1617 + }, + { + "epoch": 3.2298276292780415, + "grad_norm": 0.1069305086068009, + "learning_rate": 1.4547606944064465e-05, + "loss": 0.4737, + "num_tokens": 6744070363.0, + "step": 1618 + }, + { + "epoch": 3.2318261304021982, + "grad_norm": 0.09149138664473537, + "learning_rate": 1.4526386653961618e-05, + "loss": 0.4774, + "num_tokens": 6748254537.0, + "step": 1619 + }, + { + "epoch": 3.233824631526355, + "grad_norm": 0.11266807392268342, + "learning_rate": 1.450517890701478e-05, + "loss": 0.4739, + "num_tokens": 6752416851.0, + "step": 1620 + }, + { + "epoch": 3.2358231326505122, + "grad_norm": 0.09545200447043767, + "learning_rate": 1.448398373881747e-05, + "loss": 0.4585, + "num_tokens": 6756576618.0, + "step": 1621 + }, + { + "epoch": 3.237821633774669, + "grad_norm": 0.09860825558506366, + "learning_rate": 1.4462801184942064e-05, + "loss": 0.4786, + "num_tokens": 6760747100.0, + "step": 1622 + }, + { + "epoch": 3.239820134898826, + "grad_norm": 0.09792589727489313, + "learning_rate": 1.4441631280939792e-05, + "loss": 0.4669, + "num_tokens": 6764932709.0, + "step": 1623 + }, + { + "epoch": 3.2418186360229826, + "grad_norm": 0.10131410061527249, + "learning_rate": 1.4420474062340653e-05, + "loss": 0.4587, + "num_tokens": 6769117632.0, + "step": 1624 + }, + { + "epoch": 3.24381713714714, + "grad_norm": 0.10649366515507427, + "learning_rate": 1.439932956465333e-05, + "loss": 0.4981, + "num_tokens": 6773288274.0, + "step": 1625 + }, + { + "epoch": 3.2458156382712966, + "grad_norm": 0.09507328562354353, + "learning_rate": 1.4378197823365186e-05, + "loss": 0.4845, + "num_tokens": 6777441498.0, + "step": 1626 + }, + { + "epoch": 3.2478141393954534, + "grad_norm": 0.10217885322913138, + "learning_rate": 1.4357078873942145e-05, + "loss": 0.4745, + "num_tokens": 6781624151.0, + "step": 1627 + }, + { + "epoch": 3.24981264051961, + "grad_norm": 0.09784076407099532, + "learning_rate": 1.4335972751828693e-05, + "loss": 0.4694, + "num_tokens": 6785756371.0, + "step": 1628 + }, + { + "epoch": 3.251811141643767, + "grad_norm": 0.08577267644389926, + "learning_rate": 1.4314879492447773e-05, + "loss": 0.4603, + "num_tokens": 6789919471.0, + "step": 1629 + }, + { + "epoch": 3.253809642767924, + "grad_norm": 0.101150741932617, + "learning_rate": 1.4293799131200742e-05, + "loss": 0.4766, + "num_tokens": 6794073624.0, + "step": 1630 + }, + { + "epoch": 3.255808143892081, + "grad_norm": 0.09761427300946367, + "learning_rate": 1.4272731703467298e-05, + "loss": 0.4797, + "num_tokens": 6798256382.0, + "step": 1631 + }, + { + "epoch": 3.2578066450162377, + "grad_norm": 0.09366190231787755, + "learning_rate": 1.4251677244605446e-05, + "loss": 0.4604, + "num_tokens": 6802419351.0, + "step": 1632 + }, + { + "epoch": 3.2598051461403945, + "grad_norm": 0.11346782396694165, + "learning_rate": 1.4230635789951434e-05, + "loss": 0.4803, + "num_tokens": 6806605890.0, + "step": 1633 + }, + { + "epoch": 3.2618036472645517, + "grad_norm": 0.10120306072443554, + "learning_rate": 1.4209607374819658e-05, + "loss": 0.4696, + "num_tokens": 6810759425.0, + "step": 1634 + }, + { + "epoch": 3.2638021483887085, + "grad_norm": 0.0856716259277241, + "learning_rate": 1.418859203450265e-05, + "loss": 0.4816, + "num_tokens": 6814943770.0, + "step": 1635 + }, + { + "epoch": 3.2658006495128653, + "grad_norm": 0.12911609822950873, + "learning_rate": 1.4167589804270985e-05, + "loss": 0.5032, + "num_tokens": 6819094950.0, + "step": 1636 + }, + { + "epoch": 3.267799150637022, + "grad_norm": 0.10835125785581412, + "learning_rate": 1.4146600719373247e-05, + "loss": 0.4786, + "num_tokens": 6823276299.0, + "step": 1637 + }, + { + "epoch": 3.269797651761179, + "grad_norm": 0.1185763000196714, + "learning_rate": 1.412562481503596e-05, + "loss": 0.4698, + "num_tokens": 6827441265.0, + "step": 1638 + }, + { + "epoch": 3.271796152885336, + "grad_norm": 0.09674540204250572, + "learning_rate": 1.4104662126463513e-05, + "loss": 0.4821, + "num_tokens": 6831622732.0, + "step": 1639 + }, + { + "epoch": 3.273794654009493, + "grad_norm": 0.12359710269229467, + "learning_rate": 1.4083712688838116e-05, + "loss": 0.4835, + "num_tokens": 6835794693.0, + "step": 1640 + }, + { + "epoch": 3.2757931551336497, + "grad_norm": 0.0927622512549958, + "learning_rate": 1.4062776537319747e-05, + "loss": 0.4893, + "num_tokens": 6839962897.0, + "step": 1641 + }, + { + "epoch": 3.277791656257807, + "grad_norm": 0.10918934196815418, + "learning_rate": 1.4041853707046094e-05, + "loss": 0.4779, + "num_tokens": 6844148831.0, + "step": 1642 + }, + { + "epoch": 3.2797901573819637, + "grad_norm": 0.12407114649473029, + "learning_rate": 1.4020944233132471e-05, + "loss": 0.4862, + "num_tokens": 6848326195.0, + "step": 1643 + }, + { + "epoch": 3.2817886585061204, + "grad_norm": 0.09364792223975672, + "learning_rate": 1.4000048150671779e-05, + "loss": 0.4701, + "num_tokens": 6852512446.0, + "step": 1644 + }, + { + "epoch": 3.2837871596302772, + "grad_norm": 0.12315595333003404, + "learning_rate": 1.397916549473444e-05, + "loss": 0.487, + "num_tokens": 6856697162.0, + "step": 1645 + }, + { + "epoch": 3.285785660754434, + "grad_norm": 0.11794478867484492, + "learning_rate": 1.395829630036836e-05, + "loss": 0.4868, + "num_tokens": 6860882143.0, + "step": 1646 + }, + { + "epoch": 3.2877841618785912, + "grad_norm": 0.10794925036678407, + "learning_rate": 1.3937440602598838e-05, + "loss": 0.4704, + "num_tokens": 6865027323.0, + "step": 1647 + }, + { + "epoch": 3.289782663002748, + "grad_norm": 0.09457329508930971, + "learning_rate": 1.3916598436428528e-05, + "loss": 0.4743, + "num_tokens": 6869188557.0, + "step": 1648 + }, + { + "epoch": 3.291781164126905, + "grad_norm": 0.08721524094716418, + "learning_rate": 1.3895769836837352e-05, + "loss": 0.4559, + "num_tokens": 6873372257.0, + "step": 1649 + }, + { + "epoch": 3.2937796652510616, + "grad_norm": 0.09081749617164846, + "learning_rate": 1.3874954838782493e-05, + "loss": 0.4782, + "num_tokens": 6877526737.0, + "step": 1650 + }, + { + "epoch": 3.295778166375219, + "grad_norm": 0.0916597905879028, + "learning_rate": 1.3854153477198305e-05, + "loss": 0.4596, + "num_tokens": 6881712305.0, + "step": 1651 + }, + { + "epoch": 3.2977766674993756, + "grad_norm": 0.08988939910596194, + "learning_rate": 1.3833365786996224e-05, + "loss": 0.4746, + "num_tokens": 6885887726.0, + "step": 1652 + }, + { + "epoch": 3.2997751686235324, + "grad_norm": 0.08634895669570197, + "learning_rate": 1.381259180306477e-05, + "loss": 0.4718, + "num_tokens": 6890047313.0, + "step": 1653 + }, + { + "epoch": 3.301773669747689, + "grad_norm": 0.09206756613652155, + "learning_rate": 1.3791831560269447e-05, + "loss": 0.487, + "num_tokens": 6894230048.0, + "step": 1654 + }, + { + "epoch": 3.303772170871846, + "grad_norm": 0.0938768671077906, + "learning_rate": 1.3771085093452696e-05, + "loss": 0.4803, + "num_tokens": 6898415457.0, + "step": 1655 + }, + { + "epoch": 3.305770671996003, + "grad_norm": 0.09620760923827759, + "learning_rate": 1.375035243743386e-05, + "loss": 0.4594, + "num_tokens": 6902591037.0, + "step": 1656 + }, + { + "epoch": 3.30776917312016, + "grad_norm": 0.10076707918018396, + "learning_rate": 1.3729633627009072e-05, + "loss": 0.4812, + "num_tokens": 6906775104.0, + "step": 1657 + }, + { + "epoch": 3.3097676742443167, + "grad_norm": 0.1041401254234214, + "learning_rate": 1.3708928696951232e-05, + "loss": 0.4832, + "num_tokens": 6910957992.0, + "step": 1658 + }, + { + "epoch": 3.3117661753684735, + "grad_norm": 0.0963613474565679, + "learning_rate": 1.368823768200997e-05, + "loss": 0.4817, + "num_tokens": 6915136443.0, + "step": 1659 + }, + { + "epoch": 3.3137646764926307, + "grad_norm": 0.09034707018474156, + "learning_rate": 1.366756061691153e-05, + "loss": 0.4826, + "num_tokens": 6919290448.0, + "step": 1660 + }, + { + "epoch": 3.3157631776167875, + "grad_norm": 0.08424742546785416, + "learning_rate": 1.3646897536358772e-05, + "loss": 0.4595, + "num_tokens": 6923478318.0, + "step": 1661 + }, + { + "epoch": 3.3177616787409443, + "grad_norm": 0.0897170499097556, + "learning_rate": 1.362624847503106e-05, + "loss": 0.4697, + "num_tokens": 6927634469.0, + "step": 1662 + }, + { + "epoch": 3.319760179865101, + "grad_norm": 0.08748351974320019, + "learning_rate": 1.360561346758424e-05, + "loss": 0.4738, + "num_tokens": 6931801708.0, + "step": 1663 + }, + { + "epoch": 3.321758680989258, + "grad_norm": 0.10314490790912212, + "learning_rate": 1.3584992548650575e-05, + "loss": 0.4697, + "num_tokens": 6935958875.0, + "step": 1664 + }, + { + "epoch": 3.323757182113415, + "grad_norm": 0.1102261451430303, + "learning_rate": 1.3564385752838686e-05, + "loss": 0.4764, + "num_tokens": 6940127822.0, + "step": 1665 + }, + { + "epoch": 3.325755683237572, + "grad_norm": 0.09770312093376765, + "learning_rate": 1.3543793114733484e-05, + "loss": 0.4755, + "num_tokens": 6944274257.0, + "step": 1666 + }, + { + "epoch": 3.3277541843617287, + "grad_norm": 0.09599417547182514, + "learning_rate": 1.35232146688961e-05, + "loss": 0.4887, + "num_tokens": 6948428087.0, + "step": 1667 + }, + { + "epoch": 3.3297526854858854, + "grad_norm": 0.1327973587669142, + "learning_rate": 1.3502650449863893e-05, + "loss": 0.4859, + "num_tokens": 6952613022.0, + "step": 1668 + }, + { + "epoch": 3.3317511866100427, + "grad_norm": 0.08761071017407525, + "learning_rate": 1.3482100492150289e-05, + "loss": 0.4779, + "num_tokens": 6956790817.0, + "step": 1669 + }, + { + "epoch": 3.3337496877341994, + "grad_norm": 0.1320012423010374, + "learning_rate": 1.3461564830244829e-05, + "loss": 0.4793, + "num_tokens": 6960945760.0, + "step": 1670 + }, + { + "epoch": 3.3357481888583562, + "grad_norm": 0.10507641046840172, + "learning_rate": 1.3441043498613029e-05, + "loss": 0.4711, + "num_tokens": 6965132319.0, + "step": 1671 + }, + { + "epoch": 3.337746689982513, + "grad_norm": 0.10229134309119334, + "learning_rate": 1.3420536531696357e-05, + "loss": 0.4658, + "num_tokens": 6969317594.0, + "step": 1672 + }, + { + "epoch": 3.33974519110667, + "grad_norm": 0.09817443856128726, + "learning_rate": 1.3400043963912194e-05, + "loss": 0.4737, + "num_tokens": 6973503491.0, + "step": 1673 + }, + { + "epoch": 3.341743692230827, + "grad_norm": 0.1046303993957827, + "learning_rate": 1.3379565829653722e-05, + "loss": 0.4716, + "num_tokens": 6977667812.0, + "step": 1674 + }, + { + "epoch": 3.343742193354984, + "grad_norm": 0.1046938907901531, + "learning_rate": 1.3359102163289937e-05, + "loss": 0.4812, + "num_tokens": 6981849540.0, + "step": 1675 + }, + { + "epoch": 3.3457406944791406, + "grad_norm": 0.09314906056201656, + "learning_rate": 1.3338652999165511e-05, + "loss": 0.4705, + "num_tokens": 6986037345.0, + "step": 1676 + }, + { + "epoch": 3.3477391956032974, + "grad_norm": 0.08625631094234694, + "learning_rate": 1.3318218371600815e-05, + "loss": 0.4602, + "num_tokens": 6990216383.0, + "step": 1677 + }, + { + "epoch": 3.3497376967274546, + "grad_norm": 0.10278623959877564, + "learning_rate": 1.3297798314891792e-05, + "loss": 0.4809, + "num_tokens": 6994377938.0, + "step": 1678 + }, + { + "epoch": 3.3517361978516114, + "grad_norm": 0.09461864900051034, + "learning_rate": 1.3277392863309962e-05, + "loss": 0.4885, + "num_tokens": 6998564477.0, + "step": 1679 + }, + { + "epoch": 3.353734698975768, + "grad_norm": 0.10849803660990438, + "learning_rate": 1.3257002051102301e-05, + "loss": 0.4859, + "num_tokens": 7002751692.0, + "step": 1680 + }, + { + "epoch": 3.355733200099925, + "grad_norm": 0.09904309036030533, + "learning_rate": 1.323662591249123e-05, + "loss": 0.4638, + "num_tokens": 7006936827.0, + "step": 1681 + }, + { + "epoch": 3.3577317012240817, + "grad_norm": 0.11661092679073488, + "learning_rate": 1.321626448167455e-05, + "loss": 0.4744, + "num_tokens": 7011119261.0, + "step": 1682 + }, + { + "epoch": 3.359730202348239, + "grad_norm": 0.09313601875188222, + "learning_rate": 1.319591779282536e-05, + "loss": 0.477, + "num_tokens": 7015304998.0, + "step": 1683 + }, + { + "epoch": 3.3617287034723957, + "grad_norm": 0.10537738144243167, + "learning_rate": 1.3175585880092037e-05, + "loss": 0.485, + "num_tokens": 7019489949.0, + "step": 1684 + }, + { + "epoch": 3.3637272045965525, + "grad_norm": 0.10333866510384446, + "learning_rate": 1.315526877759814e-05, + "loss": 0.4906, + "num_tokens": 7023674163.0, + "step": 1685 + }, + { + "epoch": 3.3657257057207093, + "grad_norm": 0.10119966010284001, + "learning_rate": 1.3134966519442393e-05, + "loss": 0.4911, + "num_tokens": 7027852506.0, + "step": 1686 + }, + { + "epoch": 3.3677242068448665, + "grad_norm": 0.08811485255788981, + "learning_rate": 1.3114679139698587e-05, + "loss": 0.4811, + "num_tokens": 7031989617.0, + "step": 1687 + }, + { + "epoch": 3.3697227079690233, + "grad_norm": 0.10483018671711934, + "learning_rate": 1.3094406672415543e-05, + "loss": 0.4735, + "num_tokens": 7036174123.0, + "step": 1688 + }, + { + "epoch": 3.37172120909318, + "grad_norm": 0.0965321042953571, + "learning_rate": 1.3074149151617071e-05, + "loss": 0.4702, + "num_tokens": 7040360826.0, + "step": 1689 + }, + { + "epoch": 3.373719710217337, + "grad_norm": 0.09156546176750575, + "learning_rate": 1.3053906611301876e-05, + "loss": 0.4566, + "num_tokens": 7044520517.0, + "step": 1690 + }, + { + "epoch": 3.3757182113414936, + "grad_norm": 0.09287093544246161, + "learning_rate": 1.3033679085443538e-05, + "loss": 0.4863, + "num_tokens": 7048652185.0, + "step": 1691 + }, + { + "epoch": 3.377716712465651, + "grad_norm": 0.08942244584934715, + "learning_rate": 1.301346660799042e-05, + "loss": 0.4847, + "num_tokens": 7052837968.0, + "step": 1692 + }, + { + "epoch": 3.3797152135898076, + "grad_norm": 0.08512203905570148, + "learning_rate": 1.2993269212865651e-05, + "loss": 0.4834, + "num_tokens": 7057024049.0, + "step": 1693 + }, + { + "epoch": 3.3817137147139644, + "grad_norm": 0.09085740946210719, + "learning_rate": 1.2973086933967025e-05, + "loss": 0.4646, + "num_tokens": 7061205259.0, + "step": 1694 + }, + { + "epoch": 3.3837122158381217, + "grad_norm": 0.0999109309731767, + "learning_rate": 1.2952919805166983e-05, + "loss": 0.4791, + "num_tokens": 7065364151.0, + "step": 1695 + }, + { + "epoch": 3.3857107169622784, + "grad_norm": 0.09995163077731538, + "learning_rate": 1.2932767860312529e-05, + "loss": 0.4715, + "num_tokens": 7069522202.0, + "step": 1696 + }, + { + "epoch": 3.387709218086435, + "grad_norm": 0.09722383973779727, + "learning_rate": 1.291263113322518e-05, + "loss": 0.4821, + "num_tokens": 7073684181.0, + "step": 1697 + }, + { + "epoch": 3.389707719210592, + "grad_norm": 0.0947563732857468, + "learning_rate": 1.2892509657700931e-05, + "loss": 0.4741, + "num_tokens": 7077850808.0, + "step": 1698 + }, + { + "epoch": 3.391706220334749, + "grad_norm": 0.09474256258181336, + "learning_rate": 1.2872403467510158e-05, + "loss": 0.4728, + "num_tokens": 7082036258.0, + "step": 1699 + }, + { + "epoch": 3.393704721458906, + "grad_norm": 0.10270409887075256, + "learning_rate": 1.2852312596397597e-05, + "loss": 0.4737, + "num_tokens": 7086161944.0, + "step": 1700 + }, + { + "epoch": 3.395703222583063, + "grad_norm": 0.08824969462835636, + "learning_rate": 1.2832237078082272e-05, + "loss": 0.467, + "num_tokens": 7090345317.0, + "step": 1701 + }, + { + "epoch": 3.3977017237072196, + "grad_norm": 0.09808100537181817, + "learning_rate": 1.2812176946257431e-05, + "loss": 0.4896, + "num_tokens": 7094497585.0, + "step": 1702 + }, + { + "epoch": 3.3997002248313763, + "grad_norm": 0.0953848926006688, + "learning_rate": 1.2792132234590508e-05, + "loss": 0.4656, + "num_tokens": 7098655781.0, + "step": 1703 + }, + { + "epoch": 3.4016987259555336, + "grad_norm": 0.08831848847397593, + "learning_rate": 1.2772102976723063e-05, + "loss": 0.4797, + "num_tokens": 7102840867.0, + "step": 1704 + }, + { + "epoch": 3.4036972270796904, + "grad_norm": 0.08695314105807792, + "learning_rate": 1.2752089206270701e-05, + "loss": 0.4695, + "num_tokens": 7107022806.0, + "step": 1705 + }, + { + "epoch": 3.405695728203847, + "grad_norm": 0.10054224655223681, + "learning_rate": 1.273209095682304e-05, + "loss": 0.4768, + "num_tokens": 7111195215.0, + "step": 1706 + }, + { + "epoch": 3.407694229328004, + "grad_norm": 0.09133840895692288, + "learning_rate": 1.2712108261943658e-05, + "loss": 0.4671, + "num_tokens": 7115382286.0, + "step": 1707 + }, + { + "epoch": 3.4096927304521607, + "grad_norm": 0.0875600443008626, + "learning_rate": 1.2692141155170033e-05, + "loss": 0.4805, + "num_tokens": 7119560362.0, + "step": 1708 + }, + { + "epoch": 3.411691231576318, + "grad_norm": 0.09615825281467709, + "learning_rate": 1.2672189670013455e-05, + "loss": 0.4917, + "num_tokens": 7123724318.0, + "step": 1709 + }, + { + "epoch": 3.4136897327004747, + "grad_norm": 0.09630094580653657, + "learning_rate": 1.2652253839959017e-05, + "loss": 0.4795, + "num_tokens": 7127865227.0, + "step": 1710 + }, + { + "epoch": 3.4156882338246315, + "grad_norm": 0.08748962086554168, + "learning_rate": 1.2632333698465523e-05, + "loss": 0.4611, + "num_tokens": 7132020783.0, + "step": 1711 + }, + { + "epoch": 3.4176867349487883, + "grad_norm": 0.09424498035816632, + "learning_rate": 1.2612429278965457e-05, + "loss": 0.4887, + "num_tokens": 7136191355.0, + "step": 1712 + }, + { + "epoch": 3.4196852360729455, + "grad_norm": 0.09329160486402194, + "learning_rate": 1.2592540614864928e-05, + "loss": 0.4833, + "num_tokens": 7140346897.0, + "step": 1713 + }, + { + "epoch": 3.4216837371971023, + "grad_norm": 0.09348444361600926, + "learning_rate": 1.257266773954358e-05, + "loss": 0.479, + "num_tokens": 7144530421.0, + "step": 1714 + }, + { + "epoch": 3.423682238321259, + "grad_norm": 0.08856753534108844, + "learning_rate": 1.2552810686354567e-05, + "loss": 0.4667, + "num_tokens": 7148665769.0, + "step": 1715 + }, + { + "epoch": 3.425680739445416, + "grad_norm": 0.0977843261711538, + "learning_rate": 1.2532969488624466e-05, + "loss": 0.4714, + "num_tokens": 7152852983.0, + "step": 1716 + }, + { + "epoch": 3.4276792405695726, + "grad_norm": 0.0842462194672848, + "learning_rate": 1.2513144179653307e-05, + "loss": 0.4888, + "num_tokens": 7157007863.0, + "step": 1717 + }, + { + "epoch": 3.42967774169373, + "grad_norm": 9.349549522570788, + "learning_rate": 1.2493334792714389e-05, + "loss": 0.5362, + "num_tokens": 7161193032.0, + "step": 1718 + }, + { + "epoch": 3.4316762428178866, + "grad_norm": 0.12775231613065965, + "learning_rate": 1.2473541361054315e-05, + "loss": 0.4787, + "num_tokens": 7165379338.0, + "step": 1719 + }, + { + "epoch": 3.4336747439420434, + "grad_norm": 0.09530326339968853, + "learning_rate": 1.2453763917892898e-05, + "loss": 0.4667, + "num_tokens": 7169561028.0, + "step": 1720 + }, + { + "epoch": 3.4356732450662, + "grad_norm": 0.10596525751806056, + "learning_rate": 1.2434002496423134e-05, + "loss": 0.478, + "num_tokens": 7173745060.0, + "step": 1721 + }, + { + "epoch": 3.4376717461903574, + "grad_norm": 0.1011374534011761, + "learning_rate": 1.2414257129811124e-05, + "loss": 0.4729, + "num_tokens": 7177928140.0, + "step": 1722 + }, + { + "epoch": 3.439670247314514, + "grad_norm": 0.11868883465530357, + "learning_rate": 1.2394527851196021e-05, + "loss": 0.479, + "num_tokens": 7182114309.0, + "step": 1723 + }, + { + "epoch": 3.441668748438671, + "grad_norm": 0.1341196980810507, + "learning_rate": 1.2374814693689972e-05, + "loss": 0.4779, + "num_tokens": 7186292633.0, + "step": 1724 + }, + { + "epoch": 3.4436672495628278, + "grad_norm": 0.10265979539413406, + "learning_rate": 1.2355117690378057e-05, + "loss": 0.4803, + "num_tokens": 7190451767.0, + "step": 1725 + }, + { + "epoch": 3.4456657506869846, + "grad_norm": 0.10557774338896987, + "learning_rate": 1.2335436874318293e-05, + "loss": 0.4742, + "num_tokens": 7194636955.0, + "step": 1726 + }, + { + "epoch": 3.4476642518111418, + "grad_norm": 0.09980522601699256, + "learning_rate": 1.2315772278541474e-05, + "loss": 0.4807, + "num_tokens": 7198809501.0, + "step": 1727 + }, + { + "epoch": 3.4496627529352986, + "grad_norm": 0.10672180951174881, + "learning_rate": 1.2296123936051201e-05, + "loss": 0.4718, + "num_tokens": 7202992376.0, + "step": 1728 + }, + { + "epoch": 3.4516612540594553, + "grad_norm": 0.10525384881630279, + "learning_rate": 1.2276491879823777e-05, + "loss": 0.492, + "num_tokens": 7207178989.0, + "step": 1729 + }, + { + "epoch": 3.453659755183612, + "grad_norm": 0.11989550313915855, + "learning_rate": 1.2256876142808192e-05, + "loss": 0.4699, + "num_tokens": 7211362074.0, + "step": 1730 + }, + { + "epoch": 3.4556582563077693, + "grad_norm": 0.0888510928049302, + "learning_rate": 1.223727675792604e-05, + "loss": 0.4752, + "num_tokens": 7215535348.0, + "step": 1731 + }, + { + "epoch": 3.457656757431926, + "grad_norm": 0.1103067194010573, + "learning_rate": 1.221769375807147e-05, + "loss": 0.4695, + "num_tokens": 7219719420.0, + "step": 1732 + }, + { + "epoch": 3.459655258556083, + "grad_norm": 0.10272945243507546, + "learning_rate": 1.2198127176111121e-05, + "loss": 0.4668, + "num_tokens": 7223875921.0, + "step": 1733 + }, + { + "epoch": 3.4616537596802397, + "grad_norm": 0.10496056939539242, + "learning_rate": 1.2178577044884094e-05, + "loss": 0.4607, + "num_tokens": 7228062535.0, + "step": 1734 + }, + { + "epoch": 3.4636522608043965, + "grad_norm": 0.10057807774693137, + "learning_rate": 1.2159043397201868e-05, + "loss": 0.4711, + "num_tokens": 7232249842.0, + "step": 1735 + }, + { + "epoch": 3.4656507619285537, + "grad_norm": 0.11106071293020163, + "learning_rate": 1.213952626584827e-05, + "loss": 0.4646, + "num_tokens": 7236434257.0, + "step": 1736 + }, + { + "epoch": 3.4676492630527105, + "grad_norm": 0.09937933880863992, + "learning_rate": 1.2120025683579398e-05, + "loss": 0.4801, + "num_tokens": 7240619600.0, + "step": 1737 + }, + { + "epoch": 3.4696477641768673, + "grad_norm": 0.11950996947205864, + "learning_rate": 1.2100541683123565e-05, + "loss": 0.4759, + "num_tokens": 7244775318.0, + "step": 1738 + }, + { + "epoch": 3.471646265301024, + "grad_norm": 0.10424035422040243, + "learning_rate": 1.2081074297181278e-05, + "loss": 0.4817, + "num_tokens": 7248926277.0, + "step": 1739 + }, + { + "epoch": 3.4736447664251813, + "grad_norm": 0.10636606059628359, + "learning_rate": 1.206162355842515e-05, + "loss": 0.4743, + "num_tokens": 7253111484.0, + "step": 1740 + }, + { + "epoch": 3.475643267549338, + "grad_norm": 0.117616702384278, + "learning_rate": 1.2042189499499852e-05, + "loss": 0.4588, + "num_tokens": 7257296219.0, + "step": 1741 + }, + { + "epoch": 3.477641768673495, + "grad_norm": 0.08276259500736806, + "learning_rate": 1.2022772153022046e-05, + "loss": 0.4641, + "num_tokens": 7261459441.0, + "step": 1742 + }, + { + "epoch": 3.4796402697976516, + "grad_norm": 0.11502112715761516, + "learning_rate": 1.2003371551580378e-05, + "loss": 0.4673, + "num_tokens": 7265639168.0, + "step": 1743 + }, + { + "epoch": 3.4816387709218084, + "grad_norm": 0.09452872978988718, + "learning_rate": 1.198398772773536e-05, + "loss": 0.4905, + "num_tokens": 7269792766.0, + "step": 1744 + }, + { + "epoch": 3.4836372720459656, + "grad_norm": 0.08833965833070045, + "learning_rate": 1.1964620714019371e-05, + "loss": 0.48, + "num_tokens": 7273976066.0, + "step": 1745 + }, + { + "epoch": 3.4856357731701224, + "grad_norm": 0.0957217190294918, + "learning_rate": 1.1945270542936556e-05, + "loss": 0.4664, + "num_tokens": 7278151198.0, + "step": 1746 + }, + { + "epoch": 3.487634274294279, + "grad_norm": 0.08516031657150966, + "learning_rate": 1.1925937246962793e-05, + "loss": 0.4732, + "num_tokens": 7282333009.0, + "step": 1747 + }, + { + "epoch": 3.4896327754184364, + "grad_norm": 0.08651501453106118, + "learning_rate": 1.1906620858545665e-05, + "loss": 0.4778, + "num_tokens": 7286519185.0, + "step": 1748 + }, + { + "epoch": 3.491631276542593, + "grad_norm": 0.09346187935812623, + "learning_rate": 1.188732141010434e-05, + "loss": 0.4666, + "num_tokens": 7290699203.0, + "step": 1749 + }, + { + "epoch": 3.49362977766675, + "grad_norm": 0.0837768148239929, + "learning_rate": 1.1868038934029595e-05, + "loss": 0.4636, + "num_tokens": 7294883811.0, + "step": 1750 + }, + { + "epoch": 3.4956282787909068, + "grad_norm": 0.0897055526972062, + "learning_rate": 1.1848773462683684e-05, + "loss": 0.4805, + "num_tokens": 7299067246.0, + "step": 1751 + }, + { + "epoch": 3.4976267799150635, + "grad_norm": 0.09150871195444034, + "learning_rate": 1.1829525028400353e-05, + "loss": 0.4818, + "num_tokens": 7303250988.0, + "step": 1752 + }, + { + "epoch": 3.4996252810392208, + "grad_norm": 0.08585963298924593, + "learning_rate": 1.1810293663484725e-05, + "loss": 0.4651, + "num_tokens": 7307435848.0, + "step": 1753 + }, + { + "epoch": 3.5016237821633776, + "grad_norm": 0.100997411147579, + "learning_rate": 1.1791079400213309e-05, + "loss": 0.4864, + "num_tokens": 7311618743.0, + "step": 1754 + }, + { + "epoch": 3.5036222832875343, + "grad_norm": 0.08411795564468888, + "learning_rate": 1.1771882270833883e-05, + "loss": 0.4771, + "num_tokens": 7315777239.0, + "step": 1755 + }, + { + "epoch": 3.505620784411691, + "grad_norm": 0.0879826584543144, + "learning_rate": 1.1752702307565472e-05, + "loss": 0.4691, + "num_tokens": 7319934021.0, + "step": 1756 + }, + { + "epoch": 3.5076192855358483, + "grad_norm": 0.09140865814871416, + "learning_rate": 1.1733539542598314e-05, + "loss": 0.4792, + "num_tokens": 7324117845.0, + "step": 1757 + }, + { + "epoch": 3.509617786660005, + "grad_norm": 0.09185591711805183, + "learning_rate": 1.171439400809375e-05, + "loss": 0.4736, + "num_tokens": 7328280099.0, + "step": 1758 + }, + { + "epoch": 3.511616287784162, + "grad_norm": 0.0891899059876296, + "learning_rate": 1.1695265736184238e-05, + "loss": 0.4712, + "num_tokens": 7332464095.0, + "step": 1759 + }, + { + "epoch": 3.5136147889083187, + "grad_norm": 0.08910086388374985, + "learning_rate": 1.1676154758973226e-05, + "loss": 0.4787, + "num_tokens": 7336649801.0, + "step": 1760 + }, + { + "epoch": 3.5156132900324755, + "grad_norm": 0.10132590714389968, + "learning_rate": 1.1657061108535172e-05, + "loss": 0.481, + "num_tokens": 7340835019.0, + "step": 1761 + }, + { + "epoch": 3.5176117911566323, + "grad_norm": 0.08563191541888726, + "learning_rate": 1.1637984816915426e-05, + "loss": 0.4962, + "num_tokens": 7345018564.0, + "step": 1762 + }, + { + "epoch": 3.5196102922807895, + "grad_norm": 0.09509417507492443, + "learning_rate": 1.1618925916130212e-05, + "loss": 0.4926, + "num_tokens": 7349204722.0, + "step": 1763 + }, + { + "epoch": 3.5216087934049463, + "grad_norm": 0.08317601120804083, + "learning_rate": 1.1599884438166584e-05, + "loss": 0.4729, + "num_tokens": 7353379094.0, + "step": 1764 + }, + { + "epoch": 3.523607294529103, + "grad_norm": 0.09176727443355805, + "learning_rate": 1.158086041498232e-05, + "loss": 0.4734, + "num_tokens": 7357553097.0, + "step": 1765 + }, + { + "epoch": 3.5256057956532603, + "grad_norm": 0.0871054193543384, + "learning_rate": 1.1561853878505947e-05, + "loss": 0.4647, + "num_tokens": 7361725570.0, + "step": 1766 + }, + { + "epoch": 3.527604296777417, + "grad_norm": 0.10714170707716661, + "learning_rate": 1.15428648606366e-05, + "loss": 0.4746, + "num_tokens": 7365910911.0, + "step": 1767 + }, + { + "epoch": 3.529602797901574, + "grad_norm": 0.09363153757481958, + "learning_rate": 1.1523893393244045e-05, + "loss": 0.4711, + "num_tokens": 7370095703.0, + "step": 1768 + }, + { + "epoch": 3.5316012990257306, + "grad_norm": 0.08891316649713714, + "learning_rate": 1.150493950816857e-05, + "loss": 0.4687, + "num_tokens": 7374281653.0, + "step": 1769 + }, + { + "epoch": 3.5335998001498874, + "grad_norm": 0.10732354792044134, + "learning_rate": 1.1486003237220977e-05, + "loss": 0.4799, + "num_tokens": 7378467385.0, + "step": 1770 + }, + { + "epoch": 3.5355983012740446, + "grad_norm": 0.08172821655585842, + "learning_rate": 1.1467084612182483e-05, + "loss": 0.4718, + "num_tokens": 7382653335.0, + "step": 1771 + }, + { + "epoch": 3.5375968023982014, + "grad_norm": 0.09053611967575677, + "learning_rate": 1.14481836648047e-05, + "loss": 0.4833, + "num_tokens": 7386838323.0, + "step": 1772 + }, + { + "epoch": 3.539595303522358, + "grad_norm": 0.08981865004137597, + "learning_rate": 1.1429300426809576e-05, + "loss": 0.4706, + "num_tokens": 7390996094.0, + "step": 1773 + }, + { + "epoch": 3.541593804646515, + "grad_norm": 0.09370376921977419, + "learning_rate": 1.141043492988932e-05, + "loss": 0.4806, + "num_tokens": 7395152483.0, + "step": 1774 + }, + { + "epoch": 3.543592305770672, + "grad_norm": 0.09751122285581082, + "learning_rate": 1.1391587205706392e-05, + "loss": 0.4762, + "num_tokens": 7399338588.0, + "step": 1775 + }, + { + "epoch": 3.545590806894829, + "grad_norm": 0.08587632708792235, + "learning_rate": 1.13727572858934e-05, + "loss": 0.4756, + "num_tokens": 7403438295.0, + "step": 1776 + }, + { + "epoch": 3.5475893080189858, + "grad_norm": 0.08941321540450448, + "learning_rate": 1.1353945202053072e-05, + "loss": 0.4794, + "num_tokens": 7407591835.0, + "step": 1777 + }, + { + "epoch": 3.5495878091431425, + "grad_norm": 0.09267265978819407, + "learning_rate": 1.1335150985758219e-05, + "loss": 0.472, + "num_tokens": 7411777768.0, + "step": 1778 + }, + { + "epoch": 3.5515863102672993, + "grad_norm": 0.10495293296461373, + "learning_rate": 1.1316374668551658e-05, + "loss": 0.4745, + "num_tokens": 7415951028.0, + "step": 1779 + }, + { + "epoch": 3.5535848113914565, + "grad_norm": 0.09323526633017573, + "learning_rate": 1.129761628194615e-05, + "loss": 0.4671, + "num_tokens": 7420124467.0, + "step": 1780 + }, + { + "epoch": 3.5555833125156133, + "grad_norm": 0.10376937997335371, + "learning_rate": 1.1278875857424381e-05, + "loss": 0.4618, + "num_tokens": 7424292625.0, + "step": 1781 + }, + { + "epoch": 3.55758181363977, + "grad_norm": 0.08804021447760442, + "learning_rate": 1.1260153426438874e-05, + "loss": 0.4711, + "num_tokens": 7428464066.0, + "step": 1782 + }, + { + "epoch": 3.559580314763927, + "grad_norm": 0.08730939483810296, + "learning_rate": 1.1241449020411972e-05, + "loss": 0.4783, + "num_tokens": 7432619461.0, + "step": 1783 + }, + { + "epoch": 3.561578815888084, + "grad_norm": 0.1008707734200512, + "learning_rate": 1.1222762670735761e-05, + "loss": 0.4796, + "num_tokens": 7436803149.0, + "step": 1784 + }, + { + "epoch": 3.563577317012241, + "grad_norm": 0.10091796922519079, + "learning_rate": 1.120409440877201e-05, + "loss": 0.4823, + "num_tokens": 7440912381.0, + "step": 1785 + }, + { + "epoch": 3.5655758181363977, + "grad_norm": 0.09683416272121902, + "learning_rate": 1.118544426585214e-05, + "loss": 0.4817, + "num_tokens": 7445067829.0, + "step": 1786 + }, + { + "epoch": 3.5675743192605545, + "grad_norm": 0.08717069950839276, + "learning_rate": 1.1166812273277161e-05, + "loss": 0.487, + "num_tokens": 7449253033.0, + "step": 1787 + }, + { + "epoch": 3.5695728203847112, + "grad_norm": 0.09181644742913563, + "learning_rate": 1.1148198462317632e-05, + "loss": 0.4858, + "num_tokens": 7453412298.0, + "step": 1788 + }, + { + "epoch": 3.5715713215088685, + "grad_norm": 0.09958227130874907, + "learning_rate": 1.1129602864213575e-05, + "loss": 0.4772, + "num_tokens": 7457595341.0, + "step": 1789 + }, + { + "epoch": 3.5735698226330253, + "grad_norm": 0.09122722460241375, + "learning_rate": 1.1111025510174467e-05, + "loss": 0.4697, + "num_tokens": 7461778685.0, + "step": 1790 + }, + { + "epoch": 3.575568323757182, + "grad_norm": 0.09124689647137613, + "learning_rate": 1.1092466431379143e-05, + "loss": 0.4726, + "num_tokens": 7465965631.0, + "step": 1791 + }, + { + "epoch": 3.5775668248813393, + "grad_norm": 0.08978231854045206, + "learning_rate": 1.107392565897578e-05, + "loss": 0.4894, + "num_tokens": 7470153217.0, + "step": 1792 + }, + { + "epoch": 3.579565326005496, + "grad_norm": 0.08329740940033344, + "learning_rate": 1.1055403224081847e-05, + "loss": 0.4689, + "num_tokens": 7474300260.0, + "step": 1793 + }, + { + "epoch": 3.581563827129653, + "grad_norm": 0.08599897101272523, + "learning_rate": 1.1036899157784005e-05, + "loss": 0.4773, + "num_tokens": 7478469054.0, + "step": 1794 + }, + { + "epoch": 3.5835623282538096, + "grad_norm": 0.09391798621193263, + "learning_rate": 1.1018413491138099e-05, + "loss": 0.4743, + "num_tokens": 7482653758.0, + "step": 1795 + }, + { + "epoch": 3.5855608293779664, + "grad_norm": 0.08402136852633707, + "learning_rate": 1.0999946255169096e-05, + "loss": 0.4639, + "num_tokens": 7486823291.0, + "step": 1796 + }, + { + "epoch": 3.587559330502123, + "grad_norm": 0.09517453417729986, + "learning_rate": 1.098149748087104e-05, + "loss": 0.4708, + "num_tokens": 7491004623.0, + "step": 1797 + }, + { + "epoch": 3.5895578316262804, + "grad_norm": 0.09275196544341986, + "learning_rate": 1.0963067199206972e-05, + "loss": 0.4731, + "num_tokens": 7495188674.0, + "step": 1798 + }, + { + "epoch": 3.591556332750437, + "grad_norm": 0.09008916055578602, + "learning_rate": 1.0944655441108903e-05, + "loss": 0.462, + "num_tokens": 7499374064.0, + "step": 1799 + }, + { + "epoch": 3.593554833874594, + "grad_norm": 0.09072553001865884, + "learning_rate": 1.0926262237477752e-05, + "loss": 0.4686, + "num_tokens": 7503535242.0, + "step": 1800 + }, + { + "epoch": 3.595553334998751, + "grad_norm": 0.08601198489404335, + "learning_rate": 1.0907887619183308e-05, + "loss": 0.4806, + "num_tokens": 7507716740.0, + "step": 1801 + }, + { + "epoch": 3.597551836122908, + "grad_norm": 0.09644007554248991, + "learning_rate": 1.0889531617064169e-05, + "loss": 0.4659, + "num_tokens": 7511877459.0, + "step": 1802 + }, + { + "epoch": 3.5995503372470647, + "grad_norm": 0.09947596418640496, + "learning_rate": 1.0871194261927673e-05, + "loss": 0.4824, + "num_tokens": 7516064106.0, + "step": 1803 + }, + { + "epoch": 3.6015488383712215, + "grad_norm": 0.0870656832889935, + "learning_rate": 1.0852875584549869e-05, + "loss": 0.4632, + "num_tokens": 7520251402.0, + "step": 1804 + }, + { + "epoch": 3.6035473394953783, + "grad_norm": 0.09764254913265406, + "learning_rate": 1.0834575615675462e-05, + "loss": 0.4854, + "num_tokens": 7524399992.0, + "step": 1805 + }, + { + "epoch": 3.605545840619535, + "grad_norm": 0.09671927428737186, + "learning_rate": 1.0816294386017766e-05, + "loss": 0.4735, + "num_tokens": 7528584660.0, + "step": 1806 + }, + { + "epoch": 3.6075443417436923, + "grad_norm": 0.0971839272594111, + "learning_rate": 1.079803192625863e-05, + "loss": 0.4842, + "num_tokens": 7532765801.0, + "step": 1807 + }, + { + "epoch": 3.609542842867849, + "grad_norm": 0.08935230274893348, + "learning_rate": 1.07797882670484e-05, + "loss": 0.4793, + "num_tokens": 7536950250.0, + "step": 1808 + }, + { + "epoch": 3.611541343992006, + "grad_norm": 0.08766219482429666, + "learning_rate": 1.0761563439005878e-05, + "loss": 0.4647, + "num_tokens": 7541135119.0, + "step": 1809 + }, + { + "epoch": 3.613539845116163, + "grad_norm": 0.08669031797244502, + "learning_rate": 1.074335747271826e-05, + "loss": 0.4662, + "num_tokens": 7545296599.0, + "step": 1810 + }, + { + "epoch": 3.61553834624032, + "grad_norm": 0.09014187453182838, + "learning_rate": 1.0725170398741087e-05, + "loss": 0.4719, + "num_tokens": 7549455588.0, + "step": 1811 + }, + { + "epoch": 3.6175368473644767, + "grad_norm": 0.0950726925423079, + "learning_rate": 1.0707002247598188e-05, + "loss": 0.4794, + "num_tokens": 7553639683.0, + "step": 1812 + }, + { + "epoch": 3.6195353484886335, + "grad_norm": 0.08908516232603424, + "learning_rate": 1.0688853049781627e-05, + "loss": 0.4746, + "num_tokens": 7557807643.0, + "step": 1813 + }, + { + "epoch": 3.6215338496127902, + "grad_norm": 0.09271838842813897, + "learning_rate": 1.067072283575167e-05, + "loss": 0.4853, + "num_tokens": 7561940185.0, + "step": 1814 + }, + { + "epoch": 3.623532350736947, + "grad_norm": 0.09618695486096934, + "learning_rate": 1.0652611635936725e-05, + "loss": 0.4859, + "num_tokens": 7566127313.0, + "step": 1815 + }, + { + "epoch": 3.6255308518611042, + "grad_norm": 0.08768552083394789, + "learning_rate": 1.0634519480733276e-05, + "loss": 0.475, + "num_tokens": 7570314147.0, + "step": 1816 + }, + { + "epoch": 3.627529352985261, + "grad_norm": 0.09826133027504994, + "learning_rate": 1.061644640050585e-05, + "loss": 0.4893, + "num_tokens": 7574486988.0, + "step": 1817 + }, + { + "epoch": 3.629527854109418, + "grad_norm": 0.09494798689619627, + "learning_rate": 1.0598392425586947e-05, + "loss": 0.4738, + "num_tokens": 7578644665.0, + "step": 1818 + }, + { + "epoch": 3.631526355233575, + "grad_norm": 0.09783161924023522, + "learning_rate": 1.0580357586277028e-05, + "loss": 0.4885, + "num_tokens": 7582823863.0, + "step": 1819 + }, + { + "epoch": 3.633524856357732, + "grad_norm": 0.08649363383969437, + "learning_rate": 1.0562341912844425e-05, + "loss": 0.4746, + "num_tokens": 7587007551.0, + "step": 1820 + }, + { + "epoch": 3.6355233574818886, + "grad_norm": 0.09239446102678339, + "learning_rate": 1.0544345435525298e-05, + "loss": 0.4696, + "num_tokens": 7591194758.0, + "step": 1821 + }, + { + "epoch": 3.6375218586060454, + "grad_norm": 0.09241125085657588, + "learning_rate": 1.0526368184523585e-05, + "loss": 0.4733, + "num_tokens": 7595357031.0, + "step": 1822 + }, + { + "epoch": 3.639520359730202, + "grad_norm": 0.09396707460696171, + "learning_rate": 1.0508410190010984e-05, + "loss": 0.4956, + "num_tokens": 7599512502.0, + "step": 1823 + }, + { + "epoch": 3.6415188608543594, + "grad_norm": 0.08382424553810054, + "learning_rate": 1.0490471482126838e-05, + "loss": 0.4559, + "num_tokens": 7603646782.0, + "step": 1824 + }, + { + "epoch": 3.643517361978516, + "grad_norm": 0.08723229241346493, + "learning_rate": 1.047255209097815e-05, + "loss": 0.4701, + "num_tokens": 7607782906.0, + "step": 1825 + }, + { + "epoch": 3.645515863102673, + "grad_norm": 0.09787927475556657, + "learning_rate": 1.0454652046639486e-05, + "loss": 0.4749, + "num_tokens": 7611916044.0, + "step": 1826 + }, + { + "epoch": 3.6475143642268297, + "grad_norm": 0.08267752058322317, + "learning_rate": 1.0436771379152946e-05, + "loss": 0.4713, + "num_tokens": 7616057023.0, + "step": 1827 + }, + { + "epoch": 3.649512865350987, + "grad_norm": 0.0898877741811075, + "learning_rate": 1.0418910118528109e-05, + "loss": 0.4739, + "num_tokens": 7620238678.0, + "step": 1828 + }, + { + "epoch": 3.6515113664751437, + "grad_norm": 0.08891415108630303, + "learning_rate": 1.0401068294741999e-05, + "loss": 0.4649, + "num_tokens": 7624422177.0, + "step": 1829 + }, + { + "epoch": 3.6535098675993005, + "grad_norm": 0.09458818905481704, + "learning_rate": 1.0383245937738992e-05, + "loss": 0.4583, + "num_tokens": 7628604641.0, + "step": 1830 + }, + { + "epoch": 3.6555083687234573, + "grad_norm": 0.09707166327137819, + "learning_rate": 1.0365443077430802e-05, + "loss": 0.4777, + "num_tokens": 7632789226.0, + "step": 1831 + }, + { + "epoch": 3.657506869847614, + "grad_norm": 0.09281454584747394, + "learning_rate": 1.034765974369643e-05, + "loss": 0.4838, + "num_tokens": 7636974146.0, + "step": 1832 + }, + { + "epoch": 3.6595053709717713, + "grad_norm": 0.09340871386229709, + "learning_rate": 1.0329895966382092e-05, + "loss": 0.4761, + "num_tokens": 7641156525.0, + "step": 1833 + }, + { + "epoch": 3.661503872095928, + "grad_norm": 0.09755950973885769, + "learning_rate": 1.0312151775301202e-05, + "loss": 0.4799, + "num_tokens": 7645342719.0, + "step": 1834 + }, + { + "epoch": 3.663502373220085, + "grad_norm": 0.08969618382250054, + "learning_rate": 1.0294427200234272e-05, + "loss": 0.4827, + "num_tokens": 7649525397.0, + "step": 1835 + }, + { + "epoch": 3.6655008743442417, + "grad_norm": 0.09088910369849834, + "learning_rate": 1.0276722270928927e-05, + "loss": 0.5005, + "num_tokens": 7653707673.0, + "step": 1836 + }, + { + "epoch": 3.667499375468399, + "grad_norm": 0.08961375139588866, + "learning_rate": 1.0259037017099792e-05, + "loss": 0.4734, + "num_tokens": 7657891119.0, + "step": 1837 + }, + { + "epoch": 3.6694978765925557, + "grad_norm": 0.09242547684072316, + "learning_rate": 1.024137146842847e-05, + "loss": 0.4886, + "num_tokens": 7662048393.0, + "step": 1838 + }, + { + "epoch": 3.6714963777167124, + "grad_norm": 0.09371152352495986, + "learning_rate": 1.022372565456352e-05, + "loss": 0.4749, + "num_tokens": 7666233451.0, + "step": 1839 + }, + { + "epoch": 3.6734948788408692, + "grad_norm": 0.09143173310400146, + "learning_rate": 1.0206099605120347e-05, + "loss": 0.4634, + "num_tokens": 7670419203.0, + "step": 1840 + }, + { + "epoch": 3.675493379965026, + "grad_norm": 0.08120446873272698, + "learning_rate": 1.0188493349681215e-05, + "loss": 0.4744, + "num_tokens": 7674603793.0, + "step": 1841 + }, + { + "epoch": 3.6774918810891832, + "grad_norm": 0.09936625352954642, + "learning_rate": 1.0170906917795134e-05, + "loss": 0.4839, + "num_tokens": 7678789777.0, + "step": 1842 + }, + { + "epoch": 3.67949038221334, + "grad_norm": 0.09579180714959472, + "learning_rate": 1.015334033897788e-05, + "loss": 0.4747, + "num_tokens": 7682928739.0, + "step": 1843 + }, + { + "epoch": 3.681488883337497, + "grad_norm": 0.08289376479180005, + "learning_rate": 1.0135793642711871e-05, + "loss": 0.4665, + "num_tokens": 7687095165.0, + "step": 1844 + }, + { + "epoch": 3.683487384461654, + "grad_norm": 0.09515582876329189, + "learning_rate": 1.0118266858446196e-05, + "loss": 0.4766, + "num_tokens": 7691280098.0, + "step": 1845 + }, + { + "epoch": 3.685485885585811, + "grad_norm": 0.09140860348861266, + "learning_rate": 1.0100760015596491e-05, + "loss": 0.475, + "num_tokens": 7695447425.0, + "step": 1846 + }, + { + "epoch": 3.6874843867099676, + "grad_norm": 0.0892314505985872, + "learning_rate": 1.0083273143544934e-05, + "loss": 0.4764, + "num_tokens": 7699633489.0, + "step": 1847 + }, + { + "epoch": 3.6894828878341244, + "grad_norm": 0.08757661290739793, + "learning_rate": 1.0065806271640204e-05, + "loss": 0.4707, + "num_tokens": 7703816736.0, + "step": 1848 + }, + { + "epoch": 3.691481388958281, + "grad_norm": 0.08953013198384913, + "learning_rate": 1.004835942919738e-05, + "loss": 0.4766, + "num_tokens": 7707997611.0, + "step": 1849 + }, + { + "epoch": 3.693479890082438, + "grad_norm": 0.09932204168517068, + "learning_rate": 1.0030932645497958e-05, + "loss": 0.4667, + "num_tokens": 7712169250.0, + "step": 1850 + }, + { + "epoch": 3.695478391206595, + "grad_norm": 0.105093159460942, + "learning_rate": 1.0013525949789745e-05, + "loss": 0.4881, + "num_tokens": 7716355857.0, + "step": 1851 + }, + { + "epoch": 3.697476892330752, + "grad_norm": 0.10141530354533775, + "learning_rate": 9.996139371286842e-06, + "loss": 0.4714, + "num_tokens": 7720526157.0, + "step": 1852 + }, + { + "epoch": 3.6994753934549087, + "grad_norm": 0.09317147289500843, + "learning_rate": 9.978772939169592e-06, + "loss": 0.4816, + "num_tokens": 7724651998.0, + "step": 1853 + }, + { + "epoch": 3.701473894579066, + "grad_norm": 0.08182242626200527, + "learning_rate": 9.961426682584521e-06, + "loss": 0.4545, + "num_tokens": 7728827594.0, + "step": 1854 + }, + { + "epoch": 3.7034723957032227, + "grad_norm": 0.13297262004837254, + "learning_rate": 9.944100630644295e-06, + "loss": 0.4829, + "num_tokens": 7733008567.0, + "step": 1855 + }, + { + "epoch": 3.7054708968273795, + "grad_norm": 0.08489400437226004, + "learning_rate": 9.926794812427665e-06, + "loss": 0.4816, + "num_tokens": 7737192781.0, + "step": 1856 + }, + { + "epoch": 3.7074693979515363, + "grad_norm": 0.08881233726237753, + "learning_rate": 9.909509256979427e-06, + "loss": 0.469, + "num_tokens": 7741367767.0, + "step": 1857 + }, + { + "epoch": 3.709467899075693, + "grad_norm": 0.08628741123152059, + "learning_rate": 9.89224399331037e-06, + "loss": 0.471, + "num_tokens": 7745528248.0, + "step": 1858 + }, + { + "epoch": 3.71146640019985, + "grad_norm": 0.0948535100207662, + "learning_rate": 9.874999050397236e-06, + "loss": 0.4704, + "num_tokens": 7749711096.0, + "step": 1859 + }, + { + "epoch": 3.713464901324007, + "grad_norm": 0.0803730002540711, + "learning_rate": 9.857774457182645e-06, + "loss": 0.4889, + "num_tokens": 7753869132.0, + "step": 1860 + }, + { + "epoch": 3.715463402448164, + "grad_norm": 0.08494025102717362, + "learning_rate": 9.840570242575067e-06, + "loss": 0.4905, + "num_tokens": 7758017766.0, + "step": 1861 + }, + { + "epoch": 3.7174619035723206, + "grad_norm": 0.08861794935991386, + "learning_rate": 9.823386435448777e-06, + "loss": 0.4824, + "num_tokens": 7762178333.0, + "step": 1862 + }, + { + "epoch": 3.719460404696478, + "grad_norm": 0.09139042266708886, + "learning_rate": 9.806223064643809e-06, + "loss": 0.4773, + "num_tokens": 7766363988.0, + "step": 1863 + }, + { + "epoch": 3.7214589058206347, + "grad_norm": 0.08877182665421425, + "learning_rate": 9.789080158965876e-06, + "loss": 0.4837, + "num_tokens": 7770548922.0, + "step": 1864 + }, + { + "epoch": 3.7234574069447914, + "grad_norm": 0.09366490421574755, + "learning_rate": 9.771957747186353e-06, + "loss": 0.4841, + "num_tokens": 7774735410.0, + "step": 1865 + }, + { + "epoch": 3.725455908068948, + "grad_norm": 0.08906433358607357, + "learning_rate": 9.754855858042218e-06, + "loss": 0.4887, + "num_tokens": 7778919443.0, + "step": 1866 + }, + { + "epoch": 3.727454409193105, + "grad_norm": 0.08847517855812413, + "learning_rate": 9.737774520236012e-06, + "loss": 0.4659, + "num_tokens": 7783102438.0, + "step": 1867 + }, + { + "epoch": 3.729452910317262, + "grad_norm": 0.10102748580075348, + "learning_rate": 9.720713762435788e-06, + "loss": 0.4697, + "num_tokens": 7787286975.0, + "step": 1868 + }, + { + "epoch": 3.731451411441419, + "grad_norm": 0.09655315867918719, + "learning_rate": 9.703673613275042e-06, + "loss": 0.4721, + "num_tokens": 7791471892.0, + "step": 1869 + }, + { + "epoch": 3.733449912565576, + "grad_norm": 0.0879430123784072, + "learning_rate": 9.686654101352689e-06, + "loss": 0.4787, + "num_tokens": 7795634250.0, + "step": 1870 + }, + { + "epoch": 3.7354484136897326, + "grad_norm": 0.1035892781816886, + "learning_rate": 9.669655255233014e-06, + "loss": 0.4778, + "num_tokens": 7799738171.0, + "step": 1871 + }, + { + "epoch": 3.73744691481389, + "grad_norm": 0.10883718795586811, + "learning_rate": 9.652677103445622e-06, + "loss": 0.4653, + "num_tokens": 7803917598.0, + "step": 1872 + }, + { + "epoch": 3.7394454159380466, + "grad_norm": 0.08729136117160077, + "learning_rate": 9.63571967448537e-06, + "loss": 0.4776, + "num_tokens": 7808100466.0, + "step": 1873 + }, + { + "epoch": 3.7414439170622034, + "grad_norm": 0.09647464628166579, + "learning_rate": 9.618782996812344e-06, + "loss": 0.4823, + "num_tokens": 7812252418.0, + "step": 1874 + }, + { + "epoch": 3.74344241818636, + "grad_norm": 0.09184765029966978, + "learning_rate": 9.601867098851801e-06, + "loss": 0.4715, + "num_tokens": 7816408324.0, + "step": 1875 + }, + { + "epoch": 3.745440919310517, + "grad_norm": 0.0936386453131224, + "learning_rate": 9.584972008994123e-06, + "loss": 0.4754, + "num_tokens": 7820586621.0, + "step": 1876 + }, + { + "epoch": 3.747439420434674, + "grad_norm": 0.0983983661084726, + "learning_rate": 9.568097755594783e-06, + "loss": 0.4626, + "num_tokens": 7824769257.0, + "step": 1877 + }, + { + "epoch": 3.749437921558831, + "grad_norm": 0.1100774980639209, + "learning_rate": 9.551244366974261e-06, + "loss": 0.4841, + "num_tokens": 7828951653.0, + "step": 1878 + }, + { + "epoch": 3.7514364226829877, + "grad_norm": 0.08153146876491112, + "learning_rate": 9.534411871418026e-06, + "loss": 0.4644, + "num_tokens": 7833103549.0, + "step": 1879 + }, + { + "epoch": 3.7534349238071445, + "grad_norm": 0.11433791087073801, + "learning_rate": 9.517600297176487e-06, + "loss": 0.4749, + "num_tokens": 7837256179.0, + "step": 1880 + }, + { + "epoch": 3.7554334249313017, + "grad_norm": 0.09914452008001229, + "learning_rate": 9.500809672464947e-06, + "loss": 0.4731, + "num_tokens": 7841442347.0, + "step": 1881 + }, + { + "epoch": 3.7574319260554585, + "grad_norm": 0.08715726728337825, + "learning_rate": 9.484040025463537e-06, + "loss": 0.4766, + "num_tokens": 7845625360.0, + "step": 1882 + }, + { + "epoch": 3.7594304271796153, + "grad_norm": 0.10879615688858842, + "learning_rate": 9.467291384317177e-06, + "loss": 0.4765, + "num_tokens": 7849811351.0, + "step": 1883 + }, + { + "epoch": 3.761428928303772, + "grad_norm": 0.0829248541774169, + "learning_rate": 9.450563777135537e-06, + "loss": 0.4715, + "num_tokens": 7853983917.0, + "step": 1884 + }, + { + "epoch": 3.763427429427929, + "grad_norm": 0.09959350025256329, + "learning_rate": 9.43385723199299e-06, + "loss": 0.4756, + "num_tokens": 7858140311.0, + "step": 1885 + }, + { + "epoch": 3.765425930552086, + "grad_norm": 0.09195259737426816, + "learning_rate": 9.417171776928568e-06, + "loss": 0.4657, + "num_tokens": 7862325465.0, + "step": 1886 + }, + { + "epoch": 3.767424431676243, + "grad_norm": 0.08852284060783404, + "learning_rate": 9.400507439945886e-06, + "loss": 0.4805, + "num_tokens": 7866511164.0, + "step": 1887 + }, + { + "epoch": 3.7694229328003996, + "grad_norm": 0.10232061448626426, + "learning_rate": 9.38386424901312e-06, + "loss": 0.4676, + "num_tokens": 7870694773.0, + "step": 1888 + }, + { + "epoch": 3.7714214339245564, + "grad_norm": 0.10023304138113351, + "learning_rate": 9.367242232062974e-06, + "loss": 0.4845, + "num_tokens": 7874880829.0, + "step": 1889 + }, + { + "epoch": 3.7734199350487136, + "grad_norm": 0.08846396972494391, + "learning_rate": 9.350641416992603e-06, + "loss": 0.4687, + "num_tokens": 7879061537.0, + "step": 1890 + }, + { + "epoch": 3.7754184361728704, + "grad_norm": 0.13447531225438122, + "learning_rate": 9.33406183166357e-06, + "loss": 0.4732, + "num_tokens": 7883245707.0, + "step": 1891 + }, + { + "epoch": 3.777416937297027, + "grad_norm": 0.08829544303649121, + "learning_rate": 9.317503503901823e-06, + "loss": 0.4736, + "num_tokens": 7887429053.0, + "step": 1892 + }, + { + "epoch": 3.779415438421184, + "grad_norm": 0.10135114461068966, + "learning_rate": 9.300966461497619e-06, + "loss": 0.4716, + "num_tokens": 7891591078.0, + "step": 1893 + }, + { + "epoch": 3.7814139395453408, + "grad_norm": 0.12856356087696438, + "learning_rate": 9.284450732205502e-06, + "loss": 0.4641, + "num_tokens": 7895775323.0, + "step": 1894 + }, + { + "epoch": 3.783412440669498, + "grad_norm": 0.09582413033722738, + "learning_rate": 9.26795634374425e-06, + "loss": 0.4749, + "num_tokens": 7899962793.0, + "step": 1895 + }, + { + "epoch": 3.785410941793655, + "grad_norm": 0.09789961096194957, + "learning_rate": 9.251483323796806e-06, + "loss": 0.485, + "num_tokens": 7904148822.0, + "step": 1896 + }, + { + "epoch": 3.7874094429178116, + "grad_norm": 0.1163324378009875, + "learning_rate": 9.23503170001026e-06, + "loss": 0.4816, + "num_tokens": 7908336728.0, + "step": 1897 + }, + { + "epoch": 3.789407944041969, + "grad_norm": 0.09812457862160255, + "learning_rate": 9.218601499995798e-06, + "loss": 0.472, + "num_tokens": 7912524665.0, + "step": 1898 + }, + { + "epoch": 3.7914064451661256, + "grad_norm": 0.11416098401202499, + "learning_rate": 9.202192751328637e-06, + "loss": 0.4866, + "num_tokens": 7916709078.0, + "step": 1899 + }, + { + "epoch": 3.7934049462902824, + "grad_norm": 0.09698358557563833, + "learning_rate": 9.185805481548007e-06, + "loss": 0.4619, + "num_tokens": 7920887814.0, + "step": 1900 + }, + { + "epoch": 3.795403447414439, + "grad_norm": 0.0891234517111331, + "learning_rate": 9.16943971815708e-06, + "loss": 0.4768, + "num_tokens": 7925075299.0, + "step": 1901 + }, + { + "epoch": 3.797401948538596, + "grad_norm": 0.08905568651654885, + "learning_rate": 9.153095488622921e-06, + "loss": 0.4702, + "num_tokens": 7929260929.0, + "step": 1902 + }, + { + "epoch": 3.7994004496627527, + "grad_norm": 0.08589079112314177, + "learning_rate": 9.136772820376479e-06, + "loss": 0.4748, + "num_tokens": 7933447196.0, + "step": 1903 + }, + { + "epoch": 3.80139895078691, + "grad_norm": 0.09752076663340482, + "learning_rate": 9.12047174081251e-06, + "loss": 0.4706, + "num_tokens": 7937629559.0, + "step": 1904 + }, + { + "epoch": 3.8033974519110667, + "grad_norm": 0.0926153650702368, + "learning_rate": 9.104192277289523e-06, + "loss": 0.4672, + "num_tokens": 7941813012.0, + "step": 1905 + }, + { + "epoch": 3.8053959530352235, + "grad_norm": 0.0837805982379237, + "learning_rate": 9.087934457129756e-06, + "loss": 0.4437, + "num_tokens": 7945956438.0, + "step": 1906 + }, + { + "epoch": 3.8073944541593807, + "grad_norm": 0.09168448933434845, + "learning_rate": 9.071698307619132e-06, + "loss": 0.4745, + "num_tokens": 7950116349.0, + "step": 1907 + }, + { + "epoch": 3.8093929552835375, + "grad_norm": 0.08596470111977912, + "learning_rate": 9.055483856007182e-06, + "loss": 0.4686, + "num_tokens": 7954271152.0, + "step": 1908 + }, + { + "epoch": 3.8113914564076943, + "grad_norm": 0.08763296809191264, + "learning_rate": 9.039291129507049e-06, + "loss": 0.4728, + "num_tokens": 7958427809.0, + "step": 1909 + }, + { + "epoch": 3.813389957531851, + "grad_norm": 0.10285299948009391, + "learning_rate": 9.023120155295395e-06, + "loss": 0.4782, + "num_tokens": 7962612816.0, + "step": 1910 + }, + { + "epoch": 3.815388458656008, + "grad_norm": 0.08792670087534506, + "learning_rate": 9.006970960512365e-06, + "loss": 0.475, + "num_tokens": 7966756113.0, + "step": 1911 + }, + { + "epoch": 3.8173869597801646, + "grad_norm": 0.08509744039393145, + "learning_rate": 8.990843572261581e-06, + "loss": 0.4652, + "num_tokens": 7970904097.0, + "step": 1912 + }, + { + "epoch": 3.819385460904322, + "grad_norm": 0.08895292309246493, + "learning_rate": 8.974738017610043e-06, + "loss": 0.4642, + "num_tokens": 7975089015.0, + "step": 1913 + }, + { + "epoch": 3.8213839620284786, + "grad_norm": 0.08808301968621683, + "learning_rate": 8.95865432358812e-06, + "loss": 0.4781, + "num_tokens": 7979246691.0, + "step": 1914 + }, + { + "epoch": 3.8233824631526354, + "grad_norm": 0.09021686211022321, + "learning_rate": 8.942592517189476e-06, + "loss": 0.4781, + "num_tokens": 7983433528.0, + "step": 1915 + }, + { + "epoch": 3.8253809642767926, + "grad_norm": 0.09426201161960111, + "learning_rate": 8.926552625371065e-06, + "loss": 0.4911, + "num_tokens": 7987617141.0, + "step": 1916 + }, + { + "epoch": 3.8273794654009494, + "grad_norm": 0.10293174485190552, + "learning_rate": 8.910534675053035e-06, + "loss": 0.476, + "num_tokens": 7991802281.0, + "step": 1917 + }, + { + "epoch": 3.829377966525106, + "grad_norm": 0.08563560304392488, + "learning_rate": 8.89453869311872e-06, + "loss": 0.477, + "num_tokens": 7995960446.0, + "step": 1918 + }, + { + "epoch": 3.831376467649263, + "grad_norm": 0.09284095303219703, + "learning_rate": 8.878564706414597e-06, + "loss": 0.4817, + "num_tokens": 8000128585.0, + "step": 1919 + }, + { + "epoch": 3.8333749687734198, + "grad_norm": 0.10189153467606044, + "learning_rate": 8.862612741750201e-06, + "loss": 0.4718, + "num_tokens": 8004300062.0, + "step": 1920 + }, + { + "epoch": 3.8353734698975765, + "grad_norm": 0.09524487787168208, + "learning_rate": 8.846682825898134e-06, + "loss": 0.4752, + "num_tokens": 8008456011.0, + "step": 1921 + }, + { + "epoch": 3.8373719710217338, + "grad_norm": 0.08534026076964447, + "learning_rate": 8.830774985593969e-06, + "loss": 0.4815, + "num_tokens": 8012639363.0, + "step": 1922 + }, + { + "epoch": 3.8393704721458906, + "grad_norm": 0.10546547609086629, + "learning_rate": 8.81488924753625e-06, + "loss": 0.4762, + "num_tokens": 8016812227.0, + "step": 1923 + }, + { + "epoch": 3.8413689732700473, + "grad_norm": 0.10221321791836291, + "learning_rate": 8.79902563838641e-06, + "loss": 0.4763, + "num_tokens": 8020974438.0, + "step": 1924 + }, + { + "epoch": 3.8433674743942046, + "grad_norm": 0.0849723600461828, + "learning_rate": 8.783184184768756e-06, + "loss": 0.4681, + "num_tokens": 8025124218.0, + "step": 1925 + }, + { + "epoch": 3.8453659755183613, + "grad_norm": 0.09008370678291194, + "learning_rate": 8.767364913270399e-06, + "loss": 0.4875, + "num_tokens": 8029306164.0, + "step": 1926 + }, + { + "epoch": 3.847364476642518, + "grad_norm": 0.09207492394715112, + "learning_rate": 8.751567850441219e-06, + "loss": 0.4721, + "num_tokens": 8033486205.0, + "step": 1927 + }, + { + "epoch": 3.849362977766675, + "grad_norm": 0.08869361037485754, + "learning_rate": 8.735793022793846e-06, + "loss": 0.479, + "num_tokens": 8037671475.0, + "step": 1928 + }, + { + "epoch": 3.8513614788908317, + "grad_norm": 0.08465920616688292, + "learning_rate": 8.72004045680356e-06, + "loss": 0.4823, + "num_tokens": 8041853123.0, + "step": 1929 + }, + { + "epoch": 3.853359980014989, + "grad_norm": 0.08476193625837164, + "learning_rate": 8.704310178908308e-06, + "loss": 0.4824, + "num_tokens": 8046036336.0, + "step": 1930 + }, + { + "epoch": 3.8553584811391457, + "grad_norm": 0.08692680673101093, + "learning_rate": 8.688602215508613e-06, + "loss": 0.4971, + "num_tokens": 8050223533.0, + "step": 1931 + }, + { + "epoch": 3.8573569822633025, + "grad_norm": 0.0911799121897557, + "learning_rate": 8.67291659296754e-06, + "loss": 0.479, + "num_tokens": 8054407780.0, + "step": 1932 + }, + { + "epoch": 3.8593554833874593, + "grad_norm": 0.08746572542506817, + "learning_rate": 8.657253337610677e-06, + "loss": 0.4602, + "num_tokens": 8058592419.0, + "step": 1933 + }, + { + "epoch": 3.8613539845116165, + "grad_norm": 0.08049049041500553, + "learning_rate": 8.641612475726075e-06, + "loss": 0.4684, + "num_tokens": 8062775649.0, + "step": 1934 + }, + { + "epoch": 3.8633524856357733, + "grad_norm": 0.10076315452711623, + "learning_rate": 8.625994033564182e-06, + "loss": 0.484, + "num_tokens": 8066934239.0, + "step": 1935 + }, + { + "epoch": 3.86535098675993, + "grad_norm": 0.08358592550618879, + "learning_rate": 8.610398037337823e-06, + "loss": 0.4716, + "num_tokens": 8071099334.0, + "step": 1936 + }, + { + "epoch": 3.867349487884087, + "grad_norm": 0.08924657398408478, + "learning_rate": 8.594824513222169e-06, + "loss": 0.5023, + "num_tokens": 8075283932.0, + "step": 1937 + }, + { + "epoch": 3.8693479890082436, + "grad_norm": 0.09472377398251605, + "learning_rate": 8.579273487354649e-06, + "loss": 0.4803, + "num_tokens": 8079467666.0, + "step": 1938 + }, + { + "epoch": 3.871346490132401, + "grad_norm": 0.09268851647650359, + "learning_rate": 8.563744985834962e-06, + "loss": 0.4703, + "num_tokens": 8083630852.0, + "step": 1939 + }, + { + "epoch": 3.8733449912565576, + "grad_norm": 0.08906900371421828, + "learning_rate": 8.548239034724979e-06, + "loss": 0.4843, + "num_tokens": 8087817488.0, + "step": 1940 + }, + { + "epoch": 3.8753434923807144, + "grad_norm": 0.14898381333354238, + "learning_rate": 8.532755660048731e-06, + "loss": 0.4833, + "num_tokens": 8092001561.0, + "step": 1941 + }, + { + "epoch": 3.877341993504871, + "grad_norm": 0.09406442832949885, + "learning_rate": 8.517294887792365e-06, + "loss": 0.4858, + "num_tokens": 8096161809.0, + "step": 1942 + }, + { + "epoch": 3.8793404946290284, + "grad_norm": 0.09219533912384972, + "learning_rate": 8.501856743904094e-06, + "loss": 0.4743, + "num_tokens": 8100345810.0, + "step": 1943 + }, + { + "epoch": 3.881338995753185, + "grad_norm": 0.08843140826173951, + "learning_rate": 8.486441254294144e-06, + "loss": 0.4771, + "num_tokens": 8104532650.0, + "step": 1944 + }, + { + "epoch": 3.883337496877342, + "grad_norm": 0.08664220116574353, + "learning_rate": 8.471048444834726e-06, + "loss": 0.4737, + "num_tokens": 8108718451.0, + "step": 1945 + }, + { + "epoch": 3.8853359980014988, + "grad_norm": 0.09375192498957986, + "learning_rate": 8.455678341359976e-06, + "loss": 0.4704, + "num_tokens": 8112898121.0, + "step": 1946 + }, + { + "epoch": 3.8873344991256555, + "grad_norm": 0.08662898204634341, + "learning_rate": 8.440330969665945e-06, + "loss": 0.4638, + "num_tokens": 8117082833.0, + "step": 1947 + }, + { + "epoch": 3.8893330002498128, + "grad_norm": 0.08685995894257448, + "learning_rate": 8.425006355510514e-06, + "loss": 0.4629, + "num_tokens": 8121253334.0, + "step": 1948 + }, + { + "epoch": 3.8913315013739695, + "grad_norm": 0.0914927447089017, + "learning_rate": 8.409704524613376e-06, + "loss": 0.4776, + "num_tokens": 8125437680.0, + "step": 1949 + }, + { + "epoch": 3.8933300024981263, + "grad_norm": 0.08705372051968746, + "learning_rate": 8.394425502655972e-06, + "loss": 0.4542, + "num_tokens": 8129608955.0, + "step": 1950 + }, + { + "epoch": 3.8953285036222836, + "grad_norm": 0.08257052642012766, + "learning_rate": 8.379169315281485e-06, + "loss": 0.4756, + "num_tokens": 8133796719.0, + "step": 1951 + }, + { + "epoch": 3.8973270047464403, + "grad_norm": 0.08913065971036556, + "learning_rate": 8.363935988094767e-06, + "loss": 0.4826, + "num_tokens": 8137980974.0, + "step": 1952 + }, + { + "epoch": 3.899325505870597, + "grad_norm": 0.0935670492826562, + "learning_rate": 8.348725546662298e-06, + "loss": 0.4725, + "num_tokens": 8142164781.0, + "step": 1953 + }, + { + "epoch": 3.901324006994754, + "grad_norm": 0.08402861634847582, + "learning_rate": 8.333538016512141e-06, + "loss": 0.4623, + "num_tokens": 8146333886.0, + "step": 1954 + }, + { + "epoch": 3.9033225081189107, + "grad_norm": 0.0901132963342818, + "learning_rate": 8.318373423133914e-06, + "loss": 0.4735, + "num_tokens": 8150520018.0, + "step": 1955 + }, + { + "epoch": 3.9053210092430675, + "grad_norm": 0.093419203907287, + "learning_rate": 8.30323179197876e-06, + "loss": 0.4774, + "num_tokens": 8154696107.0, + "step": 1956 + }, + { + "epoch": 3.9073195103672247, + "grad_norm": 0.08585709459914513, + "learning_rate": 8.288113148459253e-06, + "loss": 0.4637, + "num_tokens": 8158880534.0, + "step": 1957 + }, + { + "epoch": 3.9093180114913815, + "grad_norm": 0.09721624303123978, + "learning_rate": 8.2730175179494e-06, + "loss": 0.47, + "num_tokens": 8163067693.0, + "step": 1958 + }, + { + "epoch": 3.9113165126155383, + "grad_norm": 0.09559668687274804, + "learning_rate": 8.257944925784573e-06, + "loss": 0.4909, + "num_tokens": 8167252421.0, + "step": 1959 + }, + { + "epoch": 3.9133150137396955, + "grad_norm": 0.09352061469083638, + "learning_rate": 8.242895397261502e-06, + "loss": 0.4909, + "num_tokens": 8171436533.0, + "step": 1960 + }, + { + "epoch": 3.9153135148638523, + "grad_norm": 0.09189838718895571, + "learning_rate": 8.22786895763819e-06, + "loss": 0.4824, + "num_tokens": 8175617825.0, + "step": 1961 + }, + { + "epoch": 3.917312015988009, + "grad_norm": 0.09849201468825315, + "learning_rate": 8.212865632133895e-06, + "loss": 0.4763, + "num_tokens": 8179802644.0, + "step": 1962 + }, + { + "epoch": 3.919310517112166, + "grad_norm": 0.08489248296997962, + "learning_rate": 8.197885445929081e-06, + "loss": 0.4777, + "num_tokens": 8183988035.0, + "step": 1963 + }, + { + "epoch": 3.9213090182363226, + "grad_norm": 0.08882168488573183, + "learning_rate": 8.182928424165368e-06, + "loss": 0.472, + "num_tokens": 8188150561.0, + "step": 1964 + }, + { + "epoch": 3.9233075193604794, + "grad_norm": 0.08889052081722236, + "learning_rate": 8.167994591945523e-06, + "loss": 0.471, + "num_tokens": 8192336698.0, + "step": 1965 + }, + { + "epoch": 3.9253060204846366, + "grad_norm": 0.08555927741619743, + "learning_rate": 8.15308397433337e-06, + "loss": 0.4705, + "num_tokens": 8196494972.0, + "step": 1966 + }, + { + "epoch": 3.9273045216087934, + "grad_norm": 0.08186003853767657, + "learning_rate": 8.138196596353775e-06, + "loss": 0.4848, + "num_tokens": 8200681665.0, + "step": 1967 + }, + { + "epoch": 3.92930302273295, + "grad_norm": 0.0887755314930543, + "learning_rate": 8.12333248299261e-06, + "loss": 0.4926, + "num_tokens": 8204849779.0, + "step": 1968 + }, + { + "epoch": 3.9313015238571074, + "grad_norm": 0.08298171534037738, + "learning_rate": 8.10849165919669e-06, + "loss": 0.4686, + "num_tokens": 8208981809.0, + "step": 1969 + }, + { + "epoch": 3.933300024981264, + "grad_norm": 0.09479587906526042, + "learning_rate": 8.093674149873758e-06, + "loss": 0.4791, + "num_tokens": 8213155284.0, + "step": 1970 + }, + { + "epoch": 3.935298526105421, + "grad_norm": 0.08988148466647304, + "learning_rate": 8.078879979892412e-06, + "loss": 0.4847, + "num_tokens": 8217337994.0, + "step": 1971 + }, + { + "epoch": 3.9372970272295778, + "grad_norm": 0.08643070546106753, + "learning_rate": 8.064109174082083e-06, + "loss": 0.4755, + "num_tokens": 8221523792.0, + "step": 1972 + }, + { + "epoch": 3.9392955283537345, + "grad_norm": 0.08902090161092303, + "learning_rate": 8.049361757232997e-06, + "loss": 0.4808, + "num_tokens": 8225676404.0, + "step": 1973 + }, + { + "epoch": 3.9412940294778913, + "grad_norm": 0.08813043753763553, + "learning_rate": 8.034637754096113e-06, + "loss": 0.4804, + "num_tokens": 8229862907.0, + "step": 1974 + }, + { + "epoch": 3.9432925306020485, + "grad_norm": 0.08500408581649317, + "learning_rate": 8.01993718938311e-06, + "loss": 0.4809, + "num_tokens": 8234048253.0, + "step": 1975 + }, + { + "epoch": 3.9452910317262053, + "grad_norm": 0.09132204867580028, + "learning_rate": 8.005260087766318e-06, + "loss": 0.4687, + "num_tokens": 8238232399.0, + "step": 1976 + }, + { + "epoch": 3.947289532850362, + "grad_norm": 0.08071092295957874, + "learning_rate": 7.990606473878682e-06, + "loss": 0.48, + "num_tokens": 8242417235.0, + "step": 1977 + }, + { + "epoch": 3.9492880339745193, + "grad_norm": 0.08569039632283407, + "learning_rate": 7.975976372313748e-06, + "loss": 0.4868, + "num_tokens": 8246570162.0, + "step": 1978 + }, + { + "epoch": 3.951286535098676, + "grad_norm": 0.0975835212783791, + "learning_rate": 7.961369807625581e-06, + "loss": 0.4792, + "num_tokens": 8250733236.0, + "step": 1979 + }, + { + "epoch": 3.953285036222833, + "grad_norm": 0.08914380413390571, + "learning_rate": 7.946786804328761e-06, + "loss": 0.4724, + "num_tokens": 8254918925.0, + "step": 1980 + }, + { + "epoch": 3.9552835373469897, + "grad_norm": 0.09176582482064566, + "learning_rate": 7.932227386898301e-06, + "loss": 0.4877, + "num_tokens": 8259027533.0, + "step": 1981 + }, + { + "epoch": 3.9572820384711465, + "grad_norm": 0.08926175577395853, + "learning_rate": 7.917691579769657e-06, + "loss": 0.4664, + "num_tokens": 8263183088.0, + "step": 1982 + }, + { + "epoch": 3.9592805395953037, + "grad_norm": 0.08923785395986454, + "learning_rate": 7.903179407338634e-06, + "loss": 0.471, + "num_tokens": 8267369446.0, + "step": 1983 + }, + { + "epoch": 3.9612790407194605, + "grad_norm": 0.1045459389088874, + "learning_rate": 7.888690893961389e-06, + "loss": 0.4747, + "num_tokens": 8271552680.0, + "step": 1984 + }, + { + "epoch": 3.9632775418436172, + "grad_norm": 0.08284016847518944, + "learning_rate": 7.87422606395436e-06, + "loss": 0.4645, + "num_tokens": 8275688434.0, + "step": 1985 + }, + { + "epoch": 3.965276042967774, + "grad_norm": 0.08347655778116733, + "learning_rate": 7.859784941594237e-06, + "loss": 0.4778, + "num_tokens": 8279875951.0, + "step": 1986 + }, + { + "epoch": 3.9672745440919313, + "grad_norm": 0.09947949832105578, + "learning_rate": 7.845367551117934e-06, + "loss": 0.4771, + "num_tokens": 8284057296.0, + "step": 1987 + }, + { + "epoch": 3.969273045216088, + "grad_norm": 0.09026790434082681, + "learning_rate": 7.830973916722513e-06, + "loss": 0.4713, + "num_tokens": 8288244226.0, + "step": 1988 + }, + { + "epoch": 3.971271546340245, + "grad_norm": 0.08233517814523184, + "learning_rate": 7.816604062565189e-06, + "loss": 0.4684, + "num_tokens": 8292426906.0, + "step": 1989 + }, + { + "epoch": 3.9732700474644016, + "grad_norm": 0.091106634121088, + "learning_rate": 7.802258012763247e-06, + "loss": 0.4538, + "num_tokens": 8296577103.0, + "step": 1990 + }, + { + "epoch": 3.9752685485885584, + "grad_norm": 0.08607234867961372, + "learning_rate": 7.787935791394034e-06, + "loss": 0.4726, + "num_tokens": 8300719966.0, + "step": 1991 + }, + { + "epoch": 3.9772670497127156, + "grad_norm": 0.08358509354297748, + "learning_rate": 7.773637422494902e-06, + "loss": 0.4869, + "num_tokens": 8304904105.0, + "step": 1992 + }, + { + "epoch": 3.9792655508368724, + "grad_norm": 0.0764291514820361, + "learning_rate": 7.759362930063155e-06, + "loss": 0.4754, + "num_tokens": 8309091563.0, + "step": 1993 + }, + { + "epoch": 3.981264051961029, + "grad_norm": 0.07888997666451532, + "learning_rate": 7.745112338056054e-06, + "loss": 0.4715, + "num_tokens": 8313276764.0, + "step": 1994 + }, + { + "epoch": 3.983262553085186, + "grad_norm": 0.09571113815020472, + "learning_rate": 7.73088567039072e-06, + "loss": 0.489, + "num_tokens": 8317429420.0, + "step": 1995 + }, + { + "epoch": 3.985261054209343, + "grad_norm": 0.08082804057593175, + "learning_rate": 7.716682950944141e-06, + "loss": 0.4661, + "num_tokens": 8321616362.0, + "step": 1996 + }, + { + "epoch": 3.9872595553335, + "grad_norm": 0.08310728928640784, + "learning_rate": 7.702504203553092e-06, + "loss": 0.4747, + "num_tokens": 8325803190.0, + "step": 1997 + }, + { + "epoch": 3.9892580564576567, + "grad_norm": 0.0914993561264746, + "learning_rate": 7.688349452014141e-06, + "loss": 0.4711, + "num_tokens": 8329987339.0, + "step": 1998 + }, + { + "epoch": 3.9912565575818135, + "grad_norm": 0.08382100499629373, + "learning_rate": 7.674218720083557e-06, + "loss": 0.4676, + "num_tokens": 8334158541.0, + "step": 1999 + }, + { + "epoch": 3.9932550587059703, + "grad_norm": 0.08247137497415682, + "learning_rate": 7.660112031477314e-06, + "loss": 0.4815, + "num_tokens": 8338330443.0, + "step": 2000 + }, + { + "epoch": 3.9952535598301275, + "grad_norm": 0.08531226191316629, + "learning_rate": 7.646029409871029e-06, + "loss": 0.467, + "num_tokens": 8342515116.0, + "step": 2001 + }, + { + "epoch": 3.9972520609542843, + "grad_norm": 0.08853533850293883, + "learning_rate": 7.631970878899916e-06, + "loss": 0.4677, + "num_tokens": 8346685356.0, + "step": 2002 + }, + { + "epoch": 3.999250562078441, + "grad_norm": 0.08399152309434592, + "learning_rate": 7.6179364621587796e-06, + "loss": 0.4721, + "num_tokens": 8350869258.0, + "step": 2003 + }, + { + "epoch": 4.0, + "grad_norm": 0.08399152309434592, + "learning_rate": 7.603926183201928e-06, + "loss": 0.4748, + "num_tokens": 8352439440.0, + "step": 2004 + }, + { + "epoch": 4.001998501124157, + "grad_norm": 0.1678408669324915, + "learning_rate": 7.589940065543178e-06, + "loss": 0.4472, + "num_tokens": 8356625671.0, + "step": 2005 + }, + { + "epoch": 4.003997002248314, + "grad_norm": 0.10953711969479378, + "learning_rate": 7.575978132655784e-06, + "loss": 0.4813, + "num_tokens": 8360788046.0, + "step": 2006 + }, + { + "epoch": 4.00599550337247, + "grad_norm": 0.10873681978693991, + "learning_rate": 7.562040407972411e-06, + "loss": 0.4805, + "num_tokens": 8364974397.0, + "step": 2007 + }, + { + "epoch": 4.007994004496627, + "grad_norm": 0.09342664613502027, + "learning_rate": 7.548126914885101e-06, + "loss": 0.452, + "num_tokens": 8369159868.0, + "step": 2008 + }, + { + "epoch": 4.009992505620785, + "grad_norm": 0.09761665785245682, + "learning_rate": 7.534237676745231e-06, + "loss": 0.4637, + "num_tokens": 8373316720.0, + "step": 2009 + }, + { + "epoch": 4.011991006744942, + "grad_norm": 0.10092643558910115, + "learning_rate": 7.5203727168634575e-06, + "loss": 0.463, + "num_tokens": 8377499677.0, + "step": 2010 + }, + { + "epoch": 4.013989507869098, + "grad_norm": 0.10984225728293848, + "learning_rate": 7.506532058509692e-06, + "loss": 0.4628, + "num_tokens": 8381659891.0, + "step": 2011 + }, + { + "epoch": 4.015988008993255, + "grad_norm": 0.11025697517359895, + "learning_rate": 7.492715724913075e-06, + "loss": 0.4483, + "num_tokens": 8385844237.0, + "step": 2012 + }, + { + "epoch": 4.017986510117412, + "grad_norm": 0.09733798276027214, + "learning_rate": 7.478923739261903e-06, + "loss": 0.4664, + "num_tokens": 8390014167.0, + "step": 2013 + }, + { + "epoch": 4.019985011241569, + "grad_norm": 0.09475323723190539, + "learning_rate": 7.465156124703624e-06, + "loss": 0.4456, + "num_tokens": 8394198460.0, + "step": 2014 + }, + { + "epoch": 4.0219835123657255, + "grad_norm": 0.09805433756513812, + "learning_rate": 7.451412904344774e-06, + "loss": 0.4721, + "num_tokens": 8398355076.0, + "step": 2015 + }, + { + "epoch": 4.023982013489882, + "grad_norm": 0.09883799416893838, + "learning_rate": 7.437694101250949e-06, + "loss": 0.4568, + "num_tokens": 8402541752.0, + "step": 2016 + }, + { + "epoch": 4.025980514614039, + "grad_norm": 0.09009783358431889, + "learning_rate": 7.423999738446765e-06, + "loss": 0.449, + "num_tokens": 8406725281.0, + "step": 2017 + }, + { + "epoch": 4.027979015738197, + "grad_norm": 0.1042443064815284, + "learning_rate": 7.410329838915828e-06, + "loss": 0.4709, + "num_tokens": 8410909498.0, + "step": 2018 + }, + { + "epoch": 4.0299775168623535, + "grad_norm": 0.09855138754117264, + "learning_rate": 7.396684425600677e-06, + "loss": 0.4466, + "num_tokens": 8415064369.0, + "step": 2019 + }, + { + "epoch": 4.03197601798651, + "grad_norm": 0.08875118293449565, + "learning_rate": 7.383063521402754e-06, + "loss": 0.4549, + "num_tokens": 8419244758.0, + "step": 2020 + }, + { + "epoch": 4.033974519110667, + "grad_norm": 0.09205630187326966, + "learning_rate": 7.369467149182366e-06, + "loss": 0.4518, + "num_tokens": 8423413344.0, + "step": 2021 + }, + { + "epoch": 4.035973020234824, + "grad_norm": 0.09082737861540806, + "learning_rate": 7.3558953317586555e-06, + "loss": 0.4578, + "num_tokens": 8427592338.0, + "step": 2022 + }, + { + "epoch": 4.037971521358981, + "grad_norm": 0.09301739148918264, + "learning_rate": 7.342348091909559e-06, + "loss": 0.4715, + "num_tokens": 8431776308.0, + "step": 2023 + }, + { + "epoch": 4.039970022483137, + "grad_norm": 0.09672109654392977, + "learning_rate": 7.328825452371744e-06, + "loss": 0.4558, + "num_tokens": 8435934992.0, + "step": 2024 + }, + { + "epoch": 4.041968523607294, + "grad_norm": 0.09945300980180044, + "learning_rate": 7.315327435840602e-06, + "loss": 0.4535, + "num_tokens": 8440119162.0, + "step": 2025 + }, + { + "epoch": 4.043967024731451, + "grad_norm": 0.09581360937514857, + "learning_rate": 7.301854064970202e-06, + "loss": 0.4633, + "num_tokens": 8444291327.0, + "step": 2026 + }, + { + "epoch": 4.045965525855609, + "grad_norm": 0.08917515736060927, + "learning_rate": 7.2884053623732494e-06, + "loss": 0.4789, + "num_tokens": 8448474135.0, + "step": 2027 + }, + { + "epoch": 4.047964026979765, + "grad_norm": 0.09211325889261932, + "learning_rate": 7.274981350621042e-06, + "loss": 0.4675, + "num_tokens": 8452644947.0, + "step": 2028 + }, + { + "epoch": 4.049962528103922, + "grad_norm": 0.09837663046932305, + "learning_rate": 7.261582052243441e-06, + "loss": 0.4483, + "num_tokens": 8456830507.0, + "step": 2029 + }, + { + "epoch": 4.051961029228079, + "grad_norm": 0.08541936297352741, + "learning_rate": 7.248207489728824e-06, + "loss": 0.4736, + "num_tokens": 8460984691.0, + "step": 2030 + }, + { + "epoch": 4.053959530352236, + "grad_norm": 0.09352515958881838, + "learning_rate": 7.2348576855240674e-06, + "loss": 0.4433, + "num_tokens": 8465166195.0, + "step": 2031 + }, + { + "epoch": 4.0559580314763926, + "grad_norm": 0.09150446726243736, + "learning_rate": 7.221532662034494e-06, + "loss": 0.453, + "num_tokens": 8469326727.0, + "step": 2032 + }, + { + "epoch": 4.057956532600549, + "grad_norm": 0.08923508105310497, + "learning_rate": 7.208232441623822e-06, + "loss": 0.4718, + "num_tokens": 8473512296.0, + "step": 2033 + }, + { + "epoch": 4.059955033724706, + "grad_norm": 0.10149473981241972, + "learning_rate": 7.1949570466141505e-06, + "loss": 0.473, + "num_tokens": 8477697352.0, + "step": 2034 + }, + { + "epoch": 4.061953534848864, + "grad_norm": 0.0886436761930713, + "learning_rate": 7.181706499285916e-06, + "loss": 0.447, + "num_tokens": 8481859767.0, + "step": 2035 + }, + { + "epoch": 4.063952035973021, + "grad_norm": 0.10068017011074913, + "learning_rate": 7.168480821877857e-06, + "loss": 0.4596, + "num_tokens": 8486042377.0, + "step": 2036 + }, + { + "epoch": 4.065950537097177, + "grad_norm": 0.09327774218435521, + "learning_rate": 7.15528003658696e-06, + "loss": 0.4488, + "num_tokens": 8490229176.0, + "step": 2037 + }, + { + "epoch": 4.067949038221334, + "grad_norm": 0.10485660286390354, + "learning_rate": 7.142104165568436e-06, + "loss": 0.4587, + "num_tokens": 8494404264.0, + "step": 2038 + }, + { + "epoch": 4.069947539345491, + "grad_norm": 0.09306698701197264, + "learning_rate": 7.128953230935687e-06, + "loss": 0.4452, + "num_tokens": 8498577376.0, + "step": 2039 + }, + { + "epoch": 4.071946040469648, + "grad_norm": 0.08619649777429153, + "learning_rate": 7.115827254760263e-06, + "loss": 0.4528, + "num_tokens": 8502709845.0, + "step": 2040 + }, + { + "epoch": 4.0739445415938045, + "grad_norm": 0.09174851979176973, + "learning_rate": 7.10272625907183e-06, + "loss": 0.4611, + "num_tokens": 8506874869.0, + "step": 2041 + }, + { + "epoch": 4.075943042717961, + "grad_norm": 0.08444440652044159, + "learning_rate": 7.089650265858121e-06, + "loss": 0.4576, + "num_tokens": 8511059655.0, + "step": 2042 + }, + { + "epoch": 4.077941543842118, + "grad_norm": 0.08745029167415286, + "learning_rate": 7.076599297064902e-06, + "loss": 0.4604, + "num_tokens": 8515237281.0, + "step": 2043 + }, + { + "epoch": 4.079940044966276, + "grad_norm": 0.0882329475452593, + "learning_rate": 7.063573374595951e-06, + "loss": 0.4591, + "num_tokens": 8519421393.0, + "step": 2044 + }, + { + "epoch": 4.0819385460904325, + "grad_norm": 0.09220093305017184, + "learning_rate": 7.050572520313015e-06, + "loss": 0.4706, + "num_tokens": 8523575966.0, + "step": 2045 + }, + { + "epoch": 4.083937047214589, + "grad_norm": 0.08528096765296074, + "learning_rate": 7.037596756035751e-06, + "loss": 0.4512, + "num_tokens": 8527719345.0, + "step": 2046 + }, + { + "epoch": 4.085935548338746, + "grad_norm": 0.08476602870392555, + "learning_rate": 7.024646103541715e-06, + "loss": 0.4465, + "num_tokens": 8531880446.0, + "step": 2047 + }, + { + "epoch": 4.087934049462903, + "grad_norm": 0.09257822968525853, + "learning_rate": 7.011720584566317e-06, + "loss": 0.4416, + "num_tokens": 8536046477.0, + "step": 2048 + }, + { + "epoch": 4.08993255058706, + "grad_norm": 0.09370407607907506, + "learning_rate": 6.998820220802786e-06, + "loss": 0.4628, + "num_tokens": 8540216685.0, + "step": 2049 + }, + { + "epoch": 4.091931051711216, + "grad_norm": 0.08322413110583164, + "learning_rate": 6.985945033902137e-06, + "loss": 0.461, + "num_tokens": 8544399011.0, + "step": 2050 + }, + { + "epoch": 4.093929552835373, + "grad_norm": 0.09144557251868184, + "learning_rate": 6.973095045473124e-06, + "loss": 0.4581, + "num_tokens": 8548585066.0, + "step": 2051 + }, + { + "epoch": 4.09592805395953, + "grad_norm": 0.09437383340244458, + "learning_rate": 6.960270277082199e-06, + "loss": 0.4651, + "num_tokens": 8552743534.0, + "step": 2052 + }, + { + "epoch": 4.097926555083688, + "grad_norm": 0.08785660995430017, + "learning_rate": 6.9474707502535135e-06, + "loss": 0.4624, + "num_tokens": 8556927887.0, + "step": 2053 + }, + { + "epoch": 4.099925056207844, + "grad_norm": 0.08433361846203925, + "learning_rate": 6.9346964864688285e-06, + "loss": 0.4568, + "num_tokens": 8561112344.0, + "step": 2054 + }, + { + "epoch": 4.101923557332001, + "grad_norm": 0.08542812953969876, + "learning_rate": 6.9219475071675256e-06, + "loss": 0.4668, + "num_tokens": 8565300336.0, + "step": 2055 + }, + { + "epoch": 4.103922058456158, + "grad_norm": 0.09221776487644438, + "learning_rate": 6.909223833746537e-06, + "loss": 0.4627, + "num_tokens": 8569483700.0, + "step": 2056 + }, + { + "epoch": 4.105920559580315, + "grad_norm": 0.09185179653174277, + "learning_rate": 6.896525487560328e-06, + "loss": 0.4722, + "num_tokens": 8573654100.0, + "step": 2057 + }, + { + "epoch": 4.1079190607044715, + "grad_norm": 0.08136909449838411, + "learning_rate": 6.883852489920857e-06, + "loss": 0.4678, + "num_tokens": 8577811367.0, + "step": 2058 + }, + { + "epoch": 4.109917561828628, + "grad_norm": 0.085618363973538, + "learning_rate": 6.871204862097545e-06, + "loss": 0.4659, + "num_tokens": 8581972683.0, + "step": 2059 + }, + { + "epoch": 4.111916062952785, + "grad_norm": 0.08896101601730436, + "learning_rate": 6.858582625317227e-06, + "loss": 0.4644, + "num_tokens": 8586158643.0, + "step": 2060 + }, + { + "epoch": 4.113914564076942, + "grad_norm": 0.09084284610732078, + "learning_rate": 6.8459858007641155e-06, + "loss": 0.4591, + "num_tokens": 8590321567.0, + "step": 2061 + }, + { + "epoch": 4.1159130652011, + "grad_norm": 0.08953743509755453, + "learning_rate": 6.833414409579795e-06, + "loss": 0.4671, + "num_tokens": 8594498194.0, + "step": 2062 + }, + { + "epoch": 4.117911566325256, + "grad_norm": 0.08455150119150497, + "learning_rate": 6.8208684728631445e-06, + "loss": 0.4598, + "num_tokens": 8598671113.0, + "step": 2063 + }, + { + "epoch": 4.119910067449413, + "grad_norm": 0.08951969521344197, + "learning_rate": 6.8083480116703346e-06, + "loss": 0.4724, + "num_tokens": 8602826131.0, + "step": 2064 + }, + { + "epoch": 4.12190856857357, + "grad_norm": 0.08479377749063113, + "learning_rate": 6.795853047014774e-06, + "loss": 0.4645, + "num_tokens": 8606987244.0, + "step": 2065 + }, + { + "epoch": 4.123907069697727, + "grad_norm": 0.0920317346263397, + "learning_rate": 6.783383599867075e-06, + "loss": 0.4532, + "num_tokens": 8611173287.0, + "step": 2066 + }, + { + "epoch": 4.1259055708218835, + "grad_norm": 0.07727074882504806, + "learning_rate": 6.770939691155037e-06, + "loss": 0.4682, + "num_tokens": 8615357248.0, + "step": 2067 + }, + { + "epoch": 4.12790407194604, + "grad_norm": 0.0830649797164594, + "learning_rate": 6.758521341763583e-06, + "loss": 0.4714, + "num_tokens": 8619540546.0, + "step": 2068 + }, + { + "epoch": 4.129902573070197, + "grad_norm": 0.093567062365839, + "learning_rate": 6.746128572534752e-06, + "loss": 0.4546, + "num_tokens": 8623695532.0, + "step": 2069 + }, + { + "epoch": 4.131901074194354, + "grad_norm": 0.08416658427086349, + "learning_rate": 6.733761404267637e-06, + "loss": 0.4628, + "num_tokens": 8627882439.0, + "step": 2070 + }, + { + "epoch": 4.1338995753185115, + "grad_norm": 0.08562478694037359, + "learning_rate": 6.7214198577183806e-06, + "loss": 0.4593, + "num_tokens": 8632044370.0, + "step": 2071 + }, + { + "epoch": 4.135898076442668, + "grad_norm": 0.0860275380890234, + "learning_rate": 6.70910395360011e-06, + "loss": 0.4762, + "num_tokens": 8636225079.0, + "step": 2072 + }, + { + "epoch": 4.137896577566825, + "grad_norm": 0.0870908002559957, + "learning_rate": 6.696813712582926e-06, + "loss": 0.4499, + "num_tokens": 8640356436.0, + "step": 2073 + }, + { + "epoch": 4.139895078690982, + "grad_norm": 0.0829619456604381, + "learning_rate": 6.684549155293849e-06, + "loss": 0.4536, + "num_tokens": 8644518736.0, + "step": 2074 + }, + { + "epoch": 4.141893579815139, + "grad_norm": 0.08287111657457748, + "learning_rate": 6.672310302316798e-06, + "loss": 0.4631, + "num_tokens": 8648654950.0, + "step": 2075 + }, + { + "epoch": 4.143892080939295, + "grad_norm": 0.0864007142819277, + "learning_rate": 6.660097174192556e-06, + "loss": 0.4538, + "num_tokens": 8652819348.0, + "step": 2076 + }, + { + "epoch": 4.145890582063452, + "grad_norm": 0.0854759649887541, + "learning_rate": 6.647909791418723e-06, + "loss": 0.4716, + "num_tokens": 8656980818.0, + "step": 2077 + }, + { + "epoch": 4.147889083187609, + "grad_norm": 0.09444077696605212, + "learning_rate": 6.635748174449703e-06, + "loss": 0.4409, + "num_tokens": 8661142833.0, + "step": 2078 + }, + { + "epoch": 4.149887584311767, + "grad_norm": 0.08824157865729854, + "learning_rate": 6.62361234369664e-06, + "loss": 0.4696, + "num_tokens": 8665278802.0, + "step": 2079 + }, + { + "epoch": 4.151886085435923, + "grad_norm": 0.08055835438539266, + "learning_rate": 6.611502319527412e-06, + "loss": 0.4674, + "num_tokens": 8669455808.0, + "step": 2080 + }, + { + "epoch": 4.15388458656008, + "grad_norm": 0.0791140166288833, + "learning_rate": 6.599418122266581e-06, + "loss": 0.4592, + "num_tokens": 8673640799.0, + "step": 2081 + }, + { + "epoch": 4.155883087684237, + "grad_norm": 0.09378787786645718, + "learning_rate": 6.5873597721953605e-06, + "loss": 0.4593, + "num_tokens": 8677825720.0, + "step": 2082 + }, + { + "epoch": 4.157881588808394, + "grad_norm": 0.08034318676851503, + "learning_rate": 6.575327289551592e-06, + "loss": 0.4581, + "num_tokens": 8682009702.0, + "step": 2083 + }, + { + "epoch": 4.1598800899325505, + "grad_norm": 0.08132135107151746, + "learning_rate": 6.563320694529699e-06, + "loss": 0.4582, + "num_tokens": 8686194126.0, + "step": 2084 + }, + { + "epoch": 4.161878591056707, + "grad_norm": 0.08252612837955697, + "learning_rate": 6.551340007280655e-06, + "loss": 0.4599, + "num_tokens": 8690379969.0, + "step": 2085 + }, + { + "epoch": 4.163877092180864, + "grad_norm": 0.08461856188469075, + "learning_rate": 6.5393852479119444e-06, + "loss": 0.4508, + "num_tokens": 8694537333.0, + "step": 2086 + }, + { + "epoch": 4.165875593305021, + "grad_norm": 0.081295445470372, + "learning_rate": 6.52745643648756e-06, + "loss": 0.4486, + "num_tokens": 8698690681.0, + "step": 2087 + }, + { + "epoch": 4.1678740944291786, + "grad_norm": 0.07836494262270052, + "learning_rate": 6.515553593027919e-06, + "loss": 0.4636, + "num_tokens": 8702875863.0, + "step": 2088 + }, + { + "epoch": 4.169872595553335, + "grad_norm": 0.08524500046844545, + "learning_rate": 6.503676737509874e-06, + "loss": 0.4554, + "num_tokens": 8707061698.0, + "step": 2089 + }, + { + "epoch": 4.171871096677492, + "grad_norm": 0.08385527993136961, + "learning_rate": 6.491825889866653e-06, + "loss": 0.4579, + "num_tokens": 8711246562.0, + "step": 2090 + }, + { + "epoch": 4.173869597801649, + "grad_norm": 0.08901344054681935, + "learning_rate": 6.4800010699878295e-06, + "loss": 0.4619, + "num_tokens": 8715394195.0, + "step": 2091 + }, + { + "epoch": 4.175868098925806, + "grad_norm": 0.07850850138159676, + "learning_rate": 6.468202297719308e-06, + "loss": 0.4664, + "num_tokens": 8719576489.0, + "step": 2092 + }, + { + "epoch": 4.1778666000499625, + "grad_norm": 0.08683449846827379, + "learning_rate": 6.45642959286327e-06, + "loss": 0.4602, + "num_tokens": 8723764006.0, + "step": 2093 + }, + { + "epoch": 4.179865101174119, + "grad_norm": 0.09559131378350413, + "learning_rate": 6.444682975178143e-06, + "loss": 0.4781, + "num_tokens": 8727933784.0, + "step": 2094 + }, + { + "epoch": 4.181863602298276, + "grad_norm": 0.08330537716757586, + "learning_rate": 6.432962464378578e-06, + "loss": 0.454, + "num_tokens": 8732078018.0, + "step": 2095 + }, + { + "epoch": 4.183862103422433, + "grad_norm": 0.09746456887834151, + "learning_rate": 6.421268080135401e-06, + "loss": 0.4728, + "num_tokens": 8736263446.0, + "step": 2096 + }, + { + "epoch": 4.1858606045465905, + "grad_norm": 0.08863015305947229, + "learning_rate": 6.4095998420756e-06, + "loss": 0.4553, + "num_tokens": 8740450045.0, + "step": 2097 + }, + { + "epoch": 4.187859105670747, + "grad_norm": 0.0843777728684334, + "learning_rate": 6.397957769782281e-06, + "loss": 0.4717, + "num_tokens": 8744627018.0, + "step": 2098 + }, + { + "epoch": 4.189857606794904, + "grad_norm": 0.085180659280093, + "learning_rate": 6.386341882794625e-06, + "loss": 0.4603, + "num_tokens": 8748814614.0, + "step": 2099 + }, + { + "epoch": 4.191856107919061, + "grad_norm": 0.07637136224448578, + "learning_rate": 6.374752200607871e-06, + "loss": 0.4464, + "num_tokens": 8752973023.0, + "step": 2100 + }, + { + "epoch": 4.193854609043218, + "grad_norm": 0.10045509958720439, + "learning_rate": 6.363188742673281e-06, + "loss": 0.4562, + "num_tokens": 8757156624.0, + "step": 2101 + }, + { + "epoch": 4.195853110167374, + "grad_norm": 0.0865311471155953, + "learning_rate": 6.351651528398104e-06, + "loss": 0.4756, + "num_tokens": 8761343591.0, + "step": 2102 + }, + { + "epoch": 4.197851611291531, + "grad_norm": 0.0872674439566041, + "learning_rate": 6.3401405771455395e-06, + "loss": 0.4434, + "num_tokens": 8765528586.0, + "step": 2103 + }, + { + "epoch": 4.199850112415688, + "grad_norm": 0.08609696405242658, + "learning_rate": 6.3286559082347075e-06, + "loss": 0.4608, + "num_tokens": 8769675939.0, + "step": 2104 + }, + { + "epoch": 4.201848613539845, + "grad_norm": 0.08141944512350337, + "learning_rate": 6.317197540940622e-06, + "loss": 0.4695, + "num_tokens": 8773827165.0, + "step": 2105 + }, + { + "epoch": 4.203847114664002, + "grad_norm": 0.08662841519395317, + "learning_rate": 6.305765494494154e-06, + "loss": 0.4558, + "num_tokens": 8778012285.0, + "step": 2106 + }, + { + "epoch": 4.205845615788159, + "grad_norm": 0.08940927123866879, + "learning_rate": 6.294359788082003e-06, + "loss": 0.4659, + "num_tokens": 8782180431.0, + "step": 2107 + }, + { + "epoch": 4.207844116912316, + "grad_norm": 0.09592043269661273, + "learning_rate": 6.282980440846652e-06, + "loss": 0.4668, + "num_tokens": 8786365238.0, + "step": 2108 + }, + { + "epoch": 4.209842618036473, + "grad_norm": 0.08991409581153968, + "learning_rate": 6.271627471886348e-06, + "loss": 0.4625, + "num_tokens": 8790526254.0, + "step": 2109 + }, + { + "epoch": 4.2118411191606295, + "grad_norm": 0.08356732174527103, + "learning_rate": 6.260300900255073e-06, + "loss": 0.4697, + "num_tokens": 8794652240.0, + "step": 2110 + }, + { + "epoch": 4.213839620284786, + "grad_norm": 0.08060352924735409, + "learning_rate": 6.249000744962504e-06, + "loss": 0.4648, + "num_tokens": 8798836924.0, + "step": 2111 + }, + { + "epoch": 4.215838121408943, + "grad_norm": 0.09032599058149125, + "learning_rate": 6.2377270249739754e-06, + "loss": 0.4569, + "num_tokens": 8803018469.0, + "step": 2112 + }, + { + "epoch": 4.2178366225331, + "grad_norm": 0.08838443042255853, + "learning_rate": 6.22647975921046e-06, + "loss": 0.4558, + "num_tokens": 8807202392.0, + "step": 2113 + }, + { + "epoch": 4.219835123657257, + "grad_norm": 0.0804753192152578, + "learning_rate": 6.215258966548527e-06, + "loss": 0.4515, + "num_tokens": 8811385412.0, + "step": 2114 + }, + { + "epoch": 4.221833624781414, + "grad_norm": 0.08430652135055682, + "learning_rate": 6.204064665820322e-06, + "loss": 0.4622, + "num_tokens": 8815570127.0, + "step": 2115 + }, + { + "epoch": 4.223832125905571, + "grad_norm": 0.084072130599991, + "learning_rate": 6.192896875813533e-06, + "loss": 0.4683, + "num_tokens": 8819754777.0, + "step": 2116 + }, + { + "epoch": 4.225830627029728, + "grad_norm": 0.08737113628543006, + "learning_rate": 6.181755615271338e-06, + "loss": 0.4612, + "num_tokens": 8823941672.0, + "step": 2117 + }, + { + "epoch": 4.227829128153885, + "grad_norm": 0.08196141095331673, + "learning_rate": 6.170640902892399e-06, + "loss": 0.4471, + "num_tokens": 8828075481.0, + "step": 2118 + }, + { + "epoch": 4.2298276292780415, + "grad_norm": 0.07998525151380728, + "learning_rate": 6.159552757330823e-06, + "loss": 0.4644, + "num_tokens": 8832262124.0, + "step": 2119 + }, + { + "epoch": 4.231826130402198, + "grad_norm": 0.07622210554524682, + "learning_rate": 6.148491197196134e-06, + "loss": 0.4573, + "num_tokens": 8836434940.0, + "step": 2120 + }, + { + "epoch": 4.233824631526355, + "grad_norm": 0.08034838102428896, + "learning_rate": 6.137456241053226e-06, + "loss": 0.4694, + "num_tokens": 8840622494.0, + "step": 2121 + }, + { + "epoch": 4.235823132650512, + "grad_norm": 0.09371165479188322, + "learning_rate": 6.126447907422344e-06, + "loss": 0.461, + "num_tokens": 8844808247.0, + "step": 2122 + }, + { + "epoch": 4.237821633774669, + "grad_norm": 0.08445771885744033, + "learning_rate": 6.115466214779056e-06, + "loss": 0.4648, + "num_tokens": 8848992626.0, + "step": 2123 + }, + { + "epoch": 4.239820134898826, + "grad_norm": 0.07909806750048777, + "learning_rate": 6.104511181554218e-06, + "loss": 0.4664, + "num_tokens": 8853148601.0, + "step": 2124 + }, + { + "epoch": 4.241818636022983, + "grad_norm": 0.08381900072648062, + "learning_rate": 6.093582826133948e-06, + "loss": 0.4651, + "num_tokens": 8857330271.0, + "step": 2125 + }, + { + "epoch": 4.24381713714714, + "grad_norm": 0.08367752938258317, + "learning_rate": 6.082681166859579e-06, + "loss": 0.461, + "num_tokens": 8861485292.0, + "step": 2126 + }, + { + "epoch": 4.245815638271297, + "grad_norm": 0.08770270521994358, + "learning_rate": 6.071806222027638e-06, + "loss": 0.4657, + "num_tokens": 8865643877.0, + "step": 2127 + }, + { + "epoch": 4.247814139395453, + "grad_norm": 0.08054227862760213, + "learning_rate": 6.060958009889835e-06, + "loss": 0.4626, + "num_tokens": 8869824721.0, + "step": 2128 + }, + { + "epoch": 4.24981264051961, + "grad_norm": 0.07976173790304993, + "learning_rate": 6.050136548652989e-06, + "loss": 0.4676, + "num_tokens": 8873978373.0, + "step": 2129 + }, + { + "epoch": 4.251811141643767, + "grad_norm": 0.07937137884369226, + "learning_rate": 6.039341856479044e-06, + "loss": 0.4536, + "num_tokens": 8878164614.0, + "step": 2130 + }, + { + "epoch": 4.253809642767924, + "grad_norm": 0.08797756991718589, + "learning_rate": 6.028573951485004e-06, + "loss": 0.467, + "num_tokens": 8882342211.0, + "step": 2131 + }, + { + "epoch": 4.2558081438920805, + "grad_norm": 0.08464747869982109, + "learning_rate": 6.017832851742914e-06, + "loss": 0.4682, + "num_tokens": 8886510128.0, + "step": 2132 + }, + { + "epoch": 4.257806645016238, + "grad_norm": 0.08430006911862846, + "learning_rate": 6.00711857527984e-06, + "loss": 0.4552, + "num_tokens": 8890688056.0, + "step": 2133 + }, + { + "epoch": 4.259805146140395, + "grad_norm": 0.09038133659639613, + "learning_rate": 5.996431140077831e-06, + "loss": 0.4757, + "num_tokens": 8894861031.0, + "step": 2134 + }, + { + "epoch": 4.261803647264552, + "grad_norm": 0.08379629508702194, + "learning_rate": 5.985770564073874e-06, + "loss": 0.4582, + "num_tokens": 8899042382.0, + "step": 2135 + }, + { + "epoch": 4.2638021483887085, + "grad_norm": 0.0896499320206108, + "learning_rate": 5.975136865159888e-06, + "loss": 0.4684, + "num_tokens": 8903187102.0, + "step": 2136 + }, + { + "epoch": 4.265800649512865, + "grad_norm": 0.09754283112480498, + "learning_rate": 5.964530061182684e-06, + "loss": 0.4751, + "num_tokens": 8907370281.0, + "step": 2137 + }, + { + "epoch": 4.267799150637022, + "grad_norm": 0.08604273023349358, + "learning_rate": 5.953950169943928e-06, + "loss": 0.4646, + "num_tokens": 8911496235.0, + "step": 2138 + }, + { + "epoch": 4.269797651761179, + "grad_norm": 0.08877353874468774, + "learning_rate": 5.943397209200126e-06, + "loss": 0.4498, + "num_tokens": 8915681040.0, + "step": 2139 + }, + { + "epoch": 4.271796152885336, + "grad_norm": 0.08240752484480561, + "learning_rate": 5.9328711966625785e-06, + "loss": 0.4737, + "num_tokens": 8919864050.0, + "step": 2140 + }, + { + "epoch": 4.273794654009492, + "grad_norm": 0.09238629216118052, + "learning_rate": 5.9223721499973575e-06, + "loss": 0.456, + "num_tokens": 8924048779.0, + "step": 2141 + }, + { + "epoch": 4.27579315513365, + "grad_norm": 0.08291774322390702, + "learning_rate": 5.9119000868252885e-06, + "loss": 0.4471, + "num_tokens": 8928235363.0, + "step": 2142 + }, + { + "epoch": 4.277791656257807, + "grad_norm": 0.09292764505538663, + "learning_rate": 5.901455024721894e-06, + "loss": 0.4611, + "num_tokens": 8932419276.0, + "step": 2143 + }, + { + "epoch": 4.279790157381964, + "grad_norm": 0.08394931029611591, + "learning_rate": 5.891036981217397e-06, + "loss": 0.4658, + "num_tokens": 8936603684.0, + "step": 2144 + }, + { + "epoch": 4.2817886585061204, + "grad_norm": 0.0806885572485435, + "learning_rate": 5.880645973796661e-06, + "loss": 0.4584, + "num_tokens": 8940768345.0, + "step": 2145 + }, + { + "epoch": 4.283787159630277, + "grad_norm": 0.08370294396542716, + "learning_rate": 5.870282019899179e-06, + "loss": 0.4712, + "num_tokens": 8944921116.0, + "step": 2146 + }, + { + "epoch": 4.285785660754434, + "grad_norm": 0.08508003251630429, + "learning_rate": 5.859945136919037e-06, + "loss": 0.4433, + "num_tokens": 8949046349.0, + "step": 2147 + }, + { + "epoch": 4.287784161878591, + "grad_norm": 0.08410296863220929, + "learning_rate": 5.849635342204898e-06, + "loss": 0.457, + "num_tokens": 8953231541.0, + "step": 2148 + }, + { + "epoch": 4.289782663002748, + "grad_norm": 0.07752389724979034, + "learning_rate": 5.839352653059948e-06, + "loss": 0.4649, + "num_tokens": 8957418465.0, + "step": 2149 + }, + { + "epoch": 4.291781164126904, + "grad_norm": 0.08127040191688513, + "learning_rate": 5.829097086741883e-06, + "loss": 0.4557, + "num_tokens": 8961603803.0, + "step": 2150 + }, + { + "epoch": 4.293779665251062, + "grad_norm": 0.08204882954894809, + "learning_rate": 5.818868660462886e-06, + "loss": 0.4611, + "num_tokens": 8965780124.0, + "step": 2151 + }, + { + "epoch": 4.295778166375219, + "grad_norm": 0.08551195805370122, + "learning_rate": 5.808667391389585e-06, + "loss": 0.4609, + "num_tokens": 8969964970.0, + "step": 2152 + }, + { + "epoch": 4.297776667499376, + "grad_norm": 0.07974512284498489, + "learning_rate": 5.7984932966430326e-06, + "loss": 0.4684, + "num_tokens": 8974149365.0, + "step": 2153 + }, + { + "epoch": 4.299775168623532, + "grad_norm": 0.08006827624926198, + "learning_rate": 5.788346393298666e-06, + "loss": 0.4429, + "num_tokens": 8978308356.0, + "step": 2154 + }, + { + "epoch": 4.301773669747689, + "grad_norm": 0.08481483384370653, + "learning_rate": 5.778226698386299e-06, + "loss": 0.4516, + "num_tokens": 8982493178.0, + "step": 2155 + }, + { + "epoch": 4.303772170871846, + "grad_norm": 0.08316051087876009, + "learning_rate": 5.768134228890072e-06, + "loss": 0.4579, + "num_tokens": 8986650100.0, + "step": 2156 + }, + { + "epoch": 4.305770671996003, + "grad_norm": 0.08043128214017896, + "learning_rate": 5.758069001748426e-06, + "loss": 0.4638, + "num_tokens": 8990833004.0, + "step": 2157 + }, + { + "epoch": 4.3077691731201595, + "grad_norm": 0.0856513651771577, + "learning_rate": 5.748031033854102e-06, + "loss": 0.4591, + "num_tokens": 8995016921.0, + "step": 2158 + }, + { + "epoch": 4.309767674244317, + "grad_norm": 0.08628355275426192, + "learning_rate": 5.738020342054065e-06, + "loss": 0.4485, + "num_tokens": 8999202215.0, + "step": 2159 + }, + { + "epoch": 4.311766175368474, + "grad_norm": 0.08200943417281896, + "learning_rate": 5.728036943149527e-06, + "loss": 0.4488, + "num_tokens": 9003367313.0, + "step": 2160 + }, + { + "epoch": 4.313764676492631, + "grad_norm": 0.08716105837120905, + "learning_rate": 5.718080853895868e-06, + "loss": 0.4592, + "num_tokens": 9007551065.0, + "step": 2161 + }, + { + "epoch": 4.3157631776167875, + "grad_norm": 0.08209240468802427, + "learning_rate": 5.708152091002657e-06, + "loss": 0.4728, + "num_tokens": 9011710227.0, + "step": 2162 + }, + { + "epoch": 4.317761678740944, + "grad_norm": 0.08191195294029326, + "learning_rate": 5.698250671133582e-06, + "loss": 0.4578, + "num_tokens": 9015864240.0, + "step": 2163 + }, + { + "epoch": 4.319760179865101, + "grad_norm": 0.08197031082163751, + "learning_rate": 5.688376610906458e-06, + "loss": 0.4485, + "num_tokens": 9020048937.0, + "step": 2164 + }, + { + "epoch": 4.321758680989258, + "grad_norm": 0.09104880476111918, + "learning_rate": 5.678529926893167e-06, + "loss": 0.4603, + "num_tokens": 9024204681.0, + "step": 2165 + }, + { + "epoch": 4.323757182113415, + "grad_norm": 0.07820971291653062, + "learning_rate": 5.668710635619649e-06, + "loss": 0.4471, + "num_tokens": 9028387337.0, + "step": 2166 + }, + { + "epoch": 4.325755683237571, + "grad_norm": 0.08361064894726762, + "learning_rate": 5.658918753565874e-06, + "loss": 0.4536, + "num_tokens": 9032546608.0, + "step": 2167 + }, + { + "epoch": 4.327754184361729, + "grad_norm": 0.08392932159583494, + "learning_rate": 5.649154297165802e-06, + "loss": 0.4573, + "num_tokens": 9036660844.0, + "step": 2168 + }, + { + "epoch": 4.329752685485886, + "grad_norm": 0.08883933630295655, + "learning_rate": 5.639417282807379e-06, + "loss": 0.446, + "num_tokens": 9040847024.0, + "step": 2169 + }, + { + "epoch": 4.331751186610043, + "grad_norm": 0.08356081499025061, + "learning_rate": 5.629707726832479e-06, + "loss": 0.455, + "num_tokens": 9045022945.0, + "step": 2170 + }, + { + "epoch": 4.333749687734199, + "grad_norm": 0.08116387440678524, + "learning_rate": 5.620025645536894e-06, + "loss": 0.4626, + "num_tokens": 9049157797.0, + "step": 2171 + }, + { + "epoch": 4.335748188858356, + "grad_norm": 0.08155419367219145, + "learning_rate": 5.6103710551703115e-06, + "loss": 0.4692, + "num_tokens": 9053344555.0, + "step": 2172 + }, + { + "epoch": 4.337746689982513, + "grad_norm": 0.08919963687096896, + "learning_rate": 5.600743971936282e-06, + "loss": 0.4767, + "num_tokens": 9057530332.0, + "step": 2173 + }, + { + "epoch": 4.33974519110667, + "grad_norm": 0.08979578509886059, + "learning_rate": 5.591144411992179e-06, + "loss": 0.4582, + "num_tokens": 9061716622.0, + "step": 2174 + }, + { + "epoch": 4.341743692230827, + "grad_norm": 0.08182291472802933, + "learning_rate": 5.5815723914491904e-06, + "loss": 0.4465, + "num_tokens": 9065904126.0, + "step": 2175 + }, + { + "epoch": 4.343742193354983, + "grad_norm": 0.07773276236714965, + "learning_rate": 5.5720279263722795e-06, + "loss": 0.4631, + "num_tokens": 9070062210.0, + "step": 2176 + }, + { + "epoch": 4.345740694479141, + "grad_norm": 0.08136305069181794, + "learning_rate": 5.562511032780169e-06, + "loss": 0.4608, + "num_tokens": 9074206213.0, + "step": 2177 + }, + { + "epoch": 4.347739195603298, + "grad_norm": 0.0796821971654157, + "learning_rate": 5.553021726645308e-06, + "loss": 0.4554, + "num_tokens": 9078390154.0, + "step": 2178 + }, + { + "epoch": 4.349737696727455, + "grad_norm": 0.0792827910797729, + "learning_rate": 5.543560023893836e-06, + "loss": 0.4833, + "num_tokens": 9082545101.0, + "step": 2179 + }, + { + "epoch": 4.351736197851611, + "grad_norm": 0.0857108481535421, + "learning_rate": 5.534125940405567e-06, + "loss": 0.4472, + "num_tokens": 9086730630.0, + "step": 2180 + }, + { + "epoch": 4.353734698975768, + "grad_norm": 0.08411932762204377, + "learning_rate": 5.52471949201397e-06, + "loss": 0.4628, + "num_tokens": 9090908204.0, + "step": 2181 + }, + { + "epoch": 4.355733200099925, + "grad_norm": 0.08361923581451426, + "learning_rate": 5.515340694506128e-06, + "loss": 0.4532, + "num_tokens": 9095047311.0, + "step": 2182 + }, + { + "epoch": 4.357731701224082, + "grad_norm": 0.0805040043134718, + "learning_rate": 5.505989563622715e-06, + "loss": 0.457, + "num_tokens": 9099230532.0, + "step": 2183 + }, + { + "epoch": 4.3597302023482385, + "grad_norm": 0.07577507340612212, + "learning_rate": 5.496666115057971e-06, + "loss": 0.4629, + "num_tokens": 9103413945.0, + "step": 2184 + }, + { + "epoch": 4.361728703472396, + "grad_norm": 0.0834588815506543, + "learning_rate": 5.487370364459674e-06, + "loss": 0.4633, + "num_tokens": 9107567648.0, + "step": 2185 + }, + { + "epoch": 4.363727204596553, + "grad_norm": 0.08726095463829618, + "learning_rate": 5.478102327429127e-06, + "loss": 0.4494, + "num_tokens": 9111742802.0, + "step": 2186 + }, + { + "epoch": 4.36572570572071, + "grad_norm": 0.07991685282487997, + "learning_rate": 5.4688620195211114e-06, + "loss": 0.4638, + "num_tokens": 9115921870.0, + "step": 2187 + }, + { + "epoch": 4.3677242068448665, + "grad_norm": 0.07793338149962858, + "learning_rate": 5.4596494562438684e-06, + "loss": 0.464, + "num_tokens": 9120081406.0, + "step": 2188 + }, + { + "epoch": 4.369722707969023, + "grad_norm": 0.07357552493862538, + "learning_rate": 5.450464653059081e-06, + "loss": 0.4708, + "num_tokens": 9124265962.0, + "step": 2189 + }, + { + "epoch": 4.37172120909318, + "grad_norm": 0.08044017243677802, + "learning_rate": 5.441307625381834e-06, + "loss": 0.474, + "num_tokens": 9128396248.0, + "step": 2190 + }, + { + "epoch": 4.373719710217337, + "grad_norm": 0.07991730953854363, + "learning_rate": 5.432178388580609e-06, + "loss": 0.4707, + "num_tokens": 9132560930.0, + "step": 2191 + }, + { + "epoch": 4.375718211341494, + "grad_norm": 0.07927224365861475, + "learning_rate": 5.423076957977231e-06, + "loss": 0.4571, + "num_tokens": 9136744612.0, + "step": 2192 + }, + { + "epoch": 4.37771671246565, + "grad_norm": 0.08015858645373634, + "learning_rate": 5.414003348846868e-06, + "loss": 0.4586, + "num_tokens": 9140902133.0, + "step": 2193 + }, + { + "epoch": 4.379715213589808, + "grad_norm": 0.08121879254602811, + "learning_rate": 5.404957576417977e-06, + "loss": 0.4616, + "num_tokens": 9145075751.0, + "step": 2194 + }, + { + "epoch": 4.381713714713965, + "grad_norm": 0.0894587721915389, + "learning_rate": 5.395939655872327e-06, + "loss": 0.4756, + "num_tokens": 9149259634.0, + "step": 2195 + }, + { + "epoch": 4.383712215838122, + "grad_norm": 0.07957438504663333, + "learning_rate": 5.386949602344913e-06, + "loss": 0.4557, + "num_tokens": 9153415833.0, + "step": 2196 + }, + { + "epoch": 4.385710716962278, + "grad_norm": 0.08323777597861971, + "learning_rate": 5.377987430923975e-06, + "loss": 0.4412, + "num_tokens": 9157557141.0, + "step": 2197 + }, + { + "epoch": 4.387709218086435, + "grad_norm": 0.09566296271007205, + "learning_rate": 5.36905315665095e-06, + "loss": 0.4533, + "num_tokens": 9161710389.0, + "step": 2198 + }, + { + "epoch": 4.389707719210592, + "grad_norm": 0.0807084379745404, + "learning_rate": 5.360146794520461e-06, + "loss": 0.4577, + "num_tokens": 9165894833.0, + "step": 2199 + }, + { + "epoch": 4.391706220334749, + "grad_norm": 0.07910617468609689, + "learning_rate": 5.351268359480286e-06, + "loss": 0.4678, + "num_tokens": 9170081070.0, + "step": 2200 + }, + { + "epoch": 4.393704721458906, + "grad_norm": 0.08519440092354957, + "learning_rate": 5.342417866431326e-06, + "loss": 0.4631, + "num_tokens": 9174268500.0, + "step": 2201 + }, + { + "epoch": 4.395703222583062, + "grad_norm": 0.08772681610841604, + "learning_rate": 5.3335953302275925e-06, + "loss": 0.4595, + "num_tokens": 9178454342.0, + "step": 2202 + }, + { + "epoch": 4.39770172370722, + "grad_norm": 0.07739940492829597, + "learning_rate": 5.324800765676164e-06, + "loss": 0.4678, + "num_tokens": 9182640453.0, + "step": 2203 + }, + { + "epoch": 4.399700224831377, + "grad_norm": 0.08498430377906475, + "learning_rate": 5.31603418753719e-06, + "loss": 0.4707, + "num_tokens": 9186811745.0, + "step": 2204 + }, + { + "epoch": 4.401698725955534, + "grad_norm": 0.08225650376824156, + "learning_rate": 5.307295610523844e-06, + "loss": 0.4588, + "num_tokens": 9190994548.0, + "step": 2205 + }, + { + "epoch": 4.40369722707969, + "grad_norm": 0.09187329713689486, + "learning_rate": 5.298585049302301e-06, + "loss": 0.4792, + "num_tokens": 9195149428.0, + "step": 2206 + }, + { + "epoch": 4.405695728203847, + "grad_norm": 0.08683649813089037, + "learning_rate": 5.289902518491713e-06, + "loss": 0.4659, + "num_tokens": 9199306265.0, + "step": 2207 + }, + { + "epoch": 4.407694229328004, + "grad_norm": 0.08564171765758376, + "learning_rate": 5.281248032664199e-06, + "loss": 0.4629, + "num_tokens": 9203490546.0, + "step": 2208 + }, + { + "epoch": 4.409692730452161, + "grad_norm": 0.09209424981085884, + "learning_rate": 5.272621606344803e-06, + "loss": 0.4656, + "num_tokens": 9207654097.0, + "step": 2209 + }, + { + "epoch": 4.4116912315763175, + "grad_norm": 0.08399129296366127, + "learning_rate": 5.264023254011476e-06, + "loss": 0.4682, + "num_tokens": 9211841376.0, + "step": 2210 + }, + { + "epoch": 4.413689732700474, + "grad_norm": 0.08051180974144811, + "learning_rate": 5.255452990095049e-06, + "loss": 0.4632, + "num_tokens": 9216026148.0, + "step": 2211 + }, + { + "epoch": 4.415688233824632, + "grad_norm": 0.07820566736122424, + "learning_rate": 5.246910828979221e-06, + "loss": 0.4421, + "num_tokens": 9220209203.0, + "step": 2212 + }, + { + "epoch": 4.417686734948789, + "grad_norm": 0.07862785591527738, + "learning_rate": 5.238396785000509e-06, + "loss": 0.4684, + "num_tokens": 9224393764.0, + "step": 2213 + }, + { + "epoch": 4.4196852360729455, + "grad_norm": 0.07883539821011126, + "learning_rate": 5.229910872448262e-06, + "loss": 0.4679, + "num_tokens": 9228580576.0, + "step": 2214 + }, + { + "epoch": 4.421683737197102, + "grad_norm": 0.08632872433929903, + "learning_rate": 5.221453105564599e-06, + "loss": 0.4658, + "num_tokens": 9232756264.0, + "step": 2215 + }, + { + "epoch": 4.423682238321259, + "grad_norm": 0.07834189243921814, + "learning_rate": 5.213023498544399e-06, + "loss": 0.4667, + "num_tokens": 9236908159.0, + "step": 2216 + }, + { + "epoch": 4.425680739445416, + "grad_norm": 0.08687902331479995, + "learning_rate": 5.204622065535298e-06, + "loss": 0.4534, + "num_tokens": 9241091475.0, + "step": 2217 + }, + { + "epoch": 4.427679240569573, + "grad_norm": 0.08127767397571975, + "learning_rate": 5.196248820637626e-06, + "loss": 0.4672, + "num_tokens": 9245277497.0, + "step": 2218 + }, + { + "epoch": 4.429677741693729, + "grad_norm": 0.07970996192897009, + "learning_rate": 5.187903777904421e-06, + "loss": 0.4563, + "num_tokens": 9249435270.0, + "step": 2219 + }, + { + "epoch": 4.431676242817886, + "grad_norm": 0.07693554490660816, + "learning_rate": 5.179586951341376e-06, + "loss": 0.4573, + "num_tokens": 9253607502.0, + "step": 2220 + }, + { + "epoch": 4.433674743942044, + "grad_norm": 0.08427632558689903, + "learning_rate": 5.171298354906834e-06, + "loss": 0.4624, + "num_tokens": 9257773812.0, + "step": 2221 + }, + { + "epoch": 4.435673245066201, + "grad_norm": 0.08477934744247452, + "learning_rate": 5.163038002511759e-06, + "loss": 0.4591, + "num_tokens": 9261927785.0, + "step": 2222 + }, + { + "epoch": 4.437671746190357, + "grad_norm": 0.08771411197545027, + "learning_rate": 5.1548059080197156e-06, + "loss": 0.4581, + "num_tokens": 9266113727.0, + "step": 2223 + }, + { + "epoch": 4.439670247314514, + "grad_norm": 0.0822652777689173, + "learning_rate": 5.146602085246831e-06, + "loss": 0.4694, + "num_tokens": 9270298676.0, + "step": 2224 + }, + { + "epoch": 4.441668748438671, + "grad_norm": 0.08509172698108791, + "learning_rate": 5.138426547961794e-06, + "loss": 0.4588, + "num_tokens": 9274454082.0, + "step": 2225 + }, + { + "epoch": 4.443667249562828, + "grad_norm": 0.0789651262897308, + "learning_rate": 5.130279309885817e-06, + "loss": 0.4632, + "num_tokens": 9278616225.0, + "step": 2226 + }, + { + "epoch": 4.4456657506869846, + "grad_norm": 0.0930013599017268, + "learning_rate": 5.122160384692614e-06, + "loss": 0.4805, + "num_tokens": 9282802951.0, + "step": 2227 + }, + { + "epoch": 4.447664251811141, + "grad_norm": 0.09398865737273274, + "learning_rate": 5.114069786008391e-06, + "loss": 0.4595, + "num_tokens": 9286988997.0, + "step": 2228 + }, + { + "epoch": 4.449662752935298, + "grad_norm": 0.08414361526471471, + "learning_rate": 5.1060075274118e-06, + "loss": 0.4526, + "num_tokens": 9291161974.0, + "step": 2229 + }, + { + "epoch": 4.451661254059456, + "grad_norm": 0.07948527192204735, + "learning_rate": 5.097973622433943e-06, + "loss": 0.4738, + "num_tokens": 9295331516.0, + "step": 2230 + }, + { + "epoch": 4.453659755183613, + "grad_norm": 0.09292569749683627, + "learning_rate": 5.08996808455832e-06, + "loss": 0.4539, + "num_tokens": 9299488270.0, + "step": 2231 + }, + { + "epoch": 4.455658256307769, + "grad_norm": 0.09493994440736106, + "learning_rate": 5.081990927220831e-06, + "loss": 0.4519, + "num_tokens": 9303638928.0, + "step": 2232 + }, + { + "epoch": 4.457656757431926, + "grad_norm": 0.0795833249489959, + "learning_rate": 5.0740421638097455e-06, + "loss": 0.4542, + "num_tokens": 9307819910.0, + "step": 2233 + }, + { + "epoch": 4.459655258556083, + "grad_norm": 0.08823676003203558, + "learning_rate": 5.066121807665676e-06, + "loss": 0.4592, + "num_tokens": 9312006947.0, + "step": 2234 + }, + { + "epoch": 4.46165375968024, + "grad_norm": 0.08224883627064546, + "learning_rate": 5.058229872081557e-06, + "loss": 0.4727, + "num_tokens": 9316145866.0, + "step": 2235 + }, + { + "epoch": 4.4636522608043965, + "grad_norm": 0.07940603984371776, + "learning_rate": 5.050366370302623e-06, + "loss": 0.4598, + "num_tokens": 9320296731.0, + "step": 2236 + }, + { + "epoch": 4.465650761928553, + "grad_norm": 0.42753520834747905, + "learning_rate": 5.0425313155263935e-06, + "loss": 0.5064, + "num_tokens": 9324481177.0, + "step": 2237 + }, + { + "epoch": 4.46764926305271, + "grad_norm": 0.08520683130023436, + "learning_rate": 5.034724720902639e-06, + "loss": 0.4564, + "num_tokens": 9328658346.0, + "step": 2238 + }, + { + "epoch": 4.469647764176868, + "grad_norm": 0.09101303644780959, + "learning_rate": 5.02694659953337e-06, + "loss": 0.4547, + "num_tokens": 9332812859.0, + "step": 2239 + }, + { + "epoch": 4.4716462653010245, + "grad_norm": 0.07859806198066016, + "learning_rate": 5.019196964472801e-06, + "loss": 0.4651, + "num_tokens": 9336997594.0, + "step": 2240 + }, + { + "epoch": 4.473644766425181, + "grad_norm": 0.08971272959600639, + "learning_rate": 5.011475828727345e-06, + "loss": 0.4559, + "num_tokens": 9341178978.0, + "step": 2241 + }, + { + "epoch": 4.475643267549338, + "grad_norm": 0.08505755453439887, + "learning_rate": 5.003783205255581e-06, + "loss": 0.48, + "num_tokens": 9345364157.0, + "step": 2242 + }, + { + "epoch": 4.477641768673495, + "grad_norm": 0.08041870761044213, + "learning_rate": 4.996119106968233e-06, + "loss": 0.4562, + "num_tokens": 9349550088.0, + "step": 2243 + }, + { + "epoch": 4.479640269797652, + "grad_norm": 0.09120747626348416, + "learning_rate": 4.988483546728159e-06, + "loss": 0.4603, + "num_tokens": 9353706490.0, + "step": 2244 + }, + { + "epoch": 4.481638770921808, + "grad_norm": 0.08063833560812612, + "learning_rate": 4.980876537350309e-06, + "loss": 0.464, + "num_tokens": 9357893190.0, + "step": 2245 + }, + { + "epoch": 4.483637272045965, + "grad_norm": 0.0864336465471372, + "learning_rate": 4.973298091601719e-06, + "loss": 0.4769, + "num_tokens": 9362054434.0, + "step": 2246 + }, + { + "epoch": 4.485635773170122, + "grad_norm": 0.07659107435895804, + "learning_rate": 4.965748222201492e-06, + "loss": 0.4706, + "num_tokens": 9366220811.0, + "step": 2247 + }, + { + "epoch": 4.48763427429428, + "grad_norm": 0.07820550726796875, + "learning_rate": 4.9582269418207694e-06, + "loss": 0.4592, + "num_tokens": 9370405402.0, + "step": 2248 + }, + { + "epoch": 4.489632775418436, + "grad_norm": 0.07999855693953102, + "learning_rate": 4.950734263082703e-06, + "loss": 0.4495, + "num_tokens": 9374592563.0, + "step": 2249 + }, + { + "epoch": 4.491631276542593, + "grad_norm": 0.08174483809577716, + "learning_rate": 4.94327019856245e-06, + "loss": 0.4566, + "num_tokens": 9378743714.0, + "step": 2250 + }, + { + "epoch": 4.49362977766675, + "grad_norm": 0.07748520664573011, + "learning_rate": 4.935834760787133e-06, + "loss": 0.4625, + "num_tokens": 9382929553.0, + "step": 2251 + }, + { + "epoch": 4.495628278790907, + "grad_norm": 0.07917946232834804, + "learning_rate": 4.928427962235844e-06, + "loss": 0.4674, + "num_tokens": 9387089327.0, + "step": 2252 + }, + { + "epoch": 4.4976267799150635, + "grad_norm": 0.08918138404372505, + "learning_rate": 4.921049815339605e-06, + "loss": 0.4564, + "num_tokens": 9391272675.0, + "step": 2253 + }, + { + "epoch": 4.49962528103922, + "grad_norm": 0.08326963627894575, + "learning_rate": 4.913700332481342e-06, + "loss": 0.4648, + "num_tokens": 9395457556.0, + "step": 2254 + }, + { + "epoch": 4.501623782163377, + "grad_norm": 0.07825218879005974, + "learning_rate": 4.906379525995877e-06, + "loss": 0.4641, + "num_tokens": 9399591709.0, + "step": 2255 + }, + { + "epoch": 4.503622283287534, + "grad_norm": 0.09560824228778608, + "learning_rate": 4.899087408169914e-06, + "loss": 0.4639, + "num_tokens": 9403754362.0, + "step": 2256 + }, + { + "epoch": 4.505620784411692, + "grad_norm": 0.09278790582852686, + "learning_rate": 4.891823991241999e-06, + "loss": 0.4665, + "num_tokens": 9407939743.0, + "step": 2257 + }, + { + "epoch": 4.507619285535848, + "grad_norm": 0.07603897206620108, + "learning_rate": 4.884589287402507e-06, + "loss": 0.4724, + "num_tokens": 9412122333.0, + "step": 2258 + }, + { + "epoch": 4.509617786660005, + "grad_norm": 0.08172297260630153, + "learning_rate": 4.8773833087936285e-06, + "loss": 0.4548, + "num_tokens": 9416307250.0, + "step": 2259 + }, + { + "epoch": 4.511616287784162, + "grad_norm": 0.0828557007214124, + "learning_rate": 4.870206067509339e-06, + "loss": 0.4495, + "num_tokens": 9420469253.0, + "step": 2260 + }, + { + "epoch": 4.513614788908319, + "grad_norm": 0.07939049537777713, + "learning_rate": 4.863057575595384e-06, + "loss": 0.4632, + "num_tokens": 9424654693.0, + "step": 2261 + }, + { + "epoch": 4.5156132900324755, + "grad_norm": 0.08080758674696703, + "learning_rate": 4.855937845049266e-06, + "loss": 0.4466, + "num_tokens": 9428836365.0, + "step": 2262 + }, + { + "epoch": 4.517611791156632, + "grad_norm": 0.08630982674655933, + "learning_rate": 4.84884688782021e-06, + "loss": 0.4633, + "num_tokens": 9433002851.0, + "step": 2263 + }, + { + "epoch": 4.519610292280789, + "grad_norm": 0.08365583605326357, + "learning_rate": 4.841784715809143e-06, + "loss": 0.4696, + "num_tokens": 9437191663.0, + "step": 2264 + }, + { + "epoch": 4.521608793404946, + "grad_norm": 0.08434288180950525, + "learning_rate": 4.834751340868693e-06, + "loss": 0.4652, + "num_tokens": 9441376875.0, + "step": 2265 + }, + { + "epoch": 4.5236072945291035, + "grad_norm": 0.0871530473272188, + "learning_rate": 4.827746774803158e-06, + "loss": 0.4609, + "num_tokens": 9445556653.0, + "step": 2266 + }, + { + "epoch": 4.52560579565326, + "grad_norm": 0.08430537687554278, + "learning_rate": 4.820771029368472e-06, + "loss": 0.4456, + "num_tokens": 9449710583.0, + "step": 2267 + }, + { + "epoch": 4.527604296777417, + "grad_norm": 0.08290678204955983, + "learning_rate": 4.813824116272211e-06, + "loss": 0.4642, + "num_tokens": 9453865214.0, + "step": 2268 + }, + { + "epoch": 4.529602797901574, + "grad_norm": 0.0806960062303819, + "learning_rate": 4.806906047173553e-06, + "loss": 0.4749, + "num_tokens": 9458035110.0, + "step": 2269 + }, + { + "epoch": 4.531601299025731, + "grad_norm": 0.07711730498234191, + "learning_rate": 4.800016833683271e-06, + "loss": 0.4621, + "num_tokens": 9462220471.0, + "step": 2270 + }, + { + "epoch": 4.533599800149887, + "grad_norm": 0.08071681448944149, + "learning_rate": 4.793156487363713e-06, + "loss": 0.4626, + "num_tokens": 9466404368.0, + "step": 2271 + }, + { + "epoch": 4.535598301274044, + "grad_norm": 0.08534312999473234, + "learning_rate": 4.786325019728767e-06, + "loss": 0.4678, + "num_tokens": 9470591100.0, + "step": 2272 + }, + { + "epoch": 4.537596802398202, + "grad_norm": 0.08436452255248686, + "learning_rate": 4.77952244224386e-06, + "loss": 0.4656, + "num_tokens": 9474774370.0, + "step": 2273 + }, + { + "epoch": 4.539595303522358, + "grad_norm": 0.08523199573812595, + "learning_rate": 4.772748766325934e-06, + "loss": 0.4508, + "num_tokens": 9478959961.0, + "step": 2274 + }, + { + "epoch": 4.541593804646515, + "grad_norm": 0.07992179447056276, + "learning_rate": 4.7660040033434236e-06, + "loss": 0.4428, + "num_tokens": 9483124720.0, + "step": 2275 + }, + { + "epoch": 4.543592305770672, + "grad_norm": 0.07509322743723303, + "learning_rate": 4.7592881646162336e-06, + "loss": 0.4569, + "num_tokens": 9487310253.0, + "step": 2276 + }, + { + "epoch": 4.545590806894829, + "grad_norm": 0.07909664543345085, + "learning_rate": 4.752601261415729e-06, + "loss": 0.4649, + "num_tokens": 9491492624.0, + "step": 2277 + }, + { + "epoch": 4.547589308018986, + "grad_norm": 0.08256323823846344, + "learning_rate": 4.7459433049647054e-06, + "loss": 0.4568, + "num_tokens": 9495674961.0, + "step": 2278 + }, + { + "epoch": 4.5495878091431425, + "grad_norm": 0.08043341628180715, + "learning_rate": 4.739314306437386e-06, + "loss": 0.4669, + "num_tokens": 9499832326.0, + "step": 2279 + }, + { + "epoch": 4.551586310267299, + "grad_norm": 0.09675337466416903, + "learning_rate": 4.732714276959391e-06, + "loss": 0.4767, + "num_tokens": 9503970062.0, + "step": 2280 + }, + { + "epoch": 4.553584811391456, + "grad_norm": 0.08557568195962119, + "learning_rate": 4.726143227607712e-06, + "loss": 0.4475, + "num_tokens": 9508153381.0, + "step": 2281 + }, + { + "epoch": 4.555583312515614, + "grad_norm": 0.07555127084095331, + "learning_rate": 4.719601169410711e-06, + "loss": 0.4815, + "num_tokens": 9512337986.0, + "step": 2282 + }, + { + "epoch": 4.5575818136397706, + "grad_norm": 0.07910761070085748, + "learning_rate": 4.713088113348091e-06, + "loss": 0.4538, + "num_tokens": 9516508890.0, + "step": 2283 + }, + { + "epoch": 4.559580314763927, + "grad_norm": 0.09113336873568002, + "learning_rate": 4.706604070350882e-06, + "loss": 0.453, + "num_tokens": 9520694583.0, + "step": 2284 + }, + { + "epoch": 4.561578815888084, + "grad_norm": 0.08677303692057628, + "learning_rate": 4.7001490513014195e-06, + "loss": 0.4491, + "num_tokens": 9524878507.0, + "step": 2285 + }, + { + "epoch": 4.563577317012241, + "grad_norm": 0.07715461351534983, + "learning_rate": 4.693723067033322e-06, + "loss": 0.4604, + "num_tokens": 9529059562.0, + "step": 2286 + }, + { + "epoch": 4.565575818136398, + "grad_norm": 0.07745218265397848, + "learning_rate": 4.687326128331485e-06, + "loss": 0.4734, + "num_tokens": 9533238797.0, + "step": 2287 + }, + { + "epoch": 4.5675743192605545, + "grad_norm": 0.08314476636440271, + "learning_rate": 4.680958245932056e-06, + "loss": 0.457, + "num_tokens": 9537384143.0, + "step": 2288 + }, + { + "epoch": 4.569572820384711, + "grad_norm": 0.07714874399173359, + "learning_rate": 4.674619430522415e-06, + "loss": 0.4595, + "num_tokens": 9541568076.0, + "step": 2289 + }, + { + "epoch": 4.571571321508868, + "grad_norm": 0.08325822283664289, + "learning_rate": 4.668309692741156e-06, + "loss": 0.4616, + "num_tokens": 9545748376.0, + "step": 2290 + }, + { + "epoch": 4.573569822633026, + "grad_norm": 0.0821600055395162, + "learning_rate": 4.662029043178069e-06, + "loss": 0.459, + "num_tokens": 9549916504.0, + "step": 2291 + }, + { + "epoch": 4.5755683237571825, + "grad_norm": 0.07943965864831604, + "learning_rate": 4.655777492374135e-06, + "loss": 0.4781, + "num_tokens": 9554079335.0, + "step": 2292 + }, + { + "epoch": 4.577566824881339, + "grad_norm": 0.08490615146139399, + "learning_rate": 4.649555050821488e-06, + "loss": 0.4575, + "num_tokens": 9558237359.0, + "step": 2293 + }, + { + "epoch": 4.579565326005496, + "grad_norm": 0.09302803376823923, + "learning_rate": 4.643361728963413e-06, + "loss": 0.4654, + "num_tokens": 9562422812.0, + "step": 2294 + }, + { + "epoch": 4.581563827129653, + "grad_norm": 0.08186687590894134, + "learning_rate": 4.637197537194317e-06, + "loss": 0.4666, + "num_tokens": 9566606772.0, + "step": 2295 + }, + { + "epoch": 4.58356232825381, + "grad_norm": 0.07640301620648923, + "learning_rate": 4.63106248585972e-06, + "loss": 0.4545, + "num_tokens": 9570769735.0, + "step": 2296 + }, + { + "epoch": 4.585560829377966, + "grad_norm": 0.08259185379615468, + "learning_rate": 4.624956585256239e-06, + "loss": 0.4602, + "num_tokens": 9574953727.0, + "step": 2297 + }, + { + "epoch": 4.587559330502123, + "grad_norm": 0.08120034172442327, + "learning_rate": 4.61887984563156e-06, + "loss": 0.456, + "num_tokens": 9579125723.0, + "step": 2298 + }, + { + "epoch": 4.58955783162628, + "grad_norm": 0.07978094323990086, + "learning_rate": 4.6128322771844315e-06, + "loss": 0.4512, + "num_tokens": 9583310899.0, + "step": 2299 + }, + { + "epoch": 4.591556332750438, + "grad_norm": 0.08169242280300013, + "learning_rate": 4.6068138900646395e-06, + "loss": 0.4565, + "num_tokens": 9587498417.0, + "step": 2300 + }, + { + "epoch": 4.593554833874594, + "grad_norm": 0.07706608939352524, + "learning_rate": 4.600824694373e-06, + "loss": 0.454, + "num_tokens": 9591683241.0, + "step": 2301 + }, + { + "epoch": 4.595553334998751, + "grad_norm": 0.08256555694944098, + "learning_rate": 4.5948647001613285e-06, + "loss": 0.4454, + "num_tokens": 9595862902.0, + "step": 2302 + }, + { + "epoch": 4.597551836122908, + "grad_norm": 0.0776374216360398, + "learning_rate": 4.588933917432442e-06, + "loss": 0.4586, + "num_tokens": 9600044353.0, + "step": 2303 + }, + { + "epoch": 4.599550337247065, + "grad_norm": 0.08021092523385938, + "learning_rate": 4.583032356140122e-06, + "loss": 0.485, + "num_tokens": 9604229205.0, + "step": 2304 + }, + { + "epoch": 4.6015488383712215, + "grad_norm": 0.08122224927874182, + "learning_rate": 4.577160026189106e-06, + "loss": 0.459, + "num_tokens": 9608414237.0, + "step": 2305 + }, + { + "epoch": 4.603547339495378, + "grad_norm": 0.0782373741145892, + "learning_rate": 4.5713169374350765e-06, + "loss": 0.4718, + "num_tokens": 9612598152.0, + "step": 2306 + }, + { + "epoch": 4.605545840619535, + "grad_norm": 0.08146603286811971, + "learning_rate": 4.565503099684641e-06, + "loss": 0.4579, + "num_tokens": 9616784848.0, + "step": 2307 + }, + { + "epoch": 4.607544341743692, + "grad_norm": 0.08639609529876467, + "learning_rate": 4.55971852269531e-06, + "loss": 0.4561, + "num_tokens": 9620943229.0, + "step": 2308 + }, + { + "epoch": 4.6095428428678495, + "grad_norm": 0.07956491580715348, + "learning_rate": 4.553963216175487e-06, + "loss": 0.4716, + "num_tokens": 9625129112.0, + "step": 2309 + }, + { + "epoch": 4.611541343992006, + "grad_norm": 0.07838961620283345, + "learning_rate": 4.548237189784451e-06, + "loss": 0.4748, + "num_tokens": 9629315433.0, + "step": 2310 + }, + { + "epoch": 4.613539845116163, + "grad_norm": 0.08087976747565317, + "learning_rate": 4.542540453132337e-06, + "loss": 0.4511, + "num_tokens": 9633483985.0, + "step": 2311 + }, + { + "epoch": 4.61553834624032, + "grad_norm": 0.0793362400318148, + "learning_rate": 4.53687301578012e-06, + "loss": 0.4507, + "num_tokens": 9637664610.0, + "step": 2312 + }, + { + "epoch": 4.617536847364477, + "grad_norm": 0.0793166974664218, + "learning_rate": 4.531234887239613e-06, + "loss": 0.4637, + "num_tokens": 9641850740.0, + "step": 2313 + }, + { + "epoch": 4.6195353484886335, + "grad_norm": 0.08218603960007444, + "learning_rate": 4.5256260769734235e-06, + "loss": 0.4701, + "num_tokens": 9646028181.0, + "step": 2314 + }, + { + "epoch": 4.62153384961279, + "grad_norm": 0.07914784155981568, + "learning_rate": 4.5200465943949666e-06, + "loss": 0.4637, + "num_tokens": 9650200793.0, + "step": 2315 + }, + { + "epoch": 4.623532350736947, + "grad_norm": 0.07956839999190543, + "learning_rate": 4.514496448868423e-06, + "loss": 0.4698, + "num_tokens": 9654386391.0, + "step": 2316 + }, + { + "epoch": 4.625530851861104, + "grad_norm": 0.08244770995059672, + "learning_rate": 4.508975649708752e-06, + "loss": 0.4508, + "num_tokens": 9658541380.0, + "step": 2317 + }, + { + "epoch": 4.6275293529852615, + "grad_norm": 0.08197628894820744, + "learning_rate": 4.503484206181644e-06, + "loss": 0.4753, + "num_tokens": 9662717173.0, + "step": 2318 + }, + { + "epoch": 4.629527854109418, + "grad_norm": 0.07982386461145714, + "learning_rate": 4.498022127503538e-06, + "loss": 0.4652, + "num_tokens": 9666903589.0, + "step": 2319 + }, + { + "epoch": 4.631526355233575, + "grad_norm": 0.08166400321574971, + "learning_rate": 4.492589422841571e-06, + "loss": 0.4632, + "num_tokens": 9671070766.0, + "step": 2320 + }, + { + "epoch": 4.633524856357732, + "grad_norm": 0.09121799123690044, + "learning_rate": 4.487186101313593e-06, + "loss": 0.4534, + "num_tokens": 9675247475.0, + "step": 2321 + }, + { + "epoch": 4.635523357481889, + "grad_norm": 0.08464164744729089, + "learning_rate": 4.481812171988139e-06, + "loss": 0.4631, + "num_tokens": 9679431052.0, + "step": 2322 + }, + { + "epoch": 4.637521858606045, + "grad_norm": 0.08049828191305053, + "learning_rate": 4.476467643884408e-06, + "loss": 0.4556, + "num_tokens": 9683609947.0, + "step": 2323 + }, + { + "epoch": 4.639520359730202, + "grad_norm": 0.0784599622345703, + "learning_rate": 4.4711525259722635e-06, + "loss": 0.457, + "num_tokens": 9687793702.0, + "step": 2324 + }, + { + "epoch": 4.641518860854359, + "grad_norm": 0.08330003050166694, + "learning_rate": 4.465866827172197e-06, + "loss": 0.4468, + "num_tokens": 9691978717.0, + "step": 2325 + }, + { + "epoch": 4.643517361978516, + "grad_norm": 0.07830537681047742, + "learning_rate": 4.460610556355333e-06, + "loss": 0.4463, + "num_tokens": 9696165572.0, + "step": 2326 + }, + { + "epoch": 4.645515863102673, + "grad_norm": 0.0823169594621546, + "learning_rate": 4.455383722343406e-06, + "loss": 0.4507, + "num_tokens": 9700348991.0, + "step": 2327 + }, + { + "epoch": 4.64751436422683, + "grad_norm": 0.087185501695246, + "learning_rate": 4.450186333908747e-06, + "loss": 0.4628, + "num_tokens": 9704502314.0, + "step": 2328 + }, + { + "epoch": 4.649512865350987, + "grad_norm": 0.08573754584668461, + "learning_rate": 4.4450183997742645e-06, + "loss": 0.4733, + "num_tokens": 9708685289.0, + "step": 2329 + }, + { + "epoch": 4.651511366475144, + "grad_norm": 0.08358600093835475, + "learning_rate": 4.439879928613432e-06, + "loss": 0.4564, + "num_tokens": 9712869766.0, + "step": 2330 + }, + { + "epoch": 4.6535098675993005, + "grad_norm": 0.0762113269942252, + "learning_rate": 4.4347709290502775e-06, + "loss": 0.4608, + "num_tokens": 9717027498.0, + "step": 2331 + }, + { + "epoch": 4.655508368723457, + "grad_norm": 0.08299271822343414, + "learning_rate": 4.429691409659371e-06, + "loss": 0.4649, + "num_tokens": 9721183095.0, + "step": 2332 + }, + { + "epoch": 4.657506869847614, + "grad_norm": 0.0902675732530822, + "learning_rate": 4.424641378965792e-06, + "loss": 0.4621, + "num_tokens": 9725367241.0, + "step": 2333 + }, + { + "epoch": 4.659505370971771, + "grad_norm": 0.08159836815739849, + "learning_rate": 4.41962084544514e-06, + "loss": 0.459, + "num_tokens": 9729552694.0, + "step": 2334 + }, + { + "epoch": 4.661503872095928, + "grad_norm": 0.08424082356563076, + "learning_rate": 4.414629817523501e-06, + "loss": 0.4414, + "num_tokens": 9733714930.0, + "step": 2335 + }, + { + "epoch": 4.663502373220085, + "grad_norm": 0.078238224894497, + "learning_rate": 4.409668303577446e-06, + "loss": 0.4643, + "num_tokens": 9737898117.0, + "step": 2336 + }, + { + "epoch": 4.665500874344242, + "grad_norm": 0.08128293802747705, + "learning_rate": 4.4047363119340135e-06, + "loss": 0.4756, + "num_tokens": 9742084435.0, + "step": 2337 + }, + { + "epoch": 4.667499375468399, + "grad_norm": 0.08831078324790219, + "learning_rate": 4.399833850870691e-06, + "loss": 0.4629, + "num_tokens": 9746255568.0, + "step": 2338 + }, + { + "epoch": 4.669497876592556, + "grad_norm": 0.08310395891018961, + "learning_rate": 4.394960928615399e-06, + "loss": 0.4581, + "num_tokens": 9750406518.0, + "step": 2339 + }, + { + "epoch": 4.6714963777167124, + "grad_norm": 0.08214657466021678, + "learning_rate": 4.390117553346487e-06, + "loss": 0.4608, + "num_tokens": 9754590492.0, + "step": 2340 + }, + { + "epoch": 4.673494878840869, + "grad_norm": 0.08318745018345805, + "learning_rate": 4.385303733192722e-06, + "loss": 0.4629, + "num_tokens": 9758771220.0, + "step": 2341 + }, + { + "epoch": 4.675493379965026, + "grad_norm": 0.07924133402471346, + "learning_rate": 4.380519476233257e-06, + "loss": 0.4743, + "num_tokens": 9762931022.0, + "step": 2342 + }, + { + "epoch": 4.677491881089183, + "grad_norm": 0.08742861706199205, + "learning_rate": 4.3757647904976345e-06, + "loss": 0.4523, + "num_tokens": 9767116427.0, + "step": 2343 + }, + { + "epoch": 4.67949038221334, + "grad_norm": 0.07791784561170385, + "learning_rate": 4.371039683965763e-06, + "loss": 0.4588, + "num_tokens": 9771301065.0, + "step": 2344 + }, + { + "epoch": 4.681488883337497, + "grad_norm": 0.07802416223656931, + "learning_rate": 4.366344164567907e-06, + "loss": 0.4391, + "num_tokens": 9775480968.0, + "step": 2345 + }, + { + "epoch": 4.683487384461654, + "grad_norm": 0.08603735704861482, + "learning_rate": 4.361678240184683e-06, + "loss": 0.4501, + "num_tokens": 9779653154.0, + "step": 2346 + }, + { + "epoch": 4.685485885585811, + "grad_norm": 0.08502051052238216, + "learning_rate": 4.357041918647027e-06, + "loss": 0.455, + "num_tokens": 9783838836.0, + "step": 2347 + }, + { + "epoch": 4.687484386709968, + "grad_norm": 0.07353187052605699, + "learning_rate": 4.352435207736196e-06, + "loss": 0.4524, + "num_tokens": 9788026558.0, + "step": 2348 + }, + { + "epoch": 4.689482887834124, + "grad_norm": 0.07618218213965094, + "learning_rate": 4.347858115183749e-06, + "loss": 0.4626, + "num_tokens": 9792215380.0, + "step": 2349 + }, + { + "epoch": 4.691481388958281, + "grad_norm": 0.07587548533041623, + "learning_rate": 4.34331064867154e-06, + "loss": 0.4647, + "num_tokens": 9796402577.0, + "step": 2350 + }, + { + "epoch": 4.693479890082438, + "grad_norm": 0.0797954773065478, + "learning_rate": 4.338792815831698e-06, + "loss": 0.4681, + "num_tokens": 9800554695.0, + "step": 2351 + }, + { + "epoch": 4.695478391206595, + "grad_norm": 0.09327579999568719, + "learning_rate": 4.334304624246616e-06, + "loss": 0.4659, + "num_tokens": 9804699974.0, + "step": 2352 + }, + { + "epoch": 4.6974768923307515, + "grad_norm": 0.0792833644958866, + "learning_rate": 4.3298460814489395e-06, + "loss": 0.4726, + "num_tokens": 9808852754.0, + "step": 2353 + }, + { + "epoch": 4.699475393454909, + "grad_norm": 0.07989459278266088, + "learning_rate": 4.325417194921557e-06, + "loss": 0.4636, + "num_tokens": 9813035207.0, + "step": 2354 + }, + { + "epoch": 4.701473894579066, + "grad_norm": 0.07716403248029134, + "learning_rate": 4.321017972097582e-06, + "loss": 0.4631, + "num_tokens": 9817219683.0, + "step": 2355 + }, + { + "epoch": 4.703472395703223, + "grad_norm": 0.08914523741851985, + "learning_rate": 4.31664842036034e-06, + "loss": 0.4635, + "num_tokens": 9821402026.0, + "step": 2356 + }, + { + "epoch": 4.7054708968273795, + "grad_norm": 0.09158560519018799, + "learning_rate": 4.312308547043359e-06, + "loss": 0.4534, + "num_tokens": 9825569529.0, + "step": 2357 + }, + { + "epoch": 4.707469397951536, + "grad_norm": 0.07081929655430133, + "learning_rate": 4.307998359430366e-06, + "loss": 0.458, + "num_tokens": 9829753094.0, + "step": 2358 + }, + { + "epoch": 4.709467899075693, + "grad_norm": 0.0867151170606644, + "learning_rate": 4.303717864755255e-06, + "loss": 0.4509, + "num_tokens": 9833939066.0, + "step": 2359 + }, + { + "epoch": 4.71146640019985, + "grad_norm": 0.07867682819094092, + "learning_rate": 4.29946707020209e-06, + "loss": 0.4629, + "num_tokens": 9838099975.0, + "step": 2360 + }, + { + "epoch": 4.713464901324007, + "grad_norm": 0.07139784026877023, + "learning_rate": 4.295245982905087e-06, + "loss": 0.4462, + "num_tokens": 9842283582.0, + "step": 2361 + }, + { + "epoch": 4.715463402448163, + "grad_norm": 0.07805978485814657, + "learning_rate": 4.291054609948605e-06, + "loss": 0.4566, + "num_tokens": 9846468211.0, + "step": 2362 + }, + { + "epoch": 4.717461903572321, + "grad_norm": 0.08044702820446833, + "learning_rate": 4.2868929583671325e-06, + "loss": 0.46, + "num_tokens": 9850629363.0, + "step": 2363 + }, + { + "epoch": 4.719460404696478, + "grad_norm": 0.07501026973706838, + "learning_rate": 4.282761035145278e-06, + "loss": 0.4573, + "num_tokens": 9854801716.0, + "step": 2364 + }, + { + "epoch": 4.721458905820635, + "grad_norm": 0.07987776546061222, + "learning_rate": 4.27865884721775e-06, + "loss": 0.4479, + "num_tokens": 9858986442.0, + "step": 2365 + }, + { + "epoch": 4.723457406944791, + "grad_norm": 0.07393802363327223, + "learning_rate": 4.274586401469355e-06, + "loss": 0.4537, + "num_tokens": 9863172058.0, + "step": 2366 + }, + { + "epoch": 4.725455908068948, + "grad_norm": 0.07789995754567201, + "learning_rate": 4.270543704734985e-06, + "loss": 0.4639, + "num_tokens": 9867315013.0, + "step": 2367 + }, + { + "epoch": 4.727454409193105, + "grad_norm": 0.0759081913506952, + "learning_rate": 4.2665307637995996e-06, + "loss": 0.4583, + "num_tokens": 9871474149.0, + "step": 2368 + }, + { + "epoch": 4.729452910317262, + "grad_norm": 0.07657498329412235, + "learning_rate": 4.262547585398221e-06, + "loss": 0.4524, + "num_tokens": 9875659658.0, + "step": 2369 + }, + { + "epoch": 4.731451411441419, + "grad_norm": 0.08361749340825197, + "learning_rate": 4.258594176215921e-06, + "loss": 0.4515, + "num_tokens": 9879766298.0, + "step": 2370 + }, + { + "epoch": 4.733449912565575, + "grad_norm": 0.0769173451385298, + "learning_rate": 4.254670542887803e-06, + "loss": 0.4812, + "num_tokens": 9883950902.0, + "step": 2371 + }, + { + "epoch": 4.735448413689733, + "grad_norm": 0.08115508396117947, + "learning_rate": 4.250776691999004e-06, + "loss": 0.4693, + "num_tokens": 9888135144.0, + "step": 2372 + }, + { + "epoch": 4.73744691481389, + "grad_norm": 0.08028376653722624, + "learning_rate": 4.24691263008467e-06, + "loss": 0.4541, + "num_tokens": 9892320991.0, + "step": 2373 + }, + { + "epoch": 4.739445415938047, + "grad_norm": 0.07703867409035207, + "learning_rate": 4.243078363629959e-06, + "loss": 0.458, + "num_tokens": 9896487244.0, + "step": 2374 + }, + { + "epoch": 4.741443917062203, + "grad_norm": 0.0779094745041638, + "learning_rate": 4.2392738990700146e-06, + "loss": 0.4584, + "num_tokens": 9900671760.0, + "step": 2375 + }, + { + "epoch": 4.74344241818636, + "grad_norm": 0.07838874617559573, + "learning_rate": 4.2354992427899674e-06, + "loss": 0.4785, + "num_tokens": 9904814385.0, + "step": 2376 + }, + { + "epoch": 4.745440919310517, + "grad_norm": 0.08378754176495713, + "learning_rate": 4.231754401124922e-06, + "loss": 0.4574, + "num_tokens": 9908995919.0, + "step": 2377 + }, + { + "epoch": 4.747439420434674, + "grad_norm": 0.07486717509095892, + "learning_rate": 4.228039380359941e-06, + "loss": 0.4663, + "num_tokens": 9913182725.0, + "step": 2378 + }, + { + "epoch": 4.7494379215588305, + "grad_norm": 0.08058989522882419, + "learning_rate": 4.224354186730037e-06, + "loss": 0.451, + "num_tokens": 9917366633.0, + "step": 2379 + }, + { + "epoch": 4.751436422682987, + "grad_norm": 0.078800427047986, + "learning_rate": 4.220698826420165e-06, + "loss": 0.4565, + "num_tokens": 9921526830.0, + "step": 2380 + }, + { + "epoch": 4.753434923807145, + "grad_norm": 0.08513584662781054, + "learning_rate": 4.217073305565209e-06, + "loss": 0.4665, + "num_tokens": 9925709838.0, + "step": 2381 + }, + { + "epoch": 4.755433424931302, + "grad_norm": 0.07760375131007463, + "learning_rate": 4.213477630249974e-06, + "loss": 0.471, + "num_tokens": 9929878606.0, + "step": 2382 + }, + { + "epoch": 4.7574319260554585, + "grad_norm": 0.08143012930447328, + "learning_rate": 4.2099118065091735e-06, + "loss": 0.4583, + "num_tokens": 9934032943.0, + "step": 2383 + }, + { + "epoch": 4.759430427179615, + "grad_norm": 0.08555038512733447, + "learning_rate": 4.206375840327419e-06, + "loss": 0.4615, + "num_tokens": 9938207038.0, + "step": 2384 + }, + { + "epoch": 4.761428928303772, + "grad_norm": 0.08597792325970649, + "learning_rate": 4.2028697376392175e-06, + "loss": 0.4798, + "num_tokens": 9942361462.0, + "step": 2385 + }, + { + "epoch": 4.763427429427929, + "grad_norm": 0.07912966162624527, + "learning_rate": 4.1993935043289445e-06, + "loss": 0.4546, + "num_tokens": 9946531769.0, + "step": 2386 + }, + { + "epoch": 4.765425930552086, + "grad_norm": 0.07737792340754036, + "learning_rate": 4.195947146230854e-06, + "loss": 0.4711, + "num_tokens": 9950718288.0, + "step": 2387 + }, + { + "epoch": 4.767424431676243, + "grad_norm": 0.08349738815907609, + "learning_rate": 4.192530669129059e-06, + "loss": 0.4647, + "num_tokens": 9954902117.0, + "step": 2388 + }, + { + "epoch": 4.7694229328004, + "grad_norm": 0.08640277924122673, + "learning_rate": 4.189144078757516e-06, + "loss": 0.4684, + "num_tokens": 9959066637.0, + "step": 2389 + }, + { + "epoch": 4.771421433924557, + "grad_norm": 0.08075089387390046, + "learning_rate": 4.18578738080003e-06, + "loss": 0.4589, + "num_tokens": 9963249564.0, + "step": 2390 + }, + { + "epoch": 4.773419935048714, + "grad_norm": 0.07548724792702305, + "learning_rate": 4.182460580890228e-06, + "loss": 0.4668, + "num_tokens": 9967434105.0, + "step": 2391 + }, + { + "epoch": 4.77541843617287, + "grad_norm": 0.08459295943517056, + "learning_rate": 4.179163684611567e-06, + "loss": 0.4681, + "num_tokens": 9971601523.0, + "step": 2392 + }, + { + "epoch": 4.777416937297027, + "grad_norm": 0.0875000796081113, + "learning_rate": 4.175896697497313e-06, + "loss": 0.4614, + "num_tokens": 9975774132.0, + "step": 2393 + }, + { + "epoch": 4.779415438421184, + "grad_norm": 0.08270338467862189, + "learning_rate": 4.172659625030529e-06, + "loss": 0.4637, + "num_tokens": 9979957017.0, + "step": 2394 + }, + { + "epoch": 4.781413939545341, + "grad_norm": 0.08963698196265002, + "learning_rate": 4.169452472644083e-06, + "loss": 0.4659, + "num_tokens": 9984118566.0, + "step": 2395 + }, + { + "epoch": 4.783412440669498, + "grad_norm": 0.08365416277227566, + "learning_rate": 4.166275245720614e-06, + "loss": 0.4642, + "num_tokens": 9988303105.0, + "step": 2396 + }, + { + "epoch": 4.785410941793655, + "grad_norm": 0.07984860470407805, + "learning_rate": 4.163127949592547e-06, + "loss": 0.4686, + "num_tokens": 9992487457.0, + "step": 2397 + }, + { + "epoch": 4.787409442917812, + "grad_norm": 0.07712401496817445, + "learning_rate": 4.160010589542069e-06, + "loss": 0.4569, + "num_tokens": 9996670390.0, + "step": 2398 + }, + { + "epoch": 4.789407944041969, + "grad_norm": 0.077609085478871, + "learning_rate": 4.156923170801123e-06, + "loss": 0.4553, + "num_tokens": 10000856858.0, + "step": 2399 + }, + { + "epoch": 4.791406445166126, + "grad_norm": 0.0827521293168371, + "learning_rate": 4.153865698551406e-06, + "loss": 0.467, + "num_tokens": 10005023939.0, + "step": 2400 + }, + { + "epoch": 4.793404946290282, + "grad_norm": 0.0775744318677009, + "learning_rate": 4.150838177924349e-06, + "loss": 0.4781, + "num_tokens": 10009197844.0, + "step": 2401 + }, + { + "epoch": 4.795403447414439, + "grad_norm": 0.08214959524565223, + "learning_rate": 4.147840614001118e-06, + "loss": 0.4672, + "num_tokens": 10013383582.0, + "step": 2402 + }, + { + "epoch": 4.797401948538596, + "grad_norm": 0.07923636176595424, + "learning_rate": 4.1448730118126e-06, + "loss": 0.4655, + "num_tokens": 10017567233.0, + "step": 2403 + }, + { + "epoch": 4.799400449662753, + "grad_norm": 0.07798111215451825, + "learning_rate": 4.141935376339401e-06, + "loss": 0.4575, + "num_tokens": 10021724431.0, + "step": 2404 + }, + { + "epoch": 4.8013989507869095, + "grad_norm": 0.08330467951857846, + "learning_rate": 4.139027712511823e-06, + "loss": 0.4534, + "num_tokens": 10025908569.0, + "step": 2405 + }, + { + "epoch": 4.803397451911067, + "grad_norm": 0.07788777990498198, + "learning_rate": 4.136150025209881e-06, + "loss": 0.4572, + "num_tokens": 10030095672.0, + "step": 2406 + }, + { + "epoch": 4.805395953035224, + "grad_norm": 0.08324908819039767, + "learning_rate": 4.133302319263264e-06, + "loss": 0.4623, + "num_tokens": 10034281031.0, + "step": 2407 + }, + { + "epoch": 4.807394454159381, + "grad_norm": 0.0787187487139754, + "learning_rate": 4.130484599451352e-06, + "loss": 0.4523, + "num_tokens": 10038436166.0, + "step": 2408 + }, + { + "epoch": 4.8093929552835375, + "grad_norm": 0.08040550896441936, + "learning_rate": 4.127696870503199e-06, + "loss": 0.4762, + "num_tokens": 10042620896.0, + "step": 2409 + }, + { + "epoch": 4.811391456407694, + "grad_norm": 0.08236141623929394, + "learning_rate": 4.124939137097515e-06, + "loss": 0.476, + "num_tokens": 10046807880.0, + "step": 2410 + }, + { + "epoch": 4.813389957531851, + "grad_norm": 0.0899289898772436, + "learning_rate": 4.1222114038626786e-06, + "loss": 0.4626, + "num_tokens": 10050991452.0, + "step": 2411 + }, + { + "epoch": 4.815388458656008, + "grad_norm": 0.07727246891410085, + "learning_rate": 4.119513675376715e-06, + "loss": 0.4506, + "num_tokens": 10055175791.0, + "step": 2412 + }, + { + "epoch": 4.817386959780165, + "grad_norm": 0.08706058540586974, + "learning_rate": 4.11684595616729e-06, + "loss": 0.4809, + "num_tokens": 10059331841.0, + "step": 2413 + }, + { + "epoch": 4.819385460904321, + "grad_norm": 0.07888763723650122, + "learning_rate": 4.114208250711703e-06, + "loss": 0.467, + "num_tokens": 10063515498.0, + "step": 2414 + }, + { + "epoch": 4.821383962028479, + "grad_norm": 0.08946316504688427, + "learning_rate": 4.111600563436884e-06, + "loss": 0.4659, + "num_tokens": 10067700426.0, + "step": 2415 + }, + { + "epoch": 4.823382463152636, + "grad_norm": 0.09256346213375835, + "learning_rate": 4.10902289871938e-06, + "loss": 0.4603, + "num_tokens": 10071883529.0, + "step": 2416 + }, + { + "epoch": 4.825380964276793, + "grad_norm": 0.08258739587791905, + "learning_rate": 4.1064752608853544e-06, + "loss": 0.4739, + "num_tokens": 10076067225.0, + "step": 2417 + }, + { + "epoch": 4.827379465400949, + "grad_norm": 0.07785864085626243, + "learning_rate": 4.10395765421057e-06, + "loss": 0.4631, + "num_tokens": 10080240175.0, + "step": 2418 + }, + { + "epoch": 4.829377966525106, + "grad_norm": 0.09734775047569656, + "learning_rate": 4.101470082920389e-06, + "loss": 0.4622, + "num_tokens": 10084395838.0, + "step": 2419 + }, + { + "epoch": 4.831376467649263, + "grad_norm": 0.07759986630858144, + "learning_rate": 4.09901255118977e-06, + "loss": 0.4564, + "num_tokens": 10088518739.0, + "step": 2420 + }, + { + "epoch": 4.83337496877342, + "grad_norm": 0.07488381115889184, + "learning_rate": 4.096585063143246e-06, + "loss": 0.4675, + "num_tokens": 10092675689.0, + "step": 2421 + }, + { + "epoch": 4.8353734698975765, + "grad_norm": 0.08214629073390511, + "learning_rate": 4.094187622854939e-06, + "loss": 0.4558, + "num_tokens": 10096861450.0, + "step": 2422 + }, + { + "epoch": 4.837371971021733, + "grad_norm": 0.08013402711508429, + "learning_rate": 4.091820234348529e-06, + "loss": 0.4696, + "num_tokens": 10101049582.0, + "step": 2423 + }, + { + "epoch": 4.839370472145891, + "grad_norm": 0.07949703135819265, + "learning_rate": 4.089482901597263e-06, + "loss": 0.455, + "num_tokens": 10105233530.0, + "step": 2424 + }, + { + "epoch": 4.841368973270048, + "grad_norm": 0.0904064588833247, + "learning_rate": 4.087175628523953e-06, + "loss": 0.4567, + "num_tokens": 10109393375.0, + "step": 2425 + }, + { + "epoch": 4.843367474394205, + "grad_norm": 0.08285285417047968, + "learning_rate": 4.0848984190009495e-06, + "loss": 0.4597, + "num_tokens": 10113578730.0, + "step": 2426 + }, + { + "epoch": 4.845365975518361, + "grad_norm": 0.08881125478450773, + "learning_rate": 4.082651276850152e-06, + "loss": 0.4536, + "num_tokens": 10117763251.0, + "step": 2427 + }, + { + "epoch": 4.847364476642518, + "grad_norm": 0.08243327209103994, + "learning_rate": 4.080434205842995e-06, + "loss": 0.4416, + "num_tokens": 10121891155.0, + "step": 2428 + }, + { + "epoch": 4.849362977766675, + "grad_norm": 0.08233532814595196, + "learning_rate": 4.078247209700449e-06, + "loss": 0.4766, + "num_tokens": 10126074740.0, + "step": 2429 + }, + { + "epoch": 4.851361478890832, + "grad_norm": 0.08121511238493269, + "learning_rate": 4.076090292093003e-06, + "loss": 0.467, + "num_tokens": 10130261281.0, + "step": 2430 + }, + { + "epoch": 4.8533599800149885, + "grad_norm": 0.08458460143193895, + "learning_rate": 4.073963456640667e-06, + "loss": 0.4633, + "num_tokens": 10134445665.0, + "step": 2431 + }, + { + "epoch": 4.855358481139145, + "grad_norm": 0.0868502213791275, + "learning_rate": 4.071866706912964e-06, + "loss": 0.4695, + "num_tokens": 10138601691.0, + "step": 2432 + }, + { + "epoch": 4.857356982263303, + "grad_norm": 0.0892366472579914, + "learning_rate": 4.069800046428923e-06, + "loss": 0.4735, + "num_tokens": 10142789060.0, + "step": 2433 + }, + { + "epoch": 4.85935548338746, + "grad_norm": 0.08666039340955368, + "learning_rate": 4.0677634786570715e-06, + "loss": 0.4611, + "num_tokens": 10146965446.0, + "step": 2434 + }, + { + "epoch": 4.8613539845116165, + "grad_norm": 0.08234099270588065, + "learning_rate": 4.065757007015436e-06, + "loss": 0.4605, + "num_tokens": 10151151856.0, + "step": 2435 + }, + { + "epoch": 4.863352485635773, + "grad_norm": 0.07766101770333754, + "learning_rate": 4.063780634871524e-06, + "loss": 0.4618, + "num_tokens": 10155307522.0, + "step": 2436 + }, + { + "epoch": 4.86535098675993, + "grad_norm": 0.08601989105733232, + "learning_rate": 4.0618343655423356e-06, + "loss": 0.4659, + "num_tokens": 10159490449.0, + "step": 2437 + }, + { + "epoch": 4.867349487884087, + "grad_norm": 0.08241728154440774, + "learning_rate": 4.059918202294343e-06, + "loss": 0.4416, + "num_tokens": 10163676270.0, + "step": 2438 + }, + { + "epoch": 4.869347989008244, + "grad_norm": 0.07904732659092921, + "learning_rate": 4.058032148343491e-06, + "loss": 0.453, + "num_tokens": 10167853062.0, + "step": 2439 + }, + { + "epoch": 4.8713464901324, + "grad_norm": 0.0841113907103377, + "learning_rate": 4.056176206855195e-06, + "loss": 0.4599, + "num_tokens": 10172038373.0, + "step": 2440 + }, + { + "epoch": 4.873344991256557, + "grad_norm": 0.08738768333351289, + "learning_rate": 4.054350380944325e-06, + "loss": 0.4583, + "num_tokens": 10176211270.0, + "step": 2441 + }, + { + "epoch": 4.875343492380715, + "grad_norm": 0.08889665980695668, + "learning_rate": 4.052554673675214e-06, + "loss": 0.4615, + "num_tokens": 10180396599.0, + "step": 2442 + }, + { + "epoch": 4.877341993504872, + "grad_norm": 0.0822158552684951, + "learning_rate": 4.050789088061641e-06, + "loss": 0.4525, + "num_tokens": 10184577541.0, + "step": 2443 + }, + { + "epoch": 4.879340494629028, + "grad_norm": 0.0949505838363434, + "learning_rate": 4.049053627066837e-06, + "loss": 0.4584, + "num_tokens": 10188741831.0, + "step": 2444 + }, + { + "epoch": 4.881338995753185, + "grad_norm": 0.07943766794832341, + "learning_rate": 4.047348293603467e-06, + "loss": 0.4595, + "num_tokens": 10192927982.0, + "step": 2445 + }, + { + "epoch": 4.883337496877342, + "grad_norm": 0.07942292215038912, + "learning_rate": 4.045673090533637e-06, + "loss": 0.4599, + "num_tokens": 10197111215.0, + "step": 2446 + }, + { + "epoch": 4.885335998001499, + "grad_norm": 0.0818785796704941, + "learning_rate": 4.044028020668884e-06, + "loss": 0.4555, + "num_tokens": 10201293785.0, + "step": 2447 + }, + { + "epoch": 4.8873344991256555, + "grad_norm": 0.07384879298048826, + "learning_rate": 4.04241308677017e-06, + "loss": 0.4592, + "num_tokens": 10205452739.0, + "step": 2448 + }, + { + "epoch": 4.889333000249812, + "grad_norm": 0.0874592569828942, + "learning_rate": 4.04082829154788e-06, + "loss": 0.4641, + "num_tokens": 10209639852.0, + "step": 2449 + }, + { + "epoch": 4.891331501373969, + "grad_norm": 0.08734336107272321, + "learning_rate": 4.0392736376618135e-06, + "loss": 0.4551, + "num_tokens": 10213810863.0, + "step": 2450 + }, + { + "epoch": 4.893330002498127, + "grad_norm": 0.08622532289861123, + "learning_rate": 4.037749127721191e-06, + "loss": 0.475, + "num_tokens": 10217998386.0, + "step": 2451 + }, + { + "epoch": 4.8953285036222836, + "grad_norm": 0.08027518084688585, + "learning_rate": 4.0362547642846315e-06, + "loss": 0.4549, + "num_tokens": 10222180409.0, + "step": 2452 + }, + { + "epoch": 4.89732700474644, + "grad_norm": 0.08558312082531176, + "learning_rate": 4.034790549860166e-06, + "loss": 0.4723, + "num_tokens": 10226365964.0, + "step": 2453 + }, + { + "epoch": 4.899325505870597, + "grad_norm": 0.08467492704730256, + "learning_rate": 4.033356486905224e-06, + "loss": 0.4514, + "num_tokens": 10230539092.0, + "step": 2454 + }, + { + "epoch": 4.901324006994754, + "grad_norm": 0.07388361412321685, + "learning_rate": 4.031952577826626e-06, + "loss": 0.4558, + "num_tokens": 10234712743.0, + "step": 2455 + }, + { + "epoch": 4.903322508118911, + "grad_norm": 0.08053295726868283, + "learning_rate": 4.030578824980593e-06, + "loss": 0.456, + "num_tokens": 10238844596.0, + "step": 2456 + }, + { + "epoch": 4.9053210092430675, + "grad_norm": 0.08282290200920918, + "learning_rate": 4.029235230672725e-06, + "loss": 0.4679, + "num_tokens": 10243006140.0, + "step": 2457 + }, + { + "epoch": 4.907319510367224, + "grad_norm": 0.0835429043869806, + "learning_rate": 4.027921797158014e-06, + "loss": 0.4573, + "num_tokens": 10247145476.0, + "step": 2458 + }, + { + "epoch": 4.909318011491381, + "grad_norm": 0.08115739199693724, + "learning_rate": 4.026638526640826e-06, + "loss": 0.4549, + "num_tokens": 10251281196.0, + "step": 2459 + }, + { + "epoch": 4.911316512615539, + "grad_norm": 0.08893032316913552, + "learning_rate": 4.025385421274912e-06, + "loss": 0.443, + "num_tokens": 10255409088.0, + "step": 2460 + }, + { + "epoch": 4.9133150137396955, + "grad_norm": 0.07836566984552798, + "learning_rate": 4.024162483163386e-06, + "loss": 0.4644, + "num_tokens": 10259567760.0, + "step": 2461 + }, + { + "epoch": 4.915313514863852, + "grad_norm": 0.08159548898474091, + "learning_rate": 4.0229697143587366e-06, + "loss": 0.4647, + "num_tokens": 10263752082.0, + "step": 2462 + }, + { + "epoch": 4.917312015988009, + "grad_norm": 0.07695496634946732, + "learning_rate": 4.021807116862818e-06, + "loss": 0.4652, + "num_tokens": 10267883470.0, + "step": 2463 + }, + { + "epoch": 4.919310517112166, + "grad_norm": 0.07531459889149043, + "learning_rate": 4.020674692626849e-06, + "loss": 0.4514, + "num_tokens": 10272067221.0, + "step": 2464 + }, + { + "epoch": 4.921309018236323, + "grad_norm": 0.08355734521944173, + "learning_rate": 4.019572443551406e-06, + "loss": 0.4679, + "num_tokens": 10276215612.0, + "step": 2465 + }, + { + "epoch": 4.923307519360479, + "grad_norm": 0.08088046467550143, + "learning_rate": 4.018500371486421e-06, + "loss": 0.4593, + "num_tokens": 10280390352.0, + "step": 2466 + }, + { + "epoch": 4.925306020484636, + "grad_norm": 0.09114788952318058, + "learning_rate": 4.0174584782311794e-06, + "loss": 0.4722, + "num_tokens": 10284576128.0, + "step": 2467 + }, + { + "epoch": 4.927304521608793, + "grad_norm": 0.08148929981397511, + "learning_rate": 4.016446765534319e-06, + "loss": 0.4593, + "num_tokens": 10288758694.0, + "step": 2468 + }, + { + "epoch": 4.929303022732951, + "grad_norm": 0.08058839229509071, + "learning_rate": 4.015465235093821e-06, + "loss": 0.4628, + "num_tokens": 10292921660.0, + "step": 2469 + }, + { + "epoch": 4.931301523857107, + "grad_norm": 0.07812787457671364, + "learning_rate": 4.014513888557014e-06, + "loss": 0.4778, + "num_tokens": 10297094660.0, + "step": 2470 + }, + { + "epoch": 4.933300024981264, + "grad_norm": 0.07896056669299431, + "learning_rate": 4.013592727520567e-06, + "loss": 0.4456, + "num_tokens": 10301273633.0, + "step": 2471 + }, + { + "epoch": 4.935298526105421, + "grad_norm": 0.08160120345720576, + "learning_rate": 4.012701753530487e-06, + "loss": 0.4584, + "num_tokens": 10305428725.0, + "step": 2472 + }, + { + "epoch": 4.937297027229578, + "grad_norm": 0.08025575151179334, + "learning_rate": 4.0118409680821205e-06, + "loss": 0.4654, + "num_tokens": 10309614049.0, + "step": 2473 + }, + { + "epoch": 4.9392955283537345, + "grad_norm": 0.08060128656887039, + "learning_rate": 4.011010372620145e-06, + "loss": 0.4639, + "num_tokens": 10313742423.0, + "step": 2474 + }, + { + "epoch": 4.941294029477891, + "grad_norm": 0.07600650260392076, + "learning_rate": 4.01020996853857e-06, + "loss": 0.4521, + "num_tokens": 10317930797.0, + "step": 2475 + }, + { + "epoch": 4.943292530602048, + "grad_norm": 0.07496063631553203, + "learning_rate": 4.009439757180732e-06, + "loss": 0.4638, + "num_tokens": 10322115564.0, + "step": 2476 + }, + { + "epoch": 4.945291031726205, + "grad_norm": 0.07658162238641406, + "learning_rate": 4.008699739839298e-06, + "loss": 0.4533, + "num_tokens": 10326302810.0, + "step": 2477 + }, + { + "epoch": 4.9472895328503625, + "grad_norm": 0.07710203626266972, + "learning_rate": 4.007989917756261e-06, + "loss": 0.4713, + "num_tokens": 10330436003.0, + "step": 2478 + }, + { + "epoch": 4.949288033974519, + "grad_norm": 0.07900373179743704, + "learning_rate": 4.00731029212293e-06, + "loss": 0.4589, + "num_tokens": 10334591128.0, + "step": 2479 + }, + { + "epoch": 4.951286535098676, + "grad_norm": 0.07640820041929632, + "learning_rate": 4.0066608640799375e-06, + "loss": 0.4532, + "num_tokens": 10338725271.0, + "step": 2480 + }, + { + "epoch": 4.953285036222833, + "grad_norm": 0.08402813578844821, + "learning_rate": 4.006041634717237e-06, + "loss": 0.462, + "num_tokens": 10342911355.0, + "step": 2481 + }, + { + "epoch": 4.95528353734699, + "grad_norm": 0.08253266717068031, + "learning_rate": 4.005452605074097e-06, + "loss": 0.4776, + "num_tokens": 10347078407.0, + "step": 2482 + }, + { + "epoch": 4.9572820384711465, + "grad_norm": 0.08891397207983275, + "learning_rate": 4.0048937761391e-06, + "loss": 0.463, + "num_tokens": 10351237405.0, + "step": 2483 + }, + { + "epoch": 4.959280539595303, + "grad_norm": 0.08104126264947094, + "learning_rate": 4.004365148850141e-06, + "loss": 0.4669, + "num_tokens": 10355422391.0, + "step": 2484 + }, + { + "epoch": 4.96127904071946, + "grad_norm": 0.08757763751757536, + "learning_rate": 4.003866724094433e-06, + "loss": 0.459, + "num_tokens": 10359606140.0, + "step": 2485 + }, + { + "epoch": 4.963277541843617, + "grad_norm": 0.08281192455502377, + "learning_rate": 4.003398502708494e-06, + "loss": 0.4664, + "num_tokens": 10363757171.0, + "step": 2486 + }, + { + "epoch": 4.9652760429677745, + "grad_norm": 0.08892744680495838, + "learning_rate": 4.002960485478149e-06, + "loss": 0.4553, + "num_tokens": 10367926497.0, + "step": 2487 + }, + { + "epoch": 4.967274544091931, + "grad_norm": 0.07917568477291156, + "learning_rate": 4.002552673138536e-06, + "loss": 0.4659, + "num_tokens": 10372099635.0, + "step": 2488 + }, + { + "epoch": 4.969273045216088, + "grad_norm": 0.07458228936403684, + "learning_rate": 4.002175066374095e-06, + "loss": 0.4569, + "num_tokens": 10376284014.0, + "step": 2489 + }, + { + "epoch": 4.971271546340245, + "grad_norm": 0.07944875309646282, + "learning_rate": 4.001827665818576e-06, + "loss": 0.4435, + "num_tokens": 10380469210.0, + "step": 2490 + }, + { + "epoch": 4.973270047464402, + "grad_norm": 0.08177820293827792, + "learning_rate": 4.001510472055027e-06, + "loss": 0.4786, + "num_tokens": 10384655119.0, + "step": 2491 + }, + { + "epoch": 4.975268548588558, + "grad_norm": 0.07787627494658757, + "learning_rate": 4.001223485615804e-06, + "loss": 0.4564, + "num_tokens": 10388839473.0, + "step": 2492 + }, + { + "epoch": 4.977267049712715, + "grad_norm": 0.08218977640007874, + "learning_rate": 4.000966706982564e-06, + "loss": 0.455, + "num_tokens": 10393024601.0, + "step": 2493 + }, + { + "epoch": 4.979265550836873, + "grad_norm": 0.07999511476554591, + "learning_rate": 4.000740136586263e-06, + "loss": 0.4633, + "num_tokens": 10397212347.0, + "step": 2494 + }, + { + "epoch": 4.98126405196103, + "grad_norm": 0.07301417681382097, + "learning_rate": 4.0005437748071655e-06, + "loss": 0.4487, + "num_tokens": 10401397169.0, + "step": 2495 + }, + { + "epoch": 4.983262553085186, + "grad_norm": 0.07943403777895011, + "learning_rate": 4.000377621974822e-06, + "loss": 0.4536, + "num_tokens": 10405566481.0, + "step": 2496 + }, + { + "epoch": 4.985261054209343, + "grad_norm": 0.07938579352612819, + "learning_rate": 4.0002416783680985e-06, + "loss": 0.4636, + "num_tokens": 10409752833.0, + "step": 2497 + }, + { + "epoch": 4.9872595553335, + "grad_norm": 0.08031863132793292, + "learning_rate": 4.000135944215148e-06, + "loss": 0.4356, + "num_tokens": 10413938214.0, + "step": 2498 + }, + { + "epoch": 4.989258056457657, + "grad_norm": 0.07798767591880035, + "learning_rate": 4.000060419693429e-06, + "loss": 0.4535, + "num_tokens": 10418098028.0, + "step": 2499 + }, + { + "epoch": 4.9912565575818135, + "grad_norm": 0.07560476013315333, + "learning_rate": 4.000015104929695e-06, + "loss": 0.4585, + "num_tokens": 10422284019.0, + "step": 2500 + }, + { + "epoch": 4.9912565575818135, + "step": 2500, + "total_flos": 3.886169068471593e+20, + "train_loss": 0.5261877428531647, + "train_runtime": 254522.3147, + "train_samples_per_second": 1.258, + "train_steps_per_second": 0.01 + } + ], + "logging_steps": 1, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.886169068471593e+20, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}